nvidia-texture-tools/extern/CMP_Core/shaders/BC7_Encode_Kernel.cpp

//=====================================================================
// Copyright (c) 2019    Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files(the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and / or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions :
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
//
//=====================================================================
// Ref: GPUOpen-Tools/Compressonator

///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Copyright (c) 2016, Intel Corporation
// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
// documentation files (the "Software"), to deal in the Software without restriction, including without limitation
// the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
// permit persons to whom the Software is furnished to do so, subject to the following conditions:
// The above copyright notice and this permission notice shall be included in all copies or substantial portions of
// the Software.
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO
// THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
// TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

//--------------------------------------
// Common BC7 Header
//--------------------------------------
#include "BC7_Encode_Kernel.h"

#ifndef ASPM
//---------------------------------------------
// Predefinitions for GPU and CPU compiled code
//---------------------------------------------
#define ENABLE_CODE

#ifndef ASPM_GPU
    // using code for CPU or hybrid (CPU/GPU)
    //#include "BC7.h"
#endif


INLINE CGU_INT a_compare( const void *arg1, const void *arg2 )
{
    if (((CMP_di* )arg1)->image-((CMP_di* )arg2)->image > 0 ) return  1;
    if (((CMP_di* )arg1)->image-((CMP_di* )arg2)->image < 0 ) return -1;
    return 0;
};

#endif

#ifndef ASPM_GPU
CMP_GLOBAL BC7_EncodeRamps BC7EncodeRamps
#ifndef ASPM
    = {0}
#endif
;

//---------------------------------------------
// CPU: Computes max of two float values
//---------------------------------------------
float bc7_maxf(float l1, float r1)
{
    return (l1 > r1 ? l1 : r1);
}

//---------------------------------------------
// CPU: Computes max of two float values
//---------------------------------------------
float bc7_minf(float l1, float r1)
{
    return (l1 < r1 ? l1 : r1);
}

#endif

INLINE CGV_EPOCODE shift_right_epocode(CGV_EPOCODE v,  CGU_INT bits)
{
   return v>>bits; // (perf warning expected)
}

INLINE CGV_EPOCODE expand_epocode(CGV_EPOCODE v,  CGU_INT bits)
{
   CGV_EPOCODE vv = v<<(8-bits);
   return vv + shift_right_epocode(vv, bits);
}

// valid bit range is 0..8
CGU_INT expandbits(CGU_INT bits, CGU_INT v)
{
    return (  v << (8-bits) | v >> (2* bits - 8));
}

CMP_EXPORT CGU_INT bc7_isa() {
#if defined(ISPC_TARGET_SSE2)
    ASPM_PRINT(("SSE2"));
    return 0;
#elif defined(ISPC_TARGET_SSE4)
    ASPM_PRINT(("SSE4"));
    return 1;
#elif defined(ISPC_TARGET_AVX)
    ASPM_PRINT(("AVX"));
    return 2;
#elif defined(ISPC_TARGET_AVX2)
    ASPM_PRINT(("AVX2"));
    return 3;
#else
    ASPM_PRINT(("CPU"));
    return -1;
#endif
}

CMP_EXPORT void init_BC7ramps()
{
#ifdef ASPM_GPU
#else
    CMP_STATIC CGU_BOOL g_rampsInitialized = FALSE;
    if (g_rampsInitialized == TRUE) return;
    g_rampsInitialized = TRUE;
    BC7EncodeRamps.ramp_init = TRUE;

    //bc7_isa(); ASPM_PRINT((" INIT Ramps\n"));

    CGU_INT     bits;
    CGU_INT     p1;
    CGU_INT     p2;
    CGU_INT     clogBC7;
    CGU_INT     index;
    CGU_INT     j;
    CGU_INT     o1;
    CGU_INT     o2;
    CGU_INT     maxi = 0;


    for (bits = BIT_BASE; bits<BIT_RANGE; bits++)
    {
        for (p1 = 0; p1<(1 << bits); p1++)
        {
            BC7EncodeRamps.ep_d[BTT(bits)][p1] = expandbits(bits, p1);
        } //p1
    }//bits<BIT_RANGE

    for (clogBC7 = LOG_CL_BASE; clogBC7<LOG_CL_RANGE; clogBC7++)
    {
        for (bits = BIT_BASE; bits<BIT_RANGE; bits++)
        {

#ifdef USE_BC7_RAMP
            for (p1 = 0; p1<(1 << bits); p1++)
            {
                for (p2 = 0; p2<(1 << bits); p2++)
                {
                    for (index = 0; index<(1 << clogBC7); index++)
                    {
                        if (index > maxi) maxi = index;
                        BC7EncodeRamps.ramp[(CLT(clogBC7)*4*256*256*16)+(BTT(bits)*256*256*16)+(p1*256*16)+(p2*16)+index] =
                        //floor((CGV_IMAGE)BC7EncodeRamps.ep_d[BTT(bits)][p1] + rampWeights[clogBC7][index] * (CGV_IMAGE)((BC7EncodeRamps.ep_d[BTT(bits)][p2] - BC7EncodeRamps.ep_d[BTT(bits)][p1]))+ 0.5F);
                        floor(BC7EncodeRamps.ep_d[BTT(bits)][p1] + rampWeights[clogBC7][index] *((BC7EncodeRamps.ep_d[BTT(bits)][p2] - BC7EncodeRamps.ep_d[BTT(bits)][p1]))+ 0.5F);
                    }//index<(1 << clogBC7)
                }//p2<(1 << bits)
            }//p1<(1 << bits)
#endif

#ifdef USE_BC7_SP_ERR_IDX
            for (j = 0; j<256; j++)
            {
                for (o1 = 0; o1<2; o1++)
                {
                    for (o2 = 0; o2<2; o2++)
                    {
                        for (index = 0; index<16; index++) {
                            BC7EncodeRamps.sp_idx[(CLT(clogBC7)*4*256*2*2*16*2)+(BTT(bits)*256*2*2*16*2)+(j*2*2*16*2)+(o1*2*16*2)+(o2*16*2)+(index*2)+0] = 0;
                            BC7EncodeRamps.sp_idx[(CLT(clogBC7)*4*256*2*2*16*2)+(BTT(bits)*256*2*2*16*2)+(j*2*2*16*2)+(o1*2*16*2)+(o2*16*2)+(index*2)+1] = 255;
                            BC7EncodeRamps.sp_err[(CLT(clogBC7)*4*256*2*2*16)+(BTT(bits)*256*2*2*16)+(j*2*2*16)+(o1*2*16)+(o2*16)+index] = 255;
                        } // i<16
                    }//o2<2;
                }//o1<2
            } //j<256

            for (p1 = 0; p1<(1 << bits); p1++)
            {
                for (p2 = 0; p2<(1 << bits); p2++)
                {
                    for (index = 0; index<(1 << clogBC7); index++)
                    {
#ifdef USE_BC7_RAMP
                        CGV_EPOCODE floatf = (CGV_EPOCODE)BC7EncodeRamps.ramp[(CLT(clogBC7)*4*256*256*16)+(BTT(bits)*256*256*16)+(p1*256*16)+(p2*16)+index];
#else
                        CGV_EPOCODE floatf = floor((CGV_IMAGE)BC7EncodeRamps.ep_d[BTT(bits)][p1] + rampWeights[clogBC7][index] * (CGV_IMAGE)((BC7EncodeRamps.ep_d[BTT(bits)][p2] - BC7EncodeRamps.ep_d[BTT(bits)][p1]))+ 0.5F);
#endif
                        BC7EncodeRamps.sp_idx[(CLT(clogBC7)*4*256*2*2*16*2)+(BTT(bits)*256*2*2*16*2)+(floatf*2*2*16*2)+((p1 & 0x1)*2*16*2)+((p2 & 0x1)*16*2)+(index*2)+0] = p1;
                        BC7EncodeRamps.sp_idx[(CLT(clogBC7)*4*256*2*2*16*2)+(BTT(bits)*256*2*2*16*2)+(floatf*2*2*16*2)+((p1 & 0x1)*2*16*2)+((p2 & 0x1)*16*2)+(index*2)+1] = p2;
                        BC7EncodeRamps.sp_err[(CLT(clogBC7)*4*256*2*2*16)+(BTT(bits)*256*2*2*16)+(floatf*2*2*16)+((p1 & 0x1)*2*16)+(p2 & 0x1*16)+index] = 0;
                    } //i<(1 << clogBC7)
                } //p2
            }//p1<(1 << bits)

            for (j = 0; j<256; j++)
            {
                for (o1 = 0; o1<2; o1++)
                {
                    for (o2 = 0; o2<2; o2++)
                    {
                        for (index = 0; index<(1 << clogBC7); index++)
                        {
                            if ( // check for unitialized sp_idx
                                (BC7EncodeRamps.sp_idx[(CLT(clogBC7)*4*256*2*2*16*2)+(BTT(bits)*256*2*2*16*2)+(j*2*2*16*2)+(o1*2*16*2)+(o2*16*2)+(index*2)+0] == 0) &&
                                (BC7EncodeRamps.sp_idx[(CLT(clogBC7)*4*256*2*2*16*2)+(BTT(bits)*256*2*2*16*2)+(j*2*2*16*2)+(o1*2*16*2)+(o2*16*2)+(index*2)+1] == 255)
                                )

                            {
                                CGU_INT k;
                                CGU_INT tf;
                                CGU_INT tc;

                                for (k = 1; k<256; k++)
                                {
                                    tf = j - k;
                                    tc = j + k;
                                    if ((tf >= 0 && BC7EncodeRamps.sp_err[(CLT(clogBC7)*4*256*2*2*16)+(BTT(bits)*256*2*2*16)+(tf*2*2*16)+(o1*2*16)+(o2*16)+index] == 0))
                                    {
                                        BC7EncodeRamps.sp_idx[(CLT(clogBC7)*4*256*2*2*16*2)+(BTT(bits)*256*2*2*16*2)+(j*2*2*16*2)+(o1*2*16*2)+(o2*16*2)+(index*2)+0] = BC7EncodeRamps.sp_idx[(CLT(clogBC7)*4*256*2*2*16*2)+(BTT(bits)*256*2*2*16*2)+(tf*2*2*16*2)+(o1*2*16*2)+(o2*16*2)+(index*2)+0];
                                        BC7EncodeRamps.sp_idx[(CLT(clogBC7)*4*256*2*2*16*2)+(BTT(bits)*256*2*2*16*2)+(j*2*2*16*2)+(o1*2*16*2)+(o2*16*2)+(index*2)+1] = BC7EncodeRamps.sp_idx[(CLT(clogBC7)*4*256*2*2*16*2)+(BTT(bits)*256*2*2*16*2)+(tf*2*2*16*2)+(o1*2*16*2)+(o2*16*2)+(index*2)+1];
                                        break;
                                    }
                                    else if ((tc < 256 && BC7EncodeRamps.sp_err[(CLT(clogBC7)*4*256*2*2*16)+(BTT(bits)*256*2*2*16)+(tc*2*2*16)+(o1*2*16)+(o2*16)+index] == 0))
                                    {
                                        BC7EncodeRamps.sp_idx[(CLT(clogBC7)*4*256*2*2*16*2)+(BTT(bits)*256*2*2*16*2)+(j*2*2*16*2)+(o1*2*16*2)+(o2*16*2)+(index*2)+0] = BC7EncodeRamps.sp_idx[(CLT(clogBC7)*4*256*2*2*16*2)+(BTT(bits)*256*2*2*16*2)+(tc*2*2*16*2)+(o1*2*16*2)+(o2*16*2)+(index*2)+0];
                                        break;
                                    }
                                }

                                //BC7EncodeRamps.sp_err[(CLT(clogBC7)*4*256*2*2*16)+(BTT(bits)*256*2*2*16)+(j*2*2*16)+(o1*2*16)+(o2*16)+index] = (CGV_ERROR) k;
                                BC7EncodeRamps.sp_err[(CLT(clogBC7)*4*256*2*2*16)+(BTT(bits)*256*2*2*16)+(j*2*2*16)+(o1*2*16)+(o2*16)+index] = (CGU_UINT8)k;

                            } //sp_idx < 0
                        }//i<(1 << clogBC7)
                    }//o2
                }//o1
            }//j
#endif

        } //bits<BIT_RANGE
    } //clogBC7<LOG_CL_RANGE
#endif
}

//----------------------------------------------------------
//====== Common BC7 ASPM Code used for SPMD (CPU/GPU) ======
//----------------------------------------------------------
#ifndef ASPM_GPU
//#define USE_ICMP
#endif

#define SOURCE_BLOCK_SIZE               16      // Size of a source block in pixels (each pixel has RGBA:8888 channels)
#define COMPRESSED_BLOCK_SIZE           16  // Size of a compressed block in bytes
#define MAX_CHANNELS                    4
#define MAX_SUBSETS                     3   // Maximum number of possible subsets
#define MAX_SUBSET_SIZE                 16  // Largest possible size for an individual subset

#ifndef ASPM_GPU
    extern "C" CGU_INT timerStart(CGU_INT id);
    extern "C" CGU_INT timerEnd(CGU_INT id);

#define TIMERSTART(x)   timerStart(x)
    #define TIMEREND(x)     timerEnd(x)
#else
    #define TIMERSTART(x)
    #define TIMEREND(x)
#endif

#ifdef ASPM_GPU
#define GATHER_UINT8(x,y)   x[y]
#else
#define GATHER_UINT8(x,y)   gather_uint8(x,y)
#endif
// INLINE CGV_BYTE  gather_uint8 (CMP_CONSTANT  CGU_UINT8 * __constant uniform ptr, CGV_INT idx)
// {
//    return ptr[idx]; // (perf warning expected)
// }
//
// INLINE CGV_CMPOUT gather_cmpout(CMP_CONSTANT CGV_CMPOUT * __constant uniform ptr, CGU_INT idx)
// {
//    return ptr[idx]; // (perf warning expected)
// }
//
//INLINE CGV_INDEX gather_index(CMP_CONSTANT varying CGV_INDEX* __constant uniform ptr, CGV_INT idx)
//{
//   return ptr[idx]; // (perf warning expected)
//}
//
//INLINE void       scatter_index(CGV_INDEX* ptr, CGV_INT idx, CGV_INDEX value)
//{
//   ptr[idx] = value; // (perf warning expected)
//}
//

#ifdef USE_VARYING
INLINE CGV_EPOCODE gather_epocode(CMP_CONSTANT CGV_EPOCODE* ptr, CGV_TYPEINT idx)
{
   return ptr[idx]; // (perf warning expected)
}
#endif

INLINE CGV_SHIFT32 gather_partid(CMP_CONSTANT CGV_SHIFT32 * uniform ptr, CGV_PARTID idx)
{
   return ptr[idx]; // (perf warning expected)
}

//INLINE CGV_BYTE gather_vuint8(CMP_CONSTANT varying CGV_BYTE* __constant uniform ptr, CGV_INT idx)
//{
//   return ptr[idx]; // (perf warning expected)
//}

INLINE void cmp_swap_epo(CGV_EPOCODE u[], CGV_EPOCODE v[], CGV_EPOCODE n)
{
    for (CGU_INT i=0; i<n; i++)
   {
      CGV_EPOCODE t = u[i];
      u[i] = v[i];
      v[i] = t;
   }
}

INLINE void cmp_swap_index(CGV_INDEX u[], CGV_INDEX v[], CGU_INT n)
{
    for (CGU_INT i=0; i<n; i++)
   {
      CGV_INDEX t = u[i];
      u[i] = v[i];
      v[i] = t;
   }
}

void cmp_memsetBC7(CGV_BYTE ptr[], CGV_BYTE value, CGU_UINT32 size)
{
    for (CGV_SHIFT32 i=0; i<size; i++)
    {
        ptr[i] = value;
    }
}

void cmp_memcpy(CMP_GLOBAL CGU_UINT8 dst[],CGU_UINT8 src[],CGU_UINT32 size)
{
#ifdef ASPM_GPU
    for (CGV_INT i=0; i<size; i++)
    {
        dst[i] = src[i];
    }
#else
    memcpy(dst,src,size);
#endif
}

INLINE CGV_IMAGE sq_image(CGV_IMAGE v)
{
   return v*v;
}

INLINE CGV_EPOCODE clampEPO(CGV_EPOCODE v, CGV_EPOCODE a, CGV_EPOCODE b)
{
    if (v < a)
        return a;
    else
    if (v > b)
        return b;
    return v;
}

INLINE CGV_INDEX clampIndex(CGV_INDEX v, CGV_INDEX a, CGV_INDEX b)
{
    if (v < a)
        return a;
    else
    if (v > b)
        return b;
    return v;
}

INLINE CGV_SHIFT32 shift_right_uint32(CGV_SHIFT32 v, CGU_INT bits)
{
   return v>>bits; // (perf warning expected)
}

INLINE CGV_BYTE   shift_right_uint8(CGV_BYTE v,  CGU_UINT8 bits)
{
   return v>>bits; // (perf warning expected)
}

INLINE CGV_BYTE   shift_right_uint8V(CGV_BYTE v,  CGV_UINT8 bits)
{
   return v>>bits; // (perf warning expected)
}

// valid bit range is 0..8
INLINE CGV_EPOCODE expandEPObits(CGV_EPOCODE v, uniform CGV_EPOCODE bits)
{
   CGV_EPOCODE vv = v<<(8-bits);
   return vv + shift_right_uint32(vv, bits);
}

CGV_ERROR err_absf(CGV_ERROR a) { return a>0.0F?a:-a;}
CGV_IMAGE img_absf(CGV_IMAGE a) { return a>0.0F?a:-a;}

CGU_UINT8  min8(CGU_UINT8 a, CGU_UINT8 b) { return a<b?a:b;}
CGU_UINT8  max8(CGU_UINT8 a, CGU_UINT8 b) { return a>b?a:b;}

void pack_index(CGV_INDEXPACKED  packed_index[2], CGV_INDEX   src_index[MAX_SUBSET_SIZE])
{
    // Converts from unpacked index to packed index
    packed_index[0] = 0x0000;
    packed_index[1] = 0x0000;
    CGV_BYTE shift = 0;                // was CGV_UINT8
    for (CGU_INT k=0; k<16; k++)
    {
        packed_index[k/8] |= (CGV_UINT32)(src_index[k]&0x0F) << shift;
        shift +=4;
    }
}

void unpack_index(CGV_INDEX  unpacked_index[MAX_SUBSET_SIZE],CGV_INDEXPACKED  src_packed[2])
{
    // Converts from packed index to unpacked index
    CGV_BYTE shift = 0;    // was CGV_UINT8
    for (CGV_BYTE k=0; k<16; k++)
    {
        unpacked_index[k] = (CGV_BYTE)(src_packed[k/8] >> shift)&0xF;
        if (k == 7)
            shift = 0;
        else
            shift +=4;
    }
}

//====================================== CMP MATH UTILS  ============================================
CGV_ERROR err_Total(
                    CGV_IMAGE       image_src1[SOURCE_BLOCK_SIZE*MAX_CHANNELS],
                    CGV_IMAGE       image_src2[SOURCE_BLOCK_SIZE*MAX_CHANNELS],
                    CGV_ENTRIES     numEntries,                   // < 16
                    CGU_CHANNEL     channels3or4)                 // IN:  3 = RGB or 4 = RGBA (4 = MAX_CHANNELS)
{
    CGV_ERROR err_t=0.0F;
    for (CGU_CHANNEL ch=0;ch<channels3or4; ch++)
       for (CGV_ENTRIES k=0;k<numEntries;k++)
       {
           err_t = err_t + sq_image(image_src1[k+ch*SOURCE_BLOCK_SIZE]-image_src2[k+ch*SOURCE_BLOCK_SIZE]);
       }
    return err_t;
};

void GetImageCentered(
                    CGV_IMAGE     image_centered_out[SOURCE_BLOCK_SIZE*MAX_CHANNELS],
                    CGV_IMAGE     mean_out[MAX_CHANNELS],
                    CGV_IMAGE     image_src[SOURCE_BLOCK_SIZE*MAX_CHANNELS],
                    CGV_ENTRIES   numEntries,                  // < 16
                    CGU_CHANNEL   channels3or4)                // IN:  3 = RGB or 4 = RGBA (4 = MAX_CHANNELS)
{
    for (CGU_CHANNEL ch=0; ch<channels3or4; ch++)
    {
       mean_out[ch]=0.0F;
       if (numEntries > 0)
       {
            for (CGV_ENTRIES k=0;k<numEntries;k++)
            {
                mean_out[ch] = mean_out[ch] + image_src[k+(ch*SOURCE_BLOCK_SIZE)];
            }
            mean_out[ch] /= numEntries;
            for (CGV_ENTRIES k=0;k<numEntries;k++)
                image_centered_out[k+(ch*SOURCE_BLOCK_SIZE)] = image_src[k+(ch*SOURCE_BLOCK_SIZE)] - mean_out[ch];
       }
    }

}

void GetCovarianceVector(
                    CGV_IMAGE      covariance_out[MAX_CHANNELS*MAX_CHANNELS],  // OUT: Covariance vector
                    CGV_IMAGE      image_centered[SOURCE_BLOCK_SIZE*MAX_CHANNELS],
                    CGV_ENTRIES    numEntries,                       // < 16
                    CGU_CHANNEL    channels3or4)                     // IN:  3 = RGB or 4 = RGBA (4 = MAX_CHANNELS)
{
    for (CGU_CHANNEL ch1=0; ch1<channels3or4; ch1++)
        for (CGU_CHANNEL ch2=0;ch2<=ch1;ch2++)
        {
            covariance_out[ch1+ch2*4]=0;
            for (CGV_ENTRIES k=0;k<numEntries;k++)
                covariance_out[ch1+ch2*4] += image_centered[k+(ch1*SOURCE_BLOCK_SIZE)]*image_centered[k+(ch2*SOURCE_BLOCK_SIZE)];
        }

    for (CGU_CHANNEL ch1=0; ch1<channels3or4; ch1++)
        for (CGU_CHANNEL ch2=ch1+1;ch2<channels3or4;ch2++)
            covariance_out[ch1+ch2*4] = covariance_out[ch2+ch1*4];
}

void GetProjecedImage(
                    CGV_IMAGE     projection_out[SOURCE_BLOCK_SIZE],  //output projected data
                    CGV_IMAGE     image_centered[SOURCE_BLOCK_SIZE*MAX_CHANNELS],
                    CGV_ENTRIES   numEntries,               // < 16
                    CGV_IMAGE     EigenVector[MAX_CHANNELS],
                    CGU_CHANNEL   channels3or4)             // 3 = RGB or 4 = RGBA
{
    projection_out[0] = 0.0F;

    // EigenVector must be normalized
    for (CGV_ENTRIES k=0; k<numEntries; k++)
    {
        projection_out[k]=0.0F;
        for (CGU_CHANNEL ch=0; ch<channels3or4; ch++)
        {
             projection_out[k] = projection_out[k] + (image_centered[k+(ch*SOURCE_BLOCK_SIZE)]*EigenVector[ch]);
        }
    }
}


INLINE CGV_UINT8 get_partition_subset(CGV_INT part_id, CGU_INT maxSubsets, CGV_INT index)
{
   CMP_STATIC  uniform CMP_CONSTANT   CGU_UINT32 subset_mask_table[] = {
        // 2 subset region patterns
        0x0000CCCCu, // 0   1100 1100 1100 1100  (MSB..LSB)
        0x00008888u, // 1   1000 1000 1000 1000
        0x0000EEEEu, // 2   1110 1110 1110 1110
        0x0000ECC8u, // 3   1110 1100 1100 1000
        0x0000C880u, // 4   1100 1000 1000 0000
        0x0000FEECu, // 5   1111 1110 1110 1100
        0x0000FEC8u, // 6   1111 1110 1100 1000
        0x0000EC80u, // 7   1110 1100 1000 0000
        0x0000C800u, // 8   1100 1000 0000 0000
        0x0000FFECu, // 9   1111 1111 1110 1100
        0x0000FE80u, // 10  1111 1110 1000 0000
        0x0000E800u, // 11  1110 1000 0000 0000
        0x0000FFE8u, // 12  1111 1111 1110 1000
        0x0000FF00u, // 13  1111 1111 0000 0000
        0x0000FFF0u, // 14  1111 1111 1111 0000
        0x0000F000u, // 15  1111 0000 0000 0000
        0x0000F710u, // 16  1111 0111 0001 0000
        0x0000008Eu, // 17  0000 0000 1000 1110
        0x00007100u, // 18  0111 0001 0000 0000
        0x000008CEu, // 19  0000 1000 1100 1110
        0x0000008Cu, // 20  0000 0000 1000 1100
        0x00007310u, // 21  0111 0011 0001 0000
        0x00003100u, // 22  0011 0001 0000 0000
        0x00008CCEu, // 23  1000 1100 1100 1110
        0x0000088Cu, // 24  0000 1000 1000 1100
        0x00003110u, // 25  0011 0001 0001 0000
        0x00006666u, // 26  0110 0110 0110 0110
        0x0000366Cu, // 27  0011 0110 0110 1100
        0x000017E8u, // 28  0001 0111 1110 1000
        0x00000FF0u, // 29  0000 1111 1111 0000
        0x0000718Eu, // 30  0111 0001 1000 1110
        0x0000399Cu, // 31  0011 1001 1001 1100
        0x0000AAAAu, // 32  1010 1010 1010 1010
        0x0000F0F0u, // 33  1111 0000 1111 0000
        0x00005A5Au, // 34  0101 1010 0101 1010
        0x000033CCu, // 35  0011 0011 1100 1100
        0x00003C3Cu, // 36  0011 1100 0011 1100
        0x000055AAu, // 37  0101 0101 1010 1010
        0x00009696u, // 38  1001 0110 1001 0110
        0x0000A55Au, // 39  1010 0101 0101 1010
        0x000073CEu, // 40  0111 0011 1100 1110
        0x000013C8u, // 41  0001 0011 1100 1000
        0x0000324Cu, // 42  0011 0010 0100 1100
        0x00003BDCu, // 43  0011 1011 1101 1100
        0x00006996u, // 44  0110 1001 1001 0110
        0x0000C33Cu, // 45  1100 0011 0011 1100
        0x00009966u, // 46  1001 1001 0110 0110
        0x00000660u, // 47  0000 0110 0110 0000
        0x00000272u, // 48  0000 0010 0111 0010
        0x000004E4u, // 49  0000 0100 1110 0100
        0x00004E40u, // 50  0100 1110 0100 0000
        0x00002720u, // 51  0010 0111 0010 0000
        0x0000C936u, // 52  1100 1001 0011 0110
        0x0000936Cu, // 53  1001 0011 0110 1100
        0x000039C6u, // 54  0011 1001 1100 0110
        0x0000639Cu, // 55  0110 0011 1001 1100
        0x00009336u, // 56  1001 0011 0011 0110
        0x00009CC6u, // 57  1001 1100 1100 0110
        0x0000817Eu, // 58  1000 0001 0111 1110
        0x0000E718u, // 59  1110 0111 0001 1000
        0x0000CCF0u, // 60  1100 1100 1111 0000
        0x00000FCCu, // 61  0000 1111 1100 1100
        0x00007744u, // 62  0111 0111 0100 0100
        0x0000EE22u, // 63  1110 1110 0010 0010
        // 3 Subset region patterns
        0xF60008CCu,// 0    1111 0110 0000 0000 : 0000 1000 1100 1100 = 2222122011001100 (MSB...LSB)
        0x73008CC8u,// 1    0111 0011 0000 0000 : 1000 1100 1100 1000 = 1222112211001000
        0x3310CC80u,// 2    0011 0011 0001 0000 : 1100 1100 1000 0000 = 1122112210020000
        0x00CEEC00u,// 3    0000 0000 1100 1110 : 1110 1100 0000 0000 = 1110110022002220
        0xCC003300u,// 4    1100 1100 0000 0000 : 0011 0011 0000 0000 = 2211221100000000
        0xCC0000CCu,// 5    1100 1100 0000 0000 : 0000 0000 1100 1100 = 2200220011001100
        0x00CCFF00u,// 6    0000 0000 1100 1100 : 1111 1111 0000 0000 = 1111111122002200
        0x3300CCCCu,// 7    0011 0011 0000 0000 : 1100 1100 1100 1100 = 1122112211001100
        0xF0000F00u,// 8    1111 0000 0000 0000 : 0000 1111 0000 0000 = 2222111100000000
        0xF0000FF0u,// 9    1111 0000 0000 0000 : 0000 1111 1111 0000 = 2222111111110000
        0xFF0000F0u,// 10   1111 1111 0000 0000 : 0000 0000 1111 0000 = 2222222211110000
        0x88884444u,// 11   1000 1000 1000 1000 : 0100 0100 0100 0100 = 2100210021002100
        0x88886666u,// 12   1000 1000 1000 1000 : 0110 0110 0110 0110 = 2110211021102110
        0xCCCC2222u,// 13   1100 1100 1100 1100 : 0010 0010 0010 0010 = 2210221022102210
        0xEC80136Cu,// 14   1110 1100 1000 0000 : 0001 0011 0110 1100 = 2221221121101100
        0x7310008Cu,// 15   0111 0011 0001 0000 : 0000 0000 1000 1100 = 0222002210021100
        0xC80036C8u,// 16   1100 1000 0000 0000 : 0011 0110 1100 1000 = 2211211011001000
        0x310008CEu,// 17   0011 0001 0000 0000 : 0000 1000 1100 1110 = 0022100211001110
        0xCCC03330u,// 18   1100 1100 1100 0000 : 0011 0011 0011 0000 = 2211221122110000
        0x0CCCF000u,// 19   0000 1100 1100 1100 : 1111 0000 0000 0000 = 1111220022002200
        0xEE0000EEu,// 20   1110 1110 0000 0000 : 0000 0000 1110 1110 = 2220222011101110
        0x77008888u,// 21   0111 0111 0000 0000 : 1000 1000 1000 1000 = 1222122210001000
        0xCC0022C0u,// 22   1100 1100 0000 0000 : 0010 0010 1100 0000 = 2210221011000000
        0x33004430u,// 23   0011 0011 0000 0000 : 0100 0100 0011 0000 = 0122012200110000
        0x00CC0C22u,// 24   0000 0000 1100 1100 : 0000 1100 0010 0010 = 0000110022102210
        0xFC880344u,// 25   1111 1100 1000 1000 : 0000 0011 0100 0100 = 2222221121002100
        0x06606996u,// 26   0000 0110 0110 0000 : 0110 1001 1001 0110 = 0110122112210110
        0x66009960u,// 27   0110 0110 0000 0000 : 1001 1001 0110 0000 = 1221122101100000
        0xC88C0330u,// 28   1100 1000 1000 1100 : 0000 0011 0011 0000 = 2200201120112200
        0xF9000066u,// 29   1111 1001 0000 0000 : 0000 0000 0110 0110 = 2222200201100110
        0x0CC0C22Cu,// 30   0000 1100 1100 0000 : 1100 0010 0010 1100 = 1100221022101100
        0x73108C00u,// 31   0111 0011 0001 0000 : 1000 1100 0000 0000 = 1222112200020000
        0xEC801300u,// 32   1110 1100 1000 0000 : 0001 0011 0000 0000 = 2221221120000000
        0x08CEC400u,// 33   0000 1000 1100 1110 : 1100 0100 0000 0000 = 1100210022002220
        0xEC80004Cu,// 34   1110 1100 1000 0000 : 0000 0000 0100 1100 = 2220220021001100
        0x44442222u,// 35   0100 0100 0100 0100 : 0010 0010 0010 0010 = 0210021002100210
        0x0F0000F0u,// 36   0000 1111 0000 0000 : 0000 0000 1111 0000 = 0000222211110000
        0x49242492u,// 37   0100 1001 0010 0100 : 0010 0100 1001 0010 = 0210210210210210
        0x42942942u,// 38   0100 0010 1001 0100 : 0010 1001 0100 0010 = 0210102121020210
        0x0C30C30Cu,// 39   0000 1100 0011 0000 : 1100 0011 0000 1100 = 1100221100221100
        0x03C0C03Cu,// 40   0000 0011 1100 0000 : 1100 0000 0011 1100 = 1100002222111100
        0xFF0000AAu,// 41   1111 1111 0000 0000 : 0000 0000 1010 1010 = 2222222210101010
        0x5500AA00u,// 42   0101 0101 0000 0000 : 1010 1010 0000 0000 = 1212121200000000
        0xCCCC3030u,// 43   1100 1100 1100 1100 : 0011 0000 0011 0000 = 2211220022112200
        0x0C0CC0C0u,// 44   0000 1100 0000 1100 : 1100 0000 1100 0000 = 1100220011002200
        0x66669090u,// 45   0110 0110 0110 0110 : 1001 0000 1001 0000 = 1221022012210220
        0x0FF0A00Au,// 46   0000 1111 1111 0000 : 1010 0000 0000 1010 = 1010222222221010
        0x5550AAA0u,// 47   0101 0101 0101 0000 : 1010 1010 1010 0000 = 1212121212120000
        0xF0000AAAu,// 48   1111 0000 0000 0000 : 0000 1010 1010 1010 = 2222101010101010
        0x0E0EE0E0u,// 49   0000 1110 0000 1110 : 1110 0000 1110 0000 = 1110222011102220
        0x88887070u,// 50   1000 1000 1000 1000 : 0111 0000 0111 0000 = 2111200021112000
        0x99906660u,// 51   1001 1001 1001 0000 : 0110 0110 0110 0000 = 2112211221120000
        0xE00E0EE0u,// 52   1110 0000 0000 1110 : 0000 1110 1110 0000 = 2220111011102220
        0x88880770u,// 53   1000 1000 1000 1000 : 0000 0111 0111 0000 = 2000211121112000
        0xF0000666u,// 54   1111 0000 0000 0000 : 0000 0110 0110 0110 = 2222011001100110
        0x99006600u,// 55   1001 1001 0000 0000 : 0110 0110 0000 0000 = 2112211200000000
        0xFF000066u,// 56   1111 1111 0000 0000 : 0000 0000 0110 0110 = 2222222201100110
        0xC00C0CC0u,// 57   1100 0000 0000 1100 : 0000 1100 1100 0000 = 2200110011002200
        0xCCCC0330u,// 58   1100 1100 1100 1100 : 0000 0011 0011 0000 = 2200221122112200
        0x90006000u,// 59   1001 0000 0000 0000 : 0110 0000 0000 0000 = 2112000000000000
        0x08088080u,// 60   0000 1000 0000 1000 : 1000 0000 1000 0000 = 1000200010002000
        0xEEEE1010u,// 61   1110 1110 1110 1110 : 0001 0000 0001 0000 = 2221222022212220
        0xFFF0000Au,// 62   1111 1111 1111 0000 : 0000 0000 0000 1010 = 2222222222221010
        0x731008CEu,// 63   0111 0011 0001 0000 : 0000 1000 1100 1110 = 0222102211021110
        };

  if (maxSubsets == 2)
  {
      CGV_UINT32 mask_packed = subset_mask_table[part_id];
      return ((mask_packed & (0x01<<index))?1:0);       // This can be moved to caller, just return mask!!
  }

  // 3 region subsets
  part_id += 64;
  CGV_UINT32 mask0 = subset_mask_table[part_id] & 0xFFFF;
  CGV_UINT32 mask1 = subset_mask_table[part_id] >> 16;
  CGV_UINT32 mask =  0x01 << index;

  return ((mask1 & mask)?2:0 + (mask0 & mask)?1:0);       // This can be moved to caller, just return mask!!
}

void GetPartitionSubSet_mode01237(
                    CGV_IMAGE      subsets_out[MAX_SUBSETS][SOURCE_BLOCK_SIZE][MAX_CHANNELS], // OUT: Subset pattern mapped with image src colors
                    CGV_ENTRIES    entryCount_out[MAX_SUBSETS],                             // OUT: Number of entries per subset
                    CGV_UINT8      partition,                                               // Partition Shape 0..63
                    CGV_IMAGE      image_src[SOURCE_BLOCK_SIZE*MAX_CHANNELS],               // Image colors
                    CGU_INT        blockMode,                                               // [0,1,2,3 or 7]
                    CGU_CHANNEL    channels3or4)                                            // 3 = RGB or 4 = RGBA (4 = MAX_CHANNELS)
{
    CGU_UINT8   maxSubsets = 2;     if (blockMode == 0 || blockMode == 2)  maxSubsets  = 3;

    entryCount_out[0] = 0;
    entryCount_out[1] = 0;
    entryCount_out[2] = 0;

    for (CGV_INT i = 0; i < MAX_SUBSET_SIZE; i++)
    {
        CGV_UINT8   subset = get_partition_subset(partition,maxSubsets,i);

        for (CGU_INT ch = 0; ch<3; ch++)
            subsets_out[subset][entryCount_out[subset]][ch] = image_src[i+(ch*SOURCE_BLOCK_SIZE)];
            //subsets_out[subset*64+(entryCount_out[subset]*MAX_CHANNELS+ch)] = image_src[i+(ch*SOURCE_BLOCK_SIZE)];

        // if we have only 3 channels then set the alpha subset to 0
        if (channels3or4 == 3)
            subsets_out[subset][entryCount_out[subset]][3] = 0.0F;
        else
            subsets_out[subset][entryCount_out[subset]][3] = image_src[i+(COMP_ALPHA*SOURCE_BLOCK_SIZE)];
        entryCount_out[subset]++;
    }
}

INLINE void  GetClusterMean(
                    CGV_IMAGE       cluster_mean_out[SOURCE_BLOCK_SIZE][MAX_CHANNELS],
                    CGV_IMAGE       image_src[SOURCE_BLOCK_SIZE*MAX_CHANNELS],
                    CGV_INDEX       index_in[MAX_SUBSET_SIZE],
                    CGV_ENTRIES     numEntries,                             // < 16
                    CGU_CHANNEL     channels3or4)                           // IN: 3 = RGB or 4 = RGBA (4 = MAX_CHANNELS)
{
    // unused index values are underfined
    CGV_INDEX       i_cnt[MAX_SUBSET_SIZE];
    CGV_INDEX       i_comp[MAX_SUBSET_SIZE];


    for (CGV_ENTRIES i=0;i< numEntries;i++)
        for (CGU_CHANNEL ch=0; ch< channels3or4; ch++)
        {
            CGV_INDEX idx = index_in[i]&0x0F;
            cluster_mean_out[idx][ch] = 0;
            i_cnt[idx]=0;
        }

    CGV_INDEX ic = 0; // was CGV_INT
    for (CGV_ENTRIES i=0;i< numEntries;i++)
    {
        CGV_INDEX idx = index_in[i]&0x0F;
        if (i_cnt[idx]==0)
            i_comp[ic++]=idx;
        i_cnt[idx]++;

        for (CGU_CHANNEL ch=0; ch< channels3or4; ch++)
        {
            cluster_mean_out[idx][ch] += image_src[i+(ch*SOURCE_BLOCK_SIZE)];
        }
    }

    for (CGU_CHANNEL ch=0; ch< channels3or4; ch++)
    for (CGU_INT i=0;i < ic;i++)
    {
        if (i_cnt[i_comp[i]] != 0)
        {
            CGV_INDEX icmp = i_comp[i];
            cluster_mean_out[icmp][ch] = (CGV_IMAGE) floor( (cluster_mean_out[icmp][ch] / (CGV_IMAGE) i_cnt[icmp]) +0.5F);
        }
    }

}

INLINE void GetImageMean(
                    CGV_IMAGE       image_mean_out[SOURCE_BLOCK_SIZE*MAX_CHANNELS],
                    CGV_IMAGE       image_src[SOURCE_BLOCK_SIZE*MAX_CHANNELS],
                    CGV_ENTRIES     numEntries,
                    CGU_CHANNEL     channels)
{
    for (CGU_CHANNEL ch=0; ch< channels; ch++)
        image_mean_out[ch] =0;

    for (CGV_ENTRIES i=0;i< numEntries;i++)
        for (CGU_CHANNEL ch=0; ch< channels; ch++)
            image_mean_out[ch] += image_src[i+ch*SOURCE_BLOCK_SIZE];

    for (CGU_CHANNEL ch=0; ch< channels; ch++)
        image_mean_out[ch] /=(CGV_IMAGE) numEntries;   // Performance Warning: Conversion from unsigned int to float is slow. Use "int" if possible
}

// calculate an eigen vector corresponding to a biggest eigen value
// will work for non-zero non-negative matricies only
void GetEigenVector(
                    CGV_IMAGE       EigenVector_out[MAX_CHANNELS],                  // Normalized Eigen Vector output
                    CGV_IMAGE       CovarianceVector[MAX_CHANNELS*MAX_CHANNELS],    // Covariance Vector
                    CGU_CHANNEL     channels3or4)                                   // IN: 3 = RGB or 4 = RGBA
{
    CGV_IMAGE vector_covIn[MAX_CHANNELS*MAX_CHANNELS];
    CGV_IMAGE vector_covOut[MAX_CHANNELS*MAX_CHANNELS];
    CGV_IMAGE vector_maxCovariance;

    for (CGU_CHANNEL ch1=0; ch1<channels3or4; ch1++)
        for (CGU_CHANNEL ch2=0; ch2<channels3or4; ch2++)
        {
            vector_covIn[ch1+ch2*4] = CovarianceVector[ch1+ch2*4];
        }

    vector_maxCovariance = 0;

    for (CGU_CHANNEL ch=0; ch<channels3or4; ch++)
    {
        if (vector_covIn[ch+ch*4] > vector_maxCovariance)
                    vector_maxCovariance = vector_covIn[ch+ch*4];
    }

    // Normalize Input Covariance Vector
    for (CGU_CHANNEL ch1=0; ch1<channels3or4; ch1++)
        for (CGU_CHANNEL ch2=0; ch2<channels3or4; ch2++)
        {
            if (vector_maxCovariance > 0)
                vector_covIn[ch1+ch2*4] = vector_covIn[ch1+ch2*4] / vector_maxCovariance;
        }

    for (CGU_CHANNEL ch1=0; ch1<channels3or4; ch1++)
    {
        for (CGU_CHANNEL ch2=0; ch2<channels3or4; ch2++)
        {
            CGV_IMAGE vector_temp_cov=0;
            for (CGU_CHANNEL ch3=0; ch3<channels3or4; ch3++)
            {
                vector_temp_cov = vector_temp_cov + vector_covIn[ch1+ch3*4]*vector_covIn[ch3+ch2*4];
            }
            vector_covOut[ch1+ch2*4] = vector_temp_cov;
        }
    }

    vector_maxCovariance = 0;

    CGV_TYPEINT maxCovariance_channel = 0;

    for (CGU_CHANNEL ch=0; ch<channels3or4; ch++)
    {
         if (vector_covOut[ch+ch*4] > vector_maxCovariance)
         {
             maxCovariance_channel  = ch;
             vector_maxCovariance    = vector_covOut[ch+ch*4];
         }
    }

    CGV_IMAGE vector_t = 0;

    for (CGU_CHANNEL ch=0; ch<channels3or4; ch++)
    {
        vector_t = vector_t + vector_covOut[maxCovariance_channel+ch*4]*vector_covOut[maxCovariance_channel+ch*4];
        EigenVector_out[ch] = vector_covOut[maxCovariance_channel+ch*4];
    }

    // Normalize the Eigen Vector
    vector_t= sqrt(vector_t);
    for (CGU_CHANNEL ch=0; ch<channels3or4; ch++)
    {
        if (vector_t > 0)
            EigenVector_out[ch] = EigenVector_out[ch] / vector_t;
    }

}

CGV_INDEX  index_collapse(
                    CGV_INDEX    index[MAX_SUBSET_SIZE],
                    CGV_ENTRIES  numEntries)
{
    CGV_INDEX minIndex=index[0];
    CGV_INDEX MaxIndex=index[0];

    for (CGV_ENTRIES k=1;k<numEntries;k++) {
        if (index[k] < minIndex)
                minIndex = index[k];
        if (index[k] > MaxIndex)
                MaxIndex = index[k];
    }

    CGV_INDEX D=1;

    for (CGV_INDEX d=2; d<= MaxIndex-minIndex; d++)
    {
        for (CGV_ENTRIES ent=0;ent<numEntries;ent++)
        {
            if ((index[ent] -minIndex) % d !=0)
            {
                if (ent>=numEntries)
                    D =d;
                break;
            }
        }
    }

    for (CGV_ENTRIES k=0;k<numEntries;k++)
    {
        index[k] = (index[k]- minIndex) / D;
    }

    for (CGV_ENTRIES k=1;k<numEntries;k++) {
        if (index[k] > MaxIndex)
                MaxIndex = index[k];
    }

    return (MaxIndex);

}

void sortProjected_indexs(
                    CGV_INDEX   index_ordered[MAX_SUBSET_SIZE],
                    CGV_IMAGE   projection[SOURCE_BLOCK_SIZE],
                    CGV_ENTRIES numEntries                   // max 16
                    )
{
    CMP_di what[SOURCE_BLOCK_SIZE];

    for (CGV_INDEX i=0; i < numEntries;i++)
    {
        what[i].index = i;
        what[i].image  = projection[i];
    }

    CGV_INDEX   tmp_index;
    CGV_IMAGE   tmp_image;

    for (CGV_ENTRIES i = 1; i < numEntries; i++)
    {
        for (CGV_ENTRIES j=i; j>0; j--)
        {
            if (what[j - 1].image > what[j].image)
            {
                tmp_index = what[j].index;
                tmp_image = what[j].image;
                what[j].index = what[j - 1].index;
                what[j].image  = what[j - 1].image;
                what[j - 1].index  = tmp_index;
                what[j - 1].image  = tmp_image;
            }
        }
    }

    for (CGV_ENTRIES i=0; i < numEntries;i++)
        index_ordered[i]=what[i].index;

};

void sortPartitionProjection(
                    CGV_IMAGE  projection[MAX_PARTITION_ENTRIES],
                    CGV_UINT8  order[MAX_PARTITION_ENTRIES],
                    CGU_UINT8  numPartitions       // max 64
                    )
{
    CMP_du what[MAX_PARTITION_ENTRIES];

    for (CGU_UINT8 Parti=0; Parti < numPartitions;Parti++)
    {
        what[Parti].index  = Parti;
        what[Parti].image  = projection[Parti];
    }

    CGV_UINT8   index;
    CGV_IMAGE   data;

    for (CGU_UINT8 Parti = 1; Parti < numPartitions; Parti++)
    {
        for (CGU_UINT8 Partj=Parti; Partj>0; Partj--)
        {
            if (what[Partj - 1].image > what[Partj].image)
            {
                index = what[Partj].index;
                data  = what[Partj].image;
                what[Partj].index = what[Partj - 1].index;
                what[Partj].image = what[Partj - 1].image;
                what[Partj - 1].index = index;
                what[Partj - 1].image  = data;
            }
        }
    }

    for (CGU_UINT8 Parti=0; Parti < numPartitions;Parti++)
        order[Parti]=what[Parti].index;

};


void cmp_Write8Bit(
                    CGV_CMPOUT          base[],
                    CGU_INT* uniform    offset,
                    CGU_INT             bits,
                    CGV_BYTE            bitVal)
{
    base[*offset/8] |= bitVal << (*offset%8);
    if (*offset%8+bits>8)
    {
      base[*offset/8+1] |= shift_right_uint8(bitVal, 8-*offset%8);
    }
    *offset += bits;
}

void cmp_Write8BitV(
                    CGV_CMPOUT          base[],
                    CGV_INT             offset,
                    CGU_INT             bits,
                    CGV_BYTE            bitVal)
{
    base[offset/8] |= bitVal << (offset%8);
    if (offset%8+bits>8)
    {
      base[offset/8+1] |= shift_right_uint8V(bitVal, 8-offset%8);
    }
}

INLINE CGV_EPOCODE ep_find_floor(
                    CGV_IMAGE v,
                    CGU_UINT8 bits,
                    CGV_BYTE use_par,
                    CGV_BYTE odd)
 {
     CGV_EPOCODE i1=0;
     CGV_EPOCODE i2=1<<(bits-use_par);
     odd = use_par ? odd : 0;
     while (i2-i1>1)
     {
         CGV_EPOCODE j = (i1+i2)/2;             // Warning in ASMP code
         CGV_EPOCODE ep_d = expandEPObits((j<<use_par)+odd,bits);
         if (v >= ep_d )
             i1=j;
         else
             i2=j;
     }

     return (i1<<use_par)+odd;
 }


//==========================================================

// Not used for Modes 4&5
 INLINE CGV_IMAGE GetRamp(
                    CGU_INT         clogBC7,      // ramp bits Valid range 2..4
                    CGU_INT         bits,      // Component Valid range 5..8
                    CGV_EPOCODE     p1,        // 0..255
                    CGV_EPOCODE     p2,        // 0..255
                    CGV_INDEX       index)     // 0..15
{
#ifdef ASPM_GPU // GPU Code
    CGV_FLOAT rampf = 0.0F;
    CMP_CONSTANT CGV_EPOCODE rampI[5*SOURCE_BLOCK_SIZE] = {
    0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 , // 0 bit index
    0 ,64,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 , // 1 bit index
    0 ,21,43,64,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 , // 2 bit index
    0 ,9 ,18,27,37,46,55,64,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 , // 3 bit index
    0 ,4 ,9 ,13,17,21,26,30,34,38,43,47,51,55,60,64  // 4 bit index
    };

    CGV_EPOCODE e1 = expand_epocode(p1, bits);
    CGV_EPOCODE e2 = expand_epocode(p2,bits);
    CGV_FLOAT ramp = gather_epocode(rampI,clogBC7*16+index)/64.0F;
    rampf = floor(e1 + ramp * (e2 - e1) + 0.5F);
    return rampf;
#else // CPU ASPM Code
#ifdef USE_BC7_RAMP
      return BC7EncodeRamps.ramp[(CLT(clogBC7)*4*256*256*16)+(BTT(bits)*256*256*16)+(p1*256*16)+(p2*16)+index];
#else
      return (CGV_IMAGE)floor((CGV_IMAGE)BC7EncodeRamps.ep_d[BTT(bits)][p1] + rampWeights[clogBC7][index] * (CGV_IMAGE)((BC7EncodeRamps.ep_d[BTT(bits)][p2] - BC7EncodeRamps.ep_d[BTT(bits)][p1]))+ 0.5F);
#endif
#endif
}


 // Not used for Modes 4&5
 INLINE CGV_ERROR get_sperr(
                    CGU_INT         clogBC7,      // ramp bits Valid range 2..4
                    CGU_INT         bits,      // Component Valid range 5..8
                    CGV_EPOCODE     p1,        // 0..255
                    CGU_INT         t1,
                    CGU_INT         t2,
                    CGV_INDEX       index)
{
#ifdef ASPM_GPU
     return 0.0F;
#else
#ifdef USE_BC7_SP_ERR_IDX
      if (BC7EncodeRamps.ramp_init)
          return  BC7EncodeRamps.sp_err[(CLT(clogBC7)*4*256*2*2*16)+(BTT(bits)*256*2*2*16)+(p1*2*2*16)+(t1*2*16)+(t2*16)+index];
      else
          return 0.0F;
#else
     return 0.0F;
#endif
#endif
}

 INLINE void get_fixuptable(CGV_FIXUPINDEX  fixup[3], CGV_PARTID  part_id)
{
   // same as  CMP SDK v3.1 BC7_FIXUPINDEX1 &  BC7_FIXUPINDEX2 for each partition range 0..63
   // The data is saved as a packed INT = (BC7_FIXUPINDEX1 << 4 + BC7_FIXUPINDEX2)
   CMP_STATIC uniform __constant  CGV_FIXUPINDEX  FIXUPINDEX[] = {
       // 2 subset partitions 0..63
        0xf0u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0xf0u,
        0xf0u, 0x20u, 0x80u, 0x20u, 0x20u, 0x80u, 0x80u, 0xf0u, 0x20u, 0x80u, 0x20u, 0x20u, 0x80u, 0x80u, 0x20u, 0x20u,
        0xf0u, 0xf0u, 0x60u, 0x80u, 0x20u, 0x80u, 0xf0u, 0xf0u, 0x20u, 0x80u, 0x20u, 0x20u, 0x20u, 0xf0u, 0xf0u, 0x60u,
        0x60u, 0x20u, 0x60u, 0x80u, 0xf0u, 0xf0u, 0x20u, 0x20u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0x20u, 0x20u, 0xf0u,
        // 3 subset partitions 64..128
        0x3fu, 0x38u, 0xf8u, 0xf3u, 0x8fu, 0x3fu, 0xf3u, 0xf8u, 0x8fu, 0x8fu, 0x6fu, 0x6fu, 0x6fu, 0x5fu, 0x3fu, 0x38u,
        0x3fu, 0x38u, 0x8fu, 0xf3u, 0x3fu, 0x38u, 0x6fu, 0xa8u, 0x53u, 0x8fu, 0x86u, 0x6au, 0x8fu, 0x5fu, 0xfau, 0xf8u,
        0x8fu, 0xf3u, 0x3fu, 0x5au, 0x6au, 0xa8u, 0x89u, 0xfau, 0xf6u, 0x3fu, 0xf8u, 0x5fu, 0xf3u, 0xf6u, 0xf6u, 0xf8u,
        0x3fu, 0xf3u, 0x5fu, 0x5fu, 0x5fu, 0x8fu, 0x5fu, 0xafu, 0x5fu, 0xafu, 0x8fu, 0xdfu, 0xf3u, 0xcfu, 0x3fu, 0x38u
   };

   CGV_FIXUPINDEX skip_packed = FIXUPINDEX[part_id];// gather_int2(FIXUPINDEX, part_id);
   fixup[0] = 0;
   fixup[1] = skip_packed>>4;
   fixup[2] = skip_packed&15;
}

//===================================== COMPRESS CODE =============================================
INLINE void SetDefaultIndex(CGV_INDEX  index_io[MAX_SUBSET_SIZE])
{
    // Use this a final call
    for (CGU_INT i=0; i<MAX_SUBSET_SIZE; i++)
        index_io[i] = 0;
}

INLINE void SetDefaultEPOCode(CGV_EPOCODE    epo_code_io[8], CGV_EPOCODE R,CGV_EPOCODE G,CGV_EPOCODE B,CGV_EPOCODE A)
{
    epo_code_io[0] = R;
    epo_code_io[1] = G;
    epo_code_io[2] = B;
    epo_code_io[3] = A;
    epo_code_io[4] = R;
    epo_code_io[5] = G;
    epo_code_io[6] = B;
    epo_code_io[7] = A;
}


void  GetProjectedIndex(
                    CGV_INDEX       projected_index_out[MAX_SUBSET_SIZE],   //output: index, uncentered, in the range 0..clusters-1
                    CGV_IMAGE       image_projected[SOURCE_BLOCK_SIZE], // image_block points, might be uncentered
                    CGV_INT         clusters,                       // clusters: number of points in the ramp   (max 16)
                    CGV_ENTRIES     numEntries)                     // n - number of points in v_  max 15
{
    CMP_di      what[SOURCE_BLOCK_SIZE];
    CGV_IMAGE   image_v[SOURCE_BLOCK_SIZE];
    CGV_IMAGE   image_z[SOURCE_BLOCK_SIZE];
    CGV_IMAGE   image_l;
    CGV_IMAGE   image_mm;
    CGV_IMAGE   image_r   = 0.0F;
    CGV_IMAGE   image_dm  = 0.0F;
    CGV_IMAGE   image_min;
    CGV_IMAGE   image_max;
    CGV_IMAGE   image_s;

    SetDefaultIndex(projected_index_out);

    image_min=image_projected[0];
    image_max=image_projected[0];

    for (CGV_ENTRIES i=1; i < numEntries;i++)
    {
        if (image_min < image_projected[i])
            image_min = image_projected[i];
        if (image_max > image_projected[i])
            image_max = image_projected[i];
    }

    CGV_IMAGE img_diff = image_max-image_min;

    if (img_diff == 0.0f) return;
    if (isnan(img_diff)) return;

    image_s = (clusters-1)/img_diff;

    for (CGV_INDEX i=0; i < numEntries;i++)
    {

        image_v[i] = image_projected[i]*image_s;
        image_z[i] = floor(image_v[i] + 0.5F - image_min *image_s);
        projected_index_out[i] = (CGV_INDEX)image_z[i];

        what[i].image = image_v[i]-image_z[i]- image_min *image_s;
        what[i].index = i;
        image_dm+= what[i].image;
        image_r += what[i].image*what[i].image;
    }

    if (numEntries*image_r- image_dm*image_dm >= (CGV_IMAGE)(numEntries-1)/8)
    {

        image_dm /= numEntries;

        for (CGV_INT i=0; i < numEntries;i++)
            what[i].image -= image_dm;

        CGV_INDEX tmp_index;
        CGV_IMAGE tmp_image;
        for (CGV_ENTRIES i = 1; i < numEntries; i++)
        {
            for (CGV_ENTRIES j=i; j>0; j--)
            {
                if (what[j - 1].image > what[j].image)
                {
                    tmp_index = what[j].index;
                    tmp_image = what[j].image;
                    what[j].index  = what[j - 1].index;
                    what[j].image  = what[j - 1].image;
                    what[j - 1].index  = tmp_index;
                    what[j - 1].image  = tmp_image;
                }
            }
        }

        // got into fundamental simplex
        // move coordinate system origin to its center

        // i=0 < numEntries avoids varying int division by 0
        for (CGV_ENTRIES i=0; i < numEntries;i++)
        {
                what[i].image =  what[i].image  - (CGV_IMAGE) (((2.0f*i+1)-numEntries)/(2.0f*numEntries));
        }

        image_mm=0.0F;
        image_l=0.0F;

        CGV_INT j = -1;
        for (CGV_ENTRIES i=0; i < numEntries;i++)
        {
            image_l += what[i].image;
            if (image_l < image_mm)
            {
                image_mm = image_l;
                j=i;
            }
        }


        j = j + 1;
        // avoid  j = j%numEntries us this
        while (j > numEntries) j = j - numEntries;

        for (CGV_ENTRIES i=j; i < numEntries;i++)
        {
            CGV_INDEX idx  = what[i].index;
            CGV_INDEX pidx = projected_index_out[idx] + 1;  //gather_index(projected_index_out,idx)+1;
            projected_index_out[idx] = pidx;                 // scatter_index(projected_index_out,idx,pidx);
        }
    }

    // get minimum index
    CGV_INDEX   index_min=projected_index_out[0];
    for (CGV_ENTRIES i=1; i < numEntries;i++)
    {
        if (projected_index_out[i] < index_min)
            index_min = projected_index_out[i];
    }

    // reposition all index by min index (using min index as 0)
    for (CGV_ENTRIES i=0; i < numEntries;i++)
    {
        projected_index_out[i] = clampIndex(projected_index_out[i] - index_min,0,15);
    }

}

CGV_ERROR   GetQuantizeIndex(
                    CGV_INDEXPACKED index_packed_out[2],
                    CGV_INDEX       index_out[MAX_SUBSET_SIZE],                   // OUT:
                    CGV_IMAGE       image_src[SOURCE_BLOCK_SIZE*MAX_CHANNELS],
                    CGV_ENTRIES     numEntries,                                   //IN: range 0..15 (MAX_SUBSET_SIZE)
                    CGU_INT         numClusters,
                    CGU_CHANNEL     channels3or4)                                 // IN: 3 = RGB or 4 = RGBA (4 = MAX_CHANNELS)
{
    CGV_IMAGE image_centered[SOURCE_BLOCK_SIZE*MAX_CHANNELS];
    CGV_IMAGE image_mean[MAX_CHANNELS];
    CGV_IMAGE eigen_vector[MAX_CHANNELS];
    CGV_IMAGE covariance_vector[MAX_CHANNELS*MAX_CHANNELS];

    GetImageCentered(image_centered,image_mean, image_src, numEntries, channels3or4);
    GetCovarianceVector(covariance_vector, image_centered, numEntries, channels3or4);

    //-----------------------------------------------------
    // check if all covariances are the same
    // if so then set all index to same value 0 and return
    // use EPSILON to set the limit for all same limit
    //-----------------------------------------------------

    CGV_IMAGE image_covt=0.0F;
    for (CGU_CHANNEL ch=0; ch<channels3or4; ch++)
         image_covt = image_covt + covariance_vector[ch+ch*4];

    if (image_covt < EPSILON)
    {
        SetDefaultIndex(index_out);
        index_packed_out[0] = 0;
        index_packed_out[1] = 0;
        return 0.;
    }

    GetEigenVector(eigen_vector, covariance_vector,channels3or4);

    CGV_IMAGE image_projected[SOURCE_BLOCK_SIZE];

    GetProjecedImage(image_projected,image_centered, numEntries, eigen_vector, channels3or4);
    GetProjectedIndex(index_out, image_projected,  numClusters,numEntries);

    //==========================================
    // Refine
    //==========================================
     CGV_IMAGE image_q  = 0.0F;
    for (CGU_CHANNEL ch=0; ch<channels3or4; ch++)
    {
        eigen_vector[ch]=0;
        for (CGV_ENTRIES k=0;k<numEntries;k++)
            eigen_vector[ch] =  eigen_vector[ch] + image_centered[k+(ch*SOURCE_BLOCK_SIZE)]*index_out[k];
        image_q = image_q + eigen_vector[ch]* eigen_vector[ch];
    }

    image_q = sqrt(image_q);

    // direction needs to be normalized
    if (image_q != 0.0F)
        for (CGU_CHANNEL ch=0; ch<channels3or4; ch++)
            eigen_vector[ch] = eigen_vector[ch] / image_q;

    // Get new projected data
    GetProjecedImage(image_projected, image_centered, numEntries, eigen_vector, channels3or4);
    GetProjectedIndex(index_out, image_projected,  numClusters,numEntries);

    // pack the index for use in icmp
    pack_index(index_packed_out, index_out);

    //===========================
    // Calc Error
    //===========================
    // Get the new image based on new index

    CGV_IMAGE image_t  = 0.0F;
    CGV_IMAGE index_average = 0.0F;

    for (CGV_ENTRIES ik=0;ik<numEntries;ik++)
    {
        index_average   = index_average + index_out[ik];
        image_t         = image_t + index_out[ik]*index_out[ik];
    }

    index_average = index_average / (CGV_IMAGE) numEntries;
    image_t = image_t - index_average * index_average * (CGV_IMAGE) numEntries;

    if (image_t != 0.0F)
        image_t =  1.0F/image_t;

    for (CGU_CHANNEL ch=0; ch<channels3or4; ch++)
    {
        eigen_vector[ch]=0;
        for (CGV_ENTRIES nk=0; nk<numEntries; nk++)
            eigen_vector[ch] = eigen_vector[ch] + image_centered[nk+(ch*SOURCE_BLOCK_SIZE)]*index_out[nk];
    }

    CGV_IMAGE image_decomp[SOURCE_BLOCK_SIZE*MAX_CHANNELS];
    for (CGV_ENTRIES i=0;i<numEntries;i++)
            for (CGU_CHANNEL ch=0; ch<channels3or4; ch++)
                image_decomp[i+(ch*SOURCE_BLOCK_SIZE)] = image_mean[ch] + eigen_vector[ch]*image_t*(index_out[i]-index_average);

    CGV_ERROR err_1 = err_Total(image_src,image_decomp,numEntries, channels3or4);

    return err_1;
//    return 0.0F;
}

CGV_ERROR   quant_solid_color(
                    CGV_INDEX      index_out[MAX_SUBSET_SIZE],
                    CGV_EPOCODE    epo_code_out[2*MAX_CHANNELS],
                    CGV_IMAGE      image_src[SOURCE_BLOCK_SIZE*MAX_CHANNELS],
                    CGV_ENTRIES    numEntries,
                    CGU_UINT8      Mi_,                // last cluster
                    CGU_UINT8      bits[3],            // including parity
                    CGU_INT        type,
                    CGU_CHANNEL    channels3or4                       // IN: 3 = RGB or 4 = RGBA (4 = MAX_CHANNELS)
                    )
{

    CGU_INT    clogBC7 = 0;
    CGU_INT    iv = Mi_ + 1;
    while (iv >>= 1)
        clogBC7++;

    // init epo_0
    CGV_EPOCODE  epo_0[2*MAX_CHANNELS];
    SetDefaultEPOCode(epo_0,0xFF,0,0,0);

    CGV_INDEX  image_log = 0;
    CGV_INDEX  image_idx = 0;
    CGU_BOOL   use_par  = FALSE;
    if (type != 0)
        use_par = TRUE;
    CGV_ERROR  error_1 = CMP_FLOAT_MAX;

    for (CGU_INT pn = 0; pn<npv_nd[channels3or4-3][type] && (error_1 != 0.0F);  pn++)
    { //1

        CGU_INT o1[2*MAX_CHANNELS]; // = { 0,2 };
        CGU_INT o2[2*MAX_CHANNELS]; // = { 0,2 };

        for (CGU_CHANNEL ch = 0; ch<channels3or4; ch++)
        { //A
            o2[  ch] = o1[  ch] = 0;
            o2[4+ch] = o1[4+ch] = 2;

            if (use_par == TRUE)
            {
                if (par_vectors_nd[channels3or4-3][type][pn][0][ch])
                    o1[ch] = 1;
                else
                    o1[4+ch] = 1;
                if (par_vectors_nd[channels3or4-3][type][pn][1][ch])
                    o2[ch] = 1;
                else
                    o2[4+ch] = 1;
            }
        } //A

        CGV_EPOCODE image_tcr[MAX_CHANNELS];
        CGV_EPOCODE epo_dr_0[MAX_CHANNELS];
        CGV_ERROR   error_tr;
        CGV_ERROR   error_0 = CMP_FLOAT_MAX;

        for (CGV_INDEX iclogBC7 = 0; iclogBC7< (1 << clogBC7) && (error_0 != 0); iclogBC7++)
        { //E
            CGV_ERROR       error_t = 0;
            CGV_EPOCODE     t1o[MAX_CHANNELS], t2o[MAX_CHANNELS];

            for (CGU_CHANNEL ch1 = 0; ch1<channels3or4; ch1++)
            { // D
                CGV_ERROR error_ta = CMP_FLOAT_MAX;

                for (CGU_INT t1 = o1[ch1]; t1<o1[4+ch1]; t1++)
                { // C
                    // This is needed for non-integer mean points of "collapsed" sets
                    for (CGU_INT t2 = o2[ch1]; t2<o2[4+ch1]; t2++)
                    { // B
                        CGV_EPOCODE  image_tf;
                        CGV_EPOCODE  image_tc;
                        image_tf = (CGV_EPOCODE)floor(image_src[COMP_RED+(ch1*SOURCE_BLOCK_SIZE)]);
                        image_tc = (CGV_EPOCODE) ceil(image_src[COMP_RED+(ch1*SOURCE_BLOCK_SIZE)]);

#ifdef USE_BC7_SP_ERR_IDX
                        CGV_ERROR err_tf = get_sperr(clogBC7,bits[ch1],image_tf,t1,t2,iclogBC7);
                        CGV_ERROR err_tc = get_sperr(clogBC7,bits[ch1],image_tc,t1,t2,iclogBC7);
                        if (err_tf > err_tc)
                            image_tcr[ch1] = image_tc;
                        else if (err_tf < err_tc)
                            image_tcr[ch1] = image_tf;
                        else
                            image_tcr[ch1] = (CGV_EPOCODE)floor(image_src[COMP_RED+(ch1*SOURCE_BLOCK_SIZE)] + 0.5F);

                        //image_tcr[ch1] = image_tf + (image_tc - image_tf)/2;

                        //===============================
                        // Refine this for better quality!
                        //===============================
                        error_tr = get_sperr(clogBC7,bits[ch1],image_tcr[ch1],t1,t2,iclogBC7);
                        error_tr = (error_tr*error_tr)
                                   + 2 * error_tr
                                   * img_absf(image_tcr[ch1]- image_src[COMP_RED+(ch1*SOURCE_BLOCK_SIZE)])
                                   + (image_tcr[ch1] - image_src[COMP_RED+(ch1*SOURCE_BLOCK_SIZE)])
                                   * (image_tcr[ch1] - image_src[COMP_RED+(ch1*SOURCE_BLOCK_SIZE)]);

                        if (error_tr < error_ta)
                        {
                            error_ta = error_tr;
                            t1o[ch1] = t1;
                            t2o[ch1] = t2;
                            epo_dr_0[ch1] = clampEPO(image_tcr[ch1],0,255);
                         }
#else
                      image_tcr[ch1] = floor(image_src[COMP_RED+(ch1*SOURCE_BLOCK_SIZE)] + 0.5F);
                      error_ta      = 0;
                      t1o[ch1] = t1;
                      t2o[ch1] = t2;
                      epo_dr_0[ch1] = clampi(image_tcr[ch1],0,255);
#endif
                  } // B
              } //C

              error_t += error_ta;
            } // D

            if (error_t < error_0)
            {
                image_log = iclogBC7;
                image_idx = image_log;
                CGU_BOOL srcIsWhite = FALSE;
                if ((image_src[0] == 255.0f)&&(image_src[1] == 255.0f)&&(image_src[2] == 255.0f)) srcIsWhite = TRUE;

                for (CGU_CHANNEL ch = 0; ch<channels3or4; ch++)
                {
#ifdef ASPM_GPU
                    if (srcIsWhite == TRUE)
                    {
                        // Default White block!
                        epo_0[  ch] = 0x7F;
                        epo_0[4+ch] = 0x7F;
                    }
                    else
                    {
                        // Default black block!
                        epo_0[  ch] = 0;
                        epo_0[4+ch] = 0;
                    }
#else
#ifdef USE_BC7_SP_ERR_IDX
                    if (BC7EncodeRamps.ramp_init) {
                        CGV_EPOCODE index = (CLT(clogBC7)*4*256*2*2*16*2)+(BTT(bits[ch])*256*2*2*16*2)+(epo_dr_0[ch]*2*2*16*2)+(t1o[ch]*2*16*2)+(t2o[ch]*16*2)+(iclogBC7*2);
                        epo_0[  ch] = BC7EncodeRamps.sp_idx[index+0]&0xFF;// gather_epocode(u_BC7Encode->sp_idx,index+0)&0xFF;
                        epo_0[4+ch] = BC7EncodeRamps.sp_idx[index+1]&0xFF;// gather_epocode(u_BC7Encode->sp_idx,index+1)&0xFF;
                    }
                    else {
                        epo_0[ch] = 0;
                        epo_0[4 + ch] = 0;
                    }
#else
                    epo_0[  ch] = 0;
                    epo_0[4+ch] = 0;
#endif
#endif
                }
                error_0 = error_t;
            }
            //if (error_0 == 0)
            //    break;
        } // E

        if (error_0 < error_1)
        {

            image_idx = image_log;
            for (CGU_CHANNEL chE = 0; chE<channels3or4; chE++)
            {
                epo_code_out[chE]   = epo_0[chE];
                epo_code_out[4+chE] = epo_0[4+chE];
            }
            error_1 = error_0;
        }

    } //1

    // Get Image error
    CGV_IMAGE  image_decomp[SOURCE_BLOCK_SIZE*MAX_CHANNELS];
    for (CGV_ENTRIES i = 0; i< numEntries; i++)
    {
        index_out[i] = image_idx;
        for (CGU_CHANNEL ch = 0; ch<channels3or4; ch++)
        {
            image_decomp[i+(ch*SOURCE_BLOCK_SIZE)] = GetRamp(clogBC7,bits[ch],epo_code_out[ch],epo_code_out[4+ch],image_idx);
        }
    }
    // Do we need to do this rather then err_1 * numEntries
    CGV_ERROR error_quant;
    error_quant = err_Total(image_src, image_decomp, numEntries, channels3or4);

    return error_quant;
    //return err_1 * numEntries;
}

CGV_ERROR requantized_image_err(
                        CGV_INDEX      index_out[MAX_SUBSET_SIZE],
                        CGV_EPOCODE    epo_code[2*MAX_CHANNELS],
                        CGU_INT        clogBC7,
                        CGU_UINT8      max_bits[MAX_CHANNELS],
                        CGV_IMAGE      image_src[SOURCE_BLOCK_SIZE*MAX_CHANNELS],
                        CGV_ENTRIES    numEntries,         // max 16
                        CGU_CHANNEL    channels3or4)       // IN: 3 = RGB or 4 = RGBA (4 = MAX_CHANNELS)
{

        //=========================================
        // requantized image based on new epo_code
        //=========================================
        CGV_IMAGE   image_requantize[SOURCE_BLOCK_SIZE][MAX_CHANNELS];
        CGV_ERROR   err_r=0.0F;

        for (CGU_CHANNEL ch = 0; ch<channels3or4; ch++)
        {
            for (CGU_INT k = 0; k<SOURCE_BLOCK_SIZE; k++)
            {
                  image_requantize[k][ch] = GetRamp(clogBC7,max_bits[ch],epo_code[ch],epo_code[4+ch],(CGV_INDEX)k);
            }
        }

        //=========================================
        // Calc the error for the requantized image
        //=========================================

        for (CGV_ENTRIES k =0; k < numEntries; k++)
        {
            CGV_ERROR     err_cmin     = CMP_FLOAT_MAX;
            CGV_TYPEINT       hold_index_j = 0;

            for (CGV_TYPEINT iclogBC7=0; iclogBC7 < (1<<clogBC7); iclogBC7++)
            {
                CGV_IMAGE image_err = 0.0F;

                for (CGU_CHANNEL ch=0; ch<channels3or4; ch++)
                {
                    image_err+= sq_image(image_requantize[iclogBC7][ch]-image_src[k+(ch*SOURCE_BLOCK_SIZE)]);
                }

                if(image_err < err_cmin)
                {
                    err_cmin     = image_err;
                    hold_index_j = iclogBC7;
                }
            }

            index_out[k]=(CGV_INDEX)hold_index_j;
            err_r    +=err_cmin;
        }

        return err_r;
}

CGU_BOOL get_ideal_cluster(
                        CGV_IMAGE      image_out[2*MAX_CHANNELS],
                        CGV_INDEX      index_in[MAX_SUBSET_SIZE],
                        CGU_INT        Mi_,
                        CGV_IMAGE      image_src[SOURCE_BLOCK_SIZE*MAX_CHANNELS],
                        CGV_ENTRIES    numEntries,
                        CGU_CHANNEL    channels3or4 )
{
            // get ideal cluster centers
            CGV_IMAGE   image_cluster_mean[SOURCE_BLOCK_SIZE][MAX_CHANNELS];
            GetClusterMean(image_cluster_mean, image_src, index_in, numEntries, channels3or4); // unrounded

            CGV_IMAGE image_matrix0[2] = {0,0};   // matrix /inverse matrix
            CGV_IMAGE image_matrix1[2] = {0,0};   // matrix /inverse matrix
            CGV_IMAGE image_rp[2*MAX_CHANNELS];            // right part for RMS fit problem

            for (CGU_INT i=0; i<2*MAX_CHANNELS; i++) image_rp[i]=0;

            // weight with cnt if runnning on compacted index
            for (CGV_ENTRIES k=0;k<numEntries;k++)
            {
                    image_matrix0[0] += (Mi_- index_in[k])* (Mi_-index_in[k]);
                    image_matrix0[1] +=       index_in[k] * (Mi_-index_in[k]);           // im is symmetric
                    image_matrix1[1] +=       index_in[k] *      index_in[k];

                    for (CGU_CHANNEL ch=0; ch<channels3or4; ch++)
                    {
                        image_rp[  ch] += (Mi_-index_in[k]) * image_cluster_mean[index_in[k]][ch];
                        image_rp[4+ch] +=      index_in[k]  * image_cluster_mean[index_in[k]][ch];
                    }
            }

            CGV_IMAGE matrix_dd = image_matrix0[0]*image_matrix1[1]- image_matrix0[1]*image_matrix0[1];

            // assert(matrix_dd !=0);
            // matrix_dd=0 means that index_cidx[k] and (Mi_-index_cidx[k]) collinear which implies only one active index;
            // taken care of separately
            if (matrix_dd == 0)
            {
                for (CGU_CHANNEL ch=0; ch<channels3or4; ch++)
                {
                    image_out[  ch]=0;
                    image_out[4+ch]=0;
                }
                return FALSE;
            }
            image_matrix1[0] = image_matrix0[0];
            image_matrix0[0] = image_matrix1[1]/matrix_dd;
            image_matrix1[1] = image_matrix1[0]/matrix_dd;
            image_matrix1[0] = image_matrix0[1]=-image_matrix0[1]/matrix_dd;

            CGV_IMAGE     Mif = (CGV_IMAGE)Mi_;

            for (CGU_CHANNEL ch=0; ch<channels3or4; ch++)
            {
                image_out[  ch]=(image_matrix0[0]*image_rp[ch]+image_matrix0[1]*image_rp[4+ch])*Mif;
                image_out[4+ch]=(image_matrix1[0]*image_rp[ch]+image_matrix1[1]*image_rp[4+ch])*Mif;
            }

   return TRUE;
}

CGV_ERROR shake(
        CGV_EPOCODE epo_code_shaker_out[2*MAX_CHANNELS],
        CGV_IMAGE   image_ep[2*MAX_CHANNELS],
        CGV_INDEX   index_cidx[MAX_SUBSET_SIZE],
        CGV_IMAGE   image_src[SOURCE_BLOCK_SIZE*MAX_CHANNELS],
        CGU_INT     clogBC7,
        CGU_INT     type,
        CGU_UINT8   max_bits[MAX_CHANNELS],
        CGU_UINT8   use_par,
        CGV_ENTRIES numEntries,         // max 16
        CGU_CHANNEL channels3or4 )
{
#define SHAKESIZE1 1
#define SHAKESIZE2 2
             // shake single or                                   - cartesian
             // shake odd/odd and even/even or                    - same parity
             // shake odd/odd odd/even , even/odd and even/even   - bcc

             CGV_ERROR    best_err = CMP_FLOAT_MAX;

             CGV_ERROR    err_ed[16] = {0};
             CGV_EPOCODE  epo_code_par[2][2][2][MAX_CHANNELS];

             for (CGU_CHANNEL ch=0; ch<channels3or4; ch++)
             {
                 CGU_UINT8         ppA = 0;
                 CGU_UINT8         ppB = 0;
                 CGU_UINT8         rr = (use_par ? 2:1);
                 CGV_EPOCODE     epo_code_epi[2][2];  // first/second, coord, begin rage end range

                 for (ppA=0; ppA<rr; ppA++) // loop max =2
                 {
                     for (ppB=0; ppB<rr; ppB++) //loop  max =2
                     {

                         // set default ranges
                         epo_code_epi[0][0] = epo_code_epi[0][1]= ep_find_floor( image_ep[  ch],max_bits[ch], use_par, ppA);
                         epo_code_epi[1][0] = epo_code_epi[1][1]= ep_find_floor( image_ep[4+ch],max_bits[ch], use_par, ppB);

                         // set begin range
                         epo_code_epi[0][0] -= ( (epo_code_epi[0][0]  < SHAKESIZE1 ? epo_code_epi[0][0]:SHAKESIZE1))&(~use_par);
                         epo_code_epi[1][0] -= ( (epo_code_epi[1][0]  < SHAKESIZE1 ? epo_code_epi[1][0]:SHAKESIZE1))&(~use_par);

                         // set end range
                         epo_code_epi[0][1] += ((1<<max_bits[ch])-1 - epo_code_epi[0][1]  < SHAKESIZE2 ? (1<<max_bits[ch])-1-epo_code_epi[0][1]:SHAKESIZE2)&(~use_par);
                         epo_code_epi[1][1] += ((1<<max_bits[ch])-1 - epo_code_epi[1][1]  < SHAKESIZE2 ? (1<<max_bits[ch])-1-epo_code_epi[1][1]:SHAKESIZE2)&(~use_par);

                         CGV_EPOCODE step = (1<<use_par);
                         err_ed[(ppA*8)+(ppB*4)+ch]=CMP_FLOAT_MAX;

                         for (CGV_EPOCODE epo_p1=epo_code_epi[0][0]; epo_p1<=epo_code_epi[0][1]; epo_p1+=step)
                         for (CGV_EPOCODE epo_p2=epo_code_epi[1][0]; epo_p2<=epo_code_epi[1][1]; epo_p2+=step)
                         {
                                 CGV_IMAGE      image_square_diff =0.0F;
                                 CGV_ENTRIES    _mc = numEntries;
                                 CGV_IMAGE      image_ramp;

                                 while(_mc > 0)
                                 {
                                     image_ramp = GetRamp(clogBC7,max_bits[ch],epo_p1,epo_p2,index_cidx[_mc-1]);

                                     image_square_diff += sq_image(image_ramp-image_src[(_mc-1)+(ch*SOURCE_BLOCK_SIZE)]);
                                     _mc--;
                                 }
                                 if (image_square_diff < err_ed[(ppA*8)+(ppB*4)+ch])
                                 {
                                     err_ed[(ppA*8)+(ppB*4)+ch] = image_square_diff;
                                     epo_code_par[ppA][ppB][0][ch] = epo_p1;
                                     epo_code_par[ppA][ppB][1][ch] = epo_p2;
                                 }
                             }
                        } // pp1
                     } // pp0
                 } // j

 //---------------------------------------------------------
             for (CGU_INT pn=0;  pn < npv_nd[channels3or4-3][type]; pn++)
             {
                 CGV_ERROR err_2=0.0F;
                 CGU_INT   d1;
                 CGU_INT   d2;

                 for (CGU_CHANNEL ch=0; ch<channels3or4; ch++)
                 {
                     d1 = par_vectors_nd[channels3or4-3][type][pn][0][ch];
                     d2 = par_vectors_nd[channels3or4-3][type][pn][1][ch];
                     err_2+=err_ed[(d1*8)+(d2*4)+ch];
                 }

                 if (err_2 < best_err)
                 {
                     best_err = err_2;
                     for (CGU_CHANNEL ch=0; ch<channels3or4; ch++)
                     {
                         d1 = par_vectors_nd[channels3or4-3][type][pn][0][ch];
                         d2 = par_vectors_nd[channels3or4-3][type][pn][1][ch];
                         epo_code_shaker_out[  ch]=epo_code_par[d1][d2][0][ch];
                         epo_code_shaker_out[4+ch]=epo_code_par[d1][d2][1][ch];
                     }
                 }
             }

             return best_err;
}

CGV_ERROR  optimize_IndexAndEndPoints(
                    CGV_INDEX      index_io[MAX_SUBSET_SIZE],
                    CGV_EPOCODE    epo_code_out[8],
                    CGV_IMAGE      image_src[SOURCE_BLOCK_SIZE*MAX_CHANNELS],
                    CGV_ENTRIES    numEntries,         // max 16
                    CGU_UINT8      Mi_,                // last cluster , This should be no larger than 16
                    CGU_UINT8      bits,               // total for all components
                    CGU_CHANNEL    channels3or4,       // IN: 3 = RGB or 4 = RGBA (4 = MAX_CHANNELS)
uniform CMP_GLOBAL    BC7_Encode     u_BC7Encode[])
{
    CGV_ERROR err_best = CMP_FLOAT_MAX;
    CGU_INT   type;
    CGU_CHANNEL   channels2 = 2*channels3or4;

    type = bits % channels2;

    CGU_UINT8 use_par =(type !=0);

    CGU_UINT8   max_bits[MAX_CHANNELS];
    for (CGU_UINT8 ch=0; ch<channels3or4; ch++)
        max_bits[ch] = (bits+channels2-1) / channels2;

    CGU_INT  iv;
    CGU_INT  clogBC7=0;
    iv = Mi_;
    while (iv>>=1)
        clogBC7++;

    CGU_INT clt_clogBC7 = CLT(clogBC7);

    if (clt_clogBC7 > 3)
    {
        ASPM_PRINT(("Err: optimize_IndexAndEndPoints, clt_clogBC7\n"));
        return CMP_FLOAT_MAX;
    }

    Mi_ = Mi_ - 1;

    CGV_INDEX     MaxIndex;
    CGV_INDEX     index_tmp[MAX_SUBSET_SIZE];
    CGU_INT       maxTry = MAX_TRY_SHAKER;

    CGV_INDEX       index_best[MAX_SUBSET_SIZE];

    for (CGV_ENTRIES k=0;k<numEntries;k++)
    {
        index_best[k] = index_tmp[k] = clampIndex(index_io[k],0,15);
    }

    CGV_EPOCODE epo_code_best[2*MAX_CHANNELS];

    SetDefaultEPOCode(epo_code_out ,0xFF,0,0,0);
    SetDefaultEPOCode(epo_code_best,0,0,0,0);

    CGV_ERROR       err_requant = 0.0F;

   MaxIndex = index_collapse(index_tmp, numEntries);

   //===============================
   // we have a solid color 4x4 block
   //===============================
   if (MaxIndex == 0)
   {

       return quant_solid_color(index_io, epo_code_out, image_src, numEntries,  Mi_, max_bits,type, channels3or4);
   }

  do {
        //===============================
        // We have ramp colors to process
        //===============================
        CGV_ERROR   err_cluster = CMP_FLOAT_MAX;
        CGV_ERROR   err_shake;
        CGV_INDEX   index_cluster[MAX_PARTITION_ENTRIES];

        for (CGV_INDEX index_slope=1;  (MaxIndex != 0) && (index_slope*MaxIndex <= Mi_); index_slope++)
        {
            for (CGV_INDEX index_offset=0; index_offset<=Mi_-index_slope*MaxIndex; index_offset++)
            {
              //-------------------------------------
              // set a new index data to try
              //-------------------------------------
              for (CGV_ENTRIES k=0;k<numEntries;k++)
                  index_cluster[k] = index_tmp[k] * index_slope + index_offset;

              CGV_IMAGE     image_cluster[2*MAX_CHANNELS];
              CGV_EPOCODE  epo_code_shake[2*MAX_CHANNELS];
              SetDefaultEPOCode(epo_code_shake,0,0,0xFF,0);

              if (get_ideal_cluster(  image_cluster,
                                      index_cluster,
                                      Mi_,
                                      image_src,
                                      numEntries,
                                      channels3or4) == FALSE)
              {
                  break;
              }

              err_shake = shake( epo_code_shake,  // return new epo
                             image_cluster,
                             index_cluster,
                             image_src,
                             clogBC7,
                             type,
                             max_bits,
                             use_par,
                             numEntries,         // max 16
                             channels3or4);

              if (err_shake < err_cluster)
              {
                  err_cluster = err_shake;
                  for (CGU_CHANNEL ch=0; ch<channels3or4; ch++)
                  {
                      epo_code_best[  ch] = clampEPO(epo_code_shake[  ch], 0, 255);
                      epo_code_best[4+ch] = clampEPO(epo_code_shake[4+ch], 0, 255);
                  }
              }
            }
        }

        CGV_TYPEINT change = 0;
        CGV_TYPEINT better = 0;

        if ((err_cluster != CMP_FLOAT_MAX))
        {
            //=========================
            // test results for quality
            //=========================
             err_requant = requantized_image_err(
                                                 index_best,    // new index results
                                                 epo_code_best, // prior result input
                                                 clogBC7,
                                                 max_bits,
                                                 image_src,
                                                 numEntries,
                                                 channels3or4);

              // change/better
              // Has the index values changed from that last set
              for (CGV_ENTRIES k=0;k<numEntries;k++)
                  change = change || (index_cluster[k] != index_best[k]);

              if (err_requant < err_best)
              {
                  better = 1;
                  for (CGV_ENTRIES k=0;k<numEntries;k++)
                  {
                      index_io[k]=index_tmp[k]=index_best[k];
                  }

                  for (CGU_CHANNEL ch=0; ch<channels3or4; ch++)
                  {
                      epo_code_out[  ch]=epo_code_best[0*4+ch];
                      epo_code_out[4+ch]=epo_code_best[1*4+ch];
                  }
                  err_best=err_requant;
              }
         }

         // Early out if we have our target err
         if( err_best <= u_BC7Encode->errorThreshold)
         {
             break;
         }

        CGV_TYPEINT done;
        done = !(change  &&  better);
        if ((maxTry > 0)&&(!done))
        {
            maxTry--;
            MaxIndex = index_collapse(index_tmp, numEntries);
        }
        else
        {
            maxTry = 0;
        }

    } while (maxTry);

   if (err_best == CMP_FLOAT_MAX)
   {
       ASPM_PRINT(("Err: requantized_image_err\n"));
   }

    return err_best;
}

CGU_UINT8  get_partitionsToTry(uniform CMP_GLOBAL BC7_Encode u_BC7Encode[],CGU_UINT8 maxPartitions)
{
    CGU_FLOAT u_minPartitionSearchSize = 0.30f;
    if(u_BC7Encode->quality <= BC7_qFAST_THRESHOLD) // Using this to match performance and quality of CPU code
    {
         u_minPartitionSearchSize       = u_minPartitionSearchSize + ( u_BC7Encode->quality*BC7_qFAST_THRESHOLD);
    }
    else
    {
        u_minPartitionSearchSize       =  u_BC7Encode->quality;
    }
    return (CGU_UINT8)(maxPartitions * u_minPartitionSearchSize);
}

INLINE void cmp_encode_swap(CGV_EPOCODE endpoint[], CGU_INT channels, CGV_INDEX block_index[MAX_SUBSET_SIZE], CGU_INT bits)
{
   CGU_INT levels = 1 << bits;
   if (block_index[0]>=levels/2)
   {
      cmp_swap_epo(&endpoint[0], &endpoint[channels], channels);
      for (CGU_INT k=0; k<SOURCE_BLOCK_SIZE; k++)
         block_index[k] = CGV_INDEX(levels-1) - block_index[k];
   }
}

void cmp_encode_index(CGV_CMPOUT data[16], CGU_INT* uniform pPos, CGV_INDEX block_index[MAX_SUBSET_SIZE], CGU_INT bits)
{
   cmp_Write8Bit(data,pPos,bits-1,block_index[0]);
   for (CGU_INT j=1;j<SOURCE_BLOCK_SIZE;j++)
   {
       CGV_INDEX qbits = block_index[j]&0xFF;
       cmp_Write8Bit(data,pPos,bits,qbits);
   }
}

void encode_endpoint(CGV_CMPOUT data[16], CGU_INT* uniform pPos, CGV_BYTE block_index[16],  CGU_INT bits, CGV_SHIFT32 flips)
{
   CGU_INT      levels = 1 << bits;
   CGV_TYPEINT  flips_shifted = flips;
   for (CGU_INT k1=0; k1<16; k1++)
   {
      CGV_BYTE qbits_shifted = block_index[k1];
      for (CGU_INT k2=0; k2<8; k2++)
      {
         CGV_TYPEINT q = qbits_shifted&15;
         if ((flips_shifted&1)>0) q = (levels-1)-q;

         if (k1==0 && k2==0)   cmp_Write8Bit(data, pPos, bits - 1, static_cast <CGV_BYTE>(q));
         else                  cmp_Write8Bit(data, pPos, bits, static_cast<CGV_BYTE>(q));
         qbits_shifted >>= 4;
         flips_shifted >>= 1;
      }
   }
}


INLINE CGV_SHIFT32 pow32(CGV_SHIFT32 x)
{
   return 1<<x;
}

void  Encode_mode02137(
                    CGU_INT           blockMode,
                    CGV_UINT8         bestPartition,
                    CGV_TYPEUINT32    packedEndpoints[MAX_SUBSETS*2],
                    CGV_BYTE          index16[16],
                    CGV_CMPOUT        cmp_out[COMPRESSED_BLOCK_SIZE])
{
    CGU_INT     partitionBits;
    CGU_UINT32  componentBits;
    CGU_UINT8   maxSubsets;
    CGU_INT     channels;
    CGU_BYTE    indexBits;

    switch(blockMode)
    {
        case 0:
                componentBits   = 4;
                maxSubsets      = 3;
                partitionBits   = 4;
                channels        = 3;
                indexBits       = 3;
                break;
        case 2:
                componentBits   = 5;
                maxSubsets      = 3;
                partitionBits   = 6;
                channels        = 3;
                indexBits       = 2;
                break;
        case 3:
                componentBits   = 7;
                maxSubsets      = 2;
                partitionBits   = 6;
                channels        = 3;
                indexBits       = 2;
                break;
        case 7:
                componentBits   = 5;
                maxSubsets      = 2;
                partitionBits   = 6;
                channels        = 4;
                indexBits       = 2;
                break;
        default:
        case 1:
            componentBits = 6;
            maxSubsets = 2;
            partitionBits = 6;
            channels = 3;
            indexBits = 3;
            break;
    }

    CGV_BYTE  blockindex[SOURCE_BLOCK_SIZE];
    CGV_INT   indexBitsV = indexBits;

    for (CGU_INT k=0; k<COMPRESSED_BLOCK_SIZE; k++) cmp_out[k] = 0;

    // mode 0 = 1, mode 1 = 01, mode 2 = 001, mode 3 = 0001, ...
    CGU_INT    bitPosition = blockMode;
    cmp_Write8Bit(cmp_out,&bitPosition,1,1);

    // Write partition bits
     cmp_Write8Bit(cmp_out,&bitPosition,partitionBits,bestPartition);

  // Sort out the index set and tag whether we need to flip the
  // endpoints to get the correct state in the implicit index bits
  // The implicitly encoded MSB of the fixup index must be 0
    CGV_FIXUPINDEX    fixup[3];
    get_fixuptable(fixup,(maxSubsets==2?bestPartition:bestPartition+64));

    // Extract indices and mark subsets that need to have their colours flipped to get the
    // right state for the implicit MSB of the fixup index
    CGV_INT     flipColours[3] = {0, 0, 0};

    for (CGV_INT k=0; k<SOURCE_BLOCK_SIZE; k++)
    {
        blockindex[k]  = index16[k];
        for (CGU_UINT8 j=0;j<maxSubsets;j++)
        {
            if(k==fixup[j])
            {
                if(blockindex[k] & (1<<(indexBitsV-1)))
                {
                    flipColours[j] = 1;
                }
            }
        }
     }

    // Now we must flip the endpoints where necessary so that the implicitly encoded
    // index bits have the correct state
    for (CGU_INT subset=0; subset<maxSubsets; subset++)
    {
        if(flipColours[subset] == 1)
        {
            CGV_TYPEUINT32         temp = packedEndpoints[subset*2+0];
            packedEndpoints[subset*2+0] = packedEndpoints[subset*2+1];
            packedEndpoints[subset*2+1] = temp;
        }
    }

    // ...next flip the indices where necessary


    for (CGV_INT k=0; k<SOURCE_BLOCK_SIZE; k++)
    {
        CGV_UINT8   partsub = get_partition_subset(bestPartition,maxSubsets,k);

        if(flipColours[partsub] == 1)
        {
            blockindex[k] = ((1 << indexBitsV) - 1) - blockindex[k];
        }
    }

    // Endpoints are stored in the following order RRRR GGGG BBBB (AAAA) (PPPP)
    // i.e. components are packed together
    CGV_SHIFT32  unpackedColours[MAX_SUBSETS*2*MAX_CHANNELS];
    CGV_BYTE     parityBits[MAX_SUBSETS][2];

    // Unpack the colour values for the subsets
    for (CGU_INT subset=0; subset<maxSubsets; subset++)
    {
        CGV_SHIFT32   packedColours[2] = {packedEndpoints[subset*2+0],packedEndpoints[subset*2+1]};

        if(blockMode == 0 || blockMode == 3|| blockMode == 7) // TWO_PBIT
        {
            parityBits[subset][0] = packedColours[0] & 1;
            parityBits[subset][1] = packedColours[1] & 1;
            packedColours[0] >>= 1;
            packedColours[1] >>= 1;
        }
        else
        if(blockMode == 1) // ONE_PBIT
        {
            parityBits[subset][0] = packedColours[1] & 1;
            parityBits[subset][1] = packedColours[1] & 1;
            packedColours[0] >>= 1;
            packedColours[1] >>= 1;
        }
        else
        if(blockMode == 2)
        {
            parityBits[subset][0] = 0;
            parityBits[subset][1] = 0;
        }

        for (CGU_INT ch=0; ch<channels;ch++)
        {
            unpackedColours[(subset*2+0)*MAX_CHANNELS+ch] = packedColours[0] & ((1 << componentBits) - 1);
            unpackedColours[(subset*2+1)*MAX_CHANNELS+ch] = packedColours[1] & ((1 << componentBits) - 1);
            packedColours[0] >>= componentBits;
            packedColours[1] >>= componentBits;
        }
    }

    // Loop over component
    for (CGU_INT ch=0; ch < channels; ch++)
    {
        // loop over subsets
        for (CGU_INT subset=0; subset<maxSubsets; subset++)
        {
            cmp_Write8Bit(cmp_out,&bitPosition,componentBits,unpackedColours[(subset*2+0)*MAX_CHANNELS+ch]&0xFF);
            cmp_Write8Bit(cmp_out,&bitPosition,componentBits,unpackedColours[(subset*2+1)*MAX_CHANNELS+ch]&0xFF);
        }
    }


    // write parity bits
    if (blockMode != 2)
    {
        for (CGV_INT subset=0; subset<maxSubsets; subset++)
        {
            if(blockMode == 1) // ONE_PBIT
            {
                cmp_Write8Bit(cmp_out,&bitPosition,1,parityBits[subset][0]&0x01);
            }
            else // TWO_PBIT
            {
                cmp_Write8Bit(cmp_out,&bitPosition,1,parityBits[subset][0]&0x01);
                cmp_Write8Bit(cmp_out,&bitPosition,1,parityBits[subset][1]&0x01);
            }
        }
    }

    // Encode the index bits
    CGV_INT bitPositionV = bitPosition;
    for (CGV_FIXUPINDEX k=0; k<SOURCE_BLOCK_SIZE; k++)
    {
        CGV_UINT8   partsub = get_partition_subset(bestPartition,maxSubsets,k);

        // If this is a fixup index then drop the MSB which is implicitly 0
        if(k == fixup[partsub])
        {
            cmp_Write8BitV(cmp_out, bitPositionV, indexBits-1,blockindex[k]&0x07F);
            bitPositionV += indexBits-1;
        }
        else
        {
            cmp_Write8BitV(cmp_out,bitPositionV, indexBits,blockindex[k]);
            bitPositionV += indexBits;
        }
    }
}

void  Encode_mode4( CGV_CMPOUT     cmp_out[COMPRESSED_BLOCK_SIZE],
                    varying cmp_mode_parameters* uniform params )
{
    CGU_INT   bitPosition = 4;    // Position the pointer at the LSB

    for (CGU_INT k=0; k<COMPRESSED_BLOCK_SIZE; k++) cmp_out[k] = 0;

    // mode 4 (5 bits) 00001
    cmp_Write8Bit(cmp_out,&bitPosition,1,1);

    // rotation 2 bits
    cmp_Write8Bit(cmp_out, &bitPosition, 2, static_cast <CGV_BYTE> (params->rotated_channel));

    // idxMode 1 bit
    cmp_Write8Bit(cmp_out, &bitPosition, 1, static_cast <CGV_BYTE> (params->idxMode));

    CGU_INT   idxBits[2] = {2,3};

    if(params->idxMode)
    {
        idxBits[0] = 3;
        idxBits[1] = 2;
        // Indicate if we need to fixup the index
        cmp_swap_index(params->color_index,params->alpha_index,16);
        cmp_encode_swap(params->alpha_qendpoint, 4, params->color_index,2);
        cmp_encode_swap(params->color_qendpoint, 4, params->alpha_index,3);
    }
    else
    {
        cmp_encode_swap(params->color_qendpoint, 4, params->color_index,2);
        cmp_encode_swap(params->alpha_qendpoint, 4, params->alpha_index,3);
    }

   // color endpoints 5 bits each
   // R0 : R1
   // G0 : G1
   // B0 : B1
   for (CGU_INT component=0; component < 3; component++)
   {
        cmp_Write8Bit(cmp_out, &bitPosition, 5, static_cast<CGV_BYTE> (params->color_qendpoint[component]));
        cmp_Write8Bit(cmp_out, &bitPosition, 5, static_cast <CGV_BYTE> (params->color_qendpoint[4 + component]));
   }

   // alpha endpoints (6 bits each)
   // A0 : A1
   cmp_Write8Bit(cmp_out, &bitPosition, 6, static_cast<CGV_BYTE> (params->alpha_qendpoint[0]));
   cmp_Write8Bit(cmp_out, &bitPosition, 6, static_cast<CGV_BYTE> (params->alpha_qendpoint[4]));

    // index 2 bits each  (31 bits total)
    cmp_encode_index(cmp_out, &bitPosition, params->color_index, 2);
    // index 3 bits each  (47 bits total)
    cmp_encode_index(cmp_out, &bitPosition, params->alpha_index, 3);
}

void  Encode_mode5( CGV_CMPOUT     cmp_out[COMPRESSED_BLOCK_SIZE],
                    varying cmp_mode_parameters* uniform params)
{
    for (CGU_INT k=0; k<COMPRESSED_BLOCK_SIZE; k++) cmp_out[k] = 0;

    // mode 5 bits = 000001
    CGU_INT   bitPosition = 5;    // Position the pointer at the LSB
    cmp_Write8Bit(cmp_out,&bitPosition,1,1);

    // Write 2 bit rotation
    cmp_Write8Bit(cmp_out, &bitPosition, 2, static_cast<CGV_BYTE> (params->rotated_channel));

    cmp_encode_swap(params->color_qendpoint, 4, params->color_index,2);
    cmp_encode_swap(params->alpha_qendpoint, 4, params->alpha_index,2);

   // color endpoints (7 bits each)
   // R0 : R1
   // G0 : G1
   // B0 : B1
   for (CGU_INT component=0; component < 3; component++)
   {
        cmp_Write8Bit(cmp_out, &bitPosition, 7, static_cast<CGV_BYTE> (params->color_qendpoint[component]));
        cmp_Write8Bit(cmp_out, &bitPosition, 7, static_cast <CGV_BYTE> (params->color_qendpoint[4 + component]));
   }

   // alpha endpoints (8 bits each)
   // A0 : A1
   cmp_Write8Bit(cmp_out, &bitPosition, 8, static_cast<CGV_BYTE> (params->alpha_qendpoint[0]));
   cmp_Write8Bit(cmp_out, &bitPosition, 8, static_cast<CGV_BYTE> (params->alpha_qendpoint[4]));


   // color index 2 bits each  (31 bits total)
   // alpha index 2 bits each  (31 bits total)
   cmp_encode_index(cmp_out, &bitPosition, params->color_index, 2);
   cmp_encode_index(cmp_out, &bitPosition, params->alpha_index, 2);
}

void  Encode_mode6(
                    CGV_INDEX         index[MAX_SUBSET_SIZE],
                    CGV_EPOCODE       epo_code[8],
                    CGV_CMPOUT        cmp_out[COMPRESSED_BLOCK_SIZE])
{
    for (CGU_INT k=0; k<COMPRESSED_BLOCK_SIZE; k++) cmp_out[k] = 0;

    cmp_encode_swap(epo_code, 4, index,4);

    // Mode = 6  bits = 0000001
    CGU_INT    bitPosition = 6;    // Position the pointer at the LSB
    cmp_Write8Bit(cmp_out,&bitPosition,1, 1);

    // endpoints
    for (CGU_INT p=0; p<4; p++)
    {
        cmp_Write8Bit(cmp_out, &bitPosition, 7, static_cast<CGV_BYTE> (epo_code[0 + p] >> 1));
        cmp_Write8Bit(cmp_out, &bitPosition, 7, static_cast<CGV_BYTE> (epo_code[4 + p] >> 1));
    }

    // p bits
    cmp_Write8Bit(cmp_out, &bitPosition, 1, epo_code[0]&1);
    cmp_Write8Bit(cmp_out, &bitPosition, 1, epo_code[4]&1);

    // quantized values
    cmp_encode_index(cmp_out, &bitPosition, index, 4);
}


void  Compress_mode01237(
                    CGU_INT             blockMode,
                    BC7_EncodeState     EncodeState[],
uniform CMP_GLOBAL    BC7_Encode          u_BC7Encode[])
{
    CGV_INDEX       storedBestindex[MAX_PARTITIONS][MAX_SUBSETS][MAX_SUBSET_SIZE];
    CGV_ERROR       storedError[MAX_PARTITIONS];
    CGV_UINT8       sortedPartition[MAX_PARTITIONS];

    EncodeState->numPartitionModes = 64;
    EncodeState->maxSubSets = 2;

    if (blockMode == 0)
    {
        EncodeState->numPartitionModes = 16;
        EncodeState->channels3or4      = 3;
        EncodeState->bits              = 26;
        EncodeState->clusters          = 8;
        EncodeState->componentBits     = 4;
        EncodeState->maxSubSets        = 3;
    }
    else
    if (blockMode == 2)
    {
        EncodeState->channels3or4  = 3;
        EncodeState->bits          = 30;
        EncodeState->clusters      = 4;
        EncodeState->componentBits = 5;
        EncodeState->maxSubSets    = 3;
    }
    else
    if (blockMode == 1)
    {

        EncodeState->channels3or4  = 3;
        EncodeState->bits          = 37;
        EncodeState->clusters      = 8;
        EncodeState->componentBits = 6;
    }
    else
    if (blockMode == 3)
    {
        EncodeState->channels3or4  = 3;
        EncodeState->bits          = 44;
        EncodeState->clusters      = 4;
        EncodeState->componentBits = 7;
    }
    else
    if (blockMode == 7)
    {
        EncodeState->channels3or4  = 4;
        EncodeState->bits          = 42; // (2* (R 5 + G 5 + B 5 + A 5)) + 2 parity bits
        EncodeState->clusters      = 4;
        EncodeState->componentBits = 5;  // 5 bit components
    }

    CGV_IMAGE        image_subsets[MAX_SUBSETS][MAX_SUBSET_SIZE][MAX_CHANNELS];
    CGV_ENTRIES      subset_entryCount[MAX_SUBSETS] = {0,0,0};

    // Loop over the available partitions for the block mode and quantize them
    // to figure out the best candidates for further refinement
    CGU_UINT8       mode_partitionsToTry;
    mode_partitionsToTry = get_partitionsToTry(u_BC7Encode,EncodeState->numPartitionModes);

    CGV_UINT8     bestPartition = 0;

    for (CGU_INT mode_blockPartition = 0;  mode_blockPartition < mode_partitionsToTry;  mode_blockPartition++)
    {

        GetPartitionSubSet_mode01237(
                  image_subsets,
                  subset_entryCount,
                  static_cast<CGV_UINT8>(mode_blockPartition),
                  EncodeState->image_src,
                  blockMode,
                  EncodeState->channels3or4);

        CGV_IMAGE  subset_image_src[SOURCE_BLOCK_SIZE*MAX_CHANNELS];
        CGV_INDEX  index_out1[SOURCE_BLOCK_SIZE];
        CGV_ERROR  err_quant = 0.0F;

        // Store the quntize error for this partition to be sorted and processed later
        for (CGU_INT subset=0; subset < EncodeState->maxSubSets; subset++)
        {
                CGV_ENTRIES       numEntries = subset_entryCount[subset];

                for (CGU_INT ii=0; ii<SOURCE_BLOCK_SIZE; ii++)
                {
                    subset_image_src[ii+COMP_RED  *SOURCE_BLOCK_SIZE]   = image_subsets[subset][ii][0];
                    subset_image_src[ii+COMP_GREEN*SOURCE_BLOCK_SIZE]   = image_subsets[subset][ii][1];
                    subset_image_src[ii+COMP_BLUE *SOURCE_BLOCK_SIZE]   = image_subsets[subset][ii][2];
                    subset_image_src[ii+COMP_ALPHA*SOURCE_BLOCK_SIZE]   = image_subsets[subset][ii][3];
                }

                CGV_INDEXPACKED  color_index2[2];

                err_quant += GetQuantizeIndex(
                                       color_index2,
                                       index_out1,
                                       subset_image_src,
                                       numEntries,
                                       EncodeState->clusters,
                                       EncodeState->channels3or4);

                for (CGV_INT idx=0; idx < numEntries; idx++)
                {
                     storedBestindex[mode_blockPartition][subset][idx] = index_out1[idx];
                }
        }

        storedError[mode_blockPartition] = err_quant;
    }

    // Sort the results
    sortPartitionProjection( storedError,
                             sortedPartition,
                             mode_partitionsToTry);

    CGV_EPOCODE   epo_code[MAX_SUBSETS*2*MAX_CHANNELS];
    CGV_EPOCODE   bestEndpoints[MAX_SUBSETS*2*MAX_CHANNELS];
    CGV_BYTE      bestindex[MAX_SUBSETS*MAX_SUBSET_SIZE];
    CGV_ENTRIES   bestEntryCount[MAX_SUBSETS];
    CGV_BYTE      bestindex16[MAX_SUBSET_SIZE];

    // Extensive shaking is most important when the ramp is short, and
    // when we have less index. On a long ramp the quality of the
    // initial quantizing is relatively more important
    // We modulate the shake size according to the number of ramp index
    // - the more index we have the less shaking should be required to find a near
    // optimal match

    CGU_UINT8   numShakeAttempts = max8(1, min8((CGU_UINT8)floor(8 * u_BC7Encode->quality + 0.5), mode_partitionsToTry));
    CGV_ERROR       err_best = CMP_FLOAT_MAX;

    // Now do the endpoint shaking
    for (CGU_INT nSA =0; nSA < numShakeAttempts; nSA++)
    {

        CGV_ERROR err_optimized = 0.0F;
        CGV_UINT8 sortedBlockPartition;
        sortedBlockPartition = sortedPartition[nSA];

        //********************************************
        // Get the partition shape for the given mode
        //********************************************
        GetPartitionSubSet_mode01237(
                  image_subsets,
                  subset_entryCount,
                  sortedBlockPartition,
                  EncodeState->image_src,
                  blockMode,
                  EncodeState->channels3or4);

        //*****************************
        // Process the partition shape
        //*****************************
        for (CGU_INT subset=0; subset < EncodeState->maxSubSets; subset++)
        {
             CGV_ENTRIES   numEntries = subset_entryCount[subset];
             CGV_IMAGE     src_image_block[SOURCE_BLOCK_SIZE*MAX_CHANNELS];
             CGV_INDEX     index_io[MAX_SUBSET_SIZE];
             CGV_EPOCODE   tmp_epo_code[8];

             for (CGU_INT k=0; k<SOURCE_BLOCK_SIZE; k++)
             {
                 src_image_block[k+COMP_RED*SOURCE_BLOCK_SIZE]   = image_subsets[subset][k][0];
                 src_image_block[k+COMP_GREEN*SOURCE_BLOCK_SIZE] = image_subsets[subset][k][1];
                 src_image_block[k+COMP_BLUE*SOURCE_BLOCK_SIZE]  = image_subsets[subset][k][2];
                 src_image_block[k+COMP_ALPHA*SOURCE_BLOCK_SIZE] = image_subsets[subset][k][3];
             }

             for (CGU_INT k=0; k<MAX_SUBSET_SIZE; k++)
             {
                  index_io[k] = storedBestindex[sortedBlockPartition][subset][k];
             }

             err_optimized += optimize_IndexAndEndPoints(
                                   index_io,
                                   tmp_epo_code,
                                   src_image_block,
                                   numEntries,
                                   static_cast<CGU_INT8>(EncodeState->clusters),  // Mi_
                                   EncodeState->bits,
                                   EncodeState->channels3or4,
                                   u_BC7Encode);

             for (CGU_INT k=0; k < MAX_SUBSET_SIZE; k++)
             {
                storedBestindex[sortedBlockPartition][subset][k] = index_io[k];
             }

             for (CGU_INT ch=0; ch<MAX_CHANNELS; ch++)
             {
                 epo_code[(subset*2+0)*4+ch] = tmp_epo_code[  ch];
                 epo_code[(subset*2+1)*4+ch] = tmp_epo_code[4+ch];
             }
        }

        //****************************************
        // Check if result is better than the last
        //****************************************
        if(err_optimized < err_best)
        {
            bestPartition = sortedBlockPartition;
            CGV_INT bestIndexCount = 0;

            for (CGU_INT subset=0; subset < EncodeState->maxSubSets; subset++)
            {
                CGV_ENTRIES       numEntries = subset_entryCount[subset];
                bestEntryCount[subset] = numEntries;

                if(numEntries)
                {
                    for (CGU_INT ch=0; ch < EncodeState->channels3or4; ch++)
                    {
                        bestEndpoints[(subset*2+0)*4+ch] = epo_code[(subset*2+0)*4+ch];
                        bestEndpoints[(subset*2+1)*4+ch] = epo_code[(subset*2+1)*4+ch];
                    }

                    for (CGV_ENTRIES k=0; k< numEntries; k++)
                    {
                        bestindex[subset*MAX_SUBSET_SIZE+k] = storedBestindex[sortedBlockPartition][subset][k];
                        bestindex16[bestIndexCount++]       = storedBestindex[sortedBlockPartition][subset][k];
                    }
                }
            }

            err_best = err_optimized;
            // Early out if we  found we can compress with error below the quality threshold
            if(err_best <= u_BC7Encode->errorThreshold)
            {
                break;
            }
        }
    }


    if (blockMode != 7)
           err_best +=  EncodeState->opaque_err;

    if(err_best >  EncodeState->best_err)
              return;

    //**************************
    // Save the encoded block
    //**************************
    EncodeState->best_err = err_best;


    // Now we have all the data needed to encode the block
    // We need to pack the endpoints prior to encoding
    CGV_TYPEUINT32   packedEndpoints[MAX_SUBSETS*2] = {0,0,0,0,0,0};
    for (CGU_INT subset=0; subset<EncodeState->maxSubSets; subset++)
    {
        packedEndpoints[(subset*2)+0] = 0;
        packedEndpoints[(subset*2)+1] = 0;

        if(bestEntryCount[subset])
        {
            CGU_UINT32   rightAlignment = 0;

            // Sort out parity bits
            if(blockMode != 2)
            {
                // Sort out BCC parity bits
                packedEndpoints[(subset*2)+0] = bestEndpoints[(subset*2+0)*4+0] & 1;
                packedEndpoints[(subset*2)+1] = bestEndpoints[(subset*2+1)*4+0] & 1;
                for (CGU_INT ch=0; ch<EncodeState->channels3or4; ch++)
                {
                    bestEndpoints[(subset*2+0)*4+ch] >>= 1;
                    bestEndpoints[(subset*2+1)*4+ch] >>= 1;
                }
                rightAlignment++;
            }

            // Fixup endpoints
            for (CGU_INT ch=0; ch<EncodeState->channels3or4; ch++)
            {
                    packedEndpoints[(subset*2)+0] |= bestEndpoints[((subset*2)+0)*4+ch] << rightAlignment;
                    packedEndpoints[(subset*2)+1] |= bestEndpoints[((subset*2)+1)*4+ch] << rightAlignment;
                    rightAlignment += EncodeState->componentBits;
            }
        }
    }

    CGV_UINT8    idxCount[3] = {0, 0, 0};
    for (CGV_INT k=0; k<SOURCE_BLOCK_SIZE; k++)
    {
        CGV_UINT8     partsub = get_partition_subset(bestPartition,EncodeState->maxSubSets,k);
        CGV_UINT8 idxC = idxCount[partsub];
        bestindex16[k]  = bestindex[partsub*MAX_SUBSET_SIZE+idxC];
        idxCount[partsub]    = idxC + 1;
     }

    Encode_mode02137(
                     blockMode,
                     bestPartition,
                     packedEndpoints,
                     bestindex16,
                     EncodeState->cmp_out);
}

void  Compress_mode45(
                    CGU_INT             blockMode,
                    BC7_EncodeState     EncodeState[],
uniform CMP_GLOBAL    BC7_Encode          u_BC7Encode[])
{

    cmp_mode_parameters best_candidate;
    EncodeState->channels3or4 = 4;
    cmp_memsetBC7((CGV_BYTE *)&best_candidate, 0, sizeof(cmp_mode_parameters));

    if (blockMode == 4)
    {
        EncodeState->max_idxMode     = 2;
        EncodeState->modeBits[0]     = 30;  // bits = 2 * (Red 5+ Grn 5+ blu 5)
        EncodeState->modeBits[1]     = 36;  // bits = 2 * (Alpha 6+6+6)
        EncodeState->numClusters0[0] = 4;
        EncodeState->numClusters0[1] = 8;
        EncodeState->numClusters1[0] = 8;
        EncodeState->numClusters1[1] = 4;
    }
    else
    {
        EncodeState->max_idxMode     = 1;
        EncodeState->modeBits[0]     = 42;  // bits = 2 * (Red 7+ Grn 7+ blu 7)
        EncodeState->modeBits[1]     = 48;  // bits = 2 * (Alpha 8+8+8) = 48
        EncodeState->numClusters0[0] = 4;
        EncodeState->numClusters0[1] = 4;
        EncodeState->numClusters1[0] = 4;
        EncodeState->numClusters1[1] = 4;
    }


    CGV_IMAGE   src_color_Block[SOURCE_BLOCK_SIZE*MAX_CHANNELS];
    CGV_IMAGE   src_alpha_Block[SOURCE_BLOCK_SIZE*MAX_CHANNELS];

    // Go through each possible rotation and selection of index rotationBits)
    for (CGU_CHANNEL rotated_channel = 0; rotated_channel < EncodeState->channels3or4; rotated_channel++)
    { // A

        for (CGU_INT k=0; k<SOURCE_BLOCK_SIZE; k++)
        {
            for (CGU_INT p=0; p<3; p++)
            {
                src_color_Block[k+p*SOURCE_BLOCK_SIZE] =  EncodeState->image_src[k+componentRotations[rotated_channel][p+1]*SOURCE_BLOCK_SIZE];
                src_alpha_Block[k+p*SOURCE_BLOCK_SIZE] =  EncodeState->image_src[k+componentRotations[rotated_channel][0]*SOURCE_BLOCK_SIZE];
            }
        }

        CGV_ERROR   err_quantizer;
        CGV_ERROR   err_bestQuantizer = CMP_FLOAT_MAX;

        for (CGU_INT idxMode = 0; idxMode < EncodeState->max_idxMode; idxMode++)
        { // B
            CGV_INDEXPACKED  color_index2[2]; // reserved .. Not used!

           err_quantizer = GetQuantizeIndex(
                                color_index2,
                                best_candidate.color_index,
                                src_color_Block,
                                SOURCE_BLOCK_SIZE,
                                EncodeState->numClusters0[idxMode],
                                3);

            err_quantizer += GetQuantizeIndex(
                                color_index2,
                                best_candidate.alpha_index,
                                src_alpha_Block,
                                SOURCE_BLOCK_SIZE,
                                EncodeState->numClusters1[idxMode],
                                3) / 3.0F;

            // If quality is high then run the full shaking for this config and
            // store the result if it beats the best overall error
            // Otherwise only run the shaking if the error is better than the best
            // quantizer error
            if(err_quantizer <= err_bestQuantizer)
            {
                err_bestQuantizer = err_quantizer;

                // Shake size gives the size of the shake cube
                CGV_ERROR   err_overallError;

                err_overallError = optimize_IndexAndEndPoints(
                                             best_candidate.color_index,
                                             best_candidate.color_qendpoint,
                                             src_color_Block,
                                             SOURCE_BLOCK_SIZE,
                                             EncodeState->numClusters0[idxMode],
                                             static_cast<CGU_INT8>(EncodeState->modeBits[0]),
                                             3,
                                             u_BC7Encode);

                // Alpha scalar block
                err_overallError += optimize_IndexAndEndPoints(
                                               best_candidate.alpha_index,
                                               best_candidate.alpha_qendpoint,
                                               src_alpha_Block,
                                               SOURCE_BLOCK_SIZE,
                                               EncodeState->numClusters1[idxMode],
                                               static_cast<CGU_UINT8>(EncodeState->modeBits[1]),
                                               3,
                                               u_BC7Encode) / 3.0f;

                // If we beat the previous best then encode the block
                if(err_overallError <  EncodeState->best_err)
                {
                   best_candidate.idxMode         = idxMode;
                   best_candidate.rotated_channel = rotated_channel;
                   if (blockMode == 4)
                        Encode_mode4( EncodeState->cmp_out, &best_candidate);
                   else
                        Encode_mode5( EncodeState->cmp_out, &best_candidate);
                   EncodeState->best_err = err_overallError;
                }
            }
        } // B
    } // A
}


void  Compress_mode6( BC7_EncodeState     EncodeState[],
uniform CMP_GLOBAL      BC7_Encode          u_BC7Encode[])
{
    CGV_ERROR        err;

    CGV_EPOCODE      epo_code_out[8] = {0};
    CGV_INDEX        best_index_out[MAX_SUBSET_SIZE];
    CGV_INDEXPACKED  best_packedindex_out[2];


   // CGV_IMAGE        block_endpoints[8];
   // icmp_get_block_endpoints(block_endpoints,  EncodeState->image_src, -1, 4);
   // icmp_GetQuantizedEpoCode(epo_code_out, block_endpoints, 6,4);
   // err = icmp_GetQuantizeIndex(best_packedindex_out, best_index_out, EncodeState->image_src, 4, block_endpoints, 0,4);

    err = GetQuantizeIndex(
                   best_packedindex_out,
                   best_index_out,
                   EncodeState->image_src,
                   16, // numEntries
                   16, // clusters
                   4);  // channels3or4

    //*****************************
    // Process the partition shape
    //*****************************
    err  = optimize_IndexAndEndPoints(
                          best_index_out,
                          epo_code_out,
                          EncodeState->image_src,
                          16, //numEntries
                          16, // Mi_ = clusters
                          58, // bits
                          4,  // channels3or4
                          u_BC7Encode);

    //**************************
    // Save the encoded block
    //**************************

    if (err < EncodeState->best_err)
    {
        EncodeState->best_err = err;
        Encode_mode6(
                    best_index_out,
                    epo_code_out,
                    EncodeState->cmp_out);
    }
}

void copy_BC7_Encode_settings(BC7_EncodeState  EncodeState[], uniform CMP_GLOBAL BC7_Encode settings [])
{
 EncodeState->best_err          = CMP_FLOAT_MAX;
 EncodeState->validModeMask     = settings->validModeMask;
 #ifdef USE_ICMP
 EncodeState->part_count        = settings->part_count;
 EncodeState->channels          = settings->channels;
#endif
}

//===================================== ICMP CODE =========================================================
#ifdef USE_ICMP
//========================================
// Modified Intel Texture Compression Code
//========================================

void icmp_Write32Bit(CGV_CMPOUTPACKED base[], CGU_INT* uniform offset, CGU_INT bits, CGV_CMPOUTPACKED bitVal)
{
    base[*offset / 32] |= ((CGV_CMPOUTPACKED)bitVal) << (*offset % 32);
    if (*offset % 32 + bits > 32)
    {
        base[*offset / 32 + 1] |= shift_right_uint32(bitVal, 32 - *offset % 32);
    }
    *offset += bits;
}

//================ 32 bit cmp_out mode encoders ===============

INLINE void icmp_swap_epocode(CGV_EPOCODE u[], CGV_EPOCODE v[], CGU_INT n)
{
    for (CGU_INT i = 0; i < n; i++)
    {
        CGV_EPOCODE t = u[i];
        u[i] = v[i];
        v[i] = t;
    }
}

void icmp_encode_apply_swap(CGV_EPOCODE endpoint[], CGU_INT channel, CGV_INDEXPACKED block_index[2], CGU_INT bits)
{
    CGU_INT levels = 1 << bits;
    if ((block_index[0] & 15) >= levels / 2)
    {
        icmp_swap_epocode(&endpoint[0], &endpoint[channel], channel);

        for (CGU_INT k = 0; k < 2; k++)
            block_index[k] = (CGV_INDEXPACKED)(0x11111111 * (levels - 1)) - block_index[k];
    }
}

void icmp_encode_index(CGV_CMPOUTPACKED data[5], CGU_INT* uniform pPos, CGV_INDEXPACKED block_index[2], CGU_INT bits, CGV_MASK flips)
{
    CGU_INT levels = 1 << bits;
    CGV_MASK flips_shifted = flips;
    for (CGU_INT k1 = 0; k1 < 2; k1++)
    {
        CGV_CMPOUTPACKED qbits_shifted = block_index[k1];
        for (CGU_INT k2 = 0; k2 < 8; k2++)
        {
            CGV_CMPOUTPACKED q = qbits_shifted & 15;
            if ((flips_shifted & 1) > 0) q = (levels - 1) - q;

            if (k1 == 0 && k2 == 0)   icmp_Write32Bit(data, pPos, bits - 1, q);
            else            icmp_Write32Bit(data, pPos, bits, q);
            qbits_shifted >>= 4;
            flips_shifted >>= 1;
        }
    }
}

void icmp_bc7_encode_endpoint2(CGV_CMPOUTPACKED data[5], CGU_INT* uniform pPos, CGV_INDEXPACKED color_index[2], CGU_INT bits, CGV_MASK flips)
{
    CGU_INT levels = 1 << bits;
    CGV_MASK flips_shifted = flips;
    for (CGU_INT k1 = 0; k1 < 2; k1++)
    {
        CGV_INDEXPACKED qbits_shifted = color_index[k1];
        for (CGU_INT k2 = 0; k2 < 8; k2++)
        {
            CGV_INDEXPACKED q = qbits_shifted & 15;
            if ((flips_shifted & 1) > 0) q = (levels - 1) - q;

            if (k1 == 0 && k2 == 0)   icmp_Write32Bit(data, pPos, bits - 1, q);
            else                  icmp_Write32Bit(data, pPos, bits, q);
            qbits_shifted >>= 4;
            flips_shifted >>= 1;
        }
    }
}

INLINE CGV_CMPOUTPACKED icmp_pow2Packed(CGV_FIXUPINDEX x)
{
    return 1 << x;
}

INLINE void icmp_encode_data_shl_1bit_from(CGV_CMPOUTPACKED data[5], CGV_FIXUPINDEX from)
{
    if (from < 96)
    {
        //assert(from > 64+10);

        CGV_CMPOUTPACKED shifted = (data[2] >> 1) | (data[3] << 31);
        CGV_CMPOUTPACKED mask = (icmp_pow2Packed(from - 64) - 1) >> 1;
        data[2] = (mask&data[2]) | (~mask&shifted);
        data[3] = (data[3] >> 1) | (data[4] << 31);
        data[4] = data[4] >> 1;
    }
    else if (from < 128)
    {
        CGV_CMPOUTPACKED shifted = (data[3] >> 1) | (data[4] << 31);
        CGV_CMPOUTPACKED mask = (icmp_pow2Packed(from - 96) - 1) >> 1;
        data[3] = (mask&data[3]) | (~mask&shifted);
        data[4] = data[4] >> 1;
    }
}

INLINE void icmp_get_fixuptable(CGV_FIXUPINDEX fixup[3], CGV_PARTID part_id)
{
    // same as  CMP SDK v3.1 BC7_FIXUPINDEX1 &  BC7_FIXUPINDEX2 for each partition range 0..63
    // The data is saved as a packed INT = (BC7_FIXUPINDEX1 << 4 + BC7_FIXUPINDEX2)
        CMP_STATIC  uniform __constant  CGV_FIXUPINDEX FIXUPINDEX[] = {
        // 2 subset partitions 0..63
         0xf0u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0xf0u,
         0xf0u, 0x20u, 0x80u, 0x20u, 0x20u, 0x80u, 0x80u, 0xf0u, 0x20u, 0x80u, 0x20u, 0x20u, 0x80u, 0x80u, 0x20u, 0x20u,
         0xf0u, 0xf0u, 0x60u, 0x80u, 0x20u, 0x80u, 0xf0u, 0xf0u, 0x20u, 0x80u, 0x20u, 0x20u, 0x20u, 0xf0u, 0xf0u, 0x60u,
         0x60u, 0x20u, 0x60u, 0x80u, 0xf0u, 0xf0u, 0x20u, 0x20u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0x20u, 0x20u, 0xf0u,
         // 3 subset partitions 64..128
         0x3fu, 0x38u, 0xf8u, 0xf3u, 0x8fu, 0x3fu, 0xf3u, 0xf8u, 0x8fu, 0x8fu, 0x6fu, 0x6fu, 0x6fu, 0x5fu, 0x3fu, 0x38u,
         0x3fu, 0x38u, 0x8fu, 0xf3u, 0x3fu, 0x38u, 0x6fu, 0xa8u, 0x53u, 0x8fu, 0x86u, 0x6au, 0x8fu, 0x5fu, 0xfau, 0xf8u,
         0x8fu, 0xf3u, 0x3fu, 0x5au, 0x6au, 0xa8u, 0x89u, 0xfau, 0xf6u, 0x3fu, 0xf8u, 0x5fu, 0xf3u, 0xf6u, 0xf6u, 0xf8u,
         0x3fu, 0xf3u, 0x5fu, 0x5fu, 0x5fu, 0x8fu, 0x5fu, 0xafu, 0x5fu, 0xafu, 0x8fu, 0xdfu, 0xf3u, 0xcfu, 0x3fu, 0x38u
    };

    CGV_FIXUPINDEX skip_packed = FIXUPINDEX[part_id];// gather_int2(FIXUPINDEX, part_id);
    fixup[0] = 0;
    fixup[1] = skip_packed >> 4;
    fixup[2] = skip_packed & 15;
}

void icmp_bc7_encode_adjust_skip_mode01237_2(CGV_CMPOUTPACKED data[5], CGU_INT mode, CGV_PARTID part_id)
{
    CGU_INT bits = 2;  if (mode == 0 || mode == 1) bits = 3;
    CGU_INT maxSubSets = 2;  if (mode == 0 || mode == 2) maxSubSets = 3;

    CGV_FIXUPINDEX fixup[3];
    icmp_get_fixuptable(fixup, part_id);

    if (maxSubSets > 2 && fixup[1] < fixup[2])
    {
        CGV_FIXUPINDEX t = fixup[1]; fixup[1] = fixup[2]; fixup[2] = t;
    }

    for (CGU_INT j = 1; j < maxSubSets; j++)
    {
        CGV_FIXUPINDEX k = fixup[j];
        icmp_encode_data_shl_1bit_from(data, 128 + (maxSubSets - 1) - (15 - k)*bits);
    }
}

INLINE CGV_UINT32 gather_uint32(__constant CGU_UINT32 * const uniform ptr, CGV_INT idx)
{
    return ptr[idx]; // (perf warning expected)
}

INLINE CGV_MASK icmp_get_partition_mask(CGV_PARTID part_id, CGU_INT subset)
{
    CMP_STATIC uniform __constant  CGV_SHIFT32 pattern_mask_table[] = {
        // 2 subset partitions
        0xCCCC3333u, 0x88887777u, 0xEEEE1111u, 0xECC81337u, 0xC880377Fu, 0xFEEC0113u, 0xFEC80137u, 0xEC80137Fu,
        0xC80037FFu, 0xFFEC0013u, 0xFE80017Fu, 0xE80017FFu, 0xFFE80017u, 0xFF0000FFu, 0xFFF0000Fu, 0xF0000FFFu,
        0xF71008EFu, 0x008EFF71u, 0x71008EFFu, 0x08CEF731u, 0x008CFF73u, 0x73108CEFu, 0x3100CEFFu, 0x8CCE7331u,
        0x088CF773u, 0x3110CEEFu, 0x66669999u, 0x366CC993u, 0x17E8E817u, 0x0FF0F00Fu, 0x718E8E71u, 0x399CC663u,
        0xAAAA5555u, 0xF0F00F0Fu, 0x5A5AA5A5u, 0x33CCCC33u, 0x3C3CC3C3u, 0x55AAAA55u, 0x96966969u, 0xA55A5AA5u,
        0x73CE8C31u, 0x13C8EC37u, 0x324CCDB3u, 0x3BDCC423u, 0x69969669u, 0xC33C3CC3u, 0x99666699u, 0x0660F99Fu,
        0x0272FD8Du, 0x04E4FB1Bu, 0x4E40B1BFu, 0x2720D8DFu, 0xC93636C9u, 0x936C6C93u, 0x39C6C639u, 0x639C9C63u,
        0x93366CC9u, 0x9CC66339u, 0x817E7E81u, 0xE71818E7u, 0xCCF0330Fu, 0x0FCCF033u, 0x774488BBu, 0xEE2211DDu,

        // 3 subset partitions
        0x08CC0133u, 0x8CC80037u, 0xCC80006Fu, 0xEC001331u, 0x330000FFu, 0x00CC3333u, 0xFF000033u, 0xCCCC0033u,
        0x0F0000FFu, 0x0FF0000Fu, 0x00F0000Fu, 0x44443333u, 0x66661111u, 0x22221111u, 0x136C0013u, 0x008C8C63u,
        0x36C80137u, 0x08CEC631u, 0x3330000Fu, 0xF0000333u, 0x00EE1111u, 0x88880077u, 0x22C0113Fu, 0x443088CFu,
        0x0C22F311u, 0x03440033u, 0x69969009u, 0x9960009Fu, 0x03303443u, 0x00660699u, 0xC22C3113u, 0x8C0000EFu,
        0x1300007Fu, 0xC4003331u, 0x004C1333u, 0x22229999u, 0x00F0F00Fu, 0x24929249u, 0x29429429u, 0xC30C30C3u,
        0xC03C3C03u, 0x00AA0055u, 0xAA0000FFu, 0x30300303u, 0xC0C03333u, 0x90900909u, 0xA00A5005u, 0xAAA0000Fu,
        0x0AAA0555u, 0xE0E01111u, 0x70700707u, 0x6660000Fu, 0x0EE01111u, 0x07707007u, 0x06660999u, 0x660000FFu,
        0x00660099u, 0x0CC03333u, 0x03303003u, 0x60000FFFu, 0x80807777u, 0x10100101u, 0x000A0005u, 0x08CE8421u
    };

    CGV_MASK mask_packed = gather_uint32(pattern_mask_table, part_id);
    CGV_MASK mask0 = mask_packed & 0xFFFF;
    CGV_MASK mask1 = mask_packed >> 16;

    CGV_MASK mask = (subset == 2) ? (~mask0)&(~mask1) : ((subset == 0) ? mask0 : mask1);
    return mask;
}

#ifdef USE_VARYING
#ifdef ASPM_GPU
INLINE CGV_INDEXPACKED gather_packedindex(CGV_INDEXPACKED* ptr, CGV_FIXUPINDEX idx)
{
    return ptr[idx];
}
#else
INLINE CGV_INDEXPACKED gather_packedindex(CMP_CONSTANT varying CGV_INDEXPACKED* CMP_CONSTANT uniform ptr, CGV_FIXUPINDEX idx)
{
    return ptr[idx]; // (perf warning expected)
}
#endif
#endif

CGV_MASK icmp_encode_apply_swap_mode01237(CGV_EPOCODE qep[], CGV_INDEXPACKED color_index[2], CGU_INT blockMode, CGV_PARTID part_id)
{
    CGU_INT bits = 2;  if (blockMode == 0 || blockMode == 1) bits = 3;
    CGU_INT maxSubSets = 2; if (blockMode == 0 || blockMode == 2) maxSubSets = 3;

    CGV_MASK flips = 0;
    CGU_INT levels = 1 << bits;
    CGV_FIXUPINDEX fixup[3];
    icmp_get_fixuptable(fixup, part_id);

    for (CGU_INT j = 0; j < maxSubSets; j++)
    {
        CGV_FIXUPINDEX k0 = fixup[j];

#ifdef USE_VARYING
        CGV_INDEXPACKED q = ((gather_packedindex(color_index, k0 >> 3) << (28 - (k0 & 7) * 4)) >> 28);
#else
        CGV_INDEXPACKED q = ((color_index[k0 >> 3] << (28 - (k0 & 7) * 4)) >> 28);
#endif

        if (q >= levels / 2)
        {
            icmp_swap_epocode(&qep[8 * j], &qep[8 * j + 4], 4);
            CGV_MASK partition_mask = icmp_get_partition_mask(part_id, j);
            flips |= partition_mask;
        }
    }

    return flips;
}

void icmp_encode_mode01237(CGV_CMPOUTPACKED cmp_out[5], CGV_EPOCODE color_qendpoint[], CGV_INDEXPACKED color_index[2], CGV_PARTID part_id, CGU_INT blockMode)
{
    CGU_INT bits = 2; if (blockMode == 0 || blockMode == 1) bits = 3;
    CGU_INT maxSubSets = 2; if (blockMode == 0 || blockMode == 2) maxSubSets = 3;
    CGU_INT channels = 3; if (blockMode == 7) channels = 4;

    CGV_MASK flips = icmp_encode_apply_swap_mode01237(color_qendpoint, color_index, blockMode, part_id);

    for (CGU_INT k = 0; k < 5; k++) cmp_out[k] = 0;
    CGU_INT pos = 0;

    // mode 0-3, 7
    icmp_Write32Bit(cmp_out, &pos, blockMode + 1, 1 << blockMode);

    // partition
    if (blockMode == 0)
    {
        icmp_Write32Bit(cmp_out, &pos, 4, part_id & 15);
    }
    else
    {
        icmp_Write32Bit(cmp_out, &pos, 6, part_id & 63);
    }

    // endpoints
    for (CGU_INT ch = 0; ch < channels; ch++)
        for (CGU_INT j = 0; j < maxSubSets * 2; j++)
        {
            if (blockMode == 0)
            {
                icmp_Write32Bit(cmp_out, &pos, 4, color_qendpoint[j * 4 + 0 + ch] >> 1);
            }
            else if (blockMode == 1)
            {
                icmp_Write32Bit(cmp_out, &pos, 6, color_qendpoint[j * 4 + 0 + ch] >> 1);
            }
            else if (blockMode == 2)
            {
                icmp_Write32Bit(cmp_out, &pos, 5, color_qendpoint[j * 4 + 0 + ch]);
            }
            else if (blockMode == 3)
            {
                icmp_Write32Bit(cmp_out, &pos, 7, color_qendpoint[j * 4 + 0 + ch] >> 1);
            }
            else if (blockMode == 7)
            {
                icmp_Write32Bit(cmp_out, &pos, 5, color_qendpoint[j * 4 + 0 + ch] >> 1);
            }
            //else
            //{
            //    assert(false);
            //}
        }

    // p bits
    if (blockMode == 1)
        for (CGU_INT j = 0; j < 2; j++)
        {
            icmp_Write32Bit(cmp_out, &pos, 1, color_qendpoint[j * 8] & 1);
        }

    if (blockMode == 0 || blockMode == 3 || blockMode == 7)
        for (CGU_INT j = 0; j < maxSubSets * 2; j++)
        {
            icmp_Write32Bit(cmp_out, &pos, 1, color_qendpoint[j * 4] & 1);
        }

    // quantized values
    icmp_bc7_encode_endpoint2(cmp_out, &pos, color_index, bits, flips);
    icmp_bc7_encode_adjust_skip_mode01237_2(cmp_out, blockMode, part_id);
}

INLINE void icmp_swap_indexpacked(CGV_INDEXPACKED u[], CGV_INDEXPACKED v[], CGU_INT n)
{
    for (CGU_INT i = 0; i < n; i++)
    {
        CGV_INDEXPACKED t = u[i];
        u[i] = v[i];
        v[i] = t;
    }
}


void icmp_encode_mode4(CGV_CMPOUTPACKED cmp_out[5], varying cmp_mode_parameters* uniform params)
{
    CGV_EPOCODE      color_qendpoint[8];
    CGV_INDEXPACKED   color_index[2];
    CGV_EPOCODE      alpha_qendpoint[2];
    CGV_INDEXPACKED   alpha_index[2];

    CGV_CMPOUTPACKED      rotated_channel = params->rotated_channel;
    CGV_SHIFT32           idxMode = params->idxMode;

    icmp_swap_epocode(params->color_qendpoint, color_qendpoint, 8);
    icmp_swap_indexpacked(params->best_color_index, color_index, 2);
    icmp_swap_epocode(params->alpha_qendpoint, alpha_qendpoint, 2);
    icmp_swap_indexpacked(params->best_alpha_index, alpha_index, 2);

    for (CGU_INT k = 0; k < 5; k++) cmp_out[k] = 0;
    CGU_INT pos = 0;

    // mode 4 (5 bits) 00001
    icmp_Write32Bit(cmp_out, &pos, 5, 16);

    // rotation channel 2 bits
    icmp_Write32Bit(cmp_out, &pos, 2, (rotated_channel + 1) & 3);

    // idxMode 1 bit
    icmp_Write32Bit(cmp_out, &pos, 1, idxMode);

    if (!idxMode)
    {
        icmp_encode_apply_swap(color_qendpoint, 4, color_index, 2);
        icmp_encode_apply_swap(alpha_qendpoint, 1, alpha_index, 3);
    }
    else
    {
        icmp_swap_indexpacked(color_index, alpha_index, 2);
        icmp_encode_apply_swap(alpha_qendpoint, 1, color_index, 2);
        icmp_encode_apply_swap(color_qendpoint, 4, alpha_index, 3);
    }

    // color endpoints 5 bits each
    // R0 : R1
    // G0 : G1
    // B0 : B1
    for (CGU_INT p = 0; p < 3; p++)
    {
        CGV_EPOCODE c0 = color_qendpoint[0 + p];
        CGV_EPOCODE c1 = color_qendpoint[4 + p];
        icmp_Write32Bit(cmp_out, &pos, 5, c0);  // 0
        icmp_Write32Bit(cmp_out, &pos, 5, c1);  // 1
    }

    // alpha endpoints (6 bits each)
    // A0 : A1
    icmp_Write32Bit(cmp_out, &pos, 6, alpha_qendpoint[0]);
    icmp_Write32Bit(cmp_out, &pos, 6, alpha_qendpoint[1]);

    // index data (color index 2 bits each) 31 bits total
    icmp_encode_index(cmp_out, &pos, color_index, 2, 0);

    // index data (alpha index 3 bits each)  47 bits total
    icmp_encode_index(cmp_out, &pos, alpha_index, 3, 0);
}

void icmp_Encode_mode5(CGV_CMPOUTPACKED cmp_out[5], varying cmp_mode_parameters* uniform params)
{

    CGV_EPOCODE           qep[8];
    CGV_INDEXPACKED       color_index[2];
    CGV_EPOCODE           alpha_qendpoint[2];
    CGV_INDEXPACKED       alpha_index[2];

    icmp_swap_epocode(params->color_qendpoint, qep, 8);
    icmp_swap_indexpacked(params->best_color_index, color_index, 2);
    icmp_swap_epocode(params->alpha_qendpoint, alpha_qendpoint, 2);
    icmp_swap_indexpacked(params->best_alpha_index, alpha_index, 2);

    CGV_CMPOUTPACKED rotated_channel = params->rotated_channel;

    icmp_encode_apply_swap(qep, 4, color_index, 2);
    icmp_encode_apply_swap(alpha_qendpoint, 1, alpha_index, 2);

    for (CGU_INT k = 0; k < 5; k++) cmp_out[k] = 0;
    CGU_INT pos = 0;

    // mode 5
    icmp_Write32Bit(cmp_out, &pos, 6, 1 << 5);

    // rotated channel
    icmp_Write32Bit(cmp_out, &pos, 2, (rotated_channel + 1) & 3);

    // endpoints
    for (CGU_INT p = 0; p < 3; p++)
    {
        icmp_Write32Bit(cmp_out, &pos, 7, qep[0 + p]);
        icmp_Write32Bit(cmp_out, &pos, 7, qep[4 + p]);
    }

    // alpha endpoints
    icmp_Write32Bit(cmp_out, &pos, 8, alpha_qendpoint[0]);
    icmp_Write32Bit(cmp_out, &pos, 8, alpha_qendpoint[1]);

    // quantized values
    icmp_encode_index(cmp_out, &pos, color_index, 2, 0);
    icmp_encode_index(cmp_out, &pos, alpha_index, 2, 0);

}

void icmp_encode_mode6(CGV_CMPOUTPACKED cmp_out[5], CGV_EPOCODE qep[8], CGV_INDEXPACKED color_index[2])
{
    icmp_encode_apply_swap(qep, 4, color_index, 4);

    for (CGU_INT k = 0; k < 5; k++) cmp_out[k] = 0;
    CGU_INT pos = 0;

    // mode 6
    icmp_Write32Bit(cmp_out, &pos, 7, 64);

    // endpoints
    for (CGU_INT p = 0; p < 4; p++)
    {
        icmp_Write32Bit(cmp_out, &pos, 7, qep[0 + p] >> 1);
        icmp_Write32Bit(cmp_out, &pos, 7, qep[4 + p] >> 1);
    }

    // p bits
    icmp_Write32Bit(cmp_out, &pos, 1, qep[0] & 1);
    icmp_Write32Bit(cmp_out, &pos, 1, qep[4] & 1);

    // quantized values
    icmp_encode_index(cmp_out, &pos, color_index, 4, 0);
}

///////////////////////////
//      PCA helpers

INLINE void icmp_compute_stats_masked(CGV_IMAGE stats[15], CGV_IMAGE image_src[64], CGV_MASK mask, CGU_CHANNEL channels)
{
    for (CGU_INT i = 0; i < 15; i++) stats[i] = 0;

    CGV_MASK mask_shifted = mask << 1;
    for (CGU_INT k = 0; k < 16; k++)
    {
        mask_shifted >>= 1;
        //if ((mask_shifted&1) == 0) continue;
        CGV_MASK flag = (mask_shifted & 1);

        CGV_IMAGE rgba[4];
        for (CGU_CHANNEL ch = 0; ch < channels; ch++) rgba[ch] = image_src[k + ch * 16];

        for (CGU_CHANNEL ch = 0; ch < channels; ch++) rgba[ch] *= flag;
        stats[14] += flag;

        stats[10] += rgba[0];
        stats[11] += rgba[1];
        stats[12] += rgba[2];

        stats[0] += rgba[0] * rgba[0];
        stats[1] += rgba[0] * rgba[1];
        stats[2] += rgba[0] * rgba[2];

        stats[4] += rgba[1] * rgba[1];
        stats[5] += rgba[1] * rgba[2];

        stats[7] += rgba[2] * rgba[2];

        if (channels == 4)
        {
            stats[13] += rgba[3];

            stats[3] += rgba[0] * rgba[3];
            stats[6] += rgba[1] * rgba[3];
            stats[8] += rgba[2] * rgba[3];
            stats[9] += rgba[3] * rgba[3];
        }
    }
}

INLINE void icmp_covar_from_stats(CGV_IMAGE covar[10], CGV_IMAGE stats[15], CGU_CHANNEL channels3or4)
{
    covar[0] = stats[0] - stats[10 + 0] * stats[10 + 0] / stats[14];
    covar[1] = stats[1] - stats[10 + 0] * stats[10 + 1] / stats[14];
    covar[2] = stats[2] - stats[10 + 0] * stats[10 + 2] / stats[14];

    covar[4] = stats[4] - stats[10 + 1] * stats[10 + 1] / stats[14];
    covar[5] = stats[5] - stats[10 + 1] * stats[10 + 2] / stats[14];

    covar[7] = stats[7] - stats[10 + 2] * stats[10 + 2] / stats[14];

    if (channels3or4 == 4)
    {
        covar[3] = stats[3] - stats[10 + 0] * stats[10 + 3] / stats[14];
        covar[6] = stats[6] - stats[10 + 1] * stats[10 + 3] / stats[14];
        covar[8] = stats[8] - stats[10 + 2] * stats[10 + 3] / stats[14];
        covar[9] = stats[9] - stats[10 + 3] * stats[10 + 3] / stats[14];
    }
}

INLINE void icmp_compute_covar_dc_masked(CGV_IMAGE covar[6], CGV_IMAGE dc[3], CGV_IMAGE image_src[64], CGV_MASK mask, CGU_INT channels3or4)
{
    CGV_IMAGE stats[15];
    icmp_compute_stats_masked(stats, image_src, mask, channels3or4);

    icmp_covar_from_stats(covar, stats, channels3or4);
    for (CGU_INT ch = 0; ch < channels3or4; ch++) dc[ch] = stats[10 + ch] / stats[14];
}

INLINE void icmp_ssymv3(CGV_IMAGE a[4], CGV_IMAGE covar[10], CGV_IMAGE b[4])
{
    a[0] = covar[0] * b[0] + covar[1] * b[1] + covar[2] * b[2];
    a[1] = covar[1] * b[0] + covar[4] * b[1] + covar[5] * b[2];
    a[2] = covar[2] * b[0] + covar[5] * b[1] + covar[7] * b[2];
}

INLINE void icmp_ssymv4_2(CGV_IMAGE a[4], CGV_IMAGE covar[10], CGV_IMAGE b[4])
{
    a[0] = covar[0] * b[0] + covar[1] * b[1] + covar[2] * b[2] + covar[3] * b[3];
    a[1] = covar[1] * b[0] + covar[4] * b[1] + covar[5] * b[2] + covar[6] * b[3];
    a[2] = covar[2] * b[0] + covar[5] * b[1] + covar[7] * b[2] + covar[8] * b[3];
    a[3] = covar[3] * b[0] + covar[6] * b[1] + covar[8] * b[2] + covar[9] * b[3];
}

#ifndef ASPM
// Computes inverse square root over an implementation-defined range. The maximum error is implementation-defined.
CGV_IMAGE Image_rsqrt(CGV_IMAGE f)
{
    CGV_IMAGE sf = sqrt(f);
    if (sf != 0)
        return 1 / sqrt(f);
    else
        return 0.0f;
}
#endif

INLINE void icmp_compute_axis(CGV_IMAGE axis[4],
                              CGV_IMAGE covar[10],
#ifdef ASPM_GPU
                              CGV_ITTERATIONS powerIterations,
#else
                              uniform __constant CGV_ITTERATIONS powerIterations,
#endif
                              CGU_CHANNEL channels)
{
    CGV_IMAGE vec[4] = { 1,1,1,1 };

    for (CGU_INT i = 0; i < powerIterations; i++)
    {
        if (channels == 3) icmp_ssymv3(axis, covar, vec);
        if (channels == 4) icmp_ssymv4_2(axis, covar, vec);

        for (CGU_CHANNEL ch = 0; ch < channels; ch++) vec[ch] = axis[ch];

        if (i % 2 == 1) // renormalize every other iteration
        {
            CGV_IMAGE norm_sq = 0;
            for (CGU_CHANNEL ch = 0; ch < channels; ch++)
                norm_sq += axis[ch] * axis[ch];

#ifndef ASPM
            CGV_IMAGE rnorm = Image_rsqrt(norm_sq);
#else
            CGV_IMAGE rnorm = rsqrt(norm_sq);
#endif
            for (CGU_CHANNEL ch = 0; ch < channels; ch++) vec[ch] *= rnorm;
        }
    }

    for (CGU_CHANNEL ch = 0; ch < channels; ch++) axis[ch] = vec[ch];
}

void icmp_block_pca_axis(CGV_IMAGE axis[4], CGV_IMAGE dc[4], CGV_IMAGE image_src[64], CGV_MASK mask, CGU_INT channels3or4)
{
    uniform __constant CGV_ITTERATIONS powerIterations = 8; // 4 not enough for HQ

    CGV_IMAGE covar[10];
    icmp_compute_covar_dc_masked(covar, dc, image_src, mask, channels3or4);

    CGV_IMAGE inv_var = 1.0 / (256 * 256);
    for (CGU_INT k = 0; k < 10; k++)
    {
        covar[k] *= inv_var;
    }

    CGV_IMAGE eps = sq_image(0.001F);
    covar[0] += eps;
    covar[4] += eps;
    covar[7] += eps;
    covar[9] += eps;

    icmp_compute_axis(axis, covar, powerIterations, channels3or4);
}

CGV_IMAGE minImage(CGV_IMAGE a, CGV_IMAGE b) { return a < b ? a : b; }
CGV_IMAGE maxImage(CGV_IMAGE a, CGV_IMAGE b) { return a > b ? a : b; }


void icmp_block_segment_core(CGV_IMAGE epo_code[], CGV_IMAGE image_src[64], CGV_MASK mask, CGU_INT channels3or4)
{
    CGV_IMAGE axis[4];
    CGV_IMAGE dc[4];
    icmp_block_pca_axis(axis, dc, image_src, mask, channels3or4);

    CGV_IMAGE ext[2];
    ext[0] = +1e32;
    ext[1] = -1e32;

    // find min/max
    CGV_MASK mask_shifted = mask << 1;
    for (CGU_INT k = 0; k < 16; k++)
    {
        mask_shifted >>= 1;
        if ((mask_shifted & 1) == 0) continue;

        CGV_IMAGE dot = 0;
        for (CGU_INT ch = 0; ch < channels3or4; ch++)
            dot += axis[ch] * (image_src[16 * ch + k] - dc[ch]);

        ext[0] = minImage(ext[0], dot);
        ext[1] = maxImage(ext[1], dot);
    }

    // create some distance if the endpoints collapse
    if (ext[1] - ext[0] < 1.0f)
    {
        ext[0] -= 0.5f;
        ext[1] += 0.5f;
    }

    for (CGU_INT i = 0; i < 2; i++)
        for (CGU_INT ch = 0; ch < channels3or4; ch++)
        {
            epo_code[4 * i + ch] = ext[i] * axis[ch] + dc[ch];
        }
}

INLINE CGV_IMAGE clampf(CGV_IMAGE v, CGV_IMAGE a, CGV_IMAGE b)
{
    if (v < a)
        return a;
    else
        if (v > b)
            return b;
    return v;
}


void icmp_get_block_endpoints(CGV_IMAGE block_endpoints[], CGV_IMAGE image_src[64], CGV_MASK mask, CGU_CHANNEL channels3or4)
{
    icmp_block_segment_core(block_endpoints, image_src, mask, channels3or4);

    for (CGU_INT i = 0; i < 2; i++)
        for (CGU_INT ch = 0; ch < channels3or4; ch++)
        {
            block_endpoints[4 * i + ch] = clampf(block_endpoints[4 * i + ch], 0.0f, 255.0f);
        }
}

void icmp_ep_quant0367_2(CGV_EPOCODE qep[], CGV_IMAGE ep[], CGU_INT blockMode, CGU_INT channels)
{
    CGU_INT bits = 7;
    if (blockMode == 0) bits = 4;
    if (blockMode == 7) bits = 5;

    CGU_INT levels = 1 << bits;
    CGU_INT levels2 = levels * 2 - 1;

    for (CGU_INT i = 0; i < 2; i++)
    {
        CGV_EPOCODE qep_b[8];

        for (CGU_INT b = 0; b < 2; b++)
            for (CGU_INT p = 0; p < 4; p++)
            {
                CGV_EPOCODE v = (CGV_TYPEINT)((ep[i * 4 + p] / 255.0f*levels2 - b) / 2.0f + 0.5f) * 2 + b;
                qep_b[b * 4 + p] = clampEPO(v, b, levels2 - 1 + b);
            }

        CGV_IMAGE ep_b[8];
        for (CGU_INT j = 0; j < 8; j++)
            ep_b[j] = qep_b[j];

        if (blockMode == 0)
            for (CGU_INT j = 0; j < 8; j++)
                ep_b[j] = expandEPObits(qep_b[j], 5);

        CGV_ERROR err0 = 0.0f;
        CGV_ERROR err1 = 0.0f;
        for (CGU_INT ch = 0; ch < channels; ch++)
        {
            err0 += sq_image(ep[i * 4 + ch] - ep_b[0 + ch]);
            err1 += sq_image(ep[i * 4 + ch] - ep_b[4 + ch]);
        }

        for (CGU_INT p = 0; p < 4; p++)
            qep[i * 4 + p] = (err0 < err1) ? qep_b[0 + p] : qep_b[4 + p];
    }
}

void icmp_ep_quant245_2(CGV_EPOCODE qep[], CGV_IMAGE ep[], CGU_INT mode)
{
    CGU_INT bits = 5;
    if (mode == 5) bits = 7;
    CGU_INT levels = 1 << bits;

    for (CGU_INT i = 0; i < 8; i++)
    {
        CGV_EPOCODE v = ((CGV_TYPEINT)(ep[i] / 255.0f*(levels - 1) + 0.5));
        qep[i] = clampEPO(v, 0, levels - 1);
    }
}

void icmp_ep_quant1_2(CGV_EPOCODE qep[], CGV_IMAGE ep[], CGU_INT mode)
{
    CGV_EPOCODE qep_b[16];

    for (CGU_INT b = 0; b < 2; b++)
        for (CGU_INT i = 0; i < 8; i++)
        {
            CGV_EPOCODE v = ((CGV_TYPEINT)((ep[i] / 255.0f*127.0f - b) / 2 + 0.5)) * 2 + b;
            qep_b[b * 8 + i] = clampEPO(v, b, 126 + b);
        }

    // dequant
    CGV_IMAGE ep_b[16];
    for (CGU_INT k = 0; k < 16; k++)
        ep_b[k] = expandEPObits(qep_b[k], 7);

    CGV_ERROR err0 = 0.0f;
    CGV_ERROR err1 = 0.0f;
    for (CGU_INT j = 0; j < 2; j++)
        for (CGU_INT p = 0; p < 3; p++)
        {
            err0 += sq_image(ep[j * 4 + p] - ep_b[0 + j * 4 + p]);
            err1 += sq_image(ep[j * 4 + p] - ep_b[8 + j * 4 + p]);
        }

    for (CGU_INT i = 0; i < 8; i++)
        qep[i] = (err0 < err1) ? qep_b[0 + i] : qep_b[8 + i];

}

void icmp_ep_quant2_2(CGV_EPOCODE qep[], CGV_IMAGE ep[], CGU_INT blockMode, CGU_INT channels3or4)
{
    //assert(mode <= 7);
    CMP_STATIC uniform __constant CGV_SUBSETS SubSetTable[] = { 3,2,3,2,1,1,1,2 };
#ifndef ASPM_GPU
    uniform CMP_CONSTANT
#endif
    CGV_SUBSETS maxSubSets = SubSetTable[blockMode];

    if (blockMode == 0 || blockMode == 3 || blockMode == 6 || blockMode == 7)
    {
        for (CGU_INT i = 0; i < maxSubSets; i++)
            icmp_ep_quant0367_2(&qep[i * 8], &ep[i * 8], blockMode, channels3or4);
    }
    else
        if (blockMode == 1)
        {
            for (CGU_INT i = 0; i < maxSubSets; i++)
                icmp_ep_quant1_2(&qep[i * 8], &ep[i * 8], blockMode);
        }
        else
            if (blockMode == 2 || blockMode == 4 || blockMode == 5)
            {
                for (CGU_INT i = 0; i < maxSubSets; i++)
                    icmp_ep_quant245_2(&qep[i * 8], &ep[i * 8], blockMode);
            }
    //   else
    //      assert(false);

}

void icmp_ep_dequant2(CGV_IMAGE ep[], CGV_EPOCODE qep[], CGU_INT blockMode)
{
    //assert(mode <= 7);
    CMP_STATIC uniform __constant CGV_SUBSETS subSetTable[] = { 3,2,3,2,1,1,1,2 };
#ifndef ASPM_GPU
    uniform CMP_CONSTANT
#endif
    CGV_SUBSETS maxSubSets = subSetTable[blockMode];

    // mode 3, 6 are 8-bit
    if (blockMode == 3 || blockMode == 6)
    {
        for (CGU_INT i = 0; i < 8 * maxSubSets; i++)
            ep[i] = qep[i];
    }
    else
        if (blockMode == 1 || blockMode == 5)
        {
            for (CGU_INT i = 0; i < 8 * maxSubSets; i++)
                ep[i] = expandEPObits(qep[i], 7);
        }
        else
            if (blockMode == 0 || blockMode == 2 || blockMode == 4)
            {
                for (CGU_INT i = 0; i < 8 * maxSubSets; i++)
                    ep[i] = expandEPObits(qep[i], 5);
            }
            else
                if (blockMode == 7)
                {
                    for (CGU_INT i = 0; i < 8 * maxSubSets; i++)
                        ep[i] = expandEPObits(qep[i], 6);
                }
    //else
    //  assert(false);
}

void icmp_GetQuantizedEpoCode(CGV_EPOCODE epo_code_out[], CGV_IMAGE block_endpoints[], CGU_INT blockMode, CGU_CHANNEL channels3or4)
{
    icmp_ep_quant2_2(epo_code_out, block_endpoints, blockMode, channels3or4);
    icmp_ep_dequant2(block_endpoints, epo_code_out, blockMode);
}

void icmp_ep_quant_dequant_mode4(CGV_EPOCODE qep[], CGV_IMAGE ep[])
{
    icmp_ep_quant2_2(qep, ep, 4, 3);
    icmp_ep_dequant2(ep, qep, 4);
}

///////////////////////////
//   pixel quantization
//========================================
// Modified Intel Texture Compression Code
//========================================

INLINE uniform __constant CGV_RAMP* uniform icmp_GetRamp(CGU_INT bits)
{
    //assert(bits>=2 && bits<=4); // invalid bit size

    CMP_STATIC uniform __constant CGV_RAMP unquant_table_2bits[] = { 0, 21, 43, 64 };
    CMP_STATIC uniform __constant CGV_RAMP unquant_table_3bits[] = { 0, 9, 18, 27, 37, 46, 55, 64 };
    CMP_STATIC uniform __constant CGV_RAMP unquant_table_4bits[] = { 0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64 };

    uniform __constant CGV_RAMP* uniform unquant_tables[] = { unquant_table_2bits, unquant_table_3bits, unquant_table_4bits };

    return unquant_tables[bits - 2];
}

#ifdef USE_VARYING
INLINE CGV_IMAGE  gather_image(varying CGV_IMAGE* uniform ptr, CGV_SHIFT32 idx)
{
    return ptr[idx]; // (perf warning expected)
}
#endif

INLINE CGV_RAMP  gather_ramp(
#ifdef ASPM_GPU
    CMP_CONSTANT CGV_RAMP*  ptr,
#else
    CMP_CONSTANT CGV_RAMP* CMP_CONSTANT uniform ptr,
#endif
    CGV_INDEX idx)
{
    return ptr[idx]; // (perf warning expected)
}

CGV_ERROR icmp_GetQuantizeIndex(
    CGV_INDEXPACKED index_packed_out[2],
    CGV_INDEX       index_out[MAX_SUBSET_SIZE],
    CGV_IMAGE       image_src[64],
    CGU_INT         bits,
    CGV_IMAGE       image_block[],
    CGV_SHIFT32     pattern,
    CGU_CHANNEL     channels3or4)
{
    CGV_ERROR total_err = 0;
    uniform __constant CGV_RAMP* uniform Ramp = icmp_GetRamp(bits);
    CGV_LEVELS levels = 1 << bits;

    // 64-bit color_qendpoint: 5% overhead in this function
    for (CGU_INT k = 0; k < 2; k++) index_packed_out[k] = 0;

    CGV_SHIFT32 pattern_shifted = pattern;
    for (CGU_INT k = 0; k < 16; k++)
    {
        CGV_SHIFT32 j = pattern_shifted & 3;
        pattern_shifted >>= 2;

        CGV_IMAGE proj = 0;
        CGV_IMAGE div = 0;
        for (CGU_CHANNEL ch = 0; ch < channels3or4; ch++)
        {
#ifdef USE_VARYING
            CGV_IMAGE ep_a = gather_image(image_block, 8 * j + 0 + ch);
            CGV_IMAGE ep_b = gather_image(image_block, 8 * j + 4 + ch);
#else
            CGV_IMAGE ep_a = image_block[8 * j + 0 + ch];
            CGV_IMAGE ep_b = image_block[8 * j + 4 + ch];
#endif
            proj += (image_src[k + ch * 16] - ep_a)*(ep_b - ep_a);
            div += sq_image(ep_b - ep_a);
        }

        proj /= div;

        CGV_INDEX index_q1 = (CGV_INDEX)(proj*levels + 0.5);
        index_q1 = clampIndex(index_q1, 1, levels - 1);

        CGV_ERROR err0 = 0;
        CGV_ERROR err1 = 0;
        CGV_RAMP ramp0 = gather_ramp(Ramp, index_q1 - 1);
        CGV_RAMP ramp1 = gather_ramp(Ramp, index_q1);

        for (CGU_CHANNEL ch = 0; ch < channels3or4; ch++)
        {
#ifdef USE_VARYING
            CGV_IMAGE ep_a = gather_image(image_block, 8 * j + 0 + ch);
            CGV_IMAGE ep_b = gather_image(image_block, 8 * j + 4 + ch);
#else
            CGV_IMAGE ep_a = image_block[8 * j + 0 + ch];
            CGV_IMAGE ep_b = image_block[8 * j + 4 + ch];
#endif
            CGV_IMAGE dec_v0 = (CGV_TYPEINT)(((64 - ramp0)*ep_a + ramp0 * ep_b + 32) / 64);
            CGV_IMAGE dec_v1 = (CGV_TYPEINT)(((64 - ramp1)*ep_a + ramp1 * ep_b + 32) / 64);
            err0 += sq_image(dec_v0 - image_src[k + ch * 16]);
            err1 += sq_image(dec_v1 - image_src[k + ch * 16]);
        }

        CGV_ERROR best_err = err1;
        CGV_INDEX best_index = index_q1;
        if (err0 < err1)
        {
            best_err = err0;
            best_index = index_q1 - 1;
        }

        index_out[k] = best_index;
        index_packed_out[k / 8] += ((CGV_INDEXPACKED)best_index) << 4 * (k % 8);
        total_err += best_err;
    }

    return total_err;
}

///////////////////////////
// LS endpoint refinement

void icmp_opt_endpoints(CGV_IMAGE ep[], CGV_IMAGE image_src[64], CGU_INT bits, CGV_INDEXPACKED color_qendpoint[2], CGV_MASK mask, CGU_CHANNEL channels3or4)
{
    CGU_INT levels = 1 << bits;

    CGV_IMAGE Atb1[4] = { 0,0,0,0 };
    CGV_IMAGE sum_q = 0;
    CGV_IMAGE sum_qq = 0;
    CGV_IMAGE sum[5] = { 0,0,0,0,0 };

    CGV_MASK mask_shifted = mask << 1;
    for (CGU_INT k1 = 0; k1 < 2; k1++)
    {
        CGV_INDEXPACKED qbits_shifted = color_qendpoint[k1];
        for (CGU_INT k2 = 0; k2 < 8; k2++)
        {
            CGU_INT   k = k1 * 8 + k2;
            CGV_IMAGE q = (CGV_TYPEINT)(qbits_shifted & 15);

            qbits_shifted >>= 4;

            mask_shifted >>= 1;
            if ((mask_shifted & 1) == 0) continue;

            CGV_LEVELS x = (levels - 1) - q;
            CGV_LEVELS y = q;

            sum_q += q;
            sum_qq += q * q;

            sum[4] += 1;
            for (CGU_CHANNEL ch = 0; ch < channels3or4; ch++)  sum[ch] += image_src[k + ch * 16];
            for (CGU_CHANNEL ch = 0; ch < channels3or4; ch++) Atb1[ch] += x * image_src[k + ch * 16];
        }
    }

    CGV_IMAGE Atb2[4];
    for (CGU_CHANNEL ch = 0; ch < channels3or4; ch++)
    {
        //sum[ch] = dc[ch]*16;
        Atb2[ch] = (levels - 1)*sum[ch] - Atb1[ch];
    }

    CGV_IMAGE Cxx = sum[4] * sq_image(levels - 1) - 2 * (levels - 1)*sum_q + sum_qq;
    CGV_IMAGE Cyy = sum_qq;
    CGV_IMAGE Cxy = (levels - 1)*sum_q - sum_qq;
    CGV_IMAGE scale = (levels - 1) / (Cxx*Cyy - Cxy * Cxy);

    for (CGU_CHANNEL ch = 0; ch < channels3or4; ch++)
    {
        ep[0 + ch] = (Atb1[ch] * Cyy - Atb2[ch] * Cxy)*scale;
        ep[4 + ch] = (Atb2[ch] * Cxx - Atb1[ch] * Cxy)*scale;

        //ep[0+ch] = clamp(ep[0+ch], 0, 255);
        //ep[4+ch] = clamp(ep[4+ch], 0, 255);
    }

    if (img_absf(Cxx*Cyy - Cxy * Cxy) < 0.001f)
    {
        // flatten
        for (CGU_CHANNEL ch = 0; ch < channels3or4; ch++)
        {
            ep[0 + ch] = sum[ch] / sum[4];
            ep[4 + ch] = ep[0 + ch];
        }
    }
}

//////////////////////////
// parameter estimation

void icmp_channel_quant_dequant2(CGV_EPOCODE qep[2], CGV_IMAGE ep[2], CGU_INT epbits)
{
    CGV_LEVELS elevels = (1 << epbits);

    for (CGU_INT i = 0; i < 2; i++)
    {
        CGV_EPOCODE v = ((CGV_EPOCODE)(ep[i] / 255.0f*(elevels - 1) + 0.5f));
        qep[i] = clampEPO(v, 0, elevels - 1);
        ep[i] = expandEPObits(qep[i], epbits);
    }
}

void icmp_refineEndpoints(CGV_IMAGE ep[2], CGV_IMAGE block[16], CGU_INT bits, CGV_INDEXPACKED color_index[2])
{
    CGU_INT levels = 1 << bits;

    CGV_IMAGE Atb1 = 0;
    CGV_IMAGE sum_q = 0;
    CGV_IMAGE sum_qq = 0;
    CGV_IMAGE sum = 0;

    for (CGU_INT k1 = 0; k1 < 2; k1++)
    {
        CGV_INDEXPACKED qbits_shifted = color_index[k1];
        for (CGU_INT k2 = 0; k2 < 8; k2++)
        {
            CGU_INT   k = k1 * 8 + k2;
            CGV_IMAGE q = (CGV_TYPEINT)(qbits_shifted & 15);
            qbits_shifted >>= 4;

            CGV_TYPEINT x = (levels - 1) - q;
            CGV_TYPEINT y = q;

            sum_q += q;
            sum_qq += q * q;

            sum += block[k];
            Atb1 += x * block[k];
        }
    }

    CGV_IMAGE Atb2 = (levels - 1)*sum - Atb1;

    CGV_IMAGE Cxx = 16 * sq_image(levels - 1) - 2 * (levels - 1)*sum_q + sum_qq;
    CGV_IMAGE Cyy = sum_qq;
    CGV_IMAGE Cxy = (levels - 1)*sum_q - sum_qq;
    CGV_IMAGE scale = (levels - 1) / (Cxx*Cyy - Cxy * Cxy);

    ep[0] = (Atb1*Cyy - Atb2 * Cxy)*scale;
    ep[1] = (Atb2*Cxx - Atb1 * Cxy)*scale;

    ep[0] = clampf(ep[0], 0.0f, 255.0f);
    ep[1] = clampf(ep[1], 0.0f, 255.0f);

    if (img_absf(Cxx*Cyy - Cxy * Cxy) < 0.001)
    {
        ep[0] = sum / 16;
        ep[1] = ep[0];
    }
}

CGV_ERROR  icmp_channelQuantizeIndex(CGV_INDEXPACKED color_index[2], CGV_INDEX index[MAX_SUBSET_SIZE], CGV_IMAGE block[16], CGU_INT bits, CGV_IMAGE ep[])
{
    uniform __constant CGV_RAMP* uniform Ramp = icmp_GetRamp(bits);
    CGV_LEVELS levels = (1 << bits);

    color_index[0] = 0;
    color_index[1] = 0;

    CGV_ERROR total_err = 0;

    for (CGU_INT k = 0; k < 16; k++)
    {
        CGV_IMAGE proj = (block[k] - ep[0]) / (ep[1] - ep[0] + 0.001f);

        CGV_INDEX q1 = (CGV_TYPEINT)(proj*levels + 0.5);
        q1 = clampEPO(q1, 1, levels - 1);

        CGV_ERROR err0 = 0;
        CGV_ERROR err1 = 0;
        CGV_RAMP  ramp0 = gather_ramp(Ramp, q1 - 1);
        CGV_RAMP  ramp1 = gather_ramp(Ramp, q1);

        CGV_IMAGE dec_v0 = (CGV_TYPEINT)(((64 - ramp0)*ep[0] + ramp0 * ep[1] + 32) / 64);
        CGV_IMAGE dec_v1 = (CGV_TYPEINT)(((64 - ramp1)*ep[0] + ramp1 * ep[1] + 32) / 64);
        err0 += sq_image(dec_v0 - block[k]);
        err1 += sq_image(dec_v1 - block[k]);

        CGV_TYPEINT best_err = err1;
        CGV_INDEX   best_q = q1;
        if (err0 < err1)
        {
            best_err = err0;
            best_q = q1 - 1;
        }

        index[k] = best_q;
        color_index[k / 8] += ((CGV_INDEXPACKED)best_q) << 4 * (k % 8);
        total_err += best_err;
    }

    return total_err;
}

CGV_ERROR  icmp_optQuantizeIndex(BC7_EncodeState  EncodeState[], CGV_INDEXPACKED color_index[2], CGV_INDEX index[MAX_SUBSET_SIZE], CGV_EPOCODE qep[2], CGV_IMAGE block[16], CGU_INT bits, CGU_INT epbits)
{
    CGV_IMAGE ep[2] = { 255,0 };

    for (CGU_INT k = 0; k < 16; k++)
    {
        ep[0] = minImage(ep[0], block[k]);
        ep[1] = maxImage(ep[1], block[k]);
    }

    icmp_channel_quant_dequant2(qep, ep, epbits);
    CGV_ERROR err = icmp_channelQuantizeIndex(color_index, index, block, bits, ep);

    // refine
#ifndef ASPM_GPU
    uniform CMP_CONSTANT
#endif
    CGV_ITTERATIONS refineIterations = EncodeState->refineIterations;
    for (CGU_INT i = 0; i < refineIterations; i++)
    {
        icmp_refineEndpoints(ep, block, bits, color_index);
        icmp_channel_quant_dequant2(qep, ep, epbits);
        err = icmp_channelQuantizeIndex(color_index, index, block, bits, ep);
    }

    return err;
}


INLINE CGV_SHIFT32 icmp_get_pattern2(CGV_PARTID part_id)
{
    CMP_STATIC uniform __constant CGV_SHIFT32 pattern_table[] = {
       0x50505050u, 0x40404040u, 0x54545454u, 0x54505040u, 0x50404000u, 0x55545450u, 0x55545040u, 0x54504000u,
       0x50400000u, 0x55555450u, 0x55544000u, 0x54400000u, 0x55555440u, 0x55550000u, 0x55555500u, 0x55000000u,
       0x55150100u, 0x00004054u, 0x15010000u, 0x00405054u, 0x00004050u, 0x15050100u, 0x05010000u, 0x40505054u,
       0x00404050u, 0x05010100u, 0x14141414u, 0x05141450u, 0x01155440u, 0x00555500u, 0x15014054u, 0x05414150u,
       0x44444444u, 0x55005500u, 0x11441144u, 0x05055050u, 0x05500550u, 0x11114444u, 0x41144114u, 0x44111144u,
       0x15055054u, 0x01055040u, 0x05041050u, 0x05455150u, 0x14414114u, 0x50050550u, 0x41411414u, 0x00141400u,
       0x00041504u, 0x00105410u, 0x10541000u, 0x04150400u, 0x50410514u, 0x41051450u, 0x05415014u, 0x14054150u,
       0x41050514u, 0x41505014u, 0x40011554u, 0x54150140u, 0x50505500u, 0x00555050u, 0x15151010u, 0x54540404u,
       0xAA685050u, 0x6A5A5040u, 0x5A5A4200u, 0x5450A0A8u, 0xA5A50000u, 0xA0A05050u, 0x5555A0A0u, 0x5A5A5050u,
       0xAA550000u, 0xAA555500u, 0xAAAA5500u, 0x90909090u, 0x94949494u, 0xA4A4A4A4u, 0xA9A59450u, 0x2A0A4250u,
       0xA5945040u, 0x0A425054u, 0xA5A5A500u, 0x55A0A0A0u, 0xA8A85454u, 0x6A6A4040u, 0xA4A45000u, 0x1A1A0500u,
       0x0050A4A4u, 0xAAA59090u, 0x14696914u, 0x69691400u, 0xA08585A0u, 0xAA821414u, 0x50A4A450u, 0x6A5A0200u,
       0xA9A58000u, 0x5090A0A8u, 0xA8A09050u, 0x24242424u, 0x00AA5500u, 0x24924924u, 0x24499224u, 0x50A50A50u,
       0x500AA550u, 0xAAAA4444u, 0x66660000u, 0xA5A0A5A0u, 0x50A050A0u, 0x69286928u, 0x44AAAA44u, 0x66666600u,
       0xAA444444u, 0x54A854A8u, 0x95809580u, 0x96969600u, 0xA85454A8u, 0x80959580u, 0xAA141414u, 0x96960000u,
       0xAAAA1414u, 0xA05050A0u, 0xA0A5A5A0u, 0x96000000u, 0x40804080u, 0xA9A8A9A8u, 0xAAAAAA44u, 0x2A4A5254u
    };

    return gather_uint32(pattern_table, part_id);
}

CGV_IMAGE  icmp_get_pca_bound(CGV_IMAGE covar[10], CGU_CHANNEL channels)
{
    uniform __constant CGV_TYPEINT powerIterations = 4; // quite approximative, but enough for bounding

    CGV_IMAGE inv_var = 1.0 / (256 * 256);
    for (CGU_INT k = 0; k < 10; k++)
    {
        covar[k] *= inv_var;
    }

    CGV_IMAGE eps = sq_image(0.001);
    covar[0] += eps;
    covar[4] += eps;
    covar[7] += eps;

    CGV_IMAGE axis[4];
    icmp_compute_axis(axis, covar, powerIterations, channels);

    CGV_IMAGE vec[4];
    if (channels == 3) icmp_ssymv3(vec, covar, axis);
    if (channels == 4) icmp_ssymv4_2(vec, covar, axis);

    CGV_IMAGE sq_sum = 0.0f;
    for (CGU_INT p = 0; p < channels; p++) sq_sum += sq_image(vec[p]);
    CGV_IMAGE lambda = sqrt(sq_sum);

    CGV_IMAGE bound = covar[0] + covar[4] + covar[7];
    if (channels == 4) bound += covar[9];
    bound -= lambda;
    bound = maxImage(bound, 0.0f);

    return bound;
}

CGV_IMAGE icmp_block_pca_bound_split2(CGV_IMAGE image_src[64], CGV_MASK mask, CGV_IMAGE full_stats[15], CGU_CHANNEL channels)
{
    CGV_IMAGE stats[15];
    icmp_compute_stats_masked(stats, image_src, mask, channels);

    CGV_IMAGE covar1[10];
    icmp_covar_from_stats(covar1, stats, channels);

    for (CGU_INT i = 0; i < 15; i++)
        stats[i] = full_stats[i] - stats[i];

    CGV_IMAGE covar2[10];
    icmp_covar_from_stats(covar2, stats, channels);

    CGV_IMAGE bound = 0.0f;
    bound += icmp_get_pca_bound(covar1, channels);
    bound += icmp_get_pca_bound(covar2, channels);

    return sqrt(bound) * 256;
}


#ifdef USE_VARYING
INLINE void       scatter_partid(varying CGV_PARTID* uniform ptr, CGV_TYPEINT idx, CGV_PARTID value)
{
    ptr[idx] = value; // (perf warning expected)
}
#endif

void icmp_sort_partlist(CGV_PARTID list[], CGU_INT length, CGU_INT partial_count)
{
    for (CGU_INT k = 0; k < partial_count; k++)
    {
        CGV_TYPEINT  best_idx = k;
        CGV_PARTID   best_value = list[k];
        for (CGU_INT i = k + 1; i < length; i++)
        {
            if (best_value > list[i])
            {
                best_value = list[i];
                best_idx = i;
            }
        }

        // swap
#ifdef USE_VARYING
        scatter_partid(list, best_idx, list[k]);
#else
        list[best_idx] = list[k];
#endif
        list[k] = best_value;
    }
}

INLINE void copy_epocode(CGV_EPOCODE u[], CGV_EPOCODE v[], CGU_INT n)
{
    for (CGU_INT i = 0; i < n; i++)
    {
        u[i] = v[i];
    }
}


INLINE void copy_indexpacked(CGV_INDEXPACKED u[], CGV_INDEXPACKED v[], CGU_INT n)
{
    for (CGU_INT i = 0; i < n; i++)
    {
        u[i] = v[i];
    }
}


void icmp_enc_mode4_candidate(
    BC7_EncodeState     EncodeState[],
    cmp_mode_parameters best_candidate[],
    CGV_ERROR           best_err[],
    CGU_INT             rotated_channel,
    CGU_INT             idxMode)
{
    CGU_INT bits = 2;
    CGU_INT abits = 3;
    CGU_INT aepbits = 6;

    if (idxMode == 1)
    {
        bits = 3;
        abits = 2;
    }

    CGV_IMAGE src_block[48];
    for (CGU_INT k = 0; k < 16; k++)
    {
        for (CGU_INT p = 0; p < 3; p++)
            src_block[k + p * 16] = EncodeState->image_src[k + p * 16];

        if (rotated_channel < 3)
        {
            // apply channel rotation
            if (EncodeState->channels == 4) src_block[k + rotated_channel * 16] = EncodeState->image_src[k + 3 * 16];
            if (EncodeState->channels == 3) src_block[k + rotated_channel * 16] = 255;
        }
    }

    CGV_IMAGE        block_endpoints[8];
    CGV_INDEXPACKED  color_index[2];
    CGV_INDEX        c_index[MAX_SUBSET_SIZE];
    CGV_EPOCODE      color_qendpoint[8];

    icmp_get_block_endpoints(block_endpoints, src_block, -1, 3);
    icmp_ep_quant_dequant_mode4(color_qendpoint, block_endpoints);
    CGV_ERROR  err = icmp_GetQuantizeIndex(color_index, c_index, src_block, bits, block_endpoints, 0, 3);

    // refine
    CGU_INT refineIterations = EncodeState->refineIterations;
    for (CGU_INT i = 0; i < refineIterations; i++)
    {
        icmp_opt_endpoints(block_endpoints, src_block, bits, color_index, -1, 3);
        icmp_ep_quant_dequant_mode4(color_qendpoint, block_endpoints);
        err = icmp_GetQuantizeIndex(color_index, c_index, src_block, bits, block_endpoints, 0, 3);
    }

    // encoding selected channel
    CGV_EPOCODE     alpha_qendpoint[2];
    CGV_INDEXPACKED alpha_index[2];
    CGV_INDEX       a_index[MAX_SUBSET_SIZE];
    err += icmp_optQuantizeIndex(EncodeState, alpha_index, a_index, alpha_qendpoint, &EncodeState->image_src[rotated_channel * 16], abits, aepbits);

    if (err < *best_err)
    {
        copy_epocode(best_candidate->color_qendpoint, color_qendpoint, 8);
        copy_epocode(best_candidate->alpha_qendpoint, alpha_qendpoint, 2);
        copy_indexpacked(best_candidate->best_color_index, color_index, 2);
        copy_indexpacked(best_candidate->best_alpha_index, alpha_index, 2);
        best_candidate->rotated_channel = rotated_channel;
        best_candidate->idxMode = idxMode;
        *best_err = err;
    }
}

void icmp_mode5_candidate(
    BC7_EncodeState      EncodeState[],
    cmp_mode_parameters  best_candidate[],
    CGV_ERROR            best_err[],
    CGU_INT              rotated_channel)
{
    CGU_INT bits = 2;
    CGU_INT abits = 2;
    CGU_INT aepbits = 8;

    CGV_IMAGE block[48];
    for (CGU_INT k = 0; k < 16; k++)
    {
        for (CGU_INT p = 0; p < 3; p++)
            block[k + p * 16] = EncodeState->image_src[k + p * 16];

        if (rotated_channel < 3)
        {
            // apply channel rotation
            if (EncodeState->channels == 4) block[k + rotated_channel * 16] = EncodeState->image_src[k + 3 * 16];
            if (EncodeState->channels == 3) block[k + rotated_channel * 16] = 255;
        }
    }

    CGV_IMAGE        block_endpoints[8];
    CGV_EPOCODE      color_qendpoint[8];
    CGV_INDEXPACKED  color_index[2];
    CGV_INDEX        c_index[MAX_SUBSET_SIZE];

    icmp_get_block_endpoints(block_endpoints, block, -1, 3);
    icmp_GetQuantizedEpoCode(color_qendpoint, block_endpoints, 5, 3);
    CGV_ERROR err = icmp_GetQuantizeIndex(color_index, c_index, block, bits, block_endpoints, 0, 3);

    // refine
    CGU_INT refineIterations = EncodeState->refineIterations;
    for (CGU_INT i = 0; i < refineIterations; i++)
    {
        icmp_opt_endpoints(block_endpoints, block, bits, color_index, -1, 3);
        icmp_GetQuantizedEpoCode(color_qendpoint, block_endpoints, 5, 3);
        err = icmp_GetQuantizeIndex(color_index, c_index, block, bits, block_endpoints, 0, 3);
    }

    // encoding selected channel
    CGV_EPOCODE     alpha_qendpoint[2];
    CGV_INDEXPACKED alpha_index[2];
    CGV_INDEX       a_index[MAX_SUBSET_SIZE];
    err += icmp_optQuantizeIndex(EncodeState, alpha_index, a_index, alpha_qendpoint, &EncodeState->image_src[rotated_channel * 16], abits, aepbits);

    if (err < *best_err)
    {

        icmp_swap_epocode(best_candidate->color_qendpoint, color_qendpoint, 8);
        icmp_swap_indexpacked(best_candidate->best_color_index, color_index, 2);
        icmp_swap_epocode(best_candidate->alpha_qendpoint, alpha_qendpoint, 2);
        icmp_swap_indexpacked(best_candidate->best_alpha_index, alpha_index, 2);
        best_candidate->rotated_channel = rotated_channel;
        *best_err = err;
    }
}


// =============== Mode Compression

CGV_ERROR icmp_enc_mode01237_part_fast(
    CGV_EPOCODE     qep[24],
    CGV_INDEXPACKED color_index[2],
    CGV_INDEX  index[MAX_SUBSET_SIZE],
    CGV_IMAGE  image_src[64],
    CGV_PARTID part_id,
    CGU_INT    blockMode)
{
    CGV_SHIFT32  pattern = icmp_get_pattern2(part_id);
    CGU_INT      bits = 2;  if (blockMode == 0 || blockMode == 1) bits = 3;
    CGU_INT      maxSubSets = 2;  if (blockMode == 0 || blockMode == 2) maxSubSets = 3;
    CGU_CHANNEL  channels = 3;  if (blockMode == 7) channels = 4;

    CGV_IMAGE block_endpoints[24];
    for (CGU_INT subset = 0; subset < maxSubSets; subset++)
    {
        CGV_MASK partition_mask = icmp_get_partition_mask(part_id, subset);
        icmp_get_block_endpoints(&block_endpoints[subset * 8], image_src, partition_mask, channels);
    }

    icmp_GetQuantizedEpoCode(qep, block_endpoints, blockMode, channels);
    CGV_ERROR total_err = icmp_GetQuantizeIndex(color_index, index, image_src, bits, block_endpoints, pattern, channels);

    return total_err;
}

void icmp_enc_mode01237(BC7_EncodeState  EncodeState[], CGU_INT blockMode, CGV_PARTID part_list[], CGU_INT part_count)
{
    if (part_count == 0) return;
    CGU_INT     bits = 2;       if (blockMode == 0 || blockMode == 1) bits = 3;
    CGU_INT     maxSubSets = 2; if (blockMode == 0 || blockMode == 2) maxSubSets = 3;
    CGU_CHANNEL channels = 3;   if (blockMode == 7) channels = 4;

    CGV_EPOCODE       best_qep[24];
    CGV_INDEXPACKED   best_endpoint[2];
    CGV_PARTID        best_part_id = -1;
    CGV_ERROR         best_err = 1e99;

    for (CGU_INT part = 0; part < part_count; part++)
    {
        CGV_PARTID part_id = part_list[part] & 63;
        if (maxSubSets == 3) part_id += 64;

        CGV_EPOCODE      qep[24];
        CGV_INDEXPACKED  color_index[2];
        CGV_INDEX        index[MAX_SUBSET_SIZE];
        CGV_ERROR   err = icmp_enc_mode01237_part_fast(qep, color_index, index, EncodeState->image_src, part_id, blockMode);

        if (err < best_err)
        {
            for (CGU_INT subset = 0; subset < 8 * maxSubSets; subset++) best_qep[subset] = qep[subset];
            for (CGU_INT k = 0; k < 2; k++) best_endpoint[k] = color_index[k];
            best_part_id = part_id;
            best_err = err;
        }
    }

    // refine
    CGU_INT refineIterations = EncodeState->refineIterations;
    for (CGU_INT _i = 0; _i < refineIterations; _i++)
    {
        CGV_IMAGE ep[24];
        for (CGU_INT subset = 0; subset < maxSubSets; subset++)
        {
            CGV_SHIFT32 partition_mask = icmp_get_partition_mask(best_part_id, subset);
            icmp_opt_endpoints(&ep[subset * 8], EncodeState->image_src, bits, best_endpoint, partition_mask, channels);
        }

        CGV_EPOCODE      qep[24];
        CGV_INDEXPACKED  color_index[2];
        CGV_INDEX        index[MAX_SUBSET_SIZE];

        icmp_GetQuantizedEpoCode(qep, ep, blockMode, channels);

        CGV_SHIFT32 pattern = icmp_get_pattern2(best_part_id);
        CGV_ERROR   err = icmp_GetQuantizeIndex(color_index, index, EncodeState->image_src, bits, ep, pattern, channels);

        if (err < best_err)
        {
            for (CGU_INT subset = 0; subset < 8 * maxSubSets; subset++) best_qep[subset] = qep[subset];
            for (CGU_INT k = 0; k < 2; k++) best_endpoint[k] = color_index[k];
            best_err = err;
        }
    }

    if (blockMode != 7) best_err += EncodeState->opaque_err; // take into account alpha channel

    if (best_err < EncodeState->best_err)
    {
        EncodeState->best_err = best_err;
        icmp_encode_mode01237(EncodeState->best_cmp_out, best_qep, best_endpoint, best_part_id, blockMode);
    }
}

void icmp_mode5(BC7_EncodeState  EncodeState[])
{
    cmp_mode_parameters best_candidate;
    CGV_ERROR best_err = EncodeState->best_err;

#ifdef ASPM_GPU
    cmp_memsetBC7((CGV_BYTE *)&best_candidate, 0, sizeof(cmp_mode_parameters));
#else
    memset(&best_candidate, 0, sizeof(cmp_mode_parameters));
#endif

    for (CGU_CHANNEL ch = 0; ch < EncodeState->channels; ch++)
    {
        icmp_mode5_candidate(EncodeState, &best_candidate, &best_err, ch);
    }

    if (best_err < EncodeState->best_err)
    {
        EncodeState->best_err = best_err;
        EncodeState->cmp_isout16Bytes = FALSE;
        icmp_Encode_mode5(EncodeState->best_cmp_out, &best_candidate);
    }
}

void icmp_mode6(BC7_EncodeState  EncodeState[])
{
    CGV_IMAGE block_endpoints[8];
    icmp_get_block_endpoints(block_endpoints, EncodeState->image_src, -1, 4);

    CGV_EPOCODE epo_code[8];
    icmp_GetQuantizedEpoCode(epo_code, block_endpoints, 6, 4);

    CGV_INDEXPACKED color_index[2];
    CGV_INDEX        index[MAX_SUBSET_SIZE];
    CGV_ERROR err = icmp_GetQuantizeIndex(color_index, index, EncodeState->image_src, 4, block_endpoints, 0, 4);

    // refine
    CGU_INT refineIterations = EncodeState->refineIterations;
    for (CGU_INT i = 0; i < refineIterations; i++)
    {
        icmp_opt_endpoints(block_endpoints, EncodeState->image_src, 4, color_index, -1, 4);
        icmp_GetQuantizedEpoCode(epo_code, block_endpoints, 6, EncodeState->channels);
        err = icmp_GetQuantizeIndex(color_index, index, EncodeState->image_src, 4, block_endpoints, 0, 4);
    }

    if (err < EncodeState->best_err)
    {
        EncodeState->best_err = err;
        EncodeState->cmp_isout16Bytes = FALSE;
        icmp_encode_mode6(EncodeState->best_cmp_out, epo_code, color_index);
    }
}

void icmp_mode02(BC7_EncodeState  EncodeState[])
{
    CGV_PARTID part_list[64];
    for (CGU_INT part = 0; part < 64; part++)
        part_list[part] = part;

    if (EncodeState->validModeMask & 0x01)
        icmp_enc_mode01237(EncodeState, 0, part_list, 16);
    if (EncodeState->validModeMask & 0x04)
        icmp_enc_mode01237(EncodeState, 2, part_list, 64); // usually not worth the time
}

void icmp_mode7(BC7_EncodeState  EncodeState[])
{
    CGV_IMAGE full_stats[15];
    icmp_compute_stats_masked(full_stats, EncodeState->image_src, -1, EncodeState->channels);

    CGV_PARTID part_list[64];
    for (CGU_INT part = 0; part < 64; part++)
    {
        CGV_MASK   partition_mask = icmp_get_partition_mask(part + 0, 0);
        CGV_IMAGE  bound12 = icmp_block_pca_bound_split2(EncodeState->image_src, partition_mask, full_stats, EncodeState->channels);
        CGV_PARTID bound = (CGV_TYPEINT)(bound12);
        part_list[part] = part + bound * 64;
    }

    icmp_sort_partlist(part_list, 64, EncodeState->part_count);
    icmp_enc_mode01237(EncodeState, 7, part_list, EncodeState->part_count);
}

void icmp_mode13(BC7_EncodeState  EncodeState[])
{
    CGV_IMAGE full_stats[15];
    icmp_compute_stats_masked(full_stats, EncodeState->image_src, -1, 3);

    CGV_PARTID part_list[64];
    for (CGU_INT part = 0; part < 64; part++)
    {
        CGV_MASK   partition_mask = icmp_get_partition_mask(part + 0, 0);
        CGV_IMAGE  bound12 = icmp_block_pca_bound_split2(EncodeState->image_src, partition_mask, full_stats, 3);
        CGV_PARTID bound = (CGV_TYPEINT)(bound12);
        part_list[part] = part + bound * 64;
    }

    icmp_sort_partlist(part_list, 64, EncodeState->part_count);

    if (EncodeState->validModeMask & 0x02)
        icmp_enc_mode01237(EncodeState, 1, part_list, EncodeState->part_count);
    if (EncodeState->validModeMask & 0x08)
        icmp_enc_mode01237(EncodeState, 3, part_list, EncodeState->part_count);
}

void icmp_mode4(BC7_EncodeState  EncodeState[])
{
    cmp_mode_parameters best_candidate;
    CGV_ERROR best_err = EncodeState->best_err;
#ifdef ASPM_GPU
    cmp_memsetBC7((CGV_BYTE *)&best_candidate, 0, sizeof(cmp_mode_parameters));
#else
    memset(&best_candidate, 0, sizeof(cmp_mode_parameters));
#endif

    for (CGU_CHANNEL rotated_channel = 0; rotated_channel < EncodeState->channels; rotated_channel++)
    {
        icmp_enc_mode4_candidate(EncodeState, &best_candidate, &best_err, rotated_channel, 0);
        icmp_enc_mode4_candidate(EncodeState, &best_candidate, &best_err, rotated_channel, 1);
    }

    // mode 4
    if (best_err < EncodeState->best_err)
    {
        EncodeState->best_err = best_err;
        icmp_encode_mode4(EncodeState->best_cmp_out, &best_candidate);
    }
}

#endif
//===================================== COMPRESS CODE =============================================

bool notValidBlockForMode(
    CGU_UINT32  blockMode,
    CGU_BOOL    blockNeedsAlpha,
    CGU_BOOL    blockAlphaZeroOne,
    uniform CMP_GLOBAL  BC7_Encode  u_BC7Encode[])
{
     // Do we need to skip alpha processing blocks
     if((blockNeedsAlpha == FALSE) && (blockMode > 3))
     {
         return TRUE;
     }

     // Optional restriction for colour-only blocks so that they
     // don't use modes that have combined colour+alpha - this
     // avoids the possibility that the encoder might choose an
     // alpha other than 1.0 (due to parity) and cause something to
     // become accidentally slightly transparent (it's possible that
     // when encoding 3-component texture applications will assume that
     // the 4th component can safely be assumed to be 1.0 all the time)
     if ((blockNeedsAlpha == FALSE) &&
         (u_BC7Encode->colourRestrict == TRUE) &&
         ((blockMode == 6)||(blockMode == 7))) // COMBINED_ALPHA
     {
         return TRUE;
     }

     // Optional restriction for blocks with alpha to avoid issues with
     // punch-through or thresholded alpha encoding
     if((blockNeedsAlpha == TRUE) &&
        (u_BC7Encode->alphaRestrict == TRUE) &&
        (blockAlphaZeroOne == TRUE) &&
        ((blockMode == 6)||(blockMode == 7))) // COMBINED_ALPHA
     {
         return TRUE;
     }

     return FALSE;
}

void  BC7_CompressBlock(
                      BC7_EncodeState  EncodeState[],
uniform CMP_GLOBAL    BC7_Encode       u_BC7Encode[])
{
    CGU_BOOL            blockNeedsAlpha        = FALSE;
    CGU_BOOL            blockAlphaZeroOne      = FALSE;

    CGV_ERROR alpha_err = 0.0f;
    CGV_IMAGE alpha_min = 255.0F;

    for (CGU_INT k=0; k<SOURCE_BLOCK_SIZE; k++)
    {
        if ( EncodeState->image_src[k+COMP_ALPHA*SOURCE_BLOCK_SIZE] < alpha_min)
            alpha_min =  EncodeState->image_src[k+COMP_ALPHA*SOURCE_BLOCK_SIZE];

        alpha_err += sq_image( EncodeState->image_src[k+COMP_ALPHA*SOURCE_BLOCK_SIZE]-255.0F);

        if (blockAlphaZeroOne == FALSE)
        {
            if(( EncodeState->image_src[k+COMP_ALPHA*SOURCE_BLOCK_SIZE] == 255.0F) ||
               ( EncodeState->image_src[k+COMP_ALPHA*SOURCE_BLOCK_SIZE] == 0.0F))
            {
                blockAlphaZeroOne = TRUE;
            }
        }
    }

    if (alpha_min != 255.0F)
    {
        blockNeedsAlpha = TRUE;
    }

     EncodeState->best_err    = CMP_FLOAT_MAX;
     EncodeState->opaque_err  = alpha_err;

#ifdef USE_ICMP
     EncodeState->refineIterations = 4;
     EncodeState->fastSkipTreshold = 4;
     EncodeState->channels = 4;
     EncodeState->part_count = 64;
     EncodeState->cmp_isout16Bytes = FALSE;
#else
     EncodeState->cmp_isout16Bytes = TRUE;
#endif

    // We change the order in which we visit the block modes to try to maximize the chance
    // that we manage to early out as quickly as possible.
    // This is a significant performance optimization for the lower quality modes where the
    // exit threshold is higher, and also tends to improve quality (as the generally higher quality
    // modes are now enumerated earlier, so the first encoding that passes the threshold will
    // tend to pass by a greater margin than if we used a dumb ordering, and thus overall error will
    // be improved)
    CGU_INT   blockModeOrder[NUM_BLOCK_TYPES] = {4, 6, 1, 3, 0, 2, 7, 5};

    for (CGU_INT block=0; block < NUM_BLOCK_TYPES; block++)
    {
        CGU_INT blockMode = blockModeOrder[block];

        if (u_BC7Encode->quality < BC7_qFAST_THRESHOLD)
        {
            if ( notValidBlockForMode(blockMode,blockNeedsAlpha,blockAlphaZeroOne,u_BC7Encode) )
                continue;
        }

        CGU_INT      Mode = 0x0001 << blockMode;
        if (!(u_BC7Encode->validModeMask & Mode))
            continue;
        switch (blockMode)
        {
        // image processing with no alpha
        case 0:
                #ifdef USE_ICMP
                    icmp_mode02(EncodeState);
                #else
                    Compress_mode01237(blockMode, EncodeState, u_BC7Encode);
                #endif
                break;
        case 1:
                #ifdef USE_ICMP
                    icmp_mode13(EncodeState);
                #else
                    Compress_mode01237(blockMode, EncodeState, u_BC7Encode);
                #endif
                break;
        case 2:
                #ifdef USE_ICMP
                    icmp_mode13(EncodeState);
                #else
                    Compress_mode01237(blockMode, EncodeState, u_BC7Encode);
                #endif
                break;
        case 3:
                #ifdef USE_ICMP
                    icmp_mode13(EncodeState);
                #else
                    Compress_mode01237(blockMode, EncodeState, u_BC7Encode);
                #endif
                break;
        // image processing with alpha
        case 4:
                #ifdef USE_ICMP
                    icmp_mode4(EncodeState);
                #else
                    Compress_mode45(blockMode, EncodeState, u_BC7Encode);
                #endif
                break;
        case 5:
                #ifdef USE_ICMP
                    icmp_mode5(EncodeState);
                #else
                    Compress_mode45(blockMode, EncodeState, u_BC7Encode);
                #endif
                break;
        case 6:
                #ifdef USE_ICMP
                    icmp_mode6(EncodeState);
                #else
                    Compress_mode6( EncodeState,  u_BC7Encode);
                #endif
                break;
        case 7:
                #ifdef USE_ICMP
                      icmp_mode7(EncodeState);
                #else
                      Compress_mode01237(blockMode, EncodeState, u_BC7Encode);
                #endif
                break;
        }

        // Early out if we  found we can compress with error below the quality threshold
        if( EncodeState->best_err <= u_BC7Encode->errorThreshold)
        {
            break;
        }
    }

}

//====================================== BC7_ENCODECLASS END =============================================

#ifndef ASPM_GPU

INLINE void load_block_interleaved_rgba2(CGV_IMAGE image_src[64], uniform texture_surface* uniform src, CGUV_BLOCKWIDTH block_xx, CGU_INT block_yy)
{
   for (CGU_INT y=0; y<4; y++)
   for (CGU_INT x=0; x<4; x++)
   {
      CGU_UINT32 * uniform src_ptr = (CGV_SHIFT32*)&src->ptr[(block_yy*4+y)*src->stride];
#ifdef USE_VARYING
      CGV_SHIFT32  rgba = gather_partid(src_ptr, block_xx*4+x);
      image_src[16*0+y*4+x] = (CGV_FLOAT)((rgba>> 0)&255);
      image_src[16*1+y*4+x] = (CGV_FLOAT)((rgba>> 8)&255);
      image_src[16*2+y*4+x] = (CGV_FLOAT)((rgba>>16)&255);
      image_src[16*3+y*4+x] = (CGV_FLOAT)((rgba>>24)&255);
#else
      CGV_SHIFT32  rgba = src_ptr[block_xx*4+x];
      image_src[16*0+y*4+x] = (CGU_FLOAT)((rgba>> 0)&255);
      image_src[16*1+y*4+x] = (CGU_FLOAT)((rgba>> 8)&255);
      image_src[16*2+y*4+x] = (CGU_FLOAT)((rgba>>16)&255);
      image_src[16*3+y*4+x] = (CGU_FLOAT)((rgba>>24)&255);
#endif
   }
}


#if defined(CMP_USE_FOREACH_ASPM) || defined(USE_VARYING)
INLINE void       scatter_uint2(CGU_UINT32 * ptr, CGUV_BLOCKWIDTH idx, CGV_SHIFT32 value)
{
   ptr[idx] = value; // (perf warning expected)
}
#endif

INLINE void store_data_uint32(CGU_UINT8 dst[], CGU_INT width, CGUV_BLOCKWIDTH v_xx, CGU_INT yy, CGV_SHIFT32 data[], CGU_INT data_size)
{
   for (CGU_INT k=0; k<data_size; k++)
   {
      CGU_UINT32 * dst_ptr = (CGV_SHIFT32*)&dst[(yy)*width*data_size];
#ifdef USE_VARYING
      scatter_uint2(dst_ptr, v_xx*data_size+k, data[k]);
#else
      dst_ptr[v_xx*data_size+k] = data[k];
#endif
   }
}

#ifdef USE_VARYING
INLINE void       scatter_uint8(CGU_UINT8* ptr, CGV_SHIFT32 idx, CGV_CMPOUT value)
{
   ptr[idx] = value; // (perf warning expected)
}
#endif

INLINE void store_data_uint8(CGU_UINT8 u_dstptr[], CGU_INT src_width, CGUV_BLOCKWIDTH block_x, CGU_INT block_y, CGUV_CMPOUT data[], CGU_INT data_size)
{
   for (CGU_INT k=0; k<data_size; k++)
   {
#ifdef USE_VARYING
      CGU_UINT8* dst_blockptr = (CGUV_DSTPTR*)&u_dstptr[(block_y*src_width*4)];
      scatter_uint8(dst_blockptr,k+(block_x*data_size),data[k]);
#else
      u_dstptr[(block_y*src_width*4)+k+(block_x*data_size)] = data[k];
#endif
   }
}

INLINE void store_data_uint32(CGU_UINT8 dst[], CGV_SHIFT32 width, CGUV_BLOCKWIDTH v_xx, CGU_INT yy, CGV_SHIFT32 data[], CGU_INT data_size)
{
    for (CGU_INT k = 0; k < data_size; k++)
    {
#if defined(CMP_USE_FOREACH_ASPM) || defined(USE_VARYING)
        CGU_UINT32 * dst_ptr = (CGV_SHIFT32*)&dst[(yy)*width*data_size];
        scatter_uint2(dst_ptr, v_xx*data_size + k, data[k]);
#else
        dst[((yy)*width*data_size) + v_xx * data_size + k] = data[k];
#endif
    }
}

void CompressBlockBC7_XY(uniform texture_surface u_srcptr[], CGUV_BLOCKWIDTH block_x, CGU_INT block_y, CGU_UINT8 u_dst[], uniform BC7_Encode u_settings[])
{
   BC7_EncodeState _state;
   varying BC7_EncodeState* uniform state = &_state;

   copy_BC7_Encode_settings(state, u_settings);

   load_block_interleaved_rgba2(state->image_src,u_srcptr, block_x, block_y);

   BC7_CompressBlock(state, u_settings);

   if (state->cmp_isout16Bytes)
       store_data_uint8(u_dst, u_srcptr->width, block_x, block_y, state->cmp_out, 16);
   else
       store_data_uint32(u_dst, u_srcptr->width, block_x, block_y, state->best_cmp_out, 4);

}

 CMP_EXPORT void CompressBlockBC7_encode( uniform texture_surface src[], CGU_UINT8 dst[], uniform BC7_Encode settings[])
{
  // bc7_isa(); ASPM_PRINT(("ASPM encode [%d,%d]\n",bc7_isa(),src->width,src->height));

  for (CGU_INT u_yy = 0; u_yy<src->height/4; u_yy++)
 #ifdef CMP_USE_FOREACH_ASPM
    foreach (v_xx = 0 ... src->width/4)
    {
 #else
    for (CGUV_BLOCKWIDTH v_xx = 0; v_xx<src->width/4; v_xx++)
    {
 #endif
        CompressBlockBC7_XY(src, v_xx, u_yy, dst, settings);
    }
}

#endif

#ifndef ASPM_GPU
#ifndef ASPM
//======================= DECOMPRESS =========================================
#ifndef USE_HIGH_PRECISION_INTERPOLATION_BC7
CGU_UINT16 aWeight2[] = { 0, 21, 43, 64 };
CGU_UINT16 aWeight3[] = { 0, 9, 18, 27, 37, 46, 55, 64 };
CGU_UINT16 aWeight4[] = { 0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64 };

CGU_UINT8 interpolate(CGU_UINT8 e0, CGU_UINT8 e1, CGU_UINT8 index, CGU_UINT8 indexprecision)
{
    if (indexprecision == 2)
        return (CGU_UINT8)(((64 - aWeight2[index])*CGU_UINT16(e0) + aWeight2[index] * CGU_UINT16(e1) + 32) >> 6);
    else if (indexprecision == 3)
        return (CGU_UINT8)(((64 - aWeight3[index])*CGU_UINT16(e0) + aWeight3[index] * CGU_UINT16(e1) + 32) >> 6);
    else // indexprecision == 4
        return (CGU_UINT8)(((64 - aWeight4[index])*CGU_UINT16(e0) + aWeight4[index] * CGU_UINT16(e1) + 32) >> 6);
}
#endif

void GetBC7Ramp(CGU_UINT32 endpoint[][MAX_DIMENSION_BIG],
                CGU_FLOAT  ramp[MAX_DIMENSION_BIG][(1<<MAX_INDEX_BITS)],
                CGU_UINT32 clusters[2],
                CGU_UINT32 componentBits[MAX_DIMENSION_BIG])
{
    CGU_UINT32 ep[2][MAX_DIMENSION_BIG];
    CGU_UINT32 i;

    // Expand each endpoint component to 8 bits by shifting the MSB to bit 7
    // and then replicating the high bits to the low bits revealed by
    // the shift
    for(i=0; i<MAX_DIMENSION_BIG; i++)
    {
        ep[0][i] = 0;
        ep[1][i] = 0;
        if(componentBits[i])
        {
            ep[0][i]  = (CGU_UINT32)(endpoint[0][i] << (8 - componentBits[i]));
            ep[1][i]  = (CGU_UINT32)(endpoint[1][i] << (8 - componentBits[i]));
            ep[0][i] += (CGU_UINT32)(ep[0][i] >> componentBits[i]);
            ep[1][i] += (CGU_UINT32)(ep[1][i] >> componentBits[i]);

            ep[0][i] = min8(255, max8(0, static_cast<CGU_UINT8>(ep[0][i])));
            ep[1][i] = min8(255, max8(0, static_cast<CGU_UINT8>(ep[1][i])));
        }
    }

    // If this block type has no explicit alpha channel
    // then make sure alpha is 1.0 for all points on the ramp
    if(!componentBits[COMP_ALPHA])
    {
        ep[0][COMP_ALPHA] = ep[1][COMP_ALPHA] = 255;
    }

    CGU_UINT32   rampIndex = clusters[0];

    rampIndex = (CGU_UINT32)(log((double)rampIndex) / log(2.0));

    // Generate colours for the RGB ramp
    for(i=0; i < clusters[0]; i++)
    {
#ifdef USE_HIGH_PRECISION_INTERPOLATION_BC7
        ramp[COMP_RED][i] = (CGU_FLOAT)floor((ep[0][COMP_RED] * (1.0-rampLerpWeightsBC7[rampIndex][i])) +
                                  (ep[1][COMP_RED] * rampLerpWeightsBC7[rampIndex][i]) + 0.5);
        ramp[COMP_RED][i] = bc7_minf(255.0, bc7_maxf(0., ramp[COMP_RED][i]));
        ramp[COMP_GREEN][i] = (CGU_FLOAT)floor((ep[0][COMP_GREEN] * (1.0-rampLerpWeightsBC7[rampIndex][i])) +
                                  (ep[1][COMP_GREEN] * rampLerpWeightsBC7[rampIndex][i]) + 0.5);
        ramp[COMP_GREEN][i] = bc7_minf(255.0, bc7_maxf(0., ramp[COMP_GREEN][i]));
        ramp[COMP_BLUE][i] = (CGU_FLOAT)floor((ep[0][COMP_BLUE] * (1.0-rampLerpWeightsBC7[rampIndex][i])) +
                                  (ep[1][COMP_BLUE] * rampLerpWeightsBC7[rampIndex][i]) + 0.5);
        ramp[COMP_BLUE][i] = bc7_minf(255.0, bc7_maxf(0., ramp[COMP_BLUE][i]));
#else
        ramp[COMP_RED][i]   = interpolate(ep[0][COMP_RED], ep[1][COMP_RED], i, rampIndex);
        ramp[COMP_GREEN][i] = interpolate(ep[0][COMP_GREEN], ep[1][COMP_GREEN], i, rampIndex);
        ramp[COMP_BLUE][i]  = interpolate(ep[0][COMP_BLUE], ep[1][COMP_BLUE], i, rampIndex);
#endif
    }


    rampIndex = clusters[1];
    rampIndex = (CGU_UINT32)(log((CGU_FLOAT)rampIndex) / log(2.0));

    if(!componentBits[COMP_ALPHA])
    {
        for(i=0; i < clusters[1]; i++)
        {
            ramp[COMP_ALPHA][i] = 255.;
        }
    }
    else
    {

        // Generate alphas
        for(i=0; i < clusters[1]; i++)
        {
#ifdef USE_HIGH_PRECISION_INTERPOLATION_BC7
            ramp[COMP_ALPHA][i] = (CGU_FLOAT)floor((ep[0][COMP_ALPHA] * (1.0-rampLerpWeightsBC7[rampIndex][i])) +
                                        (ep[1][COMP_ALPHA] * rampLerpWeightsBC7[rampIndex][i]) + 0.5);
            ramp[COMP_ALPHA][i] = bc7_minf(255.0, bc7_maxf(0., ramp[COMP_ALPHA][i]));
#else
            ramp[COMP_ALPHA][i] = interpolate(ep[0][COMP_ALPHA], ep[1][COMP_ALPHA], i, rampIndex);
#endif
        }

    }
}

//
// Bit reader - reads one bit from a buffer at the current bit offset
//              and increments the offset
//

CGU_UINT32 ReadBit(const CGU_UINT8 base[],CGU_UINT32 &m_bitPosition)
{
    int             byteLocation;
    int             remainder;
    CGU_UINT32 bit = 0;
    byteLocation   = m_bitPosition/8;
    remainder      = m_bitPosition % 8;

    bit = base[byteLocation];
    bit >>= remainder;
    bit &= 0x1;
    // Increment bit position
    m_bitPosition++;
    return (bit);
}

void DecompressDualIndexBlock(
    CGU_UINT8   out[MAX_SUBSET_SIZE][MAX_DIMENSION_BIG],
    const CGU_UINT8   in[COMPRESSED_BLOCK_SIZE],
    CGU_UINT32  endpoint[2][MAX_DIMENSION_BIG],
    CGU_UINT32  &m_bitPosition,
    CGU_UINT32  m_rotation,
    CGU_UINT32  m_blockMode,
    CGU_UINT32  m_indexSwap,
    CGU_UINT32  m_componentBits[MAX_DIMENSION_BIG])
{
    CGU_UINT32   i, j, k;
    CGU_FLOAT    ramp[MAX_DIMENSION_BIG][1<<MAX_INDEX_BITS];
    CGU_UINT32   blockIndices[2][MAX_SUBSET_SIZE];

    CGU_UINT32   clusters[2];
    clusters[0] = 1 << bti[m_blockMode].indexBits[0];
    clusters[1] = 1 << bti[m_blockMode].indexBits[1];
    if(m_indexSwap)
    {
        CGU_UINT32   temp = clusters[0];
        clusters[0] = clusters[1];
        clusters[1] = temp;
    }

    GetBC7Ramp(endpoint,
               ramp,
               clusters,
               m_componentBits);

    // Extract the indices
    for(i=0;i<2;i++)
    {
        for(j=0;j<MAX_SUBSET_SIZE;j++)
        {
            blockIndices[i][j] = 0;
            // If this is a fixup index then clear the implicit bit
            if(j==0)
            {
                blockIndices[i][j] &= ~(1 << (bti[m_blockMode].indexBits[i]-1));
                for(k=0;k<static_cast <CGU_UINT32>(bti[m_blockMode].indexBits[i] - 1); k++)
                {
                    blockIndices[i][j] |= (CGU_UINT32)ReadBit(in,m_bitPosition) << k;
                }
            }
            else
            {
               for(k=0;k<bti[m_blockMode].indexBits[i]; k++)
               {
                   blockIndices[i][j] |= (CGU_UINT32)ReadBit(in,m_bitPosition) << k;
               }
            }
        }
    }

    // Generate block colours
    for(i=0;i<MAX_SUBSET_SIZE;i++)
    {
        out[i][COMP_ALPHA] = (CGU_UINT8)ramp[COMP_ALPHA][blockIndices[m_indexSwap^1][i]];
        out[i][COMP_RED]   = (CGU_UINT8)ramp[COMP_RED][blockIndices[m_indexSwap][i]];
        out[i][COMP_GREEN] = (CGU_UINT8)ramp[COMP_GREEN][blockIndices[m_indexSwap][i]];
        out[i][COMP_BLUE]  = (CGU_UINT8)ramp[COMP_BLUE][blockIndices[m_indexSwap][i]];
    }

    // Resolve the component rotation
    CGU_INT8 swap;
    for(i=0; i<MAX_SUBSET_SIZE; i++)
    {
        switch(m_rotation)
        {
            case    0:
                // Do nothing
                break;
            case    1:
                // Swap A and R
                swap = out[i][COMP_ALPHA];
                out[i][COMP_ALPHA] = out[i][COMP_RED];
                out[i][COMP_RED] = swap;
                break;
            case    2:
                // Swap A and G
                swap = out[i][COMP_ALPHA];
                out[i][COMP_ALPHA] = out[i][COMP_GREEN];
                out[i][COMP_GREEN] = swap;
                break;
            case    3:
                // Swap A and B
                swap = out[i][COMP_ALPHA];
                out[i][COMP_ALPHA] = out[i][COMP_BLUE];
                out[i][COMP_BLUE] = swap;
                break;
        }
    }
}


void DecompressBC7_internal(CGU_UINT8  out[MAX_SUBSET_SIZE][MAX_DIMENSION_BIG], const CGU_UINT8 in[COMPRESSED_BLOCK_SIZE], const BC7_Encode *u_BC7Encode)
{
    if (u_BC7Encode) {}
    CGU_UINT32           i, j;
    CGU_UINT32           blockIndices[MAX_SUBSET_SIZE];
    CGU_UINT32           endpoint[MAX_SUBSETS][2][MAX_DIMENSION_BIG];

    CGU_UINT32 m_blockMode;
    CGU_UINT32 m_partition;
    CGU_UINT32 m_rotation;
    CGU_UINT32 m_indexSwap;

    CGU_UINT32 m_bitPosition;
    CGU_UINT32 m_componentBits[MAX_DIMENSION_BIG];

    m_blockMode = 0;
    m_partition = 0;
    m_rotation = 0;
    m_indexSwap = 0;

    // Position the read pointer at the LSB of the block
    m_bitPosition = 0;

    while (!ReadBit(in, m_bitPosition) && (m_blockMode < 8))
    {
        m_blockMode++;
    }

    if (m_blockMode > 7)
    {
        // Something really bad happened...
        return;
    }

    for (i = 0; i < bti[m_blockMode].rotationBits; i++)
    {
        m_rotation |= ReadBit(in, m_bitPosition) << i;
    }
    for (i = 0; i < bti[m_blockMode].indexModeBits; i++)
    {
        m_indexSwap |= ReadBit(in, m_bitPosition) << i;
    }

    for (i = 0; i < bti[m_blockMode].partitionBits; i++)
    {
        m_partition |= ReadBit(in, m_bitPosition) << i;
    }


    if (bti[m_blockMode].encodingType == NO_ALPHA)
    {
        m_componentBits[COMP_ALPHA] = 0;
        m_componentBits[COMP_RED] =
            m_componentBits[COMP_GREEN] =
            m_componentBits[COMP_BLUE] = bti[m_blockMode].vectorBits / 3;
    }
    else if (bti[m_blockMode].encodingType == COMBINED_ALPHA)
    {
        m_componentBits[COMP_ALPHA] =
            m_componentBits[COMP_RED] =
            m_componentBits[COMP_GREEN] =
            m_componentBits[COMP_BLUE] = bti[m_blockMode].vectorBits / 4;
    }
    else if (bti[m_blockMode].encodingType == SEPARATE_ALPHA)
    {
        m_componentBits[COMP_ALPHA] = bti[m_blockMode].scalarBits;
        m_componentBits[COMP_RED] =
            m_componentBits[COMP_GREEN] =
            m_componentBits[COMP_BLUE] = bti[m_blockMode].vectorBits / 3;
    }

    CGU_UINT32   subset, ep, component;
    // Endpoints are stored in the following order RRRR GGGG BBBB (AAAA) (PPPP)
    // i.e. components are packed together
    // Loop over components
    for (component = 0; component < MAX_DIMENSION_BIG; component++)
    {
        // loop over subsets
        for (subset = 0; subset < (int)bti[m_blockMode].subsetCount; subset++)
        {
            // Loop over endpoints
            for (ep = 0; ep < 2; ep++)
            {
                endpoint[subset][ep][component] = 0;
                for (j = 0; j < m_componentBits[component]; j++)
                {
                    endpoint[subset][ep][component] |= ReadBit(in, m_bitPosition) << j;
                }
            }
        }
    }


    // Now get any parity bits
    if (bti[m_blockMode].pBitType != NO_PBIT)
    {
        for (subset = 0; subset < (int)bti[m_blockMode].subsetCount; subset++)
        {
            CGU_UINT32   pBit[2];
            if (bti[m_blockMode].pBitType == ONE_PBIT)
            {
                pBit[0] = ReadBit(in, m_bitPosition);
                pBit[1] = pBit[0];
            }
            else if (bti[m_blockMode].pBitType == TWO_PBIT)
            {
                pBit[0] = ReadBit(in, m_bitPosition);
                pBit[1] = ReadBit(in, m_bitPosition);
            }

            for (component = 0; component < MAX_DIMENSION_BIG; component++)
            {
                if (m_componentBits[component])
                {
                    endpoint[subset][0][component] <<= 1;
                    endpoint[subset][1][component] <<= 1;
                    endpoint[subset][0][component] |= pBit[0];
                    endpoint[subset][1][component] |= pBit[1];
                }
            }
        }
    }

    if (bti[m_blockMode].pBitType != NO_PBIT)
    {
        // Now that we've unpacked the parity bits, update the component size information
        // for the ramp generator
        for (j = 0; j < MAX_DIMENSION_BIG; j++)
        {
            if (m_componentBits[j])
            {
                m_componentBits[j] += 1;
            }
        }
    }

    // If this block has two independent sets of indices then put it to that decoder
    if (bti[m_blockMode].encodingType == SEPARATE_ALPHA)
    {
        DecompressDualIndexBlock(out, in, endpoint[0], m_bitPosition, m_rotation, m_blockMode, m_indexSwap, m_componentBits);
        return;
    }

    CGU_UINT32   fixup[MAX_SUBSETS] = { 0, 0, 0 };
    switch (bti[m_blockMode].subsetCount)
    {
    case    3:
        fixup[1] = BC7_FIXUPINDICES_LOCAL[2][m_partition][1];
        fixup[2] = BC7_FIXUPINDICES_LOCAL[2][m_partition][2];
        break;
    case    2:
        fixup[1] = BC7_FIXUPINDICES_LOCAL[1][m_partition][1];
        break;
    default:
        break;
    }

    //--------------------------------------------------------------------
    // New Code : Possible replacement for BC7_PARTITIONS for CPU code
    //--------------------------------------------------------------------
    // Extract index bits
    // for (i = 0; i < MAX_SUBSET_SIZE; i++)
    // {
    //     CGV_UINT8   p = get_partition_subset(m_partition, bti[m_blockMode].subsetCount - 1, i);
    //     //CGU_UINT32   p = partitionTable[i];
    //     blockIndices[i] = 0;
    //     CGU_UINT32   bitsToRead = bti[m_blockMode].indexBits[0];
    //
    //     // If this is a fixup index then set the implicit bit
    //     if (i == fixup[p])
    //     {
    //         blockIndices[i] &= ~(1 << (bitsToRead - 1));
    //         bitsToRead--;
    //     }
    //
    //     for (j = 0; j < bitsToRead; j++)
    //     {
    //         blockIndices[i] |= ReadBit(in, m_bitPosition) << j;
    //     }
    // }
    CGU_BYTE *partitionTable = (CGU_BYTE*)BC7_PARTITIONS[bti[m_blockMode].subsetCount-1][m_partition];

    // Extract index bits
    for(i=0; i < MAX_SUBSET_SIZE; i++)
    {
        CGU_BYTE   p = partitionTable[i];
        blockIndices[i] = 0;
        CGU_BYTE   bitsToRead = bti[m_blockMode].indexBits[0];

        // If this is a fixup index then set the implicit bit
        if(i==fixup[p])
        {
            blockIndices[i] &= ~(1 << (bitsToRead-1));
            bitsToRead--;
        }

        for(j=0;j<bitsToRead; j++)
        {
            blockIndices[i] |= ReadBit(in,m_bitPosition) << j;
        }
    }

    // Get the ramps
    CGU_UINT32   clusters[2];
    clusters[0] = clusters[1] = 1 << bti[m_blockMode].indexBits[0];

    // Colour Ramps
    CGU_FLOAT          c[MAX_SUBSETS][MAX_DIMENSION_BIG][1 << MAX_INDEX_BITS];

    for (i = 0; i < (int)bti[m_blockMode].subsetCount; i++)
    {
        // Unpack the colours
        GetBC7Ramp(endpoint[i],
            c[i],
            clusters,
            m_componentBits);
    }

    //--------------------------------------------------------------------
    // New Code : Possible replacement for BC7_PARTITIONS for CPU code
    //--------------------------------------------------------------------
    // Generate the block colours.
    // for (i = 0; i < MAX_SUBSET_SIZE; i++)
    // {
    //     CGV_UINT8   p = get_partition_subset(m_partition, bti[m_blockMode].subsetCount - 1, i);
    //     out[i][0] = c[p][0][blockIndices[i]];
    //     out[i][1] = c[p][1][blockIndices[i]];
    //     out[i][2] = c[p][2][blockIndices[i]];
    //     out[i][3] = c[p][3][blockIndices[i]];
    // }

    // Generate the block colours.
    for(i=0; i<MAX_SUBSET_SIZE; i++)
    {
        for(j=0; j < MAX_DIMENSION_BIG; j++)
        {
            out[i][j] = (CGU_UINT8)c[partitionTable[i]][j][blockIndices[i]];
        }
    }
}

void  CompressBlockBC7_Internal(
    CGU_UINT8                      image_src[SOURCE_BLOCK_SIZE][4],
    CMP_GLOBAL    CGV_CMPOUT       cmp_out[COMPRESSED_BLOCK_SIZE],
    uniform CMP_GLOBAL BC7_Encode  u_BC7Encode[])
{
    BC7_EncodeState _state = { 0 };
    varying BC7_EncodeState* uniform state = &_state;

    copy_BC7_Encode_settings(state, u_BC7Encode);

    CGU_UINT8 offsetR = 0;
    CGU_UINT8 offsetG = 16;
    CGU_UINT8 offsetB = 32;
    CGU_UINT8 offsetA = 48;
    for (CGU_UINT8 i = 0; i < SOURCE_BLOCK_SIZE; i++)
    {
        state->image_src[offsetR++] = (CGV_IMAGE)image_src[i][0];
        state->image_src[offsetG++] = (CGV_IMAGE)image_src[i][1];
        state->image_src[offsetB++] = (CGV_IMAGE)image_src[i][2];
        state->image_src[offsetA++] = (CGV_IMAGE)image_src[i][3];
    }

    BC7_CompressBlock(state, u_BC7Encode);

    if (state->cmp_isout16Bytes)
    {
        for (CGU_UINT8 i = 0; i < COMPRESSED_BLOCK_SIZE; i++)
        {
            cmp_out[i] = state->cmp_out[i];
        }
    }
    else
    {
#ifdef ASPM_GPU
        cmp_memcpy(cmp_out, (CGU_UINT8 *)state->best_cmp_out, 16);
#else
        memcpy(cmp_out, state->best_cmp_out, 16);
#endif
    }
}

//======================= CPU USER INTERFACES ====================================

int CMP_CDECL CreateOptionsBC7(void **options)
{
    (*options) = new BC7_Encode;
    if (!options) return CGU_CORE_ERR_NEWMEM;
    init_BC7ramps();
    SetDefaultBC7Options((BC7_Encode *)(*options));
    return CGU_CORE_OK;
}

int CMP_CDECL DestroyOptionsBC7(void *options)
{
    if (!options) return CGU_CORE_ERR_INVALIDPTR;
    BC7_Encode *BCOptions = reinterpret_cast <BC7_Encode *>(options);
    delete BCOptions;
    return CGU_CORE_OK;
}

int CMP_CDECL SetErrorThresholdBC7(void *options, CGU_FLOAT minThreshold, CGU_FLOAT maxThreshold)
{
    if (!options) return CGU_CORE_ERR_INVALIDPTR;
    BC7_Encode *BC7optionsDefault = (BC7_Encode *)options;

    if (minThreshold < 0.0f) minThreshold = 0.0f;
    if (maxThreshold < 0.0f) maxThreshold = 0.0f;

    BC7optionsDefault->minThreshold = minThreshold;
    BC7optionsDefault->maxThreshold = maxThreshold;
    return CGU_CORE_OK;
}

int CMP_CDECL SetQualityBC7(void *options, CGU_FLOAT fquality)
{
    if (!options) return CGU_CORE_ERR_INVALIDPTR;

    BC7_Encode *BC7optionsDefault = (BC7_Encode *)options;
    if (fquality < 0.0f) fquality = 0.0f;
    else
        if (fquality > 1.0f) fquality = 1.0f;
    BC7optionsDefault->quality = fquality;

    // Set Error Thresholds
    BC7optionsDefault->errorThreshold           = BC7optionsDefault->maxThreshold * (1.0f - fquality);
    if(fquality > BC7_qFAST_THRESHOLD)
        BC7optionsDefault->errorThreshold      += BC7optionsDefault->minThreshold;

    return CGU_CORE_OK;
}

int CMP_CDECL SetMaskBC7(void *options, CGU_UINT8 mask)
{
    if (!options) return CGU_CORE_ERR_INVALIDPTR;
    BC7_Encode *BC7options = (BC7_Encode *)options;
    BC7options->validModeMask = mask;
    return CGU_CORE_OK;
}

int CMP_CDECL SetAlphaOptionsBC7(void *options, CGU_BOOL imageNeedsAlpha, CGU_BOOL colourRestrict, CGU_BOOL alphaRestrict)
{
    if (!options) return CGU_CORE_ERR_INVALIDPTR;
    BC7_Encode *u_BC7Encode = (BC7_Encode *)options;
    u_BC7Encode->imageNeedsAlpha = imageNeedsAlpha;
    u_BC7Encode->colourRestrict  = colourRestrict;
    u_BC7Encode->alphaRestrict   = alphaRestrict;
    return CGU_CORE_OK;
}

int CMP_CDECL CompressBlockBC7( const unsigned char *srcBlock,
                                unsigned int srcStrideInBytes,
                                CMP_GLOBAL unsigned char cmpBlock[16],
                                const void* options = NULL)
{
    CMP_Vec4uc inBlock[SOURCE_BLOCK_SIZE];

    //----------------------------------
    // Fill the inBlock with source data
    //----------------------------------
    CGU_INT srcpos = 0;
    CGU_INT dstptr = 0;
    for (CGU_UINT8 row = 0; row < 4; row++)
    {
        srcpos = row * srcStrideInBytes;
        for (CGU_UINT8 col = 0; col < 4; col++)
        {
            inBlock[dstptr].x = CGU_UINT8(srcBlock[srcpos++]);
            inBlock[dstptr].y = CGU_UINT8(srcBlock[srcpos++]);
            inBlock[dstptr].z = CGU_UINT8(srcBlock[srcpos++]);
            inBlock[dstptr].w = CGU_UINT8(srcBlock[srcpos++]);
            dstptr++;
        }
    }


    BC7_Encode *u_BC7Encode = (BC7_Encode *)options;
    BC7_Encode       BC7EncodeDefault = { 0 };
    if (u_BC7Encode == NULL)
    {
        u_BC7Encode = &BC7EncodeDefault;
        SetDefaultBC7Options(u_BC7Encode);
        init_BC7ramps();
    }

    BC7_EncodeState EncodeState
#ifndef ASPM
        = { 0 }
#endif
    ;
    EncodeState.best_err        = CMP_FLOAT_MAX;
    EncodeState.validModeMask   = u_BC7Encode->validModeMask;
    EncodeState.part_count      = u_BC7Encode->part_count;
    EncodeState.channels        = static_cast<CGU_CHANNEL>(u_BC7Encode->channels);

    CGU_UINT8 offsetR = 0;
    CGU_UINT8 offsetG = 16;
    CGU_UINT8 offsetB = 32;
    CGU_UINT8 offsetA = 48;
    CGU_UINT32 offsetSRC = 0;
    for (CGU_UINT8 i = 0; i < SOURCE_BLOCK_SIZE; i++)
    {
        EncodeState.image_src[offsetR++] = (CGV_IMAGE)inBlock[offsetSRC].x;
        EncodeState.image_src[offsetG++] = (CGV_IMAGE)inBlock[offsetSRC].y;
        EncodeState.image_src[offsetB++] = (CGV_IMAGE)inBlock[offsetSRC].z;
        EncodeState.image_src[offsetA++] = (CGV_IMAGE)inBlock[offsetSRC].w;
        offsetSRC++;
    }

    BC7_CompressBlock(&EncodeState, u_BC7Encode);

    if (EncodeState.cmp_isout16Bytes)
    {
        for (CGU_UINT8 i = 0; i < COMPRESSED_BLOCK_SIZE; i++)
        {
            cmpBlock[i] = EncodeState.cmp_out[i];
        }
    }
    else
    {
        memcpy(cmpBlock, EncodeState.best_cmp_out, 16);
    }

    return CGU_CORE_OK;
}

int  CMP_CDECL DecompressBlockBC7(const unsigned char cmpBlock[16],
                                  unsigned char srcBlock[64],
                                  const void *options = NULL) {
    BC7_Encode *u_BC7Encode = (BC7_Encode *)options;
    BC7_Encode       BC7EncodeDefault = { 0 }; // for q = 0.05
    if (u_BC7Encode == NULL)
    {
        // set for q = 1.0
        u_BC7Encode = &BC7EncodeDefault;
        SetDefaultBC7Options(u_BC7Encode);
        init_BC7ramps();
    }
    DecompressBC7_internal((CGU_UINT8(*)[4])srcBlock, (CGU_UINT8 *)cmpBlock,u_BC7Encode);
    return CGU_CORE_OK;
}
#endif
#endif

//============================================== OpenCL USER INTERFACE ====================================================
#ifdef ASPM_GPU
CMP_STATIC CMP_KERNEL void CMP_GPUEncoder(uniform CMP_GLOBAL const  CGU_Vec4uc      ImageSource[],
                                                  CMP_GLOBAL        CGV_CMPOUT      ImageDestination[],
                                          uniform CMP_GLOBAL        Source_Info     SourceInfo[],
                                          uniform CMP_GLOBAL        BC7_Encode      BC7Encode[] )
{
    CGU_INT xID=0;
    CGU_INT yID=0;

    xID = get_global_id(0);         // ToDo: Define a size_t 32 bit and 64 bit basd on clGetDeviceInfo
    yID = get_global_id(1);

    CGU_INT  srcWidth  = SourceInfo->m_src_width;
    CGU_INT  srcHeight = SourceInfo->m_src_height;
    if (xID >= (srcWidth  / BlockX)) return;
    if (yID >= (srcHeight / BlockY)) return;

    CGU_INT     destI = (xID*COMPRESSED_BLOCK_SIZE) + (yID*(srcWidth / BlockX)*COMPRESSED_BLOCK_SIZE);
    CGU_INT     srcindex = 4 * (yID * srcWidth + xID);
    CGU_INT     blkindex = 0;
    BC7_EncodeState EncodeState;
    varying BC7_EncodeState* uniform state = &EncodeState;

   copy_BC7_Encode_settings(state, BC7Encode);

    //Check if it is a complete 4X4 block
    if (((xID + 1)*BlockX <= srcWidth) && ((yID + 1)*BlockY <= srcHeight))
    {
        srcWidth = srcWidth - 4;
        for (CGU_INT j = 0; j < 4; j++) {
            for (CGU_INT i = 0; i < 4; i++) {
                state->image_src[blkindex+0*SOURCE_BLOCK_SIZE] = ImageSource[srcindex].x;
                state->image_src[blkindex+1*SOURCE_BLOCK_SIZE] = ImageSource[srcindex].y;
                state->image_src[blkindex+2*SOURCE_BLOCK_SIZE] = ImageSource[srcindex].z;
                state->image_src[blkindex+3*SOURCE_BLOCK_SIZE] = ImageSource[srcindex].w;
                blkindex++;
                srcindex++;
            }

            srcindex += srcWidth;
        }

   copy_BC7_Encode_settings(state, BC7Encode);

    BC7_CompressBlock(&EncodeState, BC7Encode);

    for (CGU_INT i=0; i<COMPRESSED_BLOCK_SIZE; i++)
    {
        ImageDestination[destI+i] = state->cmp_out[i];
    }

    }
    else
    {
        ASPM_PRINT(("[ASPM_GPU] Unable to process, make sure image size is divisible by 4"));
    }
}
#endif