nvidia-texture-tools/extern/CMP_Core/shaders/BC6_Encode_kernel.cpp

//=====================================================================
// Copyright (c) 2020    Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files(the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and / or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions :
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
//
//=====================================================================
#include "BC6_Encode_kernel.h"

#ifdef ASPM_GPU
void memset(CGU_UINT8 *srcdata, CGU_UINT8 value, CGU_INT size)
{
    for (CGU_INT i = 0; i < size; i++)
        *srcdata++ = value;
}

void memcpy(CGU_UINT8 *dstdata, CGU_UINT8 *srcdata, CGU_INT size)
{
    for (CGU_INT i = 0; i < size; i++)
    {
        *dstdata = *srcdata;
        srcdata++;
        dstdata++;
    }
}

void swap(CGU_INT A, CGU_INT B)
{
    CGU_INT hold = A;
    A = B;
    B = hold;
}

#define abs      fabs
#define floorf   floor
#define sqrtf    sqrt
#define logf     log
#define ceilf    ceil

#endif

__constant CGU_UINT8   BC6_PARTITIONS[MAX_BC6H_PARTITIONS][MAX_SUBSET_SIZE] = {
   { // 0
       0,0,1,1,0,0,1,1,0,0,1,1,0,0,1,1
   },

   { // 1
       0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1
   },

   { // 2
       0,1,1,1,0,1,1,1,0,1,1,1,0,1,1,1
   },

   { // 3
       0,0,0,1,0,0,1,1,0,0,1,1,0,1,1,1
   },

   { // 4
       0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,1
   },

   { // 5
       0,0,1,1,0,1,1,1, 0,1,1,1,1,1,1,1
   },

   { // 6
       0,0,0,1,0,0,1,1,0,1,1,1,1,1,1,1
   },

   { // 7
       0,0,0,0,0,0,0,1,0,0,1,1,0,1,1,1
   },

   { // 8
       0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1
   },

   { // 9
       0,0,1,1,0,1,1,1,1,1,1,1,1,1,1,1
   },

   { // 10
       0,0,0,0,0,0,0,1,0,1,1,1,1,1,1,1
   },

   { // 11
       0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,1
   },

   { // 12
       0,0,0,1,0,1,1,1,1,1,1,1,1,1,1,1
   },

   { // 13
       0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1
   },

   { // 14
       0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1
   },

   { // 15
       0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1
   },

   { // 16
       0,0,0,0,1,0,0,0,1,1,1,0,1,1,1,1
   },

   { // 17
       0,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0
   },

   { // 18
       0,0,0,0,0,0,0,0,1,0,0,0,1,1,1,0
   },

   { // 19
       0,1,1,1,0,0,1,1,0,0,0,1,0,0,0,0
   },

   { // 20
       0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0
   },

   { // 21
       0,0,0,0,1,0,0,0,1,1,0,0,1,1,1,0
   },

   { // 22
       0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0
   },

   { // 23
       0,1,1,1,0,0,1,1,0,0,1,1,0,0,0,1
   },

   { // 24
       0,0,1,1,0,0,0,1,0,0,0,1,0,0,0,0
   },

   { // 25
       0,0,0,0,1,0,0,0,1,0,0,0,1,1,0,0
   },

   { // 26
       0,1,1,0,0,1,1,0,0,1,1,0,0,1,1,0
   },

   { // 27
       0,0,1,1,0,1,1,0,0,1,1,0,1,1,0,0
   },

   { // 28
       0,0,0,1,0,1,1,1,1,1,1,0,1,0,0,0
   },

   { // 29
       0,0,0,0,1,1,1,1,1,1,1,1,0,0,0,0
   },

   { // 30
       0,1,1,1,0,0,0,1,1,0,0,0,1,1,1,0
   },

   { // 31
       0,0,1,1,1,0,0,1,1,0,0,1,1,1,0,0
   },
};

CGU_DWORD get_partition_subset(CGU_INT subset, CGU_INT partI, CGU_INT index)
{
    if (subset)
        return BC6_PARTITIONS[partI][index];
    else
        return 0;
}

void    Partition(CGU_INT      shape,
                  CGU_FLOAT    in[][MAX_DIMENSION_BIG],
                  CGU_FLOAT    subsets[MAX_SUBSETS][MAX_SUBSET_SIZE][MAX_DIMENSION_BIG], //[3][16][4]
                  CGU_INT      count[MAX_SUBSETS],
                  CGU_INT8     ShapeTableToUse,
                  CGU_INT      dimension)
{
    int   i, j;
    int   insubset = -1, inpart = 0;

    // Dont use memset: this is better for now
    for (i = 0; i < MAX_SUBSETS; i++) count[i] = 0;

    switch (ShapeTableToUse)
    {
    case    0:
    case    1:
        insubset = 0;
        inpart = 0;
        break;
    case    2:
        insubset = 1;
        inpart = shape;
        break;
    default:
        break;
    }

    // Nothing to do!!: Must indicate an error to user
    if (insubset == -1) return; // Nothing to do!!

    for (i = 0; i < MAX_SUBSET_SIZE; i++)
    {
        int   subset = get_partition_subset(insubset, inpart, i);
        for (j = 0; j < dimension; j++)
        {
            subsets[subset][count[subset]][j] = in[i][j];
        }
        if (dimension < MAX_DIMENSION_BIG)
        {
            subsets[subset][count[subset]][j] = 0.0;
        }
        count[subset]++;
    }

}

void GetEndPoints(CGU_FLOAT EndPoints[MAX_SUBSETS][MAX_END_POINTS][MAX_DIMENSION_BIG], CGU_FLOAT outB[MAX_SUBSETS][MAX_SUBSET_SIZE][MAX_DIMENSION_BIG], CGU_INT max_subsets, int entryCount[MAX_SUBSETS])
{
    // Should have some sort of error notification!
    if (max_subsets > MAX_SUBSETS) return;

    // Save Min and Max OutB points as EndPoints
    for (int subset = 0; subset < max_subsets; subset++)
    {
        // We now have points on direction vector(s)
        // find the min and max points
        CGU_FLOAT min = CMP_HALF_MAX;
        CGU_FLOAT max = 0;
        CGU_FLOAT val;
        int mini = 0;
        int maxi = 0;


        for (int i = 0; i < entryCount[subset]; i++)
        {
            val = outB[subset][i][0] + outB[subset][i][1] + outB[subset][i][2];
            if (val < min)
            {
                min = val;
                mini = i;
            }
            if (val > max)
            {
                max = val;
                maxi = i;
            }
        }

        // Is round best for this !
        for (int c = 0; c < MAX_DIMENSION_BIG; c++)
        {
            EndPoints[subset][0][c] = outB[subset][mini][c];
        }

        for (int c = 0; c < MAX_DIMENSION_BIG; c++)
        {
            EndPoints[subset][1][c] = outB[subset][maxi][c];
        }
    }
}

void covariance_d(CGU_FLOAT data[][MAX_DIMENSION_BIG], CGU_INT numEntries, CGU_FLOAT cov[MAX_DIMENSION_BIG][MAX_DIMENSION_BIG], CGU_INT dimension)
{
#ifdef USE_DBGTRACE
    DbgTrace(());
#endif
    int i, j, k;

    for (i = 0; i < dimension; i++)
        for (j = 0; j <= i; j++)
        {
            cov[i][j] = 0;
            for (k = 0; k < numEntries; k++)
                cov[i][j] += data[k][i] * data[k][j];
        }

    for (i = 0; i < dimension; i++)
        for (j = i + 1; j < dimension; j++)
            cov[i][j] = cov[j][i];
}

void centerInPlace_d(CGU_FLOAT data[][MAX_DIMENSION_BIG], int numEntries, CGU_FLOAT mean[MAX_DIMENSION_BIG], CGU_INT dimension)
{
#ifdef USE_DBGTRACE
    DbgTrace(());
#endif
    int i, k;

    for (i = 0; i < dimension; i++)
    {
        mean[i] = 0;
        for (k = 0; k < numEntries; k++)
            mean[i] += data[k][i];
    }

    if (!numEntries)
        return;

    for (i = 0; i < dimension; i++)
    {
        mean[i] /= numEntries;
        for (k = 0; k < numEntries; k++)
            data[k][i] -= mean[i];
    }
}

void eigenVector_d(CGU_FLOAT cov[MAX_DIMENSION_BIG][MAX_DIMENSION_BIG], CGU_FLOAT vector[MAX_DIMENSION_BIG], CGU_INT dimension)
{
#ifdef USE_DBGTRACE
    DbgTrace(());
#endif
    // calculate an eigenvecto corresponding to a biggest eigenvalue
    // will work for non-zero non-negative matricies only

#define EV_ITERATION_NUMBER 20
#define EV_SLACK            2        /* additive for exp base 2)*/


    CGU_INT i, j, k, l, m, n, p, q;
    CGU_FLOAT c[2][MAX_DIMENSION_BIG][MAX_DIMENSION_BIG];
    CGU_FLOAT maxDiag;

    for (i = 0; i < dimension; i++)
        for (j = 0; j < dimension; j++)
            c[0][i][j] = cov[i][j];

    p = (int)floorf(log((FLT_MAX_EXP - EV_SLACK) / ceilf(logf((CGU_FLOAT)dimension) / logf(2.0f))) / logf(2.0f));

    //assert(p>0);

    p = p > 0 ? p : 1;

    q = (EV_ITERATION_NUMBER + p - 1) / p;

    l = 0;

    for (n = 0; n < q; n++)
    {
        maxDiag = 0;

        for (i = 0; i < dimension; i++)
            maxDiag = c[l][i][i] > maxDiag ? c[l][i][i] : maxDiag;

        if (maxDiag <= 0)
        {
            return;
        }

        //assert(maxDiag >0);

        for (i = 0; i < dimension; i++)
            for (j = 0; j < dimension; j++)
                c[l][i][j] /= maxDiag;

        for (m = 0; m < p; m++) {
            for (i = 0; i < dimension; i++)
                for (j = 0; j < dimension; j++) {
                    CGU_FLOAT temp = 0;
                    for (k = 0; k < dimension; k++)
                    {
                        // Notes:
                        // This is the most consuming portion of the code and needs optimizing for perfromance
                        temp += c[l][i][k] * c[l][k][j];
                    }
                    c[1 - l][i][j] = temp;
                }
            l = 1 - l;
        }
    }

    maxDiag = 0;
    k = 0;

    for (i = 0; i < dimension; i++)
    {
        k = c[l][i][i] > maxDiag ? i : k;
        maxDiag = c[l][i][i] > maxDiag ? c[l][i][i] : maxDiag;
    }
    CGU_FLOAT t;
    t = 0;
    for (i = 0; i < dimension; i++)
    {
        t += c[l][k][i] * c[l][k][i];
        vector[i] = c[l][k][i];
    }
    // normalization is really optional
    t = sqrtf(t);
    //assert(t>0);

    if (t <= 0)
    {
        return;
    }
    for (i = 0; i < dimension; i++)
        vector[i] /= t;
}

void project_d(CGU_FLOAT data[][MAX_DIMENSION_BIG], CGU_INT numEntries, CGU_FLOAT vector[MAX_DIMENSION_BIG], CGU_FLOAT projection[MAX_ENTRIES], CGU_INT dimension)
{
#ifdef USE_DBGTRACE
    DbgTrace(());
#endif
    // assume that vector is normalized already
    int i, k;

    for (k = 0; k < numEntries; k++)
    {
        projection[k] = 0;
        for (i = 0; i < dimension; i++)
        {
            projection[k] += data[k][i] * vector[i];
        }
    }
}

typedef struct {
    CGU_FLOAT d;
    int i;
} a;

inline CGU_INT a_compare(const void *arg1, const void *arg2)
{
    if (((a*)arg1)->d - ((a*)arg2)->d > 0) return 1;
    if (((a*)arg1)->d - ((a*)arg2)->d < 0) return -1;
    return 0;
};

void sortProjection(CGU_FLOAT projection[MAX_ENTRIES], CGU_INT order[MAX_ENTRIES], CGU_INT numEntries)
{
    int i;
    a what[MAX_ENTRIES + MAX_PARTITIONS_TABLE];

    for (i = 0; i < numEntries; i++)
        what[what[i].i = i].d = projection[i];

#ifdef USE_QSORT
    qsort((void*)&what, numEntries, sizeof(a), a_compare);
#else
    {
        int j;
        int tmp;
        CGU_FLOAT tmp_d;
        for (i = 1; i < numEntries; i++)
        {
            for (j = i; j > 0; j--)
            {
                if (what[j - 1].d > what[j].d)
                {
                    tmp = what[j].i;
                    tmp_d = what[j].d;
                    what[j].i = what[j - 1].i;
                    what[j].d = what[j - 1].d;
                    what[j - 1].i = tmp;
                    what[j - 1].d = tmp_d;
                }
            }
        }
    }
#endif


    for (i = 0; i < numEntries; i++)
        order[i] = what[i].i;
};

CGU_FLOAT totalError_d(CGU_FLOAT data[MAX_ENTRIES][MAX_DIMENSION_BIG], CGU_FLOAT data2[MAX_ENTRIES][MAX_DIMENSION_BIG], CGU_INT numEntries, CGU_INT dimension)
{
    int i, j;
    CGU_FLOAT t = 0;
    for (i = 0; i < numEntries; i++)
        for (j = 0; j < dimension; j++)
            t += (data[i][j] - data2[i][j])*(data[i][j] - data2[i][j]);

    return t;
};

// input:
//
// v_  points, might be uncentered
// k - number of points in the ramp
// n - number of points in v_
//
// output:
// index, uncentered, in the range 0..k-1
//

void quant_AnD_Shell(CGU_FLOAT* v_, CGU_INT k, CGU_INT n, CGU_INT idx[MAX_ENTRIES])
{
#define MAX_BLOCK MAX_ENTRIES
    CGU_INT i, j;
    CGU_FLOAT v[MAX_BLOCK];
    CGU_FLOAT z[MAX_BLOCK];
    a d[MAX_BLOCK];
    CGU_FLOAT l;
    CGU_FLOAT mm;
    CGU_FLOAT r = 0;
    CGU_INT mi;

    CGU_FLOAT m, M, s, dm = 0.;
    m = M = v_[0];

    for (i = 1; i < n; i++) {
        m = m < v_[i] ? m : v_[i];
        M = M > v_[i] ? M : v_[i];
    }
    if (M == m) {
        for (i = 0; i < n; i++)
            idx[i] = 0;
        return;
    }

    //assert(M - m >0);
    s = (k - 1) / (M - m);
    for (i = 0; i < n; i++) {
        v[i] = v_[i] * s;

        idx[i] = (int)(z[i] = (v[i] + 0.5f /* stabilizer*/ - m * s));  //floorf(v[i] + 0.5f /* stabilizer*/ - m *s));

        d[i].d = v[i] - z[i] - m * s;
        d[i].i = i;
        dm += d[i].d;
        r += d[i].d*d[i].d;
    }
    if (n*r - dm * dm >= (CGU_FLOAT)(n - 1) / 4 /*slack*/ / 2) {

        dm /= (CGU_FLOAT)n;

        for (i = 0; i < n; i++)
            d[i].d -= dm;


        //!!! Need an OpenCL version of qsort
#ifdef USE_QSORT
        qsort((void*)&d, n, sizeof(a), a_compare);
#else
        {
            CGU_INT tmp;
            CGU_FLOAT tmp_d;
            for (i = 1; i < n; i++) {
                for (j = i; j > 0; j--)
                {
                    if (d[j - 1].d > d[j].d)
                    {
                        tmp = d[j].i;
                        tmp_d = d[j].d;
                        d[j].i = d[j - 1].i;
                        d[j].d = d[j - 1].d;
                        d[j - 1].i = tmp;
                        d[j - 1].d = tmp_d;
                    }
                }
            }
        }
#endif
        // got into fundamental simplex
        // move coordinate system origin to its center
        for (i = 0; i < n; i++)
            d[i].d -= (2.0f*(CGU_FLOAT)i + 1.0f - (CGU_FLOAT)n) / 2.0f / (CGU_FLOAT)n;

        mm = l = 0.;
        j = -1;
        for (i = 0; i < n; i++) {
            l += d[i].d;
            if (l < mm) {
                mm = l;
                j = i;
            }
        }

        // position which should be in 0
        j = j + 1;
        j = j % n;

        for (i = j; i < n; i++)
            idx[d[i].i]++;
    }
    // get rid of an offset in idx
    mi = idx[0];
    for (i = 1; i < n; i++)
        mi = mi < idx[i] ? mi : idx[i];

    for (i = 0; i < n; i++)
        idx[i] -= mi;
}

CGU_FLOAT optQuantAnD_d(CGU_FLOAT data[MAX_ENTRIES][MAX_DIMENSION_BIG],
                        CGU_INT numEntries,
                        CGU_INT numClusters,
                        CGU_INT index[MAX_ENTRIES],
                        CGU_FLOAT out[MAX_ENTRIES][MAX_DIMENSION_BIG],
                        CGU_FLOAT direction[MAX_DIMENSION_BIG], CGU_FLOAT *step,
                        CGU_INT dimension,
                        CGU_FLOAT quality)
{
    CGU_INT index_[MAX_ENTRIES];

    CGU_INT maxTry = (int)(MAX_TRY * quality);
    CGU_INT try_two = 50;

    CGU_INT i, j, k;
    CGU_FLOAT t, s;

    CGU_FLOAT centered[MAX_ENTRIES][MAX_DIMENSION_BIG];

    CGU_FLOAT mean[MAX_DIMENSION_BIG];

    CGU_FLOAT cov[MAX_DIMENSION_BIG][MAX_DIMENSION_BIG];

    CGU_FLOAT projected[MAX_ENTRIES];

    CGU_INT order_[MAX_ENTRIES];


    for (i = 0; i < numEntries; i++)
        for (j = 0; j < dimension; j++)
            centered[i][j] = data[i][j];

    centerInPlace_d(centered, numEntries, mean, dimension);
    covariance_d(centered, numEntries, cov, dimension);

    // check if they all are the same

    t = 0;
    for (j = 0; j < dimension; j++)
        t += cov[j][j];

    if (numEntries == 0) {
        for (i = 0; i < numEntries; i++) {
            index[i] = 0;
            for (j = 0; j < dimension; j++)
                out[i][j] = mean[j];
        }
        return 0.0f;
    }

    eigenVector_d(cov, direction, dimension);
    project_d(centered, numEntries, direction, projected, dimension);

    for (i = 0; i < maxTry; i++)
    {
        CGU_INT done = 0;

        if (i)
        {
            do
            {
                CGU_FLOAT q;
                q = s = t = 0;

                for (k = 0; k < numEntries; k++)
                {
                    s += index[k];
                    t += index[k] * index[k];
                }

                for (j = 0; j < dimension; j++)
                {
                    direction[j] = 0;
                    for (k = 0; k < numEntries; k++)
                        direction[j] += centered[k][j] * index[k];
                    q += direction[j] * direction[j];

                }

                s /= (CGU_FLOAT)numEntries;
                t = t - s * s * (CGU_FLOAT)numEntries;
                //assert(t != 0);
                t = (t == 0.0f ? 0.0f : 1.0f / t);
                // We need to requantize

                q = sqrtf(q);
                t *= q;

                if (q != 0)
                    for (j = 0; j < dimension; j++)
                        direction[j] /= q;

                // direction normalized

                project_d(centered, numEntries, direction, projected, dimension);
                sortProjection(projected, order_, numEntries);

                CGU_INT index__[MAX_ENTRIES];

                // it's projected and centered; cluster centers are (index[i]-s)*t (*dir)
                k = 0;
                for (j = 0; j < numEntries; j++)
                {
                    while (projected[order_[j]] > (k + 0.5 - s)*t  && k < numClusters - 1)
                        k++;
                    index__[order_[j]] = k;
                }
                done = 1;
                for (j = 0; j < numEntries; j++)
                {
                    done = (done && (index__[j] == index[j]));
                    index[j] = index__[j];
                }
            } while (!done && try_two--);

            if (i == 1)
                for (j = 0; j < numEntries; j++)
                    index_[j] = index[j];
            else
            {
                done = 1;
                for (j = 0; j < numEntries; j++)
                {
                    done = (done && (index_[j] == index[j]));
                    index_[j] = index_[j];
                }
                if (done)
                    break;

            }
        }

        quant_AnD_Shell(projected, numClusters, numEntries, index);
    }
    s = t = 0;

    CGU_FLOAT q = 0;

    for (k = 0; k < numEntries; k++)
    {
        s += index[k];
        t += index[k] * index[k];
    }

    for (j = 0; j < dimension; j++)
    {
        direction[j] = 0;
        for (k = 0; k < numEntries; k++)
            direction[j] += centered[k][j] * index[k];
        q += direction[j] * direction[j];
    }

    s /= (CGU_FLOAT)numEntries;

    t = t - s * s * (CGU_FLOAT)numEntries;

    //assert(t != 0);

    t = (t == 0.0 ? 0.0f : 1.0f / t);

    for (i = 0; i < numEntries; i++)
        for (j = 0; j < dimension; j++)
            out[i][j] = mean[j] + direction[j] * t*(index[i] - s);

    // normalize direction for output

    q = sqrtf(q);
    *step = t * q;
    for (j = 0; j < dimension; j++)
        direction[j] /= q;

    return totalError_d(data, out, numEntries, dimension);
}

void clampF16Max(CGU_FLOAT EndPoints[MAX_SUBSETS][MAX_END_POINTS][MAX_DIMENSION_BIG], CGU_BOOL isSigned)
{
    for (CGU_INT region = 0; region < 2; region++)
        for (CGU_INT ab = 0; ab < 2; ab++)
            for (CGU_INT rgb = 0; rgb < 3; rgb++)
            {
                if (isSigned)
                {
                    if (EndPoints[region][ab][rgb] < -FLT16_MAX) EndPoints[region][ab][rgb] = -FLT16_MAX;
                    else if (EndPoints[region][ab][rgb] > FLT16_MAX) EndPoints[region][ab][rgb] = FLT16_MAX;
                }
                else
                {
                    if (EndPoints[region][ab][rgb] < 0.0) EndPoints[region][ab][rgb] = 0.0;
                    else if (EndPoints[region][ab][rgb] > FLT16_MAX) EndPoints[region][ab][rgb] = FLT16_MAX;
                }
                // Zero region
                // if ((EndPoints[region][ab][rgb] > -0.01) && ((EndPoints[region][ab][rgb] < 0.01))) EndPoints[region][ab][rgb] = 0.0;
            }
}

//=====================================================================================================================
#define LOG_CL_BASE         2
#define BIT_BASE            5
#define LOG_CL_RANGE        5
#define BIT_RANGE           9
#define MAX_CLUSTERS_BIG    16
#define BTT(bits)           (bits-BIT_BASE)
#define CLT(cl)             (cl-LOG_CL_BASE)

#ifdef USE_BC6RAMPS

int spidx(int in_data, int in_clogs, int in_bits, int in_p2, int in_o1, int in_o2, int in_i)
{
    // use BC7 sp_idx
    return 0;
}

float sperr(int in_data, int clogs, int bits, int p2, int o1, int o2)
{
     // use BC7 sp_err
    return 0,0f;
}
#endif

__constant CGU_FLOAT rampLerpWeightsBC6[5][16] =
{
    { 0.0 }, // 0 bit index
    { 0.0, 1.0 }, // 1 bit index
    { 0.0, 21.0 / 64.0, 43.0 / 64.0, 1.0 }, // 2 bit index
    { 0.0, 9.0 / 64.0, 18.0 / 64.0, 27.0 / 64.0, 37.0 / 64.0, 46.0 / 64.0, 55.0 / 64.0, 1.0 }, // 3 bit index
    { 0.0, 4.0 / 64.0, 9.0 / 64.0, 13.0 / 64.0, 17.0 / 64.0, 21.0 / 64.0, 26.0 / 64.0, 30.0 / 64.0,
    34.0 / 64.0, 38.0 / 64.0, 43.0 / 64.0, 47.0 / 64.0, 51.0 / 64.0, 55.0 / 64.0, 60.0 / 64.0, 1.0 } // 4 bit index
};


CGU_FLOAT rampf(CGU_INT clogs, CGU_FLOAT p1, CGU_FLOAT p2, CGU_INT indexPos)
{
    // (clogs+ LOG_CL_BASE) starts from 2 to 4
    return  (CGU_FLOAT)p1 + rampLerpWeightsBC6[clogs + LOG_CL_BASE][indexPos] * (p2 - p1);
}

CGU_INT all_same_d(CGU_FLOAT d[][MAX_DIMENSION_BIG], CGU_INT n, CGU_INT dimension)
{
    CGU_INT i, j;
    CGU_INT same = 1;
    for (i = 1; i < n; i++)
        for (j = 0; j < dimension; j++)
            same = same && (d[0][j] == d[i][j]);

    return(same);
}

// return the max index from a set of indexes
CGU_INT max_index(CGU_INT a[], CGU_INT n)
{
    CGU_INT i, m = a[0];
    for (i = 0; i < n; i++)
        m = m > a[i] ? m : a[i];
    return (m);
}

CGU_INT cluster_mean_d_d(CGU_FLOAT d[MAX_ENTRIES][MAX_DIMENSION_BIG], CGU_FLOAT mean[MAX_ENTRIES][MAX_DIMENSION_BIG], CGU_INT index[], CGU_INT i_comp[], CGU_INT i_cnt[], CGU_INT n, CGU_INT dimension)
{
    // unused index values are underfined
    CGU_INT i, j, k;
    //assert(n!=0);

    for (i = 0; i < n; i++)
        for (j = 0; j < dimension; j++) {
            // assert(index[i]<MAX_CLUSTERS_BIG);
            mean[index[i]][j] = 0;
            i_cnt[index[i]] = 0;
        }
    k = 0;
    for (i = 0; i < n; i++) {
        for (j = 0; j < dimension; j++)
            mean[index[i]][j] += d[i][j];
        if (i_cnt[index[i]] == 0)
            i_comp[k++] = index[i];
        i_cnt[index[i]]++;
    }

    for (i = 0; i < k; i++)
        for (j = 0; j < dimension; j++)
            mean[i_comp[i]][j] /= (CGU_FLOAT)i_cnt[i_comp[i]];
    return k;
}

void mean_d_d(CGU_FLOAT d[][MAX_DIMENSION_BIG], CGU_FLOAT mean[MAX_DIMENSION_BIG], CGU_INT n, CGU_INT dimension)
{
    CGU_INT i, j;
    for (j = 0; j < dimension; j++)
        mean[j] = 0;
    for (i = 0; i < n; i++)
        for (j = 0; j < dimension; j++)
            mean[j] += d[i][j];
    for (j = 0; j < dimension; j++)
        mean[j] /= (CGU_FLOAT)n;
}

void index_collapse_kernel(CGU_INT index[], CGU_INT numEntries)
{
    CGU_INT k;
    CGU_INT d, D;
    CGU_INT mi;
    CGU_INT Mi;
    if (numEntries == 0)
        return;

    mi = Mi = index[0];
    for (k = 1; k < numEntries; k++) {
        mi = mi < index[k] ? mi : index[k];
        Mi = Mi > index[k] ? Mi : index[k];
    }
    D = 1;
    for (d = 2; d <= Mi - mi; d++) {

        for (k = 0; k < numEntries; k++)
            if ((index[k] - mi) % d != 0)
                break;
        if (k >= numEntries)
            D = d;
    }
    for (k = 0; k < numEntries; k++)
        index[k] = (index[k] - mi) / D;
}

CGU_INT max_int(CGU_INT a[], CGU_INT n)
{
    CGU_INT i, m = a[0];
    for (i = 0; i < n; i++)
        m = m > a[i] ? m : a[i];
    return (m);
}

__constant CGU_INT npv_nd[2][2 * MAX_DIMENSION_BIG] =
{
    { 1,2,4,8,16,32,0,0 }, //dimension = 3
    { 1,2,4,0,0,0,0,0 }    //dimension = 4
};

__constant short par_vectors_nd[2][8][128][2][MAX_DIMENSION_BIG] =
{
    { // Dimension = 3
        {
            { { 0,0,0,0 },{ 0,0,0,0 } },
            { { 0,0,0,0 },{ 0,0,0,0 } }
        },

    // 3*n+1    BCC          3*n+1        Cartesian 3*n            //same parity
        { // SAME_PAR
            { { 0,0,0 },{ 0,0,0 } },
            { { 1,1,1 },{ 1,1,1 } }
        },
    // 3*n+2    BCC          3*n+1        BCC          3*n+1
        { // BCC
            { { 0,0,0 },{ 0,0,0 } },
            { { 0,0,0 },{ 1,1,1 } },
            { { 1,1,1 },{ 0,0,0 } },
            { { 1,1,1 },{ 1,1,1 } }
        },
    // 3*n+3    FCC                    ???                        // ??????
    // BCC with FCC same or inverted, symmetric
        { // BCC_SAME_FCC
            { { 0,0,0 },{ 0,0,0 } },
            { { 1,1,0 },{ 1,1,0 } },
            { { 1,0,1 },{ 1,0,1 } },
            { { 0,1,1 },{ 0,1,1 } },

            { { 0,0,0 },{ 1,1,1 } },
            { { 1,1,1 },{ 0,0,0 } },
            { { 0,1,0 },{ 0,1,0 } },  // ??
            { { 1,1,1 },{ 1,1,1 } },

        },
        // 3*n+4    FCC          3*n+2        FCC          3*n+2
        {

            { { 0,0,0 },{ 0,0,0 } },
            { { 1,1,0 },{ 0,0,0 } },
            { { 1,0,1 },{ 0,0,0 } },
            { { 0,1,1 },{ 0,0,0 } },

            { { 0,0,0 },{ 1,1,0 } },
            { { 1,1,0 },{ 1,1,0 } },
            { { 1,0,1 },{ 1,1,0 } },
            { { 0,1,1 },{ 1,1,0 } },

            { { 0,0,0 },{ 1,0,1 } },
            { { 1,1,0 },{ 1,0,1 } },
            { { 1,0,1 },{ 1,0,1 } },
            { { 0,1,1 },{ 1,0,1 } },

            { { 0,0,0 },{ 0,1,1 } },
            { { 1,1,0 },{ 0,1,1 } },
            { { 1,0,1 },{ 0,1,1 } },
            { { 0,1,1 },{ 0,1,1 } }
        },


    // 3*n+5    Cartesian 3*n+3        FCC          3*n+2            //D^*[6]
        {

            { { 0,0,0 },{ 0,0,0 } },
            { { 1,1,0 },{ 0,0,0 } },
            { { 1,0,1 },{ 0,0,0 } },
            { { 0,1,1 },{ 0,0,0 } },

            { { 0,0,0 },{ 1,1,0 } },
            { { 1,1,0 },{ 1,1,0 } },
            { { 1,0,1 },{ 1,1,0 } },
            { { 0,1,1 },{ 1,1,0 } },

            { { 0,0,0 },{ 1,0,1 } },
            { { 1,1,0 },{ 1,0,1 } },
            { { 1,0,1 },{ 1,0,1 } },
            { { 0,1,1 },{ 1,0,1 } },

            { { 0,0,0 },{ 0,1,1 } },
            { { 1,1,0 },{ 0,1,1 } },
            { { 1,0,1 },{ 0,1,1 } },
            { { 0,1,1 },{ 0,1,1 } },


            { { 1,0,0 },{ 1,1,1 } },
            { { 0,1,0 },{ 1,1,1 } },
            { { 0,0,1 },{ 1,1,1 } },
            { { 1,1,1 },{ 1,1,1 } },

            { { 1,0,0 },{ 0,0,1 } },
            { { 0,1,0 },{ 0,0,1 } },
            { { 0,0,1 },{ 0,0,1 } },
            { { 1,1,1 },{ 0,0,1 } },

            { { 1,0,0 },{ 1,0,0 } },
            { { 0,1,0 },{ 1,0,0 } },
            { { 0,0,1 },{ 1,0,0 } },
            { { 1,1,1 },{ 1,0,0 } },

            { { 1,0,0 },{ 0,1,0 } },
            { { 0,1,0 },{ 0,1,0 } },
            { { 0,0,1 },{ 0,1,0 } },
            { { 1,1,1 },{ 0,1,0 } }
        }
    },// Dimension = 3
    { // Dimension = 4
        {
            { { 0,0,0,0 },{ 0,0,0,0 } },
            { { 0,0,0,0 },{ 0,0,0,0 } }
        },

    // 3*n+1    BCC          3*n+1        Cartesian 3*n            //same parity
        { // SAME_PAR
            { { 0,0,0,0 },{ 0,0,0,0 } },
            { { 1,1,1,1 },{ 1,1,1,1 } }
        },
    // 3*n+2    BCC          3*n+1        BCC          3*n+1
        { // BCC
            { { 0,0,0,0 },{ 0,0,0,0 } },
            { { 0,0,0,0 },{ 1,1,1,1 } },
            { { 1,1,1,1 },{ 0,0,0,0 } },
            { { 1,1,1,1 },{ 1,1,1,1 } }
        },
    // 3 PBIT
        {
            { { 0,0,0,0 },{ 0,0,0,0 } },
            { { 0,0,0,0 },{ 0,1,1,1 } },
            { { 0,1,1,1 },{ 0,0,0,0 } },
            { { 0,1,1,1 },{ 0,1,1,1 } },

            { { 1,0,0,0 },{ 1,0,0,0 } },
            { { 1,0,0,0 },{ 1,1,1,1 } },
            { { 1,1,1,1 },{ 1,0,0,0 } },
            { { 1,1,1,1 },{ 1,1,1,1 } }
        },

    // 4 PBIT
        {
            { { 0,0,0,0 },{ 0,0,0,0 } },
            { { 0,0,0,0 },{ 0,1,1,1 } },
            { { 0,1,1,1 },{ 0,0,0,0 } },
            { { 0,1,1,1 },{ 0,1,1,1 } },

            { { 1,0,0,0 },{ 1,0,0,0 } },
            { { 1,0,0,0 },{ 1,1,1,1 } },
            { { 1,1,1,1 },{ 1,0,0,0 } },
            { { 1,1,1,1 },{ 1,1,1,1 } },

            { { 0,0,0,0 },{ 0,0,0,0 } },
            { { 0,0,0,0 },{ 0,0,1,1 } },
            { { 0,0,1,1 },{ 0,0,0,0 } },
            { { 0,1,0,1 },{ 0,1,0,1 } },

            { { 1,0,0,0 },{ 1,0,0,0 } },
            { { 1,0,0,0 },{ 1,0,1,1 } },
            { { 1,0,1,1 },{ 1,0,0,0 } },
            { { 1,1,0,1 },{ 1,1,0,1 } },

        },

    } // Dimension = 4

};

CGU_INT get_par_vector(CGU_INT dim1, CGU_INT dim2, CGU_INT dim3, CGU_INT dim4, CGU_INT dim5)
{
    return par_vectors_nd[dim1][dim2][dim3][dim4][dim5];
}

CGU_FLOAT quant_single_point_d(CGU_FLOAT data[MAX_ENTRIES][MAX_DIMENSION_BIG],
                               CGU_INT numEntries, CGU_INT index[MAX_ENTRIES],
                               CGU_FLOAT out[MAX_ENTRIES][MAX_DIMENSION_BIG],
                               CGU_INT epo_1[2][MAX_DIMENSION_BIG],
                               CGU_INT Mi_,                // last cluster
                               CGU_INT type,
                               CGU_INT dimension)
{
    if (dimension < 3) return CMP_FLOAT_MAX;

    CGU_INT i, j;

    CGU_FLOAT err_0 = CMP_FLOAT_MAX;
    CGU_FLOAT err_1 = CMP_FLOAT_MAX;

    CGU_INT idx = 0;
    CGU_INT idx_1 = 0;

    CGU_INT epo_0[2][MAX_DIMENSION_BIG];

    CGU_INT use_par = (type != 0);

    CGU_INT clogs = 0;
    i = Mi_ + 1;
    while (i >>= 1)
        clogs++;

    //    assert((1<<clogs)== Mi_+1);

    CGU_INT pn;
    for (pn = 0; pn < npv_nd[dimension - 3][type]; pn++)
    { //1

        CGU_INT dim1 = dimension - 3;
        CGU_INT dim2 = type;
        CGU_INT dim3 = pn;


        CGU_INT o1[2][MAX_DIMENSION_BIG]; // = { 0,2 };
        CGU_INT o2[2][MAX_DIMENSION_BIG]; // = { 0,2 };

        for (j = 0; j < dimension; j++)
        { //A
            o2[0][j] = o1[0][j] = 0;
            o2[1][j] = o1[1][j] = 2;

            if (use_par)
            {
                if (get_par_vector(dim1, dim2, dim3, 0, j))
                    o1[0][j] = 1;
                else
                    o1[1][j] = 1;
                if (get_par_vector(dim1, dim2, dim3, 1, j))
                    o2[0][j] = 1;
                else
                    o2[1][j] = 1;
            }
        } //A

        CGU_INT t1, t2;

        CGU_INT dr[MAX_DIMENSION_BIG];
        CGU_INT dr_0[MAX_DIMENSION_BIG];
        //CGU_FLOAT tr;

        for (i = 0; i < (1 << clogs); i++)
        { //E
            CGU_FLOAT t = 0;
            CGU_INT t1o[MAX_DIMENSION_BIG], t2o[MAX_DIMENSION_BIG];

            for (j = 0; j < dimension; j++)
            { // D
                CGU_FLOAT t_ = CMP_FLOAT_MAX;

                for (t1 = o1[0][j]; t1 < o1[1][j]; t1++)
                { // C
                    for (t2 = o2[0][j]; t2 < o2[1][j]; t2++)
                        // This is needed for non-integer mean points of "collapsed" sets
                    { // B

#ifdef USE_BC6RAMPS
                        CGU_INT tf = (int)floorf(data[0][j]);
                        CGU_INT tc = (int)ceilf(data[0][j]);
                        // if they are not equal, the same representalbe point is used for
                        // both of them, as all representable points are integers in the rage
                        if (sperr(tf, CLT(clogs), BTT(bits[j]), t1, t2, i) > sperr(tc, CLT(clogs), BTT(bits[j]), t1, t2, i))
                            dr[j] = tc;
                        else if (sperr(tf, CLT(clogs), BTT(bits[j]), t1, t2, i) < sperr(tc, CLT(clogs), BTT(bits[j]), t1, t2, i))
                            dr[j] = tf;
                        else
#endif
                            dr[j] = (int)floorf(data[0][j] + 0.5f);

#ifdef USE_BC6RAMPS
                        tr = sperr(dr[j], CLT(clogs), BTT(bits[j]), t1, t2, i) + 2.0f * sqrtf(sperr(dr[j], CLT(clogs), BTT(bits[j]), t1, t2, i)) * fabsf((float)dr[j] - data[0][j]) +
                            (dr[j] - data[0][j])* (dr[j] - data[0][j]);
                        if (tr < t_)
                        {
                            t_ = tr;
#else
                        t_ = 0;
#endif

                        t1o[j] = t1;
                        t2o[j] = t2;
                        dr_0[j] = dr[j];
#ifdef USE_BC6RAMPS
                        if ((dr_0[j] < 0) || (dr_0[j] > 255))
                        {
                            dr_0[j] = 0; // Error!
                        }
                        }
#endif
                    } // B
                } //C

            t += t_;
            } // D


        if (t < err_0)
        {

            idx = i;

            for (j = 0; j < dimension; j++)
            {
#ifdef USE_BC6RAMPS
                CGU_INT p1 = CLT(clogs);        // < 3
                CGU_INT p2 = BTT(bits[j]);     // < 4
                CGU_INT in_data = dr_0[j];          // < SP_ERRIDX_MAX
                CGU_INT p4 = t1o[j];           // < 2
                CGU_INT p5 = t2o[j];           // < 2
                CGU_INT p6 = i;                // < 16

                                           // New spidx
                epo_0[0][j] = spidx(in_data, p1, p2, p4, p5, p6, 0);
                epo_0[1][j] = spidx(in_data, p1, p2, p4, p5, p6, 1);

                if (epo_0[1][j] >= SP_ERRIDX_MAX)
                {
                    epo_0[1][j] = 0; // Error!!
                }
#else
                epo_0[0][j] = 0;
                epo_0[1][j] = 0;
#endif
            }
            err_0 = t;
        }
        if (err_0 == 0)
            break;
        } // E

    if (err_0 < err_1)
    {
        idx_1 = idx;
        for (j = 0; j < dimension; j++)
        {
            epo_1[0][j] = epo_0[0][j];
            epo_1[1][j] = epo_0[1][j];
        }
        err_1 = err_0;
    }

    if (err_1 == 0)
        break;
    } //1

for (i = 0; i < numEntries; i++)
{
    index[i] = idx_1;
    for (j = 0; j < dimension; j++)
    {
        CGU_INT p1 = CLT(clogs);        // < 3
        CGU_INT p3 = epo_1[0][j];      // < SP_ERRIDX_MAX
        CGU_INT p4 = epo_1[1][j];      // < SP_ERRIDX_MAX
        CGU_INT p5 = idx_1;            // < 16
#pragma warning( push )
#pragma warning(disable:4244)
        out[i][j] = (int)rampf(p1, p3, p4, p5);
#pragma warning( pop )
    }
}
return err_1 * numEntries;
}

//========================================================================================================================

CGU_FLOAT ep_shaker_HD(CGU_FLOAT   data[MAX_ENTRIES][MAX_DIMENSION_BIG],
                       CGU_INT     numEntries,
                       CGU_INT     index_[MAX_ENTRIES],
                       CGU_FLOAT   out[MAX_ENTRIES][MAX_DIMENSION_BIG],
                       CGU_INT     epo_code_out[2][MAX_DIMENSION_BIG],
                       CGU_INT     Mi_,                // last cluster
                       CGU_INT     bits[3],            // including parity
                       CGU_INT     channels3or4
)
{
    CGU_INT i, j, k;
    CGU_INT use_par = 0;
    CGU_INT clogs = 0;

    i = Mi_ + 1;
    while (i >>= 1)
        clogs++;

    CGU_FLOAT mean[MAX_DIMENSION_BIG];
    CGU_INT index[MAX_ENTRIES];
    CGU_INT Mi;

    CGU_INT maxTry = 1;

    for (k = 0; k < numEntries; k++)
    {
        index[k] = index_[k];
    }

    CGU_INT done;
    CGU_INT change;

    CGU_INT better;

    CGU_FLOAT   err_o = CMP_FLOAT_MAX;
    CGU_FLOAT   out_2[MAX_ENTRIES][MAX_DIMENSION_BIG];
    CGU_INT     idx_2[MAX_ENTRIES];
    CGU_INT     epo_2[2][MAX_DIMENSION_BIG];

    CGU_INT max_bits[MAX_DIMENSION_BIG];
    CGU_INT type = bits[0] % (2 * channels3or4);

    for (j = 0; j < channels3or4; j++)
        max_bits[j] = (bits[0] + 2 * channels3or4 - 1) / (2 * channels3or4);


    // handled below automatically
    CGU_INT alls = all_same_d(data, numEntries, channels3or4);

    mean_d_d(data, mean, numEntries, channels3or4);

    do {
        index_collapse_kernel(index, numEntries);

        Mi = max_index(index, numEntries);  // index can be from requantizer

        CGU_INT p, q;
        CGU_INT p0 = -1, q0 = -1;

        CGU_FLOAT err_2 = CMP_FLOAT_MAX;

        if (Mi == 0) {
            CGU_FLOAT t;
            CGU_INT    epo_0[2][MAX_DIMENSION_BIG];
            // either sinle point from the beginning or collapsed index
            if (alls) {
                t = quant_single_point_d(data, numEntries, index, out_2, epo_0, Mi_, type, channels3or4);
            }
            else
            {
                quant_single_point_d(&mean, numEntries, index, out_2, epo_0, Mi_, type, channels3or4);
                t = totalError_d(data, out_2, numEntries, channels3or4);
            }

            if (t < err_o) {
                for (k = 0; k < numEntries; k++) {
                    index_[k] = index[k];
                    for (j = 0; j < channels3or4; j++) {
                        out[k][j] = out_2[k][j];
                        epo_code_out[0][j] = epo_0[0][j];
                        epo_code_out[1][j] = epo_0[1][j];
                    }
                };
                err_o = t;
            }
            return err_o;
        }

        //===============================
        // We have ramp colors to process
        //===============================

        for (q = 1; Mi != 0 && q*Mi <= Mi_; q++) // does not work for single point collapsed index!!!
        {
            for (p = 0; p <= Mi_ - q * Mi; p++)
            {

                //-------------------------------------
                // set a new index data to try
                //-------------------------------------
                CGU_INT cidx[MAX_ENTRIES];

                for (k = 0; k < numEntries; k++)
                {
                    cidx[k] = index[k] * q + p;
                }

                CGU_FLOAT epa[2][MAX_DIMENSION_BIG];

                //
                // solve RMS problem for center
                //

                CGU_FLOAT im[2][2] = { { 0,0 },{ 0,0 } };   // matrix /inverse matrix
                CGU_FLOAT rp[2][MAX_DIMENSION_BIG];            // right part for RMS fit problem

                                                           // get ideal clustr centers
                CGU_FLOAT cc[MAX_CLUSTERS_BIG][MAX_DIMENSION_BIG];
                CGU_INT index_cnt[MAX_CLUSTERS_BIG];                        // count of index entries
                CGU_INT index_comp[MAX_CLUSTERS_BIG];                       // compacted index
                CGU_INT index_ncl;                                            // number of unique indexes

                index_ncl = cluster_mean_d_d(data, cc, cidx, index_comp, index_cnt, numEntries, channels3or4); // unrounded

                for (i = 0; i < index_ncl; i++)
                    for (j = 0; j < channels3or4; j++)
                        cc[index_comp[i]][j] = (CGU_FLOAT)floorf(cc[index_comp[i]][j] + 0.5f); // more or less ideal location

                for (j = 0; j < channels3or4; j++)
                {
                    rp[0][j] = rp[1][j] = 0;
                }

                // weight with cnt if runnning on compacted index
                for (k = 0; k < numEntries; k++)
                {
                    im[0][0] += (Mi_ - cidx[k])* (Mi_ - cidx[k]);
                    im[0][1] += cidx[k] * (Mi_ - cidx[k]);           // im is symmetric
                    im[1][1] += cidx[k] * cidx[k];

                    for (j = 0; j < channels3or4; j++)
                    {
                        rp[0][j] += (Mi_ - cidx[k]) * cc[cidx[k]][j];
                        rp[1][j] += cidx[k] * cc[cidx[k]][j];
                    }
                }

                CGU_FLOAT dd = im[0][0] * im[1][1] - im[0][1] * im[0][1];

                //assert(dd !=0);

                // dd=0 means that cidx[k] and (Mi_-cidx[k]) collinear which implies only one active index;
                // taken care of separately

                im[1][0] = im[0][0];
                im[0][0] = im[1][1] / dd;
                im[1][1] = im[1][0] / dd;
                im[1][0] = im[0][1] = -im[0][1] / dd;

                for (j = 0; j < channels3or4; j++) {
                    epa[0][j] = (im[0][0] * rp[0][j] + im[0][1] * rp[1][j])*Mi_;
                    epa[1][j] = (im[1][0] * rp[0][j] + im[1][1] * rp[1][j])*Mi_;
                }

                CGU_FLOAT err_1 = CMP_FLOAT_MAX;
                CGU_FLOAT out_1[MAX_ENTRIES][MAX_DIMENSION_BIG];
                CGU_INT idx_1[MAX_ENTRIES];
                CGU_INT epo_1[2][MAX_DIMENSION_BIG];
                CGU_INT s1 = 0;
                CGU_FLOAT epd[2][MAX_DIMENSION_BIG][2];   // first second, coord, begin range end range

                for (j = 0; j < channels3or4; j++)
                {
                    for (i = 0; i < 2; i++)
                    {     // set range
                        epd[i][j][0] = epd[i][j][1] = epa[i][j];
                        epd[i][j][1] += ((1 << bits[j]) - 1 - (int)epd[i][j][1] < (1 << use_par) ?
                            (1 << bits[j]) - 1 - (int)epd[i][j][1] : (1 << use_par)) & (~use_par);
                    }
                }

                CGU_FLOAT ce[MAX_ENTRIES][MAX_CLUSTERS_BIG][MAX_DIMENSION_BIG];
                CGU_FLOAT err_0 = 0;
                CGU_FLOAT out_0[MAX_ENTRIES][MAX_DIMENSION_BIG];
                CGU_INT idx_0[MAX_ENTRIES];

                for (i = 0; i < numEntries; i++)
                {
                    CGU_FLOAT d[4];
                    d[0] = data[i][0];
                    d[1] = data[i][1];
                    d[2] = data[i][2];
                    d[3] = data[i][3];
                    for (j = 0; j < (1 << clogs); j++)
                        for (k = 0; k < channels3or4; k++)
                        {
                            ce[i][j][k] = (rampf(CLT(clogs), epd[0][k][0], epd[1][k][0], j) - d[k])*
                                (rampf(CLT(clogs), epd[0][k][0], epd[1][k][0], j) - d[k]);
                        }
                }

                CGU_INT s = 0, p1, g;
                CGU_INT ei0 = 0, ei1 = 0;

                for (p1 = 0; p1 < 64; p1++)
                {
                    CGU_INT j0 = 0;

                    // Gray code increment
                    g = p1 & (-p1);

                    err_0 = 0;

                    for (j = 0; j < channels3or4; j++)
                    {
                        if (((g >> (2 * j)) & 0x3) != 0)
                        {
                            j0 = j;
                            // new cords
                            ei0 = (((s^g) >> (2 * j)) & 0x1);
                            ei1 = (((s^g) >> (2 * j + 1)) & 0x1);
                        }
                    }
                    s = s ^ g;
                    err_0 = 0;

                    for (i = 0; i < numEntries; i++)
                    {
                        CGU_FLOAT d[4];
                        d[0] = data[i][0];
                        d[1] = data[i][1];
                        d[2] = data[i][2];
                        d[3] = data[i][3];
                        CGU_INT    ci = 0;
                        CGU_FLOAT cmin = CMP_FLOAT_MAX;

                        for (j = 0; j < (1 << clogs); j++)
                        {
                            float t_ = 0.;
                            ce[i][j][j0] = (rampf(CLT(clogs), epd[0][j0][ei0], epd[1][j0][ei1], j) - d[j0])*
                                (rampf(CLT(clogs), epd[0][j0][ei0], epd[1][j0][ei1], j) - d[j0]);
                            for (k = 0; k < channels3or4; k++)
                            {
                                t_ += ce[i][j][k];
                            }

                            if (t_ < cmin)
                            {
                                cmin = t_;
                                ci = j;
                            }
                        }

                        idx_0[i] = ci;
                        for (k = 0; k < channels3or4; k++)
                        {
                            out_0[i][k] = rampf(CLT(clogs), epd[0][k][ei0], epd[1][k][ei1], ci);
                        }
                        err_0 += cmin;
                    }

                    if (err_0 < err_1)
                    {
                        // best in the curent ep cube run
                        for (i = 0; i < numEntries; i++)
                        {
                            idx_1[i] = idx_0[i];
                            for (j = 0; j < channels3or4; j++)
                                out_1[i][j] = out_0[i][j];
                        }
                        err_1 = err_0;

                        s1 = s; // epo coding
                    }
                }

                // reconstruct epo
                for (j = 0; j < channels3or4; j++)
                {
                    {
                        // new cords
                        ei0 = ((s1 >> (2 * j)) & 0x1);
                        ei1 = ((s1 >> (2 * j + 1)) & 0x1);
                        epo_1[0][j] = (int)epd[0][j][ei0];
                        epo_1[1][j] = (int)epd[1][j][ei1];
                    }
                }

                if (err_1 < err_2)
                {
                    // best in the curent ep cube run
                    for (i = 0; i < numEntries; i++)
                    {
                        idx_2[i] = idx_1[i];
                        for (j = 0; j < channels3or4; j++)
                            out_2[i][j] = out_1[i][j];
                    }
                    err_2 = err_1;
                    for (j = 0; j < channels3or4; j++)
                    {
                        epo_2[0][j] = epo_1[0][j];
                        epo_2[1][j] = epo_1[1][j];
                    }
                    p0 = p;
                    q0 = q;
                }
            }
        }

        // change/better
        change = 0;
        for (k = 0; k < numEntries; k++)
            change = change || (index[k] * q0 + p0 != idx_2[k]);

        better = err_2 < err_o;

        if (better)
        {
            for (k = 0; k < numEntries; k++)
            {
                index_[k] = index[k] = idx_2[k];
                for (j = 0; j < channels3or4; j++)
                {
                    out[k][j] = out_2[k][j];
                    epo_code_out[0][j] = epo_2[0][j];
                    epo_code_out[1][j] = epo_2[1][j];
                }
            }
            err_o = err_2;
        }

        done = !(change  &&  better);

        if (maxTry > 0) maxTry--;
        else maxTry = 0;

    } while (!done && maxTry);

    return err_o;
}


#ifndef ASPM_GPU
static CGU_INT g_aWeights3[] = { 0, 9, 18, 27, 37, 46, 55, 64 };                                // 3 bit color Indices
static CGU_INT g_aWeights4[] = { 0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64 }; // 4 bit color indices

CGU_FLOAT lerpf(CGU_FLOAT a, CGU_FLOAT b, CGU_INT i, CGU_INT denom)
{
    assert(denom == 3 || denom == 7 || denom == 15);
    assert(i >= 0 && i <= denom);

    CGU_INT *weights = NULL;

    switch (denom)
    {
    case 3:     denom *= 5; i *= 5;    // fall through to case 15
    case 7:     weights = g_aWeights3; break;
    case 15:    weights = g_aWeights4; break;
    default:    assert(0);
    }
    return (a*weights[denom - i] + b * weights[i]) / 64.0f;
}
#else

CGU_FLOAT lerpf(CGU_FLOAT a, CGU_FLOAT b, CGU_INT i, CGU_INT denom)
{
    CGU_INT g_aWeights3[] = { 0, 9, 18, 27, 37, 46, 55, 64 };                                // 3 bit color Indices
    CGU_INT g_aWeights4[] = { 0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64 }; // 4 bit color indices
    switch (denom)
    {
    case 7:     return ((a*g_aWeights3[denom - i] + b * g_aWeights3[i]) / 64.0f); break;
    case 15:    return ((a*g_aWeights4[denom - i] + b * g_aWeights4[i]) / 64.0f); break;
    default:
    case 3:// fall through to case 15
        denom *= 5;
        i *= 5;
        return ((a*g_aWeights3[denom - i] + b * g_aWeights3[i]) / 64.0f);   break;
    }
}
#endif

void palitizeEndPointsF(BC6H_Encode_local *BC6H_data, CGU_FLOAT fEndPoints[MAX_SUBSETS][MAX_END_POINTS][MAX_DIMENSION_BIG])
{
    // scale endpoints
    CGU_FLOAT  Ar, Ag, Ab, Br, Bg, Bb;


    // Compose index colors from end points
    if (BC6H_data->region == 1)
    {
        Ar = fEndPoints[0][0][0];
        Ag = fEndPoints[0][0][1];
        Ab = fEndPoints[0][0][2];
        Br = fEndPoints[0][1][0];
        Bg = fEndPoints[0][1][1];
        Bb = fEndPoints[0][1][2];

        for (CGU_INT i = 0; i < 16; i++)
        {

            // Red
            BC6H_data->Paletef[0][i].x = lerpf(Ar, Br, i, 15);
            // Green
            BC6H_data->Paletef[0][i].y = lerpf(Ag, Bg, i, 15);
            // Blue
            BC6H_data->Paletef[0][i].z = lerpf(Ab, Bb, i, 15);
        }

    }
    else //mode.type == BC6_TWO
    {
        for (CGU_INT region = 0; region < 2; region++)
        {
            Ar = fEndPoints[region][0][0];
            Ag = fEndPoints[region][0][1];
            Ab = fEndPoints[region][0][2];
            Br = fEndPoints[region][1][0];
            Bg = fEndPoints[region][1][1];
            Bb = fEndPoints[region][1][2];
            for (CGU_INT i = 0; i < 8; i++)
            {
                // Red
                BC6H_data->Paletef[region][i].x = lerpf(Ar, Br, i, 7);
                // Greed
                BC6H_data->Paletef[region][i].y = lerpf(Ag, Bg, i, 7);
                // Blue
                BC6H_data->Paletef[region][i].z = lerpf(Ab, Bb, i, 7);
            }

        }
    }
}

CGU_FLOAT CalcShapeError(BC6H_Encode_local *BC6H_data, CGU_FLOAT fEndPoints[MAX_SUBSETS][MAX_END_POINTS][MAX_DIMENSION_BIG], CGU_BOOL SkipPallet)
{
    CGU_INT maxPallet;
    CGU_INT subset = 0;
    CGU_FLOAT  totalError = 0.0f;
    CGU_INT region = (BC6H_data->region - 1);

    if (region == 0)
        maxPallet = 16;
    else
        maxPallet = 8;

    if (!SkipPallet)
        palitizeEndPointsF(BC6H_data, fEndPoints);

    for (CGU_INT i = 0; i < MAX_SUBSET_SIZE; i++)
    {
        CGU_FLOAT error = 0.0f;
        CGU_FLOAT bestError = 0.0f;

        if (region == 0)
        {
            subset = 0;
        }
        else
        {
            // get the shape subset 0 or  1
            subset = BC6_PARTITIONS[BC6H_data->d_shape_index][i];
        }

        // initialize bestError to the difference for first data
        bestError = abs(BC6H_data->din[i][0] - BC6H_data->Paletef[subset][0].x) +
            abs(BC6H_data->din[i][1] - BC6H_data->Paletef[subset][0].y) +
            abs(BC6H_data->din[i][2] - BC6H_data->Paletef[subset][0].z);

        // loop through the rest of the data until find the best error
        for (CGU_INT j = 1; j < maxPallet && bestError > 0; j++)
        {
            error = abs(BC6H_data->din[i][0] - BC6H_data->Paletef[subset][j].x) +
                abs(BC6H_data->din[i][1] - BC6H_data->Paletef[subset][j].y) +
                abs(BC6H_data->din[i][2] - BC6H_data->Paletef[subset][j].z);

            if (error <= bestError)
                bestError = error;
            else
                break;
        }
        totalError += bestError;
    }

    return totalError;
}

CGU_FLOAT FindBestPattern(BC6H_Encode_local * BC6H_data, CGU_BOOL TwoRegionShapes, CGU_INT8 shape_pattern, CGU_FLOAT quality)
{
    // Index bit size for the patterns been used.
    // All two zone shapes have 3 bits per color, max index value < 8
    // All one zone shapes gave 4 bits per color, max index value < 16
    CGU_INT8   Index_BitSize = TwoRegionShapes ? 8 : 16;
    CGU_INT8   max_subsets = TwoRegionShapes ? 2 : 1;
    CGU_FLOAT  direction[NCHANNELS];
    CGU_FLOAT  step;

    BC6H_data->region = max_subsets;
    BC6H_data->index = 0;
    BC6H_data->d_shape_index = shape_pattern;
    memset((CGU_UINT8 *)BC6H_data->partition, 0, sizeof(BC6H_data->partition));
    memset((CGU_UINT8 *)BC6H_data->shape_indices, 0, sizeof(BC6H_data->shape_indices));

    // Get the pattern to encode with
    Partition(shape_pattern,          // Shape pattern we want to get
        BC6H_data->din,          // Input data
        BC6H_data->partition,    // Returns the patterned shape data
        BC6H_data->entryCount,   // counts the number of pixel used in each subset region num of 0's amd 1's
        max_subsets,            // Table Shapes to use eithe one regions 1 or two regions 2
        3);                     // rgb no alpha always = 3

    CGU_FLOAT  error[MAX_SUBSETS] = { 0.0, CMP_FLOAT_MAX,CMP_FLOAT_MAX };
    CGU_INT    BestOutB = 0;
    CGU_FLOAT  BestError;        //the lowest error from vector direction quantization
    CGU_FLOAT  BestError_endpts; //the lowest error from endpoints extracted from the vector direction quantization

    CGU_FLOAT   outB[2][2][MAX_SUBSET_SIZE][MAX_DIMENSION_BIG];
    CGU_INT         shape_indicesB[2][MAX_SUBSETS][MAX_SUBSET_SIZE];

    for (CGU_INT subset = 0; subset < max_subsets; subset++)
    {
        error[0] += optQuantAnD_d(
            BC6H_data->partition[subset],        // input data
            BC6H_data->entryCount[subset],       // number of input points above (not clear about 1, better to avoid)
            Index_BitSize,                      // number of clusters on the ramp, 8  or 16
            shape_indicesB[0][subset],          // output index, if not all points of the ramp used, 0 may not be assigned
            outB[0][subset],                    // resulting quantization
            direction,                          // direction vector of the ramp (check normalization)
            &step,                              // step size (check normalization)
            3,                                  // number of channels (always 3 = RGB for BC6H)
            quality                           // Quality set number of retry to get good end points
                                                // Max retries = MAX_TRY = 4000 when Quality is 1.0
                                                // Min = 0 and default with quality 0.05 is 200 times
        );
    }

    BestError = error[0];
    BestOutB = 0;

    // The following code is almost complete - runs very slow and not sure if % of improvement is justified..
#ifdef USE_SHAKERHD
    // Valid only for 2 region shapes
    if ((max_subsets > 1) && (quality > 0.80))
    {
        CGU_INT     tempIndices[MAX_SUBSET_SIZE];
        // CGU_INT     temp_epo_code[2][2][MAX_DIMENSION_BIG];
        CGU_INT     bits[3] = { 8,8,8 };     // Channel index bit size

        // CGU_FLOAT   epo[2][MAX_DIMENSION_BIG];
        CGU_INT     epo_code[MAX_SUBSETS][2][MAX_DIMENSION_BIG];
        // CGU_INT     shakeSize = 8;

        error[1] = 0.0;
        for (CGU_INT subset = 0; subset < max_subsets; subset++)
        {
            for (CGU_INT k = 0; k < BC6H_data->entryCount[subset]; k++)
            {
                tempIndices[k] = shape_indicesB[0][subset][k];
            }

            error[1] += ep_shaker_HD(
                BC6H_data->partition[subset],
                BC6H_data->entryCount[subset],
                tempIndices,                    // output index, if not all points of the ramp used, 0 may not be assigned
                outB[1][subset],                // resulting quantization
                epo_code[subset],
                BC6H_data->entryCount[subset] - 1,
                bits,
                3
            );

            // error[1] += ep_shaker_2_d(
            //      BC6H_data.partition[subset],
            //      BC6H_data.entryCount[subset],
            //      tempIndices,                    // output index, if not all points of the ramp used, 0 may not be assigned
            //      outB[1][subset],                // resulting quantization
            //      epo_code[subset],
            //      shakeSize,
            //      BC6H_data.entryCount[subset] - 1,
            //      bits[0],
            //      3,
            //      epo
            //      );


            for (CGU_INT k = 0; k < BC6H_data->entryCount[subset]; k++)
            {
                shape_indicesB[1][subset][k] = tempIndices[k];
            }

        } // subsets

        if (BestError > error[1])
        {
            BestError = error[1];
            BestOutB = 1;
            for (CGU_INT subset = 0; subset < max_subsets; subset++)
            {
                for (CGU_INT k = 0; k < MAX_DIMENSION_BIG; k++)
                {
                    BC6H_data->fEndPoints[subset][0][k] = (CGU_FLOAT)epo_code[subset][0][k];
                    BC6H_data->fEndPoints[subset][1][k] = (CGU_FLOAT)epo_code[subset][1][k];
                }
            }
        }

    }
#endif

    // Save the best for BC6H data processing later
    if (BestOutB == 0)
        GetEndPoints(BC6H_data->fEndPoints, outB[BestOutB], max_subsets, BC6H_data->entryCount);

    memcpy((CGU_UINT8 *)BC6H_data->shape_indices, (CGU_UINT8 *)shape_indicesB[BestOutB], sizeof(BC6H_data->shape_indices));
    clampF16Max(BC6H_data->fEndPoints, BC6H_data->issigned);

    BestError_endpts = CalcShapeError(BC6H_data, BC6H_data->fEndPoints, false);
    return BestError_endpts;
}

#ifndef ASPM_GPU
void SaveDataBlock(BC6H_Encode_local *bc6h_format, CMP_GLOBAL CGU_UINT8 cmpout[COMPRESSED_BLOCK_SIZE])
{
    BitHeader header(NULL, COMPRESSED_BLOCK_SIZE);

    // Save the RGB end point values
    switch (bc6h_format->m_mode)
    {
    case 1: //0x00
        header.setvalue(0, 2, 0x00);
        header.setvalue(2, 1, bc6h_format->gy, 4);        //        gy[4]
        header.setvalue(3, 1, bc6h_format->by, 4);        //        by[4]
        header.setvalue(4, 1, bc6h_format->bz, 4);        //        bz[4]
        header.setvalue(5, 10, bc6h_format->rw);          // 10:    rw[9:0]
        header.setvalue(15, 10, bc6h_format->gw);          // 10:    gw[9:0]
        header.setvalue(25, 10, bc6h_format->bw);          // 10:    bw[9:0]
        header.setvalue(35, 5, bc6h_format->rx);          // 5:     rx[4:0]
        header.setvalue(40, 1, bc6h_format->gz, 4);        //        gz[4]
        header.setvalue(41, 4, bc6h_format->gy);          // 5:     gy[3:0]
        header.setvalue(45, 5, bc6h_format->gx);          // 5:     gx[4:0]
        header.setvalue(50, 1, bc6h_format->bz);          // 5:     bz[0]
        header.setvalue(51, 4, bc6h_format->gz);          // 5:     gz[3:0]
        header.setvalue(55, 5, bc6h_format->bx);          // 5:     bx[4:0]
        header.setvalue(60, 1, bc6h_format->bz, 1);        //        bz[1]
        header.setvalue(61, 4, bc6h_format->by);          // 5:     by[3:0]
        header.setvalue(65, 5, bc6h_format->ry);          // 5:     ry[4:0]
        header.setvalue(70, 1, bc6h_format->bz, 2);        //        bz[2]
        header.setvalue(71, 5, bc6h_format->rz);          // 5:     rz[4:0]
        header.setvalue(76, 1, bc6h_format->bz, 3);        //        bz[3]
        break;
    case 2: // 0x01
        header.setvalue(0, 2, 0x01);
        header.setvalue(2, 1, bc6h_format->gy, 5);        //        gy[5]
        header.setvalue(3, 1, bc6h_format->gz, 4);        //        gz[4]
        header.setvalue(4, 1, bc6h_format->gz, 5);        //        gz[5]
        header.setvalue(5, 7, bc6h_format->rw);          //        rw[6:0]
        header.setvalue(12, 1, bc6h_format->bz);          //        bz[0]
        header.setvalue(13, 1, bc6h_format->bz, 1);        //        bz[1]
        header.setvalue(14, 1, bc6h_format->by, 4);        //        by[4]
        header.setvalue(15, 7, bc6h_format->gw);          //        gw[6:0]
        header.setvalue(22, 1, bc6h_format->by, 5);        //        by[5]
        header.setvalue(23, 1, bc6h_format->bz, 2);        //        bz[2]
        header.setvalue(24, 1, bc6h_format->gy, 4);        //        gy[4]
        header.setvalue(25, 7, bc6h_format->bw);          // 7:     bw[6:0]
        header.setvalue(32, 1, bc6h_format->bz, 3);        //        bz[3]
        header.setvalue(33, 1, bc6h_format->bz, 5);        //        bz[5]
        header.setvalue(34, 1, bc6h_format->bz, 4);        //        bz[4]
        header.setvalue(35, 6, bc6h_format->rx);          // 6:     rx[5:0]
        header.setvalue(41, 4, bc6h_format->gy);          // 6:     gy[3:0]
        header.setvalue(45, 6, bc6h_format->gx);          // 6:     gx[5:0]
        header.setvalue(51, 4, bc6h_format->gz);          // 6:     gz[3:0]
        header.setvalue(55, 6, bc6h_format->bx);          // 6:     bx[5:0]
        header.setvalue(61, 4, bc6h_format->by);          // 6:     by[3:0]
        header.setvalue(65, 6, bc6h_format->ry);          // 6:     ry[5:0]
        header.setvalue(71, 6, bc6h_format->rz);          // 6:     rz[5:0]
        break;
    case 3: // 0x02
        header.setvalue(0, 5, 0x02);
        header.setvalue(5, 10, bc6h_format->rw);          // 11:    rw[9:0]
        header.setvalue(15, 10, bc6h_format->gw);          // 11:    gw[9:0]
        header.setvalue(25, 10, bc6h_format->bw);          // 11:    bw[9:0]
        header.setvalue(35, 5, bc6h_format->rx);          // 5:     rx[4:0]
        header.setvalue(40, 1, bc6h_format->rw, 10);       //        rw[10]
        header.setvalue(41, 4, bc6h_format->gy);          // 4:     gy[3:0]
        header.setvalue(45, 4, bc6h_format->gx);          // 4:     gx[3:0]
        header.setvalue(49, 1, bc6h_format->gw, 10);       //        gw[10]
        header.setvalue(50, 1, bc6h_format->bz);          // 4:     bz[0]
        header.setvalue(51, 4, bc6h_format->gz);          // 4:     gz[3:0]
        header.setvalue(55, 4, bc6h_format->bx);          // 4:     bx[3:0]
        header.setvalue(59, 1, bc6h_format->bw, 10);       //        bw[10]
        header.setvalue(60, 1, bc6h_format->bz, 1);        //        bz[1]
        header.setvalue(61, 4, bc6h_format->by);          // 4:     by[3:0]
        header.setvalue(65, 5, bc6h_format->ry);          // 5:     ry[4:0]
        header.setvalue(70, 1, bc6h_format->bz, 2);        //        bz[2]
        header.setvalue(71, 5, bc6h_format->rz);          // 5:     rz[4:0]
        header.setvalue(76, 1, bc6h_format->bz, 3);        //        bz[3]
        break;
    case 4: // 0x06
        header.setvalue(0, 5, 0x06);
        header.setvalue(5, 10, bc6h_format->rw);          // 11:    rw[9:0]
        header.setvalue(15, 10, bc6h_format->gw);          // 11:    gw[9:0]
        header.setvalue(25, 10, bc6h_format->bw);          // 11:    bw[9:0]
        header.setvalue(35, 4, bc6h_format->rx);          //        rx[3:0]
        header.setvalue(39, 1, bc6h_format->rw, 10);       //        rw[10]
        header.setvalue(40, 1, bc6h_format->gz, 4);        //        gz[4]
        header.setvalue(41, 4, bc6h_format->gy);          // 5:     gy[3:0]
        header.setvalue(45, 5, bc6h_format->gx);          //        gx[4:0]
        header.setvalue(50, 1, bc6h_format->gw, 10);       // 5:     gw[10]
        header.setvalue(51, 4, bc6h_format->gz);          // 5:     gz[3:0]
        header.setvalue(55, 4, bc6h_format->bx);          // 4:     bx[3:0]
        header.setvalue(59, 1, bc6h_format->bw, 10);       //        bw[10]
        header.setvalue(60, 1, bc6h_format->bz, 1);        //        bz[1]
        header.setvalue(61, 4, bc6h_format->by);          // 4:     by[3:0]
        header.setvalue(65, 4, bc6h_format->ry);          // 4:     ry[3:0]
        header.setvalue(69, 1, bc6h_format->bz);          // 4:     bz[0]
        header.setvalue(70, 1, bc6h_format->bz, 2);        //        bz[2]
        header.setvalue(71, 4, bc6h_format->rz);          // 4:     rz[3:0]
        header.setvalue(75, 1, bc6h_format->gy, 4);        //        gy[4]
        header.setvalue(76, 1, bc6h_format->bz, 3);        //        bz[3]
        break;
    case 5: // 0x0A
        header.setvalue(0, 5, 0x0A);
        header.setvalue(5, 10, bc6h_format->rw);           // 11:   rw[9:0]
        header.setvalue(15, 10, bc6h_format->gw);           // 11:   gw[9:0]
        header.setvalue(25, 10, bc6h_format->bw);           // 11:   bw[9:0]
        header.setvalue(35, 4, bc6h_format->rx);           // 4:    rx[3:0]
        header.setvalue(39, 1, bc6h_format->rw, 10);        //       rw[10]
        header.setvalue(40, 1, bc6h_format->by, 4);         //       by[4]
        header.setvalue(41, 4, bc6h_format->gy);           // 4:    gy[3:0]
        header.setvalue(45, 4, bc6h_format->gx);           // 4:    gx[3:0]
        header.setvalue(49, 1, bc6h_format->gw, 10);        //       gw[10]
        header.setvalue(50, 1, bc6h_format->bz);           // 5:    bz[0]
        header.setvalue(51, 4, bc6h_format->gz);           // 4:    gz[3:0]
        header.setvalue(55, 5, bc6h_format->bx);           // 5:    bx[4:0]
        header.setvalue(60, 1, bc6h_format->bw, 10);        //       bw[10]
        header.setvalue(61, 4, bc6h_format->by);           // 5:    by[3:0]
        header.setvalue(65, 4, bc6h_format->ry);           // 4:    ry[3:0]
        header.setvalue(69, 1, bc6h_format->bz, 1);         //       bz[1]
        header.setvalue(70, 1, bc6h_format->bz, 2);         //       bz[2]
        header.setvalue(71, 4, bc6h_format->rz);           // 4:    rz[3:0]
        header.setvalue(75, 1, bc6h_format->bz, 4);         //       bz[4]
        header.setvalue(76, 1, bc6h_format->bz, 3);         //       bz[3]
        break;
    case 6: // 0x0E
        header.setvalue(0, 5, 0x0E);
        header.setvalue(5, 9, bc6h_format->rw);           // 9:    rw[8:0]
        header.setvalue(14, 1, bc6h_format->by, 4);         //       by[4]
        header.setvalue(15, 9, bc6h_format->gw);           // 9:    gw[8:0]
        header.setvalue(24, 1, bc6h_format->gy, 4);         //       gy[4]
        header.setvalue(25, 9, bc6h_format->bw);           // 9:    bw[8:0]
        header.setvalue(34, 1, bc6h_format->bz, 4);         //       bz[4]
        header.setvalue(35, 5, bc6h_format->rx);           // 5:    rx[4:0]
        header.setvalue(40, 1, bc6h_format->gz, 4);         //       gz[4]
        header.setvalue(41, 4, bc6h_format->gy);           // 5:    gy[3:0]
        header.setvalue(45, 5, bc6h_format->gx);           // 5:    gx[4:0]
        header.setvalue(50, 1, bc6h_format->bz);           // 5:    bz[0]
        header.setvalue(51, 4, bc6h_format->gz);           // 5:    gz[3:0]
        header.setvalue(55, 5, bc6h_format->bx);           // 5:    bx[4:0]
        header.setvalue(60, 1, bc6h_format->bz, 1);         //       bz[1]
        header.setvalue(61, 4, bc6h_format->by);           // 5:    by[3:0]
        header.setvalue(65, 5, bc6h_format->ry);           // 5:    ry[4:0]
        header.setvalue(70, 1, bc6h_format->bz, 2);         //       bz[2]
        header.setvalue(71, 5, bc6h_format->rz);           // 5:    rz[4:0]
        header.setvalue(76, 1, bc6h_format->bz, 3);         //       bz[3]
        break;
    case 7: // 0x12
        header.setvalue(0, 5, 0x12);
        header.setvalue(5, 8, bc6h_format->rw);           // 8:    rw[7:0]
        header.setvalue(13, 1, bc6h_format->gz, 4);         //       gz[4]
        header.setvalue(14, 1, bc6h_format->by, 4);         //       by[4]
        header.setvalue(15, 8, bc6h_format->gw);           // 8:    gw[7:0]
        header.setvalue(23, 1, bc6h_format->bz, 2);         //       bz[2]
        header.setvalue(24, 1, bc6h_format->gy, 4);         //       gy[4]
        header.setvalue(25, 8, bc6h_format->bw);           // 8:    bw[7:0]
        header.setvalue(33, 1, bc6h_format->bz, 3);         //       bz[3]
        header.setvalue(34, 1, bc6h_format->bz, 4);         //       bz[4]
        header.setvalue(35, 6, bc6h_format->rx);           // 6:    rx[5:0]
        header.setvalue(41, 4, bc6h_format->gy);           // 5:    gy[3:0]
        header.setvalue(45, 5, bc6h_format->gx);           // 5:    gx[4:0]
        header.setvalue(50, 1, bc6h_format->bz);           // 5:    bz[0]
        header.setvalue(51, 4, bc6h_format->gz);           // 5:    gz[3:0]
        header.setvalue(55, 5, bc6h_format->bx);           // 5:    bx[4:0]
        header.setvalue(60, 1, bc6h_format->bz, 1);         //       bz[1]
        header.setvalue(61, 4, bc6h_format->by);           // 5:    by[3:0]
        header.setvalue(65, 6, bc6h_format->ry);           // 6:    ry[5:0]
        header.setvalue(71, 6, bc6h_format->rz);           // 6:    rz[5:0]
        break;
    case 8: // 0x16
        header.setvalue(0, 5, 0x16);
        header.setvalue(5, 8, bc6h_format->rw);            // 8:   rw[7:0]
        header.setvalue(13, 1, bc6h_format->bz);            // 5:   bz[0]
        header.setvalue(14, 1, bc6h_format->by, 4);          //      by[4]
        header.setvalue(15, 8, bc6h_format->gw);            // 8:   gw[7:0]
        header.setvalue(23, 1, bc6h_format->gy, 5);          //      gy[5]
        header.setvalue(24, 1, bc6h_format->gy, 4);          //      gy[4]
        header.setvalue(25, 8, bc6h_format->bw);            // 8:   bw[7:0]
        header.setvalue(33, 1, bc6h_format->gz, 5);          //      gz[5]
        header.setvalue(34, 1, bc6h_format->bz, 4);          //      bz[4]
        header.setvalue(35, 5, bc6h_format->rx);            // 5:   rx[4:0]
        header.setvalue(40, 1, bc6h_format->gz, 4);          //      gz[4]
        header.setvalue(41, 4, bc6h_format->gy);            // 6:   gy[3:0]
        header.setvalue(45, 6, bc6h_format->gx);            // 6:   gx[5:0]
        header.setvalue(51, 4, bc6h_format->gz);            // 6:   gz[3:0]
        header.setvalue(55, 5, bc6h_format->bx);            // 5:   bx[4:0]
        header.setvalue(60, 1, bc6h_format->bz, 1);          //      bz[1]
        header.setvalue(61, 4, bc6h_format->by);            // 5:   by[3:0]
        header.setvalue(65, 5, bc6h_format->ry);            // 5:   ry[4:0]
        header.setvalue(70, 1, bc6h_format->bz, 2);          //      bz[2]
        header.setvalue(71, 5, bc6h_format->rz);            // 5:   rz[4:0]
        header.setvalue(76, 1, bc6h_format->bz, 3);          //      bz[3]
        break;
    case 9: // 0x1A
        header.setvalue(0, 5, 0x1A);
        header.setvalue(5, 8, bc6h_format->rw);            // 8:   rw[7:0]
        header.setvalue(13, 1, bc6h_format->bz, 1);          //      bz[1]
        header.setvalue(14, 1, bc6h_format->by, 4);          //      by[4]
        header.setvalue(15, 8, bc6h_format->gw);            // 8:   gw[7:0]
        header.setvalue(23, 1, bc6h_format->by, 5);          //      by[5]
        header.setvalue(24, 1, bc6h_format->gy, 4);          //      gy[4]
        header.setvalue(25, 8, bc6h_format->bw);            // 8:   bw[7:0]
        header.setvalue(33, 1, bc6h_format->bz, 5);          //      bz[5]
        header.setvalue(34, 1, bc6h_format->bz, 4);          //      bz[4]
        header.setvalue(35, 5, bc6h_format->rx);            // 5:   rx[4:0]
        header.setvalue(40, 1, bc6h_format->gz, 4);          //      gz[4]
        header.setvalue(41, 4, bc6h_format->gy);            // 5:   gy[3:0]
        header.setvalue(45, 5, bc6h_format->gx);            // 5:   gx[4:0]
        header.setvalue(50, 1, bc6h_format->bz);            // 6:   bz[0]
        header.setvalue(51, 4, bc6h_format->gz);            // 5:   gz[3:0]
        header.setvalue(55, 6, bc6h_format->bx);            // 6:   bx[5:0]
        header.setvalue(61, 4, bc6h_format->by);            // 6:   by[3:0]
        header.setvalue(65, 5, bc6h_format->ry);            // 5:   ry[4:0]
        header.setvalue(70, 1, bc6h_format->bz, 2);          //      bz[2]
        header.setvalue(71, 5, bc6h_format->rz);            // 5:   rz[4:0]
        header.setvalue(76, 1, bc6h_format->bz, 3);          //      bz[3]
        break;
    case 10: // 0x1E
        header.setvalue(0, 5, 0x1E);
        header.setvalue(5, 6, bc6h_format->rw);            // 6:   rw[5:0]
        header.setvalue(11, 1, bc6h_format->gz, 4);          //      gz[4]
        header.setvalue(12, 1, bc6h_format->bz);            // 6:   bz[0]
        header.setvalue(13, 1, bc6h_format->bz, 1);          //      bz[1]
        header.setvalue(14, 1, bc6h_format->by, 4);          //      by[4]
        header.setvalue(15, 6, bc6h_format->gw);            // 6:   gw[5:0]
        header.setvalue(21, 1, bc6h_format->gy, 5);          //      gy[5]
        header.setvalue(22, 1, bc6h_format->by, 5);          //      by[5]
        header.setvalue(23, 1, bc6h_format->bz, 2);          //      bz[2]
        header.setvalue(24, 1, bc6h_format->gy, 4);          //      gy[4]
        header.setvalue(25, 6, bc6h_format->bw);            // 6:   bw[5:0]
        header.setvalue(31, 1, bc6h_format->gz, 5);          //      gz[5]
        header.setvalue(32, 1, bc6h_format->bz, 3);          //      bz[3]
        header.setvalue(33, 1, bc6h_format->bz, 5);          //      bz[5]
        header.setvalue(34, 1, bc6h_format->bz, 4);          //      bz[4]
        header.setvalue(35, 6, bc6h_format->rx);            // 6:   rx[5:0]
        header.setvalue(41, 4, bc6h_format->gy);            // 6:   gy[3:0]
        header.setvalue(45, 6, bc6h_format->gx);            // 6:   gx[5:0]
        header.setvalue(51, 4, bc6h_format->gz);            // 6:   gz[3:0]
        header.setvalue(55, 6, bc6h_format->bx);            // 6:   bx[5:0]
        header.setvalue(61, 4, bc6h_format->by);            // 6:   by[3:0]
        header.setvalue(65, 6, bc6h_format->ry);            // 6:   ry[5:0]
        header.setvalue(71, 6, bc6h_format->rz);            // 6:   rz[5:0]
        break;

        // Single regions Modes
    case 11: // 0x03
        header.setvalue(0, 5, 0x03);
        header.setvalue(5, 10, bc6h_format->rw);            // 10:   rw[9:0]
        header.setvalue(15, 10, bc6h_format->gw);            // 10:   gw[9:0]
        header.setvalue(25, 10, bc6h_format->bw);            // 10:   bw[9:0]
        header.setvalue(35, 10, bc6h_format->rx);            // 10:   rx[9:0]
        header.setvalue(45, 10, bc6h_format->gx);            // 10:   gx[9:0]
        header.setvalue(55, 10, bc6h_format->bx);            // 10:   bx[9:0]
        break;
    case 12: // 0x07
        header.setvalue(0, 5, 0x07);
        header.setvalue(5, 10, bc6h_format->rw);            // 11:   rw[9:0]
        header.setvalue(15, 10, bc6h_format->gw);            // 11:   gw[9:0]
        header.setvalue(25, 10, bc6h_format->bw);            // 11:   bw[9:0]
        header.setvalue(35, 9, bc6h_format->rx);            // 9:    rx[8:0]
        header.setvalue(44, 1, bc6h_format->rw, 10);         //       rw[10]
        header.setvalue(45, 9, bc6h_format->gx);            // 9:    gx[8:0]
        header.setvalue(54, 1, bc6h_format->gw, 10);         //       gw[10]
        header.setvalue(55, 9, bc6h_format->bx);            // 9:    bx[8:0]
        header.setvalue(64, 1, bc6h_format->bw, 10);         //       bw[10]
        break;
    case 13: // 0x0B
        header.setvalue(0, 5, 0x0B);
        header.setvalue(5, 10, bc6h_format->rw);            // 12:   rw[9:0]
        header.setvalue(15, 10, bc6h_format->gw);            // 12:   gw[9:0]
        header.setvalue(25, 10, bc6h_format->bw);            // 12:   bw[9:0]
        header.setvalue(35, 8, bc6h_format->rx);            // 8:    rx[7:0]
        header.setvalue(43, 1, bc6h_format->rw, 11);         //       rw[11]
        header.setvalue(44, 1, bc6h_format->rw, 10);         //       rw[10]
        header.setvalue(45, 8, bc6h_format->gx);            // 8:    gx[7:0]
        header.setvalue(53, 1, bc6h_format->gw, 11);         //       gw[11]
        header.setvalue(54, 1, bc6h_format->gw, 10);         //       gw[10]
        header.setvalue(55, 8, bc6h_format->bx);            // 8:    bx[7:0]
        header.setvalue(63, 1, bc6h_format->bw, 11);         //       bw[11]
        header.setvalue(64, 1, bc6h_format->bw, 10);         //       bw[10]
        break;
    case 14: // 0x0F
        header.setvalue(0, 5, 0x0F);
        header.setvalue(5, 10, bc6h_format->rw);            // 16:   rw[9:0]
        header.setvalue(15, 10, bc6h_format->gw);            // 16:   gw[9:0]
        header.setvalue(25, 10, bc6h_format->bw);            // 16:   bw[9:0]
        header.setvalue(35, 4, bc6h_format->rx);            //  4:   rx[3:0]
        header.setvalue(39, 6, bc6h_format->rw, 10);         //       rw[15:10]
        header.setvalue(45, 4, bc6h_format->gx);            //  4:   gx[3:0]
        header.setvalue(49, 6, bc6h_format->gw, 10);         //       gw[15:10]
        header.setvalue(55, 4, bc6h_format->bx);            //  4:   bx[3:0]
        header.setvalue(59, 6, bc6h_format->bw, 10);         //       bw[15:10]
        break;
    default: // Need to indicate error!
        return;
    }

    // Each format in the mode table can be uniquely identified by the mode bits.
    // The first ten modes are used for two-region tiles, and the mode bit field
    // can be either two or five bits long. These blocks also have fields for
    // the compressed color endpoints (72 or 75 bits), the partition (5 bits),
    // and the partition indices (46 bits).

    if (bc6h_format->m_mode >= MIN_MODE_FOR_ONE_REGION)
    {
        CGU_INT startbit = ONE_REGION_INDEX_OFFSET;
        header.setvalue(startbit, 3, bc6h_format->indices16[0]);
        startbit += 3;
        for (CGU_INT i = 1; i < 16; i++)
        {
            header.setvalue(startbit, 4, bc6h_format->indices16[i]);
            startbit += 4;
        }
    }
    else
    {
        header.setvalue(77, 5, bc6h_format->d_shape_index);            // Shape Index
        CGU_INT startbit = TWO_REGION_INDEX_OFFSET,
            nbits = 2;
        header.setvalue(startbit, nbits, bc6h_format->indices16[0]);
        for (CGU_INT i = 1; i < 16; i++)
        {
            startbit += nbits; // offset start bit for next index using prior nbits used
            nbits = g_indexfixups[bc6h_format->d_shape_index] == i ? 2 : 3; // get new number of bit to save index with
            header.setvalue(startbit, nbits, bc6h_format->indices16[i]);
        }
    }

    // save to output buffer our new bit values
    // this can be optimized if header is part of bc6h_format struct
    header.transferbits(cmpout, 16);
}
#else
void SaveDataBlock(BC6H_Encode_local *bc6h_format, CMP_GLOBAL CGU_UINT8 out[COMPRESSED_BLOCK_SIZE])
{
    // ToDo
}
#endif

void SwapIndices(CGU_INT32 iEndPoints[MAX_SUBSETS][MAX_END_POINTS][MAX_DIMENSION_BIG], CGU_INT32 iIndices[3][MAX_SUBSET_SIZE], CGU_INT  entryCount[MAX_SUBSETS], CGU_INT max_subsets, CGU_INT mode, CGU_INT shape_pattern)
{

    CGU_UINT32 uNumIndices = 1 << ModePartition[mode].IndexPrec;
    CGU_UINT32 uHighIndexBit = uNumIndices >> 1;

    for (CGU_INT subset = 0; subset < max_subsets; ++subset)
    {
        // region 0 (subset = 0) The fix-up index for this subset is allways index 0
        // region 1 (subset = 1) The fix-up index for this subset varies based on the shape
        size_t i = subset ? g_Region2FixUp[shape_pattern] : 0;

        if (iIndices[subset][i] & uHighIndexBit)
        {
            // high bit is set, swap the aEndPts and indices for this region
            swap(iEndPoints[subset][0][0], iEndPoints[subset][1][0]);
            swap(iEndPoints[subset][0][1], iEndPoints[subset][1][1]);
            swap(iEndPoints[subset][0][2], iEndPoints[subset][1][2]);

            for (size_t j = 0; j < (size_t)entryCount[subset]; ++j)
            {
                iIndices[subset][j] = uNumIndices - 1 - iIndices[subset][j];
            }
        }

    }
}

// helper function to check transform overflow
// todo: check overflow by checking against sign
CGU_BOOL isOverflow(CGU_INT endpoint, CGU_INT nbit)
{
    CGU_INT maxRange = (int)pow(2.0f, (CGU_FLOAT)nbit - 1.0f) - 1;
    CGU_INT minRange = (int)-(pow(2.0f, (CGU_FLOAT)nbit - 1.0f));

    //no overflow
    if ((endpoint >= minRange) && (endpoint <= maxRange))
        return false;
    else //overflow
        return true;
}

CGU_BOOL TransformEndPoints(BC6H_Encode_local *BC6H_data, CGU_INT iEndPoints[MAX_SUBSETS][MAX_END_POINTS][MAX_DIMENSION_BIG], CGU_INT oEndPoints[MAX_SUBSETS][MAX_END_POINTS][MAX_DIMENSION_BIG], CGU_INT max_subsets, CGU_INT mode)
{
    CGU_INT Mask;
    if (ModePartition[mode].transformed)
    {
        BC6H_data->istransformed = true;
        for (CGU_INT i = 0; i < 3; ++i)
        {
            Mask = MASK(ModePartition[mode].nbits);
            oEndPoints[0][0][i] = iEndPoints[0][0][i] & Mask;    // [0][A]

            Mask = MASK(ModePartition[mode].prec[i]);
            oEndPoints[0][1][i] = iEndPoints[0][1][i] - iEndPoints[0][0][i]; // [0][B] - [0][A]

            if (isOverflow(oEndPoints[0][1][i], ModePartition[mode].prec[i]))
                return false;

            oEndPoints[0][1][i] = (oEndPoints[0][1][i] & Mask);

            //redo the check for sign overflow for one region case
            if (max_subsets <= 1)
            {
                if (isOverflow(oEndPoints[0][1][i], ModePartition[mode].prec[i]))
                    return false;
            }

            if (max_subsets > 1)
            {
                oEndPoints[1][0][i] = iEndPoints[1][0][i] - iEndPoints[0][0][i];  // [1][A] - [0][A]
                if (isOverflow(oEndPoints[1][0][i], ModePartition[mode].prec[i]))
                    return false;

                oEndPoints[1][0][i] = (oEndPoints[1][0][i] & Mask);

                oEndPoints[1][1][i] = iEndPoints[1][1][i] - iEndPoints[0][0][i];  // [1][B] - [0][A]
                if (isOverflow(oEndPoints[1][1][i], ModePartition[mode].prec[i]))
                    return false;

                oEndPoints[1][1][i] = (oEndPoints[1][1][i] & Mask);
            }
        }
    }
    else
    {
        BC6H_data->istransformed = false;
        for (CGU_INT i = 0; i < 3; ++i)
        {
            Mask = MASK(ModePartition[mode].nbits);
            oEndPoints[0][0][i] = iEndPoints[0][0][i] & Mask;

            Mask = MASK(ModePartition[mode].prec[i]);
            oEndPoints[0][1][i] = iEndPoints[0][1][i] & Mask;

            if (max_subsets > 1)
            {
                oEndPoints[1][0][i] = iEndPoints[1][0][i] & Mask;
                oEndPoints[1][1][i] = iEndPoints[1][1][i] & Mask;
            }
        }
    }

    return true;
}

void SaveCompressedBlockData(BC6H_Encode_local *BC6H_data,
    CGU_INT oEndPoints[MAX_SUBSETS][MAX_END_POINTS][MAX_DIMENSION_BIG],
    CGU_INT iIndices[2][MAX_SUBSET_SIZE],
    CGU_INT8 max_subsets,
    CGU_INT8 mode)
{
    BC6H_data->m_mode = mode;
    BC6H_data->index++;

    // Save the data to output
    BC6H_data->rw = oEndPoints[0][0][0]; // rw
    BC6H_data->gw = oEndPoints[0][0][1]; // gw
    BC6H_data->bw = oEndPoints[0][0][2]; // bw
    BC6H_data->rx = oEndPoints[0][1][0]; // rx
    BC6H_data->gx = oEndPoints[0][1][1]; // gx
    BC6H_data->bx = oEndPoints[0][1][2]; // bx

    if (max_subsets > 1)
    {
        // Save the data to output
        BC6H_data->ry = oEndPoints[1][0][0]; // ry
        BC6H_data->gy = oEndPoints[1][0][1]; // gy
        BC6H_data->by = oEndPoints[1][0][2]; // by
        BC6H_data->rz = oEndPoints[1][1][0]; // rz
        BC6H_data->gz = oEndPoints[1][1][1]; // gz
        BC6H_data->bz = oEndPoints[1][1][2]; // bz
    }

    // Map our two subset Indices for the shape to output 4x4 block
    CGU_INT pos[2] = { 0,0 };
    CGU_INT asubset;
    for (CGU_INT i = 0; i < MAX_SUBSET_SIZE; i++)
    {
        if (max_subsets > 1)
            asubset = BC6_PARTITIONS[BC6H_data->d_shape_index][i]; // Two region shapes
        else
            asubset = 0; // One region shapes
        BC6H_data->indices16[i] = (CGU_UINT8)iIndices[asubset][pos[asubset]];
        pos[asubset]++;
    }

}

CGU_FLOAT CalcOneRegionEndPtsError(BC6H_Encode_local *BC6H_data, CGU_FLOAT fEndPoints[MAX_SUBSETS][MAX_END_POINTS][MAX_DIMENSION_BIG], CGU_INT shape_indices[MAX_SUBSETS][MAX_SUBSET_SIZE])
{
    CGU_FLOAT error = 0;

    for (CGU_INT i = 0; i < MAX_SUBSET_SIZE; i++)
    {
        for (CGU_INT m = 0; m < MAX_END_POINTS; m++)
        {
            for (CGU_INT n = 0; n < NCHANNELS; n++)
            {
                CGU_FLOAT calencpts = fEndPoints[0][m][n] + (abs(fEndPoints[0][m][n] - fEndPoints[0][m][n]) * (shape_indices[0][i] / 15));
                error += abs(BC6H_data->din[i][n] - calencpts);
            }
        }
    }

    return error;
}

void ReIndexShapef(BC6H_Encode_local *BC6H_data, CGU_INT shape_indices[MAX_SUBSETS][MAX_SUBSET_SIZE])
{
    CGU_FLOAT error = 0;
    CGU_FLOAT bestError;
    CGU_INT bestIndex = 0;
    CGU_INT sub0index = 0;
    CGU_INT sub1index = 0;
    CGU_INT MaxPallet;
    CGU_INT region = (BC6H_data->region - 1);

    if (region == 0)
        MaxPallet = 16;
    else
        MaxPallet = 8;

    CGU_UINT8 isSet = 0;
    for (CGU_INT i = 0; i < MAX_SUBSET_SIZE; i++)
    {
        // subset 0 or subset 1
        if (region)
            isSet = BC6_PARTITIONS[BC6H_data->d_shape_index][i];

        if (isSet)
        {
            bestError = CMP_HALF_MAX;
            bestIndex = 0;

            // For two shape regions max Pallet is 8
            for (CGU_INT j = 0; j < MaxPallet; j++)
            {
                // Calculate error from original
                error = abs(BC6H_data->din[i][0] - BC6H_data->Paletef[1][j].x) +
                    abs(BC6H_data->din[i][1] - BC6H_data->Paletef[1][j].y) +
                    abs(BC6H_data->din[i][2] - BC6H_data->Paletef[1][j].z);
                if (error < bestError)
                {
                    bestError = error;
                    bestIndex = j;
                }
            }

            shape_indices[1][sub1index] = bestIndex;
            sub1index++;
        }
        else
        {
            // This is shared for one or two shape regions max Pallet either 16 or 8
            bestError = CMP_FLOAT_MAX;
            bestIndex = 0;

            for (CGU_INT j = 0; j < MaxPallet; j++)
            {
                // Calculate error from original
                error = abs(BC6H_data->din[i][0] - BC6H_data->Paletef[0][j].x) +
                    abs(BC6H_data->din[i][1] - BC6H_data->Paletef[0][j].y) +
                    abs(BC6H_data->din[i][2] - BC6H_data->Paletef[0][j].z);
                if (error < bestError)
                {
                    bestError = error;
                    bestIndex = j;
                }
            }

            shape_indices[0][sub0index] = bestIndex;
            sub0index++;
        }
    }

}

CGU_INT Unquantize(CGU_INT comp, unsigned char uBitsPerComp, CGU_BOOL bSigned)
{
    CGU_INT unq = 0, s = 0;
    if (bSigned)
    {
        if (uBitsPerComp >= 16)
        {
            unq = comp;
        }
        else
        {
            if (comp < 0)
            {
                s = 1;
                comp = -comp;
            }

            if (comp == 0) unq = 0;
            else if (comp >= ((1 << (uBitsPerComp - 1)) - 1)) unq = 0x7FFF;
            else unq = ((comp << 15) + 0x4000) >> (uBitsPerComp - 1);

            if (s) unq = -unq;
        }
    }
    else
    {
        if (uBitsPerComp >= 15) unq = comp;
        else if (comp == 0) unq = 0;
        else if (comp == ((1 << uBitsPerComp) - 1)) unq = 0xFFFF;
        else unq = ((comp << 16) + 0x8000) >> uBitsPerComp;
    }

    return unq;
}

CGU_INT finish_unquantizef16(CGU_INT q, CGU_BOOL isSigned)
{
    // Is it F16 Signed else F16 Unsigned
    if (isSigned)
        return (q < 0) ? -(((-q) * 31) >> 5) : (q * 31) >> 5;       // scale the magnitude by 31/32
    else
        return (q * 31) >> 6;                                       // scale the magnitude by 31/64

                                                                    // Note for Undefined we should return q as is
}

// decompress endpoints
void decompress_endpoints1(BC6H_Encode_local * bc6h_format, CGU_INT oEndPoints[MAX_SUBSETS][MAX_END_POINTS][MAX_DIMENSION_BIG], CGU_FLOAT outf[MAX_SUBSETS][MAX_END_POINTS][MAX_DIMENSION_BIG], CGU_INT mode)
{
    CGU_INT i;
    CGU_INT t;
    CGU_FLOAT out[MAX_SUBSETS][MAX_END_POINTS][MAX_DIMENSION_BIG];

    if (bc6h_format->issigned)
    {
        if (bc6h_format->istransformed)
        {
            for (i = 0; i < NCHANNELS; i++)
            {
                out[0][0][i] = (CGU_FLOAT)SIGN_EXTEND_TYPELESS(oEndPoints[0][0][i], ModePartition[mode].nbits);

                t = SIGN_EXTEND_TYPELESS(oEndPoints[0][1][i], ModePartition[mode].prec[i]); //C_RED
                t = (t + oEndPoints[0][0][i]) & MASK(ModePartition[mode].nbits);
                out[0][1][i] = (CGU_FLOAT)SIGN_EXTEND_TYPELESS(t, ModePartition[mode].nbits);

                // Unquantize all points to nbits
                out[0][0][i] = (CGU_FLOAT)Unquantize((int)out[0][0][i], (unsigned char)ModePartition[mode].nbits, false);
                out[0][1][i] = (CGU_FLOAT)Unquantize((int)out[0][1][i], (unsigned char)ModePartition[mode].nbits, false);

                // F16 format
                outf[0][0][i] = (CGU_FLOAT)finish_unquantizef16((int)out[0][0][i], false);
                outf[0][1][i] = (CGU_FLOAT)finish_unquantizef16((int)out[0][1][i], false);
            }
        }
        else
        {
            for (i = 0; i < NCHANNELS; i++)
            {
                out[0][0][i] = (CGU_FLOAT)SIGN_EXTEND_TYPELESS(oEndPoints[0][0][i], ModePartition[mode].nbits);
                out[0][1][i] = (CGU_FLOAT)SIGN_EXTEND_TYPELESS(oEndPoints[0][1][i], ModePartition[mode].prec[i]);

                // Unquantize all points to nbits
                out[0][0][i] = (CGU_FLOAT)Unquantize((int)out[0][0][i], (unsigned char)ModePartition[mode].nbits, false);
                out[0][1][i] = (CGU_FLOAT)Unquantize((int)out[0][1][i], (unsigned char)ModePartition[mode].nbits, false);

                // F16 format
                outf[0][0][i] = (CGU_FLOAT)finish_unquantizef16((int)out[0][0][i], false);
                outf[0][1][i] = (CGU_FLOAT)finish_unquantizef16((int)out[0][1][i], false);
            }
        }

    }
    else
    {
        if (bc6h_format->istransformed)
        {
            for (i = 0; i < NCHANNELS; i++)
            {
                out[0][0][i] = (CGU_FLOAT)oEndPoints[0][0][i];
                t = SIGN_EXTEND_TYPELESS(oEndPoints[0][1][i], ModePartition[mode].prec[i]);
                out[0][1][i] = (CGU_FLOAT)((t + oEndPoints[0][0][i]) & MASK(ModePartition[mode].nbits));

                // Unquantize all points to nbits
                out[0][0][i] = (CGU_FLOAT)Unquantize((int)out[0][0][i], (unsigned char)ModePartition[mode].nbits, false);
                out[0][1][i] = (CGU_FLOAT)Unquantize((int)out[0][1][i], (unsigned char)ModePartition[mode].nbits, false);

                // F16 format
                outf[0][0][i] = (CGU_FLOAT)finish_unquantizef16((int)out[0][0][i], false);
                outf[0][1][i] = (CGU_FLOAT)finish_unquantizef16((int)out[0][1][i], false);
            }
        }
        else
        {
            for (i = 0; i < NCHANNELS; i++)
            {
                out[0][0][i] = (CGU_FLOAT)oEndPoints[0][0][i];
                out[0][1][i] = (CGU_FLOAT)oEndPoints[0][1][i];

                // Unquantize all points to nbits
                out[0][0][i] = (CGU_FLOAT)Unquantize((int)out[0][0][i], (unsigned char)ModePartition[mode].nbits, false);
                out[0][1][i] = (CGU_FLOAT)Unquantize((int)out[0][1][i], (unsigned char)ModePartition[mode].nbits, false);

                // F16 format
                outf[0][0][i] = (CGU_FLOAT)finish_unquantizef16((int)out[0][0][i], false);
                outf[0][1][i] = (CGU_FLOAT)finish_unquantizef16((int)out[0][1][i], false);
            }
        }
    }
}

void decompress_endpoints2(BC6H_Encode_local * bc6h_format, CGU_INT oEndPoints[MAX_SUBSETS][MAX_END_POINTS][MAX_DIMENSION_BIG], CGU_FLOAT outf[MAX_SUBSETS][MAX_END_POINTS][MAX_DIMENSION_BIG], CGU_INT mode)
{
    CGU_INT i;
    CGU_INT t;
    CGU_FLOAT out[MAX_SUBSETS][MAX_END_POINTS][MAX_DIMENSION_BIG];

    if (bc6h_format->issigned)
    {
        if (bc6h_format->istransformed)
        {
            for (i = 0; i < NCHANNELS; i++)
            {
                // get the quantized values
                out[0][0][i] = (CGU_FLOAT)SIGN_EXTEND_TYPELESS(oEndPoints[0][0][i], ModePartition[mode].nbits);

                t = SIGN_EXTEND_TYPELESS(oEndPoints[0][1][i], ModePartition[mode].prec[i]);
                t = (t + oEndPoints[0][0][i]) & MASK(ModePartition[mode].nbits);
                out[0][1][i] = (CGU_FLOAT)SIGN_EXTEND_TYPELESS(t, ModePartition[mode].nbits);

                t = SIGN_EXTEND_TYPELESS(oEndPoints[1][0][i], ModePartition[mode].prec[i]);
                t = (t + oEndPoints[0][0][i]) & MASK(ModePartition[mode].nbits);
                out[1][0][i] = (CGU_FLOAT)SIGN_EXTEND_TYPELESS(t, ModePartition[mode].nbits);

                t = SIGN_EXTEND_TYPELESS(oEndPoints[1][1][i], ModePartition[mode].prec[i]);
                t = (t + oEndPoints[0][0][i]) & MASK(ModePartition[mode].nbits);
                out[1][1][i] = (CGU_FLOAT)SIGN_EXTEND_TYPELESS(t, ModePartition[mode].nbits);

                // Unquantize all points to nbits
                out[0][0][i] = (CGU_FLOAT)Unquantize((int)out[0][0][i], (unsigned char)ModePartition[mode].nbits, true);
                out[0][1][i] = (CGU_FLOAT)Unquantize((int)out[0][1][i], (unsigned char)ModePartition[mode].nbits, true);
                out[1][0][i] = (CGU_FLOAT)Unquantize((int)out[1][0][i], (unsigned char)ModePartition[mode].nbits, true);
                out[1][1][i] = (CGU_FLOAT)Unquantize((int)out[1][1][i], (unsigned char)ModePartition[mode].nbits, true);

                // F16 format
                outf[0][0][i] = (CGU_FLOAT)finish_unquantizef16((int)out[0][0][i], true);
                outf[0][1][i] = (CGU_FLOAT)finish_unquantizef16((int)out[0][1][i], true);
                outf[1][0][i] = (CGU_FLOAT)finish_unquantizef16((int)out[1][0][i], true);
                outf[1][1][i] = (CGU_FLOAT)finish_unquantizef16((int)out[1][1][i], true);

            }
        }
        else
        {
            for (i = 0; i < NCHANNELS; i++)
            {
                out[0][0][i] = (CGU_FLOAT)SIGN_EXTEND_TYPELESS(oEndPoints[0][0][i], ModePartition[mode].nbits);
                out[0][1][i] = (CGU_FLOAT)SIGN_EXTEND_TYPELESS(oEndPoints[0][1][i], ModePartition[mode].prec[i]);
                out[1][0][i] = (CGU_FLOAT)SIGN_EXTEND_TYPELESS(oEndPoints[1][0][i], ModePartition[mode].prec[i]);
                out[1][1][i] = (CGU_FLOAT)SIGN_EXTEND_TYPELESS(oEndPoints[1][1][i], ModePartition[mode].prec[i]);

                // Unquantize all points to nbits
                out[0][0][i] = (CGU_FLOAT)Unquantize((int)out[0][0][i], (unsigned char)ModePartition[mode].nbits, false);
                out[0][1][i] = (CGU_FLOAT)Unquantize((int)out[0][1][i], (unsigned char)ModePartition[mode].nbits, false);
                out[1][0][i] = (CGU_FLOAT)Unquantize((int)out[1][0][i], (unsigned char)ModePartition[mode].nbits, false);
                out[1][1][i] = (CGU_FLOAT)Unquantize((int)out[1][1][i], (unsigned char)ModePartition[mode].nbits, false);

                // nbits to F16 format
                outf[0][0][i] = (CGU_FLOAT)finish_unquantizef16((int)out[0][0][i], false);
                outf[0][1][i] = (CGU_FLOAT)finish_unquantizef16((int)out[0][1][i], false);
                outf[1][0][i] = (CGU_FLOAT)finish_unquantizef16((int)out[1][0][i], false);
                outf[1][1][i] = (CGU_FLOAT)finish_unquantizef16((int)out[1][1][i], false);
            }
        }

    }
    else
    {
        if (bc6h_format->istransformed)
        {
            for (i = 0; i < NCHANNELS; i++)
            {
                out[0][0][i] = (CGU_FLOAT)oEndPoints[0][0][i];
                t = SIGN_EXTEND_TYPELESS(oEndPoints[0][1][i], ModePartition[mode].prec[i]);
                out[0][1][i] = (CGU_FLOAT)((t + oEndPoints[0][0][i]) & MASK(ModePartition[mode].nbits));

                t = SIGN_EXTEND_TYPELESS(oEndPoints[1][0][i], ModePartition[mode].prec[i]);
                out[1][0][i] = (CGU_FLOAT)((t + oEndPoints[0][0][i]) & MASK(ModePartition[mode].nbits));

                t = SIGN_EXTEND_TYPELESS(oEndPoints[1][1][i], ModePartition[mode].prec[i]);
                out[1][1][i] = (CGU_FLOAT)((t + oEndPoints[0][0][i]) & MASK(ModePartition[mode].nbits));

                // Unquantize all points to nbits
                out[0][0][i] = (CGU_FLOAT)Unquantize((int)out[0][0][i], (unsigned char)ModePartition[mode].nbits, false);
                out[0][1][i] = (CGU_FLOAT)Unquantize((int)out[0][1][i], (unsigned char)ModePartition[mode].nbits, false);
                out[1][0][i] = (CGU_FLOAT)Unquantize((int)out[1][0][i], (unsigned char)ModePartition[mode].nbits, false);
                out[1][1][i] = (CGU_FLOAT)Unquantize((int)out[1][1][i], (unsigned char)ModePartition[mode].nbits, false);

                // nbits to F16 format
                outf[0][0][i] = (CGU_FLOAT)finish_unquantizef16((int)out[0][0][i], false);
                outf[0][1][i] = (CGU_FLOAT)finish_unquantizef16((int)out[0][1][i], false);
                outf[1][0][i] = (CGU_FLOAT)finish_unquantizef16((int)out[1][0][i], false);
                outf[1][1][i] = (CGU_FLOAT)finish_unquantizef16((int)out[1][1][i], false);

            }
        }
        else
        {
            for (i = 0; i < NCHANNELS; i++)
            {
                out[0][0][i] = (CGU_FLOAT)oEndPoints[0][0][i];
                out[0][1][i] = (CGU_FLOAT)oEndPoints[0][1][i];
                out[1][0][i] = (CGU_FLOAT)oEndPoints[1][0][i];
                out[1][1][i] = (CGU_FLOAT)oEndPoints[1][1][i];

                // Unquantize all points to nbits
                out[0][0][i] = (CGU_FLOAT)Unquantize((int)out[0][0][i], (unsigned char)ModePartition[mode].nbits, false);
                out[0][1][i] = (CGU_FLOAT)Unquantize((int)out[0][1][i], (unsigned char)ModePartition[mode].nbits, false);
                out[1][0][i] = (CGU_FLOAT)Unquantize((int)out[1][0][i], (unsigned char)ModePartition[mode].nbits, false);
                out[1][1][i] = (CGU_FLOAT)Unquantize((int)out[1][1][i], (unsigned char)ModePartition[mode].nbits, false);

                // nbits to F16 format
                outf[0][0][i] = (CGU_FLOAT)finish_unquantizef16((int)out[0][0][i], false);
                outf[0][1][i] = (CGU_FLOAT)finish_unquantizef16((int)out[0][1][i], false);
                outf[1][0][i] = (CGU_FLOAT)finish_unquantizef16((int)out[1][0][i], false);
                outf[1][1][i] = (CGU_FLOAT)finish_unquantizef16((int)out[1][1][i], false);
            }
        }
    }
}

// decompress endpoints
static void decompress_endpts(const CGU_INT in[MAX_SUBSETS][MAX_END_POINTS][MAX_DIMENSION_BIG], CGU_INT out[MAX_SUBSETS][MAX_END_POINTS][MAX_DIMENSION_BIG], const CGU_INT mode, CGU_BOOL issigned)
{

    if (ModePartition[mode].transformed)
    {
        for (CGU_INT i = 0; i < 3; ++i)
        {
            R_0(out) = issigned ? SIGN_EXTEND_TYPELESS(R_0(in), ModePartition[mode].IndexPrec) : R_0(in);
            CGU_INT t;
            t = SIGN_EXTEND_TYPELESS(R_1(in), ModePartition[mode].prec[i]);
            t = (t + R_0(in)) & MASK(ModePartition[mode].nbits);
            R_1(out) = issigned ? SIGN_EXTEND_TYPELESS(t, ModePartition[mode].nbits) : t;

            t = SIGN_EXTEND_TYPELESS(R_2(in), ModePartition[mode].prec[i]);
            t = (t + R_0(in)) & MASK(ModePartition[mode].nbits);
            R_2(out) = issigned ? SIGN_EXTEND_TYPELESS(t, ModePartition[mode].nbits) : t;

            t = SIGN_EXTEND_TYPELESS(R_3(in), ModePartition[mode].prec[i]);
            t = (t + R_0(in)) & MASK(ModePartition[mode].nbits);
            R_3(out) = issigned ? SIGN_EXTEND_TYPELESS(t, ModePartition[mode].nbits) : t;
        }
    }
    else
    {
        for (CGU_INT i = 0; i < 3; ++i)
        {
            R_0(out) = issigned ? SIGN_EXTEND_TYPELESS(R_0(in), ModePartition[mode].nbits) : R_0(in);
            R_1(out) = issigned ? SIGN_EXTEND_TYPELESS(R_1(in), ModePartition[mode].prec[i]) : R_1(in);
            R_2(out) = issigned ? SIGN_EXTEND_TYPELESS(R_2(in), ModePartition[mode].prec[i]) : R_2(in);
            R_3(out) = issigned ? SIGN_EXTEND_TYPELESS(R_3(in), ModePartition[mode].prec[i]) : R_3(in);
        }
    }
}

// endpoints fit only if the compression was lossless
static CGU_BOOL endpts_fit(const CGU_INT orig[MAX_SUBSETS][MAX_END_POINTS][MAX_DIMENSION_BIG], const CGU_INT compressed[MAX_SUBSETS][MAX_END_POINTS][MAX_DIMENSION_BIG], const CGU_INT mode, CGU_INT max_subsets, CGU_BOOL issigned)
{
    CGU_INT uncompressed[MAX_SUBSETS][MAX_END_POINTS][MAX_DIMENSION_BIG];

    decompress_endpts(compressed, uncompressed, mode, issigned);

    for (CGU_INT j = 0; j < max_subsets; ++j)
        for (CGU_INT i = 0; i < 3; ++i)
        {
            if (orig[j][0][i] != uncompressed[j][0][i]) return false;
            if (orig[j][1][i] != uncompressed[j][1][i]) return false;
        }

    return true;
}

//todo: check overflow
CGU_INT QuantizeToInt(short value, CGU_INT prec, CGU_BOOL signedfloat16)
{

    if (prec <= 1) return 0;
    CGU_BOOL negvalue = false;

    // move data to use extra bits for processing
    CGU_INT ivalue = value;

    if (signedfloat16)
    {
        if (value < 0)
        {
            negvalue = true;
            value = -value;
        }
        prec--;
    }
    else
    {
        // clamp -ve
        if (value < 0)
            value = 0;
    }

    CGU_INT iQuantized;
    CGU_INT bias = (prec > 10 && prec != 16) ? ((1 << (prec - 11)) - 1) : 0;
    bias = (prec == 16) ? 15 : bias;

    iQuantized = ((ivalue << prec) + bias) / (FLT16_MAX + 1);

    return (negvalue ? -iQuantized : iQuantized);
}

//todo: checkoverflow
void QuantizeEndPointToF16Prec(CGU_FLOAT EndPoints[MAX_SUBSETS][MAX_END_POINTS][MAX_DIMENSION_BIG], CGU_INT iEndPoints[MAX_SUBSETS][MAX_END_POINTS][MAX_DIMENSION_BIG], CGU_INT max_subsets, CGU_INT prec, CGU_BOOL isSigned)
{

    for (CGU_INT subset = 0; subset < max_subsets; ++subset)
    {
        iEndPoints[subset][0][0] = QuantizeToInt((short)EndPoints[subset][0][0], prec, isSigned);    // A.Red
        iEndPoints[subset][0][1] = QuantizeToInt((short)EndPoints[subset][0][1], prec, isSigned);    // A.Green
        iEndPoints[subset][0][2] = QuantizeToInt((short)EndPoints[subset][0][2], prec, isSigned);    // A.Blue
        iEndPoints[subset][1][0] = QuantizeToInt((short)EndPoints[subset][1][0], prec, isSigned);    // B.Red
        iEndPoints[subset][1][1] = QuantizeToInt((short)EndPoints[subset][1][1], prec, isSigned);    // B.Green
        iEndPoints[subset][1][2] = QuantizeToInt((short)EndPoints[subset][1][2], prec, isSigned);    // B.Blue
    }
}

CGU_FLOAT  EncodePattern(BC6H_Encode_local *BC6H_data, CGU_FLOAT  error)
{
    CGU_INT8        max_subsets = BC6H_data->region;

    // now we have input colors (in), output colors (outB) mapped to a line of ends (EndPoints)
    // and a set of colors on the line equally spaced (indexedcolors)
    // Lets assign indices

    //CGU_FLOAT SrcEndPoints[MAX_SUBSETS][MAX_END_POINTS][MAX_DIMENSION_BIG];                  // temp endpoints used during calculations

    // Quantize the EndPoints
    CGU_INT F16EndPoints[MAX_BC6H_MODES + 1][MAX_SUBSETS][MAX_END_POINTS][MAX_DIMENSION_BIG];                    // temp endpoints used during calculations
    CGU_INT quantEndPoints[MAX_BC6H_MODES + 1][MAX_SUBSETS][MAX_END_POINTS][MAX_DIMENSION_BIG];                    // endpoints to save for a given mode

                                                                                                                    // ModePartition[] starts from 1 to 14
                                                                                                                    // If we have a shape pattern set the loop to check modes from 1 to 10 else from 11 to 14
                                                                                                                    // of the ModePartition table
    CGU_INT     min_mode = (BC6H_data->region == 2) ? 1 : 11;
    CGU_INT     max_mode = (BC6H_data->region == 2) ? MAX_TWOREGION_MODES : MAX_BC6H_MODES;

    CGU_BOOL    fits[15];
    memset((CGU_UINT8 *)fits, 0, sizeof(fits));

    CGU_INT bestFit = 0;
    CGU_INT bestEndpointMode = 0;
    CGU_FLOAT bestError = CMP_FLOAT_MAX;
    CGU_FLOAT bestEndpointsErr = CMP_FLOAT_MAX;
    CGU_FLOAT endPointErr = 0;

    // Try Optimization for the Mode
    CGU_FLOAT       best_EndPoints[MAX_BC6H_MODES + 1][MAX_SUBSETS][MAX_END_POINTS][MAX_DIMENSION_BIG];
    CGU_INT         best_Indices[MAX_BC6H_MODES + 1][MAX_SUBSETS][MAX_SUBSET_SIZE];
    CGU_FLOAT      opt_toterr[MAX_BC6H_MODES + 1] = { 0 };

    memset((CGU_UINT8 *)opt_toterr, 0, sizeof(opt_toterr));

    CGU_INT numfits = 0;
    //
    // Notes;  Only the endpoints are varying; the indices stay fixed in values!
    // so to optimize which mode we need only check the endpoints error against our original to pick the mode to save
    //
    for (CGU_INT modes = min_mode; modes <= max_mode; ++modes)
    {
        memcpy((CGU_UINT8 *)best_EndPoints[modes], (CGU_UINT8 *)BC6H_data->fEndPoints, sizeof(BC6H_data->fEndPoints));
        memcpy((CGU_UINT8 *)best_Indices[modes]  , (CGU_UINT8 *)BC6H_data->shape_indices, sizeof(BC6H_data->shape_indices));

        {
            QuantizeEndPointToF16Prec(best_EndPoints[modes], F16EndPoints[modes], max_subsets, ModePartition[ModeFitOrder[modes]].nbits, BC6H_data->issigned);
        }

        // Indices data to save for given mode
        SwapIndices(F16EndPoints[modes], best_Indices[modes], BC6H_data->entryCount, max_subsets, ModeFitOrder[modes], BC6H_data->d_shape_index);
        CGU_BOOL transformfit = TransformEndPoints(BC6H_data, F16EndPoints[modes], quantEndPoints[modes], max_subsets, ModeFitOrder[modes]);
        fits[modes] = endpts_fit(F16EndPoints[modes], quantEndPoints[modes], ModeFitOrder[modes], max_subsets, BC6H_data->issigned);

        if (fits[modes] && transformfit)
        {
            numfits++;

            // The new compressed end points fit the mode
            // recalculate the error for this mode with a new set of indices
            // since we have shifted the end points from what we origially calc
            // from the find_bestpattern
            CGU_FLOAT uncompressed[MAX_SUBSETS][MAX_END_POINTS][MAX_DIMENSION_BIG];
            if (BC6H_data->region == 1)
                decompress_endpoints1(BC6H_data, quantEndPoints[modes], uncompressed, ModeFitOrder[modes]);
            else
                decompress_endpoints2(BC6H_data, quantEndPoints[modes], uncompressed, ModeFitOrder[modes]);
            // Takes the end points and creates a pallet of colors
            // based on preset weights along a vector formed by the two end points
            palitizeEndPointsF(BC6H_data, uncompressed);

            // Once we have the pallet - recalculate the optimal indices using the pallet
            // and the original image data stored in BC6H_data.din[]
            if (!BC6H_data->issigned)
                ReIndexShapef(BC6H_data, best_Indices[modes]);

            // Calculate the error of the new tile vs the old tile data
            opt_toterr[modes] = CalcShapeError(BC6H_data, uncompressed, true);
            if (BC6H_data->region == 1)
            {
                endPointErr = CalcOneRegionEndPtsError(BC6H_data, uncompressed, best_Indices[modes]);
                if (endPointErr < bestEndpointsErr)
                {
                    bestEndpointsErr = endPointErr;
                    bestEndpointMode = modes;
                }
            }

            CGU_BOOL transformFit = true;
            // Save hold this mode fit data if its better than the last one checked.
            if (opt_toterr[modes] < bestError)
            {
                if (!BC6H_data->issigned)
                {
                    QuantizeEndPointToF16Prec(uncompressed, F16EndPoints[modes], max_subsets, ModePartition[ModeFitOrder[modes]].nbits, BC6H_data->issigned);
                    SwapIndices(F16EndPoints[modes], best_Indices[modes], BC6H_data->entryCount, max_subsets, ModeFitOrder[modes], BC6H_data->d_shape_index);
                    transformFit = TransformEndPoints(BC6H_data, F16EndPoints[modes], quantEndPoints[modes], max_subsets, ModeFitOrder[modes]);
                }
                if (transformFit)
                {
                    if (BC6H_data->region == 1)
                    {
                        bestFit = (modes == bestEndpointMode) ? modes : ((modes < bestEndpointMode) ? modes : bestEndpointMode);
                    }
                    else
                    {
                        bestFit = modes;
                    }
                    bestError = opt_toterr[bestFit];
                    error = bestError;
                }
            }

        }
    }

    if (numfits > 0)
    {
        SaveCompressedBlockData(BC6H_data, quantEndPoints[bestFit], best_Indices[bestFit], max_subsets, ModeFitOrder[bestFit]);
        return error;
    }

    // Should not get here!
    return error;
}

void CompressBlockBC6_Internal(CMP_GLOBAL  unsigned char*outdata,
                               CGU_UINT32 destIdx,
                               BC6H_Encode_local * BC6HEncode_local,
                               CMP_GLOBAL const BC6H_Encode *BC6HEncode)
{
    //printf("---SRC---\n");
    //CGU_UINT8    blkindex = 0;
    //CGU_UINT8    srcindex = 0;
    //for ( CGU_INT32 j = 0; j < 16; j++) {
    //    printf("%5.0f,",BC6HEncode_local->din[j][0]);// R
    //    printf("%5.0f,",BC6HEncode_local->din[j][1]);// G
    //    printf("%5.0f,",BC6HEncode_local->din[j][2]);// B
    //    printf("%5.0f\n,",BC6HEncode_local->din[j][3]);// No Alpha
    //}

    CGU_UINT8 Cmp_Red_Block[16] = { 0xc2,0x7b,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xe0,0x03,0x00,0x00,0x00,0x00,0x00 };

    CGU_FLOAT bestError = CMP_FLOAT_MAX;
    CGU_FLOAT error = CMP_FLOAT_MAX;
    CGU_INT8 bestShape = 0;
    CGU_FLOAT quality = BC6HEncode->m_quality;
    BC6HEncode_local->issigned = BC6HEncode->m_isSigned;
    // run through no partition first
    error = FindBestPattern(BC6HEncode_local, false, 0, quality);
    if (error < bestError)
    {
        bestError = error;
        bestShape = -1;

        memcpy((CGU_UINT8 *)BC6HEncode_local->cur_best_shape_indices,(CGU_UINT8 *) BC6HEncode_local->shape_indices, sizeof(BC6HEncode_local->shape_indices));
        memcpy((CGU_UINT8 *)BC6HEncode_local->cur_best_partition    ,(CGU_UINT8 *) BC6HEncode_local->partition, sizeof(BC6HEncode_local->partition));
        memcpy((CGU_UINT8 *)BC6HEncode_local->cur_best_fEndPoints   ,(CGU_UINT8 *) BC6HEncode_local->fEndPoints, sizeof(BC6HEncode_local->fEndPoints));
        memcpy((CGU_UINT8 *)BC6HEncode_local->cur_best_entryCount   ,(CGU_UINT8 *) BC6HEncode_local->entryCount, sizeof(BC6HEncode_local->entryCount));
        BC6HEncode_local->d_shape_index = bestShape;
    }


    // run through 32 possible partition set
    for (CGU_INT8 shape = 0; shape < MAX_BC6H_PARTITIONS; shape++)
    {
        error = FindBestPattern(BC6HEncode_local, true, shape, quality);
        if (error < bestError)
        {
            bestError = error;
            bestShape = shape;

            memcpy((CGU_UINT8 *)BC6HEncode_local->cur_best_shape_indices, (CGU_UINT8 *)BC6HEncode_local->shape_indices, sizeof(BC6HEncode_local->shape_indices));
            memcpy((CGU_UINT8 *)BC6HEncode_local->cur_best_partition    , (CGU_UINT8 *)BC6HEncode_local->partition, sizeof(BC6HEncode_local->partition));
            memcpy((CGU_UINT8 *)BC6HEncode_local->cur_best_fEndPoints   , (CGU_UINT8 *)BC6HEncode_local->fEndPoints, sizeof(BC6HEncode_local->fEndPoints));
            memcpy((CGU_UINT8 *)BC6HEncode_local->cur_best_entryCount   , (CGU_UINT8 *)BC6HEncode_local->entryCount, sizeof(BC6HEncode_local->entryCount));
            BC6HEncode_local->d_shape_index = bestShape;
        }
        else
        {
            if (bestShape != -1)
            {
                BC6HEncode_local->d_shape_index = bestShape;
                memcpy((CGU_UINT8 *)BC6HEncode_local->shape_indices, (CGU_UINT8 *)BC6HEncode_local->cur_best_shape_indices, sizeof(BC6HEncode_local->shape_indices));
                memcpy((CGU_UINT8 *)BC6HEncode_local->partition    , (CGU_UINT8 *)BC6HEncode_local->cur_best_partition, sizeof(BC6HEncode_local->partition));
                memcpy((CGU_UINT8 *)BC6HEncode_local->fEndPoints   , (CGU_UINT8 *)BC6HEncode_local->cur_best_fEndPoints, sizeof(BC6HEncode_local->fEndPoints));
                memcpy((CGU_UINT8 *)BC6HEncode_local->entryCount   , (CGU_UINT8 *)BC6HEncode_local->cur_best_entryCount, sizeof(BC6HEncode_local->entryCount));
            }
        }
    }

    bestError = EncodePattern(BC6HEncode_local, bestError);


    // used for debugging modes, set the value you want to debug with
    if (BC6HEncode_local->m_mode != 0)
    {
        // do final encoding and save to output block
        SaveDataBlock(BC6HEncode_local, &outdata[destIdx]);
    }
   else
   {
       for (CGU_INT i = 0; i < 16; i++)
           outdata[destIdx + i] = Cmp_Red_Block[i];
   }
}

//============================================== USER INTERFACES ========================================================

#ifndef ASPM_GPU
#ifndef ASPM
//======================= DECOMPRESS =========================================
using namespace std;

static AMD_BC6H_Format extract_format(const CGU_UINT8 in[COMPRESSED_BLOCK_SIZE])
{
    AMD_BC6H_Format bc6h_format;
    unsigned short decvalue;
    CGU_UINT8 iData[COMPRESSED_BLOCK_SIZE];
    memcpy(iData,in,COMPRESSED_BLOCK_SIZE);

    memset(&bc6h_format,0,sizeof(AMD_BC6H_Format));

    // 2 bit mode has Mode bit:2 = 0 and mode bits:1 = 0 or 1
    // 5 bit mode has Mode bit:2 = 1
    if ((in[0]&0x02) > 0)
    {
        decvalue = (in[0]&0x1F);    // first five bits
    }
    else
    {
        decvalue = (in[0]&0x01);    // first two bits
    }

    BitHeader header(in,16);

    switch (decvalue)
    {
    case 0x00:
                bc6h_format.m_mode          = 1; // 10:5:5:5
                bc6h_format.wBits           = 10;
                bc6h_format.tBits[C_RED]    = 5;
                bc6h_format.tBits[C_GREEN]  = 5;
                bc6h_format.tBits[C_BLUE]   = 5;
                bc6h_format.rw = header.getvalue(5 ,10);            // 10:   rw[9:0]
                bc6h_format.rx = header.getvalue(35,5);             // 5:    rx[4:0]
                bc6h_format.ry = header.getvalue(65,5);             // 5:    ry[4:0]
                bc6h_format.rz = header.getvalue(71,5);             // 5:    rz[4:0]
                bc6h_format.gw = header.getvalue(15,10);            // 10:   gw[9:0]
                bc6h_format.gx = header.getvalue(45,5);             // 5:    gx[4:0]
                bc6h_format.gy = header.getvalue(41,4) |            // 5:    gy[3:0]
                                (header.getvalue(2,1) << 4);        //       gy[4]
                bc6h_format.gz = header.getvalue(51,4) |            // 5:    gz[3:0]
                                (header.getvalue(40,1) << 4);       //       gz[4]
                bc6h_format.bw = header.getvalue(25,10);            // 10:   bw[9:0]
                bc6h_format.bx = header.getvalue(55,5);             // 5:    bx[4:0]
                bc6h_format.by = header.getvalue(61,4) |            // 5:    by[3:0]
                                (header.getvalue(3,1) << 4);        //       by[4]
                bc6h_format.bz = header.getvalue(50,1) |            // 5:    bz[0]
                                (header.getvalue(60,1) << 1) |      //       bz[1]
                                (header.getvalue(70,1) << 2) |      //       bz[2]
                                (header.getvalue(76,1) << 3) |      //       bz[3]
                                (header.getvalue(4 ,1) << 4);       //       bz[4]
                break;
    case 0x01:
                bc6h_format.m_mode          = 2;    // 7:6:6:6
                bc6h_format.wBits           = 7;
                bc6h_format.tBits[C_RED]    = 6;
                bc6h_format.tBits[C_GREEN]  = 6;
                bc6h_format.tBits[C_BLUE]   = 6;
                bc6h_format.rw = header.getvalue(5,7);               // 7:    rw[6:0]
                bc6h_format.rx = header.getvalue(35,6);              // 6:    rx[5:0]
                bc6h_format.ry = header.getvalue(65,6);              // 6:    ry[5:0]
                bc6h_format.rz = header.getvalue(71,6);              // 6:    rz[5:0]
                bc6h_format.gw = header.getvalue(15,7);              // 7:    gw[6:0]
                bc6h_format.gx = header.getvalue(45,6);              // 6:    gx[5:0]
                bc6h_format.gy = header.getvalue(41,4)    |          // 6:    gy[3:0]
                                (header.getvalue(24,1) << 4) |       //       gy[4]
                                (header.getvalue(2,1)   << 5);       //       gy[5]
                bc6h_format.gz = header.getvalue(51,4)    |          // 6:    gz[3:0]
                                (header.getvalue(3,1) << 4) |        //       gz[4]
                                (header.getvalue(4,1) << 5);         //       gz[5]
                bc6h_format.bw = header.getvalue(25,7);              // 7:    bw[6:0]
                bc6h_format.bx = header.getvalue(55,6);              // 6:    bx[5:0]
                bc6h_format.by = header.getvalue(61,4)    |          // 6:    by[3:0]
                                (header.getvalue(14,1) << 4) |       //       by[4]
                                (header.getvalue(22,1) << 5);        //       by[5]
                bc6h_format.bz = header.getvalue(12,1)    |          // 6:    bz[0]
                                (header.getvalue(13,1) << 1) |       //       bz[1]
                                (header.getvalue(23,1) << 2) |       //       bz[2]
                                (header.getvalue(32,1) << 3) |       //       bz[3]
                                (header.getvalue(34,1) << 4) |       //       bz[4]
                                (header.getvalue(33,1) << 5);        //       bz[5]
                break;
    case 0x02:
                bc6h_format.m_mode          = 3;  // 11:5:4:4
                bc6h_format.wBits           = 11;
                bc6h_format.tBits[C_RED]    = 5;
                bc6h_format.tBits[C_GREEN]  = 4;
                bc6h_format.tBits[C_BLUE]   = 4;
                bc6h_format.rw = header.getvalue(5,10)  |            //11:    rw[9:0]
                                (header.getvalue(40,1) << 10);       //       rw[10]
                bc6h_format.rx = header.getvalue(35,5);              // 5:    rx[4:0]
                bc6h_format.ry = header.getvalue(65,5);              // 5:    ry[4:0]
                bc6h_format.rz = header.getvalue(71,5);              // 5:    rz[4:0]
                bc6h_format.gw = header.getvalue(15,10) |            //11:    gw[9:0]
                                (header.getvalue(49,1) << 10);       //       gw[10]
                bc6h_format.gx = header.getvalue(45,4);              //4:     gx[3:0]
                bc6h_format.gy = header.getvalue(41,4);              //4:     gy[3:0]
                bc6h_format.gz = header.getvalue(51,4);              //4:     gz[3:0]
                bc6h_format.bw = header.getvalue(25,10) |            //11:    bw[9:0]
                                (header.getvalue(59,1) << 10);       //       bw[10]
                bc6h_format.bx = header.getvalue(55,4);              //4:     bx[3:0]
                bc6h_format.by = header.getvalue(61,4);              //4:     by[3:0]
                bc6h_format.bz = header.getvalue(50,1) |             //4:     bz[0]
                                (header.getvalue(60,1) << 1) |       //       bz[1]
                                (header.getvalue(70,1) << 2) |       //       bz[2]
                                (header.getvalue(76,1) << 3);        //       bz[3]
                break;
    case 0x06:
                bc6h_format.m_mode          = 4;  // 11:4:5:4
                bc6h_format.wBits           = 11;
                bc6h_format.tBits[C_RED]    = 4;
                bc6h_format.tBits[C_GREEN]  = 5;
                bc6h_format.tBits[C_BLUE]   = 4;
                bc6h_format.rw = header.getvalue(5,10)  |             //11:   rw[9:0]
                                (header.getvalue(39,1) << 10);        //      rw[10]
                bc6h_format.rx = header.getvalue(35,4);               //4:    rx[3:0]
                bc6h_format.ry = header.getvalue(65,4);               //4:    ry[3:0]
                bc6h_format.rz = header.getvalue(71,4);               //4:    rz[3:0]
                bc6h_format.gw = header.getvalue(15,10) |             //11:   gw[9:0]
                                (header.getvalue(50,1) << 10);        //      gw[10]
                bc6h_format.gx = header.getvalue(45,5);               //5:    gx[4:0]
                bc6h_format.gy = header.getvalue(41,4) |              //5:    gy[3:0]
                                (header.getvalue(75,1) << 4);         //      gy[4]
                bc6h_format.gz = header.getvalue(51,4) |              //5:    gz[3:0]
                                (header.getvalue(40,1) << 4);         //      gz[4]
                bc6h_format.bw = header.getvalue(25,10) |             //11:   bw[9:0]
                                (header.getvalue(59,1) << 10);        //      bw[10]
                bc6h_format.bx = header.getvalue(55,4);               //4:    bx[3:0]
                bc6h_format.by = header.getvalue(61,4);               //4:    by[3:0]
                bc6h_format.bz = header.getvalue(69,1) |              //4:    bz[0]
                                (header.getvalue(60,1) << 1) |        //      bz[1]
                                (header.getvalue(70,1) << 2) |        //      bz[2]
                                (header.getvalue(76,1) << 3);         //      bz[3]
                break;
    case 0x0A:
                bc6h_format.m_mode          = 5; // 11:4:4:5
                bc6h_format.wBits           = 11;
                bc6h_format.tBits[C_RED]    = 4;
                bc6h_format.tBits[C_GREEN]  = 4;
                bc6h_format.tBits[C_BLUE]   = 5;
                bc6h_format.rw = header.getvalue(5,10)  |             //11:   rw[9:0]
                                (header.getvalue(39,1) << 10);        //      rw[10]
                bc6h_format.rx = header.getvalue(35,4);               //4:    rx[3:0]
                bc6h_format.ry = header.getvalue(65,4);               //4:    ry[3:0]
                bc6h_format.rz = header.getvalue(71,4);               //4:    rz[3:0]
                bc6h_format.gw = header.getvalue(15,10) |             //11:   gw[9:0]
                                (header.getvalue(49,1) << 10);        //      gw[10]
                bc6h_format.gx = header.getvalue(45,4);               //4:    gx[3:0]
                bc6h_format.gy = header.getvalue(41,4);               //4:    gy[3:0]
                bc6h_format.gz = header.getvalue(51,4);               //4:    gz[3:0]
                bc6h_format.bw = header.getvalue(25,10) |             //11:   bw[9:0]
                                (header.getvalue(60,1) << 10);        //      bw[10]
                bc6h_format.bx = header.getvalue(55,5);               //5:    bx[4:0]
                bc6h_format.by = header.getvalue(61,4);               //5:    by[3:0]
                                (header.getvalue(40,1) << 4);         //      by[4]
                bc6h_format.bz = header.getvalue(50,1) |              //5:    bz[0]
                                (header.getvalue(69,1) << 1) |        //      bz[1]
                                (header.getvalue(70,1) << 2) |        //      bz[2]
                                (header.getvalue(76,1) << 3) |        //      bz[3]
                                (header.getvalue(75,1) << 4);         //      bz[4]
                break;
    case 0x0E:
                bc6h_format.m_mode          = 6;  // 9:5:5:5
                bc6h_format.wBits           = 9;
                bc6h_format.tBits[C_RED]    = 5;
                bc6h_format.tBits[C_GREEN]  = 5;
                bc6h_format.tBits[C_BLUE]   = 5;
                bc6h_format.rw = header.getvalue(5,9);                 //9:   rw[8:0]
                bc6h_format.gw = header.getvalue(15,9);                //9:   gw[8:0]
                bc6h_format.bw = header.getvalue(25,9);                //9:   bw[8:0]
                bc6h_format.rx = header.getvalue(35,5);                //5:   rx[4:0]
                bc6h_format.gx = header.getvalue(45,5);                //5:   gx[4:0]
                bc6h_format.bx = header.getvalue(55,5);                //5:   bx[4:0]
                bc6h_format.ry = header.getvalue(65,5);                //5:   ry[4:0]
                bc6h_format.gy = header.getvalue(41,4) |               //5:   gy[3:0]
                                (header.getvalue(24,1) << 4);          //     gy[4]
                bc6h_format.by = header.getvalue(61,4) |               //5:   by[3:0]
                                (header.getvalue(14,1) << 4);          //     by[4]
                bc6h_format.rz = header.getvalue(71,5);                //5:   rz[4:0]
                bc6h_format.gz = header.getvalue(51,4) |               //5:   gz[3:0]
                                (header.getvalue(40,1) << 4);          //     gz[4]
                bc6h_format.bz = header.getvalue(50,1) |               //5:   bz[0]
                                (header.getvalue(60,1) << 1) |         //     bz[1]
                                (header.getvalue(70,1) << 2) |         //     bz[2]
                                (header.getvalue(76,1) << 3) |         //     bz[3]
                                (header.getvalue(34,1) << 4);          //     bz[4]
                break;
    case 0x12:
                bc6h_format.m_mode          = 7;  // 8:6:5:5
                bc6h_format.wBits           = 8;
                bc6h_format.tBits[C_RED]    = 6;
                bc6h_format.tBits[C_GREEN]  = 5;
                bc6h_format.tBits[C_BLUE]   = 5;
                bc6h_format.rw = header.getvalue(5,8);                 //8:    rw[7:0]
                bc6h_format.gw = header.getvalue(15,8);                //8:    gw[7:0]
                bc6h_format.bw = header.getvalue(25,8);                //8:    bw[7:0]
                bc6h_format.rx = header.getvalue(35,6);                //6:    rx[5:0]
                bc6h_format.gx = header.getvalue(45,5);                //5:    gx[4:0]
                bc6h_format.bx = header.getvalue(55,5);                //5:    bx[4:0]
                bc6h_format.ry = header.getvalue(65,6);                //6:    ry[5:0]
                bc6h_format.gy = header.getvalue(41,4) |               //5:    gy[3:0]
                                (header.getvalue(24,1) << 4);          //      gy[4]
                bc6h_format.by = header.getvalue(61,4) |               //5:    by[3:0]
                                (header.getvalue(14,1) << 4);          //      by[4]
                bc6h_format.rz = header.getvalue(71,6);                //6:    rz[5:0]
                bc6h_format.gz = header.getvalue(51,4) |               //5:    gz[3:0]
                                (header.getvalue(13,1) << 4);          //      gz[4]
                bc6h_format.bz = header.getvalue(50,1) |               //5:    bz[0]
                                (header.getvalue(60,1) << 1) |         //      bz[1]
                                (header.getvalue(23,1) << 2) |         //      bz[2]
                                (header.getvalue(33,1) << 3) |         //      bz[3]
                                (header.getvalue(34,1) << 4);          //      bz[4]
                break;
    case 0x16:
                bc6h_format.m_mode          = 8;  // 8:5:6:5
                bc6h_format.wBits           = 8;
                bc6h_format.tBits[C_RED]    = 5;
                bc6h_format.tBits[C_GREEN]  = 6;
                bc6h_format.tBits[C_BLUE]   = 5;
                bc6h_format.rw = header.getvalue(5,8);                 //8:    rw[7:0]
                bc6h_format.gw = header.getvalue(15,8);                //8:    gw[7:0]
                bc6h_format.bw = header.getvalue(25,8);                //8:    bw[7:0]
                bc6h_format.rx = header.getvalue(35,5);                //5:    rx[4:0]
                bc6h_format.gx = header.getvalue(45,6);                //6:    gx[5:0]
                bc6h_format.bx = header.getvalue(55,5);                //5:    bx[4:0]
                bc6h_format.ry = header.getvalue(65,5);                //5:    ry[4:0]
                bc6h_format.gy = header.getvalue(41,4) |               //6:    gy[3:0]
                                (header.getvalue(24,1) << 4) |         //      gy[4]
                                (header.getvalue(23,1) << 5);          //      gy[5]
                bc6h_format.by = header.getvalue(61,4) |               //5:    by[3:0]
                                (header.getvalue(14,1) << 4);          //      by[4]
                bc6h_format.rz = header.getvalue(71,5);                //5:    rz[4:0]
                bc6h_format.gz = header.getvalue(51,4) |               //6:    gz[3:0]
                                (header.getvalue(40,1) << 4) |         //      gz[4]
                                (header.getvalue(33,1) << 5);          //      gz[5]
                bc6h_format.bz = header.getvalue(13,1) |               //5:    bz[0]
                                (header.getvalue(60,1) << 1) |         //      bz[1]
                                (header.getvalue(70,1) << 2) |         //      bz[2]
                                (header.getvalue(76,1) << 3) |         //      bz[3]
                                (header.getvalue(34,1) << 4);          //      bz[4]
                break;
    case 0x1A:
                bc6h_format.m_mode          = 9;  // 8:5:5:6
                bc6h_format.wBits           = 8;
                bc6h_format.tBits[C_RED]    = 5;
                bc6h_format.tBits[C_GREEN]  = 5;
                bc6h_format.tBits[C_BLUE]   = 6;
                bc6h_format.rw = header.getvalue(5,8);                 //8:    rw[7:0]
                bc6h_format.gw = header.getvalue(15,8);                //8:    gw[7:0]
                bc6h_format.bw = header.getvalue(25,8);                //8:    bw[7:0]
                bc6h_format.rx = header.getvalue(35,5);                //5:    rx[4:0]
                bc6h_format.gx = header.getvalue(45,5);                //5:    gx[4:0]
                bc6h_format.bx = header.getvalue(55,6);                //6:    bx[5:0]
                bc6h_format.ry = header.getvalue(65,5);                //5:    ry[4:0]
                bc6h_format.gy = header.getvalue(41,4) |               //5:    gy[3:0]
                                (header.getvalue(24,1) << 4);          //      gy[4]
                bc6h_format.by = header.getvalue(61,4)    |            //6:    by[3:0]
                                (header.getvalue(14,1) << 4) |         //      by[4]
                                (header.getvalue(23,1) << 5);          //      by[5]
                bc6h_format.rz = header.getvalue(71,5);                //5:    rz[4:0]
                bc6h_format.gz = header.getvalue(51,4) |               //5:    gz[3:0]
                                (header.getvalue(40,1) << 4);          //      gz[4]
                bc6h_format.bz = header.getvalue(50,1) |               //6:    bz[0]
                                (header.getvalue(13,1) << 1) |         //      bz[1]
                                (header.getvalue(70,1) << 2) |         //      bz[2]
                                (header.getvalue(76,1) << 3) |         //      bz[3]
                                (header.getvalue(34,1) << 4) |         //      bz[4]
                                (header.getvalue(33,1) << 5);          //      bz[5]
                break;
    case 0x1E:
                bc6h_format.m_mode          = 10;  // 6:6:6:6
                bc6h_format.istransformed   = FALSE;
                bc6h_format.wBits           = 6;
                bc6h_format.tBits[C_RED]    = 6;
                bc6h_format.tBits[C_GREEN]  = 6;
                bc6h_format.tBits[C_BLUE]   = 6;
                bc6h_format.rw = header.getvalue(5,6);                 //6:    rw[5:0]
                bc6h_format.gw = header.getvalue(15,6);                //6:    gw[5:0]
                bc6h_format.bw = header.getvalue(25,6);                //6:    bw[5:0]
                bc6h_format.rx = header.getvalue(35,6);                //6:    rx[5:0]
                bc6h_format.gx = header.getvalue(45,6);                //6:    gx[5:0]
                bc6h_format.bx = header.getvalue(55,6);                //6:    bx[5:0]
                bc6h_format.ry = header.getvalue(65,6);                //6:    ry[5:0]
                bc6h_format.gy = header.getvalue(41,4) |               //6:    gy[3:0]
                                (header.getvalue(24,1) << 4) |         //      gy[4]
                                (header.getvalue(21,1) << 5);          //      gy[5]
                bc6h_format.by = header.getvalue(61,4)    |            //6:    by[3:0]
                                (header.getvalue(14,1) << 4) |         //      by[4]
                                (header.getvalue(22,1) << 5);          //      by[5]
                bc6h_format.rz = header.getvalue(71,6);                //6:    rz[5:0]
                bc6h_format.gz = header.getvalue(51,4) |               //6:    gz[3:0]
                                (header.getvalue(11,1) << 4) |         //      gz[4]
                                (header.getvalue(31,1) << 5);          //      gz[5]
                bc6h_format.bz = header.getvalue(12,1) |               //6:    bz[0]
                                (header.getvalue(13,1) << 1) |         //      bz[1]
                                (header.getvalue(23,1) << 2) |         //      bz[2]
                                (header.getvalue(32,1) << 3) |         //      bz[3]
                                (header.getvalue(34,1) << 4) |         //      bz[4]
                                (header.getvalue(33,1) << 5);          //      bz[5]
                break;

    // Single region modes
    case 0x03:
                bc6h_format.m_mode            = 11;  // 10:10
                bc6h_format.wBits             = 10;
                bc6h_format.tBits[C_RED]      = 10;
                bc6h_format.tBits[C_GREEN]    = 10;
                bc6h_format.tBits[C_BLUE]     = 10;
                bc6h_format.rw = header.getvalue(5,10);             // 10: rw[9:0]
                bc6h_format.gw = header.getvalue(15,10);            // 10: gw[9:0]
                bc6h_format.bw = header.getvalue(25,10);            // 10: bw[9:0]
                bc6h_format.rx = header.getvalue(35,10);            // 10: rx[9:0]
                bc6h_format.gx = header.getvalue(45,10);            // 10: gx[9:0]
                bc6h_format.bx = header.getvalue(55,10);            // 10: bx[9:0]
                break;
    case 0x07:
                bc6h_format.m_mode              = 12;  // 11:9
                bc6h_format.wBits               = 11;
                bc6h_format.tBits[C_RED]        = 9;
                bc6h_format.tBits[C_GREEN]      = 9;
                bc6h_format.tBits[C_BLUE]       = 9;
                bc6h_format.rw = header.getvalue(5,10) |               // 10:   rw[9:0]
                                (header.getvalue(44,1) << 10);         //       rw[10]
                bc6h_format.gw = header.getvalue(15,10) |              // 10:   gw[9:0]
                                (header.getvalue(54,1) << 10);         //       gw[10]
                bc6h_format.bw = header.getvalue(25,10) |              // 10:   bw[9:0]
                                (header.getvalue(64,1) << 10);         //       bw[10]
                bc6h_format.rx = header.getvalue(35,9);                // 9:    rx[8:0]
                bc6h_format.gx = header.getvalue(45,9);                // 9:    gx[8:0]
                bc6h_format.bx = header.getvalue(55,9);                // 9:    bx[8:0]
                break;
    case 0x0B:
                bc6h_format.m_mode              = 13;  // 12:8
                bc6h_format.wBits               = 12;
                bc6h_format.tBits[C_RED]        = 8;
                bc6h_format.tBits[C_GREEN]      = 8;
                bc6h_format.tBits[C_BLUE]       = 8;
                bc6h_format.rw = header.getvalue(5, 10) |               // 12:   rw[9:0]
                                 (header.getvalue(43, 1) << 11) |       //       rw[11]
                                 (header.getvalue(44, 1) << 10);        //       rw[10]
                bc6h_format.gw = header.getvalue(15, 10) |              // 12:   gw[9:0]
                                 (header.getvalue(53, 1) << 11) |       //       gw[11]
                                 (header.getvalue(54, 1) << 10);        //       gw[10]
                bc6h_format.bw = header.getvalue(25,10) |               // 12:   bw[9:0]
                                 (header.getvalue(63, 1) << 11) |       //       bw[11]
                                 (header.getvalue(64,1) << 10);         //       bw[10]
                bc6h_format.rx = header.getvalue(35,8);                 //  8:   rx[7:0]
                bc6h_format.gx = header.getvalue(45,8);                 //  8:   gx[7:0]
                bc6h_format.bx = header.getvalue(55,8);                 //  8:   bx[7:0]
                break;
    case 0x0F:
                bc6h_format.m_mode          = 14;  // 16:4
                bc6h_format.wBits           = 16;
                bc6h_format.tBits[C_RED]    = 4;
                bc6h_format.tBits[C_GREEN]  = 4;
                bc6h_format.tBits[C_BLUE]   = 4;
                bc6h_format.rw = header.getvalue(5,10) |                // 16:   rw[9:0]
                                 (header.getvalue(39, 1) << 15) |       //       rw[15]
                                 (header.getvalue(40, 1) << 14) |       //       rw[14]
                                 (header.getvalue(41, 1) << 13) |       //       rw[13]
                                 (header.getvalue(42, 1) << 12) |       //       rw[12]
                                 (header.getvalue(43, 1) << 11) |       //       rw[11]
                                 (header.getvalue(44, 1) << 10);        //       rw[10]
                bc6h_format.gw = header.getvalue(15,10) |               // 16:   gw[9:0]
                                 (header.getvalue(49, 1) << 15) |       //       gw[15]
                                 (header.getvalue(50, 1) << 14) |       //       gw[14]
                                 (header.getvalue(51, 1) << 13) |       //       gw[13]
                                 (header.getvalue(52, 1) << 12) |       //       gw[12]
                                 (header.getvalue(53, 1) << 11) |       //       gw[11]
                                 (header.getvalue(54, 1) << 10);        //       gw[10]
                bc6h_format.bw = header.getvalue(25,10) |               // 16:   bw[9:0]
                                 (header.getvalue(59, 1) << 15) |       //       bw[15]
                                 (header.getvalue(60, 1) << 14) |       //       bw[14]
                                 (header.getvalue(61, 1) << 13) |       //       bw[13]
                                 (header.getvalue(62, 1) << 12) |       //       bw[12]
                                 (header.getvalue(63, 1) << 11) |       //       bw[11]
                                 (header.getvalue(64, 1) << 10);        //       bw[10]
                bc6h_format.rx = header.getvalue(35,4);                 // 4:    rx[3:0]
                bc6h_format.gx = header.getvalue(45,4);                 // 4:    gx[3:0]
                bc6h_format.bx = header.getvalue(55,4);                 // 4:    bx[3:0]
                break;
    default:
                bc6h_format.m_mode = 0;
                return bc6h_format;
    }

    // Each format in the mode table can be uniquely identified by the mode bits.
    // The first ten modes are used for two-region tiles, and the mode bit field
    // can be either two or five bits long. These blocks also have fields for
    // the compressed color endpoints (72 or 75 bits), the partition (5 bits),
    // and the partition indices (46 bits).

    if (bc6h_format.m_mode <= 10)
    {
        bc6h_format.region = BC6_TWO;
        // Get the shape index bits 77 to 81
        bc6h_format.d_shape_index = (unsigned short) header.getvalue(77,5);
        bc6h_format.istransformed = (bc6h_format.m_mode < 10) ? TRUE : FALSE;
    }
    else
    {
        bc6h_format.region           = BC6_ONE;
        bc6h_format.d_shape_index    = 0;
        bc6h_format.istransformed    = (bc6h_format.m_mode > 11) ? TRUE : FALSE;
    }

    // Save the points in a form easy to compute with
    bc6h_format.EC[0].A[0] = (CGU_FLOAT)bc6h_format.rw;
    bc6h_format.EC[0].B[0] = (CGU_FLOAT)bc6h_format.rx;
    bc6h_format.EC[1].A[0] = (CGU_FLOAT)bc6h_format.ry;
    bc6h_format.EC[1].B[0] = (CGU_FLOAT)bc6h_format.rz;
    bc6h_format.EC[0].A[1] = (CGU_FLOAT)bc6h_format.gw;
    bc6h_format.EC[0].B[1] = (CGU_FLOAT)bc6h_format.gx;
    bc6h_format.EC[1].A[1] = (CGU_FLOAT)bc6h_format.gy;
    bc6h_format.EC[1].B[1] = (CGU_FLOAT)bc6h_format.gz;
    bc6h_format.EC[0].A[2] = (CGU_FLOAT)bc6h_format.bw;
    bc6h_format.EC[0].B[2] = (CGU_FLOAT)bc6h_format.bx;
    bc6h_format.EC[1].A[2] = (CGU_FLOAT)bc6h_format.by;
    bc6h_format.EC[1].B[2] = (CGU_FLOAT)bc6h_format.bz;

    if (bc6h_format.region    == BC6_ONE)
    {
        int startbits = ONE_REGION_INDEX_OFFSET;
        bc6h_format.indices16[0] = (CGU_UINT8) header.getvalue(startbits,3);
        startbits+=3;
        for (int i=1; i<16; i++)
        {
            bc6h_format.indices16[i] = (CGU_UINT8)header.getvalue(startbits,4);
            startbits+=4;
        }
    }
    else
    {
        int startbit = TWO_REGION_INDEX_OFFSET,
            nbits = 2;
        bc6h_format.indices16[0 ] = (CGU_UINT8)header.getvalue(startbit,2);
        for (int i= 1; i<16; i++)
        {
            startbit += nbits; // offset start bit for next index using prior nbits used
            nbits    = g_indexfixups[bc6h_format.d_shape_index] == i?2:3; // get new number of bit to save index with
            bc6h_format.indices16[i] = (CGU_UINT8)header.getvalue(startbit,nbits);
        }

    }

    return bc6h_format;
}

static void extract_compressed_endpoints(AMD_BC6H_Format& bc6h_format)
{
    int i;
    int t;

    if (bc6h_format.issigned)
    {
        if (bc6h_format.istransformed)
        {
            for (i=0; i<NCHANNELS; i++)
            {
                bc6h_format.E[0].A[i] = (CGU_FLOAT)SIGN_EXTEND(bc6h_format.EC[0].A[i],bc6h_format.wBits);

                t = SIGN_EXTEND(bc6h_format.EC[0].B[i], bc6h_format.tBits[i]); //C_RED
                t = int(t + bc6h_format.EC[0].A[i]) & MASK(bc6h_format.wBits);
                bc6h_format.E[0].B[i] = (CGU_FLOAT)SIGN_EXTEND(t,bc6h_format.wBits);
            }
        }
        else
        {
            for (i=0; i<NCHANNELS; i++)
            {
                bc6h_format.E[0].A[i] = (CGU_FLOAT)SIGN_EXTEND(bc6h_format.EC[0].A[i],bc6h_format.wBits);
                bc6h_format.E[0].B[i] = (CGU_FLOAT)SIGN_EXTEND(bc6h_format.EC[0].B[i],bc6h_format.tBits[i]); //C_RED
            }
        }

    }
    else
    {
        if (bc6h_format.istransformed)
        {
            for (i=0; i<NCHANNELS; i++)
            {
                bc6h_format.E[0].A[i] = bc6h_format.EC[0].A[i];
                t = SIGN_EXTEND(bc6h_format.EC[0].B[i], bc6h_format.tBits[i]); //C_RED
                bc6h_format.E[0].B[i] = CGU_FLOAT(CGU_INT(t + bc6h_format.EC[0].A[i]) & MASK(bc6h_format.wBits));
            }
        }
        else
        {
            for (i=0; i<NCHANNELS; i++)
            {
                bc6h_format.E[0].A[i] = bc6h_format.EC[0].A[i];
                bc6h_format.E[0].B[i] = bc6h_format.EC[0].B[i];
            }
        }
    }

}

// NV code: Used with modifcations
static int unquantize(AMD_BC6H_Format& bc6h_format, int q, int prec)
{
    int unq = 0, s;

    switch (bc6h_format.format)
    {
        // modify this case to move the multiplication by 31 after interpolation.
        // Need to use finish_unquantize.

        // since we have 16 bits available, let's unquantize this to 16 bits unsigned
        // thus the scale factor is [0-7c00)/[0-10000) = 31/64
        case UNSIGNED_F16:
            if (prec >= 15)
                unq = q;
            else if (q == 0)
                unq = 0;
            else if (q == ((1<<prec)-1))
                unq = U16MAX;
            else
                unq = (q * (U16MAX+1) + (U16MAX+1)/2) >> prec;
            break;

        // here, let's stick with S16 (no apparent quality benefit from going to S17)
        // range is (-7c00..7c00)/(-8000..8000) = 31/32
        case SIGNED_F16:
            // don't remove this test even though it appears equivalent to the code below
            // as it isn't -- the code below can overflow for prec = 16
            if (prec >= 16)
                unq = q;
            else
            {
                if (q < 0) { s = 1; q = -q; } else s = 0;

                if (q == 0)
                    unq = 0;
                else if (q >= ((1<<(prec-1))-1))
                    unq = s ? -S16MAX : S16MAX;
                else
                {
                    unq = (q * (S16MAX+1) + (S16MAX+1)/2) >> (prec-1);
                    if (s)
                        unq = -unq;
                }
            }
            break;
        }
        return unq;
}

static int lerp(int a, int b, int i, int denom)
{
    assert (denom == 3 || denom == 7 || denom == 15);
    assert (i >= 0 && i <= denom);

    int shift = 6, *weights = NULL;

    switch(denom)
    {
    case 3:        denom *= 5; i *= 5;    // fall through to case 15
    case 15:    weights = g_aWeights4; break;
    case 7:        weights = g_aWeights3; break;
    default:    assert(0);
    }

    #pragma warning(disable:4244)
    // no need to round these as this is an exact division
    return (int)(a*weights[denom-i] +b*weights[i]) / float(1 << shift);
}

static int finish_unquantize(AMD_BC6H_Format bc6h_format, int q)
{
    if (bc6h_format.format == UNSIGNED_F16)
        return (q * 31) >> 6;                                        // scale the magnitude by 31/64
    else if (bc6h_format.format == SIGNED_F16)
        return (q < 0) ? -(((-q) * 31) >> 5) : (q * 31) >> 5;        // scale the magnitude by 31/32
    else
        return q;
}

static void generate_palette_quantized(int max, AMD_BC6H_Format& bc6h_format, int region)
{
    // scale endpoints
    int a, b, c;            // really need a IntVec3...

    a = unquantize(bc6h_format, bc6h_format.E[region].A[0], bc6h_format.wBits);
    b = unquantize(bc6h_format, bc6h_format.E[region].B[0], bc6h_format.wBits);

    // interpolate : This part of code is used for debuging data
    for (int i = 0; i < max; i++)
    {
        c = finish_unquantize(bc6h_format, lerp(a, b, i, max-1));
        bc6h_format.Palete[region][i].x = c;
    }

    a = unquantize(bc6h_format, bc6h_format.E[region].A[1], bc6h_format.wBits);
    b = unquantize(bc6h_format, bc6h_format.E[region].B[1], bc6h_format.wBits);

    // interpolate
    for (int i = 0; i < max; i++)
        bc6h_format.Palete[region][i].y = finish_unquantize(bc6h_format, lerp(a, b, i, max-1));

    a = unquantize(bc6h_format,bc6h_format.E[region].A[2], bc6h_format.wBits);
    b = unquantize(bc6h_format,bc6h_format.E[region].B[2], bc6h_format.wBits);

    // interpolate
    for (int i = 0; i < max; i++)
        bc6h_format.Palete[region][i].z = finish_unquantize(bc6h_format, lerp(a, b, i, max-1));
}

// NV code : used with modifications
static void extract_compressed_endpoints2(AMD_BC6H_Format& bc6h_format)
{
    int i;
    int t;

    if (bc6h_format.issigned)
    {
        if (bc6h_format.istransformed)
        {
            for (i=0; i<NCHANNELS; i++)
            {
                bc6h_format.E[0].A[i] = SIGN_EXTEND(bc6h_format.EC[0].A[i],bc6h_format.wBits);

                t = SIGN_EXTEND(bc6h_format.EC[0].B[i], bc6h_format.tBits[i]); // C_RED
                t = int(t + bc6h_format.EC[0].A[i]) & MASK(bc6h_format.wBits);
                bc6h_format.E[0].B[i] = SIGN_EXTEND(t,bc6h_format.wBits);

                t = SIGN_EXTEND(bc6h_format.EC[1].A[i], bc6h_format.tBits[i]); //C_GREEN
                t = int(t + bc6h_format.EC[0].A[i]) & MASK(bc6h_format.wBits);
                bc6h_format.E[1].A[i] = SIGN_EXTEND(t,bc6h_format.wBits);

                t = SIGN_EXTEND(bc6h_format.EC[1].B[i], bc6h_format.tBits[i]); //C_BLUE
                t = int(t + bc6h_format.EC[0].A[i]) & MASK(bc6h_format.wBits);
                bc6h_format.E[1].B[i] = SIGN_EXTEND(t,bc6h_format.wBits);
            }
        }
        else
        {
            for (i=0; i<NCHANNELS; i++)
            {
                bc6h_format.E[0].A[i] = SIGN_EXTEND(bc6h_format.EC[0].A[i],bc6h_format.wBits);
                bc6h_format.E[0].B[i] = SIGN_EXTEND(bc6h_format.EC[0].B[i],bc6h_format.tBits[i]); //C_RED
                bc6h_format.E[1].A[i] = SIGN_EXTEND(bc6h_format.EC[1].A[i],bc6h_format.tBits[i]); //C_GREEN
                bc6h_format.E[1].B[i] = SIGN_EXTEND(bc6h_format.EC[1].B[i],bc6h_format.tBits[i]); //C_BLUE
            }
        }

    }
    else
    {
        if (bc6h_format.istransformed)
        {
            for (i=0; i<NCHANNELS; i++)
            {
                bc6h_format.E[0].A[i] = bc6h_format.EC[0].A[i];
                t = SIGN_EXTEND(bc6h_format.EC[0].B[i], bc6h_format.tBits[i]); // C_RED
                bc6h_format.E[0].B[i] = int(t + bc6h_format.EC[0].A[i]) & MASK(bc6h_format.wBits);

                t = SIGN_EXTEND(bc6h_format.EC[1].A[i], bc6h_format.tBits[i]); // C_GREEN
                bc6h_format.E[1].A[i] = int(t + bc6h_format.EC[0].A[i]) & MASK(bc6h_format.wBits);

                t = SIGN_EXTEND(bc6h_format.EC[1].B[i], bc6h_format.tBits[i]); //C_BLUE
                bc6h_format.E[1].B[i] = int(t + bc6h_format.EC[0].A[i]) & MASK(bc6h_format.wBits);
            }
        }
        else
        {
            for (i=0; i<NCHANNELS; i++)
            {
                bc6h_format.E[0].A[i] = bc6h_format.EC[0].A[i];
                bc6h_format.E[0].B[i] = bc6h_format.EC[0].B[i];
                bc6h_format.E[1].A[i] = bc6h_format.EC[1].A[i];
                bc6h_format.E[1].B[i] = bc6h_format.EC[1].B[i];
            }
        }
    }

}

void  DecompressBC6_Internal(CGU_UINT16 rgbBlock[48], const CGU_UINT8 compressedBlock[16], const BC6H_Encode *BC6HEncode)
{
    if (BC6HEncode) {}
    CGU_BOOL m_bc6signed = false;
    // now determine the mode type and extract the coded endpoints data
    AMD_BC6H_Format bc6h_format = extract_format(compressedBlock);
    if (!m_bc6signed)
        bc6h_format.format = UNSIGNED_F16;
    else
        bc6h_format.format = SIGNED_F16;

    if(bc6h_format.region == BC6_ONE)
    {
        extract_compressed_endpoints(bc6h_format);
        generate_palette_quantized(16,bc6h_format,0);
    }
    else //mode.type == BC6_TWO
    {
        extract_compressed_endpoints2(bc6h_format);
        for (int r=0; r<2; r++)
        {
            generate_palette_quantized(8,bc6h_format,r);
        }
    }


    BC6H_Vec3 data;
    int indexPos=0;
    int rgbPos=0;

    // Note first 32 BC6H_PARTIONS is shared with BC6H
    // Partitioning is always arranged such that index 0 is always in subset 0 of BC6H_PARTIONS array
    // Partition order goes from top-left to bottom-right, moving left to right and then top to bottom.
    for (int block_row = 0; block_row < 4; block_row++)
    for (int block_col = 0; block_col < 4; block_col++)
    {
        // Need to check region logic
        // gets the region (0 or 1) in the partition set
        //int region = bc6h_format.region == BC6_ONE?0:REGION(block_col,block_row,bc6h_format.d_shape_index);
        // for a one region partitions : its always return 0 so there is room for performance improvement
        // by seperating the condition into another looped call.
        //int region = bc6h_format.region == BC6_ONE?0:BC6H_PARTITIONS[1][bc6h_format.d_shape_index][indexPos];
        int region = bc6h_format.region == BC6_ONE?0:BC6_PARTITIONS[bc6h_format.d_shape_index][indexPos];

        // Index is validated as ok
        int paleteIndex  = bc6h_format.indices[block_row][block_col];

        // this result is validated ok for region = BC6_ONE , BC6_TWO To be determined
        data = bc6h_format.Palete[region][paleteIndex];

        rgbBlock[rgbPos++] = data.x;
        rgbBlock[rgbPos++] = data.y;
        rgbBlock[rgbPos++] = data.z;
        indexPos++;
    }

}

//======================= END OF DECOMPRESS CODE =========================================

int CMP_CDECL CreateOptionsBC6(void **options)
{
    (*options) = new BC6H_Encode;
    if (!options) return CGU_CORE_ERR_NEWMEM;
    SetDefaultBC6Options((BC6H_Encode *)options);
    return CGU_CORE_OK;
}

int CMP_CDECL DestroyOptionsBC6(void *options)
{
    if (!options) return CGU_CORE_ERR_INVALIDPTR;
    BC6H_Encode *BCOptions = reinterpret_cast <BC6H_Encode *>(options);
    delete BCOptions;
    return CGU_CORE_OK;
}

int CMP_CDECL SetQualityBC6(void *options, CGU_FLOAT fquality)
{
    if (!options) return CGU_CORE_ERR_INVALIDPTR;
    BC6H_Encode *BC6optionsDefault = (BC6H_Encode *)options;
    if (fquality < 0.0f) fquality = 0.0f;
    else
        if (fquality > 1.0f) fquality = 1.0f;
    BC6optionsDefault->m_quality = fquality;
    BC6optionsDefault->m_partitionSearchSize = (BC6optionsDefault->m_quality*2.0F) / qFAST_THRESHOLD;
    if (BC6optionsDefault->m_partitionSearchSize < (1.0F / 16.0F))
        BC6optionsDefault->m_partitionSearchSize = (1.0F / 16.0F);
    return CGU_CORE_OK;
}

int CMP_CDECL SetMaskBC6(void *options, CGU_UINT32 mask)
{
    if (!options) return CGU_CORE_ERR_INVALIDPTR;
    BC6H_Encode *BC6options = (BC6H_Encode *)options;
    BC6options->m_validModeMask = mask;
    return CGU_CORE_OK;
}

int CMP_CDECL CompressBlockBC6(const CGU_UINT16 *srcBlock,
                               unsigned int srcStrideInShorts,
                               CMP_GLOBAL CGU_UINT8 cmpBlock[16],
                               const CMP_GLOBAL void *options = NULL)
{

    CGU_UINT16 inBlock[48];

    //----------------------------------
    // Fill the inBlock with source data
    //----------------------------------
    CGU_INT srcpos = 0;
    CGU_INT dstptr = 0;
    for (CGU_UINT8 row = 0; row < 4; row++)
    {
        srcpos = row * srcStrideInShorts;
        for (CGU_UINT8 col = 0; col < 4; col++)
        {
            inBlock[dstptr++] = CGU_UINT16(srcBlock[srcpos++]);
            inBlock[dstptr++] = CGU_UINT16(srcBlock[srcpos++]);
            inBlock[dstptr++] = CGU_UINT16(srcBlock[srcpos++]);
        }
    }


    BC6H_Encode *BC6HEncode = (BC6H_Encode *)options;
    BC6H_Encode BC6HEncodeDefault;

    if (BC6HEncode == NULL)
    {
        BC6HEncode = &BC6HEncodeDefault;
        SetDefaultBC6Options(BC6HEncode);
    }

    BC6H_Encode_local BC6HEncode_local;
    memset((CGU_UINT8 *)&BC6HEncode_local, 0, sizeof(BC6H_Encode_local));
    CGU_UINT8    blkindex = 0;
    for ( CGU_INT32 j = 0; j < 16; j++) {
        BC6HEncode_local.din[j][0] = inBlock[blkindex++];  // R
        BC6HEncode_local.din[j][1] = inBlock[blkindex++];  // G
        BC6HEncode_local.din[j][2] = inBlock[blkindex++];  // B
        BC6HEncode_local.din[j][3] = 0;                    // A
        }

    CompressBlockBC6_Internal(cmpBlock, 0, &BC6HEncode_local,BC6HEncode);

    return CGU_CORE_OK;
}

int  CMP_CDECL DecompressBlockBC6(const unsigned char cmpBlock[16],
                            CGU_UINT16 srcBlock[48],
                            const void *options = NULL) {
    BC6H_Encode *BC6HEncode = (BC6H_Encode *)options;
    BC6H_Encode BC6HEncodeDefault;

    if (BC6HEncode == NULL)
    {
        BC6HEncode = &BC6HEncodeDefault;
        SetDefaultBC6Options(BC6HEncode);
    }
    DecompressBC6_Internal(srcBlock, cmpBlock,BC6HEncode);

    return CGU_CORE_OK;
}

#endif // !ASPM
#endif // !ASPM_GPU

//============================================== OpenCL USER INTERFACE ====================================================
#ifdef ASPM_OPENCL
CMP_STATIC CMP_KERNEL void CMP_GPUEncoder(
    CMP_GLOBAL  CGU_UINT8*          p_source_pixels,
    CMP_GLOBAL  CGU_UINT8*          p_encoded_blocks,
    CMP_GLOBAL  Source_Info*        SourceInfo,
    CMP_GLOBAL  BC6H_Encode *       BC6HEncode
)
{
    CGU_UINT32 x = get_global_id(0);
    CGU_UINT32 y = get_global_id(1);

    if (x >= (SourceInfo->m_src_width / BYTEPP)) return;
    if (y >= (SourceInfo->m_src_height / BYTEPP)) return;

    BC6H_Encode_local BC6HEncode_local;
    memset((CGU_UINT8 *)&BC6HEncode_local, 0, sizeof(BC6H_Encode_local));


    CGU_UINT32 stride = SourceInfo->m_src_width * BYTEPP;
    CGU_UINT32 srcOffset = (x*BlockX*BYTEPP) + (y*stride*BYTEPP);
    CGU_UINT32 destI = (x*COMPRESSED_BLOCK_SIZE) + (y*(SourceInfo->m_src_width / BlockX)*COMPRESSED_BLOCK_SIZE);
    CGU_UINT32 srcidx;

    //CGU_FLOAT block4x4[16][4];

    for (CGU_INT i = 0; i < BlockX; i++)
    {
        srcidx = i * stride;
        for (CGU_INT j = 0; j < BlockY; j++)
        {
            BC6HEncode_local.din[i*BlockX + j][0] = (CGU_UINT16)(p_source_pixels[srcOffset + srcidx++]);
            if (BC6HEncode_local.din[i*BlockX + j][0] < 0.00001 || isnan(BC6HEncode_local.din[i*BlockX + j][0]))
            {
                if (BC6HEncode->m_isSigned)
                {
                    BC6HEncode_local.din[i*BlockX + j][0] = (isnan(BC6HEncode_local.din[i*BlockX + j][0])) ? F16NEGPREC_LIMIT_VAL : -BC6HEncode_local.din[i*BlockX + j][0];
                    if (BC6HEncode_local.din[i*BlockX + j][0] < F16NEGPREC_LIMIT_VAL) {
                        BC6HEncode_local.din[i*BlockX + j][0] = F16NEGPREC_LIMIT_VAL;
                    }
                }
                else
                    BC6HEncode_local.din[i*BlockX + j][0] = 0.0;
            }

            BC6HEncode_local.din[i*BlockX + j][1] = (CGU_UINT16)(p_source_pixels[srcOffset + srcidx++]);

            if (BC6HEncode_local.din[i*BlockX + j][1] < 0.00001 || isnan(BC6HEncode_local.din[i*BlockX + j][1]))
            {
                if (BC6HEncode->m_isSigned)
                {
                    BC6HEncode_local.din[i*BlockX + j][1] = (isnan(BC6HEncode_local.din[i*BlockX + j][1])) ? F16NEGPREC_LIMIT_VAL : -BC6HEncode_local.din[i*BlockX + j][1];
                    if (BC6HEncode_local.din[i*BlockX + j][1] < F16NEGPREC_LIMIT_VAL) {
                        BC6HEncode_local.din[i*BlockX + j][1] = F16NEGPREC_LIMIT_VAL;
                    }
                }
                else
                    BC6HEncode_local.din[i*BlockX + j][1] = 0.0;
            }

            BC6HEncode_local.din[i*BlockX + j][2] = (CGU_UINT16)(p_source_pixels[srcOffset + srcidx++]);
            if (BC6HEncode_local.din[i*BlockX + j][2] < 0.00001 || isnan(BC6HEncode_local.din[i*BlockX + j][2]))
            {
                if (BC6HEncode->m_isSigned)
                {
                    BC6HEncode_local.din[i*BlockX + j][2] = (isnan(BC6HEncode_local.din[i*BlockX + j][2])) ? F16NEGPREC_LIMIT_VAL : -BC6HEncode_local.din[i*BlockX + j][2];
                    if (BC6HEncode_local.din[i*BlockX + j][2] < F16NEGPREC_LIMIT_VAL) {
                        BC6HEncode_local.din[i*BlockX + j][2] = F16NEGPREC_LIMIT_VAL;
                    }
                }
                else
                    BC6HEncode_local.din[i*BlockX + j][2] = 0.0;
            }

            BC6HEncode_local.din[i*BlockX + j][3] = 0.0f;
            //printf("Ori---src image %d, --%02x", x, (p_source_pixels[srcOffset + srcidx++]) & 0x0000ff); //for debug
        }
    }

    // printf(" X %3d Y %3d Quality %2.2f", x, y, BC6HEncode->m_quality);
    CompressBlockBC6_Internal(p_encoded_blocks, destI, &BC6HEncode_local, BC6HEncode);
}
#endif