From 9a16bebf8f87b8cbf19c0e6fec6faa73e108b09a Mon Sep 17 00:00:00 2001
From: Ignacio <castano@gmail.com>
Date: Mon, 23 Mar 2020 10:07:38 -0700
Subject: [PATCH] Add external libs for comparisons and benchmarks.

---
 extern/CMP_Core/CMP_Core.def                  |   56 +
 extern/CMP_Core/CMakeLists.txt                |   33 +
 extern/CMP_Core/shaders/BC1_Encode_kernel.cpp |  582 ++
 extern/CMP_Core/shaders/BC1_Encode_kernel.h   |   48 +
 extern/CMP_Core/shaders/BC2_Encode_kernel.cpp |  261 +
 extern/CMP_Core/shaders/BC2_Encode_kernel.h   |   34 +
 extern/CMP_Core/shaders/BC3_Encode_kernel.cpp |  218 +
 extern/CMP_Core/shaders/BC3_Encode_kernel.h   |   31 +
 extern/CMP_Core/shaders/BC4_Encode_kernel.cpp |  200 +
 extern/CMP_Core/shaders/BC4_Encode_kernel.h   |   31 +
 extern/CMP_Core/shaders/BC5_Encode_kernel.cpp |  264 +
 extern/CMP_Core/shaders/BC5_Encode_kernel.h   |   31 +
 extern/CMP_Core/shaders/BC6_Encode_kernel.cpp | 3990 ++++++++++++
 extern/CMP_Core/shaders/BC6_Encode_kernel.h   |  480 ++
 extern/CMP_Core/shaders/BC7_Encode_Kernel.cpp | 5489 +++++++++++++++++
 extern/CMP_Core/shaders/BC7_Encode_Kernel.h   | 1580 +++++
 extern/CMP_Core/shaders/BCn_Common_Kernel.h   | 2360 +++++++
 extern/CMP_Core/shaders/Common_Def.h          |  300 +
 extern/CMP_Core/shaders/CopyFiles.bat         |   50 +
 extern/CMP_Core/source/CMP_Core.h             |  153 +
 extern/CMP_Core/source/cmp_math_vec4.h        |  417 ++
 extern/CMP_Core/test/BlockConstants.h         |  228 +
 extern/CMP_Core/test/CMakeLists.txt           |   13 +
 extern/CMP_Core/test/CompressonatorTests.cpp  | 1143 ++++
 extern/CMP_Core/test/CompressonatorTests.h    |    6 +
 extern/CMP_Core/test/TestsMain.cpp            |   10 +
 extern/CMakeLists.txt                         |    6 +-
 extern/libsquish-1.15/CMakeLists.txt          |  117 +
 .../CMakeModules/FindlibSquish.cmake          |   14 +
 extern/libsquish-1.15/ChangeLog.txt           |   66 +
 extern/libsquish-1.15/Doxyfile                |  214 +
 extern/libsquish-1.15/LICENSE.txt             |   20 +
 extern/libsquish-1.15/Makefile                |   65 +
 extern/libsquish-1.15/README.txt              |   18 +
 extern/libsquish-1.15/alpha.cpp               |  350 ++
 extern/libsquish-1.15/alpha.h                 |   41 +
 extern/libsquish-1.15/clusterfit.cpp          |  392 ++
 extern/libsquish-1.15/clusterfit.h            |   61 +
 extern/libsquish-1.15/colourblock.cpp         |  214 +
 extern/libsquish-1.15/colourblock.h           |   41 +
 extern/libsquish-1.15/colourfit.cpp           |   54 +
 extern/libsquish-1.15/colourfit.h             |   56 +
 extern/libsquish-1.15/colourset.cpp           |  121 +
 extern/libsquish-1.15/colourset.h             |   58 +
 extern/libsquish-1.15/config                  |   38 +
 extern/libsquish-1.15/config.h                |   49 +
 extern/libsquish-1.15/extra/squishgen.cpp     |  151 +
 extern/libsquish-1.15/extra/squishpng.cpp     |  546 ++
 extern/libsquish-1.15/extra/squishtest.cpp    |  206 +
 extern/libsquish-1.15/libSquish.png           |  Bin 0 -> 17907 bytes
 extern/libsquish-1.15/libSquish.pri           |   26 +
 extern/libsquish-1.15/libSquish.pro           |   32 +
 extern/libsquish-1.15/libSquish.svg           |  238 +
 extern/libsquish-1.15/libsquish.pc.in         |   13 +
 extern/libsquish-1.15/maths.cpp               |  259 +
 extern/libsquish-1.15/maths.h                 |  233 +
 extern/libsquish-1.15/rangefit.cpp            |  201 +
 extern/libsquish-1.15/rangefit.h              |   54 +
 extern/libsquish-1.15/simd.h                  |   40 +
 extern/libsquish-1.15/simd_float.h            |  183 +
 extern/libsquish-1.15/simd_sse.h              |  180 +
 extern/libsquish-1.15/simd_ve.h               |  166 +
 extern/libsquish-1.15/singlecolourfit.cpp     |  172 +
 extern/libsquish-1.15/singlecolourfit.h       |   58 +
 extern/libsquish-1.15/singlecolourlookup.inl  | 1064 ++++
 extern/libsquish-1.15/squish.cpp              |  403 ++
 src/nvtt/tests/CMakeLists.txt                 |    3 +
 67 files changed, 24230 insertions(+), 1 deletion(-)
 create mode 100644 extern/CMP_Core/CMP_Core.def
 create mode 100644 extern/CMP_Core/CMakeLists.txt
 create mode 100644 extern/CMP_Core/shaders/BC1_Encode_kernel.cpp
 create mode 100644 extern/CMP_Core/shaders/BC1_Encode_kernel.h
 create mode 100644 extern/CMP_Core/shaders/BC2_Encode_kernel.cpp
 create mode 100644 extern/CMP_Core/shaders/BC2_Encode_kernel.h
 create mode 100644 extern/CMP_Core/shaders/BC3_Encode_kernel.cpp
 create mode 100644 extern/CMP_Core/shaders/BC3_Encode_kernel.h
 create mode 100644 extern/CMP_Core/shaders/BC4_Encode_kernel.cpp
 create mode 100644 extern/CMP_Core/shaders/BC4_Encode_kernel.h
 create mode 100644 extern/CMP_Core/shaders/BC5_Encode_kernel.cpp
 create mode 100644 extern/CMP_Core/shaders/BC5_Encode_kernel.h
 create mode 100644 extern/CMP_Core/shaders/BC6_Encode_kernel.cpp
 create mode 100644 extern/CMP_Core/shaders/BC6_Encode_kernel.h
 create mode 100644 extern/CMP_Core/shaders/BC7_Encode_Kernel.cpp
 create mode 100644 extern/CMP_Core/shaders/BC7_Encode_Kernel.h
 create mode 100644 extern/CMP_Core/shaders/BCn_Common_Kernel.h
 create mode 100644 extern/CMP_Core/shaders/Common_Def.h
 create mode 100644 extern/CMP_Core/shaders/CopyFiles.bat
 create mode 100644 extern/CMP_Core/source/CMP_Core.h
 create mode 100644 extern/CMP_Core/source/cmp_math_vec4.h
 create mode 100644 extern/CMP_Core/test/BlockConstants.h
 create mode 100644 extern/CMP_Core/test/CMakeLists.txt
 create mode 100644 extern/CMP_Core/test/CompressonatorTests.cpp
 create mode 100644 extern/CMP_Core/test/CompressonatorTests.h
 create mode 100644 extern/CMP_Core/test/TestsMain.cpp
 create mode 100644 extern/libsquish-1.15/CMakeLists.txt
 create mode 100644 extern/libsquish-1.15/CMakeModules/FindlibSquish.cmake
 create mode 100644 extern/libsquish-1.15/ChangeLog.txt
 create mode 100644 extern/libsquish-1.15/Doxyfile
 create mode 100644 extern/libsquish-1.15/LICENSE.txt
 create mode 100644 extern/libsquish-1.15/Makefile
 create mode 100644 extern/libsquish-1.15/README.txt
 create mode 100644 extern/libsquish-1.15/alpha.cpp
 create mode 100644 extern/libsquish-1.15/alpha.h
 create mode 100644 extern/libsquish-1.15/clusterfit.cpp
 create mode 100644 extern/libsquish-1.15/clusterfit.h
 create mode 100644 extern/libsquish-1.15/colourblock.cpp
 create mode 100644 extern/libsquish-1.15/colourblock.h
 create mode 100644 extern/libsquish-1.15/colourfit.cpp
 create mode 100644 extern/libsquish-1.15/colourfit.h
 create mode 100644 extern/libsquish-1.15/colourset.cpp
 create mode 100644 extern/libsquish-1.15/colourset.h
 create mode 100644 extern/libsquish-1.15/config
 create mode 100644 extern/libsquish-1.15/config.h
 create mode 100644 extern/libsquish-1.15/extra/squishgen.cpp
 create mode 100644 extern/libsquish-1.15/extra/squishpng.cpp
 create mode 100644 extern/libsquish-1.15/extra/squishtest.cpp
 create mode 100644 extern/libsquish-1.15/libSquish.png
 create mode 100644 extern/libsquish-1.15/libSquish.pri
 create mode 100644 extern/libsquish-1.15/libSquish.pro
 create mode 100644 extern/libsquish-1.15/libSquish.svg
 create mode 100644 extern/libsquish-1.15/libsquish.pc.in
 create mode 100644 extern/libsquish-1.15/maths.cpp
 create mode 100644 extern/libsquish-1.15/maths.h
 create mode 100644 extern/libsquish-1.15/rangefit.cpp
 create mode 100644 extern/libsquish-1.15/rangefit.h
 create mode 100644 extern/libsquish-1.15/simd.h
 create mode 100644 extern/libsquish-1.15/simd_float.h
 create mode 100644 extern/libsquish-1.15/simd_sse.h
 create mode 100644 extern/libsquish-1.15/simd_ve.h
 create mode 100644 extern/libsquish-1.15/singlecolourfit.cpp
 create mode 100644 extern/libsquish-1.15/singlecolourfit.h
 create mode 100644 extern/libsquish-1.15/singlecolourlookup.inl
 create mode 100644 extern/libsquish-1.15/squish.cpp

diff --git a/extern/CMP_Core/CMP_Core.def b/extern/CMP_Core/CMP_Core.def
new file mode 100644
index 0000000..baa5bc1
--- /dev/null
+++ b/extern/CMP_Core/CMP_Core.def
@@ -0,0 +1,56 @@
+; Core def : Declares the module parameters for the DLL.
+
+EXPORTS
+CreateOptionsBC1
+CreateOptionsBC2
+CreateOptionsBC3
+CreateOptionsBC4
+CreateOptionsBC5
+CreateOptionsBC6
+CreateOptionsBC7
+
+DestroyOptionsBC1
+DestroyOptionsBC2
+DestroyOptionsBC3
+DestroyOptionsBC4
+DestroyOptionsBC5
+DestroyOptionsBC6
+DestroyOptionsBC7
+
+SetDecodeChannelMapping
+
+SetChannelWeightsBC1
+SetChannelWeightsBC2
+SetChannelWeightsBC3
+
+SetQualityBC1
+SetQualityBC2
+SetQualityBC3
+SetQualityBC4
+SetQualityBC5
+SetQualityBC6
+SetQualityBC7
+
+SetAlphaThresholdBC1
+
+SetMaskBC6
+SetMaskBC7
+
+SetErrorThresholdBC7
+SetAlphaOptionsBC7
+
+CompressBlockBC1
+CompressBlockBC2
+CompressBlockBC3
+CompressBlockBC4
+CompressBlockBC5
+CompressBlockBC6
+CompressBlockBC7
+
+DecompressBlockBC1
+DecompressBlockBC2
+DecompressBlockBC3
+DecompressBlockBC4
+DecompressBlockBC5
+DecompressBlockBC6
+DecompressBlockBC7
diff --git a/extern/CMP_Core/CMakeLists.txt b/extern/CMP_Core/CMakeLists.txt
new file mode 100644
index 0000000..e89ea3d
--- /dev/null
+++ b/extern/CMP_Core/CMakeLists.txt
@@ -0,0 +1,33 @@
+cmake_minimum_required(VERSION 3.10)
+
+add_library(CMP_Core STATIC "")
+
+target_sources(CMP_Core
+               PRIVATE
+                   shaders/BC1_Encode_kernel.h
+                   shaders/BC1_Encode_kernel.cpp
+                   shaders/BC2_Encode_kernel.h
+                   shaders/BC2_Encode_kernel.cpp
+                   shaders/BC3_Encode_kernel.h
+                   shaders/BC3_Encode_kernel.cpp
+                   shaders/BC4_Encode_kernel.h
+                   shaders/BC4_Encode_kernel.cpp
+                   shaders/BC5_Encode_kernel.h
+                   shaders/BC5_Encode_kernel.cpp
+                   shaders/BC6_Encode_kernel.h
+                   shaders/BC6_Encode_kernel.cpp
+                   shaders/BC7_Encode_Kernel.h
+                   shaders/BC7_Encode_Kernel.cpp
+                   shaders/BCn_Common_Kernel.h
+                   shaders/Common_Def.h
+                   )
+
+target_include_directories(CMP_Core
+                           PRIVATE
+                           shaders
+                           source)
+#add_subdirectory(test)
+
+if (UNIX)
+target_compile_definitions(CMP_Core PRIVATE _LINUX ASPM_GPU)
+endif()
diff --git a/extern/CMP_Core/shaders/BC1_Encode_kernel.cpp b/extern/CMP_Core/shaders/BC1_Encode_kernel.cpp
new file mode 100644
index 0000000..4c68e42
--- /dev/null
+++ b/extern/CMP_Core/shaders/BC1_Encode_kernel.cpp
@@ -0,0 +1,582 @@
+//=====================================================================
+// Copyright (c) 2019    Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files(the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and / or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions :
+// 
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+//
+//=====================================================================
+#include "BC1_Encode_kernel.h"
+
+//============================================== BC1 INTERFACES  =======================================================
+void CompressBlockBC1_Fast(
+    CMP_Vec4uc  srcBlockTemp[16],
+    CMP_GLOBAL CGU_UINT32 compressedBlock[2])
+{
+    int i, k;
+
+    CMP_Vec3f rgb;
+    CMP_Vec3f average_rgb;                  // The centrepoint of the axis
+    CMP_Vec3f v_rgb;                        // The axis
+    CMP_Vec3f uniques[16];                  // The list of unique colours
+    int unique_pixels;                     // The number of unique pixels
+    CGU_FLOAT unique_recip;                    // Reciprocal of the above for fast multiplication
+    int index_map[16];                     // The map of source pixels to unique indices
+                                    
+    CGU_FLOAT pos_on_axis[16];                 // The distance each unique falls along the compression axis
+    CGU_FLOAT dist_from_axis[16];              // The distance each unique falls from the compression axis
+    CGU_FLOAT left = 0, right = 0, centre = 0; // The extremities and centre (average of left/right) of uniques along the compression axis
+    CGU_FLOAT axis_mapping_error = 0;          // The total computed error in mapping pixels to the axis
+
+    int swap;                              // Indicator if the RGB values need swapping to generate an opaque result
+
+    // -------------------------------------------------------------------------------------
+    // (3) Find the array of unique pixel values and sum them to find their average position
+    // -------------------------------------------------------------------------------------
+    {
+        // Find the array of unique pixel values and sum them to find their average position      
+        int current_pixel, firstdiff;
+        current_pixel = unique_pixels = 0;
+        average_rgb = 0.0f;
+        firstdiff = -1;
+        for (i = 0; i<16; i++)
+        {
+                for (k = 0; k<i; k++)
+                    if ((((srcBlockTemp[k].x ^ srcBlockTemp[i].x) & 0xf8) == 0) && (((srcBlockTemp[k].y ^ srcBlockTemp[i].y) & 0xfc) == 0) && (((srcBlockTemp[k].z ^ srcBlockTemp[i].z) & 0xf8) == 0))
+                        break;
+                index_map[i] = current_pixel++;
+                //pixel_count[i] = 1;
+                CMP_Vec3f trgb;
+                rgb.x = (CGU_FLOAT)((srcBlockTemp[i].x) & 0xff);
+                rgb.y = (CGU_FLOAT)((srcBlockTemp[i].y) & 0xff);
+                rgb.z = (CGU_FLOAT)((srcBlockTemp[i].z) & 0xff);
+
+                trgb.x = CS_RED(rgb.x, rgb.y, rgb.z);
+                trgb.y = CS_GREEN(rgb.x, rgb.y, rgb.z);
+                trgb.z = CS_BLUE(rgb.x, rgb.y, rgb.z);
+                uniques[i] = trgb;
+
+                if (k == i)
+                {
+                    unique_pixels++;
+                    if ((i != 0) && (firstdiff < 0)) firstdiff = i;
+                }
+                average_rgb = average_rgb + trgb;
+        }
+
+        unique_pixels = 16;
+        // Compute average of the uniques
+        unique_recip = 1.0f / (CGU_FLOAT)unique_pixels;
+        average_rgb = average_rgb * unique_recip;
+    }
+
+    // -------------------------------------------------------------------------------------
+    // (4) For each component, reflect points about the average so all lie on the same side
+    // of the average, and compute the new average - this gives a second point that defines the axis
+    // To compute the sign of the axis sum the positive differences of G for each of R and B (the
+    // G axis is always positive in this implementation
+    // -------------------------------------------------------------------------------------
+    // An interesting situation occurs if the G axis contains no information, in which case the RB
+    // axis is also compared. I am not entirely sure if this is the correct implementation - should
+    // the priority axis be determined by magnitude?
+    {
+
+        CGU_FLOAT rg_pos, bg_pos, rb_pos;
+        v_rgb = 0.0f;
+        rg_pos = bg_pos = rb_pos = 0;
+
+        for (i = 0; i < unique_pixels; i++)
+        {
+            rgb = uniques[i] - average_rgb;
+
+#ifndef ASPM_GPU
+            v_rgb.x += (CGU_FLOAT)fabs(rgb.x);
+            v_rgb.y += (CGU_FLOAT)fabs(rgb.y);
+            v_rgb.z += (CGU_FLOAT)fabs(rgb.z);
+#else
+            v_rgb = v_rgb + fabs(rgb);
+#endif
+
+            if (rgb.x > 0) { rg_pos += rgb.y; rb_pos += rgb.z; }
+            if (rgb.z > 0) bg_pos += rgb.y;
+        }
+        v_rgb = v_rgb*unique_recip;
+        if (rg_pos < 0) v_rgb.x = -v_rgb.x;
+        if (bg_pos < 0) v_rgb.z = -v_rgb.z;
+        if ((rg_pos == bg_pos) && (rg_pos == 0))
+            if (rb_pos < 0) v_rgb.z = -v_rgb.z;
+    }
+
+    // -------------------------------------------------------------------------------------
+    // (5) Axis projection and remapping
+    // -------------------------------------------------------------------------------------
+    {
+        CGU_FLOAT v2_recip;
+        // Normalise the axis for simplicity of future calculation
+        v2_recip = (v_rgb.x*v_rgb.x + v_rgb.y*v_rgb.y + v_rgb.z*v_rgb.z);
+        if (v2_recip > 0)
+            v2_recip = 1.0f / (CGU_FLOAT)sqrt(v2_recip);
+        else
+            v2_recip = 1.0f;
+        v_rgb = v_rgb*v2_recip;
+    }
+
+    // -------------------------------------------------------------------------------------
+    // (6) Map the axis
+    // -------------------------------------------------------------------------------------
+    // the line joining (and extended on either side of) average and axis
+    // defines the axis onto which the points will be projected
+    // Project all the points onto the axis, calculate the distance along
+    // the axis from the centre of the axis (average)
+    // From Foley & Van Dam: Closest point of approach of a line (P + v) to a point (R) is
+    //                            P + ((R-P).v) / (v.v))v
+    // The distance along v is therefore (R-P).v / (v.v)
+    // (v.v) is 1 if v is a unit vector.
+    //
+    // Calculate the extremities at the same time - these need to be reasonably accurately
+    // represented in all cases
+    //
+    // In this first calculation, also find the error of mapping the points to the axis - this
+    // is our major indicator of whether or not the block has compressed well - if the points
+    // map well onto the axis then most of the noise introduced is high-frequency noise
+    {
+        left = 10000.0f;
+        right = -10000.0f;
+        axis_mapping_error = 0;
+        for (i = 0; i < unique_pixels; i++)
+        {
+            // Compute the distance along the axis of the point of closest approach
+            CMP_Vec3f temp = (uniques[i] - average_rgb);
+            pos_on_axis[i] = (temp.x * v_rgb.x) + (temp.y * v_rgb.y) + (temp.z * v_rgb.z);
+
+            // Compute the actual point and thence the mapping error
+            rgb = uniques[i] - (average_rgb + (v_rgb * pos_on_axis[i]));
+            dist_from_axis[i] = rgb.x*rgb.x + rgb.y*rgb.y + rgb.z*rgb.z;
+            axis_mapping_error += dist_from_axis[i];
+
+            // Work out the extremities
+            if (pos_on_axis[i] < left)
+                left = pos_on_axis[i];
+            if (pos_on_axis[i] > right)
+                right = pos_on_axis[i];
+        }
+    }
+
+    // -------------------------------------------------------------------------------------
+    // (7) Now we have a good axis and the basic information about how the points are mapped
+    // to it
+    // Our initial guess is to represent the endpoints accurately, by moving the average
+    // to the centre and recalculating the point positions along the line
+    // -------------------------------------------------------------------------------------
+    {
+        centre = (left + right) / 2;
+        average_rgb = average_rgb + (v_rgb*centre);
+        for (i = 0; i<unique_pixels; i++)
+            pos_on_axis[i] -= centre;
+        right -= centre;
+        left -= centre;
+
+        // Accumulate our final resultant error
+        axis_mapping_error *= unique_recip * (1 / 255.0f);
+
+    }
+
+    // -------------------------------------------------------------------------------------
+    // (8) Calculate the high and low output colour values
+    // Involved in this is a rounding procedure which is undoubtedly slightly twitchy. A
+    // straight rounded average is not correct, as the decompressor 'unrounds' by replicating
+    // the top bits to the bottom.
+    // In order to take account of this process, we don't just apply a straight rounding correction,
+    // but base our rounding on the input value (a straight rounding is actually pretty good in terms of
+    // error measure, but creates a visual colour and/or brightness shift relative to the original image)
+    // The method used here is to apply a centre-biased rounding dependent on the input value, which was
+    // (mostly by experiment) found to give minimum MSE while preserving the visual characteristics of
+    // the image.
+    // rgb = (average_rgb + (left|right)*v_rgb);
+    // -------------------------------------------------------------------------------------
+    {
+        CGU_UINT32 c0, c1, t;
+        int rd, gd, bd;
+        rgb = (average_rgb + (v_rgb * left));
+        rd = ( CGU_INT32)DCS_RED(rgb.x, rgb.y, rgb.z);
+        gd = ( CGU_INT32)DCS_GREEN(rgb.x, rgb.y, rgb.z);
+        bd = ( CGU_INT32)DCS_BLUE(rgb.x, rgb.y, rgb.z);
+        ROUND_AND_CLAMP(rd, 5);
+        ROUND_AND_CLAMP(gd, 6);
+        ROUND_AND_CLAMP(bd, 5);
+        c0 = ((rd & 0xf8) << 8) + ((gd & 0xfc) << 3) + ((bd & 0xf8) >> 3);
+
+        rgb = average_rgb + (v_rgb * right);
+        rd = ( CGU_INT32)DCS_RED(rgb.x, rgb.y, rgb.z);
+        gd = ( CGU_INT32)DCS_GREEN(rgb.x, rgb.y, rgb.z);
+        bd = ( CGU_INT32)DCS_BLUE(rgb.x, rgb.y, rgb.z);
+        ROUND_AND_CLAMP(rd, 5);
+        ROUND_AND_CLAMP(gd, 6);
+        ROUND_AND_CLAMP(bd, 5);
+        c1 = (((rd & 0xf8) << 8) + ((gd & 0xfc) << 3) + ((bd & 0xf8) >> 3));
+
+        // Force to be a 4-colour opaque block - in which case, c0 is greater than c1
+        // blocktype == 4
+        {
+            if (c0 < c1)
+            {
+                t = c0;
+                c0 = c1;
+                c1 = t;
+                swap = 1;
+            }
+            else if (c0 == c1)
+            {
+                // This block will always be encoded in 3-colour mode
+                // Need to ensure that only one of the two points gets used,
+                // avoiding accidentally setting some transparent pixels into the block
+                for (i = 0; i<unique_pixels; i++)
+                    pos_on_axis[i] = left;
+                swap = 0;
+            }
+            else
+                swap = 0;
+        }
+
+        compressedBlock[0] = c0 | (c1 << 16);
+    }
+
+    // -------------------------------------------------------------------------------------
+    // (9) Final clustering, creating the 2-bit values that define the output
+    // -------------------------------------------------------------------------------------
+    {
+        CGU_UINT32 bit;
+        CGU_FLOAT division;
+        CGU_FLOAT cluster_x[4];
+        CGU_FLOAT cluster_y[4];
+        int cluster_count[4];
+
+        // (blocktype == 4)
+        {
+            compressedBlock[1] = 0;
+            division = right*2.0f / 3.0f;
+            centre = (left + right) / 2;        // Actually, this code only works if centre is 0 or approximately so
+
+            for (i = 0; i<4; i++)
+            {
+                cluster_x[i] = cluster_y[i] = 0.0f;
+                cluster_count[i] = 0;
+            }
+
+
+            for (i = 0; i<16; i++)
+            {
+                rgb.z = pos_on_axis[index_map[i]];
+                // Endpoints (indicated by block > average) are 0 and 1, while
+                // interpolants are 2 and 3
+                if (fabs(rgb.z) >= division)
+                    bit = 0;
+                else
+                    bit = 2;
+                // Positive is in the latter half of the block
+                if (rgb.z >= centre)
+                    bit += 1;
+                // Set the output, taking swapping into account
+                compressedBlock[1] |= ((bit^swap) << (2 * i));
+
+                // Average the X and Y locations for each cluster
+                cluster_x[bit] += (CGU_FLOAT)(i & 3);
+                cluster_y[bit] += (CGU_FLOAT)(i >> 2);
+                cluster_count[bit]++;
+            }
+
+            for (i = 0; i<4; i++)
+            {
+                CGU_FLOAT cr;
+                if (cluster_count[i])
+                {
+                    cr = 1.0f / cluster_count[i];
+                    cluster_x[i] *= cr;
+                    cluster_y[i] *= cr;
+                }
+                else
+                {
+                    cluster_x[i] = cluster_y[i] = -1;
+                }
+            }
+
+            // patterns in axis position detection
+            // (same algorithm as used in the SSE version)
+            if ((compressedBlock[0] & 0xffff) != (compressedBlock[0] >> 16))
+            {
+                CGU_UINT32 i1, k1;
+                CGU_UINT32 x = 0, y = 0;
+                int xstep = 0, ystep = 0;
+
+                // Find a corner to search from
+                for (k1 = 0; k1<4; k1++)
+                {
+                    switch (k1)
+                    {
+                    case 0:
+                        x = 0; y = 0; xstep = 1; ystep = 1;
+                        break;
+                    case 1:
+                        x = 0; y = 3; xstep = 1; ystep = -1;
+                        break;
+                    case 2:
+                        x = 3; y = 0; xstep = -1; ystep = 1;
+                        break;
+                    case 3:
+                        x = 3; y = 3; xstep = -1; ystep = -1;
+                        break;
+                    }
+
+                    for (i1 = 0; i1<4; i1++)
+                    {
+                        if ((POS(x, y + ystep*i1)                < POS(x + xstep, y + ystep*i1)) ||
+                            (POS(x + xstep, y + ystep*i1)        < POS(x + 2 * xstep, y + ystep*i1)) ||
+                            (POS(x + 2 * xstep, y + ystep*i1)    < POS(x + 3 * xstep, y + ystep*i1))
+                            )
+                            break;
+                        if ((POS(x + xstep*i1, y)                < POS(x + xstep*i1, y + ystep)) ||
+                            (POS(x + xstep*i1, y + ystep)        < POS(x + xstep*i1, y + 2 * ystep)) ||
+                            (POS(x + xstep*i1, y + 2 * ystep)    < POS(x + xstep*i1, y + 3 * ystep))
+                            )
+                            break;
+                    }
+                    if (i1 == 4)
+                        break;
+                }
+            }
+        }
+
+    }
+    // done
+}
+
+INLINE void store_uint8(CMP_GLOBAL CGU_UINT8 u_dstptr[8], CGU_UINT32 data[2])
+{
+   int shift = 0;
+   for (CGU_INT k=0; k<4; k++)
+   {
+      u_dstptr[k] = (data[0] >> shift)&0xFF;
+      shift += 8;
+   }
+   shift = 0;
+   for (CGU_INT k=4; k<8; k++)
+   {
+      u_dstptr[k] = (data[1] >> shift)&0xFF;
+      shift += 8;
+   }
+}
+
+void  CompressBlockBC1_Internal(
+    const CMP_Vec4uc  srcBlockTemp[16],
+    CMP_GLOBAL  CGU_UINT32      compressedBlock[2],
+    CMP_GLOBAL  const CMP_BC15Options *BC15options)
+{
+    CGU_UINT8    blkindex = 0;
+    CGU_UINT8    srcindex = 0;
+    CGU_UINT8    rgbBlock[64];
+    for ( CGU_INT32 j = 0; j < 4; j++) {
+     for ( CGU_INT32 i = 0; i < 4; i++) {
+        rgbBlock[blkindex++] = (CGU_UINT8)srcBlockTemp[srcindex].z;  // B
+        rgbBlock[blkindex++] = (CGU_UINT8)srcBlockTemp[srcindex].y;  // G
+        rgbBlock[blkindex++] = (CGU_UINT8)srcBlockTemp[srcindex].x;  // R
+        rgbBlock[blkindex++] = (CGU_UINT8)srcBlockTemp[srcindex].w;  // A
+        srcindex++;
+        }
+    }
+
+    CMP_BC15Options internalOptions = *BC15options;
+    CalculateColourWeightings(rgbBlock, &internalOptions);
+
+    CompressRGBBlock(rgbBlock,
+                     compressedBlock,
+                     &internalOptions,
+                     TRUE,
+                     FALSE, 
+                     internalOptions.m_nAlphaThreshold);
+}
+
+//============================================== USER INTERFACES  ========================================================
+#ifndef ASPM_GPU
+int CMP_CDECL CreateOptionsBC1(void **options)
+{
+    CMP_BC15Options *BC15optionsDefault = new CMP_BC15Options;
+    if (BC15optionsDefault) {
+      SetDefaultBC15Options(BC15optionsDefault);
+      (*options) = BC15optionsDefault;
+    }
+    else {
+        (*options) = NULL;
+        return CGU_CORE_ERR_NEWMEM;
+    }
+    return CGU_CORE_OK;
+}
+
+int CMP_CDECL DestroyOptionsBC1(void *options)
+{
+    if (!options) return CGU_CORE_ERR_INVALIDPTR;
+    CMP_BC15Options *BCOptions = reinterpret_cast <CMP_BC15Options *>(options);
+    delete BCOptions;
+    return CGU_CORE_OK;
+}
+
+int CMP_CDECL SetQualityBC1(void *options, 
+                            CGU_FLOAT fquality)
+{
+    if (!options) return CGU_CORE_ERR_NEWMEM;
+    CMP_BC15Options *BC15optionsDefault =  reinterpret_cast <CMP_BC15Options *>(options);
+    if (fquality < 0.0f) fquality = 0.0f;
+    else
+    if (fquality > 1.0f) fquality = 1.0f;
+    BC15optionsDefault->m_fquality = fquality;
+    return CGU_CORE_OK;
+}
+
+
+int CMP_CDECL SetAlphaThresholdBC1(void *options, 
+                                   CGU_UINT8 alphaThreshold)
+{
+    if (!options) return CGU_CORE_ERR_INVALIDPTR;
+    CMP_BC15Options *BC15optionsDefault =  reinterpret_cast <CMP_BC15Options *>(options);
+    BC15optionsDefault->m_nAlphaThreshold = alphaThreshold;
+    return CGU_CORE_OK;
+}
+
+int CMP_CDECL SetDecodeChannelMapping(void *options,
+                              CGU_BOOL mapRGBA)
+{
+    if (!options) return CGU_CORE_ERR_INVALIDPTR;
+    CMP_BC15Options *BC15optionsDefault =  reinterpret_cast <CMP_BC15Options *>(options);
+    BC15optionsDefault->m_mapDecodeRGBA = mapRGBA;
+    return CGU_CORE_OK;
+}
+
+int CMP_CDECL SetChannelWeightsBC1(void *options,
+                              CGU_FLOAT WeightRed,
+                              CGU_FLOAT WeightGreen,
+                              CGU_FLOAT WeightBlue) {
+    if (!options) return CGU_CORE_ERR_INVALIDPTR;
+    CMP_BC15Options *BC15optionsDefault = (CMP_BC15Options *)options;
+
+    if ((WeightRed < 0.0f)   || (WeightRed > 1.0f))      return CGU_CORE_ERR_RANGERED;
+    if ((WeightGreen < 0.0f) || (WeightGreen > 1.0f))    return CGU_CORE_ERR_RANGEGREEN;
+    if ((WeightBlue < 0.0f)  || (WeightBlue > 1.0f))     return CGU_CORE_ERR_RANGEBLUE;
+
+    BC15optionsDefault->m_bUseChannelWeighting = true;
+    BC15optionsDefault->m_fChannelWeights[0] = WeightRed;
+    BC15optionsDefault->m_fChannelWeights[1] = WeightGreen;
+    BC15optionsDefault->m_fChannelWeights[2] = WeightBlue;
+    return CGU_CORE_OK;
+}
+
+int CMP_CDECL CompressBlockBC1(const unsigned char *srcBlock,
+                               unsigned int srcStrideInBytes,
+                               CMP_GLOBAL unsigned char cmpBlock[8],
+                               const void *options = NULL) {
+    CMP_Vec4uc inBlock[16];
+
+    //----------------------------------
+    // Fill the inBlock with source data
+    //----------------------------------
+    CGU_INT srcpos = 0;
+    CGU_INT dstptr = 0;
+    for (CGU_UINT8 row=0; row < 4; row++)
+    {
+        srcpos = row * srcStrideInBytes;
+        for (CGU_UINT8 col = 0; col < 4; col++)
+        {
+            inBlock[dstptr].x = CGU_UINT8(srcBlock[srcpos++]);
+            inBlock[dstptr].y = CGU_UINT8(srcBlock[srcpos++]);
+            inBlock[dstptr].z = CGU_UINT8(srcBlock[srcpos++]);
+            inBlock[dstptr].w = CGU_UINT8(srcBlock[srcpos++]);
+            dstptr++;
+        }
+    }
+
+    CMP_BC15Options *BC15options = (CMP_BC15Options *)options;
+    CMP_BC15Options BC15optionsDefault;
+    if (BC15options == NULL)
+    {
+        BC15options     = &BC15optionsDefault;
+        SetDefaultBC15Options(BC15options);
+    }
+
+    CompressBlockBC1_Internal(inBlock, (CMP_GLOBAL  CGU_UINT32 *)cmpBlock, BC15options);
+    return CGU_CORE_OK;
+}
+
+int CMP_CDECL DecompressBlockBC1(const unsigned char cmpBlock[8], 
+                                 CMP_GLOBAL unsigned char srcBlock[64],
+                                 const void *options = NULL) {
+    CMP_BC15Options *BC15options = (CMP_BC15Options *)options;
+    CMP_BC15Options BC15optionsDefault;
+    if (BC15options == NULL)
+    {
+        BC15options     = &BC15optionsDefault;
+        SetDefaultBC15Options(BC15options);
+    }
+    DecompressDXTRGB_Internal(srcBlock, ( CGU_UINT32 *)cmpBlock, BC15options);
+
+
+    return CGU_CORE_OK;
+}
+#endif
+
+//============================================== OpenCL USER INTERFACE ========================================================
+#ifdef ASPM_GPU
+CMP_STATIC CMP_KERNEL void CMP_GPUEncoder(
+    CMP_GLOBAL  const CMP_Vec4uc*   ImageSource,
+    CMP_GLOBAL  CGU_UINT8*          ImageDestination,
+    CMP_GLOBAL  Source_Info*        SourceInfo,
+    CMP_GLOBAL  CMP_BC15Options*    BC15options
+)
+{
+    CGU_UINT32 xID;
+    CGU_UINT32 yID;
+
+//printf("SourceInfo: (H:%d,W:%d) Quality %1.2f \n", SourceInfo->m_src_height, SourceInfo->m_src_width, SourceInfo->m_fquality);
+#ifdef ASPM_GPU
+    xID = get_global_id(0);
+    yID = get_global_id(1);
+#else
+    xID = 0;
+    yID = 0;
+#endif
+
+    if (xID >= (SourceInfo->m_src_width / BlockX)) return;
+    if (yID >= (SourceInfo->m_src_height / BlockX)) return;
+    int  srcWidth = SourceInfo->m_src_width;
+
+    CGU_UINT32 destI = (xID*BC1CompBlockSize) + (yID*(srcWidth / BlockX)*BC1CompBlockSize);
+    int srcindex = 4 * (yID * srcWidth + xID);
+    int blkindex = 0;
+    CMP_Vec4uc srcData[16];
+    srcWidth = srcWidth - 4;
+
+    for ( CGU_INT32 j = 0; j < 4; j++) {
+        for ( CGU_INT32 i = 0; i < 4; i++) {
+            srcData[blkindex++] = ImageSource[srcindex++];
+        }
+        srcindex += srcWidth;
+    }
+
+    // fast low quality mode that matches v3.1 code
+    if (SourceInfo->m_fquality <= 0.04f)
+        CompressBlockBC1_Fast(srcData, (CMP_GLOBAL  CGU_UINT32 *)&ImageDestination[destI]);
+    else
+        CompressBlockBC1_Internal(srcData, (CMP_GLOBAL  CGU_UINT32 *)&ImageDestination[destI], BC15options);
+}
+#endif
diff --git a/extern/CMP_Core/shaders/BC1_Encode_kernel.h b/extern/CMP_Core/shaders/BC1_Encode_kernel.h
new file mode 100644
index 0000000..73a0acf
--- /dev/null
+++ b/extern/CMP_Core/shaders/BC1_Encode_kernel.h
@@ -0,0 +1,48 @@
+//=====================================================================
+// Copyright (c) 2018    Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files(the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and / or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions :
+// 
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+//
+//=====================================================================
+#ifndef BC1_ENCODE_KERNEL_H
+#define BC1_ENCODE_KERNEL_H
+
+#include "Common_Def.h"
+#include "BCn_Common_Kernel.h"
+
+#define CS_RED(r, g, b)        (r)
+#define CS_GREEN(r, g, b)    (g)
+#define CS_BLUE(r, g, b)    ((b+g)*0.5f)
+#define DCS_RED(r, g, b)    (r)
+#define DCS_GREEN(r, g, b)    (g)
+#define DCS_BLUE(r, g, b)    ((2.0f*b)-g)
+#define BYTEPP 4
+#define BC1CompBlockSize    8
+
+
+#define ROUND_AND_CLAMP(v, shift)    \
+{\
+    if (v < 0) v = 0;\
+    else if (v > 255) v = 255;\
+    else v += (0x80>>shift) - (v>>shift);\
+}
+
+#define POS(x,y) (pos_on_axis[(x)+(y)*4])
+
+#endif
\ No newline at end of file
diff --git a/extern/CMP_Core/shaders/BC2_Encode_kernel.cpp b/extern/CMP_Core/shaders/BC2_Encode_kernel.cpp
new file mode 100644
index 0000000..a8b355b
--- /dev/null
+++ b/extern/CMP_Core/shaders/BC2_Encode_kernel.cpp
@@ -0,0 +1,261 @@
+//=====================================================================
+// Copyright (c) 2018    Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files(the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and / or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions :
+// 
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+//
+//=====================================================================
+#include "BC2_Encode_kernel.h"
+
+//============================================== BC2 INTERFACES =======================================================
+
+void DXTCV11CompressExplicitAlphaBlock(const CGU_UINT8 block_8[16], CMP_GLOBAL CGU_UINT32 block_dxtc[2])
+{
+    CGU_UINT8 i;
+    block_dxtc[0] = block_dxtc[1] = 0;
+    for (i = 0; i < 16; i++)
+    {
+        int v = block_8[i];
+        v = (v + 7 - (v >> 4));
+        v >>= 4;
+        if (v < 0)
+            v = 0;
+        if (v > 0xf)
+            v = 0xf;
+        if (i < 8)
+            block_dxtc[0] |= v << (4 * i);
+        else
+            block_dxtc[1] |= v << (4 * (i - 8));
+    }
+}
+
+#define EXPLICIT_ALPHA_PIXEL_MASK 0xf
+#define EXPLICIT_ALPHA_PIXEL_BPP  4
+
+CGU_INT CompressExplicitAlphaBlock(const CGU_UINT8 alphaBlock[BLOCK_SIZE_4X4], 
+    CMP_GLOBAL CGU_UINT32 compressedBlock[2])
+{
+    DXTCV11CompressExplicitAlphaBlock(alphaBlock, compressedBlock);
+    return CGU_CORE_OK;
+}
+
+void  CompressBlockBC2_Internal(const CMP_Vec4uc srcBlockTemp[16],
+                                CMP_GLOBAL CGU_UINT32 compressedBlock[4],
+                                CMP_GLOBAL const CMP_BC15Options *BC15options)
+{
+    CGU_UINT8    blkindex = 0;
+    CGU_UINT8    srcindex = 0;
+    CGU_UINT8    rgbaBlock[64];
+    for (CGU_INT32 j = 0; j < 4; j++) {
+        for (CGU_INT32 i = 0; i < 4; i++) {
+            rgbaBlock[blkindex++] = (CGU_UINT8)srcBlockTemp[srcindex].z;  // B
+            rgbaBlock[blkindex++] = (CGU_UINT8)srcBlockTemp[srcindex].y;  // G
+            rgbaBlock[blkindex++] = (CGU_UINT8)srcBlockTemp[srcindex].x;  // R
+            rgbaBlock[blkindex++] = (CGU_UINT8)srcBlockTemp[srcindex].w;  // A
+            srcindex++;
+        }
+    }
+
+    CGU_UINT8 alphaBlock[BLOCK_SIZE_4X4];
+    for (CGU_INT32 i = 0; i < 16; i++)
+        alphaBlock[i] = (CGU_UINT8)(((CGU_INT32*)rgbaBlock)[i] >> RGBA8888_OFFSET_A);
+
+    // Need a copy, as CalculateColourWeightings sets variables in the BC15options
+    CMP_BC15Options internalOptions = *BC15options;
+    CalculateColourWeightings(rgbaBlock, &internalOptions);
+
+    CGU_INT err = CompressExplicitAlphaBlock(alphaBlock, &compressedBlock[DXTC_OFFSET_ALPHA]);
+    if (err != 0)
+        return;
+
+    CompressRGBBlock(rgbaBlock, &compressedBlock[DXTC_OFFSET_RGB], &internalOptions,FALSE,FALSE,0);
+}
+
+//============================================== USER INTERFACES ========================================================
+#ifndef ASPM_GPU
+
+int CMP_CDECL CreateOptionsBC2(void **options)
+{
+    CMP_BC15Options *BC15optionsDefault = new CMP_BC15Options;
+    if (BC15optionsDefault) {
+        SetDefaultBC15Options(BC15optionsDefault);
+        (*options) = BC15optionsDefault;
+    }
+    else {
+        (*options) = NULL;
+        return CGU_CORE_ERR_NEWMEM;
+    }
+    return CGU_CORE_OK;
+}
+
+int CMP_CDECL DestroyOptionsBC2(void *options)
+{
+    if (!options) return CGU_CORE_ERR_INVALIDPTR;
+    CMP_BC15Options *BCOptions = reinterpret_cast <CMP_BC15Options *>(options);
+    delete BCOptions;
+    return CGU_CORE_OK;
+}
+
+int CMP_CDECL SetQualityBC2(void *options,
+    CGU_FLOAT fquality)
+{
+    if (!options) return CGU_CORE_ERR_INVALIDPTR;
+    CMP_BC15Options *BC15optionsDefault = reinterpret_cast <CMP_BC15Options *>(options);
+    if (fquality < 0.0f) fquality = 0.0f;
+    else
+        if (fquality > 1.0f) fquality = 1.0f;
+    BC15optionsDefault->m_fquality = fquality;
+    return CGU_CORE_OK;
+}
+
+int CMP_CDECL SetChannelWeightsBC2(void *options,
+    CGU_FLOAT WeightRed,
+    CGU_FLOAT WeightGreen,
+    CGU_FLOAT WeightBlue) {
+    if (!options) return CGU_CORE_ERR_INVALIDPTR;
+    CMP_BC15Options *BC15optionsDefault = (CMP_BC15Options *)options;
+
+    if ((WeightRed < 0.0f) || (WeightRed > 1.0f))       return CGU_CORE_ERR_RANGERED;
+    if ((WeightGreen < 0.0f) || (WeightGreen > 1.0f))   return CGU_CORE_ERR_RANGEGREEN;
+    if ((WeightBlue < 0.0f) || (WeightBlue > 1.0f))     return CGU_CORE_ERR_RANGEBLUE;
+
+    BC15optionsDefault->m_bUseChannelWeighting = true;
+    BC15optionsDefault->m_fChannelWeights[0] = WeightRed;
+    BC15optionsDefault->m_fChannelWeights[1] = WeightGreen;
+    BC15optionsDefault->m_fChannelWeights[2] = WeightBlue;
+    return CGU_CORE_OK;
+}
+
+// Decompresses an explicit alpha block (DXT3)
+void DecompressExplicitAlphaBlock(CGU_UINT8 alphaBlock[BLOCK_SIZE_4X4],
+    const CGU_UINT32 compressedBlock[2])
+{
+    for (int i = 0; i < 16; i++)
+    {
+        int nBlock = i < 8 ? 0 : 1;
+        CGU_UINT8 cAlpha = (CGU_UINT8)((compressedBlock[nBlock] >> ((i % 8) * EXPLICIT_ALPHA_PIXEL_BPP)) & EXPLICIT_ALPHA_PIXEL_MASK);
+        alphaBlock[i] = (CGU_UINT8)((cAlpha << EXPLICIT_ALPHA_PIXEL_BPP) | cAlpha);
+    }
+}
+
+void DecompressBC2_Internal(CMP_GLOBAL CGU_UINT8 rgbaBlock[BLOCK_SIZE_4X4X4],
+    const CGU_UINT32 compressedBlock[4],
+    const CMP_BC15Options *BC15options)
+{
+    CGU_UINT8 alphaBlock[BLOCK_SIZE_4X4];
+
+    DecompressExplicitAlphaBlock(alphaBlock, &compressedBlock[DXTC_OFFSET_ALPHA]);
+    DecompressDXTRGB_Internal(rgbaBlock, &compressedBlock[DXTC_OFFSET_RGB],BC15options);
+
+    for (CGU_UINT32 i = 0; i < 16; i++)
+        ((CMP_GLOBAL CGU_UINT32*)rgbaBlock)[i] = (alphaBlock[i] << RGBA8888_OFFSET_A) | (((CMP_GLOBAL CGU_UINT32*)rgbaBlock)[i] & ~(BYTE_MASK << RGBA8888_OFFSET_A));
+}
+
+int CMP_CDECL CompressBlockBC2(const unsigned char *srcBlock,
+                               unsigned int srcStrideInBytes,
+                               CMP_GLOBAL unsigned char cmpBlock[16],
+                               CMP_GLOBAL const void *options = NULL) {
+
+    CMP_Vec4uc inBlock[16];
+
+    //----------------------------------
+    // Fill the inBlock with source data
+    //----------------------------------
+    CGU_INT srcpos = 0;
+    CGU_INT dstptr = 0;
+    for (CGU_UINT8 row = 0; row < 4; row++)
+    {
+        srcpos = row * srcStrideInBytes;
+        for (CGU_UINT8 col = 0; col < 4; col++)
+        {
+            inBlock[dstptr].x = CGU_UINT8(srcBlock[srcpos++]);
+            inBlock[dstptr].y = CGU_UINT8(srcBlock[srcpos++]);
+            inBlock[dstptr].z = CGU_UINT8(srcBlock[srcpos++]);
+            inBlock[dstptr].w = CGU_UINT8(srcBlock[srcpos++]);
+            dstptr++;
+        }
+    }
+
+    CMP_BC15Options *BC15options = (CMP_BC15Options *)options;
+    CMP_BC15Options BC15optionsDefault;
+    if (BC15options == NULL)
+    {
+        BC15options = &BC15optionsDefault;
+        SetDefaultBC15Options(BC15options);
+    }
+    CompressBlockBC2_Internal(inBlock, (CMP_GLOBAL CGU_UINT32 *)cmpBlock, BC15options);
+    return CGU_CORE_OK;
+}
+
+int CMP_CDECL DecompressBlockBC2(const unsigned char cmpBlock[16], 
+                                 CMP_GLOBAL unsigned char srcBlock[64],
+                                 const void *options = NULL) {
+    CMP_BC15Options *BC15options = (CMP_BC15Options *)options;
+    CMP_BC15Options BC15optionsDefault;
+    if (BC15options == NULL)
+    {
+        BC15options = &BC15optionsDefault;
+        SetDefaultBC15Options(BC15options);
+    }
+    DecompressBC2_Internal(srcBlock, (CGU_UINT32 *)cmpBlock,BC15options);
+
+    return CGU_CORE_OK;
+}
+#endif
+
+//============================================== OpenCL USER INTERFACE ========================================================
+#ifdef ASPM_GPU
+CMP_STATIC CMP_KERNEL void CMP_GPUEncoder(
+    CMP_GLOBAL  const CMP_Vec4uc*   ImageSource,
+    CMP_GLOBAL  CGU_UINT8*          ImageDestination,
+    CMP_GLOBAL  Source_Info*        SourceInfo,
+    CMP_GLOBAL  CMP_BC15Options*    BC15options
+)
+{
+    CGU_UINT32 xID;
+    CGU_UINT32 yID;
+
+#ifdef ASPM_GPU
+    xID = get_global_id(0);
+    yID = get_global_id(1);
+#else
+    xID = 0;
+    yID = 0;
+#endif
+
+    if (xID >= (SourceInfo->m_src_width / BlockX)) return;
+    if (yID >= (SourceInfo->m_src_height / BlockX)) return;
+    int  srcWidth = SourceInfo->m_src_width;
+
+    CGU_UINT32 destI = (xID*BC2CompBlockSize) + (yID*(srcWidth / BlockX)*BC2CompBlockSize);
+    int srcindex = 4 * (yID * srcWidth + xID);
+    int blkindex = 0;
+    CMP_Vec4uc srcData[16];
+    srcWidth = srcWidth - 4;
+
+    for ( CGU_INT32 j = 0; j < 4; j++) {
+        for ( CGU_INT32 i = 0; i < 4; i++) {
+            srcData[blkindex++] = ImageSource[srcindex++];
+        }
+        srcindex += srcWidth;
+    }
+
+    CompressBlockBC2_Internal(srcData,(CMP_GLOBAL CGU_UINT32 *)&ImageDestination[destI], BC15options);
+}
+#endif
+
diff --git a/extern/CMP_Core/shaders/BC2_Encode_kernel.h b/extern/CMP_Core/shaders/BC2_Encode_kernel.h
new file mode 100644
index 0000000..a152751
--- /dev/null
+++ b/extern/CMP_Core/shaders/BC2_Encode_kernel.h
@@ -0,0 +1,34 @@
+//=====================================================================
+// Copyright (c) 2018    Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files(the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and / or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions :
+// 
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+//
+//=====================================================================
+#ifndef BC2_ENCODE_KERNEL_H
+#define BC2_ENCODE_KERNEL_H
+
+#include "Common_Def.h"
+#include "BCn_Common_Kernel.h"
+
+#define BC2CompBlockSize    16
+#define NUM_CHANNELS        4
+#define NUM_ENDPOINTS       2
+
+
+#endif
\ No newline at end of file
diff --git a/extern/CMP_Core/shaders/BC3_Encode_kernel.cpp b/extern/CMP_Core/shaders/BC3_Encode_kernel.cpp
new file mode 100644
index 0000000..8fc30e6
--- /dev/null
+++ b/extern/CMP_Core/shaders/BC3_Encode_kernel.cpp
@@ -0,0 +1,218 @@
+//=====================================================================
+// Copyright (c) 2018    Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files(the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and / or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions :
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+//
+//=====================================================================
+#include "BC3_Encode_kernel.h"
+
+//============================================== BC3 INTERFACES =======================================================
+
+void CompressBlockBC3_Internal(const CMP_Vec4uc srcBlockTemp[16],
+                               CMP_GLOBAL CGU_UINT32 compressedBlock[4],
+                               CMP_GLOBAL const CMP_BC15Options *BC15options) {
+  CGU_UINT8 blkindex = 0;
+  CGU_UINT8 srcindex = 0;
+  CGU_UINT8 rgbaBlock[64];
+  for (CGU_INT32 j = 0; j < 4; j++) {
+    for (CGU_INT32 i = 0; i < 4; i++) {
+      rgbaBlock[blkindex++] = (CGU_UINT8)srcBlockTemp[srcindex].z;  // B
+      rgbaBlock[blkindex++] = (CGU_UINT8)srcBlockTemp[srcindex].y;  // G
+      rgbaBlock[blkindex++] = (CGU_UINT8)srcBlockTemp[srcindex].x;  // R
+      rgbaBlock[blkindex++] = (CGU_UINT8)srcBlockTemp[srcindex].w;  // A
+      srcindex++;
+    }
+  }
+
+  CMP_BC15Options internalOptions = *BC15options;
+  CalculateColourWeightings(rgbaBlock, &internalOptions);
+
+  CGU_UINT8 alphaBlock[BLOCK_SIZE_4X4];
+  for (CGU_INT32 i = 0; i < 16; i++)
+    alphaBlock[i] =
+        (CGU_UINT8)(((CGU_INT32 *)rgbaBlock)[i] >> RGBA8888_OFFSET_A);
+
+  CGU_INT err = CompressAlphaBlock(alphaBlock, &compressedBlock[DXTC_OFFSET_ALPHA]);
+  if (err != 0) return;
+
+  CompressRGBBlock(rgbaBlock, &compressedBlock[DXTC_OFFSET_RGB], &internalOptions,
+                   FALSE, FALSE, 0);
+}
+
+//============================================== USER INTERFACES ========================================================
+#ifndef ASPM_GPU
+
+int CMP_CDECL CreateOptionsBC3(void **options)
+{
+    CMP_BC15Options *BC15optionsDefault = new CMP_BC15Options;
+    if (BC15optionsDefault) {
+        SetDefaultBC15Options(BC15optionsDefault);
+        (*options) = BC15optionsDefault;
+    }
+    else {
+        (*options) = NULL;
+        return CGU_CORE_ERR_NEWMEM;
+    }
+    return CGU_CORE_OK;
+}
+
+
+int CMP_CDECL DestroyOptionsBC3(void *options)
+{
+    if (!options) return CGU_CORE_ERR_INVALIDPTR;
+    CMP_BC15Options *BCOptions = reinterpret_cast <CMP_BC15Options *>(options);
+    delete BCOptions;
+    return CGU_CORE_OK;
+}
+
+int CMP_CDECL SetQualityBC3(void *options,
+    CGU_FLOAT fquality)
+{
+    if (!options) return CGU_CORE_ERR_INVALIDPTR;
+    CMP_BC15Options *BC15optionsDefault = reinterpret_cast <CMP_BC15Options *>(options);
+    if (fquality < 0.0f) fquality = 0.0f;
+    else
+        if (fquality > 1.0f) fquality = 1.0f;
+    BC15optionsDefault->m_fquality = fquality;
+    return CGU_CORE_OK;
+}
+
+int CMP_CDECL SetChannelWeightsBC3(void *options,
+    CGU_FLOAT WeightRed,
+    CGU_FLOAT WeightGreen,
+    CGU_FLOAT WeightBlue) {
+    if (!options) return 1;
+    CMP_BC15Options *BC15optionsDefault = (CMP_BC15Options *)options;
+
+    if ((WeightRed < 0.0f) || (WeightRed > 1.0f))       return CGU_CORE_ERR_RANGERED;
+    if ((WeightGreen < 0.0f) || (WeightGreen > 1.0f))   return CGU_CORE_ERR_RANGEGREEN;
+    if ((WeightBlue < 0.0f) || (WeightBlue > 1.0f))     return CGU_CORE_ERR_RANGEBLUE;
+
+    BC15optionsDefault->m_bUseChannelWeighting = true;
+    BC15optionsDefault->m_fChannelWeights[0] = WeightRed;
+    BC15optionsDefault->m_fChannelWeights[1] = WeightGreen;
+    BC15optionsDefault->m_fChannelWeights[2] = WeightBlue;
+    return CGU_CORE_OK;
+}
+
+
+void DecompressBC3_Internal(CMP_GLOBAL CGU_UINT8 rgbaBlock[64],
+                            const CGU_UINT32 compressedBlock[4],
+                            const CMP_BC15Options *BC15options) {
+  CGU_UINT8 alphaBlock[BLOCK_SIZE_4X4];
+
+  DecompressAlphaBlock(alphaBlock, &compressedBlock[DXTC_OFFSET_ALPHA]);
+  DecompressDXTRGB_Internal(rgbaBlock, &compressedBlock[DXTC_OFFSET_RGB],BC15options);
+
+  for (CGU_UINT32 i = 0; i < 16; i++)
+    ((CMP_GLOBAL CGU_UINT32 *)rgbaBlock)[i] =
+        (alphaBlock[i] << RGBA8888_OFFSET_A) |
+        (((CMP_GLOBAL CGU_UINT32 *)rgbaBlock)[i] &
+         ~(BYTE_MASK << RGBA8888_OFFSET_A));
+}
+
+int CMP_CDECL CompressBlockBC3( const unsigned char *srcBlock,
+                                unsigned int srcStrideInBytes,
+                                CMP_GLOBAL unsigned char cmpBlock[16],
+                                const void *options = NULL) {
+    CMP_Vec4uc inBlock[16];
+
+    //----------------------------------
+    // Fill the inBlock with source data
+    //----------------------------------
+    CGU_INT srcpos = 0;
+    CGU_INT dstptr = 0;
+    for (CGU_UINT8 row = 0; row < 4; row++)
+    {
+        srcpos = row * srcStrideInBytes;
+        for (CGU_UINT8 col = 0; col < 4; col++)
+        {
+            inBlock[dstptr].x = CGU_UINT8(srcBlock[srcpos++]);
+            inBlock[dstptr].y = CGU_UINT8(srcBlock[srcpos++]);
+            inBlock[dstptr].z = CGU_UINT8(srcBlock[srcpos++]);
+            inBlock[dstptr].w = CGU_UINT8(srcBlock[srcpos++]);
+            dstptr++;
+        }
+    }
+
+    CMP_BC15Options *BC15options = (CMP_BC15Options *)options;
+    CMP_BC15Options BC15optionsDefault;
+    if (BC15options == NULL) {
+      BC15options = &BC15optionsDefault;
+      SetDefaultBC15Options(BC15options);
+    }
+
+    CompressBlockBC3_Internal(inBlock,(CMP_GLOBAL CGU_UINT32 *)cmpBlock, BC15options);
+    return CGU_CORE_OK;
+}
+
+int CMP_CDECL DecompressBlockBC3(const unsigned char cmpBlock[16],
+                                 CMP_GLOBAL unsigned char srcBlock[64],
+                                 const void *options = NULL) {
+    CMP_BC15Options *BC15options = (CMP_BC15Options *)options;
+    CMP_BC15Options BC15optionsDefault;
+    if (BC15options == NULL)
+    {
+        BC15options = &BC15optionsDefault;
+        SetDefaultBC15Options(BC15options);
+    }
+    DecompressBC3_Internal(srcBlock, (CGU_UINT32 *)cmpBlock,BC15options);
+    return CGU_CORE_OK;
+}
+#endif
+
+//============================================== OpenCL USER INTERFACE ====================================================
+#ifdef ASPM_GPU
+CMP_STATIC CMP_KERNEL void CMP_GPUEncoder(
+    CMP_GLOBAL const CMP_Vec4uc *ImageSource,
+    CMP_GLOBAL CGU_UINT8 *ImageDestination, CMP_GLOBAL Source_Info *SourceInfo,
+    CMP_GLOBAL CMP_BC15Options *BC15options) {
+  CGU_UINT32 xID;
+  CGU_UINT32 yID;
+
+#ifdef ASPM_GPU
+  xID = get_global_id(0);
+  yID = get_global_id(1);
+#else
+  xID = 0;
+  yID = 0;
+#endif
+
+  if (xID >= (SourceInfo->m_src_width / BlockX)) return;
+  if (yID >= (SourceInfo->m_src_height / BlockX)) return;
+  int srcWidth = SourceInfo->m_src_width;
+
+  CGU_UINT32 destI =
+      (xID * BC3CompBlockSize) + (yID * (srcWidth / BlockX) * BC3CompBlockSize);
+  int srcindex = 4 * (yID * srcWidth + xID);
+  int blkindex = 0;
+  CMP_Vec4uc srcData[16];
+  srcWidth = srcWidth - 4;
+
+  for (CGU_INT32 j = 0; j < 4; j++) {
+    for (CGU_INT32 i = 0; i < 4; i++) {
+      srcData[blkindex++] = ImageSource[srcindex++];
+    }
+    srcindex += srcWidth;
+  }
+
+  CompressBlockBC3_Internal(
+      srcData, (CMP_GLOBAL CGU_UINT32 *)&ImageDestination[destI], BC15options);
+}
+#endif
diff --git a/extern/CMP_Core/shaders/BC3_Encode_kernel.h b/extern/CMP_Core/shaders/BC3_Encode_kernel.h
new file mode 100644
index 0000000..9e97da1
--- /dev/null
+++ b/extern/CMP_Core/shaders/BC3_Encode_kernel.h
@@ -0,0 +1,31 @@
+//=====================================================================
+// Copyright (c) 2018    Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files(the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and / or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions :
+// 
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+//
+//=====================================================================
+#ifndef BC3_ENCODE_KERNEL_H
+#define BC3_ENCODE_KERNEL_H
+
+#include "Common_Def.h"
+#include "BCn_Common_Kernel.h"
+
+#define BC3CompBlockSize 16
+
+#endif
\ No newline at end of file
diff --git a/extern/CMP_Core/shaders/BC4_Encode_kernel.cpp b/extern/CMP_Core/shaders/BC4_Encode_kernel.cpp
new file mode 100644
index 0000000..6242cf8
--- /dev/null
+++ b/extern/CMP_Core/shaders/BC4_Encode_kernel.cpp
@@ -0,0 +1,200 @@
+//=====================================================================
+// Copyright (c) 2018    Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files(the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and / or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions :
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+//
+//=====================================================================
+#include "BC4_Encode_kernel.h"
+
+//============================================== BC4 INTERFACES =======================================================
+
+void CompressBlockBC4_Internal(const CMP_Vec4uc srcBlockTemp[16],
+                               CMP_GLOBAL CGU_UINT32 compressedBlock[2],
+                               CMP_GLOBAL const CMP_BC15Options *BC15options) {
+  if (BC15options->m_fquality) {
+    // Reserved!
+  }
+  CGU_UINT8 blkindex = 0;
+  CGU_UINT8 srcindex = 0;
+  CGU_UINT8 alphaBlock[16];
+  for (CGU_INT32 j = 0; j < 4; j++) {
+    for (CGU_INT32 i = 0; i < 4; i++) {
+      alphaBlock[blkindex++] =
+          (CGU_UINT8)srcBlockTemp[srcindex].x;  // Red channel
+      srcindex++;
+    }
+  }
+  CompressAlphaBlock(alphaBlock, (CMP_GLOBAL CGU_UINT32 *)compressedBlock);
+}
+
+void DecompressBC4_Internal(CMP_GLOBAL CGU_UINT8 rgbaBlock[64],
+                            const CGU_UINT32 compressedBlock[2],
+                            const CMP_BC15Options *BC15options) {
+  if (BC15options) {}
+  CGU_UINT8 alphaBlock[BLOCK_SIZE_4X4];
+  DecompressAlphaBlock(alphaBlock, compressedBlock);
+
+  CGU_UINT8 blkindex = 0;
+  CGU_UINT8 srcindex = 0;
+  for (CGU_INT32 j = 0; j < 4; j++) {
+    for (CGU_INT32 i = 0; i < 4; i++) {
+      rgbaBlock[blkindex++] = (CGU_UINT8)alphaBlock[srcindex];  // R
+      rgbaBlock[blkindex++] = (CGU_UINT8)alphaBlock[srcindex];  // G
+      rgbaBlock[blkindex++] = (CGU_UINT8)alphaBlock[srcindex];  // B
+      rgbaBlock[blkindex++] = (CGU_UINT8)alphaBlock[srcindex];  // A
+      srcindex++;
+    }
+  }
+}
+
+void CompressBlockBC4_SingleChannel(const CGU_UINT8 srcBlockTemp[16],
+                               CMP_GLOBAL CGU_UINT32 compressedBlock[2],
+                               CMP_GLOBAL const CMP_BC15Options *BC15options) {
+  if (BC15options) {}
+  CompressAlphaBlock(srcBlockTemp, (CMP_GLOBAL CGU_UINT32 *)compressedBlock);
+}
+
+void DecompressBlockBC4_SingleChannel(CGU_UINT8 srcBlockTemp[16],
+                            const CGU_UINT32 compressedBlock[2],
+                            const CMP_BC15Options *BC15options) {
+  if (BC15options) {}
+  DecompressAlphaBlock(srcBlockTemp, compressedBlock);
+}
+
+//============================================== USER INTERFACES ========================================================
+#ifndef ASPM_GPU
+
+int CMP_CDECL CreateOptionsBC4(void **options)
+{
+    CMP_BC15Options *BC15optionsDefault = new CMP_BC15Options;
+    if (BC15optionsDefault) {
+        SetDefaultBC15Options(BC15optionsDefault);
+        (*options) = BC15optionsDefault;
+    }
+    else {
+        (*options) = NULL;
+        return CGU_CORE_ERR_NEWMEM;
+    }
+    return CGU_CORE_OK;
+}
+
+int CMP_CDECL DestroyOptionsBC4(void *options)
+{
+    if (!options) return CGU_CORE_ERR_INVALIDPTR;
+    CMP_BC15Options *BCOptions = reinterpret_cast <CMP_BC15Options *>(options);
+    delete BCOptions;
+    return CGU_CORE_OK;
+}
+
+int CMP_CDECL SetQualityBC4(void *options,
+    CGU_FLOAT fquality)
+{
+    if (!options) return CGU_CORE_ERR_INVALIDPTR;
+    CMP_BC15Options *BC15optionsDefault = reinterpret_cast <CMP_BC15Options *>(options);
+    if (fquality < 0.0f) fquality = 0.0f;
+    else
+        if (fquality > 1.0f) fquality = 1.0f;
+    BC15optionsDefault->m_fquality = fquality;
+    return CGU_CORE_OK;
+}
+
+int CMP_CDECL CompressBlockBC4(const unsigned char *srcBlock,
+                               unsigned int srcStrideInBytes,
+                               CMP_GLOBAL unsigned char cmpBlock[8],
+                               const void *options = NULL) {
+
+    unsigned char inBlock[16];
+    //----------------------------------
+    // Fill the inBlock with source data
+    //----------------------------------
+    CGU_INT srcpos = 0;
+    CGU_INT dstptr = 0;
+    for (CGU_UINT8 row = 0; row < 4; row++)
+    {
+        srcpos = row * srcStrideInBytes;
+        for (CGU_UINT8 col = 0; col < 4; col++)
+        {
+            inBlock[dstptr++] = CGU_UINT8(srcBlock[srcpos++]);
+        }
+    }
+
+    CMP_BC15Options *BC15options = (CMP_BC15Options *)options;
+    if (BC15options == NULL) {
+      CMP_BC15Options BC15optionsDefault;
+      BC15options = &BC15optionsDefault;
+      SetDefaultBC15Options(BC15options);
+    }
+
+    CompressBlockBC4_SingleChannel(inBlock,(CMP_GLOBAL CGU_UINT32 *)cmpBlock, BC15options);
+    return CGU_CORE_OK;
+}
+
+int CMP_CDECL DecompressBlockBC4(const unsigned char cmpBlock[8],
+                            CMP_GLOBAL unsigned char srcBlock[16],
+                            const void *options = NULL) {
+    CMP_BC15Options *BC15options = (CMP_BC15Options *)options;
+    CMP_BC15Options BC15optionsDefault;
+    if (BC15options == NULL)
+    {
+        BC15options = &BC15optionsDefault;
+        SetDefaultBC15Options(BC15options);
+    }
+    DecompressBlockBC4_SingleChannel(srcBlock, (CGU_UINT32 *)cmpBlock,BC15options);
+    return CGU_CORE_OK;
+}
+#endif
+
+//============================================== OpenCL USER INTERFACE ====================================================
+#ifdef ASPM_GPU
+CMP_STATIC CMP_KERNEL void CMP_GPUEncoder(
+    CMP_GLOBAL const CMP_Vec4uc *ImageSource,
+    CMP_GLOBAL CGU_UINT8 *ImageDestination, CMP_GLOBAL Source_Info *SourceInfo,
+    CMP_GLOBAL CMP_BC15Options *BC15options) {
+  CGU_UINT32 xID;
+  CGU_UINT32 yID;
+
+#ifdef ASPM_GPU
+  xID = get_global_id(0);
+  yID = get_global_id(1);
+#else
+  xID = 0;
+  yID = 0;
+#endif
+
+  if (xID >= (SourceInfo->m_src_width / BlockX)) return;
+  if (yID >= (SourceInfo->m_src_height / BlockX)) return;
+  int srcWidth = SourceInfo->m_src_width;
+
+  CGU_UINT32 destI =
+      (xID * BC4CompBlockSize) + (yID * (srcWidth / BlockX) * BC4CompBlockSize);
+  int srcindex = 4 * (yID * srcWidth + xID);
+  int blkindex = 0;
+  CMP_Vec4uc srcData[16];
+  srcWidth = srcWidth - 4;
+
+  for (CGU_INT32 j = 0; j < 4; j++) {
+    for (CGU_INT32 i = 0; i < 4; i++) {
+      srcData[blkindex++] = ImageSource[srcindex++];
+    }
+    srcindex += srcWidth;
+  }
+
+  CompressBlockBC4_Internal(srcData, (CMP_GLOBAL CGU_UINT32 *)&ImageDestination[destI], BC15options);
+}
+#endif
diff --git a/extern/CMP_Core/shaders/BC4_Encode_kernel.h b/extern/CMP_Core/shaders/BC4_Encode_kernel.h
new file mode 100644
index 0000000..65af4a7
--- /dev/null
+++ b/extern/CMP_Core/shaders/BC4_Encode_kernel.h
@@ -0,0 +1,31 @@
+//=====================================================================
+// Copyright (c) 2018    Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files(the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and / or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions :
+// 
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+//
+//=====================================================================
+#ifndef BC4_ENCODE_KERNEL_H
+#define BC4_ENCODE_KERNEL_H
+
+#include "Common_Def.h"
+#include "BCn_Common_Kernel.h"
+
+#define BC4CompBlockSize 8
+
+#endif
\ No newline at end of file
diff --git a/extern/CMP_Core/shaders/BC5_Encode_kernel.cpp b/extern/CMP_Core/shaders/BC5_Encode_kernel.cpp
new file mode 100644
index 0000000..d4784dd
--- /dev/null
+++ b/extern/CMP_Core/shaders/BC5_Encode_kernel.cpp
@@ -0,0 +1,264 @@
+//=====================================================================
+// Copyright (c) 2018    Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files(the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and / or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions :
+// 
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+//
+//=====================================================================
+#include "BC5_Encode_kernel.h"
+
+//============================================== BC5 INTERFACES =======================================================
+
+void  CompressBlockBC5_Internal(CMP_Vec4uc srcBlockTemp[16],
+                                CMP_GLOBAL CGU_UINT32 compressedBlock[4],
+                                CMP_GLOBAL  CMP_BC15Options *BC15options)
+{
+    if (BC15options->m_fquality) {
+        // Resreved
+    }
+    CGU_UINT8    blkindex = 0;
+    CGU_UINT8    srcindex = 0;
+    CGU_UINT8    alphaBlock[16];
+    for (CGU_INT32 j = 0; j < 4; j++) {
+        for (CGU_INT32 i = 0; i < 4; i++) {
+            alphaBlock[blkindex++] = (CGU_UINT8)srcBlockTemp[srcindex].x;  // Red channel
+            srcindex++;
+        }
+    }
+    CompressAlphaBlock(alphaBlock,&compressedBlock[0]);
+
+    blkindex = 0;
+    srcindex = 0;
+    for (CGU_INT32 j = 0; j < 4; j++) {
+        for (CGU_INT32 i = 0; i < 4; i++) {
+            alphaBlock[blkindex++] = (CGU_UINT8)srcBlockTemp[srcindex].y;  // Green channel
+            srcindex++;
+        }
+    }
+    CompressAlphaBlock(alphaBlock,&compressedBlock[2]);
+
+}
+
+void  DecompressBC5_Internal(CMP_GLOBAL CGU_UINT8 rgbaBlock[64], 
+                             CGU_UINT32 compressedBlock[4],
+                             CMP_BC15Options *BC15options)
+{
+    CGU_UINT8 alphaBlockR[BLOCK_SIZE_4X4];
+    CGU_UINT8 alphaBlockG[BLOCK_SIZE_4X4];
+
+    DecompressAlphaBlock(alphaBlockR, &compressedBlock[0]);
+    DecompressAlphaBlock(alphaBlockG, &compressedBlock[2]);
+ 
+    CGU_UINT8    blkindex = 0;
+    CGU_UINT8    srcindex = 0;
+
+    if (BC15options->m_mapDecodeRGBA)
+    {
+        for (CGU_INT32 j = 0; j < 4; j++) {
+            for (CGU_INT32 i = 0; i < 4; i++) {
+                rgbaBlock[blkindex++] = (CGU_UINT8)alphaBlockR[srcindex];
+                rgbaBlock[blkindex++] = (CGU_UINT8)alphaBlockG[srcindex];
+                rgbaBlock[blkindex++] = 0;
+                rgbaBlock[blkindex++] = 255;
+                srcindex++;
+            }
+        }
+    }
+    else
+    {
+        for (CGU_INT32 j = 0; j < 4; j++) {
+            for (CGU_INT32 i = 0; i < 4; i++) {
+                rgbaBlock[blkindex++] = 0;
+                rgbaBlock[blkindex++] = (CGU_UINT8)alphaBlockG[srcindex];
+                rgbaBlock[blkindex++] = (CGU_UINT8)alphaBlockR[srcindex];
+                rgbaBlock[blkindex++] = 255;
+                srcindex++;
+            }
+        }
+    }
+
+}
+
+
+void  CompressBlockBC5_DualChannel_Internal(const CGU_UINT8 srcBlockR[16],
+                                            const CGU_UINT8 srcBlockG[16],
+                                            CMP_GLOBAL  CGU_UINT32 compressedBlock[4],
+                                            CMP_GLOBAL  const CMP_BC15Options *BC15options)
+{
+    if (BC15options) {}
+    CompressAlphaBlock(srcBlockR,&compressedBlock[0]);
+    CompressAlphaBlock(srcBlockG,&compressedBlock[2]);
+}
+
+void  DecompressBC5_DualChannel_Internal(CMP_GLOBAL CGU_UINT8 srcBlockR[16],
+                                         CMP_GLOBAL CGU_UINT8 srcBlockG[16], 
+                                         const CGU_UINT32 compressedBlock[4],
+                                         const CMP_BC15Options *BC15options)
+{
+    if (BC15options) {}
+    DecompressAlphaBlock(srcBlockR, &compressedBlock[0]);
+    DecompressAlphaBlock(srcBlockG, &compressedBlock[2]);
+}
+
+
+//============================================== USER INTERFACES ========================================================
+#ifndef ASPM_GPU
+
+int CMP_CDECL CreateOptionsBC5(void **options)
+{
+    CMP_BC15Options *BC15optionsDefault = new CMP_BC15Options;
+    if (BC15optionsDefault) {
+        SetDefaultBC15Options(BC15optionsDefault);
+        (*options) = BC15optionsDefault;
+    }
+    else {
+        (*options) = NULL;
+        return CGU_CORE_ERR_NEWMEM;
+    }
+    return CGU_CORE_OK;
+}
+
+int CMP_CDECL DestroyOptionsBC5(void *options)
+{
+    if (!options) return CGU_CORE_ERR_INVALIDPTR;
+    CMP_BC15Options *BCOptions = reinterpret_cast <CMP_BC15Options *>(options);
+    delete BCOptions;
+    return CGU_CORE_OK;
+}
+
+int CMP_CDECL SetQualityBC5(void *options,
+    CGU_FLOAT fquality)
+{
+    if (!options) return CGU_CORE_ERR_INVALIDPTR;
+    CMP_BC15Options *BC15optionsDefault = reinterpret_cast <CMP_BC15Options *>(options);
+    if (fquality < 0.0f) fquality = 0.0f;
+    else
+        if (fquality > 1.0f) fquality = 1.0f;
+    BC15optionsDefault->m_fquality = fquality;
+    return CGU_CORE_OK;
+}
+
+
+int CMP_CDECL CompressBlockBC5(const CGU_UINT8 *srcBlockR,
+                               unsigned int srcStrideInBytes1,
+                               const CGU_UINT8 *srcBlockG,
+                               unsigned int srcStrideInBytes2,
+                               CMP_GLOBAL CGU_UINT8 cmpBlock[16],
+                               const void *options = NULL) {
+    CGU_UINT8 inBlockR[16];
+
+    //----------------------------------
+    // Fill the inBlock with source data
+    //----------------------------------
+    CGU_INT srcpos = 0;
+    CGU_INT dstptr = 0;
+    for (CGU_UINT8 row = 0; row < 4; row++)
+    {
+        srcpos = row * srcStrideInBytes1;
+        for (CGU_UINT8 col = 0; col < 4; col++)
+        {
+            inBlockR[dstptr++] = CGU_UINT8(srcBlockR[srcpos++]);
+        }
+    }
+
+
+    CGU_UINT8 inBlockG[16];
+    //----------------------------------
+    // Fill the inBlock with source data
+    //----------------------------------
+    srcpos = 0;
+    dstptr = 0;
+    for (CGU_UINT8 row = 0; row < 4; row++)
+    {
+        srcpos = row * srcStrideInBytes2;
+        for (CGU_UINT8 col = 0; col < 4; col++)
+        {
+            inBlockG[dstptr++] = CGU_UINT8(srcBlockG[srcpos++]);
+        }
+    }
+
+
+    CMP_BC15Options *BC15options = (CMP_BC15Options *)options;
+    CMP_BC15Options BC15optionsDefault;
+    if (BC15options == NULL)
+    {
+        BC15options = &BC15optionsDefault;
+        SetDefaultBC15Options(BC15options);
+    }
+
+    CompressBlockBC5_DualChannel_Internal(inBlockR,inBlockG, (CMP_GLOBAL CGU_UINT32 *)cmpBlock, BC15options);
+    return CGU_CORE_OK;
+}
+
+int  CMP_CDECL DecompressBlockBC5(const CGU_UINT8 cmpBlock[16],
+                              CMP_GLOBAL CGU_UINT8 srcBlockR[16],
+                              CMP_GLOBAL CGU_UINT8 srcBlockG[16],
+                              const void *options = NULL) {
+    CMP_BC15Options *BC15options = (CMP_BC15Options *)options;
+    CMP_BC15Options BC15optionsDefault;
+    if (BC15options == NULL)
+    {
+        BC15options = &BC15optionsDefault;
+        SetDefaultBC15Options(BC15options);
+    }
+    DecompressBC5_DualChannel_Internal(srcBlockR,srcBlockG,(CGU_UINT32 *)cmpBlock,BC15options);
+
+    return CGU_CORE_OK;
+}
+
+#endif
+
+//============================================== OpenCL USER INTERFACE ====================================================
+#ifdef ASPM_GPU
+CMP_STATIC CMP_KERNEL void CMP_GPUEncoder(CMP_GLOBAL  const CMP_Vec4uc*   ImageSource,
+                                          CMP_GLOBAL  CGU_UINT8*          ImageDestination,
+                                          CMP_GLOBAL  Source_Info*        SourceInfo,
+                                          CMP_GLOBAL  CMP_BC15Options*    BC15options
+)
+{
+    CGU_UINT32 xID;
+    CGU_UINT32 yID;
+
+#ifdef ASPM_GPU
+    xID = get_global_id(0);
+    yID = get_global_id(1);
+#else
+    xID = 0;
+    yID = 0;
+#endif
+
+    if (xID >= (SourceInfo->m_src_width / BlockX)) return;
+    if (yID >= (SourceInfo->m_src_height / BlockX)) return;
+    int  srcWidth = SourceInfo->m_src_width;
+
+    CGU_UINT32 destI = (xID*BC5CompBlockSize) + (yID*(srcWidth / BlockX)*BC5CompBlockSize);
+    int srcindex = 4 * (yID * srcWidth + xID);
+    int blkindex = 0;
+    CMP_Vec4uc srcData[16];
+    srcWidth = srcWidth - 4;
+
+    for ( CGU_INT32 j = 0; j < 4; j++) {
+        for ( CGU_INT32 i = 0; i < 4; i++) {
+            srcData[blkindex++] = ImageSource[srcindex++];
+        }
+        srcindex += srcWidth;
+    }
+
+    CompressBlockBC5_Internal(srcData, (CMP_GLOBAL CGU_UINT32 *)&ImageDestination[destI], BC15options);
+}
+#endif
diff --git a/extern/CMP_Core/shaders/BC5_Encode_kernel.h b/extern/CMP_Core/shaders/BC5_Encode_kernel.h
new file mode 100644
index 0000000..89cffcc
--- /dev/null
+++ b/extern/CMP_Core/shaders/BC5_Encode_kernel.h
@@ -0,0 +1,31 @@
+//=====================================================================
+// Copyright (c) 2018    Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files(the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and / or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions :
+// 
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+//
+//=====================================================================
+#ifndef BC5_ENCODE_KERNEL_H
+#define BC5_ENCODE_KERNEL_H
+
+#include "Common_Def.h"
+#include "BCn_Common_Kernel.h"
+
+#define BC5CompBlockSize 16
+
+#endif
\ No newline at end of file
diff --git a/extern/CMP_Core/shaders/BC6_Encode_kernel.cpp b/extern/CMP_Core/shaders/BC6_Encode_kernel.cpp
new file mode 100644
index 0000000..f131583
--- /dev/null
+++ b/extern/CMP_Core/shaders/BC6_Encode_kernel.cpp
@@ -0,0 +1,3990 @@
+//=====================================================================
+// Copyright (c) 2018    Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files(the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and / or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions :
+// 
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+//
+//=====================================================================
+#include "BC6_Encode_kernel.h"
+
+#ifdef ASPM_GPU
+void memset(CGU_UINT8 *srcdata, CGU_UINT8 value, CGU_INT size)
+{
+    for (CGU_INT i = 0; i < size; i++)
+        *srcdata++ = value;
+}
+
+void memcpy(CGU_UINT8 *srcdata, CGU_UINT8 *dstdata, CGU_INT size)
+{
+    for (CGU_INT i = 0; i < size; i++)
+    {
+        *srcdata = *dstdata;
+        srcdata++;
+        dstdata++;
+    }
+}
+
+void swap(CGU_INT A, CGU_INT B)
+{
+    CGU_INT hold = A;
+    A = B;
+    B = hold;
+}
+
+#define abs      fabs
+#define floorf   floor
+#define sqrtf    sqrt
+#define logf     log
+#define ceilf    ceil
+
+#endif
+
+__constant CGU_UINT8   BC6_PARTITIONS[MAX_BC6H_PARTITIONS][MAX_SUBSET_SIZE] = {
+   { // 0
+       0,0,1,1,0,0,1,1,0,0,1,1,0,0,1,1
+   },
+
+   { // 1
+       0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1
+   },
+
+   { // 2
+       0,1,1,1,0,1,1,1,0,1,1,1,0,1,1,1
+   },
+
+   { // 3
+       0,0,0,1,0,0,1,1,0,0,1,1,0,1,1,1
+   },
+
+   { // 4
+       0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,1
+   },
+
+   { // 5
+       0,0,1,1,0,1,1,1, 0,1,1,1,1,1,1,1
+   },
+
+   { // 6
+       0,0,0,1,0,0,1,1,0,1,1,1,1,1,1,1
+   },
+
+   { // 7
+       0,0,0,0,0,0,0,1,0,0,1,1,0,1,1,1
+   },
+
+   { // 8
+       0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1
+   },
+
+   { // 9
+       0,0,1,1,0,1,1,1,1,1,1,1,1,1,1,1
+   },
+
+   { // 10
+       0,0,0,0,0,0,0,1,0,1,1,1,1,1,1,1
+   },
+
+   { // 11
+       0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,1
+   },
+
+   { // 12
+       0,0,0,1,0,1,1,1,1,1,1,1,1,1,1,1
+   },
+
+   { // 13
+       0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1
+   },
+
+   { // 14
+       0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1
+   },
+
+   { // 15
+       0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1
+   },
+
+   { // 16
+       0,0,0,0,1,0,0,0,1,1,1,0,1,1,1,1
+   },
+
+   { // 17
+       0,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0
+   },
+
+   { // 18
+       0,0,0,0,0,0,0,0,1,0,0,0,1,1,1,0
+   },
+
+   { // 19
+       0,1,1,1,0,0,1,1,0,0,0,1,0,0,0,0
+   },
+
+   { // 20
+       0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0
+   },
+
+   { // 21
+       0,0,0,0,1,0,0,0,1,1,0,0,1,1,1,0
+   },
+
+   { // 22
+       0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0
+   },
+
+   { // 23
+       0,1,1,1,0,0,1,1,0,0,1,1,0,0,0,1
+   },
+
+   { // 24
+       0,0,1,1,0,0,0,1,0,0,0,1,0,0,0,0
+   },
+
+   { // 25
+       0,0,0,0,1,0,0,0,1,0,0,0,1,1,0,0
+   },
+
+   { // 26
+       0,1,1,0,0,1,1,0,0,1,1,0,0,1,1,0
+   },
+
+   { // 27
+       0,0,1,1,0,1,1,0,0,1,1,0,1,1,0,0
+   },
+
+   { // 28
+       0,0,0,1,0,1,1,1,1,1,1,0,1,0,0,0
+   },
+
+   { // 29
+       0,0,0,0,1,1,1,1,1,1,1,1,0,0,0,0
+   },
+
+   { // 30
+       0,1,1,1,0,0,0,1,1,0,0,0,1,1,1,0
+   },
+
+   { // 31
+       0,0,1,1,1,0,0,1,1,0,0,1,1,1,0,0
+   },
+};
+
+CGU_DWORD get_partition_subset(CGU_INT subset, CGU_INT partI, CGU_INT index)
+{
+    if (subset)
+        return BC6_PARTITIONS[partI][index];
+    else
+        return 0;
+}
+
+void    Partition(CGU_INT      shape,
+                  CGU_FLOAT    in[][MAX_DIMENSION_BIG],
+                  CGU_FLOAT    subsets[MAX_SUBSETS][MAX_SUBSET_SIZE][MAX_DIMENSION_BIG], //[3][16][4]
+                  CGU_INT      count[MAX_SUBSETS],
+                  CGU_INT8     ShapeTableToUse,
+                  CGU_INT      dimension)
+{
+    int   i, j;
+    int   insubset = -1, inpart = 0;
+
+    // Dont use memset: this is better for now
+    for (i = 0; i < MAX_SUBSETS; i++) count[i] = 0;
+
+    switch (ShapeTableToUse)
+    {
+    case    0:
+    case    1:
+        insubset = 0;
+        inpart = 0;
+        break;
+    case    2:
+        insubset = 1;
+        inpart = shape;
+        break;
+    default:
+        break;
+    }
+
+    // Nothing to do!!: Must indicate an error to user
+    if (insubset == -1) return; // Nothing to do!!
+
+    for (i = 0; i < MAX_SUBSET_SIZE; i++)
+    {
+        int   subset = get_partition_subset(insubset, inpart, i);
+        for (j = 0; j < dimension; j++)
+        {
+            subsets[subset][count[subset]][j] = in[i][j];
+        }
+        if (dimension < MAX_DIMENSION_BIG)
+        {
+            subsets[subset][count[subset]][j] = 0.0;
+        }
+        count[subset]++;
+    }
+
+}
+
+void GetEndPoints(CGU_FLOAT EndPoints[MAX_SUBSETS][MAX_END_POINTS][MAX_DIMENSION_BIG], CGU_FLOAT outB[MAX_SUBSETS][MAX_SUBSET_SIZE][MAX_DIMENSION_BIG], CGU_INT max_subsets, int entryCount[MAX_SUBSETS])
+{
+    // Should have some sort of error notification!
+    if (max_subsets > MAX_SUBSETS) return;
+
+    // Save Min and Max OutB points as EndPoints
+    for (int subset = 0; subset < max_subsets; subset++)
+    {
+        // We now have points on direction vector(s) 
+        // find the min and max points
+        CGU_FLOAT min = CMP_HALF_MAX;
+        CGU_FLOAT max = 0;
+        CGU_FLOAT val;
+        int mini = 0;
+        int maxi = 0;
+
+
+        for (int i = 0; i < entryCount[subset]; i++)
+        {
+            val = outB[subset][i][0] + outB[subset][i][1] + outB[subset][i][2];
+            if (val < min)
+            {
+                min = val;
+                mini = i;
+            }
+            if (val > max)
+            {
+                max = val;
+                maxi = i;
+            }
+        }
+
+        // Is round best for this !
+        for (int c = 0; c < MAX_DIMENSION_BIG; c++)
+        {
+            EndPoints[subset][0][c] = outB[subset][mini][c];
+        }
+
+        for (int c = 0; c < MAX_DIMENSION_BIG; c++)
+        {
+            EndPoints[subset][1][c] = outB[subset][maxi][c];
+        }
+    }
+}
+
+void covariance_d(CGU_FLOAT data[][MAX_DIMENSION_BIG], CGU_INT numEntries, CGU_FLOAT cov[MAX_DIMENSION_BIG][MAX_DIMENSION_BIG], CGU_INT dimension)
+{
+#ifdef USE_DBGTRACE
+    DbgTrace(());
+#endif
+    int i, j, k;
+
+    for (i = 0; i < dimension; i++)
+        for (j = 0; j <= i; j++)
+        {
+            cov[i][j] = 0;
+            for (k = 0; k < numEntries; k++)
+                cov[i][j] += data[k][i] * data[k][j];
+        }
+
+    for (i = 0; i < dimension; i++)
+        for (j = i + 1; j < dimension; j++)
+            cov[i][j] = cov[j][i];
+}
+
+void centerInPlace_d(CGU_FLOAT data[][MAX_DIMENSION_BIG], int numEntries, CGU_FLOAT mean[MAX_DIMENSION_BIG], CGU_INT dimension)
+{
+#ifdef USE_DBGTRACE
+    DbgTrace(());
+#endif
+    int i, k;
+
+    for (i = 0; i < dimension; i++)
+    {
+        mean[i] = 0;
+        for (k = 0; k < numEntries; k++)
+            mean[i] += data[k][i];
+    }
+
+    if (!numEntries)
+        return;
+
+    for (i = 0; i < dimension; i++)
+    {
+        mean[i] /= numEntries;
+        for (k = 0; k < numEntries; k++)
+            data[k][i] -= mean[i];
+    }
+}
+
+void eigenVector_d(CGU_FLOAT cov[MAX_DIMENSION_BIG][MAX_DIMENSION_BIG], CGU_FLOAT vector[MAX_DIMENSION_BIG], CGU_INT dimension)
+{
+#ifdef USE_DBGTRACE
+    DbgTrace(());
+#endif
+    // calculate an eigenvecto corresponding to a biggest eigenvalue
+    // will work for non-zero non-negative matricies only
+
+#define EV_ITERATION_NUMBER 20
+#define EV_SLACK            2        /* additive for exp base 2)*/    
+
+
+    CGU_INT i, j, k, l, m, n, p, q;
+    CGU_FLOAT c[2][MAX_DIMENSION_BIG][MAX_DIMENSION_BIG];
+    CGU_FLOAT maxDiag;
+
+    for (i = 0; i < dimension; i++)
+        for (j = 0; j < dimension; j++)
+            c[0][i][j] = cov[i][j];
+
+    p = (int)floorf(log((FLT_MAX_EXP - EV_SLACK) / ceilf(logf((CGU_FLOAT)dimension) / logf(2.0f))) / logf(2.0f));
+
+    //assert(p>0);
+
+    p = p > 0 ? p : 1;
+
+    q = (EV_ITERATION_NUMBER + p - 1) / p;
+
+    l = 0;
+
+    for (n = 0; n < q; n++)
+    {
+        maxDiag = 0;
+
+        for (i = 0; i < dimension; i++)
+            maxDiag = c[l][i][i] > maxDiag ? c[l][i][i] : maxDiag;
+
+        if (maxDiag <= 0)
+        {
+            return;
+        }
+
+        //assert(maxDiag >0);
+
+        for (i = 0; i < dimension; i++)
+            for (j = 0; j < dimension; j++)
+                c[l][i][j] /= maxDiag;
+
+        for (m = 0; m < p; m++) {
+            for (i = 0; i < dimension; i++)
+                for (j = 0; j < dimension; j++) {
+                    CGU_FLOAT temp = 0;
+                    for (k = 0; k < dimension; k++)
+                    {
+                        // Notes: 
+                        // This is the most consuming portion of the code and needs optimizing for perfromance
+                        temp += c[l][i][k] * c[l][k][j];
+                    }
+                    c[1 - l][i][j] = temp;
+                }
+            l = 1 - l;
+        }
+    }
+
+    maxDiag = 0;
+    k = 0;
+
+    for (i = 0; i < dimension; i++)
+    {
+        k = c[l][i][i] > maxDiag ? i : k;
+        maxDiag = c[l][i][i] > maxDiag ? c[l][i][i] : maxDiag;
+    }
+    CGU_FLOAT t;
+    t = 0;
+    for (i = 0; i < dimension; i++)
+    {
+        t += c[l][k][i] * c[l][k][i];
+        vector[i] = c[l][k][i];
+    }
+    // normalization is really optional
+    t = sqrtf(t);
+    //assert(t>0);
+
+    if (t <= 0)
+    {
+        return;
+    }
+    for (i = 0; i < dimension; i++)
+        vector[i] /= t;
+}
+
+void project_d(CGU_FLOAT data[][MAX_DIMENSION_BIG], CGU_INT numEntries, CGU_FLOAT vector[MAX_DIMENSION_BIG], CGU_FLOAT projection[MAX_ENTRIES], CGU_INT dimension)
+{
+#ifdef USE_DBGTRACE
+    DbgTrace(());
+#endif
+    // assume that vector is normalized already
+    int i, k;
+
+    for (k = 0; k < numEntries; k++)
+    {
+        projection[k] = 0;
+        for (i = 0; i < dimension; i++)
+        {
+            projection[k] += data[k][i] * vector[i];
+        }
+    }
+}
+
+typedef struct {
+    CGU_FLOAT d;
+    int i;
+} a;
+
+inline CGU_INT a_compare(const void *arg1, const void *arg2)
+{
+    if (((a*)arg1)->d - ((a*)arg2)->d > 0) return 1;
+    if (((a*)arg1)->d - ((a*)arg2)->d < 0) return -1;
+    return 0;
+};
+
+void sortProjection(CGU_FLOAT projection[MAX_ENTRIES], CGU_INT order[MAX_ENTRIES], CGU_INT numEntries)
+{
+    int i;
+    a what[MAX_ENTRIES + MAX_PARTITIONS_TABLE];
+
+    for (i = 0; i < numEntries; i++)
+        what[what[i].i = i].d = projection[i];
+
+#ifdef USE_QSORT
+    qsort((void*)&what, numEntries, sizeof(a), a_compare);
+#else
+    {
+        int j;
+        int tmp;
+        CGU_FLOAT tmp_d;
+        for (i = 1; i < numEntries; i++)
+        {
+            for (j = i; j > 0; j--)
+            {
+                if (what[j - 1].d > what[j].d)
+                {
+                    tmp = what[j].i;
+                    tmp_d = what[j].d;
+                    what[j].i = what[j - 1].i;
+                    what[j].d = what[j - 1].d;
+                    what[j - 1].i = tmp;
+                    what[j - 1].d = tmp_d;
+                }
+            }
+        }
+    }
+#endif
+
+
+    for (i = 0; i < numEntries; i++)
+        order[i] = what[i].i;
+};
+
+CGU_FLOAT totalError_d(CGU_FLOAT data[MAX_ENTRIES][MAX_DIMENSION_BIG], CGU_FLOAT data2[MAX_ENTRIES][MAX_DIMENSION_BIG], CGU_INT numEntries, CGU_INT dimension)
+{
+    int i, j;
+    CGU_FLOAT t = 0;
+    for (i = 0; i < numEntries; i++)
+        for (j = 0; j < dimension; j++)
+            t += (data[i][j] - data2[i][j])*(data[i][j] - data2[i][j]);
+
+    return t;
+};
+
+// input:
+//
+// v_  points, might be uncentered
+// k - number of points in the ramp
+// n - number of points in v_
+//
+// output:
+// index, uncentered, in the range 0..k-1
+//
+
+void quant_AnD_Shell(CGU_FLOAT* v_, CGU_INT k, CGU_INT n, CGU_INT *idx)
+{
+#define MAX_BLOCK MAX_ENTRIES
+    CGU_INT i, j;
+    CGU_FLOAT v[MAX_BLOCK];
+    CGU_FLOAT z[MAX_BLOCK];
+    a d[MAX_BLOCK];
+    CGU_FLOAT l;
+    CGU_FLOAT mm;
+    CGU_FLOAT r = 0;
+    CGU_INT mi;
+
+    CGU_FLOAT m, M, s, dm = 0.;
+    m = M = v_[0];
+
+    for (i = 1; i < n; i++) {
+        m = m < v_[i] ? m : v_[i];
+        M = M > v_[i] ? M : v_[i];
+    }
+    if (M == m) {
+        for (i = 0; i < n; i++)
+            idx[i] = 0;
+        return;
+    }
+
+    //assert(M - m >0);
+    s = (k - 1) / (M - m);
+    for (i = 0; i < n; i++) {
+        v[i] = v_[i] * s;
+
+        idx[i] = (int)(z[i] = (v[i] + 0.5f /* stabilizer*/ - m * s));  //floorf(v[i] + 0.5f /* stabilizer*/ - m *s));
+
+        d[i].d = v[i] - z[i] - m * s;
+        d[i].i = i;
+        dm += d[i].d;
+        r += d[i].d*d[i].d;
+    }
+    if (n*r - dm * dm >= (CGU_FLOAT)(n - 1) / 4 /*slack*/ / 2) {
+
+        dm /= (CGU_FLOAT)n;
+
+        for (i = 0; i < n; i++)
+            d[i].d -= dm;
+
+
+        //!!! Need an OpenCL version of qsort
+#ifdef USE_QSORT
+        qsort((void*)&d, n, sizeof(a), a_compare);
+#else
+        {
+            CGU_INT tmp;
+            CGU_FLOAT tmp_d;
+            for (i = 1; i < n; i++) {
+                for (j = i; j > 0; j--)
+                {
+                    if (d[j - 1].d > d[j].d)
+                    {
+                        tmp = d[j].i;
+                        tmp_d = d[j].d;
+                        d[j].i = d[j - 1].i;
+                        d[j].d = d[j - 1].d;
+                        d[j - 1].i = tmp;
+                        d[j - 1].d = tmp_d;
+                    }
+                }
+            }
+        }
+#endif
+        // got into fundamental simplex
+        // move coordinate system origin to its center
+        for (i = 0; i < n; i++)
+            d[i].d -= (2.0f*(CGU_FLOAT)i + 1.0f - (CGU_FLOAT)n) / 2.0f / (CGU_FLOAT)n;
+
+        mm = l = 0.;
+        j = -1;
+        for (i = 0; i < n; i++) {
+            l += d[i].d;
+            if (l < mm) {
+                mm = l;
+                j = i;
+            }
+        }
+
+        // position which should be in 0 
+        j = j + 1;
+        j = j % n;
+
+        for (i = j; i < n; i++)
+            idx[d[i].i]++;
+    }
+    // get rid of an offset in idx
+    mi = idx[0];
+    for (i = 1; i < n; i++)
+        mi = mi < idx[i] ? mi : idx[i];
+
+    for (i = 0; i < n; i++)
+        idx[i] -= mi;
+}
+
+CGU_FLOAT optQuantAnD_d(CGU_FLOAT data[MAX_ENTRIES][MAX_DIMENSION_BIG],
+                        CGU_INT numEntries,
+                        CGU_INT numClusters,
+                        CGU_INT index[MAX_ENTRIES],
+                        CGU_FLOAT out[MAX_ENTRIES][MAX_DIMENSION_BIG],
+                        CGU_FLOAT direction[MAX_DIMENSION_BIG], CGU_FLOAT *step,
+                        CGU_INT dimension,
+                        CGU_FLOAT quality)
+{
+    CGU_INT index_[MAX_ENTRIES];
+
+    CGU_INT maxTry = (int)(MAX_TRY * quality);
+    CGU_INT try_two = 50;
+
+    CGU_INT i, j, k;
+    CGU_FLOAT t, s;
+
+    CGU_FLOAT centered[MAX_ENTRIES][MAX_DIMENSION_BIG];
+
+    CGU_FLOAT mean[MAX_DIMENSION_BIG];
+
+    CGU_FLOAT cov[MAX_DIMENSION_BIG][MAX_DIMENSION_BIG];
+
+    CGU_FLOAT projected[MAX_ENTRIES];
+
+    CGU_INT order_[MAX_ENTRIES];
+
+
+    for (i = 0; i < numEntries; i++)
+        for (j = 0; j < dimension; j++)
+            centered[i][j] = data[i][j];
+
+    centerInPlace_d(centered, numEntries, mean, dimension);
+    covariance_d(centered, numEntries, cov, dimension);
+
+    // check if they all are the same 
+
+    t = 0;
+    for (j = 0; j < dimension; j++)
+        t += cov[j][j];
+
+    if (numEntries == 0) {
+        for (i = 0; i < numEntries; i++) {
+            index[i] = 0;
+            for (j = 0; j < dimension; j++)
+                out[i][j] = mean[j];
+        }
+        return 0.0f;
+    }
+
+    eigenVector_d(cov, direction, dimension);
+    project_d(centered, numEntries, direction, projected, dimension);
+
+    for (i = 0; i < maxTry; i++)
+    {
+        CGU_INT done = 0;
+
+        if (i)
+        {
+            do
+            {
+                CGU_FLOAT q;
+                q = s = t = 0;
+
+                for (k = 0; k < numEntries; k++)
+                {
+                    s += index[k];
+                    t += index[k] * index[k];
+                }
+
+                for (j = 0; j < dimension; j++)
+                {
+                    direction[j] = 0;
+                    for (k = 0; k < numEntries; k++)
+                        direction[j] += centered[k][j] * index[k];
+                    q += direction[j] * direction[j];
+
+                }
+
+                s /= (CGU_FLOAT)numEntries;
+                t = t - s * s * (CGU_FLOAT)numEntries;
+                //assert(t != 0);
+                t = (t == 0.0f ? 0.0f : 1.0f / t);
+                // We need to requantize 
+
+                q = sqrtf(q);
+                t *= q;
+
+                if (q != 0)
+                    for (j = 0; j < dimension; j++)
+                        direction[j] /= q;
+
+                // direction normalized
+
+                project_d(centered, numEntries, direction, projected, dimension);
+                sortProjection(projected, order_, numEntries);
+
+                CGU_INT index__[MAX_ENTRIES];
+
+                // it's projected and centered; cluster centers are (index[i]-s)*t (*dir)
+                k = 0;
+                for (j = 0; j < numEntries; j++)
+                {
+                    while (projected[order_[j]] > (k + 0.5 - s)*t  && k < numClusters - 1)
+                        k++;
+                    index__[order_[j]] = k;
+                }
+                done = 1;
+                for (j = 0; j < numEntries; j++)
+                {
+                    done = (done && (index__[j] == index[j]));
+                    index[j] = index__[j];
+                }
+            } while (!done && try_two--);
+
+            if (i == 1)
+                for (j = 0; j < numEntries; j++)
+                    index_[j] = index[j];
+            else
+            {
+                done = 1;
+                for (j = 0; j < numEntries; j++)
+                {
+                    done = (done && (index_[j] == index[j]));
+                    index_[j] = index_[j];
+                }
+                if (done)
+                    break;
+
+            }
+        }
+
+        quant_AnD_Shell(projected, numClusters, numEntries, index);
+    }
+    s = t = 0;
+
+    CGU_FLOAT q = 0;
+
+    for (k = 0; k < numEntries; k++)
+    {
+        s += index[k];
+        t += index[k] * index[k];
+    }
+
+    for (j = 0; j < dimension; j++)
+    {
+        direction[j] = 0;
+        for (k = 0; k < numEntries; k++)
+            direction[j] += centered[k][j] * index[k];
+        q += direction[j] * direction[j];
+    }
+
+    s /= (CGU_FLOAT)numEntries;
+
+    t = t - s * s * (CGU_FLOAT)numEntries;
+
+    //assert(t != 0);
+
+    t = (t == 0.0 ? 0.0f : 1.0f / t);
+
+    for (i = 0; i < numEntries; i++)
+        for (j = 0; j < dimension; j++)
+            out[i][j] = mean[j] + direction[j] * t*(index[i] - s);
+
+    // normalize direction for output
+
+    q = sqrtf(q);
+    *step = t * q;
+    for (j = 0; j < dimension; j++)
+        direction[j] /= q;
+
+    return totalError_d(data, out, numEntries, dimension);
+}
+
+void clampF16Max(CGU_FLOAT EndPoints[MAX_SUBSETS][MAX_END_POINTS][MAX_DIMENSION_BIG], CGU_BOOL isSigned)
+{
+    for (CGU_INT region = 0; region < 2; region++)
+        for (CGU_INT ab = 0; ab < 2; ab++)
+            for (CGU_INT rgb = 0; rgb < 3; rgb++)
+            {
+                if (isSigned)
+                {
+                    if (EndPoints[region][ab][rgb] < -FLT16_MAX) EndPoints[region][ab][rgb] = -FLT16_MAX;
+                    else if (EndPoints[region][ab][rgb] > FLT16_MAX) EndPoints[region][ab][rgb] = FLT16_MAX;
+                }
+                else
+                {
+                    if (EndPoints[region][ab][rgb] < 0.0) EndPoints[region][ab][rgb] = 0.0;
+                    else if (EndPoints[region][ab][rgb] > FLT16_MAX) EndPoints[region][ab][rgb] = FLT16_MAX;
+                }
+                // Zero region
+                // if ((EndPoints[region][ab][rgb] > -0.01) && ((EndPoints[region][ab][rgb] < 0.01))) EndPoints[region][ab][rgb] = 0.0;
+            }
+}
+
+//=====================================================================================================================
+#define LOG_CL_BASE         2
+#define BIT_BASE            5
+#define LOG_CL_RANGE        5
+#define BIT_RANGE           9
+#define MAX_CLUSTERS_BIG    16
+#define BTT(bits)           (bits-BIT_BASE)
+#define CLT(cl)             (cl-LOG_CL_BASE)
+
+#ifdef USE_BC6RAMPS
+
+int spidx(int in_data, int in_clogs, int in_bits, int in_p2, int in_o1, int in_o2, int in_i)
+{
+    // use BC7 sp_idx
+    return 0;
+}
+
+float sperr(int in_data, int clogs, int bits, int p2, int o1, int o2)
+{
+     // use BC7 sp_err
+    return 0,0f;
+}
+#endif
+
+__constant CGU_FLOAT rampLerpWeightsBC6[5][16] =
+{
+    { 0.0 }, // 0 bit index
+    { 0.0, 1.0 }, // 1 bit index
+    { 0.0, 21.0 / 64.0, 43.0 / 64.0, 1.0 }, // 2 bit index
+    { 0.0, 9.0 / 64.0, 18.0 / 64.0, 27.0 / 64.0, 37.0 / 64.0, 46.0 / 64.0, 55.0 / 64.0, 1.0 }, // 3 bit index
+    { 0.0, 4.0 / 64.0, 9.0 / 64.0, 13.0 / 64.0, 17.0 / 64.0, 21.0 / 64.0, 26.0 / 64.0, 30.0 / 64.0,
+    34.0 / 64.0, 38.0 / 64.0, 43.0 / 64.0, 47.0 / 64.0, 51.0 / 64.0, 55.0 / 64.0, 60.0 / 64.0, 1.0 } // 4 bit index
+};
+
+
+CGU_FLOAT rampf(CGU_INT clogs, CGU_FLOAT p1, CGU_FLOAT p2, CGU_INT indexPos)
+{
+    // (clogs+ LOG_CL_BASE) starts from 2 to 4
+    return  (CGU_FLOAT)p1 + rampLerpWeightsBC6[clogs + LOG_CL_BASE][indexPos] * (p2 - p1);
+}
+
+CGU_INT all_same_d(CGU_FLOAT d[][MAX_DIMENSION_BIG], CGU_INT n, CGU_INT dimension)
+{
+    CGU_INT i, j;
+    CGU_INT same = 1;
+    for (i = 1; i < n; i++)
+        for (j = 0; j < dimension; j++)
+            same = same && (d[0][j] == d[i][j]);
+
+    return(same);
+}
+
+// return the max index from a set of indexes
+CGU_INT max_index(CGU_INT a[], CGU_INT n)
+{
+    CGU_INT i, m = a[0];
+    for (i = 0; i < n; i++)
+        m = m > a[i] ? m : a[i];
+    return (m);
+}
+
+CGU_INT cluster_mean_d_d(CGU_FLOAT d[MAX_ENTRIES][MAX_DIMENSION_BIG], CGU_FLOAT mean[MAX_ENTRIES][MAX_DIMENSION_BIG], CGU_INT index[], CGU_INT i_comp[], CGU_INT i_cnt[], CGU_INT n, CGU_INT dimension)
+{
+    // unused index values are underfined
+    CGU_INT i, j, k;
+    //assert(n!=0);
+
+    for (i = 0; i < n; i++)
+        for (j = 0; j < dimension; j++) {
+            // assert(index[i]<MAX_CLUSTERS_BIG);
+            mean[index[i]][j] = 0;
+            i_cnt[index[i]] = 0;
+        }
+    k = 0;
+    for (i = 0; i < n; i++) {
+        for (j = 0; j < dimension; j++)
+            mean[index[i]][j] += d[i][j];
+        if (i_cnt[index[i]] == 0)
+            i_comp[k++] = index[i];
+        i_cnt[index[i]]++;
+    }
+
+    for (i = 0; i < k; i++)
+        for (j = 0; j < dimension; j++)
+            mean[i_comp[i]][j] /= (CGU_FLOAT)i_cnt[i_comp[i]];
+    return k;
+}
+
+void mean_d_d(CGU_FLOAT d[][MAX_DIMENSION_BIG], CGU_FLOAT mean[MAX_DIMENSION_BIG], CGU_INT n, CGU_INT dimension)
+{
+    CGU_INT i, j;
+    for (j = 0; j < dimension; j++)
+        mean[j] = 0;
+    for (i = 0; i < n; i++)
+        for (j = 0; j < dimension; j++)
+            mean[j] += d[i][j];
+    for (j = 0; j < dimension; j++)
+        mean[j] /= (CGU_FLOAT)n;
+}
+
+void index_collapse_kernel(CGU_INT index[], CGU_INT numEntries)
+{
+    CGU_INT k;
+    CGU_INT d, D;
+    CGU_INT mi;
+    CGU_INT Mi;
+    if (numEntries == 0)
+        return;
+
+    mi = Mi = index[0];
+    for (k = 1; k < numEntries; k++) {
+        mi = mi < index[k] ? mi : index[k];
+        Mi = Mi > index[k] ? Mi : index[k];
+    }
+    D = 1;
+    for (d = 2; d <= Mi - mi; d++) {
+
+        for (k = 0; k < numEntries; k++)
+            if ((index[k] - mi) % d != 0)
+                break;
+        if (k >= numEntries)
+            D = d;
+    }
+    for (k = 0; k < numEntries; k++)
+        index[k] = (index[k] - mi) / D;
+}
+
+CGU_INT max_int(CGU_INT a[], CGU_INT n)
+{
+    CGU_INT i, m = a[0];
+    for (i = 0; i < n; i++)
+        m = m > a[i] ? m : a[i];
+    return (m);
+}
+
+__constant CGU_INT npv_nd[2][2 * MAX_DIMENSION_BIG] =
+{
+    { 1,2,4,8,16,32,0,0 }, //dimension = 3
+    { 1,2,4,0,0,0,0,0 }    //dimension = 4
+};
+
+__constant short par_vectors_nd[2][8][128][2][MAX_DIMENSION_BIG] =
+{
+    { // Dimension = 3
+        {
+            { { 0,0,0,0 },{ 0,0,0,0 } },
+            { { 0,0,0,0 },{ 0,0,0,0 } }
+        },
+
+    // 3*n+1    BCC          3*n+1        Cartesian 3*n            //same parity
+        { // SAME_PAR
+            { { 0,0,0 },{ 0,0,0 } },
+            { { 1,1,1 },{ 1,1,1 } }
+        },
+    // 3*n+2    BCC          3*n+1        BCC          3*n+1    
+        { // BCC
+            { { 0,0,0 },{ 0,0,0 } },
+            { { 0,0,0 },{ 1,1,1 } },
+            { { 1,1,1 },{ 0,0,0 } },
+            { { 1,1,1 },{ 1,1,1 } }
+        },
+    // 3*n+3    FCC                    ???                        // ??????
+    // BCC with FCC same or inverted, symmetric
+        { // BCC_SAME_FCC
+            { { 0,0,0 },{ 0,0,0 } },
+            { { 1,1,0 },{ 1,1,0 } },
+            { { 1,0,1 },{ 1,0,1 } },
+            { { 0,1,1 },{ 0,1,1 } },
+
+            { { 0,0,0 },{ 1,1,1 } },
+            { { 1,1,1 },{ 0,0,0 } },
+            { { 0,1,0 },{ 0,1,0 } },  // ??
+            { { 1,1,1 },{ 1,1,1 } },
+
+        },
+        // 3*n+4    FCC          3*n+2        FCC          3*n+2
+        {
+
+            { { 0,0,0 },{ 0,0,0 } },
+            { { 1,1,0 },{ 0,0,0 } },
+            { { 1,0,1 },{ 0,0,0 } },
+            { { 0,1,1 },{ 0,0,0 } },
+
+            { { 0,0,0 },{ 1,1,0 } },
+            { { 1,1,0 },{ 1,1,0 } },
+            { { 1,0,1 },{ 1,1,0 } },
+            { { 0,1,1 },{ 1,1,0 } },
+
+            { { 0,0,0 },{ 1,0,1 } },
+            { { 1,1,0 },{ 1,0,1 } },
+            { { 1,0,1 },{ 1,0,1 } },
+            { { 0,1,1 },{ 1,0,1 } },
+
+            { { 0,0,0 },{ 0,1,1 } },
+            { { 1,1,0 },{ 0,1,1 } },
+            { { 1,0,1 },{ 0,1,1 } },
+            { { 0,1,1 },{ 0,1,1 } }
+        },
+
+
+    // 3*n+5    Cartesian 3*n+3        FCC          3*n+2            //D^*[6]  
+        {
+
+            { { 0,0,0 },{ 0,0,0 } },
+            { { 1,1,0 },{ 0,0,0 } },
+            { { 1,0,1 },{ 0,0,0 } },
+            { { 0,1,1 },{ 0,0,0 } },
+
+            { { 0,0,0 },{ 1,1,0 } },
+            { { 1,1,0 },{ 1,1,0 } },
+            { { 1,0,1 },{ 1,1,0 } },
+            { { 0,1,1 },{ 1,1,0 } },
+
+            { { 0,0,0 },{ 1,0,1 } },
+            { { 1,1,0 },{ 1,0,1 } },
+            { { 1,0,1 },{ 1,0,1 } },
+            { { 0,1,1 },{ 1,0,1 } },
+
+            { { 0,0,0 },{ 0,1,1 } },
+            { { 1,1,0 },{ 0,1,1 } },
+            { { 1,0,1 },{ 0,1,1 } },
+            { { 0,1,1 },{ 0,1,1 } },
+
+
+            { { 1,0,0 },{ 1,1,1 } },
+            { { 0,1,0 },{ 1,1,1 } },
+            { { 0,0,1 },{ 1,1,1 } },
+            { { 1,1,1 },{ 1,1,1 } },
+
+            { { 1,0,0 },{ 0,0,1 } },
+            { { 0,1,0 },{ 0,0,1 } },
+            { { 0,0,1 },{ 0,0,1 } },
+            { { 1,1,1 },{ 0,0,1 } },
+
+            { { 1,0,0 },{ 1,0,0 } },
+            { { 0,1,0 },{ 1,0,0 } },
+            { { 0,0,1 },{ 1,0,0 } },
+            { { 1,1,1 },{ 1,0,0 } },
+
+            { { 1,0,0 },{ 0,1,0 } },
+            { { 0,1,0 },{ 0,1,0 } },
+            { { 0,0,1 },{ 0,1,0 } },
+            { { 1,1,1 },{ 0,1,0 } }
+        }
+    },// Dimension = 3
+    { // Dimension = 4
+        {
+            { { 0,0,0,0 },{ 0,0,0,0 } },
+            { { 0,0,0,0 },{ 0,0,0,0 } }
+        },
+
+    // 3*n+1    BCC          3*n+1        Cartesian 3*n            //same parity
+        { // SAME_PAR
+            { { 0,0,0,0 },{ 0,0,0,0 } },
+            { { 1,1,1,1 },{ 1,1,1,1 } }
+        },
+    // 3*n+2    BCC          3*n+1        BCC          3*n+1    
+        { // BCC
+            { { 0,0,0,0 },{ 0,0,0,0 } },
+            { { 0,0,0,0 },{ 1,1,1,1 } },
+            { { 1,1,1,1 },{ 0,0,0,0 } },
+            { { 1,1,1,1 },{ 1,1,1,1 } }
+        },
+    // 3 PBIT
+        {
+            { { 0,0,0,0 },{ 0,0,0,0 } },
+            { { 0,0,0,0 },{ 0,1,1,1 } },
+            { { 0,1,1,1 },{ 0,0,0,0 } },
+            { { 0,1,1,1 },{ 0,1,1,1 } },
+
+            { { 1,0,0,0 },{ 1,0,0,0 } },
+            { { 1,0,0,0 },{ 1,1,1,1 } },
+            { { 1,1,1,1 },{ 1,0,0,0 } },
+            { { 1,1,1,1 },{ 1,1,1,1 } }
+        },
+
+    // 4 PBIT
+        {
+            { { 0,0,0,0 },{ 0,0,0,0 } },
+            { { 0,0,0,0 },{ 0,1,1,1 } },
+            { { 0,1,1,1 },{ 0,0,0,0 } },
+            { { 0,1,1,1 },{ 0,1,1,1 } },
+
+            { { 1,0,0,0 },{ 1,0,0,0 } },
+            { { 1,0,0,0 },{ 1,1,1,1 } },
+            { { 1,1,1,1 },{ 1,0,0,0 } },
+            { { 1,1,1,1 },{ 1,1,1,1 } },
+
+            { { 0,0,0,0 },{ 0,0,0,0 } },
+            { { 0,0,0,0 },{ 0,0,1,1 } },
+            { { 0,0,1,1 },{ 0,0,0,0 } },
+            { { 0,1,0,1 },{ 0,1,0,1 } },
+
+            { { 1,0,0,0 },{ 1,0,0,0 } },
+            { { 1,0,0,0 },{ 1,0,1,1 } },
+            { { 1,0,1,1 },{ 1,0,0,0 } },
+            { { 1,1,0,1 },{ 1,1,0,1 } },
+
+        },
+
+    } // Dimension = 4
+
+};
+
+CGU_INT get_par_vector(CGU_INT dim1, CGU_INT dim2, CGU_INT dim3, CGU_INT dim4, CGU_INT dim5)
+{
+    return par_vectors_nd[dim1][dim2][dim3][dim4][dim5];
+}
+
+CGU_FLOAT quant_single_point_d(CGU_FLOAT data[MAX_ENTRIES][MAX_DIMENSION_BIG],
+                               CGU_INT numEntries, CGU_INT index[MAX_ENTRIES],
+                               CGU_FLOAT out[MAX_ENTRIES][MAX_DIMENSION_BIG],
+                               CGU_INT epo_1[2][MAX_DIMENSION_BIG],
+                               CGU_INT Mi_,                // last cluster
+                               CGU_INT type,
+                               CGU_INT dimension)
+{
+    if (dimension < 3) return CMP_FLOAT_MAX;
+
+    CGU_INT i, j;
+
+    CGU_FLOAT err_0 = CMP_FLOAT_MAX;
+    CGU_FLOAT err_1 = CMP_FLOAT_MAX;
+
+    CGU_INT idx = 0;
+    CGU_INT idx_1 = 0;
+
+    CGU_INT epo_0[2][MAX_DIMENSION_BIG];
+
+    CGU_INT use_par = (type != 0);
+
+    CGU_INT clogs = 0;
+    i = Mi_ + 1;
+    while (i >>= 1)
+        clogs++;
+
+    //    assert((1<<clogs)== Mi_+1);
+
+    CGU_INT pn;
+    for (pn = 0; pn < npv_nd[dimension - 3][type]; pn++)
+    { //1
+
+        CGU_INT dim1 = dimension - 3;
+        CGU_INT dim2 = type;
+        CGU_INT dim3 = pn;
+
+
+        CGU_INT o1[2][MAX_DIMENSION_BIG]; // = { 0,2 };
+        CGU_INT o2[2][MAX_DIMENSION_BIG]; // = { 0,2 };
+
+        for (j = 0; j < dimension; j++)
+        { //A
+            o2[0][j] = o1[0][j] = 0;
+            o2[1][j] = o1[1][j] = 2;
+
+            if (use_par)
+            {
+                if (get_par_vector(dim1, dim2, dim3, 0, j))
+                    o1[0][j] = 1;
+                else
+                    o1[1][j] = 1;
+                if (get_par_vector(dim1, dim2, dim3, 1, j))
+                    o2[0][j] = 1;
+                else
+                    o2[1][j] = 1;
+            }
+        } //A
+
+        CGU_INT t1, t2;
+
+        CGU_INT dr[MAX_DIMENSION_BIG];
+        CGU_INT dr_0[MAX_DIMENSION_BIG];
+        //CGU_FLOAT tr;
+
+        for (i = 0; i < (1 << clogs); i++)
+        { //E
+            CGU_FLOAT t = 0;
+            CGU_INT t1o[MAX_DIMENSION_BIG], t2o[MAX_DIMENSION_BIG];
+
+            for (j = 0; j < dimension; j++)
+            { // D
+                CGU_FLOAT t_ = CMP_FLOAT_MAX;
+
+                for (t1 = o1[0][j]; t1 < o1[1][j]; t1++)
+                { // C
+                    for (t2 = o2[0][j]; t2 < o2[1][j]; t2++)
+                        // This is needed for non-integer mean points of "collapsed" sets
+                    { // B
+
+#ifdef USE_BC6RAMPS
+                        CGU_INT tf = (int)floorf(data[0][j]);
+                        CGU_INT tc = (int)ceilf(data[0][j]);
+                        // if they are not equal, the same representalbe point is used for 
+                        // both of them, as all representable points are integers in the rage 
+                        if (sperr(tf, CLT(clogs), BTT(bits[j]), t1, t2, i) > sperr(tc, CLT(clogs), BTT(bits[j]), t1, t2, i))
+                            dr[j] = tc;
+                        else if (sperr(tf, CLT(clogs), BTT(bits[j]), t1, t2, i) < sperr(tc, CLT(clogs), BTT(bits[j]), t1, t2, i))
+                            dr[j] = tf;
+                        else
+#endif
+                            dr[j] = (int)floorf(data[0][j] + 0.5f);
+
+#ifdef USE_BC6RAMPS
+                        tr = sperr(dr[j], CLT(clogs), BTT(bits[j]), t1, t2, i) + 2.0f * sqrtf(sperr(dr[j], CLT(clogs), BTT(bits[j]), t1, t2, i)) * fabsf((float)dr[j] - data[0][j]) +
+                            (dr[j] - data[0][j])* (dr[j] - data[0][j]);
+                        if (tr < t_)
+                        {
+                            t_ = tr;
+#else
+                        t_ = 0;
+#endif
+
+                        t1o[j] = t1;
+                        t2o[j] = t2;
+                        dr_0[j] = dr[j];
+#ifdef USE_BC6RAMPS
+                        if ((dr_0[j] < 0) || (dr_0[j] > 255))
+                        {
+                            dr_0[j] = 0; // Error!
+                        }
+                        }
+#endif
+                    } // B
+                } //C
+
+            t += t_;
+            } // D
+
+
+        if (t < err_0)
+        {
+
+            idx = i;
+
+            for (j = 0; j < dimension; j++)
+            {
+#ifdef USE_BC6RAMPS
+                CGU_INT p1 = CLT(clogs);        // < 3
+                CGU_INT p2 = BTT(bits[j]);     // < 4
+                CGU_INT in_data = dr_0[j];          // < SP_ERRIDX_MAX
+                CGU_INT p4 = t1o[j];           // < 2
+                CGU_INT p5 = t2o[j];           // < 2
+                CGU_INT p6 = i;                // < 16
+
+                                           // New spidx
+                epo_0[0][j] = spidx(in_data, p1, p2, p4, p5, p6, 0);
+                epo_0[1][j] = spidx(in_data, p1, p2, p4, p5, p6, 1);
+
+                if (epo_0[1][j] >= SP_ERRIDX_MAX)
+                {
+                    epo_0[1][j] = 0; // Error!!
+                }
+#else
+                epo_0[0][j] = 0;
+                epo_0[1][j] = 0;
+#endif
+            }
+            err_0 = t;
+        }
+        if (err_0 == 0)
+            break;
+        } // E
+
+    if (err_0 < err_1)
+    {
+        idx_1 = idx;
+        for (j = 0; j < dimension; j++)
+        {
+            epo_1[0][j] = epo_0[0][j];
+            epo_1[1][j] = epo_0[1][j];
+        }
+        err_1 = err_0;
+    }
+
+    if (err_1 == 0)
+        break;
+    } //1
+
+for (i = 0; i < numEntries; i++)
+{
+    index[i] = idx_1;
+    for (j = 0; j < dimension; j++)
+    {
+        CGU_INT p1 = CLT(clogs);        // < 3
+        CGU_INT p3 = epo_1[0][j];      // < SP_ERRIDX_MAX
+        CGU_INT p4 = epo_1[1][j];      // < SP_ERRIDX_MAX
+        CGU_INT p5 = idx_1;            // < 16
+#pragma warning( push )
+#pragma warning(disable:4244)
+        out[i][j] = (int)rampf(p1, p3, p4, p5);
+#pragma warning( pop )
+    }
+}
+return err_1 * numEntries;
+}
+
+//========================================================================================================================
+
+CGU_FLOAT ep_shaker_HD(CGU_FLOAT   data[MAX_ENTRIES][MAX_DIMENSION_BIG],
+                       CGU_INT     numEntries,
+                       CGU_INT     index_[MAX_ENTRIES],
+                       CGU_FLOAT   out[MAX_ENTRIES][MAX_DIMENSION_BIG],
+                       CGU_INT     epo_code_out[2][MAX_DIMENSION_BIG],
+                       CGU_INT     Mi_,                // last cluster
+                       CGU_INT     bits[3],            // including parity
+                       CGU_INT     channels3or4
+)
+{
+    CGU_INT i, j, k;
+    CGU_INT use_par = 0;
+    CGU_INT clogs = 0;
+
+    i = Mi_ + 1;
+    while (i >>= 1)
+        clogs++;
+
+    CGU_FLOAT mean[MAX_DIMENSION_BIG];
+    CGU_INT index[MAX_ENTRIES];
+    CGU_INT Mi;
+
+    CGU_INT maxTry = 1;
+
+    for (k = 0; k < numEntries; k++)
+    {
+        index[k] = index_[k];
+    }
+
+    CGU_INT done;
+    CGU_INT change;
+
+    CGU_INT better;
+
+    CGU_FLOAT   err_o = CMP_FLOAT_MAX;
+    CGU_FLOAT   out_2[MAX_ENTRIES][MAX_DIMENSION_BIG];
+    CGU_INT     idx_2[MAX_ENTRIES];
+    CGU_INT     epo_2[2][MAX_DIMENSION_BIG];
+
+    CGU_INT max_bits[MAX_DIMENSION_BIG];
+    CGU_INT type = bits[0] % (2 * channels3or4);
+
+    for (j = 0; j < channels3or4; j++)
+        max_bits[j] = (bits[0] + 2 * channels3or4 - 1) / (2 * channels3or4);
+
+
+    // handled below automatically
+    CGU_INT alls = all_same_d(data, numEntries, channels3or4);
+
+    mean_d_d(data, mean, numEntries, channels3or4);
+
+    do {
+        index_collapse_kernel(index, numEntries);
+
+        Mi = max_index(index, numEntries);  // index can be from requantizer
+
+        CGU_INT p, q;
+        CGU_INT p0 = -1, q0 = -1;
+
+        CGU_FLOAT err_2 = CMP_FLOAT_MAX;
+
+        if (Mi == 0) {
+            CGU_FLOAT t;
+            CGU_INT    epo_0[2][MAX_DIMENSION_BIG];
+            // either sinle point from the beginning or collapsed index
+            if (alls) {
+                t = quant_single_point_d(data, numEntries, index, out_2, epo_0, Mi_, type, channels3or4);
+            }
+            else
+            {
+                quant_single_point_d(&mean, numEntries, index, out_2, epo_0, Mi_, type, channels3or4);
+                t = totalError_d(data, out_2, numEntries, channels3or4);
+            }
+
+            if (t < err_o) {
+                for (k = 0; k < numEntries; k++) {
+                    index_[k] = index[k];
+                    for (j = 0; j < channels3or4; j++) {
+                        out[k][j] = out_2[k][j];
+                        epo_code_out[0][j] = epo_0[0][j];
+                        epo_code_out[1][j] = epo_0[1][j];
+                    }
+                };
+                err_o = t;
+            }
+            return err_o;
+        }
+
+        //===============================
+        // We have ramp colors to process
+        //===============================
+
+        for (q = 1; Mi != 0 && q*Mi <= Mi_; q++) // does not work for single point collapsed index!!!
+        {
+            for (p = 0; p <= Mi_ - q * Mi; p++)
+            {
+
+                //-------------------------------------
+                // set a new index data to try
+                //-------------------------------------
+                CGU_INT cidx[MAX_ENTRIES];
+
+                for (k = 0; k < numEntries; k++)
+                {
+                    cidx[k] = index[k] * q + p;
+                }
+
+                CGU_FLOAT epa[2][MAX_DIMENSION_BIG];
+
+                //
+                // solve RMS problem for center
+                //
+
+                CGU_FLOAT im[2][2] = { { 0,0 },{ 0,0 } };   // matrix /inverse matrix
+                CGU_FLOAT rp[2][MAX_DIMENSION_BIG];            // right part for RMS fit problem
+
+                                                           // get ideal clustr centers
+                CGU_FLOAT cc[MAX_CLUSTERS_BIG][MAX_DIMENSION_BIG];
+                CGU_INT index_cnt[MAX_CLUSTERS_BIG];                        // count of index entries
+                CGU_INT index_comp[MAX_CLUSTERS_BIG];                       // compacted index
+                CGU_INT index_ncl;                                            // number of unique indexes
+
+                index_ncl = cluster_mean_d_d(data, cc, cidx, index_comp, index_cnt, numEntries, channels3or4); // unrounded
+
+                for (i = 0; i < index_ncl; i++)
+                    for (j = 0; j < channels3or4; j++)
+                        cc[index_comp[i]][j] = (CGU_FLOAT)floorf(cc[index_comp[i]][j] + 0.5f); // more or less ideal location
+
+                for (j = 0; j < channels3or4; j++)
+                {
+                    rp[0][j] = rp[1][j] = 0;
+                }
+
+                // weight with cnt if runnning on compacted index
+                for (k = 0; k < numEntries; k++)
+                {
+                    im[0][0] += (Mi_ - cidx[k])* (Mi_ - cidx[k]);
+                    im[0][1] += cidx[k] * (Mi_ - cidx[k]);           // im is symmetric
+                    im[1][1] += cidx[k] * cidx[k];
+
+                    for (j = 0; j < channels3or4; j++)
+                    {
+                        rp[0][j] += (Mi_ - cidx[k]) * cc[cidx[k]][j];
+                        rp[1][j] += cidx[k] * cc[cidx[k]][j];
+                    }
+                }
+
+                CGU_FLOAT dd = im[0][0] * im[1][1] - im[0][1] * im[0][1];
+
+                //assert(dd !=0);
+
+                // dd=0 means that cidx[k] and (Mi_-cidx[k]) collinear which implies only one active index;
+                // taken care of separately
+
+                im[1][0] = im[0][0];
+                im[0][0] = im[1][1] / dd;
+                im[1][1] = im[1][0] / dd;
+                im[1][0] = im[0][1] = -im[0][1] / dd;
+
+                for (j = 0; j < channels3or4; j++) {
+                    epa[0][j] = (im[0][0] * rp[0][j] + im[0][1] * rp[1][j])*Mi_;
+                    epa[1][j] = (im[1][0] * rp[0][j] + im[1][1] * rp[1][j])*Mi_;
+                }
+
+                CGU_FLOAT err_1 = CMP_FLOAT_MAX;
+                CGU_FLOAT out_1[MAX_ENTRIES][MAX_DIMENSION_BIG];
+                CGU_INT idx_1[MAX_ENTRIES];
+                CGU_INT epo_1[2][MAX_DIMENSION_BIG];
+                CGU_INT s1 = 0;
+                CGU_FLOAT epd[2][MAX_DIMENSION_BIG][2];   // first second, coord, begin range end range
+
+                for (j = 0; j < channels3or4; j++)
+                {
+                    for (i = 0; i < 2; i++)
+                    {     // set range
+                        epd[i][j][0] = epd[i][j][1] = epa[i][j];
+                        epd[i][j][1] += ((1 << bits[j]) - 1 - (int)epd[i][j][1] < (1 << use_par) ?
+                            (1 << bits[j]) - 1 - (int)epd[i][j][1] : (1 << use_par)) & (~use_par);
+                    }
+                }
+
+                CGU_FLOAT ce[MAX_ENTRIES][MAX_CLUSTERS_BIG][MAX_DIMENSION_BIG];
+                CGU_FLOAT err_0 = 0;
+                CGU_FLOAT out_0[MAX_ENTRIES][MAX_DIMENSION_BIG];
+                CGU_INT idx_0[MAX_ENTRIES];
+
+                for (i = 0; i < numEntries; i++)
+                {
+                    CGU_FLOAT d[4];
+                    d[0] = data[i][0];
+                    d[1] = data[i][1];
+                    d[2] = data[i][2];
+                    d[3] = data[i][3];
+                    for (j = 0; j < (1 << clogs); j++)
+                        for (k = 0; k < channels3or4; k++)
+                        {
+                            ce[i][j][k] = (rampf(CLT(clogs), epd[0][k][0], epd[1][k][0], j) - d[k])*
+                                (rampf(CLT(clogs), epd[0][k][0], epd[1][k][0], j) - d[k]);
+                        }
+                }
+
+                CGU_INT s = 0, p1, g;
+                CGU_INT ei0 = 0, ei1 = 0;
+
+                for (p1 = 0; p1 < 64; p1++)
+                {
+                    CGU_INT j0 = 0;
+
+                    // Gray code increment
+                    g = p1 & (-p1);
+
+                    err_0 = 0;
+
+                    for (j = 0; j < channels3or4; j++)
+                    {
+                        if (((g >> (2 * j)) & 0x3) != 0)
+                        {
+                            j0 = j;
+                            // new cords
+                            ei0 = (((s^g) >> (2 * j)) & 0x1);
+                            ei1 = (((s^g) >> (2 * j + 1)) & 0x1);
+                        }
+                    }
+                    s = s ^ g;
+                    err_0 = 0;
+
+                    for (i = 0; i < numEntries; i++)
+                    {
+                        CGU_FLOAT d[4];
+                        d[0] = data[i][0];
+                        d[1] = data[i][1];
+                        d[2] = data[i][2];
+                        d[3] = data[i][3];
+                        CGU_INT    ci = 0;
+                        CGU_FLOAT cmin = CMP_FLOAT_MAX;
+
+                        for (j = 0; j < (1 << clogs); j++)
+                        {
+                            float t_ = 0.;
+                            ce[i][j][j0] = (rampf(CLT(clogs), epd[0][j0][ei0], epd[1][j0][ei1], j) - d[j0])*
+                                (rampf(CLT(clogs), epd[0][j0][ei0], epd[1][j0][ei1], j) - d[j0]);
+                            for (k = 0; k < channels3or4; k++)
+                            {
+                                t_ += ce[i][j][k];
+                            }
+
+                            if (t_ < cmin)
+                            {
+                                cmin = t_;
+                                ci = j;
+                            }
+                        }
+
+                        idx_0[i] = ci;
+                        for (k = 0; k < channels3or4; k++)
+                        {
+                            out_0[i][k] = rampf(CLT(clogs), epd[0][k][ei0], epd[1][k][ei1], ci);
+                        }
+                        err_0 += cmin;
+                    }
+
+                    if (err_0 < err_1)
+                    {
+                        // best in the curent ep cube run
+                        for (i = 0; i < numEntries; i++)
+                        {
+                            idx_1[i] = idx_0[i];
+                            for (j = 0; j < channels3or4; j++)
+                                out_1[i][j] = out_0[i][j];
+                        }
+                        err_1 = err_0;
+
+                        s1 = s; // epo coding             
+                    }
+                }
+
+                // reconstruct epo
+                for (j = 0; j < channels3or4; j++)
+                {
+                    {
+                        // new cords
+                        ei0 = ((s1 >> (2 * j)) & 0x1);
+                        ei1 = ((s1 >> (2 * j + 1)) & 0x1);
+                        epo_1[0][j] = (int)epd[0][j][ei0];
+                        epo_1[1][j] = (int)epd[1][j][ei1];
+                    }
+                }
+
+                if (err_1 < err_2)
+                {
+                    // best in the curent ep cube run
+                    for (i = 0; i < numEntries; i++)
+                    {
+                        idx_2[i] = idx_1[i];
+                        for (j = 0; j < channels3or4; j++)
+                            out_2[i][j] = out_1[i][j];
+                    }
+                    err_2 = err_1;
+                    for (j = 0; j < channels3or4; j++)
+                    {
+                        epo_2[0][j] = epo_1[0][j];
+                        epo_2[1][j] = epo_1[1][j];
+                    }
+                    p0 = p;
+                    q0 = q;
+                }
+            }
+        }
+
+        // change/better
+        change = 0;
+        for (k = 0; k < numEntries; k++)
+            change = change || (index[k] * q0 + p0 != idx_2[k]);
+
+        better = err_2 < err_o;
+
+        if (better)
+        {
+            for (k = 0; k < numEntries; k++)
+            {
+                index_[k] = index[k] = idx_2[k];
+                for (j = 0; j < channels3or4; j++)
+                {
+                    out[k][j] = out_2[k][j];
+                    epo_code_out[0][j] = epo_2[0][j];
+                    epo_code_out[1][j] = epo_2[1][j];
+                }
+            }
+            err_o = err_2;
+        }
+
+        done = !(change  &&  better);
+
+        if (maxTry > 0) maxTry--;
+        else maxTry = 0;
+
+    } while (!done && maxTry);
+
+    return err_o;
+}
+
+
+#ifndef ASPM_GPU
+static CGU_INT g_aWeights3[] = { 0, 9, 18, 27, 37, 46, 55, 64 };                                // 3 bit color Indices
+static CGU_INT g_aWeights4[] = { 0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64 }; // 4 bit color indices
+
+CGU_FLOAT lerpf(CGU_FLOAT a, CGU_FLOAT b, CGU_INT i, CGU_INT denom)
+{
+    assert(denom == 3 || denom == 7 || denom == 15);
+    assert(i >= 0 && i <= denom);
+
+    CGU_INT *weights = NULL;
+
+    switch (denom)
+    {
+    case 3:     denom *= 5; i *= 5;    // fall through to case 15
+    case 7:     weights = g_aWeights3; break;
+    case 15:    weights = g_aWeights4; break;
+    default:    assert(0);
+    }
+    return (a*weights[denom - i] + b * weights[i]) / 64.0f;
+}
+#else
+
+CGU_FLOAT lerpf(CGU_FLOAT a, CGU_FLOAT b, CGU_INT i, CGU_INT denom)
+{
+    CGU_INT g_aWeights3[] = { 0, 9, 18, 27, 37, 46, 55, 64 };                                // 3 bit color Indices
+    CGU_INT g_aWeights4[] = { 0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64 }; // 4 bit color indices
+    switch (denom)
+    {
+    case 7:     return ((a*g_aWeights3[denom - i] + b * g_aWeights3[i]) / 64.0f); break;
+    case 15:    return ((a*g_aWeights4[denom - i] + b * g_aWeights4[i]) / 64.0f); break;
+    default:
+    case 3:// fall through to case 15
+        denom *= 5;
+        i *= 5;
+        return ((a*g_aWeights3[denom - i] + b * g_aWeights3[i]) / 64.0f);   break;
+    }
+}
+#endif
+
+void palitizeEndPointsF(BC6H_Encode_local *BC6H_data, CGU_FLOAT fEndPoints[MAX_SUBSETS][MAX_END_POINTS][MAX_DIMENSION_BIG])
+{
+    // scale endpoints
+    CGU_FLOAT  Ar, Ag, Ab, Br, Bg, Bb;
+
+
+    // Compose index colors from end points
+    if (BC6H_data->region == 1)
+    {
+        Ar = fEndPoints[0][0][0];
+        Ag = fEndPoints[0][0][1];
+        Ab = fEndPoints[0][0][2];
+        Br = fEndPoints[0][1][0];
+        Bg = fEndPoints[0][1][1];
+        Bb = fEndPoints[0][1][2];
+
+        for (CGU_INT i = 0; i < 16; i++)
+        {
+
+            // Red
+            BC6H_data->Paletef[0][i].x = lerpf(Ar, Br, i, 15);
+            // Green
+            BC6H_data->Paletef[0][i].y = lerpf(Ag, Bg, i, 15);
+            // Blue
+            BC6H_data->Paletef[0][i].z = lerpf(Ab, Bb, i, 15);
+        }
+
+    }
+    else //mode.type == BC6_TWO
+    {
+        for (CGU_INT region = 0; region < 2; region++)
+        {
+            Ar = fEndPoints[region][0][0];
+            Ag = fEndPoints[region][0][1];
+            Ab = fEndPoints[region][0][2];
+            Br = fEndPoints[region][1][0];
+            Bg = fEndPoints[region][1][1];
+            Bb = fEndPoints[region][1][2];
+            for (CGU_INT i = 0; i < 8; i++)
+            {
+                // Red
+                BC6H_data->Paletef[region][i].x = lerpf(Ar, Br, i, 7);
+                // Greed
+                BC6H_data->Paletef[region][i].y = lerpf(Ag, Bg, i, 7);
+                // Blue
+                BC6H_data->Paletef[region][i].z = lerpf(Ab, Bb, i, 7);
+            }
+
+        }
+    }
+}
+
+CGU_FLOAT CalcShapeError(BC6H_Encode_local *BC6H_data, CGU_FLOAT fEndPoints[MAX_SUBSETS][MAX_END_POINTS][MAX_DIMENSION_BIG], CGU_BOOL SkipPallet)
+{
+    CGU_INT maxPallet;
+    CGU_INT subset = 0;
+    CGU_FLOAT  totalError = 0.0f;
+    CGU_INT region = (BC6H_data->region - 1);
+
+    if (region == 0)
+        maxPallet = 16;
+    else
+        maxPallet = 8;
+
+    if (!SkipPallet)
+        palitizeEndPointsF(BC6H_data, fEndPoints);
+
+    for (CGU_INT i = 0; i < MAX_SUBSET_SIZE; i++)
+    {
+        CGU_FLOAT error = 0.0f;
+        CGU_FLOAT bestError = 0.0f;
+
+        if (region == 0)
+        {
+            subset = 0;
+        }
+        else
+        {
+            // get the shape subset 0 or  1
+            subset = BC6_PARTITIONS[BC6H_data->d_shape_index][i];
+        }
+
+        // initialize bestError to the difference for first data
+        bestError = abs(BC6H_data->din[i][0] - BC6H_data->Paletef[subset][0].x) +
+            abs(BC6H_data->din[i][1] - BC6H_data->Paletef[subset][0].y) +
+            abs(BC6H_data->din[i][2] - BC6H_data->Paletef[subset][0].z);
+
+        // loop through the rest of the data until find the best error 
+        for (CGU_INT j = 1; j < maxPallet && bestError > 0; j++)
+        {
+            error = abs(BC6H_data->din[i][0] - BC6H_data->Paletef[subset][j].x) +
+                abs(BC6H_data->din[i][1] - BC6H_data->Paletef[subset][j].y) +
+                abs(BC6H_data->din[i][2] - BC6H_data->Paletef[subset][j].z);
+
+            if (error <= bestError)
+                bestError = error;
+            else
+                break;
+        }
+        totalError += bestError;
+    }
+
+    return totalError;
+}
+
+CGU_FLOAT FindBestPattern(BC6H_Encode_local * BC6H_data, CGU_BOOL TwoRegionShapes, CGU_INT8 shape_pattern, CGU_FLOAT quality)
+{
+    // Index bit size for the patterns been used. 
+    // All two zone shapes have 3 bits per color, max index value < 8  
+    // All one zone shapes gave 4 bits per color, max index value < 16
+    CGU_INT8   Index_BitSize = TwoRegionShapes ? 8 : 16;
+    CGU_INT8   max_subsets = TwoRegionShapes ? 2 : 1;
+    CGU_FLOAT  direction[NCHANNELS];
+    CGU_FLOAT  step;
+
+    BC6H_data->region = max_subsets;
+    BC6H_data->index = 0;
+    BC6H_data->d_shape_index = shape_pattern;
+    memset((CGU_UINT8 *)BC6H_data->partition, 0, sizeof(BC6H_data->partition));
+    memset((CGU_UINT8 *)BC6H_data->shape_indices, 0, sizeof(BC6H_data->shape_indices));
+
+    // Get the pattern to encode with
+    Partition(shape_pattern,          // Shape pattern we want to get
+        BC6H_data->din,          // Input data
+        BC6H_data->partition,    // Returns the patterned shape data
+        BC6H_data->entryCount,   // counts the number of pixel used in each subset region num of 0's amd 1's
+        max_subsets,            // Table Shapes to use eithe one regions 1 or two regions 2
+        3);                     // rgb no alpha always = 3
+
+    CGU_FLOAT  error[MAX_SUBSETS] = { 0.0, CMP_FLOAT_MAX,CMP_FLOAT_MAX };
+    CGU_INT    BestOutB = 0;
+    CGU_FLOAT  BestError;        //the lowest error from vector direction quantization
+    CGU_FLOAT  BestError_endpts; //the lowest error from endpoints extracted from the vector direction quantization
+
+    CGU_FLOAT   outB[2][2][MAX_SUBSET_SIZE][MAX_DIMENSION_BIG];
+    CGU_INT         shape_indicesB[2][MAX_SUBSETS][MAX_SUBSET_SIZE];
+
+    for (CGU_INT subset = 0; subset < max_subsets; subset++)
+    {
+        error[0] += optQuantAnD_d(
+            BC6H_data->partition[subset],        // input data 
+            BC6H_data->entryCount[subset],       // number of input points above (not clear about 1, better to avoid)
+            Index_BitSize,                      // number of clusters on the ramp, 8  or 16
+            shape_indicesB[0][subset],          // output index, if not all points of the ramp used, 0 may not be assigned
+            outB[0][subset],                    // resulting quantization
+            direction,                          // direction vector of the ramp (check normalization) 
+            &step,                              // step size (check normalization) 
+            3,                                  // number of channels (always 3 = RGB for BC6H)
+            quality                           // Quality set number of retry to get good end points 
+                                                // Max retries = MAX_TRY = 4000 when Quality is 1.0
+                                                // Min = 0 and default with quality 0.05 is 200 times
+        );
+    }
+
+    BestError = error[0];
+    BestOutB = 0;
+
+    // The following code is almost complete - runs very slow and not sure if % of improvement is justified..
+#ifdef USE_SHAKERHD
+    // Valid only for 2 region shapes
+    if ((max_subsets > 1) && (quality > 0.80))
+    {
+        CGU_INT     tempIndices[MAX_SUBSET_SIZE];
+        // CGU_INT     temp_epo_code[2][2][MAX_DIMENSION_BIG];
+        CGU_INT     bits[3] = { 8,8,8 };     // Channel index bit size
+
+        // CGU_FLOAT   epo[2][MAX_DIMENSION_BIG];
+        CGU_INT     epo_code[MAX_SUBSETS][2][MAX_DIMENSION_BIG];
+        // CGU_INT     shakeSize = 8;
+
+        error[1] = 0.0;
+        for (CGU_INT subset = 0; subset < max_subsets; subset++)
+        {
+            for (CGU_INT k = 0; k < BC6H_data->entryCount[subset]; k++)
+            {
+                tempIndices[k] = shape_indicesB[0][subset][k];
+            }
+
+            error[1] += ep_shaker_HD(
+                BC6H_data->partition[subset],
+                BC6H_data->entryCount[subset],
+                tempIndices,                    // output index, if not all points of the ramp used, 0 may not be assigned
+                outB[1][subset],                // resulting quantization
+                epo_code[subset],
+                BC6H_data->entryCount[subset] - 1,
+                bits,
+                3
+            );
+
+            // error[1] += ep_shaker_2_d(
+            //      BC6H_data.partition[subset],
+            //      BC6H_data.entryCount[subset],
+            //      tempIndices,                    // output index, if not all points of the ramp used, 0 may not be assigned
+            //      outB[1][subset],                // resulting quantization
+            //      epo_code[subset],
+            //      shakeSize,
+            //      BC6H_data.entryCount[subset] - 1,
+            //      bits[0],
+            //      3,
+            //      epo
+            //      );
+
+
+            for (CGU_INT k = 0; k < BC6H_data->entryCount[subset]; k++)
+            {
+                shape_indicesB[1][subset][k] = tempIndices[k];
+            }
+
+        } // subsets
+
+        if (BestError > error[1])
+        {
+            BestError = error[1];
+            BestOutB = 1;
+            for (CGU_INT subset = 0; subset < max_subsets; subset++)
+            {
+                for (CGU_INT k = 0; k < MAX_DIMENSION_BIG; k++)
+                {
+                    BC6H_data->fEndPoints[subset][0][k] = (CGU_FLOAT)epo_code[subset][0][k];
+                    BC6H_data->fEndPoints[subset][1][k] = (CGU_FLOAT)epo_code[subset][1][k];
+                }
+            }
+        }
+
+    }
+#endif
+
+    // Save the best for BC6H data processing later
+    if (BestOutB == 0)
+        GetEndPoints(BC6H_data->fEndPoints, outB[BestOutB], max_subsets, BC6H_data->entryCount);
+
+    memcpy((CGU_UINT8 *)BC6H_data->shape_indices, (CGU_UINT8 *)shape_indicesB[BestOutB], sizeof(BC6H_data->shape_indices));
+    clampF16Max(BC6H_data->fEndPoints, BC6H_data->issigned);
+
+    BestError_endpts = CalcShapeError(BC6H_data, BC6H_data->fEndPoints, false);
+    return BestError_endpts;
+}
+
+#ifndef ASPM_GPU
+void SaveDataBlock(BC6H_Encode_local *bc6h_format, CMP_GLOBAL CGU_UINT8 cmpout[COMPRESSED_BLOCK_SIZE])
+{
+    BitHeader header(NULL, COMPRESSED_BLOCK_SIZE);
+
+    // Save the RGB end point values
+    switch (bc6h_format->m_mode)
+    {
+    case 1: //0x00
+        header.setvalue(0, 2, 0x00);
+        header.setvalue(2, 1, bc6h_format->gy, 4);        //        gy[4]
+        header.setvalue(3, 1, bc6h_format->by, 4);        //        by[4]
+        header.setvalue(4, 1, bc6h_format->bz, 4);        //        bz[4]
+        header.setvalue(5, 10, bc6h_format->rw);          // 10:    rw[9:0] 
+        header.setvalue(15, 10, bc6h_format->gw);          // 10:    gw[9:0]
+        header.setvalue(25, 10, bc6h_format->bw);          // 10:    bw[9:0]
+        header.setvalue(35, 5, bc6h_format->rx);          // 5:     rx[4:0]
+        header.setvalue(40, 1, bc6h_format->gz, 4);        //        gz[4]
+        header.setvalue(41, 4, bc6h_format->gy);          // 5:     gy[3:0]
+        header.setvalue(45, 5, bc6h_format->gx);          // 5:     gx[4:0]
+        header.setvalue(50, 1, bc6h_format->bz);          // 5:     bz[0]
+        header.setvalue(51, 4, bc6h_format->gz);          // 5:     gz[3:0]
+        header.setvalue(55, 5, bc6h_format->bx);          // 5:     bx[4:0]
+        header.setvalue(60, 1, bc6h_format->bz, 1);        //        bz[1]
+        header.setvalue(61, 4, bc6h_format->by);          // 5:     by[3:0]
+        header.setvalue(65, 5, bc6h_format->ry);          // 5:     ry[4:0]
+        header.setvalue(70, 1, bc6h_format->bz, 2);        //        bz[2]
+        header.setvalue(71, 5, bc6h_format->rz);          // 5:     rz[4:0]
+        header.setvalue(76, 1, bc6h_format->bz, 3);        //        bz[3]
+        break;
+    case 2: // 0x01
+        header.setvalue(0, 2, 0x01);
+        header.setvalue(2, 1, bc6h_format->gy, 5);        //        gy[5]
+        header.setvalue(3, 1, bc6h_format->gz, 4);        //        gz[4]
+        header.setvalue(4, 1, bc6h_format->gz, 5);        //        gz[5]
+        header.setvalue(5, 7, bc6h_format->rw);          //        rw[6:0] 
+        header.setvalue(12, 1, bc6h_format->bz);          //        bz[0]
+        header.setvalue(13, 1, bc6h_format->bz, 1);        //        bz[1]
+        header.setvalue(14, 1, bc6h_format->by, 4);        //        by[4]
+        header.setvalue(15, 7, bc6h_format->gw);          //        gw[6:0]
+        header.setvalue(22, 1, bc6h_format->by, 5);        //        by[5]
+        header.setvalue(23, 1, bc6h_format->bz, 2);        //        bz[2]
+        header.setvalue(24, 1, bc6h_format->gy, 4);        //        gy[4]
+        header.setvalue(25, 7, bc6h_format->bw);          // 7:     bw[6:0]
+        header.setvalue(32, 1, bc6h_format->bz, 3);        //        bz[3]
+        header.setvalue(33, 1, bc6h_format->bz, 5);        //        bz[5]
+        header.setvalue(34, 1, bc6h_format->bz, 4);        //        bz[4]
+        header.setvalue(35, 6, bc6h_format->rx);          // 6:     rx[5:0]
+        header.setvalue(41, 4, bc6h_format->gy);          // 6:     gy[3:0]
+        header.setvalue(45, 6, bc6h_format->gx);          // 6:     gx[5:0]
+        header.setvalue(51, 4, bc6h_format->gz);          // 6:     gz[3:0]
+        header.setvalue(55, 6, bc6h_format->bx);          // 6:     bx[5:0]
+        header.setvalue(61, 4, bc6h_format->by);          // 6:     by[3:0]
+        header.setvalue(65, 6, bc6h_format->ry);          // 6:     ry[5:0]
+        header.setvalue(71, 6, bc6h_format->rz);          // 6:     rz[5:0]
+        break;
+    case 3: // 0x02
+        header.setvalue(0, 5, 0x02);
+        header.setvalue(5, 10, bc6h_format->rw);          // 11:    rw[9:0] 
+        header.setvalue(15, 10, bc6h_format->gw);          // 11:    gw[9:0]
+        header.setvalue(25, 10, bc6h_format->bw);          // 11:    bw[9:0]
+        header.setvalue(35, 5, bc6h_format->rx);          // 5:     rx[4:0]
+        header.setvalue(40, 1, bc6h_format->rw, 10);       //        rw[10]
+        header.setvalue(41, 4, bc6h_format->gy);          // 4:     gy[3:0]
+        header.setvalue(45, 4, bc6h_format->gx);          // 4:     gx[3:0]
+        header.setvalue(49, 1, bc6h_format->gw, 10);       //        gw[10]
+        header.setvalue(50, 1, bc6h_format->bz);          // 4:     bz[0]
+        header.setvalue(51, 4, bc6h_format->gz);          // 4:     gz[3:0]
+        header.setvalue(55, 4, bc6h_format->bx);          // 4:     bx[3:0]
+        header.setvalue(59, 1, bc6h_format->bw, 10);       //        bw[10]
+        header.setvalue(60, 1, bc6h_format->bz, 1);        //        bz[1]
+        header.setvalue(61, 4, bc6h_format->by);          // 4:     by[3:0]
+        header.setvalue(65, 5, bc6h_format->ry);          // 5:     ry[4:0]
+        header.setvalue(70, 1, bc6h_format->bz, 2);        //        bz[2]
+        header.setvalue(71, 5, bc6h_format->rz);          // 5:     rz[4:0]
+        header.setvalue(76, 1, bc6h_format->bz, 3);        //        bz[3]
+        break;
+    case 4: // 0x06
+        header.setvalue(0, 5, 0x06);
+        header.setvalue(5, 10, bc6h_format->rw);          // 11:    rw[9:0] 
+        header.setvalue(15, 10, bc6h_format->gw);          // 11:    gw[9:0]
+        header.setvalue(25, 10, bc6h_format->bw);          // 11:    bw[9:0]
+        header.setvalue(35, 4, bc6h_format->rx);          //        rx[3:0]
+        header.setvalue(39, 1, bc6h_format->rw, 10);       //        rw[10]
+        header.setvalue(40, 1, bc6h_format->gz, 4);        //        gz[4]
+        header.setvalue(41, 4, bc6h_format->gy);          // 5:     gy[3:0]
+        header.setvalue(45, 5, bc6h_format->gx);          //        gx[4:0]
+        header.setvalue(50, 1, bc6h_format->gw, 10);       // 5:     gw[10]
+        header.setvalue(51, 4, bc6h_format->gz);          // 5:     gz[3:0]
+        header.setvalue(55, 4, bc6h_format->bx);          // 4:     bx[3:0]
+        header.setvalue(59, 1, bc6h_format->bw, 10);       //        bw[10]
+        header.setvalue(60, 1, bc6h_format->bz, 1);        //        bz[1]
+        header.setvalue(61, 4, bc6h_format->by);          // 4:     by[3:0]
+        header.setvalue(65, 4, bc6h_format->ry);          // 4:     ry[3:0]
+        header.setvalue(69, 1, bc6h_format->bz);          // 4:     bz[0]
+        header.setvalue(70, 1, bc6h_format->bz, 2);        //        bz[2]
+        header.setvalue(71, 4, bc6h_format->rz);          // 4:     rz[3:0]
+        header.setvalue(75, 1, bc6h_format->gy, 4);        //        gy[4]
+        header.setvalue(76, 1, bc6h_format->bz, 3);        //        bz[3]
+        break;
+    case 5: // 0x0A
+        header.setvalue(0, 5, 0x0A);
+        header.setvalue(5, 10, bc6h_format->rw);           // 11:   rw[9:0] 
+        header.setvalue(15, 10, bc6h_format->gw);           // 11:   gw[9:0]
+        header.setvalue(25, 10, bc6h_format->bw);           // 11:   bw[9:0]
+        header.setvalue(35, 4, bc6h_format->rx);           // 4:    rx[3:0]
+        header.setvalue(39, 1, bc6h_format->rw, 10);        //       rw[10]
+        header.setvalue(40, 1, bc6h_format->by, 4);         //       by[4]
+        header.setvalue(41, 4, bc6h_format->gy);           // 4:    gy[3:0]
+        header.setvalue(45, 4, bc6h_format->gx);           // 4:    gx[3:0]
+        header.setvalue(49, 1, bc6h_format->gw, 10);        //       gw[10]
+        header.setvalue(50, 1, bc6h_format->bz);           // 5:    bz[0]
+        header.setvalue(51, 4, bc6h_format->gz);           // 4:    gz[3:0]
+        header.setvalue(55, 5, bc6h_format->bx);           // 5:    bx[4:0]
+        header.setvalue(60, 1, bc6h_format->bw, 10);        //       bw[10]
+        header.setvalue(61, 4, bc6h_format->by);           // 5:    by[3:0]
+        header.setvalue(65, 4, bc6h_format->ry);           // 4:    ry[3:0]
+        header.setvalue(69, 1, bc6h_format->bz, 1);         //       bz[1]
+        header.setvalue(70, 1, bc6h_format->bz, 2);         //       bz[2]
+        header.setvalue(71, 4, bc6h_format->rz);           // 4:    rz[3:0]
+        header.setvalue(75, 1, bc6h_format->bz, 4);         //       bz[4]
+        header.setvalue(76, 1, bc6h_format->bz, 3);         //       bz[3]
+        break;
+    case 6: // 0x0E
+        header.setvalue(0, 5, 0x0E);
+        header.setvalue(5, 9, bc6h_format->rw);           // 9:    rw[8:0] 
+        header.setvalue(14, 1, bc6h_format->by, 4);         //       by[4]
+        header.setvalue(15, 9, bc6h_format->gw);           // 9:    gw[8:0]
+        header.setvalue(24, 1, bc6h_format->gy, 4);         //       gy[4]
+        header.setvalue(25, 9, bc6h_format->bw);           // 9:    bw[8:0]
+        header.setvalue(34, 1, bc6h_format->bz, 4);         //       bz[4]
+        header.setvalue(35, 5, bc6h_format->rx);           // 5:    rx[4:0]
+        header.setvalue(40, 1, bc6h_format->gz, 4);         //       gz[4]
+        header.setvalue(41, 4, bc6h_format->gy);           // 5:    gy[3:0]
+        header.setvalue(45, 5, bc6h_format->gx);           // 5:    gx[4:0]
+        header.setvalue(50, 1, bc6h_format->bz);           // 5:    bz[0]
+        header.setvalue(51, 4, bc6h_format->gz);           // 5:    gz[3:0]
+        header.setvalue(55, 5, bc6h_format->bx);           // 5:    bx[4:0]
+        header.setvalue(60, 1, bc6h_format->bz, 1);         //       bz[1]
+        header.setvalue(61, 4, bc6h_format->by);           // 5:    by[3:0]
+        header.setvalue(65, 5, bc6h_format->ry);           // 5:    ry[4:0]
+        header.setvalue(70, 1, bc6h_format->bz, 2);         //       bz[2]
+        header.setvalue(71, 5, bc6h_format->rz);           // 5:    rz[4:0]
+        header.setvalue(76, 1, bc6h_format->bz, 3);         //       bz[3]
+        break;
+    case 7: // 0x12
+        header.setvalue(0, 5, 0x12);
+        header.setvalue(5, 8, bc6h_format->rw);           // 8:    rw[7:0] 
+        header.setvalue(13, 1, bc6h_format->gz, 4);         //       gz[4]
+        header.setvalue(14, 1, bc6h_format->by, 4);         //       by[4]
+        header.setvalue(15, 8, bc6h_format->gw);           // 8:    gw[7:0]
+        header.setvalue(23, 1, bc6h_format->bz, 2);         //       bz[2]
+        header.setvalue(24, 1, bc6h_format->gy, 4);         //       gy[4]
+        header.setvalue(25, 8, bc6h_format->bw);           // 8:    bw[7:0]
+        header.setvalue(33, 1, bc6h_format->bz, 3);         //       bz[3]
+        header.setvalue(34, 1, bc6h_format->bz, 4);         //       bz[4]
+        header.setvalue(35, 6, bc6h_format->rx);           // 6:    rx[5:0]
+        header.setvalue(41, 4, bc6h_format->gy);           // 5:    gy[3:0]
+        header.setvalue(45, 5, bc6h_format->gx);           // 5:    gx[4:0]
+        header.setvalue(50, 1, bc6h_format->bz);           // 5:    bz[0]
+        header.setvalue(51, 4, bc6h_format->gz);           // 5:    gz[3:0]
+        header.setvalue(55, 5, bc6h_format->bx);           // 5:    bx[4:0]
+        header.setvalue(60, 1, bc6h_format->bz, 1);         //       bz[1]
+        header.setvalue(61, 4, bc6h_format->by);           // 5:    by[3:0]
+        header.setvalue(65, 6, bc6h_format->ry);           // 6:    ry[5:0]
+        header.setvalue(71, 6, bc6h_format->rz);           // 6:    rz[5:0]
+        break;
+    case 8: // 0x16
+        header.setvalue(0, 5, 0x16);
+        header.setvalue(5, 8, bc6h_format->rw);            // 8:   rw[7:0] 
+        header.setvalue(13, 1, bc6h_format->bz);            // 5:   bz[0]
+        header.setvalue(14, 1, bc6h_format->by, 4);          //      by[4]
+        header.setvalue(15, 8, bc6h_format->gw);            // 8:   gw[7:0]
+        header.setvalue(23, 1, bc6h_format->gy, 5);          //      gy[5]
+        header.setvalue(24, 1, bc6h_format->gy, 4);          //      gy[4]
+        header.setvalue(25, 8, bc6h_format->bw);            // 8:   bw[7:0]
+        header.setvalue(33, 1, bc6h_format->gz, 5);          //      gz[5]
+        header.setvalue(34, 1, bc6h_format->bz, 4);          //      bz[4]
+        header.setvalue(35, 5, bc6h_format->rx);            // 5:   rx[4:0]
+        header.setvalue(40, 1, bc6h_format->gz, 4);          //      gz[4]
+        header.setvalue(41, 4, bc6h_format->gy);            // 6:   gy[3:0]
+        header.setvalue(45, 6, bc6h_format->gx);            // 6:   gx[5:0]
+        header.setvalue(51, 4, bc6h_format->gz);            // 6:   gz[3:0]
+        header.setvalue(55, 5, bc6h_format->bx);            // 5:   bx[4:0]
+        header.setvalue(60, 1, bc6h_format->bz, 1);          //      bz[1]
+        header.setvalue(61, 4, bc6h_format->by);            // 5:   by[3:0]
+        header.setvalue(65, 5, bc6h_format->ry);            // 5:   ry[4:0]
+        header.setvalue(70, 1, bc6h_format->bz, 2);          //      bz[2]
+        header.setvalue(71, 5, bc6h_format->rz);            // 5:   rz[4:0]
+        header.setvalue(76, 1, bc6h_format->bz, 3);          //      bz[3]
+        break;
+    case 9: // 0x1A
+        header.setvalue(0, 5, 0x1A);
+        header.setvalue(5, 8, bc6h_format->rw);            // 8:   rw[7:0] 
+        header.setvalue(13, 1, bc6h_format->bz, 1);          //      bz[1]
+        header.setvalue(14, 1, bc6h_format->by, 4);          //      by[4]
+        header.setvalue(15, 8, bc6h_format->gw);            // 8:   gw[7:0]
+        header.setvalue(23, 1, bc6h_format->by, 5);          //      by[5]
+        header.setvalue(24, 1, bc6h_format->gy, 4);          //      gy[4]
+        header.setvalue(25, 8, bc6h_format->bw);            // 8:   bw[7:0]
+        header.setvalue(33, 1, bc6h_format->bz, 5);          //      bz[5]
+        header.setvalue(34, 1, bc6h_format->bz, 4);          //      bz[4]
+        header.setvalue(35, 5, bc6h_format->rx);            // 5:   rx[4:0]
+        header.setvalue(40, 1, bc6h_format->gz, 4);          //      gz[4]
+        header.setvalue(41, 4, bc6h_format->gy);            // 5:   gy[3:0]
+        header.setvalue(45, 5, bc6h_format->gx);            // 5:   gx[4:0]
+        header.setvalue(50, 1, bc6h_format->bz);            // 6:   bz[0]
+        header.setvalue(51, 4, bc6h_format->gz);            // 5:   gz[3:0]
+        header.setvalue(55, 6, bc6h_format->bx);            // 6:   bx[5:0]
+        header.setvalue(61, 4, bc6h_format->by);            // 6:   by[3:0]
+        header.setvalue(65, 5, bc6h_format->ry);            // 5:   ry[4:0]
+        header.setvalue(70, 1, bc6h_format->bz, 2);          //      bz[2]
+        header.setvalue(71, 5, bc6h_format->rz);            // 5:   rz[4:0]
+        header.setvalue(76, 1, bc6h_format->bz, 3);          //      bz[3]
+        break;
+    case 10: // 0x1E
+        header.setvalue(0, 5, 0x1E);
+        header.setvalue(5, 6, bc6h_format->rw);            // 6:   rw[5:0] 
+        header.setvalue(11, 1, bc6h_format->gz, 4);          //      gz[4]
+        header.setvalue(12, 1, bc6h_format->bz);            // 6:   bz[0]
+        header.setvalue(13, 1, bc6h_format->bz, 1);          //      bz[1]
+        header.setvalue(14, 1, bc6h_format->by, 4);          //      by[4]
+        header.setvalue(15, 6, bc6h_format->gw);            // 6:   gw[5:0]
+        header.setvalue(21, 1, bc6h_format->gy, 5);          //      gy[5]
+        header.setvalue(22, 1, bc6h_format->by, 5);          //      by[5]
+        header.setvalue(23, 1, bc6h_format->bz, 2);          //      bz[2]
+        header.setvalue(24, 1, bc6h_format->gy, 4);          //      gy[4]
+        header.setvalue(25, 6, bc6h_format->bw);            // 6:   bw[5:0]
+        header.setvalue(31, 1, bc6h_format->gz, 5);          //      gz[5]
+        header.setvalue(32, 1, bc6h_format->bz, 3);          //      bz[3]
+        header.setvalue(33, 1, bc6h_format->bz, 5);          //      bz[5]
+        header.setvalue(34, 1, bc6h_format->bz, 4);          //      bz[4]
+        header.setvalue(35, 6, bc6h_format->rx);            // 6:   rx[5:0]
+        header.setvalue(41, 4, bc6h_format->gy);            // 6:   gy[3:0]
+        header.setvalue(45, 6, bc6h_format->gx);            // 6:   gx[5:0]
+        header.setvalue(51, 4, bc6h_format->gz);            // 6:   gz[3:0]
+        header.setvalue(55, 6, bc6h_format->bx);            // 6:   bx[5:0]
+        header.setvalue(61, 4, bc6h_format->by);            // 6:   by[3:0]
+        header.setvalue(65, 6, bc6h_format->ry);            // 6:   ry[5:0]
+        header.setvalue(71, 6, bc6h_format->rz);            // 6:   rz[5:0]
+        break;
+
+        // Single regions Modes
+    case 11: // 0x03
+        header.setvalue(0, 5, 0x03);
+        header.setvalue(5, 10, bc6h_format->rw);            // 10:   rw[9:0] 
+        header.setvalue(15, 10, bc6h_format->gw);            // 10:   gw[9:0]
+        header.setvalue(25, 10, bc6h_format->bw);            // 10:   bw[9:0]
+        header.setvalue(35, 10, bc6h_format->rx);            // 10:   rx[9:0]
+        header.setvalue(45, 10, bc6h_format->gx);            // 10:   gx[9:0]
+        header.setvalue(55, 10, bc6h_format->bx);            // 10:   bx[9:0]
+        break;
+    case 12: // 0x07
+        header.setvalue(0, 5, 0x07);
+        header.setvalue(5, 10, bc6h_format->rw);            // 11:   rw[9:0] 
+        header.setvalue(15, 10, bc6h_format->gw);            // 11:   gw[9:0]
+        header.setvalue(25, 10, bc6h_format->bw);            // 11:   bw[9:0]
+        header.setvalue(35, 9, bc6h_format->rx);            // 9:    rx[8:0]
+        header.setvalue(44, 1, bc6h_format->rw, 10);         //       rw[10]
+        header.setvalue(45, 9, bc6h_format->gx);            // 9:    gx[8:0]
+        header.setvalue(54, 1, bc6h_format->gw, 10);         //       gw[10]
+        header.setvalue(55, 9, bc6h_format->bx);            // 9:    bx[8:0]
+        header.setvalue(64, 1, bc6h_format->bw, 10);         //       bw[10]
+        break;
+    case 13: // 0x0B
+        header.setvalue(0, 5, 0x0B);
+        header.setvalue(5, 10, bc6h_format->rw);            // 12:   rw[9:0] 
+        header.setvalue(15, 10, bc6h_format->gw);            // 12:   gw[9:0]
+        header.setvalue(25, 10, bc6h_format->bw);            // 12:   bw[9:0]
+        header.setvalue(35, 8, bc6h_format->rx);            // 8:    rx[7:0]
+        header.setvalue(43, 1, bc6h_format->rw, 11);         //       rw[11]
+        header.setvalue(44, 1, bc6h_format->rw, 10);         //       rw[10]
+        header.setvalue(45, 8, bc6h_format->gx);            // 8:    gx[7:0]
+        header.setvalue(53, 1, bc6h_format->gw, 11);         //       gw[11]
+        header.setvalue(54, 1, bc6h_format->gw, 10);         //       gw[10]
+        header.setvalue(55, 8, bc6h_format->bx);            // 8:    bx[7:0]
+        header.setvalue(63, 1, bc6h_format->bw, 11);         //       bw[11]
+        header.setvalue(64, 1, bc6h_format->bw, 10);         //       bw[10]
+        break;
+    case 14: // 0x0F
+        header.setvalue(0, 5, 0x0F);
+        header.setvalue(5, 10, bc6h_format->rw);            // 16:   rw[9:0] 
+        header.setvalue(15, 10, bc6h_format->gw);            // 16:   gw[9:0]
+        header.setvalue(25, 10, bc6h_format->bw);            // 16:   bw[9:0]
+        header.setvalue(35, 4, bc6h_format->rx);            //  4:   rx[3:0]
+        header.setvalue(39, 6, bc6h_format->rw, 10);         //       rw[15:10]
+        header.setvalue(45, 4, bc6h_format->gx);            //  4:   gx[3:0]
+        header.setvalue(49, 6, bc6h_format->gw, 10);         //       gw[15:10]
+        header.setvalue(55, 4, bc6h_format->bx);            //  4:   bx[3:0]
+        header.setvalue(59, 6, bc6h_format->bw, 10);         //       bw[15:10]
+        break;
+    default: // Need to indicate error!
+        return;
+    }
+
+    // Each format in the mode table can be uniquely identified by the mode bits. 
+    // The first ten modes are used for two-region tiles, and the mode bit field 
+    // can be either two or five bits long. These blocks also have fields for 
+    // the compressed color endpoints (72 or 75 bits), the partition (5 bits), 
+    // and the partition indices (46 bits).
+
+    if (bc6h_format->m_mode >= MIN_MODE_FOR_ONE_REGION)
+    {
+        CGU_INT startbit = ONE_REGION_INDEX_OFFSET;
+        header.setvalue(startbit, 3, bc6h_format->indices16[0]);
+        startbit += 3;
+        for (CGU_INT i = 1; i < 16; i++)
+        {
+            header.setvalue(startbit, 4, bc6h_format->indices16[i]);
+            startbit += 4;
+        }
+    }
+    else
+    {
+        header.setvalue(77, 5, bc6h_format->d_shape_index);            // Shape Index
+        CGU_INT startbit = TWO_REGION_INDEX_OFFSET,
+            nbits = 2;
+        header.setvalue(startbit, nbits, bc6h_format->indices16[0]);
+        for (CGU_INT i = 1; i < 16; i++)
+        {
+            startbit += nbits; // offset start bit for next index using prior nbits used
+            nbits = g_indexfixups[bc6h_format->d_shape_index] == i ? 2 : 3; // get new number of bit to save index with
+            header.setvalue(startbit, nbits, bc6h_format->indices16[i]);
+        }
+    }
+
+    // save to output buffer our new bit values
+    // this can be optimized if header is part of bc6h_format struct
+    header.transferbits(cmpout, 16);
+}
+#else
+void SaveDataBlock(BC6H_Encode_local *bc6h_format, CMP_GLOBAL CGU_UINT8 out[COMPRESSED_BLOCK_SIZE])
+{
+    // ToDo
+}
+#endif
+
+void SwapIndices(CGU_INT32 iEndPoints[MAX_SUBSETS][MAX_END_POINTS][MAX_DIMENSION_BIG], CGU_INT32 iIndices[3][MAX_SUBSET_SIZE], CGU_INT  entryCount[MAX_SUBSETS], CGU_INT max_subsets, CGU_INT mode, CGU_INT shape_pattern)
+{
+
+    CGU_UINT32 uNumIndices = 1 << ModePartition[mode].IndexPrec;
+    CGU_UINT32 uHighIndexBit = uNumIndices >> 1;
+
+    for (CGU_INT subset = 0; subset < max_subsets; ++subset)
+    {
+        // region 0 (subset = 0) The fix-up index for this subset is allways index 0
+        // region 1 (subset = 1) The fix-up index for this subset varies based on the shape 
+        size_t i = subset ? g_Region2FixUp[shape_pattern] : 0;
+
+        if (iIndices[subset][i] & uHighIndexBit)
+        {
+            // high bit is set, swap the aEndPts and indices for this region
+            swap(iEndPoints[subset][0][0], iEndPoints[subset][1][0]);
+            swap(iEndPoints[subset][0][1], iEndPoints[subset][1][1]);
+            swap(iEndPoints[subset][0][2], iEndPoints[subset][1][2]);
+
+            for (size_t j = 0; j < (size_t)entryCount[subset]; ++j)
+            {
+                iIndices[subset][j] = uNumIndices - 1 - iIndices[subset][j];
+            }
+        }
+
+    }
+}
+
+// helper function to check transform overflow
+// todo: check overflow by checking against sign
+CGU_BOOL isOverflow(CGU_INT endpoint, CGU_INT nbit)
+{
+    CGU_INT maxRange = (int)pow(2.0f, (CGU_FLOAT)nbit - 1.0f) - 1;
+    CGU_INT minRange = (int)-(pow(2.0f, (CGU_FLOAT)nbit - 1.0f));
+
+    //no overflow
+    if ((endpoint >= minRange) && (endpoint <= maxRange))
+        return false;
+    else //overflow
+        return true;
+}
+
+CGU_BOOL TransformEndPoints(BC6H_Encode_local *BC6H_data, CGU_INT iEndPoints[MAX_SUBSETS][MAX_END_POINTS][MAX_DIMENSION_BIG], CGU_INT oEndPoints[MAX_SUBSETS][MAX_END_POINTS][MAX_DIMENSION_BIG], CGU_INT max_subsets, CGU_INT mode)
+{
+    CGU_INT Mask;
+    if (ModePartition[mode].transformed)
+    {
+        BC6H_data->istransformed = true;
+        for (CGU_INT i = 0; i < 3; ++i)
+        {
+            Mask = MASK(ModePartition[mode].nbits);
+            oEndPoints[0][0][i] = iEndPoints[0][0][i] & Mask;    // [0][A]
+
+            Mask = MASK(ModePartition[mode].prec[i]);
+            oEndPoints[0][1][i] = iEndPoints[0][1][i] - iEndPoints[0][0][i]; // [0][B] - [0][A]
+
+            if (isOverflow(oEndPoints[0][1][i], ModePartition[mode].prec[i]))
+                return false;
+
+            oEndPoints[0][1][i] = (oEndPoints[0][1][i] & Mask);
+
+            //redo the check for sign overflow for one region case
+            if (max_subsets <= 1)
+            {
+                if (isOverflow(oEndPoints[0][1][i], ModePartition[mode].prec[i]))
+                    return false;
+            }
+
+            if (max_subsets > 1)
+            {
+                oEndPoints[1][0][i] = iEndPoints[1][0][i] - iEndPoints[0][0][i];  // [1][A] - [0][A]
+                if (isOverflow(oEndPoints[1][0][i], ModePartition[mode].prec[i]))
+                    return false;
+
+                oEndPoints[1][0][i] = (oEndPoints[1][0][i] & Mask);
+
+                oEndPoints[1][1][i] = iEndPoints[1][1][i] - iEndPoints[0][0][i];  // [1][B] - [0][A]
+                if (isOverflow(oEndPoints[1][1][i], ModePartition[mode].prec[i]))
+                    return false;
+
+                oEndPoints[1][1][i] = (oEndPoints[1][1][i] & Mask);
+            }
+        }
+    }
+    else
+    {
+        BC6H_data->istransformed = false;
+        for (CGU_INT i = 0; i < 3; ++i)
+        {
+            Mask = MASK(ModePartition[mode].nbits);
+            oEndPoints[0][0][i] = iEndPoints[0][0][i] & Mask;
+
+            Mask = MASK(ModePartition[mode].prec[i]);
+            oEndPoints[0][1][i] = iEndPoints[0][1][i] & Mask;
+
+            if (max_subsets > 1)
+            {
+                oEndPoints[1][0][i] = iEndPoints[1][0][i] & Mask;
+                oEndPoints[1][1][i] = iEndPoints[1][1][i] & Mask;
+            }
+        }
+    }
+
+    return true;
+}
+
+void SaveCompressedBlockData(BC6H_Encode_local *BC6H_data,
+    CGU_INT oEndPoints[MAX_SUBSETS][MAX_END_POINTS][MAX_DIMENSION_BIG],
+    CGU_INT iIndices[2][MAX_SUBSET_SIZE],
+    CGU_INT8 max_subsets,
+    CGU_INT8 mode)
+{
+    BC6H_data->m_mode = mode;
+    BC6H_data->index++;
+
+    // Save the data to output
+    BC6H_data->rw = oEndPoints[0][0][0]; // rw
+    BC6H_data->gw = oEndPoints[0][0][1]; // gw
+    BC6H_data->bw = oEndPoints[0][0][2]; // bw
+    BC6H_data->rx = oEndPoints[0][1][0]; // rx
+    BC6H_data->gx = oEndPoints[0][1][1]; // gx
+    BC6H_data->bx = oEndPoints[0][1][2]; // bx
+
+    if (max_subsets > 1)
+    {
+        // Save the data to output
+        BC6H_data->ry = oEndPoints[1][0][0]; // ry
+        BC6H_data->gy = oEndPoints[1][0][1]; // gy
+        BC6H_data->by = oEndPoints[1][0][2]; // by
+        BC6H_data->rz = oEndPoints[1][1][0]; // rz
+        BC6H_data->gz = oEndPoints[1][1][1]; // gz
+        BC6H_data->bz = oEndPoints[1][1][2]; // bz
+    }
+
+    // Map our two subset Indices for the shape to output 4x4 block
+    CGU_INT pos[2] = { 0,0 };
+    CGU_INT asubset;
+    for (CGU_INT i = 0; i < MAX_SUBSET_SIZE; i++)
+    {
+        if (max_subsets > 1)
+            asubset = BC6_PARTITIONS[BC6H_data->d_shape_index][i]; // Two region shapes 
+        else
+            asubset = 0; // One region shapes 
+        BC6H_data->indices16[i] = (CGU_UINT8)iIndices[asubset][pos[asubset]];
+        pos[asubset]++;
+    }
+
+}
+
+CGU_FLOAT CalcOneRegionEndPtsError(BC6H_Encode_local *BC6H_data, CGU_FLOAT fEndPoints[MAX_SUBSETS][MAX_END_POINTS][MAX_DIMENSION_BIG], CGU_INT shape_indices[MAX_SUBSETS][MAX_SUBSET_SIZE])
+{
+    CGU_FLOAT error = 0;
+
+    for (CGU_INT i = 0; i < MAX_SUBSET_SIZE; i++)
+    {
+        for (CGU_INT m = 0; m < MAX_END_POINTS; m++)
+        {
+            for (CGU_INT n = 0; n < NCHANNELS; n++)
+            {
+                CGU_FLOAT calencpts = fEndPoints[0][m][n] + (abs(fEndPoints[0][m][n] - fEndPoints[0][m][n]) * (shape_indices[0][i] / 15));
+                error += abs(BC6H_data->din[i][n] - calencpts);
+            }
+        }
+    }
+
+    return error;
+}
+
+void ReIndexShapef(BC6H_Encode_local *BC6H_data, CGU_INT shape_indices[MAX_SUBSETS][MAX_SUBSET_SIZE])
+{
+    CGU_FLOAT error = 0;
+    CGU_FLOAT bestError;
+    CGU_INT bestIndex = 0;
+    CGU_INT sub0index = 0;
+    CGU_INT sub1index = 0;
+    CGU_INT MaxPallet;
+    CGU_INT region = (BC6H_data->region - 1);
+
+    if (region == 0)
+        MaxPallet = 16;
+    else
+        MaxPallet = 8;
+
+    CGU_UINT8 isSet = 0;
+    for (CGU_INT i = 0; i < MAX_SUBSET_SIZE; i++)
+    {
+        // subset 0 or subset 1
+        if (region)
+            isSet = BC6_PARTITIONS[BC6H_data->d_shape_index][i];
+
+        if (isSet)
+        {
+            bestError = CMP_HALF_MAX;
+            bestIndex = 0;
+
+            // For two shape regions max Pallet is 8
+            for (CGU_INT j = 0; j < MaxPallet; j++)
+            {
+                // Calculate error from original
+                error = abs(BC6H_data->din[i][0] - BC6H_data->Paletef[1][j].x) +
+                    abs(BC6H_data->din[i][1] - BC6H_data->Paletef[1][j].y) +
+                    abs(BC6H_data->din[i][2] - BC6H_data->Paletef[1][j].z);
+                if (error < bestError)
+                {
+                    bestError = error;
+                    bestIndex = j;
+                }
+            }
+
+            shape_indices[1][sub1index] = bestIndex;
+            sub1index++;
+        }
+        else
+        {
+            // This is shared for one or two shape regions max Pallet either 16 or 8
+            bestError = CMP_FLOAT_MAX;
+            bestIndex = 0;
+
+            for (CGU_INT j = 0; j < MaxPallet; j++)
+            {
+                // Calculate error from original
+                error = abs(BC6H_data->din[i][0] - BC6H_data->Paletef[0][j].x) +
+                    abs(BC6H_data->din[i][1] - BC6H_data->Paletef[0][j].y) +
+                    abs(BC6H_data->din[i][2] - BC6H_data->Paletef[0][j].z);
+                if (error < bestError)
+                {
+                    bestError = error;
+                    bestIndex = j;
+                }
+            }
+
+            shape_indices[0][sub0index] = bestIndex;
+            sub0index++;
+        }
+    }
+
+}
+
+CGU_INT Unquantize(CGU_INT comp, unsigned char uBitsPerComp, CGU_BOOL bSigned)
+{
+    CGU_INT unq = 0, s = 0;
+    if (bSigned)
+    {
+        if (uBitsPerComp >= 16)
+        {
+            unq = comp;
+        }
+        else
+        {
+            if (comp < 0)
+            {
+                s = 1;
+                comp = -comp;
+            }
+
+            if (comp == 0) unq = 0;
+            else if (comp >= ((1 << (uBitsPerComp - 1)) - 1)) unq = 0x7FFF;
+            else unq = ((comp << 15) + 0x4000) >> (uBitsPerComp - 1);
+
+            if (s) unq = -unq;
+        }
+    }
+    else
+    {
+        if (uBitsPerComp >= 15) unq = comp;
+        else if (comp == 0) unq = 0;
+        else if (comp == ((1 << uBitsPerComp) - 1)) unq = 0xFFFF;
+        else unq = ((comp << 16) + 0x8000) >> uBitsPerComp;
+    }
+
+    return unq;
+}
+
+CGU_INT finish_unquantizeF16(CGU_INT q, CGU_BOOL isSigned)
+{
+    // Is it F16 Signed else F16 Unsigned
+    if (isSigned)
+        return (q < 0) ? -(((-q) * 31) >> 5) : (q * 31) >> 5;       // scale the magnitude by 31/32
+    else
+        return (q * 31) >> 6;                                       // scale the magnitude by 31/64
+
+                                                                    // Note for Undefined we should return q as is
+}
+
+// decompress endpoints
+void decompress_endpoints1(BC6H_Encode_local * bc6h_format, CGU_INT oEndPoints[MAX_SUBSETS][MAX_END_POINTS][MAX_DIMENSION_BIG], CGU_FLOAT outf[MAX_SUBSETS][MAX_END_POINTS][MAX_DIMENSION_BIG], CGU_INT mode)
+{
+    CGU_INT i;
+    CGU_INT t;
+    CGU_FLOAT out[MAX_SUBSETS][MAX_END_POINTS][MAX_DIMENSION_BIG];
+
+    if (bc6h_format->issigned)
+    {
+        if (bc6h_format->istransformed)
+        {
+            for (i = 0; i < NCHANNELS; i++)
+            {
+                out[0][0][i] = (CGU_FLOAT)SIGN_EXTEND_TYPELESS(oEndPoints[0][0][i], ModePartition[mode].nbits);
+
+                t = SIGN_EXTEND_TYPELESS(oEndPoints[0][1][i], ModePartition[mode].prec[i]); //C_RED
+                t = (t + oEndPoints[0][0][i]) & MASK(ModePartition[mode].nbits);
+                out[0][1][i] = (CGU_FLOAT)SIGN_EXTEND_TYPELESS(t, ModePartition[mode].nbits);
+
+                // Unquantize all points to nbits
+                out[0][0][i] = (CGU_FLOAT)Unquantize((int)out[0][0][i], (unsigned char)ModePartition[mode].nbits, false);
+                out[0][1][i] = (CGU_FLOAT)Unquantize((int)out[0][1][i], (unsigned char)ModePartition[mode].nbits, false);
+
+                // F16 format
+                outf[0][0][i] = (CGU_FLOAT)finish_unquantizeF16((int)out[0][0][i], false);
+                outf[0][1][i] = (CGU_FLOAT)finish_unquantizeF16((int)out[0][1][i], false);
+            }
+        }
+        else
+        {
+            for (i = 0; i < NCHANNELS; i++)
+            {
+                out[0][0][i] = (CGU_FLOAT)SIGN_EXTEND_TYPELESS(oEndPoints[0][0][i], ModePartition[mode].nbits);
+                out[0][1][i] = (CGU_FLOAT)SIGN_EXTEND_TYPELESS(oEndPoints[0][1][i], ModePartition[mode].prec[i]);
+
+                // Unquantize all points to nbits
+                out[0][0][i] = (CGU_FLOAT)Unquantize((int)out[0][0][i], (unsigned char)ModePartition[mode].nbits, false);
+                out[0][1][i] = (CGU_FLOAT)Unquantize((int)out[0][1][i], (unsigned char)ModePartition[mode].nbits, false);
+
+                // F16 format
+                outf[0][0][i] = (CGU_FLOAT)finish_unquantizeF16((int)out[0][0][i], false);
+                outf[0][1][i] = (CGU_FLOAT)finish_unquantizeF16((int)out[0][1][i], false);
+            }
+        }
+
+    }
+    else
+    {
+        if (bc6h_format->istransformed)
+        {
+            for (i = 0; i < NCHANNELS; i++)
+            {
+                out[0][0][i] = (CGU_FLOAT)oEndPoints[0][0][i];
+                t = SIGN_EXTEND_TYPELESS(oEndPoints[0][1][i], ModePartition[mode].prec[i]);
+                out[0][1][i] = (CGU_FLOAT)((t + oEndPoints[0][0][i]) & MASK(ModePartition[mode].nbits));
+
+                // Unquantize all points to nbits
+                out[0][0][i] = (CGU_FLOAT)Unquantize((int)out[0][0][i], (unsigned char)ModePartition[mode].nbits, false);
+                out[0][1][i] = (CGU_FLOAT)Unquantize((int)out[0][1][i], (unsigned char)ModePartition[mode].nbits, false);
+
+                // F16 format
+                outf[0][0][i] = (CGU_FLOAT)finish_unquantizeF16((int)out[0][0][i], false);
+                outf[0][1][i] = (CGU_FLOAT)finish_unquantizeF16((int)out[0][1][i], false);
+            }
+        }
+        else
+        {
+            for (i = 0; i < NCHANNELS; i++)
+            {
+                out[0][0][i] = (CGU_FLOAT)oEndPoints[0][0][i];
+                out[0][1][i] = (CGU_FLOAT)oEndPoints[0][1][i];
+
+                // Unquantize all points to nbits
+                out[0][0][i] = (CGU_FLOAT)Unquantize((int)out[0][0][i], (unsigned char)ModePartition[mode].nbits, false);
+                out[0][1][i] = (CGU_FLOAT)Unquantize((int)out[0][1][i], (unsigned char)ModePartition[mode].nbits, false);
+
+                // F16 format
+                outf[0][0][i] = (CGU_FLOAT)finish_unquantizeF16((int)out[0][0][i], false);
+                outf[0][1][i] = (CGU_FLOAT)finish_unquantizeF16((int)out[0][1][i], false);
+            }
+        }
+    }
+}
+
+void decompress_endpoints2(BC6H_Encode_local * bc6h_format, CGU_INT oEndPoints[MAX_SUBSETS][MAX_END_POINTS][MAX_DIMENSION_BIG], CGU_FLOAT outf[MAX_SUBSETS][MAX_END_POINTS][MAX_DIMENSION_BIG], CGU_INT mode)
+{
+    CGU_INT i;
+    CGU_INT t;
+    CGU_FLOAT out[MAX_SUBSETS][MAX_END_POINTS][MAX_DIMENSION_BIG];
+
+    if (bc6h_format->issigned)
+    {
+        if (bc6h_format->istransformed)
+        {
+            for (i = 0; i < NCHANNELS; i++)
+            {
+                // get the quantized values 
+                out[0][0][i] = (CGU_FLOAT)SIGN_EXTEND_TYPELESS(oEndPoints[0][0][i], ModePartition[mode].nbits);
+
+                t = SIGN_EXTEND_TYPELESS(oEndPoints[0][1][i], ModePartition[mode].prec[i]);
+                t = (t + oEndPoints[0][0][i]) & MASK(ModePartition[mode].nbits);
+                out[0][1][i] = (CGU_FLOAT)SIGN_EXTEND_TYPELESS(t, ModePartition[mode].nbits);
+
+                t = SIGN_EXTEND_TYPELESS(oEndPoints[1][0][i], ModePartition[mode].prec[i]);
+                t = (t + oEndPoints[0][0][i]) & MASK(ModePartition[mode].nbits);
+                out[1][0][i] = (CGU_FLOAT)SIGN_EXTEND_TYPELESS(t, ModePartition[mode].nbits);
+
+                t = SIGN_EXTEND_TYPELESS(oEndPoints[1][1][i], ModePartition[mode].prec[i]);
+                t = (t + oEndPoints[0][0][i]) & MASK(ModePartition[mode].nbits);
+                out[1][1][i] = (CGU_FLOAT)SIGN_EXTEND_TYPELESS(t, ModePartition[mode].nbits);
+
+                // Unquantize all points to nbits 
+                out[0][0][i] = (CGU_FLOAT)Unquantize((int)out[0][0][i], (unsigned char)ModePartition[mode].nbits, true);
+                out[0][1][i] = (CGU_FLOAT)Unquantize((int)out[0][1][i], (unsigned char)ModePartition[mode].nbits, true);
+                out[1][0][i] = (CGU_FLOAT)Unquantize((int)out[1][0][i], (unsigned char)ModePartition[mode].nbits, true);
+                out[1][1][i] = (CGU_FLOAT)Unquantize((int)out[1][1][i], (unsigned char)ModePartition[mode].nbits, true);
+
+                // F16 format
+                outf[0][0][i] = (CGU_FLOAT)finish_unquantizeF16((int)out[0][0][i], true);
+                outf[0][1][i] = (CGU_FLOAT)finish_unquantizeF16((int)out[0][1][i], true);
+                outf[1][0][i] = (CGU_FLOAT)finish_unquantizeF16((int)out[1][0][i], true);
+                outf[1][1][i] = (CGU_FLOAT)finish_unquantizeF16((int)out[1][1][i], true);
+
+            }
+        }
+        else
+        {
+            for (i = 0; i < NCHANNELS; i++)
+            {
+                out[0][0][i] = (CGU_FLOAT)SIGN_EXTEND_TYPELESS(oEndPoints[0][0][i], ModePartition[mode].nbits);
+                out[0][1][i] = (CGU_FLOAT)SIGN_EXTEND_TYPELESS(oEndPoints[0][1][i], ModePartition[mode].prec[i]);
+                out[1][0][i] = (CGU_FLOAT)SIGN_EXTEND_TYPELESS(oEndPoints[1][0][i], ModePartition[mode].prec[i]);
+                out[1][1][i] = (CGU_FLOAT)SIGN_EXTEND_TYPELESS(oEndPoints[1][1][i], ModePartition[mode].prec[i]);
+
+                // Unquantize all points to nbits
+                out[0][0][i] = (CGU_FLOAT)Unquantize((int)out[0][0][i], (unsigned char)ModePartition[mode].nbits, false);
+                out[0][1][i] = (CGU_FLOAT)Unquantize((int)out[0][1][i], (unsigned char)ModePartition[mode].nbits, false);
+                out[1][0][i] = (CGU_FLOAT)Unquantize((int)out[1][0][i], (unsigned char)ModePartition[mode].nbits, false);
+                out[1][1][i] = (CGU_FLOAT)Unquantize((int)out[1][1][i], (unsigned char)ModePartition[mode].nbits, false);
+
+                // nbits to F16 format
+                outf[0][0][i] = (CGU_FLOAT)finish_unquantizeF16((int)out[0][0][i], false);
+                outf[0][1][i] = (CGU_FLOAT)finish_unquantizeF16((int)out[0][1][i], false);
+                outf[1][0][i] = (CGU_FLOAT)finish_unquantizeF16((int)out[1][0][i], false);
+                outf[1][1][i] = (CGU_FLOAT)finish_unquantizeF16((int)out[1][1][i], false);
+            }
+        }
+
+    }
+    else
+    {
+        if (bc6h_format->istransformed)
+        {
+            for (i = 0; i < NCHANNELS; i++)
+            {
+                out[0][0][i] = (CGU_FLOAT)oEndPoints[0][0][i];
+                t = SIGN_EXTEND_TYPELESS(oEndPoints[0][1][i], ModePartition[mode].prec[i]);
+                out[0][1][i] = (CGU_FLOAT)((t + oEndPoints[0][0][i]) & MASK(ModePartition[mode].nbits));
+
+                t = SIGN_EXTEND_TYPELESS(oEndPoints[1][0][i], ModePartition[mode].prec[i]);
+                out[1][0][i] = (CGU_FLOAT)((t + oEndPoints[0][0][i]) & MASK(ModePartition[mode].nbits));
+
+                t = SIGN_EXTEND_TYPELESS(oEndPoints[1][1][i], ModePartition[mode].prec[i]);
+                out[1][1][i] = (CGU_FLOAT)((t + oEndPoints[0][0][i]) & MASK(ModePartition[mode].nbits));
+
+                // Unquantize all points to nbits
+                out[0][0][i] = (CGU_FLOAT)Unquantize((int)out[0][0][i], (unsigned char)ModePartition[mode].nbits, false);
+                out[0][1][i] = (CGU_FLOAT)Unquantize((int)out[0][1][i], (unsigned char)ModePartition[mode].nbits, false);
+                out[1][0][i] = (CGU_FLOAT)Unquantize((int)out[1][0][i], (unsigned char)ModePartition[mode].nbits, false);
+                out[1][1][i] = (CGU_FLOAT)Unquantize((int)out[1][1][i], (unsigned char)ModePartition[mode].nbits, false);
+
+                // nbits to F16 format
+                outf[0][0][i] = (CGU_FLOAT)finish_unquantizeF16((int)out[0][0][i], false);
+                outf[0][1][i] = (CGU_FLOAT)finish_unquantizeF16((int)out[0][1][i], false);
+                outf[1][0][i] = (CGU_FLOAT)finish_unquantizeF16((int)out[1][0][i], false);
+                outf[1][1][i] = (CGU_FLOAT)finish_unquantizeF16((int)out[1][1][i], false);
+
+            }
+        }
+        else
+        {
+            for (i = 0; i < NCHANNELS; i++)
+            {
+                out[0][0][i] = (CGU_FLOAT)oEndPoints[0][0][i];
+                out[0][1][i] = (CGU_FLOAT)oEndPoints[0][1][i];
+                out[1][0][i] = (CGU_FLOAT)oEndPoints[1][0][i];
+                out[1][1][i] = (CGU_FLOAT)oEndPoints[1][1][i];
+
+                // Unquantize all points to nbits
+                out[0][0][i] = (CGU_FLOAT)Unquantize((int)out[0][0][i], (unsigned char)ModePartition[mode].nbits, false);
+                out[0][1][i] = (CGU_FLOAT)Unquantize((int)out[0][1][i], (unsigned char)ModePartition[mode].nbits, false);
+                out[1][0][i] = (CGU_FLOAT)Unquantize((int)out[1][0][i], (unsigned char)ModePartition[mode].nbits, false);
+                out[1][1][i] = (CGU_FLOAT)Unquantize((int)out[1][1][i], (unsigned char)ModePartition[mode].nbits, false);
+
+                // nbits to F16 format
+                outf[0][0][i] = (CGU_FLOAT)finish_unquantizeF16((int)out[0][0][i], false);
+                outf[0][1][i] = (CGU_FLOAT)finish_unquantizeF16((int)out[0][1][i], false);
+                outf[1][0][i] = (CGU_FLOAT)finish_unquantizeF16((int)out[1][0][i], false);
+                outf[1][1][i] = (CGU_FLOAT)finish_unquantizeF16((int)out[1][1][i], false);
+            }
+        }
+    }
+}
+
+// decompress endpoints
+static void decompress_endpts(const CGU_INT in[MAX_SUBSETS][MAX_END_POINTS][MAX_DIMENSION_BIG], CGU_INT out[MAX_SUBSETS][MAX_END_POINTS][MAX_DIMENSION_BIG], const CGU_INT mode, CGU_BOOL issigned)
+{
+
+    if (ModePartition[mode].transformed)
+    {
+        for (CGU_INT i = 0; i < 3; ++i)
+        {
+            R_0(out) = issigned ? SIGN_EXTEND_TYPELESS(R_0(in), ModePartition[mode].IndexPrec) : R_0(in);
+            CGU_INT t;
+            t = SIGN_EXTEND_TYPELESS(R_1(in), ModePartition[mode].prec[i]);
+            t = (t + R_0(in)) & MASK(ModePartition[mode].nbits);
+            R_1(out) = issigned ? SIGN_EXTEND_TYPELESS(t, ModePartition[mode].nbits) : t;
+
+            t = SIGN_EXTEND_TYPELESS(R_2(in), ModePartition[mode].prec[i]);
+            t = (t + R_0(in)) & MASK(ModePartition[mode].nbits);
+            R_2(out) = issigned ? SIGN_EXTEND_TYPELESS(t, ModePartition[mode].nbits) : t;
+
+            t = SIGN_EXTEND_TYPELESS(R_3(in), ModePartition[mode].prec[i]);
+            t = (t + R_0(in)) & MASK(ModePartition[mode].nbits);
+            R_3(out) = issigned ? SIGN_EXTEND_TYPELESS(t, ModePartition[mode].nbits) : t;
+        }
+    }
+    else
+    {
+        for (CGU_INT i = 0; i < 3; ++i)
+        {
+            R_0(out) = issigned ? SIGN_EXTEND_TYPELESS(R_0(in), ModePartition[mode].nbits) : R_0(in);
+            R_1(out) = issigned ? SIGN_EXTEND_TYPELESS(R_1(in), ModePartition[mode].prec[i]) : R_1(in);
+            R_2(out) = issigned ? SIGN_EXTEND_TYPELESS(R_2(in), ModePartition[mode].prec[i]) : R_2(in);
+            R_3(out) = issigned ? SIGN_EXTEND_TYPELESS(R_3(in), ModePartition[mode].prec[i]) : R_3(in);
+        }
+    }
+}
+
+// endpoints fit only if the compression was lossless
+static CGU_BOOL endpts_fit(const CGU_INT orig[MAX_SUBSETS][MAX_END_POINTS][MAX_DIMENSION_BIG], const CGU_INT compressed[MAX_SUBSETS][MAX_END_POINTS][MAX_DIMENSION_BIG], const CGU_INT mode, CGU_INT max_subsets, CGU_BOOL issigned)
+{
+    CGU_INT uncompressed[MAX_SUBSETS][MAX_END_POINTS][MAX_DIMENSION_BIG];
+
+    decompress_endpts(compressed, uncompressed, mode, issigned);
+
+    for (CGU_INT j = 0; j < max_subsets; ++j)
+        for (CGU_INT i = 0; i < 3; ++i)
+        {
+            if (orig[j][0][i] != uncompressed[j][0][i]) return false;
+            if (orig[j][1][i] != uncompressed[j][1][i]) return false;
+        }
+
+    return true;
+}
+
+//todo: check overflow
+CGU_INT QuantizeToInt(short value, CGU_INT prec, CGU_BOOL signedfloat16)
+{
+
+    if (prec <= 1) return 0;
+    CGU_BOOL negvalue = false;
+
+    // move data to use extra bits for processing
+    CGU_INT ivalue = value;
+
+    if (signedfloat16)
+    {
+        if (value < 0)
+        {
+            negvalue = true;
+            value = -value;
+        }
+        prec--;
+    }
+    else
+    {
+        // clamp -ve
+        if (value < 0)
+            value = 0;
+    }
+
+    CGU_INT iQuantized;
+    CGU_INT bias = (prec > 10 && prec != 16) ? ((1 << (prec - 11)) - 1) : 0;
+    bias = (prec == 16) ? 15 : bias;
+
+    iQuantized = ((ivalue << prec) + bias) / (FLT16_MAX + 1);
+
+    return (negvalue ? -iQuantized : iQuantized);
+}
+
+//todo: checkoverflow
+void QuantizeEndPointToF16Prec(CGU_FLOAT EndPoints[MAX_SUBSETS][MAX_END_POINTS][MAX_DIMENSION_BIG], CGU_INT iEndPoints[MAX_SUBSETS][MAX_END_POINTS][MAX_DIMENSION_BIG], CGU_INT max_subsets, CGU_INT prec, CGU_BOOL isSigned)
+{
+
+    for (CGU_INT subset = 0; subset < max_subsets; ++subset)
+    {
+        iEndPoints[subset][0][0] = QuantizeToInt((short)EndPoints[subset][0][0], prec, isSigned);    // A.Red
+        iEndPoints[subset][0][1] = QuantizeToInt((short)EndPoints[subset][0][1], prec, isSigned);    // A.Green
+        iEndPoints[subset][0][2] = QuantizeToInt((short)EndPoints[subset][0][2], prec, isSigned);    // A.Blue
+        iEndPoints[subset][1][0] = QuantizeToInt((short)EndPoints[subset][1][0], prec, isSigned);    // B.Red
+        iEndPoints[subset][1][1] = QuantizeToInt((short)EndPoints[subset][1][1], prec, isSigned);    // B.Green
+        iEndPoints[subset][1][2] = QuantizeToInt((short)EndPoints[subset][1][2], prec, isSigned);    // B.Blue
+    }
+}
+
+CGU_FLOAT  EncodePattern(BC6H_Encode_local *BC6H_data, CGU_FLOAT  error)
+{
+    CGU_INT8        max_subsets = BC6H_data->region;
+
+    // now we have input colors (in), output colors (outB) mapped to a line of ends (EndPoints)
+    // and a set of colors on the line equally spaced (indexedcolors)
+    // Lets assign indices
+
+    //CGU_FLOAT SrcEndPoints[MAX_SUBSETS][MAX_END_POINTS][MAX_DIMENSION_BIG];                  // temp endpoints used during calculations
+
+    // Quantize the EndPoints 
+    CGU_INT F16EndPoints[MAX_BC6H_MODES + 1][MAX_SUBSETS][MAX_END_POINTS][MAX_DIMENSION_BIG];                    // temp endpoints used during calculations
+    CGU_INT quantEndPoints[MAX_BC6H_MODES + 1][MAX_SUBSETS][MAX_END_POINTS][MAX_DIMENSION_BIG];                    // endpoints to save for a given mode
+
+                                                                                                                    // ModePartition[] starts from 1 to 14
+                                                                                                                    // If we have a shape pattern set the loop to check modes from 1 to 10 else from 11 to 14
+                                                                                                                    // of the ModePartition table
+    CGU_INT     min_mode = (BC6H_data->region == 2) ? 1 : 11;
+    CGU_INT     max_mode = (BC6H_data->region == 2) ? MAX_TWOREGION_MODES : MAX_BC6H_MODES;
+
+    CGU_BOOL    fits[15];
+    memset((CGU_UINT8 *)fits, 0, sizeof(fits));
+
+    CGU_INT bestFit = 0;
+    CGU_INT bestEndpointMode = 0;
+    CGU_FLOAT bestError = CMP_FLOAT_MAX;
+    CGU_FLOAT bestEndpointsErr = CMP_FLOAT_MAX;
+    CGU_FLOAT endPointErr = 0;
+
+    // Try Optimization for the Mode
+    CGU_FLOAT       best_EndPoints[MAX_BC6H_MODES + 1][MAX_SUBSETS][MAX_END_POINTS][MAX_DIMENSION_BIG];
+    CGU_INT         best_Indices[MAX_BC6H_MODES + 1][MAX_SUBSETS][MAX_SUBSET_SIZE];
+    CGU_FLOAT      opt_toterr[MAX_BC6H_MODES + 1] = { 0 };
+
+    memset((CGU_UINT8 *)opt_toterr, 0, sizeof(opt_toterr));
+
+    CGU_INT numfits = 0;
+    //
+    // Notes;  Only the endpoints are varying; the indices stay fixed in values!
+    // so to optimize which mode we need only check the endpoints error against our original to pick the mode to save
+    //
+    for (CGU_INT modes = min_mode; modes <= max_mode; ++modes)
+    {
+        memcpy((CGU_UINT8 *)best_EndPoints[modes], (CGU_UINT8 *)BC6H_data->fEndPoints, sizeof(BC6H_data->fEndPoints));
+        memcpy((CGU_UINT8 *)best_Indices[modes]  , (CGU_UINT8 *)BC6H_data->shape_indices, sizeof(BC6H_data->shape_indices));
+
+        {
+            QuantizeEndPointToF16Prec(best_EndPoints[modes], F16EndPoints[modes], max_subsets, ModePartition[ModeFitOrder[modes]].nbits, BC6H_data->issigned);
+        }
+
+        // Indices data to save for given mode
+        SwapIndices(F16EndPoints[modes], best_Indices[modes], BC6H_data->entryCount, max_subsets, ModeFitOrder[modes], BC6H_data->d_shape_index);
+        CGU_BOOL transformfit = TransformEndPoints(BC6H_data, F16EndPoints[modes], quantEndPoints[modes], max_subsets, ModeFitOrder[modes]);
+        fits[modes] = endpts_fit(F16EndPoints[modes], quantEndPoints[modes], ModeFitOrder[modes], max_subsets, BC6H_data->issigned);
+
+        if (fits[modes] && transformfit)
+        {
+            numfits++;
+
+            // The new compressed end points fit the mode
+            // recalculate the error for this mode with a new set of indices
+            // since we have shifted the end points from what we origially calc
+            // from the find_bestpattern
+            CGU_FLOAT uncompressed[MAX_SUBSETS][MAX_END_POINTS][MAX_DIMENSION_BIG];
+            if (BC6H_data->region == 1)
+                decompress_endpoints1(BC6H_data, quantEndPoints[modes], uncompressed, ModeFitOrder[modes]);
+            else
+                decompress_endpoints2(BC6H_data, quantEndPoints[modes], uncompressed, ModeFitOrder[modes]);
+            // Takes the end points and creates a pallet of colors
+            // based on preset weights along a vector formed by the two end points
+            palitizeEndPointsF(BC6H_data, uncompressed);
+
+            // Once we have the pallet - recalculate the optimal indices using the pallet
+            // and the original image data stored in BC6H_data.din[]
+            if (!BC6H_data->issigned)
+                ReIndexShapef(BC6H_data, best_Indices[modes]);
+
+            // Calculate the error of the new tile vs the old tile data
+            opt_toterr[modes] = CalcShapeError(BC6H_data, uncompressed, true);
+            if (BC6H_data->region == 1)
+            {
+                endPointErr = CalcOneRegionEndPtsError(BC6H_data, uncompressed, best_Indices[modes]);
+                if (endPointErr < bestEndpointsErr)
+                {
+                    bestEndpointsErr = endPointErr;
+                    bestEndpointMode = modes;
+                }
+            }
+
+            CGU_BOOL transformFit = true;
+            // Save hold this mode fit data if its better than the last one checked.
+            if (opt_toterr[modes] < bestError)
+            {
+                if (!BC6H_data->issigned)
+                {
+                    QuantizeEndPointToF16Prec(uncompressed, F16EndPoints[modes], max_subsets, ModePartition[ModeFitOrder[modes]].nbits, BC6H_data->issigned);
+                    SwapIndices(F16EndPoints[modes], best_Indices[modes], BC6H_data->entryCount, max_subsets, ModeFitOrder[modes], BC6H_data->d_shape_index);
+                    transformFit = TransformEndPoints(BC6H_data, F16EndPoints[modes], quantEndPoints[modes], max_subsets, ModeFitOrder[modes]);
+                }
+                if (transformFit)
+                {
+                    if (BC6H_data->region == 1)
+                    {
+                        bestFit = (modes == bestEndpointMode) ? modes : ((modes < bestEndpointMode) ? modes : bestEndpointMode);
+                    }
+                    else
+                    {
+                        bestFit = modes;
+                    }
+                    bestError = opt_toterr[bestFit];
+                    error = bestError;
+                }
+            }
+
+        }
+    }
+
+    if (numfits > 0)
+    {
+        SaveCompressedBlockData(BC6H_data, quantEndPoints[bestFit], best_Indices[bestFit], max_subsets, ModeFitOrder[bestFit]);
+        return error;
+    }
+
+    // Should not get here!
+    return error;
+}
+
+void CompressBlockBC6_Internal(CMP_GLOBAL  unsigned char*outdata, 
+                               CGU_UINT32 destIdx,
+                               BC6H_Encode_local * BC6HEncode_local,
+                               CMP_GLOBAL const BC6H_Encode *BC6HEncode)
+{
+    //printf("---SRC---\n");
+    //CGU_UINT8    blkindex = 0;
+    //CGU_UINT8    srcindex = 0;
+    //for ( CGU_INT32 j = 0; j < 16; j++) {
+    //    printf("%5.0f,",BC6HEncode_local->din[j][0]);// R
+    //    printf("%5.0f,",BC6HEncode_local->din[j][1]);// G
+    //    printf("%5.0f,",BC6HEncode_local->din[j][2]);// B
+    //    printf("%5.0f\n,",BC6HEncode_local->din[j][3]);// No Alpha
+    //}
+
+    CGU_UINT8 Cmp_Red_Block[16] = { 0xc2,0x7b,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xe0,0x03,0x00,0x00,0x00,0x00,0x00 };
+
+    CGU_FLOAT bestError = CMP_FLOAT_MAX;
+    CGU_FLOAT error = CMP_FLOAT_MAX;
+    CGU_INT8 bestShape = 0;
+    CGU_FLOAT quality = BC6HEncode->m_quality;
+    BC6HEncode_local->issigned = BC6HEncode->m_isSigned;
+    // run through no partition first
+    error = FindBestPattern(BC6HEncode_local, false, 0, quality);
+    if (error < bestError)
+    {
+        bestError = error;
+        bestShape = -1;
+
+        memcpy((CGU_UINT8 *)BC6HEncode_local->cur_best_shape_indices,(CGU_UINT8 *) BC6HEncode_local->shape_indices, sizeof(BC6HEncode_local->shape_indices));
+        memcpy((CGU_UINT8 *)BC6HEncode_local->cur_best_partition    ,(CGU_UINT8 *) BC6HEncode_local->partition, sizeof(BC6HEncode_local->partition));
+        memcpy((CGU_UINT8 *)BC6HEncode_local->cur_best_fEndPoints   ,(CGU_UINT8 *) BC6HEncode_local->fEndPoints, sizeof(BC6HEncode_local->fEndPoints));
+        memcpy((CGU_UINT8 *)BC6HEncode_local->cur_best_entryCount   ,(CGU_UINT8 *) BC6HEncode_local->entryCount, sizeof(BC6HEncode_local->entryCount));
+        BC6HEncode_local->d_shape_index = bestShape;
+    }
+
+
+    // run through 32 possible partition set
+    for (CGU_INT8 shape = 0; shape < MAX_BC6H_PARTITIONS; shape++)
+    {
+        error = FindBestPattern(BC6HEncode_local, true, shape, quality);
+        if (error < bestError)
+        {
+            bestError = error;
+            bestShape = shape;
+
+            memcpy((CGU_UINT8 *)BC6HEncode_local->cur_best_shape_indices, (CGU_UINT8 *)BC6HEncode_local->shape_indices, sizeof(BC6HEncode_local->shape_indices));
+            memcpy((CGU_UINT8 *)BC6HEncode_local->cur_best_partition    , (CGU_UINT8 *)BC6HEncode_local->partition, sizeof(BC6HEncode_local->partition));
+            memcpy((CGU_UINT8 *)BC6HEncode_local->cur_best_fEndPoints   , (CGU_UINT8 *)BC6HEncode_local->fEndPoints, sizeof(BC6HEncode_local->fEndPoints));
+            memcpy((CGU_UINT8 *)BC6HEncode_local->cur_best_entryCount   , (CGU_UINT8 *)BC6HEncode_local->entryCount, sizeof(BC6HEncode_local->entryCount));
+            BC6HEncode_local->d_shape_index = bestShape;
+        }
+        else
+        {
+            if (bestShape != -1)
+            {
+                BC6HEncode_local->d_shape_index = bestShape;
+                memcpy((CGU_UINT8 *)BC6HEncode_local->shape_indices, (CGU_UINT8 *)BC6HEncode_local->cur_best_shape_indices, sizeof(BC6HEncode_local->shape_indices));
+                memcpy((CGU_UINT8 *)BC6HEncode_local->partition    , (CGU_UINT8 *)BC6HEncode_local->cur_best_partition, sizeof(BC6HEncode_local->partition));
+                memcpy((CGU_UINT8 *)BC6HEncode_local->fEndPoints   , (CGU_UINT8 *)BC6HEncode_local->cur_best_fEndPoints, sizeof(BC6HEncode_local->fEndPoints));
+                memcpy((CGU_UINT8 *)BC6HEncode_local->entryCount   , (CGU_UINT8 *)BC6HEncode_local->cur_best_entryCount, sizeof(BC6HEncode_local->entryCount));
+            }
+        }
+    }
+
+    bestError = EncodePattern(BC6HEncode_local, bestError);
+
+
+    // used for debugging modes, set the value you want to debug with
+    if (BC6HEncode_local->m_mode != 0)
+    {
+        // do final encoding and save to output block
+        SaveDataBlock(BC6HEncode_local, &outdata[destIdx]);
+    }
+   else
+   {
+       for (CGU_INT i = 0; i < 16; i++)
+           outdata[destIdx + i] = Cmp_Red_Block[i];
+   }
+}
+
+//============================================== USER INTERFACES ========================================================
+
+#ifndef ASPM_GPU
+#ifndef ASPM
+//======================= DECOMPRESS =========================================
+using namespace std;
+
+static AMD_BC6H_Format extract_format(const CGU_UINT8 in[COMPRESSED_BLOCK_SIZE])
+{
+    AMD_BC6H_Format bc6h_format;
+    unsigned short decvalue;
+    CGU_UINT8 iData[COMPRESSED_BLOCK_SIZE];
+    memcpy(iData,in,COMPRESSED_BLOCK_SIZE);
+
+    memset(&bc6h_format,0,sizeof(AMD_BC6H_Format));
+
+    // 2 bit mode has Mode bit:2 = 0 and mode bits:1 = 0 or 1
+    // 5 bit mode has Mode bit:2 = 1 
+    if ((in[0]&0x02) > 0)
+    {
+        decvalue = (in[0]&0x1F);    // first five bits
+    }
+    else
+    {
+        decvalue = (in[0]&0x01);    // first two bits
+    }
+
+    BitHeader header(in,16);
+    
+    switch (decvalue)
+    {
+    case 0x00:
+                bc6h_format.m_mode          = 1; // 10:5:5:5
+                bc6h_format.wBits           = 10;
+                bc6h_format.tBits[C_RED]    = 5;
+                bc6h_format.tBits[C_GREEN]  = 5;
+                bc6h_format.tBits[C_BLUE]   = 5;
+                bc6h_format.rw = header.getvalue(5 ,10);            // 10:   rw[9:0] 
+                bc6h_format.rx = header.getvalue(35,5);             // 5:    rx[4:0]
+                bc6h_format.ry = header.getvalue(65,5);             // 5:    ry[4:0]
+                bc6h_format.rz = header.getvalue(71,5);             // 5:    rz[4:0]
+                bc6h_format.gw = header.getvalue(15,10);            // 10:   gw[9:0]
+                bc6h_format.gx = header.getvalue(45,5);             // 5:    gx[4:0]
+                bc6h_format.gy = header.getvalue(41,4) |            // 5:    gy[3:0]
+                                (header.getvalue(2,1) << 4);        //       gy[4]
+                bc6h_format.gz = header.getvalue(51,4) |            // 5:    gz[3:0]
+                                (header.getvalue(40,1) << 4);       //       gz[4]
+                bc6h_format.bw = header.getvalue(25,10);            // 10:   bw[9:0]
+                bc6h_format.bx = header.getvalue(55,5);             // 5:    bx[4:0]
+                bc6h_format.by = header.getvalue(61,4) |            // 5:    by[3:0]
+                                (header.getvalue(3,1) << 4);        //       by[4]
+                bc6h_format.bz = header.getvalue(50,1) |            // 5:    bz[0]
+                                (header.getvalue(60,1) << 1) |      //       bz[1]
+                                (header.getvalue(70,1) << 2) |      //       bz[2]
+                                (header.getvalue(76,1) << 3) |      //       bz[3]
+                                (header.getvalue(4 ,1) << 4);       //       bz[4]
+                break;
+    case 0x01:
+                bc6h_format.m_mode          = 2;    // 7:6:6:6
+                bc6h_format.wBits           = 7;
+                bc6h_format.tBits[C_RED]    = 6;
+                bc6h_format.tBits[C_GREEN]  = 6;
+                bc6h_format.tBits[C_BLUE]   = 6;
+                bc6h_format.rw = header.getvalue(5,7);               // 7:    rw[6:0] 
+                bc6h_format.rx = header.getvalue(35,6);              // 6:    rx[5:0]
+                bc6h_format.ry = header.getvalue(65,6);              // 6:    ry[5:0]
+                bc6h_format.rz = header.getvalue(71,6);              // 6:    rz[5:0]
+                bc6h_format.gw = header.getvalue(15,7);              // 7:    gw[6:0]
+                bc6h_format.gx = header.getvalue(45,6);              // 6:    gx[5:0]
+                bc6h_format.gy = header.getvalue(41,4)    |          // 6:    gy[3:0]
+                                (header.getvalue(24,1) << 4) |       //       gy[4]
+                                (header.getvalue(2,1)   << 5);       //       gy[5]
+                bc6h_format.gz = header.getvalue(51,4)    |          // 6:    gz[3:0]
+                                (header.getvalue(3,1) << 4) |        //       gz[4]
+                                (header.getvalue(4,1) << 5);         //       gz[5]
+                bc6h_format.bw = header.getvalue(25,7);              // 7:    bw[6:0]
+                bc6h_format.bx = header.getvalue(55,6);              // 6:    bx[5:0]
+                bc6h_format.by = header.getvalue(61,4)    |          // 6:    by[3:0]
+                                (header.getvalue(14,1) << 4) |       //       by[4]
+                                (header.getvalue(22,1) << 5);        //       by[5]
+                bc6h_format.bz = header.getvalue(12,1)    |          // 6:    bz[0]
+                                (header.getvalue(13,1) << 1) |       //       bz[1]
+                                (header.getvalue(23,1) << 2) |       //       bz[2]
+                                (header.getvalue(32,1) << 3) |       //       bz[3]
+                                (header.getvalue(34,1) << 4) |       //       bz[4]
+                                (header.getvalue(33,1) << 5);        //       bz[5]
+                break;
+    case 0x02:
+                bc6h_format.m_mode          = 3;  // 11:5:4:4
+                bc6h_format.wBits           = 11;
+                bc6h_format.tBits[C_RED]    = 5;
+                bc6h_format.tBits[C_GREEN]  = 4;
+                bc6h_format.tBits[C_BLUE]   = 4;
+                bc6h_format.rw = header.getvalue(5,10)  |            //11:    rw[9:0] 
+                                (header.getvalue(40,1) << 10);       //       rw[10]
+                bc6h_format.rx = header.getvalue(35,5);              // 5:    rx[4:0]
+                bc6h_format.ry = header.getvalue(65,5);              // 5:    ry[4:0]
+                bc6h_format.rz = header.getvalue(71,5);              // 5:    rz[4:0]
+                bc6h_format.gw = header.getvalue(15,10) |            //11:    gw[9:0]
+                                (header.getvalue(49,1) << 10);       //       gw[10]
+                bc6h_format.gx = header.getvalue(45,4);              //4:     gx[3:0]
+                bc6h_format.gy = header.getvalue(41,4);              //4:     gy[3:0]
+                bc6h_format.gz = header.getvalue(51,4);              //4:     gz[3:0]
+                bc6h_format.bw = header.getvalue(25,10) |            //11:    bw[9:0]
+                                (header.getvalue(59,1) << 10);       //       bw[10]
+                bc6h_format.bx = header.getvalue(55,4);              //4:     bx[3:0]
+                bc6h_format.by = header.getvalue(61,4);              //4:     by[3:0]
+                bc6h_format.bz = header.getvalue(50,1) |             //4:     bz[0]
+                                (header.getvalue(60,1) << 1) |       //       bz[1]
+                                (header.getvalue(70,1) << 2) |       //       bz[2]
+                                (header.getvalue(76,1) << 3);        //       bz[3]
+                break;
+    case 0x06:
+                bc6h_format.m_mode          = 4;  // 11:4:5:4
+                bc6h_format.wBits           = 11;
+                bc6h_format.tBits[C_RED]    = 4;
+                bc6h_format.tBits[C_GREEN]  = 5;
+                bc6h_format.tBits[C_BLUE]   = 4;
+                bc6h_format.rw = header.getvalue(5,10)  |             //11:   rw[9:0] 
+                                (header.getvalue(39,1) << 10);        //      rw[10]
+                bc6h_format.rx = header.getvalue(35,4);               //4:    rx[3:0]
+                bc6h_format.ry = header.getvalue(65,4);               //4:    ry[3:0]
+                bc6h_format.rz = header.getvalue(71,4);               //4:    rz[3:0]
+                bc6h_format.gw = header.getvalue(15,10) |             //11:   gw[9:0]
+                                (header.getvalue(50,1) << 10);        //      gw[10]
+                bc6h_format.gx = header.getvalue(45,5);               //5:    gx[4:0]
+                bc6h_format.gy = header.getvalue(41,4) |              //5:    gy[3:0]
+                                (header.getvalue(75,1) << 4);         //      gy[4]
+                bc6h_format.gz = header.getvalue(51,4) |              //5:    gz[3:0]
+                                (header.getvalue(40,1) << 4);         //      gz[4]
+                bc6h_format.bw = header.getvalue(25,10) |             //11:   bw[9:0]
+                                (header.getvalue(59,1) << 10);        //      bw[10]
+                bc6h_format.bx = header.getvalue(55,4);               //4:    bx[3:0]
+                bc6h_format.by = header.getvalue(61,4);               //4:    by[3:0]
+                bc6h_format.bz = header.getvalue(69,1) |              //4:    bz[0]
+                                (header.getvalue(60,1) << 1) |        //      bz[1]
+                                (header.getvalue(70,1) << 2) |        //      bz[2]
+                                (header.getvalue(76,1) << 3);         //      bz[3]
+                break;
+    case 0x0A:
+                bc6h_format.m_mode          = 5; // 11:4:4:5
+                bc6h_format.wBits           = 11;
+                bc6h_format.tBits[C_RED]    = 4;
+                bc6h_format.tBits[C_GREEN]  = 4;
+                bc6h_format.tBits[C_BLUE]   = 5;
+                bc6h_format.rw = header.getvalue(5,10)  |             //11:   rw[9:0] 
+                                (header.getvalue(39,1) << 10);        //      rw[10]
+                bc6h_format.rx = header.getvalue(35,4);               //4:    rx[3:0]
+                bc6h_format.ry = header.getvalue(65,4);               //4:    ry[3:0]
+                bc6h_format.rz = header.getvalue(71,4);               //4:    rz[3:0]
+                bc6h_format.gw = header.getvalue(15,10) |             //11:   gw[9:0]
+                                (header.getvalue(49,1) << 10);        //      gw[10]
+                bc6h_format.gx = header.getvalue(45,4);               //4:    gx[3:0]
+                bc6h_format.gy = header.getvalue(41,4);               //4:    gy[3:0]
+                bc6h_format.gz = header.getvalue(51,4);               //4:    gz[3:0]
+                bc6h_format.bw = header.getvalue(25,10) |             //11:   bw[9:0]
+                                (header.getvalue(60,1) << 10);        //      bw[10]
+                bc6h_format.bx = header.getvalue(55,5);               //5:    bx[4:0]
+                bc6h_format.by = header.getvalue(61,4);               //5:    by[3:0]
+                                (header.getvalue(40,1) << 4);         //      by[4]
+                bc6h_format.bz = header.getvalue(50,1) |              //5:    bz[0]
+                                (header.getvalue(69,1) << 1) |        //      bz[1]
+                                (header.getvalue(70,1) << 2) |        //      bz[2]
+                                (header.getvalue(76,1) << 3) |        //      bz[3]
+                                (header.getvalue(75,1) << 4);         //      bz[4]
+                break;
+    case 0x0E:
+                bc6h_format.m_mode          = 6;  // 9:5:5:5
+                bc6h_format.wBits           = 9;
+                bc6h_format.tBits[C_RED]    = 5;
+                bc6h_format.tBits[C_GREEN]  = 5;
+                bc6h_format.tBits[C_BLUE]   = 5;
+                bc6h_format.rw = header.getvalue(5,9);                 //9:   rw[8:0] 
+                bc6h_format.gw = header.getvalue(15,9);                //9:   gw[8:0]
+                bc6h_format.bw = header.getvalue(25,9);                //9:   bw[8:0]
+                bc6h_format.rx = header.getvalue(35,5);                //5:   rx[4:0]
+                bc6h_format.gx = header.getvalue(45,5);                //5:   gx[4:0]
+                bc6h_format.bx = header.getvalue(55,5);                //5:   bx[4:0]
+                bc6h_format.ry = header.getvalue(65,5);                //5:   ry[4:0]
+                bc6h_format.gy = header.getvalue(41,4) |               //5:   gy[3:0]
+                                (header.getvalue(24,1) << 4);          //     gy[4]
+                bc6h_format.by = header.getvalue(61,4) |               //5:   by[3:0]
+                                (header.getvalue(14,1) << 4);          //     by[4]
+                bc6h_format.rz = header.getvalue(71,5);                //5:   rz[4:0]
+                bc6h_format.gz = header.getvalue(51,4) |               //5:   gz[3:0]
+                                (header.getvalue(40,1) << 4);          //     gz[4]
+                bc6h_format.bz = header.getvalue(50,1) |               //5:   bz[0]
+                                (header.getvalue(60,1) << 1) |         //     bz[1]
+                                (header.getvalue(70,1) << 2) |         //     bz[2]
+                                (header.getvalue(76,1) << 3) |         //     bz[3]
+                                (header.getvalue(34,1) << 4);          //     bz[4]
+                break;
+    case 0x12:
+                bc6h_format.m_mode          = 7;  // 8:6:5:5
+                bc6h_format.wBits           = 8;
+                bc6h_format.tBits[C_RED]    = 6;
+                bc6h_format.tBits[C_GREEN]  = 5;
+                bc6h_format.tBits[C_BLUE]   = 5;
+                bc6h_format.rw = header.getvalue(5,8);                 //8:    rw[7:0] 
+                bc6h_format.gw = header.getvalue(15,8);                //8:    gw[7:0]
+                bc6h_format.bw = header.getvalue(25,8);                //8:    bw[7:0]
+                bc6h_format.rx = header.getvalue(35,6);                //6:    rx[5:0]
+                bc6h_format.gx = header.getvalue(45,5);                //5:    gx[4:0]
+                bc6h_format.bx = header.getvalue(55,5);                //5:    bx[4:0]
+                bc6h_format.ry = header.getvalue(65,6);                //6:    ry[5:0]
+                bc6h_format.gy = header.getvalue(41,4) |               //5:    gy[3:0]
+                                (header.getvalue(24,1) << 4);          //      gy[4]
+                bc6h_format.by = header.getvalue(61,4) |               //5:    by[3:0]
+                                (header.getvalue(14,1) << 4);          //      by[4]
+                bc6h_format.rz = header.getvalue(71,6);                //6:    rz[5:0]
+                bc6h_format.gz = header.getvalue(51,4) |               //5:    gz[3:0]
+                                (header.getvalue(13,1) << 4);          //      gz[4]
+                bc6h_format.bz = header.getvalue(50,1) |               //5:    bz[0]
+                                (header.getvalue(60,1) << 1) |         //      bz[1]
+                                (header.getvalue(23,1) << 2) |         //      bz[2]
+                                (header.getvalue(33,1) << 3) |         //      bz[3]
+                                (header.getvalue(34,1) << 4);          //      bz[4]
+                break;
+    case 0x16:
+                bc6h_format.m_mode          = 8;  // 8:5:6:5
+                bc6h_format.wBits           = 8;
+                bc6h_format.tBits[C_RED]    = 5;
+                bc6h_format.tBits[C_GREEN]  = 6;
+                bc6h_format.tBits[C_BLUE]   = 5;
+                bc6h_format.rw = header.getvalue(5,8);                 //8:    rw[7:0] 
+                bc6h_format.gw = header.getvalue(15,8);                //8:    gw[7:0]
+                bc6h_format.bw = header.getvalue(25,8);                //8:    bw[7:0]
+                bc6h_format.rx = header.getvalue(35,5);                //5:    rx[4:0]
+                bc6h_format.gx = header.getvalue(45,6);                //6:    gx[5:0]
+                bc6h_format.bx = header.getvalue(55,5);                //5:    bx[4:0]
+                bc6h_format.ry = header.getvalue(65,5);                //5:    ry[4:0]
+                bc6h_format.gy = header.getvalue(41,4) |               //6:    gy[3:0]
+                                (header.getvalue(24,1) << 4) |         //      gy[4]
+                                (header.getvalue(23,1) << 5);          //      gy[5]
+                bc6h_format.by = header.getvalue(61,4) |               //5:    by[3:0]
+                                (header.getvalue(14,1) << 4);          //      by[4]
+                bc6h_format.rz = header.getvalue(71,5);                //5:    rz[4:0]
+                bc6h_format.gz = header.getvalue(51,4) |               //6:    gz[3:0]
+                                (header.getvalue(40,1) << 4) |         //      gz[4]
+                                (header.getvalue(33,1) << 5);          //      gz[5]
+                bc6h_format.bz = header.getvalue(13,1) |               //5:    bz[0]
+                                (header.getvalue(60,1) << 1) |         //      bz[1]
+                                (header.getvalue(70,1) << 2) |         //      bz[2]
+                                (header.getvalue(76,1) << 3) |         //      bz[3]
+                                (header.getvalue(34,1) << 4);          //      bz[4]
+                break;
+    case 0x1A:
+                bc6h_format.m_mode          = 9;  // 8:5:5:6
+                bc6h_format.wBits           = 8;
+                bc6h_format.tBits[C_RED]    = 5;
+                bc6h_format.tBits[C_GREEN]  = 5;
+                bc6h_format.tBits[C_BLUE]   = 6;
+                bc6h_format.rw = header.getvalue(5,8);                 //8:    rw[7:0] 
+                bc6h_format.gw = header.getvalue(15,8);                //8:    gw[7:0]
+                bc6h_format.bw = header.getvalue(25,8);                //8:    bw[7:0]
+                bc6h_format.rx = header.getvalue(35,5);                //5:    rx[4:0]
+                bc6h_format.gx = header.getvalue(45,5);                //5:    gx[4:0]
+                bc6h_format.bx = header.getvalue(55,6);                //6:    bx[5:0]
+                bc6h_format.ry = header.getvalue(65,5);                //5:    ry[4:0]
+                bc6h_format.gy = header.getvalue(41,4) |               //5:    gy[3:0]
+                                (header.getvalue(24,1) << 4);          //      gy[4]
+                bc6h_format.by = header.getvalue(61,4)    |            //6:    by[3:0]
+                                (header.getvalue(14,1) << 4) |         //      by[4]
+                                (header.getvalue(23,1) << 5);          //      by[5]
+                bc6h_format.rz = header.getvalue(71,5);                //5:    rz[4:0]
+                bc6h_format.gz = header.getvalue(51,4) |               //5:    gz[3:0]
+                                (header.getvalue(40,1) << 4);          //      gz[4]
+                bc6h_format.bz = header.getvalue(50,1) |               //6:    bz[0]
+                                (header.getvalue(13,1) << 1) |         //      bz[1]
+                                (header.getvalue(70,1) << 2) |         //      bz[2]
+                                (header.getvalue(76,1) << 3) |         //      bz[3]
+                                (header.getvalue(34,1) << 4) |         //      bz[4]
+                                (header.getvalue(33,1) << 5);          //      bz[5]
+                break;
+    case 0x1E:
+                bc6h_format.m_mode          = 10;  // 6:6:6:6
+                bc6h_format.istransformed   = FALSE;
+                bc6h_format.wBits           = 6;
+                bc6h_format.tBits[C_RED]    = 6;
+                bc6h_format.tBits[C_GREEN]  = 6;
+                bc6h_format.tBits[C_BLUE]   = 6;
+                bc6h_format.rw = header.getvalue(5,6);                 //6:    rw[5:0] 
+                bc6h_format.gw = header.getvalue(15,6);                //6:    gw[5:0]
+                bc6h_format.bw = header.getvalue(25,6);                //6:    bw[5:0]
+                bc6h_format.rx = header.getvalue(35,6);                //6:    rx[5:0]
+                bc6h_format.gx = header.getvalue(45,6);                //6:    gx[5:0]
+                bc6h_format.bx = header.getvalue(55,6);                //6:    bx[5:0]
+                bc6h_format.ry = header.getvalue(65,6);                //6:    ry[5:0]
+                bc6h_format.gy = header.getvalue(41,4) |               //6:    gy[3:0]
+                                (header.getvalue(24,1) << 4) |         //      gy[4]
+                                (header.getvalue(21,1) << 5);          //      gy[5]
+                bc6h_format.by = header.getvalue(61,4)    |            //6:    by[3:0]
+                                (header.getvalue(14,1) << 4) |         //      by[4]
+                                (header.getvalue(22,1) << 5);          //      by[5]
+                bc6h_format.rz = header.getvalue(71,6);                //6:    rz[5:0]
+                bc6h_format.gz = header.getvalue(51,4) |               //6:    gz[3:0]
+                                (header.getvalue(11,1) << 4) |         //      gz[4]
+                                (header.getvalue(31,1) << 5);          //      gz[5]
+                bc6h_format.bz = header.getvalue(12,1) |               //6:    bz[0]
+                                (header.getvalue(13,1) << 1) |         //      bz[1]
+                                (header.getvalue(23,1) << 2) |         //      bz[2]
+                                (header.getvalue(32,1) << 3) |         //      bz[3]
+                                (header.getvalue(34,1) << 4) |         //      bz[4]
+                                (header.getvalue(33,1) << 5);          //      bz[5]
+                break;
+
+    // Single region modes    
+    case 0x03:
+                bc6h_format.m_mode            = 11;  // 10:10
+                bc6h_format.wBits             = 10;
+                bc6h_format.tBits[C_RED]      = 10;
+                bc6h_format.tBits[C_GREEN]    = 10;
+                bc6h_format.tBits[C_BLUE]     = 10;
+                bc6h_format.rw = header.getvalue(5,10);             // 10: rw[9:0] 
+                bc6h_format.gw = header.getvalue(15,10);            // 10: gw[9:0]
+                bc6h_format.bw = header.getvalue(25,10);            // 10: bw[9:0]
+                bc6h_format.rx = header.getvalue(35,10);            // 10: rx[9:0]
+                bc6h_format.gx = header.getvalue(45,10);            // 10: gx[9:0]
+                bc6h_format.bx = header.getvalue(55,10);            // 10: bx[9:0]
+                break;
+    case 0x07:
+                bc6h_format.m_mode              = 12;  // 11:9
+                bc6h_format.wBits               = 11;
+                bc6h_format.tBits[C_RED]        = 9;
+                bc6h_format.tBits[C_GREEN]      = 9;
+                bc6h_format.tBits[C_BLUE]       = 9;
+                bc6h_format.rw = header.getvalue(5,10) |               // 10:   rw[9:0] 
+                                (header.getvalue(44,1) << 10);         //       rw[10]
+                bc6h_format.gw = header.getvalue(15,10) |              // 10:   gw[9:0]
+                                (header.getvalue(54,1) << 10);         //       gw[10]
+                bc6h_format.bw = header.getvalue(25,10) |              // 10:   bw[9:0]
+                                (header.getvalue(64,1) << 10);         //       bw[10]
+                bc6h_format.rx = header.getvalue(35,9);                // 9:    rx[8:0]
+                bc6h_format.gx = header.getvalue(45,9);                // 9:    gx[8:0]
+                bc6h_format.bx = header.getvalue(55,9);                // 9:    bx[8:0]
+                break;
+    case 0x0B:
+                bc6h_format.m_mode              = 13;  // 12:8
+                bc6h_format.wBits               = 12;
+                bc6h_format.tBits[C_RED]        = 8;
+                bc6h_format.tBits[C_GREEN]      = 8;
+                bc6h_format.tBits[C_BLUE]       = 8;
+                bc6h_format.rw = header.getvalue(5, 10) |               // 12:   rw[9:0] 
+                                 (header.getvalue(43, 1) << 11) |       //       rw[11]
+                                 (header.getvalue(44, 1) << 10);        //       rw[10]
+                bc6h_format.gw = header.getvalue(15, 10) |              // 12:   gw[9:0]
+                                 (header.getvalue(53, 1) << 11) |       //       gw[11]
+                                 (header.getvalue(54, 1) << 10);        //       gw[10]
+                bc6h_format.bw = header.getvalue(25,10) |               // 12:   bw[9:0]
+                                 (header.getvalue(63, 1) << 11) |       //       bw[11]
+                                 (header.getvalue(64,1) << 10);         //       bw[10]
+                bc6h_format.rx = header.getvalue(35,8);                 //  8:   rx[7:0]
+                bc6h_format.gx = header.getvalue(45,8);                 //  8:   gx[7:0]
+                bc6h_format.bx = header.getvalue(55,8);                 //  8:   bx[7:0]
+                break;
+    case 0x0F:
+                bc6h_format.m_mode          = 14;  // 16:4
+                bc6h_format.wBits           = 16;
+                bc6h_format.tBits[C_RED]    = 4;
+                bc6h_format.tBits[C_GREEN]  = 4;
+                bc6h_format.tBits[C_BLUE]   = 4;
+                bc6h_format.rw = header.getvalue(5,10) |                // 16:   rw[9:0] 
+                                 (header.getvalue(39, 1) << 15) |       //       rw[15]
+                                 (header.getvalue(40, 1) << 14) |       //       rw[14]
+                                 (header.getvalue(41, 1) << 13) |       //       rw[13]
+                                 (header.getvalue(42, 1) << 12) |       //       rw[12]
+                                 (header.getvalue(43, 1) << 11) |       //       rw[11]
+                                 (header.getvalue(44, 1) << 10);        //       rw[10]
+                bc6h_format.gw = header.getvalue(15,10) |               // 16:   gw[9:0]
+                                 (header.getvalue(49, 1) << 15) |       //       gw[15]
+                                 (header.getvalue(50, 1) << 14) |       //       gw[14]
+                                 (header.getvalue(51, 1) << 13) |       //       gw[13]
+                                 (header.getvalue(52, 1) << 12) |       //       gw[12]
+                                 (header.getvalue(53, 1) << 11) |       //       gw[11]
+                                 (header.getvalue(54, 1) << 10);        //       gw[10]
+                bc6h_format.bw = header.getvalue(25,10) |               // 16:   bw[9:0]
+                                 (header.getvalue(59, 1) << 15) |       //       bw[15]
+                                 (header.getvalue(60, 1) << 14) |       //       bw[14]
+                                 (header.getvalue(61, 1) << 13) |       //       bw[13]
+                                 (header.getvalue(62, 1) << 12) |       //       bw[12]
+                                 (header.getvalue(63, 1) << 11) |       //       bw[11]
+                                 (header.getvalue(64, 1) << 10);        //       bw[10]
+                bc6h_format.rx = header.getvalue(35,4);                 // 4:    rx[3:0]
+                bc6h_format.gx = header.getvalue(45,4);                 // 4:    gx[3:0]
+                bc6h_format.bx = header.getvalue(55,4);                 // 4:    bx[3:0]
+                break;
+    default:
+                bc6h_format.m_mode = 0;
+                return bc6h_format;
+    }
+
+    // Each format in the mode table can be uniquely identified by the mode bits. 
+    // The first ten modes are used for two-region tiles, and the mode bit field 
+    // can be either two or five bits long. These blocks also have fields for 
+    // the compressed color endpoints (72 or 75 bits), the partition (5 bits), 
+    // and the partition indices (46 bits).
+
+    if (bc6h_format.m_mode <= 10) 
+    {
+        bc6h_format.region = BC6_TWO;
+        // Get the shape index bits 77 to 81
+        bc6h_format.d_shape_index = (unsigned short) header.getvalue(77,5);
+        bc6h_format.istransformed = (bc6h_format.m_mode < 10) ? TRUE : FALSE; 
+    }
+    else 
+    {
+        bc6h_format.region           = BC6_ONE;
+        bc6h_format.d_shape_index    = 0;
+        bc6h_format.istransformed    = (bc6h_format.m_mode > 11) ? TRUE : FALSE; 
+    }
+
+    // Save the points in a form easy to compute with
+    bc6h_format.EC[0].A[0] = (CGU_FLOAT)bc6h_format.rw; 
+    bc6h_format.EC[0].B[0] = (CGU_FLOAT)bc6h_format.rx; 
+    bc6h_format.EC[1].A[0] = (CGU_FLOAT)bc6h_format.ry; 
+    bc6h_format.EC[1].B[0] = (CGU_FLOAT)bc6h_format.rz;
+    bc6h_format.EC[0].A[1] = (CGU_FLOAT)bc6h_format.gw; 
+    bc6h_format.EC[0].B[1] = (CGU_FLOAT)bc6h_format.gx; 
+    bc6h_format.EC[1].A[1] = (CGU_FLOAT)bc6h_format.gy; 
+    bc6h_format.EC[1].B[1] = (CGU_FLOAT)bc6h_format.gz;
+    bc6h_format.EC[0].A[2] = (CGU_FLOAT)bc6h_format.bw; 
+    bc6h_format.EC[0].B[2] = (CGU_FLOAT)bc6h_format.bx; 
+    bc6h_format.EC[1].A[2] = (CGU_FLOAT)bc6h_format.by; 
+    bc6h_format.EC[1].B[2] = (CGU_FLOAT)bc6h_format.bz;
+
+    if (bc6h_format.region    == BC6_ONE)
+    {
+        int startbits = ONE_REGION_INDEX_OFFSET;
+        bc6h_format.indices16[0] = (CGU_UINT8) header.getvalue(startbits,3);
+        startbits+=3;
+        for (int i=1; i<16; i++)
+        {
+            bc6h_format.indices16[i] = (CGU_UINT8)header.getvalue(startbits,4);
+            startbits+=4;
+        }
+    }
+    else
+    {
+        int startbit = TWO_REGION_INDEX_OFFSET, 
+            nbits = 2;
+        bc6h_format.indices16[0 ] = (CGU_UINT8)header.getvalue(startbit,2);
+        for (int i= 1; i<16; i++)
+        {
+            startbit += nbits; // offset start bit for next index using prior nbits used
+            nbits    = g_indexfixups[bc6h_format.d_shape_index] == i?2:3; // get new number of bit to save index with
+            bc6h_format.indices16[i] = (CGU_UINT8)header.getvalue(startbit,nbits);
+        }
+
+    }
+
+    return bc6h_format;
+}
+
+static void extract_compressed_endpoints(AMD_BC6H_Format& bc6h_format)
+{
+    int i;
+    int t;
+
+    if (bc6h_format.issigned)
+    {
+        if (bc6h_format.istransformed)
+        {
+            for (i=0; i<NCHANNELS; i++)
+            {
+                bc6h_format.E[0].A[i] = (CGU_FLOAT)SIGN_EXTEND(bc6h_format.EC[0].A[i],bc6h_format.wBits);
+
+                t = SIGN_EXTEND(bc6h_format.EC[0].B[i], bc6h_format.tBits[i]); //C_RED
+                t = int(t + bc6h_format.EC[0].A[i]) & MASK(bc6h_format.wBits);
+                bc6h_format.E[0].B[i] = (CGU_FLOAT)SIGN_EXTEND(t,bc6h_format.wBits);
+            }
+        }
+        else
+        {
+            for (i=0; i<NCHANNELS; i++)
+            {
+                bc6h_format.E[0].A[i] = (CGU_FLOAT)SIGN_EXTEND(bc6h_format.EC[0].A[i],bc6h_format.wBits);
+                bc6h_format.E[0].B[i] = (CGU_FLOAT)SIGN_EXTEND(bc6h_format.EC[0].B[i],bc6h_format.tBits[i]); //C_RED
+            }
+        }
+
+    }
+    else
+    {
+        if (bc6h_format.istransformed)
+        {
+            for (i=0; i<NCHANNELS; i++)
+            {
+                bc6h_format.E[0].A[i] = bc6h_format.EC[0].A[i];
+                t = SIGN_EXTEND(bc6h_format.EC[0].B[i], bc6h_format.tBits[i]); //C_RED
+                bc6h_format.E[0].B[i] = CGU_FLOAT(CGU_INT(t + bc6h_format.EC[0].A[i]) & MASK(bc6h_format.wBits));
+            }
+        }
+        else
+        {
+            for (i=0; i<NCHANNELS; i++)
+            {
+                bc6h_format.E[0].A[i] = bc6h_format.EC[0].A[i];
+                bc6h_format.E[0].B[i] = bc6h_format.EC[0].B[i];
+            }
+        }
+    }
+    
+}
+
+// NV code: Used with modifcations
+static int unquantize(AMD_BC6H_Format& bc6h_format, int q, int prec)
+{
+    int unq = 0, s;
+
+    switch (bc6h_format.format)
+    {
+        // modify this case to move the multiplication by 31 after interpolation.
+        // Need to use finish_unquantize.
+
+        // since we have 16 bits available, let's unquantize this to 16 bits unsigned
+        // thus the scale factor is [0-7c00)/[0-10000) = 31/64
+        case UNSIGNED_F16:
+            if (prec >= 15) 
+                unq = q;
+            else if (q == 0) 
+                unq = 0;
+            else if (q == ((1<<prec)-1)) 
+                unq = U16MAX;
+            else
+                unq = (q * (U16MAX+1) + (U16MAX+1)/2) >> prec;
+            break;
+
+        // here, let's stick with S16 (no apparent quality benefit from going to S17)
+        // range is (-7c00..7c00)/(-8000..8000) = 31/32
+        case SIGNED_F16:
+            // don't remove this test even though it appears equivalent to the code below
+            // as it isn't -- the code below can overflow for prec = 16
+            if (prec >= 16)
+                unq = q;
+            else
+            {
+                if (q < 0) { s = 1; q = -q; } else s = 0;
+
+                if (q == 0)
+                    unq = 0;
+                else if (q >= ((1<<(prec-1))-1))
+                    unq = s ? -S16MAX : S16MAX;
+                else
+                {
+                    unq = (q * (S16MAX+1) + (S16MAX+1)/2) >> (prec-1);
+                    if (s)
+                        unq = -unq;
+                }
+            }
+            break;
+        }
+        return unq;
+}
+
+static int lerp(int a, int b, int i, int denom)
+{
+    assert (denom == 3 || denom == 7 || denom == 15);
+    assert (i >= 0 && i <= denom);
+    
+    int shift = 6, *weights = NULL;
+
+    switch(denom)
+    {
+    case 3:        denom *= 5; i *= 5;    // fall through to case 15
+    case 15:    weights = g_aWeights4; break;
+    case 7:        weights = g_aWeights3; break;
+    default:    assert(0);
+    }
+
+    #pragma warning(disable:4244)
+    // no need to round these as this is an exact division
+    return (int)(a*weights[denom-i] +b*weights[i]) / float(1 << shift);
+}
+
+static int finish_unquantize(AMD_BC6H_Format bc6h_format, int q)
+{
+    if (bc6h_format.format == UNSIGNED_F16)
+        return (q * 31) >> 6;                                        // scale the magnitude by 31/64
+    else if (bc6h_format.format == SIGNED_F16)
+        return (q < 0) ? -(((-q) * 31) >> 5) : (q * 31) >> 5;        // scale the magnitude by 31/32
+    else
+        return q;
+}
+
+static void generate_palette_quantized(int max, AMD_BC6H_Format& bc6h_format, int region)
+{
+    // scale endpoints
+    int a, b, c;            // really need a IntVec3...
+
+    a = unquantize(bc6h_format, bc6h_format.E[region].A[0], bc6h_format.wBits); 
+    b = unquantize(bc6h_format, bc6h_format.E[region].B[0], bc6h_format.wBits);
+
+    // interpolate : This part of code is used for debuging data 
+    for (int i = 0; i < max; i++)
+    {
+        c = finish_unquantize(bc6h_format, lerp(a, b, i, max-1));
+        bc6h_format.Palete[region][i].x = c;
+    }
+
+    a = unquantize(bc6h_format, bc6h_format.E[region].A[1], bc6h_format.wBits); 
+    b = unquantize(bc6h_format, bc6h_format.E[region].B[1], bc6h_format.wBits);
+
+    // interpolate
+    for (int i = 0; i < max; i++)
+        bc6h_format.Palete[region][i].y = finish_unquantize(bc6h_format, lerp(a, b, i, max-1));
+
+    a = unquantize(bc6h_format,bc6h_format.E[region].A[2], bc6h_format.wBits); 
+    b = unquantize(bc6h_format,bc6h_format.E[region].B[2], bc6h_format.wBits);
+
+    // interpolate
+    for (int i = 0; i < max; i++)
+        bc6h_format.Palete[region][i].z = finish_unquantize(bc6h_format, lerp(a, b, i, max-1));
+}
+
+// NV code : used with modifications
+static void extract_compressed_endpoints2(AMD_BC6H_Format& bc6h_format)
+{
+    int i;
+    int t;
+
+    if (bc6h_format.issigned)
+    {
+        if (bc6h_format.istransformed)
+        {
+            for (i=0; i<NCHANNELS; i++)
+            {
+                bc6h_format.E[0].A[i] = SIGN_EXTEND(bc6h_format.EC[0].A[i],bc6h_format.wBits);
+
+                t = SIGN_EXTEND(bc6h_format.EC[0].B[i], bc6h_format.tBits[i]); // C_RED
+                t = int(t + bc6h_format.EC[0].A[i]) & MASK(bc6h_format.wBits);
+                bc6h_format.E[0].B[i] = SIGN_EXTEND(t,bc6h_format.wBits);
+                
+                t = SIGN_EXTEND(bc6h_format.EC[1].A[i], bc6h_format.tBits[i]); //C_GREEN
+                t = int(t + bc6h_format.EC[0].A[i]) & MASK(bc6h_format.wBits);
+                bc6h_format.E[1].A[i] = SIGN_EXTEND(t,bc6h_format.wBits);
+                
+                t = SIGN_EXTEND(bc6h_format.EC[1].B[i], bc6h_format.tBits[i]); //C_BLUE
+                t = int(t + bc6h_format.EC[0].A[i]) & MASK(bc6h_format.wBits);
+                bc6h_format.E[1].B[i] = SIGN_EXTEND(t,bc6h_format.wBits);
+            }
+        }
+        else
+        {
+            for (i=0; i<NCHANNELS; i++)
+            {
+                bc6h_format.E[0].A[i] = SIGN_EXTEND(bc6h_format.EC[0].A[i],bc6h_format.wBits);
+                bc6h_format.E[0].B[i] = SIGN_EXTEND(bc6h_format.EC[0].B[i],bc6h_format.tBits[i]); //C_RED
+                bc6h_format.E[1].A[i] = SIGN_EXTEND(bc6h_format.EC[1].A[i],bc6h_format.tBits[i]); //C_GREEN
+                bc6h_format.E[1].B[i] = SIGN_EXTEND(bc6h_format.EC[1].B[i],bc6h_format.tBits[i]); //C_BLUE
+            }
+        }
+
+    }
+    else
+    {
+        if (bc6h_format.istransformed)
+        {
+            for (i=0; i<NCHANNELS; i++)
+            {
+                bc6h_format.E[0].A[i] = bc6h_format.EC[0].A[i];
+                t = SIGN_EXTEND(bc6h_format.EC[0].B[i], bc6h_format.tBits[i]); // C_RED
+                bc6h_format.E[0].B[i] = int(t + bc6h_format.EC[0].A[i]) & MASK(bc6h_format.wBits);
+                
+                t = SIGN_EXTEND(bc6h_format.EC[1].A[i], bc6h_format.tBits[i]); // C_GREEN
+                bc6h_format.E[1].A[i] = int(t + bc6h_format.EC[0].A[i]) & MASK(bc6h_format.wBits);
+                
+                t = SIGN_EXTEND(bc6h_format.EC[1].B[i], bc6h_format.tBits[i]); //C_BLUE
+                bc6h_format.E[1].B[i] = int(t + bc6h_format.EC[0].A[i]) & MASK(bc6h_format.wBits);
+            }
+        }
+        else
+        {
+            for (i=0; i<NCHANNELS; i++)
+            {
+                bc6h_format.E[0].A[i] = bc6h_format.EC[0].A[i];
+                bc6h_format.E[0].B[i] = bc6h_format.EC[0].B[i];
+                bc6h_format.E[1].A[i] = bc6h_format.EC[1].A[i];
+                bc6h_format.E[1].B[i] = bc6h_format.EC[1].B[i];
+            }
+        }
+    }
+    
+}
+
+void  DecompressBC6_Internal(CGU_UINT16 rgbBlock[48], const CGU_UINT8 compressedBlock[16], const BC6H_Encode *BC6HEncode)
+{
+    if (BC6HEncode) {}
+    CGU_BOOL m_bc6signed = false;
+    // now determine the mode type and extract the coded endpoints data 
+    AMD_BC6H_Format bc6h_format = extract_format(compressedBlock);
+    if (!m_bc6signed)
+        bc6h_format.format = UNSIGNED_F16;
+    else
+        bc6h_format.format = SIGNED_F16;
+
+    if(bc6h_format.region == BC6_ONE)
+    {
+        extract_compressed_endpoints(bc6h_format);
+        generate_palette_quantized(16,bc6h_format,0);
+    }
+    else //mode.type == BC6_TWO
+    {
+        extract_compressed_endpoints2(bc6h_format);
+        for (int r=0; r<2; r++)
+        {
+            generate_palette_quantized(8,bc6h_format,r);
+        }
+    }
+
+    
+    BC6H_Vec3 data;
+    int indexPos=0;
+    int rgbPos=0;
+
+    // Note first 32 BC6H_PARTIONS is shared with BC6H
+    // Partitioning is always arranged such that index 0 is always in subset 0 of BC6H_PARTIONS array 
+    // Partition order goes from top-left to bottom-right, moving left to right and then top to bottom.
+    for (int block_row = 0; block_row < 4; block_row++)
+    for (int block_col = 0; block_col < 4; block_col++)
+    {
+        // Need to check region logic
+        // gets the region (0 or 1) in the partition set
+        //int region = bc6h_format.region == BC6_ONE?0:REGION(block_col,block_row,bc6h_format.d_shape_index);
+        // for a one region partitions : its always return 0 so there is room for performance improvement
+        // by seperating the condition into another looped call.
+        //int region = bc6h_format.region == BC6_ONE?0:BC6H_PARTITIONS[1][bc6h_format.d_shape_index][indexPos];
+        int region = bc6h_format.region == BC6_ONE?0:BC6_PARTITIONS[bc6h_format.d_shape_index][indexPos];
+
+        // Index is validated as ok
+        int paleteIndex  = bc6h_format.indices[block_row][block_col];
+        
+        // this result is validated ok for region = BC6_ONE , BC6_TWO To be determined 
+        data = bc6h_format.Palete[region][paleteIndex];
+
+        rgbBlock[rgbPos++] = data.x;
+        rgbBlock[rgbPos++] = data.y;
+        rgbBlock[rgbPos++] = data.z;
+        indexPos++;
+    }
+
+}
+
+//======================= END OF DECOMPRESS CODE =========================================
+
+int CMP_CDECL CreateOptionsBC6(void **options)
+{
+    (*options) = new BC6H_Encode;
+    if (!options) return CGU_CORE_ERR_NEWMEM;
+    SetDefaultBC6Options((BC6H_Encode *)options);
+    return CGU_CORE_OK;
+}
+
+int CMP_CDECL DestroyOptionsBC6(void *options)
+{
+    if (!options) return CGU_CORE_ERR_INVALIDPTR;
+    BC6H_Encode *BCOptions = reinterpret_cast <BC6H_Encode *>(options);
+    delete BCOptions;
+    return CGU_CORE_OK;
+}
+
+int CMP_CDECL SetQualityBC6(void *options, CGU_FLOAT fquality)
+{
+    if (!options) return CGU_CORE_ERR_INVALIDPTR;
+    BC6H_Encode *BC6optionsDefault = (BC6H_Encode *)options;
+    if (fquality < 0.0f) fquality = 0.0f;
+    else
+        if (fquality > 1.0f) fquality = 1.0f;
+    BC6optionsDefault->m_quality = fquality;
+    BC6optionsDefault->m_partitionSearchSize = (BC6optionsDefault->m_quality*2.0F) / qFAST_THRESHOLD;
+    if (BC6optionsDefault->m_partitionSearchSize < (1.0F / 16.0F))
+        BC6optionsDefault->m_partitionSearchSize = (1.0F / 16.0F);
+    return CGU_CORE_OK;
+}
+
+int CMP_CDECL SetMaskBC6(void *options, CGU_UINT32 mask)
+{
+    if (!options) return CGU_CORE_ERR_INVALIDPTR;
+    BC6H_Encode *BC6options = (BC6H_Encode *)options;
+    BC6options->m_validModeMask = mask;
+    return CGU_CORE_OK;
+}
+
+int CMP_CDECL CompressBlockBC6(const CGU_UINT16 *srcBlock,
+                               unsigned int srcStrideInShorts,
+                               CMP_GLOBAL CGU_UINT8 cmpBlock[16],
+                               const CMP_GLOBAL void *options = NULL)
+{
+
+    CGU_UINT16 inBlock[48];
+
+    //----------------------------------
+    // Fill the inBlock with source data
+    //----------------------------------
+    CGU_INT srcpos = 0;
+    CGU_INT dstptr = 0;
+    for (CGU_UINT8 row = 0; row < 4; row++)
+    {
+        srcpos = row * srcStrideInShorts;
+        for (CGU_UINT8 col = 0; col < 4; col++)
+        {
+            inBlock[dstptr++] = CGU_UINT16(srcBlock[srcpos++]);
+            inBlock[dstptr++] = CGU_UINT16(srcBlock[srcpos++]);
+            inBlock[dstptr++] = CGU_UINT16(srcBlock[srcpos++]);
+        }
+    }
+
+
+    BC6H_Encode *BC6HEncode = (BC6H_Encode *)options;
+    BC6H_Encode BC6HEncodeDefault;
+
+    if (BC6HEncode == NULL)
+    {
+        BC6HEncode = &BC6HEncodeDefault;
+        SetDefaultBC6Options(BC6HEncode);
+    }
+
+    BC6H_Encode_local BC6HEncode_local;
+    memset((CGU_UINT8 *)&BC6HEncode_local, 0, sizeof(BC6H_Encode_local));
+    CGU_UINT8    blkindex = 0;
+    for ( CGU_INT32 j = 0; j < 16; j++) {
+        BC6HEncode_local.din[j][0] = inBlock[blkindex++];  // R
+        BC6HEncode_local.din[j][1] = inBlock[blkindex++];  // G
+        BC6HEncode_local.din[j][2] = inBlock[blkindex++];  // B
+        BC6HEncode_local.din[j][3] = 0;                    // A
+        }
+
+    CompressBlockBC6_Internal(cmpBlock, 0, &BC6HEncode_local,BC6HEncode);
+
+    return CGU_CORE_OK;
+}
+
+int  CMP_CDECL DecompressBlockBC6(const unsigned char cmpBlock[16],
+                            CGU_UINT16 srcBlock[48],
+                            const void *options = NULL) {
+    BC6H_Encode *BC6HEncode = (BC6H_Encode *)options;
+    BC6H_Encode BC6HEncodeDefault;
+
+    if (BC6HEncode == NULL)
+    {
+        BC6HEncode = &BC6HEncodeDefault;
+        SetDefaultBC6Options(BC6HEncode);
+    }
+    DecompressBC6_Internal(srcBlock, cmpBlock,BC6HEncode);
+
+    return CGU_CORE_OK;
+}
+
+#endif // !ASPM
+#endif // !ASPM_GPU
+
+//============================================== OpenCL USER INTERFACE ====================================================
+#ifdef ASPM_GPU
+CMP_STATIC CMP_KERNEL void CMP_GPUEncoder(
+    CMP_GLOBAL  CGU_UINT8*          p_source_pixels,
+    CMP_GLOBAL  CGU_UINT8*          p_encoded_blocks,
+    CMP_GLOBAL  Source_Info*        SourceInfo,
+    CMP_GLOBAL  BC6H_Encode *       BC6HEncode
+)
+{
+    CGU_UINT32 x = get_global_id(0);
+    CGU_UINT32 y = get_global_id(1);
+
+    if (x >= (SourceInfo->m_src_width / BYTEPP)) return;
+    if (y >= (SourceInfo->m_src_height / BYTEPP)) return;
+
+    BC6H_Encode_local BC6HEncode_local;
+    memset((CGU_UINT8 *)&BC6HEncode_local, 0, sizeof(BC6H_Encode_local));
+
+
+    CGU_UINT32 stride = SourceInfo->m_src_width * BYTEPP;
+    CGU_UINT32 srcOffset = (x*BlockX*BYTEPP) + (y*stride*BYTEPP);
+    CGU_UINT32 destI = (x*COMPRESSED_BLOCK_SIZE) + (y*(SourceInfo->m_src_width / BlockX)*COMPRESSED_BLOCK_SIZE);
+    CGU_UINT32 srcidx;
+
+    //CGU_FLOAT block4x4[16][4];
+
+    for (CGU_INT i = 0; i < BlockX; i++)
+    {
+        srcidx = i * stride;
+        for (CGU_INT j = 0; j < BlockY; j++)
+        {
+            BC6HEncode_local.din[i*BlockX + j][0] = (CGU_UINT16)(p_source_pixels[srcOffset + srcidx++]);
+            if (BC6HEncode_local.din[i*BlockX + j][0] < 0.00001 || isnan(BC6HEncode_local.din[i*BlockX + j][0]))
+            {
+                if (BC6HEncode->m_isSigned)
+                {
+                    BC6HEncode_local.din[i*BlockX + j][0] = (isnan(BC6HEncode_local.din[i*BlockX + j][0])) ? F16NEGPREC_LIMIT_VAL : -BC6HEncode_local.din[i*BlockX + j][0];
+                    if (BC6HEncode_local.din[i*BlockX + j][0] < F16NEGPREC_LIMIT_VAL) {
+                        BC6HEncode_local.din[i*BlockX + j][0] = F16NEGPREC_LIMIT_VAL;
+                    }
+                }
+                else
+                    BC6HEncode_local.din[i*BlockX + j][0] = 0.0;
+            }
+
+            BC6HEncode_local.din[i*BlockX + j][1] = (CGU_UINT16)(p_source_pixels[srcOffset + srcidx++]);
+
+            if (BC6HEncode_local.din[i*BlockX + j][1] < 0.00001 || isnan(BC6HEncode_local.din[i*BlockX + j][1]))
+            {
+                if (BC6HEncode->m_isSigned)
+                {
+                    BC6HEncode_local.din[i*BlockX + j][1] = (isnan(BC6HEncode_local.din[i*BlockX + j][1])) ? F16NEGPREC_LIMIT_VAL : -BC6HEncode_local.din[i*BlockX + j][1];
+                    if (BC6HEncode_local.din[i*BlockX + j][1] < F16NEGPREC_LIMIT_VAL) {
+                        BC6HEncode_local.din[i*BlockX + j][1] = F16NEGPREC_LIMIT_VAL;
+                    }
+                }
+                else
+                    BC6HEncode_local.din[i*BlockX + j][1] = 0.0;
+            }
+
+            BC6HEncode_local.din[i*BlockX + j][2] = (CGU_UINT16)(p_source_pixels[srcOffset + srcidx++]);
+            if (BC6HEncode_local.din[i*BlockX + j][2] < 0.00001 || isnan(BC6HEncode_local.din[i*BlockX + j][2]))
+            {
+                if (BC6HEncode->m_isSigned)
+                {
+                    BC6HEncode_local.din[i*BlockX + j][2] = (isnan(BC6HEncode_local.din[i*BlockX + j][2])) ? F16NEGPREC_LIMIT_VAL : -BC6HEncode_local.din[i*BlockX + j][2];
+                    if (BC6HEncode_local.din[i*BlockX + j][2] < F16NEGPREC_LIMIT_VAL) {
+                        BC6HEncode_local.din[i*BlockX + j][2] = F16NEGPREC_LIMIT_VAL;
+                    }
+                }
+                else
+                    BC6HEncode_local.din[i*BlockX + j][2] = 0.0;
+            }
+
+            BC6HEncode_local.din[i*BlockX + j][3] = 0.0f;
+            //printf("Ori---src image %d, --%02x", x, (p_source_pixels[srcOffset + srcidx++]) & 0x0000ff); //for debug
+        }
+    }
+
+    // printf(" X %3d Y %3d Quality %2.2f", x, y, BC6HEncode->m_quality);
+    CompressBlockBC6_Internal(p_encoded_blocks, destI, &BC6HEncode_local, BC6HEncode);
+}
+#endif
diff --git a/extern/CMP_Core/shaders/BC6_Encode_kernel.h b/extern/CMP_Core/shaders/BC6_Encode_kernel.h
new file mode 100644
index 0000000..1a6c206
--- /dev/null
+++ b/extern/CMP_Core/shaders/BC6_Encode_kernel.h
@@ -0,0 +1,480 @@
+//=====================================================================
+// Copyright (c) 2018    Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files(the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and / or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions :
+// 
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+//
+//=====================================================================
+#ifndef BC6_ENCODE_KERNEL_H
+#define BC6_ENCODE_KERNEL_H
+
+#include "Common_Def.h"
+
+#define MAX_TRACE                       10
+#define MAX_ENTRIES_QUANT_TRACE         16
+#define BlockX                          4
+#define BlockY                          4
+#define BYTEPP                          4
+#define COMPRESSED_BLOCK_SIZE           16             // Size of a compressed block in bytes
+#define MAX_DIMENSION_BIG               4
+#define MAX_SUBSET_SIZE                 16              // Largest possible size for an individual subset
+#define NUM_BLOCK_TYPES                 8               // Number of block types in the format
+#define MAX_SUBSETS                     3               // Maximum number of possible subsets
+#define MAX_PARTITIONS                  64              // Maximum number of partition types
+#define MAX_ENTRIES                     64
+#define MAX_TRY                         20
+
+#define MAX_PARTITIONS_TABLE            (1+64+64)
+#define DIMENSION                       4
+#define MAX_CLUSTERS_BIG                16
+#define EPSILON                         0.000001
+#define MAX_CLUSTERS_QUANT_TRACE        8
+
+//# Image Quality will increase as this number gets larger and end-to-end performance time will reduce
+#define MAX_INDEX_BITS                  4
+#define HIGHQULITY_THRESHOLD            0.7F
+#define qFAST_THRESHOLD                 0.5F
+
+#define F16NEGPREC_LIMIT_VAL            -2048.0f //f16 negative precision limit value
+
+#define LOG_CL_RANGE                    5
+#define LOG_CL_BASE                     2
+#define BIT_BASE                        5
+#define BIT_RANGE                       9
+#define MAX_CLUSTERS                    8
+#define BTT(bits)                       (bits-BIT_BASE)
+#define CLT(cl)                         (cl-LOG_CL_BASE)
+#define MASK(n)                         ((1<<(n))-1)
+#define SIGN_EXTEND_TYPELESS(x,nb)      ((((x)&(1<<((nb)-1)))?((~0)<<(nb)):0)|(x))
+#define CMP_HALF_MAX                    65504.0f // positive half max
+
+#ifndef ASPM_GPU
+#include <bitset>
+#include <assert.h>
+//typedef uint8_t        byte;
+#else
+//typedef bitset       uint8_t;
+//typedef uint8          byte;
+#endif
+
+#define BC6CompBlockSize 16
+#define BC6BlockX   4
+#define BC6BlockY   4
+
+typedef struct
+{
+    CGU_INT  k;
+    CGU_FLOAT d;
+} BC6H_TRACE;
+
+#define NCHANNELS                        3
+#define MAX_END_POINTS                   2
+#define MAX_BC6H_MODES                  14
+#define MAX_BC6H_PARTITIONS             32
+#define MAX_TWOREGION_MODES             10
+#define COMPRESSED_BLOCK_SIZE           16        // Size of a compressed block in bytes
+#define ONE_REGION_INDEX_OFFSET         65        // bit location to start saving color index values for single region shape
+#define TWO_REGION_INDEX_OFFSET         82        // bit location to start saving color index values for two region shapes
+#define MIN_MODE_FOR_ONE_REGION         11        // Two regions shapes use modes 1..9 and single use 11..14 
+#define R_0(ep)                         (ep)[0][0][i]
+#define R_1(ep)                         (ep)[0][1][i]
+#define R_2(ep)                         (ep)[1][0][i]
+#define R_3(ep)                         (ep)[1][1][i]
+#define FLT16_MAX                       0x7bff
+
+#ifndef ASPM_GPU
+#define USE_SHAKERHD
+#endif
+
+#define USE_NEWRAMP
+
+typedef struct
+{
+    CGU_FLOAT A[NCHANNELS];
+    CGU_FLOAT B[NCHANNELS];
+} END_Points;
+
+typedef struct
+{
+    CGU_FLOAT x, y, z;
+} BC6H_Vec3f;
+
+typedef struct
+{
+    CGU_INT nbits;              // Number of bits
+    CGU_INT prec[3];            // precission of the Qunatized RGB endpoints
+    CGU_INT transformed;        // if 0, deltas are unsigned and no transform; otherwise, signed and transformed
+    CGU_INT modebits;           // number of mode bits
+    CGU_INT IndexPrec;          // Index Precision
+    CGU_INT mode;               // Mode value to save
+    CGU_INT lowestPrec;         // Step size of each precesion incriment
+}  ModePartitions;
+
+__constant ModePartitions ModePartition[MAX_BC6H_MODES + 1] =
+{
+   0,    0,0,0,        0,    0,    0,    0,     0,   // Mode = Invaild
+
+   // Two region Partition
+   10,   5,5,5,        1,    2,    3,    0x00,  31,    // Mode = 1
+   7,    6,6,6,        1,    2,    3,    0x01,  248,   // Mode = 2
+   11,   5,4,4,        1,    5,    3,    0x02,  15,    // Mode = 3
+   11,   4,5,4,        1,    5,    3,    0x06,  15,    // Mode = 4 
+   11,   4,4,5,        1,    5,    3,    0x0a,  15,    // Mode = 5
+   9,    5,5,5,        1,    5,    3,    0x0e,  62,    // Mode = 6
+   8,    6,5,5,        1,    5,    3,    0x12,  124,   // Mode = 7
+   8,    5,6,5,        1,    5,    3,    0x16,  124,   // Mode = 8
+   8,    5,5,6,        1,    5,    3,    0x1a,  124,   // Mode = 9
+   6,    6,6,6,        0,    5,    3,    0x1e,  496,   // Mode = 10
+
+   // One region Partition    
+   10,   10,10,10,     0,    5,    4,    0x03,  31,    // Mode = 11
+   11,   9,9,9,        1,    5,    4,    0x07,  15,    // Mode = 12
+   12,   8,8,8,        1,    5,    4,    0x0b,  7,     // Mode = 13
+   16,   4,4,4,        1,    5,    4,    0x0f,  1,     // Mode = 14
+};
+
+//================================================
+// Mode Pathern order to try on endpoints
+// The order can be rearranged to set which modes gets processed first
+// for now it is set in order.
+//================================================
+__constant CGU_INT8 ModeFitOrder[MAX_BC6H_MODES + 1] =
+{
+   0,                //0: N/A
+    // ----  2 region lower bits ---
+    1,                // 10 5 5 5
+    2,                // 7  6 6 6 
+    3,                // 11 5 4 5
+    4,                // 11 4 5 4
+    5,                // 11 4 4 5
+    6,                // 9  5 5 5
+    7,                // 8  6 5 5
+    8,                // 8  5 6 5
+    9,                // 8  5 5 6
+    10,               // 6  6 6 6
+    //------ 1 region high bits ---
+    11,               // 10 10 10 10
+    12,               // 11 9  9  9
+    13,               // 12 8  8  8
+    14                // 16 4  4  4
+};
+
+// The Region2FixUps are for our index[subset = 2][16][3] locations
+// indexed by shape region 2
+__constant CGU_INT g_Region2FixUp[32] =
+{
+   7 , 3 , 11, 7,
+   3 , 11, 9 , 5,
+   2 , 12, 7 , 3,
+   11, 7 , 11, 3,
+   7 , 1 , 0 , 1,
+   0 , 1 , 0 , 7,
+   0 , 1 , 1 , 0,
+   4 , 4 , 1 , 0,
+};
+
+// Indexed by all shape regions 
+// Partition Set Fixups for region 1 note region 0 is always at 0
+// that means normally we use 3 bits to define an index value
+// if its at the fix up location then its one bit less
+__constant CGU_INT g_indexfixups[32] =
+{
+   15,15,15,15,
+   15,15,15,15,
+   15,15,15,15,
+   15,15,15,15,
+   15, 2, 8, 2,
+   2, 8, 8,15,
+   2, 8, 2, 2,
+   8, 8, 2, 2,
+};
+
+typedef struct
+{
+    CGU_INT8 region;                // one or two
+    CGU_INT8 m_mode;                // m
+    CGU_INT8 d_shape_index;         // d
+    CGU_INT rw;                            // endpt[0].A[0]
+    CGU_INT rx;                            // endpt[0].B[0]
+    CGU_INT ry;                            // endpt[1].A[0]
+    CGU_INT rz;                            // endpt[1].B[0] 
+    CGU_INT gw;                            // endpt[0].A[1]
+    CGU_INT gx;                            // endpt[0].B[1]
+    CGU_INT gy;                            // endpt[1].A[1]
+    CGU_INT gz;                            // endpt[1].B[1]
+    CGU_INT bw;                            // endpt[0].A[2]
+    CGU_INT bx;                            // endpt[0].B[2]
+    CGU_INT by;                            // endpt[1].A[2]
+    CGU_INT bz;                            // endpt[1].B[2]
+
+    union
+    {
+        CGU_UINT8 indices[4][4];            // Indices data after header block
+        CGU_UINT8 indices16[16];
+    };
+
+    union
+    {
+        CGU_FLOAT         din[MAX_SUBSET_SIZE][MAX_DIMENSION_BIG];   // Original data input as floats
+        unsigned char     cdin[256];                                 // as uchar to match float
+    };
+
+    END_Points    EC[MAX_END_POINTS];    // compressed endpoints expressed as endpt[0].A[] and endpt[1].B[]
+    END_Points    E[MAX_END_POINTS];     // decompressed endpoints 
+    CGU_BOOL      issigned;            // Format is 16 bit signed floating point 
+    CGU_BOOL      istransformed;       // region two: all modes = true except mode=10
+    short         wBits;               // number of bits for the root endpoint
+    short         tBits[NCHANNELS];    // number of bits used for the transformed endpoints
+    CGU_INT           format;              // floating point format are we using for decompression
+    BC6H_Vec3f     Paletef[2][16];
+
+    CGU_INT           index;               // for debugging
+    CGU_FLOAT     fEndPoints[MAX_SUBSETS][MAX_END_POINTS][MAX_DIMENSION_BIG];
+    CGU_FLOAT     cur_best_fEndPoints[MAX_SUBSETS][MAX_END_POINTS][MAX_DIMENSION_BIG];
+    CGU_INT           shape_indices[MAX_SUBSETS][MAX_SUBSET_SIZE];
+    CGU_INT           cur_best_shape_indices[MAX_SUBSETS][MAX_SUBSET_SIZE];
+    CGU_INT           entryCount[MAX_SUBSETS];
+    CGU_INT           cur_best_entryCount[MAX_SUBSETS];
+    CGU_FLOAT     partition[MAX_SUBSETS][MAX_SUBSET_SIZE][MAX_DIMENSION_BIG];
+    CGU_FLOAT     cur_best_partition[MAX_SUBSETS][MAX_SUBSET_SIZE][MAX_DIMENSION_BIG];
+    CGU_BOOL      optimized;           // were end points optimized during final encoding
+
+} BC6H_Encode_local;
+
+#ifndef ASPM_GPU
+using namespace std;
+class BitHeader
+{
+public:
+    BitHeader(const CGU_UINT8 in[], CGU_INT sizeinbytes)
+    {
+        m_bits.reset();
+        m_sizeinbytes = sizeinbytes;
+
+        if ((in != NULL) && (sizeinbytes <= 16))
+        {
+            // Init bits set with given data
+            CGU_INT bitpos = 0;
+            for (CGU_INT i = 0; i < sizeinbytes; i++)
+            {
+                CGU_INT bit = 1;
+                for (CGU_INT j = 0; j < 8; j++)
+                {
+                    m_bits[bitpos] = in[i] & bit ? 1 : 0;
+                    bit = bit << 1;
+                    bitpos++;
+                }
+            }
+        }
+    }
+
+    ~BitHeader()
+    {
+    }
+
+    void transferbits(CGU_UINT8 in[], CGU_INT sizeinbytes)
+    {
+        if ((sizeinbytes <= m_sizeinbytes) && (in != NULL))
+        {
+            // Init bits set with given data
+            memset(in, 0, sizeinbytes);
+            CGU_INT bitpos = 0;
+            for (CGU_INT i = 0; i < sizeinbytes; i++)
+            {
+                CGU_INT bit = 1;
+                for (CGU_INT j = 0; j < 8; j++)
+                {
+                    if (m_bits[bitpos]) in[i] |= bit;
+                    bit = bit << 1;
+                    bitpos++;
+                }
+            }
+        }
+    }
+
+    CGU_INT getvalue(CGU_INT start, CGU_INT bitsize)
+    {
+        CGU_INT value = 0;
+        CGU_INT end = start + bitsize - 1;
+        for (; end >= start; end--)
+        {
+            value |= m_bits[end] ? 1 : 0;
+            if (end > start) value <<= 1;
+        }
+
+        return value;
+    }
+
+    void setvalue(CGU_INT start, CGU_INT bitsize, CGU_INT value, CGU_INT maskshift = 0)
+    {
+        CGU_INT end = start + bitsize - 1;
+        CGU_INT mask = 0x1 << maskshift;
+        for (; start <= end; start++)
+        {
+            m_bits[start] = (value&mask) ? 1 : 0;
+            mask <<= 1;
+        }
+    }
+
+    bitset<128> m_bits;        // 16 bytes max
+    CGU_INT     m_sizeinbytes;
+};
+
+//==================== DECODER CODE ======================
+#define MAXENDPOINTS                    2
+#define U16MAX                          0xffff
+#define S16MAX                          0x7fff
+#define SIGN_EXTEND(w,tbits)            ((((signed(w))&(1<<((tbits)-1)))?((~0)<<(tbits)):0)|(signed(w)))
+
+enum
+{
+    UNSIGNED_F16 = 1,
+    SIGNED_F16   = 2
+};
+
+enum
+{
+    BC6_ONE = 0,
+    BC6_TWO
+};
+
+enum
+{
+    C_RED = 0,
+    C_GREEN,
+    C_BLUE
+};
+
+struct BC6H_Vec3
+{
+    int x,y,z;
+};
+
+struct AMD_BC6H_Format
+{
+    unsigned short region;             // one or two
+    unsigned short m_mode;             // m
+    int d_shape_index;                 // d
+    int rw;                            // endpt[0].A[0]
+    int rx;                            // endpt[0].B[0]
+    int ry;                            // endpt[1].A[0]
+    int rz;                            // endpt[1].B[0] 
+    int gw;                            // endpt[0].A[1]
+    int gx;                            // endpt[0].B[1]
+    int gy;                            // endpt[1].A[1]
+    int gz;                            // endpt[1].B[1]
+    int bw;                            // endpt[0].A[2]
+    int bx;                            // endpt[0].B[2]
+    int by;                            // endpt[1].A[2]
+    int bz;                            // endpt[1].B[2]
+    
+    union
+    {
+        CGU_UINT8 indices[4][4];            // Indices data after header block
+        CGU_UINT8 indices16[16];
+    };
+
+    float         din[MAX_SUBSET_SIZE][MAX_DIMENSION_BIG];   // Original data input
+    END_Points    EC[MAXENDPOINTS];    // compressed endpoints expressed as endpt[0].A[] and endpt[1].B[]
+    END_Points    E[MAXENDPOINTS];     // decompressed endpoints 
+    bool          issigned;            // Format is 16 bit signed floating point 
+    bool          istransformed;       // region two: all modes = true except mode=10
+    short         wBits;               // number of bits for the root endpoint
+    short         tBits[NCHANNELS];    // number of bits used for the transformed endpoints
+    int           format;              // floating point format are we using for decompression
+    BC6H_Vec3      Palete[2][16];
+    BC6H_Vec3f     Paletef[2][16];
+
+    int           index;               // for debugging
+    float         fEndPoints[MAX_SUBSETS][MAX_END_POINTS][MAX_DIMENSION_BIG];
+    float         cur_best_fEndPoints[MAX_SUBSETS][MAX_END_POINTS][MAX_DIMENSION_BIG];
+    int           shape_indices[MAX_SUBSETS][MAX_SUBSET_SIZE];
+    int           cur_best_shape_indices[MAX_SUBSETS][MAX_SUBSET_SIZE];
+    int           entryCount[MAX_SUBSETS];
+    int           cur_best_entryCount[MAX_SUBSETS];
+    float         partition[MAX_SUBSETS][MAX_SUBSET_SIZE][MAX_DIMENSION_BIG];
+    float         cur_best_partition[MAX_SUBSETS][MAX_SUBSET_SIZE][MAX_DIMENSION_BIG];
+    bool          optimized;           // were end points optimized during final encoding
+};
+
+// ===================================  END OF DECODER CODE ========================================================
+#endif
+
+//-------------------------------------------------
+// Set by Host : Read only in kernel
+//-------------------------------------------------
+typedef struct
+{
+    // Setup at initialization time
+    CGU_FLOAT  m_quality;
+    CGU_FLOAT  m_performance;
+    CGU_FLOAT  m_errorThreshold;
+    CGU_DWORD  m_validModeMask;
+    CGU_BOOL   m_imageNeedsAlpha;
+    CGU_BOOL   m_colourRestrict;
+    CGU_BOOL   m_alphaRestrict;
+    CGU_BOOL   m_isSigned;
+} CMP_BC6HOptions;
+
+typedef struct
+{
+    // These are quality parameters used to select when to use the high precision quantizer
+    // and shaker paths
+    CGU_FLOAT m_quantizerRangeThreshold;
+    CGU_FLOAT m_shakerRangeThreshold;
+    CGU_FLOAT m_partitionSearchSize;
+
+    // Setup at initialization time
+    CGU_FLOAT  m_quality;
+    CGU_FLOAT  m_performance;
+    CGU_FLOAT  m_errorThreshold;
+    CGU_DWORD  m_validModeMask;
+    CGU_BOOL   m_imageNeedsAlpha;
+    CGU_BOOL   m_colourRestrict;
+    CGU_BOOL   m_alphaRestrict;
+    CGU_BOOL   m_isSigned;
+
+    // Source image info : must be set prior to use in kernel
+    CGU_UINT32   m_src_width;
+    CGU_UINT32   m_src_height;
+    CGU_UINT32   m_src_stride;
+
+} BC6H_Encode;
+
+CMP_STATIC void SetDefaultBC6Options(BC6H_Encode *BC6Encode)
+{
+    if (BC6Encode)
+    {
+        BC6Encode->m_quality = 1.0f;
+        BC6Encode->m_quantizerRangeThreshold = 0.0f;
+        BC6Encode->m_shakerRangeThreshold = 0.0f;
+        BC6Encode->m_partitionSearchSize = 0.20f;
+        BC6Encode->m_performance = 0.0f;
+        BC6Encode->m_errorThreshold = 0.0f;
+        BC6Encode->m_validModeMask = 0;
+        BC6Encode->m_imageNeedsAlpha = 0;
+        BC6Encode->m_colourRestrict = 0;
+        BC6Encode->m_alphaRestrict = 0;
+        BC6Encode->m_isSigned = 0;
+        BC6Encode->m_src_width = 4;
+        BC6Encode->m_src_height = 4;
+        BC6Encode->m_src_stride = 0;
+    }
+}
+
+#endif
\ No newline at end of file
diff --git a/extern/CMP_Core/shaders/BC7_Encode_Kernel.cpp b/extern/CMP_Core/shaders/BC7_Encode_Kernel.cpp
new file mode 100644
index 0000000..ef6b1cb
--- /dev/null
+++ b/extern/CMP_Core/shaders/BC7_Encode_Kernel.cpp
@@ -0,0 +1,5489 @@
+//=====================================================================
+// Copyright (c) 2019    Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files(the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and / or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions :
+// 
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+//
+//=====================================================================
+// Ref: GPUOpen-Tools/Compressonator
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2016, Intel Corporation
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated 
+// documentation files (the "Software"), to deal in the Software without restriction, including without limitation 
+// the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
+// permit persons to whom the Software is furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of 
+// the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO
+// THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+// TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 
+// SOFTWARE.
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+//--------------------------------------
+// Common BC7 Header
+//--------------------------------------
+#include "BC7_Encode_Kernel.h"
+
+#ifndef ASPM
+//---------------------------------------------
+// Predefinitions for GPU and CPU compiled code
+//---------------------------------------------
+#define ENABLE_CODE
+
+#ifndef ASPM_GPU
+    // using code for CPU or hybrid (CPU/GPU) 
+    //#include "BC7.h"
+#endif
+
+
+INLINE CGU_INT a_compare( const void *arg1, const void *arg2 )
+{
+    if (((CMP_di* )arg1)->image-((CMP_di* )arg2)->image > 0 ) return  1;
+    if (((CMP_di* )arg1)->image-((CMP_di* )arg2)->image < 0 ) return -1;
+    return 0;
+};
+
+#endif
+
+#ifndef ASPM_GPU
+CMP_GLOBAL BC7_EncodeRamps BC7EncodeRamps
+#ifndef ASPM
+    = {0}
+#endif
+;
+
+//---------------------------------------------
+// CPU: Computes max of two float values
+//---------------------------------------------
+float bc7_maxf(float l1, float r1)
+{
+    return (l1 > r1 ? l1 : r1);
+}
+
+//---------------------------------------------
+// CPU: Computes max of two float values
+//---------------------------------------------
+float bc7_minf(float l1, float r1)
+{
+    return (l1 < r1 ? l1 : r1);
+}
+
+#endif
+
+INLINE CGV_EPOCODE shift_right_epocode(CGV_EPOCODE v,  CGU_INT bits)
+{
+   return v>>bits; // (perf warning expected)
+}
+
+INLINE CGV_EPOCODE expand_epocode(CGV_EPOCODE v,  CGU_INT bits)
+{
+   CGV_EPOCODE vv = v<<(8-bits);
+   return vv + shift_right_epocode(vv, bits);
+}
+
+// valid bit range is 0..8
+CGU_INT expandbits(CGU_INT bits, CGU_INT v) 
+{
+    return (  v << (8-bits) | v >> (2* bits - 8)); 
+}
+
+CMP_EXPORT CGU_INT bc7_isa() {
+#if defined(ISPC_TARGET_SSE2)
+    ASPM_PRINT(("SSE2"));
+    return 0;
+#elif defined(ISPC_TARGET_SSE4)
+    ASPM_PRINT(("SSE4"));
+    return 1;
+#elif defined(ISPC_TARGET_AVX)
+    ASPM_PRINT(("AVX"));
+    return 2;
+#elif defined(ISPC_TARGET_AVX2)
+    ASPM_PRINT(("AVX2"));
+    return 3;
+#else
+    ASPM_PRINT(("CPU"));
+    return -1;
+#endif
+}
+
+CMP_EXPORT void init_BC7ramps()
+{
+#ifdef ASPM_GPU
+#else
+    CMP_STATIC CGU_BOOL g_rampsInitialized = FALSE;
+    if (g_rampsInitialized == TRUE) return;
+    g_rampsInitialized = TRUE;
+    BC7EncodeRamps.ramp_init = TRUE;
+
+    //bc7_isa(); ASPM_PRINT((" INIT Ramps\n"));
+
+    CGU_INT     bits;
+    CGU_INT     p1;
+    CGU_INT     p2;
+    CGU_INT     clogBC7;
+    CGU_INT     index;
+    CGU_INT     j;
+    CGU_INT     o1;
+    CGU_INT     o2;
+    CGU_INT     maxi = 0;
+
+
+    for (bits = BIT_BASE; bits<BIT_RANGE; bits++)
+    {
+        for (p1 = 0; p1<(1 << bits); p1++)
+        {
+            BC7EncodeRamps.ep_d[BTT(bits)][p1] = expandbits(bits, p1);
+        } //p1
+    }//bits<BIT_RANGE
+
+    for (clogBC7 = LOG_CL_BASE; clogBC7<LOG_CL_RANGE; clogBC7++)
+    {
+        for (bits = BIT_BASE; bits<BIT_RANGE; bits++)
+        {
+
+#ifdef USE_BC7_RAMP
+            for (p1 = 0; p1<(1 << bits); p1++)
+            {
+                for (p2 = 0; p2<(1 << bits); p2++)
+                {
+                    for (index = 0; index<(1 << clogBC7); index++)
+                    {
+                        if (index > maxi) maxi = index;
+                        BC7EncodeRamps.ramp[(CLT(clogBC7)*4*256*256*16)+(BTT(bits)*256*256*16)+(p1*256*16)+(p2*16)+index] =
+                        //floor((CGV_IMAGE)BC7EncodeRamps.ep_d[BTT(bits)][p1] + rampWeights[clogBC7][index] * (CGV_IMAGE)((BC7EncodeRamps.ep_d[BTT(bits)][p2] - BC7EncodeRamps.ep_d[BTT(bits)][p1]))+ 0.5F);
+                        floor(BC7EncodeRamps.ep_d[BTT(bits)][p1] + rampWeights[clogBC7][index] *((BC7EncodeRamps.ep_d[BTT(bits)][p2] - BC7EncodeRamps.ep_d[BTT(bits)][p1]))+ 0.5F);
+                    }//index<(1 << clogBC7)
+                }//p2<(1 << bits)
+            }//p1<(1 << bits)
+#endif
+
+#ifdef USE_BC7_SP_ERR_IDX
+            for (j = 0; j<256; j++)
+            {
+                for (o1 = 0; o1<2; o1++)
+                {
+                    for (o2 = 0; o2<2; o2++)
+                    {
+                        for (index = 0; index<16; index++) {
+                            BC7EncodeRamps.sp_idx[(CLT(clogBC7)*4*256*2*2*16*2)+(BTT(bits)*256*2*2*16*2)+(j*2*2*16*2)+(o1*2*16*2)+(o2*16*2)+(index*2)+0] = 0;
+                            BC7EncodeRamps.sp_idx[(CLT(clogBC7)*4*256*2*2*16*2)+(BTT(bits)*256*2*2*16*2)+(j*2*2*16*2)+(o1*2*16*2)+(o2*16*2)+(index*2)+1] = 255;
+                            BC7EncodeRamps.sp_err[(CLT(clogBC7)*4*256*2*2*16)+(BTT(bits)*256*2*2*16)+(j*2*2*16)+(o1*2*16)+(o2*16)+index] = 255;
+                        } // i<16
+                    }//o2<2;
+                }//o1<2
+            } //j<256
+
+            for (p1 = 0; p1<(1 << bits); p1++)
+            {
+                for (p2 = 0; p2<(1 << bits); p2++)
+                {
+                    for (index = 0; index<(1 << clogBC7); index++) 
+                    {
+#ifdef USE_BC7_RAMP
+                        CGV_EPOCODE floatf = (CGV_EPOCODE)BC7EncodeRamps.ramp[(CLT(clogBC7)*4*256*256*16)+(BTT(bits)*256*256*16)+(p1*256*16)+(p2*16)+index];
+#else
+                        CGV_EPOCODE floatf = floor((CGV_IMAGE)BC7EncodeRamps.ep_d[BTT(bits)][p1] + rampWeights[clogBC7][index] * (CGV_IMAGE)((BC7EncodeRamps.ep_d[BTT(bits)][p2] - BC7EncodeRamps.ep_d[BTT(bits)][p1]))+ 0.5F);
+#endif
+                        BC7EncodeRamps.sp_idx[(CLT(clogBC7)*4*256*2*2*16*2)+(BTT(bits)*256*2*2*16*2)+(floatf*2*2*16*2)+((p1 & 0x1)*2*16*2)+((p2 & 0x1)*16*2)+(index*2)+0] = p1;
+                        BC7EncodeRamps.sp_idx[(CLT(clogBC7)*4*256*2*2*16*2)+(BTT(bits)*256*2*2*16*2)+(floatf*2*2*16*2)+((p1 & 0x1)*2*16*2)+((p2 & 0x1)*16*2)+(index*2)+1] = p2;
+                        BC7EncodeRamps.sp_err[(CLT(clogBC7)*4*256*2*2*16)+(BTT(bits)*256*2*2*16)+(floatf*2*2*16)+((p1 & 0x1)*2*16)+(p2 & 0x1*16)+index] = 0;
+                    } //i<(1 << clogBC7)
+                } //p2
+            }//p1<(1 << bits)
+
+            for (j = 0; j<256; j++)
+            {
+                for (o1 = 0; o1<2; o1++)
+                {
+                    for (o2 = 0; o2<2; o2++)
+                    {
+                        for (index = 0; index<(1 << clogBC7); index++)
+                        {
+                            if ( // check for unitialized sp_idx
+                                (BC7EncodeRamps.sp_idx[(CLT(clogBC7)*4*256*2*2*16*2)+(BTT(bits)*256*2*2*16*2)+(j*2*2*16*2)+(o1*2*16*2)+(o2*16*2)+(index*2)+0] == 0) && 
+                                (BC7EncodeRamps.sp_idx[(CLT(clogBC7)*4*256*2*2*16*2)+(BTT(bits)*256*2*2*16*2)+(j*2*2*16*2)+(o1*2*16*2)+(o2*16*2)+(index*2)+1] == 255)
+                                )
+
+                            {
+                                CGU_INT k;
+                                CGU_INT tf;
+                                CGU_INT tc;
+
+                                for (k = 1; k<256; k++)
+                                {
+                                    tf = j - k;
+                                    tc = j + k;
+                                    if ((tf >= 0 && BC7EncodeRamps.sp_err[(CLT(clogBC7)*4*256*2*2*16)+(BTT(bits)*256*2*2*16)+(tf*2*2*16)+(o1*2*16)+(o2*16)+index] == 0)) 
+                                    {
+                                        BC7EncodeRamps.sp_idx[(CLT(clogBC7)*4*256*2*2*16*2)+(BTT(bits)*256*2*2*16*2)+(j*2*2*16*2)+(o1*2*16*2)+(o2*16*2)+(index*2)+0] = BC7EncodeRamps.sp_idx[(CLT(clogBC7)*4*256*2*2*16*2)+(BTT(bits)*256*2*2*16*2)+(tf*2*2*16*2)+(o1*2*16*2)+(o2*16*2)+(index*2)+0];
+                                        BC7EncodeRamps.sp_idx[(CLT(clogBC7)*4*256*2*2*16*2)+(BTT(bits)*256*2*2*16*2)+(j*2*2*16*2)+(o1*2*16*2)+(o2*16*2)+(index*2)+1] = BC7EncodeRamps.sp_idx[(CLT(clogBC7)*4*256*2*2*16*2)+(BTT(bits)*256*2*2*16*2)+(tf*2*2*16*2)+(o1*2*16*2)+(o2*16*2)+(index*2)+1];
+                                        break;
+                                    }
+                                    else if ((tc < 256 && BC7EncodeRamps.sp_err[(CLT(clogBC7)*4*256*2*2*16)+(BTT(bits)*256*2*2*16)+(tc*2*2*16)+(o1*2*16)+(o2*16)+index] == 0)) 
+                                    {
+                                        BC7EncodeRamps.sp_idx[(CLT(clogBC7)*4*256*2*2*16*2)+(BTT(bits)*256*2*2*16*2)+(j*2*2*16*2)+(o1*2*16*2)+(o2*16*2)+(index*2)+0] = BC7EncodeRamps.sp_idx[(CLT(clogBC7)*4*256*2*2*16*2)+(BTT(bits)*256*2*2*16*2)+(tc*2*2*16*2)+(o1*2*16*2)+(o2*16*2)+(index*2)+0];
+                                        break;
+                                    }
+                                }
+
+                                //BC7EncodeRamps.sp_err[(CLT(clogBC7)*4*256*2*2*16)+(BTT(bits)*256*2*2*16)+(j*2*2*16)+(o1*2*16)+(o2*16)+index] = (CGV_ERROR) k;
+                                BC7EncodeRamps.sp_err[(CLT(clogBC7)*4*256*2*2*16)+(BTT(bits)*256*2*2*16)+(j*2*2*16)+(o1*2*16)+(o2*16)+index] = (CGU_UINT8)k;
+
+                            } //sp_idx < 0
+                        }//i<(1 << clogBC7)
+                    }//o2
+                }//o1
+            }//j
+#endif
+
+        } //bits<BIT_RANGE
+    } //clogBC7<LOG_CL_RANGE
+#endif
+}
+
+//----------------------------------------------------------
+//====== Common BC7 ASPM Code used for SPMD (CPU/GPU) ======
+//----------------------------------------------------------
+#ifndef ASPM_GPU
+//#define USE_ICMP
+#endif
+
+#define SOURCE_BLOCK_SIZE               16      // Size of a source block in pixels (each pixel has RGBA:8888 channels)
+#define COMPRESSED_BLOCK_SIZE           16  // Size of a compressed block in bytes
+#define MAX_CHANNELS                    4
+#define MAX_SUBSETS                     3   // Maximum number of possible subsets
+#define MAX_SUBSET_SIZE                 16  // Largest possible size for an individual subset
+
+#ifndef ASPM_GPU
+    extern "C" CGU_INT timerStart(CGU_INT id);
+    extern "C" CGU_INT timerEnd(CGU_INT id);
+
+#define TIMERSTART(x)   timerStart(x)
+    #define TIMEREND(x)     timerEnd(x)
+#else
+    #define TIMERSTART(x)
+    #define TIMEREND(x)
+#endif
+
+#ifdef ASPM_GPU
+#define GATHER_UINT8(x,y)   x[y]
+#else
+#define GATHER_UINT8(x,y)   gather_uint8(x,y)
+#endif
+// INLINE CGV_BYTE  gather_uint8 (CMP_CONSTANT  CGU_UINT8 * __constant uniform ptr, CGV_INT idx)
+// {
+//    return ptr[idx]; // (perf warning expected)
+// }
+//
+// INLINE CGV_CMPOUT gather_cmpout(CMP_CONSTANT CGV_CMPOUT * __constant uniform ptr, CGU_INT idx)
+// {
+//    return ptr[idx]; // (perf warning expected)
+// }
+//
+//INLINE CGV_INDEX gather_index(CMP_CONSTANT varying CGV_INDEX* __constant uniform ptr, CGV_INT idx)
+//{
+//   return ptr[idx]; // (perf warning expected)
+//}
+//
+//INLINE void       scatter_index(CGV_INDEX* ptr, CGV_INT idx, CGV_INDEX value)
+//{
+//   ptr[idx] = value; // (perf warning expected)
+//}
+//
+
+#ifdef USE_VARYING
+INLINE CGV_EPOCODE gather_epocode(CMP_CONSTANT CGV_EPOCODE* ptr, CGV_TYPEINT idx)
+{
+   return ptr[idx]; // (perf warning expected)
+}
+#endif
+
+INLINE CGV_SHIFT32 gather_partid(CMP_CONSTANT CGV_SHIFT32 * uniform ptr, CGV_PARTID idx)
+{
+   return ptr[idx]; // (perf warning expected)
+}
+
+//INLINE CGV_BYTE gather_vuint8(CMP_CONSTANT varying CGV_BYTE* __constant uniform ptr, CGV_INT idx)
+//{
+//   return ptr[idx]; // (perf warning expected)
+//}
+
+INLINE void cmp_swap_epo(CGV_EPOCODE u[], CGV_EPOCODE v[], CGV_EPOCODE n)
+{
+    for (CGU_INT i=0; i<n; i++)
+   {
+      CGV_EPOCODE t = u[i];
+      u[i] = v[i];
+      v[i] = t;
+   }
+}
+
+INLINE void cmp_swap_index(CGV_INDEX u[], CGV_INDEX v[], CGU_INT n)
+{
+    for (CGU_INT i=0; i<n; i++)
+   {
+      CGV_INDEX t = u[i];
+      u[i] = v[i];
+      v[i] = t;
+   }
+}
+
+void cmp_memsetBC7(CGV_BYTE ptr[], CGV_BYTE value, CGU_UINT32 size)
+{
+    for (CGV_SHIFT32 i=0; i<size; i++)
+    {
+        ptr[i] = value;
+    }
+}
+
+void cmp_memcpy(CMP_GLOBAL CGU_UINT8 dst[],CGU_UINT8 src[],CGU_UINT32 size)
+{
+#ifdef ASPM_GPU
+    for (CGV_INT i=0; i<size; i++)
+    {
+        dst[i] = src[i];
+    }
+#else
+    memcpy(dst,src,size);
+#endif
+}
+
+INLINE CGV_IMAGE sq_image(CGV_IMAGE v)
+{
+   return v*v;
+}
+
+INLINE CGV_EPOCODE clampEPO(CGV_EPOCODE v, CGV_EPOCODE a, CGV_EPOCODE b)
+{
+    if (v < a)
+        return a;
+    else
+    if (v > b)
+        return b;
+    return v;
+}
+
+INLINE CGV_INDEX clampIndex(CGV_INDEX v, CGV_INDEX a, CGV_INDEX b)
+{
+    if (v < a)
+        return a;
+    else
+    if (v > b)
+        return b;
+    return v;
+}
+
+INLINE CGV_SHIFT32 shift_right_uint32(CGV_SHIFT32 v, CGU_INT bits)
+{
+   return v>>bits; // (perf warning expected)
+}
+
+INLINE CGV_BYTE   shift_right_uint8(CGV_BYTE v,  CGU_UINT8 bits)
+{
+   return v>>bits; // (perf warning expected)
+}
+
+INLINE CGV_BYTE   shift_right_uint8V(CGV_BYTE v,  CGV_UINT8 bits)
+{
+   return v>>bits; // (perf warning expected)
+}
+
+// valid bit range is 0..8
+INLINE CGV_EPOCODE expandEPObits(CGV_EPOCODE v, uniform CGV_EPOCODE bits)
+{
+   CGV_EPOCODE vv = v<<(8-bits);
+   return vv + shift_right_uint32(vv, bits);
+}
+
+CGV_ERROR err_absf(CGV_ERROR a) { return a>0.0F?a:-a;}
+CGV_IMAGE img_absf(CGV_IMAGE a) { return a>0.0F?a:-a;}
+
+CGU_UINT8  min8(CGU_UINT8 a, CGU_UINT8 b) { return a<b?a:b;}
+CGU_UINT8  max8(CGU_UINT8 a, CGU_UINT8 b) { return a>b?a:b;}
+
+void pack_index(CGV_INDEXPACKED  packed_index[2], CGV_INDEX   src_index[MAX_SUBSET_SIZE])
+{
+    // Converts from unpacked index to packed index
+    packed_index[0] = 0x0000;
+    packed_index[1] = 0x0000;
+    CGV_BYTE shift = 0;                // was CGV_UINT8
+    for (CGU_INT k=0; k<16; k++)
+    {
+        packed_index[k/8] |= (CGV_UINT32)(src_index[k]&0x0F) << shift;
+        shift +=4;
+    }
+}
+
+void unpack_index(CGV_INDEX  unpacked_index[MAX_SUBSET_SIZE],CGV_INDEXPACKED  src_packed[2])
+{
+    // Converts from packed index to unpacked index
+    CGV_BYTE shift = 0;    // was CGV_UINT8
+    for (CGV_BYTE k=0; k<16; k++)
+    {
+        unpacked_index[k] = (CGV_BYTE)(src_packed[k/8] >> shift)&0xF;
+        if (k == 7)
+            shift = 0;
+        else
+            shift +=4;
+    }
+}
+
+//====================================== CMP MATH UTILS  ============================================
+CGV_ERROR err_Total(
+                    CGV_IMAGE       image_src1[SOURCE_BLOCK_SIZE*MAX_CHANNELS],
+                    CGV_IMAGE       image_src2[SOURCE_BLOCK_SIZE*MAX_CHANNELS],
+                    CGV_ENTRIES     numEntries,                   // < 16
+                    CGU_CHANNEL     channels3or4)                 // IN:  3 = RGB or 4 = RGBA (4 = MAX_CHANNELS)
+{
+    CGV_ERROR err_t=0.0F;
+    for (CGU_CHANNEL ch=0;ch<channels3or4; ch++) 
+       for (CGV_ENTRIES k=0;k<numEntries;k++) 
+       {
+           err_t = err_t + sq_image(image_src1[k+ch*SOURCE_BLOCK_SIZE]-image_src2[k+ch*SOURCE_BLOCK_SIZE]);
+       }
+    return err_t;
+};
+
+void GetImageCentered(
+                    CGV_IMAGE     image_centered_out[SOURCE_BLOCK_SIZE*MAX_CHANNELS], 
+                    CGV_IMAGE     mean_out[MAX_CHANNELS],
+                    CGV_IMAGE     image_src[SOURCE_BLOCK_SIZE*MAX_CHANNELS],
+                    CGV_ENTRIES   numEntries,                  // < 16
+                    CGU_CHANNEL   channels3or4)                // IN:  3 = RGB or 4 = RGBA (4 = MAX_CHANNELS)
+{
+    for (CGU_CHANNEL ch=0; ch<channels3or4; ch++)
+    {
+       mean_out[ch]=0.0F;
+       if (numEntries > 0)
+       {
+            for (CGV_ENTRIES k=0;k<numEntries;k++)
+            {
+                mean_out[ch] = mean_out[ch] + image_src[k+(ch*SOURCE_BLOCK_SIZE)];
+            }
+            mean_out[ch] /= numEntries;
+            for (CGV_ENTRIES k=0;k<numEntries;k++)
+                image_centered_out[k+(ch*SOURCE_BLOCK_SIZE)] = image_src[k+(ch*SOURCE_BLOCK_SIZE)] - mean_out[ch];
+       }
+    }
+
+}
+
+void GetCovarianceVector(
+                    CGV_IMAGE      covariance_out[MAX_CHANNELS*MAX_CHANNELS],  // OUT: Covariance vector
+                    CGV_IMAGE      image_centered[SOURCE_BLOCK_SIZE*MAX_CHANNELS], 
+                    CGV_ENTRIES    numEntries,                       // < 16
+                    CGU_CHANNEL    channels3or4)                     // IN:  3 = RGB or 4 = RGBA (4 = MAX_CHANNELS)
+{
+    for (CGU_CHANNEL ch1=0; ch1<channels3or4; ch1++)
+        for (CGU_CHANNEL ch2=0;ch2<=ch1;ch2++)
+        {
+            covariance_out[ch1+ch2*4]=0;
+            for (CGV_ENTRIES k=0;k<numEntries;k++)
+                covariance_out[ch1+ch2*4] += image_centered[k+(ch1*SOURCE_BLOCK_SIZE)]*image_centered[k+(ch2*SOURCE_BLOCK_SIZE)];
+        }
+
+    for (CGU_CHANNEL ch1=0; ch1<channels3or4; ch1++)
+        for (CGU_CHANNEL ch2=ch1+1;ch2<channels3or4;ch2++)
+            covariance_out[ch1+ch2*4] = covariance_out[ch2+ch1*4];
+}
+
+void GetProjecedImage(
+                    CGV_IMAGE     projection_out[SOURCE_BLOCK_SIZE],  //output projected data
+                    CGV_IMAGE     image_centered[SOURCE_BLOCK_SIZE*MAX_CHANNELS], 
+                    CGV_ENTRIES   numEntries,               // < 16
+                    CGV_IMAGE     EigenVector[MAX_CHANNELS],
+                    CGU_CHANNEL   channels3or4)             // 3 = RGB or 4 = RGBA 
+{
+    projection_out[0] = 0.0F;
+
+    // EigenVector must be normalized
+    for (CGV_ENTRIES k=0; k<numEntries; k++)
+    {
+        projection_out[k]=0.0F;
+        for (CGU_CHANNEL ch=0; ch<channels3or4; ch++)
+        {
+             projection_out[k] = projection_out[k] + (image_centered[k+(ch*SOURCE_BLOCK_SIZE)]*EigenVector[ch]);
+        }
+    }
+}
+
+
+INLINE CGV_UINT8 get_partition_subset(CGV_INT part_id, CGU_INT maxSubsets, CGV_INT index)
+{
+   CMP_STATIC  uniform CMP_CONSTANT   CGU_UINT32 subset_mask_table[] = {
+        // 2 subset region patterns
+        0x0000CCCCu, // 0   1100 1100 1100 1100  (MSB..LSB)
+        0x00008888u, // 1   1000 1000 1000 1000
+        0x0000EEEEu, // 2   1110 1110 1110 1110
+        0x0000ECC8u, // 3   1110 1100 1100 1000
+        0x0000C880u, // 4   1100 1000 1000 0000
+        0x0000FEECu, // 5   1111 1110 1110 1100
+        0x0000FEC8u, // 6   1111 1110 1100 1000
+        0x0000EC80u, // 7   1110 1100 1000 0000
+        0x0000C800u, // 8   1100 1000 0000 0000
+        0x0000FFECu, // 9   1111 1111 1110 1100
+        0x0000FE80u, // 10  1111 1110 1000 0000
+        0x0000E800u, // 11  1110 1000 0000 0000
+        0x0000FFE8u, // 12  1111 1111 1110 1000
+        0x0000FF00u, // 13  1111 1111 0000 0000
+        0x0000FFF0u, // 14  1111 1111 1111 0000
+        0x0000F000u, // 15  1111 0000 0000 0000
+        0x0000F710u, // 16  1111 0111 0001 0000
+        0x0000008Eu, // 17  0000 0000 1000 1110
+        0x00007100u, // 18  0111 0001 0000 0000
+        0x000008CEu, // 19  0000 1000 1100 1110
+        0x0000008Cu, // 20  0000 0000 1000 1100
+        0x00007310u, // 21  0111 0011 0001 0000
+        0x00003100u, // 22  0011 0001 0000 0000
+        0x00008CCEu, // 23  1000 1100 1100 1110
+        0x0000088Cu, // 24  0000 1000 1000 1100
+        0x00003110u, // 25  0011 0001 0001 0000
+        0x00006666u, // 26  0110 0110 0110 0110
+        0x0000366Cu, // 27  0011 0110 0110 1100
+        0x000017E8u, // 28  0001 0111 1110 1000
+        0x00000FF0u, // 29  0000 1111 1111 0000
+        0x0000718Eu, // 30  0111 0001 1000 1110
+        0x0000399Cu, // 31  0011 1001 1001 1100
+        0x0000AAAAu, // 32  1010 1010 1010 1010
+        0x0000F0F0u, // 33  1111 0000 1111 0000
+        0x00005A5Au, // 34  0101 1010 0101 1010
+        0x000033CCu, // 35  0011 0011 1100 1100
+        0x00003C3Cu, // 36  0011 1100 0011 1100
+        0x000055AAu, // 37  0101 0101 1010 1010
+        0x00009696u, // 38  1001 0110 1001 0110
+        0x0000A55Au, // 39  1010 0101 0101 1010
+        0x000073CEu, // 40  0111 0011 1100 1110
+        0x000013C8u, // 41  0001 0011 1100 1000
+        0x0000324Cu, // 42  0011 0010 0100 1100
+        0x00003BDCu, // 43  0011 1011 1101 1100
+        0x00006996u, // 44  0110 1001 1001 0110
+        0x0000C33Cu, // 45  1100 0011 0011 1100
+        0x00009966u, // 46  1001 1001 0110 0110
+        0x00000660u, // 47  0000 0110 0110 0000
+        0x00000272u, // 48  0000 0010 0111 0010
+        0x000004E4u, // 49  0000 0100 1110 0100
+        0x00004E40u, // 50  0100 1110 0100 0000
+        0x00002720u, // 51  0010 0111 0010 0000
+        0x0000C936u, // 52  1100 1001 0011 0110
+        0x0000936Cu, // 53  1001 0011 0110 1100
+        0x000039C6u, // 54  0011 1001 1100 0110
+        0x0000639Cu, // 55  0110 0011 1001 1100
+        0x00009336u, // 56  1001 0011 0011 0110
+        0x00009CC6u, // 57  1001 1100 1100 0110
+        0x0000817Eu, // 58  1000 0001 0111 1110
+        0x0000E718u, // 59  1110 0111 0001 1000
+        0x0000CCF0u, // 60  1100 1100 1111 0000
+        0x00000FCCu, // 61  0000 1111 1100 1100
+        0x00007744u, // 62  0111 0111 0100 0100
+        0x0000EE22u, // 63  1110 1110 0010 0010
+        // 3 Subset region patterns
+        0xF60008CCu,// 0    1111 0110 0000 0000 : 0000 1000 1100 1100 = 2222122011001100 (MSB...LSB)
+        0x73008CC8u,// 1    0111 0011 0000 0000 : 1000 1100 1100 1000 = 1222112211001000
+        0x3310CC80u,// 2    0011 0011 0001 0000 : 1100 1100 1000 0000 = 1122112210020000
+        0x00CEEC00u,// 3    0000 0000 1100 1110 : 1110 1100 0000 0000 = 1110110022002220
+        0xCC003300u,// 4    1100 1100 0000 0000 : 0011 0011 0000 0000 = 2211221100000000
+        0xCC0000CCu,// 5    1100 1100 0000 0000 : 0000 0000 1100 1100 = 2200220011001100
+        0x00CCFF00u,// 6    0000 0000 1100 1100 : 1111 1111 0000 0000 = 1111111122002200
+        0x3300CCCCu,// 7    0011 0011 0000 0000 : 1100 1100 1100 1100 = 1122112211001100
+        0xF0000F00u,// 8    1111 0000 0000 0000 : 0000 1111 0000 0000 = 2222111100000000
+        0xF0000FF0u,// 9    1111 0000 0000 0000 : 0000 1111 1111 0000 = 2222111111110000
+        0xFF0000F0u,// 10   1111 1111 0000 0000 : 0000 0000 1111 0000 = 2222222211110000
+        0x88884444u,// 11   1000 1000 1000 1000 : 0100 0100 0100 0100 = 2100210021002100
+        0x88886666u,// 12   1000 1000 1000 1000 : 0110 0110 0110 0110 = 2110211021102110
+        0xCCCC2222u,// 13   1100 1100 1100 1100 : 0010 0010 0010 0010 = 2210221022102210
+        0xEC80136Cu,// 14   1110 1100 1000 0000 : 0001 0011 0110 1100 = 2221221121101100
+        0x7310008Cu,// 15   0111 0011 0001 0000 : 0000 0000 1000 1100 = 0222002210021100
+        0xC80036C8u,// 16   1100 1000 0000 0000 : 0011 0110 1100 1000 = 2211211011001000
+        0x310008CEu,// 17   0011 0001 0000 0000 : 0000 1000 1100 1110 = 0022100211001110
+        0xCCC03330u,// 18   1100 1100 1100 0000 : 0011 0011 0011 0000 = 2211221122110000
+        0x0CCCF000u,// 19   0000 1100 1100 1100 : 1111 0000 0000 0000 = 1111220022002200
+        0xEE0000EEu,// 20   1110 1110 0000 0000 : 0000 0000 1110 1110 = 2220222011101110
+        0x77008888u,// 21   0111 0111 0000 0000 : 1000 1000 1000 1000 = 1222122210001000
+        0xCC0022C0u,// 22   1100 1100 0000 0000 : 0010 0010 1100 0000 = 2210221011000000
+        0x33004430u,// 23   0011 0011 0000 0000 : 0100 0100 0011 0000 = 0122012200110000
+        0x00CC0C22u,// 24   0000 0000 1100 1100 : 0000 1100 0010 0010 = 0000110022102210
+        0xFC880344u,// 25   1111 1100 1000 1000 : 0000 0011 0100 0100 = 2222221121002100
+        0x06606996u,// 26   0000 0110 0110 0000 : 0110 1001 1001 0110 = 0110122112210110
+        0x66009960u,// 27   0110 0110 0000 0000 : 1001 1001 0110 0000 = 1221122101100000
+        0xC88C0330u,// 28   1100 1000 1000 1100 : 0000 0011 0011 0000 = 2200201120112200
+        0xF9000066u,// 29   1111 1001 0000 0000 : 0000 0000 0110 0110 = 2222200201100110
+        0x0CC0C22Cu,// 30   0000 1100 1100 0000 : 1100 0010 0010 1100 = 1100221022101100
+        0x73108C00u,// 31   0111 0011 0001 0000 : 1000 1100 0000 0000 = 1222112200020000
+        0xEC801300u,// 32   1110 1100 1000 0000 : 0001 0011 0000 0000 = 2221221120000000
+        0x08CEC400u,// 33   0000 1000 1100 1110 : 1100 0100 0000 0000 = 1100210022002220
+        0xEC80004Cu,// 34   1110 1100 1000 0000 : 0000 0000 0100 1100 = 2220220021001100
+        0x44442222u,// 35   0100 0100 0100 0100 : 0010 0010 0010 0010 = 0210021002100210
+        0x0F0000F0u,// 36   0000 1111 0000 0000 : 0000 0000 1111 0000 = 0000222211110000
+        0x49242492u,// 37   0100 1001 0010 0100 : 0010 0100 1001 0010 = 0210210210210210
+        0x42942942u,// 38   0100 0010 1001 0100 : 0010 1001 0100 0010 = 0210102121020210
+        0x0C30C30Cu,// 39   0000 1100 0011 0000 : 1100 0011 0000 1100 = 1100221100221100
+        0x03C0C03Cu,// 40   0000 0011 1100 0000 : 1100 0000 0011 1100 = 1100002222111100
+        0xFF0000AAu,// 41   1111 1111 0000 0000 : 0000 0000 1010 1010 = 2222222210101010
+        0x5500AA00u,// 42   0101 0101 0000 0000 : 1010 1010 0000 0000 = 1212121200000000
+        0xCCCC3030u,// 43   1100 1100 1100 1100 : 0011 0000 0011 0000 = 2211220022112200
+        0x0C0CC0C0u,// 44   0000 1100 0000 1100 : 1100 0000 1100 0000 = 1100220011002200
+        0x66669090u,// 45   0110 0110 0110 0110 : 1001 0000 1001 0000 = 1221022012210220
+        0x0FF0A00Au,// 46   0000 1111 1111 0000 : 1010 0000 0000 1010 = 1010222222221010
+        0x5550AAA0u,// 47   0101 0101 0101 0000 : 1010 1010 1010 0000 = 1212121212120000
+        0xF0000AAAu,// 48   1111 0000 0000 0000 : 0000 1010 1010 1010 = 2222101010101010
+        0x0E0EE0E0u,// 49   0000 1110 0000 1110 : 1110 0000 1110 0000 = 1110222011102220
+        0x88887070u,// 50   1000 1000 1000 1000 : 0111 0000 0111 0000 = 2111200021112000
+        0x99906660u,// 51   1001 1001 1001 0000 : 0110 0110 0110 0000 = 2112211221120000
+        0xE00E0EE0u,// 52   1110 0000 0000 1110 : 0000 1110 1110 0000 = 2220111011102220
+        0x88880770u,// 53   1000 1000 1000 1000 : 0000 0111 0111 0000 = 2000211121112000
+        0xF0000666u,// 54   1111 0000 0000 0000 : 0000 0110 0110 0110 = 2222011001100110
+        0x99006600u,// 55   1001 1001 0000 0000 : 0110 0110 0000 0000 = 2112211200000000
+        0xFF000066u,// 56   1111 1111 0000 0000 : 0000 0000 0110 0110 = 2222222201100110
+        0xC00C0CC0u,// 57   1100 0000 0000 1100 : 0000 1100 1100 0000 = 2200110011002200
+        0xCCCC0330u,// 58   1100 1100 1100 1100 : 0000 0011 0011 0000 = 2200221122112200
+        0x90006000u,// 59   1001 0000 0000 0000 : 0110 0000 0000 0000 = 2112000000000000
+        0x08088080u,// 60   0000 1000 0000 1000 : 1000 0000 1000 0000 = 1000200010002000
+        0xEEEE1010u,// 61   1110 1110 1110 1110 : 0001 0000 0001 0000 = 2221222022212220
+        0xFFF0000Au,// 62   1111 1111 1111 0000 : 0000 0000 0000 1010 = 2222222222221010
+        0x731008CEu,// 63   0111 0011 0001 0000 : 0000 1000 1100 1110 = 0222102211021110
+        };
+
+  if (maxSubsets == 2)
+  {
+      CGV_UINT32 mask_packed = subset_mask_table[part_id];
+      return ((mask_packed & (0x01<<index))?1:0);       // This can be moved to caller, just return mask!!
+  }
+
+  // 3 region subsets
+  part_id += 64;
+  CGV_UINT32 mask0 = subset_mask_table[part_id] & 0xFFFF;
+  CGV_UINT32 mask1 = subset_mask_table[part_id] >> 16;
+  CGV_UINT32 mask =  0x01 << index;
+
+  return ((mask1 & mask)?2:0 + (mask0 & mask)?1:0);       // This can be moved to caller, just return mask!!
+}
+
+void GetPartitionSubSet_mode01237(
+                    CGV_IMAGE      subsets_out[MAX_SUBSETS][SOURCE_BLOCK_SIZE][MAX_CHANNELS], // OUT: Subset pattern mapped with image src colors
+                    CGV_ENTRIES    entryCount_out[MAX_SUBSETS],                             // OUT: Number of entries per subset
+                    CGV_UINT8      partition,                                               // Partition Shape 0..63
+                    CGV_IMAGE      image_src[SOURCE_BLOCK_SIZE*MAX_CHANNELS],               // Image colors
+                    CGU_INT        blockMode,                                               // [0,1,2,3 or 7]
+                    CGU_CHANNEL    channels3or4)                                            // 3 = RGB or 4 = RGBA (4 = MAX_CHANNELS)
+{
+    CGU_UINT8   maxSubsets = 2;     if (blockMode == 0 || blockMode == 2)  maxSubsets  = 3;
+
+    entryCount_out[0] = 0;
+    entryCount_out[1] = 0;
+    entryCount_out[2] = 0;
+
+    for (CGV_INT i = 0; i < MAX_SUBSET_SIZE; i++)
+    {
+        CGV_UINT8   subset = get_partition_subset(partition,maxSubsets,i);
+
+        for (CGU_INT ch = 0; ch<3; ch++)
+            subsets_out[subset][entryCount_out[subset]][ch] = image_src[i+(ch*SOURCE_BLOCK_SIZE)];
+            //subsets_out[subset*64+(entryCount_out[subset]*MAX_CHANNELS+ch)] = image_src[i+(ch*SOURCE_BLOCK_SIZE)];
+
+        // if we have only 3 channels then set the alpha subset to 0
+        if (channels3or4 == 3)
+            subsets_out[subset][entryCount_out[subset]][3] = 0.0F;
+        else
+            subsets_out[subset][entryCount_out[subset]][3] = image_src[i+(COMP_ALPHA*SOURCE_BLOCK_SIZE)];
+        entryCount_out[subset]++;
+    }
+}
+
+INLINE void  GetClusterMean(
+                    CGV_IMAGE       cluster_mean_out[SOURCE_BLOCK_SIZE][MAX_CHANNELS],
+                    CGV_IMAGE       image_src[SOURCE_BLOCK_SIZE*MAX_CHANNELS],
+                    CGV_INDEX       index_in[MAX_SUBSET_SIZE],
+                    CGV_ENTRIES     numEntries,                             // < 16
+                    CGU_CHANNEL     channels3or4)                           // IN: 3 = RGB or 4 = RGBA (4 = MAX_CHANNELS)
+{
+    // unused index values are underfined
+    CGV_INDEX       i_cnt[MAX_SUBSET_SIZE];
+    CGV_INDEX       i_comp[MAX_SUBSET_SIZE];
+
+
+    for (CGV_ENTRIES i=0;i< numEntries;i++)
+        for (CGU_CHANNEL ch=0; ch< channels3or4; ch++) 
+        {
+            CGV_INDEX idx = index_in[i]&0x0F;
+            cluster_mean_out[idx][ch] = 0;
+            i_cnt[idx]=0;
+        }
+
+    CGV_INDEX ic = 0; // was CGV_INT
+    for (CGV_ENTRIES i=0;i< numEntries;i++)
+    {
+        CGV_INDEX idx = index_in[i]&0x0F;
+        if (i_cnt[idx]==0) 
+            i_comp[ic++]=idx;
+        i_cnt[idx]++;
+
+        for (CGU_CHANNEL ch=0; ch< channels3or4; ch++) 
+        {
+            cluster_mean_out[idx][ch] += image_src[i+(ch*SOURCE_BLOCK_SIZE)];
+        }
+    }
+
+    for (CGU_CHANNEL ch=0; ch< channels3or4; ch++)
+    for (CGU_INT i=0;i < ic;i++)
+    {
+        if (i_cnt[i_comp[i]] != 0)
+        {
+            CGV_INDEX icmp = i_comp[i];
+            cluster_mean_out[icmp][ch] = (CGV_IMAGE) floor( (cluster_mean_out[icmp][ch] / (CGV_IMAGE) i_cnt[icmp]) +0.5F);
+        }
+    }
+
+}
+
+INLINE void GetImageMean(
+                    CGV_IMAGE       image_mean_out[SOURCE_BLOCK_SIZE*MAX_CHANNELS], 
+                    CGV_IMAGE       image_src[SOURCE_BLOCK_SIZE*MAX_CHANNELS], 
+                    CGV_ENTRIES     numEntries, 
+                    CGU_CHANNEL     channels) 
+{
+    for (CGU_CHANNEL ch=0; ch< channels; ch++)
+        image_mean_out[ch] =0;
+
+    for (CGV_ENTRIES i=0;i< numEntries;i++)
+        for (CGU_CHANNEL ch=0; ch< channels; ch++)
+            image_mean_out[ch] += image_src[i+ch*SOURCE_BLOCK_SIZE];
+
+    for (CGU_CHANNEL ch=0; ch< channels; ch++)
+        image_mean_out[ch] /=(CGV_IMAGE) numEntries;   // Performance Warning: Conversion from unsigned int to float is slow. Use "int" if possible
+}
+
+// calculate an eigen vector corresponding to a biggest eigen value
+// will work for non-zero non-negative matricies only
+void GetEigenVector(
+                    CGV_IMAGE       EigenVector_out[MAX_CHANNELS],                  // Normalized Eigen Vector output
+                    CGV_IMAGE       CovarianceVector[MAX_CHANNELS*MAX_CHANNELS],    // Covariance Vector
+                    CGU_CHANNEL     channels3or4)                                   // IN: 3 = RGB or 4 = RGBA
+{
+    CGV_IMAGE vector_covIn[MAX_CHANNELS*MAX_CHANNELS];
+    CGV_IMAGE vector_covOut[MAX_CHANNELS*MAX_CHANNELS];
+    CGV_IMAGE vector_maxCovariance;
+
+    for (CGU_CHANNEL ch1=0; ch1<channels3or4; ch1++)
+        for (CGU_CHANNEL ch2=0; ch2<channels3or4; ch2++)
+        {
+            vector_covIn[ch1+ch2*4] = CovarianceVector[ch1+ch2*4];
+        }
+
+    vector_maxCovariance = 0;
+
+    for (CGU_CHANNEL ch=0; ch<channels3or4; ch++)
+    {
+        if (vector_covIn[ch+ch*4] > vector_maxCovariance)
+                    vector_maxCovariance = vector_covIn[ch+ch*4];
+    }
+
+    // Normalize Input Covariance Vector 
+    for (CGU_CHANNEL ch1=0; ch1<channels3or4; ch1++)
+        for (CGU_CHANNEL ch2=0; ch2<channels3or4; ch2++)
+        {
+            if (vector_maxCovariance > 0)
+                vector_covIn[ch1+ch2*4] = vector_covIn[ch1+ch2*4] / vector_maxCovariance;
+        }
+
+    for (CGU_CHANNEL ch1=0; ch1<channels3or4; ch1++) 
+    {
+        for (CGU_CHANNEL ch2=0; ch2<channels3or4; ch2++) 
+        {
+            CGV_IMAGE vector_temp_cov=0;
+            for (CGU_CHANNEL ch3=0; ch3<channels3or4; ch3++)
+            {
+                vector_temp_cov = vector_temp_cov + vector_covIn[ch1+ch3*4]*vector_covIn[ch3+ch2*4];
+            }
+            vector_covOut[ch1+ch2*4] = vector_temp_cov; 
+        }
+    }
+
+    vector_maxCovariance = 0;
+
+    CGV_TYPEINT maxCovariance_channel = 0;
+
+    for (CGU_CHANNEL ch=0; ch<channels3or4; ch++)
+    {
+         if (vector_covOut[ch+ch*4] > vector_maxCovariance) 
+         {
+             maxCovariance_channel  = ch;
+             vector_maxCovariance    = vector_covOut[ch+ch*4];
+         }
+    }
+
+    CGV_IMAGE vector_t = 0;
+
+    for (CGU_CHANNEL ch=0; ch<channels3or4; ch++)
+    {
+        vector_t = vector_t + vector_covOut[maxCovariance_channel+ch*4]*vector_covOut[maxCovariance_channel+ch*4];
+        EigenVector_out[ch] = vector_covOut[maxCovariance_channel+ch*4]; 
+    }
+
+    // Normalize the Eigen Vector
+    vector_t= sqrt(vector_t);
+    for (CGU_CHANNEL ch=0; ch<channels3or4; ch++) 
+    {
+        if (vector_t > 0)
+            EigenVector_out[ch] = EigenVector_out[ch] / vector_t;
+    }
+
+}
+
+CGV_INDEX  index_collapse(
+                    CGV_INDEX    index[MAX_SUBSET_SIZE], 
+                    CGV_ENTRIES  numEntries)
+{
+    CGV_INDEX minIndex=index[0];
+    CGV_INDEX MaxIndex=index[0];
+
+    for (CGV_ENTRIES k=1;k<numEntries;k++) {
+        if (index[k] < minIndex)
+                minIndex = index[k];
+        if (index[k] > MaxIndex)
+                MaxIndex = index[k];
+    }
+
+    CGV_INDEX D=1;
+
+    for (CGV_INDEX d=2; d<= MaxIndex-minIndex; d++) 
+    {
+        for (CGV_ENTRIES ent=0;ent<numEntries;ent++)
+        {
+            if ((index[ent] -minIndex) % d !=0)
+            {
+                if (ent>=numEntries) 
+                    D =d;
+                break;
+            }
+        }
+    }
+
+    for (CGV_ENTRIES k=0;k<numEntries;k++)
+    {
+        index[k] = (index[k]- minIndex) / D;
+    }
+
+    for (CGV_ENTRIES k=1;k<numEntries;k++) {
+        if (index[k] > MaxIndex)
+                MaxIndex = index[k];
+    }
+
+    return (MaxIndex);
+
+}
+
+void sortProjected_indexs(
+                    CGV_INDEX   index_ordered[MAX_SUBSET_SIZE], 
+                    CGV_IMAGE   projection[SOURCE_BLOCK_SIZE],
+                    CGV_ENTRIES numEntries                   // max 16
+                    ) 
+{
+    CMP_di what[SOURCE_BLOCK_SIZE];
+
+    for (CGV_INDEX i=0; i < numEntries;i++) 
+    {
+        what[i].index = i;
+        what[i].image  = projection[i];
+    }
+
+    CGV_INDEX   tmp_index;
+    CGV_IMAGE   tmp_image;
+
+    for (CGV_ENTRIES i = 1; i < numEntries; i++) 
+    {
+        for (CGV_ENTRIES j=i; j>0; j--)
+        {
+            if (what[j - 1].image > what[j].image)
+            {
+                tmp_index = what[j].index;
+                tmp_image = what[j].image;
+                what[j].index = what[j - 1].index;
+                what[j].image  = what[j - 1].image;
+                what[j - 1].index  = tmp_index;
+                what[j - 1].image  = tmp_image;
+            }
+        }
+    }
+
+    for (CGV_ENTRIES i=0; i < numEntries;i++) 
+        index_ordered[i]=what[i].index;
+
+};
+
+void sortPartitionProjection(
+                    CGV_IMAGE  projection[MAX_PARTITION_ENTRIES],
+                    CGV_UINT8  order[MAX_PARTITION_ENTRIES], 
+                    CGU_UINT8  numPartitions       // max 64
+                    ) 
+{
+    CMP_du what[MAX_PARTITION_ENTRIES];
+         
+    for (CGU_UINT8 Parti=0; Parti < numPartitions;Parti++) 
+    {
+        what[Parti].index  = Parti;
+        what[Parti].image  = projection[Parti];
+    }
+
+    CGV_UINT8   index;
+    CGV_IMAGE   data;
+
+    for (CGU_UINT8 Parti = 1; Parti < numPartitions; Parti++) 
+    {
+        for (CGU_UINT8 Partj=Parti; Partj>0; Partj--)
+        {
+            if (what[Partj - 1].image > what[Partj].image)
+            {
+                index = what[Partj].index;
+                data  = what[Partj].image;
+                what[Partj].index = what[Partj - 1].index;
+                what[Partj].image = what[Partj - 1].image;
+                what[Partj - 1].index = index;
+                what[Partj - 1].image  = data;
+            }
+        }
+    }
+
+    for (CGU_UINT8 Parti=0; Parti < numPartitions;Parti++) 
+        order[Parti]=what[Parti].index;
+
+};
+
+
+void cmp_Write8Bit(
+                    CGV_CMPOUT          base[],
+                    CGU_INT* uniform    offset, 
+                    CGU_INT             bits,
+                    CGV_BYTE            bitVal)
+{
+    base[*offset/8] |= bitVal << (*offset%8);
+    if (*offset%8+bits>8)
+    {
+      base[*offset/8+1] |= shift_right_uint8(bitVal, 8-*offset%8);
+    }
+    *offset += bits;
+}
+
+void cmp_Write8BitV(
+                    CGV_CMPOUT          base[],
+                    CGV_INT             offset, 
+                    CGU_INT             bits,
+                    CGV_BYTE            bitVal)
+{
+    base[offset/8] |= bitVal << (offset%8);
+    if (offset%8+bits>8)
+    {
+      base[offset/8+1] |= shift_right_uint8V(bitVal, 8-offset%8);
+    }
+}
+
+INLINE CGV_EPOCODE ep_find_floor( 
+                    CGV_IMAGE v, 
+                    CGU_UINT8 bits, 
+                    CGV_BYTE use_par, 
+                    CGV_BYTE odd)
+ {
+     CGV_EPOCODE i1=0;
+     CGV_EPOCODE i2=1<<(bits-use_par);
+     odd = use_par ? odd : 0; 
+     while (i2-i1>1) 
+     {
+         CGV_EPOCODE j = (i1+i2)/2;             // Warning in ASMP code
+         CGV_EPOCODE ep_d = expandEPObits((j<<use_par)+odd,bits);
+         if (v >= ep_d )
+             i1=j;
+         else
+             i2=j;
+     }
+     
+     return (i1<<use_par)+odd;
+ }
+
+
+//==========================================================
+
+// Not used for Modes 4&5
+ INLINE CGV_IMAGE GetRamp(
+                    CGU_INT         clogBC7,      // ramp bits Valid range 2..4
+                    CGU_INT         bits,      // Component Valid range 5..8
+                    CGV_EPOCODE     p1,        // 0..255
+                    CGV_EPOCODE     p2,        // 0..255
+                    CGV_INDEX       index)     // 0..15
+{
+#ifdef ASPM_GPU // GPU Code 
+    CGV_FLOAT rampf = 0.0F;
+    CMP_CONSTANT CGV_EPOCODE rampI[5*SOURCE_BLOCK_SIZE] = {
+    0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 , // 0 bit index
+    0 ,64,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 , // 1 bit index
+    0 ,21,43,64,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 , // 2 bit index
+    0 ,9 ,18,27,37,46,55,64,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 , // 3 bit index
+    0 ,4 ,9 ,13,17,21,26,30,34,38,43,47,51,55,60,64  // 4 bit index
+    };
+
+    CGV_EPOCODE e1 = expand_epocode(p1, bits);
+    CGV_EPOCODE e2 = expand_epocode(p2,bits);
+    CGV_FLOAT ramp = gather_epocode(rampI,clogBC7*16+index)/64.0F;
+    rampf = floor(e1 + ramp * (e2 - e1) + 0.5F);
+    return rampf;
+#else // CPU ASPM Code
+#ifdef USE_BC7_RAMP
+      return BC7EncodeRamps.ramp[(CLT(clogBC7)*4*256*256*16)+(BTT(bits)*256*256*16)+(p1*256*16)+(p2*16)+index];
+#else
+      return (CGV_IMAGE)floor((CGV_IMAGE)BC7EncodeRamps.ep_d[BTT(bits)][p1] + rampWeights[clogBC7][index] * (CGV_IMAGE)((BC7EncodeRamps.ep_d[BTT(bits)][p2] - BC7EncodeRamps.ep_d[BTT(bits)][p1]))+ 0.5F);
+#endif
+#endif
+}
+
+
+ // Not used for Modes 4&5
+ INLINE CGV_ERROR get_sperr(
+                    CGU_INT         clogBC7,      // ramp bits Valid range 2..4
+                    CGU_INT         bits,      // Component Valid range 5..8
+                    CGV_EPOCODE     p1,        // 0..255
+                    CGU_INT         t1,
+                    CGU_INT         t2,
+                    CGV_INDEX       index)
+{
+#ifdef ASPM_GPU
+     return 0.0F;
+#else
+#ifdef USE_BC7_SP_ERR_IDX
+      if (BC7EncodeRamps.ramp_init)
+          return  BC7EncodeRamps.sp_err[(CLT(clogBC7)*4*256*2*2*16)+(BTT(bits)*256*2*2*16)+(p1*2*2*16)+(t1*2*16)+(t2*16)+index];
+      else
+          return 0.0F;
+#else
+     return 0.0F;
+#endif
+#endif
+}
+
+ INLINE void get_fixuptable(CGV_FIXUPINDEX  fixup[3], CGV_PARTID  part_id)
+{
+   // same as  CMP SDK v3.1 BC7_FIXUPINDEX1 &  BC7_FIXUPINDEX2 for each partition range 0..63
+   // The data is saved as a packed INT = (BC7_FIXUPINDEX1 << 4 + BC7_FIXUPINDEX2)
+   CMP_STATIC uniform __constant  CGV_FIXUPINDEX  FIXUPINDEX[] = {
+       // 2 subset partitions 0..63
+        0xf0u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0xf0u,
+        0xf0u, 0x20u, 0x80u, 0x20u, 0x20u, 0x80u, 0x80u, 0xf0u, 0x20u, 0x80u, 0x20u, 0x20u, 0x80u, 0x80u, 0x20u, 0x20u,
+        0xf0u, 0xf0u, 0x60u, 0x80u, 0x20u, 0x80u, 0xf0u, 0xf0u, 0x20u, 0x80u, 0x20u, 0x20u, 0x20u, 0xf0u, 0xf0u, 0x60u,
+        0x60u, 0x20u, 0x60u, 0x80u, 0xf0u, 0xf0u, 0x20u, 0x20u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0x20u, 0x20u, 0xf0u,
+        // 3 subset partitions 64..128
+        0x3fu, 0x38u, 0xf8u, 0xf3u, 0x8fu, 0x3fu, 0xf3u, 0xf8u, 0x8fu, 0x8fu, 0x6fu, 0x6fu, 0x6fu, 0x5fu, 0x3fu, 0x38u,
+        0x3fu, 0x38u, 0x8fu, 0xf3u, 0x3fu, 0x38u, 0x6fu, 0xa8u, 0x53u, 0x8fu, 0x86u, 0x6au, 0x8fu, 0x5fu, 0xfau, 0xf8u,
+        0x8fu, 0xf3u, 0x3fu, 0x5au, 0x6au, 0xa8u, 0x89u, 0xfau, 0xf6u, 0x3fu, 0xf8u, 0x5fu, 0xf3u, 0xf6u, 0xf6u, 0xf8u,
+        0x3fu, 0xf3u, 0x5fu, 0x5fu, 0x5fu, 0x8fu, 0x5fu, 0xafu, 0x5fu, 0xafu, 0x8fu, 0xdfu, 0xf3u, 0xcfu, 0x3fu, 0x38u 
+   };
+
+   CGV_FIXUPINDEX skip_packed = FIXUPINDEX[part_id];// gather_int2(FIXUPINDEX, part_id);
+   fixup[0] = 0;
+   fixup[1] = skip_packed>>4;
+   fixup[2] = skip_packed&15;
+}
+
+//===================================== COMPRESS CODE =============================================
+INLINE void SetDefaultIndex(CGV_INDEX  index_io[MAX_SUBSET_SIZE])
+{
+    // Use this a final call
+    for (CGU_INT i=0; i<MAX_SUBSET_SIZE; i++)
+        index_io[i] = 0;
+}
+
+INLINE void SetDefaultEPOCode(CGV_EPOCODE    epo_code_io[8], CGV_EPOCODE R,CGV_EPOCODE G,CGV_EPOCODE B,CGV_EPOCODE A)
+{
+    epo_code_io[0] = R;
+    epo_code_io[1] = G;
+    epo_code_io[2] = B;
+    epo_code_io[3] = A;
+    epo_code_io[4] = R;
+    epo_code_io[5] = G;
+    epo_code_io[6] = B;
+    epo_code_io[7] = A;
+}
+
+
+void  GetProjectedIndex(
+                    CGV_INDEX       projected_index_out[MAX_SUBSET_SIZE],   //output: index, uncentered, in the range 0..clusters-1
+                    CGV_IMAGE       image_projected[SOURCE_BLOCK_SIZE], // image_block points, might be uncentered
+                    CGV_INT         clusters,                       // clusters: number of points in the ramp   (max 16)
+                    CGV_ENTRIES     numEntries)                     // n - number of points in v_  max 15
+{ 
+    CMP_di      what[SOURCE_BLOCK_SIZE];
+    CGV_IMAGE   image_v[SOURCE_BLOCK_SIZE];
+    CGV_IMAGE   image_z[SOURCE_BLOCK_SIZE];
+    CGV_IMAGE   image_l;
+    CGV_IMAGE   image_mm;
+    CGV_IMAGE   image_r   = 0.0F;
+    CGV_IMAGE   image_dm  = 0.0F;
+    CGV_IMAGE   image_min;
+    CGV_IMAGE   image_max;
+    CGV_IMAGE   image_s;
+
+    SetDefaultIndex(projected_index_out);
+
+    image_min=image_projected[0];
+    image_max=image_projected[0]; 
+
+    for (CGV_ENTRIES i=1; i < numEntries;i++) 
+    {
+        if (image_min < image_projected[i])
+            image_min = image_projected[i];
+        if (image_max > image_projected[i])
+            image_max = image_projected[i];
+    }
+
+    CGV_IMAGE img_diff = image_max-image_min;
+
+    if (img_diff == 0.0f) return;
+    if (isnan(img_diff)) return;
+
+    image_s = (clusters-1)/img_diff;
+
+    for (CGV_INDEX i=0; i < numEntries;i++) 
+    {
+
+        image_v[i] = image_projected[i]*image_s;
+        image_z[i] = floor(image_v[i] + 0.5F - image_min *image_s);
+        projected_index_out[i] = (CGV_INDEX)image_z[i];
+
+        what[i].image = image_v[i]-image_z[i]- image_min *image_s;
+        what[i].index = i; 
+        image_dm+= what[i].image;
+        image_r += what[i].image*what[i].image;
+    }
+
+    if (numEntries*image_r- image_dm*image_dm >= (CGV_IMAGE)(numEntries-1)/8)
+    { 
+
+        image_dm /= numEntries;
+
+        for (CGV_INT i=0; i < numEntries;i++) 
+            what[i].image -= image_dm;
+
+        CGV_INDEX tmp_index;
+        CGV_IMAGE tmp_image;
+        for (CGV_ENTRIES i = 1; i < numEntries; i++) 
+        {
+            for (CGV_ENTRIES j=i; j>0; j--)
+            {
+                if (what[j - 1].image > what[j].image) 
+                {
+                    tmp_index = what[j].index;
+                    tmp_image = what[j].image;
+                    what[j].index  = what[j - 1].index;
+                    what[j].image  = what[j - 1].image;
+                    what[j - 1].index  = tmp_index;
+                    what[j - 1].image  = tmp_image;
+                }
+            }
+        }
+
+        // got into fundamental simplex
+        // move coordinate system origin to its center
+
+        // i=0 < numEntries avoids varying int division by 0
+        for (CGV_ENTRIES i=0; i < numEntries;i++) 
+        {
+                what[i].image =  what[i].image  - (CGV_IMAGE) (((2.0f*i+1)-numEntries)/(2.0f*numEntries));
+        }
+
+        image_mm=0.0F;
+        image_l=0.0F;
+
+        CGV_INT j = -1;
+        for (CGV_ENTRIES i=0; i < numEntries;i++) 
+        {
+            image_l += what[i].image;
+            if (image_l < image_mm) 
+            {
+                image_mm = image_l;
+                j=i;
+            }
+        }
+
+
+        j = j + 1;
+        // avoid  j = j%numEntries us this
+        while (j > numEntries) j = j - numEntries;
+
+        for (CGV_ENTRIES i=j; i < numEntries;i++) 
+        {
+            CGV_INDEX idx  = what[i].index;
+            CGV_INDEX pidx = projected_index_out[idx] + 1;  //gather_index(projected_index_out,idx)+1;
+            projected_index_out[idx] = pidx;                 // scatter_index(projected_index_out,idx,pidx);
+        }
+    }
+
+    // get minimum index 
+    CGV_INDEX   index_min=projected_index_out[0];
+    for (CGV_ENTRIES i=1; i < numEntries;i++) 
+    {
+        if (projected_index_out[i] < index_min)
+            index_min = projected_index_out[i];
+    }
+
+    // reposition all index by min index (using min index as 0)
+    for (CGV_ENTRIES i=0; i < numEntries;i++)
+    {
+        projected_index_out[i] = clampIndex(projected_index_out[i] - index_min,0,15);
+    }
+
+}
+
+CGV_ERROR   GetQuantizeIndex(
+                    CGV_INDEXPACKED index_packed_out[2],
+                    CGV_INDEX       index_out[MAX_SUBSET_SIZE],                   // OUT:
+                    CGV_IMAGE       image_src[SOURCE_BLOCK_SIZE*MAX_CHANNELS],
+                    CGV_ENTRIES     numEntries,                                   //IN: range 0..15 (MAX_SUBSET_SIZE)
+                    CGU_INT         numClusters,
+                    CGU_CHANNEL     channels3or4)                                 // IN: 3 = RGB or 4 = RGBA (4 = MAX_CHANNELS)
+{
+    CGV_IMAGE image_centered[SOURCE_BLOCK_SIZE*MAX_CHANNELS];
+    CGV_IMAGE image_mean[MAX_CHANNELS];
+    CGV_IMAGE eigen_vector[MAX_CHANNELS];
+    CGV_IMAGE covariance_vector[MAX_CHANNELS*MAX_CHANNELS];
+
+    GetImageCentered(image_centered,image_mean, image_src, numEntries, channels3or4);
+    GetCovarianceVector(covariance_vector, image_centered, numEntries, channels3or4);
+
+    //-----------------------------------------------------
+    // check if all covariances are the same 
+    // if so then set all index to same value 0 and return
+    // use EPSILON to set the limit for all same limit
+    //-----------------------------------------------------
+
+    CGV_IMAGE image_covt=0.0F;
+    for (CGU_CHANNEL ch=0; ch<channels3or4; ch++) 
+         image_covt = image_covt + covariance_vector[ch+ch*4];
+    
+    if (image_covt < EPSILON) 
+    {
+        SetDefaultIndex(index_out);
+        index_packed_out[0] = 0;
+        index_packed_out[1] = 0;
+        return 0.;
+    }
+
+    GetEigenVector(eigen_vector, covariance_vector,channels3or4);
+
+    CGV_IMAGE image_projected[SOURCE_BLOCK_SIZE];
+
+    GetProjecedImage(image_projected,image_centered, numEntries, eigen_vector, channels3or4);
+    GetProjectedIndex(index_out, image_projected,  numClusters,numEntries);
+
+    //==========================================
+    // Refine 
+    //==========================================
+     CGV_IMAGE image_q  = 0.0F;
+    for (CGU_CHANNEL ch=0; ch<channels3or4; ch++)
+    {
+        eigen_vector[ch]=0;
+        for (CGV_ENTRIES k=0;k<numEntries;k++) 
+            eigen_vector[ch] =  eigen_vector[ch] + image_centered[k+(ch*SOURCE_BLOCK_SIZE)]*index_out[k];
+        image_q = image_q + eigen_vector[ch]* eigen_vector[ch];
+    }
+
+    image_q = sqrt(image_q);
+
+    // direction needs to be normalized
+    if (image_q != 0.0F)
+        for (CGU_CHANNEL ch=0; ch<channels3or4; ch++) 
+            eigen_vector[ch] = eigen_vector[ch] / image_q;
+
+    // Get new projected data
+    GetProjecedImage(image_projected, image_centered, numEntries, eigen_vector, channels3or4);
+    GetProjectedIndex(index_out, image_projected,  numClusters,numEntries);
+
+    // pack the index for use in icmp
+    pack_index(index_packed_out, index_out);
+
+    //===========================
+    // Calc Error
+    //===========================
+    // Get the new image based on new index
+
+    CGV_IMAGE image_t  = 0.0F;
+    CGV_IMAGE index_average = 0.0F;
+
+    for (CGV_ENTRIES ik=0;ik<numEntries;ik++)
+    { 
+        index_average   = index_average + index_out[ik];
+        image_t         = image_t + index_out[ik]*index_out[ik]; 
+    }
+
+    index_average = index_average / (CGV_IMAGE) numEntries; 
+    image_t = image_t - index_average * index_average * (CGV_IMAGE) numEntries;
+
+    if (image_t != 0.0F)
+        image_t =  1.0F/image_t;
+
+    for (CGU_CHANNEL ch=0; ch<channels3or4; ch++)
+    {
+        eigen_vector[ch]=0;
+        for (CGV_ENTRIES nk=0; nk<numEntries; nk++) 
+            eigen_vector[ch] = eigen_vector[ch] + image_centered[nk+(ch*SOURCE_BLOCK_SIZE)]*index_out[nk];
+    }
+
+    CGV_IMAGE image_decomp[SOURCE_BLOCK_SIZE*MAX_CHANNELS];
+    for (CGV_ENTRIES i=0;i<numEntries;i++) 
+            for (CGU_CHANNEL ch=0; ch<channels3or4; ch++) 
+                image_decomp[i+(ch*SOURCE_BLOCK_SIZE)] = image_mean[ch] + eigen_vector[ch]*image_t*(index_out[i]-index_average); 
+
+    CGV_ERROR err_1 = err_Total(image_src,image_decomp,numEntries, channels3or4);
+
+    return err_1;
+//    return 0.0F;
+}
+
+CGV_ERROR   quant_solid_color(
+                    CGV_INDEX      index_out[MAX_SUBSET_SIZE],
+                    CGV_EPOCODE    epo_code_out[2*MAX_CHANNELS],
+                    CGV_IMAGE      image_src[SOURCE_BLOCK_SIZE*MAX_CHANNELS], 
+                    CGV_ENTRIES    numEntries, 
+                    CGU_UINT8      Mi_,                // last cluster
+                    CGU_UINT8      bits[3],            // including parity
+                    CGU_INT        type, 
+                    CGU_CHANNEL    channels3or4                       // IN: 3 = RGB or 4 = RGBA (4 = MAX_CHANNELS)
+                    ) 
+{
+
+    CGU_INT    clogBC7 = 0;
+    CGU_INT    iv = Mi_ + 1;
+    while (iv >>= 1)
+        clogBC7++;
+
+    // init epo_0
+    CGV_EPOCODE  epo_0[2*MAX_CHANNELS];
+    SetDefaultEPOCode(epo_0,0xFF,0,0,0);
+
+    CGV_INDEX  image_log = 0;
+    CGV_INDEX  image_idx = 0;
+    CGU_BOOL   use_par  = FALSE;
+    if (type != 0)
+        use_par = TRUE;
+    CGV_ERROR  error_1 = CMP_FLOAT_MAX;
+
+    for (CGU_INT pn = 0; pn<npv_nd[channels3or4-3][type] && (error_1 != 0.0F);  pn++)
+    { //1
+
+        CGU_INT o1[2*MAX_CHANNELS]; // = { 0,2 };
+        CGU_INT o2[2*MAX_CHANNELS]; // = { 0,2 };
+
+        for (CGU_CHANNEL ch = 0; ch<channels3or4; ch++)
+        { //A
+            o2[  ch] = o1[  ch] = 0;
+            o2[4+ch] = o1[4+ch] = 2;
+
+            if (use_par == TRUE)
+            {
+                if (par_vectors_nd[channels3or4-3][type][pn][0][ch])
+                    o1[ch] = 1;
+                else
+                    o1[4+ch] = 1;
+                if (par_vectors_nd[channels3or4-3][type][pn][1][ch])
+                    o2[ch] = 1; 
+                else
+                    o2[4+ch] = 1;
+            }
+        } //A
+
+        CGV_EPOCODE image_tcr[MAX_CHANNELS];
+        CGV_EPOCODE epo_dr_0[MAX_CHANNELS];
+        CGV_ERROR   error_tr;
+        CGV_ERROR   error_0 = CMP_FLOAT_MAX;
+
+        for (CGV_INDEX iclogBC7 = 0; iclogBC7< (1 << clogBC7) && (error_0 != 0); iclogBC7++)
+        { //E
+            CGV_ERROR       error_t = 0;
+            CGV_EPOCODE     t1o[MAX_CHANNELS], t2o[MAX_CHANNELS];
+
+            for (CGU_CHANNEL ch1 = 0; ch1<channels3or4; ch1++)
+            { // D
+                CGV_ERROR error_ta = CMP_FLOAT_MAX;
+
+                for (CGU_INT t1 = o1[ch1]; t1<o1[4+ch1]; t1++)
+                { // C
+                    // This is needed for non-integer mean points of "collapsed" sets
+                    for (CGU_INT t2 = o2[ch1]; t2<o2[4+ch1]; t2++)
+                    { // B
+                        CGV_EPOCODE  image_tf; 
+                        CGV_EPOCODE  image_tc; 
+                        image_tf = (CGV_EPOCODE)floor(image_src[COMP_RED+(ch1*SOURCE_BLOCK_SIZE)]);
+                        image_tc = (CGV_EPOCODE) ceil(image_src[COMP_RED+(ch1*SOURCE_BLOCK_SIZE)]);
+
+#ifdef USE_BC7_SP_ERR_IDX
+                        CGV_ERROR err_tf = get_sperr(clogBC7,bits[ch1],image_tf,t1,t2,iclogBC7);
+                        CGV_ERROR err_tc = get_sperr(clogBC7,bits[ch1],image_tc,t1,t2,iclogBC7);
+                        if (err_tf > err_tc)
+                            image_tcr[ch1] = image_tc;
+                        else if (err_tf < err_tc)
+                            image_tcr[ch1] = image_tf;
+                        else
+                            image_tcr[ch1] = (CGV_EPOCODE)floor(image_src[COMP_RED+(ch1*SOURCE_BLOCK_SIZE)] + 0.5F);
+
+                        //image_tcr[ch1] = image_tf + (image_tc - image_tf)/2; 
+
+                        //===============================
+                        // Refine this for better quality!
+                        //===============================
+                        error_tr = get_sperr(clogBC7,bits[ch1],image_tcr[ch1],t1,t2,iclogBC7);
+                        error_tr = (error_tr*error_tr) 
+                                   + 2 * error_tr 
+                                   * img_absf(image_tcr[ch1]- image_src[COMP_RED+(ch1*SOURCE_BLOCK_SIZE)]) 
+                                   + (image_tcr[ch1] - image_src[COMP_RED+(ch1*SOURCE_BLOCK_SIZE)]) 
+                                   * (image_tcr[ch1] - image_src[COMP_RED+(ch1*SOURCE_BLOCK_SIZE)]);
+
+                        if (error_tr < error_ta)
+                        {
+                            error_ta = error_tr;
+                            t1o[ch1] = t1;
+                            t2o[ch1] = t2;
+                            epo_dr_0[ch1] = clampEPO(image_tcr[ch1],0,255);
+                         }
+#else
+                      image_tcr[ch1] = floor(image_src[COMP_RED+(ch1*SOURCE_BLOCK_SIZE)] + 0.5F);
+                      error_ta      = 0;
+                      t1o[ch1] = t1;
+                      t2o[ch1] = t2;
+                      epo_dr_0[ch1] = clampi(image_tcr[ch1],0,255);
+#endif
+                  } // B
+              } //C
+
+              error_t += error_ta;
+            } // D
+
+            if (error_t < error_0)
+            {
+                image_log = iclogBC7;
+                image_idx = image_log;
+                CGU_BOOL srcIsWhite = FALSE;
+                if ((image_src[0] == 255.0f)&&(image_src[1] == 255.0f)&&(image_src[2] == 255.0f)) srcIsWhite = TRUE;
+
+                for (CGU_CHANNEL ch = 0; ch<channels3or4; ch++)
+                {
+#ifdef ASPM_GPU
+                    if (srcIsWhite == TRUE)
+                    {
+                        // Default White block!
+                        epo_0[  ch] = 0x7F;
+                        epo_0[4+ch] = 0x7F;
+                    }
+                    else
+                    {
+                        // Default black block!
+                        epo_0[  ch] = 0;
+                        epo_0[4+ch] = 0;
+                    }
+#else
+#ifdef USE_BC7_SP_ERR_IDX
+                    if (BC7EncodeRamps.ramp_init) {
+                        CGV_EPOCODE index = (CLT(clogBC7)*4*256*2*2*16*2)+(BTT(bits[ch])*256*2*2*16*2)+(epo_dr_0[ch]*2*2*16*2)+(t1o[ch]*2*16*2)+(t2o[ch]*16*2)+(iclogBC7*2);
+                        epo_0[  ch] = BC7EncodeRamps.sp_idx[index+0]&0xFF;// gather_epocode(u_BC7Encode->sp_idx,index+0)&0xFF;
+                        epo_0[4+ch] = BC7EncodeRamps.sp_idx[index+1]&0xFF;// gather_epocode(u_BC7Encode->sp_idx,index+1)&0xFF;
+                    }
+                    else {
+                        epo_0[ch] = 0;
+                        epo_0[4 + ch] = 0;
+                    }
+#else
+                    epo_0[  ch] = 0;
+                    epo_0[4+ch] = 0;
+#endif
+#endif
+                }
+                error_0 = error_t;
+            }
+            //if (error_0 == 0)
+            //    break;
+        } // E
+
+        if (error_0 < error_1)
+        {
+
+            image_idx = image_log;
+            for (CGU_CHANNEL chE = 0; chE<channels3or4; chE++)
+            {
+                epo_code_out[chE]   = epo_0[chE];
+                epo_code_out[4+chE] = epo_0[4+chE];
+            }
+            error_1 = error_0;
+        }
+
+    } //1
+
+    // Get Image error
+    CGV_IMAGE  image_decomp[SOURCE_BLOCK_SIZE*MAX_CHANNELS];
+    for (CGV_ENTRIES i = 0; i< numEntries; i++)
+    {
+        index_out[i] = image_idx;
+        for (CGU_CHANNEL ch = 0; ch<channels3or4; ch++)
+        {
+            image_decomp[i+(ch*SOURCE_BLOCK_SIZE)] = GetRamp(clogBC7,bits[ch],epo_code_out[ch],epo_code_out[4+ch],image_idx);
+        }
+    }
+    // Do we need to do this rather then err_1 * numEntries
+    CGV_ERROR error_quant;
+    error_quant = err_Total(image_src, image_decomp, numEntries, channels3or4);
+
+    return error_quant;
+    //return err_1 * numEntries;
+}
+
+CGV_ERROR requantized_image_err(
+                        CGV_INDEX      index_out[MAX_SUBSET_SIZE],
+                        CGV_EPOCODE    epo_code[2*MAX_CHANNELS],
+                        CGU_INT        clogBC7,
+                        CGU_UINT8      max_bits[MAX_CHANNELS],
+                        CGV_IMAGE      image_src[SOURCE_BLOCK_SIZE*MAX_CHANNELS],
+                        CGV_ENTRIES    numEntries,         // max 16
+                        CGU_CHANNEL    channels3or4)       // IN: 3 = RGB or 4 = RGBA (4 = MAX_CHANNELS)
+{
+
+        //=========================================
+        // requantized image based on new epo_code
+        //=========================================
+        CGV_IMAGE   image_requantize[SOURCE_BLOCK_SIZE][MAX_CHANNELS];
+        CGV_ERROR   err_r=0.0F;
+
+        for (CGU_CHANNEL ch = 0; ch<channels3or4; ch++)
+        {
+            for (CGU_INT k = 0; k<SOURCE_BLOCK_SIZE; k++)
+            {
+                  image_requantize[k][ch] = GetRamp(clogBC7,max_bits[ch],epo_code[ch],epo_code[4+ch],(CGV_INDEX)k);
+            }
+        }
+
+        //=========================================
+        // Calc the error for the requantized image
+        //=========================================
+
+        for (CGV_ENTRIES k =0; k < numEntries; k++)
+        {
+            CGV_ERROR     err_cmin     = CMP_FLOAT_MAX;
+            CGV_TYPEINT       hold_index_j = 0;
+
+            for (CGV_TYPEINT iclogBC7=0; iclogBC7 < (1<<clogBC7); iclogBC7++)
+            {
+                CGV_IMAGE image_err = 0.0F;
+
+                for (CGU_CHANNEL ch=0; ch<channels3or4; ch++)
+                {
+                    image_err+= sq_image(image_requantize[iclogBC7][ch]-image_src[k+(ch*SOURCE_BLOCK_SIZE)]);
+                }
+
+                if(image_err < err_cmin)
+                {
+                    err_cmin     = image_err;
+                    hold_index_j = iclogBC7;
+                }
+            }
+
+            index_out[k]=(CGV_INDEX)hold_index_j;
+            err_r    +=err_cmin;
+        }
+
+        return err_r;
+}
+
+CGU_BOOL get_ideal_cluster(
+                        CGV_IMAGE      image_out[2*MAX_CHANNELS],
+                        CGV_INDEX      index_in[MAX_SUBSET_SIZE],
+                        CGU_INT        Mi_,
+                        CGV_IMAGE      image_src[SOURCE_BLOCK_SIZE*MAX_CHANNELS],
+                        CGV_ENTRIES    numEntries,
+                        CGU_CHANNEL    channels3or4 )
+{
+            // get ideal cluster centers
+            CGV_IMAGE   image_cluster_mean[SOURCE_BLOCK_SIZE][MAX_CHANNELS];
+            GetClusterMean(image_cluster_mean, image_src, index_in, numEntries, channels3or4); // unrounded
+
+            CGV_IMAGE image_matrix0[2] = {0,0};   // matrix /inverse matrix
+            CGV_IMAGE image_matrix1[2] = {0,0};   // matrix /inverse matrix
+            CGV_IMAGE image_rp[2*MAX_CHANNELS];            // right part for RMS fit problem
+
+            for (CGU_INT i=0; i<2*MAX_CHANNELS; i++) image_rp[i]=0;
+
+            // weight with cnt if runnning on compacted index
+            for (CGV_ENTRIES k=0;k<numEntries;k++)
+            {
+                    image_matrix0[0] += (Mi_- index_in[k])* (Mi_-index_in[k]);
+                    image_matrix0[1] +=       index_in[k] * (Mi_-index_in[k]);           // im is symmetric
+                    image_matrix1[1] +=       index_in[k] *      index_in[k];
+
+                    for (CGU_CHANNEL ch=0; ch<channels3or4; ch++)
+                    {
+                        image_rp[  ch] += (Mi_-index_in[k]) * image_cluster_mean[index_in[k]][ch];
+                        image_rp[4+ch] +=      index_in[k]  * image_cluster_mean[index_in[k]][ch];
+                    }
+            }
+
+            CGV_IMAGE matrix_dd = image_matrix0[0]*image_matrix1[1]- image_matrix0[1]*image_matrix0[1];
+
+            // assert(matrix_dd !=0);
+            // matrix_dd=0 means that index_cidx[k] and (Mi_-index_cidx[k]) collinear which implies only one active index;
+            // taken care of separately
+            if (matrix_dd == 0)
+            {
+                for (CGU_CHANNEL ch=0; ch<channels3or4; ch++)
+                {
+                    image_out[  ch]=0;
+                    image_out[4+ch]=0;
+                }
+                return FALSE;
+            }
+            image_matrix1[0] = image_matrix0[0];
+            image_matrix0[0] = image_matrix1[1]/matrix_dd;
+            image_matrix1[1] = image_matrix1[0]/matrix_dd;
+            image_matrix1[0] = image_matrix0[1]=-image_matrix0[1]/matrix_dd;
+
+            CGV_IMAGE     Mif = (CGV_IMAGE)Mi_;
+
+            for (CGU_CHANNEL ch=0; ch<channels3or4; ch++)
+            {
+                image_out[  ch]=(image_matrix0[0]*image_rp[ch]+image_matrix0[1]*image_rp[4+ch])*Mif;
+                image_out[4+ch]=(image_matrix1[0]*image_rp[ch]+image_matrix1[1]*image_rp[4+ch])*Mif;
+            }
+
+   return TRUE;
+}
+
+CGV_ERROR shake(
+        CGV_EPOCODE epo_code_shaker_out[2*MAX_CHANNELS],
+        CGV_IMAGE   image_ep[2*MAX_CHANNELS],
+        CGV_INDEX   index_cidx[MAX_SUBSET_SIZE],
+        CGV_IMAGE   image_src[SOURCE_BLOCK_SIZE*MAX_CHANNELS],
+        CGU_INT     clogBC7,
+        CGU_INT     type,
+        CGU_UINT8   max_bits[MAX_CHANNELS],
+        CGU_UINT8   use_par,
+        CGV_ENTRIES numEntries,         // max 16
+        CGU_CHANNEL channels3or4 )
+{
+#define SHAKESIZE1 1
+#define SHAKESIZE2 2
+             // shake single or                                   - cartesian
+             // shake odd/odd and even/even or                    - same parity
+             // shake odd/odd odd/even , even/odd and even/even   - bcc
+
+             CGV_ERROR    best_err = CMP_FLOAT_MAX;
+
+             CGV_ERROR    err_ed[16] = {0};
+             CGV_EPOCODE  epo_code_par[2][2][2][MAX_CHANNELS];
+
+             for (CGU_CHANNEL ch=0; ch<channels3or4; ch++)
+             {
+                 CGU_UINT8         ppA = 0;
+                 CGU_UINT8         ppB = 0;
+                 CGU_UINT8         rr = (use_par ? 2:1);
+                 CGV_EPOCODE     epo_code_epi[2][2];  // first/second, coord, begin rage end range
+
+                 for (ppA=0; ppA<rr; ppA++) // loop max =2
+                 {
+                     for (ppB=0; ppB<rr; ppB++) //loop  max =2
+                     {
+
+                         // set default ranges
+                         epo_code_epi[0][0] = epo_code_epi[0][1]= ep_find_floor( image_ep[  ch],max_bits[ch], use_par, ppA);
+                         epo_code_epi[1][0] = epo_code_epi[1][1]= ep_find_floor( image_ep[4+ch],max_bits[ch], use_par, ppB);
+
+                         // set begin range 
+                         epo_code_epi[0][0] -= ( (epo_code_epi[0][0]  < SHAKESIZE1 ? epo_code_epi[0][0]:SHAKESIZE1))&(~use_par);
+                         epo_code_epi[1][0] -= ( (epo_code_epi[1][0]  < SHAKESIZE1 ? epo_code_epi[1][0]:SHAKESIZE1))&(~use_par);
+
+                         // set end range
+                         epo_code_epi[0][1] += ((1<<max_bits[ch])-1 - epo_code_epi[0][1]  < SHAKESIZE2 ? (1<<max_bits[ch])-1-epo_code_epi[0][1]:SHAKESIZE2)&(~use_par);
+                         epo_code_epi[1][1] += ((1<<max_bits[ch])-1 - epo_code_epi[1][1]  < SHAKESIZE2 ? (1<<max_bits[ch])-1-epo_code_epi[1][1]:SHAKESIZE2)&(~use_par);
+
+                         CGV_EPOCODE step = (1<<use_par);
+                         err_ed[(ppA*8)+(ppB*4)+ch]=CMP_FLOAT_MAX;
+
+                         for (CGV_EPOCODE epo_p1=epo_code_epi[0][0]; epo_p1<=epo_code_epi[0][1]; epo_p1+=step) 
+                         for (CGV_EPOCODE epo_p2=epo_code_epi[1][0]; epo_p2<=epo_code_epi[1][1]; epo_p2+=step)
+                         {
+                                 CGV_IMAGE      image_square_diff =0.0F;
+                                 CGV_ENTRIES    _mc = numEntries;
+                                 CGV_IMAGE      image_ramp;
+ 
+                                 while(_mc > 0)
+                                 {
+                                     image_ramp = GetRamp(clogBC7,max_bits[ch],epo_p1,epo_p2,index_cidx[_mc-1]);
+ 
+                                     image_square_diff += sq_image(image_ramp-image_src[(_mc-1)+(ch*SOURCE_BLOCK_SIZE)]);
+                                     _mc--;
+                                 }
+                                 if (image_square_diff < err_ed[(ppA*8)+(ppB*4)+ch]) 
+                                 {
+                                     err_ed[(ppA*8)+(ppB*4)+ch] = image_square_diff;
+                                     epo_code_par[ppA][ppB][0][ch] = epo_p1;
+                                     epo_code_par[ppA][ppB][1][ch] = epo_p2;
+                                 }
+                             }
+                        } // pp1
+                     } // pp0
+                 } // j
+ 
+ //---------------------------------------------------------
+             for (CGU_INT pn=0;  pn < npv_nd[channels3or4-3][type]; pn++)
+             {
+                 CGV_ERROR err_2=0.0F;
+                 CGU_INT   d1;
+                 CGU_INT   d2;
+ 
+                 for (CGU_CHANNEL ch=0; ch<channels3or4; ch++) 
+                 {
+                     d1 = par_vectors_nd[channels3or4-3][type][pn][0][ch];
+                     d2 = par_vectors_nd[channels3or4-3][type][pn][1][ch];
+                     err_2+=err_ed[(d1*8)+(d2*4)+ch];
+                 }
+ 
+                 if (err_2 < best_err)
+                 {
+                     best_err = err_2;
+                     for (CGU_CHANNEL ch=0; ch<channels3or4; ch++) 
+                     {
+                         d1 = par_vectors_nd[channels3or4-3][type][pn][0][ch];
+                         d2 = par_vectors_nd[channels3or4-3][type][pn][1][ch];
+                         epo_code_shaker_out[  ch]=epo_code_par[d1][d2][0][ch];
+                         epo_code_shaker_out[4+ch]=epo_code_par[d1][d2][1][ch];
+                     }
+                 }
+             }
+
+             return best_err;
+}
+
+CGV_ERROR  optimize_IndexAndEndPoints(
+                    CGV_INDEX      index_io[MAX_SUBSET_SIZE],
+                    CGV_EPOCODE    epo_code_out[8],
+                    CGV_IMAGE      image_src[SOURCE_BLOCK_SIZE*MAX_CHANNELS],
+                    CGV_ENTRIES    numEntries,         // max 16
+                    CGU_UINT8      Mi_,                // last cluster , This should be no larger than 16
+                    CGU_UINT8      bits,               // total for all components
+                    CGU_CHANNEL    channels3or4,       // IN: 3 = RGB or 4 = RGBA (4 = MAX_CHANNELS)
+uniform CMP_GLOBAL    BC7_Encode     u_BC7Encode[]) 
+{
+    CGV_ERROR err_best = CMP_FLOAT_MAX;
+    CGU_INT   type;
+    CGU_CHANNEL   channels2 = 2*channels3or4;
+
+    type = bits % channels2;
+
+    CGU_UINT8 use_par =(type !=0);
+
+    CGU_UINT8   max_bits[MAX_CHANNELS];
+    for (CGU_UINT8 ch=0; ch<channels3or4; ch++) 
+        max_bits[ch] = (bits+channels2-1) / channels2;
+
+    CGU_INT  iv;
+    CGU_INT  clogBC7=0;
+    iv = Mi_;
+    while (iv>>=1) 
+        clogBC7++;
+
+    CGU_INT clt_clogBC7 = CLT(clogBC7);
+
+    if (clt_clogBC7 > 3)
+    {
+        ASPM_PRINT(("Err: optimize_IndexAndEndPoints, clt_clogBC7\n"));
+        return CMP_FLOAT_MAX;
+    }
+
+    Mi_ = Mi_ - 1;
+
+    CGV_INDEX     MaxIndex;
+    CGV_INDEX     index_tmp[MAX_SUBSET_SIZE];
+    CGU_INT       maxTry = MAX_TRY_SHAKER;
+
+    CGV_INDEX       index_best[MAX_SUBSET_SIZE];
+
+    for (CGV_ENTRIES k=0;k<numEntries;k++) 
+    {
+        index_best[k] = index_tmp[k] = clampIndex(index_io[k],0,15);
+    }
+
+    CGV_EPOCODE epo_code_best[2*MAX_CHANNELS];
+
+    SetDefaultEPOCode(epo_code_out ,0xFF,0,0,0);
+    SetDefaultEPOCode(epo_code_best,0,0,0,0);
+
+    CGV_ERROR       err_requant = 0.0F;
+
+   MaxIndex = index_collapse(index_tmp, numEntries);
+
+   //===============================
+   // we have a solid color 4x4 block
+   //===============================
+   if (MaxIndex == 0)
+   {
+
+       return quant_solid_color(index_io, epo_code_out, image_src, numEntries,  Mi_, max_bits,type, channels3or4);
+   }
+
+  do {
+        //===============================
+        // We have ramp colors to process
+        //===============================
+        CGV_ERROR   err_cluster = CMP_FLOAT_MAX;
+        CGV_ERROR   err_shake;
+        CGV_INDEX   index_cluster[MAX_PARTITION_ENTRIES];
+
+        for (CGV_INDEX index_slope=1;  (MaxIndex != 0) && (index_slope*MaxIndex <= Mi_); index_slope++)
+        {
+            for (CGV_INDEX index_offset=0; index_offset<=Mi_-index_slope*MaxIndex; index_offset++)
+            {
+              //-------------------------------------
+              // set a new index data to try
+              //-------------------------------------
+              for (CGV_ENTRIES k=0;k<numEntries;k++)
+                  index_cluster[k] = index_tmp[k] * index_slope + index_offset;
+
+              CGV_IMAGE     image_cluster[2*MAX_CHANNELS];
+              CGV_EPOCODE  epo_code_shake[2*MAX_CHANNELS];
+              SetDefaultEPOCode(epo_code_shake,0,0,0xFF,0);
+
+              if (get_ideal_cluster(  image_cluster,
+                                      index_cluster,
+                                      Mi_,
+                                      image_src,
+                                      numEntries,
+                                      channels3or4) == FALSE) 
+              {
+                  break;
+              }
+
+              err_shake = shake( epo_code_shake,  // return new epo 
+                             image_cluster,
+                             index_cluster,
+                             image_src,
+                             clogBC7,
+                             type,
+                             max_bits,
+                             use_par,
+                             numEntries,         // max 16
+                             channels3or4);
+
+              if (err_shake < err_cluster)
+              { 
+                  err_cluster = err_shake;
+                  for (CGU_CHANNEL ch=0; ch<channels3or4; ch++) 
+                  {
+                      epo_code_best[  ch] = clampEPO(epo_code_shake[  ch], 0, 255);
+                      epo_code_best[4+ch] = clampEPO(epo_code_shake[4+ch], 0, 255);
+                  }
+              }
+            }
+        }
+
+        CGV_TYPEINT change = 0;
+        CGV_TYPEINT better = 0;
+
+        if ((err_cluster != CMP_FLOAT_MAX))
+        {
+            //=========================
+            // test results for quality
+            //=========================
+             err_requant = requantized_image_err(
+                                                 index_best,    // new index results
+                                                 epo_code_best, // prior result input
+                                                 clogBC7,
+                                                 max_bits,
+                                                 image_src,
+                                                 numEntries,
+                                                 channels3or4);
+
+              // change/better
+              // Has the index values changed from that last set 
+              for (CGV_ENTRIES k=0;k<numEntries;k++)
+                  change = change || (index_cluster[k] != index_best[k]);
+
+              if (err_requant < err_best)
+              {
+                  better = 1;
+                  for (CGV_ENTRIES k=0;k<numEntries;k++)
+                  { 
+                      index_io[k]=index_tmp[k]=index_best[k];
+                  }
+
+                  for (CGU_CHANNEL ch=0; ch<channels3or4; ch++)
+                  {
+                      epo_code_out[  ch]=epo_code_best[0*4+ch];
+                      epo_code_out[4+ch]=epo_code_best[1*4+ch];
+                  }
+                  err_best=err_requant;
+              }
+         }
+
+         // Early out if we have our target err
+         if( err_best <= u_BC7Encode->errorThreshold)
+         {
+             break;
+         }
+
+        CGV_TYPEINT done;
+        done = !(change  &&  better);
+        if ((maxTry > 0)&&(!done)) 
+        {
+            maxTry--;
+            MaxIndex = index_collapse(index_tmp, numEntries);
+        }
+        else 
+        {
+            maxTry = 0;
+        }
+
+    } while (maxTry);
+
+   if (err_best == CMP_FLOAT_MAX)
+   {
+       ASPM_PRINT(("Err: requantized_image_err\n"));
+   }
+
+    return err_best;
+}
+
+CGU_UINT8  get_partitionsToTry(uniform CMP_GLOBAL BC7_Encode u_BC7Encode[],CGU_UINT8 maxPartitions)
+{
+    CGU_FLOAT u_minPartitionSearchSize = 0.30f;
+    if(u_BC7Encode->quality <= BC7_qFAST_THRESHOLD) // Using this to match performance and quality of CPU code
+    {
+         u_minPartitionSearchSize       = u_minPartitionSearchSize + ( u_BC7Encode->quality*BC7_qFAST_THRESHOLD);
+    }
+    else
+    {
+        u_minPartitionSearchSize       =  u_BC7Encode->quality;
+    }
+    return (CGU_UINT8)(maxPartitions * u_minPartitionSearchSize);
+}
+
+INLINE void cmp_encode_swap(CGV_EPOCODE endpoint[], CGU_INT channels, CGV_INDEX block_index[MAX_SUBSET_SIZE], CGU_INT bits)
+{
+   CGU_INT levels = 1 << bits;
+   if (block_index[0]>=levels/2)
+   {
+      cmp_swap_epo(&endpoint[0], &endpoint[channels], channels);
+      for (CGU_INT k=0; k<SOURCE_BLOCK_SIZE; k++)
+         block_index[k] = CGV_INDEX(levels-1) - block_index[k];
+   }
+}
+
+void cmp_encode_index(CGV_CMPOUT data[16], CGU_INT* uniform pPos, CGV_INDEX block_index[MAX_SUBSET_SIZE], CGU_INT bits)
+{
+   cmp_Write8Bit(data,pPos,bits-1,block_index[0]);
+   for (CGU_INT j=1;j<SOURCE_BLOCK_SIZE;j++)
+   {
+       CGV_INDEX qbits = block_index[j]&0xFF;
+       cmp_Write8Bit(data,pPos,bits,qbits);
+   }
+}
+
+void encode_endpoint(CGV_CMPOUT data[16], CGU_INT* uniform pPos, CGV_BYTE block_index[16],  CGU_INT bits, CGV_SHIFT32 flips)
+{
+   CGU_INT      levels = 1 << bits;
+   CGV_TYPEINT  flips_shifted = flips;
+   for (CGU_INT k1=0; k1<16; k1++)
+   {
+      CGV_BYTE qbits_shifted = block_index[k1];
+      for (CGU_INT k2=0; k2<8; k2++)
+      {
+         CGV_TYPEINT q = qbits_shifted&15;
+         if ((flips_shifted&1)>0) q = (levels-1)-q;
+
+         if (k1==0 && k2==0)   cmp_Write8Bit(data, pPos, bits - 1, static_cast <CGV_BYTE>(q));
+         else                  cmp_Write8Bit(data, pPos, bits, static_cast<CGV_BYTE>(q));
+         qbits_shifted >>= 4;
+         flips_shifted >>= 1;
+      }
+   }
+}
+
+
+INLINE CGV_SHIFT32 pow32(CGV_SHIFT32 x) 
+{
+   return 1<<x; 
+}
+
+void  Encode_mode02137(
+                    CGU_INT           blockMode,
+                    CGV_UINT8         bestPartition,
+                    CGV_TYPEUINT32    packedEndpoints[MAX_SUBSETS*2],
+                    CGV_BYTE          index16[16],
+                    CGV_CMPOUT        cmp_out[COMPRESSED_BLOCK_SIZE])
+{
+    CGU_INT     partitionBits;
+    CGU_UINT32  componentBits;
+    CGU_UINT8   maxSubsets;
+    CGU_INT     channels;
+    CGU_BYTE    indexBits;
+
+    switch(blockMode)
+    {
+        case 0:
+                componentBits   = 4;
+                maxSubsets      = 3;
+                partitionBits   = 4;
+                channels        = 3;
+                indexBits       = 3;
+                break;
+        case 2:
+                componentBits   = 5;
+                maxSubsets      = 3;
+                partitionBits   = 6;
+                channels        = 3;
+                indexBits       = 2;
+                break;
+        case 3:
+                componentBits   = 7;
+                maxSubsets      = 2;
+                partitionBits   = 6;
+                channels        = 3;
+                indexBits       = 2;
+                break;
+        case 7:
+                componentBits   = 5;
+                maxSubsets      = 2;
+                partitionBits   = 6;
+                channels        = 4;
+                indexBits       = 2;
+                break;
+        default:
+        case 1:
+            componentBits = 6;
+            maxSubsets = 2;
+            partitionBits = 6;
+            channels = 3;
+            indexBits = 3;
+            break;
+    }
+
+    CGV_BYTE  blockindex[SOURCE_BLOCK_SIZE];
+    CGV_INT   indexBitsV = indexBits;
+
+    for (CGU_INT k=0; k<COMPRESSED_BLOCK_SIZE; k++) cmp_out[k] = 0;
+
+    // mode 0 = 1, mode 1 = 01, mode 2 = 001, mode 3 = 0001, ...
+    CGU_INT    bitPosition = blockMode;
+    cmp_Write8Bit(cmp_out,&bitPosition,1,1);
+
+    // Write partition bits
+     cmp_Write8Bit(cmp_out,&bitPosition,partitionBits,bestPartition);
+
+  // Sort out the index set and tag whether we need to flip the 
+  // endpoints to get the correct state in the implicit index bits
+  // The implicitly encoded MSB of the fixup index must be 0
+    CGV_FIXUPINDEX    fixup[3];
+    get_fixuptable(fixup,(maxSubsets==2?bestPartition:bestPartition+64));
+
+    // Extract indices and mark subsets that need to have their colours flipped to get the
+    // right state for the implicit MSB of the fixup index
+    CGV_INT     flipColours[3] = {0, 0, 0};
+
+    for (CGV_INT k=0; k<SOURCE_BLOCK_SIZE; k++)
+    {
+        blockindex[k]  = index16[k];
+        for (CGU_UINT8 j=0;j<maxSubsets;j++)
+        {
+            if(k==fixup[j])
+            {
+                if(blockindex[k] & (1<<(indexBitsV-1)))
+                {
+                    flipColours[j] = 1;
+                }
+            }
+        }
+     }
+
+    // Now we must flip the endpoints where necessary so that the implicitly encoded
+    // index bits have the correct state
+    for (CGU_INT subset=0; subset<maxSubsets; subset++)
+    {
+        if(flipColours[subset] == 1)
+        {
+            CGV_TYPEUINT32         temp = packedEndpoints[subset*2+0];
+            packedEndpoints[subset*2+0] = packedEndpoints[subset*2+1];
+            packedEndpoints[subset*2+1] = temp;
+        }
+    }
+
+    // ...next flip the indices where necessary
+
+
+    for (CGV_INT k=0; k<SOURCE_BLOCK_SIZE; k++)
+    {
+        CGV_UINT8   partsub = get_partition_subset(bestPartition,maxSubsets,k);
+
+        if(flipColours[partsub] == 1)
+        {
+            blockindex[k] = ((1 << indexBitsV) - 1) - blockindex[k];
+        }
+    }
+
+    // Endpoints are stored in the following order RRRR GGGG BBBB (AAAA) (PPPP)
+    // i.e. components are packed together
+    CGV_SHIFT32  unpackedColours[MAX_SUBSETS*2*MAX_CHANNELS];
+    CGV_BYTE     parityBits[MAX_SUBSETS][2];
+
+    // Unpack the colour values for the subsets
+    for (CGU_INT subset=0; subset<maxSubsets; subset++)
+    {
+        CGV_SHIFT32   packedColours[2] = {packedEndpoints[subset*2+0],packedEndpoints[subset*2+1]};
+
+        if(blockMode == 0 || blockMode == 3|| blockMode == 7) // TWO_PBIT
+        {
+            parityBits[subset][0] = packedColours[0] & 1;
+            parityBits[subset][1] = packedColours[1] & 1;
+            packedColours[0] >>= 1;
+            packedColours[1] >>= 1;
+        }
+        else 
+        if(blockMode == 1) // ONE_PBIT
+        {
+            parityBits[subset][0] = packedColours[1] & 1;
+            parityBits[subset][1] = packedColours[1] & 1;
+            packedColours[0] >>= 1;
+            packedColours[1] >>= 1;
+        }
+        else
+        if(blockMode == 2)
+        {
+            parityBits[subset][0] = 0;
+            parityBits[subset][1] = 0;
+        }
+
+        for (CGU_INT ch=0; ch<channels;ch++)
+        {
+            unpackedColours[(subset*2+0)*MAX_CHANNELS+ch] = packedColours[0] & ((1 << componentBits) - 1);
+            unpackedColours[(subset*2+1)*MAX_CHANNELS+ch] = packedColours[1] & ((1 << componentBits) - 1);
+            packedColours[0] >>= componentBits;
+            packedColours[1] >>= componentBits;
+        }
+    }
+
+    // Loop over component 
+    for (CGU_INT ch=0; ch < channels; ch++)
+    {
+        // loop over subsets
+        for (CGU_INT subset=0; subset<maxSubsets; subset++)
+        {
+            cmp_Write8Bit(cmp_out,&bitPosition,componentBits,unpackedColours[(subset*2+0)*MAX_CHANNELS+ch]&0xFF);
+            cmp_Write8Bit(cmp_out,&bitPosition,componentBits,unpackedColours[(subset*2+1)*MAX_CHANNELS+ch]&0xFF);
+        }
+    }
+
+
+    // write parity bits 
+    if (blockMode != 2)
+    {
+        for (CGV_INT subset=0; subset<maxSubsets; subset++)
+        {
+            if(blockMode == 1) // ONE_PBIT
+            {
+                cmp_Write8Bit(cmp_out,&bitPosition,1,parityBits[subset][0]&0x01);
+            }
+            else // TWO_PBIT
+            {
+                cmp_Write8Bit(cmp_out,&bitPosition,1,parityBits[subset][0]&0x01);
+                cmp_Write8Bit(cmp_out,&bitPosition,1,parityBits[subset][1]&0x01);
+            }
+        }
+    }
+
+    // Encode the index bits
+    CGV_INT bitPositionV = bitPosition;
+    for (CGV_FIXUPINDEX k=0; k<SOURCE_BLOCK_SIZE; k++)
+    {
+        CGV_UINT8   partsub = get_partition_subset(bestPartition,maxSubsets,k);
+
+        // If this is a fixup index then drop the MSB which is implicitly 0
+        if(k == fixup[partsub])
+        {
+            cmp_Write8BitV(cmp_out, bitPositionV, indexBits-1,blockindex[k]&0x07F);
+            bitPositionV += indexBits-1;
+        }
+        else
+        {
+            cmp_Write8BitV(cmp_out,bitPositionV, indexBits,blockindex[k]);
+            bitPositionV += indexBits;
+        }
+    }
+}
+
+void  Encode_mode4( CGV_CMPOUT     cmp_out[COMPRESSED_BLOCK_SIZE],
+                    varying cmp_mode_parameters* uniform params )
+{
+    CGU_INT   bitPosition = 4;    // Position the pointer at the LSB
+
+    for (CGU_INT k=0; k<COMPRESSED_BLOCK_SIZE; k++) cmp_out[k] = 0;
+
+    // mode 4 (5 bits) 00001
+    cmp_Write8Bit(cmp_out,&bitPosition,1,1);
+
+    // rotation 2 bits
+    cmp_Write8Bit(cmp_out, &bitPosition, 2, static_cast <CGV_BYTE> (params->rotated_channel));
+
+    // idxMode 1 bit
+    cmp_Write8Bit(cmp_out, &bitPosition, 1, static_cast <CGV_BYTE> (params->idxMode));
+
+    CGU_INT   idxBits[2] = {2,3};
+
+    if(params->idxMode)
+    {
+        idxBits[0] = 3;
+        idxBits[1] = 2;
+        // Indicate if we need to fixup the index
+        cmp_swap_index(params->color_index,params->alpha_index,16);
+        cmp_encode_swap(params->alpha_qendpoint, 4, params->color_index,2);
+        cmp_encode_swap(params->color_qendpoint, 4, params->alpha_index,3);
+    }
+    else
+    {
+        cmp_encode_swap(params->color_qendpoint, 4, params->color_index,2);
+        cmp_encode_swap(params->alpha_qendpoint, 4, params->alpha_index,3);
+    }
+
+   // color endpoints 5 bits each
+   // R0 : R1
+   // G0 : G1
+   // B0 : B1
+   for (CGU_INT component=0; component < 3; component++)
+   {
+        cmp_Write8Bit(cmp_out, &bitPosition, 5, static_cast<CGV_BYTE> (params->color_qendpoint[component]));
+        cmp_Write8Bit(cmp_out, &bitPosition, 5, static_cast <CGV_BYTE> (params->color_qendpoint[4 + component]));
+   }
+
+   // alpha endpoints (6 bits each)
+   // A0 : A1
+   cmp_Write8Bit(cmp_out, &bitPosition, 6, static_cast<CGV_BYTE> (params->alpha_qendpoint[0]));
+   cmp_Write8Bit(cmp_out, &bitPosition, 6, static_cast<CGV_BYTE> (params->alpha_qendpoint[4]));
+
+    // index 2 bits each  (31 bits total)
+    cmp_encode_index(cmp_out, &bitPosition, params->color_index, 2);
+    // index 3 bits each  (47 bits total)
+    cmp_encode_index(cmp_out, &bitPosition, params->alpha_index, 3);
+}
+
+void  Encode_mode5( CGV_CMPOUT     cmp_out[COMPRESSED_BLOCK_SIZE],
+                    varying cmp_mode_parameters* uniform params)
+{
+    for (CGU_INT k=0; k<COMPRESSED_BLOCK_SIZE; k++) cmp_out[k] = 0;
+
+    // mode 5 bits = 000001
+    CGU_INT   bitPosition = 5;    // Position the pointer at the LSB
+    cmp_Write8Bit(cmp_out,&bitPosition,1,1);
+
+    // Write 2 bit rotation
+    cmp_Write8Bit(cmp_out, &bitPosition, 2, static_cast<CGV_BYTE> (params->rotated_channel));
+
+    cmp_encode_swap(params->color_qendpoint, 4, params->color_index,2);
+    cmp_encode_swap(params->alpha_qendpoint, 4, params->alpha_index,2);
+
+   // color endpoints (7 bits each)
+   // R0 : R1
+   // G0 : G1
+   // B0 : B1
+   for (CGU_INT component=0; component < 3; component++)
+   {
+        cmp_Write8Bit(cmp_out, &bitPosition, 7, static_cast<CGV_BYTE> (params->color_qendpoint[component]));
+        cmp_Write8Bit(cmp_out, &bitPosition, 7, static_cast <CGV_BYTE> (params->color_qendpoint[4 + component]));
+   }
+
+   // alpha endpoints (8 bits each)
+   // A0 : A1
+   cmp_Write8Bit(cmp_out, &bitPosition, 8, static_cast<CGV_BYTE> (params->alpha_qendpoint[0]));
+   cmp_Write8Bit(cmp_out, &bitPosition, 8, static_cast<CGV_BYTE> (params->alpha_qendpoint[4]));
+
+
+   // color index 2 bits each  (31 bits total)
+   // alpha index 2 bits each  (31 bits total)
+   cmp_encode_index(cmp_out, &bitPosition, params->color_index, 2);
+   cmp_encode_index(cmp_out, &bitPosition, params->alpha_index, 2);
+}
+
+void  Encode_mode6(
+                    CGV_INDEX         index[MAX_SUBSET_SIZE],
+                    CGV_EPOCODE       epo_code[8],
+                    CGV_CMPOUT        cmp_out[COMPRESSED_BLOCK_SIZE])
+{
+    for (CGU_INT k=0; k<COMPRESSED_BLOCK_SIZE; k++) cmp_out[k] = 0;
+
+    cmp_encode_swap(epo_code, 4, index,4);
+
+    // Mode = 6  bits = 0000001
+    CGU_INT    bitPosition = 6;    // Position the pointer at the LSB
+    cmp_Write8Bit(cmp_out,&bitPosition,1, 1);
+
+    // endpoints
+    for (CGU_INT p=0; p<4; p++)
+    {
+        cmp_Write8Bit(cmp_out, &bitPosition, 7, static_cast<CGV_BYTE> (epo_code[0 + p] >> 1));
+        cmp_Write8Bit(cmp_out, &bitPosition, 7, static_cast<CGV_BYTE> (epo_code[4 + p] >> 1));
+    }
+
+    // p bits
+    cmp_Write8Bit(cmp_out, &bitPosition, 1, epo_code[0]&1);
+    cmp_Write8Bit(cmp_out, &bitPosition, 1, epo_code[4]&1);
+
+    // quantized values
+    cmp_encode_index(cmp_out, &bitPosition, index, 4);
+}
+
+
+void  Compress_mode01237(
+                    CGU_INT             blockMode,
+                    BC7_EncodeState     EncodeState[],
+uniform CMP_GLOBAL    BC7_Encode          u_BC7Encode[])
+{
+    CGV_INDEX       storedBestindex[MAX_PARTITIONS][MAX_SUBSETS][MAX_SUBSET_SIZE];
+    CGV_ERROR       storedError[MAX_PARTITIONS];
+    CGV_UINT8       sortedPartition[MAX_PARTITIONS];
+
+    EncodeState->numPartitionModes = 64;
+    EncodeState->maxSubSets = 2;
+
+    if (blockMode == 0)
+    {
+        EncodeState->numPartitionModes = 16;
+        EncodeState->channels3or4      = 3;
+        EncodeState->bits              = 26;
+        EncodeState->clusters          = 8; 
+        EncodeState->componentBits     = 4; 
+        EncodeState->maxSubSets        = 3;
+    }
+    else
+    if (blockMode == 2)
+    {
+        EncodeState->channels3or4  = 3;
+        EncodeState->bits          = 30;
+        EncodeState->clusters      = 4;
+        EncodeState->componentBits = 5;
+        EncodeState->maxSubSets    = 3;
+    }
+    else
+    if (blockMode == 1)
+    {
+    
+        EncodeState->channels3or4  = 3;
+        EncodeState->bits          = 37;
+        EncodeState->clusters      = 8; 
+        EncodeState->componentBits = 6;
+    }
+    else
+    if (blockMode == 3)
+    {
+        EncodeState->channels3or4  = 3;
+        EncodeState->bits          = 44;
+        EncodeState->clusters      = 4;
+        EncodeState->componentBits = 7;
+    }
+    else
+    if (blockMode == 7)
+    {
+        EncodeState->channels3or4  = 4;
+        EncodeState->bits          = 42; // (2* (R 5 + G 5 + B 5 + A 5)) + 2 parity bits
+        EncodeState->clusters      = 4;
+        EncodeState->componentBits = 5;  // 5 bit components
+    }
+
+    CGV_IMAGE        image_subsets[MAX_SUBSETS][MAX_SUBSET_SIZE][MAX_CHANNELS];
+    CGV_ENTRIES      subset_entryCount[MAX_SUBSETS] = {0,0,0};
+
+    // Loop over the available partitions for the block mode and quantize them 
+    // to figure out the best candidates for further refinement
+    CGU_UINT8       mode_partitionsToTry;
+    mode_partitionsToTry = get_partitionsToTry(u_BC7Encode,EncodeState->numPartitionModes);
+
+    CGV_UINT8     bestPartition = 0;
+
+    for (CGU_INT mode_blockPartition = 0;  mode_blockPartition < mode_partitionsToTry;  mode_blockPartition++)
+    {
+
+        GetPartitionSubSet_mode01237(
+                  image_subsets,
+                  subset_entryCount,
+                  static_cast<CGV_UINT8>(mode_blockPartition),
+                  EncodeState->image_src,
+                  blockMode,
+                  EncodeState->channels3or4);
+
+        CGV_IMAGE  subset_image_src[SOURCE_BLOCK_SIZE*MAX_CHANNELS];
+        CGV_INDEX  index_out1[SOURCE_BLOCK_SIZE];
+        CGV_ERROR  err_quant = 0.0F;
+
+        // Store the quntize error for this partition to be sorted and processed later
+        for (CGU_INT subset=0; subset < EncodeState->maxSubSets; subset++)
+        {
+                CGV_ENTRIES       numEntries = subset_entryCount[subset];
+
+                for (CGU_INT ii=0; ii<SOURCE_BLOCK_SIZE; ii++)
+                {
+                    subset_image_src[ii+COMP_RED  *SOURCE_BLOCK_SIZE]   = image_subsets[subset][ii][0];
+                    subset_image_src[ii+COMP_GREEN*SOURCE_BLOCK_SIZE]   = image_subsets[subset][ii][1];
+                    subset_image_src[ii+COMP_BLUE *SOURCE_BLOCK_SIZE]   = image_subsets[subset][ii][2];
+                    subset_image_src[ii+COMP_ALPHA*SOURCE_BLOCK_SIZE]   = image_subsets[subset][ii][3];
+                }
+
+                CGV_INDEXPACKED  color_index2[2];
+
+                err_quant += GetQuantizeIndex(
+                                       color_index2,
+                                       index_out1,
+                                       subset_image_src,
+                                       numEntries,
+                                       EncodeState->clusters,
+                                       EncodeState->channels3or4);
+
+                for (CGV_INT idx=0; idx < numEntries; idx++)
+                {
+                     storedBestindex[mode_blockPartition][subset][idx] = index_out1[idx];
+                }
+        }
+
+        storedError[mode_blockPartition] = err_quant;
+    }
+
+    // Sort the results
+    sortPartitionProjection( storedError,
+                             sortedPartition,
+                             mode_partitionsToTry);
+
+    CGV_EPOCODE   epo_code[MAX_SUBSETS*2*MAX_CHANNELS];
+    CGV_EPOCODE   bestEndpoints[MAX_SUBSETS*2*MAX_CHANNELS];
+    CGV_BYTE      bestindex[MAX_SUBSETS*MAX_SUBSET_SIZE];
+    CGV_ENTRIES   bestEntryCount[MAX_SUBSETS];
+    CGV_BYTE      bestindex16[MAX_SUBSET_SIZE];
+
+    // Extensive shaking is most important when the ramp is short, and
+    // when we have less index. On a long ramp the quality of the
+    // initial quantizing is relatively more important
+    // We modulate the shake size according to the number of ramp index
+    // - the more index we have the less shaking should be required to find a near
+    // optimal match
+
+    CGU_UINT8   numShakeAttempts = max8(1, min8((CGU_UINT8)floor(8 * u_BC7Encode->quality + 0.5), mode_partitionsToTry));
+    CGV_ERROR       err_best = CMP_FLOAT_MAX;
+
+    // Now do the endpoint shaking
+    for (CGU_INT nSA =0; nSA < numShakeAttempts; nSA++)
+    {
+
+        CGV_ERROR err_optimized = 0.0F;
+        CGV_UINT8 sortedBlockPartition;
+        sortedBlockPartition = sortedPartition[nSA];
+
+        //********************************************
+        // Get the partition shape for the given mode
+        //********************************************
+        GetPartitionSubSet_mode01237(
+                  image_subsets,
+                  subset_entryCount,
+                  sortedBlockPartition,
+                  EncodeState->image_src,
+                  blockMode,
+                  EncodeState->channels3or4);
+
+        //*****************************
+        // Process the partition shape 
+        //*****************************
+        for (CGU_INT subset=0; subset < EncodeState->maxSubSets; subset++)
+        {
+             CGV_ENTRIES   numEntries = subset_entryCount[subset];
+             CGV_IMAGE     src_image_block[SOURCE_BLOCK_SIZE*MAX_CHANNELS];
+             CGV_INDEX     index_io[MAX_SUBSET_SIZE];
+             CGV_EPOCODE   tmp_epo_code[8];
+
+             for (CGU_INT k=0; k<SOURCE_BLOCK_SIZE; k++)
+             {
+                 src_image_block[k+COMP_RED*SOURCE_BLOCK_SIZE]   = image_subsets[subset][k][0];
+                 src_image_block[k+COMP_GREEN*SOURCE_BLOCK_SIZE] = image_subsets[subset][k][1];
+                 src_image_block[k+COMP_BLUE*SOURCE_BLOCK_SIZE]  = image_subsets[subset][k][2];
+                 src_image_block[k+COMP_ALPHA*SOURCE_BLOCK_SIZE] = image_subsets[subset][k][3];
+             }
+
+             for (CGU_INT k=0; k<MAX_SUBSET_SIZE; k++)
+             {
+                  index_io[k] = storedBestindex[sortedBlockPartition][subset][k];
+             }
+
+             err_optimized += optimize_IndexAndEndPoints(
+                                   index_io,
+                                   tmp_epo_code,
+                                   src_image_block,
+                                   numEntries,
+                                   static_cast<CGU_INT8>(EncodeState->clusters),  // Mi_
+                                   EncodeState->bits,
+                                   EncodeState->channels3or4,
+                                   u_BC7Encode);
+
+             for (CGU_INT k=0; k < MAX_SUBSET_SIZE; k++)
+             {
+                storedBestindex[sortedBlockPartition][subset][k] = index_io[k];
+             }
+
+             for (CGU_INT ch=0; ch<MAX_CHANNELS; ch++)
+             {
+                 epo_code[(subset*2+0)*4+ch] = tmp_epo_code[  ch];
+                 epo_code[(subset*2+1)*4+ch] = tmp_epo_code[4+ch];
+             }
+        }
+
+        //****************************************
+        // Check if result is better than the last
+        //****************************************
+        if(err_optimized < err_best)
+        {
+            bestPartition = sortedBlockPartition;
+            CGV_INT bestIndexCount = 0;
+
+            for (CGU_INT subset=0; subset < EncodeState->maxSubSets; subset++)
+            {
+                CGV_ENTRIES       numEntries = subset_entryCount[subset];
+                bestEntryCount[subset] = numEntries;
+
+                if(numEntries)
+                {
+                    for (CGU_INT ch=0; ch < EncodeState->channels3or4; ch++)
+                    {
+                        bestEndpoints[(subset*2+0)*4+ch] = epo_code[(subset*2+0)*4+ch];
+                        bestEndpoints[(subset*2+1)*4+ch] = epo_code[(subset*2+1)*4+ch];
+                    }
+
+                    for (CGV_ENTRIES k=0; k< numEntries; k++)
+                    {
+                        bestindex[subset*MAX_SUBSET_SIZE+k] = storedBestindex[sortedBlockPartition][subset][k];
+                        bestindex16[bestIndexCount++]       = storedBestindex[sortedBlockPartition][subset][k];
+                    }
+                }
+            }
+
+            err_best = err_optimized;
+            // Early out if we  found we can compress with error below the quality threshold
+            if(err_best <= u_BC7Encode->errorThreshold)
+            {
+                break;
+            }
+        }
+    }
+
+
+    if (blockMode != 7)
+           err_best +=  EncodeState->opaque_err;
+
+    if(err_best >  EncodeState->best_err) 
+              return;
+
+    //**************************
+    // Save the encoded block
+    //**************************
+    EncodeState->best_err = err_best;
+
+
+    // Now we have all the data needed to encode the block
+    // We need to pack the endpoints prior to encoding
+    CGV_TYPEUINT32   packedEndpoints[MAX_SUBSETS*2] = {0,0,0,0,0,0};
+    for (CGU_INT subset=0; subset<EncodeState->maxSubSets; subset++)
+    {
+        packedEndpoints[(subset*2)+0] = 0;
+        packedEndpoints[(subset*2)+1] = 0;
+
+        if(bestEntryCount[subset])
+        {
+            CGU_UINT32   rightAlignment = 0;
+
+            // Sort out parity bits
+            if(blockMode != 2)
+            {
+                // Sort out BCC parity bits
+                packedEndpoints[(subset*2)+0] = bestEndpoints[(subset*2+0)*4+0] & 1;
+                packedEndpoints[(subset*2)+1] = bestEndpoints[(subset*2+1)*4+0] & 1;
+                for (CGU_INT ch=0; ch<EncodeState->channels3or4; ch++)
+                {
+                    bestEndpoints[(subset*2+0)*4+ch] >>= 1;
+                    bestEndpoints[(subset*2+1)*4+ch] >>= 1;
+                }
+                rightAlignment++;
+            }
+
+            // Fixup endpoints
+            for (CGU_INT ch=0; ch<EncodeState->channels3or4; ch++)
+            {
+                    packedEndpoints[(subset*2)+0] |= bestEndpoints[((subset*2)+0)*4+ch] << rightAlignment;
+                    packedEndpoints[(subset*2)+1] |= bestEndpoints[((subset*2)+1)*4+ch] << rightAlignment;
+                    rightAlignment += EncodeState->componentBits;
+            }
+        }
+    }
+
+    CGV_UINT8    idxCount[3] = {0, 0, 0};
+    for (CGV_INT k=0; k<SOURCE_BLOCK_SIZE; k++)
+    {
+        CGV_UINT8     partsub = get_partition_subset(bestPartition,EncodeState->maxSubSets,k);
+        CGV_UINT8 idxC = idxCount[partsub];
+        bestindex16[k]  = bestindex[partsub*MAX_SUBSET_SIZE+idxC];
+        idxCount[partsub]    = idxC + 1;
+     }
+
+    Encode_mode02137(
+                     blockMode,
+                     bestPartition,
+                     packedEndpoints,
+                     bestindex16,
+                     EncodeState->cmp_out);
+}
+
+void  Compress_mode45(
+                    CGU_INT             blockMode,
+                    BC7_EncodeState     EncodeState[],
+uniform CMP_GLOBAL    BC7_Encode          u_BC7Encode[])
+{
+
+    cmp_mode_parameters best_candidate;
+    EncodeState->channels3or4 = 4;
+    cmp_memsetBC7((CGV_BYTE *)&best_candidate, 0, sizeof(cmp_mode_parameters));
+
+    if (blockMode == 4)
+    {
+        EncodeState->max_idxMode     = 2;
+        EncodeState->modeBits[0]     = 30;  // bits = 2 * (Red 5+ Grn 5+ blu 5)
+        EncodeState->modeBits[1]     = 36;  // bits = 2 * (Alpha 6+6+6)
+        EncodeState->numClusters0[0] = 4;
+        EncodeState->numClusters0[1] = 8;
+        EncodeState->numClusters1[0] = 8;
+        EncodeState->numClusters1[1] = 4;
+    }
+    else
+    {
+        EncodeState->max_idxMode     = 1;
+        EncodeState->modeBits[0]     = 42;  // bits = 2 * (Red 7+ Grn 7+ blu 7)
+        EncodeState->modeBits[1]     = 48;  // bits = 2 * (Alpha 8+8+8) = 48
+        EncodeState->numClusters0[0] = 4;
+        EncodeState->numClusters0[1] = 4;
+        EncodeState->numClusters1[0] = 4;
+        EncodeState->numClusters1[1] = 4;
+    }
+
+
+    CGV_IMAGE   src_color_Block[SOURCE_BLOCK_SIZE*MAX_CHANNELS];
+    CGV_IMAGE   src_alpha_Block[SOURCE_BLOCK_SIZE*MAX_CHANNELS];
+
+    // Go through each possible rotation and selection of index rotationBits)
+    for (CGU_CHANNEL rotated_channel = 0; rotated_channel < EncodeState->channels3or4; rotated_channel++)  
+    { // A
+
+        for (CGU_INT k=0; k<SOURCE_BLOCK_SIZE; k++)
+        { 
+            for (CGU_INT p=0; p<3; p++)
+            {
+                src_color_Block[k+p*SOURCE_BLOCK_SIZE] =  EncodeState->image_src[k+componentRotations[rotated_channel][p+1]*SOURCE_BLOCK_SIZE];
+                src_alpha_Block[k+p*SOURCE_BLOCK_SIZE] =  EncodeState->image_src[k+componentRotations[rotated_channel][0]*SOURCE_BLOCK_SIZE];
+            }
+        }
+
+        CGV_ERROR   err_quantizer;
+        CGV_ERROR   err_bestQuantizer = CMP_FLOAT_MAX;
+
+        for (CGU_INT idxMode = 0; idxMode < EncodeState->max_idxMode; idxMode++)
+        { // B
+            CGV_INDEXPACKED  color_index2[2]; // reserved .. Not used!
+
+           err_quantizer = GetQuantizeIndex(
+                                color_index2,
+                                best_candidate.color_index,
+                                src_color_Block,
+                                SOURCE_BLOCK_SIZE,
+                                EncodeState->numClusters0[idxMode],
+                                3);
+
+            err_quantizer += GetQuantizeIndex(
+                                color_index2,
+                                best_candidate.alpha_index,
+                                src_alpha_Block,
+                                SOURCE_BLOCK_SIZE,
+                                EncodeState->numClusters1[idxMode],
+                                3) / 3.0F;
+
+            // If quality is high then run the full shaking for this config and
+            // store the result if it beats the best overall error
+            // Otherwise only run the shaking if the error is better than the best
+            // quantizer error
+            if(err_quantizer <= err_bestQuantizer)
+            {
+                err_bestQuantizer = err_quantizer;
+
+                // Shake size gives the size of the shake cube
+                CGV_ERROR   err_overallError;
+
+                err_overallError = optimize_IndexAndEndPoints(
+                                             best_candidate.color_index,
+                                             best_candidate.color_qendpoint,
+                                             src_color_Block,
+                                             SOURCE_BLOCK_SIZE,
+                                             EncodeState->numClusters0[idxMode],
+                                             static_cast<CGU_INT8>(EncodeState->modeBits[0]),
+                                             3,
+                                             u_BC7Encode);
+
+                // Alpha scalar block
+                err_overallError += optimize_IndexAndEndPoints(
+                                               best_candidate.alpha_index,
+                                               best_candidate.alpha_qendpoint,
+                                               src_alpha_Block,
+                                               SOURCE_BLOCK_SIZE,
+                                               EncodeState->numClusters1[idxMode],
+                                               static_cast<CGU_UINT8>(EncodeState->modeBits[1]),
+                                               3,
+                                               u_BC7Encode) / 3.0f;
+
+                // If we beat the previous best then encode the block
+                if(err_overallError <  EncodeState->best_err)
+                {
+                   best_candidate.idxMode         = idxMode;
+                   best_candidate.rotated_channel = rotated_channel;
+                   if (blockMode == 4)
+                        Encode_mode4( EncodeState->cmp_out, &best_candidate);
+                   else
+                        Encode_mode5( EncodeState->cmp_out, &best_candidate);
+                   EncodeState->best_err = err_overallError;
+                }
+            }
+        } // B
+    } // A
+}
+
+
+void  Compress_mode6( BC7_EncodeState     EncodeState[],
+uniform CMP_GLOBAL      BC7_Encode          u_BC7Encode[])
+{
+    CGV_ERROR        err;
+
+    CGV_EPOCODE      epo_code_out[8] = {0};
+    CGV_INDEX        best_index_out[MAX_SUBSET_SIZE];
+    CGV_INDEXPACKED  best_packedindex_out[2];
+
+
+   // CGV_IMAGE        block_endpoints[8];
+   // icmp_get_block_endpoints(block_endpoints,  EncodeState->image_src, -1, 4);
+   // icmp_GetQuantizedEpoCode(epo_code_out, block_endpoints, 6,4);
+   // err = icmp_GetQuantizeIndex(best_packedindex_out, best_index_out, EncodeState->image_src, 4, block_endpoints, 0,4);
+
+    err = GetQuantizeIndex(
+                   best_packedindex_out,
+                   best_index_out,
+                   EncodeState->image_src,
+                   16, // numEntries
+                   16, // clusters
+                   4);  // channels3or4
+
+    //*****************************
+    // Process the partition shape 
+    //*****************************
+    err  = optimize_IndexAndEndPoints(
+                          best_index_out,
+                          epo_code_out,
+                          EncodeState->image_src,
+                          16, //numEntries
+                          16, // Mi_ = clusters
+                          58, // bits
+                          4,  // channels3or4
+                          u_BC7Encode);
+    
+    //**************************
+    // Save the encoded block
+    //**************************
+    
+    if (err < EncodeState->best_err)
+    {
+        EncodeState->best_err = err;
+        Encode_mode6(
+                    best_index_out,
+                    epo_code_out,
+                    EncodeState->cmp_out);
+    }
+}
+
+void copy_BC7_Encode_settings(BC7_EncodeState  EncodeState[], uniform CMP_GLOBAL BC7_Encode settings [])
+{
+ EncodeState->best_err          = CMP_FLOAT_MAX;
+ EncodeState->validModeMask     = settings->validModeMask;
+ #ifdef USE_ICMP
+ EncodeState->part_count        = settings->part_count;
+ EncodeState->channels          = settings->channels;
+#endif
+}
+
+//===================================== ICMP CODE =========================================================
+#ifdef USE_ICMP
+//========================================
+// Modified Intel Texture Compression Code
+//========================================
+
+void icmp_Write32Bit(CGV_CMPOUTPACKED base[], CGU_INT* uniform offset, CGU_INT bits, CGV_CMPOUTPACKED bitVal)
+{
+    base[*offset / 32] |= ((CGV_CMPOUTPACKED)bitVal) << (*offset % 32);
+    if (*offset % 32 + bits > 32)
+    {
+        base[*offset / 32 + 1] |= shift_right_uint32(bitVal, 32 - *offset % 32);
+    }
+    *offset += bits;
+}
+
+//================ 32 bit cmp_out mode encoders ===============
+
+INLINE void icmp_swap_epocode(CGV_EPOCODE u[], CGV_EPOCODE v[], CGU_INT n)
+{
+    for (CGU_INT i = 0; i < n; i++)
+    {
+        CGV_EPOCODE t = u[i];
+        u[i] = v[i];
+        v[i] = t;
+    }
+}
+
+void icmp_encode_apply_swap(CGV_EPOCODE endpoint[], CGU_INT channel, CGV_INDEXPACKED block_index[2], CGU_INT bits)
+{
+    CGU_INT levels = 1 << bits;
+    if ((block_index[0] & 15) >= levels / 2)
+    {
+        icmp_swap_epocode(&endpoint[0], &endpoint[channel], channel);
+
+        for (CGU_INT k = 0; k < 2; k++)
+            block_index[k] = (CGV_INDEXPACKED)(0x11111111 * (levels - 1)) - block_index[k];
+    }
+}
+
+void icmp_encode_index(CGV_CMPOUTPACKED data[5], CGU_INT* uniform pPos, CGV_INDEXPACKED block_index[2], CGU_INT bits, CGV_MASK flips)
+{
+    CGU_INT levels = 1 << bits;
+    CGV_MASK flips_shifted = flips;
+    for (CGU_INT k1 = 0; k1 < 2; k1++)
+    {
+        CGV_CMPOUTPACKED qbits_shifted = block_index[k1];
+        for (CGU_INT k2 = 0; k2 < 8; k2++)
+        {
+            CGV_CMPOUTPACKED q = qbits_shifted & 15;
+            if ((flips_shifted & 1) > 0) q = (levels - 1) - q;
+
+            if (k1 == 0 && k2 == 0)   icmp_Write32Bit(data, pPos, bits - 1, q);
+            else            icmp_Write32Bit(data, pPos, bits, q);
+            qbits_shifted >>= 4;
+            flips_shifted >>= 1;
+        }
+    }
+}
+
+void icmp_bc7_encode_endpoint2(CGV_CMPOUTPACKED data[5], CGU_INT* uniform pPos, CGV_INDEXPACKED color_index[2], CGU_INT bits, CGV_MASK flips)
+{
+    CGU_INT levels = 1 << bits;
+    CGV_MASK flips_shifted = flips;
+    for (CGU_INT k1 = 0; k1 < 2; k1++)
+    {
+        CGV_INDEXPACKED qbits_shifted = color_index[k1];
+        for (CGU_INT k2 = 0; k2 < 8; k2++)
+        {
+            CGV_INDEXPACKED q = qbits_shifted & 15;
+            if ((flips_shifted & 1) > 0) q = (levels - 1) - q;
+
+            if (k1 == 0 && k2 == 0)   icmp_Write32Bit(data, pPos, bits - 1, q);
+            else                  icmp_Write32Bit(data, pPos, bits, q);
+            qbits_shifted >>= 4;
+            flips_shifted >>= 1;
+        }
+    }
+}
+
+INLINE CGV_CMPOUTPACKED icmp_pow2Packed(CGV_FIXUPINDEX x)
+{
+    return 1 << x;
+}
+
+INLINE void icmp_encode_data_shl_1bit_from(CGV_CMPOUTPACKED data[5], CGV_FIXUPINDEX from)
+{
+    if (from < 96)
+    {
+        //assert(from > 64+10);
+
+        CGV_CMPOUTPACKED shifted = (data[2] >> 1) | (data[3] << 31);
+        CGV_CMPOUTPACKED mask = (icmp_pow2Packed(from - 64) - 1) >> 1;
+        data[2] = (mask&data[2]) | (~mask&shifted);
+        data[3] = (data[3] >> 1) | (data[4] << 31);
+        data[4] = data[4] >> 1;
+    }
+    else if (from < 128)
+    {
+        CGV_CMPOUTPACKED shifted = (data[3] >> 1) | (data[4] << 31);
+        CGV_CMPOUTPACKED mask = (icmp_pow2Packed(from - 96) - 1) >> 1;
+        data[3] = (mask&data[3]) | (~mask&shifted);
+        data[4] = data[4] >> 1;
+    }
+}
+
+INLINE void icmp_get_fixuptable(CGV_FIXUPINDEX fixup[3], CGV_PARTID part_id)
+{
+    // same as  CMP SDK v3.1 BC7_FIXUPINDEX1 &  BC7_FIXUPINDEX2 for each partition range 0..63
+    // The data is saved as a packed INT = (BC7_FIXUPINDEX1 << 4 + BC7_FIXUPINDEX2)
+        CMP_STATIC  uniform __constant  CGV_FIXUPINDEX FIXUPINDEX[] = {
+        // 2 subset partitions 0..63
+         0xf0u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0xf0u,
+         0xf0u, 0x20u, 0x80u, 0x20u, 0x20u, 0x80u, 0x80u, 0xf0u, 0x20u, 0x80u, 0x20u, 0x20u, 0x80u, 0x80u, 0x20u, 0x20u,
+         0xf0u, 0xf0u, 0x60u, 0x80u, 0x20u, 0x80u, 0xf0u, 0xf0u, 0x20u, 0x80u, 0x20u, 0x20u, 0x20u, 0xf0u, 0xf0u, 0x60u,
+         0x60u, 0x20u, 0x60u, 0x80u, 0xf0u, 0xf0u, 0x20u, 0x20u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0x20u, 0x20u, 0xf0u,
+         // 3 subset partitions 64..128
+         0x3fu, 0x38u, 0xf8u, 0xf3u, 0x8fu, 0x3fu, 0xf3u, 0xf8u, 0x8fu, 0x8fu, 0x6fu, 0x6fu, 0x6fu, 0x5fu, 0x3fu, 0x38u,
+         0x3fu, 0x38u, 0x8fu, 0xf3u, 0x3fu, 0x38u, 0x6fu, 0xa8u, 0x53u, 0x8fu, 0x86u, 0x6au, 0x8fu, 0x5fu, 0xfau, 0xf8u,
+         0x8fu, 0xf3u, 0x3fu, 0x5au, 0x6au, 0xa8u, 0x89u, 0xfau, 0xf6u, 0x3fu, 0xf8u, 0x5fu, 0xf3u, 0xf6u, 0xf6u, 0xf8u,
+         0x3fu, 0xf3u, 0x5fu, 0x5fu, 0x5fu, 0x8fu, 0x5fu, 0xafu, 0x5fu, 0xafu, 0x8fu, 0xdfu, 0xf3u, 0xcfu, 0x3fu, 0x38u
+    };
+
+    CGV_FIXUPINDEX skip_packed = FIXUPINDEX[part_id];// gather_int2(FIXUPINDEX, part_id);
+    fixup[0] = 0;
+    fixup[1] = skip_packed >> 4;
+    fixup[2] = skip_packed & 15;
+}
+
+void icmp_bc7_encode_adjust_skip_mode01237_2(CGV_CMPOUTPACKED data[5], CGU_INT mode, CGV_PARTID part_id)
+{
+    CGU_INT bits = 2;  if (mode == 0 || mode == 1) bits = 3;
+    CGU_INT maxSubSets = 2;  if (mode == 0 || mode == 2) maxSubSets = 3;
+
+    CGV_FIXUPINDEX fixup[3];
+    icmp_get_fixuptable(fixup, part_id);
+
+    if (maxSubSets > 2 && fixup[1] < fixup[2])
+    {
+        CGV_FIXUPINDEX t = fixup[1]; fixup[1] = fixup[2]; fixup[2] = t;
+    }
+
+    for (CGU_INT j = 1; j < maxSubSets; j++)
+    {
+        CGV_FIXUPINDEX k = fixup[j];
+        icmp_encode_data_shl_1bit_from(data, 128 + (maxSubSets - 1) - (15 - k)*bits);
+    }
+}
+
+INLINE CGV_UINT32 gather_uint32(__constant CGU_UINT32 * const uniform ptr, CGV_INT idx)
+{
+    return ptr[idx]; // (perf warning expected)
+}
+
+INLINE CGV_MASK icmp_get_partition_mask(CGV_PARTID part_id, CGU_INT subset)
+{
+    CMP_STATIC uniform __constant  CGV_SHIFT32 pattern_mask_table[] = {
+        // 2 subset partitions
+        0xCCCC3333u, 0x88887777u, 0xEEEE1111u, 0xECC81337u, 0xC880377Fu, 0xFEEC0113u, 0xFEC80137u, 0xEC80137Fu,
+        0xC80037FFu, 0xFFEC0013u, 0xFE80017Fu, 0xE80017FFu, 0xFFE80017u, 0xFF0000FFu, 0xFFF0000Fu, 0xF0000FFFu,
+        0xF71008EFu, 0x008EFF71u, 0x71008EFFu, 0x08CEF731u, 0x008CFF73u, 0x73108CEFu, 0x3100CEFFu, 0x8CCE7331u,
+        0x088CF773u, 0x3110CEEFu, 0x66669999u, 0x366CC993u, 0x17E8E817u, 0x0FF0F00Fu, 0x718E8E71u, 0x399CC663u,
+        0xAAAA5555u, 0xF0F00F0Fu, 0x5A5AA5A5u, 0x33CCCC33u, 0x3C3CC3C3u, 0x55AAAA55u, 0x96966969u, 0xA55A5AA5u,
+        0x73CE8C31u, 0x13C8EC37u, 0x324CCDB3u, 0x3BDCC423u, 0x69969669u, 0xC33C3CC3u, 0x99666699u, 0x0660F99Fu,
+        0x0272FD8Du, 0x04E4FB1Bu, 0x4E40B1BFu, 0x2720D8DFu, 0xC93636C9u, 0x936C6C93u, 0x39C6C639u, 0x639C9C63u,
+        0x93366CC9u, 0x9CC66339u, 0x817E7E81u, 0xE71818E7u, 0xCCF0330Fu, 0x0FCCF033u, 0x774488BBu, 0xEE2211DDu,
+
+        // 3 subset partitions
+        0x08CC0133u, 0x8CC80037u, 0xCC80006Fu, 0xEC001331u, 0x330000FFu, 0x00CC3333u, 0xFF000033u, 0xCCCC0033u,
+        0x0F0000FFu, 0x0FF0000Fu, 0x00F0000Fu, 0x44443333u, 0x66661111u, 0x22221111u, 0x136C0013u, 0x008C8C63u,
+        0x36C80137u, 0x08CEC631u, 0x3330000Fu, 0xF0000333u, 0x00EE1111u, 0x88880077u, 0x22C0113Fu, 0x443088CFu,
+        0x0C22F311u, 0x03440033u, 0x69969009u, 0x9960009Fu, 0x03303443u, 0x00660699u, 0xC22C3113u, 0x8C0000EFu,
+        0x1300007Fu, 0xC4003331u, 0x004C1333u, 0x22229999u, 0x00F0F00Fu, 0x24929249u, 0x29429429u, 0xC30C30C3u,
+        0xC03C3C03u, 0x00AA0055u, 0xAA0000FFu, 0x30300303u, 0xC0C03333u, 0x90900909u, 0xA00A5005u, 0xAAA0000Fu,
+        0x0AAA0555u, 0xE0E01111u, 0x70700707u, 0x6660000Fu, 0x0EE01111u, 0x07707007u, 0x06660999u, 0x660000FFu,
+        0x00660099u, 0x0CC03333u, 0x03303003u, 0x60000FFFu, 0x80807777u, 0x10100101u, 0x000A0005u, 0x08CE8421u
+    };
+
+    CGV_MASK mask_packed = gather_uint32(pattern_mask_table, part_id);
+    CGV_MASK mask0 = mask_packed & 0xFFFF;
+    CGV_MASK mask1 = mask_packed >> 16;
+
+    CGV_MASK mask = (subset == 2) ? (~mask0)&(~mask1) : ((subset == 0) ? mask0 : mask1);
+    return mask;
+}
+
+#ifdef USE_VARYING
+#ifdef ASPM_GPU
+INLINE CGV_INDEXPACKED gather_packedindex(CGV_INDEXPACKED* ptr, CGV_FIXUPINDEX idx)
+{
+    return ptr[idx];
+}
+#else
+INLINE CGV_INDEXPACKED gather_packedindex(CMP_CONSTANT varying CGV_INDEXPACKED* CMP_CONSTANT uniform ptr, CGV_FIXUPINDEX idx)
+{
+    return ptr[idx]; // (perf warning expected)
+}
+#endif
+#endif
+
+CGV_MASK icmp_encode_apply_swap_mode01237(CGV_EPOCODE qep[], CGV_INDEXPACKED color_index[2], CGU_INT blockMode, CGV_PARTID part_id)
+{
+    CGU_INT bits = 2;  if (blockMode == 0 || blockMode == 1) bits = 3;
+    CGU_INT maxSubSets = 2; if (blockMode == 0 || blockMode == 2) maxSubSets = 3;
+
+    CGV_MASK flips = 0;
+    CGU_INT levels = 1 << bits;
+    CGV_FIXUPINDEX fixup[3];
+    icmp_get_fixuptable(fixup, part_id);
+
+    for (CGU_INT j = 0; j < maxSubSets; j++)
+    {
+        CGV_FIXUPINDEX k0 = fixup[j];
+
+#ifdef USE_VARYING
+        CGV_INDEXPACKED q = ((gather_packedindex(color_index, k0 >> 3) << (28 - (k0 & 7) * 4)) >> 28);
+#else
+        CGV_INDEXPACKED q = ((color_index[k0 >> 3] << (28 - (k0 & 7) * 4)) >> 28);
+#endif
+
+        if (q >= levels / 2)
+        {
+            icmp_swap_epocode(&qep[8 * j], &qep[8 * j + 4], 4);
+            CGV_MASK partition_mask = icmp_get_partition_mask(part_id, j);
+            flips |= partition_mask;
+        }
+    }
+
+    return flips;
+}
+
+void icmp_encode_mode01237(CGV_CMPOUTPACKED cmp_out[5], CGV_EPOCODE color_qendpoint[], CGV_INDEXPACKED color_index[2], CGV_PARTID part_id, CGU_INT blockMode)
+{
+    CGU_INT bits = 2; if (blockMode == 0 || blockMode == 1) bits = 3;
+    CGU_INT maxSubSets = 2; if (blockMode == 0 || blockMode == 2) maxSubSets = 3;
+    CGU_INT channels = 3; if (blockMode == 7) channels = 4;
+
+    CGV_MASK flips = icmp_encode_apply_swap_mode01237(color_qendpoint, color_index, blockMode, part_id);
+
+    for (CGU_INT k = 0; k < 5; k++) cmp_out[k] = 0;
+    CGU_INT pos = 0;
+
+    // mode 0-3, 7
+    icmp_Write32Bit(cmp_out, &pos, blockMode + 1, 1 << blockMode);
+
+    // partition
+    if (blockMode == 0)
+    {
+        icmp_Write32Bit(cmp_out, &pos, 4, part_id & 15);
+    }
+    else
+    {
+        icmp_Write32Bit(cmp_out, &pos, 6, part_id & 63);
+    }
+
+    // endpoints
+    for (CGU_INT ch = 0; ch < channels; ch++)
+        for (CGU_INT j = 0; j < maxSubSets * 2; j++)
+        {
+            if (blockMode == 0)
+            {
+                icmp_Write32Bit(cmp_out, &pos, 4, color_qendpoint[j * 4 + 0 + ch] >> 1);
+            }
+            else if (blockMode == 1)
+            {
+                icmp_Write32Bit(cmp_out, &pos, 6, color_qendpoint[j * 4 + 0 + ch] >> 1);
+            }
+            else if (blockMode == 2)
+            {
+                icmp_Write32Bit(cmp_out, &pos, 5, color_qendpoint[j * 4 + 0 + ch]);
+            }
+            else if (blockMode == 3)
+            {
+                icmp_Write32Bit(cmp_out, &pos, 7, color_qendpoint[j * 4 + 0 + ch] >> 1);
+            }
+            else if (blockMode == 7)
+            {
+                icmp_Write32Bit(cmp_out, &pos, 5, color_qendpoint[j * 4 + 0 + ch] >> 1);
+            }
+            //else
+            //{
+            //    assert(false);
+            //}
+        }
+
+    // p bits
+    if (blockMode == 1)
+        for (CGU_INT j = 0; j < 2; j++)
+        {
+            icmp_Write32Bit(cmp_out, &pos, 1, color_qendpoint[j * 8] & 1);
+        }
+
+    if (blockMode == 0 || blockMode == 3 || blockMode == 7)
+        for (CGU_INT j = 0; j < maxSubSets * 2; j++)
+        {
+            icmp_Write32Bit(cmp_out, &pos, 1, color_qendpoint[j * 4] & 1);
+        }
+
+    // quantized values
+    icmp_bc7_encode_endpoint2(cmp_out, &pos, color_index, bits, flips);
+    icmp_bc7_encode_adjust_skip_mode01237_2(cmp_out, blockMode, part_id);
+}
+
+INLINE void icmp_swap_indexpacked(CGV_INDEXPACKED u[], CGV_INDEXPACKED v[], CGU_INT n)
+{
+    for (CGU_INT i = 0; i < n; i++)
+    {
+        CGV_INDEXPACKED t = u[i];
+        u[i] = v[i];
+        v[i] = t;
+    }
+}
+
+
+void icmp_encode_mode4(CGV_CMPOUTPACKED cmp_out[5], varying cmp_mode_parameters* uniform params)
+{
+    CGV_EPOCODE      color_qendpoint[8];
+    CGV_INDEXPACKED   color_index[2];
+    CGV_EPOCODE      alpha_qendpoint[2];
+    CGV_INDEXPACKED   alpha_index[2];
+
+    CGV_CMPOUTPACKED      rotated_channel = params->rotated_channel;
+    CGV_SHIFT32           idxMode = params->idxMode;
+
+    icmp_swap_epocode(params->color_qendpoint, color_qendpoint, 8);
+    icmp_swap_indexpacked(params->best_color_index, color_index, 2);
+    icmp_swap_epocode(params->alpha_qendpoint, alpha_qendpoint, 2);
+    icmp_swap_indexpacked(params->best_alpha_index, alpha_index, 2);
+
+    for (CGU_INT k = 0; k < 5; k++) cmp_out[k] = 0;
+    CGU_INT pos = 0;
+
+    // mode 4 (5 bits) 00001
+    icmp_Write32Bit(cmp_out, &pos, 5, 16);
+
+    // rotation channel 2 bits
+    icmp_Write32Bit(cmp_out, &pos, 2, (rotated_channel + 1) & 3);
+
+    // idxMode 1 bit
+    icmp_Write32Bit(cmp_out, &pos, 1, idxMode);
+
+    if (!idxMode)
+    {
+        icmp_encode_apply_swap(color_qendpoint, 4, color_index, 2);
+        icmp_encode_apply_swap(alpha_qendpoint, 1, alpha_index, 3);
+    }
+    else
+    {
+        icmp_swap_indexpacked(color_index, alpha_index, 2);
+        icmp_encode_apply_swap(alpha_qendpoint, 1, color_index, 2);
+        icmp_encode_apply_swap(color_qendpoint, 4, alpha_index, 3);
+    }
+
+    // color endpoints 5 bits each
+    // R0 : R1
+    // G0 : G1
+    // B0 : B1
+    for (CGU_INT p = 0; p < 3; p++)
+    {
+        CGV_EPOCODE c0 = color_qendpoint[0 + p];
+        CGV_EPOCODE c1 = color_qendpoint[4 + p];
+        icmp_Write32Bit(cmp_out, &pos, 5, c0);  // 0
+        icmp_Write32Bit(cmp_out, &pos, 5, c1);  // 1
+    }
+
+    // alpha endpoints (6 bits each)
+    // A0 : A1
+    icmp_Write32Bit(cmp_out, &pos, 6, alpha_qendpoint[0]);
+    icmp_Write32Bit(cmp_out, &pos, 6, alpha_qendpoint[1]);
+
+    // index data (color index 2 bits each) 31 bits total
+    icmp_encode_index(cmp_out, &pos, color_index, 2, 0);
+
+    // index data (alpha index 3 bits each)  47 bits total
+    icmp_encode_index(cmp_out, &pos, alpha_index, 3, 0);
+}
+
+void icmp_Encode_mode5(CGV_CMPOUTPACKED cmp_out[5], varying cmp_mode_parameters* uniform params)
+{
+
+    CGV_EPOCODE           qep[8];
+    CGV_INDEXPACKED       color_index[2];
+    CGV_EPOCODE           alpha_qendpoint[2];
+    CGV_INDEXPACKED       alpha_index[2];
+
+    icmp_swap_epocode(params->color_qendpoint, qep, 8);
+    icmp_swap_indexpacked(params->best_color_index, color_index, 2);
+    icmp_swap_epocode(params->alpha_qendpoint, alpha_qendpoint, 2);
+    icmp_swap_indexpacked(params->best_alpha_index, alpha_index, 2);
+
+    CGV_CMPOUTPACKED rotated_channel = params->rotated_channel;
+
+    icmp_encode_apply_swap(qep, 4, color_index, 2);
+    icmp_encode_apply_swap(alpha_qendpoint, 1, alpha_index, 2);
+
+    for (CGU_INT k = 0; k < 5; k++) cmp_out[k] = 0;
+    CGU_INT pos = 0;
+
+    // mode 5
+    icmp_Write32Bit(cmp_out, &pos, 6, 1 << 5);
+
+    // rotated channel
+    icmp_Write32Bit(cmp_out, &pos, 2, (rotated_channel + 1) & 3);
+
+    // endpoints
+    for (CGU_INT p = 0; p < 3; p++)
+    {
+        icmp_Write32Bit(cmp_out, &pos, 7, qep[0 + p]);
+        icmp_Write32Bit(cmp_out, &pos, 7, qep[4 + p]);
+    }
+
+    // alpha endpoints
+    icmp_Write32Bit(cmp_out, &pos, 8, alpha_qendpoint[0]);
+    icmp_Write32Bit(cmp_out, &pos, 8, alpha_qendpoint[1]);
+
+    // quantized values
+    icmp_encode_index(cmp_out, &pos, color_index, 2, 0);
+    icmp_encode_index(cmp_out, &pos, alpha_index, 2, 0);
+
+}
+
+void icmp_encode_mode6(CGV_CMPOUTPACKED cmp_out[5], CGV_EPOCODE qep[8], CGV_INDEXPACKED color_index[2])
+{
+    icmp_encode_apply_swap(qep, 4, color_index, 4);
+
+    for (CGU_INT k = 0; k < 5; k++) cmp_out[k] = 0;
+    CGU_INT pos = 0;
+
+    // mode 6
+    icmp_Write32Bit(cmp_out, &pos, 7, 64);
+
+    // endpoints
+    for (CGU_INT p = 0; p < 4; p++)
+    {
+        icmp_Write32Bit(cmp_out, &pos, 7, qep[0 + p] >> 1);
+        icmp_Write32Bit(cmp_out, &pos, 7, qep[4 + p] >> 1);
+    }
+
+    // p bits
+    icmp_Write32Bit(cmp_out, &pos, 1, qep[0] & 1);
+    icmp_Write32Bit(cmp_out, &pos, 1, qep[4] & 1);
+
+    // quantized values
+    icmp_encode_index(cmp_out, &pos, color_index, 4, 0);
+}
+
+///////////////////////////
+//      PCA helpers
+
+INLINE void icmp_compute_stats_masked(CGV_IMAGE stats[15], CGV_IMAGE image_src[64], CGV_MASK mask, CGU_CHANNEL channels)
+{
+    for (CGU_INT i = 0; i < 15; i++) stats[i] = 0;
+
+    CGV_MASK mask_shifted = mask << 1;
+    for (CGU_INT k = 0; k < 16; k++)
+    {
+        mask_shifted >>= 1;
+        //if ((mask_shifted&1) == 0) continue;
+        CGV_MASK flag = (mask_shifted & 1);
+
+        CGV_IMAGE rgba[4];
+        for (CGU_CHANNEL ch = 0; ch < channels; ch++) rgba[ch] = image_src[k + ch * 16];
+
+        for (CGU_CHANNEL ch = 0; ch < channels; ch++) rgba[ch] *= flag;
+        stats[14] += flag;
+
+        stats[10] += rgba[0];
+        stats[11] += rgba[1];
+        stats[12] += rgba[2];
+
+        stats[0] += rgba[0] * rgba[0];
+        stats[1] += rgba[0] * rgba[1];
+        stats[2] += rgba[0] * rgba[2];
+
+        stats[4] += rgba[1] * rgba[1];
+        stats[5] += rgba[1] * rgba[2];
+
+        stats[7] += rgba[2] * rgba[2];
+
+        if (channels == 4)
+        {
+            stats[13] += rgba[3];
+
+            stats[3] += rgba[0] * rgba[3];
+            stats[6] += rgba[1] * rgba[3];
+            stats[8] += rgba[2] * rgba[3];
+            stats[9] += rgba[3] * rgba[3];
+        }
+    }
+}
+
+INLINE void icmp_covar_from_stats(CGV_IMAGE covar[10], CGV_IMAGE stats[15], CGU_CHANNEL channels3or4)
+{
+    covar[0] = stats[0] - stats[10 + 0] * stats[10 + 0] / stats[14];
+    covar[1] = stats[1] - stats[10 + 0] * stats[10 + 1] / stats[14];
+    covar[2] = stats[2] - stats[10 + 0] * stats[10 + 2] / stats[14];
+
+    covar[4] = stats[4] - stats[10 + 1] * stats[10 + 1] / stats[14];
+    covar[5] = stats[5] - stats[10 + 1] * stats[10 + 2] / stats[14];
+
+    covar[7] = stats[7] - stats[10 + 2] * stats[10 + 2] / stats[14];
+
+    if (channels3or4 == 4)
+    {
+        covar[3] = stats[3] - stats[10 + 0] * stats[10 + 3] / stats[14];
+        covar[6] = stats[6] - stats[10 + 1] * stats[10 + 3] / stats[14];
+        covar[8] = stats[8] - stats[10 + 2] * stats[10 + 3] / stats[14];
+        covar[9] = stats[9] - stats[10 + 3] * stats[10 + 3] / stats[14];
+    }
+}
+
+INLINE void icmp_compute_covar_dc_masked(CGV_IMAGE covar[6], CGV_IMAGE dc[3], CGV_IMAGE image_src[64], CGV_MASK mask, CGU_INT channels3or4)
+{
+    CGV_IMAGE stats[15];
+    icmp_compute_stats_masked(stats, image_src, mask, channels3or4);
+
+    icmp_covar_from_stats(covar, stats, channels3or4);
+    for (CGU_INT ch = 0; ch < channels3or4; ch++) dc[ch] = stats[10 + ch] / stats[14];
+}
+
+INLINE void icmp_ssymv3(CGV_IMAGE a[4], CGV_IMAGE covar[10], CGV_IMAGE b[4])
+{
+    a[0] = covar[0] * b[0] + covar[1] * b[1] + covar[2] * b[2];
+    a[1] = covar[1] * b[0] + covar[4] * b[1] + covar[5] * b[2];
+    a[2] = covar[2] * b[0] + covar[5] * b[1] + covar[7] * b[2];
+}
+
+INLINE void icmp_ssymv4_2(CGV_IMAGE a[4], CGV_IMAGE covar[10], CGV_IMAGE b[4])
+{
+    a[0] = covar[0] * b[0] + covar[1] * b[1] + covar[2] * b[2] + covar[3] * b[3];
+    a[1] = covar[1] * b[0] + covar[4] * b[1] + covar[5] * b[2] + covar[6] * b[3];
+    a[2] = covar[2] * b[0] + covar[5] * b[1] + covar[7] * b[2] + covar[8] * b[3];
+    a[3] = covar[3] * b[0] + covar[6] * b[1] + covar[8] * b[2] + covar[9] * b[3];
+}
+
+#ifndef ASPM
+// Computes inverse square root over an implementation-defined range. The maximum error is implementation-defined.
+CGV_IMAGE Image_rsqrt(CGV_IMAGE f)
+{
+    CGV_IMAGE sf = sqrt(f);
+    if (sf != 0)
+        return 1 / sqrt(f);
+    else
+        return 0.0f;
+}
+#endif
+
+INLINE void icmp_compute_axis(CGV_IMAGE axis[4], 
+                              CGV_IMAGE covar[10], 
+#ifdef ASPM_GPU
+                              CGV_ITTERATIONS powerIterations, 
+#else
+                              uniform __constant CGV_ITTERATIONS powerIterations, 
+#endif
+                              CGU_CHANNEL channels)
+{
+    CGV_IMAGE vec[4] = { 1,1,1,1 };
+
+    for (CGU_INT i = 0; i < powerIterations; i++)
+    {
+        if (channels == 3) icmp_ssymv3(axis, covar, vec);
+        if (channels == 4) icmp_ssymv4_2(axis, covar, vec);
+
+        for (CGU_CHANNEL ch = 0; ch < channels; ch++) vec[ch] = axis[ch];
+
+        if (i % 2 == 1) // renormalize every other iteration
+        {
+            CGV_IMAGE norm_sq = 0;
+            for (CGU_CHANNEL ch = 0; ch < channels; ch++)
+                norm_sq += axis[ch] * axis[ch];
+
+#ifndef ASPM
+            CGV_IMAGE rnorm = Image_rsqrt(norm_sq);
+#else
+            CGV_IMAGE rnorm = rsqrt(norm_sq);
+#endif
+            for (CGU_CHANNEL ch = 0; ch < channels; ch++) vec[ch] *= rnorm;
+        }
+    }
+
+    for (CGU_CHANNEL ch = 0; ch < channels; ch++) axis[ch] = vec[ch];
+}
+
+void icmp_block_pca_axis(CGV_IMAGE axis[4], CGV_IMAGE dc[4], CGV_IMAGE image_src[64], CGV_MASK mask, CGU_INT channels3or4)
+{
+    uniform __constant CGV_ITTERATIONS powerIterations = 8; // 4 not enough for HQ
+
+    CGV_IMAGE covar[10];
+    icmp_compute_covar_dc_masked(covar, dc, image_src, mask, channels3or4);
+
+    CGV_IMAGE inv_var = 1.0 / (256 * 256);
+    for (CGU_INT k = 0; k < 10; k++)
+    {
+        covar[k] *= inv_var;
+    }
+
+    CGV_IMAGE eps = sq_image(0.001F);
+    covar[0] += eps;
+    covar[4] += eps;
+    covar[7] += eps;
+    covar[9] += eps;
+
+    icmp_compute_axis(axis, covar, powerIterations, channels3or4);
+}
+
+CGV_IMAGE minImage(CGV_IMAGE a, CGV_IMAGE b) { return a < b ? a : b; }
+CGV_IMAGE maxImage(CGV_IMAGE a, CGV_IMAGE b) { return a > b ? a : b; }
+
+
+void icmp_block_segment_core(CGV_IMAGE epo_code[], CGV_IMAGE image_src[64], CGV_MASK mask, CGU_INT channels3or4)
+{
+    CGV_IMAGE axis[4];
+    CGV_IMAGE dc[4];
+    icmp_block_pca_axis(axis, dc, image_src, mask, channels3or4);
+
+    CGV_IMAGE ext[2];
+    ext[0] = +1e32;
+    ext[1] = -1e32;
+
+    // find min/max
+    CGV_MASK mask_shifted = mask << 1;
+    for (CGU_INT k = 0; k < 16; k++)
+    {
+        mask_shifted >>= 1;
+        if ((mask_shifted & 1) == 0) continue;
+
+        CGV_IMAGE dot = 0;
+        for (CGU_INT ch = 0; ch < channels3or4; ch++)
+            dot += axis[ch] * (image_src[16 * ch + k] - dc[ch]);
+
+        ext[0] = minImage(ext[0], dot);
+        ext[1] = maxImage(ext[1], dot);
+    }
+
+    // create some distance if the endpoints collapse
+    if (ext[1] - ext[0] < 1.0f)
+    {
+        ext[0] -= 0.5f;
+        ext[1] += 0.5f;
+    }
+
+    for (CGU_INT i = 0; i < 2; i++)
+        for (CGU_INT ch = 0; ch < channels3or4; ch++)
+        {
+            epo_code[4 * i + ch] = ext[i] * axis[ch] + dc[ch];
+        }
+}
+
+INLINE CGV_IMAGE clampf(CGV_IMAGE v, CGV_IMAGE a, CGV_IMAGE b)
+{
+    if (v < a)
+        return a;
+    else
+        if (v > b)
+            return b;
+    return v;
+}
+
+
+void icmp_get_block_endpoints(CGV_IMAGE block_endpoints[], CGV_IMAGE image_src[64], CGV_MASK mask, CGU_CHANNEL channels3or4)
+{
+    icmp_block_segment_core(block_endpoints, image_src, mask, channels3or4);
+
+    for (CGU_INT i = 0; i < 2; i++)
+        for (CGU_INT ch = 0; ch < channels3or4; ch++)
+        {
+            block_endpoints[4 * i + ch] = clampf(block_endpoints[4 * i + ch], 0.0f, 255.0f);
+        }
+}
+
+void icmp_ep_quant0367_2(CGV_EPOCODE qep[], CGV_IMAGE ep[], CGU_INT blockMode, CGU_INT channels)
+{
+    CGU_INT bits = 7;
+    if (blockMode == 0) bits = 4;
+    if (blockMode == 7) bits = 5;
+
+    CGU_INT levels = 1 << bits;
+    CGU_INT levels2 = levels * 2 - 1;
+
+    for (CGU_INT i = 0; i < 2; i++)
+    {
+        CGV_EPOCODE qep_b[8];
+
+        for (CGU_INT b = 0; b < 2; b++)
+            for (CGU_INT p = 0; p < 4; p++)
+            {
+                CGV_EPOCODE v = (CGV_TYPEINT)((ep[i * 4 + p] / 255.0f*levels2 - b) / 2.0f + 0.5f) * 2 + b;
+                qep_b[b * 4 + p] = clampEPO(v, b, levels2 - 1 + b);
+            }
+
+        CGV_IMAGE ep_b[8];
+        for (CGU_INT j = 0; j < 8; j++)
+            ep_b[j] = qep_b[j];
+
+        if (blockMode == 0)
+            for (CGU_INT j = 0; j < 8; j++)
+                ep_b[j] = expandEPObits(qep_b[j], 5);
+
+        CGV_ERROR err0 = 0.0f;
+        CGV_ERROR err1 = 0.0f;
+        for (CGU_INT ch = 0; ch < channels; ch++)
+        {
+            err0 += sq_image(ep[i * 4 + ch] - ep_b[0 + ch]);
+            err1 += sq_image(ep[i * 4 + ch] - ep_b[4 + ch]);
+        }
+
+        for (CGU_INT p = 0; p < 4; p++)
+            qep[i * 4 + p] = (err0 < err1) ? qep_b[0 + p] : qep_b[4 + p];
+    }
+}
+
+void icmp_ep_quant245_2(CGV_EPOCODE qep[], CGV_IMAGE ep[], CGU_INT mode)
+{
+    CGU_INT bits = 5;
+    if (mode == 5) bits = 7;
+    CGU_INT levels = 1 << bits;
+
+    for (CGU_INT i = 0; i < 8; i++)
+    {
+        CGV_EPOCODE v = ((CGV_TYPEINT)(ep[i] / 255.0f*(levels - 1) + 0.5));
+        qep[i] = clampEPO(v, 0, levels - 1);
+    }
+}
+
+void icmp_ep_quant1_2(CGV_EPOCODE qep[], CGV_IMAGE ep[], CGU_INT mode)
+{
+    CGV_EPOCODE qep_b[16];
+
+    for (CGU_INT b = 0; b < 2; b++)
+        for (CGU_INT i = 0; i < 8; i++)
+        {
+            CGV_EPOCODE v = ((CGV_TYPEINT)((ep[i] / 255.0f*127.0f - b) / 2 + 0.5)) * 2 + b;
+            qep_b[b * 8 + i] = clampEPO(v, b, 126 + b);
+        }
+
+    // dequant
+    CGV_IMAGE ep_b[16];
+    for (CGU_INT k = 0; k < 16; k++)
+        ep_b[k] = expandEPObits(qep_b[k], 7);
+
+    CGV_ERROR err0 = 0.0f;
+    CGV_ERROR err1 = 0.0f;
+    for (CGU_INT j = 0; j < 2; j++)
+        for (CGU_INT p = 0; p < 3; p++)
+        {
+            err0 += sq_image(ep[j * 4 + p] - ep_b[0 + j * 4 + p]);
+            err1 += sq_image(ep[j * 4 + p] - ep_b[8 + j * 4 + p]);
+        }
+
+    for (CGU_INT i = 0; i < 8; i++)
+        qep[i] = (err0 < err1) ? qep_b[0 + i] : qep_b[8 + i];
+
+}
+
+void icmp_ep_quant2_2(CGV_EPOCODE qep[], CGV_IMAGE ep[], CGU_INT blockMode, CGU_INT channels3or4)
+{
+    //assert(mode <= 7);
+    CMP_STATIC uniform __constant CGV_SUBSETS SubSetTable[] = { 3,2,3,2,1,1,1,2 };
+#ifndef ASPM_GPU
+    uniform CMP_CONSTANT 
+#endif
+    CGV_SUBSETS maxSubSets = SubSetTable[blockMode];
+
+    if (blockMode == 0 || blockMode == 3 || blockMode == 6 || blockMode == 7)
+    {
+        for (CGU_INT i = 0; i < maxSubSets; i++)
+            icmp_ep_quant0367_2(&qep[i * 8], &ep[i * 8], blockMode, channels3or4);
+    }
+    else
+        if (blockMode == 1)
+        {
+            for (CGU_INT i = 0; i < maxSubSets; i++)
+                icmp_ep_quant1_2(&qep[i * 8], &ep[i * 8], blockMode);
+        }
+        else
+            if (blockMode == 2 || blockMode == 4 || blockMode == 5)
+            {
+                for (CGU_INT i = 0; i < maxSubSets; i++)
+                    icmp_ep_quant245_2(&qep[i * 8], &ep[i * 8], blockMode);
+            }
+    //   else 
+    //      assert(false);
+
+}
+
+void icmp_ep_dequant2(CGV_IMAGE ep[], CGV_EPOCODE qep[], CGU_INT blockMode)
+{
+    //assert(mode <= 7);
+    CMP_STATIC uniform __constant CGV_SUBSETS subSetTable[] = { 3,2,3,2,1,1,1,2 };
+#ifndef ASPM_GPU
+    uniform CMP_CONSTANT
+#endif
+    CGV_SUBSETS maxSubSets = subSetTable[blockMode];
+
+    // mode 3, 6 are 8-bit
+    if (blockMode == 3 || blockMode == 6)
+    {
+        for (CGU_INT i = 0; i < 8 * maxSubSets; i++)
+            ep[i] = qep[i];
+    }
+    else
+        if (blockMode == 1 || blockMode == 5)
+        {
+            for (CGU_INT i = 0; i < 8 * maxSubSets; i++)
+                ep[i] = expandEPObits(qep[i], 7);
+        }
+        else
+            if (blockMode == 0 || blockMode == 2 || blockMode == 4)
+            {
+                for (CGU_INT i = 0; i < 8 * maxSubSets; i++)
+                    ep[i] = expandEPObits(qep[i], 5);
+            }
+            else
+                if (blockMode == 7)
+                {
+                    for (CGU_INT i = 0; i < 8 * maxSubSets; i++)
+                        ep[i] = expandEPObits(qep[i], 6);
+                }
+    //else 
+    //  assert(false);
+}
+
+void icmp_GetQuantizedEpoCode(CGV_EPOCODE epo_code_out[], CGV_IMAGE block_endpoints[], CGU_INT blockMode, CGU_CHANNEL channels3or4)
+{
+    icmp_ep_quant2_2(epo_code_out, block_endpoints, blockMode, channels3or4);
+    icmp_ep_dequant2(block_endpoints, epo_code_out, blockMode);
+}
+
+void icmp_ep_quant_dequant_mode4(CGV_EPOCODE qep[], CGV_IMAGE ep[])
+{
+    icmp_ep_quant2_2(qep, ep, 4, 3);
+    icmp_ep_dequant2(ep, qep, 4);
+}
+
+///////////////////////////
+//   pixel quantization
+//========================================
+// Modified Intel Texture Compression Code
+//========================================
+
+INLINE uniform __constant CGV_RAMP* uniform icmp_GetRamp(CGU_INT bits)
+{
+    //assert(bits>=2 && bits<=4); // invalid bit size
+
+    CMP_STATIC uniform __constant CGV_RAMP unquant_table_2bits[] = { 0, 21, 43, 64 };
+    CMP_STATIC uniform __constant CGV_RAMP unquant_table_3bits[] = { 0, 9, 18, 27, 37, 46, 55, 64 };
+    CMP_STATIC uniform __constant CGV_RAMP unquant_table_4bits[] = { 0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64 };
+
+    uniform __constant CGV_RAMP* uniform unquant_tables[] = { unquant_table_2bits, unquant_table_3bits, unquant_table_4bits };
+
+    return unquant_tables[bits - 2];
+}
+
+#ifdef USE_VARYING
+INLINE CGV_IMAGE  gather_image(varying CGV_IMAGE* uniform ptr, CGV_SHIFT32 idx)
+{
+    return ptr[idx]; // (perf warning expected)
+}
+#endif
+
+INLINE CGV_RAMP  gather_ramp(
+#ifdef ASPM_GPU
+    CMP_CONSTANT CGV_RAMP*  ptr, 
+#else
+    CMP_CONSTANT CGV_RAMP* CMP_CONSTANT uniform ptr, 
+#endif
+    CGV_INDEX idx)
+{
+    return ptr[idx]; // (perf warning expected)
+}
+
+CGV_ERROR icmp_GetQuantizeIndex(
+    CGV_INDEXPACKED index_packed_out[2],
+    CGV_INDEX       index_out[MAX_SUBSET_SIZE],
+    CGV_IMAGE       image_src[64],
+    CGU_INT         bits,
+    CGV_IMAGE       image_block[],
+    CGV_SHIFT32     pattern,
+    CGU_CHANNEL     channels3or4)
+{
+    CGV_ERROR total_err = 0;
+    uniform __constant CGV_RAMP* uniform Ramp = icmp_GetRamp(bits);
+    CGV_LEVELS levels = 1 << bits;
+
+    // 64-bit color_qendpoint: 5% overhead in this function
+    for (CGU_INT k = 0; k < 2; k++) index_packed_out[k] = 0;
+
+    CGV_SHIFT32 pattern_shifted = pattern;
+    for (CGU_INT k = 0; k < 16; k++)
+    {
+        CGV_SHIFT32 j = pattern_shifted & 3;
+        pattern_shifted >>= 2;
+
+        CGV_IMAGE proj = 0;
+        CGV_IMAGE div = 0;
+        for (CGU_CHANNEL ch = 0; ch < channels3or4; ch++)
+        {
+#ifdef USE_VARYING
+            CGV_IMAGE ep_a = gather_image(image_block, 8 * j + 0 + ch);
+            CGV_IMAGE ep_b = gather_image(image_block, 8 * j + 4 + ch);
+#else
+            CGV_IMAGE ep_a = image_block[8 * j + 0 + ch];
+            CGV_IMAGE ep_b = image_block[8 * j + 4 + ch];
+#endif
+            proj += (image_src[k + ch * 16] - ep_a)*(ep_b - ep_a);
+            div += sq_image(ep_b - ep_a);
+        }
+
+        proj /= div;
+
+        CGV_INDEX index_q1 = (CGV_INDEX)(proj*levels + 0.5);
+        index_q1 = clampIndex(index_q1, 1, levels - 1);
+
+        CGV_ERROR err0 = 0;
+        CGV_ERROR err1 = 0;
+        CGV_RAMP ramp0 = gather_ramp(Ramp, index_q1 - 1);
+        CGV_RAMP ramp1 = gather_ramp(Ramp, index_q1);
+
+        for (CGU_CHANNEL ch = 0; ch < channels3or4; ch++)
+        {
+#ifdef USE_VARYING
+            CGV_IMAGE ep_a = gather_image(image_block, 8 * j + 0 + ch);
+            CGV_IMAGE ep_b = gather_image(image_block, 8 * j + 4 + ch);
+#else
+            CGV_IMAGE ep_a = image_block[8 * j + 0 + ch];
+            CGV_IMAGE ep_b = image_block[8 * j + 4 + ch];
+#endif
+            CGV_IMAGE dec_v0 = (CGV_TYPEINT)(((64 - ramp0)*ep_a + ramp0 * ep_b + 32) / 64);
+            CGV_IMAGE dec_v1 = (CGV_TYPEINT)(((64 - ramp1)*ep_a + ramp1 * ep_b + 32) / 64);
+            err0 += sq_image(dec_v0 - image_src[k + ch * 16]);
+            err1 += sq_image(dec_v1 - image_src[k + ch * 16]);
+        }
+
+        CGV_ERROR best_err = err1;
+        CGV_INDEX best_index = index_q1;
+        if (err0 < err1)
+        {
+            best_err = err0;
+            best_index = index_q1 - 1;
+        }
+
+        index_out[k] = best_index;
+        index_packed_out[k / 8] += ((CGV_INDEXPACKED)best_index) << 4 * (k % 8);
+        total_err += best_err;
+    }
+
+    return total_err;
+}
+
+///////////////////////////
+// LS endpoint refinement
+
+void icmp_opt_endpoints(CGV_IMAGE ep[], CGV_IMAGE image_src[64], CGU_INT bits, CGV_INDEXPACKED color_qendpoint[2], CGV_MASK mask, CGU_CHANNEL channels3or4)
+{
+    CGU_INT levels = 1 << bits;
+
+    CGV_IMAGE Atb1[4] = { 0,0,0,0 };
+    CGV_IMAGE sum_q = 0;
+    CGV_IMAGE sum_qq = 0;
+    CGV_IMAGE sum[5] = { 0,0,0,0,0 };
+
+    CGV_MASK mask_shifted = mask << 1;
+    for (CGU_INT k1 = 0; k1 < 2; k1++)
+    {
+        CGV_INDEXPACKED qbits_shifted = color_qendpoint[k1];
+        for (CGU_INT k2 = 0; k2 < 8; k2++)
+        {
+            CGU_INT   k = k1 * 8 + k2;
+            CGV_IMAGE q = (CGV_TYPEINT)(qbits_shifted & 15);
+
+            qbits_shifted >>= 4;
+
+            mask_shifted >>= 1;
+            if ((mask_shifted & 1) == 0) continue;
+
+            CGV_LEVELS x = (levels - 1) - q;
+            CGV_LEVELS y = q;
+
+            sum_q += q;
+            sum_qq += q * q;
+
+            sum[4] += 1;
+            for (CGU_CHANNEL ch = 0; ch < channels3or4; ch++)  sum[ch] += image_src[k + ch * 16];
+            for (CGU_CHANNEL ch = 0; ch < channels3or4; ch++) Atb1[ch] += x * image_src[k + ch * 16];
+        }
+    }
+
+    CGV_IMAGE Atb2[4];
+    for (CGU_CHANNEL ch = 0; ch < channels3or4; ch++)
+    {
+        //sum[ch] = dc[ch]*16;
+        Atb2[ch] = (levels - 1)*sum[ch] - Atb1[ch];
+    }
+
+    CGV_IMAGE Cxx = sum[4] * sq_image(levels - 1) - 2 * (levels - 1)*sum_q + sum_qq;
+    CGV_IMAGE Cyy = sum_qq;
+    CGV_IMAGE Cxy = (levels - 1)*sum_q - sum_qq;
+    CGV_IMAGE scale = (levels - 1) / (Cxx*Cyy - Cxy * Cxy);
+
+    for (CGU_CHANNEL ch = 0; ch < channels3or4; ch++)
+    {
+        ep[0 + ch] = (Atb1[ch] * Cyy - Atb2[ch] * Cxy)*scale;
+        ep[4 + ch] = (Atb2[ch] * Cxx - Atb1[ch] * Cxy)*scale;
+
+        //ep[0+ch] = clamp(ep[0+ch], 0, 255);
+        //ep[4+ch] = clamp(ep[4+ch], 0, 255);
+    }
+
+    if (img_absf(Cxx*Cyy - Cxy * Cxy) < 0.001f)
+    {
+        // flatten
+        for (CGU_CHANNEL ch = 0; ch < channels3or4; ch++)
+        {
+            ep[0 + ch] = sum[ch] / sum[4];
+            ep[4 + ch] = ep[0 + ch];
+        }
+    }
+}
+
+//////////////////////////
+// parameter estimation
+
+void icmp_channel_quant_dequant2(CGV_EPOCODE qep[2], CGV_IMAGE ep[2], CGU_INT epbits)
+{
+    CGV_LEVELS elevels = (1 << epbits);
+
+    for (CGU_INT i = 0; i < 2; i++)
+    {
+        CGV_EPOCODE v = ((CGV_EPOCODE)(ep[i] / 255.0f*(elevels - 1) + 0.5f));
+        qep[i] = clampEPO(v, 0, elevels - 1);
+        ep[i] = expandEPObits(qep[i], epbits);
+    }
+}
+
+void icmp_refineEndpoints(CGV_IMAGE ep[2], CGV_IMAGE block[16], CGU_INT bits, CGV_INDEXPACKED color_index[2])
+{
+    CGU_INT levels = 1 << bits;
+
+    CGV_IMAGE Atb1 = 0;
+    CGV_IMAGE sum_q = 0;
+    CGV_IMAGE sum_qq = 0;
+    CGV_IMAGE sum = 0;
+
+    for (CGU_INT k1 = 0; k1 < 2; k1++)
+    {
+        CGV_INDEXPACKED qbits_shifted = color_index[k1];
+        for (CGU_INT k2 = 0; k2 < 8; k2++)
+        {
+            CGU_INT   k = k1 * 8 + k2;
+            CGV_IMAGE q = (CGV_TYPEINT)(qbits_shifted & 15);
+            qbits_shifted >>= 4;
+
+            CGV_TYPEINT x = (levels - 1) - q;
+            CGV_TYPEINT y = q;
+
+            sum_q += q;
+            sum_qq += q * q;
+
+            sum += block[k];
+            Atb1 += x * block[k];
+        }
+    }
+
+    CGV_IMAGE Atb2 = (levels - 1)*sum - Atb1;
+
+    CGV_IMAGE Cxx = 16 * sq_image(levels - 1) - 2 * (levels - 1)*sum_q + sum_qq;
+    CGV_IMAGE Cyy = sum_qq;
+    CGV_IMAGE Cxy = (levels - 1)*sum_q - sum_qq;
+    CGV_IMAGE scale = (levels - 1) / (Cxx*Cyy - Cxy * Cxy);
+
+    ep[0] = (Atb1*Cyy - Atb2 * Cxy)*scale;
+    ep[1] = (Atb2*Cxx - Atb1 * Cxy)*scale;
+
+    ep[0] = clampf(ep[0], 0.0f, 255.0f);
+    ep[1] = clampf(ep[1], 0.0f, 255.0f);
+
+    if (img_absf(Cxx*Cyy - Cxy * Cxy) < 0.001)
+    {
+        ep[0] = sum / 16;
+        ep[1] = ep[0];
+    }
+}
+
+CGV_ERROR  icmp_channelQuantizeIndex(CGV_INDEXPACKED color_index[2], CGV_INDEX index[MAX_SUBSET_SIZE], CGV_IMAGE block[16], CGU_INT bits, CGV_IMAGE ep[])
+{
+    uniform __constant CGV_RAMP* uniform Ramp = icmp_GetRamp(bits);
+    CGV_LEVELS levels = (1 << bits);
+
+    color_index[0] = 0;
+    color_index[1] = 0;
+
+    CGV_ERROR total_err = 0;
+
+    for (CGU_INT k = 0; k < 16; k++)
+    {
+        CGV_IMAGE proj = (block[k] - ep[0]) / (ep[1] - ep[0] + 0.001f);
+
+        CGV_INDEX q1 = (CGV_TYPEINT)(proj*levels + 0.5);
+        q1 = clampEPO(q1, 1, levels - 1);
+
+        CGV_ERROR err0 = 0;
+        CGV_ERROR err1 = 0;
+        CGV_RAMP  ramp0 = gather_ramp(Ramp, q1 - 1);
+        CGV_RAMP  ramp1 = gather_ramp(Ramp, q1);
+
+        CGV_IMAGE dec_v0 = (CGV_TYPEINT)(((64 - ramp0)*ep[0] + ramp0 * ep[1] + 32) / 64);
+        CGV_IMAGE dec_v1 = (CGV_TYPEINT)(((64 - ramp1)*ep[0] + ramp1 * ep[1] + 32) / 64);
+        err0 += sq_image(dec_v0 - block[k]);
+        err1 += sq_image(dec_v1 - block[k]);
+
+        CGV_TYPEINT best_err = err1;
+        CGV_INDEX   best_q = q1;
+        if (err0 < err1)
+        {
+            best_err = err0;
+            best_q = q1 - 1;
+        }
+
+        index[k] = best_q;
+        color_index[k / 8] += ((CGV_INDEXPACKED)best_q) << 4 * (k % 8);
+        total_err += best_err;
+    }
+
+    return total_err;
+}
+
+CGV_ERROR  icmp_optQuantizeIndex(BC7_EncodeState  EncodeState[], CGV_INDEXPACKED color_index[2], CGV_INDEX index[MAX_SUBSET_SIZE], CGV_EPOCODE qep[2], CGV_IMAGE block[16], CGU_INT bits, CGU_INT epbits)
+{
+    CGV_IMAGE ep[2] = { 255,0 };
+
+    for (CGU_INT k = 0; k < 16; k++)
+    {
+        ep[0] = minImage(ep[0], block[k]);
+        ep[1] = maxImage(ep[1], block[k]);
+    }
+
+    icmp_channel_quant_dequant2(qep, ep, epbits);
+    CGV_ERROR err = icmp_channelQuantizeIndex(color_index, index, block, bits, ep);
+
+    // refine
+#ifndef ASPM_GPU
+    uniform CMP_CONSTANT
+#endif
+    CGV_ITTERATIONS refineIterations = EncodeState->refineIterations;
+    for (CGU_INT i = 0; i < refineIterations; i++)
+    {
+        icmp_refineEndpoints(ep, block, bits, color_index);
+        icmp_channel_quant_dequant2(qep, ep, epbits);
+        err = icmp_channelQuantizeIndex(color_index, index, block, bits, ep);
+    }
+
+    return err;
+}
+
+
+INLINE CGV_SHIFT32 icmp_get_pattern2(CGV_PARTID part_id)
+{
+    CMP_STATIC uniform __constant CGV_SHIFT32 pattern_table[] = {
+       0x50505050u, 0x40404040u, 0x54545454u, 0x54505040u, 0x50404000u, 0x55545450u, 0x55545040u, 0x54504000u,
+       0x50400000u, 0x55555450u, 0x55544000u, 0x54400000u, 0x55555440u, 0x55550000u, 0x55555500u, 0x55000000u,
+       0x55150100u, 0x00004054u, 0x15010000u, 0x00405054u, 0x00004050u, 0x15050100u, 0x05010000u, 0x40505054u,
+       0x00404050u, 0x05010100u, 0x14141414u, 0x05141450u, 0x01155440u, 0x00555500u, 0x15014054u, 0x05414150u,
+       0x44444444u, 0x55005500u, 0x11441144u, 0x05055050u, 0x05500550u, 0x11114444u, 0x41144114u, 0x44111144u,
+       0x15055054u, 0x01055040u, 0x05041050u, 0x05455150u, 0x14414114u, 0x50050550u, 0x41411414u, 0x00141400u,
+       0x00041504u, 0x00105410u, 0x10541000u, 0x04150400u, 0x50410514u, 0x41051450u, 0x05415014u, 0x14054150u,
+       0x41050514u, 0x41505014u, 0x40011554u, 0x54150140u, 0x50505500u, 0x00555050u, 0x15151010u, 0x54540404u,
+       0xAA685050u, 0x6A5A5040u, 0x5A5A4200u, 0x5450A0A8u, 0xA5A50000u, 0xA0A05050u, 0x5555A0A0u, 0x5A5A5050u,
+       0xAA550000u, 0xAA555500u, 0xAAAA5500u, 0x90909090u, 0x94949494u, 0xA4A4A4A4u, 0xA9A59450u, 0x2A0A4250u,
+       0xA5945040u, 0x0A425054u, 0xA5A5A500u, 0x55A0A0A0u, 0xA8A85454u, 0x6A6A4040u, 0xA4A45000u, 0x1A1A0500u,
+       0x0050A4A4u, 0xAAA59090u, 0x14696914u, 0x69691400u, 0xA08585A0u, 0xAA821414u, 0x50A4A450u, 0x6A5A0200u,
+       0xA9A58000u, 0x5090A0A8u, 0xA8A09050u, 0x24242424u, 0x00AA5500u, 0x24924924u, 0x24499224u, 0x50A50A50u,
+       0x500AA550u, 0xAAAA4444u, 0x66660000u, 0xA5A0A5A0u, 0x50A050A0u, 0x69286928u, 0x44AAAA44u, 0x66666600u,
+       0xAA444444u, 0x54A854A8u, 0x95809580u, 0x96969600u, 0xA85454A8u, 0x80959580u, 0xAA141414u, 0x96960000u,
+       0xAAAA1414u, 0xA05050A0u, 0xA0A5A5A0u, 0x96000000u, 0x40804080u, 0xA9A8A9A8u, 0xAAAAAA44u, 0x2A4A5254u
+    };
+
+    return gather_uint32(pattern_table, part_id);
+}
+
+CGV_IMAGE  icmp_get_pca_bound(CGV_IMAGE covar[10], CGU_CHANNEL channels)
+{
+    uniform __constant CGV_TYPEINT powerIterations = 4; // quite approximative, but enough for bounding
+
+    CGV_IMAGE inv_var = 1.0 / (256 * 256);
+    for (CGU_INT k = 0; k < 10; k++)
+    {
+        covar[k] *= inv_var;
+    }
+
+    CGV_IMAGE eps = sq_image(0.001);
+    covar[0] += eps;
+    covar[4] += eps;
+    covar[7] += eps;
+
+    CGV_IMAGE axis[4];
+    icmp_compute_axis(axis, covar, powerIterations, channels);
+
+    CGV_IMAGE vec[4];
+    if (channels == 3) icmp_ssymv3(vec, covar, axis);
+    if (channels == 4) icmp_ssymv4_2(vec, covar, axis);
+
+    CGV_IMAGE sq_sum = 0.0f;
+    for (CGU_INT p = 0; p < channels; p++) sq_sum += sq_image(vec[p]);
+    CGV_IMAGE lambda = sqrt(sq_sum);
+
+    CGV_IMAGE bound = covar[0] + covar[4] + covar[7];
+    if (channels == 4) bound += covar[9];
+    bound -= lambda;
+    bound = maxImage(bound, 0.0f);
+
+    return bound;
+}
+
+CGV_IMAGE icmp_block_pca_bound_split2(CGV_IMAGE image_src[64], CGV_MASK mask, CGV_IMAGE full_stats[15], CGU_CHANNEL channels)
+{
+    CGV_IMAGE stats[15];
+    icmp_compute_stats_masked(stats, image_src, mask, channels);
+
+    CGV_IMAGE covar1[10];
+    icmp_covar_from_stats(covar1, stats, channels);
+
+    for (CGU_INT i = 0; i < 15; i++)
+        stats[i] = full_stats[i] - stats[i];
+
+    CGV_IMAGE covar2[10];
+    icmp_covar_from_stats(covar2, stats, channels);
+
+    CGV_IMAGE bound = 0.0f;
+    bound += icmp_get_pca_bound(covar1, channels);
+    bound += icmp_get_pca_bound(covar2, channels);
+
+    return sqrt(bound) * 256;
+}
+
+
+#ifdef USE_VARYING
+INLINE void       scatter_partid(varying CGV_PARTID* uniform ptr, CGV_TYPEINT idx, CGV_PARTID value)
+{
+    ptr[idx] = value; // (perf warning expected)
+}
+#endif
+
+void icmp_sort_partlist(CGV_PARTID list[], CGU_INT length, CGU_INT partial_count)
+{
+    for (CGU_INT k = 0; k < partial_count; k++)
+    {
+        CGV_TYPEINT  best_idx = k;
+        CGV_PARTID   best_value = list[k];
+        for (CGU_INT i = k + 1; i < length; i++)
+        {
+            if (best_value > list[i])
+            {
+                best_value = list[i];
+                best_idx = i;
+            }
+        }
+
+        // swap
+#ifdef USE_VARYING
+        scatter_partid(list, best_idx, list[k]);
+#else
+        list[best_idx] = list[k];
+#endif
+        list[k] = best_value;
+    }
+}
+
+INLINE void copy_epocode(CGV_EPOCODE u[], CGV_EPOCODE v[], CGU_INT n)
+{
+    for (CGU_INT i = 0; i < n; i++)
+    {
+        u[i] = v[i];
+    }
+}
+
+
+INLINE void copy_indexpacked(CGV_INDEXPACKED u[], CGV_INDEXPACKED v[], CGU_INT n)
+{
+    for (CGU_INT i = 0; i < n; i++)
+    {
+        u[i] = v[i];
+    }
+}
+
+
+void icmp_enc_mode4_candidate(
+    BC7_EncodeState     EncodeState[],
+    cmp_mode_parameters best_candidate[],
+    CGV_ERROR           best_err[],
+    CGU_INT             rotated_channel,
+    CGU_INT             idxMode)
+{
+    CGU_INT bits = 2;
+    CGU_INT abits = 3;
+    CGU_INT aepbits = 6;
+
+    if (idxMode == 1)
+    {
+        bits = 3;
+        abits = 2;
+    }
+
+    CGV_IMAGE src_block[48];
+    for (CGU_INT k = 0; k < 16; k++)
+    {
+        for (CGU_INT p = 0; p < 3; p++)
+            src_block[k + p * 16] = EncodeState->image_src[k + p * 16];
+
+        if (rotated_channel < 3)
+        {
+            // apply channel rotation
+            if (EncodeState->channels == 4) src_block[k + rotated_channel * 16] = EncodeState->image_src[k + 3 * 16];
+            if (EncodeState->channels == 3) src_block[k + rotated_channel * 16] = 255;
+        }
+    }
+
+    CGV_IMAGE        block_endpoints[8];
+    CGV_INDEXPACKED  color_index[2];
+    CGV_INDEX        c_index[MAX_SUBSET_SIZE];
+    CGV_EPOCODE      color_qendpoint[8];
+
+    icmp_get_block_endpoints(block_endpoints, src_block, -1, 3);
+    icmp_ep_quant_dequant_mode4(color_qendpoint, block_endpoints);
+    CGV_ERROR  err = icmp_GetQuantizeIndex(color_index, c_index, src_block, bits, block_endpoints, 0, 3);
+
+    // refine
+    CGU_INT refineIterations = EncodeState->refineIterations;
+    for (CGU_INT i = 0; i < refineIterations; i++)
+    {
+        icmp_opt_endpoints(block_endpoints, src_block, bits, color_index, -1, 3);
+        icmp_ep_quant_dequant_mode4(color_qendpoint, block_endpoints);
+        err = icmp_GetQuantizeIndex(color_index, c_index, src_block, bits, block_endpoints, 0, 3);
+    }
+
+    // encoding selected channel 
+    CGV_EPOCODE     alpha_qendpoint[2];
+    CGV_INDEXPACKED alpha_index[2];
+    CGV_INDEX       a_index[MAX_SUBSET_SIZE];
+    err += icmp_optQuantizeIndex(EncodeState, alpha_index, a_index, alpha_qendpoint, &EncodeState->image_src[rotated_channel * 16], abits, aepbits);
+
+    if (err < *best_err)
+    {
+        copy_epocode(best_candidate->color_qendpoint, color_qendpoint, 8);
+        copy_epocode(best_candidate->alpha_qendpoint, alpha_qendpoint, 2);
+        copy_indexpacked(best_candidate->best_color_index, color_index, 2);
+        copy_indexpacked(best_candidate->best_alpha_index, alpha_index, 2);
+        best_candidate->rotated_channel = rotated_channel;
+        best_candidate->idxMode = idxMode;
+        *best_err = err;
+    }
+}
+
+void icmp_mode5_candidate(
+    BC7_EncodeState      EncodeState[],
+    cmp_mode_parameters  best_candidate[],
+    CGV_ERROR            best_err[],
+    CGU_INT              rotated_channel)
+{
+    CGU_INT bits = 2;
+    CGU_INT abits = 2;
+    CGU_INT aepbits = 8;
+
+    CGV_IMAGE block[48];
+    for (CGU_INT k = 0; k < 16; k++)
+    {
+        for (CGU_INT p = 0; p < 3; p++)
+            block[k + p * 16] = EncodeState->image_src[k + p * 16];
+
+        if (rotated_channel < 3)
+        {
+            // apply channel rotation
+            if (EncodeState->channels == 4) block[k + rotated_channel * 16] = EncodeState->image_src[k + 3 * 16];
+            if (EncodeState->channels == 3) block[k + rotated_channel * 16] = 255;
+        }
+    }
+
+    CGV_IMAGE        block_endpoints[8];
+    CGV_EPOCODE      color_qendpoint[8];
+    CGV_INDEXPACKED  color_index[2];
+    CGV_INDEX        c_index[MAX_SUBSET_SIZE];
+
+    icmp_get_block_endpoints(block_endpoints, block, -1, 3);
+    icmp_GetQuantizedEpoCode(color_qendpoint, block_endpoints, 5, 3);
+    CGV_ERROR err = icmp_GetQuantizeIndex(color_index, c_index, block, bits, block_endpoints, 0, 3);
+
+    // refine
+    CGU_INT refineIterations = EncodeState->refineIterations;
+    for (CGU_INT i = 0; i < refineIterations; i++)
+    {
+        icmp_opt_endpoints(block_endpoints, block, bits, color_index, -1, 3);
+        icmp_GetQuantizedEpoCode(color_qendpoint, block_endpoints, 5, 3);
+        err = icmp_GetQuantizeIndex(color_index, c_index, block, bits, block_endpoints, 0, 3);
+    }
+
+    // encoding selected channel 
+    CGV_EPOCODE     alpha_qendpoint[2];
+    CGV_INDEXPACKED alpha_index[2];
+    CGV_INDEX       a_index[MAX_SUBSET_SIZE];
+    err += icmp_optQuantizeIndex(EncodeState, alpha_index, a_index, alpha_qendpoint, &EncodeState->image_src[rotated_channel * 16], abits, aepbits);
+
+    if (err < *best_err)
+    {
+
+        icmp_swap_epocode(best_candidate->color_qendpoint, color_qendpoint, 8);
+        icmp_swap_indexpacked(best_candidate->best_color_index, color_index, 2);
+        icmp_swap_epocode(best_candidate->alpha_qendpoint, alpha_qendpoint, 2);
+        icmp_swap_indexpacked(best_candidate->best_alpha_index, alpha_index, 2);
+        best_candidate->rotated_channel = rotated_channel;
+        *best_err = err;
+    }
+}
+
+
+// =============== Mode Compression
+
+CGV_ERROR icmp_enc_mode01237_part_fast(
+    CGV_EPOCODE     qep[24],
+    CGV_INDEXPACKED color_index[2],
+    CGV_INDEX  index[MAX_SUBSET_SIZE],
+    CGV_IMAGE  image_src[64],
+    CGV_PARTID part_id,
+    CGU_INT    blockMode)
+{
+    CGV_SHIFT32  pattern = icmp_get_pattern2(part_id);
+    CGU_INT      bits = 2;  if (blockMode == 0 || blockMode == 1) bits = 3;
+    CGU_INT      maxSubSets = 2;  if (blockMode == 0 || blockMode == 2) maxSubSets = 3;
+    CGU_CHANNEL  channels = 3;  if (blockMode == 7) channels = 4;
+
+    CGV_IMAGE block_endpoints[24];
+    for (CGU_INT subset = 0; subset < maxSubSets; subset++)
+    {
+        CGV_MASK partition_mask = icmp_get_partition_mask(part_id, subset);
+        icmp_get_block_endpoints(&block_endpoints[subset * 8], image_src, partition_mask, channels);
+    }
+
+    icmp_GetQuantizedEpoCode(qep, block_endpoints, blockMode, channels);
+    CGV_ERROR total_err = icmp_GetQuantizeIndex(color_index, index, image_src, bits, block_endpoints, pattern, channels);
+
+    return total_err;
+}
+
+void icmp_enc_mode01237(BC7_EncodeState  EncodeState[], CGU_INT blockMode, CGV_PARTID part_list[], CGU_INT part_count)
+{
+    if (part_count == 0) return;
+    CGU_INT     bits = 2;       if (blockMode == 0 || blockMode == 1) bits = 3;
+    CGU_INT     maxSubSets = 2; if (blockMode == 0 || blockMode == 2) maxSubSets = 3;
+    CGU_CHANNEL channels = 3;   if (blockMode == 7) channels = 4;
+
+    CGV_EPOCODE       best_qep[24];
+    CGV_INDEXPACKED   best_endpoint[2];
+    CGV_PARTID        best_part_id = -1;
+    CGV_ERROR         best_err = 1e99;
+
+    for (CGU_INT part = 0; part < part_count; part++)
+    {
+        CGV_PARTID part_id = part_list[part] & 63;
+        if (maxSubSets == 3) part_id += 64;
+
+        CGV_EPOCODE      qep[24];
+        CGV_INDEXPACKED  color_index[2];
+        CGV_INDEX        index[MAX_SUBSET_SIZE];
+        CGV_ERROR   err = icmp_enc_mode01237_part_fast(qep, color_index, index, EncodeState->image_src, part_id, blockMode);
+
+        if (err < best_err)
+        {
+            for (CGU_INT subset = 0; subset < 8 * maxSubSets; subset++) best_qep[subset] = qep[subset];
+            for (CGU_INT k = 0; k < 2; k++) best_endpoint[k] = color_index[k];
+            best_part_id = part_id;
+            best_err = err;
+        }
+    }
+
+    // refine
+    CGU_INT refineIterations = EncodeState->refineIterations;
+    for (CGU_INT _i = 0; _i < refineIterations; _i++)
+    {
+        CGV_IMAGE ep[24];
+        for (CGU_INT subset = 0; subset < maxSubSets; subset++)
+        {
+            CGV_SHIFT32 partition_mask = icmp_get_partition_mask(best_part_id, subset);
+            icmp_opt_endpoints(&ep[subset * 8], EncodeState->image_src, bits, best_endpoint, partition_mask, channels);
+        }
+
+        CGV_EPOCODE      qep[24];
+        CGV_INDEXPACKED  color_index[2];
+        CGV_INDEX        index[MAX_SUBSET_SIZE];
+
+        icmp_GetQuantizedEpoCode(qep, ep, blockMode, channels);
+
+        CGV_SHIFT32 pattern = icmp_get_pattern2(best_part_id);
+        CGV_ERROR   err = icmp_GetQuantizeIndex(color_index, index, EncodeState->image_src, bits, ep, pattern, channels);
+
+        if (err < best_err)
+        {
+            for (CGU_INT subset = 0; subset < 8 * maxSubSets; subset++) best_qep[subset] = qep[subset];
+            for (CGU_INT k = 0; k < 2; k++) best_endpoint[k] = color_index[k];
+            best_err = err;
+        }
+    }
+
+    if (blockMode != 7) best_err += EncodeState->opaque_err; // take into account alpha channel
+
+    if (best_err < EncodeState->best_err)
+    {
+        EncodeState->best_err = best_err;
+        icmp_encode_mode01237(EncodeState->best_cmp_out, best_qep, best_endpoint, best_part_id, blockMode);
+    }
+}
+
+void icmp_mode5(BC7_EncodeState  EncodeState[])
+{
+    cmp_mode_parameters best_candidate;
+    CGV_ERROR best_err = EncodeState->best_err;
+
+#ifdef ASPM_GPU
+    cmp_memsetBC7((CGV_BYTE *)&best_candidate, 0, sizeof(cmp_mode_parameters));
+#else
+    memset(&best_candidate, 0, sizeof(cmp_mode_parameters));
+#endif
+
+    for (CGU_CHANNEL ch = 0; ch < EncodeState->channels; ch++)
+    {
+        icmp_mode5_candidate(EncodeState, &best_candidate, &best_err, ch);
+    }
+
+    if (best_err < EncodeState->best_err)
+    {
+        EncodeState->best_err = best_err;
+        EncodeState->cmp_isout16Bytes = FALSE;
+        icmp_Encode_mode5(EncodeState->best_cmp_out, &best_candidate);
+    }
+}
+
+void icmp_mode6(BC7_EncodeState  EncodeState[])
+{
+    CGV_IMAGE block_endpoints[8];
+    icmp_get_block_endpoints(block_endpoints, EncodeState->image_src, -1, 4);
+
+    CGV_EPOCODE epo_code[8];
+    icmp_GetQuantizedEpoCode(epo_code, block_endpoints, 6, 4);
+
+    CGV_INDEXPACKED color_index[2];
+    CGV_INDEX        index[MAX_SUBSET_SIZE];
+    CGV_ERROR err = icmp_GetQuantizeIndex(color_index, index, EncodeState->image_src, 4, block_endpoints, 0, 4);
+
+    // refine
+    CGU_INT refineIterations = EncodeState->refineIterations;
+    for (CGU_INT i = 0; i < refineIterations; i++)
+    {
+        icmp_opt_endpoints(block_endpoints, EncodeState->image_src, 4, color_index, -1, 4);
+        icmp_GetQuantizedEpoCode(epo_code, block_endpoints, 6, EncodeState->channels);
+        err = icmp_GetQuantizeIndex(color_index, index, EncodeState->image_src, 4, block_endpoints, 0, 4);
+    }
+
+    if (err < EncodeState->best_err)
+    {
+        EncodeState->best_err = err;
+        EncodeState->cmp_isout16Bytes = FALSE;
+        icmp_encode_mode6(EncodeState->best_cmp_out, epo_code, color_index);
+    }
+}
+
+void icmp_mode02(BC7_EncodeState  EncodeState[])
+{
+    CGV_PARTID part_list[64];
+    for (CGU_INT part = 0; part < 64; part++)
+        part_list[part] = part;
+
+    if (EncodeState->validModeMask & 0x01)
+        icmp_enc_mode01237(EncodeState, 0, part_list, 16);
+    if (EncodeState->validModeMask & 0x04)
+        icmp_enc_mode01237(EncodeState, 2, part_list, 64); // usually not worth the time
+}
+
+void icmp_mode7(BC7_EncodeState  EncodeState[])
+{
+    CGV_IMAGE full_stats[15];
+    icmp_compute_stats_masked(full_stats, EncodeState->image_src, -1, EncodeState->channels);
+
+    CGV_PARTID part_list[64];
+    for (CGU_INT part = 0; part < 64; part++)
+    {
+        CGV_MASK   partition_mask = icmp_get_partition_mask(part + 0, 0);
+        CGV_IMAGE  bound12 = icmp_block_pca_bound_split2(EncodeState->image_src, partition_mask, full_stats, EncodeState->channels);
+        CGV_PARTID bound = (CGV_TYPEINT)(bound12);
+        part_list[part] = part + bound * 64;
+    }
+
+    icmp_sort_partlist(part_list, 64, EncodeState->part_count);
+    icmp_enc_mode01237(EncodeState, 7, part_list, EncodeState->part_count);
+}
+
+void icmp_mode13(BC7_EncodeState  EncodeState[])
+{
+    CGV_IMAGE full_stats[15];
+    icmp_compute_stats_masked(full_stats, EncodeState->image_src, -1, 3);
+
+    CGV_PARTID part_list[64];
+    for (CGU_INT part = 0; part < 64; part++)
+    {
+        CGV_MASK   partition_mask = icmp_get_partition_mask(part + 0, 0);
+        CGV_IMAGE  bound12 = icmp_block_pca_bound_split2(EncodeState->image_src, partition_mask, full_stats, 3);
+        CGV_PARTID bound = (CGV_TYPEINT)(bound12);
+        part_list[part] = part + bound * 64;
+    }
+
+    icmp_sort_partlist(part_list, 64, EncodeState->part_count);
+
+    if (EncodeState->validModeMask & 0x02)
+        icmp_enc_mode01237(EncodeState, 1, part_list, EncodeState->part_count);
+    if (EncodeState->validModeMask & 0x08)
+        icmp_enc_mode01237(EncodeState, 3, part_list, EncodeState->part_count);
+}
+
+void icmp_mode4(BC7_EncodeState  EncodeState[])
+{
+    cmp_mode_parameters best_candidate;
+    CGV_ERROR best_err = EncodeState->best_err;
+#ifdef ASPM_GPU
+    cmp_memsetBC7((CGV_BYTE *)&best_candidate, 0, sizeof(cmp_mode_parameters));
+#else
+    memset(&best_candidate, 0, sizeof(cmp_mode_parameters));
+#endif
+
+    for (CGU_CHANNEL rotated_channel = 0; rotated_channel < EncodeState->channels; rotated_channel++)
+    {
+        icmp_enc_mode4_candidate(EncodeState, &best_candidate, &best_err, rotated_channel, 0);
+        icmp_enc_mode4_candidate(EncodeState, &best_candidate, &best_err, rotated_channel, 1);
+    }
+
+    // mode 4
+    if (best_err < EncodeState->best_err)
+    {
+        EncodeState->best_err = best_err;
+        icmp_encode_mode4(EncodeState->best_cmp_out, &best_candidate);
+    }
+}
+
+#endif
+//===================================== COMPRESS CODE =============================================
+
+bool notValidBlockForMode(
+    CGU_UINT32  blockMode,
+    CGU_BOOL    blockNeedsAlpha,
+    CGU_BOOL    blockAlphaZeroOne,
+    uniform CMP_GLOBAL  BC7_Encode  u_BC7Encode[])
+{
+     // Do we need to skip alpha processing blocks
+     if((blockNeedsAlpha == FALSE) && (blockMode > 3))
+     {
+         return TRUE;
+     }
+
+     // Optional restriction for colour-only blocks so that they
+     // don't use modes that have combined colour+alpha - this
+     // avoids the possibility that the encoder might choose an
+     // alpha other than 1.0 (due to parity) and cause something to
+     // become accidentally slightly transparent (it's possible that
+     // when encoding 3-component texture applications will assume that
+     // the 4th component can safely be assumed to be 1.0 all the time)
+     if ((blockNeedsAlpha == FALSE) &&
+         (u_BC7Encode->colourRestrict == TRUE) && 
+         ((blockMode == 6)||(blockMode == 7))) // COMBINED_ALPHA
+     {
+         return TRUE;
+     }
+    
+     // Optional restriction for blocks with alpha to avoid issues with
+     // punch-through or thresholded alpha encoding
+     if((blockNeedsAlpha == TRUE) &&
+        (u_BC7Encode->alphaRestrict == TRUE) &&
+        (blockAlphaZeroOne == TRUE) &&
+        ((blockMode == 6)||(blockMode == 7))) // COMBINED_ALPHA
+     {
+         return TRUE;
+     }
+
+     return FALSE;
+}
+
+void  BC7_CompressBlock(
+                      BC7_EncodeState  EncodeState[],
+uniform CMP_GLOBAL    BC7_Encode       u_BC7Encode[])
+{
+    CGU_BOOL            blockNeedsAlpha        = FALSE;
+    CGU_BOOL            blockAlphaZeroOne      = FALSE;
+
+    CGV_ERROR alpha_err = 0.0f;
+    CGV_IMAGE alpha_min = 255.0F;
+
+    for (CGU_INT k=0; k<SOURCE_BLOCK_SIZE; k++)
+    {
+        if ( EncodeState->image_src[k+COMP_ALPHA*SOURCE_BLOCK_SIZE] < alpha_min)
+            alpha_min =  EncodeState->image_src[k+COMP_ALPHA*SOURCE_BLOCK_SIZE];
+
+        alpha_err += sq_image( EncodeState->image_src[k+COMP_ALPHA*SOURCE_BLOCK_SIZE]-255.0F);
+
+        if (blockAlphaZeroOne == FALSE)
+        {
+            if(( EncodeState->image_src[k+COMP_ALPHA*SOURCE_BLOCK_SIZE] == 255.0F) ||
+               ( EncodeState->image_src[k+COMP_ALPHA*SOURCE_BLOCK_SIZE] == 0.0F))
+            {
+                blockAlphaZeroOne = TRUE;
+            }
+        }
+    }
+
+    if (alpha_min != 255.0F)
+    {
+        blockNeedsAlpha = TRUE;
+    }
+
+     EncodeState->best_err    = CMP_FLOAT_MAX;
+     EncodeState->opaque_err  = alpha_err;
+
+#ifdef USE_ICMP
+     EncodeState->refineIterations = 4;
+     EncodeState->fastSkipTreshold = 4;
+     EncodeState->channels = 4;
+     EncodeState->part_count = 64;
+     EncodeState->cmp_isout16Bytes = FALSE;
+#else
+     EncodeState->cmp_isout16Bytes = TRUE;
+#endif
+
+    // We change the order in which we visit the block modes to try to maximize the chance
+    // that we manage to early out as quickly as possible.
+    // This is a significant performance optimization for the lower quality modes where the
+    // exit threshold is higher, and also tends to improve quality (as the generally higher quality
+    // modes are now enumerated earlier, so the first encoding that passes the threshold will
+    // tend to pass by a greater margin than if we used a dumb ordering, and thus overall error will
+    // be improved)
+    CGU_INT   blockModeOrder[NUM_BLOCK_TYPES] = {4, 6, 1, 3, 0, 2, 7, 5};
+
+    for (CGU_INT block=0; block < NUM_BLOCK_TYPES; block++)
+    {
+        CGU_INT blockMode = blockModeOrder[block];
+
+        if (u_BC7Encode->quality < BC7_qFAST_THRESHOLD)
+        {
+            if ( notValidBlockForMode(blockMode,blockNeedsAlpha,blockAlphaZeroOne,u_BC7Encode) )
+                continue;
+        }
+
+        CGU_INT      Mode = 0x0001 << blockMode;
+        if (!(u_BC7Encode->validModeMask & Mode))
+            continue;
+        switch (blockMode)
+        {
+        // image processing with no alpha
+        case 0:
+                #ifdef USE_ICMP
+                    icmp_mode02(EncodeState);
+                #else
+                    Compress_mode01237(blockMode, EncodeState, u_BC7Encode);
+                #endif
+                break;
+        case 1:
+                #ifdef USE_ICMP
+                    icmp_mode13(EncodeState);
+                #else
+                    Compress_mode01237(blockMode, EncodeState, u_BC7Encode);
+                #endif
+                break;
+        case 2:
+                #ifdef USE_ICMP
+                    icmp_mode13(EncodeState);
+                #else
+                    Compress_mode01237(blockMode, EncodeState, u_BC7Encode);
+                #endif
+                break;
+        case 3:
+                #ifdef USE_ICMP
+                    icmp_mode13(EncodeState);
+                #else
+                    Compress_mode01237(blockMode, EncodeState, u_BC7Encode);
+                #endif
+                break;
+        // image processing with alpha
+        case 4: 
+                #ifdef USE_ICMP
+                    icmp_mode4(EncodeState);
+                #else
+                    Compress_mode45(blockMode, EncodeState, u_BC7Encode);
+                #endif
+                break;
+        case 5:
+                #ifdef USE_ICMP
+                    icmp_mode5(EncodeState);
+                #else
+                    Compress_mode45(blockMode, EncodeState, u_BC7Encode);
+                #endif
+                break;
+        case 6:
+                #ifdef USE_ICMP
+                    icmp_mode6(EncodeState);
+                #else
+                    Compress_mode6( EncodeState,  u_BC7Encode);
+                #endif
+                break;
+        case 7:
+                #ifdef USE_ICMP
+                      icmp_mode7(EncodeState);
+                #else
+                      Compress_mode01237(blockMode, EncodeState, u_BC7Encode);
+                #endif
+                break;
+        }
+
+        // Early out if we  found we can compress with error below the quality threshold
+        if( EncodeState->best_err <= u_BC7Encode->errorThreshold)
+        {
+            break;
+        }
+    }
+
+}
+
+//====================================== BC7_ENCODECLASS END =============================================
+
+#ifndef ASPM_GPU
+
+INLINE void load_block_interleaved_rgba2(CGV_IMAGE image_src[64], uniform texture_surface* uniform src, CGUV_BLOCKWIDTH block_xx, CGU_INT block_yy)
+{
+   for (CGU_INT y=0; y<4; y++)
+   for (CGU_INT x=0; x<4; x++)
+   {
+      CGU_UINT32 * uniform src_ptr = (CGV_SHIFT32*)&src->ptr[(block_yy*4+y)*src->stride];
+#ifdef USE_VARYING
+      CGV_SHIFT32  rgba = gather_partid(src_ptr, block_xx*4+x);
+      image_src[16*0+y*4+x] = (CGV_FLOAT)((rgba>> 0)&255);
+      image_src[16*1+y*4+x] = (CGV_FLOAT)((rgba>> 8)&255);
+      image_src[16*2+y*4+x] = (CGV_FLOAT)((rgba>>16)&255);
+      image_src[16*3+y*4+x] = (CGV_FLOAT)((rgba>>24)&255);
+#else
+      CGV_SHIFT32  rgba = src_ptr[block_xx*4+x];
+      image_src[16*0+y*4+x] = (CGU_FLOAT)((rgba>> 0)&255);
+      image_src[16*1+y*4+x] = (CGU_FLOAT)((rgba>> 8)&255);
+      image_src[16*2+y*4+x] = (CGU_FLOAT)((rgba>>16)&255);
+      image_src[16*3+y*4+x] = (CGU_FLOAT)((rgba>>24)&255);
+#endif
+   }
+}
+
+
+#if defined(CMP_USE_FOREACH_ASPM) || defined(USE_VARYING)
+INLINE void       scatter_uint2(CGU_UINT32 * ptr, CGUV_BLOCKWIDTH idx, CGV_SHIFT32 value)
+{
+   ptr[idx] = value; // (perf warning expected)
+}
+#endif
+
+INLINE void store_data_uint32(CGU_UINT8 dst[], CGU_INT width, CGUV_BLOCKWIDTH v_xx, CGU_INT yy, CGV_SHIFT32 data[], CGU_INT data_size)
+{
+   for (CGU_INT k=0; k<data_size; k++)
+   {
+      CGU_UINT32 * dst_ptr = (CGV_SHIFT32*)&dst[(yy)*width*data_size];
+#ifdef USE_VARYING
+      scatter_uint2(dst_ptr, v_xx*data_size+k, data[k]);
+#else
+      dst_ptr[v_xx*data_size+k] = data[k];
+#endif
+   }
+}
+
+#ifdef USE_VARYING
+INLINE void       scatter_uint8(CGU_UINT8* ptr, CGV_SHIFT32 idx, CGV_CMPOUT value)
+{
+   ptr[idx] = value; // (perf warning expected)
+}
+#endif
+
+INLINE void store_data_uint8(CGU_UINT8 u_dstptr[], CGU_INT src_width, CGUV_BLOCKWIDTH block_x, CGU_INT block_y, CGUV_CMPOUT data[], CGU_INT data_size)
+{
+   for (CGU_INT k=0; k<data_size; k++)
+   {
+#ifdef USE_VARYING
+      CGU_UINT8* dst_blockptr = (CGUV_DSTPTR*)&u_dstptr[(block_y*src_width*4)];
+      scatter_uint8(dst_blockptr,k+(block_x*data_size),data[k]);
+#else
+      u_dstptr[(block_y*src_width*4)+k+(block_x*data_size)] = data[k];
+#endif
+   }
+}
+
+INLINE void store_data_uint32(CGU_UINT8 dst[], CGV_SHIFT32 width, CGUV_BLOCKWIDTH v_xx, CGU_INT yy, CGV_SHIFT32 data[], CGU_INT data_size)
+{
+    for (CGU_INT k = 0; k < data_size; k++)
+    {
+#if defined(CMP_USE_FOREACH_ASPM) || defined(USE_VARYING)
+        CGU_UINT32 * dst_ptr = (CGV_SHIFT32*)&dst[(yy)*width*data_size];
+        scatter_uint2(dst_ptr, v_xx*data_size + k, data[k]);
+#else
+        dst[((yy)*width*data_size) + v_xx * data_size + k] = data[k];
+#endif
+    }
+}
+
+void CompressBlockBC7_XY(uniform texture_surface u_srcptr[], CGUV_BLOCKWIDTH block_x, CGU_INT block_y, CGU_UINT8 u_dst[], uniform BC7_Encode u_settings[])
+{
+   BC7_EncodeState _state;
+   varying BC7_EncodeState* uniform state = &_state;
+
+   copy_BC7_Encode_settings(state, u_settings);
+
+   load_block_interleaved_rgba2(state->image_src,u_srcptr, block_x, block_y);
+
+   BC7_CompressBlock(state, u_settings);
+
+   if (state->cmp_isout16Bytes)
+       store_data_uint8(u_dst, u_srcptr->width, block_x, block_y, state->cmp_out, 16);
+   else
+       store_data_uint32(u_dst, u_srcptr->width, block_x, block_y, state->best_cmp_out, 4);
+
+}
+
+ CMP_EXPORT void CompressBlockBC7_encode( uniform texture_surface src[], CGU_UINT8 dst[], uniform BC7_Encode settings[])
+{
+  // bc7_isa(); ASPM_PRINT(("ASPM encode [%d,%d]\n",bc7_isa(),src->width,src->height));
+
+  for (CGU_INT u_yy = 0; u_yy<src->height/4; u_yy++)
+ #ifdef CMP_USE_FOREACH_ASPM
+    foreach (v_xx = 0 ... src->width/4)
+    {
+ #else
+    for (CGUV_BLOCKWIDTH v_xx = 0; v_xx<src->width/4; v_xx++)
+    {
+ #endif
+        CompressBlockBC7_XY(src, v_xx, u_yy, dst, settings);
+    }
+}
+
+#endif
+
+#ifndef ASPM_GPU
+#ifndef ASPM
+//======================= DECOMPRESS =========================================
+#ifndef USE_HIGH_PRECISION_INTERPOLATION_BC7
+CGU_UINT16 aWeight2[] = { 0, 21, 43, 64 };
+CGU_UINT16 aWeight3[] = { 0, 9, 18, 27, 37, 46, 55, 64 };
+CGU_UINT16 aWeight4[] = { 0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64 };
+
+CGU_UINT8 interpolate(CGU_UINT8 e0, CGU_UINT8 e1, CGU_UINT8 index, CGU_UINT8 indexprecision)
+{
+    if (indexprecision == 2)
+        return (CGU_UINT8)(((64 - aWeight2[index])*CGU_UINT16(e0) + aWeight2[index] * CGU_UINT16(e1) + 32) >> 6);
+    else if (indexprecision == 3)
+        return (CGU_UINT8)(((64 - aWeight3[index])*CGU_UINT16(e0) + aWeight3[index] * CGU_UINT16(e1) + 32) >> 6);
+    else // indexprecision == 4
+        return (CGU_UINT8)(((64 - aWeight4[index])*CGU_UINT16(e0) + aWeight4[index] * CGU_UINT16(e1) + 32) >> 6);
+}
+#endif
+
+void GetBC7Ramp(CGU_UINT32 endpoint[][MAX_DIMENSION_BIG],
+                CGU_FLOAT  ramp[MAX_DIMENSION_BIG][(1<<MAX_INDEX_BITS)],
+                CGU_UINT32 clusters[2],
+                CGU_UINT32 componentBits[MAX_DIMENSION_BIG])
+{
+    CGU_UINT32 ep[2][MAX_DIMENSION_BIG];
+    CGU_UINT32 i;
+
+    // Expand each endpoint component to 8 bits by shifting the MSB to bit 7
+    // and then replicating the high bits to the low bits revealed by
+    // the shift
+    for(i=0; i<MAX_DIMENSION_BIG; i++)
+    {
+        ep[0][i] = 0;
+        ep[1][i] = 0;
+        if(componentBits[i])
+        {
+            ep[0][i]  = (CGU_UINT32)(endpoint[0][i] << (8 - componentBits[i]));
+            ep[1][i]  = (CGU_UINT32)(endpoint[1][i] << (8 - componentBits[i]));
+            ep[0][i] += (CGU_UINT32)(ep[0][i] >> componentBits[i]);
+            ep[1][i] += (CGU_UINT32)(ep[1][i] >> componentBits[i]);
+
+            ep[0][i] = min8(255, max8(0, static_cast<CGU_UINT8>(ep[0][i])));
+            ep[1][i] = min8(255, max8(0, static_cast<CGU_UINT8>(ep[1][i])));
+        }
+    }
+
+    // If this block type has no explicit alpha channel
+    // then make sure alpha is 1.0 for all points on the ramp
+    if(!componentBits[COMP_ALPHA])
+    {
+        ep[0][COMP_ALPHA] = ep[1][COMP_ALPHA] = 255;
+    }
+
+    CGU_UINT32   rampIndex = clusters[0];
+
+    rampIndex = (CGU_UINT32)(log((double)rampIndex) / log(2.0));
+
+    // Generate colours for the RGB ramp
+    for(i=0; i < clusters[0]; i++)
+    {
+#ifdef USE_HIGH_PRECISION_INTERPOLATION_BC7
+        ramp[COMP_RED][i] = (CGU_FLOAT)floor((ep[0][COMP_RED] * (1.0-rampLerpWeightsBC7[rampIndex][i])) +
+                                  (ep[1][COMP_RED] * rampLerpWeightsBC7[rampIndex][i]) + 0.5);
+        ramp[COMP_RED][i] = bc7_minf(255.0, bc7_maxf(0., ramp[COMP_RED][i]));
+        ramp[COMP_GREEN][i] = (CGU_FLOAT)floor((ep[0][COMP_GREEN] * (1.0-rampLerpWeightsBC7[rampIndex][i])) +
+                                  (ep[1][COMP_GREEN] * rampLerpWeightsBC7[rampIndex][i]) + 0.5);
+        ramp[COMP_GREEN][i] = bc7_minf(255.0, bc7_maxf(0., ramp[COMP_GREEN][i]));
+        ramp[COMP_BLUE][i] = (CGU_FLOAT)floor((ep[0][COMP_BLUE] * (1.0-rampLerpWeightsBC7[rampIndex][i])) +
+                                  (ep[1][COMP_BLUE] * rampLerpWeightsBC7[rampIndex][i]) + 0.5);
+        ramp[COMP_BLUE][i] = bc7_minf(255.0, bc7_maxf(0., ramp[COMP_BLUE][i]));
+#else
+        ramp[COMP_RED][i]   = interpolate(ep[0][COMP_RED], ep[1][COMP_RED], i, rampIndex);
+        ramp[COMP_GREEN][i] = interpolate(ep[0][COMP_GREEN], ep[1][COMP_GREEN], i, rampIndex);
+        ramp[COMP_BLUE][i]  = interpolate(ep[0][COMP_BLUE], ep[1][COMP_BLUE], i, rampIndex);
+#endif
+    }
+
+
+    rampIndex = clusters[1];
+    rampIndex = (CGU_UINT32)(log((CGU_FLOAT)rampIndex) / log(2.0));
+
+    if(!componentBits[COMP_ALPHA])
+    {
+        for(i=0; i < clusters[1]; i++)
+        {
+            ramp[COMP_ALPHA][i] = 255.;
+        }
+    }
+    else
+    {
+
+        // Generate alphas
+        for(i=0; i < clusters[1]; i++)
+        {
+#ifdef USE_HIGH_PRECISION_INTERPOLATION_BC7
+            ramp[COMP_ALPHA][i] = (CGU_FLOAT)floor((ep[0][COMP_ALPHA] * (1.0-rampLerpWeightsBC7[rampIndex][i])) +
+                                        (ep[1][COMP_ALPHA] * rampLerpWeightsBC7[rampIndex][i]) + 0.5);
+            ramp[COMP_ALPHA][i] = bc7_minf(255.0, bc7_maxf(0., ramp[COMP_ALPHA][i]));
+#else
+            ramp[COMP_ALPHA][i] = interpolate(ep[0][COMP_ALPHA], ep[1][COMP_ALPHA], i, rampIndex);
+#endif
+        }
+
+    }
+}
+
+//
+// Bit reader - reads one bit from a buffer at the current bit offset
+//              and increments the offset
+//
+
+CGU_UINT32 ReadBit(const CGU_UINT8 base[],CGU_UINT32 &m_bitPosition)
+{
+    int             byteLocation;
+    int             remainder;
+    CGU_UINT32 bit = 0;
+    byteLocation   = m_bitPosition/8;
+    remainder      = m_bitPosition % 8;
+
+    bit = base[byteLocation];
+    bit >>= remainder;
+    bit &= 0x1;
+    // Increment bit position
+    m_bitPosition++;
+    return (bit);
+}
+
+void DecompressDualIndexBlock(
+    CGU_UINT8   out[MAX_SUBSET_SIZE][MAX_DIMENSION_BIG],
+    const CGU_UINT8   in[COMPRESSED_BLOCK_SIZE],
+    CGU_UINT32  endpoint[2][MAX_DIMENSION_BIG],
+    CGU_UINT32  &m_bitPosition,
+    CGU_UINT32  m_rotation,
+    CGU_UINT32  m_blockMode,
+    CGU_UINT32  m_indexSwap,
+    CGU_UINT32  m_componentBits[MAX_DIMENSION_BIG])
+{
+    CGU_UINT32   i, j, k;
+    CGU_FLOAT    ramp[MAX_DIMENSION_BIG][1<<MAX_INDEX_BITS];
+    CGU_UINT32   blockIndices[2][MAX_SUBSET_SIZE];
+
+    CGU_UINT32   clusters[2];
+    clusters[0] = 1 << bti[m_blockMode].indexBits[0];
+    clusters[1] = 1 << bti[m_blockMode].indexBits[1];
+    if(m_indexSwap)
+    {
+        CGU_UINT32   temp = clusters[0];
+        clusters[0] = clusters[1];
+        clusters[1] = temp;
+    }
+
+    GetBC7Ramp(endpoint,
+               ramp,
+               clusters,
+               m_componentBits);
+
+    // Extract the indices
+    for(i=0;i<2;i++)
+    {
+        for(j=0;j<MAX_SUBSET_SIZE;j++)
+        {
+            blockIndices[i][j] = 0;
+            // If this is a fixup index then clear the implicit bit
+            if(j==0)
+            {
+                blockIndices[i][j] &= ~(1 << (bti[m_blockMode].indexBits[i]-1));
+                for(k=0;k<static_cast <CGU_UINT32>(bti[m_blockMode].indexBits[i] - 1); k++)
+                {
+                    blockIndices[i][j] |= (CGU_UINT32)ReadBit(in,m_bitPosition) << k;
+                }
+            }
+            else
+            {
+               for(k=0;k<bti[m_blockMode].indexBits[i]; k++)
+               {
+                   blockIndices[i][j] |= (CGU_UINT32)ReadBit(in,m_bitPosition) << k;
+               }
+            }
+        }
+    }
+
+    // Generate block colours
+    for(i=0;i<MAX_SUBSET_SIZE;i++)
+    {
+        out[i][COMP_ALPHA] = (CGU_UINT8)ramp[COMP_ALPHA][blockIndices[m_indexSwap^1][i]];
+        out[i][COMP_RED]   = (CGU_UINT8)ramp[COMP_RED][blockIndices[m_indexSwap][i]];
+        out[i][COMP_GREEN] = (CGU_UINT8)ramp[COMP_GREEN][blockIndices[m_indexSwap][i]];
+        out[i][COMP_BLUE]  = (CGU_UINT8)ramp[COMP_BLUE][blockIndices[m_indexSwap][i]];
+    }
+
+    // Resolve the component rotation
+    CGU_INT8 swap;
+    for(i=0; i<MAX_SUBSET_SIZE; i++)
+    {
+        switch(m_rotation)
+        {
+            case    0:
+                // Do nothing
+                break;
+            case    1:
+                // Swap A and R
+                swap = out[i][COMP_ALPHA];
+                out[i][COMP_ALPHA] = out[i][COMP_RED];
+                out[i][COMP_RED] = swap;
+                break;
+            case    2:
+                // Swap A and G
+                swap = out[i][COMP_ALPHA];
+                out[i][COMP_ALPHA] = out[i][COMP_GREEN];
+                out[i][COMP_GREEN] = swap;
+                break;
+            case    3:
+                // Swap A and B
+                swap = out[i][COMP_ALPHA];
+                out[i][COMP_ALPHA] = out[i][COMP_BLUE];
+                out[i][COMP_BLUE] = swap;
+                break;
+        }
+    }
+}
+
+
+void DecompressBC7_internal(CGU_UINT8  out[MAX_SUBSET_SIZE][MAX_DIMENSION_BIG], const CGU_UINT8 in[COMPRESSED_BLOCK_SIZE], const BC7_Encode *u_BC7Encode)
+{
+    if (u_BC7Encode) {}
+    CGU_UINT32           i, j;
+    CGU_UINT32           blockIndices[MAX_SUBSET_SIZE];
+    CGU_UINT32           endpoint[MAX_SUBSETS][2][MAX_DIMENSION_BIG];
+
+    CGU_UINT32 m_blockMode;
+    CGU_UINT32 m_partition;
+    CGU_UINT32 m_rotation;
+    CGU_UINT32 m_indexSwap;
+
+    CGU_UINT32 m_bitPosition;
+    CGU_UINT32 m_componentBits[MAX_DIMENSION_BIG];
+
+    m_blockMode = 0;
+    m_partition = 0;
+    m_rotation = 0;
+    m_indexSwap = 0;
+
+    // Position the read pointer at the LSB of the block
+    m_bitPosition = 0;
+
+    while (!ReadBit(in, m_bitPosition) && (m_blockMode < 8))
+    {
+        m_blockMode++;
+    }
+
+    if (m_blockMode > 7)
+    {
+        // Something really bad happened...
+        return;
+    }
+
+    for (i = 0; i < bti[m_blockMode].rotationBits; i++)
+    {
+        m_rotation |= ReadBit(in, m_bitPosition) << i;
+    }
+    for (i = 0; i < bti[m_blockMode].indexModeBits; i++)
+    {
+        m_indexSwap |= ReadBit(in, m_bitPosition) << i;
+    }
+
+    for (i = 0; i < bti[m_blockMode].partitionBits; i++)
+    {
+        m_partition |= ReadBit(in, m_bitPosition) << i;
+    }
+
+
+
+    if (bti[m_blockMode].encodingType == NO_ALPHA)
+    {
+        m_componentBits[COMP_ALPHA] = 0;
+        m_componentBits[COMP_RED] =
+            m_componentBits[COMP_GREEN] =
+            m_componentBits[COMP_BLUE] = bti[m_blockMode].vectorBits / 3;
+    }
+    else if (bti[m_blockMode].encodingType == COMBINED_ALPHA)
+    {
+        m_componentBits[COMP_ALPHA] =
+            m_componentBits[COMP_RED] =
+            m_componentBits[COMP_GREEN] =
+            m_componentBits[COMP_BLUE] = bti[m_blockMode].vectorBits / 4;
+    }
+    else if (bti[m_blockMode].encodingType == SEPARATE_ALPHA)
+    {
+        m_componentBits[COMP_ALPHA] = bti[m_blockMode].scalarBits;
+        m_componentBits[COMP_RED] =
+            m_componentBits[COMP_GREEN] =
+            m_componentBits[COMP_BLUE] = bti[m_blockMode].vectorBits / 3;
+    }
+
+    CGU_UINT32   subset, ep, component;
+    // Endpoints are stored in the following order RRRR GGGG BBBB (AAAA) (PPPP)
+    // i.e. components are packed together
+    // Loop over components
+    for (component = 0; component < MAX_DIMENSION_BIG; component++)
+    {
+        // loop over subsets
+        for (subset = 0; subset < (int)bti[m_blockMode].subsetCount; subset++)
+        {
+            // Loop over endpoints
+            for (ep = 0; ep < 2; ep++)
+            {
+                endpoint[subset][ep][component] = 0;
+                for (j = 0; j < m_componentBits[component]; j++)
+                {
+                    endpoint[subset][ep][component] |= ReadBit(in, m_bitPosition) << j;
+                }
+            }
+        }
+    }
+
+
+    // Now get any parity bits
+    if (bti[m_blockMode].pBitType != NO_PBIT)
+    {
+        for (subset = 0; subset < (int)bti[m_blockMode].subsetCount; subset++)
+        {
+            CGU_UINT32   pBit[2];
+            if (bti[m_blockMode].pBitType == ONE_PBIT)
+            {
+                pBit[0] = ReadBit(in, m_bitPosition);
+                pBit[1] = pBit[0];
+            }
+            else if (bti[m_blockMode].pBitType == TWO_PBIT)
+            {
+                pBit[0] = ReadBit(in, m_bitPosition);
+                pBit[1] = ReadBit(in, m_bitPosition);
+            }
+
+            for (component = 0; component < MAX_DIMENSION_BIG; component++)
+            {
+                if (m_componentBits[component])
+                {
+                    endpoint[subset][0][component] <<= 1;
+                    endpoint[subset][1][component] <<= 1;
+                    endpoint[subset][0][component] |= pBit[0];
+                    endpoint[subset][1][component] |= pBit[1];
+                }
+            }
+        }
+    }
+
+    if (bti[m_blockMode].pBitType != NO_PBIT)
+    {
+        // Now that we've unpacked the parity bits, update the component size information
+        // for the ramp generator
+        for (j = 0; j < MAX_DIMENSION_BIG; j++)
+        {
+            if (m_componentBits[j])
+            {
+                m_componentBits[j] += 1;
+            }
+        }
+    }
+
+    // If this block has two independent sets of indices then put it to that decoder
+    if (bti[m_blockMode].encodingType == SEPARATE_ALPHA)
+    {
+        DecompressDualIndexBlock(out, in, endpoint[0], m_bitPosition, m_rotation, m_blockMode, m_indexSwap, m_componentBits);
+        return;
+    }
+
+    CGU_UINT32   fixup[MAX_SUBSETS] = { 0, 0, 0 };
+    switch (bti[m_blockMode].subsetCount)
+    {
+    case    3:
+        fixup[1] = BC7_FIXUPINDICES_LOCAL[2][m_partition][1];
+        fixup[2] = BC7_FIXUPINDICES_LOCAL[2][m_partition][2];
+        break;
+    case    2:
+        fixup[1] = BC7_FIXUPINDICES_LOCAL[1][m_partition][1];
+        break;
+    default:
+        break;
+    }
+
+    //--------------------------------------------------------------------
+    // New Code : Possible replacement for BC7_PARTITIONS for CPU code
+    //--------------------------------------------------------------------
+    // Extract index bits
+    // for (i = 0; i < MAX_SUBSET_SIZE; i++)
+    // {
+    //     CGV_UINT8   p = get_partition_subset(m_partition, bti[m_blockMode].subsetCount - 1, i);
+    //     //CGU_UINT32   p = partitionTable[i];
+    //     blockIndices[i] = 0;
+    //     CGU_UINT32   bitsToRead = bti[m_blockMode].indexBits[0];
+    // 
+    //     // If this is a fixup index then set the implicit bit
+    //     if (i == fixup[p])
+    //     {
+    //         blockIndices[i] &= ~(1 << (bitsToRead - 1));
+    //         bitsToRead--;
+    //     }
+    // 
+    //     for (j = 0; j < bitsToRead; j++)
+    //     {
+    //         blockIndices[i] |= ReadBit(in, m_bitPosition) << j;
+    //     }
+    // }
+    CGU_BYTE *partitionTable = (CGU_BYTE*)BC7_PARTITIONS[bti[m_blockMode].subsetCount-1][m_partition];
+
+    // Extract index bits
+    for(i=0; i < MAX_SUBSET_SIZE; i++)
+    {
+        CGU_BYTE   p = partitionTable[i];
+        blockIndices[i] = 0;
+        CGU_BYTE   bitsToRead = bti[m_blockMode].indexBits[0];
+
+        // If this is a fixup index then set the implicit bit
+        if(i==fixup[p])
+        {
+            blockIndices[i] &= ~(1 << (bitsToRead-1));
+            bitsToRead--;
+        }
+
+        for(j=0;j<bitsToRead; j++)
+        {
+            blockIndices[i] |= ReadBit(in,m_bitPosition) << j;
+        }
+    }
+
+    // Get the ramps
+    CGU_UINT32   clusters[2];
+    clusters[0] = clusters[1] = 1 << bti[m_blockMode].indexBits[0];
+
+    // Colour Ramps
+    CGU_FLOAT          c[MAX_SUBSETS][MAX_DIMENSION_BIG][1 << MAX_INDEX_BITS];
+
+    for (i = 0; i < (int)bti[m_blockMode].subsetCount; i++)
+    {
+        // Unpack the colours
+        GetBC7Ramp(endpoint[i],
+            c[i],
+            clusters,
+            m_componentBits);
+    }
+
+    //--------------------------------------------------------------------
+    // New Code : Possible replacement for BC7_PARTITIONS for CPU code
+    //--------------------------------------------------------------------
+    // Generate the block colours.
+    // for (i = 0; i < MAX_SUBSET_SIZE; i++)
+    // {
+    //     CGV_UINT8   p = get_partition_subset(m_partition, bti[m_blockMode].subsetCount - 1, i);
+    //     out[i][0] = c[p][0][blockIndices[i]];
+    //     out[i][1] = c[p][1][blockIndices[i]];
+    //     out[i][2] = c[p][2][blockIndices[i]];
+    //     out[i][3] = c[p][3][blockIndices[i]];
+    // }
+
+    // Generate the block colours.
+    for(i=0; i<MAX_SUBSET_SIZE; i++)
+    {
+        for(j=0; j < MAX_DIMENSION_BIG; j++)
+        {
+            out[i][j] = (CGU_UINT8)c[partitionTable[i]][j][blockIndices[i]];
+        }
+    }
+}
+
+void  CompressBlockBC7_Internal(
+    CGU_UINT8                      image_src[SOURCE_BLOCK_SIZE][4],
+    CMP_GLOBAL    CGV_CMPOUT       cmp_out[COMPRESSED_BLOCK_SIZE],
+    uniform CMP_GLOBAL BC7_Encode  u_BC7Encode[])
+{
+    BC7_EncodeState _state = { 0 };
+    varying BC7_EncodeState* uniform state = &_state;
+
+    copy_BC7_Encode_settings(state, u_BC7Encode);
+
+    CGU_UINT8 offsetR = 0;
+    CGU_UINT8 offsetG = 16;
+    CGU_UINT8 offsetB = 32;
+    CGU_UINT8 offsetA = 48;
+    for (CGU_UINT8 i = 0; i < SOURCE_BLOCK_SIZE; i++)
+    {
+        state->image_src[offsetR++] = (CGV_IMAGE)image_src[i][0];
+        state->image_src[offsetG++] = (CGV_IMAGE)image_src[i][1];
+        state->image_src[offsetB++] = (CGV_IMAGE)image_src[i][2];
+        state->image_src[offsetA++] = (CGV_IMAGE)image_src[i][3];
+    }
+
+    BC7_CompressBlock(state, u_BC7Encode);
+
+    if (state->cmp_isout16Bytes)
+    {
+        for (CGU_UINT8 i = 0; i < COMPRESSED_BLOCK_SIZE; i++)
+        {
+            cmp_out[i] = state->cmp_out[i];
+        }
+    }
+    else
+    {
+#ifdef ASPM_GPU
+        cmp_memcpy(cmp_out, (CGU_UINT8 *)state->best_cmp_out, 16);
+#else
+        memcpy(cmp_out, state->best_cmp_out, 16);
+#endif
+    }
+}
+
+//======================= CPU USER INTERFACES ====================================
+
+int CMP_CDECL CreateOptionsBC7(void **options)
+{
+    (*options) = new BC7_Encode;
+    if (!options) return CGU_CORE_ERR_NEWMEM;
+    init_BC7ramps();
+    SetDefaultBC7Options((BC7_Encode *)(*options));
+    return CGU_CORE_OK;
+}
+
+int CMP_CDECL DestroyOptionsBC7(void *options)
+{
+    if (!options) return CGU_CORE_ERR_INVALIDPTR;
+    BC7_Encode *BCOptions = reinterpret_cast <BC7_Encode *>(options);
+    delete BCOptions;
+    return CGU_CORE_OK;
+}
+
+int CMP_CDECL SetErrorThresholdBC7(void *options, CGU_FLOAT minThreshold, CGU_FLOAT maxThreshold)
+{
+    if (!options) return CGU_CORE_ERR_INVALIDPTR;
+    BC7_Encode *BC7optionsDefault = (BC7_Encode *)options;
+
+    if (minThreshold < 0.0f) minThreshold = 0.0f;
+    if (maxThreshold < 0.0f) maxThreshold = 0.0f;
+
+    BC7optionsDefault->minThreshold = minThreshold;
+    BC7optionsDefault->maxThreshold = maxThreshold;
+    return CGU_CORE_OK;
+}
+
+int CMP_CDECL SetQualityBC7(void *options, CGU_FLOAT fquality)
+{
+    if (!options) return CGU_CORE_ERR_INVALIDPTR;
+
+    BC7_Encode *BC7optionsDefault = (BC7_Encode *)options;
+    if (fquality < 0.0f) fquality = 0.0f;
+    else
+        if (fquality > 1.0f) fquality = 1.0f;
+    BC7optionsDefault->quality = fquality;
+
+    // Set Error Thresholds
+    BC7optionsDefault->errorThreshold           = BC7optionsDefault->maxThreshold * (1.0f - fquality);
+    if(fquality > BC7_qFAST_THRESHOLD)
+        BC7optionsDefault->errorThreshold      += BC7optionsDefault->minThreshold;
+
+    return CGU_CORE_OK;
+}
+
+int CMP_CDECL SetMaskBC7(void *options, CGU_UINT8 mask)
+{
+    if (!options) return CGU_CORE_ERR_INVALIDPTR;
+    BC7_Encode *BC7options = (BC7_Encode *)options;
+    BC7options->validModeMask = mask;
+    return CGU_CORE_OK;
+}
+
+int CMP_CDECL SetAlphaOptionsBC7(void *options, CGU_BOOL imageNeedsAlpha, CGU_BOOL colourRestrict, CGU_BOOL alphaRestrict)
+{
+    if (!options) return CGU_CORE_ERR_INVALIDPTR;
+    BC7_Encode *u_BC7Encode = (BC7_Encode *)options;
+    u_BC7Encode->imageNeedsAlpha = imageNeedsAlpha;
+    u_BC7Encode->colourRestrict  = colourRestrict;
+    u_BC7Encode->alphaRestrict   = alphaRestrict;
+    return CGU_CORE_OK;
+}
+
+int CMP_CDECL CompressBlockBC7( const unsigned char *srcBlock,
+                                unsigned int srcStrideInBytes,
+                                CMP_GLOBAL unsigned char cmpBlock[16],
+                                const void* options = NULL) 
+{
+    CMP_Vec4uc inBlock[SOURCE_BLOCK_SIZE];
+
+    //----------------------------------
+    // Fill the inBlock with source data
+    //----------------------------------
+    CGU_INT srcpos = 0;
+    CGU_INT dstptr = 0;
+    for (CGU_UINT8 row = 0; row < 4; row++)
+    {
+        srcpos = row * srcStrideInBytes;
+        for (CGU_UINT8 col = 0; col < 4; col++)
+        {
+            inBlock[dstptr].x = CGU_UINT8(srcBlock[srcpos++]);
+            inBlock[dstptr].y = CGU_UINT8(srcBlock[srcpos++]);
+            inBlock[dstptr].z = CGU_UINT8(srcBlock[srcpos++]);
+            inBlock[dstptr].w = CGU_UINT8(srcBlock[srcpos++]);
+            dstptr++;
+        }
+    }
+
+
+    BC7_Encode *u_BC7Encode = (BC7_Encode *)options;
+    BC7_Encode       BC7EncodeDefault = { 0 };
+    if (u_BC7Encode == NULL)
+    {
+        u_BC7Encode = &BC7EncodeDefault;
+        SetDefaultBC7Options(u_BC7Encode);
+        init_BC7ramps();
+    }
+
+    BC7_EncodeState EncodeState 
+#ifndef ASPM
+        = { 0 }
+#endif
+    ;
+    EncodeState.best_err        = CMP_FLOAT_MAX;
+    EncodeState.validModeMask   = u_BC7Encode->validModeMask;
+    EncodeState.part_count      = u_BC7Encode->part_count;
+    EncodeState.channels        = static_cast<CGU_CHANNEL>(u_BC7Encode->channels);
+
+    CGU_UINT8 offsetR = 0;
+    CGU_UINT8 offsetG = 16;
+    CGU_UINT8 offsetB = 32;
+    CGU_UINT8 offsetA = 48;
+    CGU_UINT32 offsetSRC = 0;
+    for (CGU_UINT8 i = 0; i < SOURCE_BLOCK_SIZE; i++)
+    {
+        EncodeState.image_src[offsetR++] = (CGV_IMAGE)inBlock[offsetSRC].x;
+        EncodeState.image_src[offsetG++] = (CGV_IMAGE)inBlock[offsetSRC].y;
+        EncodeState.image_src[offsetB++] = (CGV_IMAGE)inBlock[offsetSRC].z;
+        EncodeState.image_src[offsetA++] = (CGV_IMAGE)inBlock[offsetSRC].w;
+        offsetSRC++;
+    }
+
+    BC7_CompressBlock(&EncodeState, u_BC7Encode);
+
+    if (EncodeState.cmp_isout16Bytes)
+    {
+        for (CGU_UINT8 i = 0; i < COMPRESSED_BLOCK_SIZE; i++)
+        {
+            cmpBlock[i] = EncodeState.cmp_out[i];
+        }
+    }
+    else
+    {
+        memcpy(cmpBlock, EncodeState.best_cmp_out, 16);
+    }
+
+    return CGU_CORE_OK;
+}
+
+int  CMP_CDECL DecompressBlockBC7(const unsigned char cmpBlock[16],
+                                  unsigned char srcBlock[64],
+                                  const void *options = NULL) {
+    BC7_Encode *u_BC7Encode = (BC7_Encode *)options;
+    BC7_Encode       BC7EncodeDefault = { 0 }; // for q = 0.05
+    if (u_BC7Encode == NULL)
+    {
+        // set for q = 1.0
+        u_BC7Encode = &BC7EncodeDefault;
+        SetDefaultBC7Options(u_BC7Encode);
+        init_BC7ramps();
+    }
+    DecompressBC7_internal((CGU_UINT8(*)[4])srcBlock, (CGU_UINT8 *)cmpBlock,u_BC7Encode);
+    return CGU_CORE_OK;
+}
+#endif
+#endif
+
+//============================================== OpenCL USER INTERFACE ====================================================
+#ifdef ASPM_GPU
+CMP_STATIC CMP_KERNEL void CMP_GPUEncoder(uniform CMP_GLOBAL const  CGU_Vec4uc      ImageSource[],
+                                                  CMP_GLOBAL        CGV_CMPOUT      ImageDestination[],
+                                          uniform CMP_GLOBAL        Source_Info     SourceInfo[],
+                                          uniform CMP_GLOBAL        BC7_Encode      BC7Encode[] )
+{
+    CGU_INT xID=0;
+    CGU_INT yID=0;
+
+    xID = get_global_id(0);         // ToDo: Define a size_t 32 bit and 64 bit basd on clGetDeviceInfo
+    yID = get_global_id(1);
+
+    CGU_INT  srcWidth  = SourceInfo->m_src_width;
+    CGU_INT  srcHeight = SourceInfo->m_src_height;
+    if (xID >= (srcWidth  / BlockX)) return;
+    if (yID >= (srcHeight / BlockY)) return;
+
+    CGU_INT     destI = (xID*COMPRESSED_BLOCK_SIZE) + (yID*(srcWidth / BlockX)*COMPRESSED_BLOCK_SIZE);
+    CGU_INT     srcindex = 4 * (yID * srcWidth + xID);
+    CGU_INT     blkindex = 0;
+    BC7_EncodeState EncodeState;
+    varying BC7_EncodeState* uniform state = &EncodeState;
+
+   copy_BC7_Encode_settings(state, BC7Encode);
+
+    //Check if it is a complete 4X4 block
+    if (((xID + 1)*BlockX <= srcWidth) && ((yID + 1)*BlockY <= srcHeight))
+    {
+        srcWidth = srcWidth - 4;
+        for (CGU_INT j = 0; j < 4; j++) {
+            for (CGU_INT i = 0; i < 4; i++) {
+                state->image_src[blkindex+0*SOURCE_BLOCK_SIZE] = ImageSource[srcindex].x;
+                state->image_src[blkindex+1*SOURCE_BLOCK_SIZE] = ImageSource[srcindex].y;
+                state->image_src[blkindex+2*SOURCE_BLOCK_SIZE] = ImageSource[srcindex].z;
+                state->image_src[blkindex+3*SOURCE_BLOCK_SIZE] = ImageSource[srcindex].w;
+                blkindex++;
+                srcindex++;
+            }
+
+            srcindex += srcWidth;
+        }
+
+   copy_BC7_Encode_settings(state, BC7Encode);
+
+    BC7_CompressBlock(&EncodeState, BC7Encode);
+
+    for (CGU_INT i=0; i<COMPRESSED_BLOCK_SIZE; i++)
+    {
+        ImageDestination[destI+i] = state->cmp_out[i];
+    }
+
+    }
+    else
+    {
+        ASPM_PRINT(("[ASPM_GPU] Unable to process, make sure image size is divisible by 4"));
+    }
+}
+#endif
diff --git a/extern/CMP_Core/shaders/BC7_Encode_Kernel.h b/extern/CMP_Core/shaders/BC7_Encode_Kernel.h
new file mode 100644
index 0000000..1a812b9
--- /dev/null
+++ b/extern/CMP_Core/shaders/BC7_Encode_Kernel.h
@@ -0,0 +1,1580 @@
+//=====================================================================
+// Copyright (c) 2018    Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files(the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and / or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions :
+// 
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+//
+//=====================================================================
+#ifndef BC7_ENCODE_KERNEL_H
+#define BC7_ENCODE_KERNEL_H
+
+#if defined(ISPC)||defined(ASPM)
+//#include "..\..\Common\Common_Def.h"
+#include "Common_Def.h"
+#else
+#include "Common_Def.h"
+#endif
+
+// cmp param uniform data tracking
+typedef CGU_UINT8  CGU_CHANNEL;
+typedef CGV_UINT32 CGV_SHIFT32;
+typedef CGV_UINT8  CGV_BYTE;
+typedef CGV_FLOAT  CGV_ERROR; 
+typedef CGV_FLOAT  CGV_IMAGE;
+typedef CGV_INT    CGV_EPOCODE;
+typedef CGV_UINT8  CGV_CMPOUT;
+typedef CGV_UINT8  CGV_INDEX;
+typedef CGV_UINT32 CGV_INDEXPACKED;
+typedef CGV_UINT32 CGV_CMPOUTPACKED;
+typedef CGV_INT    CGV_LEVELS;
+typedef CGV_INT    CGV_SUBSETS;
+typedef CGV_INT    CGV_MASK;
+typedef CGV_INT    CGV_ITTERATIONS;
+typedef CGV_INT    CGV_PARTID;
+typedef CGV_INT    CGV_FIXUPINDEX;
+typedef CGV_INT    CGV_RAMP;
+typedef CGV_INT    CGV_ENTRIES;
+typedef CGV_INT    CGV_TYPEINT;
+typedef CGV_UINT32 CGV_TYPEUINT32;
+typedef CGU_UINT8  CGU_BYTE;
+typedef CGV_CMPOUT CGUV_CMPOUT;
+typedef CGU_UINT8  CGUV_DSTPTR;
+
+#define USE_VARYING
+#ifdef USE_VARYING
+typedef CGV_INT    CGUV_BLOCKWIDTH;
+#else
+typedef CGU_INT    CGUV_BLOCKWIDTH;
+#endif
+
+#ifndef ASPM_GPU
+
+struct cmp_bc7_state
+{
+   CGV_IMAGE      block[16][4];
+   CGV_SHIFT32    best_data[4];
+} ;
+
+
+    typedef enum 
+    {
+        CGU_FORMAT_Unknown,         // Undefined texture format.
+
+        // Channel Component formats-------------------------------------------------------------------------------
+        CGU_FORMAT_RGBA_8888,       // RGBA format with 8-bit fixed channels.
+
+                                    // Formats supported by GPU
+        CGU_FORMAT_BC1,             // A four component opaque (or 1-bit alpha) compressed texture format for Microsoft DirectX10. Identical to DXT1.  Four bits per pixel.
+        CGU_FORMAT_BC6H,            // BC6H compressed texture format
+        CGU_FORMAT_BC7,             // BC7  compressed texture format
+
+        // Formats supported by CPU
+        CGU_FORMAT_GTC,             // GTC Gradient Texture Compressor
+        CGU_FORMAT_MAX
+    } CGU_FORMAT;
+
+    //------------------------------------
+    // The structure describing a texture
+    //------------------------------------
+    struct CGU_Texture_Type
+    {
+        // Optional Settings
+        CGU_FLOAT    m_fquality;      // Minimum resulting quality to maintain while processing the texture, default is 0.05
+        CGU_INT8     m_nBlockHeight;  // Size of the texture tiles (blocks) to use.during processing
+        CGU_INT8     m_nBlockWidth;   // default = 4
+        CGU_INT8     m_nBlockDepth;   // default = 1
+
+        // Required settings
+        CGU_FORMAT   m_format;        // Texture format
+        CGU_UINT32   m_src_width;     // Width of the texture.
+        CGU_UINT32   m_src_height;    // Height of the texture.
+        CGU_UINT32   m_stride;        // Number of bytes to start of next line 
+        CGU_UINT32   m_dwDataSize;    // Size of the allocated texture data.
+        CGU_UINT8*   m_pData;         // Pointer to the texture data
+    };
+
+#endif // End of ASPM_CPU 
+
+#define SOURCE_BLOCK_SIZE               16     // Size of a source block in pixels (each pixel has RGBA:8888 channels)
+#define COMPRESSED_BLOCK_SIZE           16     // Size of a compressed block in bytes
+#define MAX_CHANNELS                    4
+#define MAX_SUBSETS                     3      // Maximum number of possible subsets
+#define MAX_SUBSET_SIZE                 16     // Largest possible size for an individual subset
+#define BC7_qFAST_THRESHOLD             0.50f
+#define MAX_INDEX_BITS                  4      // Maximum number of index bits
+
+typedef struct 
+{
+    CGV_IMAGE   image;
+    CGV_INDEX   index;
+} CMP_di;
+
+typedef struct 
+{
+    CGV_IMAGE   image;
+    CGV_UINT8   index;
+} CMP_du;
+
+#define MAX_PARTITION_ENTRIES           64
+
+#define MAX_PARTITIONS_TABLE            193
+
+#define MAX_PARTITIONS                  64  // Maximum number of partition types
+
+#define EPSILON                         0.00390625f
+#define DIMENSION                       4
+#define BlockX                          4
+#define BlockY                          4
+#define QUANT_RT                        250.0f          // quality = 0.05f
+//==========================================================================================================
+#define LOG_CL_RANGE                    5
+#define LOG_CL_BASE                     2
+#define BIT_BASE                        5
+#define BIT_RANGE                       9
+#define MAX_CLUSTERS_BIG                16
+#define MAX_CLUSTERS                    8
+#define BTT(bits)                       (bits-BIT_BASE)
+#define CLT(cl)                         (cl-LOG_CL_BASE)
+
+#define MAX_TRY_QUANT_TRACE             2       // used in optQuantTrace_d : increasing this has no gain in quality!, keep it set at 2 
+#define MAX_TRY_SHAKER                  5       // used in ep_shaker_2_d if set at 4  PSNR drops by -0.1 SSIM stays the same
+#define NUM_BLOCK_TYPES                 8       // Number of block types in the format
+
+#define BC7_MAX_TRACE                   25000
+
+// If this is defined, ramp calculation is done via math floor and division.
+// Otherwise, ramp calculation is done by bit shifting
+#define USE_HIGH_PRECISION_INTERPOLATION_BC7 
+
+typedef struct 
+{
+    CGU_INT32   k;
+    CGV_FLOAT   d;
+} TRACE;
+
+
+typedef struct 
+#ifdef ASPM
+BC7_EncodeState
+#endif
+{
+    CGV_IMAGE       image_src[64];
+    CGV_CMPOUT      cmp_out[COMPRESSED_BLOCK_SIZE];
+
+    // Common
+    CGV_ERROR       opaque_err;       // error for coding alpha=255
+    CGV_ERROR       best_err;
+
+    // set per mode
+    CGU_CHANNEL     channels3or4;
+    CGU_UINT8       bits;
+    CGU_INT         clusters;
+    CGU_BYTE        componentBits;
+    CGU_UINT8       numPartitionModes;
+    CGU_INT         maxSubSets;
+    CGU_UINT8       numClusters0[2];
+    CGU_UINT8       numClusters1[2];
+    CGU_UINT8       max_idxMode;
+    CGU_INT         modeBits[2];
+    CGU_BOOL        optimizedQ;
+
+    CGU_UINT32      validModeMask;
+    CGU_INT         part_count;
+    CGU_CHANNEL     channels;
+
+    // use_icmp
+    CGV_CMPOUTPACKED    best_cmp_out[5];
+    CGV_BOOL            cmp_isout16Bytes;
+    CGU_INT             refineIterations;
+    CGU_INT             fastSkipTreshold;
+}
+#ifndef ASPM
+BC7_EncodeState
+#endif
+;
+
+typedef struct 
+#ifdef ASPM
+cmp_mode_parameters
+#endif
+{
+   CGV_EPOCODE       color_qendpoint[8];
+   CGV_EPOCODE       alpha_qendpoint[8];
+   CGV_INDEXPACKED   best_color_index[2];
+   CGV_INDEXPACKED   best_alpha_index[2];
+   CGV_INDEX         color_index[SOURCE_BLOCK_SIZE];
+   CGV_INDEX         alpha_index[SOURCE_BLOCK_SIZE];
+   CGV_SHIFT32       idxMode;
+   CGV_SHIFT32       rotated_channel;
+}
+#ifndef ASPM
+cmp_mode_parameters
+#endif
+;
+
+
+typedef struct 
+#ifdef ASPM
+BC7_Encode
+#endif
+{
+    // Global data setup at initialization time
+    CGU_FLOAT  quality;                         // range is 0 to 1
+    CGU_FLOAT  errorThreshold;                  // use 5 to 75
+    CGU_UINT32 validModeMask;                   // bit for mode masks def to 0xFF
+    CGU_BOOL   imageNeedsAlpha;                 // default: false
+    CGU_BOOL   colourRestrict;                  // default: false
+    CGU_BOOL   alphaRestrict;                   // default: false
+
+    // Used to track errors in internal state code
+    CGV_ERROR  opaque_err;
+    CGV_ERROR  best_err;
+
+    CGU_FLOAT  minThreshold;
+    CGU_FLOAT  maxThreshold;;
+
+    // icmp code settings
+    CGU_INT    refineIterations;
+    CGU_INT    part_count;
+    CGU_INT    channels;
+
+} 
+#ifndef ASPM
+BC7_Encode
+#endif
+;
+
+CMP_CONSTANT CGU_FLOAT  rampWeights[5][SOURCE_BLOCK_SIZE] = {
+{ 0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f}, // 0 bit index
+{ 0.000000f,1.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f}, // 1 bit index
+{ 0.000000f,0.328125f,0.671875f,1.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f}, // 2 bit index
+{ 0.000000f,0.140625f,0.281250f,0.421875f,0.578125f,0.718750f,0.859375f,1.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f}, // 3 bit index
+{ 0.000000f,0.062500f,0.140625f,0.203125f,0.265625f,0.328125f,0.406250f,0.468750f,0.531250f,0.593750f,0.671875f,0.734375f,0.796875f,0.859375f,0.937500f,1.000000f}  // 4 bit index
+};
+
+#ifndef ASPM_GPU
+typedef struct 
+#ifdef ASPM
+BC7_EncodeRamps
+#endif
+{
+    CGU_INT      ep_d[4][256];
+#ifdef USE_BC7_SP_ERR_IDX
+    CGU_UINT8    sp_err[3*4*256*2*2*16];
+    CGU_INT      sp_idx[3*4*256*2*2*16*2];
+#endif
+#ifdef USE_BC7_RAMP
+    CGU_FLOAT    ramp[3*4*256*256*16];
+#endif
+    CGU_BOOL ramp_init;
+} 
+#ifndef ASPM
+BC7_EncodeRamps
+#endif
+;
+#endif
+
+CMP_CONSTANT CGU_UINT8    npv_nd[2][8] = {
+    {1,2,4,8,16,32,0,0},    // 3
+    {1,2,4,0,0 ,0 ,0,0}       // 4
+};
+
+typedef enum
+{
+    NO_ALPHA,
+    COMBINED_ALPHA,
+    SEPARATE_ALPHA
+} CMP_BCE;
+
+// Endpoint encoding type
+typedef enum
+{
+    NO_PBIT,
+    ONE_PBIT,
+    TWO_PBIT,
+    THREE_PBIT,
+    FOUR_PBIT,
+    FIVE_PBIT
+} CMP_PBIT;
+
+typedef struct 
+#ifdef ASPM
+BC7_Encode_local
+#endif
+{
+    // Data for compressing a particular block mode
+    CGV_INT    clusters[2];
+    CGV_BYTE   parityBits;
+    CGV_BYTE   componentBits[MAX_CHANNELS];
+
+    CMP_BCE     encodingType;           // Type of block
+    CGU_UINT8   partitionBits;          // Number of bits for partition data
+    CGU_UINT8   rotationBits;           // Number of bits for component rotation
+    CGU_UINT8   indexModeBits;          // Number of bits for index selection
+    CMP_PBIT    pBitType;               // Type of P-bit encoding
+    CGU_UINT8   subsetCount;            // Number of subsets
+    CGU_UINT8   indexBits[2];           // Number of bits per index in each index set
+
+    // Bulky temporary data used during compression of a block
+    CGV_UINT8  storedindex[MAX_PARTITIONS][MAX_SUBSETS][MAX_SUBSET_SIZE];
+    CGV_ERROR  storedError[MAX_PARTITIONS];
+    CGV_UINT8  sortedModes[MAX_PARTITIONS];
+
+    // This stores the min and max for the components of the block, and the ranges
+    CGV_IMAGE  blockMin[MAX_CHANNELS];
+    CGV_IMAGE  blockMax[MAX_CHANNELS];
+    CGV_IMAGE  blockRange[MAX_CHANNELS];
+    CGV_IMAGE  blockMaxRange;
+}
+#ifndef ASPM
+BC7_Encode_local
+#endif
+;
+ 
+
+
+typedef enum
+{
+    CART,
+    SAME_PAR,
+    BCC,
+    SAME_FCC,
+    FCC,
+    FCC_SAME_BCC,
+} CMP_qt; 
+
+// Block component encoding
+
+
+// Descriptor structure for block encodings
+typedef struct 
+{
+    uniform CMP_BCE     encodingType;           // Type of block
+    CGU_UINT8   partitionBits;          // Number of bits for partition data
+    CGU_UINT8   rotationBits;           // Number of bits for component rotation
+    CGU_UINT8   indexModeBits;          // Number of bits for index selection
+    CGU_UINT8   scalarBits;             // Number of bits for one scalar endpoint
+    CGU_UINT8   vectorBits;             // Number of bits for one vector endpoint(excluding P bits)
+    uniform CMP_PBIT    pBitType;               // Type of P-bit encoding
+    CGU_UINT8   subsetCount;            // Number of subsets
+    CGU_UINT8   indexBits[2];           // Number of bits per index in each index set
+} CMP_BTI;
+
+typedef enum
+ {
+     COMP_RED   = 0,
+     COMP_GREEN = 1,
+     COMP_BLUE =  2,
+     COMP_ALPHA = 3
+ } COMPONENT;
+
+CMP_CONSTANT CGU_UINT8  componentRotations[4][4] = {
+{ COMP_ALPHA, COMP_RED,   COMP_GREEN, COMP_BLUE },
+{ COMP_RED,   COMP_ALPHA, COMP_GREEN, COMP_BLUE },
+{ COMP_GREEN, COMP_RED,   COMP_ALPHA, COMP_BLUE },
+{ COMP_BLUE,  COMP_RED,   COMP_GREEN, COMP_ALPHA }
+};
+
+CMP_CONSTANT CMP_BTI    bti[NUM_BLOCK_TYPES] = {
+//encodingType,partitionBits,rotationBits,indexModeBits,scalarBits,vectorBits,pBitType,     subsetCount,indexBits[0]&[1]
+{ NO_ALPHA,          4,         0,          0,              0,          12,     TWO_PBIT,       3,      { 3, 0 } },  // Format Mode 0
+{ NO_ALPHA,          6,         0,          0,              0,          18,     ONE_PBIT,       2,      { 3, 0 } },  // Format Mode 1
+{ NO_ALPHA,          6,         0,          0,              0,          15,     NO_PBIT,        3,      { 2, 0 } },  // Format Mode 2
+{ NO_ALPHA,          6,         0,          0,              0,          21,     TWO_PBIT,       2,      { 2, 0 } },  // Format Mode 3
+{ SEPARATE_ALPHA,    0,         2,          1,              6,          15,     NO_PBIT,        1,      { 2, 3 } },  // Format Mode 4
+{ SEPARATE_ALPHA,    0,         2,          0,              8,          21,     NO_PBIT,        1,      { 2, 2 } },  // Format Mode 5
+{ COMBINED_ALPHA,    0,         0,          0,              0,          28,     TWO_PBIT,       1,      { 4, 0 } },  // Format Mode 6
+{ COMBINED_ALPHA,    6,         0,          0,              0,          20,     TWO_PBIT,       2,      { 2, 0 } }   // Format Mode 7
+};
+
+CMP_CONSTANT CGU_UINT8    par_vectors_nd[2][8][64][2][4] = {
+{     // 3D
+{
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}}
+},
+{
+{{0,0,0,0},{0,0,0,0}},{{1,1,1,0},{1,1,1,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}}
+},
+{
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{1,1,1,0}},{{1,1,1,0},{0,0,0,0}},{{1,1,1,0},{1,1,1,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}}
+},
+{
+{{0,0,0,0},{0,0,0,0}},{{1,1,0,0},{1,1,0,0}},{{1,0,1,0},{1,0,1,0}},{{0,1,1,0},{0,1,1,0}},{{0,0,0,0},{1,1,1,0}},{{1,1,1,0},{0,0,0,0}},{{0,1,0,0},{0,1,0,0}},{{1,1,1,0},{1,1,1,0}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}}
+},
+{
+{{0,0,0,0},{0,0,0,0}},{{1,1,0,0},{0,0,0,0}},{{1,0,1,0},{0,0,0,0}},{{0,1,1,0},{0,0,0,0}},{{0,0,0,0},{1,1,0,0}},{{1,1,0,0},{1,1,0,0}},{{1,0,1,0},{1,1,0,0}},{{0,1,1,0},{1,1,0,0}},
+{{0,0,0,0},{1,0,1,0}},{{1,1,0,0},{1,0,1,0}},{{1,0,1,0},{1,0,1,0}},{{0,1,1,0},{1,0,1,0}},{{0,0,0,0},{0,1,1,0}},{{1,1,0,0},{0,1,1,0}},{{1,0,1,0},{0,1,1,0}},{{0,1,1,0},{0,1,1,0}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}}
+},
+{
+{{0,0,0,0},{0,0,0,0}},{{1,1,0,0},{0,0,0,0}},{{1,0,1,0},{0,0,0,0}},{{0,1,1,0},{0,0,0,0}},{{0,0,0,0},{1,1,0,0}},{{1,1,0,0},{1,1,0,0}},{{1,0,1,0},{1,1,0,0}},{{0,1,1,0},{1,1,0,0}},
+{{0,0,0,0},{1,0,1,0}},{{1,1,0,0},{1,0,1,0}},{{1,0,1,0},{1,0,1,0}},{{0,1,1,0},{1,0,1,0}},{{0,0,0,0},{0,1,1,0}},{{1,1,0,0},{0,1,1,0}},{{1,0,1,0},{0,1,1,0}},{{0,1,1,0},{0,1,1,0}},
+{{1,0,0,0},{1,1,1,0}},{{0,1,0,0},{1,1,1,0}},{{0,0,1,0},{1,1,1,0}},{{1,1,1,0},{1,1,1,0}},{{1,0,0,0},{0,0,1,0}},{{0,1,0,0},{0,0,1,0}},{{0,0,1,0},{0,0,1,0}},{{1,1,1,0},{0,0,1,0}},
+{{1,0,0,0},{1,0,0,0}},{{0,1,0,0},{1,0,0,0}},{{0,0,1,0},{1,0,0,0}},{{1,1,1,0},{1,0,0,0}},{{1,0,0,0},{0,1,0,0}},{{0,1,0,0},{0,1,0,0}},{{0,0,1,0},{0,1,0,0}},{{1,1,1,0},{0,1,0,0}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}}
+},
+{
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}}
+},
+{
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}}
+},
+},
+{     // 4D
+{
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}}
+},
+{
+{{0,0,0,0},{0,0,0,0}},{{1,1,1,1},{1,1,1,1}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}}
+},
+{
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{1,1,1,1}},{{1,1,1,1},{0,0,0,0}},{{1,1,1,1},{1,1,1,1}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}}
+},
+{
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,1,1,1}},{{0,1,1,1},{0,0,0,0}},{{0,1,1,1},{0,1,1,1}},{{1,0,0,0},{1,0,0,0}},{{1,0,0,0},{1,1,1,1}},{{1,1,1,1},{1,0,0,0}},{{1,1,1,1},{1,1,1,1}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}}
+},
+{
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,1,1,1}},{{0,1,1,1},{0,0,0,0}},{{0,1,1,1},{0,1,1,1}},{{1,0,0,0},{1,0,0,0}},{{1,0,0,0},{1,1,1,1}},{{1,1,1,1},{1,0,0,0}},{{1,1,1,1},{1,1,1,1}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,1,1}},{{0,0,1,1},{0,0,0,0}},{{0,1,0,1},{0,1,0,1}},{{1,0,0,0},{1,0,0,0}},{{1,0,0,0},{1,0,1,1}},{{1,0,1,1},{1,0,0,0}},{{1,1,0,1},{1,1,0,1}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}}
+},
+{
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}}
+},
+{
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}}
+},
+{
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},
+{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}},{{0,0,0,0},{0,0,0,0}}
+},
+},
+};
+
+#ifndef ASPM_GPU
+// =============================== USED BY DECODER THIS CODE NEEDS TO BE UPDATED =========================================
+CMP_CONSTANT CGU_UINT32  BC7_FIXUPINDICES_LOCAL[MAX_SUBSETS][MAX_PARTITIONS][3] =
+{
+    // One subset
+    {
+        {0, 0, 0},
+    },
+
+    {
+        {0, 15},{0, 15},{0, 15},{0, 15},
+        {0, 15},{0, 15},{0, 15},{0, 15},
+        {0, 15},{0, 15},{0, 15},{0, 15},
+        {0, 15},{0, 15},{0, 15},{0, 15},
+        {0, 15},{0,  2},{0,  8},{0,  2},
+        {0,  2},{0,  8},{0,  8},{0, 15},
+        {0,  2},{0,  8},{0,  2},{0,  2},
+        {0,  8},{0,  8},{0,  2},{0,  2},
+        {0, 15},{0, 15},{0,  6},{0,  8},
+        {0,  2},{0,  8},{0, 15},{0, 15},
+        {0,  2},{0,  8},{0,  2},{0,  2},
+        {0,  2},{0, 15},{0, 15},{0,  6},
+        {0,  6},{0,  2},{0,  6},{0,  8},
+        {0, 15},{0, 15},{0,  2},{0,  2},
+        {0, 15},{0, 15},{0, 15},{0, 15},
+        {0, 15},{0,  2},{0,  2},{0, 15},
+    },
+
+    // Three subsets
+    {
+        {0, 3,15}, {0, 3, 8}, {0,15, 8}, {0,15, 3},
+        {0, 8,15}, {0, 3,15}, {0,15, 3}, {0,15, 8},
+        {0, 8,15}, {0, 8,15}, {0, 6,15}, {0, 6,15},
+        {0, 6,15}, {0, 5,15}, {0, 3,15}, {0, 3, 8},
+        {0, 3,15}, {0, 3, 8}, {0, 8,15}, {0,15, 3},
+        {0, 3,15}, {0, 3, 8}, {0, 6,15}, {0,10, 8},
+        {0, 5, 3}, {0, 8,15}, {0, 8, 6}, {0, 6,10},
+        {0, 8,15}, {0, 5,15}, {0,15,10}, {0,15, 8},
+        {0, 8,15}, {0,15, 3}, {0, 3,15}, {0, 5,10},
+        {0, 6,10}, {0,10, 8}, {0, 8, 9}, {0,15,10},
+        {0,15, 6}, {0, 3,15}, {0,15, 8}, {0, 5,15},
+        {0,15, 3}, {0,15, 6}, {0,15, 6}, {0,15, 8},
+        {0, 3,15}, {0,15, 3}, {0, 5,15}, {0, 5,15},
+        {0, 5,15}, {0, 8,15}, {0, 5,15}, {0,10,15},
+        {0, 5,15}, {0,10,15}, {0, 8,15}, {0,13,15},
+        {0,15, 3}, {0,12,15}, {0, 3,15}, {0, 3, 8}
+
+    },
+
+};
+
+CMP_STATIC void SetDefaultBC7Options(BC7_Encode *BC7Encode)
+{
+if (BC7Encode)
+{
+    // Set for max quality
+    BC7Encode->quality              = 1.0f;
+    BC7Encode->minThreshold         = 5.0f;
+    BC7Encode->maxThreshold         = 80.0f;
+    BC7Encode->errorThreshold       = 5.0f;
+    BC7Encode->validModeMask        = 0xFF;
+
+    BC7Encode->imageNeedsAlpha      = FALSE;
+    BC7Encode->colourRestrict       = FALSE;
+    BC7Encode->alphaRestrict        = FALSE;
+
+    BC7Encode->channels             = 4;
+    BC7Encode->part_count           = 128;
+}
+}
+
+#ifndef ASPM
+//=====================
+// Used by Decoder
+//=====================
+__constant CGU_FLOAT rampLerpWeightsBC7[5][16] =
+{
+    { 0.0 }, // 0 bit index
+    { 0.0, 1.0 }, // 1 bit index
+    { 0.0, 21.0 / 64.0, 43.0 / 64.0, 1.0 }, // 2 bit index
+    { 0.0, 9.0 / 64.0, 18.0 / 64.0, 27.0 / 64.0, 37.0 / 64.0, 46.0 / 64.0, 55.0 / 64.0, 1.0 }, // 3 bit index
+    { 0.0, 4.0 / 64.0, 9.0 / 64.0, 13.0 / 64.0, 17.0 / 64.0, 21.0 / 64.0, 26.0 / 64.0, 30.0 / 64.0,
+    34.0 / 64.0, 38.0 / 64.0, 43.0 / 64.0, 47.0 / 64.0, 51.0 / 64.0, 55.0 / 64.0, 60.0 / 64.0, 1.0 } // 4 bit index
+};
+
+
+__constant CGU_UINT8 BC7_PARTITIONS[MAX_SUBSETS][MAX_PARTITIONS][MAX_SUBSET_SIZE] =
+{
+    // Single subset partitions for both BC6H abd BC7
+    {
+        {
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+        },
+    },
+
+    {
+        { // 0
+        0,0,1,1,
+        0,0,1,1,
+        0,0,1,1,
+        0,0,1,1
+        },
+
+        { // 1
+        0,0,0,1,
+        0,0,0,1,
+        0,0,0,1,
+        0,0,0,1
+        },
+
+        { // 2
+        0,1,1,1,
+        0,1,1,1,
+        0,1,1,1,
+        0,1,1,1
+        },
+
+        { // 3
+        0,0,0,1,
+        0,0,1,1,
+        0,0,1,1,
+        0,1,1,1
+        },
+
+        { // 4
+        0,0,0,0,
+        0,0,0,1,
+        0,0,0,1,
+        0,0,1,1
+        },
+
+        { // 5
+        0,0,1,1,
+        0,1,1,1,
+        0,1,1,1,
+        1,1,1,1
+        },
+
+        { // 6
+        0,0,0,1,
+        0,0,1,1,
+        0,1,1,1,
+        1,1,1,1
+        },
+
+        { // 7
+        0,0,0,0,
+        0,0,0,1,
+        0,0,1,1,
+        0,1,1,1
+        },
+
+        { // 8
+        0,0,0,0,
+        0,0,0,0,
+        0,0,0,1,
+        0,0,1,1
+        },
+
+        { // 9
+        0,0,1,1,
+        0,1,1,1,
+        1,1,1,1,
+        1,1,1,1
+        },
+
+        { // 10
+        0,0,0,0,
+        0,0,0,1,
+        0,1,1,1,
+        1,1,1,1
+        },
+
+        { // 11
+        0,0,0,0,
+        0,0,0,0,
+        0,0,0,1,
+        0,1,1,1
+        },
+
+        { // 12
+        0,0,0,1,
+        0,1,1,1,
+        1,1,1,1,
+        1,1,1,1
+        },
+
+        { // 13
+        0,0,0,0,
+        0,0,0,0,
+        1,1,1,1,
+        1,1,1,1
+        },
+
+        { // 14
+        0,0,0,0,
+        1,1,1,1,
+        1,1,1,1,
+        1,1,1,1
+        },
+
+        { // 15
+        0,0,0,0,
+        0,0,0,0,
+        0,0,0,0,
+        1,1,1,1
+        },
+
+        { // 16
+        0,0,0,0,
+        1,0,0,0,
+        1,1,1,0,
+        1,1,1,1
+        },
+
+        { // 17
+        0,1,1,1,
+        0,0,0,1,
+        0,0,0,0,
+        0,0,0,0
+        },
+
+        { // 18
+        0,0,0,0,
+        0,0,0,0,
+        1,0,0,0,
+        1,1,1,0
+        },
+
+        { // 19
+        0,1,1,1,
+        0,0,1,1,
+        0,0,0,1,
+        0,0,0,0
+        },
+
+        { // 20
+        0,0,1,1,
+        0,0,0,1,
+        0,0,0,0,
+        0,0,0,0
+        },
+
+        { // 21
+        0,0,0,0,
+        1,0,0,0,
+        1,1,0,0,
+        1,1,1,0
+        },
+
+        { // 22
+        0,0,0,0,
+        0,0,0,0,
+        1,0,0,0,
+        1,1,0,0
+        },
+
+        { // 23
+        0,1,1,1,
+        0,0,1,1,
+        0,0,1,1,
+        0,0,0,1
+        },
+
+        { // 24
+        0,0,1,1,
+        0,0,0,1,
+        0,0,0,1,
+        0,0,0,0
+        },
+
+        { // 25
+        0,0,0,0,
+        1,0,0,0,
+        1,0,0,0,
+        1,1,0,0
+        },
+
+        { // 26
+        0,1,1,0,
+        0,1,1,0,
+        0,1,1,0,
+        0,1,1,0
+        },
+
+        { // 27
+        0,0,1,1,
+        0,1,1,0,
+        0,1,1,0,
+        1,1,0,0
+        },
+
+        { // 28
+        0,0,0,1,
+        0,1,1,1,
+        1,1,1,0,
+        1,0,0,0
+        },
+
+        { // 29
+        0,0,0,0,
+        1,1,1,1,
+        1,1,1,1,
+        0,0,0,0
+        },
+
+        { // 30
+        0,1,1,1,
+        0,0,0,1,
+        1,0,0,0,
+        1,1,1,0
+        },
+
+        { // 31
+        0,0,1,1,
+        1,0,0,1,
+        1,0,0,1,
+        1,1,0,0
+        },
+		// -----------  BC7 only shapes from here on -------------
+        {  // 32
+        0,1,0,1,
+        0,1,0,1,
+        0,1,0,1,
+        0,1,0,1
+        },
+
+        { // 33
+        0,0,0,0,
+        1,1,1,1,
+        0,0,0,0,
+        1,1,1,1
+        },
+
+        { // 34
+        0,1,0,1,
+        1,0,1,0,
+        0,1,0,1,
+        1,0,1,0
+        },
+
+        { // 35
+        0,0,1,1,
+        0,0,1,1,
+        1,1,0,0,
+        1,1,0,0
+        },
+
+        { // 36
+        0,0,1,1,
+        1,1,0,0,
+        0,0,1,1,
+        1,1,0,0
+        },
+
+        { // 37
+        0,1,0,1,
+        0,1,0,1,
+        1,0,1,0,
+        1,0,1,0
+        },
+
+        { // 38
+        0,1,1,0,
+        1,0,0,1,
+        0,1,1,0,
+        1,0,0,1
+        },
+
+        { // 39
+        0,1,0,1,
+        1,0,1,0,
+        1,0,1,0,
+        0,1,0,1
+        },
+
+        { // 40
+        0,1,1,1,
+        0,0,1,1,
+        1,1,0,0,
+        1,1,1,0
+        },
+
+        { // 41
+        0,0,0,1,
+        0,0,1,1,
+        1,1,0,0,
+        1,0,0,0
+        },
+
+        { // 42
+        0,0,1,1,
+        0,0,1,0,
+        0,1,0,0,
+        1,1,0,0
+        },
+
+        { // 43
+        0,0,1,1,
+        1,0,1,1,
+        1,1,0,1,
+        1,1,0,0
+        },
+
+        { // 44
+        0,1,1,0,
+        1,0,0,1,
+        1,0,0,1,
+        0,1,1,0
+        },
+
+        { // 45
+        0,0,1,1,
+        1,1,0,0,
+        1,1,0,0,
+        0,0,1,1
+        },
+
+        { // 46
+        0,1,1,0,
+        0,1,1,0,
+        1,0,0,1,
+        1,0,0,1
+        },
+
+        { // 47
+        0,0,0,0,
+        0,1,1,0,
+        0,1,1,0,
+        0,0,0,0
+        },
+
+        { // 48
+        0,1,0,0,
+        1,1,1,0,
+        0,1,0,0,
+        0,0,0,0
+        },
+
+        { // 49
+        0,0,1,0,
+        0,1,1,1,
+        0,0,1,0,
+        0,0,0,0
+        },
+
+        { // 50
+        0,0,0,0,
+        0,0,1,0,
+        0,1,1,1,
+        0,0,1,0
+        },
+
+        { // 51
+        0,0,0,0,
+        0,1,0,0,
+        1,1,1,0,
+        0,1,0,0
+        },
+
+        { // 52
+        0,1,1,0,
+        1,1,0,0,
+        1,0,0,1,
+        0,0,1,1
+        },
+
+        { // 53
+        0,0,1,1,
+        0,1,1,0,
+        1,1,0,0,
+        1,0,0,1
+        },
+
+        { // 54
+        0,1,1,0,
+        0,0,1,1,
+        1,0,0,1,
+        1,1,0,0
+        },
+
+        { // 55
+        0,0,1,1,
+        1,0,0,1,
+        1,1,0,0,
+        0,1,1,0
+        },
+
+        { // 56
+        0,1,1,0,
+        1,1,0,0,
+        1,1,0,0,
+        1,0,0,1
+        },
+
+        { // 57
+        0,1,1,0,
+        0,0,1,1,
+        0,0,1,1,
+        1,0,0,1
+        },
+
+        { // 58
+        0,1,1,1,
+        1,1,1,0,
+        1,0,0,0,
+        0,0,0,1
+        },
+
+        { // 59
+        0,0,0,1,
+        1,0,0,0,
+        1,1,1,0,
+        0,1,1,1
+        },
+
+        { // 60
+        0,0,0,0,
+        1,1,1,1,
+        0,0,1,1,
+        0,0,1,1
+        },
+
+        { // 61
+        0,0,1,1,
+        0,0,1,1,
+        1,1,1,1,
+        0,0,0,0
+        },
+
+        { // 62
+        0,0,1,0,
+        0,0,1,0,
+        1,1,1,0,
+        1,1,1,0
+        },
+
+        { // 63
+        0,1,0,0,
+        0,1,0,0,
+        0,1,1,1,
+        0,1,1,1
+        },
+    },
+
+
+    // Table.P3 - only for BC7
+
+    {
+
+        {
+        0,0,1,1,
+        0,0,1,1,
+        0,2,2,1,
+        2,2,2,2
+        },
+
+        {
+        0,0,0,1,
+        0,0,1,1,
+        2,2,1,1,
+        2,2,2,1
+        },
+
+        {
+        0,0,0,0,
+        2,0,0,1,
+        2,2,1,1,
+        2,2,1,1
+        },
+
+        {
+        0,2,2,2,
+        0,0,2,2,
+        0,0,1,1,
+        0,1,1,1
+        },
+
+        {
+        0,0,0,0,
+        0,0,0,0,
+        1,1,2,2,
+        1,1,2,2
+        },
+
+        {
+        0,0,1,1,
+        0,0,1,1,
+        0,0,2,2,
+        0,0,2,2
+        },
+
+        {
+        0,0,2,2,
+        0,0,2,2,
+        1,1,1,1,
+        1,1,1,1
+        },
+
+        {
+        0,0,1,1,
+        0,0,1,1,
+        2,2,1,1,
+        2,2,1,1
+        },
+
+        {
+        0,0,0,0,
+        0,0,0,0,
+        1,1,1,1,
+        2,2,2,2
+        },
+
+        {
+        0,0,0,0,
+        1,1,1,1,
+        1,1,1,1,
+        2,2,2,2
+        },
+
+        {
+        0,0,0,0,
+        1,1,1,1,
+        2,2,2,2,
+        2,2,2,2
+        },
+
+        {
+        0,0,1,2,
+        0,0,1,2,
+        0,0,1,2,
+        0,0,1,2
+        },
+
+        {
+        0,1,1,2,
+        0,1,1,2,
+        0,1,1,2,
+        0,1,1,2
+        },
+
+        {
+        0,1,2,2,
+        0,1,2,2,
+        0,1,2,2,
+        0,1,2,2
+        },
+
+        {
+        0,0,1,1,
+        0,1,1,2,
+        1,1,2,2,
+        1,2,2,2
+        },
+
+        {
+        0,0,1,1,
+        2,0,0,1,
+        2,2,0,0,
+        2,2,2,0
+        },
+
+        {
+        0,0,0,1,
+        0,0,1,1,
+        0,1,1,2,
+        1,1,2,2
+        },
+
+        {
+        0,1,1,1,
+        0,0,1,1,
+        2,0,0,1,
+        2,2,0,0
+        },
+
+        {
+        0,0,0,0,
+        1,1,2,2,
+        1,1,2,2,
+        1,1,2,2
+        },
+
+        {
+        0,0,2,2,
+        0,0,2,2,
+        0,0,2,2,
+        1,1,1,1
+        },
+
+        {
+        0,1,1,1,
+        0,1,1,1,
+        0,2,2,2,
+        0,2,2,2
+        },
+
+        {
+        0,0,0,1,
+        0,0,0,1,
+        2,2,2,1,
+        2,2,2,1
+        },
+
+        {
+        0,0,0,0,
+        0,0,1,1,
+        0,1,2,2,
+        0,1,2,2
+        },
+
+        {
+        0,0,0,0,
+        1,1,0,0,
+        2,2,1,0,
+        2,2,1,0
+        },
+
+        {
+        0,1,2,2,
+        0,1,2,2,
+        0,0,1,1,
+        0,0,0,0
+        },
+
+        {
+        0,0,1,2,
+        0,0,1,2,
+        1,1,2,2,
+        2,2,2,2
+        },
+
+        {
+        0,1,1,0,
+        1,2,2,1,
+        1,2,2,1,
+        0,1,1,0
+        },
+
+        {
+        0,0,0,0,
+        0,1,1,0,
+        1,2,2,1,
+        1,2,2,1
+        },
+
+        {
+        0,0,2,2,
+        1,1,0,2,
+        1,1,0,2,
+        0,0,2,2
+        },
+
+        {
+        0,1,1,0,
+        0,1,1,0,
+        2,0,0,2,
+        2,2,2,2
+        },
+
+        {
+        0,0,1,1,
+        0,1,2,2,
+        0,1,2,2,
+        0,0,1,1
+        },
+
+        {
+        0,0,0,0,
+        2,0,0,0,
+        2,2,1,1,
+        2,2,2,1
+        },
+
+        {
+        0,0,0,0,
+        0,0,0,2,
+        1,1,2,2,
+        1,2,2,2
+        },
+
+        {
+        0,2,2,2,
+        0,0,2,2,
+        0,0,1,2,
+        0,0,1,1
+        },
+
+        {
+        0,0,1,1,
+        0,0,1,2,
+        0,0,2,2,
+        0,2,2,2
+        },
+
+        {
+        0,1,2,0,
+        0,1,2,0,
+        0,1,2,0,
+        0,1,2,0
+        },
+
+        {
+        0,0,0,0,
+        1,1,1,1,
+        2,2,2,2,
+        0,0,0,0
+        },
+
+        {
+        0,1,2,0,
+        1,2,0,1,
+        2,0,1,2,
+        0,1,2,0
+        },
+
+        {
+        0,1,2,0,
+        2,0,1,2,
+        1,2,0,1,
+        0,1,2,0
+        },
+
+        {
+        0,0,1,1,
+        2,2,0,0,
+        1,1,2,2,
+        0,0,1,1
+        },
+
+        {
+        0,0,1,1,
+        1,1,2,2,
+        2,2,0,0,
+        0,0,1,1
+        },
+
+        {
+        0,1,0,1,
+        0,1,0,1,
+        2,2,2,2,
+        2,2,2,2
+        },
+
+        {
+        0,0,0,0,
+        0,0,0,0,
+        2,1,2,1,
+        2,1,2,1
+        },
+
+        {
+        0,0,2,2,
+        1,1,2,2,
+        0,0,2,2,
+        1,1,2,2
+        },
+
+        {
+        0,0,2,2,
+        0,0,1,1,
+        0,0,2,2,
+        0,0,1,1
+        },
+
+        {
+        0,2,2,0,
+        1,2,2,1,
+        0,2,2,0,
+        1,2,2,1
+        },
+
+        {
+        0,1,0,1,
+        2,2,2,2,
+        2,2,2,2,
+        0,1,0,1
+        },
+
+        {
+        0,0,0,0,
+        2,1,2,1,
+        2,1,2,1,
+        2,1,2,1
+        },
+
+        {
+        0,1,0,1,
+        0,1,0,1,
+        0,1,0,1,
+        2,2,2,2
+        },
+
+        {
+        0,2,2,2,
+        0,1,1,1,
+        0,2,2,2,
+        0,1,1,1
+        },
+
+        {
+        0,0,0,2,
+        1,1,1,2,
+        0,0,0,2,
+        1,1,1,2
+        },
+
+        {
+        0,0,0,0,
+        2,1,1,2,
+        2,1,1,2,
+        2,1,1,2
+        },
+
+        {
+        0,2,2,2,
+        0,1,1,1,
+        0,1,1,1,
+        0,2,2,2
+        },
+
+        {
+        0,0,0,2,
+        1,1,1,2,
+        1,1,1,2,
+        0,0,0,2
+        },
+
+        {
+        0,1,1,0,
+        0,1,1,0,
+        0,1,1,0,
+        2,2,2,2
+        },
+
+        {
+        0,0,0,0,
+        0,0,0,0,
+        2,1,1,2,
+        2,1,1,2
+        },
+
+        {
+        0,1,1,0,
+        0,1,1,0,
+        2,2,2,2,
+        2,2,2,2
+        },
+
+        {
+        0,0,2,2,
+        0,0,1,1,
+        0,0,1,1,
+        0,0,2,2
+        },
+
+        {
+        0,0,2,2,
+        1,1,2,2,
+        1,1,2,2,
+        0,0,2,2
+        },
+
+        {
+        0,0,0,0,
+        0,0,0,0,
+        0,0,0,0,
+        2,1,1,2
+        },
+
+        {
+        0,0,0,2,
+        0,0,0,1,
+        0,0,0,2,
+        0,0,0,1
+        },
+
+        {
+        0,2,2,2,
+        1,2,2,2,
+        0,2,2,2,
+        1,2,2,2
+        },
+
+        {
+        0,1,0,1,
+        2,2,2,2,
+        2,2,2,2,
+        2,2,2,2
+        },
+
+        {
+        0,1,1,1,
+        2,0,1,1,
+        2,2,0,1,
+        2,2,2,0
+        },
+    },
+};
+#endif
+
+#endif  // !ASPM_GPU
+
+
+
+#endif
diff --git a/extern/CMP_Core/shaders/BCn_Common_Kernel.h b/extern/CMP_Core/shaders/BCn_Common_Kernel.h
new file mode 100644
index 0000000..e9db4a3
--- /dev/null
+++ b/extern/CMP_Core/shaders/BCn_Common_Kernel.h
@@ -0,0 +1,2360 @@
+//=====================================================================
+// Copyright (c) 2018    Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files(the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and / or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions :
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+//
+//=====================================================================
+#ifndef _BCn_Common_Kernel_H
+#define _BCn_Common_Kernel_H
+
+#include "Common_Def.h"
+
+#ifndef ASPM_GPU
+#if defined(WIN32) || defined(_WIN64)
+#define ALIGN_16 __declspec(align(16))
+#else  // !WIN32 && !_WIN64
+#define ALIGN_16
+#endif  // !WIN32 && !_WIN64
+#else
+#define ALIGN_16
+#endif
+
+#define DXTC_OFFSET_ALPHA 0
+#define DXTC_OFFSET_RGB 2
+
+#define RC 2
+#define GC 1
+#define BC 0
+#define AC 3
+
+/*
+Channel Bits
+*/
+#define RG 5
+#define GG 6
+#define BG 5
+
+#define RGBA8888_CHANNEL_A 3
+#define RGBA8888_CHANNEL_R 2
+#define RGBA8888_CHANNEL_G 1
+#define RGBA8888_CHANNEL_B 0
+#define RGBA8888_OFFSET_A (RGBA8888_CHANNEL_A * 8)
+#define RGBA8888_OFFSET_R (RGBA8888_CHANNEL_R * 8)
+#define RGBA8888_OFFSET_G (RGBA8888_CHANNEL_G * 8)
+#define RGBA8888_OFFSET_B (RGBA8888_CHANNEL_B * 8)
+
+#define MAX_BLOCK 64
+#define BLOCK_SIZE MAX_BLOCK
+
+#ifndef MAX_ERROR
+#define MAX_ERROR 128000.f
+#endif
+
+#define MAX_BLOCK 64
+#define MAX_POINTS 16
+#define BLOCK_SIZE MAX_BLOCK
+#define NUM_CHANNELS 4
+#define NUM_ENDPOINTS 2
+#define BLOCK_SIZE_4X4 16
+
+#define ConstructColour(r, g, b) (((r) << 11) | ((g) << 5) | (b))
+
+// Find the first approximation of the line
+// Assume there is a linear relation
+//   Z = a * X_In
+//   Z = b * Y_In
+// Find a,b to minimize MSE between Z and Z_In
+#define EPS (2.f / 255.f) * (2.f / 255.f)
+#define EPS2 3.f * (2.f / 255.f) * (2.f / 255.f)
+
+// Grid precision
+#define PIX_GRID 8
+
+#define BYTE_MASK 0x00ff
+
+CMP_CONSTANT CGU_UINT8 nByteBitsMask[9] = {0x00, 0x80, 0xc0, 0xe0, 0xf0,
+                                           0xf8, 0xfc, 0xfe, 0xff};
+CMP_CONSTANT CGU_DWORD dwRndAmount[9] = {0, 0, 0, 0, 1, 1, 2, 2, 3};
+
+#define _INT_GRID (_bFixedRamp && _FracPrc == 0)
+#define SCH_STPS 3  // number of search steps to make at each end of interval
+static CMP_CONSTANT CGU_FLOAT sMvF[] = {0.f, -1.f, 1.f, -2.f, 2.f, -3.f,
+                                        3.f, -4.f, 4.f, -5.f, 5.f, -6.f,
+                                        6.f, -7.f, 7.f, -8.f, 8.f};
+
+#ifndef GBL_SCH_STEP
+#define GBL_SCH_STEP_MXS 0.018f
+#define GBL_SCH_EXT_MXS 0.1f
+#define LCL_SCH_STEP_MXS 0.6f
+#define GBL_SCH_STEP_MXQ 0.0175f
+#define GBL_SCH_EXT_MXQ 0.154f
+#define LCL_SCH_STEP_MXQ 0.45f
+
+#define GBL_SCH_STEP GBL_SCH_STEP_MXS
+#define GBL_SCH_EXT GBL_SCH_EXT_MXS
+#define LCL_SCH_STEP LCL_SCH_STEP_MXS
+#endif
+
+typedef struct {
+  CGU_UINT32 data;
+  CGU_UINT32 index;
+} CMP_di;
+
+typedef struct {
+  CGU_FLOAT data;
+  CGU_UINT32 index;
+} CMP_df;
+
+typedef struct {
+  // user setable
+  CGU_FLOAT m_fquality;
+  CGU_FLOAT m_fChannelWeights[3];
+  CGU_BOOL m_bUseChannelWeighting;
+  CGU_BOOL m_bUseAdaptiveWeighting;
+  CGU_BOOL m_bUseFloat;
+  CGU_BOOL m_b3DRefinement;
+  CGU_UINT8 m_nRefinementSteps;
+  CGU_UINT8 m_nAlphaThreshold;
+
+  CGU_BOOL  m_mapDecodeRGBA;
+
+  // ?? Remove this
+  CGU_UINT32 m_src_width;
+  CGU_UINT32 m_src_height;
+} CMP_BC15Options;
+
+//---------------------------------------- Common Code -------------------------------------------------------
+
+static void SetDefaultBC15Options(CMP_BC15Options *BC15Options) {
+  if (BC15Options) {
+    BC15Options->m_fquality = 1.0f;
+    BC15Options->m_bUseChannelWeighting = false;
+    BC15Options->m_bUseAdaptiveWeighting = false;
+    BC15Options->m_fChannelWeights[0] = 0.3086f;
+    BC15Options->m_fChannelWeights[1] = 0.6094f;
+    BC15Options->m_fChannelWeights[2] = 0.0820f;
+    BC15Options->m_nAlphaThreshold    = 128;
+    BC15Options->m_bUseFloat = false;
+    BC15Options->m_b3DRefinement = false;
+    BC15Options->m_nRefinementSteps = 1;
+    BC15Options->m_src_width = 4;
+    BC15Options->m_src_height = 4;
+#ifdef CMP_SET_BC13_DECODER_RGBA
+    BC15Options->m_mapDecodeRGBA = true;
+#else
+    BC15Options->m_mapDecodeRGBA = false;
+#endif
+  }
+}
+
+inline CGU_UINT8 minb(CGU_UINT8 a, CGU_UINT8 b) { return a < b ? a : b; }
+inline CGU_FLOAT minf(CGU_FLOAT a, CGU_FLOAT b) { return a < b ? a : b; }
+inline CGU_FLOAT maxf(CGU_FLOAT a, CGU_FLOAT b) { return a > b ? a : b; }
+
+#if !defined(BC4_ENCODE_KERNEL_H)
+#if !defined(BC5_ENCODE_KERNEL_H)
+static void CalculateColourWeightings(CGU_UINT8 block[BLOCK_SIZE_4X4X4],
+                                      CMP_GLOBAL CMP_BC15Options *BC15options) {
+  CGU_FLOAT fBaseChannelWeights[3] = {0.3086f, 0.6094f, 0.0820f};
+
+  if (!BC15options->m_bUseChannelWeighting) {
+    BC15options->m_fChannelWeights[0] = 1.0F;
+    BC15options->m_fChannelWeights[1] = 1.0F;
+    BC15options->m_fChannelWeights[2] = 1.0F;
+    return;
+  }
+
+  if (BC15options->m_bUseAdaptiveWeighting) {
+    float medianR = 0.0f, medianG = 0.0f, medianB = 0.0f;
+
+    for (CGU_UINT32 k = 0; k < BLOCK_SIZE_4X4; k++) {
+      CGU_DWORD R = (block[k] & 0xff0000) >> 16;
+      CGU_DWORD G = (block[k] & 0xff00) >> 8;
+      CGU_DWORD B = block[k] & 0xff;
+
+      medianR += R;
+      medianG += G;
+      medianB += B;
+    }
+
+    medianR /= BLOCK_SIZE_4X4;
+    medianG /= BLOCK_SIZE_4X4;
+    medianB /= BLOCK_SIZE_4X4;
+
+    // Now skew the colour weightings based on the gravity center of the block
+    float largest = maxf(maxf(medianR, medianG), medianB);
+
+    if (largest > 0) {
+      medianR /= largest;
+      medianG /= largest;
+      medianB /= largest;
+    } else
+      medianR = medianG = medianB = 1.0f;
+
+    // Scale weightings back up to 1.0f
+    CGU_FLOAT fWeightScale =
+        1.0f / (fBaseChannelWeights[0] + fBaseChannelWeights[1] +
+                fBaseChannelWeights[2]);
+    BC15options->m_fChannelWeights[0] = fBaseChannelWeights[0] * fWeightScale;
+    BC15options->m_fChannelWeights[1] = fBaseChannelWeights[1] * fWeightScale;
+    BC15options->m_fChannelWeights[2] = fBaseChannelWeights[2] * fWeightScale;
+    BC15options->m_fChannelWeights[0] =
+        ((BC15options->m_fChannelWeights[0] * 3 * medianR) +
+         BC15options->m_fChannelWeights[0]) *
+        0.25f;
+    BC15options->m_fChannelWeights[1] =
+        ((BC15options->m_fChannelWeights[1] * 3 * medianG) +
+         BC15options->m_fChannelWeights[1]) *
+        0.25f;
+    BC15options->m_fChannelWeights[2] =
+        ((BC15options->m_fChannelWeights[2] * 3 * medianB) +
+         BC15options->m_fChannelWeights[2]) *
+        0.25f;
+    fWeightScale = 1.0f / (BC15options->m_fChannelWeights[0] +
+                           BC15options->m_fChannelWeights[1] +
+                           BC15options->m_fChannelWeights[2]);
+    BC15options->m_fChannelWeights[0] *= fWeightScale;
+    BC15options->m_fChannelWeights[1] *= fWeightScale;
+    BC15options->m_fChannelWeights[2] *= fWeightScale;
+  } else {
+    BC15options->m_fChannelWeights[0] = fBaseChannelWeights[0];
+    BC15options->m_fChannelWeights[1] = fBaseChannelWeights[1];
+    BC15options->m_fChannelWeights[2] = fBaseChannelWeights[2];
+  }
+}
+#endif  // !BC5
+#endif  // !BC4
+
+/*------------------------------------------------------------------------------------------------
+1 dim error
+------------------------------------------------------------------------------------------------*/
+#if !defined(BC4_ENCODE_KERNEL_H)
+#if !defined(BC5_ENCODE_KERNEL_H)
+
+static CGU_FLOAT RampSrchW(CGU_FLOAT _Blck[MAX_BLOCK],
+                           CGU_FLOAT _BlckErr[MAX_BLOCK],
+                           CGU_FLOAT _Rpt[MAX_BLOCK], CGU_FLOAT _maxerror,
+                           CGU_FLOAT _min_ex, CGU_FLOAT _max_ex, int _NmbClrs,
+                           int _block) {
+  CGU_FLOAT error = 0;
+  CGU_FLOAT step = (_max_ex - _min_ex) / (_block - 1);
+  CGU_FLOAT step_h = step * (CGU_FLOAT)0.5;
+  CGU_FLOAT rstep = (CGU_FLOAT)1.0f / step;
+
+  for (CGU_INT32 i = 0; i < _NmbClrs; i++) {
+    CGU_FLOAT v;
+    // Work out which value in the block this select
+    CGU_FLOAT del;
+
+    if ((del = _Blck[i] - _min_ex) <= 0)
+      v = _min_ex;
+    else if (_Blck[i] - _max_ex >= 0)
+      v = _max_ex;
+    else
+      v = floor((del + step_h) * rstep) * step + _min_ex;
+
+    // And accumulate the error
+    CGU_FLOAT d = (_Blck[i] - v);
+    d *= d;
+    CGU_FLOAT err = _Rpt[i] * d + _BlckErr[i];
+    error += err;
+    if (_maxerror < error) {
+      error = _maxerror;
+      break;
+    }
+  }
+  return error;
+}
+#endif  // !BC5
+#endif  // BC4
+
+/*------------------------------------------------------------------------------------------------
+// this is how the end points is going to be rounded in compressed format
+------------------------------------------------------------------------------------------------*/
+#if !defined(BC4_ENCODE_KERNEL_H)
+#if !defined(BC5_ENCODE_KERNEL_H)
+
+static void MkRmpOnGrid(CGU_FLOAT _RmpF[NUM_CHANNELS][NUM_ENDPOINTS],
+                        CGU_FLOAT _MnMx[NUM_CHANNELS][NUM_ENDPOINTS],
+                        CGU_FLOAT _Min, CGU_FLOAT _Max, CGU_UINT8 nRedBits,
+                        CGU_UINT8 nGreenBits, CGU_UINT8 nBlueBits) {
+  CGU_FLOAT Fctrs0[3];
+  CGU_FLOAT Fctrs1[3];
+
+  Fctrs1[RC] = (CGU_FLOAT)(1 << nRedBits);
+  Fctrs1[GC] = (CGU_FLOAT)(1 << nGreenBits);
+  Fctrs1[BC] = (CGU_FLOAT)(1 << nBlueBits);
+  Fctrs0[RC] = (CGU_FLOAT)(1 << (PIX_GRID - nRedBits));
+  Fctrs0[GC] = (CGU_FLOAT)(1 << (PIX_GRID - nGreenBits));
+  Fctrs0[BC] = (CGU_FLOAT)(1 << (PIX_GRID - nBlueBits));
+
+  for (CGU_INT32 j = 0; j < 3; j++) {
+    for (CGU_INT32 k = 0; k < 2; k++) {
+      _RmpF[j][k] = floor(_MnMx[j][k]);
+      if (_RmpF[j][k] <= _Min)
+        _RmpF[j][k] = _Min;
+      else {
+        _RmpF[j][k] +=
+            floor(128.f / Fctrs1[j]) - floor(_RmpF[j][k] / Fctrs1[j]);
+        _RmpF[j][k] = minf(_RmpF[j][k], _Max);
+      }
+
+      _RmpF[j][k] = floor(_RmpF[j][k] / Fctrs0[j]) * Fctrs0[j];
+    }
+  }
+}
+#endif  // !BC5
+#endif  // BC4
+
+/*------------------------------------------------------------------------------------------------
+// this is how the end points is going to be look like when decompressed
+------------------------------------------------------------------------------------------------*/
+inline void MkWkRmpPts(CGU_BOOL *_bEq,
+                       CGU_FLOAT _OutRmpPts[NUM_CHANNELS][NUM_ENDPOINTS],
+                       CGU_FLOAT _InpRmpPts[NUM_CHANNELS][NUM_ENDPOINTS],
+                       CGU_UINT8 nRedBits, CGU_UINT8 nGreenBits,
+                       CGU_UINT8 nBlueBits) {
+  CGU_FLOAT Fctrs[3];
+  Fctrs[RC] = (CGU_FLOAT)(1 << nRedBits);
+  Fctrs[GC] = (CGU_FLOAT)(1 << nGreenBits);
+  Fctrs[BC] = (CGU_FLOAT)(1 << nBlueBits);
+
+  *_bEq = TRUE;
+  // find whether input ramp is flat
+  for (CGU_INT32 j = 0; j < 3; j++)
+    *_bEq &= (_InpRmpPts[j][0] == _InpRmpPts[j][1]);
+
+  // end points on the integer grid
+  for (CGU_INT32 j = 0; j < 3; j++) {
+    for (CGU_INT32 k = 0; k < 2; k++) {
+      // Apply the lower bit replication to give full dynamic range
+      _OutRmpPts[j][k] = _InpRmpPts[j][k] + floor(_InpRmpPts[j][k] / Fctrs[j]);
+      _OutRmpPts[j][k] = maxf((CGU_FLOAT)_OutRmpPts[j][k], 0.f);
+      _OutRmpPts[j][k] = minf((CGU_FLOAT)_OutRmpPts[j][k], 255.f);
+    }
+  }
+}
+
+/*------------------------------------------------------------------------------------------------
+1 DIM ramp
+------------------------------------------------------------------------------------------------*/
+
+inline void BldClrRmp(CGU_FLOAT _Rmp[MAX_POINTS],
+                      CGU_FLOAT _InpRmp[NUM_ENDPOINTS], CGU_UINT8 dwNumPoints) {
+  // linear interpolate end points to get the ramp
+  _Rmp[0] = _InpRmp[0];
+  _Rmp[dwNumPoints - 1] = _InpRmp[1];
+  if (dwNumPoints % 2)
+    _Rmp[dwNumPoints] =
+        1000000.f;  // for 3 point ramp; not to select the 4th point as min
+  for (CGU_INT32 e = 1; e < dwNumPoints - 1; e++)
+    _Rmp[e] = floor((_Rmp[0] * (dwNumPoints - 1 - e) +
+                     _Rmp[dwNumPoints - 1] * e + dwRndAmount[dwNumPoints]) /
+                    (CGU_FLOAT)(dwNumPoints - 1));
+}
+
+/*------------------------------------------------------------------------------------------------
+// build 3D ramp
+------------------------------------------------------------------------------------------------*/
+inline void BldRmp(CGU_FLOAT _Rmp[NUM_CHANNELS][MAX_POINTS],
+                   CGU_FLOAT _InpRmp[NUM_CHANNELS][NUM_ENDPOINTS],
+                   CGU_UINT8 dwNumPoints) {
+  for (CGU_INT32 j = 0; j < 3; j++) BldClrRmp(_Rmp[j], _InpRmp[j], dwNumPoints);
+}
+
+/*------------------------------------------------------------------------------------------------
+Compute cumulative error for the current cluster
+------------------------------------------------------------------------------------------------*/
+#if !defined(BC4_ENCODE_KERNEL_H)
+#if !defined(BC5_ENCODE_KERNEL_H)
+
+static CGU_FLOAT ClstrErr(CGU_FLOAT _Blk[MAX_BLOCK][NUM_CHANNELS],
+                          CGU_FLOAT _Rpt[MAX_BLOCK],
+                          CGU_FLOAT _Rmp[NUM_CHANNELS][MAX_POINTS],
+                          int _NmbClrs, int _blcktp, CGU_BOOL _ConstRamp,
+                          CMP_GLOBAL const CMP_BC15Options *BC15options) {
+  CGU_FLOAT fError = 0.f;
+  int rmp_l = (_ConstRamp) ? 1 : _blcktp;
+
+  // For each colour in the original block, find the closest cluster
+  // and compute the comulative error
+  for (CGU_INT32 i = 0; i < _NmbClrs; i++) {
+    CGU_FLOAT fShortest = 99999999999.f;
+
+    if (BC15options->m_bUseChannelWeighting)
+      for (CGU_INT32 r = 0; r < rmp_l; r++) {
+        // calculate the distance for each component
+        CGU_FLOAT fDistance =
+            (_Blk[i][RC] - _Rmp[RC][r]) * (_Blk[i][RC] - _Rmp[RC][r]) *
+                BC15options->m_fChannelWeights[0] +
+            (_Blk[i][GC] - _Rmp[GC][r]) * (_Blk[i][GC] - _Rmp[GC][r]) *
+                BC15options->m_fChannelWeights[1] +
+            (_Blk[i][BC] - _Rmp[BC][r]) * (_Blk[i][BC] - _Rmp[BC][r]) *
+                BC15options->m_fChannelWeights[2];
+
+        if (fDistance < fShortest) fShortest = fDistance;
+      }
+    else
+      for (CGU_INT32 r = 0; r < rmp_l; r++) {
+        // calculate the distance for each component
+        CGU_FLOAT fDistance =
+            (_Blk[i][RC] - _Rmp[RC][r]) * (_Blk[i][RC] - _Rmp[RC][r]) +
+            (_Blk[i][GC] - _Rmp[GC][r]) * (_Blk[i][GC] - _Rmp[GC][r]) +
+            (_Blk[i][BC] - _Rmp[BC][r]) * (_Blk[i][BC] - _Rmp[BC][r]);
+
+        if (fDistance < fShortest) fShortest = fDistance;
+      }
+
+    // accumulate the error
+    fError += fShortest * _Rpt[i];
+  }
+
+  return fError;
+}
+#endif // !BC5
+#endif  // !BC4
+
+#if !defined(BC4_ENCODE_KERNEL_H)
+#if !defined(BC5_ENCODE_KERNEL_H)
+static CGU_FLOAT Refine3D(CGU_FLOAT _OutRmpPnts[NUM_CHANNELS][NUM_ENDPOINTS],
+                          CGU_FLOAT _InpRmpPnts[NUM_CHANNELS][NUM_ENDPOINTS],
+                          CGU_FLOAT _Blk[MAX_BLOCK][NUM_CHANNELS],
+                          CGU_FLOAT _Rpt[MAX_BLOCK], int _NmrClrs,
+                          CGU_UINT8 dwNumPoints,
+                          CMP_GLOBAL const CMP_BC15Options *BC15options,
+                          CGU_UINT8 nRedBits, CGU_UINT8 nGreenBits,
+                          CGU_UINT8 nBlueBits, CGU_UINT8 nRefineSteps) {
+  ALIGN_16 CGU_FLOAT Rmp[NUM_CHANNELS][MAX_POINTS];
+
+  CGU_FLOAT Blk[MAX_BLOCK][NUM_CHANNELS];
+  for (CGU_INT32 i = 0; i < _NmrClrs; i++)
+    for (CGU_INT32 j = 0; j < 3; j++) Blk[i][j] = _Blk[i][j];
+
+  CGU_FLOAT fWeightRed = BC15options->m_fChannelWeights[0];
+  CGU_FLOAT fWeightGreen = BC15options->m_fChannelWeights[1];
+  CGU_FLOAT fWeightBlue = BC15options->m_fChannelWeights[2];
+
+  // here is our grid
+  CGU_FLOAT Fctrs[3];
+  Fctrs[RC] = (CGU_FLOAT)(1 << (PIX_GRID - nRedBits));
+  Fctrs[GC] = (CGU_FLOAT)(1 << (PIX_GRID - nGreenBits));
+  Fctrs[BC] = (CGU_FLOAT)(1 << (PIX_GRID - nBlueBits));
+
+  CGU_FLOAT InpRmp0[NUM_CHANNELS][NUM_ENDPOINTS];
+  CGU_FLOAT InpRmp[NUM_CHANNELS][NUM_ENDPOINTS];
+  for (CGU_INT32 k = 0; k < 2; k++)
+    for (CGU_INT32 j = 0; j < 3; j++)
+      InpRmp0[j][k] = InpRmp[j][k] = _OutRmpPnts[j][k] = _InpRmpPnts[j][k];
+
+  // make ramp endpoints the way they'll going to be decompressed
+  // plus check whether the ramp is flat
+  CGU_BOOL Eq;
+  CGU_FLOAT WkRmpPts[NUM_CHANNELS][NUM_ENDPOINTS];
+  MkWkRmpPts(&Eq, WkRmpPts, InpRmp, nRedBits, nGreenBits, nBlueBits);
+
+  // build ramp for all 3 colors
+  BldRmp(Rmp, WkRmpPts, dwNumPoints);
+
+  // clusterize for the current ramp
+  CGU_FLOAT bestE =
+      ClstrErr(Blk, _Rpt, Rmp, _NmrClrs, dwNumPoints, Eq, BC15options);
+  if (bestE == 0.f || !nRefineSteps)  // if exact, we've done
+    return bestE;
+
+  // Jitter endpoints in each direction
+  int nRefineStart = 0 - (minb(nRefineSteps, (CGU_UINT8)8));
+  int nRefineEnd = minb(nRefineSteps, (CGU_UINT8)8);
+  for (CGU_INT32 nJitterG0 = nRefineStart; nJitterG0 <= nRefineEnd;
+       nJitterG0++) {
+    InpRmp[GC][0] =
+        minf(maxf(InpRmp0[GC][0] + nJitterG0 * Fctrs[GC], 0.f), 255.f);
+    for (CGU_INT32 nJitterG1 = nRefineStart; nJitterG1 <= nRefineEnd;
+         nJitterG1++) {
+      InpRmp[GC][1] =
+          minf(maxf(InpRmp0[GC][1] + nJitterG1 * Fctrs[GC], 0.f), 255.f);
+      MkWkRmpPts(&Eq, WkRmpPts, InpRmp, nRedBits, nGreenBits, nBlueBits);
+      BldClrRmp(Rmp[GC], WkRmpPts[GC], dwNumPoints);
+
+      CGU_FLOAT RmpErrG[MAX_POINTS][MAX_BLOCK];
+      for (CGU_INT32 i = 0; i < _NmrClrs; i++) {
+        for (CGU_INT32 r = 0; r < dwNumPoints; r++) {
+          CGU_FLOAT DistG = (Rmp[GC][r] - Blk[i][GC]);
+          RmpErrG[r][i] = DistG * DistG * fWeightGreen;
+        }
+      }
+
+      for (CGU_INT32 nJitterB0 = nRefineStart; nJitterB0 <= nRefineEnd;
+           nJitterB0++) {
+        InpRmp[BC][0] =
+            minf(maxf(InpRmp0[BC][0] + nJitterB0 * Fctrs[BC], 0.f), 255.f);
+        for (CGU_INT32 nJitterB1 = nRefineStart; nJitterB1 <= nRefineEnd;
+             nJitterB1++) {
+          InpRmp[BC][1] =
+              minf(maxf(InpRmp0[BC][1] + nJitterB1 * Fctrs[BC], 0.f), 255.f);
+          MkWkRmpPts(&Eq, WkRmpPts, InpRmp, nRedBits, nGreenBits, nBlueBits);
+          BldClrRmp(Rmp[BC], WkRmpPts[BC], dwNumPoints);
+
+          CGU_FLOAT RmpErr[MAX_POINTS][MAX_BLOCK];
+          for (CGU_INT32 i = 0; i < _NmrClrs; i++) {
+            for (CGU_INT32 r = 0; r < dwNumPoints; r++) {
+              CGU_FLOAT DistB = (Rmp[BC][r] - Blk[i][BC]);
+              RmpErr[r][i] = RmpErrG[r][i] + DistB * DistB * fWeightBlue;
+            }
+          }
+
+          for (CGU_INT32 nJitterR0 = nRefineStart; nJitterR0 <= nRefineEnd;
+               nJitterR0++) {
+            InpRmp[RC][0] =
+                minf(maxf(InpRmp0[RC][0] + nJitterR0 * Fctrs[RC], 0.f), 255.f);
+            for (CGU_INT32 nJitterR1 = nRefineStart; nJitterR1 <= nRefineEnd;
+                 nJitterR1++) {
+              InpRmp[RC][1] = minf(
+                  maxf(InpRmp0[RC][1] + nJitterR1 * Fctrs[RC], 0.f), 255.f);
+              MkWkRmpPts(&Eq, WkRmpPts, InpRmp, nRedBits, nGreenBits,
+                         nBlueBits);
+              BldClrRmp(Rmp[RC], WkRmpPts[RC], dwNumPoints);
+
+              // compute cumulative error
+              CGU_FLOAT mse = 0.f;
+              int rmp_l = (Eq) ? 1 : dwNumPoints;
+              for (CGU_INT32 k = 0; k < _NmrClrs; k++) {
+                CGU_FLOAT MinErr = 10000000.f;
+                for (CGU_INT32 r = 0; r < rmp_l; r++) {
+                  CGU_FLOAT Dist = (Rmp[RC][r] - Blk[k][RC]);
+                  CGU_FLOAT Err = RmpErr[r][k] + Dist * Dist * fWeightRed;
+                  MinErr = minf(MinErr, Err);
+                }
+                mse += MinErr * _Rpt[k];
+              }
+
+              // save if we achieve better result
+              if (mse < bestE) {
+                bestE = mse;
+                for (CGU_INT32 k = 0; k < 2; k++)
+                  for (CGU_INT32 j = 0; j < 3; j++)
+                    _OutRmpPnts[j][k] = InpRmp[j][k];
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  return bestE;
+}
+#endif  // !BC5
+#endif  // BC4
+
+#if !defined(BC4_ENCODE_KERNEL_H)
+#if !defined(BC5_ENCODE_KERNEL_H)
+
+static CGU_FLOAT Refine(CGU_FLOAT _OutRmpPnts[NUM_CHANNELS][NUM_ENDPOINTS],
+                        CGU_FLOAT _InpRmpPnts[NUM_CHANNELS][NUM_ENDPOINTS],
+                        CGU_FLOAT _Blk[MAX_BLOCK][NUM_CHANNELS],
+                        CGU_FLOAT _Rpt[MAX_BLOCK], int _NmrClrs,
+                        CGU_UINT8 dwNumPoints,
+                        CMP_GLOBAL const CMP_BC15Options *BC15options,
+                        CGU_UINT8 nRedBits, CGU_UINT8 nGreenBits,
+                        CGU_UINT8 nBlueBits, CGU_UINT8 nRefineSteps) {
+  ALIGN_16 CGU_FLOAT Rmp[NUM_CHANNELS][MAX_POINTS];
+
+  CGU_FLOAT Blk[MAX_BLOCK][NUM_CHANNELS];
+  for (CGU_INT32 i = 0; i < _NmrClrs; i++)
+    for (CGU_INT32 j = 0; j < 3; j++) Blk[i][j] = _Blk[i][j];
+
+  CGU_FLOAT fWeightRed = BC15options->m_fChannelWeights[0];
+  CGU_FLOAT fWeightGreen = BC15options->m_fChannelWeights[1];
+  CGU_FLOAT fWeightBlue = BC15options->m_fChannelWeights[2];
+
+  // here is our grid
+  CGU_FLOAT Fctrs[3];
+  Fctrs[RC] = (CGU_FLOAT)(1 << (PIX_GRID - nRedBits));
+  Fctrs[GC] = (CGU_FLOAT)(1 << (PIX_GRID - nGreenBits));
+  Fctrs[BC] = (CGU_FLOAT)(1 << (PIX_GRID - nBlueBits));
+
+  CGU_FLOAT InpRmp0[NUM_CHANNELS][NUM_ENDPOINTS];
+  CGU_FLOAT InpRmp[NUM_CHANNELS][NUM_ENDPOINTS];
+  for (CGU_INT32 k = 0; k < 2; k++)
+    for (CGU_INT32 j = 0; j < 3; j++)
+      InpRmp0[j][k] = InpRmp[j][k] = _OutRmpPnts[j][k] = _InpRmpPnts[j][k];
+
+  // make ramp endpoints the way they'll going to be decompressed
+  // plus check whether the ramp is flat
+  CGU_BOOL Eq;
+  CGU_FLOAT WkRmpPts[NUM_CHANNELS][NUM_ENDPOINTS];
+  MkWkRmpPts(&Eq, WkRmpPts, InpRmp, nRedBits, nGreenBits, nBlueBits);
+
+  // build ramp for all 3 colors
+  BldRmp(Rmp, WkRmpPts, dwNumPoints);
+
+  // clusterize for the current ramp
+  CGU_FLOAT bestE =
+      ClstrErr(Blk, _Rpt, Rmp, _NmrClrs, dwNumPoints, Eq, BC15options);
+  if (bestE == 0.f || !nRefineSteps)  // if exact, we've done
+    return bestE;
+
+  // Tweak each component in isolation and get the best values
+
+  // precompute ramp errors for Green and Blue
+  CGU_FLOAT RmpErr[MAX_POINTS][MAX_BLOCK];
+  for (CGU_INT32 i = 0; i < _NmrClrs; i++) {
+    for (CGU_INT32 r = 0; r < dwNumPoints; r++) {
+      CGU_FLOAT DistG = (Rmp[GC][r] - Blk[i][GC]);
+      CGU_FLOAT DistB = (Rmp[BC][r] - Blk[i][BC]);
+      RmpErr[r][i] = DistG * DistG * fWeightGreen + DistB * DistB * fWeightBlue;
+    }
+  }
+
+  // First Red
+  CGU_FLOAT bstC0 = InpRmp0[RC][0];
+  CGU_FLOAT bstC1 = InpRmp0[RC][1];
+  int nRefineStart = 0 - (minb(nRefineSteps, (CGU_UINT8)8));
+  int nRefineEnd = minb(nRefineSteps, (CGU_UINT8)8);
+  for (CGU_INT32 i = nRefineStart; i <= nRefineEnd; i++) {
+    for (CGU_INT32 j = nRefineStart; j <= nRefineEnd; j++) {
+      // make a move; both sides of interval.
+      InpRmp[RC][0] = minf(maxf(InpRmp0[RC][0] + i * Fctrs[RC], 0.f), 255.f);
+      InpRmp[RC][1] = minf(maxf(InpRmp0[RC][1] + j * Fctrs[RC], 0.f), 255.f);
+
+      // make ramp endpoints the way they'll going to be decompressed
+      // plus check whether the ramp is flat
+      MkWkRmpPts(&Eq, WkRmpPts, InpRmp, nRedBits, nGreenBits, nBlueBits);
+
+      // build ramp only for red
+      BldClrRmp(Rmp[RC], WkRmpPts[RC], dwNumPoints);
+
+      // compute cumulative error
+      CGU_FLOAT mse = 0.f;
+      int rmp_l = (Eq) ? 1 : dwNumPoints;
+      for (CGU_INT32 k = 0; k < _NmrClrs; k++) {
+        CGU_FLOAT MinErr = 10000000.f;
+        for (CGU_INT32 r = 0; r < rmp_l; r++) {
+          CGU_FLOAT Dist = (Rmp[RC][r] - Blk[k][RC]);
+          CGU_FLOAT Err = RmpErr[r][k] + Dist * Dist * fWeightRed;
+          MinErr = minf(MinErr, Err);
+        }
+        mse += MinErr * _Rpt[k];
+      }
+
+      // save if we achieve better result
+      if (mse < bestE) {
+        bstC0 = InpRmp[RC][0];
+        bstC1 = InpRmp[RC][1];
+        bestE = mse;
+      }
+    }
+  }
+
+  // our best REDs
+  InpRmp[RC][0] = bstC0;
+  InpRmp[RC][1] = bstC1;
+
+  // make ramp endpoints the way they'll going to be decompressed
+  // plus check whether the ramp is flat
+  MkWkRmpPts(&Eq, WkRmpPts, InpRmp, nRedBits, nGreenBits, nBlueBits);
+
+  // build ramp only for green
+  BldRmp(Rmp, WkRmpPts, dwNumPoints);
+
+  // precompute ramp errors for Red and Blue
+  for (CGU_INT32 i = 0; i < _NmrClrs; i++) {
+    for (CGU_INT32 r = 0; r < dwNumPoints; r++) {
+      CGU_FLOAT DistR = (Rmp[RC][r] - Blk[i][RC]);
+      CGU_FLOAT DistB = (Rmp[BC][r] - Blk[i][BC]);
+      RmpErr[r][i] = DistR * DistR * fWeightRed + DistB * DistB * fWeightBlue;
+    }
+  }
+
+  // Now green
+  bstC0 = InpRmp0[GC][0];
+  bstC1 = InpRmp0[GC][1];
+  for (CGU_INT32 i = nRefineStart; i <= nRefineEnd; i++) {
+    for (CGU_INT32 j = nRefineStart; j <= nRefineEnd; j++) {
+      InpRmp[GC][0] = minf(maxf(InpRmp0[GC][0] + i * Fctrs[GC], 0.f), 255.f);
+      InpRmp[GC][1] = minf(maxf(InpRmp0[GC][1] + j * Fctrs[GC], 0.f), 255.f);
+
+      MkWkRmpPts(&Eq, WkRmpPts, InpRmp, nRedBits, nGreenBits, nBlueBits);
+      BldClrRmp(Rmp[GC], WkRmpPts[GC], dwNumPoints);
+
+      CGU_FLOAT mse = 0.f;
+      int rmp_l = (Eq) ? 1 : dwNumPoints;
+      for (CGU_INT32 k = 0; k < _NmrClrs; k++) {
+        CGU_FLOAT MinErr = 10000000.f;
+        for (CGU_INT32 r = 0; r < rmp_l; r++) {
+          CGU_FLOAT Dist = (Rmp[GC][r] - Blk[k][GC]);
+          CGU_FLOAT Err = RmpErr[r][k] + Dist * Dist * fWeightGreen;
+          MinErr = minf(MinErr, Err);
+        }
+        mse += MinErr * _Rpt[k];
+      }
+
+      if (mse < bestE) {
+        bstC0 = InpRmp[GC][0];
+        bstC1 = InpRmp[GC][1];
+        bestE = mse;
+      }
+    }
+  }
+
+  // our best GREENs
+  InpRmp[GC][0] = bstC0;
+  InpRmp[GC][1] = bstC1;
+
+  MkWkRmpPts(&Eq, WkRmpPts, InpRmp, nRedBits, nGreenBits, nBlueBits);
+  BldRmp(Rmp, WkRmpPts, dwNumPoints);
+
+  // ramp err for Red and Green
+  for (CGU_INT32 i = 0; i < _NmrClrs; i++) {
+    for (CGU_INT32 r = 0; r < dwNumPoints; r++) {
+      CGU_FLOAT DistR = (Rmp[RC][r] - Blk[i][RC]);
+      CGU_FLOAT DistG = (Rmp[GC][r] - Blk[i][GC]);
+      RmpErr[r][i] = DistR * DistR * fWeightRed + DistG * DistG * fWeightGreen;
+    }
+  }
+
+  bstC0 = InpRmp0[BC][0];
+  bstC1 = InpRmp0[BC][1];
+  // Now blue
+  for (CGU_INT32 i = nRefineStart; i <= nRefineEnd; i++) {
+    for (CGU_INT32 j = nRefineStart; j <= nRefineEnd; j++) {
+      InpRmp[BC][0] = minf(maxf(InpRmp0[BC][0] + i * Fctrs[BC], 0.f), 255.f);
+      InpRmp[BC][1] = minf(maxf(InpRmp0[BC][1] + j * Fctrs[BC], 0.f), 255.f);
+
+      MkWkRmpPts(&Eq, WkRmpPts, InpRmp, nRedBits, nGreenBits, nBlueBits);
+      BldClrRmp(Rmp[BC], WkRmpPts[BC], dwNumPoints);
+
+      CGU_FLOAT mse = 0.f;
+      int rmp_l = (Eq) ? 1 : dwNumPoints;
+      for (CGU_INT32 k = 0; k < _NmrClrs; k++) {
+        CGU_FLOAT MinErr = 10000000.f;
+        for (CGU_INT32 r = 0; r < rmp_l; r++) {
+          CGU_FLOAT Dist = (Rmp[BC][r] - Blk[k][BC]);
+          CGU_FLOAT Err = RmpErr[r][k] + Dist * Dist * fWeightBlue;
+          MinErr = minf(MinErr, Err);
+        }
+        mse += MinErr * _Rpt[k];
+      }
+
+      if (mse < bestE) {
+        bstC0 = InpRmp[BC][0];
+        bstC1 = InpRmp[BC][1];
+        bestE = mse;
+      }
+    }
+  }
+
+  // our best BLUEs
+  InpRmp[BC][0] = bstC0;
+  InpRmp[BC][1] = bstC1;
+
+  // return our best choice
+  for (CGU_INT32 j = 0; j < 3; j++)
+    for (CGU_INT32 k = 0; k < 2; k++) _OutRmpPnts[j][k] = InpRmp[j][k];
+
+  return bestE;
+}
+#endif  // !BC5
+#endif  //! BC4
+
+#if !defined(BC4_ENCODE_KERNEL_H)
+#if !defined(BC5_ENCODE_KERNEL_H)
+
+static CGU_DWORD ConstructColor(CGU_UINT8 R, CGU_UINT8 nRedBits, CGU_UINT8 G,
+                                CGU_UINT8 nGreenBits, CGU_UINT8 B,
+                                CGU_UINT8 nBlueBits) {
+  return (((R & nByteBitsMask[nRedBits])
+           << (nGreenBits + nBlueBits - (PIX_GRID - nRedBits))) |
+          ((G & nByteBitsMask[nGreenBits])
+           << (nBlueBits - (PIX_GRID - nGreenBits))) |
+          ((B & nByteBitsMask[nBlueBits]) >> ((PIX_GRID - nBlueBits))));
+}
+#endif  // !BC5
+#endif  // !BC4
+
+#if !defined(BC4_ENCODE_KERNEL_H)
+#if !defined(BC5_ENCODE_KERNEL_H)
+// Compute error and find DXTC indexes for the current cluster
+static CGU_FLOAT ClstrIntnl(CGU_FLOAT _Blk[MAX_BLOCK][NUM_CHANNELS],
+                            CGU_UINT8 *_Indxs,
+                            CGU_FLOAT _Rmp[NUM_CHANNELS][MAX_POINTS],
+                            int dwBlockSize, CGU_UINT8 dwNumPoints,
+                            CGU_BOOL _ConstRamp,
+                            CMP_GLOBAL const CMP_BC15Options *BC15options,
+                            CGU_BOOL _bUseAlpha) {
+  CGU_FLOAT Err = 0.f;
+  CGU_UINT8 rmp_l = (_ConstRamp) ? 1 : dwNumPoints;
+
+  // For each colour in the original block assign it
+  // to the closest cluster and compute the cumulative error
+  for (CGU_INT32 i = 0; i < dwBlockSize; i++) {
+    if (_bUseAlpha && *((CGU_DWORD *)&_Blk[i][AC]) == 0)
+      _Indxs[i] = dwNumPoints;
+    else {
+      CGU_FLOAT shortest = 99999999999.f;
+      CGU_UINT8 shortestIndex = 0;
+      if (BC15options)
+        for (CGU_UINT8 r = 0; r < rmp_l; r++) {
+          // calculate the distance for each component
+          CGU_FLOAT distance =
+              (_Blk[i][RC] - _Rmp[RC][r]) * (_Blk[i][RC] - _Rmp[RC][r]) *
+                  BC15options->m_fChannelWeights[0] +
+              (_Blk[i][GC] - _Rmp[GC][r]) * (_Blk[i][GC] - _Rmp[GC][r]) *
+                  BC15options->m_fChannelWeights[1] +
+              (_Blk[i][BC] - _Rmp[BC][r]) * (_Blk[i][BC] - _Rmp[BC][r]) *
+                  BC15options->m_fChannelWeights[2];
+
+          if (distance < shortest) {
+            shortest = distance;
+            shortestIndex = r;
+          }
+        }
+      else
+        for (CGU_UINT8 r = 0; r < rmp_l; r++) {
+          // calculate the distance for each component
+          CGU_FLOAT distance =
+              (_Blk[i][RC] - _Rmp[RC][r]) * (_Blk[i][RC] - _Rmp[RC][r]) +
+              (_Blk[i][GC] - _Rmp[GC][r]) * (_Blk[i][GC] - _Rmp[GC][r]) +
+              (_Blk[i][BC] - _Rmp[BC][r]) * (_Blk[i][BC] - _Rmp[BC][r]);
+
+          if (distance < shortest) {
+            shortest = distance;
+            shortestIndex = r;
+          }
+        }
+
+      Err += shortest;
+
+      // We have the index of the best cluster, so assign this in the block
+      // Reorder indices to match correct DXTC ordering
+      if (shortestIndex == dwNumPoints - 1)
+        shortestIndex = 1;
+      else if (shortestIndex)
+        shortestIndex++;
+      _Indxs[i] = shortestIndex;
+    }
+  }
+
+  return Err;
+}
+#endif  // !BC5
+#endif  // !BC4
+
+/*------------------------------------------------------------------------------------------------
+// input ramp is on the coarse grid
+------------------------------------------------------------------------------------------------*/
+#if !defined(BC4_ENCODE_KERNEL_H)
+#if !defined(BC5_ENCODE_KERNEL_H)
+static CGU_FLOAT ClstrBas(CGU_UINT8 *_Indxs,
+                          CGU_FLOAT _Blk[MAX_BLOCK][NUM_CHANNELS],
+                          CGU_FLOAT _InpRmp[NUM_CHANNELS][NUM_ENDPOINTS],
+                          int dwBlockSize, CGU_UINT8 dwNumPoints,
+                          CMP_GLOBAL const CMP_BC15Options *BC15options,
+                          CGU_BOOL _bUseAlpha, CGU_UINT8 nRedBits,
+                          CGU_UINT8 nGreenBits, CGU_UINT8 nBlueBits) {
+  // make ramp endpoints the way they'll going to be decompressed
+  CGU_BOOL Eq = TRUE;
+  CGU_FLOAT InpRmp[NUM_CHANNELS][NUM_ENDPOINTS];
+  MkWkRmpPts(&Eq, InpRmp, _InpRmp, nRedBits, nGreenBits, nBlueBits);
+
+  // build ramp as it would be built by decompressor
+  CGU_FLOAT Rmp[NUM_CHANNELS][MAX_POINTS];
+  BldRmp(Rmp, InpRmp, dwNumPoints);
+
+  // clusterize and find a cumulative error
+  return ClstrIntnl(_Blk, _Indxs, Rmp, dwBlockSize, dwNumPoints, Eq,
+                    BC15options, _bUseAlpha);
+}
+#endif  // !BC5
+#endif  // !BC4
+
+/*------------------------------------------------------------------------------------------------
+Clusterization the way it looks from the DXTC decompressor
+------------------------------------------------------------------------------------------------*/
+#if !defined(BC4_ENCODE_KERNEL_H)
+#if !defined(BC5_ENCODE_KERNEL_H)
+static CGU_FLOAT Clstr(CGU_UINT32 block_32[MAX_BLOCK], CGU_UINT32 dwBlockSize,
+                       CGU_UINT8 nEndpoints[3][NUM_ENDPOINTS],
+                       CGU_UINT8 *pcIndices, CGU_UINT8 dwNumPoints,
+                       CMP_GLOBAL const CMP_BC15Options *BC15options,
+                       CGU_BOOL _bUseAlpha, CGU_UINT8 _nAlphaThreshold,
+                       CGU_UINT8 nRedBits, CGU_UINT8 nGreenBits,
+                       CGU_UINT8 nBlueBits) {
+  CGU_INT32 c0 = ConstructColor(nEndpoints[RC][0], nRedBits, nEndpoints[GC][0],
+                                nGreenBits, nEndpoints[BC][0], nBlueBits);
+  CGU_INT32 c1 = ConstructColor(nEndpoints[RC][1], nRedBits, nEndpoints[GC][1],
+                                nGreenBits, nEndpoints[BC][1], nBlueBits);
+  CGU_INT32 nEndpointIndex0 = 0;
+  CGU_INT32 nEndpointIndex1 = 1;
+  if ((!(dwNumPoints & 0x1) && c0 <= c1) || ((dwNumPoints & 0x1) && c0 > c1)) {
+    nEndpointIndex0 = 1;
+    nEndpointIndex1 = 0;
+  }
+
+  CGU_FLOAT InpRmp[NUM_CHANNELS][NUM_ENDPOINTS];
+  InpRmp[RC][0] = (CGU_FLOAT)nEndpoints[RC][nEndpointIndex0];
+  InpRmp[RC][1] = (CGU_FLOAT)nEndpoints[RC][nEndpointIndex1];
+  InpRmp[GC][0] = (CGU_FLOAT)nEndpoints[GC][nEndpointIndex0];
+  InpRmp[GC][1] = (CGU_FLOAT)nEndpoints[GC][nEndpointIndex1];
+  InpRmp[BC][0] = (CGU_FLOAT)nEndpoints[BC][nEndpointIndex0];
+  InpRmp[BC][1] = (CGU_FLOAT)nEndpoints[BC][nEndpointIndex1];
+
+  CGU_UINT32 dwAlphaThreshold = _nAlphaThreshold << 24;
+  CGU_FLOAT Blk[MAX_BLOCK][NUM_CHANNELS];
+  for (CGU_UINT32 i = 0; i < dwBlockSize; i++) {
+    Blk[i][RC] = (CGU_FLOAT)((block_32[i] & 0xff0000) >> 16);
+    Blk[i][GC] = (CGU_FLOAT)((block_32[i] & 0xff00) >> 8);
+    Blk[i][BC] = (CGU_FLOAT)(block_32[i] & 0xff);
+    if (_bUseAlpha)
+      Blk[i][AC] = ((block_32[i] & 0xff000000) >= dwAlphaThreshold) ? 1.f : 0.f;
+  }
+
+  return ClstrBas(pcIndices, Blk, InpRmp, dwBlockSize, dwNumPoints, BC15options,
+                  _bUseAlpha, nRedBits, nGreenBits, nBlueBits);
+}
+#endif  // !BC5
+#endif  // !BC4
+
+//----------------------------------------------------
+// This function decompresses a DXT colour block
+// The block is decompressed to 8 bits per channel
+// Result buffer is RGBA format
+//----------------------------------------------------
+#if !defined(BC4_ENCODE_KERNEL_H)
+#if !defined(BC5_ENCODE_KERNEL_H)
+#ifndef ASPM_GPU
+static void DecompressDXTRGB_Internal(CGU_UINT8  rgbBlock[BLOCK_SIZE_4X4X4],
+                                      const CGU_UINT32 compressedBlock[2],
+                                      const CMP_BC15Options *BC15options) {
+
+  CGU_BOOL bDXT1 = TRUE;
+  CGU_UINT32 n0 = compressedBlock[0] & 0xffff;
+  CGU_UINT32 n1 = compressedBlock[0] >> 16;
+  CGU_UINT32 r0;
+  CGU_UINT32 g0;
+  CGU_UINT32 b0;
+  CGU_UINT32 r1;
+  CGU_UINT32 g1;
+  CGU_UINT32 b1;
+
+  r0 = ((n0 & 0xf800) >> 8);
+  g0 = ((n0 & 0x07e0) >> 3);
+  b0 = ((n0 & 0x001f) << 3);
+
+  r1 = ((n1 & 0xf800) >> 8);
+  g1 = ((n1 & 0x07e0) >> 3);
+  b1 = ((n1 & 0x001f) << 3);
+
+  // Apply the lower bit replication to give full dynamic range
+  r0 += (r0 >> 5);
+  r1 += (r1 >> 5);
+  g0 += (g0 >> 6);
+  g1 += (g1 >> 6);
+  b0 += (b0 >> 5);
+  b1 += (b1 >> 5);
+
+if (!BC15options->m_mapDecodeRGBA)
+{
+  //--------------------------------------------------------------
+  // Channel mapping output as BGRA
+  //--------------------------------------------------------------
+    CGU_UINT32 c0 = 0xff000000 | (r0<<16) | (g0<<8) | b0;
+    CGU_UINT32 c1 = 0xff000000 | (r1<<16) | (g1<<8) | b1;
+
+    if(!bDXT1 || n0 > n1)
+    {
+        CGU_UINT32 c2 = 0xff000000 | (((2*r0+r1+1)/3)<<16) | (((2*g0+g1+1)/3)<<8) | (((2*b0+b1+1)/3));
+        CGU_UINT32 c3 = 0xff000000 | (((2*r1+r0+1)/3)<<16) | (((2*g1+g0+1)/3)<<8) | (((2*b1+b0+1)/3));
+
+        for(int i=0; i<16; i++)
+        {
+            int index = (compressedBlock[1] >> (2 * i)) & 3;
+
+            switch(index)
+            {
+            case 0:
+                ((CGU_UINT32*)rgbBlock)[i] = c0;
+                break;
+            case 1:
+                ((CGU_UINT32*)rgbBlock)[i] = c1;
+                break;
+            case 2:
+                ((CGU_UINT32*)rgbBlock)[i] = c2;
+                break;
+            case 3:
+                ((CGU_UINT32*)rgbBlock)[i] = c3;
+                break;
+            }
+        }
+    }
+    else
+    {
+        // Transparent decode
+        CGU_UINT32 c2 = 0xff000000 | (((r0+r1)/2)<<16) | (((g0+g1)/2)<<8) | (((b0+b1)/2));
+
+        for(int i=0; i<16; i++)
+        {
+            int index = (compressedBlock[1] >> (2 * i)) & 3;
+
+            switch(index)
+            {
+                case 0:
+                    ((CGU_UINT32*)rgbBlock)[i] = c0;
+                    break;
+                case 1:
+                    ((CGU_UINT32*)rgbBlock)[i] = c1;
+                    break;
+                case 2:
+                    ((CGU_UINT32*)rgbBlock)[i] = c2;
+                    break;
+                case 3:
+                    ((CGU_UINT32*)rgbBlock)[i] = 0x00000000;
+                    break;
+            }
+        }
+    }
+}
+else { // MAP_BC15_TO_ABGR
+  //--------------------------------------------------------------
+  // Channel mapping output as ARGB
+  //--------------------------------------------------------------
+
+  CGU_UINT32 c0 = 0xff000000 | (b0 << 16) | (g0 << 8) | r0;
+  CGU_UINT32 c1 = 0xff000000 | (b1 << 16) | (g1 << 8) | r1;
+
+  if (!bDXT1 || n0 > n1) {
+    CGU_UINT32 c2 = 0xff000000 | (((2 * b0 + b1 + 1) / 3) << 16) |
+                    (((2 * g0 + g1 + 1) / 3) << 8) | (((2 * r0 + r1 + 1) / 3));
+    CGU_UINT32 c3 = 0xff000000 | (((2 * b1 + b0 + 1) / 3) << 16) |
+                    (((2 * g1 + g0 + 1) / 3) << 8) | (((2 * r1 + r0 + 1) / 3));
+
+    for (int i = 0; i < 16; i++) {
+      int index = (compressedBlock[1] >> (2 * i)) & 3;
+      switch (index) {
+        case 0:
+          ((CMP_GLOBAL CGU_UINT32 *)rgbBlock)[i] = c0;
+          break;
+        case 1:
+          ((CMP_GLOBAL CGU_UINT32 *)rgbBlock)[i] = c1;
+          break;
+        case 2:
+          ((CMP_GLOBAL CGU_UINT32 *)rgbBlock)[i] = c2;
+          break;
+        case 3:
+          ((CMP_GLOBAL CGU_UINT32 *)rgbBlock)[i] = c3;
+          break;
+      }
+    }
+  } else {
+    // Transparent decode
+    CGU_UINT32 c2 = 0xff000000 | (((b0 + b1) / 2) << 16) |
+                    (((g0 + g1) / 2) << 8) | (((r0 + r1) / 2));
+
+    for (int i = 0; i < 16; i++) {
+      int index = (compressedBlock[1] >> (2 * i)) & 3;
+      switch (index) {
+        case 0:
+          ((CMP_GLOBAL CGU_UINT32 *)rgbBlock)[i] = c0;
+          break;
+        case 1:
+          ((CMP_GLOBAL CGU_UINT32 *)rgbBlock)[i] = c1;
+          break;
+        case 2:
+          ((CMP_GLOBAL CGU_UINT32 *)rgbBlock)[i] = c2;
+          break;
+        case 3:
+          ((CMP_GLOBAL CGU_UINT32 *)rgbBlock)[i] = 0x00000000;
+          break;
+      }
+    }
+  }
+} //MAP_ABGR
+}
+#endif // !ASPM_GPU
+#endif  // !BC5
+#endif  // !BC4
+
+#if !defined(BC4_ENCODE_KERNEL_H)
+#if !defined(BC5_ENCODE_KERNEL_H)
+static int QSortIntCmp(const void *Elem1, const void *Elem2) {
+  return (*(CGU_INT32 *)Elem1 - *(CGU_INT32 *)Elem2);
+}
+#endif  // !BC5
+#endif  // !BC4
+
+// Find the first approximation of the line
+// Assume there is a linear relation
+//   Z = a * X_In
+//   Z = b * Y_In
+// Find a,b to minimize MSE between Z and Z_In
+#if !defined(BC4_ENCODE_KERNEL_H)
+#if !defined(BC5_ENCODE_KERNEL_H)
+
+static void FindAxis(CGU_FLOAT _outBlk[MAX_BLOCK][NUM_CHANNELS],
+                     CGU_FLOAT fLineDirection[NUM_CHANNELS],
+                     CGU_FLOAT fBlockCenter[NUM_CHANNELS], CGU_BOOL *_pbSmall,
+                     CGU_FLOAT _inpBlk[MAX_BLOCK][NUM_CHANNELS],
+                     CGU_FLOAT _inpRpt[MAX_BLOCK], int nDimensions,
+                     int nNumColors) {
+  CGU_FLOAT Crrl[NUM_CHANNELS];
+  CGU_FLOAT RGB2[NUM_CHANNELS];
+
+  fLineDirection[0] = fLineDirection[1] = fLineDirection[2] = RGB2[0] =
+      RGB2[1] = RGB2[2] = Crrl[0] = Crrl[1] = Crrl[2] = fBlockCenter[0] =
+          fBlockCenter[1] = fBlockCenter[2] = 0.f;
+
+  // sum position of all points
+  CGU_FLOAT fNumPoints = 0.f;
+  for (CGU_INT32 i = 0; i < nNumColors; i++) {
+    fBlockCenter[0] += _inpBlk[i][0] * _inpRpt[i];
+    fBlockCenter[1] += _inpBlk[i][1] * _inpRpt[i];
+    fBlockCenter[2] += _inpBlk[i][2] * _inpRpt[i];
+    fNumPoints += _inpRpt[i];
+  }
+
+  // and then average to calculate center coordinate of block
+  fBlockCenter[0] /= fNumPoints;
+  fBlockCenter[1] /= fNumPoints;
+  fBlockCenter[2] /= fNumPoints;
+
+  for (CGU_INT32 i = 0; i < nNumColors; i++) {
+    // calculate output block as offsets around block center
+    _outBlk[i][0] = _inpBlk[i][0] - fBlockCenter[0];
+    _outBlk[i][1] = _inpBlk[i][1] - fBlockCenter[1];
+    _outBlk[i][2] = _inpBlk[i][2] - fBlockCenter[2];
+
+    // compute correlation matrix
+    // RGB2 = sum of ((distance from point from center) squared)
+    // Crrl = ???????. Seems to be be some calculation based on distance from
+    // point center in two dimensions
+    for (CGU_INT32 j = 0; j < nDimensions; j++) {
+      RGB2[j] += _outBlk[i][j] * _outBlk[i][j] * _inpRpt[i];
+      Crrl[j] += _outBlk[i][j] * _outBlk[i][(j + 1) % 3] * _inpRpt[i];
+    }
+  }
+
+  // if set's diameter is small
+  int i0 = 0, i1 = 1;
+  CGU_FLOAT mxRGB2 = 0.f;
+  int k = 0, j = 0;
+  CGU_FLOAT fEPS = fNumPoints * EPS;
+  for (k = 0, j = 0; j < 3; j++) {
+    if (RGB2[j] >= fEPS)
+      k++;
+    else
+      RGB2[j] = 0.f;
+
+    if (mxRGB2 < RGB2[j]) {
+      mxRGB2 = RGB2[j];
+      i0 = j;
+    }
+  }
+
+  CGU_FLOAT fEPS2 = fNumPoints * EPS2;
+  *_pbSmall = TRUE;
+  for (j = 0; j < 3; j++) *_pbSmall &= (RGB2[j] < fEPS2);
+
+  if (*_pbSmall)  // all are very small to avoid division on the small
+                  // determinant
+    return;
+
+  if (k == 1)  // really only 1 dimension
+    fLineDirection[i0] = 1.;
+  else if (k == 2)  // really only 2 dimensions
+  {
+    i1 = (RGB2[(i0 + 1) % 3] > 0.f) ? (i0 + 1) % 3 : (i0 + 2) % 3;
+    CGU_FLOAT Crl = (i1 == (i0 + 1) % 3) ? Crrl[i0] : Crrl[(i0 + 2) % 3];
+    fLineDirection[i1] = Crl / RGB2[i0];
+    fLineDirection[i0] = 1.;
+  } else {
+    CGU_FLOAT maxDet = 100000.f;
+    CGU_FLOAT Cs[3];
+    // select max det for precision
+    for (j = 0; j < nDimensions; j++) {
+      CGU_FLOAT Det = RGB2[j] * RGB2[(j + 1) % 3] - Crrl[j] * Crrl[j];
+      Cs[j] = fabs(Crrl[j] / sqrt(RGB2[j] * RGB2[(j + 1) % 3]));
+      if (maxDet < Det) {
+        maxDet = Det;
+        i0 = j;
+      }
+    }
+
+    // inverse correl matrix
+    //  --      --       --      --
+    //  |  A   B |       |  C  -B |
+    //  |  B   C |  =>   | -B   A |
+    //  --      --       --     --
+    CGU_FLOAT mtrx1[2][2];
+    CGU_FLOAT vc1[2];
+    CGU_FLOAT vc[2];
+    vc1[0] = Crrl[(i0 + 2) % 3];
+    vc1[1] = Crrl[(i0 + 1) % 3];
+    // C
+    mtrx1[0][0] = RGB2[(i0 + 1) % 3];
+    // A
+    mtrx1[1][1] = RGB2[i0];
+    // -B
+    mtrx1[1][0] = mtrx1[0][1] = -Crrl[i0];
+    // find a solution
+    vc[0] = mtrx1[0][0] * vc1[0] + mtrx1[0][1] * vc1[1];
+    vc[1] = mtrx1[1][0] * vc1[0] + mtrx1[1][1] * vc1[1];
+    // normalize
+    vc[0] /= maxDet;
+    vc[1] /= maxDet;
+    // find a line direction vector
+    fLineDirection[i0] = 1.;
+    fLineDirection[(i0 + 1) % 3] = 1.;
+    fLineDirection[(i0 + 2) % 3] = vc[0] + vc[1];
+  }
+
+  // normalize direction vector
+  CGU_FLOAT Len = fLineDirection[0] * fLineDirection[0] +
+                  fLineDirection[1] * fLineDirection[1] +
+                  fLineDirection[2] * fLineDirection[2];
+  Len = sqrt(Len);
+
+  for (j = 0; j < 3; j++)
+    fLineDirection[j] = (Len > 0.f) ? fLineDirection[j] / Len : 0.f;
+}
+#endif  // !BC5
+#endif  // !BC4
+
+#if !defined(BC4_ENCODE_KERNEL_H)
+#if !defined(BC5_ENCODE_KERNEL_H)
+static void CompressRGBBlockX(
+    CGU_FLOAT _RsltRmpPnts[NUM_CHANNELS][NUM_ENDPOINTS],
+    CGU_FLOAT _BlkIn[MAX_BLOCK][NUM_CHANNELS], CGU_FLOAT _Rpt[MAX_BLOCK],
+    int _UniqClrs, CGU_UINT8 dwNumPoints, CGU_BOOL b3DRefinement,
+    CGU_UINT8 nRefinementSteps, CMP_GLOBAL const CMP_BC15Options *BC15options,
+    CGU_UINT8 nRedBits, CGU_UINT8 nGreenBits, CGU_UINT8 nBlueBits) {
+  ALIGN_16 CGU_FLOAT Prj0[MAX_BLOCK];
+  ALIGN_16 CGU_FLOAT Prj[MAX_BLOCK];
+  ALIGN_16 CGU_FLOAT PrjErr[MAX_BLOCK];
+  ALIGN_16 CGU_FLOAT LineDir[NUM_CHANNELS];
+  ALIGN_16 CGU_FLOAT RmpIndxs[MAX_BLOCK];
+
+  CGU_FLOAT LineDirG[NUM_CHANNELS];
+  CGU_FLOAT PosG[NUM_ENDPOINTS];
+  CGU_FLOAT Blk[MAX_BLOCK][NUM_CHANNELS];
+  CGU_FLOAT BlkSh[MAX_BLOCK][NUM_CHANNELS];
+  CGU_FLOAT LineDir0[NUM_CHANNELS];
+  CGU_FLOAT Mdl[NUM_CHANNELS];
+
+  CGU_FLOAT rsltC[NUM_CHANNELS][NUM_ENDPOINTS];
+  int i, j, k;
+
+  // down to [0., 1.]
+  for (i = 0; i < _UniqClrs; i++)
+    for (j = 0; j < 3; j++) Blk[i][j] = _BlkIn[i][j] / 255.f;
+
+  CGU_BOOL isDONE = FALSE;
+
+  // as usual if not more then 2 different colors, we've done
+  if (_UniqClrs <= 2) {
+    for (j = 0; j < 3; j++) {
+      rsltC[j][0] = _BlkIn[0][j];
+      rsltC[j][1] = _BlkIn[_UniqClrs - 1][j];
+    }
+    isDONE = TRUE;
+  }
+
+  if (!isDONE) {
+    //    This is our first attempt to find an axis we will go along.
+    //    The cumulation is done to find a line minimizing the MSE from the
+    //    input 3D points.
+    CGU_BOOL bSmall = TRUE;
+    FindAxis(BlkSh, LineDir0, Mdl, &bSmall, Blk, _Rpt, 3, _UniqClrs);
+
+    //    While trying to find the axis we found that the diameter of the input
+    //    set is quite small. Do not bother.
+    if (bSmall) {
+      for (j = 0; j < 3; j++) {
+        rsltC[j][0] = _BlkIn[0][j];
+        rsltC[j][1] = _BlkIn[_UniqClrs - 1][j];
+      }
+      isDONE = TRUE;
+    }
+  }
+
+  // GCC is being an awful being when it comes to goto-jumps.
+  // So please bear with this.
+  if (!isDONE) {
+    CGU_FLOAT ErrG = 10000000.f;
+    CGU_FLOAT PrjBnd[NUM_ENDPOINTS];
+    ALIGN_16 CGU_FLOAT PreMRep[MAX_BLOCK];
+    for (j = 0; j < 3; j++) LineDir[j] = LineDir0[j];
+
+    //    Here is the main loop.
+    //    1. Project input set on the axis in consideration.
+    //    2. Run 1 dimensional search (see scalar case) to find an (sub) optimal
+    //    pair of end points.
+    //    3. Compute the vector of indexes (or clusters) for the current
+    //    approximate ramp.
+    //    4. Present our color channels as 3 16DIM vectors.
+    //    5. Find closest approximation of each of 16DIM color vector with the
+    //    projection of the 16DIM index vector.
+    //    6. Plug the projections as a new directional vector for the axis.
+    //    7. Goto 1.
+
+    //    D - is 16 dim "index" vector (or 16 DIM vector of indexes - {0, 1/3,
+    //    2/3, 0, ...,}, but shifted and normalized). Ci - is a 16 dim vector of
+    //    color i. for each Ci find a scalar Ai such that (Ai * D - Ci) (Ai * D
+    //    - Ci) -> min , i.e distance between vector AiD and C is min. You can
+    //    think of D as a unit interval(vector) "clusterizer", and Ai is a scale
+    //    you need to apply to the clusterizer to approximate the Ci vector
+    //    instead of the unit vector.
+
+    //    Solution is
+
+    //    Ai = (D . Ci) / (D . D); . - is a dot product.
+
+    //    in 3 dim space Ai(s) represent a line direction, along which
+    //    we again try to find (sub)optimal quantizer.
+
+    //    That's what our for(;;) loop is about.
+    for (;;) {
+      //  1. Project input set on the axis in consideration.
+      // From Foley & Van Dam: Closest point of approach of a line (P + v) to a
+      // point (R) is
+      //                            P + ((R-P).v) / (v.v))v
+      // The distance along v is therefore (R-P).v / (v.v)
+      // (v.v) is 1 if v is a unit vector.
+      //
+      PrjBnd[0] = 1000.;
+      PrjBnd[1] = -1000.;
+      for (i = 0; i < MAX_BLOCK; i++)
+        Prj0[i] = Prj[i] = PrjErr[i] = PreMRep[i] = 0.f;
+
+      for (i = 0; i < _UniqClrs; i++) {
+        Prj0[i] = Prj[i] = BlkSh[i][0] * LineDir[0] + BlkSh[i][1] * LineDir[1] +
+                           BlkSh[i][2] * LineDir[2];
+
+        PrjErr[i] = (BlkSh[i][0] - LineDir[0] * Prj[i]) *
+                        (BlkSh[i][0] - LineDir[0] * Prj[i]) +
+                    (BlkSh[i][1] - LineDir[1] * Prj[i]) *
+                        (BlkSh[i][1] - LineDir[1] * Prj[i]) +
+                    (BlkSh[i][2] - LineDir[2] * Prj[i]) *
+                        (BlkSh[i][2] - LineDir[2] * Prj[i]);
+
+        PrjBnd[0] = minf(PrjBnd[0], Prj[i]);
+        PrjBnd[1] = maxf(PrjBnd[1], Prj[i]);
+      }
+
+      //  2. Run 1 dimensional search (see scalar case) to find an (sub) optimal
+      //  pair of end points.
+
+      // min and max of the search interval
+      CGU_FLOAT Scl[NUM_ENDPOINTS];
+      Scl[0] = PrjBnd[0] - (PrjBnd[1] - PrjBnd[0]) * 0.125f;
+      ;
+      Scl[1] = PrjBnd[1] + (PrjBnd[1] - PrjBnd[0]) * 0.125f;
+      ;
+
+      // compute scaling factor to scale down the search interval to [0.,1]
+      const CGU_FLOAT Scl2 = (Scl[1] - Scl[0]) * (Scl[1] - Scl[0]);
+      const CGU_FLOAT overScl = 1.f / (Scl[1] - Scl[0]);
+
+      for (i = 0; i < _UniqClrs; i++) {
+        // scale them
+        Prj[i] = (Prj[i] - Scl[0]) * overScl;
+        // premultiply the scale squire to plug into error computation later
+        PreMRep[i] = _Rpt[i] * Scl2;
+      }
+
+      // scale first approximation of end points
+      for (k = 0; k < 2; k++) PrjBnd[k] = (PrjBnd[k] - Scl[0]) * overScl;
+
+      CGU_FLOAT Err = MAX_ERROR;
+
+      // search step
+      CGU_FLOAT stp = 0.025f;
+
+      // low Start/End; high Start/End
+      const CGU_FLOAT lS =
+          (PrjBnd[0] - 2.f * stp > 0.f) ? PrjBnd[0] - 2.f * stp : 0.f;
+      const CGU_FLOAT hE =
+          (PrjBnd[1] + 2.f * stp < 1.f) ? PrjBnd[1] + 2.f * stp : 1.f;
+
+      // find the best endpoints
+      CGU_FLOAT Pos[NUM_ENDPOINTS];
+      CGU_FLOAT lP, hP;
+      int l, h;
+      for (l = 0, lP = lS; l < 8; l++, lP += stp) {
+        for (h = 0, hP = hE; h < 8; h++, hP -= stp) {
+          CGU_FLOAT err = Err;
+          // compute an error for the current pair of end points.
+          err = RampSrchW(Prj, PrjErr, PreMRep, err, lP, hP, _UniqClrs,
+                          dwNumPoints);
+
+          if (err < Err) {
+            // save better result
+            Err = err;
+            Pos[0] = lP;
+            Pos[1] = hP;
+          }
+        }
+      }
+
+      // inverse the scaling
+      for (k = 0; k < 2; k++) Pos[k] = Pos[k] * (Scl[1] - Scl[0]) + Scl[0];
+
+      // did we find somthing better from the previous run?
+      if (Err + 0.001 < ErrG) {
+        // yes, remember it
+        ErrG = Err;
+        LineDirG[0] = LineDir[0];
+        LineDirG[1] = LineDir[1];
+        LineDirG[2] = LineDir[2];
+        PosG[0] = Pos[0];
+        PosG[1] = Pos[1];
+        //  3. Compute the vector of indexes (or clusters) for the current
+        //  approximate ramp.
+        // indexes
+        const CGU_FLOAT step = (Pos[1] - Pos[0]) / (CGU_FLOAT)(dwNumPoints - 1);
+        const CGU_FLOAT step_h = step * (CGU_FLOAT)0.5;
+        const CGU_FLOAT rstep = (CGU_FLOAT)1.0f / step;
+        const CGU_FLOAT overBlkTp = 1.f / (CGU_FLOAT)(dwNumPoints - 1);
+
+        // here the index vector is computed,
+        // shifted and normalized
+        CGU_FLOAT indxAvrg = (CGU_FLOAT)(dwNumPoints - 1) / 2.f;
+
+        for (i = 0; i < _UniqClrs; i++) {
+          CGU_FLOAT del;
+          // CGU_UINT32 n = (CGU_UINT32)((b - _min_ex + (step*0.5f)) * rstep);
+          if ((del = Prj0[i] - Pos[0]) <= 0)
+            RmpIndxs[i] = 0.f;
+          else if (Prj0[i] - Pos[1] >= 0)
+            RmpIndxs[i] = (CGU_FLOAT)(dwNumPoints - 1);
+          else
+            RmpIndxs[i] = floor((del + step_h) * rstep);
+          // shift and normalization
+          RmpIndxs[i] = (RmpIndxs[i] - indxAvrg) * overBlkTp;
+        }
+
+        //  4. Present our color channels as 3 16DIM vectors.
+        //  5. Find closest aproximation of each of 16DIM color vector with the
+        //  pojection of the 16DIM index vector.
+        CGU_FLOAT Crs[3], Len, Len2;
+        for (i = 0, Crs[0] = Crs[1] = Crs[2] = Len = 0.f; i < _UniqClrs; i++) {
+          const CGU_FLOAT PreMlt = RmpIndxs[i] * _Rpt[i];
+          Len += RmpIndxs[i] * PreMlt;
+          for (j = 0; j < 3; j++) Crs[j] += BlkSh[i][j] * PreMlt;
+        }
+
+        LineDir[0] = LineDir[1] = LineDir[2] = 0.f;
+        if (Len > 0.f) {
+          LineDir[0] = Crs[0] / Len;
+          LineDir[1] = Crs[1] / Len;
+          LineDir[2] = Crs[2] / Len;
+
+          //  6. Plug the projections as a new directional vector for the axis.
+          //  7. Goto 1.
+          Len2 = LineDir[0] * LineDir[0] + LineDir[1] * LineDir[1] +
+                 LineDir[2] * LineDir[2];
+          Len2 = sqrt(Len2);
+
+          LineDir[0] /= Len2;
+          LineDir[1] /= Len2;
+          LineDir[2] /= Len2;
+        }
+      } else  // We was not able to find anything better.  Drop dead.
+        break;
+    }
+
+    // inverse transform to find end-points of 3-color ramp
+    for (k = 0; k < 2; k++)
+      for (j = 0; j < 3; j++)
+        rsltC[j][k] = (PosG[k] * LineDirG[j] + Mdl[j]) * 255.f;
+  }
+
+  // We've dealt with (almost) unrestricted full precision realm.
+  // Now back to the dirty digital world.
+
+  // round the end points to make them look like compressed ones
+  CGU_FLOAT inpRmpEndPts[NUM_CHANNELS][NUM_ENDPOINTS];
+  MkRmpOnGrid(inpRmpEndPts, rsltC, 0.f, 255.f, nRedBits, nGreenBits, nBlueBits);
+
+  //    This not a small procedure squeezes and stretches the ramp along each
+  //    axis (R,G,B) separately while other 2 are fixed. It does it only over
+  //    coarse grid - 565 that is. It tries to squeeze more precision for the
+  //    real world ramp.
+  if (b3DRefinement)
+    Refine3D(_RsltRmpPnts, inpRmpEndPts, _BlkIn, _Rpt, _UniqClrs, dwNumPoints,
+             BC15options, nRedBits, nGreenBits, nBlueBits, nRefinementSteps);
+  else
+    Refine(_RsltRmpPnts, inpRmpEndPts, _BlkIn, _Rpt, _UniqClrs, dwNumPoints,
+           BC15options, nRedBits, nGreenBits, nBlueBits, nRefinementSteps);
+}
+#endif  // !BC5
+#endif  // !BC4
+
+#ifdef ASPM_GPU
+void cmp_memsetfBCn(CGU_FLOAT ptr[], CGU_FLOAT value, CGU_UINT32 size) {
+  for (CGU_UINT32 i = 0; i < size; i++) {
+    ptr[i] = value;
+  }
+}
+#endif
+
+#ifdef ASPM_GPU
+void memset(CGU_UINT8 *srcdata, CGU_UINT8 value, CGU_INT size) {
+  for (CGU_INT i = 0; i < size; i++) *srcdata++ = value;
+}
+
+void memcpy(CGU_UINT8 *srcdata, CGU_UINT8 *dstdata, CGU_INT size) {
+  for (CGU_INT i = 0; i < size; i++) {
+    *srcdata = *dstdata;
+    srcdata++;
+    dstdata++;
+  }
+}
+
+void cmp_memsetBC1(CGU_UINT8 ptr[], CGU_UINT8 value, CGU_UINT32 size) {
+  for (CGU_UINT32 i = 0; i < size; i++) {
+    ptr[i] = value;
+  }
+}
+#endif
+
+#ifdef ASPM_GPU
+static void sortData_UINT32(CGU_UINT32 data_ordered[BLOCK_SIZE],
+                            CGU_UINT32 projection[BLOCK_SIZE],
+                            CGU_UINT32 numEntries  // max 64
+) {
+  CMP_di what[BLOCK_SIZE];
+
+  for (CGU_UINT32 i = 0; i < numEntries; i++) {
+    what[i].index = i;
+    what[i].data = projection[i];
+  }
+
+  CGU_UINT32 tmp_index;
+  CGU_UINT32 tmp_data;
+
+  for (CGU_UINT32 i = 1; i < numEntries; i++) {
+    for (CGU_UINT32 j = i; j > 0; j--) {
+      if (what[j - 1].data > what[j].data) {
+        tmp_index = what[j].index;
+        tmp_data = what[j].data;
+        what[j].index = what[j - 1].index;
+        what[j].data = what[j - 1].data;
+        what[j - 1].index = tmp_index;
+        what[j - 1].data = tmp_data;
+      }
+    }
+  }
+
+  for (CGU_UINT32 i = 0; i < numEntries; i++) data_ordered[i] = what[i].data;
+};
+
+static void sortData_FLOAT(CGU_FLOAT data_ordered[BLOCK_SIZE],
+                           CGU_FLOAT projection[BLOCK_SIZE],
+                           CGU_UINT32 numEntries  // max 64
+) {
+  CMP_df what[BLOCK_SIZE];
+
+  for (CGU_UINT32 i = 0; i < numEntries; i++) {
+    what[i].index = i;
+    what[i].data = projection[i];
+  }
+
+  CGU_UINT32 tmp_index;
+  CGU_FLOAT tmp_data;
+
+  for (CGU_UINT32 i = 1; i < numEntries; i++) {
+    for (CGU_UINT32 j = i; j > 0; j--) {
+      if (what[j - 1].data > what[j].data) {
+        tmp_index = what[j].index;
+        tmp_data = what[j].data;
+        what[j].index = what[j - 1].index;
+        what[j].data = what[j - 1].data;
+        what[j - 1].index = tmp_index;
+        what[j - 1].data = tmp_data;
+      }
+    }
+  }
+
+  for (CGU_UINT32 i = 0; i < numEntries; i++) data_ordered[i] = what[i].data;
+};
+#endif
+
+#if !defined(BC4_ENCODE_KERNEL_H)
+#if !defined(BC5_ENCODE_KERNEL_H)
+static CGU_FLOAT CompRGBBlock(CGU_UINT32 *block_32, CGU_UINT32 dwBlockSize,
+                              CGU_UINT8 nRedBits, CGU_UINT8 nGreenBits,
+                              CGU_UINT8 nBlueBits,
+                              CGU_UINT8 nEndpoints[3][NUM_ENDPOINTS],
+                              CGU_UINT8 *pcIndices, CGU_UINT8 dwNumPoints,
+                              CGU_BOOL b3DRefinement,
+                              CGU_UINT8 nRefinementSteps,
+                              CMP_GLOBAL const CMP_BC15Options *BC15options,
+                              CGU_BOOL _bUseAlpha, CGU_UINT8 _nAlphaThreshold) {
+    ALIGN_16 CGU_FLOAT Rpt[BLOCK_SIZE];
+    ALIGN_16 CGU_FLOAT BlkIn[BLOCK_SIZE][NUM_CHANNELS];
+#ifndef ASPM_GPU
+    memset(Rpt, 0, sizeof(Rpt));
+    memset(BlkIn, 0, sizeof(BlkIn));
+#else
+    cmp_memsetfBCn(&Rpt[0], 0, BLOCK_SIZE);
+    cmp_memsetfBCn(&BlkIn[0][0], 0, BLOCK_SIZE * NUM_CHANNELS);
+#endif
+
+  CGU_UINT32 dwAlphaThreshold = _nAlphaThreshold << 24;
+  CGU_UINT32 dwColors = 0;
+  CGU_UINT32 dwBlk[BLOCK_SIZE];
+  for (CGU_UINT32 i = 0; i < dwBlockSize; i++)
+    if (!_bUseAlpha || (block_32[i] & 0xff000000) >= dwAlphaThreshold)
+      dwBlk[dwColors++] = block_32[i] | 0xff000000;
+
+  // Do we have any colors ?
+  if (dwColors) {
+    CGU_BOOL bHasAlpha = (dwColors != dwBlockSize);
+    if (bHasAlpha && _bUseAlpha && !(dwNumPoints & 0x1)) return CMP_FLOAT_MAX;
+
+      // CGU_UINT32 dwBlk_sorted[BLOCK_SIZE];
+      // Here we are computing an unique number of colors.
+      // For each unique value we compute the number of it appearences.
+#ifndef ASPM_GPU
+    qsort((void *)dwBlk, (size_t)dwColors, sizeof(CGU_UINT32), QSortIntCmp);
+#else
+    sortData_UINT32(dwBlk, dwBlk, dwColors);
+#endif
+
+    CGU_UINT32 new_p;
+    CGU_UINT32 dwBlkU[BLOCK_SIZE];
+    CGU_UINT32 dwUniqueColors = 0;
+    new_p = dwBlkU[0] = dwBlk[0];
+    Rpt[dwUniqueColors] = 1.f;
+    for (CGU_UINT32 i = 1; i < dwColors; i++) {
+      if (new_p != dwBlk[i]) {
+        dwUniqueColors++;
+        new_p = dwBlkU[dwUniqueColors] = dwBlk[i];
+        Rpt[dwUniqueColors] = 1.f;
+      } else
+        Rpt[dwUniqueColors] += 1.f;
+    }
+    dwUniqueColors++;
+
+    // switch to float
+    for (CGU_UINT32 i = 0; i < dwUniqueColors; i++) {
+      BlkIn[i][RC] = (CGU_FLOAT)((dwBlkU[i] >> 16) & 0xff);  // R
+      BlkIn[i][GC] = (CGU_FLOAT)((dwBlkU[i] >> 8) & 0xff);   // G
+      BlkIn[i][BC] = (CGU_FLOAT)((dwBlkU[i] >> 0) & 0xff);   // B
+      BlkIn[i][AC] = 255.f;                                  // A
+    }
+
+    CGU_FLOAT rsltC[NUM_CHANNELS][NUM_ENDPOINTS];
+    CompressRGBBlockX(rsltC, BlkIn, Rpt, dwUniqueColors, dwNumPoints,
+                      b3DRefinement, nRefinementSteps, BC15options, nRedBits,
+                      nGreenBits, nBlueBits);
+
+    // return to integer realm
+    for (CGU_INT32 i = 0; i < 3; i++)
+      for (CGU_INT32 j = 0; j < 2; j++)
+        nEndpoints[i][j] = (CGU_UINT8)rsltC[i][j];
+
+    return Clstr(block_32, dwBlockSize, nEndpoints, pcIndices, dwNumPoints,
+                 BC15options, _bUseAlpha, _nAlphaThreshold, nRedBits,
+                 nGreenBits, nBlueBits);
+  } else {
+    // All colors transparent
+    nEndpoints[0][0] = nEndpoints[1][0] = nEndpoints[2][0] = 0;
+    nEndpoints[0][1] = nEndpoints[1][1] = nEndpoints[2][1] = 0xff;
+#ifndef ASPM_GPU
+    memset(pcIndices, 0xff, dwBlockSize);
+#else
+    cmp_memsetBC1(pcIndices, 0xff, dwBlockSize);
+#endif
+    return 0.0;
+  }
+}
+#endif  // !BC5
+#endif  // !BC4
+
+#if !defined(BC4_ENCODE_KERNEL_H)
+#if !defined(BC5_ENCODE_KERNEL_H)
+static void CompressRGBBlock(const CGU_UINT8 rgbBlock[64],
+                             CMP_GLOBAL CGU_UINT32 compressedBlock[2],
+                             CMP_GLOBAL const CMP_BC15Options *BC15options,
+                             CGU_BOOL bDXT1, CGU_BOOL bDXT1UseAlpha,
+                             CGU_UINT8 nDXT1AlphaThreshold) {
+  CGU_BOOL m_b3DRefinement = FALSE;
+  CGU_UINT8 m_nRefinementSteps = 1;
+
+  /*
+  ARGB Channel indexes
+  */
+  if (bDXT1) {
+    CGU_UINT8 nEndpoints[2][3][2];
+    CGU_UINT8 nIndices[2][16];
+
+    CGU_FLOAT fError3 = CompRGBBlock((CGU_UINT32 *)rgbBlock, BLOCK_SIZE_4X4, RG, GG, BG, nEndpoints[0],
+                                      nIndices[0], 3, m_b3DRefinement, m_nRefinementSteps, BC15options,
+                                      bDXT1UseAlpha, nDXT1AlphaThreshold);
+    CGU_FLOAT fError4 = (fError3 == 0.0) ? CMP_FLOAT_MAX : CompRGBBlock((CGU_UINT32 *)rgbBlock, BLOCK_SIZE_4X4, RG, GG, BG,
+                                                                   nEndpoints[1], nIndices[1], 4, m_b3DRefinement,
+                                                                   m_nRefinementSteps, BC15options, bDXT1UseAlpha,
+                                                                   nDXT1AlphaThreshold);
+
+    CGU_INT32 nMethod = (fError3 <= fError4) ? 0 : 1;
+    CGU_INT32 c0 = ConstructColour((nEndpoints[nMethod][RC][0] >> (8 - RG)),
+                                   (nEndpoints[nMethod][GC][0] >> (8 - GG)),
+                                   (nEndpoints[nMethod][BC][0] >> (8 - BG)));
+    CGU_INT32 c1 = ConstructColour((nEndpoints[nMethod][RC][1] >> (8 - RG)),
+                                   (nEndpoints[nMethod][GC][1] >> (8 - GG)),
+                                   (nEndpoints[nMethod][BC][1] >> (8 - BG)));
+    CGU_BOOL m1 = (nMethod == 1 && c0 <= c1);
+    CGU_BOOL m2 = (nMethod == 0 && c0 > c1);
+    if (m1 || m2)
+      compressedBlock[0] = c1 | (c0 << 16);
+    else
+      compressedBlock[0] = c0 | (c1 << 16);
+
+    compressedBlock[1] = 0;
+    for (CGU_INT32 i = 0; i < 16; i++)
+      compressedBlock[1] |= (nIndices[nMethod][i] << (2 * i));
+  } else {
+    CGU_UINT8 nEndpoints[3][2];
+    CGU_UINT8 nIndices[BLOCK_SIZE_4X4];
+
+    CompRGBBlock((CGU_UINT32 *)rgbBlock, BLOCK_SIZE_4X4, RG, GG, BG, nEndpoints,
+                 nIndices, 4, m_b3DRefinement, m_nRefinementSteps, BC15options,
+                 bDXT1UseAlpha, nDXT1AlphaThreshold);
+
+    CGU_INT32 c0 = ConstructColour((nEndpoints[RC][0] >> (8 - RG)),
+                                   (nEndpoints[GC][0] >> (8 - GG)),
+                                   (nEndpoints[BC][0] >> (8 - BG)));
+    CGU_INT32 c1 = ConstructColour((nEndpoints[RC][1] >> (8 - RG)),
+                                   (nEndpoints[GC][1] >> (8 - GG)),
+                                   (nEndpoints[BC][1] >> (8 - BG)));
+    if (c0 <= c1)
+      compressedBlock[0] = c1 | (c0 << 16);
+    else
+      compressedBlock[0] = c0 | (c1 << 16);
+
+    compressedBlock[1] = 0;
+    for (CGU_INT32 i = 0; i < 16; i++)
+      compressedBlock[1] |= (nIndices[i] << (2 * i));
+  }
+}
+#endif  // BC5
+
+#endif  // BC4
+
+#if !defined(BC1_ENCODE_KERNEL_H)
+#if !defined(BC2_ENCODE_KERNEL_H)
+static CGU_FLOAT RmpSrch1(CGU_FLOAT _Blk[MAX_BLOCK], CGU_FLOAT _Rpt[MAX_BLOCK],
+                          CGU_FLOAT _maxerror, CGU_FLOAT _min_ex,
+                          CGU_FLOAT _max_ex, CGU_INT _NmbrClrs,
+                          CGU_UINT8 nNumPoints) {
+  CGU_FLOAT error = 0;
+  const CGU_FLOAT step = (_max_ex - _min_ex) / (CGU_FLOAT)(nNumPoints - 1);
+  const CGU_FLOAT step_h = step * 0.5f;
+  const CGU_FLOAT rstep = 1.0f / step;
+
+  for (CGU_INT i = 0; i < _NmbrClrs; i++) {
+    CGU_FLOAT v;
+    // Work out which value in the block this select
+    CGU_FLOAT del;
+
+    if ((del = _Blk[i] - _min_ex) <= 0)
+      v = _min_ex;
+    else if (_Blk[i] - _max_ex >= 0)
+      v = _max_ex;
+    else
+      v = (floor((del + step_h) * rstep) * step) + _min_ex;
+
+    // And accumulate the error
+    CGU_FLOAT del2 = (_Blk[i] - v);
+    error += del2 * del2 * _Rpt[i];
+
+    // if we've already lost to the previous step bail out
+    if (_maxerror < error) {
+      error = _maxerror;
+      break;
+    }
+  }
+  return error;
+}
+#endif  // !BC2
+
+#if !defined(BC2_ENCODE_KERNEL_H)
+static CGU_FLOAT BlockRefine1(CGU_FLOAT _Blk[MAX_BLOCK],
+                              CGU_FLOAT _Rpt[MAX_BLOCK], CGU_FLOAT _MaxError,
+                              CGU_FLOAT *_min_ex, CGU_FLOAT *_max_ex,
+                              CGU_FLOAT _m_step, CGU_FLOAT _min_bnd,
+                              CGU_FLOAT _max_bnd, CGU_INT _NmbrClrs,
+                              CGU_UINT8 dwNumPoints) {
+  // Start out assuming our endpoints are the min and max values we've
+  // determined
+
+  // Attempt a (simple) progressive refinement step to reduce noise in the
+  // output image by trying to find a better overall match for the endpoints.
+
+  CGU_FLOAT maxerror = _MaxError;
+  CGU_FLOAT min_ex = *_min_ex;
+  CGU_FLOAT max_ex = *_max_ex;
+
+  int mode, bestmode;
+  do {
+    CGU_FLOAT cr_min0 = min_ex;
+    CGU_FLOAT cr_max0 = max_ex;
+    for (bestmode = -1, mode = 0; mode < SCH_STPS * SCH_STPS; mode++) {
+      // check each move (see sStep for direction)
+      CGU_FLOAT cr_min = min_ex + _m_step * sMvF[mode / SCH_STPS];
+      CGU_FLOAT cr_max = max_ex + _m_step * sMvF[mode % SCH_STPS];
+
+      cr_min = maxf(cr_min, _min_bnd);
+      cr_max = minf(cr_max, _max_bnd);
+
+      CGU_FLOAT error;
+      error = RmpSrch1(_Blk, _Rpt, maxerror, cr_min, cr_max, _NmbrClrs,
+                       dwNumPoints);
+
+      if (error < maxerror) {
+        maxerror = error;
+        bestmode = mode;
+        cr_min0 = cr_min;
+        cr_max0 = cr_max;
+      }
+    }
+
+    if (bestmode != -1) {
+      // make move (see sStep for direction)
+      min_ex = cr_min0;
+      max_ex = cr_max0;
+    }
+  } while (bestmode != -1);
+
+  *_min_ex = min_ex;
+  *_max_ex = max_ex;
+
+  return maxerror;
+}
+#endif  //! BC2
+
+#if !defined(BC2_ENCODE_KERNEL_H)
+static int QSortFCmp(const void *Elem1, const void *Elem2) {
+  int ret = 0;
+
+  if (*(CGU_FLOAT *)Elem1 - *(CGU_FLOAT *)Elem2 < 0.)
+    ret = -1;
+  else if (*(CGU_FLOAT *)Elem1 - *(CGU_FLOAT *)Elem2 > 0.)
+    ret = 1;
+  return ret;
+}
+#endif  // !BC2
+
+#if !defined(BC2_ENCODE_KERNEL_H)
+static CGU_FLOAT CompBlock1(CGU_FLOAT _RmpPnts[NUM_ENDPOINTS],
+                            CGU_FLOAT _Blk[MAX_BLOCK], CGU_INT _Nmbr,
+                            CGU_UINT8 dwNumPoints, CGU_BOOL bFixedRampPoints,
+                            CGU_INT _IntPrc, CGU_INT _FracPrc,
+                            CGU_BOOL _bFixedRamp) {
+  CGU_FLOAT fMaxError = 0.f;
+
+  CGU_FLOAT Ramp[NUM_ENDPOINTS];
+
+  CGU_FLOAT IntFctr = (CGU_FLOAT)(1 << _IntPrc);
+  //    CGU_FLOAT FracFctr = (CGU_FLOAT)(1 << _FracPrc);
+
+  ALIGN_16 CGU_FLOAT afUniqueValues[MAX_BLOCK];
+  ALIGN_16 CGU_FLOAT afValueRepeats[MAX_BLOCK];
+  for (int i = 0; i < MAX_BLOCK; i++)
+    afUniqueValues[i] = afValueRepeats[i] = 0.f;
+
+  // For each unique value we compute the number of it appearances.
+  CGU_FLOAT fBlk[MAX_BLOCK];
+#ifdef ASPM_GPU
+  for (CGU_INT i = 0; i < _Nmbr; i++) {
+    fBlk[i] = _Blk[i];
+  }
+#else
+  memcpy(fBlk, _Blk, _Nmbr * sizeof(CGU_FLOAT));
+#endif
+
+  // sort the input
+#ifndef ASPM_GPU
+  qsort((void *)fBlk, (size_t)_Nmbr, sizeof(CGU_FLOAT), QSortFCmp);
+#else
+  sortData_FLOAT(fBlk, fBlk, _Nmbr);
+#endif
+
+  CGU_FLOAT new_p = -2.;
+
+  int N0s = 0, N1s = 0;
+  CGU_UINT32 dwUniqueValues = 0;
+  afUniqueValues[0] = 0.f;
+
+  bool requiresCalculation = true;
+
+  if (bFixedRampPoints) {
+    for (CGU_INT i = 0; i < _Nmbr; i++) {
+      if (new_p != fBlk[i]) {
+        new_p = fBlk[i];
+        if (new_p <= 1.5 / 255.)
+          N0s++;
+        else if (new_p >= 253.5 / 255.)
+          N1s++;
+        else {
+          afUniqueValues[dwUniqueValues] = fBlk[i];
+          afValueRepeats[dwUniqueValues] = 1.f;
+          dwUniqueValues++;
+        }
+      } else {
+        if (dwUniqueValues > 0) {
+          if (afUniqueValues[dwUniqueValues - 1] == new_p)
+            afValueRepeats[dwUniqueValues - 1] += 1.f;
+        }
+      }
+    }
+
+    // if number of unique colors is less or eq 2 we've done either, but we know
+    // that we may have 0s and/or 1s as well. To avoid for the ramp to be
+    // considered flat we invented couple entries on the way.
+    if (dwUniqueValues <= 2) {
+      if (dwUniqueValues == 2)  // if 2, take them
+      {
+        Ramp[0] = floor(afUniqueValues[0] * (IntFctr - 1) + 0.5f);
+        Ramp[1] = floor(afUniqueValues[1] * (IntFctr - 1) + 0.5f);
+      } else if (dwUniqueValues == 1)  // if 1, add another one
+      {
+        Ramp[0] = floor(afUniqueValues[0] * (IntFctr - 1) + 0.5f);
+        Ramp[1] = Ramp[0] + 1.f;
+      } else  // if 0, invent them
+      {
+        Ramp[0] = 128.f;
+        Ramp[1] = Ramp[0] + 1.f;
+      }
+
+      fMaxError = 0.f;
+      requiresCalculation = false;
+    }
+  } else {
+    for (CGU_INT i = 0; i < _Nmbr; i++) {
+      if (new_p != fBlk[i]) {
+        afUniqueValues[dwUniqueValues] = new_p = fBlk[i];
+        afValueRepeats[dwUniqueValues] = 1.f;
+        dwUniqueValues++;
+      } else
+        afValueRepeats[dwUniqueValues - 1] += 1.f;
+    }
+
+    // if number of unique colors is less or eq 2, we've done
+    if (dwUniqueValues <= 2) {
+      Ramp[0] = floor(afUniqueValues[0] * (IntFctr - 1) + 0.5f);
+      if (dwUniqueValues == 1)
+        Ramp[1] = Ramp[0] + 1.f;
+      else
+        Ramp[1] = floor(afUniqueValues[1] * (IntFctr - 1) + 0.5f);
+      fMaxError = 0.f;
+      requiresCalculation = false;
+    }
+  }
+
+  if (requiresCalculation) {
+    CGU_FLOAT min_ex = afUniqueValues[0];
+    CGU_FLOAT max_ex = afUniqueValues[dwUniqueValues - 1];
+    CGU_FLOAT min_bnd = 0, max_bnd = 1.;
+    CGU_FLOAT min_r = min_ex, max_r = max_ex;
+    CGU_FLOAT gbl_l = 0, gbl_r = 0;
+    CGU_FLOAT cntr = (min_r + max_r) / 2;
+
+    CGU_FLOAT gbl_err = MAX_ERROR;
+    // Trying to avoid unnecessary calculations. Heuristics: after some analisis
+    // it appears that in integer case, if the input interval not more then 48
+    // we won't get much better
+
+    bool wantsSearch = !(_INT_GRID && max_ex - min_ex <= 48.f / IntFctr);
+
+    if (wantsSearch) {
+      // Search.
+      // 1. take the vicinities of both low and high bound of the input
+      // interval.
+      // 2. setup some search step
+      // 3. find the new low and high bound which provides an (sub) optimal
+      // (infinite precision) clusterization.
+      CGU_FLOAT gbl_llb =
+          (min_bnd > min_r - GBL_SCH_EXT) ? min_bnd : min_r - GBL_SCH_EXT;
+      CGU_FLOAT gbl_rrb =
+          (max_bnd < max_r + GBL_SCH_EXT) ? max_bnd : max_r + GBL_SCH_EXT;
+      CGU_FLOAT gbl_lrb =
+          (cntr < min_r + GBL_SCH_EXT) ? cntr : min_r + GBL_SCH_EXT;
+      CGU_FLOAT gbl_rlb =
+          (cntr > max_r - GBL_SCH_EXT) ? cntr : max_r - GBL_SCH_EXT;
+      for (CGU_FLOAT step_l = gbl_llb; step_l < gbl_lrb;
+           step_l += GBL_SCH_STEP) {
+        for (CGU_FLOAT step_r = gbl_rrb; gbl_rlb <= step_r;
+             step_r -= GBL_SCH_STEP) {
+          CGU_FLOAT sch_err;
+          // an sse version is avaiable
+          sch_err = RmpSrch1(afUniqueValues, afValueRepeats, gbl_err, step_l,
+                             step_r, dwUniqueValues, dwNumPoints);
+          if (sch_err < gbl_err) {
+            gbl_err = sch_err;
+            gbl_l = step_l;
+            gbl_r = step_r;
+          }
+        }
+      }
+
+      min_r = gbl_l;
+      max_r = gbl_r;
+    }
+
+    // This is a refinement call. The function tries to make several small
+    // stretches or squashes to minimize quantization error.
+    CGU_FLOAT m_step = LCL_SCH_STEP / IntFctr;
+    fMaxError =
+        BlockRefine1(afUniqueValues, afValueRepeats, gbl_err, &min_r, &max_r,
+                     m_step, min_bnd, max_bnd, dwUniqueValues, dwNumPoints);
+
+    min_ex = min_r;
+    max_ex = max_r;
+
+    max_ex *= (IntFctr - 1);
+    min_ex *= (IntFctr - 1);
+    /*
+    this one is tricky. for the float or high fractional precision ramp it tries
+    to avoid for the ramp to be collapsed into one integer number after
+    rounding. Notice the condition. There is a difference between max_ex and
+    min_ex but after rounding they may collapse into the same integer.
+
+    So we try to run the same refinement procedure but with starting position on
+    the integer grid and step equal 1.
+    */
+    if (!_INT_GRID && max_ex - min_ex > 0. &&
+        floor(min_ex + 0.5f) == floor(max_ex + 0.5f)) {
+      m_step = 1.;
+      gbl_err = MAX_ERROR;
+      for (CGU_UINT32 i = 0; i < dwUniqueValues; i++)
+        afUniqueValues[i] *= (IntFctr - 1);
+
+      max_ex = min_ex = floor(min_ex + 0.5f);
+
+      gbl_err = BlockRefine1(afUniqueValues, afValueRepeats, gbl_err, &min_ex,
+                             &max_ex, m_step, 0.f, 255.f, dwUniqueValues,
+                             dwNumPoints);
+
+      fMaxError = gbl_err;
+    }
+    Ramp[1] = floor(max_ex + 0.5f);
+    Ramp[0] = floor(min_ex + 0.5f);
+  }
+
+  // Ensure that the two endpoints are not the same
+  // This is legal but serves no need & can break some optimizations in the
+  // compressor
+  if (Ramp[0] == Ramp[1]) {
+    if (Ramp[1] < 255.f)
+      Ramp[1]++;
+    else
+      Ramp[1]--;
+  }
+  _RmpPnts[0] = Ramp[0];
+  _RmpPnts[1] = Ramp[1];
+
+  return fMaxError;
+}
+#endif  // !BC2
+
+#if !defined(BC2_ENCODE_KERNEL_H)
+static void BldRmp1(CGU_FLOAT _Rmp[MAX_POINTS],
+                    CGU_FLOAT _InpRmp[NUM_ENDPOINTS], int nNumPoints) {
+  // for 3 point ramp; not to select the 4th point in min
+  for (int e = nNumPoints; e < MAX_POINTS; e++) _Rmp[e] = 100000.f;
+
+  _Rmp[0] = _InpRmp[0];
+  _Rmp[1] = _InpRmp[1];
+  for (int e = 1; e < nNumPoints - 1; e++)
+    _Rmp[e + 1] = (_Rmp[0] * (nNumPoints - 1 - e) + _Rmp[1] * e) /
+                  (CGU_FLOAT)(nNumPoints - 1);
+}
+#endif  //! BC2
+
+#if !defined(BC2_ENCODE_KERNEL_H)
+static void GetRmp1(CGU_FLOAT _rampDat[MAX_POINTS],
+                    CGU_FLOAT _ramp[NUM_ENDPOINTS], int nNumPoints,
+                    CGU_BOOL bFixedRampPoints, CGU_INT _intPrec,
+                    CGU_INT _fracPrec, CGU_BOOL _bFixedRamp) {
+  if (_ramp[0] == _ramp[1]) return;
+
+  CGU_BOOL r0 = _ramp[0] <= _ramp[1];
+  CGU_BOOL r1 = _ramp[0] > _ramp[1];
+  if ((!bFixedRampPoints && r0) || (bFixedRampPoints && r1)) {
+    CGU_FLOAT t = _ramp[0];
+    _ramp[0] = _ramp[1];
+    _ramp[1] = t;
+  }
+
+  _rampDat[0] = _ramp[0];
+  _rampDat[1] = _ramp[1];
+
+  CGU_FLOAT IntFctr = (CGU_FLOAT)(1 << _intPrec);
+  CGU_FLOAT FracFctr = (CGU_FLOAT)(1 << _fracPrec);
+
+  CGU_FLOAT ramp[NUM_ENDPOINTS];
+  ramp[0] = _ramp[0] * FracFctr;
+  ramp[1] = _ramp[1] * FracFctr;
+
+  BldRmp1(_rampDat, ramp, nNumPoints);
+  if (bFixedRampPoints) {
+    _rampDat[nNumPoints] = 0.;
+    _rampDat[nNumPoints + 1] = FracFctr * IntFctr - 1.f;
+  }
+
+  if (_bFixedRamp) {
+    for (CGU_INT i = 0; i < nNumPoints; i++) {
+      _rampDat[i] = floor(_rampDat[i] + 0.5f);
+      _rampDat[i] /= FracFctr;
+    }
+  }
+}
+#endif
+
+#if !defined(BC2_ENCODE_KERNEL_H)
+static CGU_FLOAT Clstr1(CGU_UINT8 *pcIndices, CGU_FLOAT _blockIn[MAX_BLOCK],
+                        CGU_FLOAT _ramp[NUM_ENDPOINTS], CGU_INT _NmbrClrs,
+                        CGU_INT nNumPoints, CGU_BOOL bFixedRampPoints,
+                        CGU_INT _intPrec, CGU_INT _fracPrec,
+                        CGU_BOOL _bFixedRamp) {
+  CGU_FLOAT Err = 0.f;
+  CGU_FLOAT alpha[MAX_POINTS];
+
+  for (CGU_INT i = 0; i < _NmbrClrs; i++) pcIndices[i] = 0;
+
+  if (_ramp[0] == _ramp[1]) return Err;
+
+  if (!_bFixedRamp) {
+    _intPrec = 8;
+    _fracPrec = 0;
+  }
+
+  GetRmp1(alpha, _ramp, nNumPoints, bFixedRampPoints, _intPrec, _fracPrec,
+          _bFixedRamp);
+
+  if (bFixedRampPoints) nNumPoints += 2;
+
+  const CGU_FLOAT OverIntFctr = 1.f / ((CGU_FLOAT)(1 << _intPrec) - 1.f);
+  for (int i = 0; i < nNumPoints; i++) alpha[i] *= OverIntFctr;
+
+  // For each colour in the original block, calculate its weighted
+  // distance from each point in the original and assign it
+  // to the closest cluster
+  for (int i = 0; i < _NmbrClrs; i++) {
+    CGU_FLOAT shortest = 10000000.f;
+
+    // Get the original alpha
+    CGU_FLOAT acur = _blockIn[i];
+
+    for (CGU_UINT8 j = 0; j < nNumPoints; j++) {
+      CGU_FLOAT adist = (acur - alpha[j]);
+      adist *= adist;
+
+      if (adist < shortest) {
+        shortest = adist;
+        pcIndices[i] = j;
+      }
+    }
+
+    Err += shortest;
+  }
+
+  return Err;
+}
+#endif  // !BC2
+
+#if !defined(BC2_ENCODE_KERNEL_H)
+static CGU_FLOAT CompBlock1XF(CGU_FLOAT *_Blk, CGU_UINT32 dwBlockSize,
+                              CGU_UINT8 nEndpoints[2], CGU_UINT8 *pcIndices,
+                              CGU_UINT8 dwNumPoints, CGU_BOOL bFixedRampPoints,
+                              CGU_INT _intPrec, CGU_INT _fracPrec,
+                              CGU_BOOL _bFixedRamp) {
+  // just to make them initialized
+  if (!_bFixedRamp) {
+    _intPrec = 8;
+    _fracPrec = 0;
+  }
+
+  // this one makes the bulk of the work
+  CGU_FLOAT Ramp[NUM_ENDPOINTS];
+  CompBlock1(Ramp, _Blk, dwBlockSize, dwNumPoints, bFixedRampPoints, _intPrec,
+             _fracPrec, _bFixedRamp);
+
+  // final clusterization applied
+  CGU_FLOAT fError = Clstr1(pcIndices, _Blk, Ramp, dwBlockSize, dwNumPoints,
+                            bFixedRampPoints, _intPrec, _fracPrec, _bFixedRamp);
+  nEndpoints[0] = (CGU_UINT8)Ramp[0];
+  nEndpoints[1] = (CGU_UINT8)Ramp[1];
+
+  return fError;
+}
+#endif  //! BC2
+#endif  //! BC1
+
+#if !defined(BC1_ENCODE_KERNEL_H)
+#if !defined(BC2_ENCODE_KERNEL_H)
+static CGU_FLOAT CompBlock1X(const CGU_UINT8 *_Blk, CGU_UINT32 dwBlockSize,
+                             CGU_UINT8 nEndpoints[2], CGU_UINT8 *pcIndices,
+                             CGU_UINT8 dwNumPoints, CGU_BOOL bFixedRampPoints,
+                             CGU_INT _intPrec, CGU_INT _fracPrec,
+                             CGU_BOOL _bFixedRamp) {
+  // convert the input and call the float equivalent.
+  CGU_FLOAT fBlk[MAX_BLOCK];
+  for (CGU_UINT32 i = 0; i < dwBlockSize; i++)
+    fBlk[i] = (CGU_FLOAT)_Blk[i] / 255.f;
+
+  return CompBlock1XF(fBlk, dwBlockSize, nEndpoints, pcIndices, dwNumPoints,
+                      bFixedRampPoints, _intPrec, _fracPrec, _bFixedRamp);
+}
+#endif
+
+#if !defined(BC2_ENCODE_KERNEL_H)
+static void EncodeAlphaBlock(CMP_GLOBAL CGU_UINT32 compressedBlock[2],
+                             CGU_UINT8 nEndpoints[2],
+                             CGU_UINT8 nIndices[BLOCK_SIZE_4X4]) {
+  compressedBlock[0] =
+      ((CGU_UINT32)nEndpoints[0]) | (((CGU_UINT32)nEndpoints[1]) << 8);
+  compressedBlock[1] = 0;
+
+  for (CGU_INT i = 0; i < BLOCK_SIZE_4X4; i++) {
+    if (i < 5)
+      compressedBlock[0] |= (nIndices[i] & 0x7) << (16 + (i * 3));
+    else if (i > 5)
+      compressedBlock[1] |= (nIndices[i] & 0x7) << (2 + (i - 6) * 3);
+    else {
+      compressedBlock[0] |= (nIndices[i] & 0x1) << 31;
+      compressedBlock[1] |= (nIndices[i] & 0x6) >> 1;
+    }
+  }
+}
+#endif
+
+#endif
+
+#if !defined(BC1_ENCODE_KERNEL_H)
+#if !defined(BC2_ENCODE_KERNEL_H)
+static CGU_INT32 CompressAlphaBlock(const CGU_UINT8 alphaBlock[BLOCK_SIZE_4X4],
+                                    CMP_GLOBAL CGU_UINT32 compressedBlock[2]) {
+  CGU_UINT8 nEndpoints[2][2];
+  CGU_UINT8 nIndices[2][BLOCK_SIZE_4X4];
+  CGU_FLOAT fError8 = CompBlock1X(alphaBlock, BLOCK_SIZE_4X4, nEndpoints[0],
+                                  nIndices[0], 8, false, 8, 0, true);
+  CGU_FLOAT fError6 =
+      (fError8 == 0.f) ? CMP_FLOAT_MAX
+                       : CompBlock1X(alphaBlock, BLOCK_SIZE_4X4, nEndpoints[1],
+                                     nIndices[1], 6, true, 8, 0, true);
+  if (fError8 <= fError6)
+    EncodeAlphaBlock(compressedBlock, nEndpoints[0], nIndices[0]);
+  else
+    EncodeAlphaBlock(compressedBlock, nEndpoints[1], nIndices[1]);
+  return CGU_CORE_OK;
+}
+#endif
+
+#if !defined(BC2_ENCODE_KERNEL_H)
+static void GetCompressedAlphaRamp(CGU_UINT8 alpha[8],
+                                   const CGU_UINT32 compressedBlock[2]) {
+  alpha[0] = (CGU_UINT8)(compressedBlock[0] & 0xff);
+  alpha[1] = (CGU_UINT8)((compressedBlock[0] >> 8) & 0xff);
+
+  if (alpha[0] > alpha[1]) {
+    // 8-alpha block:  derive the other six alphas.
+    // Bit code 000 = alpha_0, 001 = alpha_1, others are interpolated.
+#ifdef ASPM_GPU
+    alpha[2] =
+        (CGU_UINT8)((6 * alpha[0] + 1 * alpha[1] + 3) / 7);  // bit code 010
+    alpha[3] =
+        (CGU_UINT8)((5 * alpha[0] + 2 * alpha[1] + 3) / 7);  // bit code 011
+    alpha[4] =
+        (CGU_UINT8)((4 * alpha[0] + 3 * alpha[1] + 3) / 7);  // bit code 100
+    alpha[5] =
+        (CGU_UINT8)((3 * alpha[0] + 4 * alpha[1] + 3) / 7);  // bit code 101
+    alpha[6] =
+        (CGU_UINT8)((2 * alpha[0] + 5 * alpha[1] + 3) / 7);  // bit code 110
+    alpha[7] =
+        (CGU_UINT8)((1 * alpha[0] + 6 * alpha[1] + 3) / 7);  // bit code 111
+#else
+    alpha[2] = static_cast<CGU_UINT8>((6 * alpha[0] + 1 * alpha[1] + 3) /
+                                      7);  // bit code 010
+    alpha[3] = static_cast<CGU_UINT8>((5 * alpha[0] + 2 * alpha[1] + 3) /
+                                      7);  // bit code 011
+    alpha[4] = static_cast<CGU_UINT8>((4 * alpha[0] + 3 * alpha[1] + 3) /
+                                      7);  // bit code 100
+    alpha[5] = static_cast<CGU_UINT8>((3 * alpha[0] + 4 * alpha[1] + 3) /
+                                      7);  // bit code 101
+    alpha[6] = static_cast<CGU_UINT8>((2 * alpha[0] + 5 * alpha[1] + 3) /
+                                      7);  // bit code 110
+    alpha[7] = static_cast<CGU_UINT8>((1 * alpha[0] + 6 * alpha[1] + 3) /
+                                      7);  // bit code 111
+#endif
+  } else {
+    // 6-alpha block.
+    // Bit code 000 = alpha_0, 001 = alpha_1, others are interpolated.
+#ifdef ASPM_GPU
+    alpha[2] =
+        (CGU_UINT8)((4 * alpha[0] + 1 * alpha[1] + 2) / 5);  // Bit code 010
+    alpha[3] =
+        (CGU_UINT8)((3 * alpha[0] + 2 * alpha[1] + 2) / 5);  // Bit code 011
+    alpha[4] =
+        (CGU_UINT8)((2 * alpha[0] + 3 * alpha[1] + 2) / 5);  // Bit code 100
+    alpha[5] =
+        (CGU_UINT8)((1 * alpha[0] + 4 * alpha[1] + 2) / 5);  // Bit code 101
+#else
+    alpha[2] = static_cast<CGU_UINT8>((4 * alpha[0] + 1 * alpha[1] + 2) /
+                                      5);  // Bit code 010
+    alpha[3] = static_cast<CGU_UINT8>((3 * alpha[0] + 2 * alpha[1] + 2) /
+                                      5);  // Bit code 011
+    alpha[4] = static_cast<CGU_UINT8>((2 * alpha[0] + 3 * alpha[1] + 2) /
+                                      5);  // Bit code 100
+    alpha[5] = static_cast<CGU_UINT8>((1 * alpha[0] + 4 * alpha[1] + 2) /
+                                      5);  // Bit code 101
+#endif
+    alpha[6] = 0;    // Bit code 110
+    alpha[7] = 255;  // Bit code 111
+  }
+}
+#endif  // !BC2
+
+#if !defined(BC2_ENCODE_KERNEL_H)
+static void DecompressAlphaBlock(CGU_UINT8 alphaBlock[BLOCK_SIZE_4X4],
+                                 const CGU_UINT32 compressedBlock[2]) {
+  CGU_UINT8 alpha[8];
+  GetCompressedAlphaRamp(alpha, compressedBlock);
+
+  for (int i = 0; i < BLOCK_SIZE_4X4; i++) {
+    CGU_UINT32 index;
+    if (i < 5)
+      index = (compressedBlock[0] & (0x7 << (16 + (i * 3)))) >> (16 + (i * 3));
+    else if (i > 5)
+      index = (compressedBlock[1] & (0x7 << (2 + (i - 6) * 3))) >>
+              (2 + (i - 6) * 3);
+    else {
+      index = (compressedBlock[0] & 0x80000000) >> 31;
+      index |= (compressedBlock[1] & 0x3) << 1;
+    }
+
+    alphaBlock[i] = alpha[index];
+  }
+}
+#endif  // !BC2
+#endif  // !BC1
+
+#endif
diff --git a/extern/CMP_Core/shaders/Common_Def.h b/extern/CMP_Core/shaders/Common_Def.h
new file mode 100644
index 0000000..ed9e94a
--- /dev/null
+++ b/extern/CMP_Core/shaders/Common_Def.h
@@ -0,0 +1,300 @@
+#ifndef _COMMON_DEFINITIONS_H
+#define _COMMON_DEFINITIONS_H
+
+//===============================================================================
+// Copyright (c) 2007-2019 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2004-2006 ATI Technologies Inc.
+//===============================================================================
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files(the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and / or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions :
+// 
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+//
+//
+//  File Name:   Common_Def.h
+//  Description: common definitions used for CPU/HPC/GPU
+//
+//////////////////////////////////////////////////////////////////////////////
+
+
+// Features
+#ifdef _WIN32
+//#define USE_ASPM_CODE
+#endif
+
+// Proxy ISPC compiler (Warning! Not all ASPM features will be available : expect build errors for specialized ASPM code!
+#ifdef ISPC
+#define ASPM
+#endif
+
+// Using OpenCL Compiler
+#ifdef __OPENCL_VERSION__
+#define  ASPM_GPU
+#endif
+
+
+#ifdef _LINUX
+#undef ASPM_GPU
+#include <cstring>
+#include <cmath>
+#include <stdio.h>
+#include "cmp_math_vec4.h"
+#endif
+
+#ifndef CMP_MAX
+#define CMP_MAX(x, y) (((x) > (y)) ? (x) : (y))
+#endif
+
+#ifndef CMP_MIN
+#define CMP_MIN(x, y) (((x) < (y)) ? (x) : (y))
+#endif
+
+#define CMP_SET_BC13_DECODER_RGBA       //  Sets mapping BC1, BC2 & BC3 to decode Red,Green,Blue and Alpha 
+                                        //       RGBA to channels [0,1,2,3] else BGRA maps to [0,1,2,3]
+                                        //  BC4 alpha always maps as AAAA to channels [0,1,2,3] 
+                                        //  BC5 decoded (Red&Green) maps R,G,B=0,A=255 to [0,1,2,3] else  maps [B=0,G,R,A=255] to [0,1,2,3]
+
+//#define USE_BLOCK_LINEAR
+
+#define CMP_FLOAT_MAX       3.402823466e+38F // max value used to detect an Error in processing
+#define CMP_FLOAT_MAX_EXP   38
+#define USE_PROCESS_SEPERATE_ALPHA          // Enable this to use higher quality code using CompressDualIndexBlock
+#define COMPRESSED_BLOCK_SIZE           16  // Size of a compressed block in bytes
+#define MAX_DIMENSION_BIG               4   // Max number of channels  (RGBA)
+#define MAX_SUBSETS                     3   // Maximum number of possible subsets
+#define MAX_SUBSET_SIZE                 16  // Largest possible size for an individual subset
+#define BLOCK_SIZE_4X4X4                64
+#define BLOCK_SIZE_4X4                  16
+#define BlockX                          4
+#define BlockY                          4
+//#define USE_BLOCK_LINEAR    // Source Data is organized in linear form for each block : Experimental Code not fully developed 
+//#define USE_DOUBLE          // Default is to use float, enable to use double data types only for float definitions
+
+typedef enum {
+    CGU_CORE_OK = 0,                          // No errors, call was successfull
+    CGU_CORE_ERR_UNKOWN,                      // An unknown error occurred
+    CGU_CORE_ERR_NEWMEM,                      // New Memory Allocation Failed
+    CGU_CORE_ERR_INVALIDPTR,                  // The pointer value used is invalid or null
+    CGU_CORE_ERR_RANGERED,                    // values for Red   Channel is out of range (too high or too low)
+    CGU_CORE_ERR_RANGEGREEN,                  // values for Green Channel is out of range (too high or too low)
+    CGU_CORE_ERR_RANGEBLUE,                   // values for Blue  Channel is out of range (too high or too low)
+} CGU_ERROR_CODES;
+
+
+//---------------------------------------------
+// Predefinitions for GPU and CPU compiled code
+//---------------------------------------------
+
+#ifdef ASPM_GPU  // GPU Based code
+        // ==== Vectors ====
+        typedef float2  CGU_Vec2f;
+        typedef float2  CGV_Vec2f;
+        typedef float3  CMP_Vec3f;
+        typedef float3  CGU_Vec3f;
+        typedef float3  CGV_Vec3f;
+        typedef uchar3  CGU_Vec3uc;
+        typedef uchar3  CGV_Vec3uc;
+        typedef uchar4  CMP_Vec4uc;
+        typedef uchar4  CGU_Vec4uc;
+        typedef uchar4  CGV_Vec4uc;
+
+        #define USE_BC7_SP_ERR_IDX
+        #define ASPM_PRINT(args)      printf args
+        #define BC7_ENCODECLASS
+
+        #define CMP_EXPORT
+        #define INLINE
+        #define uniform
+        #define varying
+        #define CMP_GLOBAL          __global
+        #define CMP_KERNEL          __kernel
+        #define CMP_CONSTANT        __constant
+        #define CMP_STATIC
+
+
+        typedef unsigned int        CGU_DWORD;      //32bits
+        typedef int                 CGU_INT;        //32bits
+        typedef int                 CGU_BOOL;
+        typedef unsigned short      CGU_SHORT;      //16bits
+        typedef float               CGU_FLOAT;
+        typedef unsigned int        uint32;     // need to remove this def
+
+        typedef int                 CGV_INT;
+        typedef unsigned int        CGU_UINT;
+        typedef int                 CGUV_INT;
+        typedef int                 CGV_BOOL;
+
+        typedef char                CGU_INT8;
+        typedef unsigned char       CGU_UINT8;
+        typedef short               CGU_INT16;
+        typedef unsigned short      CGU_UINT16;
+        typedef int                 CGU_INT32;
+        typedef unsigned int        CGU_UINT32;
+        typedef unsigned long       CGU_UINT64;
+
+        typedef char                CGV_INT8;
+        typedef unsigned char       CGV_UINT8;
+        typedef short               CGV_INT16;
+        typedef unsigned short      CGV_UINT16;
+        typedef int                 CGV_INT32;
+        typedef unsigned int        CGV_UINT32;
+        typedef unsigned long       CGV_UINT64;
+
+        typedef float               CGV_FLOAT;
+
+        #define TRUE  1
+        #define FALSE 0
+        #define CMP_CDECL
+
+#else
+    // CPU & ASPM definitions
+
+    #ifdef ASPM // SPMD ,SIMD CPU code
+        // using hybrid (CPU/GPU) aspm compiler 
+        #define ASPM_PRINT(args)       print args
+        #define CMP_USE_FOREACH_ASPM
+        #define __ASPM__
+        #define BC7_ENCODECLASS
+
+        #define USE_BC7_SP_ERR_IDX
+        //#define USE_BC7_RAMP
+
+        #define CMP_EXPORT          export
+        #define TRUE            true
+        #define FALSE           false
+        typedef uniform bool    CGU_BOOL;
+        typedef bool            CGV_BOOL;
+
+        typedef unsigned int8   uint8;
+        typedef unsigned int16  uint16;
+        typedef unsigned int32  uint32;
+        typedef unsigned int64  uint64;
+        typedef uniform float   CGU_FLOAT;
+        typedef varying float   CGV_FLOAT;
+        typedef uniform uint8   CGU_UINT8;
+        typedef varying uint8   CGV_UINT8;
+
+
+        typedef CGV_UINT8<4> CGV_Vec4uc;
+        typedef CGU_UINT8<4> CGU_Vec4uc;
+
+        typedef CGU_FLOAT<3> CGU_Vec3f;
+        typedef CGV_FLOAT<3> CGV_Vec3f;
+
+        typedef CGU_FLOAT<2> CGU_Vec2f;
+        typedef CGV_FLOAT<2> CGV_Vec2f;
+
+        #define CMP_CDECL
+
+    #else   // standard CPU code
+        #include <stdio.h>
+        #include <string>
+        #include "cmp_math_vec4.h"
+
+        // using CPU compiler
+        #define ASPM_PRINT(args)  printf args
+        #define USE_BC7_RAMP
+        #define USE_BC7_SP_ERR_IDX
+
+        #define CMP_EXPORT
+        #define BC7_ENCODECLASS BC7_EncodeClass::
+        #define TRUE            1
+        #define FALSE           0
+        #define uniform
+        #define varying
+
+        typedef char            int8;
+        typedef short           int16;
+        typedef int             int32;
+        typedef long            int64;
+        typedef unsigned char   uint8;
+        typedef unsigned short  uint16;
+        typedef unsigned int    uint32;
+        typedef unsigned long   uint64;
+
+        typedef int8            CGV_BOOL;
+        typedef int8            CGU_BOOL;
+        typedef int16           CGU_WORD;
+        typedef uint8           CGU_SHORT;
+        typedef int64           CGU_LONG;
+        typedef uint64          CGU_ULONG;
+
+        typedef uniform float   CGU_FLOAT;
+        typedef varying float   CGV_FLOAT;
+        typedef uniform uint8   CGU_UINT8;
+        typedef varying uint8   CGV_UINT8;
+        #if defined(WIN32) || defined(_WIN64)
+        #define CMP_CDECL __cdecl
+        #else
+        #define CMP_CDECL
+        #endif
+    #endif
+
+    // Common CPU & ASPM definitions
+    #define CMP_ASSERT(arg)
+
+    #define CMP_GLOBAL
+
+    #define CMP_KERNEL
+    #define __local                 const
+    #define __constant              const
+    #define CMP_CONSTANT            const
+    #define INLINE                  inline
+    #define CMP_STATIC              static
+
+
+    typedef uniform int32           CGU_DWORD;
+    typedef uniform uint8           CGU_UBYTE;
+    typedef uniform int             CGU_INT;
+    typedef uniform int8            CGU_INT8;
+
+    typedef uniform int16           CGU_INT16;
+    typedef uniform uint16          CGU_UINT16;
+    typedef uniform int32           CGU_INT32;
+    typedef uniform uint32          CGU_UINT32;
+    typedef uniform uint64          CGU_UINT64;
+
+    typedef int                     CGV_INT;
+    typedef int8                    CGV_INT8;
+    typedef int16                   CGV_INT16;
+    typedef int32                   CGV_INT32;
+    typedef uint16                  CGV_UINT16;
+    typedef uint32                  CGV_UINT32;
+    typedef uint64                  CGV_UINT64;
+#endif // ASPM_GPU
+
+
+typedef struct 
+{
+    CGU_UINT32     m_src_width;
+    CGU_UINT32     m_src_height;
+    CGU_UINT32     m_width_in_blocks;
+    CGU_UINT32     m_height_in_blocks;
+    CGU_FLOAT      m_fquality;
+} Source_Info;
+
+// Ref Compute_CPU_HPC
+struct texture_surface
+{
+    CGU_UINT8*  ptr;
+    CGU_INT     width,
+                height,
+                stride;
+    CGU_INT     channels;
+};
+
+#endif
diff --git a/extern/CMP_Core/shaders/CopyFiles.bat b/extern/CMP_Core/shaders/CopyFiles.bat
new file mode 100644
index 0000000..fc125e9
--- /dev/null
+++ b/extern/CMP_Core/shaders/CopyFiles.bat
@@ -0,0 +1,50 @@
+REM ====================================
+REM Hybrid Codecs: Full support in v4.0
+REM ====================================
+
+REM gets the output dir
+set BUILD_OUTDIR=%1
+
+REM get the batch files dir 
+SET mypath=%~dp0
+echo %mypath:~0,-1%
+
+IF NOT EXIST "%outpath%"\Plugins mkdir %BUILD_OUTDIR%Plugins
+IF NOT EXIST "%outpath%"\Plugins\Compute mkdir %BUILD_OUTDIR%Plugins\Compute
+
+REM Build Vulkan Shader Binary
+REM "%VULKAN_SDK%"\bin\glslangvalidator -V %mypath:~0,-1%\BC1.comp -o %BUILD_OUTDIR%\Plugins\Compute\BC1.spv
+REM IF %ERRORLEVEL% GTR 0 exit 123
+
+REM Enabled in v4.0
+REM 
+REM del %BUILD_OUTDIR%Plugins\Compute\BC1_Encode_Kernel.cpp.cmp
+REM del %BUILD_OUTDIR%Plugins\Compute\BC2_Encode_Kernel.cpp.cmp
+REM del %BUILD_OUTDIR%Plugins\Compute\BC3_Encode_Kernel.cpp.cmp
+REM del %BUILD_OUTDIR%Plugins\Compute\BC4_Encode_Kernel.cpp.cmp
+REM del %BUILD_OUTDIR%Plugins\Compute\BC5_Encode_Kernel.cpp.cmp
+REM del %BUILD_OUTDIR%Plugins\Compute\BC6_Encode_Kernel.cpp.cmp
+REM del %BUILD_OUTDIR%Plugins\Compute\BC7_Encode_Kernel.cpp.cmp
+
+XCopy /r /d /y "%mypath:~0,-1%\Common_Def.h"        %BUILD_OUTDIR%Plugins\Compute\
+XCopy /r /d /y "%mypath:~0,-1%\BCn_Common_Kernel.h"  %BUILD_OUTDIR%Plugins\Compute\
+XCopy /r /d /y "%mypath:~0,-1%\BC1_Encode_Kernel.h"  %BUILD_OUTDIR%Plugins\Compute\
+XCopy /r /d /y "%mypath:~0,-1%\BC1_Encode_Kernel.cpp"  %BUILD_OUTDIR%Plugins\Compute\
+XCopy /r /d /y "%mypath:~0,-1%\BC2_Encode_Kernel.h"  %BUILD_OUTDIR%Plugins\Compute\
+XCopy /r /d /y "%mypath:~0,-1%\BC2_Encode_Kernel.cpp"  %BUILD_OUTDIR%Plugins\Compute\
+XCopy /r /d /y "%mypath:~0,-1%\BC3_Encode_Kernel.h"  %BUILD_OUTDIR%Plugins\Compute\
+XCopy /r /d /y "%mypath:~0,-1%\BC3_Encode_Kernel.cpp"  %BUILD_OUTDIR%Plugins\Compute\
+XCopy /r /d /y "%mypath:~0,-1%\BC4_Encode_Kernel.h"  %BUILD_OUTDIR%Plugins\Compute\
+XCopy /r /d /y "%mypath:~0,-1%\BC4_Encode_Kernel.cpp"  %BUILD_OUTDIR%Plugins\Compute\
+XCopy /r /d /y "%mypath:~0,-1%\BC5_Encode_Kernel.h"  %BUILD_OUTDIR%Plugins\Compute\
+XCopy /r /d /y "%mypath:~0,-1%\BC5_Encode_Kernel.cpp"  %BUILD_OUTDIR%Plugins\Compute\
+XCopy /r /d /y "%mypath:~0,-1%\BC6_Encode_Kernel.h"     %BUILD_OUTDIR%Plugins\Compute\
+XCopy /r /d /y "%mypath:~0,-1%\BC6_Encode_Kernel.cpp"   %BUILD_OUTDIR%Plugins\Compute\
+XCopy /r /d /y "%mypath:~0,-1%\BC7_Encode_Kernel.h"  %BUILD_OUTDIR%Plugins\Compute\
+XCopy /r /d /y "%mypath:~0,-1%\BC7_Encode_Kernel.cpp"  %BUILD_OUTDIR%Plugins\Compute\
+
+echo "Dependencies copied done"
+
+
+
+
diff --git a/extern/CMP_Core/source/CMP_Core.h b/extern/CMP_Core/source/CMP_Core.h
new file mode 100644
index 0000000..d54dc27
--- /dev/null
+++ b/extern/CMP_Core/source/CMP_Core.h
@@ -0,0 +1,153 @@
+//=====================================================================
+// Copyright (c) 2019   Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files(the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and / or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions :
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+//
+/// \file CMP_Core.h
+//
+//=====================================================================
+
+#ifndef CMP_CORE_H
+#define CMP_CORE_H
+
+#include <stdint.h>
+#ifdef _WIN32
+#define CMP_CDECL __cdecl
+#else
+#define CMP_CDECL
+#endif
+
+//====================================================================================
+// API Definitions for Core API
+//------------------------------------------------------------------------------------
+// All API return 0 on success else error codes > 0
+// See Common_Def.h CGU_CORE_ values for the error codes
+//=====================================================================================
+
+//======================================================================================================
+// Block level setting option: Create and Destroy Reference Pointers
+//======================================================================================================
+// Context create and destroy to use for BCn codec settings, where n is the set [1,2,3,4,5,6,7]
+// All codecs will use default max quality settings, users can create multiple contexts to 
+// set quality levels, masks , channel mapping, etc...
+
+int CMP_CDECL CreateOptionsBC1(void **optionsBC1);
+int CMP_CDECL CreateOptionsBC2(void **optionsBC2);
+int CMP_CDECL CreateOptionsBC3(void **optionsBC3);
+int CMP_CDECL CreateOptionsBC4(void **optionsBC4);
+int CMP_CDECL CreateOptionsBC5(void **optionsBC5);
+int CMP_CDECL CreateOptionsBC6(void **optionsBC6);
+int CMP_CDECL CreateOptionsBC7(void **optionsBC7);
+
+int CMP_CDECL DestroyOptionsBC1(void *optionsBC1);
+int CMP_CDECL DestroyOptionsBC2(void *optionsBC2);
+int CMP_CDECL DestroyOptionsBC3(void *optionsBC3);
+int CMP_CDECL DestroyOptionsBC4(void *optionsBC4);
+int CMP_CDECL DestroyOptionsBC5(void *optionsBC5);
+int CMP_CDECL DestroyOptionsBC6(void *optionsBC6);
+int CMP_CDECL DestroyOptionsBC7(void *optionsBC7);
+
+
+//======================================================================================================
+// Block level settings using the options Reference Pointers
+//======================================================================================================
+
+// Setting channel Weights : Applies to BC1, BC2 and BC3 valid ranges are [0..1.0f] Default is {1.0f, 1.0f , 1.0f}
+// Use channel weightings. With swizzled formats the weighting applies to the data within the specified channel not the channel itself.
+int CMP_CDECL SetChannelWeightsBC1(void *options, float WeightRed, float WeightGreen, float WeightBlue);
+int CMP_CDECL SetChannelWeightsBC2(void *options, float WeightRed, float WeightGreen, float WeightBlue);
+int CMP_CDECL SetChannelWeightsBC3(void *options, float WeightRed, float WeightGreen, float WeightBlue);
+
+
+//  True sets mapping CMP_Core BC1, BC2 & BC3 to decode Red,Green,Blue and Alpha as
+//       RGBA to channels [0,1,2,3] else BGRA maps to [0,1,2,3]
+//  Default is set to true.
+int CMP_CDECL SetDecodeChannelMapping(void *options, bool mapRGBA);
+
+int CMP_CDECL SetQualityBC1(void *options, float fquality);
+int CMP_CDECL SetQualityBC2(void *options, float fquality);
+int CMP_CDECL SetQualityBC3(void *options, float fquality);
+int CMP_CDECL SetQualityBC4(void *options, float fquality);
+int CMP_CDECL SetQualityBC5(void *options, float fquality);
+int CMP_CDECL SetQualityBC6(void *options, float fquality);
+int CMP_CDECL SetQualityBC7(void *options, float fquality);
+
+
+int CMP_CDECL SetAlphaThresholdBC1(void *options, unsigned char alphaThreshold);
+
+int CMP_CDECL SetMaskBC6(void *options, unsigned int  mask);
+int CMP_CDECL SetMaskBC7(void *options, unsigned char mask);
+
+int CMP_CDECL SetAlphaOptionsBC7(void *options, bool imageNeedsAlpha, bool colourRestrict, bool alphaRestrict);
+int CMP_CDECL SetErrorThresholdBC7(void *options, float minThreshold, float maxThreshold);
+
+//======================================================================================================
+// (4x4) Block level 4 channel source CompressBlock and DecompressBlock API for BCn Codecs
+//======================================================================================================
+// The options parameter for these API can be set to null in the calls if defaults settings is sufficient
+// Example: CompressBlockBC1(srcBlock,16,cmpBlock,NULL);   For "C" call
+//          CompressBlockBC1(srcBlock,16,cmpBlock);        For "C++" calls
+//
+// To use this parameter first create the options context using the CreateOptions call
+// then use the Set Options to set various codec settings and pass them to the appropriate 
+// Compress or Decompress API.
+// The source (srcBlock) channel format is expected to be RGBA:8888 by default for LDR Codecs
+// for BC6H the format is RGBA Half float (16 bits per channel)
+//------------------------------------------------------------------------------------------------------
+#ifdef __cplusplus
+#define CMP_DEFAULTNULL  =NULL
+#else
+#define CMP_DEFAULTNULL
+#endif
+
+//=========================================================================================================
+// 4 channel Sources, default format RGBA:8888 is processed as a 4x4 block starting at srcBlock location
+// where each row of the block is calculated from srcStride
+//=========================================================================================================
+int CMP_CDECL CompressBlockBC1(const unsigned char *srcBlock, unsigned int  srcStrideInBytes, unsigned char cmpBlock[8 ], const void *options CMP_DEFAULTNULL);
+int CMP_CDECL CompressBlockBC2(const unsigned char *srcBlock, unsigned int  srcStrideInBytes, unsigned char cmpBlock[16], const void *options CMP_DEFAULTNULL);
+int CMP_CDECL CompressBlockBC3(const unsigned char *srcBlock, unsigned int  srcStrideInBytes, unsigned char cmpBlock[16], const void *options CMP_DEFAULTNULL);
+int CMP_CDECL CompressBlockBC7(const unsigned char *srcBlock, unsigned int  srcStrideInBytes, unsigned char cmpBlock[16], const void *options CMP_DEFAULTNULL);
+
+int CMP_CDECL DecompressBlockBC1(const unsigned char cmpBlock[8 ], unsigned char srcBlock[64], const void *options CMP_DEFAULTNULL);
+int CMP_CDECL DecompressBlockBC2(const unsigned char cmpBlock[16], unsigned char srcBlock[64], const void *options CMP_DEFAULTNULL);
+int CMP_CDECL DecompressBlockBC3(const unsigned char cmpBlock[16], unsigned char srcBlock[64], const void *options CMP_DEFAULTNULL);
+int CMP_CDECL DecompressBlockBC7(const unsigned char cmpBlock[16], unsigned char srcBlock[64], const void *options CMP_DEFAULTNULL);
+
+//================================================
+// 1 channel Source 4x4 8 bits per block
+//================================================
+int CMP_CDECL CompressBlockBC4(const unsigned char *srcBlock, unsigned int  srcStrideInBytes, unsigned char cmpBlock[8], const void *options  CMP_DEFAULTNULL);
+int CMP_CDECL DecompressBlockBC4(const unsigned char cmpBlock[8], unsigned char srcBlock[16], const void *options  CMP_DEFAULTNULL);
+
+//================================================
+// 2 channel Source 2x(4x4 8 bits)
+//================================================
+int CMP_CDECL CompressBlockBC5(const unsigned char *srcBlock1, unsigned int srcStrideInBytes1,
+                               const unsigned char *srcBlock2, unsigned int srcStrideInBytes2,
+                               unsigned char cmpBlock[16], const void *options  CMP_DEFAULTNULL);
+int CMP_CDECL DecompressBlockBC5(const unsigned char cmpBlock[16], unsigned char srcBlock1[16], unsigned char srcBlock2[16], const void *options  CMP_DEFAULTNULL);
+
+//========================================================================================
+// For 3 channel Source  RGB_16, Note srcStride is in unsigned short steps (2 bytes each)
+//========================================================================================
+int CMP_CDECL CompressBlockBC6(const unsigned short *srcBlock, unsigned int srcStrideInShorts, unsigned char cmpBlock[16], const void *options CMP_DEFAULTNULL);
+int CMP_CDECL DecompressBlockBC6(const unsigned char cmpBlock[16], unsigned short srcBlock[48], const void *options CMP_DEFAULTNULL);
+
+#endif  // CMP_CORE
diff --git a/extern/CMP_Core/source/cmp_math_vec4.h b/extern/CMP_Core/source/cmp_math_vec4.h
new file mode 100644
index 0000000..d92080e
--- /dev/null
+++ b/extern/CMP_Core/source/cmp_math_vec4.h
@@ -0,0 +1,417 @@
+//=====================================================================
+// Copyright 2019 (c), Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files(the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and / or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions :
+// 
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+//
+//=====================================================================
+#ifndef CMP_MATH_VEC4_H
+#define CMP_MATH_VEC4_H
+
+//====================================================
+// Vector Class definitions for CPU & Intrinsics
+//====================================================
+
+#if defined (_LINUX) || defined (_WIN32)
+
+//============================================= VEC2 ==================================================
+template<class T>
+class Vec2
+{
+public:
+
+    T x;
+    T y;
+
+    // *****************************************
+    //     Constructors
+    // *****************************************
+
+    /// Default constructor
+    Vec2() : x((T)0), y((T)0) {};
+
+    /// Value constructor
+    Vec2(const T& vx, const T& vy) : x(vx), y(vy) {};
+
+    /// Copy constructor
+    Vec2(const Vec2<T>& val) : x(val.x), y(val.y) {};
+
+    /// Single value constructor.  Sets all components to the given value
+    Vec2(const T& v) : x(v), y(v) {};
+
+
+    // *****************************************
+    //     Conversions/Assignment/Indexing
+    // *****************************************
+
+    /// cast to T*
+    operator const T* () const { return (const T*)this; };
+
+    /// cast to T*
+    operator T* () { return (T*)this; };
+
+    /// Indexing
+    const T& operator[](int i) const { return ((const T*)this)[i]; };
+    T& operator[](int i) { return ((T*)this)[i]; };
+
+    /// Assignment
+    const Vec2<T>& operator=(const Vec2<T>& rhs) { x = rhs.x; y = rhs.y; return *this; };
+
+    // *****************************************
+    //    Comparison
+    // *****************************************
+
+    /// Equality comparison
+    bool operator==(const Vec2<T>& rhs) const { return (x == rhs.x && y == rhs.y); };
+
+    /// Inequality comparision
+    bool operator!=(const Vec2<T>& rhs) const { return (x != rhs.x || y != rhs.y); };
+
+    // *****************************************
+    //    Arithmetic
+    // *****************************************
+
+    /// Addition
+    const Vec2<T> operator+(const Vec2<T>& rhs) const { return Vec2<T>(x + rhs.x, y + rhs.y); };
+
+    /// Subtraction
+    const Vec2<T> operator-(const Vec2<T>& rhs) const { return Vec2<T>(x - rhs.x, y - rhs.y); };
+
+    /// Multiply by scalar
+    const Vec2<T> operator*(const T& v) const { return Vec2<T>(x * v, y * v); };
+
+    /// Divide by scalar
+    const Vec2<T> operator/(const T& v) const { return Vec2<T>(x / v, y / v); };
+
+    /// Addition in-place
+    Vec2<T>& operator+= (const Vec2<T>& rhs) { x += rhs.x; y += rhs.y; return *this; };
+
+    /// Subtract in-place
+    Vec2<T>& operator-= (const Vec2<T>& rhs) { x -= rhs.x; y -= rhs.y; return *this; };
+
+    /// Scalar multiply in-place
+    Vec2<T>& operator*= (const T& v) { x *= v; y *= v; return *this; };
+
+    /// Scalar divide in-place
+    Vec2<T>& operator/= (const T& v) { x /= v; y /= v; return *this; };
+
+
+};
+
+typedef Vec2<float>  CMP_Vec2f;
+typedef Vec2<float>  CGU_Vec2f;
+typedef Vec2<float>  CGV_Vec2f;
+typedef Vec2<double> CMP_Vec2d;
+typedef Vec2<int>    CMP_Vec2i;
+
+//}
+
+
+
+
+//============================================= VEC3 ==================================================
+template<class T>
+class Vec3
+{
+public:
+
+    T x;
+    T y;
+    T z;
+
+    // *****************************************
+    //     Constructors
+    // *****************************************
+
+    /// Default constructor
+    Vec3() : x((T)0), y((T)0), z((T)0) {};
+
+    /// Value constructor
+    Vec3(const T& vx, const T& vy, const T& vz) : x(vx), y(vy), z(vz) {};
+
+    /// Copy constructor
+    Vec3(const Vec3<T>& val) : x(val.x), y(val.y), z(val.z) {};
+
+    /// Single value constructor.  Sets all components to the given value
+    Vec3(const T& v) : x(v), y(v), z(v) {};
+
+    /// Array constructor.  Assumes a 3-component array
+    Vec3(const T* v) : x(v[0]), y(v[1]), z(v[2]) {};
+
+    // *****************************************
+    //     Conversions/Assignment/Indexing
+    // *****************************************
+
+    /// cast to T*
+    operator const T* () const { return (const T*)this; };
+
+    /// cast to T*
+    operator T* () { return (T*)this; };
+
+    /// Assignment
+    const Vec3<T>& operator=(const Vec3<T>& rhs) { x = rhs.x; y = rhs.y; z = rhs.z; return *this; };
+
+    // *****************************************
+    //    Comparison
+    // *****************************************
+
+    /// Equality comparison
+    bool operator==(const Vec3<T>& rhs) const { return (x == rhs.x && y == rhs.y && z == rhs.z); };
+
+    /// Inequality comparision
+    bool operator!=(const Vec3<T>& rhs) const { return (x != rhs.x || y != rhs.y || z != rhs.z); };
+
+    // *****************************************
+    //    Arithmetic
+    // *****************************************
+
+    /// Addition
+    const Vec3<T> operator+(const Vec3<T>& rhs) const { return Vec3<T>(x + rhs.x, y + rhs.y, z + rhs.z); };
+
+    /// Subtraction
+    const Vec3<T> operator-(const Vec3<T>& rhs) const { return Vec3<T>(x - rhs.x, y - rhs.y, z - rhs.z); };
+
+    /// Multiply by scalar
+    const Vec3<T> operator*(const T& v) const { return Vec3<T>(x * v, y * v, z * v); };
+
+    /// Divide by scalar
+    const Vec3<T> operator/(const T& v) const { return Vec3<T>(x / v, y / v, z / v); };
+
+    /// Divide by vector
+    const Vec3<T> operator/(const Vec3<T>& rhs) const { return Vec3<T>(x / rhs.x, y / rhs.y, z / rhs.z); };
+
+    /// Addition in-place
+    Vec3<T>& operator+= (const Vec3<T>& rhs) { x += rhs.x; y += rhs.y; z += rhs.z; return *this; };
+
+    /// Subtract in-place
+    Vec3<T>& operator-= (const Vec3<T>& rhs) { x -= rhs.x; y -= rhs.y; z -= rhs.z; return *this; };
+
+    /// Scalar multiply in-place
+    Vec3<T>& operator*= (const T& v) { x *= v; y *= v; z *= v; return *this; };
+
+    /// Scalar divide in-place
+    Vec3<T>& operator/= (const T& v) { x /= v; y /= v; z /= v; return *this; };
+};
+
+typedef Vec3<float>             CGU_Vec3f;
+typedef Vec3<float>             CGV_Vec3f;
+typedef Vec3<unsigned char>     CGU_Vec3uc;
+typedef Vec3<unsigned char>     CGV_Vec3uc;
+
+typedef Vec3<float>             CMP_Vec3f;
+typedef Vec3<double>            CMP_Vec3d;
+typedef Vec3<int>               CMP_Vec3i;
+typedef Vec3<unsigned char>     CMP_Vec3uc;
+
+//============================================= VEC4 ==================================================
+template<class T>
+class Vec4
+{
+public:
+
+    T x;
+    T y;
+    T z;
+    T w;
+
+    // *****************************************
+    //     Constructors
+    // *****************************************
+
+    /// Default constructor
+    Vec4() : x((T)0), y((T)0), z((T)0), w((T)0) {};
+
+    /// Value constructor
+    Vec4(const T& vx, const T& vy, const T& vz, const T& vw) : x(vx), y(vy), z(vz), w(vw) {};
+
+    /// Copy constructor
+    Vec4(const Vec4<T>& val) : x(val.x), y(val.y), z(val.z), w(val.w) {};
+
+    /// Single value constructor.  Sets all components to the given value
+    Vec4(const T& v) : x(v), y(v), z(v), w(v) {};
+
+    /// Array constructor.  Assumes a 4-component array
+    Vec4(const T* v) : x(v[0]), y(v[1]), z(v[2]), w(v[3]) {};
+
+    // *****************************************
+    //     Conversions/Assignment/Indexing
+    // *****************************************
+
+    /// cast to T*
+    operator const T* () const { return (const T*)this; };
+
+    /// cast to T*
+    operator T* () { return (T*)this; };
+
+    /// Assignment
+    const Vec4<T>& operator=(const Vec4<T>& rhs) { x = rhs.x; y = rhs.y; z = rhs.z;  w = rhs.w; return *this; };
+
+    // *****************************************
+    //    Comparison
+    // *****************************************
+
+    /// Equality comparison
+    bool operator==(const Vec4<T>& rhs) const { return (x == rhs.x && y == rhs.y && z == rhs.z && w == rhs.w); };
+
+    /// Inequality comparision
+    bool operator!=(const Vec4<T>& rhs) const { return (x != rhs.x || y != rhs.y || z != rhs.z || w != rhs.w); };
+
+    // *****************************************
+    //    Arithmetic
+    // *****************************************
+
+    /// Addition
+    const Vec4<T> operator+(const Vec4<T>& rhs) const { return Vec4<T>(x + rhs.x, y + rhs.y, z + rhs.z, w + rhs.w); };
+
+    /// Subtraction
+    const Vec4<T> operator-(const Vec4<T>& rhs) const { return Vec4<T>(x - rhs.x, y - rhs.y, z - rhs.z, w - rhs.w); };
+
+    /// Multiply by scalar
+    const Vec4<T> operator*(const T& v) const { return Vec4<T>(x * v, y * v, z * v, w * v); };
+
+    /// Divide by scalar
+    const Vec4<T> operator/(const T& v) const { return Vec4<T>(x / v, y / v, z / v, w / v); };
+
+    /// Divide by vector
+    const Vec4<T> operator/(const Vec4<T>& rhs) const { return Vec4<T>(x / rhs.x, y / rhs.y, z / rhs.z, w / rhs.w); };
+
+    /// Addition in-place
+    Vec4<T>& operator+= (const Vec4<T>& rhs) { x += rhs.x; y += rhs.y; z += rhs.z; w += rhs.w; return *this; };
+
+    /// Subtract in-place
+    Vec4<T>& operator-= (const Vec4<T>& rhs) { x -= rhs.x; y -= rhs.y; z -= rhs.z; w -= rhs.w; return *this; };
+
+    /// Scalar multiply in-place
+    Vec4<T>& operator*= (const T& v) { x *= v; y *= v; z *= v; w *= v; return *this; };
+
+    /// Scalar divide in-place
+    Vec4<T>& operator/= (const T& v) { x /= v; y /= v; z /= v; w /= v; return *this; };
+};
+
+#include <stdio.h>
+#include "xmmintrin.h"
+#include <math.h>
+#include <float.h> 
+
+// SSE Vec4
+#ifdef _LINUX
+class CMP_SSEVec4f
+#else
+#include "intrin.h"
+class   __declspec(align(16)) CMP_SSEVec4f
+#endif
+{
+public:
+
+    union
+    {
+        __m128 vec128;          // float Vector 128 bits in total (16 Bytes) = array of 4 floats
+#ifdef _LINUX
+        float f32[4];
+#endif
+    };
+
+    // constructors
+    inline CMP_SSEVec4f() {};
+    inline CMP_SSEVec4f(float x, float y, float z, float w) : vec128(_mm_setr_ps(x, y, z, w)) {};
+    inline CMP_SSEVec4f(__m128 vec) : vec128(vec) {}
+    inline CMP_SSEVec4f(const float* data) : vec128(_mm_load_ps(data)) {};
+    inline CMP_SSEVec4f(float scalar) : vec128(_mm_load1_ps(&scalar)) {};
+
+    // copy and assignment
+    inline CMP_SSEVec4f(const CMP_SSEVec4f& init) : vec128(init.vec128) {};
+    inline const CMP_SSEVec4f& operator=(const CMP_SSEVec4f& lhs) { vec128 = lhs.vec128; return *this; };
+
+    // conversion to m128 type for direct use in _mm intrinsics
+    inline operator __m128() { return vec128; };
+    inline operator const __m128() const { return vec128; };
+
+    // indexing
+#ifdef _LINUX
+    inline const float& operator[](int i) const { return f32[i]; };
+    inline float& operator[](int i) { return f32[i]; };
+#else
+    inline const float& operator[](int i) const { return vec128.m128_f32[i]; };
+    inline float& operator[](int i) { return vec128.m128_f32[i]; };
+#endif
+
+    // addition
+    inline CMP_SSEVec4f operator+(const CMP_SSEVec4f& rhs) const { return CMP_SSEVec4f(_mm_add_ps(vec128, rhs.vec128)); };
+    inline CMP_SSEVec4f& operator+=(const CMP_SSEVec4f& rhs) { vec128 = _mm_add_ps(vec128, rhs.vec128); return *this; };
+
+    // multiplication
+    inline CMP_SSEVec4f operator*(const CMP_SSEVec4f& rhs) const { return CMP_SSEVec4f(_mm_mul_ps(vec128, rhs.vec128)); };
+    inline CMP_SSEVec4f& operator*=(const CMP_SSEVec4f& rhs) { vec128 = _mm_mul_ps(vec128, rhs.vec128); return *this; };
+
+    // scalar multiplication
+    //inline CMP_SSEVec4f operator*( float rhs ) const { return CMP_SSEVec4f( _mm_mul_ps(vec128, _mm_load1_ps(&rhs)) ); };
+    //inline CMP_SSEVec4f& operator*=( float rhs )  { vec128 = _mm_mul_ps(vec128, _mm_load1_ps(&rhs)); return *this; };
+
+
+    // subtraction
+    inline CMP_SSEVec4f operator-(const CMP_SSEVec4f& rhs) const { return CMP_SSEVec4f(_mm_sub_ps(vec128, rhs.vec128)); };
+    inline CMP_SSEVec4f& operator-= (const CMP_SSEVec4f& rhs) { vec128 = _mm_sub_ps(vec128, rhs.vec128); return *this; };
+
+    // division
+    inline CMP_SSEVec4f operator/(const CMP_SSEVec4f& rhs) const { return CMP_SSEVec4f(_mm_div_ps(vec128, rhs.vec128)); };
+    inline CMP_SSEVec4f& operator/= (const CMP_SSEVec4f& rhs) { vec128 = _mm_div_ps(vec128, rhs.vec128); return *this; };
+
+    // scalar division
+    inline CMP_SSEVec4f operator/(float rhs)   const { return CMP_SSEVec4f(_mm_div_ps(vec128, _mm_load1_ps(&rhs))); };
+    inline CMP_SSEVec4f& operator/=(float rhs) { vec128 = _mm_div_ps(vec128, _mm_load1_ps(&rhs)); return *this; };
+
+    // comparison
+    // these return 0 or 0xffffffff in each component
+    inline CMP_SSEVec4f operator< (const CMP_SSEVec4f& rhs) const { return CMP_SSEVec4f(_mm_cmplt_ps(vec128, rhs.vec128)); };
+    inline CMP_SSEVec4f operator> (const CMP_SSEVec4f& rhs) const { return CMP_SSEVec4f(_mm_cmpgt_ps(vec128, rhs.vec128)); };
+    inline CMP_SSEVec4f operator<=(const CMP_SSEVec4f& rhs) const { return CMP_SSEVec4f(_mm_cmple_ps(vec128, rhs.vec128)); };
+    inline CMP_SSEVec4f operator>=(const CMP_SSEVec4f& rhs) const { return CMP_SSEVec4f(_mm_cmpge_ps(vec128, rhs.vec128)); };
+    inline CMP_SSEVec4f operator==(const CMP_SSEVec4f& rhs) const { return CMP_SSEVec4f(_mm_cmpeq_ps(vec128, rhs.vec128)); };
+
+    // bitwise operators
+    inline CMP_SSEVec4f operator|(const CMP_SSEVec4f& rhs) const { return CMP_SSEVec4f(_mm_or_ps(vec128, rhs.vec128)); };
+    inline CMP_SSEVec4f operator&(const CMP_SSEVec4f& rhs) const { return CMP_SSEVec4f(_mm_and_ps(vec128, rhs.vec128)); };
+    inline CMP_SSEVec4f operator^(const CMP_SSEVec4f& rhs) const { return CMP_SSEVec4f(_mm_xor_ps(vec128, rhs.vec128)); };
+    inline const CMP_SSEVec4f& operator|=(const CMP_SSEVec4f& rhs) { vec128 = _mm_or_ps(vec128, rhs.vec128); return *this; };
+    inline const CMP_SSEVec4f& operator&=(const CMP_SSEVec4f& rhs) { vec128 = _mm_and_ps(vec128, rhs.vec128); return *this; };
+
+    // for some horrible reason,there's no bitwise not instruction for SSE,
+    // so we have to do xor with 0xfffffff in order to fake it.
+    // TO get a 0xffffffff, we execute 0=0
+    inline CMP_SSEVec4f operator~() const
+    {
+        __m128 zero = _mm_setzero_ps();
+        __m128 is_true = _mm_cmpeq_ps(zero, zero);
+        return _mm_xor_ps(is_true, vec128);
+    };
+
+};
+
+typedef Vec4<float>             CMP_Vec4f;
+typedef Vec4<double>            CMP_Vec4d;
+typedef Vec4<int>               CMP_Vec4i;
+typedef Vec4<unsigned int>      CMP_Vec4ui;         // unsigned 16 bit x,y,x,w
+typedef Vec4<unsigned char>     CMP_Vec4uc;         // unsigned 8  bit x,y,x,w
+
+typedef Vec4<unsigned char>     CGU_Vec4uc;         // unsigned 8  bit x,y,x,w
+typedef Vec4<unsigned char>     CGV_Vec4uc;         // unsigned 8  bit x,y,x,w
+
+#endif // not ASPM_GPU
+
+#endif // Header Guard
+
diff --git a/extern/CMP_Core/test/BlockConstants.h b/extern/CMP_Core/test/BlockConstants.h
new file mode 100644
index 0000000..e1c5232
--- /dev/null
+++ b/extern/CMP_Core/test/BlockConstants.h
@@ -0,0 +1,228 @@
+#ifndef BLOCKCONSTANTS_H
+#define BLOCKCONSTANTS_H
+#include <string>
+#include <unordered_map>
+struct Block { const unsigned char* data; const unsigned char* color; };
+
+static const unsigned char BC1_Red_Ignore_Alpha [] {0x0 , 0xf8, 0x0 , 0xf8, 0x0 , 0x0 , 0x0 , 0x0 };
+static const unsigned char BC1_Blue_Half_Alpha [] {0x0 , 0x0 , 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
+static const unsigned char BC1_White_Half_Alpha [] {0x0 , 0x0 , 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
+static const unsigned char BC1_Black_Half_Alpha [] {0x0 , 0x0 , 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
+static const unsigned char BC1_Red_Blue_Half_Alpha [] {0x0 , 0x0 , 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
+static const unsigned char BC1_Red_Green_Half_Alpha [] {0x0 , 0x0 , 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
+static const unsigned char BC1_Green_Blue_Half_Alpha [] {0x0 , 0x0 , 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
+static const unsigned char BC1_Red_Full_Alpha [] {0x0 , 0x0 , 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
+static const unsigned char BC1_Green_Full_Alpha [] {0x0 , 0x0 , 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
+static const unsigned char BC1_Blue_Full_Alpha [] {0x0 , 0x0 , 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
+static const unsigned char BC1_White_Full_Alpha [] {0x0 , 0x0 , 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
+static const unsigned char BC1_Green_Ignore_Alpha [] {0xe0, 0x7 , 0xe0, 0x7 , 0x0 , 0x0 , 0x0 , 0x0 };
+static const unsigned char BC1_Black_Full_Alpha [] {0x0 , 0x0 , 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
+static const unsigned char BC1_Red_Blue_Full_Alpha [] {0x0 , 0x0 , 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
+static const unsigned char BC1_Red_Green_Full_Alpha [] {0x0 , 0x0 , 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
+static const unsigned char BC1_Green_Blue_Full_Alpha [] {0x0 , 0x0 , 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
+static const unsigned char BC1_Blue_Ignore_Alpha [] {0x1f, 0x0 , 0x1f, 0x0 , 0x0 , 0x0 , 0x0 , 0x0 };
+static const unsigned char BC1_White_Ignore_Alpha [] {0xff, 0xff, 0xff, 0xff, 0x0 , 0x0 , 0x0 , 0x0 };
+static const unsigned char BC1_Black_Ignore_Alpha [] {0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 };
+static const unsigned char BC1_Red_Blue_Ignore_Alpha [] {0x1f, 0xf8, 0x1f, 0xf8, 0x0 , 0x0 , 0x0 , 0x0 };
+static const unsigned char BC1_Red_Green_Ignore_Alpha [] {0xe0, 0xff, 0xe0, 0xff, 0x0 , 0x0 , 0x0 , 0x0 };
+static const unsigned char BC1_Green_Blue_Ignore_Alpha [] {0xff, 0x7 , 0xff, 0x7 , 0x0 , 0x0 , 0x0 , 0x0 };
+static const unsigned char BC1_Red_Half_Alpha [] {0x0 , 0x0 , 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
+static const unsigned char BC1_Green_Half_Alpha [] {0x0 , 0x0 , 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
+static const unsigned char BC2_Red_Ignore_Alpha [] {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x0 , 0xf8, 0x0 , 0xf8, 0x0 , 0x0 , 0x0 , 0x0 };
+static const unsigned char BC2_Blue_Half_Alpha [] {0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x1f, 0x0 , 0x1f, 0x0 , 0x0 , 0x0 , 0x0 , 0x0 };
+static const unsigned char BC2_White_Half_Alpha [] {0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0xff, 0xff, 0xff, 0xff, 0x0 , 0x0 , 0x0 , 0x0 };
+static const unsigned char BC2_Black_Half_Alpha [] {0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 };
+static const unsigned char BC2_Red_Blue_Half_Alpha [] {0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x1f, 0xf8, 0x1f, 0xf8, 0x0 , 0x0 , 0x0 , 0x0 };
+static const unsigned char BC2_Red_Green_Half_Alpha [] {0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0xe0, 0xff, 0xe0, 0xff, 0x0 , 0x0 , 0x0 , 0x0 };
+static const unsigned char BC2_Green_Blue_Half_Alpha [] {0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0xff, 0x7 , 0xff, 0x7 , 0x0 , 0x0 , 0x0 , 0x0 };
+static const unsigned char BC2_Red_Full_Alpha [] {0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0xf8, 0x0 , 0xf8, 0x0 , 0x0 , 0x0 , 0x0 };
+static const unsigned char BC2_Green_Full_Alpha [] {0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0xe0, 0x7 , 0xe0, 0x7 , 0x0 , 0x0 , 0x0 , 0x0 };
+static const unsigned char BC2_Blue_Full_Alpha [] {0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x1f, 0x0 , 0x1f, 0x0 , 0x0 , 0x0 , 0x0 , 0x0 };
+static const unsigned char BC2_White_Full_Alpha [] {0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0xff, 0xff, 0xff, 0xff, 0x0 , 0x0 , 0x0 , 0x0 };
+static const unsigned char BC2_Green_Ignore_Alpha [] {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xe0, 0x7 , 0xe0, 0x7 , 0x0 , 0x0 , 0x0 , 0x0 };
+static const unsigned char BC2_Black_Full_Alpha [] {0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 };
+static const unsigned char BC2_Red_Blue_Full_Alpha [] {0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x1f, 0xf8, 0x1f, 0xf8, 0x0 , 0x0 , 0x0 , 0x0 };
+static const unsigned char BC2_Red_Green_Full_Alpha [] {0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0xe0, 0xff, 0xe0, 0xff, 0x0 , 0x0 , 0x0 , 0x0 };
+static const unsigned char BC2_Green_Blue_Full_Alpha [] {0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0xff, 0x7 , 0xff, 0x7 , 0x0 , 0x0 , 0x0 , 0x0 };
+static const unsigned char BC2_Blue_Ignore_Alpha [] {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x1f, 0x0 , 0x1f, 0x0 , 0x0 , 0x0 , 0x0 , 0x0 };
+static const unsigned char BC2_White_Ignore_Alpha [] {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x0 , 0x0 , 0x0 , 0x0 };
+static const unsigned char BC2_Black_Ignore_Alpha [] {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 };
+static const unsigned char BC2_Red_Blue_Ignore_Alpha [] {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x1f, 0xf8, 0x1f, 0xf8, 0x0 , 0x0 , 0x0 , 0x0 };
+static const unsigned char BC2_Red_Green_Ignore_Alpha [] {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xe0, 0xff, 0xe0, 0xff, 0x0 , 0x0 , 0x0 , 0x0 };
+static const unsigned char BC2_Green_Blue_Ignore_Alpha [] {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7 , 0xff, 0x7 , 0x0 , 0x0 , 0x0 , 0x0 };
+static const unsigned char BC2_Red_Half_Alpha [] {0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x0 , 0xf8, 0x0 , 0xf8, 0x0 , 0x0 , 0x0 , 0x0 };
+static const unsigned char BC2_Green_Half_Alpha [] {0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0xe0, 0x7 , 0xe0, 0x7 , 0x0 , 0x0 , 0x0 , 0x0 };
+static const unsigned char BC3_Red_Ignore_Alpha [] {0xff, 0xff, 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0xf8, 0x0 , 0xf8, 0x0 , 0x0 , 0x0 , 0x0 };
+static const unsigned char BC3_Blue_Half_Alpha [] {0x7b, 0x7b, 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x1f, 0x0 , 0x1f, 0x0 , 0x0 , 0x0 , 0x0 , 0x0 };
+static const unsigned char BC3_White_Half_Alpha [] {0x7b, 0x7b, 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0xff, 0xff, 0xff, 0xff, 0x0 , 0x0 , 0x0 , 0x0 };
+static const unsigned char BC3_Black_Half_Alpha [] {0x7b, 0x7b, 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 };
+static const unsigned char BC3_Red_Blue_Half_Alpha [] {0x7b, 0x7b, 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x1f, 0xf8, 0x1f, 0xf8, 0x0 , 0x0 , 0x0 , 0x0 };
+static const unsigned char BC3_Red_Green_Half_Alpha [] {0x7b, 0x7b, 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0xe0, 0xff, 0xe0, 0xff, 0x0 , 0x0 , 0x0 , 0x0 };
+static const unsigned char BC3_Green_Blue_Half_Alpha [] {0x7b, 0x7b, 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0xff, 0x7 , 0xff, 0x7 , 0x0 , 0x0 , 0x0 , 0x0 };
+static const unsigned char BC3_Red_Full_Alpha [] {0xff, 0x0 , 0x49, 0x92, 0x24, 0x49, 0x92, 0x24, 0x0 , 0xf8, 0x0 , 0xf8, 0x0 , 0x0 , 0x0 , 0x0 };
+static const unsigned char BC3_Green_Full_Alpha [] {0xff, 0x0 , 0x49, 0x92, 0x24, 0x49, 0x92, 0x24, 0xe0, 0x7 , 0xe0, 0x7 , 0x0 , 0x0 , 0x0 , 0x0 };
+static const unsigned char BC3_Blue_Full_Alpha [] {0xff, 0x0 , 0x49, 0x92, 0x24, 0x49, 0x92, 0x24, 0x1f, 0x0 , 0x1f, 0x0 , 0x0 , 0x0 , 0x0 , 0x0 };
+static const unsigned char BC3_White_Full_Alpha [] {0xff, 0x0 , 0x49, 0x92, 0x24, 0x49, 0x92, 0x24, 0xff, 0xff, 0xff, 0xff, 0x0 , 0x0 , 0x0 , 0x0 };
+static const unsigned char BC3_Green_Ignore_Alpha [] {0xff, 0xff, 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0xe0, 0x7 , 0xe0, 0x7 , 0x0 , 0x0 , 0x0 , 0x0 };
+static const unsigned char BC3_Black_Full_Alpha [] {0xff, 0x0 , 0x49, 0x92, 0x24, 0x49, 0x92, 0x24, 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 };
+static const unsigned char BC3_Red_Blue_Full_Alpha [] {0xff, 0x0 , 0x49, 0x92, 0x24, 0x49, 0x92, 0x24, 0x1f, 0xf8, 0x1f, 0xf8, 0x0 , 0x0 , 0x0 , 0x0 };
+static const unsigned char BC3_Red_Green_Full_Alpha [] {0xff, 0x0 , 0x49, 0x92, 0x24, 0x49, 0x92, 0x24, 0xe0, 0xff, 0xe0, 0xff, 0x0 , 0x0 , 0x0 , 0x0 };
+static const unsigned char BC3_Green_Blue_Full_Alpha [] {0xff, 0x0 , 0x49, 0x92, 0x24, 0x49, 0x92, 0x24, 0xff, 0x7 , 0xff, 0x7 , 0x0 , 0x0 , 0x0 , 0x0 };
+static const unsigned char BC3_Blue_Ignore_Alpha [] {0xff, 0xff, 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x1f, 0x0 , 0x1f, 0x0 , 0x0 , 0x0 , 0x0 , 0x0 };
+static const unsigned char BC3_White_Ignore_Alpha [] {0xff, 0xff, 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0xff, 0xff, 0xff, 0xff, 0x0 , 0x0 , 0x0 , 0x0 };
+static const unsigned char BC3_Black_Ignore_Alpha [] {0xff, 0xff, 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 };
+static const unsigned char BC3_Red_Blue_Ignore_Alpha [] {0xff, 0xff, 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x1f, 0xf8, 0x1f, 0xf8, 0x0 , 0x0 , 0x0 , 0x0 };
+static const unsigned char BC3_Red_Green_Ignore_Alpha [] {0xff, 0xff, 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0xe0, 0xff, 0xe0, 0xff, 0x0 , 0x0 , 0x0 , 0x0 };
+static const unsigned char BC3_Green_Blue_Ignore_Alpha [] {0xff, 0xff, 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0xff, 0x7 , 0xff, 0x7 , 0x0 , 0x0 , 0x0 , 0x0 };
+static const unsigned char BC3_Red_Half_Alpha [] {0x7b, 0x7b, 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0xf8, 0x0 , 0xf8, 0x0 , 0x0 , 0x0 , 0x0 };
+static const unsigned char BC3_Green_Half_Alpha [] {0x7b, 0x7b, 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0x0 , 0xe0, 0x7 , 0xe0, 0x7 , 0x0 , 0x0 , 0x0 , 0x0 };
+
+Block BC1_Red_Ignore_Alpha_Block = {BC1_Red_Ignore_Alpha, nullptr};
+Block BC1_Blue_Half_Alpha_Block = {BC1_Blue_Half_Alpha, nullptr};
+Block BC1_White_Half_Alpha_Block = {BC1_White_Half_Alpha, nullptr};
+Block BC1_Black_Half_Alpha_Block = {BC1_Black_Half_Alpha, nullptr};
+Block BC1_Red_Blue_Half_Alpha_Block = {BC1_Red_Blue_Half_Alpha, nullptr};
+Block BC1_Red_Green_Half_Alpha_Block = {BC1_Red_Green_Half_Alpha, nullptr};
+Block BC1_Green_Blue_Half_Alpha_Block = {BC1_Green_Blue_Half_Alpha, nullptr};
+Block BC1_Red_Full_Alpha_Block = {BC1_Red_Full_Alpha, nullptr};
+Block BC1_Green_Full_Alpha_Block = {BC1_Green_Full_Alpha, nullptr};
+Block BC1_Blue_Full_Alpha_Block = {BC1_Blue_Full_Alpha, nullptr};
+Block BC1_White_Full_Alpha_Block = {BC1_White_Full_Alpha, nullptr};
+Block BC1_Green_Ignore_Alpha_Block = {BC1_Green_Ignore_Alpha, nullptr};
+Block BC1_Black_Full_Alpha_Block = {BC1_Black_Full_Alpha, nullptr};
+Block BC1_Red_Blue_Full_Alpha_Block = {BC1_Red_Blue_Full_Alpha, nullptr};
+Block BC1_Red_Green_Full_Alpha_Block = {BC1_Red_Green_Full_Alpha, nullptr};
+Block BC1_Green_Blue_Full_Alpha_Block = {BC1_Green_Blue_Full_Alpha, nullptr};
+Block BC1_Blue_Ignore_Alpha_Block = {BC1_Blue_Ignore_Alpha, nullptr};
+Block BC1_White_Ignore_Alpha_Block = {BC1_White_Ignore_Alpha, nullptr};
+Block BC1_Black_Ignore_Alpha_Block = {BC1_Black_Ignore_Alpha, nullptr};
+Block BC1_Red_Blue_Ignore_Alpha_Block = {BC1_Red_Blue_Ignore_Alpha, nullptr};
+Block BC1_Red_Green_Ignore_Alpha_Block = {BC1_Red_Green_Ignore_Alpha, nullptr};
+Block BC1_Green_Blue_Ignore_Alpha_Block = {BC1_Green_Blue_Ignore_Alpha, nullptr};
+Block BC1_Red_Half_Alpha_Block = {BC1_Red_Half_Alpha, nullptr};
+Block BC1_Green_Half_Alpha_Block = {BC1_Green_Half_Alpha, nullptr};
+Block BC2_Red_Ignore_Alpha_Block = {BC2_Red_Ignore_Alpha, nullptr};
+Block BC2_Blue_Half_Alpha_Block = {BC2_Blue_Half_Alpha, nullptr};
+Block BC2_White_Half_Alpha_Block = {BC2_White_Half_Alpha, nullptr};
+Block BC2_Black_Half_Alpha_Block = {BC2_Black_Half_Alpha, nullptr};
+Block BC2_Red_Blue_Half_Alpha_Block = {BC2_Red_Blue_Half_Alpha, nullptr};
+Block BC2_Red_Green_Half_Alpha_Block = {BC2_Red_Green_Half_Alpha, nullptr};
+Block BC2_Green_Blue_Half_Alpha_Block = {BC2_Green_Blue_Half_Alpha, nullptr};
+Block BC2_Red_Full_Alpha_Block = {BC2_Red_Full_Alpha, nullptr};
+Block BC2_Green_Full_Alpha_Block = {BC2_Green_Full_Alpha, nullptr};
+Block BC2_Blue_Full_Alpha_Block = {BC2_Blue_Full_Alpha, nullptr};
+Block BC2_White_Full_Alpha_Block = {BC2_White_Full_Alpha, nullptr};
+Block BC2_Green_Ignore_Alpha_Block = {BC2_Green_Ignore_Alpha, nullptr};
+Block BC2_Black_Full_Alpha_Block = {BC2_Black_Full_Alpha, nullptr};
+Block BC2_Red_Blue_Full_Alpha_Block = {BC2_Red_Blue_Full_Alpha, nullptr};
+Block BC2_Red_Green_Full_Alpha_Block = {BC2_Red_Green_Full_Alpha, nullptr};
+Block BC2_Green_Blue_Full_Alpha_Block = {BC2_Green_Blue_Full_Alpha, nullptr};
+Block BC2_Blue_Ignore_Alpha_Block = {BC2_Blue_Ignore_Alpha, nullptr};
+Block BC2_White_Ignore_Alpha_Block = {BC2_White_Ignore_Alpha, nullptr};
+Block BC2_Black_Ignore_Alpha_Block = {BC2_Black_Ignore_Alpha, nullptr};
+Block BC2_Red_Blue_Ignore_Alpha_Block = {BC2_Red_Blue_Ignore_Alpha, nullptr};
+Block BC2_Red_Green_Ignore_Alpha_Block = {BC2_Red_Green_Ignore_Alpha, nullptr};
+Block BC2_Green_Blue_Ignore_Alpha_Block = {BC2_Green_Blue_Ignore_Alpha, nullptr};
+Block BC2_Red_Half_Alpha_Block = {BC2_Red_Half_Alpha, nullptr};
+Block BC2_Green_Half_Alpha_Block = {BC2_Green_Half_Alpha, nullptr};
+Block BC3_Red_Ignore_Alpha_Block = {BC3_Red_Ignore_Alpha, nullptr};
+Block BC3_Blue_Half_Alpha_Block = {BC3_Blue_Half_Alpha, nullptr};
+Block BC3_White_Half_Alpha_Block = {BC3_White_Half_Alpha, nullptr};
+Block BC3_Black_Half_Alpha_Block = {BC3_Black_Half_Alpha, nullptr};
+Block BC3_Red_Blue_Half_Alpha_Block = {BC3_Red_Blue_Half_Alpha, nullptr};
+Block BC3_Red_Green_Half_Alpha_Block = {BC3_Red_Green_Half_Alpha, nullptr};
+Block BC3_Green_Blue_Half_Alpha_Block = {BC3_Green_Blue_Half_Alpha, nullptr};
+Block BC3_Red_Full_Alpha_Block = {BC3_Red_Full_Alpha, nullptr};
+Block BC3_Green_Full_Alpha_Block = {BC3_Green_Full_Alpha, nullptr};
+Block BC3_Blue_Full_Alpha_Block = {BC3_Blue_Full_Alpha, nullptr};
+Block BC3_White_Full_Alpha_Block = {BC3_White_Full_Alpha, nullptr};
+Block BC3_Green_Ignore_Alpha_Block = {BC3_Green_Ignore_Alpha, nullptr};
+Block BC3_Black_Full_Alpha_Block = {BC3_Black_Full_Alpha, nullptr};
+Block BC3_Red_Blue_Full_Alpha_Block = {BC3_Red_Blue_Full_Alpha, nullptr};
+Block BC3_Red_Green_Full_Alpha_Block = {BC3_Red_Green_Full_Alpha, nullptr};
+Block BC3_Green_Blue_Full_Alpha_Block = {BC3_Green_Blue_Full_Alpha, nullptr};
+Block BC3_Blue_Ignore_Alpha_Block = {BC3_Blue_Ignore_Alpha, nullptr};
+Block BC3_White_Ignore_Alpha_Block = {BC3_White_Ignore_Alpha, nullptr};
+Block BC3_Black_Ignore_Alpha_Block = {BC3_Black_Ignore_Alpha, nullptr};
+Block BC3_Red_Blue_Ignore_Alpha_Block = {BC3_Red_Blue_Ignore_Alpha, nullptr};
+Block BC3_Red_Green_Ignore_Alpha_Block = {BC3_Red_Green_Ignore_Alpha, nullptr};
+Block BC3_Green_Blue_Ignore_Alpha_Block = {BC3_Green_Blue_Ignore_Alpha, nullptr};
+Block BC3_Red_Half_Alpha_Block = {BC3_Red_Half_Alpha, nullptr};
+Block BC3_Green_Half_Alpha_Block = {BC3_Green_Half_Alpha, nullptr};
+
+static std::unordered_map<std::string, Block> blocks {
+	{ "BC1_Red_Ignore_Alpha", BC1_Red_Ignore_Alpha_Block},
+	{ "BC1_Blue_Half_Alpha", BC1_Blue_Half_Alpha_Block},
+	{ "BC1_White_Half_Alpha", BC1_White_Half_Alpha_Block},
+	{ "BC1_Black_Half_Alpha", BC1_Black_Half_Alpha_Block},
+	{ "BC1_Red_Blue_Half_Alpha", BC1_Red_Blue_Half_Alpha_Block},
+	{ "BC1_Red_Green_Half_Alpha", BC1_Red_Green_Half_Alpha_Block},
+	{ "BC1_Green_Blue_Half_Alpha", BC1_Green_Blue_Half_Alpha_Block},
+	{ "BC1_Red_Full_Alpha", BC1_Red_Full_Alpha_Block},
+	{ "BC1_Green_Full_Alpha", BC1_Green_Full_Alpha_Block},
+	{ "BC1_Blue_Full_Alpha", BC1_Blue_Full_Alpha_Block},
+	{ "BC1_White_Full_Alpha", BC1_White_Full_Alpha_Block},
+	{ "BC1_Green_Ignore_Alpha", BC1_Green_Ignore_Alpha_Block},
+	{ "BC1_Black_Full_Alpha", BC1_Black_Full_Alpha_Block},
+	{ "BC1_Red_Blue_Full_Alpha", BC1_Red_Blue_Full_Alpha_Block},
+	{ "BC1_Red_Green_Full_Alpha", BC1_Red_Green_Full_Alpha_Block},
+	{ "BC1_Green_Blue_Full_Alpha", BC1_Green_Blue_Full_Alpha_Block},
+	{ "BC1_Blue_Ignore_Alpha", BC1_Blue_Ignore_Alpha_Block},
+	{ "BC1_White_Ignore_Alpha", BC1_White_Ignore_Alpha_Block},
+	{ "BC1_Black_Ignore_Alpha", BC1_Black_Ignore_Alpha_Block},
+	{ "BC1_Red_Blue_Ignore_Alpha", BC1_Red_Blue_Ignore_Alpha_Block},
+	{ "BC1_Red_Green_Ignore_Alpha", BC1_Red_Green_Ignore_Alpha_Block},
+	{ "BC1_Green_Blue_Ignore_Alpha", BC1_Green_Blue_Ignore_Alpha_Block},
+	{ "BC1_Red_Half_Alpha", BC1_Red_Half_Alpha_Block},
+	{ "BC1_Green_Half_Alpha", BC1_Green_Half_Alpha_Block},
+	{ "BC2_Red_Ignore_Alpha", BC2_Red_Ignore_Alpha_Block},
+	{ "BC2_Blue_Half_Alpha", BC2_Blue_Half_Alpha_Block},
+	{ "BC2_White_Half_Alpha", BC2_White_Half_Alpha_Block},
+	{ "BC2_Black_Half_Alpha", BC2_Black_Half_Alpha_Block},
+	{ "BC2_Red_Blue_Half_Alpha", BC2_Red_Blue_Half_Alpha_Block},
+	{ "BC2_Red_Green_Half_Alpha", BC2_Red_Green_Half_Alpha_Block},
+	{ "BC2_Green_Blue_Half_Alpha", BC2_Green_Blue_Half_Alpha_Block},
+	{ "BC2_Red_Full_Alpha", BC2_Red_Full_Alpha_Block},
+	{ "BC2_Green_Full_Alpha", BC2_Green_Full_Alpha_Block},
+	{ "BC2_Blue_Full_Alpha", BC2_Blue_Full_Alpha_Block},
+	{ "BC2_White_Full_Alpha", BC2_White_Full_Alpha_Block},
+	{ "BC2_Green_Ignore_Alpha", BC2_Green_Ignore_Alpha_Block},
+	{ "BC2_Black_Full_Alpha", BC2_Black_Full_Alpha_Block},
+	{ "BC2_Red_Blue_Full_Alpha", BC2_Red_Blue_Full_Alpha_Block},
+	{ "BC2_Red_Green_Full_Alpha", BC2_Red_Green_Full_Alpha_Block},
+	{ "BC2_Green_Blue_Full_Alpha", BC2_Green_Blue_Full_Alpha_Block},
+	{ "BC2_Blue_Ignore_Alpha", BC2_Blue_Ignore_Alpha_Block},
+	{ "BC2_White_Ignore_Alpha", BC2_White_Ignore_Alpha_Block},
+	{ "BC2_Black_Ignore_Alpha", BC2_Black_Ignore_Alpha_Block},
+	{ "BC2_Red_Blue_Ignore_Alpha", BC2_Red_Blue_Ignore_Alpha_Block},
+	{ "BC2_Red_Green_Ignore_Alpha", BC2_Red_Green_Ignore_Alpha_Block},
+	{ "BC2_Green_Blue_Ignore_Alpha", BC2_Green_Blue_Ignore_Alpha_Block},
+	{ "BC2_Red_Half_Alpha", BC2_Red_Half_Alpha_Block},
+	{ "BC2_Green_Half_Alpha", BC2_Green_Half_Alpha_Block},
+	{ "BC3_Red_Ignore_Alpha", BC3_Red_Ignore_Alpha_Block},
+	{ "BC3_Blue_Half_Alpha", BC3_Blue_Half_Alpha_Block},
+	{ "BC3_White_Half_Alpha", BC3_White_Half_Alpha_Block},
+	{ "BC3_Black_Half_Alpha", BC3_Black_Half_Alpha_Block},
+	{ "BC3_Red_Blue_Half_Alpha", BC3_Red_Blue_Half_Alpha_Block},
+	{ "BC3_Red_Green_Half_Alpha", BC3_Red_Green_Half_Alpha_Block},
+	{ "BC3_Green_Blue_Half_Alpha", BC3_Green_Blue_Half_Alpha_Block},
+	{ "BC3_Red_Full_Alpha", BC3_Red_Full_Alpha_Block},
+	{ "BC3_Green_Full_Alpha", BC3_Green_Full_Alpha_Block},
+	{ "BC3_Blue_Full_Alpha", BC3_Blue_Full_Alpha_Block},
+	{ "BC3_White_Full_Alpha", BC3_White_Full_Alpha_Block},
+	{ "BC3_Green_Ignore_Alpha", BC3_Green_Ignore_Alpha_Block},
+	{ "BC3_Black_Full_Alpha", BC3_Black_Full_Alpha_Block},
+	{ "BC3_Red_Blue_Full_Alpha", BC3_Red_Blue_Full_Alpha_Block},
+	{ "BC3_Red_Green_Full_Alpha", BC3_Red_Green_Full_Alpha_Block},
+	{ "BC3_Green_Blue_Full_Alpha", BC3_Green_Blue_Full_Alpha_Block},
+	{ "BC3_Blue_Ignore_Alpha", BC3_Blue_Ignore_Alpha_Block},
+	{ "BC3_White_Ignore_Alpha", BC3_White_Ignore_Alpha_Block},
+	{ "BC3_Black_Ignore_Alpha", BC3_Black_Ignore_Alpha_Block},
+	{ "BC3_Red_Blue_Ignore_Alpha", BC3_Red_Blue_Ignore_Alpha_Block},
+	{ "BC3_Red_Green_Ignore_Alpha", BC3_Red_Green_Ignore_Alpha_Block},
+	{ "BC3_Green_Blue_Ignore_Alpha", BC3_Green_Blue_Ignore_Alpha_Block},
+	{ "BC3_Red_Half_Alpha", BC3_Red_Half_Alpha_Block},
+	{ "BC3_Green_Half_Alpha", BC3_Green_Half_Alpha_Block}
+};
+
+#endif
\ No newline at end of file
diff --git a/extern/CMP_Core/test/CMakeLists.txt b/extern/CMP_Core/test/CMakeLists.txt
new file mode 100644
index 0000000..710e8fa
--- /dev/null
+++ b/extern/CMP_Core/test/CMakeLists.txt
@@ -0,0 +1,13 @@
+cmake_minimum_required(VERSION 3.5)
+project(CMP_Core_Tests)
+
+add_executable(Tests TestsMain.cpp)
+add_subdirectory(../../../Common/Lib/Ext/Catch2
+                Common/Lib/Ext/Catch2/bin)
+target_sources(Tests 
+                PRIVATE
+                CompressonatorTests.cpp
+                CompressonatorTests.h
+                BlockConstants.h
+                )
+target_link_libraries(Tests Catch2::Catch2 CMP_Core)
diff --git a/extern/CMP_Core/test/CompressonatorTests.cpp b/extern/CMP_Core/test/CompressonatorTests.cpp
new file mode 100644
index 0000000..a75c268
--- /dev/null
+++ b/extern/CMP_Core/test/CompressonatorTests.cpp
@@ -0,0 +1,1143 @@
+#include <map>
+#include <array>
+#include "../source/CMP_Core.h"
+// incudes all compressed 4x4 blocks
+#include "BlockConstants.h"
+#include "../../../Common/Lib/Ext/Catch2/catch.hpp"
+#include "CompressonatorTests.h"
+
+static const int BC1_BLOCK_SIZE = 8;
+static const int BC2_BLOCK_SIZE = 16;
+static const int BC3_BLOCK_SIZE = 16;
+static const int DECOMPRESSED_BLOCK_SIZE = 64;
+static const int STRIDE_DECOMPRESSED = 16;
+
+static const std::map<std::string, std::array<unsigned char, 4>> colorValues{
+	{ "Red_Ignore_Alpha", { 0xff, 0x0, 0x0, 0xff }},
+	{ "Green_Ignore_Alpha" , { 0x0, 0xff, 0x0, 0xff }},
+	{ "Blue_Ignore_Alpha" , { 0x0, 0x0, 0xff, 0xff }},
+	{ "White_Ignore_Alpha" , { 0xff, 0xff, 0xff, 0xff }},
+	{ "Black_Ignore_Alpha" , { 0x0, 0x0, 0x0, 0xff }},
+	{ "Red_Blue_Ignore_Alpha" , { 0xff, 0x0, 0xff, 0xff }},
+	{ "Red_Green_Ignore_Alpha" , { 0xff, 0xff, 0x0, 0xff }},
+	{ "Green_Blue_Ignore_Alpha", { 0x0, 0xff, 0xff, 0xff }},
+
+	{ "Red_Half_Alpha" , { 0xff, 0x0, 0x0, 0x7b }},
+	{ "Green_Half_Alpha" , { 0x0, 0xff, 0x0, 0x7b }},
+	{ "Blue_Half_Alpha" , { 0x0, 0x0, 0xff, 0x7b }},
+	{ "White_Half_Alpha" , { 0xff, 0xff, 0xff, 0x7b }},
+	{ "Black_Half_Alpha" , { 0x0, 0x0, 0x0, 0x7b }},
+	{ "Red_Blue_Half_Alpha" , { 0xff, 0x0, 0xff, 0x7b }},
+	{ "Red_Green_Half_Alpha", { 0xff, 0xff, 0x0, 0x7b }},
+	{ "Green_Blue_Half_Alpha" , { 0x0, 0xff, 0xff, 0x7b }},
+
+	{ "Red_Full_Alpha" , { 0xff, 0x0, 0x0, 0x0 }},
+	{ "Green_Full_Alpha" , { 0x0, 0xff, 0x0, 0x0 }},
+	{ "Blue_Full_Alpha" ,  { 0x0, 0x0, 0xff, 0x0 }},
+	{ "White_Full_Alpha" , { 0xff, 0xff, 0xff, 0x0 }},
+	{ "Black_Full_Alpha" , { 0x0, 0x0, 0x0, 0x0 }},
+	{ "Red_Blue_Full_Alpha" , { 0xff, 0x0, 0xff, 0x0 }},
+	{ "Red_Green_Full_Alpha", { 0xff, 0xff, 0x0, 0x0 }},
+	{ "Green_Blue_Full_Alpha" , { 0x0, 0xff, 0xff, 0x0 }}
+};
+
+//block storage format: [R, G, B, W, Black, RB, RG, GB]. Alpha: 100%, 50%, 0%
+enum ColorEnum {
+	Red, Green, Blue, White, Black, Red_Blue, Red_Green, Green_Blue
+};
+enum AlphaEnum {
+	Ignore_Alpha, Half_Alpha, Full_Alpha
+};
+enum CompEnum {
+	BC1, BC2, BC3
+};
+
+std::string BlockKeyName(CompEnum compression, ColorEnum color, AlphaEnum alpha)
+{
+	std::string result = "";
+	switch (compression) {
+		case BC1:	result += "BC1"; break;
+		case BC2:	result += "BC2"; break;
+		case BC3:	result += "BC3"; break;
+	}
+	switch (color) {
+		case Red:			result += "_Red_"; break;
+		case Green:			result += "_Green_"; break;
+		case Blue:			result += "_Blue_"; break;
+		case White:			result += "_White_"; break;
+		case Black:			result += "_Black_"; break;
+		case Red_Blue:		result += "_Red_Blue_"; break;
+		case Red_Green:		result += "_Red_Green_"; break;
+		case Green_Blue:	result += "_Green_Blue_"; break;
+	}
+	switch (alpha) {
+		case Ignore_Alpha:	result += "Ignore_Alpha"; break;
+		case Half_Alpha:	result += "Half_Alpha"; break;
+		case Full_Alpha:	result += "Full_Alpha"; break;
+	}
+	return result;
+}
+
+void AssignExpectedColorsToBlocks()
+{
+	ColorEnum color = Red;
+	CompEnum comp = BC1;
+	AlphaEnum alpha = Ignore_Alpha;
+	for (int i = 0; i < blocks.size(); ++i) {
+		if (i % 24 == 0 && i > 0) {
+			comp = static_cast<CompEnum>(comp + 1);
+		}
+		if (i % 8 == 0 && i > 0) {
+			alpha = static_cast<AlphaEnum>((alpha + 1) % 3);
+		}
+		const std::string keyBlocks = BlockKeyName(comp, color, alpha);
+		std::string keyColor = keyBlocks;
+		// string keyColor is in format BCn_color_alpha. To use it as key to access colorValues, delete the BCn_ part.
+		keyColor.erase(0, 4);
+		((blocks.find(keyBlocks))->second).color = ((colorValues.find(keyColor))->second).data();
+		color = static_cast<ColorEnum>((color + 1) % 8);
+	}
+}
+
+bool ColorMatches(unsigned char* buffer, const unsigned char* expectedColor, bool ignoreAlpha)
+{
+	unsigned char expectedColorBuffer[64];
+	// handle formats that do not support alpha.
+	if (ignoreAlpha) {
+		// if alpha is ignored, BC should set all values to 0. Except the alpha value which can be 0 or 0xff only.
+		if (buffer[3] != 0 && buffer[3] != 255) {
+			return false;
+		}
+		unsigned char expColorWithoutAlpha[4] = { 0 };
+		// Only when the alpha value is 0xff colors are stored. Otherwise the RGB colors were set to 0 by during compression.
+		if (expectedColor[3] == 0xff) {
+			memcpy(expColorWithoutAlpha, expectedColor, 4);
+		}
+		// Set alpha value to the alpha value in the first pixel of the decompressed buffer.
+		// The buffer contains only one color, so all pixels should have the same values.
+		expColorWithoutAlpha[3] = buffer[3];
+		
+		for (int idx = 0; idx < DECOMPRESSED_BLOCK_SIZE / 4; ++idx) {
+			memcpy(expectedColorBuffer + (idx * 4), expColorWithoutAlpha, 4);
+		}
+		return memcmp(&expectedColorBuffer, buffer, DECOMPRESSED_BLOCK_SIZE) == 0;
+	}
+	
+	for (int idx = 0; idx < DECOMPRESSED_BLOCK_SIZE / 4; ++idx) {
+		memcpy(expectedColorBuffer + (idx * 4), expectedColor, 4);
+	}
+	return memcmp(&expectedColorBuffer, buffer, DECOMPRESSED_BLOCK_SIZE) == 0;
+}
+
+//***************************************************************************************
+
+TEST_CASE("BC1_Red_Ignore_Alpha", "[BC1_Red_Ignore_Alpha]")
+{
+	const auto block = blocks.find("BC1_Red_Ignore_Alpha")->second;
+	const auto blockData = block.data;
+	const auto blockColor = block.color;
+	unsigned char decompBlock [64];
+	DecompressBlockBC1(blockData, decompBlock, nullptr);
+	CHECK(ColorMatches(decompBlock, blockColor,true));
+	unsigned char compBlock[8];
+	unsigned char decompCompBlock [64];
+	CompressBlockBC1(decompBlock, 16, compBlock, nullptr);
+	DecompressBlockBC1(compBlock, decompCompBlock, nullptr);
+	CHECK(ColorMatches(decompCompBlock, blockColor,true));
+}
+TEST_CASE("BC1_Blue_Half_Alpha", "[BC1_Blue_Half_Alpha]")
+{
+	const auto block = blocks.find("BC1_Blue_Half_Alpha")->second;
+	const auto blockData = block.data;
+	const auto blockColor = block.color;
+	unsigned char decompBlock [64];
+	DecompressBlockBC1(blockData, decompBlock, nullptr);
+	CHECK(ColorMatches(decompBlock, blockColor,true));
+	unsigned char compBlock[8];
+	unsigned char decompCompBlock [64];
+	CompressBlockBC1(decompBlock, 16, compBlock, nullptr);
+	DecompressBlockBC1(compBlock, decompCompBlock, nullptr);
+	CHECK(ColorMatches(decompCompBlock, blockColor,true));
+}
+TEST_CASE("BC1_White_Half_Alpha", "[BC1_White_Half_Alpha]")
+{
+	const auto block = blocks.find("BC1_White_Half_Alpha")->second;
+	const auto blockData = block.data;
+	const auto blockColor = block.color;
+	unsigned char decompBlock [64];
+	DecompressBlockBC1(blockData, decompBlock, nullptr);
+	CHECK(ColorMatches(decompBlock, blockColor,true));
+	unsigned char compBlock[8];
+	unsigned char decompCompBlock [64];
+	CompressBlockBC1(decompBlock, 16, compBlock, nullptr);
+	DecompressBlockBC1(compBlock, decompCompBlock, nullptr);
+	CHECK(ColorMatches(decompCompBlock, blockColor,true));
+}
+TEST_CASE("BC1_Black_Half_Alpha", "[BC1_Black_Half_Alpha]")
+{
+	const auto block = blocks.find("BC1_Black_Half_Alpha")->second;
+	const auto blockData = block.data;
+	const auto blockColor = block.color;
+	unsigned char decompBlock [64];
+	DecompressBlockBC1(blockData, decompBlock, nullptr);
+	CHECK(ColorMatches(decompBlock, blockColor,true));
+	unsigned char compBlock[8];
+	unsigned char decompCompBlock [64];
+	CompressBlockBC1(decompBlock, 16, compBlock, nullptr);
+	DecompressBlockBC1(compBlock, decompCompBlock, nullptr);
+	CHECK(ColorMatches(decompCompBlock, blockColor,true));
+}
+TEST_CASE("BC1_Red_Blue_Half_Alpha", "[BC1_Red_Blue_Half_Alpha]")
+{
+	const auto block = blocks.find("BC1_Red_Blue_Half_Alpha")->second;
+	const auto blockData = block.data;
+	const auto blockColor = block.color;
+	unsigned char decompBlock [64];
+	DecompressBlockBC1(blockData, decompBlock, nullptr);
+	CHECK(ColorMatches(decompBlock, blockColor,true));
+	unsigned char compBlock[8];
+	unsigned char decompCompBlock [64];
+	CompressBlockBC1(decompBlock, 16, compBlock, nullptr);
+	DecompressBlockBC1(compBlock, decompCompBlock, nullptr);
+	CHECK(ColorMatches(decompCompBlock, blockColor,true));
+}
+TEST_CASE("BC1_Red_Green_Half_Alpha", "[BC1_Red_Green_Half_Alpha]")
+{
+	const auto block = blocks.find("BC1_Red_Green_Half_Alpha")->second;
+	const auto blockData = block.data;
+	const auto blockColor = block.color;
+	unsigned char decompBlock [64];
+	DecompressBlockBC1(blockData, decompBlock, nullptr);
+	CHECK(ColorMatches(decompBlock, blockColor,true));
+	unsigned char compBlock[8];
+	unsigned char decompCompBlock [64];
+	CompressBlockBC1(decompBlock, 16, compBlock, nullptr);
+	DecompressBlockBC1(compBlock, decompCompBlock, nullptr);
+	CHECK(ColorMatches(decompCompBlock, blockColor,true));
+}
+TEST_CASE("BC1_Green_Blue_Half_Alpha", "[BC1_Green_Blue_Half_Alpha]")
+{
+	const auto block = blocks.find("BC1_Green_Blue_Half_Alpha")->second;
+	const auto blockData = block.data;
+	const auto blockColor = block.color;
+	unsigned char decompBlock [64];
+	DecompressBlockBC1(blockData, decompBlock, nullptr);
+	CHECK(ColorMatches(decompBlock, blockColor,true));
+	unsigned char compBlock[8];
+	unsigned char decompCompBlock [64];
+	CompressBlockBC1(decompBlock, 16, compBlock, nullptr);
+	DecompressBlockBC1(compBlock, decompCompBlock, nullptr);
+	CHECK(ColorMatches(decompCompBlock, blockColor,true));
+}
+TEST_CASE("BC1_Red_Full_Alpha", "[BC1_Red_Full_Alpha]")
+{
+	const auto block = blocks.find("BC1_Red_Full_Alpha")->second;
+	const auto blockData = block.data;
+	const auto blockColor = block.color;
+	unsigned char decompBlock [64];
+	DecompressBlockBC1(blockData, decompBlock, nullptr);
+	CHECK(ColorMatches(decompBlock, blockColor,true));
+	unsigned char compBlock[8];
+	unsigned char decompCompBlock [64];
+	CompressBlockBC1(decompBlock, 16, compBlock, nullptr);
+	DecompressBlockBC1(compBlock, decompCompBlock, nullptr);
+	CHECK(ColorMatches(decompCompBlock, blockColor,true));
+}
+TEST_CASE("BC1_Green_Full_Alpha", "[BC1_Green_Full_Alpha]")
+{
+	const auto block = blocks.find("BC1_Green_Full_Alpha")->second;
+	const auto blockData = block.data;
+	const auto blockColor = block.color;
+	unsigned char decompBlock [64];
+	DecompressBlockBC1(blockData, decompBlock, nullptr);
+	CHECK(ColorMatches(decompBlock, blockColor,true));
+	unsigned char compBlock[8];
+	unsigned char decompCompBlock [64];
+	CompressBlockBC1(decompBlock, 16, compBlock, nullptr);
+	DecompressBlockBC1(compBlock, decompCompBlock, nullptr);
+	CHECK(ColorMatches(decompCompBlock, blockColor,true));
+}
+TEST_CASE("BC1_Blue_Full_Alpha", "[BC1_Blue_Full_Alpha]")
+{
+	const auto block = blocks.find("BC1_Blue_Full_Alpha")->second;
+	const auto blockData = block.data;
+	const auto blockColor = block.color;
+	unsigned char decompBlock [64];
+	DecompressBlockBC1(blockData, decompBlock, nullptr);
+	CHECK(ColorMatches(decompBlock, blockColor,true));
+	unsigned char compBlock[8];
+	unsigned char decompCompBlock [64];
+	CompressBlockBC1(decompBlock, 16, compBlock, nullptr);
+	DecompressBlockBC1(compBlock, decompCompBlock, nullptr);
+	CHECK(ColorMatches(decompCompBlock, blockColor,true));
+}
+TEST_CASE("BC1_White_Full_Alpha", "[BC1_White_Full_Alpha]")
+{
+	const auto block = blocks.find("BC1_White_Full_Alpha")->second;
+	const auto blockData = block.data;
+	const auto blockColor = block.color;
+	unsigned char decompBlock [64];
+	DecompressBlockBC1(blockData, decompBlock, nullptr);
+	CHECK(ColorMatches(decompBlock, blockColor,true));
+	unsigned char compBlock[8];
+	unsigned char decompCompBlock [64];
+	CompressBlockBC1(decompBlock, 16, compBlock, nullptr);
+	DecompressBlockBC1(compBlock, decompCompBlock, nullptr);
+	CHECK(ColorMatches(decompCompBlock, blockColor,true));
+}
+TEST_CASE("BC1_Green_Ignore_Alpha", "[BC1_Green_Ignore_Alpha]")
+{
+	const auto block = blocks.find("BC1_Green_Ignore_Alpha")->second;
+	const auto blockData = block.data;
+	const auto blockColor = block.color;
+	unsigned char decompBlock [64];
+	DecompressBlockBC1(blockData, decompBlock, nullptr);
+	CHECK(ColorMatches(decompBlock, blockColor,true));
+	unsigned char compBlock[8];
+	unsigned char decompCompBlock [64];
+	CompressBlockBC1(decompBlock, 16, compBlock, nullptr);
+	DecompressBlockBC1(compBlock, decompCompBlock, nullptr);
+	CHECK(ColorMatches(decompCompBlock, blockColor,true));
+}
+TEST_CASE("BC1_Black_Full_Alpha", "[BC1_Black_Full_Alpha]")
+{
+	const auto block = blocks.find("BC1_Black_Full_Alpha")->second;
+	const auto blockData = block.data;
+	const auto blockColor = block.color;
+	unsigned char decompBlock [64];
+	DecompressBlockBC1(blockData, decompBlock, nullptr);
+	CHECK(ColorMatches(decompBlock, blockColor,true));
+	unsigned char compBlock[8];
+	unsigned char decompCompBlock [64];
+	CompressBlockBC1(decompBlock, 16, compBlock, nullptr);
+	DecompressBlockBC1(compBlock, decompCompBlock, nullptr);
+	CHECK(ColorMatches(decompCompBlock, blockColor,true));
+}
+TEST_CASE("BC1_Red_Blue_Full_Alpha", "[BC1_Red_Blue_Full_Alpha]")
+{
+	const auto block = blocks.find("BC1_Red_Blue_Full_Alpha")->second;
+	const auto blockData = block.data;
+	const auto blockColor = block.color;
+	unsigned char decompBlock [64];
+	DecompressBlockBC1(blockData, decompBlock, nullptr);
+	CHECK(ColorMatches(decompBlock, blockColor,true));
+	unsigned char compBlock[8];
+	unsigned char decompCompBlock [64];
+	CompressBlockBC1(decompBlock, 16, compBlock, nullptr);
+	DecompressBlockBC1(compBlock, decompCompBlock, nullptr);
+	CHECK(ColorMatches(decompCompBlock, blockColor,true));
+}
+TEST_CASE("BC1_Red_Green_Full_Alpha", "[BC1_Red_Green_Full_Alpha]")
+{
+	const auto block = blocks.find("BC1_Red_Green_Full_Alpha")->second;
+	const auto blockData = block.data;
+	const auto blockColor = block.color;
+	unsigned char decompBlock [64];
+	DecompressBlockBC1(blockData, decompBlock, nullptr);
+	CHECK(ColorMatches(decompBlock, blockColor,true));
+	unsigned char compBlock[8];
+	unsigned char decompCompBlock [64];
+	CompressBlockBC1(decompBlock, 16, compBlock, nullptr);
+	DecompressBlockBC1(compBlock, decompCompBlock, nullptr);
+	CHECK(ColorMatches(decompCompBlock, blockColor,true));
+}
+TEST_CASE("BC1_Green_Blue_Full_Alpha", "[BC1_Green_Blue_Full_Alpha]")
+{
+	const auto block = blocks.find("BC1_Green_Blue_Full_Alpha")->second;
+	const auto blockData = block.data;
+	const auto blockColor = block.color;
+	unsigned char decompBlock [64];
+	DecompressBlockBC1(blockData, decompBlock, nullptr);
+	CHECK(ColorMatches(decompBlock, blockColor,true));
+	unsigned char compBlock[8];
+	unsigned char decompCompBlock [64];
+	CompressBlockBC1(decompBlock, 16, compBlock, nullptr);
+	DecompressBlockBC1(compBlock, decompCompBlock, nullptr);
+	CHECK(ColorMatches(decompCompBlock, blockColor,true));
+}
+TEST_CASE("BC1_Blue_Ignore_Alpha", "[BC1_Blue_Ignore_Alpha]")
+{
+	const auto block = blocks.find("BC1_Blue_Ignore_Alpha")->second;
+	const auto blockData = block.data;
+	const auto blockColor = block.color;
+	unsigned char decompBlock [64];
+	DecompressBlockBC1(blockData, decompBlock, nullptr);
+	CHECK(ColorMatches(decompBlock, blockColor,true));
+	unsigned char compBlock[8];
+	unsigned char decompCompBlock [64];
+	CompressBlockBC1(decompBlock, 16, compBlock, nullptr);
+	DecompressBlockBC1(compBlock, decompCompBlock, nullptr);
+	CHECK(ColorMatches(decompCompBlock, blockColor,true));
+}
+TEST_CASE("BC1_White_Ignore_Alpha", "[BC1_White_Ignore_Alpha]")
+{
+	const auto block = blocks.find("BC1_White_Ignore_Alpha")->second;
+	const auto blockData = block.data;
+	const auto blockColor = block.color;
+	unsigned char decompBlock [64];
+	DecompressBlockBC1(blockData, decompBlock, nullptr);
+	CHECK(ColorMatches(decompBlock, blockColor,true));
+	unsigned char compBlock[8];
+	unsigned char decompCompBlock [64];
+	CompressBlockBC1(decompBlock, 16, compBlock, nullptr);
+	DecompressBlockBC1(compBlock, decompCompBlock, nullptr);
+	CHECK(ColorMatches(decompCompBlock, blockColor,true));
+}
+TEST_CASE("BC1_Black_Ignore_Alpha", "[BC1_Black_Ignore_Alpha]")
+{
+	const auto block = blocks.find("BC1_Black_Ignore_Alpha")->second;
+	const auto blockData = block.data;
+	const auto blockColor = block.color;
+	unsigned char decompBlock [64];
+	DecompressBlockBC1(blockData, decompBlock, nullptr);
+	CHECK(ColorMatches(decompBlock, blockColor,true));
+	unsigned char compBlock[8];
+	unsigned char decompCompBlock [64];
+	CompressBlockBC1(decompBlock, 16, compBlock, nullptr);
+	DecompressBlockBC1(compBlock, decompCompBlock, nullptr);
+	CHECK(ColorMatches(decompCompBlock, blockColor,true));
+}
+TEST_CASE("BC1_Red_Blue_Ignore_Alpha", "[BC1_Red_Blue_Ignore_Alpha]")
+{
+	const auto block = blocks.find("BC1_Red_Blue_Ignore_Alpha")->second;
+	const auto blockData = block.data;
+	const auto blockColor = block.color;
+	unsigned char decompBlock [64];
+	DecompressBlockBC1(blockData, decompBlock, nullptr);
+	CHECK(ColorMatches(decompBlock, blockColor,true));
+	unsigned char compBlock[8];
+	unsigned char decompCompBlock [64];
+	CompressBlockBC1(decompBlock, 16, compBlock, nullptr);
+	DecompressBlockBC1(compBlock, decompCompBlock, nullptr);
+	CHECK(ColorMatches(decompCompBlock, blockColor,true));
+}
+TEST_CASE("BC1_Red_Green_Ignore_Alpha", "[BC1_Red_Green_Ignore_Alpha]")
+{
+	const auto block = blocks.find("BC1_Red_Green_Ignore_Alpha")->second;
+	const auto blockData = block.data;
+	const auto blockColor = block.color;
+	unsigned char decompBlock [64];
+	DecompressBlockBC1(blockData, decompBlock, nullptr);
+	CHECK(ColorMatches(decompBlock, blockColor,true));
+	unsigned char compBlock[8];
+	unsigned char decompCompBlock [64];
+	CompressBlockBC1(decompBlock, 16, compBlock, nullptr);
+	DecompressBlockBC1(compBlock, decompCompBlock, nullptr);
+	CHECK(ColorMatches(decompCompBlock, blockColor,true));
+}
+TEST_CASE("BC1_Green_Blue_Ignore_Alpha", "[BC1_Green_Blue_Ignore_Alpha]")
+{
+	const auto block = blocks.find("BC1_Green_Blue_Ignore_Alpha")->second;
+	const auto blockData = block.data;
+	const auto blockColor = block.color;
+	unsigned char decompBlock [64];
+	DecompressBlockBC1(blockData, decompBlock, nullptr);
+	CHECK(ColorMatches(decompBlock, blockColor,true));
+	unsigned char compBlock[8];
+	unsigned char decompCompBlock [64];
+	CompressBlockBC1(decompBlock, 16, compBlock, nullptr);
+	DecompressBlockBC1(compBlock, decompCompBlock, nullptr);
+	CHECK(ColorMatches(decompCompBlock, blockColor,true));
+}
+TEST_CASE("BC1_Red_Half_Alpha", "[BC1_Red_Half_Alpha]")
+{
+	const auto block = blocks.find("BC1_Red_Half_Alpha")->second;
+	const auto blockData = block.data;
+	const auto blockColor = block.color;
+	unsigned char decompBlock [64];
+	DecompressBlockBC1(blockData, decompBlock, nullptr);
+	CHECK(ColorMatches(decompBlock, blockColor,true));
+	unsigned char compBlock[8];
+	unsigned char decompCompBlock [64];
+	CompressBlockBC1(decompBlock, 16, compBlock, nullptr);
+	DecompressBlockBC1(compBlock, decompCompBlock, nullptr);
+	CHECK(ColorMatches(decompCompBlock, blockColor,true));
+}
+TEST_CASE("BC1_Green_Half_Alpha", "[BC1_Green_Half_Alpha]")
+{
+	const auto block = blocks.find("BC1_Green_Half_Alpha")->second;
+	const auto blockData = block.data;
+	const auto blockColor = block.color;
+	unsigned char decompBlock [64];
+	DecompressBlockBC1(blockData, decompBlock, nullptr);
+	CHECK(ColorMatches(decompBlock, blockColor,true));
+	unsigned char compBlock[8];
+	unsigned char decompCompBlock [64];
+	CompressBlockBC1(decompBlock, 16, compBlock, nullptr);
+	DecompressBlockBC1(compBlock, decompCompBlock, nullptr);
+	CHECK(ColorMatches(decompCompBlock, blockColor,true));
+}
+TEST_CASE("BC2_Red_Ignore_Alpha", "[BC2_Red_Ignore_Alpha]")
+{
+	const auto block = blocks.find("BC2_Red_Ignore_Alpha")->second;
+	const auto blockData = block.data;
+	const auto blockColor = block.color;
+	unsigned char decompBlock [64];
+	DecompressBlockBC2(blockData, decompBlock, nullptr);
+	CHECK(ColorMatches(decompBlock, blockColor,false));
+	unsigned char compBlock[16];
+	unsigned char decompCompBlock [64];
+	CompressBlockBC2(decompBlock, 16, compBlock, nullptr);
+	DecompressBlockBC2(compBlock, decompCompBlock, nullptr);
+	CHECK(ColorMatches(decompCompBlock, blockColor,false));
+}
+TEST_CASE("BC2_Blue_Half_Alpha", "[BC2_Blue_Half_Alpha]")
+{
+	const auto block = blocks.find("BC2_Blue_Half_Alpha")->second;
+	const auto blockData = block.data;
+	const auto blockColor = block.color;
+	unsigned char decompBlock [64];
+	DecompressBlockBC2(blockData, decompBlock, nullptr);
+	CHECK(ColorMatches(decompBlock, blockColor,false));
+	unsigned char compBlock[16];
+	unsigned char decompCompBlock [64];
+	CompressBlockBC2(decompBlock, 16, compBlock, nullptr);
+	DecompressBlockBC2(compBlock, decompCompBlock, nullptr);
+	CHECK(ColorMatches(decompCompBlock, blockColor,false));
+}
+TEST_CASE("BC2_White_Half_Alpha", "[BC2_White_Half_Alpha]")
+{
+	const auto block = blocks.find("BC2_White_Half_Alpha")->second;
+	const auto blockData = block.data;
+	const auto blockColor = block.color;
+	unsigned char decompBlock [64];
+	DecompressBlockBC2(blockData, decompBlock, nullptr);
+	CHECK(ColorMatches(decompBlock, blockColor,false));
+	unsigned char compBlock[16];
+	unsigned char decompCompBlock [64];
+	CompressBlockBC2(decompBlock, 16, compBlock, nullptr);
+	DecompressBlockBC2(compBlock, decompCompBlock, nullptr);
+	CHECK(ColorMatches(decompCompBlock, blockColor,false));
+}
+TEST_CASE("BC2_Black_Half_Alpha", "[BC2_Black_Half_Alpha]")
+{
+	const auto block = blocks.find("BC2_Black_Half_Alpha")->second;
+	const auto blockData = block.data;
+	const auto blockColor = block.color;
+	unsigned char decompBlock [64];
+	DecompressBlockBC2(blockData, decompBlock, nullptr);
+	CHECK(ColorMatches(decompBlock, blockColor,false));
+	unsigned char compBlock[16];
+	unsigned char decompCompBlock [64];
+	CompressBlockBC2(decompBlock, 16, compBlock, nullptr);
+	DecompressBlockBC2(compBlock, decompCompBlock, nullptr);
+	CHECK(ColorMatches(decompCompBlock, blockColor,false));
+}
+TEST_CASE("BC2_Red_Blue_Half_Alpha", "[BC2_Red_Blue_Half_Alpha]")
+{
+	const auto block = blocks.find("BC2_Red_Blue_Half_Alpha")->second;
+	const auto blockData = block.data;
+	const auto blockColor = block.color;
+	unsigned char decompBlock [64];
+	DecompressBlockBC2(blockData, decompBlock, nullptr);
+	CHECK(ColorMatches(decompBlock, blockColor,false));
+	unsigned char compBlock[16];
+	unsigned char decompCompBlock [64];
+	CompressBlockBC2(decompBlock, 16, compBlock, nullptr);
+	DecompressBlockBC2(compBlock, decompCompBlock, nullptr);
+	CHECK(ColorMatches(decompCompBlock, blockColor,false));
+}
+TEST_CASE("BC2_Red_Green_Half_Alpha", "[BC2_Red_Green_Half_Alpha]")
+{
+	const auto block = blocks.find("BC2_Red_Green_Half_Alpha")->second;
+	const auto blockData = block.data;
+	const auto blockColor = block.color;
+	unsigned char decompBlock [64];
+	DecompressBlockBC2(blockData, decompBlock, nullptr);
+	CHECK(ColorMatches(decompBlock, blockColor,false));
+	unsigned char compBlock[16];
+	unsigned char decompCompBlock [64];
+	CompressBlockBC2(decompBlock, 16, compBlock, nullptr);
+	DecompressBlockBC2(compBlock, decompCompBlock, nullptr);
+	CHECK(ColorMatches(decompCompBlock, blockColor,false));
+}
+TEST_CASE("BC2_Green_Blue_Half_Alpha", "[BC2_Green_Blue_Half_Alpha]")
+{
+	const auto block = blocks.find("BC2_Green_Blue_Half_Alpha")->second;
+	const auto blockData = block.data;
+	const auto blockColor = block.color;
+	unsigned char decompBlock [64];
+	DecompressBlockBC2(blockData, decompBlock, nullptr);
+	CHECK(ColorMatches(decompBlock, blockColor,false));
+	unsigned char compBlock[16];
+	unsigned char decompCompBlock [64];
+	CompressBlockBC2(decompBlock, 16, compBlock, nullptr);
+	DecompressBlockBC2(compBlock, decompCompBlock, nullptr);
+	CHECK(ColorMatches(decompCompBlock, blockColor,false));
+}
+TEST_CASE("BC2_Red_Full_Alpha", "[BC2_Red_Full_Alpha]")
+{
+	const auto block = blocks.find("BC2_Red_Full_Alpha")->second;
+	const auto blockData = block.data;
+	const auto blockColor = block.color;
+	unsigned char decompBlock [64];
+	DecompressBlockBC2(blockData, decompBlock, nullptr);
+	CHECK(ColorMatches(decompBlock, blockColor,false));
+	unsigned char compBlock[16];
+	unsigned char decompCompBlock [64];
+	CompressBlockBC2(decompBlock, 16, compBlock, nullptr);
+	DecompressBlockBC2(compBlock, decompCompBlock, nullptr);
+	CHECK(ColorMatches(decompCompBlock, blockColor,false));
+}
+TEST_CASE("BC2_Green_Full_Alpha", "[BC2_Green_Full_Alpha]")
+{
+	const auto block = blocks.find("BC2_Green_Full_Alpha")->second;
+	const auto blockData = block.data;
+	const auto blockColor = block.color;
+	unsigned char decompBlock [64];
+	DecompressBlockBC2(blockData, decompBlock, nullptr);
+	CHECK(ColorMatches(decompBlock, blockColor,false));
+	unsigned char compBlock[16];
+	unsigned char decompCompBlock [64];
+	CompressBlockBC2(decompBlock, 16, compBlock, nullptr);
+	DecompressBlockBC2(compBlock, decompCompBlock, nullptr);
+	CHECK(ColorMatches(decompCompBlock, blockColor,false));
+}
+TEST_CASE("BC2_Blue_Full_Alpha", "[BC2_Blue_Full_Alpha]")
+{
+	const auto block = blocks.find("BC2_Blue_Full_Alpha")->second;
+	const auto blockData = block.data;
+	const auto blockColor = block.color;
+	unsigned char decompBlock [64];
+	DecompressBlockBC2(blockData, decompBlock, nullptr);
+	CHECK(ColorMatches(decompBlock, blockColor,false));
+	unsigned char compBlock[16];
+	unsigned char decompCompBlock [64];
+	CompressBlockBC2(decompBlock, 16, compBlock, nullptr);
+	DecompressBlockBC2(compBlock, decompCompBlock, nullptr);
+	CHECK(ColorMatches(decompCompBlock, blockColor,false));
+}
+TEST_CASE("BC2_White_Full_Alpha", "[BC2_White_Full_Alpha]")
+{
+	const auto block = blocks.find("BC2_White_Full_Alpha")->second;
+	const auto blockData = block.data;
+	const auto blockColor = block.color;
+	unsigned char decompBlock [64];
+	DecompressBlockBC2(blockData, decompBlock, nullptr);
+	CHECK(ColorMatches(decompBlock, blockColor,false));
+	unsigned char compBlock[16];
+	unsigned char decompCompBlock [64];
+	CompressBlockBC2(decompBlock, 16, compBlock, nullptr);
+	DecompressBlockBC2(compBlock, decompCompBlock, nullptr);
+	CHECK(ColorMatches(decompCompBlock, blockColor,false));
+}
+TEST_CASE("BC2_Green_Ignore_Alpha", "[BC2_Green_Ignore_Alpha]")
+{
+	const auto block = blocks.find("BC2_Green_Ignore_Alpha")->second;
+	const auto blockData = block.data;
+	const auto blockColor = block.color;
+	unsigned char decompBlock [64];
+	DecompressBlockBC2(blockData, decompBlock, nullptr);
+	CHECK(ColorMatches(decompBlock, blockColor,false));
+	unsigned char compBlock[16];
+	unsigned char decompCompBlock [64];
+	CompressBlockBC2(decompBlock, 16, compBlock, nullptr);
+	DecompressBlockBC2(compBlock, decompCompBlock, nullptr);
+	CHECK(ColorMatches(decompCompBlock, blockColor,false));
+}
+TEST_CASE("BC2_Black_Full_Alpha", "[BC2_Black_Full_Alpha]")
+{
+	const auto block = blocks.find("BC2_Black_Full_Alpha")->second;
+	const auto blockData = block.data;
+	const auto blockColor = block.color;
+	unsigned char decompBlock [64];
+	DecompressBlockBC2(blockData, decompBlock, nullptr);
+	CHECK(ColorMatches(decompBlock, blockColor,false));
+	unsigned char compBlock[16];
+	unsigned char decompCompBlock [64];
+	CompressBlockBC2(decompBlock, 16, compBlock, nullptr);
+	DecompressBlockBC2(compBlock, decompCompBlock, nullptr);
+	CHECK(ColorMatches(decompCompBlock, blockColor,false));
+}
+TEST_CASE("BC2_Red_Blue_Full_Alpha", "[BC2_Red_Blue_Full_Alpha]")
+{
+	const auto block = blocks.find("BC2_Red_Blue_Full_Alpha")->second;
+	const auto blockData = block.data;
+	const auto blockColor = block.color;
+	unsigned char decompBlock [64];
+	DecompressBlockBC2(blockData, decompBlock, nullptr);
+	CHECK(ColorMatches(decompBlock, blockColor,false));
+	unsigned char compBlock[16];
+	unsigned char decompCompBlock [64];
+	CompressBlockBC2(decompBlock, 16, compBlock, nullptr);
+	DecompressBlockBC2(compBlock, decompCompBlock, nullptr);
+	CHECK(ColorMatches(decompCompBlock, blockColor,false));
+}
+TEST_CASE("BC2_Red_Green_Full_Alpha", "[BC2_Red_Green_Full_Alpha]")
+{
+	const auto block = blocks.find("BC2_Red_Green_Full_Alpha")->second;
+	const auto blockData = block.data;
+	const auto blockColor = block.color;
+	unsigned char decompBlock [64];
+	DecompressBlockBC2(blockData, decompBlock, nullptr);
+	CHECK(ColorMatches(decompBlock, blockColor,false));
+	unsigned char compBlock[16];
+	unsigned char decompCompBlock [64];
+	CompressBlockBC2(decompBlock, 16, compBlock, nullptr);
+	DecompressBlockBC2(compBlock, decompCompBlock, nullptr);
+	CHECK(ColorMatches(decompCompBlock, blockColor,false));
+}
+TEST_CASE("BC2_Green_Blue_Full_Alpha", "[BC2_Green_Blue_Full_Alpha]")
+{
+	const auto block = blocks.find("BC2_Green_Blue_Full_Alpha")->second;
+	const auto blockData = block.data;
+	const auto blockColor = block.color;
+	unsigned char decompBlock [64];
+	DecompressBlockBC2(blockData, decompBlock, nullptr);
+	CHECK(ColorMatches(decompBlock, blockColor,false));
+	unsigned char compBlock[16];
+	unsigned char decompCompBlock [64];
+	CompressBlockBC2(decompBlock, 16, compBlock, nullptr);
+	DecompressBlockBC2(compBlock, decompCompBlock, nullptr);
+	CHECK(ColorMatches(decompCompBlock, blockColor,false));
+}
+TEST_CASE("BC2_Blue_Ignore_Alpha", "[BC2_Blue_Ignore_Alpha]")
+{
+	const auto block = blocks.find("BC2_Blue_Ignore_Alpha")->second;
+	const auto blockData = block.data;
+	const auto blockColor = block.color;
+	unsigned char decompBlock [64];
+	DecompressBlockBC2(blockData, decompBlock, nullptr);
+	CHECK(ColorMatches(decompBlock, blockColor,false));
+	unsigned char compBlock[16];
+	unsigned char decompCompBlock [64];
+	CompressBlockBC2(decompBlock, 16, compBlock, nullptr);
+	DecompressBlockBC2(compBlock, decompCompBlock, nullptr);
+	CHECK(ColorMatches(decompCompBlock, blockColor,false));
+}
+TEST_CASE("BC2_White_Ignore_Alpha", "[BC2_White_Ignore_Alpha]")
+{
+	const auto block = blocks.find("BC2_White_Ignore_Alpha")->second;
+	const auto blockData = block.data;
+	const auto blockColor = block.color;
+	unsigned char decompBlock [64];
+	DecompressBlockBC2(blockData, decompBlock, nullptr);
+	CHECK(ColorMatches(decompBlock, blockColor,false));
+	unsigned char compBlock[16];
+	unsigned char decompCompBlock [64];
+	CompressBlockBC2(decompBlock, 16, compBlock, nullptr);
+	DecompressBlockBC2(compBlock, decompCompBlock, nullptr);
+	CHECK(ColorMatches(decompCompBlock, blockColor,false));
+}
+TEST_CASE("BC2_Black_Ignore_Alpha", "[BC2_Black_Ignore_Alpha]")
+{
+	const auto block = blocks.find("BC2_Black_Ignore_Alpha")->second;
+	const auto blockData = block.data;
+	const auto blockColor = block.color;
+	unsigned char decompBlock [64];
+	DecompressBlockBC2(blockData, decompBlock, nullptr);
+	CHECK(ColorMatches(decompBlock, blockColor,false));
+	unsigned char compBlock[16];
+	unsigned char decompCompBlock [64];
+	CompressBlockBC2(decompBlock, 16, compBlock, nullptr);
+	DecompressBlockBC2(compBlock, decompCompBlock, nullptr);
+	CHECK(ColorMatches(decompCompBlock, blockColor,false));
+}
+TEST_CASE("BC2_Red_Blue_Ignore_Alpha", "[BC2_Red_Blue_Ignore_Alpha]")
+{
+	const auto block = blocks.find("BC2_Red_Blue_Ignore_Alpha")->second;
+	const auto blockData = block.data;
+	const auto blockColor = block.color;
+	unsigned char decompBlock [64];
+	DecompressBlockBC2(blockData, decompBlock, nullptr);
+	CHECK(ColorMatches(decompBlock, blockColor,false));
+	unsigned char compBlock[16];
+	unsigned char decompCompBlock [64];
+	CompressBlockBC2(decompBlock, 16, compBlock, nullptr);
+	DecompressBlockBC2(compBlock, decompCompBlock, nullptr);
+	CHECK(ColorMatches(decompCompBlock, blockColor,false));
+}
+TEST_CASE("BC2_Red_Green_Ignore_Alpha", "[BC2_Red_Green_Ignore_Alpha]")
+{
+	const auto block = blocks.find("BC2_Red_Green_Ignore_Alpha")->second;
+	const auto blockData = block.data;
+	const auto blockColor = block.color;
+	unsigned char decompBlock [64];
+	DecompressBlockBC2(blockData, decompBlock, nullptr);
+	CHECK(ColorMatches(decompBlock, blockColor,false));
+	unsigned char compBlock[16];
+	unsigned char decompCompBlock [64];
+	CompressBlockBC2(decompBlock, 16, compBlock, nullptr);
+	DecompressBlockBC2(compBlock, decompCompBlock, nullptr);
+	CHECK(ColorMatches(decompCompBlock, blockColor,false));
+}
+TEST_CASE("BC2_Green_Blue_Ignore_Alpha", "[BC2_Green_Blue_Ignore_Alpha]")
+{
+	const auto block = blocks.find("BC2_Green_Blue_Ignore_Alpha")->second;
+	const auto blockData = block.data;
+	const auto blockColor = block.color;
+	unsigned char decompBlock [64];
+	DecompressBlockBC2(blockData, decompBlock, nullptr);
+	CHECK(ColorMatches(decompBlock, blockColor,false));
+	unsigned char compBlock[16];
+	unsigned char decompCompBlock [64];
+	CompressBlockBC2(decompBlock, 16, compBlock, nullptr);
+	DecompressBlockBC2(compBlock, decompCompBlock, nullptr);
+	CHECK(ColorMatches(decompCompBlock, blockColor,false));
+}
+TEST_CASE("BC2_Red_Half_Alpha", "[BC2_Red_Half_Alpha]")
+{
+	const auto block = blocks.find("BC2_Red_Half_Alpha")->second;
+	const auto blockData = block.data;
+	const auto blockColor = block.color;
+	unsigned char decompBlock [64];
+	DecompressBlockBC2(blockData, decompBlock, nullptr);
+	CHECK(ColorMatches(decompBlock, blockColor,false));
+	unsigned char compBlock[16];
+	unsigned char decompCompBlock [64];
+	CompressBlockBC2(decompBlock, 16, compBlock, nullptr);
+	DecompressBlockBC2(compBlock, decompCompBlock, nullptr);
+	CHECK(ColorMatches(decompCompBlock, blockColor,false));
+}
+TEST_CASE("BC2_Green_Half_Alpha", "[BC2_Green_Half_Alpha]")
+{
+	const auto block = blocks.find("BC2_Green_Half_Alpha")->second;
+	const auto blockData = block.data;
+	const auto blockColor = block.color;
+	unsigned char decompBlock [64];
+	DecompressBlockBC2(blockData, decompBlock, nullptr);
+	CHECK(ColorMatches(decompBlock, blockColor,false));
+	unsigned char compBlock[16];
+	unsigned char decompCompBlock [64];
+	CompressBlockBC2(decompBlock, 16, compBlock, nullptr);
+	DecompressBlockBC2(compBlock, decompCompBlock, nullptr);
+	CHECK(ColorMatches(decompCompBlock, blockColor,false));
+}
+TEST_CASE("BC3_Red_Ignore_Alpha", "[BC3_Red_Ignore_Alpha]")
+{
+	const auto block = blocks.find("BC3_Red_Ignore_Alpha")->second;
+	const auto blockData = block.data;
+	const auto blockColor = block.color;
+	unsigned char decompBlock [64];
+	DecompressBlockBC3(blockData, decompBlock, nullptr);
+	CHECK(ColorMatches(decompBlock, blockColor,false));
+	unsigned char compBlock[16];
+	unsigned char decompCompBlock [64];
+	CompressBlockBC3(decompBlock, 16, compBlock, nullptr);
+	DecompressBlockBC3(compBlock, decompCompBlock, nullptr);
+	CHECK(ColorMatches(decompCompBlock, blockColor,false));
+}
+TEST_CASE("BC3_Blue_Half_Alpha", "[BC3_Blue_Half_Alpha]")
+{
+	const auto block = blocks.find("BC3_Blue_Half_Alpha")->second;
+	const auto blockData = block.data;
+	const auto blockColor = block.color;
+	unsigned char decompBlock [64];
+	DecompressBlockBC3(blockData, decompBlock, nullptr);
+	CHECK(ColorMatches(decompBlock, blockColor,false));
+	unsigned char compBlock[16];
+	unsigned char decompCompBlock [64];
+	CompressBlockBC3(decompBlock, 16, compBlock, nullptr);
+	DecompressBlockBC3(compBlock, decompCompBlock, nullptr);
+	CHECK(ColorMatches(decompCompBlock, blockColor,false));
+}
+TEST_CASE("BC3_White_Half_Alpha", "[BC3_White_Half_Alpha]")
+{
+	const auto block = blocks.find("BC3_White_Half_Alpha")->second;
+	const auto blockData = block.data;
+	const auto blockColor = block.color;
+	unsigned char decompBlock [64];
+	DecompressBlockBC3(blockData, decompBlock, nullptr);
+	CHECK(ColorMatches(decompBlock, blockColor,false));
+	unsigned char compBlock[16];
+	unsigned char decompCompBlock [64];
+	CompressBlockBC3(decompBlock, 16, compBlock, nullptr);
+	DecompressBlockBC3(compBlock, decompCompBlock, nullptr);
+	CHECK(ColorMatches(decompCompBlock, blockColor,false));
+}
+TEST_CASE("BC3_Black_Half_Alpha", "[BC3_Black_Half_Alpha]")
+{
+	const auto block = blocks.find("BC3_Black_Half_Alpha")->second;
+	const auto blockData = block.data;
+	const auto blockColor = block.color;
+	unsigned char decompBlock [64];
+	DecompressBlockBC3(blockData, decompBlock, nullptr);
+	CHECK(ColorMatches(decompBlock, blockColor,false));
+	unsigned char compBlock[16];
+	unsigned char decompCompBlock [64];
+	CompressBlockBC3(decompBlock, 16, compBlock, nullptr);
+	DecompressBlockBC3(compBlock, decompCompBlock, nullptr);
+	CHECK(ColorMatches(decompCompBlock, blockColor,false));
+}
+TEST_CASE("BC3_Red_Blue_Half_Alpha", "[BC3_Red_Blue_Half_Alpha]")
+{
+	const auto block = blocks.find("BC3_Red_Blue_Half_Alpha")->second;
+	const auto blockData = block.data;
+	const auto blockColor = block.color;
+	unsigned char decompBlock [64];
+	DecompressBlockBC3(blockData, decompBlock, nullptr);
+	CHECK(ColorMatches(decompBlock, blockColor,false));
+	unsigned char compBlock[16];
+	unsigned char decompCompBlock [64];
+	CompressBlockBC3(decompBlock, 16, compBlock, nullptr);
+	DecompressBlockBC3(compBlock, decompCompBlock, nullptr);
+	CHECK(ColorMatches(decompCompBlock, blockColor,false));
+}
+TEST_CASE("BC3_Red_Green_Half_Alpha", "[BC3_Red_Green_Half_Alpha]")
+{
+	const auto block = blocks.find("BC3_Red_Green_Half_Alpha")->second;
+	const auto blockData = block.data;
+	const auto blockColor = block.color;
+	unsigned char decompBlock [64];
+	DecompressBlockBC3(blockData, decompBlock, nullptr);
+	CHECK(ColorMatches(decompBlock, blockColor,false));
+	unsigned char compBlock[16];
+	unsigned char decompCompBlock [64];
+	CompressBlockBC3(decompBlock, 16, compBlock, nullptr);
+	DecompressBlockBC3(compBlock, decompCompBlock, nullptr);
+	CHECK(ColorMatches(decompCompBlock, blockColor,false));
+}
+TEST_CASE("BC3_Green_Blue_Half_Alpha", "[BC3_Green_Blue_Half_Alpha]")
+{
+	const auto block = blocks.find("BC3_Green_Blue_Half_Alpha")->second;
+	const auto blockData = block.data;
+	const auto blockColor = block.color;
+	unsigned char decompBlock [64];
+	DecompressBlockBC3(blockData, decompBlock, nullptr);
+	CHECK(ColorMatches(decompBlock, blockColor,false));
+	unsigned char compBlock[16];
+	unsigned char decompCompBlock [64];
+	CompressBlockBC3(decompBlock, 16, compBlock, nullptr);
+	DecompressBlockBC3(compBlock, decompCompBlock, nullptr);
+	CHECK(ColorMatches(decompCompBlock, blockColor,false));
+}
+TEST_CASE("BC3_Red_Full_Alpha", "[BC3_Red_Full_Alpha]")
+{
+	const auto block = blocks.find("BC3_Red_Full_Alpha")->second;
+	const auto blockData = block.data;
+	const auto blockColor = block.color;
+	unsigned char decompBlock [64];
+	DecompressBlockBC3(blockData, decompBlock, nullptr);
+	CHECK(ColorMatches(decompBlock, blockColor,false));
+	unsigned char compBlock[16];
+	unsigned char decompCompBlock [64];
+	CompressBlockBC3(decompBlock, 16, compBlock, nullptr);
+	DecompressBlockBC3(compBlock, decompCompBlock, nullptr);
+	CHECK(ColorMatches(decompCompBlock, blockColor,false));
+}
+TEST_CASE("BC3_Green_Full_Alpha", "[BC3_Green_Full_Alpha]")
+{
+	const auto block = blocks.find("BC3_Green_Full_Alpha")->second;
+	const auto blockData = block.data;
+	const auto blockColor = block.color;
+	unsigned char decompBlock [64];
+	DecompressBlockBC3(blockData, decompBlock, nullptr);
+	CHECK(ColorMatches(decompBlock, blockColor,false));
+	unsigned char compBlock[16];
+	unsigned char decompCompBlock [64];
+	CompressBlockBC3(decompBlock, 16, compBlock, nullptr);
+	DecompressBlockBC3(compBlock, decompCompBlock, nullptr);
+	CHECK(ColorMatches(decompCompBlock, blockColor,false));
+}
+TEST_CASE("BC3_Blue_Full_Alpha", "[BC3_Blue_Full_Alpha]")
+{
+	const auto block = blocks.find("BC3_Blue_Full_Alpha")->second;
+	const auto blockData = block.data;
+	const auto blockColor = block.color;
+	unsigned char decompBlock [64];
+	DecompressBlockBC3(blockData, decompBlock, nullptr);
+	CHECK(ColorMatches(decompBlock, blockColor,false));
+	unsigned char compBlock[16];
+	unsigned char decompCompBlock [64];
+	CompressBlockBC3(decompBlock, 16, compBlock, nullptr);
+	DecompressBlockBC3(compBlock, decompCompBlock, nullptr);
+	CHECK(ColorMatches(decompCompBlock, blockColor,false));
+}
+TEST_CASE("BC3_White_Full_Alpha", "[BC3_White_Full_Alpha]")
+{
+	const auto block = blocks.find("BC3_White_Full_Alpha")->second;
+	const auto blockData = block.data;
+	const auto blockColor = block.color;
+	unsigned char decompBlock [64];
+	DecompressBlockBC3(blockData, decompBlock, nullptr);
+	CHECK(ColorMatches(decompBlock, blockColor,false));
+	unsigned char compBlock[16];
+	unsigned char decompCompBlock [64];
+	CompressBlockBC3(decompBlock, 16, compBlock, nullptr);
+	DecompressBlockBC3(compBlock, decompCompBlock, nullptr);
+	CHECK(ColorMatches(decompCompBlock, blockColor,false));
+}
+TEST_CASE("BC3_Green_Ignore_Alpha", "[BC3_Green_Ignore_Alpha]")
+{
+	const auto block = blocks.find("BC3_Green_Ignore_Alpha")->second;
+	const auto blockData = block.data;
+	const auto blockColor = block.color;
+	unsigned char decompBlock [64];
+	DecompressBlockBC3(blockData, decompBlock, nullptr);
+	CHECK(ColorMatches(decompBlock, blockColor,false));
+	unsigned char compBlock[16];
+	unsigned char decompCompBlock [64];
+	CompressBlockBC3(decompBlock, 16, compBlock, nullptr);
+	DecompressBlockBC3(compBlock, decompCompBlock, nullptr);
+	CHECK(ColorMatches(decompCompBlock, blockColor,false));
+}
+TEST_CASE("BC3_Black_Full_Alpha", "[BC3_Black_Full_Alpha]")
+{
+	const auto block = blocks.find("BC3_Black_Full_Alpha")->second;
+	const auto blockData = block.data;
+	const auto blockColor = block.color;
+	unsigned char decompBlock [64];
+	DecompressBlockBC3(blockData, decompBlock, nullptr);
+	CHECK(ColorMatches(decompBlock, blockColor,false));
+	unsigned char compBlock[16];
+	unsigned char decompCompBlock [64];
+	CompressBlockBC3(decompBlock, 16, compBlock, nullptr);
+	DecompressBlockBC3(compBlock, decompCompBlock, nullptr);
+	CHECK(ColorMatches(decompCompBlock, blockColor,false));
+}
+TEST_CASE("BC3_Red_Blue_Full_Alpha", "[BC3_Red_Blue_Full_Alpha]")
+{
+	const auto block = blocks.find("BC3_Red_Blue_Full_Alpha")->second;
+	const auto blockData = block.data;
+	const auto blockColor = block.color;
+	unsigned char decompBlock [64];
+	DecompressBlockBC3(blockData, decompBlock, nullptr);
+	CHECK(ColorMatches(decompBlock, blockColor,false));
+	unsigned char compBlock[16];
+	unsigned char decompCompBlock [64];
+	CompressBlockBC3(decompBlock, 16, compBlock, nullptr);
+	DecompressBlockBC3(compBlock, decompCompBlock, nullptr);
+	CHECK(ColorMatches(decompCompBlock, blockColor,false));
+}
+TEST_CASE("BC3_Red_Green_Full_Alpha", "[BC3_Red_Green_Full_Alpha]")
+{
+	const auto block = blocks.find("BC3_Red_Green_Full_Alpha")->second;
+	const auto blockData = block.data;
+	const auto blockColor = block.color;
+	unsigned char decompBlock [64];
+	DecompressBlockBC3(blockData, decompBlock, nullptr);
+	CHECK(ColorMatches(decompBlock, blockColor,false));
+	unsigned char compBlock[16];
+	unsigned char decompCompBlock [64];
+	CompressBlockBC3(decompBlock, 16, compBlock, nullptr);
+	DecompressBlockBC3(compBlock, decompCompBlock, nullptr);
+	CHECK(ColorMatches(decompCompBlock, blockColor,false));
+}
+TEST_CASE("BC3_Green_Blue_Full_Alpha", "[BC3_Green_Blue_Full_Alpha]")
+{
+	const auto block = blocks.find("BC3_Green_Blue_Full_Alpha")->second;
+	const auto blockData = block.data;
+	const auto blockColor = block.color;
+	unsigned char decompBlock [64];
+	DecompressBlockBC3(blockData, decompBlock, nullptr);
+	CHECK(ColorMatches(decompBlock, blockColor,false));
+	unsigned char compBlock[16];
+	unsigned char decompCompBlock [64];
+	CompressBlockBC3(decompBlock, 16, compBlock, nullptr);
+	DecompressBlockBC3(compBlock, decompCompBlock, nullptr);
+	CHECK(ColorMatches(decompCompBlock, blockColor,false));
+}
+TEST_CASE("BC3_Blue_Ignore_Alpha", "[BC3_Blue_Ignore_Alpha]")
+{
+	const auto block = blocks.find("BC3_Blue_Ignore_Alpha")->second;
+	const auto blockData = block.data;
+	const auto blockColor = block.color;
+	unsigned char decompBlock [64];
+	DecompressBlockBC3(blockData, decompBlock, nullptr);
+	CHECK(ColorMatches(decompBlock, blockColor,false));
+	unsigned char compBlock[16];
+	unsigned char decompCompBlock [64];
+	CompressBlockBC3(decompBlock, 16, compBlock, nullptr);
+	DecompressBlockBC3(compBlock, decompCompBlock, nullptr);
+	CHECK(ColorMatches(decompCompBlock, blockColor,false));
+}
+TEST_CASE("BC3_White_Ignore_Alpha", "[BC3_White_Ignore_Alpha]")
+{
+	const auto block = blocks.find("BC3_White_Ignore_Alpha")->second;
+	const auto blockData = block.data;
+	const auto blockColor = block.color;
+	unsigned char decompBlock [64];
+	DecompressBlockBC3(blockData, decompBlock, nullptr);
+	CHECK(ColorMatches(decompBlock, blockColor,false));
+	unsigned char compBlock[16];
+	unsigned char decompCompBlock [64];
+	CompressBlockBC3(decompBlock, 16, compBlock, nullptr);
+	DecompressBlockBC3(compBlock, decompCompBlock, nullptr);
+	CHECK(ColorMatches(decompCompBlock, blockColor,false));
+}
+TEST_CASE("BC3_Black_Ignore_Alpha", "[BC3_Black_Ignore_Alpha]")
+{
+	const auto block = blocks.find("BC3_Black_Ignore_Alpha")->second;
+	const auto blockData = block.data;
+	const auto blockColor = block.color;
+	unsigned char decompBlock [64];
+	DecompressBlockBC3(blockData, decompBlock, nullptr);
+	CHECK(ColorMatches(decompBlock, blockColor,false));
+	unsigned char compBlock[16];
+	unsigned char decompCompBlock [64];
+	CompressBlockBC3(decompBlock, 16, compBlock, nullptr);
+	DecompressBlockBC3(compBlock, decompCompBlock, nullptr);
+	CHECK(ColorMatches(decompCompBlock, blockColor,false));
+}
+TEST_CASE("BC3_Red_Blue_Ignore_Alpha", "[BC3_Red_Blue_Ignore_Alpha]")
+{
+	const auto block = blocks.find("BC3_Red_Blue_Ignore_Alpha")->second;
+	const auto blockData = block.data;
+	const auto blockColor = block.color;
+	unsigned char decompBlock [64];
+	DecompressBlockBC3(blockData, decompBlock, nullptr);
+	CHECK(ColorMatches(decompBlock, blockColor,false));
+	unsigned char compBlock[16];
+	unsigned char decompCompBlock [64];
+	CompressBlockBC3(decompBlock, 16, compBlock, nullptr);
+	DecompressBlockBC3(compBlock, decompCompBlock, nullptr);
+	CHECK(ColorMatches(decompCompBlock, blockColor,false));
+}
+TEST_CASE("BC3_Red_Green_Ignore_Alpha", "[BC3_Red_Green_Ignore_Alpha]")
+{
+	const auto block = blocks.find("BC3_Red_Green_Ignore_Alpha")->second;
+	const auto blockData = block.data;
+	const auto blockColor = block.color;
+	unsigned char decompBlock [64];
+	DecompressBlockBC3(blockData, decompBlock, nullptr);
+	CHECK(ColorMatches(decompBlock, blockColor,false));
+	unsigned char compBlock[16];
+	unsigned char decompCompBlock [64];
+	CompressBlockBC3(decompBlock, 16, compBlock, nullptr);
+	DecompressBlockBC3(compBlock, decompCompBlock, nullptr);
+	CHECK(ColorMatches(decompCompBlock, blockColor,false));
+}
+TEST_CASE("BC3_Green_Blue_Ignore_Alpha", "[BC3_Green_Blue_Ignore_Alpha]")
+{
+	const auto block = blocks.find("BC3_Green_Blue_Ignore_Alpha")->second;
+	const auto blockData = block.data;
+	const auto blockColor = block.color;
+	unsigned char decompBlock [64];
+	DecompressBlockBC3(blockData, decompBlock, nullptr);
+	CHECK(ColorMatches(decompBlock, blockColor,false));
+	unsigned char compBlock[16];
+	unsigned char decompCompBlock [64];
+	CompressBlockBC3(decompBlock, 16, compBlock, nullptr);
+	DecompressBlockBC3(compBlock, decompCompBlock, nullptr);
+	CHECK(ColorMatches(decompCompBlock, blockColor,false));
+}
+TEST_CASE("BC3_Red_Half_Alpha", "[BC3_Red_Half_Alpha]")
+{
+	const auto block = blocks.find("BC3_Red_Half_Alpha")->second;
+	const auto blockData = block.data;
+	const auto blockColor = block.color;
+	unsigned char decompBlock [64];
+	DecompressBlockBC3(blockData, decompBlock, nullptr);
+	CHECK(ColorMatches(decompBlock, blockColor,false));
+	unsigned char compBlock[16];
+	unsigned char decompCompBlock [64];
+	CompressBlockBC3(decompBlock, 16, compBlock, nullptr);
+	DecompressBlockBC3(compBlock, decompCompBlock, nullptr);
+	CHECK(ColorMatches(decompCompBlock, blockColor,false));
+}
+TEST_CASE("BC3_Green_Half_Alpha", "[BC3_Green_Half_Alpha]")
+{
+	const auto block = blocks.find("BC3_Green_Half_Alpha")->second;
+	const auto blockData = block.data;
+	const auto blockColor = block.color;
+	unsigned char decompBlock [64];
+	DecompressBlockBC3(blockData, decompBlock, nullptr);
+	CHECK(ColorMatches(decompBlock, blockColor,false));
+	unsigned char compBlock[16];
+	unsigned char decompCompBlock [64];
+	CompressBlockBC3(decompBlock, 16, compBlock, nullptr);
+	DecompressBlockBC3(compBlock, decompCompBlock, nullptr);
+	CHECK(ColorMatches(decompCompBlock, blockColor,false));
+}
+
+//***************************************************************************************
\ No newline at end of file
diff --git a/extern/CMP_Core/test/CompressonatorTests.h b/extern/CMP_Core/test/CompressonatorTests.h
new file mode 100644
index 0000000..f070a4f
--- /dev/null
+++ b/extern/CMP_Core/test/CompressonatorTests.h
@@ -0,0 +1,6 @@
+#ifndef COMPRESSONATOR_TESTS_H
+#define COMPRESSONATOR_TESTS_H
+
+void AssignExpectedColorsToBlocks();
+
+#endif
\ No newline at end of file
diff --git a/extern/CMP_Core/test/TestsMain.cpp b/extern/CMP_Core/test/TestsMain.cpp
new file mode 100644
index 0000000..99f12a5
--- /dev/null
+++ b/extern/CMP_Core/test/TestsMain.cpp
@@ -0,0 +1,10 @@
+#define CATCH_CONFIG_RUNNER
+#include "../../../Common/Lib/Ext/Catch2/catch.hpp"
+#include "CompressonatorTests.h"
+
+int main(int argc, char* argv[]) {
+	AssignExpectedColorsToBlocks();
+	int result = Catch::Session().run(argc, argv);
+
+	return result;
+}
\ No newline at end of file
diff --git a/extern/CMakeLists.txt b/extern/CMakeLists.txt
index 7e41986..4e812f4 100644
--- a/extern/CMakeLists.txt
+++ b/extern/CMakeLists.txt
@@ -9,5 +9,9 @@ ADD_SUBDIRECTORY(EtcLib)
 ADD_SUBDIRECTORY(rg_etc1_v104)
 #ADD_SUBDIRECTORY(etcpack)
 
-ADD_SUBDIRECTORY(butteraugli)
+#ADD_SUBDIRECTORY(butteraugli)
+
+ADD_SUBDIRECTORY(libsquish-1.15)
+
+ADD_SUBDIRECTORY(CMP_Core)
 
diff --git a/extern/libsquish-1.15/CMakeLists.txt b/extern/libsquish-1.15/CMakeLists.txt
new file mode 100644
index 0000000..a36e574
--- /dev/null
+++ b/extern/libsquish-1.15/CMakeLists.txt
@@ -0,0 +1,117 @@
+# cmake build file for squish
+# by Stefan Roettger (snroettg@gmail.com)
+# updated by Simon Brown (si@sjbrown.co.uk)
+
+# features:
+#   uses -fopenmp when available
+#    use BUILD_SQUISH_WITH_OPENMP to override
+#   Xcode: builds universal binaries, uses SSE2 on i386 and Altivec on ppc
+#   Unix and VS: SSE2 support is enabled by default
+#    use BUILD_SQUISH_WITH_SSE2 and BUILD_SQUISH_WITH_ALTIVEC to override
+
+PROJECT(squish)
+
+CMAKE_MINIMUM_REQUIRED(VERSION 2.8.3)
+
+OPTION(BUILD_SQUISH_WITH_OPENMP "Build with OpenMP." ON)
+
+OPTION(BUILD_SQUISH_WITH_SSE2 "Build with SSE2." ON)
+OPTION(BUILD_SQUISH_WITH_ALTIVEC "Build with Altivec." OFF)
+
+OPTION(BUILD_SHARED_LIBS "Build shared libraries." OFF)
+
+OPTION(BUILD_SQUISH_EXTRA "Build extra source code." OFF)
+
+IF (BUILD_SQUISH_WITH_OPENMP)
+   FIND_PACKAGE(OpenMP)
+   IF (OPENMP_FOUND)
+       SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
+       SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+       ADD_DEFINITIONS(-DSQUISH_USE_OPENMP)
+   ENDIF()
+ENDIF()
+
+IF (CMAKE_GENERATOR STREQUAL "Xcode")
+    SET(CMAKE_OSX_ARCHITECTURES "i386;ppc")
+ELSE (CMAKE_GENERATOR STREQUAL "Xcode")
+    IF (BUILD_SQUISH_WITH_SSE2 AND NOT WIN32)
+        ADD_DEFINITIONS(-DSQUISH_USE_SSE=2 -msse2)
+    ENDIF (BUILD_SQUISH_WITH_SSE2 AND NOT WIN32)
+    IF (BUILD_SQUISH_WITH_ALTIVEC AND NOT WIN32)
+        ADD_DEFINITIONS(-DSQUISH_USE_ALTIVEC=1 -maltivec)
+    ENDIF (BUILD_SQUISH_WITH_ALTIVEC AND NOT WIN32)
+ENDIF (CMAKE_GENERATOR STREQUAL "Xcode")
+
+SET(SQUISH_HDRS
+    squish.h
+    )
+
+SET(SQUISH_SRCS
+    alpha.cpp
+    alpha.h
+    clusterfit.cpp
+    clusterfit.h
+    colourblock.cpp
+    colourblock.h
+    colourfit.cpp
+    colourfit.h
+    colourset.cpp
+    colourset.h
+    maths.cpp
+    maths.h
+    rangefit.cpp
+    rangefit.h
+    simd.h
+    simd_float.h
+    simd_sse.h
+    simd_ve.h
+    singlecolourfit.cpp
+    singlecolourfit.h
+    singlecolourlookup.inl
+    squish.cpp
+    )
+
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+
+ADD_LIBRARY(squish ${SQUISH_SRCS} ${SQUISH_HDRS})
+
+SET_TARGET_PROPERTIES(
+    squish PROPERTIES
+    PUBLIC_HEADER "${SQUISH_HDRS}"
+    VERSION 0.0
+    SOVERSION 0.0
+    DEBUG_POSTFIX "d"
+    XCODE_ATTRIBUTE_GCC_PREPROCESSOR_DEFINITIONS "$(SQUISH_CPP_$(CURRENT_ARCH))"
+    XCODE_ATTRIBUTE_OTHER_CFLAGS "$(SQUISH_CFLAGS_$(CURRENT_ARCH))"
+    XCODE_ATTRIBUTE_SQUISH_CPP_i386 "SQUISH_USE_SSE=2"
+    XCODE_ATTRIBUTE_SQUISH_CFLAGS_i386 ""
+    XCODE_ATTRIBUTE_SQUISH_CPP_ppc "SQUISH_USE_ALTIVEC=1"
+    XCODE_ATTRIBUTE_SQUISH_CFLAGS_ppc "-maltivec"
+    )
+
+IF (BUILD_SQUISH_EXTRA)
+    SET(SQUISHTEST_SRCS extra/squishtest.cpp)
+
+    ADD_EXECUTABLE(squishtest ${SQUISHTEST_SRCS})
+    SET_TARGET_PROPERTIES(squishtest PROPERTIES DEBUG_POSTFIX "d")
+    TARGET_LINK_LIBRARIES(squishtest squish)
+
+    SET(SQUISHPNG_SRCS extra/squishpng.cpp)
+
+    FIND_PACKAGE(PNG)
+
+    IF (PNG_FOUND)
+        SET(CMAKE_PLATFORM_IMPLICIT_INCLUDE_DIRECTORIES)
+        INCLUDE_DIRECTORIES(${PNG_INCLUDE_DIR})
+        ADD_EXECUTABLE(squishpng ${SQUISHPNG_SRCS})
+        SET_TARGET_PROPERTIES(squishpng PROPERTIES DEBUG_POSTFIX "d")
+        TARGET_LINK_LIBRARIES(squishpng squish ${PNG_LIBRARIES})
+    ENDIF (PNG_FOUND)
+ENDIF (BUILD_SQUISH_EXTRA)
+
+INSTALL(
+    TARGETS squish
+    LIBRARY DESTINATION lib
+    ARCHIVE DESTINATION lib
+    PUBLIC_HEADER DESTINATION include
+    )
diff --git a/extern/libsquish-1.15/CMakeModules/FindlibSquish.cmake b/extern/libsquish-1.15/CMakeModules/FindlibSquish.cmake
new file mode 100644
index 0000000..a8d7cfe
--- /dev/null
+++ b/extern/libsquish-1.15/CMakeModules/FindlibSquish.cmake
@@ -0,0 +1,14 @@
+# Defines
+#  LIBSQUISH_FOUND
+#  LIBSQUISH_INCLUDE_DIR
+#  LIBSQUISH_LIBRARIES
+
+FIND_PATH(LIBSQUISH_INCLUDE_DIR squish.h PATHS . squish .. ../squish DOC "Directory containing libSquish headers")
+FIND_LIBRARY(LIBSQUISH_LIBRARY NAMES squish libsquish PATHS . squish .. ../squish PATH_SUFFIXES lib lib64 release minsizerel relwithdebinfo DOC "Path to libSquish library")
+
+SET(LIBSQUISH_LIBRARIES ${LIBSQUISH_LIBRARY})
+
+IF (LIBSQUISH_LIBRARY AND LIBSQUISH_INCLUDE_DIR)
+   SET(LIBSQUISH_FOUND TRUE)
+   MESSAGE(STATUS "Found libSquish: ${LIBSQUISH_LIBRARY}")
+ENDIF (LIBSQUISH_LIBRARY AND LIBSQUISH_INCLUDE_DIR)
diff --git a/extern/libsquish-1.15/ChangeLog.txt b/extern/libsquish-1.15/ChangeLog.txt
new file mode 100644
index 0000000..f6c8c6d
--- /dev/null
+++ b/extern/libsquish-1.15/ChangeLog.txt
@@ -0,0 +1,66 @@
+1.15
+* parallel compression using openmp with cmake (Marian Krivos / Stefan Roettger)
+* parallel decompression using openmp with cmake (Stefan Roettger)
+
+1.14
+* backport BGRA support
+* backport BC4 and BC5 support
+* backport BlockMSE support
+
+1.11-1.13
+* added support for CMake and QMake (Stefan Roettger)
+* misc. minor changes on the build system (Stefan Roettger)
+* added svg icon (Stefan Roettger)
+
+1.10
+* Iterative cluster fit is now considered to be a new compression mode
+* The core cluster fit is now 4x faster using contributions by Ignacio
+Castano from NVIDIA
+* The single colour lookup table has been halved by exploiting symmetry
+
+1.9
+* Added contributed SSE1 truncate implementation
+* Changed use of SQUISH_USE_SSE to be 1 for SSE and 2 for SSE2 instructions
+* Cluster fit is now iterative to further reduce image error
+
+1.8
+* Switched from using floor to trunc for much better SSE performance (again)
+* Xcode build now expects libpng in /usr/local for extra/squishpng
+
+1.7
+* Fixed floating-point equality issue in clusterfit sort (x86 affected only)
+* Implemented proper SSE(2) floor function for 50% speedup on SSE builds 
+* The range fit implementation now uses the correct colour metric
+
+1.6
+* Fixed bug in CompressImage where masked pixels were not skipped over
+* DXT3 and DXT5 alpha compression now properly use the mask to ignore pixels
+* Fixed major DXT1 bug that can generate unexpected transparent pixels
+
+1.5
+* Added CompressMasked function to handle incomplete DXT blocks more cleanly
+* Added kWeightColourByAlpha flag for better quality images when alpha blending
+
+1.4
+* Fixed stack overflow in rangefit
+
+1.3
+* Worked around SSE floor implementation bug, proper fix needed!
+* This release has visual studio and makefile builds that work
+
+1.2
+* Added provably optimal single colour compressor
+* Added extra/squishgen.cpp that generates single colour lookup tables
+
+1.1
+* Fixed a DXT1 colour output bug
+* Changed argument order for Decompress function to match Compress
+* Added GetStorageRequirements function
+* Added CompressImage function
+* Added DecompressImage function
+* Moved squishtool.cpp to extra/squishpng.cpp
+* Added extra/squishtest.cpp
+
+1.0
+* Initial release
+
diff --git a/extern/libsquish-1.15/Doxyfile b/extern/libsquish-1.15/Doxyfile
new file mode 100644
index 0000000..3c54d29
--- /dev/null
+++ b/extern/libsquish-1.15/Doxyfile
@@ -0,0 +1,214 @@
+# Doxyfile 1.4.6
+
+#---------------------------------------------------------------------------
+# Project related configuration options
+#---------------------------------------------------------------------------
+PROJECT_NAME           = squish
+PROJECT_NUMBER         = 1.14
+OUTPUT_DIRECTORY       = docs 
+CREATE_SUBDIRS         = NO
+OUTPUT_LANGUAGE        = English
+BRIEF_MEMBER_DESC      = YES
+REPEAT_BRIEF           = YES
+ABBREVIATE_BRIEF       = 
+ALWAYS_DETAILED_SEC    = NO
+INLINE_INHERITED_MEMB  = NO
+FULL_PATH_NAMES        = YES
+STRIP_FROM_PATH        = 
+STRIP_FROM_INC_PATH    = 
+SHORT_NAMES            = NO
+JAVADOC_AUTOBRIEF      = NO
+MULTILINE_CPP_IS_BRIEF = NO
+INHERIT_DOCS           = YES
+SEPARATE_MEMBER_PAGES  = NO
+TAB_SIZE               = 4
+ALIASES                = 
+OPTIMIZE_OUTPUT_FOR_C  = NO
+OPTIMIZE_OUTPUT_JAVA   = NO
+BUILTIN_STL_SUPPORT    = NO
+DISTRIBUTE_GROUP_DOC   = NO
+SUBGROUPING            = YES
+#---------------------------------------------------------------------------
+# Build related configuration options
+#---------------------------------------------------------------------------
+EXTRACT_ALL            = YES
+EXTRACT_PRIVATE        = NO
+EXTRACT_STATIC         = NO
+EXTRACT_LOCAL_CLASSES  = YES
+EXTRACT_LOCAL_METHODS  = NO
+HIDE_UNDOC_MEMBERS     = NO
+HIDE_UNDOC_CLASSES     = NO
+HIDE_FRIEND_COMPOUNDS  = NO
+HIDE_IN_BODY_DOCS      = NO
+INTERNAL_DOCS          = NO
+CASE_SENSE_NAMES       = NO
+HIDE_SCOPE_NAMES       = NO
+SHOW_INCLUDE_FILES     = YES
+INLINE_INFO            = YES
+SORT_MEMBER_DOCS       = YES
+SORT_BRIEF_DOCS        = NO
+SORT_BY_SCOPE_NAME     = NO
+GENERATE_TODOLIST      = YES
+GENERATE_TESTLIST      = YES
+GENERATE_BUGLIST       = YES
+GENERATE_DEPRECATEDLIST= YES
+ENABLED_SECTIONS       = 
+MAX_INITIALIZER_LINES  = 30
+SHOW_USED_FILES        = YES
+FILE_VERSION_FILTER    = 
+#---------------------------------------------------------------------------
+# configuration options related to warning and progress messages
+#---------------------------------------------------------------------------
+QUIET                  = YES
+WARNINGS               = YES
+WARN_IF_UNDOCUMENTED   = YES
+WARN_IF_DOC_ERROR      = YES
+WARN_NO_PARAMDOC       = NO
+WARN_FORMAT            = "$file:$line: $text"
+WARN_LOGFILE           = 
+#---------------------------------------------------------------------------
+# configuration options related to the input files
+#---------------------------------------------------------------------------
+INPUT                  = squish.h
+FILE_PATTERNS          = 
+RECURSIVE              = NO
+EXCLUDE                = 
+EXCLUDE_SYMLINKS       = NO
+EXCLUDE_PATTERNS       = 
+EXAMPLE_PATH           = 
+EXAMPLE_PATTERNS       = 
+EXAMPLE_RECURSIVE      = NO
+IMAGE_PATH             = 
+INPUT_FILTER           = 
+FILTER_PATTERNS        = 
+FILTER_SOURCE_FILES    = NO
+#---------------------------------------------------------------------------
+# configuration options related to source browsing
+#---------------------------------------------------------------------------
+SOURCE_BROWSER         = NO
+INLINE_SOURCES         = NO
+STRIP_CODE_COMMENTS    = YES
+REFERENCED_BY_RELATION = YES
+REFERENCES_RELATION    = YES
+USE_HTAGS              = NO
+VERBATIM_HEADERS       = YES
+#---------------------------------------------------------------------------
+# configuration options related to the alphabetical class index
+#---------------------------------------------------------------------------
+ALPHABETICAL_INDEX     = NO
+COLS_IN_ALPHA_INDEX    = 5
+IGNORE_PREFIX          = 
+#---------------------------------------------------------------------------
+# configuration options related to the HTML output
+#---------------------------------------------------------------------------
+GENERATE_HTML          = YES
+HTML_OUTPUT            = html
+HTML_FILE_EXTENSION    = .html
+HTML_HEADER            = 
+HTML_FOOTER            = 
+HTML_STYLESHEET        = 
+GENERATE_HTMLHELP      = NO
+CHM_FILE               = 
+HHC_LOCATION           = 
+GENERATE_CHI           = NO
+BINARY_TOC             = NO
+TOC_EXPAND             = NO
+DISABLE_INDEX          = NO
+ENUM_VALUES_PER_LINE   = 4
+GENERATE_TREEVIEW      = NO
+TREEVIEW_WIDTH         = 250
+#---------------------------------------------------------------------------
+# configuration options related to the LaTeX output
+#---------------------------------------------------------------------------
+GENERATE_LATEX         = NO
+LATEX_OUTPUT           = latex
+LATEX_CMD_NAME         = latex
+MAKEINDEX_CMD_NAME     = makeindex
+COMPACT_LATEX          = NO
+PAPER_TYPE             = a4wide
+EXTRA_PACKAGES         = 
+LATEX_HEADER           = 
+PDF_HYPERLINKS         = NO
+USE_PDFLATEX           = NO
+LATEX_BATCHMODE        = NO
+LATEX_HIDE_INDICES     = NO
+#---------------------------------------------------------------------------
+# configuration options related to the RTF output
+#---------------------------------------------------------------------------
+GENERATE_RTF           = NO
+RTF_OUTPUT             = rtf
+COMPACT_RTF            = NO
+RTF_HYPERLINKS         = NO
+RTF_STYLESHEET_FILE    = 
+RTF_EXTENSIONS_FILE    = 
+#---------------------------------------------------------------------------
+# configuration options related to the man page output
+#---------------------------------------------------------------------------
+GENERATE_MAN           = NO
+MAN_OUTPUT             = man
+MAN_EXTENSION          = .3
+MAN_LINKS              = NO
+#---------------------------------------------------------------------------
+# configuration options related to the XML output
+#---------------------------------------------------------------------------
+GENERATE_XML           = NO
+XML_OUTPUT             = xml
+XML_PROGRAMLISTING     = YES
+#---------------------------------------------------------------------------
+# configuration options for the AutoGen Definitions output
+#---------------------------------------------------------------------------
+GENERATE_AUTOGEN_DEF   = NO
+#---------------------------------------------------------------------------
+# configuration options related to the Perl module output
+#---------------------------------------------------------------------------
+GENERATE_PERLMOD       = NO
+PERLMOD_LATEX          = NO
+PERLMOD_PRETTY         = YES
+PERLMOD_MAKEVAR_PREFIX = 
+#---------------------------------------------------------------------------
+# Configuration options related to the preprocessor   
+#---------------------------------------------------------------------------
+ENABLE_PREPROCESSING   = YES
+MACRO_EXPANSION        = NO
+EXPAND_ONLY_PREDEF     = NO
+SEARCH_INCLUDES        = YES
+INCLUDE_PATH           = 
+INCLUDE_FILE_PATTERNS  = 
+PREDEFINED             = 
+EXPAND_AS_DEFINED      = 
+SKIP_FUNCTION_MACROS   = YES
+#---------------------------------------------------------------------------
+# Configuration::additions related to external references   
+#---------------------------------------------------------------------------
+TAGFILES               = 
+GENERATE_TAGFILE       = 
+ALLEXTERNALS           = NO
+EXTERNAL_GROUPS        = YES
+PERL_PATH              = /usr/bin/perl
+#---------------------------------------------------------------------------
+# Configuration options related to the dot tool   
+#---------------------------------------------------------------------------
+CLASS_DIAGRAMS         = YES
+HIDE_UNDOC_RELATIONS   = YES
+HAVE_DOT               = NO
+CLASS_GRAPH            = YES
+COLLABORATION_GRAPH    = YES
+GROUP_GRAPHS           = YES
+UML_LOOK               = NO
+TEMPLATE_RELATIONS     = NO
+INCLUDE_GRAPH          = YES
+INCLUDED_BY_GRAPH      = YES
+CALL_GRAPH             = NO
+GRAPHICAL_HIERARCHY    = YES
+DIRECTORY_GRAPH        = YES
+DOT_IMAGE_FORMAT       = png
+DOTFILE_DIRS           = 
+MAX_DOT_GRAPH_DEPTH    = 0
+DOT_TRANSPARENT        = NO
+DOT_MULTI_TARGETS      = NO
+GENERATE_LEGEND        = YES
+DOT_CLEANUP            = YES
+#---------------------------------------------------------------------------
+# Configuration::additions related to the search engine   
+#---------------------------------------------------------------------------
+SEARCHENGINE           = NO
diff --git a/extern/libsquish-1.15/LICENSE.txt b/extern/libsquish-1.15/LICENSE.txt
new file mode 100644
index 0000000..ed1c78d
--- /dev/null
+++ b/extern/libsquish-1.15/LICENSE.txt
@@ -0,0 +1,20 @@
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/extern/libsquish-1.15/Makefile b/extern/libsquish-1.15/Makefile
new file mode 100644
index 0000000..fd7d6c8
--- /dev/null
+++ b/extern/libsquish-1.15/Makefile
@@ -0,0 +1,65 @@
+include config
+
+VER = 1.15
+SOVER = 0
+
+SRC = alpha.cpp clusterfit.cpp colourblock.cpp colourfit.cpp colourset.cpp maths.cpp rangefit.cpp singlecolourfit.cpp squish.cpp
+
+HDR = alpha.h clusterfit.h colourblock.h colourfit.h colourset.h maths.h rangefit.h singlecolourfit.h squish.h
+HDR += config.h simd.h simd_float.h simd_sse.h simd_ve.h singlecolourlookup.inl
+
+OBJ = $(SRC:%.cpp=%.o)
+
+SOLIB = libsquish.so.$(SOVER)
+LIB = $(SOLIB).0
+CPPFLAGS += -fPIC
+LIBA = libsquish.a
+
+.PHONY: all install uninstall docs tgz clean
+
+all: $(LIB) $(LIBA) docs libsquish.pc
+
+install: $(LIB) $(LIBA) libsquish.pc
+	$(INSTALL_DIRECTORY) $(INSTALL_DIR)/include $(INSTALL_DIR)/$(LIB_PATH)
+	$(INSTALL_FILE) squish.h $(INSTALL_DIR)/include
+	$(INSTALL_FILE) $(LIBA) $(INSTALL_DIR)/$(LIB_PATH)
+ifneq ($(USE_SHARED),0)
+	$(INSTALL_FILE) $(LIB) $(INSTALL_DIR)/$(LIB_PATH)
+	ln -s $(LIB) $(INSTALL_DIR)/$(LIB_PATH)/$(SOLIB)
+	ln -s $(LIB) $(INSTALL_DIR)/$(LIB_PATH)/libsquish.so
+	$(INSTALL_DIRECTORY) $(INSTALL_DIR)/$(LIB_PATH)/pkgconfig
+	$(INSTALL_FILE) libsquish.pc $(INSTALL_DIR)/$(LIB_PATH)/pkgconfig
+endif
+
+uninstall:
+	$(RM) $(INSTALL_DIR)/include/squish.h
+	$(RM) $(INSTALL_DIR)/$(LIB_PATH)/$(LIBA)
+	-$(RM) $(INSTALL_DIR)/$(LIB_PATH)/$(LIB)
+	-$(RM) $(INSTALL_DIR)/$(LIB_PATH)/$(SOLIB)
+	-$(RM) $(INSTALL_DIR)/$(LIB_PATH)/libsquish.so
+	-$(RM) $(INSTALL_DIR)/$(LIB_PATH)/pkgconfig/libsquish.pc
+
+$(LIB): $(OBJ)
+ifneq ($(USE_SHARED),0)
+	$(CXX) $(LDFLAGS) -shared -Wl,-soname,$(SOLIB) -o $@ $(OBJ)
+endif
+
+$(LIBA): $(OBJ)
+	$(AR) cr $@ $?
+	@ranlib $@
+
+docs: $(SRC) $(HDR)
+	@if [ -x "`command -v doxygen`" ]; then doxygen; fi
+
+libsquish.pc: libsquish.pc.in
+	@sed 's|@PREFIX@|$(PREFIX)|;s|@LIB_PATH@|$(LIB_PATH)|' $@.in > $@
+
+tgz: clean
+	tar zcf libsquish-$(VER).tgz $(SRC) $(HDR) Makefile config CMakeLists.txt CMakeModules libSquish.* README.txt LICENSE.txt ChangeLog.txt Doxyfile libsquish.pc.in extra --exclude \*.svn\*
+
+%.o: %.cpp
+	$(CXX) $(CPPFLAGS) -I. $(CXXFLAGS) -o $@ -c $<
+
+clean:
+	$(RM) $(OBJ) $(LIB) $(LIBA) libsquish.pc
+	@-$(RM) -rf docs
diff --git a/extern/libsquish-1.15/README.txt b/extern/libsquish-1.15/README.txt
new file mode 100644
index 0000000..60380ee
--- /dev/null
+++ b/extern/libsquish-1.15/README.txt
@@ -0,0 +1,18 @@
+LICENSE
+-------
+
+The squish library is distributed under the terms and conditions of the MIT
+license. This license is specified at the top of each source file and must be
+preserved in its entirety.
+
+BUILDING AND INSTALLING THE LIBRARY
+-----------------------------------
+
+The preferred way to install the library on Unix/Mac (and Windows) is via cmake:
+ cmake . && make && sudo make install
+
+REPORTING BUGS OR FEATURE REQUESTS
+----------------------------------
+
+Feedback can be sent to Simon Brown (the developer) at si@sjbrown.co.uk
+Feedback can also be sent to Stefan Roettger (the maintainer) at snroettg@gmail.com
diff --git a/extern/libsquish-1.15/alpha.cpp b/extern/libsquish-1.15/alpha.cpp
new file mode 100644
index 0000000..7039c1a
--- /dev/null
+++ b/extern/libsquish-1.15/alpha.cpp
@@ -0,0 +1,350 @@
+/* -----------------------------------------------------------------------------
+
+    Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+    Permission is hereby granted, free of charge, to any person obtaining
+    a copy of this software and associated documentation files (the
+    "Software"), to deal in the Software without restriction, including
+    without limitation the rights to use, copy, modify, merge, publish,
+    distribute, sublicense, and/or sell copies of the Software, and to
+    permit persons to whom the Software is furnished to do so, subject to
+    the following conditions:
+
+    The above copyright notice and this permission notice shall be included
+    in all copies or substantial portions of the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+    OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+    IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+    CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+    TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+    SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+   -------------------------------------------------------------------------- */
+
+#include "alpha.h"
+
+#include <climits>
+#include <algorithm>
+
+namespace squish {
+
+static int FloatToInt( float a, int limit )
+{
+    // use ANSI round-to-zero behaviour to get round-to-nearest
+    int i = ( int )( a + 0.5f );
+
+    // clamp to the limit
+    if( i < 0 )
+        i = 0;
+    else if( i > limit )
+        i = limit;
+
+    // done
+    return i;
+}
+
+void CompressAlphaDxt3( u8 const* rgba, int mask, void* block )
+{
+    u8* bytes = reinterpret_cast< u8* >( block );
+
+    // quantise and pack the alpha values pairwise
+    for( int i = 0; i < 8; ++i )
+    {
+        // quantise down to 4 bits
+        float alpha1 = ( float )rgba[8*i + 3] * ( 15.0f/255.0f );
+        float alpha2 = ( float )rgba[8*i + 7] * ( 15.0f/255.0f );
+        int quant1 = FloatToInt( alpha1, 15 );
+        int quant2 = FloatToInt( alpha2, 15 );
+
+        // set alpha to zero where masked
+        int bit1 = 1 << ( 2*i );
+        int bit2 = 1 << ( 2*i + 1 );
+        if( ( mask & bit1 ) == 0 )
+            quant1 = 0;
+        if( ( mask & bit2 ) == 0 )
+            quant2 = 0;
+
+        // pack into the byte
+        bytes[i] = ( u8 )( quant1 | ( quant2 << 4 ) );
+    }
+}
+
+void DecompressAlphaDxt3( u8* rgba, void const* block )
+{
+    u8 const* bytes = reinterpret_cast< u8 const* >( block );
+
+    // unpack the alpha values pairwise
+    for( int i = 0; i < 8; ++i )
+    {
+        // quantise down to 4 bits
+        u8 quant = bytes[i];
+
+        // unpack the values
+        u8 lo = quant & 0x0f;
+        u8 hi = quant & 0xf0;
+
+        // convert back up to bytes
+        rgba[8*i + 3] = lo | ( lo << 4 );
+        rgba[8*i + 7] = hi | ( hi >> 4 );
+    }
+}
+
+static void FixRange( int& min, int& max, int steps )
+{
+    if( max - min < steps )
+        max = std::min( min + steps, 255 );
+    if( max - min < steps )
+        min = std::max( 0, max - steps );
+}
+
+static int FitCodes( u8 const* rgba, int mask, u8 const* codes, u8* indices )
+{
+    // fit each alpha value to the codebook
+    int err = 0;
+    for( int i = 0; i < 16; ++i )
+    {
+        // check this pixel is valid
+        int bit = 1 << i;
+        if( ( mask & bit ) == 0 )
+        {
+            // use the first code
+            indices[i] = 0;
+            continue;
+        }
+
+        // find the least error and corresponding index
+        int value = rgba[4*i + 3];
+        int least = INT_MAX;
+        int index = 0;
+        for( int j = 0; j < 8; ++j )
+        {
+            // get the squared error from this code
+            int dist = ( int )value - ( int )codes[j];
+            dist *= dist;
+
+            // compare with the best so far
+            if( dist < least )
+            {
+                least = dist;
+                index = j;
+            }
+        }
+
+        // save this index and accumulate the error
+        indices[i] = ( u8 )index;
+        err += least;
+    }
+
+    // return the total error
+    return err;
+}
+
+static void WriteAlphaBlock( int alpha0, int alpha1, u8 const* indices, void* block )
+{
+    u8* bytes = reinterpret_cast< u8* >( block );
+
+    // write the first two bytes
+    bytes[0] = ( u8 )alpha0;
+    bytes[1] = ( u8 )alpha1;
+
+    // pack the indices with 3 bits each
+    u8* dest = bytes + 2;
+    u8 const* src = indices;
+    for( int i = 0; i < 2; ++i )
+    {
+        // pack 8 3-bit values
+        int value = 0;
+        for( int j = 0; j < 8; ++j )
+        {
+            int index = *src++;
+            value |= ( index << 3*j );
+        }
+
+        // store in 3 bytes
+        for( int j = 0; j < 3; ++j )
+        {
+            int byte = ( value >> 8*j ) & 0xff;
+            *dest++ = ( u8 )byte;
+        }
+    }
+}
+
+static void WriteAlphaBlock5( int alpha0, int alpha1, u8 const* indices, void* block )
+{
+    // check the relative values of the endpoints
+    if( alpha0 > alpha1 )
+    {
+        // swap the indices
+        u8 swapped[16];
+        for( int i = 0; i < 16; ++i )
+        {
+            u8 index = indices[i];
+            if( index == 0 )
+                swapped[i] = 1;
+            else if( index == 1 )
+                swapped[i] = 0;
+            else if( index <= 5 )
+                swapped[i] = 7 - index;
+            else
+                swapped[i] = index;
+        }
+
+        // write the block
+        WriteAlphaBlock( alpha1, alpha0, swapped, block );
+    }
+    else
+    {
+        // write the block
+        WriteAlphaBlock( alpha0, alpha1, indices, block );
+    }
+}
+
+static void WriteAlphaBlock7( int alpha0, int alpha1, u8 const* indices, void* block )
+{
+    // check the relative values of the endpoints
+    if( alpha0 < alpha1 )
+    {
+        // swap the indices
+        u8 swapped[16];
+        for( int i = 0; i < 16; ++i )
+        {
+            u8 index = indices[i];
+            if( index == 0 )
+                swapped[i] = 1;
+            else if( index == 1 )
+                swapped[i] = 0;
+            else
+                swapped[i] = 9 - index;
+        }
+
+        // write the block
+        WriteAlphaBlock( alpha1, alpha0, swapped, block );
+    }
+    else
+    {
+        // write the block
+        WriteAlphaBlock( alpha0, alpha1, indices, block );
+    }
+}
+
+void CompressAlphaDxt5( u8 const* rgba, int mask, void* block )
+{
+    // get the range for 5-alpha and 7-alpha interpolation
+    int min5 = 255;
+    int max5 = 0;
+    int min7 = 255;
+    int max7 = 0;
+    for( int i = 0; i < 16; ++i )
+    {
+        // check this pixel is valid
+        int bit = 1 << i;
+        if( ( mask & bit ) == 0 )
+            continue;
+
+        // incorporate into the min/max
+        int value = rgba[4*i + 3];
+        if( value < min7 )
+            min7 = value;
+        if( value > max7 )
+            max7 = value;
+        if( value != 0 && value < min5 )
+            min5 = value;
+        if( value != 255 && value > max5 )
+            max5 = value;
+    }
+
+    // handle the case that no valid range was found
+    if( min5 > max5 )
+        min5 = max5;
+    if( min7 > max7 )
+        min7 = max7;
+
+    // fix the range to be the minimum in each case
+    FixRange( min5, max5, 5 );
+    FixRange( min7, max7, 7 );
+
+    // set up the 5-alpha code book
+    u8 codes5[8];
+    codes5[0] = ( u8 )min5;
+    codes5[1] = ( u8 )max5;
+    for( int i = 1; i < 5; ++i )
+        codes5[1 + i] = ( u8 )( ( ( 5 - i )*min5 + i*max5 )/5 );
+    codes5[6] = 0;
+    codes5[7] = 255;
+
+    // set up the 7-alpha code book
+    u8 codes7[8];
+    codes7[0] = ( u8 )min7;
+    codes7[1] = ( u8 )max7;
+    for( int i = 1; i < 7; ++i )
+        codes7[1 + i] = ( u8 )( ( ( 7 - i )*min7 + i*max7 )/7 );
+
+    // fit the data to both code books
+    u8 indices5[16];
+    u8 indices7[16];
+    int err5 = FitCodes( rgba, mask, codes5, indices5 );
+    int err7 = FitCodes( rgba, mask, codes7, indices7 );
+
+    // save the block with least error
+    if( err5 <= err7 )
+        WriteAlphaBlock5( min5, max5, indices5, block );
+    else
+        WriteAlphaBlock7( min7, max7, indices7, block );
+}
+
+void DecompressAlphaDxt5( u8* rgba, void const* block )
+{
+    // get the two alpha values
+    u8 const* bytes = reinterpret_cast< u8 const* >( block );
+    int alpha0 = bytes[0];
+    int alpha1 = bytes[1];
+
+    // compare the values to build the codebook
+    u8 codes[8];
+    codes[0] = ( u8 )alpha0;
+    codes[1] = ( u8 )alpha1;
+    if( alpha0 <= alpha1 )
+    {
+        // use 5-alpha codebook
+        for( int i = 1; i < 5; ++i )
+            codes[1 + i] = ( u8 )( ( ( 5 - i )*alpha0 + i*alpha1 )/5 );
+        codes[6] = 0;
+        codes[7] = 255;
+    }
+    else
+    {
+        // use 7-alpha codebook
+        for( int i = 1; i < 7; ++i )
+            codes[1 + i] = ( u8 )( ( ( 7 - i )*alpha0 + i*alpha1 )/7 );
+    }
+
+    // decode the indices
+    u8 indices[16];
+    u8 const* src = bytes + 2;
+    u8* dest = indices;
+    for( int i = 0; i < 2; ++i )
+    {
+        // grab 3 bytes
+        int value = 0;
+        for( int j = 0; j < 3; ++j )
+        {
+            int byte = *src++;
+            value |= ( byte << 8*j );
+        }
+
+        // unpack 8 3-bit values from it
+        for( int j = 0; j < 8; ++j )
+        {
+            int index = ( value >> 3*j ) & 0x7;
+            *dest++ = ( u8 )index;
+        }
+    }
+
+    // write out the indexed codebook values
+    for( int i = 0; i < 16; ++i )
+        rgba[4*i + 3] = codes[indices[i]];
+}
+
+} // namespace squish
diff --git a/extern/libsquish-1.15/alpha.h b/extern/libsquish-1.15/alpha.h
new file mode 100644
index 0000000..a1fffd4
--- /dev/null
+++ b/extern/libsquish-1.15/alpha.h
@@ -0,0 +1,41 @@
+/* -----------------------------------------------------------------------------
+
+    Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+    Permission is hereby granted, free of charge, to any person obtaining
+    a copy of this software and associated documentation files (the
+    "Software"), to deal in the Software without restriction, including
+    without limitation the rights to use, copy, modify, merge, publish,
+    distribute, sublicense, and/or sell copies of the Software, and to
+    permit persons to whom the Software is furnished to do so, subject to
+    the following conditions:
+
+    The above copyright notice and this permission notice shall be included
+    in all copies or substantial portions of the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+    OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+    IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+    CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+    TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+    SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+   -------------------------------------------------------------------------- */
+
+#ifndef SQUISH_ALPHA_H
+#define SQUISH_ALPHA_H
+
+#include "squish.h"
+
+namespace squish {
+
+void CompressAlphaDxt3( u8 const* rgba, int mask, void* block );
+void CompressAlphaDxt5( u8 const* rgba, int mask, void* block );
+
+void DecompressAlphaDxt3( u8* rgba, void const* block );
+void DecompressAlphaDxt5( u8* rgba, void const* block );
+
+} // namespace squish
+
+#endif // ndef SQUISH_ALPHA_H
diff --git a/extern/libsquish-1.15/clusterfit.cpp b/extern/libsquish-1.15/clusterfit.cpp
new file mode 100644
index 0000000..1610ecb
--- /dev/null
+++ b/extern/libsquish-1.15/clusterfit.cpp
@@ -0,0 +1,392 @@
+/* -----------------------------------------------------------------------------
+
+    Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+    Copyright (c) 2007 Ignacio Castano                   icastano@nvidia.com
+
+    Permission is hereby granted, free of charge, to any person obtaining
+    a copy of this software and associated documentation files (the
+    "Software"), to deal in the Software without restriction, including
+    without limitation the rights to use, copy, modify, merge, publish,
+    distribute, sublicense, and/or sell copies of the Software, and to
+    permit persons to whom the Software is furnished to do so, subject to
+    the following conditions:
+
+    The above copyright notice and this permission notice shall be included
+    in all copies or substantial portions of the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+    OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+    IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+    CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+    TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+    SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+   -------------------------------------------------------------------------- */
+
+#include "clusterfit.h"
+#include "colourset.h"
+#include "colourblock.h"
+#include <cfloat>
+
+namespace squish {
+
+ClusterFit::ClusterFit( ColourSet const* colours, int flags, float* metric )
+  : ColourFit( colours, flags )
+{
+    // set the iteration count
+    m_iterationCount = ( m_flags & kColourIterativeClusterFit ) ? kMaxIterations : 1;
+
+    // initialise the metric (old perceptual = 0.2126f, 0.7152f, 0.0722f)
+    if( metric )
+        m_metric = Vec4( metric[0], metric[1], metric[2], 1.0f );
+    else
+        m_metric = VEC4_CONST( 1.0f );
+
+    // initialise the best error
+    m_besterror = VEC4_CONST( FLT_MAX );
+
+    // cache some values
+    int const count = m_colours->GetCount();
+    Vec3 const* values = m_colours->GetPoints();
+
+    // get the covariance matrix
+    Sym3x3 covariance = ComputeWeightedCovariance( count, values, m_colours->GetWeights() );
+
+    // compute the principle component
+    m_principle = ComputePrincipleComponent( covariance );
+}
+
+bool ClusterFit::ConstructOrdering( Vec3 const& axis, int iteration )
+{
+    // cache some values
+    int const count = m_colours->GetCount();
+    Vec3 const* values = m_colours->GetPoints();
+
+    // build the list of dot products
+    float dps[16];
+    u8* order = ( u8* )m_order + 16*iteration;
+    for( int i = 0; i < count; ++i )
+    {
+        dps[i] = Dot( values[i], axis );
+        order[i] = ( u8 )i;
+    }
+
+    // stable sort using them
+    for( int i = 0; i < count; ++i )
+    {
+        for( int j = i; j > 0 && dps[j] < dps[j - 1]; --j )
+        {
+            std::swap( dps[j], dps[j - 1] );
+            std::swap( order[j], order[j - 1] );
+        }
+    }
+
+    // check this ordering is unique
+    for( int it = 0; it < iteration; ++it )
+    {
+        u8 const* prev = ( u8* )m_order + 16*it;
+        bool same = true;
+        for( int i = 0; i < count; ++i )
+        {
+            if( order[i] != prev[i] )
+            {
+                same = false;
+                break;
+            }
+        }
+        if( same )
+            return false;
+    }
+
+    // copy the ordering and weight all the points
+    Vec3 const* unweighted = m_colours->GetPoints();
+    float const* weights = m_colours->GetWeights();
+    m_xsum_wsum = VEC4_CONST( 0.0f );
+    for( int i = 0; i < count; ++i )
+    {
+        int j = order[i];
+        Vec4 p( unweighted[j].X(), unweighted[j].Y(), unweighted[j].Z(), 1.0f );
+        Vec4 w( weights[j] );
+        Vec4 x = p*w;
+        m_points_weights[i] = x;
+        m_xsum_wsum += x;
+    }
+    return true;
+}
+
+void ClusterFit::Compress3( void* block )
+{
+    // declare variables
+    int const count = m_colours->GetCount();
+    Vec4 const two = VEC4_CONST( 2.0 );
+    Vec4 const one = VEC4_CONST( 1.0f );
+    Vec4 const half_half2( 0.5f, 0.5f, 0.5f, 0.25f );
+    Vec4 const zero = VEC4_CONST( 0.0f );
+    Vec4 const half = VEC4_CONST( 0.5f );
+    Vec4 const grid( 31.0f, 63.0f, 31.0f, 0.0f );
+    Vec4 const gridrcp( 1.0f/31.0f, 1.0f/63.0f, 1.0f/31.0f, 0.0f );
+
+    // prepare an ordering using the principle axis
+    ConstructOrdering( m_principle, 0 );
+
+    // check all possible clusters and iterate on the total order
+    Vec4 beststart = VEC4_CONST( 0.0f );
+    Vec4 bestend = VEC4_CONST( 0.0f );
+    Vec4 besterror = m_besterror;
+    u8 bestindices[16];
+    int bestiteration = 0;
+    int besti = 0, bestj = 0;
+
+    // loop over iterations (we avoid the case that all points in first or last cluster)
+    for( int iterationIndex = 0;; )
+    {
+        // first cluster [0,i) is at the start
+        Vec4 part0 = VEC4_CONST( 0.0f );
+        for( int i = 0; i < count; ++i )
+        {
+            // second cluster [i,j) is half along
+            Vec4 part1 = ( i == 0 ) ? m_points_weights[0] : VEC4_CONST( 0.0f );
+            int jmin = ( i == 0 ) ? 1 : i;
+            for( int j = jmin;; )
+            {
+                // last cluster [j,count) is at the end
+                Vec4 part2 = m_xsum_wsum - part1 - part0;
+
+                // compute least squares terms directly
+                Vec4 alphax_sum = MultiplyAdd( part1, half_half2, part0 );
+                Vec4 alpha2_sum = alphax_sum.SplatW();
+
+                Vec4 betax_sum = MultiplyAdd( part1, half_half2, part2 );
+                Vec4 beta2_sum = betax_sum.SplatW();
+
+                Vec4 alphabeta_sum = ( part1*half_half2 ).SplatW();
+
+                // compute the least-squares optimal points
+                Vec4 factor = Reciprocal( NegativeMultiplySubtract( alphabeta_sum, alphabeta_sum, alpha2_sum*beta2_sum ) );
+                Vec4 a = NegativeMultiplySubtract( betax_sum, alphabeta_sum, alphax_sum*beta2_sum )*factor;
+                Vec4 b = NegativeMultiplySubtract( alphax_sum, alphabeta_sum, betax_sum*alpha2_sum )*factor;
+
+                // clamp to the grid
+                a = Min( one, Max( zero, a ) );
+                b = Min( one, Max( zero, b ) );
+                a = Truncate( MultiplyAdd( grid, a, half ) )*gridrcp;
+                b = Truncate( MultiplyAdd( grid, b, half ) )*gridrcp;
+
+                // compute the error (we skip the constant xxsum)
+                Vec4 e1 = MultiplyAdd( a*a, alpha2_sum, b*b*beta2_sum );
+                Vec4 e2 = NegativeMultiplySubtract( a, alphax_sum, a*b*alphabeta_sum );
+                Vec4 e3 = NegativeMultiplySubtract( b, betax_sum, e2 );
+                Vec4 e4 = MultiplyAdd( two, e3, e1 );
+
+                // apply the metric to the error term
+                Vec4 e5 = e4*m_metric;
+                Vec4 error = e5.SplatX() + e5.SplatY() + e5.SplatZ();
+
+                // keep the solution if it wins
+                if( CompareAnyLessThan( error, besterror ) )
+                {
+                    beststart = a;
+                    bestend = b;
+                    besti = i;
+                    bestj = j;
+                    besterror = error;
+                    bestiteration = iterationIndex;
+                }
+
+                // advance
+                if( j == count )
+                    break;
+                part1 += m_points_weights[j];
+                ++j;
+            }
+
+            // advance
+            part0 += m_points_weights[i];
+        }
+
+        // stop if we didn't improve in this iteration
+        if( bestiteration != iterationIndex )
+            break;
+
+        // advance if possible
+        ++iterationIndex;
+        if( iterationIndex == m_iterationCount )
+            break;
+
+        // stop if a new iteration is an ordering that has already been tried
+        Vec3 axis = ( bestend - beststart ).GetVec3();
+        if( !ConstructOrdering( axis, iterationIndex ) )
+            break;
+    }
+
+    // save the block if necessary
+    if( CompareAnyLessThan( besterror, m_besterror ) )
+    {
+        // remap the indices
+        u8 const* order = ( u8* )m_order + 16*bestiteration;
+
+        u8 unordered[16];
+        for( int m = 0; m < besti; ++m )
+            unordered[order[m]] = 0;
+        for( int m = besti; m < bestj; ++m )
+            unordered[order[m]] = 2;
+        for( int m = bestj; m < count; ++m )
+            unordered[order[m]] = 1;
+
+        m_colours->RemapIndices( unordered, bestindices );
+
+        // save the block
+        WriteColourBlock3( beststart.GetVec3(), bestend.GetVec3(), bestindices, block );
+
+        // save the error
+        m_besterror = besterror;
+    }
+}
+
+void ClusterFit::Compress4( void* block )
+{
+    // declare variables
+    int const count = m_colours->GetCount();
+    Vec4 const two = VEC4_CONST( 2.0f );
+    Vec4 const one = VEC4_CONST( 1.0f );
+    Vec4 const onethird_onethird2( 1.0f/3.0f, 1.0f/3.0f, 1.0f/3.0f, 1.0f/9.0f );
+    Vec4 const twothirds_twothirds2( 2.0f/3.0f, 2.0f/3.0f, 2.0f/3.0f, 4.0f/9.0f );
+    Vec4 const twonineths = VEC4_CONST( 2.0f/9.0f );
+    Vec4 const zero = VEC4_CONST( 0.0f );
+    Vec4 const half = VEC4_CONST( 0.5f );
+    Vec4 const grid( 31.0f, 63.0f, 31.0f, 0.0f );
+    Vec4 const gridrcp( 1.0f/31.0f, 1.0f/63.0f, 1.0f/31.0f, 0.0f );
+
+    // prepare an ordering using the principle axis
+    ConstructOrdering( m_principle, 0 );
+
+    // check all possible clusters and iterate on the total order
+    Vec4 beststart = VEC4_CONST( 0.0f );
+    Vec4 bestend = VEC4_CONST( 0.0f );
+    Vec4 besterror = m_besterror;
+    u8 bestindices[16];
+    int bestiteration = 0;
+    int besti = 0, bestj = 0, bestk = 0;
+
+    // loop over iterations (we avoid the case that all points in first or last cluster)
+    for( int iterationIndex = 0;; )
+    {
+        // first cluster [0,i) is at the start
+        Vec4 part0 = VEC4_CONST( 0.0f );
+        for( int i = 0; i < count; ++i )
+        {
+            // second cluster [i,j) is one third along
+            Vec4 part1 = VEC4_CONST( 0.0f );
+            for( int j = i;; )
+            {
+                // third cluster [j,k) is two thirds along
+                Vec4 part2 = ( j == 0 ) ? m_points_weights[0] : VEC4_CONST( 0.0f );
+                int kmin = ( j == 0 ) ? 1 : j;
+                for( int k = kmin;; )
+                {
+                    // last cluster [k,count) is at the end
+                    Vec4 part3 = m_xsum_wsum - part2 - part1 - part0;
+
+                    // compute least squares terms directly
+                    Vec4 const alphax_sum = MultiplyAdd( part2, onethird_onethird2, MultiplyAdd( part1, twothirds_twothirds2, part0 ) );
+                    Vec4 const alpha2_sum = alphax_sum.SplatW();
+
+                    Vec4 const betax_sum = MultiplyAdd( part1, onethird_onethird2, MultiplyAdd( part2, twothirds_twothirds2, part3 ) );
+                    Vec4 const beta2_sum = betax_sum.SplatW();
+
+                    Vec4 const alphabeta_sum = twonineths*( part1 + part2 ).SplatW();
+
+                    // compute the least-squares optimal points
+                    Vec4 factor = Reciprocal( NegativeMultiplySubtract( alphabeta_sum, alphabeta_sum, alpha2_sum*beta2_sum ) );
+                    Vec4 a = NegativeMultiplySubtract( betax_sum, alphabeta_sum, alphax_sum*beta2_sum )*factor;
+                    Vec4 b = NegativeMultiplySubtract( alphax_sum, alphabeta_sum, betax_sum*alpha2_sum )*factor;
+
+                    // clamp to the grid
+                    a = Min( one, Max( zero, a ) );
+                    b = Min( one, Max( zero, b ) );
+                    a = Truncate( MultiplyAdd( grid, a, half ) )*gridrcp;
+                    b = Truncate( MultiplyAdd( grid, b, half ) )*gridrcp;
+
+                    // compute the error (we skip the constant xxsum)
+                    Vec4 e1 = MultiplyAdd( a*a, alpha2_sum, b*b*beta2_sum );
+                    Vec4 e2 = NegativeMultiplySubtract( a, alphax_sum, a*b*alphabeta_sum );
+                    Vec4 e3 = NegativeMultiplySubtract( b, betax_sum, e2 );
+                    Vec4 e4 = MultiplyAdd( two, e3, e1 );
+
+                    // apply the metric to the error term
+                    Vec4 e5 = e4*m_metric;
+                    Vec4 error = e5.SplatX() + e5.SplatY() + e5.SplatZ();
+
+                    // keep the solution if it wins
+                    if( CompareAnyLessThan( error, besterror ) )
+                    {
+                        beststart = a;
+                        bestend = b;
+                        besterror = error;
+                        besti = i;
+                        bestj = j;
+                        bestk = k;
+                        bestiteration = iterationIndex;
+                    }
+
+                    // advance
+                    if( k == count )
+                        break;
+                    part2 += m_points_weights[k];
+                    ++k;
+                }
+
+                // advance
+                if( j == count )
+                    break;
+                part1 += m_points_weights[j];
+                ++j;
+            }
+
+            // advance
+            part0 += m_points_weights[i];
+        }
+
+        // stop if we didn't improve in this iteration
+        if( bestiteration != iterationIndex )
+            break;
+
+        // advance if possible
+        ++iterationIndex;
+        if( iterationIndex == m_iterationCount )
+            break;
+
+        // stop if a new iteration is an ordering that has already been tried
+        Vec3 axis = ( bestend - beststart ).GetVec3();
+        if( !ConstructOrdering( axis, iterationIndex ) )
+            break;
+    }
+
+    // save the block if necessary
+    if( CompareAnyLessThan( besterror, m_besterror ) )
+    {
+        // remap the indices
+        u8 const* order = ( u8* )m_order + 16*bestiteration;
+
+        u8 unordered[16];
+        for( int m = 0; m < besti; ++m )
+            unordered[order[m]] = 0;
+        for( int m = besti; m < bestj; ++m )
+            unordered[order[m]] = 2;
+        for( int m = bestj; m < bestk; ++m )
+            unordered[order[m]] = 3;
+        for( int m = bestk; m < count; ++m )
+            unordered[order[m]] = 1;
+
+        m_colours->RemapIndices( unordered, bestindices );
+
+        // save the block
+        WriteColourBlock4( beststart.GetVec3(), bestend.GetVec3(), bestindices, block );
+
+        // save the error
+        m_besterror = besterror;
+    }
+}
+
+} // namespace squish
diff --git a/extern/libsquish-1.15/clusterfit.h b/extern/libsquish-1.15/clusterfit.h
new file mode 100644
index 0000000..999396b
--- /dev/null
+++ b/extern/libsquish-1.15/clusterfit.h
@@ -0,0 +1,61 @@
+/* -----------------------------------------------------------------------------
+
+    Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+    Copyright (c) 2007 Ignacio Castano                   icastano@nvidia.com
+
+    Permission is hereby granted, free of charge, to any person obtaining
+    a copy of this software and associated documentation files (the
+    "Software"), to deal in the Software without restriction, including
+    without limitation the rights to use, copy, modify, merge, publish,
+    distribute, sublicense, and/or sell copies of the Software, and to
+    permit persons to whom the Software is furnished to do so, subject to
+    the following conditions:
+
+    The above copyright notice and this permission notice shall be included
+    in all copies or substantial portions of the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+    OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+    IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+    CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+    TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+    SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+   -------------------------------------------------------------------------- */
+
+#ifndef SQUISH_CLUSTERFIT_H
+#define SQUISH_CLUSTERFIT_H
+
+#include "squish.h"
+#include "maths.h"
+#include "simd.h"
+#include "colourfit.h"
+
+namespace squish {
+
+class ClusterFit : public ColourFit
+{
+public:
+    ClusterFit( ColourSet const* colours, int flags, float* metric );
+
+private:
+    bool ConstructOrdering( Vec3 const& axis, int iteration );
+
+    virtual void Compress3( void* block );
+    virtual void Compress4( void* block );
+
+    enum { kMaxIterations = 8 };
+
+    int m_iterationCount;
+    Vec3 m_principle;
+    u8 m_order[16*kMaxIterations];
+    Vec4 m_points_weights[16];
+    Vec4 m_xsum_wsum;
+    Vec4 m_metric;
+    Vec4 m_besterror;
+};
+
+} // namespace squish
+
+#endif // ndef SQUISH_CLUSTERFIT_H
diff --git a/extern/libsquish-1.15/colourblock.cpp b/extern/libsquish-1.15/colourblock.cpp
new file mode 100644
index 0000000..af8b980
--- /dev/null
+++ b/extern/libsquish-1.15/colourblock.cpp
@@ -0,0 +1,214 @@
+/* -----------------------------------------------------------------------------
+
+    Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+    Permission is hereby granted, free of charge, to any person obtaining
+    a copy of this software and associated documentation files (the
+    "Software"), to deal in the Software without restriction, including
+    without limitation the rights to use, copy, modify, merge, publish,
+    distribute, sublicense, and/or sell copies of the Software, and to
+    permit persons to whom the Software is furnished to do so, subject to
+    the following conditions:
+
+    The above copyright notice and this permission notice shall be included
+    in all copies or substantial portions of the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+    OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+    IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+    CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+    TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+    SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+   -------------------------------------------------------------------------- */
+
+#include "colourblock.h"
+
+namespace squish {
+
+static int FloatToInt( float a, int limit )
+{
+    // use ANSI round-to-zero behaviour to get round-to-nearest
+    int i = ( int )( a + 0.5f );
+
+    // clamp to the limit
+    if( i < 0 )
+        i = 0;
+    else if( i > limit )
+        i = limit;
+
+    // done
+    return i;
+}
+
+static int FloatTo565( Vec3::Arg colour )
+{
+    // get the components in the correct range
+    int r = FloatToInt( 31.0f*colour.X(), 31 );
+    int g = FloatToInt( 63.0f*colour.Y(), 63 );
+    int b = FloatToInt( 31.0f*colour.Z(), 31 );
+
+    // pack into a single value
+    return ( r << 11 ) | ( g << 5 ) | b;
+}
+
+static void WriteColourBlock( int a, int b, u8* indices, void* block )
+{
+    // get the block as bytes
+    u8* bytes = ( u8* )block;
+
+    // write the endpoints
+    bytes[0] = ( u8 )( a & 0xff );
+    bytes[1] = ( u8 )( a >> 8 );
+    bytes[2] = ( u8 )( b & 0xff );
+    bytes[3] = ( u8 )( b >> 8 );
+
+    // write the indices
+    for( int i = 0; i < 4; ++i )
+    {
+        u8 const* ind = indices + 4*i;
+        bytes[4 + i] = ind[0] | ( ind[1] << 2 ) | ( ind[2] << 4 ) | ( ind[3] << 6 );
+    }
+}
+
+void WriteColourBlock3( Vec3::Arg start, Vec3::Arg end, u8 const* indices, void* block )
+{
+    // get the packed values
+    int a = FloatTo565( start );
+    int b = FloatTo565( end );
+
+    // remap the indices
+    u8 remapped[16];
+    if( a <= b )
+    {
+        // use the indices directly
+        for( int i = 0; i < 16; ++i )
+            remapped[i] = indices[i];
+    }
+    else
+    {
+        // swap a and b
+        std::swap( a, b );
+        for( int i = 0; i < 16; ++i )
+        {
+            if( indices[i] == 0 )
+                remapped[i] = 1;
+            else if( indices[i] == 1 )
+                remapped[i] = 0;
+            else
+                remapped[i] = indices[i];
+        }
+    }
+
+    // write the block
+    WriteColourBlock( a, b, remapped, block );
+}
+
+void WriteColourBlock4( Vec3::Arg start, Vec3::Arg end, u8 const* indices, void* block )
+{
+    // get the packed values
+    int a = FloatTo565( start );
+    int b = FloatTo565( end );
+
+    // remap the indices
+    u8 remapped[16];
+    if( a < b )
+    {
+        // swap a and b
+        std::swap( a, b );
+        for( int i = 0; i < 16; ++i )
+            remapped[i] = ( indices[i] ^ 0x1 ) & 0x3;
+    }
+    else if( a == b )
+    {
+        // use index 0
+        for( int i = 0; i < 16; ++i )
+            remapped[i] = 0;
+    }
+    else
+    {
+        // use the indices directly
+        for( int i = 0; i < 16; ++i )
+            remapped[i] = indices[i];
+    }
+
+    // write the block
+    WriteColourBlock( a, b, remapped, block );
+}
+
+static int Unpack565( u8 const* packed, u8* colour )
+{
+    // build the packed value
+    int value = ( int )packed[0] | ( ( int )packed[1] << 8 );
+
+    // get the components in the stored range
+    u8 red = ( u8 )( ( value >> 11 ) & 0x1f );
+    u8 green = ( u8 )( ( value >> 5 ) & 0x3f );
+    u8 blue = ( u8 )( value & 0x1f );
+
+    // scale up to 8 bits
+    colour[0] = ( red << 3 ) | ( red >> 2 );
+    colour[1] = ( green << 2 ) | ( green >> 4 );
+    colour[2] = ( blue << 3 ) | ( blue >> 2 );
+    colour[3] = 255;
+
+    // return the value
+    return value;
+}
+
+void DecompressColour( u8* rgba, void const* block, bool isDxt1 )
+{
+    // get the block bytes
+    u8 const* bytes = reinterpret_cast< u8 const* >( block );
+
+    // unpack the endpoints
+    u8 codes[16];
+    int a = Unpack565( bytes, codes );
+    int b = Unpack565( bytes + 2, codes + 4 );
+
+    // generate the midpoints
+    for( int i = 0; i < 3; ++i )
+    {
+        int c = codes[i];
+        int d = codes[4 + i];
+
+        if( isDxt1 && a <= b )
+        {
+            codes[8 + i] = ( u8 )( ( c + d )/2 );
+            codes[12 + i] = 0;
+        }
+        else
+        {
+            codes[8 + i] = ( u8 )( ( 2*c + d )/3 );
+            codes[12 + i] = ( u8 )( ( c + 2*d )/3 );
+        }
+    }
+
+    // fill in alpha for the intermediate values
+    codes[8 + 3] = 255;
+    codes[12 + 3] = ( isDxt1 && a <= b ) ? 0 : 255;
+
+    // unpack the indices
+    u8 indices[16];
+    for( int i = 0; i < 4; ++i )
+    {
+        u8* ind = indices + 4*i;
+        u8 packed = bytes[4 + i];
+
+        ind[0] = packed & 0x3;
+        ind[1] = ( packed >> 2 ) & 0x3;
+        ind[2] = ( packed >> 4 ) & 0x3;
+        ind[3] = ( packed >> 6 ) & 0x3;
+    }
+
+    // store out the colours
+    for( int i = 0; i < 16; ++i )
+    {
+        u8 offset = 4*indices[i];
+        for( int j = 0; j < 4; ++j )
+            rgba[4*i + j] = codes[offset + j];
+    }
+}
+
+} // namespace squish
diff --git a/extern/libsquish-1.15/colourblock.h b/extern/libsquish-1.15/colourblock.h
new file mode 100644
index 0000000..fee2cd7
--- /dev/null
+++ b/extern/libsquish-1.15/colourblock.h
@@ -0,0 +1,41 @@
+/* -----------------------------------------------------------------------------
+
+    Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+    Permission is hereby granted, free of charge, to any person obtaining
+    a copy of this software and associated documentation files (the
+    "Software"), to deal in the Software without restriction, including
+    without limitation the rights to use, copy, modify, merge, publish,
+    distribute, sublicense, and/or sell copies of the Software, and to
+    permit persons to whom the Software is furnished to do so, subject to
+    the following conditions:
+
+    The above copyright notice and this permission notice shall be included
+    in all copies or substantial portions of the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+    OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+    IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+    CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+    TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+    SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+   -------------------------------------------------------------------------- */
+
+#ifndef SQUISH_COLOURBLOCK_H
+#define SQUISH_COLOURBLOCK_H
+
+#include "squish.h"
+#include "maths.h"
+
+namespace squish {
+
+void WriteColourBlock3( Vec3::Arg start, Vec3::Arg end, u8 const* indices, void* block );
+void WriteColourBlock4( Vec3::Arg start, Vec3::Arg end, u8 const* indices, void* block );
+
+void DecompressColour( u8* rgba, void const* block, bool isDxt1 );
+
+} // namespace squish
+
+#endif // ndef SQUISH_COLOURBLOCK_H
diff --git a/extern/libsquish-1.15/colourfit.cpp b/extern/libsquish-1.15/colourfit.cpp
new file mode 100644
index 0000000..e45b656
--- /dev/null
+++ b/extern/libsquish-1.15/colourfit.cpp
@@ -0,0 +1,54 @@
+/* -----------------------------------------------------------------------------
+
+    Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+    Permission is hereby granted, free of charge, to any person obtaining
+    a copy of this software and associated documentation files (the
+    "Software"), to deal in the Software without restriction, including
+    without limitation the rights to use, copy, modify, merge, publish,
+    distribute, sublicense, and/or sell copies of the Software, and to
+    permit persons to whom the Software is furnished to do so, subject to
+    the following conditions:
+
+    The above copyright notice and this permission notice shall be included
+    in all copies or substantial portions of the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+    OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+    IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+    CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+    TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+    SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+   -------------------------------------------------------------------------- */
+
+#include "colourfit.h"
+#include "colourset.h"
+
+namespace squish {
+
+ColourFit::ColourFit( ColourSet const* colours, int flags )
+  : m_colours( colours ),
+    m_flags( flags )
+{
+}
+
+ColourFit::~ColourFit()
+{
+}
+
+void ColourFit::Compress( void* block )
+{
+    bool isDxt1 = ( ( m_flags & kDxt1 ) != 0 );
+    if( isDxt1 )
+    {
+        Compress3( block );
+        if( !m_colours->IsTransparent() )
+            Compress4( block );
+    }
+    else
+        Compress4( block );
+}
+
+} // namespace squish
diff --git a/extern/libsquish-1.15/colourfit.h b/extern/libsquish-1.15/colourfit.h
new file mode 100644
index 0000000..e73dceb
--- /dev/null
+++ b/extern/libsquish-1.15/colourfit.h
@@ -0,0 +1,56 @@
+/* -----------------------------------------------------------------------------
+
+    Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+    Permission is hereby granted, free of charge, to any person obtaining
+    a copy of this software and associated documentation files (the
+    "Software"), to deal in the Software without restriction, including
+    without limitation the rights to use, copy, modify, merge, publish,
+    distribute, sublicense, and/or sell copies of the Software, and to
+    permit persons to whom the Software is furnished to do so, subject to
+    the following conditions:
+
+    The above copyright notice and this permission notice shall be included
+    in all copies or substantial portions of the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+    OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+    IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+    CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+    TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+    SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+   -------------------------------------------------------------------------- */
+
+#ifndef SQUISH_COLOURFIT_H
+#define SQUISH_COLOURFIT_H
+
+#include "squish.h"
+#include "maths.h"
+
+#include <climits>
+
+namespace squish {
+
+class ColourSet;
+
+class ColourFit
+{
+public:
+    ColourFit( ColourSet const* colours, int flags );
+    virtual ~ColourFit();
+
+    void Compress( void* block );
+
+protected:
+    virtual void Compress3( void* block ) = 0;
+    virtual void Compress4( void* block ) = 0;
+
+    ColourSet const* m_colours;
+    int m_flags;
+};
+
+} // namespace squish
+
+#endif // ndef SQUISH_COLOURFIT_H
diff --git a/extern/libsquish-1.15/colourset.cpp b/extern/libsquish-1.15/colourset.cpp
new file mode 100644
index 0000000..e900556
--- /dev/null
+++ b/extern/libsquish-1.15/colourset.cpp
@@ -0,0 +1,121 @@
+/* -----------------------------------------------------------------------------
+
+    Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+    Permission is hereby granted, free of charge, to any person obtaining
+    a copy of this software and associated documentation files (the
+    "Software"), to deal in the Software without restriction, including
+    without limitation the rights to use, copy, modify, merge, publish,
+    distribute, sublicense, and/or sell copies of the Software, and to
+    permit persons to whom the Software is furnished to do so, subject to
+    the following conditions:
+
+    The above copyright notice and this permission notice shall be included
+    in all copies or substantial portions of the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+    OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+    IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+    CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+    TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+    SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+   -------------------------------------------------------------------------- */
+
+#include "colourset.h"
+
+namespace squish {
+
+ColourSet::ColourSet( u8 const* rgba, int mask, int flags )
+  : m_count( 0 ),
+    m_transparent( false )
+{
+    // check the compression mode for dxt1
+    bool isDxt1 = ( ( flags & kDxt1 ) != 0 );
+    bool weightByAlpha = ( ( flags & kWeightColourByAlpha ) != 0 );
+
+    // create the minimal set
+    for( int i = 0; i < 16; ++i )
+    {
+        // check this pixel is enabled
+        int bit = 1 << i;
+        if( ( mask & bit ) == 0 )
+        {
+            m_remap[i] = -1;
+            continue;
+        }
+
+        // check for transparent pixels when using dxt1
+        if( isDxt1 && rgba[4*i + 3] < 128 )
+        {
+            m_remap[i] = -1;
+            m_transparent = true;
+            continue;
+        }
+
+        // loop over previous points for a match
+        for( int j = 0;; ++j )
+        {
+            // allocate a new point
+            if( j == i )
+            {
+                // normalise coordinates to [0,1]
+                float x = ( float )rgba[4*i] / 255.0f;
+                float y = ( float )rgba[4*i + 1] / 255.0f;
+                float z = ( float )rgba[4*i + 2] / 255.0f;
+
+                // ensure there is always non-zero weight even for zero alpha
+                float w = ( float )( rgba[4*i + 3] + 1 ) / 256.0f;
+
+                // add the point
+                m_points[m_count] = Vec3( x, y, z );
+                m_weights[m_count] = ( weightByAlpha ? w : 1.0f );
+                m_remap[i] = m_count;
+
+                // advance
+                ++m_count;
+                break;
+            }
+
+            // check for a match
+            int oldbit = 1 << j;
+            bool match = ( ( mask & oldbit ) != 0 )
+                && ( rgba[4*i] == rgba[4*j] )
+                && ( rgba[4*i + 1] == rgba[4*j + 1] )
+                && ( rgba[4*i + 2] == rgba[4*j + 2] )
+                && ( rgba[4*j + 3] >= 128 || !isDxt1 );
+            if( match )
+            {
+                // get the index of the match
+                int index = m_remap[j];
+
+                // ensure there is always non-zero weight even for zero alpha
+                float w = ( float )( rgba[4*i + 3] + 1 ) / 256.0f;
+
+                // map to this point and increase the weight
+                m_weights[index] += ( weightByAlpha ? w : 1.0f );
+                m_remap[i] = index;
+                break;
+            }
+        }
+    }
+
+    // square root the weights
+    for( int i = 0; i < m_count; ++i )
+        m_weights[i] = std::sqrt( m_weights[i] );
+}
+
+void ColourSet::RemapIndices( u8 const* source, u8* target ) const
+{
+    for( int i = 0; i < 16; ++i )
+    {
+        int j = m_remap[i];
+        if( j == -1 )
+            target[i] = 3;
+        else
+            target[i] = source[j];
+    }
+}
+
+} // namespace squish
diff --git a/extern/libsquish-1.15/colourset.h b/extern/libsquish-1.15/colourset.h
new file mode 100644
index 0000000..e13bb6f
--- /dev/null
+++ b/extern/libsquish-1.15/colourset.h
@@ -0,0 +1,58 @@
+/* -----------------------------------------------------------------------------
+
+    Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+    Permission is hereby granted, free of charge, to any person obtaining
+    a copy of this software and associated documentation files (the
+    "Software"), to deal in the Software without restriction, including
+    without limitation the rights to use, copy, modify, merge, publish,
+    distribute, sublicense, and/or sell copies of the Software, and to
+    permit persons to whom the Software is furnished to do so, subject to
+    the following conditions:
+
+    The above copyright notice and this permission notice shall be included
+    in all copies or substantial portions of the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+    OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+    IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+    CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+    TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+    SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+   -------------------------------------------------------------------------- */
+
+#ifndef SQUISH_COLOURSET_H
+#define SQUISH_COLOURSET_H
+
+#include "squish.h"
+#include "maths.h"
+
+namespace squish {
+
+/*! @brief Represents a set of block colours
+*/
+class ColourSet
+{
+public:
+    ColourSet( u8 const* rgba, int mask, int flags );
+
+    int GetCount() const { return m_count; }
+    Vec3 const* GetPoints() const { return m_points; }
+    float const* GetWeights() const { return m_weights; }
+    bool IsTransparent() const { return m_transparent; }
+
+    void RemapIndices( u8 const* source, u8* target ) const;
+
+private:
+    int m_count;
+    Vec3 m_points[16];
+    float m_weights[16];
+    int m_remap[16];
+    bool m_transparent;
+};
+
+} // namespace sqish
+
+#endif // ndef SQUISH_COLOURSET_H
diff --git a/extern/libsquish-1.15/config b/extern/libsquish-1.15/config
new file mode 100644
index 0000000..da6de8d
--- /dev/null
+++ b/extern/libsquish-1.15/config
@@ -0,0 +1,38 @@
+# config file for GNUmake
+
+# define to 1 to use OpenMP parallelization
+USE_OPENMP ?= 0
+
+# define to 1 to install shared library
+USE_SHARED ?= 0
+
+# define to 1 to use Altivec instructions
+USE_ALTIVEC ?= 0
+
+# define to 1 to use SSE2 instructions
+USE_SSE ?= 0
+
+# default flags
+CXXFLAGS ?= -O2 -Wall
+ifeq ($(USE_OPENMP),1)
+   CPPFLAGS += -DSQUISH_USE_OPENMP
+   CXXFLAGS += -fopenmp
+endif
+ifeq ($(USE_ALTIVEC),1)
+   CPPFLAGS += -DSQUISH_USE_ALTIVEC=1
+   CXXFLAGS += -maltivec
+endif
+ifeq ($(USE_SSE),1)
+   CPPFLAGS += -DSQUISH_USE_SSE=2
+   CXXFLAGS += -msse
+endif
+
+# install options
+INSTALL = install
+INSTALL_FILE      = $(INSTALL) -p -m 644
+INSTALL_PROGRAM   = $(INSTALL) -p -m 755
+INSTALL_DIRECTORY = $(INSTALL) -d -m 755
+
+# where should we install to
+INSTALL_DIR ?= /usr/local
+LIB_PATH ?= lib
diff --git a/extern/libsquish-1.15/config.h b/extern/libsquish-1.15/config.h
new file mode 100644
index 0000000..9f1f4b1
--- /dev/null
+++ b/extern/libsquish-1.15/config.h
@@ -0,0 +1,49 @@
+/* -----------------------------------------------------------------------------
+
+    Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+    Permission is hereby granted, free of charge, to any person obtaining
+    a copy of this software and associated documentation files (the
+    "Software"), to deal in the Software without restriction, including
+    without limitation the rights to use, copy, modify, merge, publish,
+    distribute, sublicense, and/or sell copies of the Software, and to
+    permit persons to whom the Software is furnished to do so, subject to
+    the following conditions:
+
+    The above copyright notice and this permission notice shall be included
+    in all copies or substantial portions of the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+    OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+    IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+    CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+    TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+    SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+   -------------------------------------------------------------------------- */
+
+#ifndef SQUISH_CONFIG_H
+#define SQUISH_CONFIG_H
+
+// Set to 1 when building squish to use Altivec instructions.
+#ifndef SQUISH_USE_ALTIVEC
+#define SQUISH_USE_ALTIVEC 0
+#endif
+
+// Set to 1 or 2 when building squish to use SSE or SSE2 instructions.
+#ifndef SQUISH_USE_SSE
+#define SQUISH_USE_SSE 2
+#endif
+
+// Internally set SQUISH_USE_SIMD when either Altivec or SSE is available.
+#if SQUISH_USE_ALTIVEC && SQUISH_USE_SSE
+#error "Cannot enable both Altivec and SSE!"
+#endif
+#if SQUISH_USE_ALTIVEC || SQUISH_USE_SSE
+#define SQUISH_USE_SIMD 1
+#else
+#define SQUISH_USE_SIMD 0
+#endif
+
+#endif // ndef SQUISH_CONFIG_H
diff --git a/extern/libsquish-1.15/extra/squishgen.cpp b/extern/libsquish-1.15/extra/squishgen.cpp
new file mode 100644
index 0000000..1fcbd2a
--- /dev/null
+++ b/extern/libsquish-1.15/extra/squishgen.cpp
@@ -0,0 +1,151 @@
+/* -----------------------------------------------------------------------------
+
+    Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+    Permission is hereby granted, free of charge, to any person obtaining
+    a copy of this software and associated documentation files (the
+    "Software"), to deal in the Software without restriction, including
+    without limitation the rights to use, copy, modify, merge, publish,
+    distribute, sublicense, and/or sell copies of the Software, and to
+    permit persons to whom the Software is furnished to do so, subject to
+    the following conditions:
+
+    The above copyright notice and this permission notice shall be included
+    in all copies or substantial portions of the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+    OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+    IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+    CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+    TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+    SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+   -------------------------------------------------------------------------- */
+
+#include <iostream>
+
+struct SourceBlock
+{
+    int start;
+    int end;
+    int error;
+};
+
+struct TargetValue
+{
+    SourceBlock sources[2];
+};
+
+static void GenerateData( std::string const& name, int bits, int colours )
+{
+    TargetValue values[256];
+
+    // initialise the data
+    for( int target = 0; target < 256; ++target )
+        for( int index = 0; index < colours; ++index )
+            values[target].sources[index].error = 255;
+
+    // loop over all possible source points
+    int count = ( 1 << bits );
+    for( int value1 = 0; value1 < count; ++value1 )
+    {
+        for( int value2 = 0; value2 < count; ++value2 )
+        {
+            // compute the 8-bit endpoints
+            int a = ( value1 << ( 8 - bits ) ) | ( value1 >> ( 2*bits - 8 ) );
+            int b = ( value2 << ( 8 - bits ) ) | ( value2 >> ( 2*bits - 8 ) );
+
+            // fill in the codebook with the these and intermediates
+            int codes[2];
+            codes[0] = a;
+            if( colours == 3 )
+                codes[1] = ( a + b )/2;
+            else
+                codes[1] = ( 2*a + b )/3;
+
+            // mark each target point with the endpoints and index needed for it
+            for( int index = 0; index < 2; ++index )
+            {
+                int target = codes[index];
+
+                SourceBlock& block = values[target].sources[index];
+                if( block.error != 0 )
+                {
+                    block.start = value1;
+                    block.end = value2;
+                    block.error = 0;
+                }
+            }
+        }
+    }
+
+    // iteratively fill in the missing values
+    for( ;; )
+    {
+        bool stable = true;
+        for( int index = 0; index < 2; ++index )
+        {
+            for( int target = 0; target < 256; ++target )
+            {
+                if( target != 255 )
+                {
+                    SourceBlock& current = values[target].sources[index];
+                    SourceBlock& next = values[target + 1].sources[index];
+                    if( current.error > next.error + 1 )
+                    {
+                        current.start = next.start;
+                        current.end = next.end;
+                        current.error = next.error + 1;
+                        stable = false;
+                    }
+                }
+                if( target != 0 )
+                {
+                    SourceBlock& current = values[target].sources[index];
+                    SourceBlock& previous = values[target - 1].sources[index];
+                    if( current.error > previous.error + 1 )
+                    {
+                        current.start = previous.start;
+                        current.end = previous.end;
+                        current.error = previous.error + 1;
+                        stable = false;
+                    }
+                }
+            }
+        }
+        if( stable )
+            break;
+    }
+
+    // debug
+    std::cout << "\nstatic SingleColourLookup const " << name << "[] = \n{\n";
+    for( int i = 0;; )
+    {
+        std::cout << "\t{ { ";
+        for( int j = 0;; )
+        {
+            SourceBlock const& block = values[i].sources[j];
+            if( j < colours )
+                std::cout << "{ " << block.start << ", " << block.end << ", " << block.error << " }";
+            else
+                std::cout << "{ 0, 0, 0 }";
+            if( ++j == 2 )
+                break;
+            std::cout << ", ";
+        }
+        std::cout << " } }";
+        if( ++i == 256 )
+            break;
+        std::cout << ",\n";
+    }
+    std::cout << "\n};\n";
+}
+
+int main()
+{
+    GenerateData( "lookup_5_3", 5, 3 );
+    GenerateData( "lookup_6_3", 6, 3 );
+    GenerateData( "lookup_5_4", 5, 4 );
+    GenerateData( "lookup_6_4", 6, 4 );
+}
diff --git a/extern/libsquish-1.15/extra/squishpng.cpp b/extern/libsquish-1.15/extra/squishpng.cpp
new file mode 100644
index 0000000..5d45b0c
--- /dev/null
+++ b/extern/libsquish-1.15/extra/squishpng.cpp
@@ -0,0 +1,546 @@
+/* -----------------------------------------------------------------------------
+
+    Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+    Permission is hereby granted, free of charge, to any person obtaining
+    a copy of this software and associated documentation files (the
+    "Software"), to deal in the Software without restriction, including
+    without limitation the rights to use, copy, modify, merge, publish,
+    distribute, sublicense, and/or sell copies of the Software, and to
+    permit persons to whom the Software is furnished to do so, subject to
+    the following conditions:
+
+    The above copyright notice and this permission notice shall be included
+    in all copies or substantial portions of the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+    OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+    IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+    CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+    TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+    SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+   -------------------------------------------------------------------------- */
+
+/*! @file
+
+    @brief Test program that compresses images loaded using the PNG format.
+
+    This program requires libpng for PNG input and output, and is designed to
+    test the RMS error for DXT compression for a set of test images.
+
+    This program uses the high-level image compression and decompression
+    functions that process an entire image at a time.
+*/
+
+#include <iostream>
+#include <string>
+#include <sstream>
+#include <ctime>
+#include <cmath>
+#include <squish.h>
+#include <png.h>
+
+#ifdef _MSC_VER
+#pragma warning( disable: 4511 4512 )
+#endif // def _MSC_VER
+
+using namespace squish;
+
+//! Simple exception class.
+class Error : public std::exception
+{
+public:
+    Error( std::string const& excuse ) : m_excuse( excuse ) {}
+    ~Error() throw() {}
+
+    virtual char const* what() const throw() { return m_excuse.c_str(); }
+
+private:
+    std::string m_excuse;
+};
+
+//! Base class to make derived classes non-copyable
+class NonCopyable
+{
+public:
+    NonCopyable() {}
+
+private:
+    NonCopyable( NonCopyable const& );
+    NonCopyable& operator=( NonCopyable const& );
+};
+
+//! Memory object.
+class Mem : NonCopyable
+{
+public:
+    Mem() : m_p( 0 ) {}
+    explicit Mem( int size ) : m_p( new u8[size] ) {}
+    ~Mem() { delete[] m_p; }
+
+    void Reset( int size )
+    {
+        u8 *p = new u8[size];
+        delete m_p;
+        m_p = p;
+    }
+
+    u8* Get() const { return m_p; }
+
+private:
+    u8* m_p;
+};
+
+//! File object.
+class File : NonCopyable
+{
+public:
+    explicit File( FILE* fp ) : m_fp( fp ) {}
+    ~File() { if( m_fp ) fclose( m_fp ); }
+
+    bool IsValid() const { return m_fp != 0; }
+    FILE* Get() const { return m_fp; }
+
+private:
+    FILE* m_fp;
+};
+
+//! PNG read object.
+class PngReadStruct : NonCopyable
+{
+public:
+    PngReadStruct()
+      : m_png( 0 ),
+        m_info( 0 ),
+        m_end( 0 )
+    {
+        m_png = png_create_read_struct( PNG_LIBPNG_VER_STRING, 0, 0, 0 );
+        if( !m_png )
+            throw Error( "failed to create png read struct" );
+
+        m_info = png_create_info_struct( m_png );
+        m_end = png_create_info_struct( m_png );
+        if( !m_info || !m_end )
+        {
+            png_infopp info = m_info ? &m_info : 0;
+            png_infopp end = m_end ? &m_end : 0;
+            png_destroy_read_struct( &m_png, info, end );
+            throw Error( "failed to create png info structs" );
+        }
+    }
+
+    ~PngReadStruct()
+    {
+        png_destroy_read_struct( &m_png, &m_info, &m_end );
+    }
+
+    png_structp GetPng() const { return m_png; }
+    png_infop GetInfo() const { return m_info; }
+
+private:
+    png_structp m_png;
+    png_infop m_info, m_end;
+};
+
+//! PNG write object.
+class PngWriteStruct : NonCopyable
+{
+public:
+    PngWriteStruct()
+      : m_png( 0 ),
+        m_info( 0 )
+    {
+        m_png = png_create_write_struct( PNG_LIBPNG_VER_STRING, 0, 0, 0 );
+        if( !m_png )
+            throw Error( "failed to create png read struct" );
+
+        m_info = png_create_info_struct( m_png );
+        if( !m_info )
+        {
+            png_infopp info = m_info ? &m_info : 0;
+            png_destroy_write_struct( &m_png, info );
+            throw Error( "failed to create png info structs" );
+        }
+    }
+
+    ~PngWriteStruct()
+    {
+        png_destroy_write_struct( &m_png, &m_info );
+    }
+
+    png_structp GetPng() const { return m_png; }
+    png_infop GetInfo() const { return m_info; }
+
+private:
+    png_structp m_png;
+    png_infop m_info;
+};
+
+//! PNG rows object.
+class PngRows : NonCopyable
+{
+public:
+    PngRows( int pitch, int height ) : m_height( height )
+    {
+        m_rows = new png_bytep[m_height];
+        for( int i = 0; i < m_height; ++i )
+            m_rows[i] = new png_byte[pitch];
+    }
+
+    ~PngRows()
+    {
+        for( int i = 0; i < m_height; ++i )
+            delete[] m_rows[i];
+        delete[] m_rows;
+    }
+
+    png_bytep* Get() const { return m_rows; }
+
+    png_bytep operator[](int y) const { return m_rows[y]; }
+
+private:
+    png_bytep* m_rows;
+    int m_height;
+};
+
+//! Represents a DXT compressed image in memory.
+struct DxtData
+{
+    int width;
+    int height;
+    int format; //!< Either kDxt1, kDxt3 or kDxt5.
+    Mem data;
+    bool isColour;
+    bool isAlpha;
+};
+
+//! Represents an uncompressed RGBA image in memory.
+class Image
+{
+public:
+    Image();
+
+    void LoadPng( std::string const& fileName );
+    void SavePng( std::string const& fileName ) const;
+
+    void Decompress( DxtData const& dxt );
+    void Compress( DxtData& dxt, int flags ) const;
+
+    double GetRmsError( Image const& image ) const;
+
+private:
+    int m_width;
+    int m_height;
+    bool m_isColour; //!< Either colour or luminance.
+    bool m_isAlpha; //!< Either alpha or not.
+    Mem m_pixels;
+};
+
+Image::Image()
+  : m_width( 0 ),
+    m_height( 0 ),
+    m_isColour( false ),
+    m_isAlpha( false )
+{
+}
+
+void Image::LoadPng( std::string const& fileName )
+{
+    // open the source file
+    File file( fopen( fileName.c_str(), "rb" ) );
+    if( !file.IsValid() )
+    {
+        std::ostringstream oss;
+        oss << "failed to open \"" << fileName << "\" for reading";
+        throw Error( oss.str() );
+    }
+
+    // check the signature bytes
+    png_byte header[8];
+    size_t check = fread( header, 1, 8, file.Get() );
+    if( check != 8 )
+        throw Error( "file read error" );
+    if( png_sig_cmp( header, 0, 8 ) )
+    {
+        std::ostringstream oss;
+        oss << "\"" << fileName << "\" does not look like a png file";
+        throw Error( oss.str() );
+    }
+
+    // read the image into memory
+    PngReadStruct png;
+    png_init_io( png.GetPng(), file.Get() );
+    png_set_sig_bytes( png.GetPng(), 8 );
+    png_read_png( png.GetPng(), png.GetInfo(), PNG_TRANSFORM_EXPAND, 0 );
+
+    // get the image info
+    png_uint_32 width;
+    png_uint_32 height;
+    int bitDepth;
+    int colourType;
+    png_get_IHDR( png.GetPng(), png.GetInfo(), &width, &height, &bitDepth, &colourType, 0, 0, 0 );
+
+    // check the image is 8 bit
+    if( bitDepth != 8 )
+    {
+        std::ostringstream oss;
+        oss << "cannot process " << bitDepth << "-bit image (bit depth must be 8)";
+        throw Error( oss.str() );
+    }
+
+    // copy the data into a contiguous array
+    m_width = width;
+    m_height = height;
+    m_isColour = ( ( colourType & PNG_COLOR_MASK_COLOR ) != 0 );
+    m_isAlpha = ( ( colourType & PNG_COLOR_MASK_ALPHA ) != 0 );
+    m_pixels.Reset(4*width*height);
+
+    // get the image rows
+    png_bytep const *rows = png_get_rows( png.GetPng(), png.GetInfo() );
+    if( !rows )
+        throw Error( "failed to get image rows" );
+
+    // copy the pixels into the storage
+    u8 *dest = m_pixels.Get();
+    for( int y = 0; y < m_height; ++y )
+    {
+        u8 const *src = rows[y];
+        for( int x = 0; x < m_width; ++x )
+        {
+            if( m_isColour )
+            {
+                dest[0] = src[0];
+                dest[1] = src[1];
+                dest[2] = src[2];
+                src += 3;
+            }
+            else
+            {
+                u8 lum = *src++;
+                dest[0] = lum;
+                dest[1] = lum;
+                dest[2] = lum;
+            }
+
+            if( m_isAlpha )
+                dest[3] = *src++;
+            else
+                dest[3] = 255;
+
+            dest += 4;
+        }
+    }
+}
+
+void Image::SavePng( std::string const& fileName ) const
+{
+    // create the target rows
+    int const pixelSize = ( m_isColour ? 3 : 1 ) + ( m_isAlpha ? 1 : 0 );
+    PngRows rows( m_width*pixelSize, m_height );
+
+    // fill the rows with pixel data
+    u8 const *src = m_pixels.Get();
+    for( int y = 0; y < m_height; ++y )
+    {
+        u8 *dest = rows[y];
+        for( int x = 0; x < m_width; ++x )
+        {
+            if( m_isColour )
+            {
+                dest[0] = src[0];
+                dest[1] = src[1];
+                dest[2] = src[2];
+                dest += 3;
+            }
+            else
+                *dest++ = src[1];
+
+            if( m_isAlpha )
+                *dest++ = src[3];
+
+            src += 4;
+        }
+    }
+
+    // set up the image
+    PngWriteStruct png;
+    png_set_IHDR(
+        png.GetPng(), png.GetInfo(), m_width, m_height,
+        8, ( m_isColour ? PNG_COLOR_MASK_COLOR : 0) | ( m_isAlpha ? PNG_COLOR_MASK_ALPHA : 0 ),
+        PNG_INTERLACE_NONE, PNG_COMPRESSION_TYPE_DEFAULT, PNG_FILTER_TYPE_DEFAULT
+    );
+
+    // open the target file
+    File file( fopen( fileName.c_str(), "wb" ) );
+    if( !file.IsValid() )
+    {
+        std::ostringstream oss;
+        oss << "failed to open \"" << fileName << "\" for writing";
+        throw Error( oss.str() );
+    }
+
+    // write the image
+    png_set_rows( png.GetPng(), png.GetInfo(), rows.Get() );
+    png_init_io( png.GetPng(), file.Get() );
+    png_write_png( png.GetPng(), png.GetInfo(), PNG_TRANSFORM_IDENTITY, 0 );
+}
+
+void Image::Decompress( DxtData const& dxt )
+{
+    // allocate storage
+    m_width = dxt.width;
+    m_height = dxt.height;
+    m_isColour = dxt.isColour;
+    m_isAlpha = dxt.isAlpha;
+    m_pixels.Reset( 4*m_width*m_height );
+
+    // use the whole image decompression function to do the work
+    DecompressImage( m_pixels.Get(), m_width, m_height, dxt.data.Get(), dxt.format );
+}
+
+void Image::Compress( DxtData& dxt, int flags ) const
+{
+    // work out how much memory we need
+    int storageSize = GetStorageRequirements( m_width, m_height, flags );
+
+    // set the structure fields and allocate it
+    dxt.width = m_width;
+    dxt.height = m_height;
+    dxt.format = flags & ( kDxt1 | kDxt3 | kDxt5 );
+    dxt.isColour = m_isColour;
+    dxt.isAlpha = m_isAlpha;
+    dxt.data.Reset( storageSize );
+
+    // use the whole image compression function to do the work
+    CompressImage( m_pixels.Get(), m_width, m_height, dxt.data.Get(), flags );
+}
+
+double Image::GetRmsError( Image const& image ) const
+{
+    if( m_width != image.m_width || m_height != image.m_height )
+        throw Error( "image dimensions mismatch when computing RMS error" );
+
+    // accumulate colour error
+    double difference = 0;
+    u8 const *a = m_pixels.Get();
+    u8 const *b = image.m_pixels.Get();
+    for( int y = 0; y < m_height; ++y )
+    {
+        for( int x = 0; x < m_width; ++x )
+        {
+            int d0 = ( int )a[0] - ( int )b[0];
+            int d1 = ( int )a[1] - ( int )b[1];
+            int d2 = ( int )a[2] - ( int )b[2];
+            difference += ( double )( d0*d0 + d1*d1 + d2*d2 );
+            a += 4;
+            b += 4;
+        }
+    }
+    return std::sqrt( difference/( double )( m_width*m_height ) );
+}
+
+int main( int argc, char* argv[] )
+{
+    try
+    {
+        // parse the command-line
+        std::string sourceFileName;
+        std::string targetFileName;
+        int format = kDxt1;
+        int fit = kColourClusterFit;
+        int extra = 0;
+        bool help = false;
+        bool arguments = true;
+        bool error = false;
+        for( int i = 1; i < argc; ++i )
+        {
+            // check for options
+            char const* word = argv[i];
+            if( arguments && word[0] == '-' )
+            {
+                for( int j = 1; word[j] != '\0'; ++j )
+                {
+                    switch( word[j] )
+                    {
+                    case 'h': help = true; break;
+                    case '1': format = kDxt1; break;
+                    case '3': format = kDxt3; break;
+                    case '5': format = kDxt5; break;
+                    case 'r': fit = kColourRangeFit; break;
+                    case 'i': fit = kColourIterativeClusterFit; break;
+                    case 'w': extra = kWeightColourByAlpha; break;
+                    case '-': arguments = false; break;
+                    default:
+                        std::cerr << "squishpng error: unknown option '" << word[j] << "'" << std::endl;
+                        error = true;
+                    }
+                }
+            }
+            else
+            {
+                if( sourceFileName.empty() )
+                    sourceFileName.assign( word );
+                else if( targetFileName.empty() )
+                    targetFileName.assign( word );
+                else
+                {
+                    std::cerr << "squishpng error: unexpected argument \"" << word << "\"" << std::endl;
+                    error = true;
+                }
+            }
+        }
+
+        // check arguments
+        if( sourceFileName.empty() )
+        {
+            std::cerr << "squishpng error: no source file given" << std::endl;
+            error = true;
+        }
+        if( help || error )
+        {
+            std::cout
+                << "SYNTAX" << std::endl
+                << "\tsquishpng [-135riw] <source> [<target>]" << std::endl
+                << "OPTIONS" << std::endl
+                << "\t-h\tPrint this help message" << std::endl
+                << "\t-135\tSpecifies whether to use DXT1 (default), DXT3 or DXT5 compression" << std::endl
+                << "\t-r\tUse the fast but inferior range-based colour compressor" << std::endl
+                << "\t-i\tUse the very slow but slightly better iterative colour compressor" << std::endl
+                << "\t-w\tWeight colour values by alpha in the cluster colour compressor" << std::endl
+                ;
+
+            return error ? -1 : 0;
+        }
+
+        // load the source image
+        Image sourceImage;
+        sourceImage.LoadPng( sourceFileName );
+
+        // compress to DXT
+        DxtData dxt;
+        sourceImage.Compress( dxt, format | fit | extra );
+
+        // decompress back
+        Image targetImage;
+        targetImage.Decompress( dxt );
+
+        // compare the images
+        double rmsError = sourceImage.GetRmsError( targetImage );
+        std::cout << sourceFileName << " " << rmsError << std::endl;
+
+        // save the target image if necessary
+        if( !targetFileName.empty() )
+            targetImage.SavePng( targetFileName );
+    }
+    catch( std::exception& excuse )
+    {
+        // complain
+        std::cerr << "squishpng error: " << excuse.what() << std::endl;
+        return -1;
+    }
+
+    // done
+    return 0;
+}
diff --git a/extern/libsquish-1.15/extra/squishtest.cpp b/extern/libsquish-1.15/extra/squishtest.cpp
new file mode 100644
index 0000000..e4362fe
--- /dev/null
+++ b/extern/libsquish-1.15/extra/squishtest.cpp
@@ -0,0 +1,206 @@
+/* -----------------------------------------------------------------------------
+
+    Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+    Permission is hereby granted, free of charge, to any person obtaining
+    a copy of this software and associated documentation files (the
+    "Software"), to deal in the Software without restriction, including
+    without limitation the rights to use, copy, modify, merge, publish,
+    distribute, sublicense, and/or sell copies of the Software, and to
+    permit persons to whom the Software is furnished to do so, subject to
+    the following conditions:
+
+    The above copyright notice and this permission notice shall be included
+    in all copies or substantial portions of the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+    OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+    IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+    CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+    TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+    SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+   -------------------------------------------------------------------------- */
+
+/*! @file
+
+    @brief This program tests the error for 1 and 2-colour DXT compression.
+
+    This tests the effectiveness of the DXT compression algorithm for all
+    possible 1 and 2-colour blocks of pixels.
+*/
+
+#include <squish.h>
+#include <iostream>
+#include <cmath>
+#include <cfloat>
+#include <cstdlib>
+
+using namespace squish;
+
+double GetColourError( u8 const* a, u8 const* b )
+{
+    double error = 0.0;
+    for( int i = 0; i < 16; ++i )
+    {
+        for( int j = 0; j < 3; ++j )
+        {
+            int index = 4*i + j;
+            int diff = ( int )a[index] - ( int )b[index];
+            error += ( double )( diff*diff );
+        }
+    }
+    return error / 16.0;
+}
+
+void TestOneColour( int flags )
+{
+    u8 input[4*16];
+    u8 output[4*16];
+    u8 block[16];
+
+    double avg = 0.0, min = DBL_MAX, max = -DBL_MAX;
+    int counter = 0;
+
+    // test all single-channel colours
+    for( int i = 0; i < 16*4; ++i )
+        input[i] = ( ( i % 4 ) == 3 ) ? 255 : 0;
+    for( int channel = 0; channel < 3; ++channel )
+    {
+        for( int value = 0; value < 255; ++value )
+        {
+            // set the channnel value
+            for( int i = 0; i < 16; ++i )
+                input[4*i + channel] = ( u8 )value;
+
+            // compress and decompress
+            Compress( input, block, flags );
+            Decompress( output, block, flags );
+
+            // test the results
+            double rm = GetColourError( input, output );
+            double rms = std::sqrt( rm );
+
+            // accumulate stats
+            min = std::min( min, rms );
+            max = std::max( max, rms );
+            avg += rm;
+            ++counter;
+        }
+
+        // reset the channel value
+        for( int i = 0; i < 16; ++i )
+            input[4*i + channel] = 0;
+    }
+
+    // finish stats
+    avg = std::sqrt( avg/counter );
+
+    // show stats
+    std::cout << "one colour error (min, max, avg): "
+        << min << ", " << max << ", " << avg << std::endl;
+}
+
+void TestOneColourRandom( int flags )
+{
+    u8 input[4*16];
+    u8 output[4*16];
+    u8 block[16];
+
+    double avg = 0.0, min = DBL_MAX, max = -DBL_MAX;
+    int counter = 0;
+
+    // test all single-channel colours
+    for( int test = 0; test < 1000; ++test )
+    {
+        // set a constant random colour
+        for( int channel = 0; channel < 3; ++channel )
+        {
+            u8 value = ( u8 )( rand() & 0xff );
+            for( int i = 0; i < 16; ++i )
+                input[4*i + channel] = value;
+        }
+        for( int i = 0; i < 16; ++i )
+            input[4*i + 3] = 255;
+
+        // compress and decompress
+        Compress( input, block, flags );
+        Decompress( output, block, flags );
+
+        // test the results
+        double rm = GetColourError( input, output );
+        double rms = std::sqrt( rm );
+
+        // accumulate stats
+        min = std::min( min, rms );
+        max = std::max( max, rms );
+        avg += rm;
+        ++counter;
+    }
+
+    // finish stats
+    avg = std::sqrt( avg/counter );
+
+    // show stats
+    std::cout << "random one colour error (min, max, avg): "
+        << min << ", " << max << ", " << avg << std::endl;
+}
+
+void TestTwoColour( int flags )
+{
+    u8 input[4*16];
+    u8 output[4*16];
+    u8 block[16];
+
+    double avg = 0.0, min = DBL_MAX, max = -DBL_MAX;
+    int counter = 0;
+
+    // test all single-channel colours
+    for( int i = 0; i < 16*4; ++i )
+        input[i] = ( ( i % 4 ) == 3 ) ? 255 : 0;
+    for( int channel = 0; channel < 3; ++channel )
+    {
+        for( int value1 = 0; value1 < 255; ++value1 )
+        {
+            for( int value2 = value1 + 1; value2 < 255; ++value2 )
+            {
+                // set the channnel value
+                for( int i = 0; i < 16; ++i )
+                    input[4*i + channel] = ( u8 )( ( i < 8 ) ? value1 : value2 );
+
+                // compress and decompress
+                Compress( input, block, flags );
+                Decompress( output, block, flags );
+
+                // test the results
+                double rm = GetColourError( input, output );
+                double rms = std::sqrt( rm );
+
+                // accumulate stats
+                min = std::min( min, rms );
+                max = std::max( max, rms );
+                avg += rm;
+                ++counter;
+            }
+        }
+
+        // reset the channel value
+        for( int i = 0; i < 16; ++i )
+            input[4*i + channel] = 0;
+    }
+
+    // finish stats
+    avg = std::sqrt( avg/counter );
+
+    // show stats
+    std::cout << "two colour error (min, max, avg): "
+        << min << ", " << max << ", " << avg << std::endl;
+}
+
+int main()
+{
+    TestOneColourRandom( kDxt1 | kColourRangeFit );
+    TestOneColour( kDxt1 );
+    TestTwoColour( kDxt1 );
+}
diff --git a/extern/libsquish-1.15/libSquish.png b/extern/libsquish-1.15/libSquish.png
new file mode 100644
index 0000000000000000000000000000000000000000..7f37a4e0589c6eb3ef300bb7bfee45b0013fbcd0
GIT binary patch
literal 17907
zcmeHv_dnHt6#v_nO@*wi;zmd^Zbo!Pwo92IvR7922<1v$L`hbuODd7QX-FX>BCai2
z**oidZhgM%KluLey$^}|c;DCiHO_gS=Xsv<7HgoV!AQqNhad>!MNJGAK`1m4gzPje
zHM~QgA+!kpQTnJ|G^T}D2<<f-{C?oNrkM|du=$eykqtQ<4S+X~_^O-w8hJYS`d{_7
zNBsT$C7eB6eeAAYx0mqrc1)XB<U$aB<RV7ZI3WG^U|;~x_nRjdeS-5WzMuS3^pncM
z>Y7Eshlo>g6hD4bpZ&dU%hMohb&bv8c<(3Ki+|S6ksr{aO{&p3ep*t8HuXzO+JV-D
zXa=*}zkf@e`_cL<p!}HYv`49V?Z8Hi`Fm<^?fi!xmo~OStwZmhG}(Cj@N!1gr473L
zGq}Gm+Qn3+a~RU=C%d@j-ybl6|NjTf|FH>VU1X%KdJ!UX=8PG_g9spO5uZKlD2DbI
zvsC8oYQx*eHOZ!$Jwq472vsqY;TnW<PWify{DSaGadvT`qu>n0<#U}AO=-DxI#tJB
zBIQ-<g!f_dS*_LNOcq3gc9d4&dHZB>sB+fC%9gl5v`R{gJzOZ8_SFIYd$VC`?0Ro!
z`%lEi($kGI$WQOe_z)f1o9Fl=&hhh;-;9uCp`{JaY@b{q>!`GpNNbt+W<KkY?`Q6s
zF*Dd<lye`cie@=gKRnDuOS!(iyLSV5%j~|yyl;kj!1c0#icykX7$%y{*3;c>%xFU%
zVa{XVE1lXp<%?F5O=-E;Y2HgG@Nze7pSFJ3dURtuL+5zX^!&V@NYWP_=~RqhB1<gh
zeU_&1YvU7#r{?F^bGw{*JZ9(TFQ79nz?H%E<v$+rM@I}VXz;U~lj8h+(ul^rHr&&U
z{dsZ%_I=shYj>h^lh?O0cSqi)POR2R!l8%uTjDKrlu_>@U`><WmGf@0X{`>$?&h<*
z$oik%y(<sx=T^5uwviIVNqI3Jd*bTvB(<22k$;WYP*+!1sKY4iIQgT5uMV@-RaFj7
z`Li(nfR-W`*i8xcI6o1#1Hx<vtOXMno5sER`ue8zk0(w2{*94N{djXB7mHft^co3@
zyCix@4Zh@g=0mMbE?>?$AtqKWcH%_Q;lqco^E6G$>vb$_Y_AYrDy6iPlK(&vc6_8b
zL+BfVx8@|Z^6vKSU)i!zC7FPCUQSMq7Q%?&+a|n=jq<whT1v=$pIo`eOda*iyD}Z$
zXq(s5bGf9X#B_UO!N0%1A7f#`-A^QT4h`XTZa01Ra3tQa&U^X#HOAfj<b`;?44%^t
zIwtP!`2|Hq-`X6CmweH+S7?{Y+=&ISJLOBNH~R_6W2&Kvh5W4WI+9%9n_Eu#Rr7d~
z2=eF%1GUI6@5+X~-EE`b_1_164_3<!^!Lxqo>XyncNbT&f1aI9e&)=X7y0?>y1MuH
z;8v4UozJ@9Eiz(R{`~y>LpRy#OIlmiHd31wh(3z&Sa)(=@ShC{fDIi|(?8(yIps+u
z;y@Mk?xOjujjOAxOM9l$-e9%A$!}tLd+)%&-Cw^Po?~kkvhb}78_%k$xL|lrm9UWc
zsb}%?k7S&&2DGg04#nYgF(34i&4Upp$yl?lz(G7!EXffyA3uKVg0zxLi9OKzs-4@l
z68X+1ibb#+wyxBF!D|1_mFoA~>vJm`51+hwBhmFX4|B=H#BTZLmq%znm!hXn1<rW?
z)a)A^+;x!R+<9PcR<GJF+NIp3FnIg#_)ZZp{wE>p=&GtJJup0<sCOeJmJ*Z0<(}3q
zO?Y1K$+nEAO2O+kuqaJodBY>A9JG{DSKm{1_x4V5adFuuiRd$t;~b0oa^H_GhLde7
z1zkvOuHyf99K32qMoG*K*N|eeRxLOZ=lu9$@mF=#)n}{y7lu~xyl2y|RNv^URToXw
z;nLHUkrL9t|K_!yd3?(5!b>>t6usPiPP$usWQ(qZZuYJtG2HZQq+^cA$vJ+&ChT~U
zS@+;z=-XuP%hIVjb8x!Vq1)>nuqCfaQ(hRVJr?enz(W4e{$1Hz-C6|GfOt#f@InlH
zwNFO1Pi}vHtXAvFv{pB_D_6VCQxU#k@x+n!howY}o9uRCd7m45s!q<r?vTpS3f|4P
zL%ZLmWw3eOSIgZVmQSq0aXC3THQYUN8tvGYo?0p{meDei>B}=Yj3~ZIb~&Bl_2Mnb
zCJoJ8TwIEf^!s0~u@kU)apEnHOC8%YyIl)vFE6bVJ!#53hw;8>ceI1*^W+yVUYxVH
zKX&582^OZ9?PgtYNQ<l{uk^Zla3EUhsKxDF7PWJ(i~^Wp@>jL$x#j$!6+c(^h!o`?
zc6XaX#Zy{TT|<A&jVljrzhzE&JrwC{6Fak8u11SYiCNUf4dh_+qIzQPG>wP4VvZ$U
z{AhdX<G2@m7f$+e|BBp8Fiu8BM)~=QadeJu8hh(C>4LNP6$*im;i810QyHG3e~;C$
z@AL3k->ZU&rY>eZC@{X+u-&!Z4;E!^2iyj$zJKp>&_-?3=`RppzC6+0(_{PdbK(nd
zXwgHF$wi)7Luy?X(QZU#r|8KB!%T|dg_DdEKR-M(iV3Rs|A<P}$$s^Ue%iOH;2NQ8
zx^Ao+aN*%Y_LB1QqsGr|-*TeQo>j8Q*$Gq0(MgR|@cAVie|EmY7oc`=D@3J8e3n})
zZ$o!djQy{Vo^S?$6v^$HZUKxhRl|7|)bCY=ZB3+*=IZs8;H^e$49?R1!VnOk@zE(e
z0~3>&nHiU8H38Fy-!~S5wDlLVvulOWwH9!PYRh}K^%p*O-Lw!)Jg3OFAZAo)`KCa}
z&(BYQJ??7nn=3y`T{+Cn%{##5Vag^PYZUkRl(>8*+rFlIeL6B|J}bi0v=xb+IowrA
z9rxG!wDgROx}x1=gSC$*IqL|sJx#F+Lak1!>?;(y6J_lj7k0x`F5r3kOd>zUAC31K
z|MWB`hZ;YQAmS}*mo8lz9_BR-YxXkEHRd(Vx&K8NjYf9>F8JqJ9ZymX4nBKhVd&l*
zv3%ojE@?h`^hX(}PlfO83<OSQq(D$<SHSz^VrQ6hd=B6FHyyRT%xvwXFF)tP>cfZV
zC2ROIgJw{{00SQ$e<%{~{{H>y`{)AzS!{Upj3{i~#si)vFM}&r?y(A8XaFk}{78<e
z4U+7I<=~pef7FMou*G7AYmMHTG6ZkTH?GeS-@>}wh-Hktyu3Mf9~L&oqYvtJa9Cbf
zwba_ekITPG#<o9hJ7e&dl>H<f->91`@zGZYxw(PLuKgI5ou@P3O{{qzwj@Ws<DZ$C
zk@6bD@Pw{*k|xE|bkp99Qz=O1<*QfQS_TXD-k7MwuMXSIMuLeA^Mlpf&0X+)smH?e
z>TFN`kGjyawr?LL-c_S*O+8Ixa>ehGA}7NIIrskVYr>j0O%nipKPLe?UdZX1Zg<I7
zS?PBz*vbqTi(RY>-M;;M3noB1l>-Nl+x%IbCQU%?DGR&_QR`lRzq#?{%j;JpG-T4h
zP5Ta0vew2Obd*kwPtFNYRCY^+`0BOtRj#S&?va&bcVY^Pp!n8S`Y*TzYJpAsFh1dz
z%VN6~Cs{YO{k<S;qNJ>h>5Gr<BPp^bUVolnU(;fmo3$vTR7w5?_8VIKiM8^QzyF3R
zA+Naj;IuE7XYCq$X7Id%TVIj==iH)=t&obbJ7QIMmA%bLccQWdYT{4jirL>qo}RB>
zs1y4(y&9#jw3fX4GfCgPu`c>E1gob{$yR=S{cx&@hXQr%xL!K$86HhHdi~MHfy7Ay
z7WI3${^&yO1|K*j>0A|8T2BYemG&lRGJHkiS;$)+p466=k=c3gmEMw!eIvpD?^J`}
zRBv9B-l}bDYt!ZnntFOC<E%gX+*{tJcdyNguP=WQv1?976}oe;t*yanuBy7yu|9YV
zpuD@iF)kUtMQ?6l@iu%v?C<R+9+-f?S$0n@Z||Dc2KhN&xiUDICg)j~`-0i(+J86n
zAN1?>9}e9ZbgkdtU5F{_b2H>ChP7!w6n__e;Glt-S?r%bLEYWm{4BJGCYEArFs}FT
zc&6V2l?SVD%&CF(NQN!ZD{lWTR<C9#0FczmV?s?(5|UGo2>-cRfRGXjmyH3R_Wl6H
zu&>Nvi&3g6LRsxGl|GZUU+y1!k(c-GcVF=-mu^ujw1&Mt?M-kOn~ylwM`v8w$Cl(j
z-Q$jllc=H%-x)u;1iQ}^6d6-iCR?-mP5a$JJ^9gMWY5Gfl9>!o3MYxKKp!qW@0y0?
z_-85oh~*@GJk6u(000qbKrsHe<ohpt;n&d&|DksfWEX*HjHNne9_bzj)VTWLHsc#U
z^)XsfD6Q;^1!i=yw5+WEmU09OImZ=WR(5gI2!K|b-#^|qwrUuEb0dBN3y4A)<?tV(
z8|^mIZ;WDAHeGyQ0!|zN=f4X6(~c~5jpdwV%9<wa1|}w{W#U);?m98*vGz9jO(PYD
zWyI+=`KSrpi*rv6Xmhi|(CiYHIHG5LsDaUd=U`Q-VA|JA<BtLAjeoiAn+H0_+xx2g
zTnjsP+tIV)tywBZMY2MqBi}_Hy^ZTQ&1GS29Z#C%V3i-0N*gS4=gYO_yMh<QPW**_
z?~_^Wa&|yks20L|%p9){)dWK9RM$~A9)ier`;^@$o1M)is=0cfNj*yPo8xzQUOD}9
z`BLvn%hw&HxC?hV$(p<%49BJ4!F|H>#yyKyu6h!BFcvO9XP+o^nDMW>R+?!)r4nI}
z6G&+}?9Rv`_~)Ffs~8>c;CJ)$6#*Wh|4l~VW1X5Ob^C;O!1z76#kR8xhZGJQ2F5ko
zY6`OjY|oX`_YcptFnB%oBys4I^~3eSQpNVWQyMDoKgJ#$m1jDxpK1B=8fJpz-1;iJ
zGJGM6s@J4bBl)6){^qe%MdGd|@;nlozCJ+g<?5CgN!Amvw>249$!mDCq~}MKpV;1J
zlgij=%+XH|Pg>B0iMJFbv`koTfB4Kw%89&B3VbAa94~U=&>zmZxw(qT%;4+qs5qYF
z=Q9>Ne0^WxH9j(S@CWA2HweilR)JItb=wl)z5*qsrHnI6uw6HSo0n@Y-TB8{2Y&Yg
zH+-m^#SCF`gyzdN>CI{V@H>-P;e3F#he>GS+?8{6>|<;?dHR&p<FvG;KeGkTYHNAp
zmDV{k!}qqb6+Qg?&UjZ2v}G&)VELC|^kTJ2_veQ9=Bks9>ptGy+g@P)vqXw>o?$C#
zoTnWa+En&KNO%rjLV{$7o?nVE_L6+@cBBYh=FYu7KN#0^G@PfPcHP`i<TB?eTgsiV
z{b2Rr20^>OIgClM_-yBNSFV2e{_aNHje#Pd*@`=hA82DfJUZ>jq^B@(2qK{X57V={
zI=<4<(n9f9rurH`&COk5C$199+<%mN9)(c62!Jp;kz)M`_~`AQ3L<*xkCL&<O2W)m
zl>cza<>t8&N1tD<&LLy5A}`~1f5c+QNr#&m_Qx5Zz4g&43X#kJL5J2yWPrZ<nRiK6
zD|LFBwNe-2yS6rcnaAKA5>E9M(~v&5w=<~%`xp8-YyXiQ{vy2q-_d1}6D<GCbg=8$
zg?mMA#0w@Sy$MPe(HWm!71Z4^u?{_~MhjftZhoNh(OG{9o+ee;bCS73d@(jQZkQau
z*6@?@%QZG|c2Yox7#?GM;+FqVAWggSw)jxU%2$Ei%0Gn=@ibA@$=F*bvt=0mdD8b}
zY-;s<)vRM~x_7gE{qCG+%)}3M*Lxd7!OWzv!58?G^)A;LIxv$@-FZf*-1}wj@+hz=
zXDvMWc8Dp)?D6M{Ytpf%;yos}tE#ILb&ivQOwC%41?d<e1HA1f>nc191iYl@Pws`x
znyi1fUE^sot9R<mZfsXv(P_M%IXD0PZxZ&a&a5Ux`6wwoL(F~x_-~Y}H&?sFLrOZt
zbe|gF2(h=Ld${t9UluSxGErW@;67Ye3qL~~$TM#C9Z+uK3wDcfC&oW+Tk^u3a(vFe
zaZoVmRK)e0)t;SbZ2A_jk{)jA2=a-eVw-5Lktybo883wKrH(jwdv=TABH0weLQ#0M
zA=q4_HJwW+Tfhe(JbYU}{7=_cmF>Rv=hNabV=G%yH-4LffjuQnFh}Pdj#Y?18pH&Q
zx;hnP5-fDLM&y~`nD`rfzWWALS~6X^as|>|_;PY}JrAr8n|G+B2Y&rg=_3Niqf4S&
zXK(zze(A^`srun~IFVOkoDBHOT8+As3Oswp53@YU9~qC-g)>J`<u&ml%4Q>V`?NMc
z=-49Y$wK@%v)UQ&t6{8F+c#$G$pWymnxphDSQ5p4l<jVH>78|-UI^b0?(6Klif+^R
z>8<d)zmy(==6j#^&_(H^bhivFEaD8GU0Az^zIWs_HH064Vc_btx6~jks@mJzTL+Gb
z2Tcg1eO*8Nq^(U4qyc`uc%ckEim6o1=L7;3vD~vMQz`gARt+w8*l%WTRLZ{j5Cp+q
zpld(-y(u9tF?9BZBx|?_<O$4b7{cttdy!lB0*1;+YpxogtgbF#X{>&$Gd}ERs`CUe
zmd?JuSa)Js=ubPW7O<%G!!N3;&^&(KhA-Z{xgf<!+2PZ3MJv`oG#VV?M;T*F(VsFK
zsn2U^mKR*_`J(TEI8b-;ftCQwHBs{k<SeN=)aI_R0gz2W;?GQ)Ht@thS_b0mr+2MU
zFCqE~IS|+VbtmQ7AS#@o87CW{v8iO~w68Hv{#+gTZX_ih$QcoEY&d%Tcb^@)?Forv
zf$^R7pWpf1M1B(i+PGbI=MEik%WTL50OMY~d<m+}?PNAc@s#YFi(c1Be?QCi-69)O
zB#2YPpAYk<_HaQa`(HBa?l~RXXbS?6;)_*es`{uXd)!k1_^)5TPEAcgR-+-#K1<XY
zFmhD8a)mcXFXJ)f3B#u{CN#mfUZl5@FKP>9PrC93_V{uNa>)~ncj~t1tL-P69}MsJ
z_tsqoC)L&9f@GXJKR@5e-Cb#{o{^F~l<Fl2EpYas8;ak%yPv=>*ygC_`|XpnXCHB>
zHB7F&Ti6idWDw+JP}Axw*Z0gr7up;UVCoN19@f~K|6uWU6<_0&A73%ytpJK(6rG-I
znmR~sCMFzU?2whi7u<8Weqpy}mHjz|wjqdh5QZTcV>+Q!>)AqBW%x))vFg8S))Pc9
zE}s_Hcn>UNB8RJ?{T8mR1O3en&A`ey@MmrI=<XOt{bs3!N9BZP9Uzv;DqrivVx8kr
zK*&ge&Z*u;ik4Wm-P{o)M=XQL45>QPW6Srj@%Q~kZamNKew7gHs9dvulWOXN{nUrX
z7v&733^7r2)jiL6=1G3J6mLb~obpjQ^xo#U&v;Y(ut;>k58sD-5Z|)BjVVSXRtK^$
z7(U<04F3xNo^JvnC!DT~S6s%~q2o=~@aYVbwGuS=%b}YmBkHg|Ht!U!8^_exuTFJV
z{F!-m5ZkPsjLqB>TplwLJYiB!$I5Gmrer<b=9TUWSwp$|-p*Ef&+z#mNi)zb=eawl
zYxQ%%007;AijZ)GfDKKlP24bjc+0}G_KbsBk2#^+HD_g&1LDX7F_nnMobxP(qsiEd
z#Ip9<&9OLQu1=~gnu(RSVPV4#4B@ST=;|WuWVj8w{T#_J6)diy5mrV#)}=amIU9EN
zm@-IGU0yg^rMTn6-YqAWmd2a*Eww7!d^2y3DQRL>y{yi+zc2F`K{y$07$UY1t~-m5
z*3<(S1f_LQYq`M?JJv6~x~y~^;<Jgbe|6Y>@$CD@q8q*uv=0J<sZYUL6(lbYh;u-g
zUG*C3MKAkx2{5VSrqotnyUT>LFrO~d^um2EyvHpUwh?E&Gj<U&mF;f-`t3m}M?N9x
z@t3dO%%nrGjf2g6fX%E?Nl5v@nkeGTyTu!^LRKS%i^)S_c?7-5P!qYcv$NLbH36q$
z5^xL^upQN;3`_2LGDpDG+GiiG0;6dEhW<u2!W|WW8_PG>#C$>#ipt!reg917E}R7$
zP>9CJ++{y$p8QxPd~X}uT>Jo=k+Z@;4ci?!n59F*FSuc#ReBNCUJJ)N4z`pt&hp^a
zEqaa|h;OY|u+3DEjdTV6RTOGGHZRKw3+fwa)uISBzaZ1q1*mw8KX3dEXiM_%vBtQK
zizs~~t<pzrMht&hh_QDvR;65g))NxOo&f#^g*@Et_e)Jjv#NnWZdg3@tHQdZOe;2$
z90r3q29!<L`iO$}&+%TX;QZW$7=G_(siz%hT;EjY3>Lk<^;dpD9sei4n3cIprU6Mm
z>zLtp`I9euHkK6Q#VyLj!K@Ut^5k?UQL-c~bX2qvJmIC42dVI+q<P7UHQ+C!tv4`H
zGLYU6fQg)HDPk8RSV?C%k|e0aUdCTczeClvRWWY}$^GsynB2hi^G!e^beja0UxQF`
z_4~_9UZ{x)t-Sjg>JLHPsG09p{asz^gu;CWUQ%HG&~J2kqGcQ$P>wg|v(JTlT(?SH
zdq24o>r_!|VqJl^*+>WQkfrkklEwG&{7#jr8Se9LGB!K`YgO*>^)cwv`P1^4OhZsV
z{APO^-HBF&<e*mH%%`7Vr!s{xz5)><NUPYQKo!}J#S$B9B8Mm;>;%EG_s0*;Nv?|+
zjmwaB5<RL<^3mPu27lTA?wdK<aElB#HQr6nx+~Ttu*`!zBfCo`b@bc^{-X;P<{AO9
zyoN{m3$K3In6KR!dd9`gt+LTT!E6k<UMe2*QwtI+5H{j&eHg7DgsW5@wVk#E+@2b5
zrM_N2&2oOk8A;QL;TQCg^%&TkkPMyH*2p(6b(tuyTg$(rb34hLjaO4Emb48oU%uS)
z82@xn7c5c4%2oq2@v9t>=s~|X`h>Cf08JoV3214PN8Q%1==P@bg4tH@Y*=2I6yO2g
zLp;GovQB3>RxEvwz%{EgG;kp4ul%CBf)BDH=d>zc*VXOm_gebs=1z-aJVrLQ-`Y0b
z+5GX=?&x4m;EK(0wPkUVP52cc6$cxP^M(AKc}M+ZFx?uc2M@4TiXgViNz^R9Po7r9
z9N!924w*5^x}*O{YBc4c{h%8<9tCD?d}oMk(FDnH=2g&}{hU&Qmq}w%$IL%Zp1F_B
z^qI-a^-vkVdpdyaR`Y!p^4=@{S8V%Bob@lF3NC9+=r~H>0(;TwltHX%7Asgf3usxD
zdVKi>z2ykT)ydwYTa#JCik?F?MM_f7m;FFZ6ahJRw<vj8kYH?<B}&3^tSoOs5F!c3
zSjy6wq(dVpxPjH0yN{KqS#B01!U~+xOr3>WKSfmf&cH|~vCZmQc}4Zz$#CQ5#y%vI
zRWPWW#qcUbrOG7ijC)H_+m$jmGdGa+!TT92ecBXqw@bS8h{p+vMyLl;8BL6T<1_85
z&X2{yX@1O4E3G_^lPy)(0FYzf+BE?L$XwGqJ6F=GFqvL#p%qw#efiawuY5%9EeRqi
z7u$2r|GbC@tq`vvxn#8Lo2GKKtf3cYKZiEbX<!KIsI{LYy`ihW)U|$Ntsq=!py!jF
z#)*@VpV+Ryr<Zgq6Q7kK97?H|N5E}D)BQ{E1Fq<gPia~6$vbFAGRcpQNeE=CbfAlh
zac^-iqF-Cr?+pWv552v2@7{*;Vgtp4SjqzB>~B(zZ9JikhfPYrK>>MUrWHLtX#WNw
zt_1J<vxPWFr5sJUD5riGpvVdAa~LW`H5*-7d%lW?=u%;kPLQ`Xt@T<Ddzkg4YUKgj
z@)81TTCS2raEB`v2#pMC7=kl=(i%)Wh@>8zB9;MYocNK9GP>TSE9YnW(C-a)=1f=M
z07(>8)SbmqFRD<@5AkDsEl;hm7&?QgbWMWFvj6t&+f&C0GQ2TP4WL98f)!WTn{k{V
zSiyOGC7s8i8W~(@F8fI;)(0hD)8xX-%F5P9&$#w{jC!Xwi4))ovunaP&U}A0j%^lu
zfUTFyXqx-`u@WZR;Z72|9rFZNyZ1~Zey{W8rB`D&k;y8-jaN{uva_=r@JhGW$`jF<
zTp}mAi~oZh3YNrjO_T|qTW<dCVX?~}cf;xdp<>S1ex!YEQ7+*@yzvY>W7ri~6A25$
z;hd0UthrX;I4BI^Br#GXM43-7aXb3M)qxvC_ay^S-=^D)+;yJl_>@JF$D|KLBx4m&
z6W>WO)~w9!h&Eg0opJe!xiX?H)Co5Ryqb82r8AmbH@kE&gowB#md-*WEqg%J*H530
z=FJu~u_i?Q&t8D~IwRVvXf{^ymiLd(xY`O`xVzZ{LRS4XbUX)?O$<?w?0_^p5Fr@%
zm(9_G$85bn(`%V@?V?~3@L&&!RX<)BKHIYgsNvR~Tq21+C#ZaMY52333CUW0WubfN
z3O-hqJIgIeIV%Q%%OHPFJhT{tgAZ;kdR>~c5^#is?~hqJTadeFGNN$JY1k9a#B~Lc
z^y}}_IIKgwntl0BNUG5efMOH-B}rtzJlQ)HtCiOF(;fikr<oe2opP}tDui>7O&~kl
zK<gNkG<<+uEr?syR1o2_M@bMjxjNmI^;MEG?Q}-dY{mN{j341@tE}@XZ~T5<=n_zT
zegQXS_P<?E8{sjDZW@Gt{DS0&C;<)fGpKQeBhyfz-ZH8G^S~Z`@IjfjWUWl#@3&2R
zW?GdP)C1X!+(U)@o#(nI<s@Ct@HW^fla18t#&Fsafjwk@%T9K2cX2__{Vr|~lZ8U|
z=2y?K5924f*hsSb4CswvCR%}b?8_G4Ozb=GwHN7r6hB;mG(O-X7#AlmAB<x^*&hmY
z#ZhPS={KKWsg5@!Ei0}A6h6hC@IdRMfaGL*uRrQZjGkt}7(Y(N20u^EGA+7xapj^&
zM)H}A7_FwAt(DWHVZ;cES5S?w?C#tpAzhzJ8uVijbWx?}wF3J=Dj8A^UU$=@J0B(M
zKR+;VEXA_OS}X7|)+K&gk$@w8;`=jmjWAo%+>{C3!4K^-ihZZQkz7}&Nq{*`DY&c-
zND@)J1r2uzc8Z~eGk);Q9~p2XeT13Ny${|!dgrj%*uS{1_=2PYr%22diqGm0x`{a}
z$<icClB=gGJU6RZ4M=Kim5e1ZN>a?<1d}7(C{-~~niTpT*_*`ymw4P}G(+-R%a3Fs
ztPPds8aL9I&SNGnqSlO1_TZ^@H7{Q_`|>PD0M8{cWZ)%@T6vB@erQOsYUT#nCV9jc
za}DKO2$FMbiH6*=Zku)E^46y19#;muc)GkkH<DBf5^{hyT|ljIcLg?6k=FM33KGU>
zb1>o{((TM!sXhBStoSVsr(6gOXdCMHb(J^f`c0)c0i!IPpbYnSz_K}DMNM#b5zD^H
zodM=2{l2tO_tk^ak0PSJkG8i2?WYP<$Z$+6K1|2%Wl~w4B@<ruJd4yYk6g(Y%J`Md
z%DOfTsf1kUQk*qUt@lK;YyC>bg7LLt%M-Cd#+NRQmJvPTAhF&ik(#V&#fb|}Ur}2m
z#L;nbaao2M=uywUqKTRqz97-oacR0+Y84<9+d+DmH4Gvn1ssA9iE>~t*KpVqB7~YM
zfR8P)f|?>z6NLngzOui33sT|wX76iEunQoHt7_$yxD&O1wPjS>gW`_r3jBQ6Mhf@+
zl}Qr`sl5Ou2)@oWlYi@%tVeQ^XCO)PBT;a$<g5L@d|Xyx^+&}_-rSAnQ~IL;1%$oV
zF!O&(=4Yy8Y(DVpI1Wh<g{6-NeR5Vh7D2<Pn9fNX1416Jjq(I52NQso<#|+T7Z70+
zOE2|psAtIAosT`JHgcQWLsKUMy|#gEo_&Dbd0b73lQ8i(D6B&4Oy~BjIi(DmMwgcp
z?UlfZ2N|awS_2HV0`stUR%)s#|6SZ7O0a?&=lhA4l%}Tmqn>Cc#+y($c;O1-o#&6T
z&^qi(K%PJU4NZ?CggFw|*MOGkfxrP2K;3sJaMCY)>|j=azE)rv_D)xIHN(H5wc%o0
z;*IetzgadoiJWl*R=^$pP|(ijlCcuTT7Hk(&L91!&HuCo0}c4>m9F_W41s+*vFM?v
zpZ{O!#rYT#4~D{sH@BR}j*GqhM0w9^vx88^hOx=;O6#VfK%ExaQ=VOc?Hv7EDrXIB
z0K-~<V(e}?1Tf(pQUH@dU@JOEBrVO}jQ#=ELU;0MeAB<NS+UKB;Q;n)0R{8~qNnA%
znL*QQEV6Gw0|r>2)^es5eg9Q~Fnmvwn;|YtbZS73f7ryb(;m$nT0^SdZ7olnLQU8L
zI=~utKl~e7Kr7FKSSAUY=={URI1Zm(o)q-Vi)V#FYcmze%hH5yBZw>e|4sWY^yLuO
z&FI<1e^lDMi@XbWEd<f)>Rf-R7qP6sL~EC1Vw?ai@L~okn1ao$yY}V&E`;}%)VA|V
zV5FUp7nHjPKfun2z=kyZH*zx8B^Q?c28bgb@~p>rqjgXmt`ptM8_z%OaFlcVu~+~T
zx%$ue6m?LgjYY40K;UP8RZt+uo9m{4k0PZfYEKPT@n`2Z@zaXN^8e-$Z=_Z0WY(iD
zcKNmDU2eHH-}E9O!bf0ub`XsNY`|rJ{dbJ~bD0MSq1%&?tlj(CED6r%42(=kPWIZo
z^u%Qgi|tz7G}+K0Sfb+Jp|jcEhVA~@Pr!y%0Yx%2Gjm4keOZA5U0q0E2vn8g-G*wm
zx?tK~{|%l1Mf`EqCa-K@gI-NuI!!1uHCj@hc*^<Pb8AhM=c$aCJ28J*%Zrb0{Uy_N
z&CSi>q<SBeY++Blqgg6ID?#V#rmbtBCeCP~^op(2{_TlY2UNlGTwGnlYeKe`qlXt@
zSDstm=(GRQSM0bS?@r_pBUBi8#*$KXLBSJ-W$we$Q05l#tQfuh*QSnLT^ei5JE*U?
zbdp%cfg)gG;!FRI=Yc4}xUFnXEJr_U@7q6{ito(`UGEztnZk+x$a&0dQZf&nAP0=J
z`~VpD1}c4qYeypKtAM$@G}2c(<1=45IsXtV0f~NE+xheVbEPN(TbX;-+v}t6N#lKf
z+%|rOWH43%qqn|do4b;++YphT{&PxW>SQeB5nb$(Rs-Je4~RGc&l`g@x|0M%uacL2
z3EiL)I8FUG@dNNHYU2H?H&rh~f3>FOj8_bSW89l9sISx23cQV-xdIml{C7O8gl-Fn
zwZTxk`UHB1Zj)C&)Jq3^yL5T6%_|TAVbUM|n{+E+6r{*Cz=l5pn(H)qWq_{w+Vrj9
zal+1X><krY-2WyaK<G}wHgkhbb(vXMxT5i{Am&q((n#1P0;w|rfc9Te=7iB~Z$Wr>
z(QAD<S=r+N{<=-4-hc#3G8kf+3L*K=t!c$;|LyY;BHY;@LS;3W->5u(TK*HnrK>>p
z!MJ+fK<>c`0fOhBtxA^%uKWrX6BFC_0?i=SM#>4@R!AT7fy8Uao*GQy8M$Q@-O$Xb
z|MnSEvC?*SKL~2>@e<Z`HC>ZpS?4jtvbP?CRi^6djVYwm`Xr2A6ETf^0L_UODPFOl
zh|oetDP(DeP<Oe`ah>BiC%7-mG$M&u(JVghNzC{)MhXQSePz3OfJZw3W&}s&oy8CM
z9zt!Vn-u5A|3#6Zb~z8eFrb<|BUi^_NZ$~Mydl4?9Dy#5y?uoyVxM|(0paUHe{+y$
zh9LvbLHpJB@88#mBtknMAHHMfjBYnr0#7V<&<z==@M0c@UJzH^Nr4T~%1B{MH}#?d
zGK&~<<Hy;Z5f#e9L?La!a9?34*&l{FL;KFWf4wWTK!|9~=LkcEn-qz_BKD260-s@D
zmN4O*4YXeJA7E9YXiy=O@AqX-xkIrT*^{`fOCtc@<?rg^@=}+8l~e}dYkYq(Mr9Lt
z2)h8KN6pfwkHD3~-HGdv)ud1B{FJb&ocx80#%0ouOxh^?kw1*o9mn`40VRW=;-Qtt
zj+$_W0)AgO7<+Q-ku@l1Rrx?8M*A<0NPC6sc+pr`kS{}oM#t$+a^Aj_Xk1vcFD*<0
z^7!r>|JlCXr@KdT#u@s7YJT$rMqTGZTxWxv!<WxH_r9nmqRecQJ<Y<`C5(}|1{4zN
z<!RU%-T%_@kiY5pA-B$K^}Hl~<t?Ztv|rNl13aIf*n*qOFho2PppfHgaOn~NW<XMp
z+O<(Cp%8={bsSOMW{w#Ikd2-v-$`pzoBy>HNoj+MI00F~j3-I<1LZJ4?hu{5Zj7ig
zG$tTF?9eoo)~G|z<8)~j3lFGa93L~O!e(T4%**4Yc(YkRf-#?E$X&_V6TKBF)9#DJ
zpB?l?V4US>Sz?(|;BQd^MmA0NahvMU$QRFLD>Y0LIj1K&rs<><Zv;$G@#)thaeWCc
zSE4M`z~B2aEmP}r2wz`)!I?wi>2N0&dg`c5aZRii)H&Mqq*OnM*`OQv868$dU-^BU
z-4911SUQKFE+#%nY;2W26pg!_?MOB<jeJ6Aj~4bJih)YLnbZm73Gy~XdK>zPa>(g9
zTW_!kWapAs<TVT)1PW}?s(Ah)Zt4qinPE(z$CdX;ne|6COg-h3oQA>h7k)p#2!29w
z31mM*S~2bI`<gsE#S5RJN*I(k74Q^JG>Z}u<z#NX=&_f<N8!AWG~men_aF2m?whnK
zet==WBW)+IZ@_&+@*f4(sG%NILoKrFolQrUUa;!{0b&_h1Xb*x##ySH#l4J~vvl`a
zI_Z!}iV8~bvzgZ8(i*ra38bEKQ8Z%z8`^exIU-dw#AWj`l8MqpxTGqa7waQNchzec
z8luTJLJv12HO67(1+p(866DtpN1R5wAL|+$Us1zkkuM%ZR65biIn(E+%ml|@kZ;V9
zkMu=Sj>sI^JH5w^*qbFF{Apirpz(IWCH$qSaAFOSm^Ua*HB1zu9F-ZR9p)Z<_k8=i
z1!=@x4Wq(5vQUZ-xbgT>HgijX<Awqf&UlUsf7;j$ahxbW4#Er7qC&oqWGaKYj6t89
ztON0eu11Wj#Pul-)Ugm{>sQk}KN+p+Y4X1LMzHbU!f{D9t6^M`8sy0Vag7)w#u08L
zEUW%^!snG}93RDqAHr=|H8Eu%LAe;o(%JrqNlgX}kBJnwIi;W|va+?F_HV`~b0}z8
zOA$qK8~n65-_ijj{+Eg;Ci57^S23cAOh+c8k%er8Od$J}4voN!WWJxpp7$C3x9%1+
zQSThO6h3p;SS8eChF44=HRK~_5f3scquI}*`9epE-xks-ZEzdxXHeMOgQPQRHQb<X
zV(OgrK|wTRGueAAYQHzeXz!29I}1dYzo8elyK8N(e*Pt6Ll?3~JED(d-spPmgg|zO
zRDAViivJ@fkiDO_A(Y4g(8bqa+ofPB$Y4Vev6d6X!W-K@>AL=JFysxIT5!Ww^#m_h
z1NRh4t@hTGU%9SWFs&o<=qfr|LszxkEj&JrNzFK-UYfVT-h*AR>K#QvpYbm!?^(s;
z`R(BtcRxlNb9vA#X7;d3wzXlW{^)kwk(nlZPHO|z(EbA1BdU=zU8*=`J$@FaFDZ=r
zyywyQBno=AwWQp-%IS_0e2oqI1AqE8bVSovIo=4TC($@OnN*)I6Nfo1yRdhC*>(8=
z#(2`Fp1r$eo0ld3g)H?(_B6uR)P6LORsSwezYH|_|9Ps!!<3de;N0LwWFMg=joz>F
z;Q6VjALR!c@U~Q^oP>ken@(a%Sq!yjHT_Rfurj_~qMG+V7~y#5$Tiyh4-Fzu@_6s?
zW7K;d)?0ox5^q_K0^j*EOZCM^<;m*J9rDsBGtOmen6Bw-#T;7JjOh3DY#qgg61_M0
zC6AbmgoT3xTrf;xU=_%|wWHH>NQu^Psq~hbK-Up!kZ2Iogcn&gVX;@<`yyP2BvvhW
zE`h+sXz-LOulpY0)r+J3D=Du6wWK&1K+YoFH+Fbxh+UTV38v8Les3Wy4_A0@B1g$Z
zl#(A6R2rjL)I<s2a*7k=0%o{(Sg22P$Fc|pnRVW0P+{c1m1$T)QLbdJQL{`wvKBed
zzN2`*cQKg#2gOJwg<QT+_16o6Gb!skPInzOQP5JDl;e}&A5hEB_{%Mgxk0FrZcOlq
zK=#5D$Ic&e9mP(+!<~6ArruM>7?(XJQ;d&l>9|pxh@*R!JhQi)QZEio2;K^w19Tol
zS!hMe1?S_nlGIcrt{ajer<aL@t@>FO!9Pb4zHapL@TdE%tPkR?DK<|BHFs>^E@nA=
zNj7b+#61|}m@9{dW`quN$mRJL7K2omcNiQ#Jf~}rMp~kAd)7#~D{r5j{6SX3FW=D1
zXS<HzZfnV=-8+jPPmL5NME5+7`-rU%{DWTHIYtLWa`BNXS&al{)f8+Gf>m%y&tGM9
zN)B}lDIxju=yYWAMkLsg3|m7u)D_*)v?PuZ*HiUXn+lax3XZgl$BJr#{f`)~A$-!j
zk+|mjCJz0=+2>t-c$$1~hZVbXLuY9{X~hJCggTlAZjN_V?Y08hO{oyd>|-3gV-;H`
zSIz`GQ(n}HJ;&c+i>3*T=@oH~=t-0X7p1eg1vuh&cC%6_d-S{cNSMzp9Q&EiESlpZ
zrPJsVeDc_DGwoZrOndZ4{=0I#4K3t18>}Sn@hjky*0y(V)y*3mSq@QcuI6r-KwNZa
zmY#4!l(;af`QDdB@C5fQJpZ6|_n{9M+!S9(q3oa$vUh$K*UQ{<VM6Z2`FA#jeO~E2
zir&WnRt7l-`csV^6qoLW3U8e`{)Q^y3{PX68zS<Ew2ld-F0$KwridfCDdUJL5*&?F
zmcaNv?K8ztxdO8s%sIbmB6_G9Vh!~ZVLCR%iFfN*1YyeTZ94TYVerituN9g39nsOM
zsz@B!&Sl(`3O#GS3@-?<{c2AW9FJRg@rcl~3v)%>hq&{^@8>@GqXQ$=i1h98fCG?e
zFmmS$okBbxw~eyNk!?O@F{}sDM9~3tCPkm`&}7SnaNHq;iRDF7BIX+ACHU9K^+Q1t
zj+<u!rUw&SWq}JcGo?6fHi(ezNCI=Aesb~<opYDKA2F42K8=JI!&>(DZ^uK?oP^1B
zLKtK6NBGQ6LMoC&{(_Q4*~VvN?!76^lkA32jYFqtSq)V}D3pqwynjA#a)jMJrc!m`
z=MDEVcX}AHrAVCY?>bIu(tJbTCtuVG6T7H}IZAQe@Mg8i;nVc2orzAof!4Sw=1!jl
z(X|>TNt(q7Sj}z+B<GHL9-1f$8`W&G#bw-q!_2Vs)Iz3)EZvyviP|6q&*aLs6h)FY
z`yWV9!-gXbY=SPSOrx@Dm|HBJr(r@6(=F66^tI86I40Fu&9l(-Qsab<(@q!0B-f{W
zSxd%(A&#WJO4tbtKZj~u<I5Y~RMV+7O<0b`wLzVRIbwrshq@sj!Ba0DNvgzYCH5rd
zPo-X5xr{7u;qA<iMI3>O`I-hWuZt+%CV9cXeRqn*4hJiv9Q#!|4cG+DX-0w&Idb_u
zsePRTth>Wo#X}flVRH>{WbZ(O3WOvkuva~e_Mhm3hE^ggTu6-vnzA$TT5(^rJ8{(b
zggv^ssmPtX&#kNrd=PryPr!;kAa7LlQD}M%Wm@h7Bc2H6w(JqZEkFY4h?L-caZLAx
z_SR>Yrc&KY{C&1}nRBxX4IK-_xwhR&HA2p4NitJcbacvucjY5@qFq06Ftp7;E7nq?
zsb$vb9AI-g71==iO<kWo6F4uZ>xAQ^Sv$#q(UApw^~K+06MSiCs=C}<i$~=Px#yfe
zq=kD?9iyap&UFyj>NKauHKJ@uPr7mz_49NIiwG9O@+`0CSCjWA!uApWSLB6PX~$St
z$%asxnEWj&(<}F=V(AQvgfzDruf|hd7mNtpnlly1K5VWbec?l5S_rRF+$B6ugK|0S
z9wbz?=aWGz<`8KW&?gar_q6v({W0zM8~2YKGfara-R|(Q3-x40vo^R@>~r}^>wwe?
zbw2WnV5IsMnH#2km()~g-?Xr?u+mHm!NV0Q1T~F5R2Okh$091_1Fkt8N7`Rc>AYF-
zNA|eR%^~Y%Vwe&rm$eBrlhPA&cxua!M}vYTr#OTMbqsoO7PxcBmY}&og?iCqb>M6f
zL&GT>g>2OhG#kbBVZ^_fY(Rn1izFZF({{4c-LkB&OK>NmS!iWQ$U9caDx58_tczq(
zNd+O>uMY|Sd9b6Ff{~$B!%#(_rYikQ{LeBqn3UpolD@cMd^O!iugfzwe1Fx5#&?~J
zo{}QX9-cTb@?m1^&7$JcGS^@OQXP5op0F7y#hK`ME_gy$4WltSBIqj~^oCl#o590s
zM@ik7ssH_G;jBn4h_ud#xTXxh)lTCF&*d>~XI|7SgoBd^UBeHu-#$?535H028k36C
z0#0~js&aV26d7k|63Bk%>dzzMi2j%;k58hC=XH*#31DWc{}LCUWd!hpCL1{0Fi4IF
zo@a@9(al3O5>m*+`rwXilb5BEDh2XWn<w7K>0GlHrE?UEVQ<!wDAdB&&#b*4{nT=-
zp3z1rVu|91V;)PVq%&`z9Zs6}Vy~1Ev&NC__^em)W0TLrHv<I=oQmCvYa;zCMu|R8
zO}SVk*`Keg&hIy7KNmfj!)_4A9W?X-S~QMfR{4-&&IS)<#1QFa(tRn!^(M0A^!44Q
z_U{@?KHpT{1YZ$G-@Sn%%*G431816LW=xo^f6F}Do<ot$FO=1_^O08JP*~pNTQy9K
z?RvUNhfjj{3+_+MlP<u(DGOgkOmcr#(vOk;kwong%vTWS%8P0=oOw64p}$|qpJ-A-
z6Q$bXixW!3{&^RPq4?PtxeC$t*pxz4J1>pFrIET}F^2?Iwkfk`t`+uV!nda@seK|W
zPGY*dHSGl=qkg2Qk95!}o!f$%*3BXADe7Bc%u87QWlDna{7~@1S9n~`MmFsX?_?o7
zK}I^ubD3hA<{Eo~m=daU`P1UZK3-D|qG&)-XARF<^`f(5rYnc245n~dh*Wlkm*uQ`
z7ukU3tY!>Lrvvhgy5d#Ra5I(TN8$F*4#gSJj2Zj!dPXjHg$;Dh(X+wEc)mG%H&=&7
zRNw$2c_d`dkx|(n#aWg`^*1yz^{xbR*5@)W1a^^GO5Dy&x`4@qX5=ya_|OUm2=V5w
z`N`f3EF6{hP3XVvpW=NIt@zwe*m-d)P)b4mLugTib%D_M<kX^Rg7VNxP58YUVR$Za
z(lOTO80uJ2-|H`SJj<d5?9cZp3kQ91Y)?9iuRpt}(zvK=bj1ZbwPnVsr*h=i<jP|`
zq+W*iePfsdI-+H1GOi})>sKMoGIybrmdcGkD&8mc*@g3k%n)n2jAxXJw%6zQ5a1yW
zh>A?=yAONnv!Cm(hP7!Rzd3(RuHL69#s8M$eV<=~-yv&=%kil&7KvDZ1YL$j4Y%vT
zr`pUR+{SQ`;@21OnlniJW2Vuoo`|Uj8Z%@v+NN$d`1)W&=!f0sfN044(fP?WoZf1U
zL}wHC`hrFLrbxa}J_Hd)P%k}i_^95;s=0mz6ZzE_nVePWdZe|j;n$^V#4_9O<KZq=
z=*~Obpw*?IC`@CM*8{yWN-Fw#A;t^6Kd*GD4nRLdJ^5zYG>^oJOavooP-7|1f!THK
z-wgzg7f9ww5~C1i8Zky@-!-oNIZv@kgA|wI-A?iTI%lrIM_ovcNb3IyfV)A;Z>p*=
z^7CVTTJwmsJK`a`b!O>_{<Fza+(9;^p89%f4=X-9X>v29ZuhBys5`o86#6IyC^a0M
z8x*f3L}Y4*L1P9PP?r_N>J-nHdr$|Xj(|e{>rV7oGu7Erd<aK_FQSVK$)$brD?=){
z8yND1_8ri7dlD;p5?>+WWQ!WflQ-wXkZ@+3s-DD!G^X)mTfK8<5Iyom&gK<1;$){!
z(YPh>cik9)69&QXG@UH(*!Abh(1WVDq($+o@|G>L#_VUNRi1N0Co#UqBh;r*y0yfB
zsHNLRXM^S?X9gi#+^|YTU!K*Z54u3vOHYc$m2goP84uLPB6(NE70q2_B7`FxRu5&D
zcdVS*A;6aFP@m<=7gC{F^byFmLo8`WWL85maws?*r7aQ(9mNLo79{D<8VO6ulDbXz
z-0!D4l0wVM_-yjbwp++zu@!TJr^FS*x%2HCZS1Q$6#H4`8uACl9z>QIh$f$c6+j!e
z6sMakz<TaAx^Fj^PZ#sboK0gf<UU6gu}GFiv4GEZUgaq{#x$^;!uyaCQh$noErX*i
z;Yz#+`|SepD-<I)$=8D!8eV<~(V>{4PUuFD^-Z^d2~#cFgi#<c7qS=BEbnPld>$r7
zP(15AJ*p<6yxHgCwRQ3KE3!jDxg|?2?xlF)4j*faVf#BIS>nRjNUt0Byo$p8Pwxs^
zcqPKF)R9MsU$-vcsPQD<j3?jlo4CYzy95%v-b_(%MD7y&xdAFu(%%K(5gA|E+FZ*f
z6Z;bBE4mfcF0q=PArgOfWLCiq2pV?-FXyHcY|(KiUev1t>LvKqhYVb{<{Gzy-;D_5
z`Ux}I6io9FbY?$i={QP(Tb@PWS;c3RB{8I2lBvZj-2>r;gl!|hZ#CNKhcoZ=g{xii
z^s74ddKiqcR=*qZ#@vfH<ML*O{DzQ+<EVcfMGi>|wH;BzNQ&Ly1pO?k|9SG8TCP|i
zAs_<FAX~u)>I_Hbt-lPLfB<Jq*MG9bdv4)Olb1}yBgk!(psGu;;V=4o=<zeM*C)N%
zVF;mQEGnyg^5FXnlfDU#I}@~QjGXjroybJ4;C>Uie>^nGNI3Apg+`DPrX=6eyDeYx
zJX!g_DbaOBXHJLEyrfp5=Q;>Emz6|H+@yw!Wz)Yr8PFU<^;vRx6B^NnK_qp7J{4Bh
zR6tZ){P-ihm*zWYS_M`Jg+as7I)*0+jpy@s>PUiKPN|2_I4v|mDQC48Bx^lxQyYT%
zmob07kSBS-^M=6!Ho+kCzoQ-LnE3>0BSs=cn<PiU6X-JX8w|F?YU~-uKvQMRYVZC1
zN&ENnWQYg1TIID#7jQxkJk4Spxqp@<X0^)p7f8TddhcwmXT@nGu)&GQK^=f`$mT-g
zJnPtgx&Qu~d%a#fO~b=2<rg*0xpq3Euuj#7QpUnM^6ciYlpHi(*S%QQ;zoRNNsCu@
zDSW*TkFr1YZn=3aKlw)D`Mg7ii7kgOe%tihsu&QoObbgJUow>r{o>l1BnfR|)ftn|
z$Q$-Quk6I@<(|$9ZYu7dyJ383tz@3J_4rLn6XQ!8;W*sUtd*rw+~&xi9;D$Ode#?R
zK3}hlN@$xMdgs<W%ZEtf$^~tA6|#6OCs~jR)yYARjlg-T^p{&(Tc3-5zy3E??~|AF
zDtKIz-vScBz1|C@l5u$M&?_j&eB<Gxw6r*QxMsAcr>C&fDdf@ypM%lmOP2<@5#*7o
zz1dKz&hZ5+E2}WC>(@KN7DrSkza$0CI}E}D47Ihjxq+GCM2~V6@jGK1=C`F*DA^Fi
z1pCn5QHCoNd+ECd)UI8WLN*Hny}VAxe1;C}MriT1adw_gxlz5VXLx9rd&}0!l>7h^
z;cEDB;?;-I#TD_Eq4|yNd%L^4>m@^?`ETBQf~WFq0s{j(TuSfYHgiPjRNN}1Z&D%+
zr6*dJA9OmcdSPn9eSHt@hJ}xAZpt4|dX+&t<Wg$u<8vAoxp58dw%|~_<^0gznnT@d
z7SH8)<A5-N<YbGtY~cb{#>U2K72B;<6cu0V>FHHXPELB=xWT?3s?4nyn5j|F8`s#_
zIJFkkIlE>HCFAK&pFU+Q1?+^D6L+Nr*Ke_Q@h8AWRTYU_<8g8ll9JZkOflS-F6GN|
z-J5{BehX0CbA)8CSiFUuo!$K21))#lOIh$hYq@w!82p)#rohboL#)Gx)kn!qbF}!$
z5QA4m;=!zM*;tcso28<3{*+j5SSyk+kQWSp+N8l<0B(n{{e6oTRuO@L&mh9Eg9tw>
q;1?2mjvPTWN#}4*@c*Gpcj)E5g&)yRC5Xcuk&Ehjm^W%Rk^cjZhPf#K

literal 0
HcmV?d00001

diff --git a/extern/libsquish-1.15/libSquish.pri b/extern/libsquish-1.15/libSquish.pri
new file mode 100644
index 0000000..0313db0
--- /dev/null
+++ b/extern/libsquish-1.15/libSquish.pri
@@ -0,0 +1,26 @@
+HEADERS += \
+   squish.h
+
+SOURCES += \
+   alpha.cpp \
+   alpha.h \
+   clusterfit.cpp \
+   clusterfit.h \
+   colourblock.cpp \
+   colourblock.h \
+   colourfit.cpp \
+   colourfit.h \
+   colourset.cpp \
+   colourset.h \
+   maths.cpp \
+   maths.h \
+   rangefit.cpp \
+   rangefit.h \
+   simd.h \
+   simd_float.h \
+   simd_sse.h \
+   simd_ve.h \
+   singlecolourfit.cpp \
+   singlecolourfit.h \
+   singlecolourlookup.inl \
+   squish.cpp
diff --git a/extern/libsquish-1.15/libSquish.pro b/extern/libsquish-1.15/libSquish.pro
new file mode 100644
index 0000000..054faa2
--- /dev/null
+++ b/extern/libsquish-1.15/libSquish.pro
@@ -0,0 +1,32 @@
+TARGET = squish
+TEMPLATE = lib
+
+include(libSquish.pri)
+
+QT -= gui
+
+CONFIG += staticlib thread
+CONFIG += debug_and_release
+
+CONFIG(debug, debug|release) {
+   unix:TARGET = $$join(TARGET,,,_debug)
+}
+
+MOC_DIR = mocs
+OBJECTS_DIR = objs
+RCC_DIR = rccs
+UI_DIR = uics
+
+CONFIG(debug, debug|release) {
+   unix:MOC_DIR = $$join(MOC_DIR,,,_debug)
+   unix:OBJECTS_DIR = $$join(OBJECTS_DIR,,,_debug)
+   unix:RCC_DIR = $$join(RCC_DIR,,,_debug)
+   unix:UI_DIR = $$join(UI_DIR,,,_debug)
+   win32:MOC_DIR = $$join(MOC_DIR,,,d)
+   win32:OBJECTS_DIR = $$join(OBJECTS_DIR,,,d)
+   win32:RCC_DIR = $$join(RCC_DIR,,,d)
+   win32:UI_DIR = $$join(UI_DIR,,,d)
+}
+
+unix:QMAKE_CXXFLAGS += -DSQUISH_USE_OPENMP -fopenmp
+
diff --git a/extern/libsquish-1.15/libSquish.svg b/extern/libsquish-1.15/libSquish.svg
new file mode 100644
index 0000000..efdcee7
--- /dev/null
+++ b/extern/libsquish-1.15/libSquish.svg
@@ -0,0 +1,238 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<svg
+   xmlns:dc="http://purl.org/dc/elements/1.1/"
+   xmlns:cc="http://creativecommons.org/ns#"
+   xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+   xmlns:svg="http://www.w3.org/2000/svg"
+   xmlns="http://www.w3.org/2000/svg"
+   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+   width="630"
+   height="230"
+   viewBox="0 0 630 230"
+   id="svg2"
+   version="1.1"
+   inkscape:version="0.48.0 r9654"
+   sodipodi:docname="libSquish.svg"
+   inkscape:export-filename="/Users/roettger/Projects/libsquish/libSquish.png"
+   inkscape:export-xdpi="119.99844"
+   inkscape:export-ydpi="119.99844">
+  <metadata
+     id="metadata26">
+    <rdf:RDF>
+      <cc:Work
+         rdf:about="">
+        <dc:format>image/svg+xml</dc:format>
+        <dc:type
+           rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
+        <dc:title></dc:title>
+      </cc:Work>
+    </rdf:RDF>
+  </metadata>
+  <defs
+     id="defs24">
+    <marker
+       inkscape:stockid="DotL"
+       orient="auto"
+       refY="0"
+       refX="0"
+       id="DotL"
+       style="overflow:visible">
+      <path
+         id="path3691"
+         d="m -2.5,-1 c 0,2.76 -2.24,5 -5,5 -2.76,0 -5,-2.24 -5,-5 0,-2.76 2.24,-5 5,-5 2.76,0 5,2.24 5,5 z"
+         style="fill-rule:evenodd;stroke:#000000;stroke-width:1pt;marker-start:none;marker-end:none"
+         transform="matrix(0.8,0,0,0.8,5.92,0.8)"
+         inkscape:connector-curvature="0" />
+    </marker>
+    <marker
+       inkscape:stockid="TriangleInM"
+       orient="auto"
+       refY="0"
+       refX="0"
+       id="TriangleInM"
+       style="overflow:visible">
+      <path
+         id="path3766"
+         d="m 5.77,0 -8.65,5 0,-10 8.65,5 z"
+         style="fill-rule:evenodd;stroke:#000000;stroke-width:1pt;marker-start:none"
+         transform="scale(-0.4,-0.4)"
+         inkscape:connector-curvature="0" />
+    </marker>
+    <marker
+       inkscape:stockid="TriangleOutM"
+       orient="auto"
+       refY="0"
+       refX="0"
+       id="TriangleOutM"
+       style="overflow:visible">
+      <path
+         id="path3775"
+         d="m 5.77,0 -8.65,5 0,-10 8.65,5 z"
+         style="fill-rule:evenodd;stroke:#000000;stroke-width:1pt;marker-start:none"
+         transform="scale(0.4,0.4)"
+         inkscape:connector-curvature="0" />
+    </marker>
+    <marker
+       inkscape:stockid="DotM"
+       orient="auto"
+       refY="0"
+       refX="0"
+       id="DotM"
+       style="overflow:visible">
+      <path
+         id="path3694"
+         d="m -2.5,-1 c 0,2.76 -2.24,5 -5,5 -2.76,0 -5,-2.24 -5,-5 0,-2.76 2.24,-5 5,-5 2.76,0 5,2.24 5,5 z"
+         style="fill-rule:evenodd;stroke:#000000;stroke-width:1pt;marker-start:none;marker-end:none"
+         transform="matrix(0.4,0,0,0.4,2.96,0.4)"
+         inkscape:connector-curvature="0" />
+    </marker>
+    <marker
+       inkscape:stockid="Arrow1Mend"
+       orient="auto"
+       refY="0"
+       refX="0"
+       id="Arrow1Mend"
+       style="overflow:visible">
+      <path
+         id="path3638"
+         d="M 0,0 5,-5 -12.5,0 5,5 0,0 z"
+         style="fill-rule:evenodd;stroke:#000000;stroke-width:1pt;marker-start:none"
+         transform="matrix(-0.4,0,0,-0.4,-4,0)"
+         inkscape:connector-curvature="0" />
+    </marker>
+    <inkscape:perspective
+       sodipodi:type="inkscape:persp3d"
+       inkscape:vp_x="0 : 200 : 1"
+       inkscape:vp_y="0 : 1000 : 0"
+       inkscape:vp_z="420 : 200 : 1"
+       inkscape:persp3d-origin="210 : 133.33333 : 1"
+       id="perspective28" />
+  </defs>
+  <sodipodi:namedview
+     pagecolor="#ffffff"
+     bordercolor="#666666"
+     borderopacity="1"
+     objecttolerance="10"
+     gridtolerance="10"
+     guidetolerance="10"
+     inkscape:pageopacity="0"
+     inkscape:pageshadow="2"
+     inkscape:window-width="1436"
+     inkscape:window-height="856"
+     id="namedview22"
+     showgrid="false"
+     inkscape:zoom="0.79420663"
+     inkscape:cx="437.50383"
+     inkscape:cy="-3.1396505"
+     inkscape:window-x="4"
+     inkscape:window-y="22"
+     inkscape:window-maximized="0"
+     inkscape:current-layer="g3960" />
+  <text
+     xml:space="preserve"
+     style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Verdana;-inkscape-font-specification:Verdana"
+     x="102.93208"
+     y="-7.535553"
+     id="text3010"
+     sodipodi:linespacing="125%"><tspan
+       sodipodi:role="line"
+       id="tspan3012"
+       x="102.93208"
+       y="-7.535553" /><tspan
+       sodipodi:role="line"
+       x="102.93208"
+       y="4.964447"
+       id="tspan3014" /></text>
+  <g
+     id="g3805"
+     transform="matrix(1,0,0,0.38948748,-4,-80.62777)" />
+  <text
+     xml:space="preserve"
+     style="font-size:11.97706985px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000080;fill-opacity:1;stroke:none;font-family:Sans;-inkscape-font-specification:Terminal"
+     x="205.95784"
+     y="34.59861"
+     id="text3041-8-9-9"
+     sodipodi:linespacing="125%"
+     transform="scale(1.1185212,0.8940376)"><tspan
+       sodipodi:role="line"
+       id="tspan3043-42-2-8"
+       x="205.95784"
+       y="34.59861"
+       style="font-size:86.2348938px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;fill:#000080;font-family:Sans;-inkscape-font-specification:Terminal" /></text>
+  <g
+     id="g3960"
+     transform="matrix(1.2774265,0,0,1.2774265,-32.35617,-208.47432)">
+    <text
+       xml:space="preserve"
+       style="font-size:10.98347282px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000080;fill-opacity:1;stroke:none;font-family:Sans;-inkscape-font-specification:Terminal"
+       x="192.10129"
+       y="275.97144"
+       id="text3041-8-9"
+       sodipodi:linespacing="125%"
+       transform="scale(1.0257307,0.9749148)"><tspan
+         sodipodi:role="line"
+         id="tspan3043-42-2"
+         x="192.10129"
+         y="275.97144"
+         style="font-size:79.08100128px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;fill:#000080;font-family:Sans;-inkscape-font-specification:Terminal" /></text>
+    <text
+       transform="scale(1.0257307,0.97491477)"
+       sodipodi:linespacing="125%"
+       id="text3041-8-6"
+       y="248.2854"
+       x="87.743195"
+       style="font-size:85.09155273px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000080;fill-opacity:1;stroke:none;font-family:Bank Gothic;-inkscape-font-specification:Bank Gothic"
+       xml:space="preserve"><tspan
+         style="font-size:85.09155273px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:125%;writing-mode:lr-tb;text-anchor:start;fill:#000080;font-family:Bank Gothic;-inkscape-font-specification:Bank Gothic"
+         y="248.2854"
+         x="87.743195"
+         id="tspan3043-42-9"
+         sodipodi:role="line">lib</tspan></text>
+    <flowRoot
+       xml:space="preserve"
+       id="flowRoot3097"
+       style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Arial;-inkscape-font-specification:Arial"
+       transform="matrix(0.7828239,0,0,0.7828239,25.329183,69.259813)"><flowRegion
+         id="flowRegion3099"><rect
+           id="rect3101"
+           width="33.49387"
+           height="62.420395"
+           x="534.37952"
+           y="241.90614" /></flowRegion><flowPara
+         id="flowPara3103"></flowPara></flowRoot>    <g
+       id="g3907"
+       transform="matrix(0.72690637,0,0,0.71996495,139.8595,-23.170483)">
+      <path
+         sodipodi:nodetypes="sscccssccsssssscss"
+         transform="matrix(0.7828239,0,0,0.7828239,25.329183,69.259813)"
+         inkscape:connector-curvature="0"
+         id="path3105"
+         d="m 339.50606,300.90773 c -1.95856,3.69785 -0.75722,15.26794 4.45166,22.34714 14.00779,19.03753 45.78914,51.10431 45.78914,51.10431 l 51.57631,56.40186 5.19946,85.81798 c 0,0 7.52237,4.52608 11.73442,5.45768 4.95508,1.09594 10.28542,1.16596 15.22449,0 4.41764,-1.04286 12.17959,-6.08979 12.17959,-6.08979 l 4.56735,-85.25713 c 0,0 64.30741,-68.15797 96.54638,-102.89439 3.40006,-3.66345 5.37777,-8.75489 6.0898,-13.70204 0.77539,-5.38744 1.0823,-11.8102 -2.15457,-16.18608 -5.64272,-7.62832 -16.74691,-9.28331 -25.88163,-11.85011 -20.66448,-5.80658 -42.50429,-6.55141 -63.94284,-7.61225 -25.85497,-1.27937 -51.84264,-0.56565 -77.64489,1.52245 -17.88376,1.44728 -53.2857,7.61224 -53.2857,7.61224 0,0 -13.56919,3.10259 -19.79183,6.0898 -3.87132,1.85844 -8.6472,3.44346 -10.65714,7.23833 z"
+         style="fill:#000000;fill-opacity:1;stroke:#8b8b8b;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" />
+      <path
+         transform="matrix(0.7828239,0,0,0.7828239,73.071791,66.123437)"
+         d="m 516.39198,310.38028 c 0,10.32604 -49.82684,18.69695 -111.29136,18.69695 -61.46453,0 -111.29137,-8.37091 -111.29137,-18.69695 0,-10.32604 49.82684,-18.69695 111.29137,-18.69695 61.46452,0 111.29136,8.37091 111.29136,18.69695 z"
+         sodipodi:ry="18.696951"
+         sodipodi:rx="111.29137"
+         sodipodi:cy="310.38028"
+         sodipodi:cx="405.10062"
+         id="path3905"
+         style="fill:#cccccc;fill-opacity:1;fill-rule:evenodd;stroke:#8b8b8b;stroke-opacity:1"
+         sodipodi:type="arc" />
+    </g>
+    <text
+       xml:space="preserve"
+       style="font-size:85.09155273px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000080;fill-opacity:1;stroke:none;font-family:Bank Gothic;-inkscape-font-specification:Bank Gothic"
+       x="40.24892"
+       y="316.10275"
+       id="text3041-8"
+       sodipodi:linespacing="125%"
+       transform="scale(1.0257307,0.97491477)"><tspan
+         sodipodi:role="line"
+         id="tspan3043-42"
+         x="40.24892"
+         y="316.10275"
+         style="font-size:85.09155273px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:125%;writing-mode:lr-tb;text-anchor:start;fill:#000080;font-family:Bank Gothic;-inkscape-font-specification:Bank Gothic">Squish</tspan></text>
+  </g>
+</svg>
diff --git a/extern/libsquish-1.15/libsquish.pc.in b/extern/libsquish-1.15/libsquish.pc.in
new file mode 100644
index 0000000..d3b95bd
--- /dev/null
+++ b/extern/libsquish-1.15/libsquish.pc.in
@@ -0,0 +1,13 @@
+prefix=@PREFIX@
+exec_prefix=${prefix}
+libdir=${prefix}/@LIB_PATH@
+sharedlibdir=${libdir}
+includedir=${prefix}/include
+
+Name: libsquish
+Description: squish DXT library
+Version: 1.14
+
+Requires:
+Libs: -L${libdir} -L${sharedlibdir} -llibsquish
+Cflags: -I${includedir}
diff --git a/extern/libsquish-1.15/maths.cpp b/extern/libsquish-1.15/maths.cpp
new file mode 100644
index 0000000..4fa0bcf
--- /dev/null
+++ b/extern/libsquish-1.15/maths.cpp
@@ -0,0 +1,259 @@
+/* -----------------------------------------------------------------------------
+
+    Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+    Permission is hereby granted, free of charge, to any person obtaining
+    a copy of this software and associated documentation files (the
+    "Software"), to deal in the Software without restriction, including
+    without limitation the rights to use, copy, modify, merge, publish,
+    distribute, sublicense, and/or sell copies of the Software, and to
+    permit persons to whom the Software is furnished to do so, subject to
+    the following conditions:
+
+    The above copyright notice and this permission notice shall be included
+    in all copies or substantial portions of the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+    OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+    IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+    CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+    TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+    SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+   -------------------------------------------------------------------------- */
+
+/*! @file
+
+    The symmetric eigensystem solver algorithm is from
+    http://www.geometrictools.com/Documentation/EigenSymmetric3x3.pdf
+*/
+
+#include "maths.h"
+#include "simd.h"
+#include <cfloat>
+
+namespace squish {
+
+Sym3x3 ComputeWeightedCovariance( int n, Vec3 const* points, float const* weights )
+{
+    // compute the centroid
+    float total = 0.0f;
+    Vec3 centroid( 0.0f );
+    for( int i = 0; i < n; ++i )
+    {
+        total += weights[i];
+        centroid += weights[i]*points[i];
+    }
+    if( total > FLT_EPSILON )
+        centroid /= total;
+
+    // accumulate the covariance matrix
+    Sym3x3 covariance( 0.0f );
+    for( int i = 0; i < n; ++i )
+    {
+        Vec3 a = points[i] - centroid;
+        Vec3 b = weights[i]*a;
+
+        covariance[0] += a.X()*b.X();
+        covariance[1] += a.X()*b.Y();
+        covariance[2] += a.X()*b.Z();
+        covariance[3] += a.Y()*b.Y();
+        covariance[4] += a.Y()*b.Z();
+        covariance[5] += a.Z()*b.Z();
+    }
+
+    // return it
+    return covariance;
+}
+
+#if 0
+
+static Vec3 GetMultiplicity1Evector( Sym3x3 const& matrix, float evalue )
+{
+    // compute M
+    Sym3x3 m;
+    m[0] = matrix[0] - evalue;
+    m[1] = matrix[1];
+    m[2] = matrix[2];
+    m[3] = matrix[3] - evalue;
+    m[4] = matrix[4];
+    m[5] = matrix[5] - evalue;
+
+    // compute U
+    Sym3x3 u;
+    u[0] = m[3]*m[5] - m[4]*m[4];
+    u[1] = m[2]*m[4] - m[1]*m[5];
+    u[2] = m[1]*m[4] - m[2]*m[3];
+    u[3] = m[0]*m[5] - m[2]*m[2];
+    u[4] = m[1]*m[2] - m[4]*m[0];
+    u[5] = m[0]*m[3] - m[1]*m[1];
+
+    // find the largest component
+    float mc = std::fabs( u[0] );
+    int mi = 0;
+    for( int i = 1; i < 6; ++i )
+    {
+        float c = std::fabs( u[i] );
+        if( c > mc )
+        {
+            mc = c;
+            mi = i;
+        }
+    }
+
+    // pick the column with this component
+    switch( mi )
+    {
+    case 0:
+        return Vec3( u[0], u[1], u[2] );
+
+    case 1:
+    case 3:
+        return Vec3( u[1], u[3], u[4] );
+
+    default:
+        return Vec3( u[2], u[4], u[5] );
+    }
+}
+
+static Vec3 GetMultiplicity2Evector( Sym3x3 const& matrix, float evalue )
+{
+    // compute M
+    Sym3x3 m;
+    m[0] = matrix[0] - evalue;
+    m[1] = matrix[1];
+    m[2] = matrix[2];
+    m[3] = matrix[3] - evalue;
+    m[4] = matrix[4];
+    m[5] = matrix[5] - evalue;
+
+    // find the largest component
+    float mc = std::fabs( m[0] );
+    int mi = 0;
+    for( int i = 1; i < 6; ++i )
+    {
+        float c = std::fabs( m[i] );
+        if( c > mc )
+        {
+            mc = c;
+            mi = i;
+        }
+    }
+
+    // pick the first eigenvector based on this index
+    switch( mi )
+    {
+    case 0:
+    case 1:
+        return Vec3( -m[1], m[0], 0.0f );
+
+    case 2:
+        return Vec3( m[2], 0.0f, -m[0] );
+
+    case 3:
+    case 4:
+        return Vec3( 0.0f, -m[4], m[3] );
+
+    default:
+        return Vec3( 0.0f, -m[5], m[4] );
+    }
+}
+
+Vec3 ComputePrincipleComponent( Sym3x3 const& matrix )
+{
+    // compute the cubic coefficients
+    float c0 = matrix[0]*matrix[3]*matrix[5]
+        + 2.0f*matrix[1]*matrix[2]*matrix[4]
+        - matrix[0]*matrix[4]*matrix[4]
+        - matrix[3]*matrix[2]*matrix[2]
+        - matrix[5]*matrix[1]*matrix[1];
+    float c1 = matrix[0]*matrix[3] + matrix[0]*matrix[5] + matrix[3]*matrix[5]
+        - matrix[1]*matrix[1] - matrix[2]*matrix[2] - matrix[4]*matrix[4];
+    float c2 = matrix[0] + matrix[3] + matrix[5];
+
+    // compute the quadratic coefficients
+    float a = c1 - ( 1.0f/3.0f )*c2*c2;
+    float b = ( -2.0f/27.0f )*c2*c2*c2 + ( 1.0f/3.0f )*c1*c2 - c0;
+
+    // compute the root count check
+    float Q = 0.25f*b*b + ( 1.0f/27.0f )*a*a*a;
+
+    // test the multiplicity
+    if( FLT_EPSILON < Q )
+    {
+        // only one root, which implies we have a multiple of the identity
+        return Vec3( 1.0f );
+    }
+    else if( Q < -FLT_EPSILON )
+    {
+        // three distinct roots
+        float theta = std::atan2( std::sqrt( -Q ), -0.5f*b );
+        float rho = std::sqrt( 0.25f*b*b - Q );
+
+        float rt = std::pow( rho, 1.0f/3.0f );
+        float ct = std::cos( theta/3.0f );
+        float st = std::sin( theta/3.0f );
+
+        float l1 = ( 1.0f/3.0f )*c2 + 2.0f*rt*ct;
+        float l2 = ( 1.0f/3.0f )*c2 - rt*( ct + ( float )sqrt( 3.0f )*st );
+        float l3 = ( 1.0f/3.0f )*c2 - rt*( ct - ( float )sqrt( 3.0f )*st );
+
+        // pick the larger
+        if( std::fabs( l2 ) > std::fabs( l1 ) )
+            l1 = l2;
+        if( std::fabs( l3 ) > std::fabs( l1 ) )
+            l1 = l3;
+
+        // get the eigenvector
+        return GetMultiplicity1Evector( matrix, l1 );
+    }
+    else // if( -FLT_EPSILON <= Q && Q <= FLT_EPSILON )
+    {
+        // two roots
+        float rt;
+        if( b < 0.0f )
+            rt = -std::pow( -0.5f*b, 1.0f/3.0f );
+        else
+            rt = std::pow( 0.5f*b, 1.0f/3.0f );
+
+        float l1 = ( 1.0f/3.0f )*c2 + rt;        // repeated
+        float l2 = ( 1.0f/3.0f )*c2 - 2.0f*rt;
+
+        // get the eigenvector
+        if( std::fabs( l1 ) > std::fabs( l2 ) )
+            return GetMultiplicity2Evector( matrix, l1 );
+        else
+            return GetMultiplicity1Evector( matrix, l2 );
+    }
+}
+
+#else
+
+#define POWER_ITERATION_COUNT    8
+
+Vec3 ComputePrincipleComponent( Sym3x3 const& matrix )
+{
+    Vec4 const row0( matrix[0], matrix[1], matrix[2], 0.0f );
+    Vec4 const row1( matrix[1], matrix[3], matrix[4], 0.0f );
+    Vec4 const row2( matrix[2], matrix[4], matrix[5], 0.0f );
+    Vec4 v = VEC4_CONST( 1.0f );
+    for( int i = 0; i < POWER_ITERATION_COUNT; ++i )
+    {
+        // matrix multiply
+        Vec4 w = row0*v.SplatX();
+        w = MultiplyAdd(row1, v.SplatY(), w);
+        w = MultiplyAdd(row2, v.SplatZ(), w);
+
+        // get max component from xyz in all channels
+        Vec4 a = Max(w.SplatX(), Max(w.SplatY(), w.SplatZ()));
+
+        // divide through and advance
+        v = w*Reciprocal(a);
+    }
+    return v.GetVec3();
+}
+
+#endif
+
+} // namespace squish
diff --git a/extern/libsquish-1.15/maths.h b/extern/libsquish-1.15/maths.h
new file mode 100644
index 0000000..59c3219
--- /dev/null
+++ b/extern/libsquish-1.15/maths.h
@@ -0,0 +1,233 @@
+/* -----------------------------------------------------------------------------
+
+    Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+    Permission is hereby granted, free of charge, to any person obtaining
+    a copy of this software and associated documentation files (the
+    "Software"), to deal in the Software without restriction, including
+    without limitation the rights to use, copy, modify, merge, publish,
+    distribute, sublicense, and/or sell copies of the Software, and to
+    permit persons to whom the Software is furnished to do so, subject to
+    the following conditions:
+
+    The above copyright notice and this permission notice shall be included
+    in all copies or substantial portions of the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+    OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+    IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+    CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+    TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+    SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+   -------------------------------------------------------------------------- */
+
+#ifndef SQUISH_MATHS_H
+#define SQUISH_MATHS_H
+
+#include <cmath>
+#include <algorithm>
+#include "config.h"
+
+namespace squish {
+
+class Vec3
+{
+public:
+    typedef Vec3 const& Arg;
+
+    Vec3()
+    {
+    }
+
+    explicit Vec3( float s )
+    {
+        m_x = s;
+        m_y = s;
+        m_z = s;
+    }
+
+    Vec3( float x, float y, float z )
+    {
+        m_x = x;
+        m_y = y;
+        m_z = z;
+    }
+
+    float X() const { return m_x; }
+    float Y() const { return m_y; }
+    float Z() const { return m_z; }
+
+    Vec3 operator-() const
+    {
+        return Vec3( -m_x, -m_y, -m_z );
+    }
+
+    Vec3& operator+=( Arg v )
+    {
+        m_x += v.m_x;
+        m_y += v.m_y;
+        m_z += v.m_z;
+        return *this;
+    }
+
+    Vec3& operator-=( Arg v )
+    {
+        m_x -= v.m_x;
+        m_y -= v.m_y;
+        m_z -= v.m_z;
+        return *this;
+    }
+
+    Vec3& operator*=( Arg v )
+    {
+        m_x *= v.m_x;
+        m_y *= v.m_y;
+        m_z *= v.m_z;
+        return *this;
+    }
+
+    Vec3& operator*=( float s )
+    {
+        m_x *= s;
+        m_y *= s;
+        m_z *= s;
+        return *this;
+    }
+
+    Vec3& operator/=( Arg v )
+    {
+        m_x /= v.m_x;
+        m_y /= v.m_y;
+        m_z /= v.m_z;
+        return *this;
+    }
+
+    Vec3& operator/=( float s )
+    {
+        float t = 1.0f/s;
+        m_x *= t;
+        m_y *= t;
+        m_z *= t;
+        return *this;
+    }
+
+    friend Vec3 operator+( Arg left, Arg right )
+    {
+        Vec3 copy( left );
+        return copy += right;
+    }
+
+    friend Vec3 operator-( Arg left, Arg right )
+    {
+        Vec3 copy( left );
+        return copy -= right;
+    }
+
+    friend Vec3 operator*( Arg left, Arg right )
+    {
+        Vec3 copy( left );
+        return copy *= right;
+    }
+
+    friend Vec3 operator*( Arg left, float right )
+    {
+        Vec3 copy( left );
+        return copy *= right;
+    }
+
+    friend Vec3 operator*( float left, Arg right )
+    {
+        Vec3 copy( right );
+        return copy *= left;
+    }
+
+    friend Vec3 operator/( Arg left, Arg right )
+    {
+        Vec3 copy( left );
+        return copy /= right;
+    }
+
+    friend Vec3 operator/( Arg left, float right )
+    {
+        Vec3 copy( left );
+        return copy /= right;
+    }
+
+    friend float Dot( Arg left, Arg right )
+    {
+        return left.m_x*right.m_x + left.m_y*right.m_y + left.m_z*right.m_z;
+    }
+
+    friend Vec3 Min( Arg left, Arg right )
+    {
+        return Vec3(
+            std::min( left.m_x, right.m_x ),
+            std::min( left.m_y, right.m_y ),
+            std::min( left.m_z, right.m_z )
+        );
+    }
+
+    friend Vec3 Max( Arg left, Arg right )
+    {
+        return Vec3(
+            std::max( left.m_x, right.m_x ),
+            std::max( left.m_y, right.m_y ),
+            std::max( left.m_z, right.m_z )
+        );
+    }
+
+    friend Vec3 Truncate( Arg v )
+    {
+        return Vec3(
+            v.m_x > 0.0f ? std::floor( v.m_x ) : std::ceil( v.m_x ),
+            v.m_y > 0.0f ? std::floor( v.m_y ) : std::ceil( v.m_y ),
+            v.m_z > 0.0f ? std::floor( v.m_z ) : std::ceil( v.m_z )
+        );
+    }
+
+private:
+    float m_x;
+    float m_y;
+    float m_z;
+};
+
+inline float LengthSquared( Vec3::Arg v )
+{
+    return Dot( v, v );
+}
+
+class Sym3x3
+{
+public:
+    Sym3x3()
+    {
+    }
+
+    Sym3x3( float s )
+    {
+        for( int i = 0; i < 6; ++i )
+            m_x[i] = s;
+    }
+
+    float operator[]( int index ) const
+    {
+        return m_x[index];
+    }
+
+    float& operator[]( int index )
+    {
+        return m_x[index];
+    }
+
+private:
+    float m_x[6];
+};
+
+Sym3x3 ComputeWeightedCovariance( int n, Vec3 const* points, float const* weights );
+Vec3 ComputePrincipleComponent( Sym3x3 const& matrix );
+
+} // namespace squish
+
+#endif // ndef SQUISH_MATHS_H
diff --git a/extern/libsquish-1.15/rangefit.cpp b/extern/libsquish-1.15/rangefit.cpp
new file mode 100644
index 0000000..adc07ed
--- /dev/null
+++ b/extern/libsquish-1.15/rangefit.cpp
@@ -0,0 +1,201 @@
+/* -----------------------------------------------------------------------------
+
+    Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+    Permission is hereby granted, free of charge, to any person obtaining
+    a copy of this software and associated documentation files (the
+    "Software"), to deal in the Software without restriction, including
+    without limitation the rights to use, copy, modify, merge, publish,
+    distribute, sublicense, and/or sell copies of the Software, and to
+    permit persons to whom the Software is furnished to do so, subject to
+    the following conditions:
+
+    The above copyright notice and this permission notice shall be included
+    in all copies or substantial portions of the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+    OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+    IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+    CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+    TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+    SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+   -------------------------------------------------------------------------- */
+
+#include "rangefit.h"
+#include "colourset.h"
+#include "colourblock.h"
+#include <cfloat>
+
+namespace squish {
+
+RangeFit::RangeFit( ColourSet const* colours, int flags, float* metric )
+  : ColourFit( colours, flags )
+{
+    // initialise the metric (old perceptual = 0.2126f, 0.7152f, 0.0722f)
+    if( metric )
+        m_metric = Vec3( metric[0], metric[1], metric[2] );
+    else
+        m_metric = Vec3( 1.0f );
+
+    // initialise the best error
+    m_besterror = FLT_MAX;
+
+    // cache some values
+    int const count = m_colours->GetCount();
+    Vec3 const* values = m_colours->GetPoints();
+    float const* weights = m_colours->GetWeights();
+
+    // get the covariance matrix
+    Sym3x3 covariance = ComputeWeightedCovariance( count, values, weights );
+
+    // compute the principle component
+    Vec3 principle = ComputePrincipleComponent( covariance );
+
+    // get the min and max range as the codebook endpoints
+    Vec3 start( 0.0f );
+    Vec3 end( 0.0f );
+    if( count > 0 )
+    {
+        float min, max;
+
+        // compute the range
+        start = end = values[0];
+        min = max = Dot( values[0], principle );
+        for( int i = 1; i < count; ++i )
+        {
+            float val = Dot( values[i], principle );
+            if( val < min )
+            {
+                start = values[i];
+                min = val;
+            }
+            else if( val > max )
+            {
+                end = values[i];
+                max = val;
+            }
+        }
+    }
+
+    // clamp the output to [0, 1]
+    Vec3 const one( 1.0f );
+    Vec3 const zero( 0.0f );
+    start = Min( one, Max( zero, start ) );
+    end = Min( one, Max( zero, end ) );
+
+    // clamp to the grid and save
+    Vec3 const grid( 31.0f, 63.0f, 31.0f );
+    Vec3 const gridrcp( 1.0f/31.0f, 1.0f/63.0f, 1.0f/31.0f );
+    Vec3 const half( 0.5f );
+    m_start = Truncate( grid*start + half )*gridrcp;
+    m_end = Truncate( grid*end + half )*gridrcp;
+}
+
+void RangeFit::Compress3( void* block )
+{
+    // cache some values
+    int const count = m_colours->GetCount();
+    Vec3 const* values = m_colours->GetPoints();
+
+    // create a codebook
+    Vec3 codes[3];
+    codes[0] = m_start;
+    codes[1] = m_end;
+    codes[2] = 0.5f*m_start + 0.5f*m_end;
+
+    // match each point to the closest code
+    u8 closest[16];
+    float error = 0.0f;
+    for( int i = 0; i < count; ++i )
+    {
+        // find the closest code
+        float dist = FLT_MAX;
+        int idx = 0;
+        for( int j = 0; j < 3; ++j )
+        {
+            float d = LengthSquared( m_metric*( values[i] - codes[j] ) );
+            if( d < dist )
+            {
+                dist = d;
+                idx = j;
+            }
+        }
+
+        // save the index
+        closest[i] = ( u8 )idx;
+
+        // accumulate the error
+        error += dist;
+    }
+
+    // save this scheme if it wins
+    if( error < m_besterror )
+    {
+        // remap the indices
+        u8 indices[16];
+        m_colours->RemapIndices( closest, indices );
+
+        // save the block
+        WriteColourBlock3( m_start, m_end, indices, block );
+
+        // save the error
+        m_besterror = error;
+    }
+}
+
+void RangeFit::Compress4( void* block )
+{
+    // cache some values
+    int const count = m_colours->GetCount();
+    Vec3 const* values = m_colours->GetPoints();
+
+    // create a codebook
+    Vec3 codes[4];
+    codes[0] = m_start;
+    codes[1] = m_end;
+    codes[2] = ( 2.0f/3.0f )*m_start + ( 1.0f/3.0f )*m_end;
+    codes[3] = ( 1.0f/3.0f )*m_start + ( 2.0f/3.0f )*m_end;
+
+    // match each point to the closest code
+    u8 closest[16];
+    float error = 0.0f;
+    for( int i = 0; i < count; ++i )
+    {
+        // find the closest code
+        float dist = FLT_MAX;
+        int idx = 0;
+        for( int j = 0; j < 4; ++j )
+        {
+            float d = LengthSquared( m_metric*( values[i] - codes[j] ) );
+            if( d < dist )
+            {
+                dist = d;
+                idx = j;
+            }
+        }
+
+        // save the index
+        closest[i] = ( u8 )idx;
+
+        // accumulate the error
+        error += dist;
+    }
+
+    // save this scheme if it wins
+    if( error < m_besterror )
+    {
+        // remap the indices
+        u8 indices[16];
+        m_colours->RemapIndices( closest, indices );
+
+        // save the block
+        WriteColourBlock4( m_start, m_end, indices, block );
+
+        // save the error
+        m_besterror = error;
+    }
+}
+
+} // namespace squish
diff --git a/extern/libsquish-1.15/rangefit.h b/extern/libsquish-1.15/rangefit.h
new file mode 100644
index 0000000..bdb21a9
--- /dev/null
+++ b/extern/libsquish-1.15/rangefit.h
@@ -0,0 +1,54 @@
+/* -----------------------------------------------------------------------------
+
+    Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+    Permission is hereby granted, free of charge, to any person obtaining
+    a copy of this software and associated documentation files (the
+    "Software"), to deal in the Software without restriction, including
+    without limitation the rights to use, copy, modify, merge, publish,
+    distribute, sublicense, and/or sell copies of the Software, and to
+    permit persons to whom the Software is furnished to do so, subject to
+    the following conditions:
+
+    The above copyright notice and this permission notice shall be included
+    in all copies or substantial portions of the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+    OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+    IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+    CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+    TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+    SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+   -------------------------------------------------------------------------- */
+
+#ifndef SQUISH_RANGEFIT_H
+#define SQUISH_RANGEFIT_H
+
+#include "squish.h"
+#include "colourfit.h"
+#include "maths.h"
+
+namespace squish {
+
+class ColourSet;
+
+class RangeFit : public ColourFit
+{
+public:
+    RangeFit( ColourSet const* colours, int flags, float* metric );
+
+private:
+    virtual void Compress3( void* block );
+    virtual void Compress4( void* block );
+
+    Vec3 m_metric;
+    Vec3 m_start;
+    Vec3 m_end;
+    float m_besterror;
+};
+
+} // squish
+
+#endif // ndef SQUISH_RANGEFIT_H
diff --git a/extern/libsquish-1.15/simd.h b/extern/libsquish-1.15/simd.h
new file mode 100644
index 0000000..1e02fa1
--- /dev/null
+++ b/extern/libsquish-1.15/simd.h
@@ -0,0 +1,40 @@
+/* -----------------------------------------------------------------------------
+
+    Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+    Permission is hereby granted, free of charge, to any person obtaining
+    a copy of this software and associated documentation files (the
+    "Software"), to deal in the Software without restriction, including
+    without limitation the rights to use, copy, modify, merge, publish,
+    distribute, sublicense, and/or sell copies of the Software, and to
+    permit persons to whom the Software is furnished to do so, subject to
+    the following conditions:
+
+    The above copyright notice and this permission notice shall be included
+    in all copies or substantial portions of the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+    OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+    IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+    CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+    TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+    SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+   -------------------------------------------------------------------------- */
+
+#ifndef SQUISH_SIMD_H
+#define SQUISH_SIMD_H
+
+#include "maths.h"
+
+#if SQUISH_USE_ALTIVEC
+#include "simd_ve.h"
+#elif SQUISH_USE_SSE
+#include "simd_sse.h"
+#else
+#include "simd_float.h"
+#endif
+
+
+#endif // ndef SQUISH_SIMD_H
diff --git a/extern/libsquish-1.15/simd_float.h b/extern/libsquish-1.15/simd_float.h
new file mode 100644
index 0000000..030ea70
--- /dev/null
+++ b/extern/libsquish-1.15/simd_float.h
@@ -0,0 +1,183 @@
+/* -----------------------------------------------------------------------------
+
+    Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+    Permission is hereby granted, free of charge, to any person obtaining
+    a copy of this software and associated documentation files (the
+    "Software"), to deal in the Software without restriction, including
+    without limitation the rights to use, copy, modify, merge, publish,
+    distribute, sublicense, and/or sell copies of the Software, and to
+    permit persons to whom the Software is furnished to do so, subject to
+    the following conditions:
+
+    The above copyright notice and this permission notice shall be included
+    in all copies or substantial portions of the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+    OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+    IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+    CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+    TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+    SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+   -------------------------------------------------------------------------- */
+
+#ifndef SQUISH_SIMD_FLOAT_H
+#define SQUISH_SIMD_FLOAT_H
+
+#include <algorithm>
+
+namespace squish {
+
+#define VEC4_CONST( X ) Vec4( X )
+
+class Vec4
+{
+public:
+    typedef Vec4 const& Arg;
+
+    Vec4() {}
+
+    explicit Vec4( float s )
+      : m_x( s ),
+        m_y( s ),
+        m_z( s ),
+        m_w( s )
+    {
+    }
+
+    Vec4( float x, float y, float z, float w )
+      : m_x( x ),
+        m_y( y ),
+        m_z( z ),
+        m_w( w )
+    {
+    }
+
+    Vec3 GetVec3() const
+    {
+        return Vec3( m_x, m_y, m_z );
+    }
+
+    Vec4 SplatX() const { return Vec4( m_x ); }
+    Vec4 SplatY() const { return Vec4( m_y ); }
+    Vec4 SplatZ() const { return Vec4( m_z ); }
+    Vec4 SplatW() const { return Vec4( m_w ); }
+
+    Vec4& operator+=( Arg v )
+    {
+        m_x += v.m_x;
+        m_y += v.m_y;
+        m_z += v.m_z;
+        m_w += v.m_w;
+        return *this;
+    }
+
+    Vec4& operator-=( Arg v )
+    {
+        m_x -= v.m_x;
+        m_y -= v.m_y;
+        m_z -= v.m_z;
+        m_w -= v.m_w;
+        return *this;
+    }
+
+    Vec4& operator*=( Arg v )
+    {
+        m_x *= v.m_x;
+        m_y *= v.m_y;
+        m_z *= v.m_z;
+        m_w *= v.m_w;
+        return *this;
+    }
+
+    friend Vec4 operator+( Vec4::Arg left, Vec4::Arg right  )
+    {
+        Vec4 copy( left );
+        return copy += right;
+    }
+
+    friend Vec4 operator-( Vec4::Arg left, Vec4::Arg right  )
+    {
+        Vec4 copy( left );
+        return copy -= right;
+    }
+
+    friend Vec4 operator*( Vec4::Arg left, Vec4::Arg right  )
+    {
+        Vec4 copy( left );
+        return copy *= right;
+    }
+
+    //! Returns a*b + c
+    friend Vec4 MultiplyAdd( Vec4::Arg a, Vec4::Arg b, Vec4::Arg c )
+    {
+        return a*b + c;
+    }
+
+    //! Returns -( a*b - c )
+    friend Vec4 NegativeMultiplySubtract( Vec4::Arg a, Vec4::Arg b, Vec4::Arg c )
+    {
+        return c - a*b;
+    }
+
+    friend Vec4 Reciprocal( Vec4::Arg v )
+    {
+        return Vec4(
+            1.0f/v.m_x,
+            1.0f/v.m_y,
+            1.0f/v.m_z,
+            1.0f/v.m_w
+        );
+    }
+
+    friend Vec4 Min( Vec4::Arg left, Vec4::Arg right )
+    {
+        return Vec4(
+            std::min( left.m_x, right.m_x ),
+            std::min( left.m_y, right.m_y ),
+            std::min( left.m_z, right.m_z ),
+            std::min( left.m_w, right.m_w )
+        );
+    }
+
+    friend Vec4 Max( Vec4::Arg left, Vec4::Arg right )
+    {
+        return Vec4(
+            std::max( left.m_x, right.m_x ),
+            std::max( left.m_y, right.m_y ),
+            std::max( left.m_z, right.m_z ),
+            std::max( left.m_w, right.m_w )
+        );
+    }
+
+    friend Vec4 Truncate( Vec4::Arg v )
+    {
+        return Vec4(
+            v.m_x > 0.0f ? std::floor( v.m_x ) : std::ceil( v.m_x ),
+            v.m_y > 0.0f ? std::floor( v.m_y ) : std::ceil( v.m_y ),
+            v.m_z > 0.0f ? std::floor( v.m_z ) : std::ceil( v.m_z ),
+            v.m_w > 0.0f ? std::floor( v.m_w ) : std::ceil( v.m_w )
+        );
+    }
+
+    friend bool CompareAnyLessThan( Vec4::Arg left, Vec4::Arg right )
+    {
+        return left.m_x < right.m_x
+            || left.m_y < right.m_y
+            || left.m_z < right.m_z
+            || left.m_w < right.m_w;
+    }
+
+private:
+    float m_x;
+    float m_y;
+    float m_z;
+    float m_w;
+};
+
+} // namespace squish
+
+#endif // ndef SQUISH_SIMD_FLOAT_H
+
diff --git a/extern/libsquish-1.15/simd_sse.h b/extern/libsquish-1.15/simd_sse.h
new file mode 100644
index 0000000..2e8be4c
--- /dev/null
+++ b/extern/libsquish-1.15/simd_sse.h
@@ -0,0 +1,180 @@
+/* -----------------------------------------------------------------------------
+
+    Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+    Permission is hereby granted, free of charge, to any person obtaining
+    a copy of this software and associated documentation files (the
+    "Software"), to deal in the Software without restriction, including
+    without limitation the rights to use, copy, modify, merge, publish,
+    distribute, sublicense, and/or sell copies of the Software, and to
+    permit persons to whom the Software is furnished to do so, subject to
+    the following conditions:
+
+    The above copyright notice and this permission notice shall be included
+    in all copies or substantial portions of the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+    OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+    IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+    CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+    TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+    SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+   -------------------------------------------------------------------------- */
+
+#ifndef SQUISH_SIMD_SSE_H
+#define SQUISH_SIMD_SSE_H
+
+#include <xmmintrin.h>
+#if ( SQUISH_USE_SSE > 1 )
+#include <emmintrin.h>
+#endif
+
+#define SQUISH_SSE_SPLAT( a )                                        \
+    ( ( a ) | ( ( a ) << 2 ) | ( ( a ) << 4 ) | ( ( a ) << 6 ) )
+
+#define SQUISH_SSE_SHUF( x, y, z, w )                                \
+    ( ( x ) | ( ( y ) << 2 ) | ( ( z ) << 4 ) | ( ( w ) << 6 ) )
+
+namespace squish {
+
+#define VEC4_CONST( X ) Vec4( X )
+
+class Vec4
+{
+public:
+    typedef Vec4 const& Arg;
+
+    Vec4() {}
+
+    explicit Vec4( __m128 v ) : m_v( v ) {}
+
+    Vec4( Vec4 const& arg ) : m_v( arg.m_v ) {}
+
+    Vec4& operator=( Vec4 const& arg )
+    {
+        m_v = arg.m_v;
+        return *this;
+    }
+
+    explicit Vec4( float s ) : m_v( _mm_set1_ps( s ) ) {}
+
+    Vec4( float x, float y, float z, float w ) : m_v( _mm_setr_ps( x, y, z, w ) ) {}
+
+    Vec3 GetVec3() const
+    {
+#ifdef __GNUC__
+        __attribute__ ((__aligned__ (16))) float c[4];
+#else
+        __declspec(align(16)) float c[4];
+#endif
+        _mm_store_ps( c, m_v );
+        return Vec3( c[0], c[1], c[2] );
+    }
+
+    Vec4 SplatX() const { return Vec4( _mm_shuffle_ps( m_v, m_v, SQUISH_SSE_SPLAT( 0 ) ) ); }
+    Vec4 SplatY() const { return Vec4( _mm_shuffle_ps( m_v, m_v, SQUISH_SSE_SPLAT( 1 ) ) ); }
+    Vec4 SplatZ() const { return Vec4( _mm_shuffle_ps( m_v, m_v, SQUISH_SSE_SPLAT( 2 ) ) ); }
+    Vec4 SplatW() const { return Vec4( _mm_shuffle_ps( m_v, m_v, SQUISH_SSE_SPLAT( 3 ) ) ); }
+
+    Vec4& operator+=( Arg v )
+    {
+        m_v = _mm_add_ps( m_v, v.m_v );
+        return *this;
+    }
+
+    Vec4& operator-=( Arg v )
+    {
+        m_v = _mm_sub_ps( m_v, v.m_v );
+        return *this;
+    }
+
+    Vec4& operator*=( Arg v )
+    {
+        m_v = _mm_mul_ps( m_v, v.m_v );
+        return *this;
+    }
+
+    friend Vec4 operator+( Vec4::Arg left, Vec4::Arg right  )
+    {
+        return Vec4( _mm_add_ps( left.m_v, right.m_v ) );
+    }
+
+    friend Vec4 operator-( Vec4::Arg left, Vec4::Arg right  )
+    {
+        return Vec4( _mm_sub_ps( left.m_v, right.m_v ) );
+    }
+
+    friend Vec4 operator*( Vec4::Arg left, Vec4::Arg right  )
+    {
+        return Vec4( _mm_mul_ps( left.m_v, right.m_v ) );
+    }
+
+    //! Returns a*b + c
+    friend Vec4 MultiplyAdd( Vec4::Arg a, Vec4::Arg b, Vec4::Arg c )
+    {
+        return Vec4( _mm_add_ps( _mm_mul_ps( a.m_v, b.m_v ), c.m_v ) );
+    }
+
+    //! Returns -( a*b - c )
+    friend Vec4 NegativeMultiplySubtract( Vec4::Arg a, Vec4::Arg b, Vec4::Arg c )
+    {
+        return Vec4( _mm_sub_ps( c.m_v, _mm_mul_ps( a.m_v, b.m_v ) ) );
+    }
+
+    friend Vec4 Reciprocal( Vec4::Arg v )
+    {
+        // get the reciprocal estimate
+        __m128 estimate = _mm_rcp_ps( v.m_v );
+
+        // one round of Newton-Rhaphson refinement
+        __m128 diff = _mm_sub_ps( _mm_set1_ps( 1.0f ), _mm_mul_ps( estimate, v.m_v ) );
+        return Vec4( _mm_add_ps( _mm_mul_ps( diff, estimate ), estimate ) );
+    }
+
+    friend Vec4 Min( Vec4::Arg left, Vec4::Arg right )
+    {
+        return Vec4( _mm_min_ps( left.m_v, right.m_v ) );
+    }
+
+    friend Vec4 Max( Vec4::Arg left, Vec4::Arg right )
+    {
+        return Vec4( _mm_max_ps( left.m_v, right.m_v ) );
+    }
+
+    friend Vec4 Truncate( Vec4::Arg v )
+    {
+#if ( SQUISH_USE_SSE == 1 )
+        // convert to ints
+        __m128 input = v.m_v;
+        __m64 lo = _mm_cvttps_pi32( input );
+        __m64 hi = _mm_cvttps_pi32( _mm_movehl_ps( input, input ) );
+
+        // convert to floats
+        __m128 part = _mm_movelh_ps( input, _mm_cvtpi32_ps( input, hi ) );
+        __m128 truncated = _mm_cvtpi32_ps( part, lo );
+
+        // clear out the MMX multimedia state to allow FP calls later
+        _mm_empty();
+        return Vec4( truncated );
+#else
+        // use SSE2 instructions
+        return Vec4( _mm_cvtepi32_ps( _mm_cvttps_epi32( v.m_v ) ) );
+#endif
+    }
+
+    friend bool CompareAnyLessThan( Vec4::Arg left, Vec4::Arg right )
+    {
+        __m128 bits = _mm_cmplt_ps( left.m_v, right.m_v );
+        int value = _mm_movemask_ps( bits );
+        return value != 0;
+    }
+
+private:
+    __m128 m_v;
+};
+
+} // namespace squish
+
+#endif // ndef SQUISH_SIMD_SSE_H
diff --git a/extern/libsquish-1.15/simd_ve.h b/extern/libsquish-1.15/simd_ve.h
new file mode 100644
index 0000000..08a1537
--- /dev/null
+++ b/extern/libsquish-1.15/simd_ve.h
@@ -0,0 +1,166 @@
+/* -----------------------------------------------------------------------------
+
+    Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+    Permission is hereby granted, free of charge, to any person obtaining
+    a copy of this software and associated documentation files (the
+    "Software"), to deal in the Software without restriction, including
+    without limitation the rights to use, copy, modify, merge, publish,
+    distribute, sublicense, and/or sell copies of the Software, and to
+    permit persons to whom the Software is furnished to do so, subject to
+    the following conditions:
+
+    The above copyright notice and this permission notice shall be included
+    in all copies or substantial portions of the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+    OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+    IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+    CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+    TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+    SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+   -------------------------------------------------------------------------- */
+
+#ifndef SQUISH_SIMD_VE_H
+#define SQUISH_SIMD_VE_H
+
+#include <altivec.h>
+#undef bool
+
+namespace squish {
+
+#define VEC4_CONST( X ) Vec4( ( vector float ){ X } )
+
+class Vec4
+{
+public:
+    typedef Vec4 Arg;
+
+    Vec4() {}
+
+    explicit Vec4( vector float v ) : m_v( v ) {}
+
+    Vec4( Vec4 const& arg ) : m_v( arg.m_v ) {}
+
+    Vec4& operator=( Vec4 const& arg )
+    {
+        m_v = arg.m_v;
+        return *this;
+    }
+
+    explicit Vec4( float s )
+    {
+        union { vector float v; float c[4]; } u;
+        u.c[0] = s;
+        u.c[1] = s;
+        u.c[2] = s;
+        u.c[3] = s;
+        m_v = u.v;
+    }
+
+    Vec4( float x, float y, float z, float w )
+    {
+        union { vector float v; float c[4]; } u;
+        u.c[0] = x;
+        u.c[1] = y;
+        u.c[2] = z;
+        u.c[3] = w;
+        m_v = u.v;
+    }
+
+    Vec3 GetVec3() const
+    {
+        union { vector float v; float c[4]; } u;
+        u.v = m_v;
+        return Vec3( u.c[0], u.c[1], u.c[2] );
+    }
+
+    Vec4 SplatX() const { return Vec4( vec_splat( m_v, 0 ) ); }
+    Vec4 SplatY() const { return Vec4( vec_splat( m_v, 1 ) ); }
+    Vec4 SplatZ() const { return Vec4( vec_splat( m_v, 2 ) ); }
+    Vec4 SplatW() const { return Vec4( vec_splat( m_v, 3 ) ); }
+
+    Vec4& operator+=( Arg v )
+    {
+        m_v = vec_add( m_v, v.m_v );
+        return *this;
+    }
+
+    Vec4& operator-=( Arg v )
+    {
+        m_v = vec_sub( m_v, v.m_v );
+        return *this;
+    }
+
+    Vec4& operator*=( Arg v )
+    {
+        m_v = vec_madd( m_v, v.m_v, ( vector float ){ -0.0f } );
+        return *this;
+    }
+
+    friend Vec4 operator+( Vec4::Arg left, Vec4::Arg right  )
+    {
+        return Vec4( vec_add( left.m_v, right.m_v ) );
+    }
+
+    friend Vec4 operator-( Vec4::Arg left, Vec4::Arg right  )
+    {
+        return Vec4( vec_sub( left.m_v, right.m_v ) );
+    }
+
+    friend Vec4 operator*( Vec4::Arg left, Vec4::Arg right  )
+    {
+        return Vec4( vec_madd( left.m_v, right.m_v, ( vector float ){ -0.0f } ) );
+    }
+
+    //! Returns a*b + c
+    friend Vec4 MultiplyAdd( Vec4::Arg a, Vec4::Arg b, Vec4::Arg c )
+    {
+        return Vec4( vec_madd( a.m_v, b.m_v, c.m_v ) );
+    }
+
+    //! Returns -( a*b - c )
+    friend Vec4 NegativeMultiplySubtract( Vec4::Arg a, Vec4::Arg b, Vec4::Arg c )
+    {
+        return Vec4( vec_nmsub( a.m_v, b.m_v, c.m_v ) );
+    }
+
+    friend Vec4 Reciprocal( Vec4::Arg v )
+    {
+        // get the reciprocal estimate
+        vector float estimate = vec_re( v.m_v );
+
+        // one round of Newton-Rhaphson refinement
+        vector float diff = vec_nmsub( estimate, v.m_v, ( vector float ){ 1.0f } );
+        return Vec4( vec_madd( diff, estimate, estimate ) );
+    }
+
+    friend Vec4 Min( Vec4::Arg left, Vec4::Arg right )
+    {
+        return Vec4( vec_min( left.m_v, right.m_v ) );
+    }
+
+    friend Vec4 Max( Vec4::Arg left, Vec4::Arg right )
+    {
+        return Vec4( vec_max( left.m_v, right.m_v ) );
+    }
+
+    friend Vec4 Truncate( Vec4::Arg v )
+    {
+        return Vec4( vec_trunc( v.m_v ) );
+    }
+
+    friend bool CompareAnyLessThan( Vec4::Arg left, Vec4::Arg right )
+    {
+        return vec_any_lt( left.m_v, right.m_v ) != 0;
+    }
+
+private:
+    vector float m_v;
+};
+
+} // namespace squish
+
+#endif // ndef SQUISH_SIMD_VE_H
diff --git a/extern/libsquish-1.15/singlecolourfit.cpp b/extern/libsquish-1.15/singlecolourfit.cpp
new file mode 100644
index 0000000..cef0ebc
--- /dev/null
+++ b/extern/libsquish-1.15/singlecolourfit.cpp
@@ -0,0 +1,172 @@
+/* -----------------------------------------------------------------------------
+
+    Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+    Permission is hereby granted, free of charge, to any person obtaining
+    a copy of this software and associated documentation files (the
+    "Software"), to deal in the Software without restriction, including
+    without limitation the rights to use, copy, modify, merge, publish,
+    distribute, sublicense, and/or sell copies of the Software, and to
+    permit persons to whom the Software is furnished to do so, subject to
+    the following conditions:
+
+    The above copyright notice and this permission notice shall be included
+    in all copies or substantial portions of the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+    OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+    IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+    CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+    TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+    SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+   -------------------------------------------------------------------------- */
+
+#include "singlecolourfit.h"
+#include "colourset.h"
+#include "colourblock.h"
+
+namespace squish {
+
+struct SourceBlock
+{
+    u8 start;
+    u8 end;
+    u8 error;
+};
+
+struct SingleColourLookup
+{
+    SourceBlock sources[2];
+};
+
+#include "singlecolourlookup.inl"
+
+static int FloatToInt( float a, int limit )
+{
+    // use ANSI round-to-zero behaviour to get round-to-nearest
+    int i = ( int )( a + 0.5f );
+
+    // clamp to the limit
+    if( i < 0 )
+        i = 0;
+    else if( i > limit )
+        i = limit;
+
+    // done
+    return i;
+}
+
+SingleColourFit::SingleColourFit( ColourSet const* colours, int flags )
+  : ColourFit( colours, flags )
+{
+    // grab the single colour
+    Vec3 const* values = m_colours->GetPoints();
+    m_colour[0] = ( u8 )FloatToInt( 255.0f*values->X(), 255 );
+    m_colour[1] = ( u8 )FloatToInt( 255.0f*values->Y(), 255 );
+    m_colour[2] = ( u8 )FloatToInt( 255.0f*values->Z(), 255 );
+
+    // initialise the best error
+    m_besterror = INT_MAX;
+}
+
+void SingleColourFit::Compress3( void* block )
+{
+    // build the table of lookups
+    SingleColourLookup const* const lookups[] =
+    {
+        lookup_5_3,
+        lookup_6_3,
+        lookup_5_3
+    };
+
+    // find the best end-points and index
+    ComputeEndPoints( lookups );
+
+    // build the block if we win
+    if( m_error < m_besterror )
+    {
+        // remap the indices
+        u8 indices[16];
+        m_colours->RemapIndices( &m_index, indices );
+
+        // save the block
+        WriteColourBlock3( m_start, m_end, indices, block );
+
+        // save the error
+        m_besterror = m_error;
+    }
+}
+
+void SingleColourFit::Compress4( void* block )
+{
+    // build the table of lookups
+    SingleColourLookup const* const lookups[] =
+    {
+        lookup_5_4,
+        lookup_6_4,
+        lookup_5_4
+    };
+
+    // find the best end-points and index
+    ComputeEndPoints( lookups );
+
+    // build the block if we win
+    if( m_error < m_besterror )
+    {
+        // remap the indices
+        u8 indices[16];
+        m_colours->RemapIndices( &m_index, indices );
+
+        // save the block
+        WriteColourBlock4( m_start, m_end, indices, block );
+
+        // save the error
+        m_besterror = m_error;
+    }
+}
+
+void SingleColourFit::ComputeEndPoints( SingleColourLookup const* const* lookups )
+{
+    // check each index combination (endpoint or intermediate)
+    m_error = INT_MAX;
+    for( int index = 0; index < 2; ++index )
+    {
+        // check the error for this codebook index
+        SourceBlock const* sources[3];
+        int error = 0;
+        for( int channel = 0; channel < 3; ++channel )
+        {
+            // grab the lookup table and index for this channel
+            SingleColourLookup const* lookup = lookups[channel];
+            int target = m_colour[channel];
+
+            // store a pointer to the source for this channel
+            sources[channel] = lookup[target].sources + index;
+
+            // accumulate the error
+            int diff = sources[channel]->error;
+            error += diff*diff;
+        }
+
+        // keep it if the error is lower
+        if( error < m_error )
+        {
+            m_start = Vec3(
+                ( float )sources[0]->start/31.0f,
+                ( float )sources[1]->start/63.0f,
+                ( float )sources[2]->start/31.0f
+            );
+            m_end = Vec3(
+                ( float )sources[0]->end/31.0f,
+                ( float )sources[1]->end/63.0f,
+                ( float )sources[2]->end/31.0f
+            );
+            m_index = ( u8 )( 2*index );
+            m_error = error;
+        }
+    }
+}
+
+} // namespace squish
diff --git a/extern/libsquish-1.15/singlecolourfit.h b/extern/libsquish-1.15/singlecolourfit.h
new file mode 100644
index 0000000..974ce77
--- /dev/null
+++ b/extern/libsquish-1.15/singlecolourfit.h
@@ -0,0 +1,58 @@
+/* -----------------------------------------------------------------------------
+
+    Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+    Permission is hereby granted, free of charge, to any person obtaining
+    a copy of this software and associated documentation files (the
+    "Software"), to deal in the Software without restriction, including
+    without limitation the rights to use, copy, modify, merge, publish,
+    distribute, sublicense, and/or sell copies of the Software, and to
+    permit persons to whom the Software is furnished to do so, subject to
+    the following conditions:
+
+    The above copyright notice and this permission notice shall be included
+    in all copies or substantial portions of the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+    OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+    IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+    CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+    TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+    SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+   -------------------------------------------------------------------------- */
+
+#ifndef SQUISH_SINGLECOLOURFIT_H
+#define SQUISH_SINGLECOLOURFIT_H
+
+#include "squish.h"
+#include "colourfit.h"
+
+namespace squish {
+
+class ColourSet;
+struct SingleColourLookup;
+
+class SingleColourFit : public ColourFit
+{
+public:
+    SingleColourFit( ColourSet const* colours, int flags );
+
+private:
+    virtual void Compress3( void* block );
+    virtual void Compress4( void* block );
+
+    void ComputeEndPoints( SingleColourLookup const* const* lookups );
+
+    u8 m_colour[3];
+    Vec3 m_start;
+    Vec3 m_end;
+    u8 m_index;
+    int m_error;
+    int m_besterror;
+};
+
+} // namespace squish
+
+#endif // ndef SQUISH_SINGLECOLOURFIT_H
diff --git a/extern/libsquish-1.15/singlecolourlookup.inl b/extern/libsquish-1.15/singlecolourlookup.inl
new file mode 100644
index 0000000..5b44a1e
--- /dev/null
+++ b/extern/libsquish-1.15/singlecolourlookup.inl
@@ -0,0 +1,1064 @@
+/* -----------------------------------------------------------------------------
+
+    Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+    Permission is hereby granted, free of charge, to any person obtaining
+    a copy of this software and associated documentation files (the
+    "Software"), to deal in the Software without restriction, including
+    without limitation the rights to use, copy, modify, merge, publish,
+    distribute, sublicense, and/or sell copies of the Software, and to
+    permit persons to whom the Software is furnished to do so, subject to
+    the following conditions:
+
+    The above copyright notice and this permission notice shall be included
+    in all copies or substantial portions of the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+    OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+    IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+    CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+    TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+    SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+   -------------------------------------------------------------------------- */
+
+static SingleColourLookup const lookup_5_3[] =
+{
+    { { { 0, 0, 0 }, { 0, 0, 0 } } },
+    { { { 0, 0, 1 }, { 0, 0, 1 } } },
+    { { { 0, 0, 2 }, { 0, 0, 2 } } },
+    { { { 0, 0, 3 }, { 0, 1, 1 } } },
+    { { { 0, 0, 4 }, { 0, 1, 0 } } },
+    { { { 1, 0, 3 }, { 0, 1, 1 } } },
+    { { { 1, 0, 2 }, { 0, 1, 2 } } },
+    { { { 1, 0, 1 }, { 0, 2, 1 } } },
+    { { { 1, 0, 0 }, { 0, 2, 0 } } },
+    { { { 1, 0, 1 }, { 0, 2, 1 } } },
+    { { { 1, 0, 2 }, { 0, 2, 2 } } },
+    { { { 1, 0, 3 }, { 0, 3, 1 } } },
+    { { { 1, 0, 4 }, { 0, 3, 0 } } },
+    { { { 2, 0, 3 }, { 0, 3, 1 } } },
+    { { { 2, 0, 2 }, { 0, 3, 2 } } },
+    { { { 2, 0, 1 }, { 0, 4, 1 } } },
+    { { { 2, 0, 0 }, { 0, 4, 0 } } },
+    { { { 2, 0, 1 }, { 0, 4, 1 } } },
+    { { { 2, 0, 2 }, { 0, 4, 2 } } },
+    { { { 2, 0, 3 }, { 0, 5, 1 } } },
+    { { { 2, 0, 4 }, { 0, 5, 0 } } },
+    { { { 3, 0, 3 }, { 0, 5, 1 } } },
+    { { { 3, 0, 2 }, { 0, 5, 2 } } },
+    { { { 3, 0, 1 }, { 0, 6, 1 } } },
+    { { { 3, 0, 0 }, { 0, 6, 0 } } },
+    { { { 3, 0, 1 }, { 0, 6, 1 } } },
+    { { { 3, 0, 2 }, { 0, 6, 2 } } },
+    { { { 3, 0, 3 }, { 0, 7, 1 } } },
+    { { { 3, 0, 4 }, { 0, 7, 0 } } },
+    { { { 4, 0, 4 }, { 0, 7, 1 } } },
+    { { { 4, 0, 3 }, { 0, 7, 2 } } },
+    { { { 4, 0, 2 }, { 1, 7, 1 } } },
+    { { { 4, 0, 1 }, { 1, 7, 0 } } },
+    { { { 4, 0, 0 }, { 0, 8, 0 } } },
+    { { { 4, 0, 1 }, { 0, 8, 1 } } },
+    { { { 4, 0, 2 }, { 2, 7, 1 } } },
+    { { { 4, 0, 3 }, { 2, 7, 0 } } },
+    { { { 4, 0, 4 }, { 0, 9, 0 } } },
+    { { { 5, 0, 3 }, { 0, 9, 1 } } },
+    { { { 5, 0, 2 }, { 3, 7, 1 } } },
+    { { { 5, 0, 1 }, { 3, 7, 0 } } },
+    { { { 5, 0, 0 }, { 0, 10, 0 } } },
+    { { { 5, 0, 1 }, { 0, 10, 1 } } },
+    { { { 5, 0, 2 }, { 0, 10, 2 } } },
+    { { { 5, 0, 3 }, { 0, 11, 1 } } },
+    { { { 5, 0, 4 }, { 0, 11, 0 } } },
+    { { { 6, 0, 3 }, { 0, 11, 1 } } },
+    { { { 6, 0, 2 }, { 0, 11, 2 } } },
+    { { { 6, 0, 1 }, { 0, 12, 1 } } },
+    { { { 6, 0, 0 }, { 0, 12, 0 } } },
+    { { { 6, 0, 1 }, { 0, 12, 1 } } },
+    { { { 6, 0, 2 }, { 0, 12, 2 } } },
+    { { { 6, 0, 3 }, { 0, 13, 1 } } },
+    { { { 6, 0, 4 }, { 0, 13, 0 } } },
+    { { { 7, 0, 3 }, { 0, 13, 1 } } },
+    { { { 7, 0, 2 }, { 0, 13, 2 } } },
+    { { { 7, 0, 1 }, { 0, 14, 1 } } },
+    { { { 7, 0, 0 }, { 0, 14, 0 } } },
+    { { { 7, 0, 1 }, { 0, 14, 1 } } },
+    { { { 7, 0, 2 }, { 0, 14, 2 } } },
+    { { { 7, 0, 3 }, { 0, 15, 1 } } },
+    { { { 7, 0, 4 }, { 0, 15, 0 } } },
+    { { { 8, 0, 4 }, { 0, 15, 1 } } },
+    { { { 8, 0, 3 }, { 0, 15, 2 } } },
+    { { { 8, 0, 2 }, { 1, 15, 1 } } },
+    { { { 8, 0, 1 }, { 1, 15, 0 } } },
+    { { { 8, 0, 0 }, { 0, 16, 0 } } },
+    { { { 8, 0, 1 }, { 0, 16, 1 } } },
+    { { { 8, 0, 2 }, { 2, 15, 1 } } },
+    { { { 8, 0, 3 }, { 2, 15, 0 } } },
+    { { { 8, 0, 4 }, { 0, 17, 0 } } },
+    { { { 9, 0, 3 }, { 0, 17, 1 } } },
+    { { { 9, 0, 2 }, { 3, 15, 1 } } },
+    { { { 9, 0, 1 }, { 3, 15, 0 } } },
+    { { { 9, 0, 0 }, { 0, 18, 0 } } },
+    { { { 9, 0, 1 }, { 0, 18, 1 } } },
+    { { { 9, 0, 2 }, { 0, 18, 2 } } },
+    { { { 9, 0, 3 }, { 0, 19, 1 } } },
+    { { { 9, 0, 4 }, { 0, 19, 0 } } },
+    { { { 10, 0, 3 }, { 0, 19, 1 } } },
+    { { { 10, 0, 2 }, { 0, 19, 2 } } },
+    { { { 10, 0, 1 }, { 0, 20, 1 } } },
+    { { { 10, 0, 0 }, { 0, 20, 0 } } },
+    { { { 10, 0, 1 }, { 0, 20, 1 } } },
+    { { { 10, 0, 2 }, { 0, 20, 2 } } },
+    { { { 10, 0, 3 }, { 0, 21, 1 } } },
+    { { { 10, 0, 4 }, { 0, 21, 0 } } },
+    { { { 11, 0, 3 }, { 0, 21, 1 } } },
+    { { { 11, 0, 2 }, { 0, 21, 2 } } },
+    { { { 11, 0, 1 }, { 0, 22, 1 } } },
+    { { { 11, 0, 0 }, { 0, 22, 0 } } },
+    { { { 11, 0, 1 }, { 0, 22, 1 } } },
+    { { { 11, 0, 2 }, { 0, 22, 2 } } },
+    { { { 11, 0, 3 }, { 0, 23, 1 } } },
+    { { { 11, 0, 4 }, { 0, 23, 0 } } },
+    { { { 12, 0, 4 }, { 0, 23, 1 } } },
+    { { { 12, 0, 3 }, { 0, 23, 2 } } },
+    { { { 12, 0, 2 }, { 1, 23, 1 } } },
+    { { { 12, 0, 1 }, { 1, 23, 0 } } },
+    { { { 12, 0, 0 }, { 0, 24, 0 } } },
+    { { { 12, 0, 1 }, { 0, 24, 1 } } },
+    { { { 12, 0, 2 }, { 2, 23, 1 } } },
+    { { { 12, 0, 3 }, { 2, 23, 0 } } },
+    { { { 12, 0, 4 }, { 0, 25, 0 } } },
+    { { { 13, 0, 3 }, { 0, 25, 1 } } },
+    { { { 13, 0, 2 }, { 3, 23, 1 } } },
+    { { { 13, 0, 1 }, { 3, 23, 0 } } },
+    { { { 13, 0, 0 }, { 0, 26, 0 } } },
+    { { { 13, 0, 1 }, { 0, 26, 1 } } },
+    { { { 13, 0, 2 }, { 0, 26, 2 } } },
+    { { { 13, 0, 3 }, { 0, 27, 1 } } },
+    { { { 13, 0, 4 }, { 0, 27, 0 } } },
+    { { { 14, 0, 3 }, { 0, 27, 1 } } },
+    { { { 14, 0, 2 }, { 0, 27, 2 } } },
+    { { { 14, 0, 1 }, { 0, 28, 1 } } },
+    { { { 14, 0, 0 }, { 0, 28, 0 } } },
+    { { { 14, 0, 1 }, { 0, 28, 1 } } },
+    { { { 14, 0, 2 }, { 0, 28, 2 } } },
+    { { { 14, 0, 3 }, { 0, 29, 1 } } },
+    { { { 14, 0, 4 }, { 0, 29, 0 } } },
+    { { { 15, 0, 3 }, { 0, 29, 1 } } },
+    { { { 15, 0, 2 }, { 0, 29, 2 } } },
+    { { { 15, 0, 1 }, { 0, 30, 1 } } },
+    { { { 15, 0, 0 }, { 0, 30, 0 } } },
+    { { { 15, 0, 1 }, { 0, 30, 1 } } },
+    { { { 15, 0, 2 }, { 0, 30, 2 } } },
+    { { { 15, 0, 3 }, { 0, 31, 1 } } },
+    { { { 15, 0, 4 }, { 0, 31, 0 } } },
+    { { { 16, 0, 4 }, { 0, 31, 1 } } },
+    { { { 16, 0, 3 }, { 0, 31, 2 } } },
+    { { { 16, 0, 2 }, { 1, 31, 1 } } },
+    { { { 16, 0, 1 }, { 1, 31, 0 } } },
+    { { { 16, 0, 0 }, { 4, 28, 0 } } },
+    { { { 16, 0, 1 }, { 4, 28, 1 } } },
+    { { { 16, 0, 2 }, { 2, 31, 1 } } },
+    { { { 16, 0, 3 }, { 2, 31, 0 } } },
+    { { { 16, 0, 4 }, { 4, 29, 0 } } },
+    { { { 17, 0, 3 }, { 4, 29, 1 } } },
+    { { { 17, 0, 2 }, { 3, 31, 1 } } },
+    { { { 17, 0, 1 }, { 3, 31, 0 } } },
+    { { { 17, 0, 0 }, { 4, 30, 0 } } },
+    { { { 17, 0, 1 }, { 4, 30, 1 } } },
+    { { { 17, 0, 2 }, { 4, 30, 2 } } },
+    { { { 17, 0, 3 }, { 4, 31, 1 } } },
+    { { { 17, 0, 4 }, { 4, 31, 0 } } },
+    { { { 18, 0, 3 }, { 4, 31, 1 } } },
+    { { { 18, 0, 2 }, { 4, 31, 2 } } },
+    { { { 18, 0, 1 }, { 5, 31, 1 } } },
+    { { { 18, 0, 0 }, { 5, 31, 0 } } },
+    { { { 18, 0, 1 }, { 5, 31, 1 } } },
+    { { { 18, 0, 2 }, { 5, 31, 2 } } },
+    { { { 18, 0, 3 }, { 6, 31, 1 } } },
+    { { { 18, 0, 4 }, { 6, 31, 0 } } },
+    { { { 19, 0, 3 }, { 6, 31, 1 } } },
+    { { { 19, 0, 2 }, { 6, 31, 2 } } },
+    { { { 19, 0, 1 }, { 7, 31, 1 } } },
+    { { { 19, 0, 0 }, { 7, 31, 0 } } },
+    { { { 19, 0, 1 }, { 7, 31, 1 } } },
+    { { { 19, 0, 2 }, { 7, 31, 2 } } },
+    { { { 19, 0, 3 }, { 8, 31, 1 } } },
+    { { { 19, 0, 4 }, { 8, 31, 0 } } },
+    { { { 20, 0, 4 }, { 8, 31, 1 } } },
+    { { { 20, 0, 3 }, { 8, 31, 2 } } },
+    { { { 20, 0, 2 }, { 9, 31, 1 } } },
+    { { { 20, 0, 1 }, { 9, 31, 0 } } },
+    { { { 20, 0, 0 }, { 12, 28, 0 } } },
+    { { { 20, 0, 1 }, { 12, 28, 1 } } },
+    { { { 20, 0, 2 }, { 10, 31, 1 } } },
+    { { { 20, 0, 3 }, { 10, 31, 0 } } },
+    { { { 20, 0, 4 }, { 12, 29, 0 } } },
+    { { { 21, 0, 3 }, { 12, 29, 1 } } },
+    { { { 21, 0, 2 }, { 11, 31, 1 } } },
+    { { { 21, 0, 1 }, { 11, 31, 0 } } },
+    { { { 21, 0, 0 }, { 12, 30, 0 } } },
+    { { { 21, 0, 1 }, { 12, 30, 1 } } },
+    { { { 21, 0, 2 }, { 12, 30, 2 } } },
+    { { { 21, 0, 3 }, { 12, 31, 1 } } },
+    { { { 21, 0, 4 }, { 12, 31, 0 } } },
+    { { { 22, 0, 3 }, { 12, 31, 1 } } },
+    { { { 22, 0, 2 }, { 12, 31, 2 } } },
+    { { { 22, 0, 1 }, { 13, 31, 1 } } },
+    { { { 22, 0, 0 }, { 13, 31, 0 } } },
+    { { { 22, 0, 1 }, { 13, 31, 1 } } },
+    { { { 22, 0, 2 }, { 13, 31, 2 } } },
+    { { { 22, 0, 3 }, { 14, 31, 1 } } },
+    { { { 22, 0, 4 }, { 14, 31, 0 } } },
+    { { { 23, 0, 3 }, { 14, 31, 1 } } },
+    { { { 23, 0, 2 }, { 14, 31, 2 } } },
+    { { { 23, 0, 1 }, { 15, 31, 1 } } },
+    { { { 23, 0, 0 }, { 15, 31, 0 } } },
+    { { { 23, 0, 1 }, { 15, 31, 1 } } },
+    { { { 23, 0, 2 }, { 15, 31, 2 } } },
+    { { { 23, 0, 3 }, { 16, 31, 1 } } },
+    { { { 23, 0, 4 }, { 16, 31, 0 } } },
+    { { { 24, 0, 4 }, { 16, 31, 1 } } },
+    { { { 24, 0, 3 }, { 16, 31, 2 } } },
+    { { { 24, 0, 2 }, { 17, 31, 1 } } },
+    { { { 24, 0, 1 }, { 17, 31, 0 } } },
+    { { { 24, 0, 0 }, { 20, 28, 0 } } },
+    { { { 24, 0, 1 }, { 20, 28, 1 } } },
+    { { { 24, 0, 2 }, { 18, 31, 1 } } },
+    { { { 24, 0, 3 }, { 18, 31, 0 } } },
+    { { { 24, 0, 4 }, { 20, 29, 0 } } },
+    { { { 25, 0, 3 }, { 20, 29, 1 } } },
+    { { { 25, 0, 2 }, { 19, 31, 1 } } },
+    { { { 25, 0, 1 }, { 19, 31, 0 } } },
+    { { { 25, 0, 0 }, { 20, 30, 0 } } },
+    { { { 25, 0, 1 }, { 20, 30, 1 } } },
+    { { { 25, 0, 2 }, { 20, 30, 2 } } },
+    { { { 25, 0, 3 }, { 20, 31, 1 } } },
+    { { { 25, 0, 4 }, { 20, 31, 0 } } },
+    { { { 26, 0, 3 }, { 20, 31, 1 } } },
+    { { { 26, 0, 2 }, { 20, 31, 2 } } },
+    { { { 26, 0, 1 }, { 21, 31, 1 } } },
+    { { { 26, 0, 0 }, { 21, 31, 0 } } },
+    { { { 26, 0, 1 }, { 21, 31, 1 } } },
+    { { { 26, 0, 2 }, { 21, 31, 2 } } },
+    { { { 26, 0, 3 }, { 22, 31, 1 } } },
+    { { { 26, 0, 4 }, { 22, 31, 0 } } },
+    { { { 27, 0, 3 }, { 22, 31, 1 } } },
+    { { { 27, 0, 2 }, { 22, 31, 2 } } },
+    { { { 27, 0, 1 }, { 23, 31, 1 } } },
+    { { { 27, 0, 0 }, { 23, 31, 0 } } },
+    { { { 27, 0, 1 }, { 23, 31, 1 } } },
+    { { { 27, 0, 2 }, { 23, 31, 2 } } },
+    { { { 27, 0, 3 }, { 24, 31, 1 } } },
+    { { { 27, 0, 4 }, { 24, 31, 0 } } },
+    { { { 28, 0, 4 }, { 24, 31, 1 } } },
+    { { { 28, 0, 3 }, { 24, 31, 2 } } },
+    { { { 28, 0, 2 }, { 25, 31, 1 } } },
+    { { { 28, 0, 1 }, { 25, 31, 0 } } },
+    { { { 28, 0, 0 }, { 28, 28, 0 } } },
+    { { { 28, 0, 1 }, { 28, 28, 1 } } },
+    { { { 28, 0, 2 }, { 26, 31, 1 } } },
+    { { { 28, 0, 3 }, { 26, 31, 0 } } },
+    { { { 28, 0, 4 }, { 28, 29, 0 } } },
+    { { { 29, 0, 3 }, { 28, 29, 1 } } },
+    { { { 29, 0, 2 }, { 27, 31, 1 } } },
+    { { { 29, 0, 1 }, { 27, 31, 0 } } },
+    { { { 29, 0, 0 }, { 28, 30, 0 } } },
+    { { { 29, 0, 1 }, { 28, 30, 1 } } },
+    { { { 29, 0, 2 }, { 28, 30, 2 } } },
+    { { { 29, 0, 3 }, { 28, 31, 1 } } },
+    { { { 29, 0, 4 }, { 28, 31, 0 } } },
+    { { { 30, 0, 3 }, { 28, 31, 1 } } },
+    { { { 30, 0, 2 }, { 28, 31, 2 } } },
+    { { { 30, 0, 1 }, { 29, 31, 1 } } },
+    { { { 30, 0, 0 }, { 29, 31, 0 } } },
+    { { { 30, 0, 1 }, { 29, 31, 1 } } },
+    { { { 30, 0, 2 }, { 29, 31, 2 } } },
+    { { { 30, 0, 3 }, { 30, 31, 1 } } },
+    { { { 30, 0, 4 }, { 30, 31, 0 } } },
+    { { { 31, 0, 3 }, { 30, 31, 1 } } },
+    { { { 31, 0, 2 }, { 30, 31, 2 } } },
+    { { { 31, 0, 1 }, { 31, 31, 1 } } },
+    { { { 31, 0, 0 }, { 31, 31, 0 } } }
+};
+
+static SingleColourLookup const lookup_6_3[] =
+{
+    { { { 0, 0, 0 }, { 0, 0, 0 } } },
+    { { { 0, 0, 1 }, { 0, 1, 1 } } },
+    { { { 0, 0, 2 }, { 0, 1, 0 } } },
+    { { { 1, 0, 1 }, { 0, 2, 1 } } },
+    { { { 1, 0, 0 }, { 0, 2, 0 } } },
+    { { { 1, 0, 1 }, { 0, 3, 1 } } },
+    { { { 1, 0, 2 }, { 0, 3, 0 } } },
+    { { { 2, 0, 1 }, { 0, 4, 1 } } },
+    { { { 2, 0, 0 }, { 0, 4, 0 } } },
+    { { { 2, 0, 1 }, { 0, 5, 1 } } },
+    { { { 2, 0, 2 }, { 0, 5, 0 } } },
+    { { { 3, 0, 1 }, { 0, 6, 1 } } },
+    { { { 3, 0, 0 }, { 0, 6, 0 } } },
+    { { { 3, 0, 1 }, { 0, 7, 1 } } },
+    { { { 3, 0, 2 }, { 0, 7, 0 } } },
+    { { { 4, 0, 1 }, { 0, 8, 1 } } },
+    { { { 4, 0, 0 }, { 0, 8, 0 } } },
+    { { { 4, 0, 1 }, { 0, 9, 1 } } },
+    { { { 4, 0, 2 }, { 0, 9, 0 } } },
+    { { { 5, 0, 1 }, { 0, 10, 1 } } },
+    { { { 5, 0, 0 }, { 0, 10, 0 } } },
+    { { { 5, 0, 1 }, { 0, 11, 1 } } },
+    { { { 5, 0, 2 }, { 0, 11, 0 } } },
+    { { { 6, 0, 1 }, { 0, 12, 1 } } },
+    { { { 6, 0, 0 }, { 0, 12, 0 } } },
+    { { { 6, 0, 1 }, { 0, 13, 1 } } },
+    { { { 6, 0, 2 }, { 0, 13, 0 } } },
+    { { { 7, 0, 1 }, { 0, 14, 1 } } },
+    { { { 7, 0, 0 }, { 0, 14, 0 } } },
+    { { { 7, 0, 1 }, { 0, 15, 1 } } },
+    { { { 7, 0, 2 }, { 0, 15, 0 } } },
+    { { { 8, 0, 1 }, { 0, 16, 1 } } },
+    { { { 8, 0, 0 }, { 0, 16, 0 } } },
+    { { { 8, 0, 1 }, { 0, 17, 1 } } },
+    { { { 8, 0, 2 }, { 0, 17, 0 } } },
+    { { { 9, 0, 1 }, { 0, 18, 1 } } },
+    { { { 9, 0, 0 }, { 0, 18, 0 } } },
+    { { { 9, 0, 1 }, { 0, 19, 1 } } },
+    { { { 9, 0, 2 }, { 0, 19, 0 } } },
+    { { { 10, 0, 1 }, { 0, 20, 1 } } },
+    { { { 10, 0, 0 }, { 0, 20, 0 } } },
+    { { { 10, 0, 1 }, { 0, 21, 1 } } },
+    { { { 10, 0, 2 }, { 0, 21, 0 } } },
+    { { { 11, 0, 1 }, { 0, 22, 1 } } },
+    { { { 11, 0, 0 }, { 0, 22, 0 } } },
+    { { { 11, 0, 1 }, { 0, 23, 1 } } },
+    { { { 11, 0, 2 }, { 0, 23, 0 } } },
+    { { { 12, 0, 1 }, { 0, 24, 1 } } },
+    { { { 12, 0, 0 }, { 0, 24, 0 } } },
+    { { { 12, 0, 1 }, { 0, 25, 1 } } },
+    { { { 12, 0, 2 }, { 0, 25, 0 } } },
+    { { { 13, 0, 1 }, { 0, 26, 1 } } },
+    { { { 13, 0, 0 }, { 0, 26, 0 } } },
+    { { { 13, 0, 1 }, { 0, 27, 1 } } },
+    { { { 13, 0, 2 }, { 0, 27, 0 } } },
+    { { { 14, 0, 1 }, { 0, 28, 1 } } },
+    { { { 14, 0, 0 }, { 0, 28, 0 } } },
+    { { { 14, 0, 1 }, { 0, 29, 1 } } },
+    { { { 14, 0, 2 }, { 0, 29, 0 } } },
+    { { { 15, 0, 1 }, { 0, 30, 1 } } },
+    { { { 15, 0, 0 }, { 0, 30, 0 } } },
+    { { { 15, 0, 1 }, { 0, 31, 1 } } },
+    { { { 15, 0, 2 }, { 0, 31, 0 } } },
+    { { { 16, 0, 2 }, { 1, 31, 1 } } },
+    { { { 16, 0, 1 }, { 1, 31, 0 } } },
+    { { { 16, 0, 0 }, { 0, 32, 0 } } },
+    { { { 16, 0, 1 }, { 2, 31, 0 } } },
+    { { { 16, 0, 2 }, { 0, 33, 0 } } },
+    { { { 17, 0, 1 }, { 3, 31, 0 } } },
+    { { { 17, 0, 0 }, { 0, 34, 0 } } },
+    { { { 17, 0, 1 }, { 4, 31, 0 } } },
+    { { { 17, 0, 2 }, { 0, 35, 0 } } },
+    { { { 18, 0, 1 }, { 5, 31, 0 } } },
+    { { { 18, 0, 0 }, { 0, 36, 0 } } },
+    { { { 18, 0, 1 }, { 6, 31, 0 } } },
+    { { { 18, 0, 2 }, { 0, 37, 0 } } },
+    { { { 19, 0, 1 }, { 7, 31, 0 } } },
+    { { { 19, 0, 0 }, { 0, 38, 0 } } },
+    { { { 19, 0, 1 }, { 8, 31, 0 } } },
+    { { { 19, 0, 2 }, { 0, 39, 0 } } },
+    { { { 20, 0, 1 }, { 9, 31, 0 } } },
+    { { { 20, 0, 0 }, { 0, 40, 0 } } },
+    { { { 20, 0, 1 }, { 10, 31, 0 } } },
+    { { { 20, 0, 2 }, { 0, 41, 0 } } },
+    { { { 21, 0, 1 }, { 11, 31, 0 } } },
+    { { { 21, 0, 0 }, { 0, 42, 0 } } },
+    { { { 21, 0, 1 }, { 12, 31, 0 } } },
+    { { { 21, 0, 2 }, { 0, 43, 0 } } },
+    { { { 22, 0, 1 }, { 13, 31, 0 } } },
+    { { { 22, 0, 0 }, { 0, 44, 0 } } },
+    { { { 22, 0, 1 }, { 14, 31, 0 } } },
+    { { { 22, 0, 2 }, { 0, 45, 0 } } },
+    { { { 23, 0, 1 }, { 15, 31, 0 } } },
+    { { { 23, 0, 0 }, { 0, 46, 0 } } },
+    { { { 23, 0, 1 }, { 0, 47, 1 } } },
+    { { { 23, 0, 2 }, { 0, 47, 0 } } },
+    { { { 24, 0, 1 }, { 0, 48, 1 } } },
+    { { { 24, 0, 0 }, { 0, 48, 0 } } },
+    { { { 24, 0, 1 }, { 0, 49, 1 } } },
+    { { { 24, 0, 2 }, { 0, 49, 0 } } },
+    { { { 25, 0, 1 }, { 0, 50, 1 } } },
+    { { { 25, 0, 0 }, { 0, 50, 0 } } },
+    { { { 25, 0, 1 }, { 0, 51, 1 } } },
+    { { { 25, 0, 2 }, { 0, 51, 0 } } },
+    { { { 26, 0, 1 }, { 0, 52, 1 } } },
+    { { { 26, 0, 0 }, { 0, 52, 0 } } },
+    { { { 26, 0, 1 }, { 0, 53, 1 } } },
+    { { { 26, 0, 2 }, { 0, 53, 0 } } },
+    { { { 27, 0, 1 }, { 0, 54, 1 } } },
+    { { { 27, 0, 0 }, { 0, 54, 0 } } },
+    { { { 27, 0, 1 }, { 0, 55, 1 } } },
+    { { { 27, 0, 2 }, { 0, 55, 0 } } },
+    { { { 28, 0, 1 }, { 0, 56, 1 } } },
+    { { { 28, 0, 0 }, { 0, 56, 0 } } },
+    { { { 28, 0, 1 }, { 0, 57, 1 } } },
+    { { { 28, 0, 2 }, { 0, 57, 0 } } },
+    { { { 29, 0, 1 }, { 0, 58, 1 } } },
+    { { { 29, 0, 0 }, { 0, 58, 0 } } },
+    { { { 29, 0, 1 }, { 0, 59, 1 } } },
+    { { { 29, 0, 2 }, { 0, 59, 0 } } },
+    { { { 30, 0, 1 }, { 0, 60, 1 } } },
+    { { { 30, 0, 0 }, { 0, 60, 0 } } },
+    { { { 30, 0, 1 }, { 0, 61, 1 } } },
+    { { { 30, 0, 2 }, { 0, 61, 0 } } },
+    { { { 31, 0, 1 }, { 0, 62, 1 } } },
+    { { { 31, 0, 0 }, { 0, 62, 0 } } },
+    { { { 31, 0, 1 }, { 0, 63, 1 } } },
+    { { { 31, 0, 2 }, { 0, 63, 0 } } },
+    { { { 32, 0, 2 }, { 1, 63, 1 } } },
+    { { { 32, 0, 1 }, { 1, 63, 0 } } },
+    { { { 32, 0, 0 }, { 16, 48, 0 } } },
+    { { { 32, 0, 1 }, { 2, 63, 0 } } },
+    { { { 32, 0, 2 }, { 16, 49, 0 } } },
+    { { { 33, 0, 1 }, { 3, 63, 0 } } },
+    { { { 33, 0, 0 }, { 16, 50, 0 } } },
+    { { { 33, 0, 1 }, { 4, 63, 0 } } },
+    { { { 33, 0, 2 }, { 16, 51, 0 } } },
+    { { { 34, 0, 1 }, { 5, 63, 0 } } },
+    { { { 34, 0, 0 }, { 16, 52, 0 } } },
+    { { { 34, 0, 1 }, { 6, 63, 0 } } },
+    { { { 34, 0, 2 }, { 16, 53, 0 } } },
+    { { { 35, 0, 1 }, { 7, 63, 0 } } },
+    { { { 35, 0, 0 }, { 16, 54, 0 } } },
+    { { { 35, 0, 1 }, { 8, 63, 0 } } },
+    { { { 35, 0, 2 }, { 16, 55, 0 } } },
+    { { { 36, 0, 1 }, { 9, 63, 0 } } },
+    { { { 36, 0, 0 }, { 16, 56, 0 } } },
+    { { { 36, 0, 1 }, { 10, 63, 0 } } },
+    { { { 36, 0, 2 }, { 16, 57, 0 } } },
+    { { { 37, 0, 1 }, { 11, 63, 0 } } },
+    { { { 37, 0, 0 }, { 16, 58, 0 } } },
+    { { { 37, 0, 1 }, { 12, 63, 0 } } },
+    { { { 37, 0, 2 }, { 16, 59, 0 } } },
+    { { { 38, 0, 1 }, { 13, 63, 0 } } },
+    { { { 38, 0, 0 }, { 16, 60, 0 } } },
+    { { { 38, 0, 1 }, { 14, 63, 0 } } },
+    { { { 38, 0, 2 }, { 16, 61, 0 } } },
+    { { { 39, 0, 1 }, { 15, 63, 0 } } },
+    { { { 39, 0, 0 }, { 16, 62, 0 } } },
+    { { { 39, 0, 1 }, { 16, 63, 1 } } },
+    { { { 39, 0, 2 }, { 16, 63, 0 } } },
+    { { { 40, 0, 1 }, { 17, 63, 1 } } },
+    { { { 40, 0, 0 }, { 17, 63, 0 } } },
+    { { { 40, 0, 1 }, { 18, 63, 1 } } },
+    { { { 40, 0, 2 }, { 18, 63, 0 } } },
+    { { { 41, 0, 1 }, { 19, 63, 1 } } },
+    { { { 41, 0, 0 }, { 19, 63, 0 } } },
+    { { { 41, 0, 1 }, { 20, 63, 1 } } },
+    { { { 41, 0, 2 }, { 20, 63, 0 } } },
+    { { { 42, 0, 1 }, { 21, 63, 1 } } },
+    { { { 42, 0, 0 }, { 21, 63, 0 } } },
+    { { { 42, 0, 1 }, { 22, 63, 1 } } },
+    { { { 42, 0, 2 }, { 22, 63, 0 } } },
+    { { { 43, 0, 1 }, { 23, 63, 1 } } },
+    { { { 43, 0, 0 }, { 23, 63, 0 } } },
+    { { { 43, 0, 1 }, { 24, 63, 1 } } },
+    { { { 43, 0, 2 }, { 24, 63, 0 } } },
+    { { { 44, 0, 1 }, { 25, 63, 1 } } },
+    { { { 44, 0, 0 }, { 25, 63, 0 } } },
+    { { { 44, 0, 1 }, { 26, 63, 1 } } },
+    { { { 44, 0, 2 }, { 26, 63, 0 } } },
+    { { { 45, 0, 1 }, { 27, 63, 1 } } },
+    { { { 45, 0, 0 }, { 27, 63, 0 } } },
+    { { { 45, 0, 1 }, { 28, 63, 1 } } },
+    { { { 45, 0, 2 }, { 28, 63, 0 } } },
+    { { { 46, 0, 1 }, { 29, 63, 1 } } },
+    { { { 46, 0, 0 }, { 29, 63, 0 } } },
+    { { { 46, 0, 1 }, { 30, 63, 1 } } },
+    { { { 46, 0, 2 }, { 30, 63, 0 } } },
+    { { { 47, 0, 1 }, { 31, 63, 1 } } },
+    { { { 47, 0, 0 }, { 31, 63, 0 } } },
+    { { { 47, 0, 1 }, { 32, 63, 1 } } },
+    { { { 47, 0, 2 }, { 32, 63, 0 } } },
+    { { { 48, 0, 2 }, { 33, 63, 1 } } },
+    { { { 48, 0, 1 }, { 33, 63, 0 } } },
+    { { { 48, 0, 0 }, { 48, 48, 0 } } },
+    { { { 48, 0, 1 }, { 34, 63, 0 } } },
+    { { { 48, 0, 2 }, { 48, 49, 0 } } },
+    { { { 49, 0, 1 }, { 35, 63, 0 } } },
+    { { { 49, 0, 0 }, { 48, 50, 0 } } },
+    { { { 49, 0, 1 }, { 36, 63, 0 } } },
+    { { { 49, 0, 2 }, { 48, 51, 0 } } },
+    { { { 50, 0, 1 }, { 37, 63, 0 } } },
+    { { { 50, 0, 0 }, { 48, 52, 0 } } },
+    { { { 50, 0, 1 }, { 38, 63, 0 } } },
+    { { { 50, 0, 2 }, { 48, 53, 0 } } },
+    { { { 51, 0, 1 }, { 39, 63, 0 } } },
+    { { { 51, 0, 0 }, { 48, 54, 0 } } },
+    { { { 51, 0, 1 }, { 40, 63, 0 } } },
+    { { { 51, 0, 2 }, { 48, 55, 0 } } },
+    { { { 52, 0, 1 }, { 41, 63, 0 } } },
+    { { { 52, 0, 0 }, { 48, 56, 0 } } },
+    { { { 52, 0, 1 }, { 42, 63, 0 } } },
+    { { { 52, 0, 2 }, { 48, 57, 0 } } },
+    { { { 53, 0, 1 }, { 43, 63, 0 } } },
+    { { { 53, 0, 0 }, { 48, 58, 0 } } },
+    { { { 53, 0, 1 }, { 44, 63, 0 } } },
+    { { { 53, 0, 2 }, { 48, 59, 0 } } },
+    { { { 54, 0, 1 }, { 45, 63, 0 } } },
+    { { { 54, 0, 0 }, { 48, 60, 0 } } },
+    { { { 54, 0, 1 }, { 46, 63, 0 } } },
+    { { { 54, 0, 2 }, { 48, 61, 0 } } },
+    { { { 55, 0, 1 }, { 47, 63, 0 } } },
+    { { { 55, 0, 0 }, { 48, 62, 0 } } },
+    { { { 55, 0, 1 }, { 48, 63, 1 } } },
+    { { { 55, 0, 2 }, { 48, 63, 0 } } },
+    { { { 56, 0, 1 }, { 49, 63, 1 } } },
+    { { { 56, 0, 0 }, { 49, 63, 0 } } },
+    { { { 56, 0, 1 }, { 50, 63, 1 } } },
+    { { { 56, 0, 2 }, { 50, 63, 0 } } },
+    { { { 57, 0, 1 }, { 51, 63, 1 } } },
+    { { { 57, 0, 0 }, { 51, 63, 0 } } },
+    { { { 57, 0, 1 }, { 52, 63, 1 } } },
+    { { { 57, 0, 2 }, { 52, 63, 0 } } },
+    { { { 58, 0, 1 }, { 53, 63, 1 } } },
+    { { { 58, 0, 0 }, { 53, 63, 0 } } },
+    { { { 58, 0, 1 }, { 54, 63, 1 } } },
+    { { { 58, 0, 2 }, { 54, 63, 0 } } },
+    { { { 59, 0, 1 }, { 55, 63, 1 } } },
+    { { { 59, 0, 0 }, { 55, 63, 0 } } },
+    { { { 59, 0, 1 }, { 56, 63, 1 } } },
+    { { { 59, 0, 2 }, { 56, 63, 0 } } },
+    { { { 60, 0, 1 }, { 57, 63, 1 } } },
+    { { { 60, 0, 0 }, { 57, 63, 0 } } },
+    { { { 60, 0, 1 }, { 58, 63, 1 } } },
+    { { { 60, 0, 2 }, { 58, 63, 0 } } },
+    { { { 61, 0, 1 }, { 59, 63, 1 } } },
+    { { { 61, 0, 0 }, { 59, 63, 0 } } },
+    { { { 61, 0, 1 }, { 60, 63, 1 } } },
+    { { { 61, 0, 2 }, { 60, 63, 0 } } },
+    { { { 62, 0, 1 }, { 61, 63, 1 } } },
+    { { { 62, 0, 0 }, { 61, 63, 0 } } },
+    { { { 62, 0, 1 }, { 62, 63, 1 } } },
+    { { { 62, 0, 2 }, { 62, 63, 0 } } },
+    { { { 63, 0, 1 }, { 63, 63, 1 } } },
+    { { { 63, 0, 0 }, { 63, 63, 0 } } }
+};
+
+static SingleColourLookup const lookup_5_4[] =
+{
+    { { { 0, 0, 0 }, { 0, 0, 0 } } },
+    { { { 0, 0, 1 }, { 0, 1, 1 } } },
+    { { { 0, 0, 2 }, { 0, 1, 0 } } },
+    { { { 0, 0, 3 }, { 0, 1, 1 } } },
+    { { { 0, 0, 4 }, { 0, 2, 1 } } },
+    { { { 1, 0, 3 }, { 0, 2, 0 } } },
+    { { { 1, 0, 2 }, { 0, 2, 1 } } },
+    { { { 1, 0, 1 }, { 0, 3, 1 } } },
+    { { { 1, 0, 0 }, { 0, 3, 0 } } },
+    { { { 1, 0, 1 }, { 1, 2, 1 } } },
+    { { { 1, 0, 2 }, { 1, 2, 0 } } },
+    { { { 1, 0, 3 }, { 0, 4, 0 } } },
+    { { { 1, 0, 4 }, { 0, 5, 1 } } },
+    { { { 2, 0, 3 }, { 0, 5, 0 } } },
+    { { { 2, 0, 2 }, { 0, 5, 1 } } },
+    { { { 2, 0, 1 }, { 0, 6, 1 } } },
+    { { { 2, 0, 0 }, { 0, 6, 0 } } },
+    { { { 2, 0, 1 }, { 2, 3, 1 } } },
+    { { { 2, 0, 2 }, { 2, 3, 0 } } },
+    { { { 2, 0, 3 }, { 0, 7, 0 } } },
+    { { { 2, 0, 4 }, { 1, 6, 1 } } },
+    { { { 3, 0, 3 }, { 1, 6, 0 } } },
+    { { { 3, 0, 2 }, { 0, 8, 0 } } },
+    { { { 3, 0, 1 }, { 0, 9, 1 } } },
+    { { { 3, 0, 0 }, { 0, 9, 0 } } },
+    { { { 3, 0, 1 }, { 0, 9, 1 } } },
+    { { { 3, 0, 2 }, { 0, 10, 1 } } },
+    { { { 3, 0, 3 }, { 0, 10, 0 } } },
+    { { { 3, 0, 4 }, { 2, 7, 1 } } },
+    { { { 4, 0, 4 }, { 2, 7, 0 } } },
+    { { { 4, 0, 3 }, { 0, 11, 0 } } },
+    { { { 4, 0, 2 }, { 1, 10, 1 } } },
+    { { { 4, 0, 1 }, { 1, 10, 0 } } },
+    { { { 4, 0, 0 }, { 0, 12, 0 } } },
+    { { { 4, 0, 1 }, { 0, 13, 1 } } },
+    { { { 4, 0, 2 }, { 0, 13, 0 } } },
+    { { { 4, 0, 3 }, { 0, 13, 1 } } },
+    { { { 4, 0, 4 }, { 0, 14, 1 } } },
+    { { { 5, 0, 3 }, { 0, 14, 0 } } },
+    { { { 5, 0, 2 }, { 2, 11, 1 } } },
+    { { { 5, 0, 1 }, { 2, 11, 0 } } },
+    { { { 5, 0, 0 }, { 0, 15, 0 } } },
+    { { { 5, 0, 1 }, { 1, 14, 1 } } },
+    { { { 5, 0, 2 }, { 1, 14, 0 } } },
+    { { { 5, 0, 3 }, { 0, 16, 0 } } },
+    { { { 5, 0, 4 }, { 0, 17, 1 } } },
+    { { { 6, 0, 3 }, { 0, 17, 0 } } },
+    { { { 6, 0, 2 }, { 0, 17, 1 } } },
+    { { { 6, 0, 1 }, { 0, 18, 1 } } },
+    { { { 6, 0, 0 }, { 0, 18, 0 } } },
+    { { { 6, 0, 1 }, { 2, 15, 1 } } },
+    { { { 6, 0, 2 }, { 2, 15, 0 } } },
+    { { { 6, 0, 3 }, { 0, 19, 0 } } },
+    { { { 6, 0, 4 }, { 1, 18, 1 } } },
+    { { { 7, 0, 3 }, { 1, 18, 0 } } },
+    { { { 7, 0, 2 }, { 0, 20, 0 } } },
+    { { { 7, 0, 1 }, { 0, 21, 1 } } },
+    { { { 7, 0, 0 }, { 0, 21, 0 } } },
+    { { { 7, 0, 1 }, { 0, 21, 1 } } },
+    { { { 7, 0, 2 }, { 0, 22, 1 } } },
+    { { { 7, 0, 3 }, { 0, 22, 0 } } },
+    { { { 7, 0, 4 }, { 2, 19, 1 } } },
+    { { { 8, 0, 4 }, { 2, 19, 0 } } },
+    { { { 8, 0, 3 }, { 0, 23, 0 } } },
+    { { { 8, 0, 2 }, { 1, 22, 1 } } },
+    { { { 8, 0, 1 }, { 1, 22, 0 } } },
+    { { { 8, 0, 0 }, { 0, 24, 0 } } },
+    { { { 8, 0, 1 }, { 0, 25, 1 } } },
+    { { { 8, 0, 2 }, { 0, 25, 0 } } },
+    { { { 8, 0, 3 }, { 0, 25, 1 } } },
+    { { { 8, 0, 4 }, { 0, 26, 1 } } },
+    { { { 9, 0, 3 }, { 0, 26, 0 } } },
+    { { { 9, 0, 2 }, { 2, 23, 1 } } },
+    { { { 9, 0, 1 }, { 2, 23, 0 } } },
+    { { { 9, 0, 0 }, { 0, 27, 0 } } },
+    { { { 9, 0, 1 }, { 1, 26, 1 } } },
+    { { { 9, 0, 2 }, { 1, 26, 0 } } },
+    { { { 9, 0, 3 }, { 0, 28, 0 } } },
+    { { { 9, 0, 4 }, { 0, 29, 1 } } },
+    { { { 10, 0, 3 }, { 0, 29, 0 } } },
+    { { { 10, 0, 2 }, { 0, 29, 1 } } },
+    { { { 10, 0, 1 }, { 0, 30, 1 } } },
+    { { { 10, 0, 0 }, { 0, 30, 0 } } },
+    { { { 10, 0, 1 }, { 2, 27, 1 } } },
+    { { { 10, 0, 2 }, { 2, 27, 0 } } },
+    { { { 10, 0, 3 }, { 0, 31, 0 } } },
+    { { { 10, 0, 4 }, { 1, 30, 1 } } },
+    { { { 11, 0, 3 }, { 1, 30, 0 } } },
+    { { { 11, 0, 2 }, { 4, 24, 0 } } },
+    { { { 11, 0, 1 }, { 1, 31, 1 } } },
+    { { { 11, 0, 0 }, { 1, 31, 0 } } },
+    { { { 11, 0, 1 }, { 1, 31, 1 } } },
+    { { { 11, 0, 2 }, { 2, 30, 1 } } },
+    { { { 11, 0, 3 }, { 2, 30, 0 } } },
+    { { { 11, 0, 4 }, { 2, 31, 1 } } },
+    { { { 12, 0, 4 }, { 2, 31, 0 } } },
+    { { { 12, 0, 3 }, { 4, 27, 0 } } },
+    { { { 12, 0, 2 }, { 3, 30, 1 } } },
+    { { { 12, 0, 1 }, { 3, 30, 0 } } },
+    { { { 12, 0, 0 }, { 4, 28, 0 } } },
+    { { { 12, 0, 1 }, { 3, 31, 1 } } },
+    { { { 12, 0, 2 }, { 3, 31, 0 } } },
+    { { { 12, 0, 3 }, { 3, 31, 1 } } },
+    { { { 12, 0, 4 }, { 4, 30, 1 } } },
+    { { { 13, 0, 3 }, { 4, 30, 0 } } },
+    { { { 13, 0, 2 }, { 6, 27, 1 } } },
+    { { { 13, 0, 1 }, { 6, 27, 0 } } },
+    { { { 13, 0, 0 }, { 4, 31, 0 } } },
+    { { { 13, 0, 1 }, { 5, 30, 1 } } },
+    { { { 13, 0, 2 }, { 5, 30, 0 } } },
+    { { { 13, 0, 3 }, { 8, 24, 0 } } },
+    { { { 13, 0, 4 }, { 5, 31, 1 } } },
+    { { { 14, 0, 3 }, { 5, 31, 0 } } },
+    { { { 14, 0, 2 }, { 5, 31, 1 } } },
+    { { { 14, 0, 1 }, { 6, 30, 1 } } },
+    { { { 14, 0, 0 }, { 6, 30, 0 } } },
+    { { { 14, 0, 1 }, { 6, 31, 1 } } },
+    { { { 14, 0, 2 }, { 6, 31, 0 } } },
+    { { { 14, 0, 3 }, { 8, 27, 0 } } },
+    { { { 14, 0, 4 }, { 7, 30, 1 } } },
+    { { { 15, 0, 3 }, { 7, 30, 0 } } },
+    { { { 15, 0, 2 }, { 8, 28, 0 } } },
+    { { { 15, 0, 1 }, { 7, 31, 1 } } },
+    { { { 15, 0, 0 }, { 7, 31, 0 } } },
+    { { { 15, 0, 1 }, { 7, 31, 1 } } },
+    { { { 15, 0, 2 }, { 8, 30, 1 } } },
+    { { { 15, 0, 3 }, { 8, 30, 0 } } },
+    { { { 15, 0, 4 }, { 10, 27, 1 } } },
+    { { { 16, 0, 4 }, { 10, 27, 0 } } },
+    { { { 16, 0, 3 }, { 8, 31, 0 } } },
+    { { { 16, 0, 2 }, { 9, 30, 1 } } },
+    { { { 16, 0, 1 }, { 9, 30, 0 } } },
+    { { { 16, 0, 0 }, { 12, 24, 0 } } },
+    { { { 16, 0, 1 }, { 9, 31, 1 } } },
+    { { { 16, 0, 2 }, { 9, 31, 0 } } },
+    { { { 16, 0, 3 }, { 9, 31, 1 } } },
+    { { { 16, 0, 4 }, { 10, 30, 1 } } },
+    { { { 17, 0, 3 }, { 10, 30, 0 } } },
+    { { { 17, 0, 2 }, { 10, 31, 1 } } },
+    { { { 17, 0, 1 }, { 10, 31, 0 } } },
+    { { { 17, 0, 0 }, { 12, 27, 0 } } },
+    { { { 17, 0, 1 }, { 11, 30, 1 } } },
+    { { { 17, 0, 2 }, { 11, 30, 0 } } },
+    { { { 17, 0, 3 }, { 12, 28, 0 } } },
+    { { { 17, 0, 4 }, { 11, 31, 1 } } },
+    { { { 18, 0, 3 }, { 11, 31, 0 } } },
+    { { { 18, 0, 2 }, { 11, 31, 1 } } },
+    { { { 18, 0, 1 }, { 12, 30, 1 } } },
+    { { { 18, 0, 0 }, { 12, 30, 0 } } },
+    { { { 18, 0, 1 }, { 14, 27, 1 } } },
+    { { { 18, 0, 2 }, { 14, 27, 0 } } },
+    { { { 18, 0, 3 }, { 12, 31, 0 } } },
+    { { { 18, 0, 4 }, { 13, 30, 1 } } },
+    { { { 19, 0, 3 }, { 13, 30, 0 } } },
+    { { { 19, 0, 2 }, { 16, 24, 0 } } },
+    { { { 19, 0, 1 }, { 13, 31, 1 } } },
+    { { { 19, 0, 0 }, { 13, 31, 0 } } },
+    { { { 19, 0, 1 }, { 13, 31, 1 } } },
+    { { { 19, 0, 2 }, { 14, 30, 1 } } },
+    { { { 19, 0, 3 }, { 14, 30, 0 } } },
+    { { { 19, 0, 4 }, { 14, 31, 1 } } },
+    { { { 20, 0, 4 }, { 14, 31, 0 } } },
+    { { { 20, 0, 3 }, { 16, 27, 0 } } },
+    { { { 20, 0, 2 }, { 15, 30, 1 } } },
+    { { { 20, 0, 1 }, { 15, 30, 0 } } },
+    { { { 20, 0, 0 }, { 16, 28, 0 } } },
+    { { { 20, 0, 1 }, { 15, 31, 1 } } },
+    { { { 20, 0, 2 }, { 15, 31, 0 } } },
+    { { { 20, 0, 3 }, { 15, 31, 1 } } },
+    { { { 20, 0, 4 }, { 16, 30, 1 } } },
+    { { { 21, 0, 3 }, { 16, 30, 0 } } },
+    { { { 21, 0, 2 }, { 18, 27, 1 } } },
+    { { { 21, 0, 1 }, { 18, 27, 0 } } },
+    { { { 21, 0, 0 }, { 16, 31, 0 } } },
+    { { { 21, 0, 1 }, { 17, 30, 1 } } },
+    { { { 21, 0, 2 }, { 17, 30, 0 } } },
+    { { { 21, 0, 3 }, { 20, 24, 0 } } },
+    { { { 21, 0, 4 }, { 17, 31, 1 } } },
+    { { { 22, 0, 3 }, { 17, 31, 0 } } },
+    { { { 22, 0, 2 }, { 17, 31, 1 } } },
+    { { { 22, 0, 1 }, { 18, 30, 1 } } },
+    { { { 22, 0, 0 }, { 18, 30, 0 } } },
+    { { { 22, 0, 1 }, { 18, 31, 1 } } },
+    { { { 22, 0, 2 }, { 18, 31, 0 } } },
+    { { { 22, 0, 3 }, { 20, 27, 0 } } },
+    { { { 22, 0, 4 }, { 19, 30, 1 } } },
+    { { { 23, 0, 3 }, { 19, 30, 0 } } },
+    { { { 23, 0, 2 }, { 20, 28, 0 } } },
+    { { { 23, 0, 1 }, { 19, 31, 1 } } },
+    { { { 23, 0, 0 }, { 19, 31, 0 } } },
+    { { { 23, 0, 1 }, { 19, 31, 1 } } },
+    { { { 23, 0, 2 }, { 20, 30, 1 } } },
+    { { { 23, 0, 3 }, { 20, 30, 0 } } },
+    { { { 23, 0, 4 }, { 22, 27, 1 } } },
+    { { { 24, 0, 4 }, { 22, 27, 0 } } },
+    { { { 24, 0, 3 }, { 20, 31, 0 } } },
+    { { { 24, 0, 2 }, { 21, 30, 1 } } },
+    { { { 24, 0, 1 }, { 21, 30, 0 } } },
+    { { { 24, 0, 0 }, { 24, 24, 0 } } },
+    { { { 24, 0, 1 }, { 21, 31, 1 } } },
+    { { { 24, 0, 2 }, { 21, 31, 0 } } },
+    { { { 24, 0, 3 }, { 21, 31, 1 } } },
+    { { { 24, 0, 4 }, { 22, 30, 1 } } },
+    { { { 25, 0, 3 }, { 22, 30, 0 } } },
+    { { { 25, 0, 2 }, { 22, 31, 1 } } },
+    { { { 25, 0, 1 }, { 22, 31, 0 } } },
+    { { { 25, 0, 0 }, { 24, 27, 0 } } },
+    { { { 25, 0, 1 }, { 23, 30, 1 } } },
+    { { { 25, 0, 2 }, { 23, 30, 0 } } },
+    { { { 25, 0, 3 }, { 24, 28, 0 } } },
+    { { { 25, 0, 4 }, { 23, 31, 1 } } },
+    { { { 26, 0, 3 }, { 23, 31, 0 } } },
+    { { { 26, 0, 2 }, { 23, 31, 1 } } },
+    { { { 26, 0, 1 }, { 24, 30, 1 } } },
+    { { { 26, 0, 0 }, { 24, 30, 0 } } },
+    { { { 26, 0, 1 }, { 26, 27, 1 } } },
+    { { { 26, 0, 2 }, { 26, 27, 0 } } },
+    { { { 26, 0, 3 }, { 24, 31, 0 } } },
+    { { { 26, 0, 4 }, { 25, 30, 1 } } },
+    { { { 27, 0, 3 }, { 25, 30, 0 } } },
+    { { { 27, 0, 2 }, { 28, 24, 0 } } },
+    { { { 27, 0, 1 }, { 25, 31, 1 } } },
+    { { { 27, 0, 0 }, { 25, 31, 0 } } },
+    { { { 27, 0, 1 }, { 25, 31, 1 } } },
+    { { { 27, 0, 2 }, { 26, 30, 1 } } },
+    { { { 27, 0, 3 }, { 26, 30, 0 } } },
+    { { { 27, 0, 4 }, { 26, 31, 1 } } },
+    { { { 28, 0, 4 }, { 26, 31, 0 } } },
+    { { { 28, 0, 3 }, { 28, 27, 0 } } },
+    { { { 28, 0, 2 }, { 27, 30, 1 } } },
+    { { { 28, 0, 1 }, { 27, 30, 0 } } },
+    { { { 28, 0, 0 }, { 28, 28, 0 } } },
+    { { { 28, 0, 1 }, { 27, 31, 1 } } },
+    { { { 28, 0, 2 }, { 27, 31, 0 } } },
+    { { { 28, 0, 3 }, { 27, 31, 1 } } },
+    { { { 28, 0, 4 }, { 28, 30, 1 } } },
+    { { { 29, 0, 3 }, { 28, 30, 0 } } },
+    { { { 29, 0, 2 }, { 30, 27, 1 } } },
+    { { { 29, 0, 1 }, { 30, 27, 0 } } },
+    { { { 29, 0, 0 }, { 28, 31, 0 } } },
+    { { { 29, 0, 1 }, { 29, 30, 1 } } },
+    { { { 29, 0, 2 }, { 29, 30, 0 } } },
+    { { { 29, 0, 3 }, { 29, 30, 1 } } },
+    { { { 29, 0, 4 }, { 29, 31, 1 } } },
+    { { { 30, 0, 3 }, { 29, 31, 0 } } },
+    { { { 30, 0, 2 }, { 29, 31, 1 } } },
+    { { { 30, 0, 1 }, { 30, 30, 1 } } },
+    { { { 30, 0, 0 }, { 30, 30, 0 } } },
+    { { { 30, 0, 1 }, { 30, 31, 1 } } },
+    { { { 30, 0, 2 }, { 30, 31, 0 } } },
+    { { { 30, 0, 3 }, { 30, 31, 1 } } },
+    { { { 30, 0, 4 }, { 31, 30, 1 } } },
+    { { { 31, 0, 3 }, { 31, 30, 0 } } },
+    { { { 31, 0, 2 }, { 31, 30, 1 } } },
+    { { { 31, 0, 1 }, { 31, 31, 1 } } },
+    { { { 31, 0, 0 }, { 31, 31, 0 } } }
+};
+
+static SingleColourLookup const lookup_6_4[] =
+{
+    { { { 0, 0, 0 }, { 0, 0, 0 } } },
+    { { { 0, 0, 1 }, { 0, 1, 0 } } },
+    { { { 0, 0, 2 }, { 0, 2, 0 } } },
+    { { { 1, 0, 1 }, { 0, 3, 1 } } },
+    { { { 1, 0, 0 }, { 0, 3, 0 } } },
+    { { { 1, 0, 1 }, { 0, 4, 0 } } },
+    { { { 1, 0, 2 }, { 0, 5, 0 } } },
+    { { { 2, 0, 1 }, { 0, 6, 1 } } },
+    { { { 2, 0, 0 }, { 0, 6, 0 } } },
+    { { { 2, 0, 1 }, { 0, 7, 0 } } },
+    { { { 2, 0, 2 }, { 0, 8, 0 } } },
+    { { { 3, 0, 1 }, { 0, 9, 1 } } },
+    { { { 3, 0, 0 }, { 0, 9, 0 } } },
+    { { { 3, 0, 1 }, { 0, 10, 0 } } },
+    { { { 3, 0, 2 }, { 0, 11, 0 } } },
+    { { { 4, 0, 1 }, { 0, 12, 1 } } },
+    { { { 4, 0, 0 }, { 0, 12, 0 } } },
+    { { { 4, 0, 1 }, { 0, 13, 0 } } },
+    { { { 4, 0, 2 }, { 0, 14, 0 } } },
+    { { { 5, 0, 1 }, { 0, 15, 1 } } },
+    { { { 5, 0, 0 }, { 0, 15, 0 } } },
+    { { { 5, 0, 1 }, { 0, 16, 0 } } },
+    { { { 5, 0, 2 }, { 1, 15, 0 } } },
+    { { { 6, 0, 1 }, { 0, 17, 0 } } },
+    { { { 6, 0, 0 }, { 0, 18, 0 } } },
+    { { { 6, 0, 1 }, { 0, 19, 0 } } },
+    { { { 6, 0, 2 }, { 3, 14, 0 } } },
+    { { { 7, 0, 1 }, { 0, 20, 0 } } },
+    { { { 7, 0, 0 }, { 0, 21, 0 } } },
+    { { { 7, 0, 1 }, { 0, 22, 0 } } },
+    { { { 7, 0, 2 }, { 4, 15, 0 } } },
+    { { { 8, 0, 1 }, { 0, 23, 0 } } },
+    { { { 8, 0, 0 }, { 0, 24, 0 } } },
+    { { { 8, 0, 1 }, { 0, 25, 0 } } },
+    { { { 8, 0, 2 }, { 6, 14, 0 } } },
+    { { { 9, 0, 1 }, { 0, 26, 0 } } },
+    { { { 9, 0, 0 }, { 0, 27, 0 } } },
+    { { { 9, 0, 1 }, { 0, 28, 0 } } },
+    { { { 9, 0, 2 }, { 7, 15, 0 } } },
+    { { { 10, 0, 1 }, { 0, 29, 0 } } },
+    { { { 10, 0, 0 }, { 0, 30, 0 } } },
+    { { { 10, 0, 1 }, { 0, 31, 0 } } },
+    { { { 10, 0, 2 }, { 9, 14, 0 } } },
+    { { { 11, 0, 1 }, { 0, 32, 0 } } },
+    { { { 11, 0, 0 }, { 0, 33, 0 } } },
+    { { { 11, 0, 1 }, { 2, 30, 0 } } },
+    { { { 11, 0, 2 }, { 0, 34, 0 } } },
+    { { { 12, 0, 1 }, { 0, 35, 0 } } },
+    { { { 12, 0, 0 }, { 0, 36, 0 } } },
+    { { { 12, 0, 1 }, { 3, 31, 0 } } },
+    { { { 12, 0, 2 }, { 0, 37, 0 } } },
+    { { { 13, 0, 1 }, { 0, 38, 0 } } },
+    { { { 13, 0, 0 }, { 0, 39, 0 } } },
+    { { { 13, 0, 1 }, { 5, 30, 0 } } },
+    { { { 13, 0, 2 }, { 0, 40, 0 } } },
+    { { { 14, 0, 1 }, { 0, 41, 0 } } },
+    { { { 14, 0, 0 }, { 0, 42, 0 } } },
+    { { { 14, 0, 1 }, { 6, 31, 0 } } },
+    { { { 14, 0, 2 }, { 0, 43, 0 } } },
+    { { { 15, 0, 1 }, { 0, 44, 0 } } },
+    { { { 15, 0, 0 }, { 0, 45, 0 } } },
+    { { { 15, 0, 1 }, { 8, 30, 0 } } },
+    { { { 15, 0, 2 }, { 0, 46, 0 } } },
+    { { { 16, 0, 2 }, { 0, 47, 0 } } },
+    { { { 16, 0, 1 }, { 1, 46, 0 } } },
+    { { { 16, 0, 0 }, { 0, 48, 0 } } },
+    { { { 16, 0, 1 }, { 0, 49, 0 } } },
+    { { { 16, 0, 2 }, { 0, 50, 0 } } },
+    { { { 17, 0, 1 }, { 2, 47, 0 } } },
+    { { { 17, 0, 0 }, { 0, 51, 0 } } },
+    { { { 17, 0, 1 }, { 0, 52, 0 } } },
+    { { { 17, 0, 2 }, { 0, 53, 0 } } },
+    { { { 18, 0, 1 }, { 4, 46, 0 } } },
+    { { { 18, 0, 0 }, { 0, 54, 0 } } },
+    { { { 18, 0, 1 }, { 0, 55, 0 } } },
+    { { { 18, 0, 2 }, { 0, 56, 0 } } },
+    { { { 19, 0, 1 }, { 5, 47, 0 } } },
+    { { { 19, 0, 0 }, { 0, 57, 0 } } },
+    { { { 19, 0, 1 }, { 0, 58, 0 } } },
+    { { { 19, 0, 2 }, { 0, 59, 0 } } },
+    { { { 20, 0, 1 }, { 7, 46, 0 } } },
+    { { { 20, 0, 0 }, { 0, 60, 0 } } },
+    { { { 20, 0, 1 }, { 0, 61, 0 } } },
+    { { { 20, 0, 2 }, { 0, 62, 0 } } },
+    { { { 21, 0, 1 }, { 8, 47, 0 } } },
+    { { { 21, 0, 0 }, { 0, 63, 0 } } },
+    { { { 21, 0, 1 }, { 1, 62, 0 } } },
+    { { { 21, 0, 2 }, { 1, 63, 0 } } },
+    { { { 22, 0, 1 }, { 10, 46, 0 } } },
+    { { { 22, 0, 0 }, { 2, 62, 0 } } },
+    { { { 22, 0, 1 }, { 2, 63, 0 } } },
+    { { { 22, 0, 2 }, { 3, 62, 0 } } },
+    { { { 23, 0, 1 }, { 11, 47, 0 } } },
+    { { { 23, 0, 0 }, { 3, 63, 0 } } },
+    { { { 23, 0, 1 }, { 4, 62, 0 } } },
+    { { { 23, 0, 2 }, { 4, 63, 0 } } },
+    { { { 24, 0, 1 }, { 13, 46, 0 } } },
+    { { { 24, 0, 0 }, { 5, 62, 0 } } },
+    { { { 24, 0, 1 }, { 5, 63, 0 } } },
+    { { { 24, 0, 2 }, { 6, 62, 0 } } },
+    { { { 25, 0, 1 }, { 14, 47, 0 } } },
+    { { { 25, 0, 0 }, { 6, 63, 0 } } },
+    { { { 25, 0, 1 }, { 7, 62, 0 } } },
+    { { { 25, 0, 2 }, { 7, 63, 0 } } },
+    { { { 26, 0, 1 }, { 16, 45, 0 } } },
+    { { { 26, 0, 0 }, { 8, 62, 0 } } },
+    { { { 26, 0, 1 }, { 8, 63, 0 } } },
+    { { { 26, 0, 2 }, { 9, 62, 0 } } },
+    { { { 27, 0, 1 }, { 16, 48, 0 } } },
+    { { { 27, 0, 0 }, { 9, 63, 0 } } },
+    { { { 27, 0, 1 }, { 10, 62, 0 } } },
+    { { { 27, 0, 2 }, { 10, 63, 0 } } },
+    { { { 28, 0, 1 }, { 16, 51, 0 } } },
+    { { { 28, 0, 0 }, { 11, 62, 0 } } },
+    { { { 28, 0, 1 }, { 11, 63, 0 } } },
+    { { { 28, 0, 2 }, { 12, 62, 0 } } },
+    { { { 29, 0, 1 }, { 16, 54, 0 } } },
+    { { { 29, 0, 0 }, { 12, 63, 0 } } },
+    { { { 29, 0, 1 }, { 13, 62, 0 } } },
+    { { { 29, 0, 2 }, { 13, 63, 0 } } },
+    { { { 30, 0, 1 }, { 16, 57, 0 } } },
+    { { { 30, 0, 0 }, { 14, 62, 0 } } },
+    { { { 30, 0, 1 }, { 14, 63, 0 } } },
+    { { { 30, 0, 2 }, { 15, 62, 0 } } },
+    { { { 31, 0, 1 }, { 16, 60, 0 } } },
+    { { { 31, 0, 0 }, { 15, 63, 0 } } },
+    { { { 31, 0, 1 }, { 24, 46, 0 } } },
+    { { { 31, 0, 2 }, { 16, 62, 0 } } },
+    { { { 32, 0, 2 }, { 16, 63, 0 } } },
+    { { { 32, 0, 1 }, { 17, 62, 0 } } },
+    { { { 32, 0, 0 }, { 25, 47, 0 } } },
+    { { { 32, 0, 1 }, { 17, 63, 0 } } },
+    { { { 32, 0, 2 }, { 18, 62, 0 } } },
+    { { { 33, 0, 1 }, { 18, 63, 0 } } },
+    { { { 33, 0, 0 }, { 27, 46, 0 } } },
+    { { { 33, 0, 1 }, { 19, 62, 0 } } },
+    { { { 33, 0, 2 }, { 19, 63, 0 } } },
+    { { { 34, 0, 1 }, { 20, 62, 0 } } },
+    { { { 34, 0, 0 }, { 28, 47, 0 } } },
+    { { { 34, 0, 1 }, { 20, 63, 0 } } },
+    { { { 34, 0, 2 }, { 21, 62, 0 } } },
+    { { { 35, 0, 1 }, { 21, 63, 0 } } },
+    { { { 35, 0, 0 }, { 30, 46, 0 } } },
+    { { { 35, 0, 1 }, { 22, 62, 0 } } },
+    { { { 35, 0, 2 }, { 22, 63, 0 } } },
+    { { { 36, 0, 1 }, { 23, 62, 0 } } },
+    { { { 36, 0, 0 }, { 31, 47, 0 } } },
+    { { { 36, 0, 1 }, { 23, 63, 0 } } },
+    { { { 36, 0, 2 }, { 24, 62, 0 } } },
+    { { { 37, 0, 1 }, { 24, 63, 0 } } },
+    { { { 37, 0, 0 }, { 32, 47, 0 } } },
+    { { { 37, 0, 1 }, { 25, 62, 0 } } },
+    { { { 37, 0, 2 }, { 25, 63, 0 } } },
+    { { { 38, 0, 1 }, { 26, 62, 0 } } },
+    { { { 38, 0, 0 }, { 32, 50, 0 } } },
+    { { { 38, 0, 1 }, { 26, 63, 0 } } },
+    { { { 38, 0, 2 }, { 27, 62, 0 } } },
+    { { { 39, 0, 1 }, { 27, 63, 0 } } },
+    { { { 39, 0, 0 }, { 32, 53, 0 } } },
+    { { { 39, 0, 1 }, { 28, 62, 0 } } },
+    { { { 39, 0, 2 }, { 28, 63, 0 } } },
+    { { { 40, 0, 1 }, { 29, 62, 0 } } },
+    { { { 40, 0, 0 }, { 32, 56, 0 } } },
+    { { { 40, 0, 1 }, { 29, 63, 0 } } },
+    { { { 40, 0, 2 }, { 30, 62, 0 } } },
+    { { { 41, 0, 1 }, { 30, 63, 0 } } },
+    { { { 41, 0, 0 }, { 32, 59, 0 } } },
+    { { { 41, 0, 1 }, { 31, 62, 0 } } },
+    { { { 41, 0, 2 }, { 31, 63, 0 } } },
+    { { { 42, 0, 1 }, { 32, 61, 0 } } },
+    { { { 42, 0, 0 }, { 32, 62, 0 } } },
+    { { { 42, 0, 1 }, { 32, 63, 0 } } },
+    { { { 42, 0, 2 }, { 41, 46, 0 } } },
+    { { { 43, 0, 1 }, { 33, 62, 0 } } },
+    { { { 43, 0, 0 }, { 33, 63, 0 } } },
+    { { { 43, 0, 1 }, { 34, 62, 0 } } },
+    { { { 43, 0, 2 }, { 42, 47, 0 } } },
+    { { { 44, 0, 1 }, { 34, 63, 0 } } },
+    { { { 44, 0, 0 }, { 35, 62, 0 } } },
+    { { { 44, 0, 1 }, { 35, 63, 0 } } },
+    { { { 44, 0, 2 }, { 44, 46, 0 } } },
+    { { { 45, 0, 1 }, { 36, 62, 0 } } },
+    { { { 45, 0, 0 }, { 36, 63, 0 } } },
+    { { { 45, 0, 1 }, { 37, 62, 0 } } },
+    { { { 45, 0, 2 }, { 45, 47, 0 } } },
+    { { { 46, 0, 1 }, { 37, 63, 0 } } },
+    { { { 46, 0, 0 }, { 38, 62, 0 } } },
+    { { { 46, 0, 1 }, { 38, 63, 0 } } },
+    { { { 46, 0, 2 }, { 47, 46, 0 } } },
+    { { { 47, 0, 1 }, { 39, 62, 0 } } },
+    { { { 47, 0, 0 }, { 39, 63, 0 } } },
+    { { { 47, 0, 1 }, { 40, 62, 0 } } },
+    { { { 47, 0, 2 }, { 48, 46, 0 } } },
+    { { { 48, 0, 2 }, { 40, 63, 0 } } },
+    { { { 48, 0, 1 }, { 41, 62, 0 } } },
+    { { { 48, 0, 0 }, { 41, 63, 0 } } },
+    { { { 48, 0, 1 }, { 48, 49, 0 } } },
+    { { { 48, 0, 2 }, { 42, 62, 0 } } },
+    { { { 49, 0, 1 }, { 42, 63, 0 } } },
+    { { { 49, 0, 0 }, { 43, 62, 0 } } },
+    { { { 49, 0, 1 }, { 48, 52, 0 } } },
+    { { { 49, 0, 2 }, { 43, 63, 0 } } },
+    { { { 50, 0, 1 }, { 44, 62, 0 } } },
+    { { { 50, 0, 0 }, { 44, 63, 0 } } },
+    { { { 50, 0, 1 }, { 48, 55, 0 } } },
+    { { { 50, 0, 2 }, { 45, 62, 0 } } },
+    { { { 51, 0, 1 }, { 45, 63, 0 } } },
+    { { { 51, 0, 0 }, { 46, 62, 0 } } },
+    { { { 51, 0, 1 }, { 48, 58, 0 } } },
+    { { { 51, 0, 2 }, { 46, 63, 0 } } },
+    { { { 52, 0, 1 }, { 47, 62, 0 } } },
+    { { { 52, 0, 0 }, { 47, 63, 0 } } },
+    { { { 52, 0, 1 }, { 48, 61, 0 } } },
+    { { { 52, 0, 2 }, { 48, 62, 0 } } },
+    { { { 53, 0, 1 }, { 56, 47, 0 } } },
+    { { { 53, 0, 0 }, { 48, 63, 0 } } },
+    { { { 53, 0, 1 }, { 49, 62, 0 } } },
+    { { { 53, 0, 2 }, { 49, 63, 0 } } },
+    { { { 54, 0, 1 }, { 58, 46, 0 } } },
+    { { { 54, 0, 0 }, { 50, 62, 0 } } },
+    { { { 54, 0, 1 }, { 50, 63, 0 } } },
+    { { { 54, 0, 2 }, { 51, 62, 0 } } },
+    { { { 55, 0, 1 }, { 59, 47, 0 } } },
+    { { { 55, 0, 0 }, { 51, 63, 0 } } },
+    { { { 55, 0, 1 }, { 52, 62, 0 } } },
+    { { { 55, 0, 2 }, { 52, 63, 0 } } },
+    { { { 56, 0, 1 }, { 61, 46, 0 } } },
+    { { { 56, 0, 0 }, { 53, 62, 0 } } },
+    { { { 56, 0, 1 }, { 53, 63, 0 } } },
+    { { { 56, 0, 2 }, { 54, 62, 0 } } },
+    { { { 57, 0, 1 }, { 62, 47, 0 } } },
+    { { { 57, 0, 0 }, { 54, 63, 0 } } },
+    { { { 57, 0, 1 }, { 55, 62, 0 } } },
+    { { { 57, 0, 2 }, { 55, 63, 0 } } },
+    { { { 58, 0, 1 }, { 56, 62, 1 } } },
+    { { { 58, 0, 0 }, { 56, 62, 0 } } },
+    { { { 58, 0, 1 }, { 56, 63, 0 } } },
+    { { { 58, 0, 2 }, { 57, 62, 0 } } },
+    { { { 59, 0, 1 }, { 57, 63, 1 } } },
+    { { { 59, 0, 0 }, { 57, 63, 0 } } },
+    { { { 59, 0, 1 }, { 58, 62, 0 } } },
+    { { { 59, 0, 2 }, { 58, 63, 0 } } },
+    { { { 60, 0, 1 }, { 59, 62, 1 } } },
+    { { { 60, 0, 0 }, { 59, 62, 0 } } },
+    { { { 60, 0, 1 }, { 59, 63, 0 } } },
+    { { { 60, 0, 2 }, { 60, 62, 0 } } },
+    { { { 61, 0, 1 }, { 60, 63, 1 } } },
+    { { { 61, 0, 0 }, { 60, 63, 0 } } },
+    { { { 61, 0, 1 }, { 61, 62, 0 } } },
+    { { { 61, 0, 2 }, { 61, 63, 0 } } },
+    { { { 62, 0, 1 }, { 62, 62, 1 } } },
+    { { { 62, 0, 0 }, { 62, 62, 0 } } },
+    { { { 62, 0, 1 }, { 62, 63, 0 } } },
+    { { { 62, 0, 2 }, { 63, 62, 0 } } },
+    { { { 63, 0, 1 }, { 63, 63, 1 } } },
+    { { { 63, 0, 0 }, { 63, 63, 0 } } }
+};
diff --git a/extern/libsquish-1.15/squish.cpp b/extern/libsquish-1.15/squish.cpp
new file mode 100644
index 0000000..1d22a64
--- /dev/null
+++ b/extern/libsquish-1.15/squish.cpp
@@ -0,0 +1,403 @@
+/* -----------------------------------------------------------------------------
+
+    Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+    Permission is hereby granted, free of charge, to any person obtaining
+    a copy of this software and associated documentation files (the
+    "Software"), to deal in the Software without restriction, including
+    without limitation the rights to use, copy, modify, merge, publish,
+    distribute, sublicense, and/or sell copies of the Software, and to
+    permit persons to whom the Software is furnished to do so, subject to
+    the following conditions:
+
+    The above copyright notice and this permission notice shall be included
+    in all copies or substantial portions of the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+    OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+    IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+    CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+    TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+    SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+   -------------------------------------------------------------------------- */
+
+#include <string.h>
+#include "squish.h"
+#include "colourset.h"
+#include "maths.h"
+#include "rangefit.h"
+#include "clusterfit.h"
+#include "colourblock.h"
+#include "alpha.h"
+#include "singlecolourfit.h"
+
+namespace squish {
+
+static int FixFlags( int flags )
+{
+    // grab the flag bits
+    int method = flags & ( kDxt1 | kDxt3 | kDxt5 | kBc4 | kBc5 );
+    int fit = flags & ( kColourIterativeClusterFit | kColourClusterFit | kColourRangeFit );
+    int extra = flags & kWeightColourByAlpha;
+
+    // set defaults
+    if ( method != kDxt3
+    &&   method != kDxt5
+    &&   method != kBc4
+    &&   method != kBc5 )
+    {
+        method = kDxt1;
+    }
+    if( fit != kColourRangeFit && fit != kColourIterativeClusterFit )
+        fit = kColourClusterFit;
+
+    // done
+    return method | fit | extra;
+}
+
+void CompressMasked( u8 const* rgba, int mask, void* block, int flags, float* metric )
+{
+    // fix any bad flags
+    flags = FixFlags( flags );
+
+    if ( ( flags & ( kBc4 | kBc5 ) ) != 0 )
+    {
+        u8 alpha[16*4];
+        for( int i = 0; i < 16; ++i )
+        {
+            alpha[i*4 + 3] = rgba[i*4 + 0]; // copy R to A
+        }
+
+        u8* rBlock = reinterpret_cast< u8* >( block );
+        CompressAlphaDxt5( alpha, mask, rBlock );
+
+        if ( ( flags & ( kBc5 ) ) != 0 )
+        {
+            for( int i = 0; i < 16; ++i )
+            {
+                alpha[i*4 + 3] = rgba[i*4 + 1]; // copy G to A
+            }
+
+            u8* gBlock = reinterpret_cast< u8* >( block ) + 8;
+            CompressAlphaDxt5( alpha, mask, gBlock );
+        }
+
+        return;
+    }
+
+    // get the block locations
+    void* colourBlock = block;
+    void* alphaBlock = block;
+    if( ( flags & ( kDxt3 | kDxt5 ) ) != 0 )
+        colourBlock = reinterpret_cast< u8* >( block ) + 8;
+
+    // create the minimal point set
+    ColourSet colours( rgba, mask, flags );
+
+    // check the compression type and compress colour
+    if( colours.GetCount() == 1 )
+    {
+        // always do a single colour fit
+        SingleColourFit fit( &colours, flags );
+        fit.Compress( colourBlock );
+    }
+    else if( ( flags & kColourRangeFit ) != 0 || colours.GetCount() == 0 )
+    {
+        // do a range fit
+        RangeFit fit( &colours, flags, metric );
+        fit.Compress( colourBlock );
+    }
+    else
+    {
+        // default to a cluster fit (could be iterative or not)
+        ClusterFit fit( &colours, flags, metric );
+        fit.Compress( colourBlock );
+    }
+
+    // compress alpha separately if necessary
+    if( ( flags & kDxt3 ) != 0 )
+        CompressAlphaDxt3( rgba, mask, alphaBlock );
+    else if( ( flags & kDxt5 ) != 0 )
+        CompressAlphaDxt5( rgba, mask, alphaBlock );
+}
+
+void Decompress( u8* rgba, void const* block, int flags )
+{
+    // fix any bad flags
+    flags = FixFlags( flags );
+
+    // get the block locations
+    void const* colourBlock = block;
+    void const* alphaBlock = block;
+    if( ( flags & ( kDxt3 | kDxt5 ) ) != 0 )
+        colourBlock = reinterpret_cast< u8 const* >( block ) + 8;
+
+    // decompress colour
+    DecompressColour( rgba, colourBlock, ( flags & kDxt1 ) != 0 );
+
+    // decompress alpha separately if necessary
+    if( ( flags & kDxt3 ) != 0 )
+        DecompressAlphaDxt3( rgba, alphaBlock );
+    else if( ( flags & kDxt5 ) != 0 )
+        DecompressAlphaDxt5( rgba, alphaBlock );
+}
+
+int GetStorageRequirements( int width, int height, int flags )
+{
+    // fix any bad flags
+    flags = FixFlags( flags );
+
+    // compute the storage requirements
+    int blockcount = ( ( width + 3 )/4 ) * ( ( height + 3 )/4 );
+    int blocksize = ( ( flags & ( kDxt1 | kBc4 ) ) != 0 ) ? 8 : 16;
+    return blockcount*blocksize;
+}
+
+void CopyRGBA( u8 const* source, u8* dest, int flags )
+{
+    if (flags & kSourceBGRA)
+    {
+        // convert from bgra to rgba
+        dest[0] = source[2];
+        dest[1] = source[1];
+        dest[2] = source[0];
+        dest[3] = source[3];
+    }
+    else
+    {
+        for( int i = 0; i < 4; ++i )
+            *dest++ = *source++;
+    }
+}
+
+void CompressImage( u8 const* rgba, int width, int height, int pitch, void* blocks, int flags, float* metric )
+{
+    // fix any bad flags
+    flags = FixFlags( flags );
+
+    // loop over blocks
+#ifdef SQUISH_USE_OPENMP
+#   pragma omp parallel for
+#endif
+    for( int y = 0; y < height; y += 4 )
+    {
+        // initialise the block output
+        u8* targetBlock = reinterpret_cast< u8* >( blocks );
+        int bytesPerBlock = ( ( flags & ( kDxt1 | kBc4 ) ) != 0 ) ? 8 : 16;
+        targetBlock += ( (y / 4) * ( (width + 3) / 4) ) * bytesPerBlock;
+
+        for( int x = 0; x < width; x += 4 )
+        {
+            // build the 4x4 block of pixels
+            u8 sourceRgba[16*4];
+            u8* targetPixel = sourceRgba;
+            int mask = 0;
+            for( int py = 0; py < 4; ++py )
+            {
+                for( int px = 0; px < 4; ++px )
+                {
+                    // get the source pixel in the image
+                    int sx = x + px;
+                    int sy = y + py;
+
+                    // enable if we're in the image
+                    if( sx < width && sy < height )
+                    {
+                        // copy the rgba value
+                        u8 const* sourcePixel = rgba + pitch*sy + 4*sx;
+                        CopyRGBA(sourcePixel, targetPixel, flags);
+                        // enable this pixel
+                        mask |= ( 1 << ( 4*py + px ) );
+                    }
+
+                    // advance to the next pixel
+                    targetPixel += 4;
+                }
+            }
+
+            // compress it into the output
+            CompressMasked( sourceRgba, mask, targetBlock, flags, metric );
+
+            // advance
+            targetBlock += bytesPerBlock;
+        }
+    }
+}
+
+void CompressImage( u8 const* rgba, int width, int height, void* blocks, int flags, float* metric )
+{
+    CompressImage(rgba, width, height, width*4, blocks, flags, metric);
+}
+
+void DecompressImage( u8* rgba, int width, int height, int pitch, void const* blocks, int flags )
+{
+    // fix any bad flags
+    flags = FixFlags( flags );
+
+    // loop over blocks
+#ifdef SQUISH_USE_OPENMP
+#   pragma omp parallel for
+#endif
+    for( int y = 0; y < height; y += 4 )
+    {
+        // initialise the block input
+        u8 const* sourceBlock = reinterpret_cast< u8 const* >( blocks );
+        int bytesPerBlock = ( ( flags & ( kDxt1 | kBc4 ) ) != 0 ) ? 8 : 16;
+        sourceBlock += ( (y / 4) * ( (width + 3) / 4) ) * bytesPerBlock;
+
+        for( int x = 0; x < width; x += 4 )
+        {
+            // decompress the block
+            u8 targetRgba[4*16];
+            Decompress( targetRgba, sourceBlock, flags );
+
+            // write the decompressed pixels to the correct image locations
+            u8 const* sourcePixel = targetRgba;
+            for( int py = 0; py < 4; ++py )
+            {
+                for( int px = 0; px < 4; ++px )
+                {
+                    // get the target location
+                    int sx = x + px;
+                    int sy = y + py;
+
+                    // write if we're in the image
+                    if( sx < width && sy < height )
+                    {
+                        // copy the rgba value
+                        u8* targetPixel = rgba + pitch*sy + 4*sx;
+                        CopyRGBA(sourcePixel, targetPixel, flags);
+                    }
+
+                    // advance to the next pixel
+                    sourcePixel += 4;
+                }
+            }
+
+            // advance
+            sourceBlock += bytesPerBlock;
+        }
+    }
+}
+
+void DecompressImage( u8* rgba, int width, int height, void const* blocks, int flags )
+{
+    DecompressImage( rgba, width, height, width*4, blocks, flags );
+}
+
+static double ErrorSq(double x, double y)
+{
+    return (x - y) * (x - y);
+}
+
+static void ComputeBlockWMSE(u8 const *original, u8 const *compressed, unsigned int w, unsigned int h, double &cmse, double &amse)
+{
+    // Computes the MSE for the block and weights it by the variance of the original block.
+    // If the variance of the original block is less than 4 (i.e. a standard deviation of 1 per channel)
+    // then the block is close to being a single colour. Quantisation errors in single colour blocks
+    // are easier to see than similar errors in blocks that contain more colours, particularly when there
+    // are many such blocks in a large area (eg a blue sky background) as they cause banding.  Given that
+    // banding is easier to see than small errors in "complex" blocks, we weight the errors by a factor
+    // of 5. This implies that images with large, single colour areas will have a higher potential WMSE
+    // than images with lots of detail.
+
+    cmse = amse = 0;
+    unsigned int sum_p[4];  // per channel sum of pixels
+    unsigned int sum_p2[4]; // per channel sum of pixels squared
+    memset(sum_p, 0, sizeof(sum_p));
+    memset(sum_p2, 0, sizeof(sum_p2));
+    for( unsigned int py = 0; py < 4; ++py )
+    {
+        for( unsigned int px = 0; px < 4; ++px )
+        {
+            if( px < w && py < h )
+            {
+                double pixelCMSE = 0;
+                for( int i = 0; i < 3; ++i )
+                {
+                    pixelCMSE += ErrorSq(original[i], compressed[i]);
+                    sum_p[i] += original[i];
+                    sum_p2[i] += (unsigned int)original[i]*original[i];
+                }
+                if( original[3] == 0 && compressed[3] == 0 )
+                    pixelCMSE = 0; // transparent in both, so colour is inconsequential
+                amse += ErrorSq(original[3], compressed[3]);
+                cmse += pixelCMSE;
+                sum_p[3] += original[3];
+                sum_p2[3] += (unsigned int)original[3]*original[3];
+            }
+            original += 4;
+            compressed += 4;
+        }
+    }
+    unsigned int variance = 0;
+    for( int i = 0; i < 4; ++i )
+        variance += w*h*sum_p2[i] - sum_p[i]*sum_p[i];
+    if( variance < 4 * w * w * h * h )
+    {
+        amse *= 5;
+        cmse *= 5;
+    }
+}
+
+void ComputeMSE( u8 const *rgba, int width, int height, int pitch, u8 const *dxt, int flags, double &colourMSE, double &alphaMSE )
+{
+    // fix any bad flags
+    flags = FixFlags( flags );
+    colourMSE = alphaMSE = 0;
+
+    // initialise the block input
+    squish::u8 const* sourceBlock = dxt;
+    int bytesPerBlock = ( ( flags & squish::kDxt1 ) != 0 ) ? 8 : 16;
+
+    // loop over blocks
+    for( int y = 0; y < height; y += 4 )
+    {
+        for( int x = 0; x < width; x += 4 )
+        {
+            // decompress the block
+            u8 targetRgba[4*16];
+            Decompress( targetRgba, sourceBlock, flags );
+            u8 const* sourcePixel = targetRgba;
+
+            // copy across to a similar pixel block
+            u8 originalRgba[4*16];
+            u8* originalPixel = originalRgba;
+
+            for( int py = 0; py < 4; ++py )
+            {
+                for( int px = 0; px < 4; ++px )
+                {
+                    int sx = x + px;
+                    int sy = y + py;
+                    if( sx < width && sy < height )
+                    {
+                        u8 const* targetPixel = rgba + pitch*sy + 4*sx;
+                        CopyRGBA(targetPixel, originalPixel, flags);
+                    }
+                    sourcePixel += 4;
+                    originalPixel += 4;
+                }
+            }
+
+            // compute the weighted MSE of the block
+            double blockCMSE, blockAMSE;
+            ComputeBlockWMSE(originalRgba, targetRgba, std::min(4, width - x), std::min(4, height - y), blockCMSE, blockAMSE);
+            colourMSE += blockCMSE;
+            alphaMSE += blockAMSE;
+            // advance
+            sourceBlock += bytesPerBlock;
+        }
+    }
+    colourMSE /= (width * height * 3);
+    alphaMSE /= (width * height);
+}
+
+void ComputeMSE( u8 const *rgba, int width, int height, u8 const *dxt, int flags, double &colourMSE, double &alphaMSE )
+{
+    ComputeMSE(rgba, width, height, width*4, dxt, flags, colourMSE, alphaMSE);
+}
+
+} // namespace squish
diff --git a/src/nvtt/tests/CMakeLists.txt b/src/nvtt/tests/CMakeLists.txt
index 4a3a6de..0cf1ab4 100644
--- a/src/nvtt/tests/CMakeLists.txt
+++ b/src/nvtt/tests/CMakeLists.txt
@@ -28,6 +28,9 @@ TARGET_LINK_LIBRARIES(cubemaptest nvcore nvmath nvimage nvtt)
 ADD_EXECUTABLE(nvhdrtest hdrtest.cpp)
 TARGET_LINK_LIBRARIES(nvhdrtest nvcore nvimage nvtt bc6h nvmath)
 
+ADD_EXECUTABLE(bc1enc bc1enc.cpp)
+TARGET_LINK_LIBRARIES(bc1enc nvcore nvimage nvmath nvtt squish CMP_Core)
+
 INSTALL(TARGETS nvtestsuite nvhdrtest DESTINATION bin)
  
 #include_directories("/usr/include/ffmpeg/")