diff --git a/project/vc9/nvimage/nvimage.vcproj b/project/vc9/nvimage/nvimage.vcproj
index 93b3dae..f6a28e7 100644
--- a/project/vc9/nvimage/nvimage.vcproj
+++ b/project/vc9/nvimage/nvimage.vcproj
@@ -311,6 +311,14 @@
 			RelativePath="..\..\..\src\nvimage\DirectDrawSurface.h"
 			>
 		</File>
+		<File
+			RelativePath="..\..\..\src\nvimage\ErrorMetric.cpp"
+			>
+		</File>
+		<File
+			RelativePath="..\..\..\src\nvimage\ErrorMetric.h"
+			>
+		</File>
 		<File
 			RelativePath="..\..\..\src\nvimage\Filter.cpp"
 			>
diff --git a/project/vc9/nvtt.sln b/project/vc9/nvtt.sln
index 6d2206b..c186d94 100644
--- a/project/vc9/nvtt.sln
+++ b/project/vc9/nvtt.sln
@@ -25,6 +25,11 @@ EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "nvmath", "nvmath\nvmath.vcproj", "{50C465FE-B308-42BC-894D-89484482AF06}"
 EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "squish", "squish\squish.vcproj", "{CE017322-01FC-4851-9C8B-64E9A8E26C38}"
+	ProjectSection(ProjectDependencies) = postProject
+		{F143D180-D4C4-4037-B3DE-BE89A21C8D1D} = {F143D180-D4C4-4037-B3DE-BE89A21C8D1D}
+		{4046F392-A18B-4C66-9639-3EABFFF5D531} = {4046F392-A18B-4C66-9639-3EABFFF5D531}
+		{50C465FE-B308-42BC-894D-89484482AF06} = {50C465FE-B308-42BC-894D-89484482AF06}
+	EndProjectSection
 EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "nvddsinfo", "nvddsinfo\nvddsinfo.vcproj", "{841B73C5-C679-4EEF-A50A-7D6106642B49}"
 	ProjectSection(ProjectDependencies) = postProject
diff --git a/project/vc9/nvtt/nvtt.vcproj b/project/vc9/nvtt/nvtt.vcproj
index cab8305..9a8d035 100644
--- a/project/vc9/nvtt/nvtt.vcproj
+++ b/project/vc9/nvtt/nvtt.vcproj
@@ -864,6 +864,14 @@
 				>
 			</File>
 		</Filter>
+		<File
+			RelativePath="..\..\..\src\nvtt\ClusterFit.cpp"
+			>
+		</File>
+		<File
+			RelativePath="..\..\..\src\nvtt\ClusterFit.h"
+			>
+		</File>
 		<File
 			RelativePath="..\..\..\src\nvtt\CompressionOptions.cpp"
 			>
@@ -887,10 +895,138 @@
 		<File
 			RelativePath="..\..\..\src\nvtt\CompressorDX11.cpp"
 			>
+			<FileConfiguration
+				Name="Debug|Win32"
+				ExcludedFromBuild="true"
+				>
+				<Tool
+					Name="VCCLCompilerTool"
+				/>
+			</FileConfiguration>
+			<FileConfiguration
+				Name="Debug|x64"
+				ExcludedFromBuild="true"
+				>
+				<Tool
+					Name="VCCLCompilerTool"
+				/>
+			</FileConfiguration>
+			<FileConfiguration
+				Name="Release|Win32"
+				ExcludedFromBuild="true"
+				>
+				<Tool
+					Name="VCCLCompilerTool"
+				/>
+			</FileConfiguration>
+			<FileConfiguration
+				Name="Release|x64"
+				ExcludedFromBuild="true"
+				>
+				<Tool
+					Name="VCCLCompilerTool"
+				/>
+			</FileConfiguration>
+			<FileConfiguration
+				Name="Debug (no cuda)|Win32"
+				ExcludedFromBuild="true"
+				>
+				<Tool
+					Name="VCCLCompilerTool"
+				/>
+			</FileConfiguration>
+			<FileConfiguration
+				Name="Debug (no cuda)|x64"
+				ExcludedFromBuild="true"
+				>
+				<Tool
+					Name="VCCLCompilerTool"
+				/>
+			</FileConfiguration>
+			<FileConfiguration
+				Name="Release (no cuda)|Win32"
+				ExcludedFromBuild="true"
+				>
+				<Tool
+					Name="VCCLCompilerTool"
+				/>
+			</FileConfiguration>
+			<FileConfiguration
+				Name="Release (no cuda)|x64"
+				ExcludedFromBuild="true"
+				>
+				<Tool
+					Name="VCCLCompilerTool"
+				/>
+			</FileConfiguration>
 		</File>
 		<File
 			RelativePath="..\..\..\src\nvtt\CompressorDX11.h"
 			>
+			<FileConfiguration
+				Name="Debug|Win32"
+				ExcludedFromBuild="true"
+				>
+				<Tool
+					Name="VCCustomBuildTool"
+				/>
+			</FileConfiguration>
+			<FileConfiguration
+				Name="Debug|x64"
+				ExcludedFromBuild="true"
+				>
+				<Tool
+					Name="VCCustomBuildTool"
+				/>
+			</FileConfiguration>
+			<FileConfiguration
+				Name="Release|Win32"
+				ExcludedFromBuild="true"
+				>
+				<Tool
+					Name="VCCustomBuildTool"
+				/>
+			</FileConfiguration>
+			<FileConfiguration
+				Name="Release|x64"
+				ExcludedFromBuild="true"
+				>
+				<Tool
+					Name="VCCustomBuildTool"
+				/>
+			</FileConfiguration>
+			<FileConfiguration
+				Name="Debug (no cuda)|Win32"
+				ExcludedFromBuild="true"
+				>
+				<Tool
+					Name="VCCustomBuildTool"
+				/>
+			</FileConfiguration>
+			<FileConfiguration
+				Name="Debug (no cuda)|x64"
+				ExcludedFromBuild="true"
+				>
+				<Tool
+					Name="VCCustomBuildTool"
+				/>
+			</FileConfiguration>
+			<FileConfiguration
+				Name="Release (no cuda)|Win32"
+				ExcludedFromBuild="true"
+				>
+				<Tool
+					Name="VCCustomBuildTool"
+				/>
+			</FileConfiguration>
+			<FileConfiguration
+				Name="Release (no cuda)|x64"
+				ExcludedFromBuild="true"
+				>
+				<Tool
+					Name="VCCustomBuildTool"
+				/>
+			</FileConfiguration>
 		</File>
 		<File
 			RelativePath="..\..\..\src\nvtt\CompressorDX9.cpp"
diff --git a/src/nvimage/ColorBlock.cpp b/src/nvimage/ColorBlock.cpp
index db77083..a829c47 100644
--- a/src/nvimage/ColorBlock.cpp
+++ b/src/nvimage/ColorBlock.cpp
@@ -6,6 +6,7 @@
 #include "nvmath/Box.h"
 #include "nvcore/Utils.h" // swap
 
+#include <string.h> // memcpy
 
 using namespace nv;
 
@@ -457,44 +458,176 @@ float ColorBlock::volume() const
     return bounds.volume();
 }*/
 
+#include "FloatImage.h"
 
-
-void ColorSet::init(const Image * img, uint x, uint y)
+void ColorSet::setColors(const float * data, uint img_w, uint img_h, uint img_x, uint img_y)
 {
-    w = min(4U, img->width() - x);
-    h = min(4U, img->height() - y);
+    nvDebugCheck(img_x < img_w && img_y < img_h);
+
+    w = min(4U, img_w - img_x);
+    h = min(4U, img_h - img_y);
     nvDebugCheck(w != 0 && h != 0);
 
-    // Blocks that are smaller than 4x4 are handled by repeating the pixels.
-    // @@ Thats only correct when block size is 1, 2 or 4, but not with 3. :(
-    // @@ Ideally we should zero the weights of the pixels out of range.
+    count = w * h;
 
-    for (uint i = 0; i < 4; i++)
+    const float * r = data + img_w * img_h * 0;
+    const float * g = data + img_w * img_h * 1;
+    const float * b = data + img_w * img_h * 2;
+    const float * a = data + img_w * img_h * 3;
+
+    // Set colors.
+    for (uint y = 0, i = 0; y < h; y++)
     {
-        const uint by = i % h;
+        for (uint x = 0; x < w; x++, i++)
+        {
+            colors[i].x = r[x + img_x, y + img_y];
+            colors[i].y = g[x + img_x, y + img_y];
+            colors[i].z = b[x + img_x, y + img_y];
+            colors[i].w = a[x + img_x, y + img_y];
+        }
+    }
+}
 
-        for (uint e = 0; e < 4; e++)
+void ColorSet::setAlphaWeights()
+{
+    for (uint i = 0; i < count; i++)
+    {
+        weights[i] = max(colors[i].w, 0.001f); // Avoid division by zero.
+    }
+}
+
+void ColorSet::setUniformWeights()
+{
+    for (uint i = 0; i < count; i++)
+    {
+        weights[i] = 1.0f;
+    }
+}
+
+
+void ColorSet::createMinimalSet(bool ignoreTransparent)
+{
+    nvDebugCheck(count == w*h); // Do not call this method multiple times.
+
+    Vector4 C[16];
+    float W[16];
+    memcpy(C, colors, sizeof(Vector4)*count);
+    memcpy(W, weights, sizeof(float)*count);
+
+    uint n = 0;
+    for (uint y = 0, i = 0; y < h; y++)
+    {
+        for (uint x = 0; x < w; x++, i++)
         {
-            const uint bx = e % w;
-            Color32 c = img->pixel(x+bx, y+by);
-            Vector4 & v = color(e, i);
-            v.x = c.r / 255.0f;
-            v.y = c.g / 255.0f;
-            v.z = c.b / 255.0f;
-            v.w = c.a / 255.0f;
+            if (ignoreTransparent && C[i].w == 0) {
+                continue;
+            }
+
+            uint idx = y * 4 + x;
+
+            // loop over previous points for a match
+            for (int j = 0; ; j++)
+            {
+                // allocate a new point
+                if (j == i)
+                {
+				    colors[n] = C[i];
+				    weights[n] = W[i];
+                    remap[idx] = n;
+                    n++;
+                    break;
+                }
+
+                // check for a match
+                bool colorMatch = (C[i].x == C[j].x) && (C[i].w == C[j].w) && (C[i].z == C[j].z);
+                //bool alphaMatch = (C[i].w == C[j].w);
+
+			    if (colorMatch)
+			    {
+				    // get the index of the match
+				    int index = remap[j];
+    				
+				    // map to this point and increase the weight
+				    weights[index] += W[i];
+				    remap[idx] = index;
+				    break;
+			    }
+            }
         }
     }
+
+    count = n;
+
+    // Avoid empty blocks.
+    if (count == 0) {
+        count = 1;
+        //colors[0] = C[0];
+        //weights[0] = W[0];
+        memset(remap, 0, sizeof(int)*16);
+    }
 }
 
-void ColorSet::init(const FloatImage * img, uint x, uint y)
+
+// Fill blocks that are smaller than (4,4) by wrapping indices.
+void ColorSet::wrapIndices()
 {
+    for (uint y = h; y < 4; y++)
+    {
+        uint base = (y % h) * w;
+        for (uint x = w; x < 4; x++)
+        {
+            remap[y*4+3] = remap[base + (x % w)];
+        }
+    }
 }
 
-void ColorSet::init(const uint * data, uint w, uint h, uint x, uint y)
+bool ColorSet::isSingleColor(bool ignoreAlpha) const
 {
+    Vector4 v = colors[0];
+    if (ignoreAlpha) v.w = 1.0f;
+
+    for (uint i = 1; i < count; i++)
+    {
+        Vector4 c = colors[i];
+        if (ignoreAlpha) c.w = 1.0f;
+
+        if (v != c) {
+            return false;
+        }
+    }
+
+    return true;
+}
+
+
+// 0=r, 1=g, 2=b, 3=a, 4=0xFF, 5=0
+static inline float component(Vector4::Arg c, uint i)
+{
+    if (i == 0) return c.x;
+    if (i == 1) return c.y;
+    if (i == 2) return c.z;
+    if (i == 3) return c.w;
+    if (i == 4) return 0xFF;
+    return 0;
 }
 
-void ColorSet::init(const float * data, uint w, uint h, uint x, uint y)
+void ColorSet::swizzle(uint x, uint y, uint z, uint w)
 {
+    for (uint i = 0; i < count; i++)
+    {
+        Vector4 c = colors[i];
+        colors[i].x = component(c, x);
+        colors[i].y = component(c, y);
+        colors[i].z = component(c, z);
+        colors[i].w = component(c, w);
+    }
 }
 
+bool ColorSet::hasAlpha() const
+{
+    for (uint i = 0; i < count; i++)
+    {
+        if (colors[i].w != 0.0f) return true;
+    }
+    return false;
+}
diff --git a/src/nvimage/ColorBlock.h b/src/nvimage/ColorBlock.h
index e87cc9f..572be5a 100644
--- a/src/nvimage/ColorBlock.h
+++ b/src/nvimage/ColorBlock.h
@@ -82,22 +82,33 @@ namespace nv
 
     struct ColorSet
     {
-        ColorSet() : w(4), h(4) {}
-        ColorSet(uint w, uint h) : w(w), h(h) {}
+        void setColors(const float * data, uint img_w, uint img_h, uint img_x, uint img_y);
 
-        void init(const Image * img, uint x, uint y);
-        void init(const FloatImage * img, uint x, uint y);
-        void init(const uint * data, uint w, uint h, uint x, uint y);
-        void init(const float * data, uint w, uint h, uint x, uint y);
+        void setAlphaWeights();
+        void setUniformWeights();
 
-        Vector4 color(uint x, uint y) const { nvDebugCheck(x < w && y < h); return colors[y * 4 + x]; }
-        Vector4 & color(uint x, uint y) { nvDebugCheck(x < w && y < h); return colors[y * 4 + x]; }
+        void createMinimalSet(bool ignoreTransparent);
+        void wrapIndices();
 
-        Vector4 color(uint i) const { nvDebugCheck(i < 16); return colors[i]; }
-        Vector4 & color(uint i) { nvDebugCheck(i < 16); return colors[i]; }
+        void swizzle(uint x, uint y, uint z, uint w); // 0=r, 1=g, 2=b, 3=a, 4=0xFF, 5=0
 
-        Vector4 colors[16];
+        bool isSingleColor(bool ignoreAlpha) const;
+        bool hasAlpha() const;
+
+        // These methods require indices to be set:
+        Vector4 color(uint x, uint y) const { nvDebugCheck(x < w && y < h); return colors[remap[y * 4 + x]]; }
+        Vector4 & color(uint x, uint y) { nvDebugCheck(x < w && y < h); return colors[remap[y * 4 + x]]; }
+
+        Vector4 color(uint i) const { nvDebugCheck(i < 16); return colors[remap[i]]; }
+        Vector4 & color(uint i) { nvDebugCheck(i < 16); return colors[remap[i]]; }
+
+
+        uint count;
         uint w, h;
+
+        Vector4 colors[16];
+        float weights[16];
+        int remap[16];
     };
 
 } // nv namespace
diff --git a/src/nvimage/ErrorMetric.cpp b/src/nvimage/ErrorMetric.cpp
index 0c4741e..c4db41d 100644
--- a/src/nvimage/ErrorMetric.cpp
+++ b/src/nvimage/ErrorMetric.cpp
@@ -146,7 +146,7 @@ static Vector3 xyzToCieLab(Vector3::Arg c)
     // Normalized white point.
     const float Xn = 0.950456f;
     const float Yn = 1.0f;
-    const float Zn = 1.088754;
+    const float Zn = 1.088754f;
 
     float Xr = c.x / Xn;
     float Yr = c.y / Yn;
@@ -159,6 +159,8 @@ static Vector3 xyzToCieLab(Vector3::Arg c)
     float L = 116 * fx - 16;
     float a = 500 * (fx - fy);
     float b = 200 * (fy - fz);
+
+    return Vector3(L, a, b);
 }
 
 static Vector3 rgbToCieLab(Vector3::Arg c)
@@ -222,6 +224,9 @@ float nv::cieLabError(const FloatImage * img0, const FloatImage * img1)
         Vector3 lab1 = rgbToCieLab(Vector3(r1[i], g1[i], b1[i]));
 
         // @@ Measure Delta E.
+        Vector3 delta = lab0 - lab1;
+        
+        error += length(delta);
     }
 
     return float(error / count);
diff --git a/src/nvmath/Fitting.cpp b/src/nvmath/Fitting.cpp
index 00adcb0..de387d5 100644
--- a/src/nvmath/Fitting.cpp
+++ b/src/nvmath/Fitting.cpp
@@ -30,7 +30,7 @@ static inline Vector3 firstEigenVector_PowerMethod(const float *__restrict matri
 {
     if (matrix[0] == 0 && matrix[3] == 0 && matrix[5] == 0)
     {
-        return Vector3(zero);
+        return Vector3(0.0f);
     }
 
     Vector3 v = estimatePrincipleComponent(matrix);
@@ -53,7 +53,7 @@ static inline Vector3 firstEigenVector_PowerMethod(const float *__restrict matri
 
 Vector3 nv::Fit::computeCentroid(int n, const Vector3 *__restrict points)
 {
-    Vector3 centroid(zero);
+    Vector3 centroid(0.0f);
 
     for (int i = 0; i < n; i++)
     {
@@ -66,7 +66,7 @@ Vector3 nv::Fit::computeCentroid(int n, const Vector3 *__restrict points)
 
 Vector3 nv::Fit::computeCentroid(int n, const Vector3 *__restrict points, const float *__restrict weights, Vector3::Arg metric)
 {
-    Vector3 centroid(zero);
+    Vector3 centroid(0.0f   );
     float total = 0.0f;
 
     for (int i = 0; i < n; i++)
@@ -210,7 +210,7 @@ int nv::Fit::compute4Means(int n, const Vector3 *__restrict points, const float
     // Now we have to iteratively refine the clusters.
     while (true)
     {
-        Vector3 newCluster[4] = { Vector3(zero), Vector3(zero), Vector3(zero), Vector3(zero) };
+        Vector3 newCluster[4] = { Vector3(0.0f), Vector3(0.0f), Vector3(0.0f), Vector3(0.0f) };
         float total[4] = {0, 0, 0, 0};
 
         for (int i = 0; i < n; ++i)
diff --git a/src/nvmath/Matrix.h b/src/nvmath/Matrix.h
index 0607fdf..adbefe1 100644
--- a/src/nvmath/Matrix.h
+++ b/src/nvmath/Matrix.h
@@ -9,6 +9,8 @@
 
 namespace nv
 {
+    enum zero_t { zero };
+    enum identity_t { identity };
 
     class NVMATH_CLASS Matrix3
     {
diff --git a/src/nvmath/SimdVector_SSE.h b/src/nvmath/SimdVector_SSE.h
index 8677322..2b8271b 100644
--- a/src/nvmath/SimdVector_SSE.h
+++ b/src/nvmath/SimdVector_SSE.h
@@ -26,6 +26,8 @@
 #ifndef NV_SIMD_VECTOR_SSE_H
 #define NV_SIMD_VECTOR_SSE_H
 
+#include "nvcore/Memory.h"
+
 #include <xmmintrin.h>
 #if (NV_USE_SSE > 1)
 #include <emmintrin.h>
@@ -35,6 +37,7 @@ namespace nv {
 
     class SimdVector
     {
+    public:
         __m128 vec;
 
         typedef SimdVector const& Arg;
@@ -42,15 +45,13 @@ namespace nv {
         SimdVector() {}
         explicit SimdVector(float f) : vec(_mm_set1_ps(f)) {}
         explicit SimdVector(__m128 v) : vec(v) {}
-        SimdVector(const SimdVector & arg) : vec(arg.vec) {}
 
-        SimdVector & operator=(const SimdVector & arg)
+        explicit SimdVector(Vector4::Arg v)
         {
-            vec = arg.vec;
-            return *this;
+            vec = _mm_load_ps( v.component );
         }
 
-        SimdVector(const float * v)
+        explicit SimdVector(const float * v)
         {
             vec = _mm_load_ps( v );
         }
@@ -60,6 +61,16 @@ namespace nv {
             vec = _mm_setr_ps( x, y, z, w );
         }
 
+        SimdVector(const SimdVector & arg) : vec(arg.vec) {}
+
+        SimdVector & operator=(const SimdVector & arg)
+        {
+            vec = arg.vec;
+            return *this;
+        }
+
+
+
         float toFloat() const 
         {
             NV_ALIGN_16 float f;
@@ -77,7 +88,7 @@ namespace nv {
         Vector4 toVector4() const
         {
             NV_ALIGN_16 float c[4];
-            _mm_store_ps( v.components, vec );
+            _mm_store_ps( c, vec );
             return Vector4( c[0], c[1], c[2], c[3] );
         }
 
@@ -108,34 +119,34 @@ namespace nv {
     };
 
 
-    SimdVector operator+( SimdVector::Arg left, SimdVector::Arg right  )
+    inline SimdVector operator+( SimdVector::Arg left, SimdVector::Arg right  )
     {
         return SimdVector( _mm_add_ps( left.vec, right.vec ) );
     }
 
-    SimdVector operator-( SimdVector::Arg left, SimdVector::Arg right  )
+    inline SimdVector operator-( SimdVector::Arg left, SimdVector::Arg right  )
     {
         return SimdVector( _mm_sub_ps( left.vec, right.vec ) );
     }
 
-    SimdVector operator*( SimdVector::Arg left, SimdVector::Arg right  )
+    inline SimdVector operator*( SimdVector::Arg left, SimdVector::Arg right  )
     {
         return SimdVector( _mm_mul_ps( left.vec, right.vec ) );
     }
 
     // Returns a*b + c
-    SimdVector multiplyAdd( SimdVector::Arg a, SimdVector::Arg b, SimdVector::Arg c )
+    inline SimdVector multiplyAdd( SimdVector::Arg a, SimdVector::Arg b, SimdVector::Arg c )
     {
         return SimdVector( _mm_add_ps( _mm_mul_ps( a.vec, b.vec ), c.vec ) );
     }
 
     // Returns -( a*b - c )
-    SimdVector negativeMultiplySubtract( SimdVector::Arg a, SimdVector::Arg b, SimdVector::Arg c )
+    inline SimdVector negativeMultiplySubtract( SimdVector::Arg a, SimdVector::Arg b, SimdVector::Arg c )
     {
         return SimdVector( _mm_sub_ps( c.vec, _mm_mul_ps( a.vec, b.vec ) ) );
     }
 
-    SimdVector reciprocal( SimdVector::Arg v )
+    inline SimdVector reciprocal( SimdVector::Arg v )
     {
         // get the reciprocal estimate
         __m128 estimate = _mm_rcp_ps( v.vec );
@@ -145,17 +156,17 @@ namespace nv {
         return SimdVector( _mm_add_ps( _mm_mul_ps( diff, estimate ), estimate ) );
     }
 
-    SimdVector min( SimdVector::Arg left, SimdVector::Arg right )
+    inline SimdVector min( SimdVector::Arg left, SimdVector::Arg right )
     {
         return SimdVector( _mm_min_ps( left.vec, right.vec ) );
     }
 
-    SimdVector max( SimdVector::Arg left, SimdVector::Arg right )
+    inline SimdVector max( SimdVector::Arg left, SimdVector::Arg right )
     {
         return SimdVector( _mm_max_ps( left.vec, right.vec ) );
     }
 
-    SimdVector truncate( SimdVector::Arg v )
+    inline SimdVector truncate( SimdVector::Arg v )
     {
 #if (NV_USE_SSE == 1)
         // convert to ints
@@ -176,12 +187,12 @@ namespace nv {
 #endif
     }
 
-    SimdVector compareEqual( SimdVector::Arg left, SimdVector::Arg right )
+    inline SimdVector compareEqual( SimdVector::Arg left, SimdVector::Arg right )
     {
         return SimdVector( _mm_cmpeq_ps( left.vec, right.vec ) );
     }
 
-    SimdVector select( SimdVector::Arg off, SimdVector::Arg on, SimdVector::Arg bits )
+    inline SimdVector select( SimdVector::Arg off, SimdVector::Arg on, SimdVector::Arg bits )
     {
         __m128 a = _mm_andnot_ps( bits.vec, off.vec );
         __m128 b = _mm_and_ps( bits.vec, on.vec );
@@ -189,7 +200,7 @@ namespace nv {
         return SimdVector( _mm_or_ps( a, b ) );
     }
 
-    bool compareAnyLessThan( SimdVector::Arg left, SimdVector::Arg right ) 
+    inline bool compareAnyLessThan( SimdVector::Arg left, SimdVector::Arg right ) 
     {
         __m128 bits = _mm_cmplt_ps( left.vec, right.vec );
         int value = _mm_movemask_ps( bits );
diff --git a/src/nvmath/Vector.h b/src/nvmath/Vector.h
index 9a3f093..7ab40e8 100644
--- a/src/nvmath/Vector.h
+++ b/src/nvmath/Vector.h
@@ -10,9 +10,6 @@
 namespace nv
 {
 
-    enum zero_t { zero };
-    enum identity_t { identity };
-
     // I should probably use templates.
     typedef float scalar;
 
@@ -22,7 +19,6 @@ namespace nv
         typedef Vector2 const & Arg;
 
         Vector2();
-        explicit Vector2(zero_t);
         explicit Vector2(scalar f);
         Vector2(scalar x, scalar y);
         Vector2(Vector2::Arg v);
@@ -57,7 +53,7 @@ namespace nv
         typedef Vector3 const & Arg;
 
         Vector3();
-        explicit Vector3(zero_t);
+        explicit Vector3(scalar x);
         Vector3(scalar x, scalar y, scalar z);
         Vector3(Vector2::Arg v, scalar z);
         Vector3(Vector3::Arg v);
@@ -99,7 +95,7 @@ namespace nv
         typedef Vector4 const & Arg;
 
         Vector4();
-        explicit Vector4(zero_t);
+        explicit Vector4(scalar x);
         Vector4(scalar x, scalar y, scalar z, scalar w);
         Vector4(Vector2::Arg v, scalar z, scalar w);
         Vector4(Vector3::Arg v, scalar w);
@@ -136,7 +132,6 @@ namespace nv
     // Vector2
 
     inline Vector2::Vector2() {}
-    inline Vector2::Vector2(zero_t) : x(0.0f), y(0.0f) {}
     inline Vector2::Vector2(scalar f) : x(f), y(f) {}
     inline Vector2::Vector2(scalar x, scalar y) : x(x), y(y) {}
     inline Vector2::Vector2(Vector2::Arg v) : x(v.x), y(v.y) {}
@@ -201,7 +196,7 @@ namespace nv
     // Vector3
 
     inline Vector3::Vector3() {}
-    inline Vector3::Vector3(zero_t) : x(0.0f), y(0.0f), z(0.0f) {}
+    inline Vector3::Vector3(scalar f) : x(f), y(f), z(f) {}
     inline Vector3::Vector3(scalar x, scalar y, scalar z) : x(x), y(y), z(z) {}
     inline Vector3::Vector3(Vector2::Arg v, scalar z) : x(v.x), y(v.y), z(z) {}
     inline Vector3::Vector3(Vector3::Arg v) : x(v.x), y(v.y), z(v.z) {}
@@ -286,7 +281,7 @@ namespace nv
     // Vector4
 
     inline Vector4::Vector4() {}
-    inline Vector4::Vector4(zero_t) : x(0.0f), y(0.0f), z(0.0f), w(0.0f) {}
+    inline Vector4::Vector4(scalar f) : x(f), y(f), z(f), w(f) {}
     inline Vector4::Vector4(scalar x, scalar y, scalar z, scalar w) : x(x), y(y), z(z), w(w) {}
     inline Vector4::Vector4(Vector2::Arg v, scalar z, scalar w) : x(v.x), y(v.y), z(z), w(w) {}
     inline Vector4::Vector4(Vector3::Arg v, scalar w) : x(v.x), y(v.y), z(v.z), w(w) {}
@@ -640,6 +635,15 @@ namespace nv
         return isFinite(v.x) && isFinite(v.y) && isFinite(v.z);
     }
 
+    inline Vector3 floor(Vector3::Arg v)
+    {
+        return Vector3(floorf(v.x), floorf(v.y), floorf(v.z));
+    }
+
+    inline Vector3 ceil(Vector3::Arg v)
+    {
+        return Vector3(ceilf(v.x), ceilf(v.y), ceilf(v.z));
+    }
 
     // Vector4
 
diff --git a/src/nvtt/CompressorDX11.cpp b/src/nvtt/CompressorDX11.cpp
index d698842..2b443e7 100644
--- a/src/nvtt/CompressorDX11.cpp
+++ b/src/nvtt/CompressorDX11.cpp
@@ -1,62 +1,62 @@
 // Copyright (c) 2009-2011 Ignacio Castano <castano@gmail.com>
 // Copyright (c) 2007-2009 NVIDIA Corporation -- Ignacio Castano <icastano@nvidia.com>
-// 
-// Permission is hereby granted, free of charge, to any person
-// obtaining a copy of this software and associated documentation
-// files (the "Software"), to deal in the Software without
-// restriction, including without limitation the rights to use,
-// copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the
-// Software is furnished to do so, subject to the following
-// conditions:
-// 
-// The above copyright notice and this permission notice shall be
-// included in all copies or substantial portions of the Software.
-// 
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-// OTHER DEALINGS IN THE SOFTWARE.
-
-#include "CompressorDX11.h"
-
-#include "nvtt.h"
-#include "CompressionOptions.h"
-
-#include "bc6h/zoh.h"
-#include "bc6h/utils.h"
-
-//#include "bc7/avpcl.h"
-//#include "bc7/utils.h"
-
-using namespace nv;
-using namespace nvtt;
-
-
-void CompressorBC6::compressBlock(Tile & tile, AlphaMode alphaMode, const CompressionOptions::Private & compressionOptions, void * output)
-{
-    NV_UNUSED(alphaMode); // ZOH does not support alpha.
-
-    if (compressionOptions.pixelType == PixelType_UnsignedFloat ||
-        compressionOptions.pixelType == PixelType_UnsignedNorm ||
-        compressionOptions.pixelType == PixelType_UnsignedInt)
-    {
-        Utils::FORMAT = UNSIGNED_F16; // @@ Do not use globals.
-    }
-    else
-    {
-        Utils::FORMAT = SIGNED_F16;
-    }
-
-    ZOH::compress(tile, (char *)output);
-}
-
-
-void CompressorBC7::compressBlock(Tile & tile, AlphaMode alphaMode, const CompressionOptions::Private & compressionOptions, void * output)
-{
-    // @@ TODO
-}
+// 
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+#include "CompressorDX11.h"
+
+#include "nvtt.h"
+#include "CompressionOptions.h"
+
+#include "bc6h/zoh.h"
+#include "bc6h/utils.h"
+
+//#include "bc7/avpcl.h"
+//#include "bc7/utils.h"
+
+using namespace nv;
+using namespace nvtt;
+
+
+void CompressorBC6::compressBlock(Tile & tile, AlphaMode alphaMode, const CompressionOptions::Private & compressionOptions, void * output)
+{
+    NV_UNUSED(alphaMode); // ZOH does not support alpha.
+
+    if (compressionOptions.pixelType == PixelType_UnsignedFloat ||
+        compressionOptions.pixelType == PixelType_UnsignedNorm ||
+        compressionOptions.pixelType == PixelType_UnsignedInt)
+    {
+        Utils::FORMAT = UNSIGNED_F16; // @@ Do not use globals.
+    }
+    else
+    {
+        Utils::FORMAT = SIGNED_F16;
+    }
+
+    ZOH::compress(tile, (char *)output);
+}
+
+
+void CompressorBC7::compressBlock(Tile & tile, AlphaMode alphaMode, const CompressionOptions::Private & compressionOptions, void * output)
+{
+    // @@ TODO
+}
diff --git a/src/nvtt/CompressorDX11.h b/src/nvtt/CompressorDX11.h
index 0022811..f665e3f 100644
--- a/src/nvtt/CompressorDX11.h
+++ b/src/nvtt/CompressorDX11.h
@@ -28,15 +28,15 @@
 
 namespace nv
 {
-    struct CompressorBC6 : public TileCompressor
+    struct CompressorBC6 : public ColorSetCompressor
     {
-        virtual void compressBlock(Tile & tile, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
+        virtual void compressBlock(ColorSet & set, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
         virtual uint blockSize() const { return 16; }
     };
 
-    struct CompressorBC7 : public TileCompressor
+    struct CompressorBC7 : public ColorSetCompressor
     {
-        virtual void compressBlock(Tile & tile, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
+        virtual void compressBlock(ColorSet & set, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
         virtual uint blockSize() const { return 16; }
     };
 	
diff --git a/src/nvtt/CompressorDX9.cpp b/src/nvtt/CompressorDX9.cpp
index cd6ba84..d8b529b 100644
--- a/src/nvtt/CompressorDX9.cpp
+++ b/src/nvtt/CompressorDX9.cpp
@@ -27,6 +27,7 @@
 #include "OptimalCompressDXT.h"
 #include "CompressionOptions.h"
 #include "OutputOptions.h"
+#include "ClusterFit.h"
 
 // squish
 #include "squish/colourset.h"
@@ -109,30 +110,36 @@ void FastCompressorDXT5n::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alpha
 }
 
 
-inline static Vector3 vec(nvsquish::Vec3 v) { return Vector3(v.X(), v.Y(), v.Z()); }
-
-void NormalCompressorDXT1::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
+void NormalCompressorDXT1::compressBlock(ColorSet & set, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
 {
-	nvsquish::WeightedClusterFit fit;
-	fit.SetMetric(compressionOptions.colorWeight.x, compressionOptions.colorWeight.y, compressionOptions.colorWeight.z);
+    set.setUniformWeights();
+    set.createMinimalSet(false);
+
+	ClusterFit fit;
+	fit.setMetric(compressionOptions.colorWeight);
 
     BlockDXT1 * block = new(output) BlockDXT1;
-    if (rgba.isSingleColor())
+    
+    if (set.isSingleColor(true))
 	{
-		OptimalCompress::compressDXT1(rgba.color(0), block);
+        Color32 c;
+        c.r = uint8(clamp(set.colors[0].x, 0.0f, 1.0f) * 255);
+        c.g = uint8(clamp(set.colors[0].y, 0.0f, 1.0f) * 255);
+        c.b = uint8(clamp(set.colors[0].z, 0.0f, 1.0f) * 255);
+        c.a = 255;
+		OptimalCompress::compressDXT1(c, block);
 	}
 	else
 	{
-		nvsquish::ColourSet colours((uint8 *)rgba.colors(), 0);
-		fit.SetColourSet(&colours, nvsquish::kDxt1);
+		fit.setColourSet(&set);
 		
-        nvsquish::Vec3 start, end;
+        Vector3 start, end;
         
-        fit.Compress4(&start, &end);
-        QuickCompress::outputBlock4(rgba, vec(start), vec(end), block);
+        fit.compress4(&start, &end);
+        QuickCompress::outputBlock4(set, start, end, block);
 
-        if (fit.Compress3(&start, &end)) {
-            QuickCompress::outputBlock3(rgba, vec(start), vec(end), block);
+        if (fit.compress3(&start, &end)) {
+            QuickCompress::outputBlock3(set, start, end, block);
         }
 	}
 }
@@ -140,8 +147,6 @@ void NormalCompressorDXT1::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alph
 
 void NormalCompressorDXT1a::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
 {
-#pragma NV_MESSAGE("NormalCompressorDXT1a - Not implemented!")
-    /*
     uint alphaMask = 0;
 	for (uint i = 0; i < 16; i++)
 	{
@@ -168,14 +173,11 @@ void NormalCompressorDXT1a::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alp
 
 		fit.Compress(output);
 	}
-    */
 }
 
 
 void NormalCompressorDXT3::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
 {
-#pragma NV_MESSAGE("NormalCompressorDXT1a - Not implemented!")
-    /*
 	BlockDXT3 * block = new(output) BlockDXT3;
 
 	// Compress explicit alpha.
@@ -198,14 +200,11 @@ void NormalCompressorDXT3::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alph
 		fit.SetColourSet(&colours, 0);
 		fit.Compress(&block->color);
 	}
-    */
 }
 
 
 void NormalCompressorDXT5::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
 {
-#pragma NV_MESSAGE("NormalCompressorDXT1a - Not implemented!")
-    /*
 	BlockDXT5 * block = new(output) BlockDXT5;
 
 	// Compress alpha.
@@ -235,14 +234,11 @@ void NormalCompressorDXT5::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alph
 		fit.SetColourSet(&colours, 0);
 		fit.Compress(&block->color);
 	}
-    */
 }
 
 
 void NormalCompressorDXT5n::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
 {
-#pragma NV_MESSAGE("NormalCompressorDXT1a - Not implemented!")
-    /*
 	BlockDXT5 * block = new(output) BlockDXT5;
 
 	// Compress Y.
@@ -284,7 +280,6 @@ void NormalCompressorDXT5n::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alp
 	{
 		QuickCompress::compressDXT5A(rgba, &block->alpha);
 	}
-    */
 }
 
 
diff --git a/src/nvtt/CompressorDX9.h b/src/nvtt/CompressorDX9.h
index 4e8730d..93e9102 100644
--- a/src/nvtt/CompressorDX9.h
+++ b/src/nvtt/CompressorDX9.h
@@ -64,9 +64,9 @@ namespace nv
 
 
     // Normal CPU compressors.
-    struct NormalCompressorDXT1 : public FixedBlockCompressor
+    struct NormalCompressorDXT1 : public ColorSetCompressor
     {
-        virtual void compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
+        virtual void compressBlock(ColorSet & set, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
         virtual uint blockSize() const { return 8; }
     };
 
diff --git a/src/nvtt/CompressorDXT.cpp b/src/nvtt/CompressorDXT.cpp
index 0bd57f8..4398f5c 100644
--- a/src/nvtt/CompressorDXT.cpp
+++ b/src/nvtt/CompressorDXT.cpp
@@ -111,33 +111,33 @@ void FixedBlockCompressor::compress(nvtt::AlphaMode alphaMode, uint w, uint h, c
 }
 
 
-#include "bc6h/tile.h"
+//#include "bc6h/tile.h"
 
-void TileCompressor::compress(AlphaMode alphaMode, uint w, uint h, const float * data, const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions)
+void ColorSetCompressor::compress(AlphaMode alphaMode, uint w, uint h, const float * data, const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions)
 {
     const uint bs = blockSize();
     const uint bw = (w + 3) / 4;
     const uint bh = (h + 3) / 4;
 
-    bool singleThreaded = true;
-
-    if (singleThreaded)
+    //bool singleThreaded = true;
+    //if (singleThreaded)
     {
-        nvDebugCheck(bs <= 16);
-        uint8 mem[16]; // @@ Output one row at a time!
-
-        for (uint y = 0; y < h; y += 4) {
-            for (uint x = 0; x < w; x += 4) {
+        uint8 * mem = malloc<uint8>(bs * bw);
+        uint8 * ptr = mem;
 
-                Tile tile;
-                //tile.init((const float *)data, w, h, x, y);
+        ColorSet set;
 
-                compressBlock(tile, alphaMode, compressionOptions, mem);
+        for (uint y = 0; y < h; y += 4) {
+            for (uint x = 0; x < w; x += 4, ptr += bs) {
+                set.setColors(data, w, h, x, y);
+                compressBlock(set, alphaMode, compressionOptions, ptr);
+            }
 
-                if (outputOptions.outputHandler != NULL) {
-                    outputOptions.outputHandler->writeData(mem, bs);
-                }
+            if (outputOptions.outputHandler != NULL) {
+                outputOptions.outputHandler->writeData(mem, bs * bw);
             }
         }
+
+        free(mem);
     }
 }
diff --git a/src/nvtt/CompressorDXT.h b/src/nvtt/CompressorDXT.h
index bdc9f95..8a5f1bb 100644
--- a/src/nvtt/CompressorDXT.h
+++ b/src/nvtt/CompressorDXT.h
@@ -27,26 +27,26 @@
 
 #include "Compressor.h"
 
-class Tile;
 
 namespace nv
 {
+    struct ColorSet;
     struct ColorBlock;
 
     struct FixedBlockCompressor : public CompressorInterface
     {
         virtual void compress(nvtt::AlphaMode alphaMode, uint w, uint h, const float * rgba, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
 
-	virtual void compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output) = 0;
-	virtual uint blockSize() const = 0;
+        virtual void compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output) = 0;
+        virtual uint blockSize() const = 0;
     };
 
-    struct TileCompressor : public CompressorInterface
+    struct ColorSetCompressor : public CompressorInterface
     {
         virtual void compress(nvtt::AlphaMode alphaMode, uint w, uint h, const float * rgba, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
 
-	virtual void compressBlock(Tile & tile, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output) = 0;
-	virtual uint blockSize() const = 0;
+        virtual void compressBlock(ColorSet & set, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output) = 0;
+        virtual uint blockSize() const = 0;
     };
 
 } // nv namespace
diff --git a/src/nvtt/QuickCompressDXT.cpp b/src/nvtt/QuickCompressDXT.cpp
index 369b0d5..e5a2190 100644
--- a/src/nvtt/QuickCompressDXT.cpp
+++ b/src/nvtt/QuickCompressDXT.cpp
@@ -83,7 +83,7 @@ inline static void selectDiagonal(const Vector3 * block, uint num, Vector3 * res
 {
 	Vector3 center = (*maxColor + *minColor) * 0.5;
 
-	Vector2 covariance = Vector2(zero);
+	Vector2 covariance = Vector2(0.0f);
 	for (uint i = 0; i < num; i++)
 	{
 		Vector3 t = block[i] - center;
@@ -166,6 +166,40 @@ inline static uint computeIndices4(const Vector3 block[16], Vector3::Arg maxColo
 	return indices;
 }
 
+inline static uint computeIndices4(const ColorSet & set, Vector3::Arg maxColor, Vector3::Arg minColor)
+{
+	Vector3 palette[4];
+	palette[0] = maxColor;
+	palette[1] = minColor;
+	palette[2] = lerp(palette[0], palette[1], 1.0f / 3.0f);
+	palette[3] = lerp(palette[0], palette[1], 2.0f / 3.0f);
+	
+	uint indices = 0;
+	for(int i = 0; i < 16; i++)
+	{
+        Vector3 color = set.color(i).xyz();
+
+		float d0 = colorDistance(palette[0], color);
+		float d1 = colorDistance(palette[1], color);
+		float d2 = colorDistance(palette[2], color);
+		float d3 = colorDistance(palette[3], color);
+		
+		uint b0 = d0 > d3;
+		uint b1 = d1 > d2;
+		uint b2 = d0 > d2;
+		uint b3 = d1 > d3;
+		uint b4 = d2 > d3;
+		
+		uint x0 = b1 & b2;
+		uint x1 = b0 & b3;
+		uint x2 = b0 & b4;
+		
+		indices |= (x2 | ((x0 | x1) << 1)) << (2 * i);
+	}
+
+	return indices;
+}
+
 inline static float evaluatePaletteError4(const Vector3 block[16], Vector3::Arg maxColor, Vector3::Arg minColor)
 {
 	Vector3 palette[4];
@@ -188,7 +222,7 @@ inline static float evaluatePaletteError4(const Vector3 block[16], Vector3::Arg
 	return total;
 }
 
-inline static uint computeIndices3(const ColorBlock & rgba, Vector3::Arg maxColor, Vector3::Arg minColor)
+inline static uint computeIndices3(const ColorSet & set, Vector3::Arg maxColor, Vector3::Arg minColor)
 {
 	Vector3 palette[4];
 	palette[0] = minColor;
@@ -198,15 +232,15 @@ inline static uint computeIndices3(const ColorBlock & rgba, Vector3::Arg maxColo
 	uint indices = 0;
 	for(int i = 0; i < 16; i++)
 	{
-		Color32 c = rgba.color(i);
-		Vector3 color = Vector3(c.r, c.g, c.b);
+        Vector3 color = set.color(i).xyz();
+		float alpha = set.color(i).w;
 		
 		float d0 = colorDistance(palette[0], color);
 		float d1 = colorDistance(palette[1], color);
 		float d2 = colorDistance(palette[2], color);
 		
 		uint index;
-		if (c.a < 128) index = 3;
+		if (alpha == 0) index = 3;
 		else if (d0 < d1 && d0 < d2) index = 0;
 		else if (d1 < d2) index = 1;
 		else index = 2;
@@ -250,8 +284,8 @@ static void optimizeEndPoints4(Vector3 block[16], BlockDXT1 * dxtBlock)
 	float alpha2_sum = 0.0f;
 	float beta2_sum = 0.0f;
 	float alphabeta_sum = 0.0f;
-	Vector3 alphax_sum(zero);
-	Vector3 betax_sum(zero);
+	Vector3 alphax_sum(0.0f);
+	Vector3 betax_sum(0.0f);
 	
 	for( int i = 0; i < 16; ++i )
 	{
@@ -298,8 +332,8 @@ static void optimizeEndPoints3(Vector3 block[16], BlockDXT1 * dxtBlock)
 	float alpha2_sum = 0.0f;
 	float beta2_sum = 0.0f;
 	float alphabeta_sum = 0.0f;
-	Vector3 alphax_sum(zero);
-	Vector3 betax_sum(zero);
+	Vector3 alphax_sum(0.0f);
+	Vector3 betax_sum(0.0f);
 	
 	for( int i = 0; i < 16; ++i )
 	{
@@ -664,11 +698,8 @@ void QuickCompress::compressDXT5(const ColorBlock & rgba, BlockDXT5 * dxtBlock,
 
 
 
-void QuickCompress::outputBlock4(const ColorBlock & rgba, const Vector3 & start, const Vector3 & end, BlockDXT1 * dxtBlock)
+void QuickCompress::outputBlock4(const ColorSet & set, const Vector3 & start, const Vector3 & end, BlockDXT1 * block)
 {
-	Vector3 block[16];
-	extractColorBlockRGB(rgba, block);
-
     Vector3 maxColor = start * 255;
     Vector3 minColor = end * 255;
 	uint16 color0 = roundAndExpand(&maxColor);
@@ -680,18 +711,15 @@ void QuickCompress::outputBlock4(const ColorBlock & rgba, const Vector3 & start,
 		swap(color0, color1);
 	}
 
-	dxtBlock->col0 = Color16(color0);
-	dxtBlock->col1 = Color16(color1);
-	dxtBlock->indices = computeIndices4(block, maxColor, minColor);
+	block->col0 = Color16(color0);
+	block->col1 = Color16(color1);
+	block->indices = computeIndices4(set, maxColor, minColor);
 
-	optimizeEndPoints4(block, dxtBlock);
+	//optimizeEndPoints4(set, block);
 }
 
-void QuickCompress::outputBlock3(const ColorBlock & rgba, const Vector3 & start, const Vector3 & end, BlockDXT1 * dxtBlock)
+void QuickCompress::outputBlock3(const ColorSet & set, const Vector3 & start, const Vector3 & end, BlockDXT1 * block)
 {
-	Vector3 block[16];
-	extractColorBlockRGB(rgba, block);
-
     Vector3 maxColor = start * 255;
     Vector3 minColor = end * 255;
 	uint16 color0 = roundAndExpand(&maxColor);
@@ -703,9 +731,9 @@ void QuickCompress::outputBlock3(const ColorBlock & rgba, const Vector3 & start,
 		swap(color0, color1);
 	}
 
-	dxtBlock->col0 = Color16(color0);
-	dxtBlock->col1 = Color16(color1);
-    dxtBlock->indices = computeIndices3(block, maxColor, minColor);
+	block->col0 = Color16(color0);
+	block->col1 = Color16(color1);
+    block->indices = computeIndices3(set, maxColor, minColor);
 
-	optimizeEndPoints3(block, dxtBlock);
+	//optimizeEndPoints3(set, block);
 }
\ No newline at end of file
diff --git a/src/nvtt/QuickCompressDXT.h b/src/nvtt/QuickCompressDXT.h
index f7140c0..56adf3a 100644
--- a/src/nvtt/QuickCompressDXT.h
+++ b/src/nvtt/QuickCompressDXT.h
@@ -30,6 +30,7 @@
 namespace nv
 {
 	struct ColorBlock;
+    struct ColorSet;
 	struct BlockDXT1;
 	struct BlockDXT3;
 	struct BlockDXT5;
@@ -47,8 +48,8 @@ namespace nv
 		void compressDXT5A(const ColorBlock & rgba, AlphaBlockDXT5 * dxtBlock, int iterationCount=8);
 		void compressDXT5(const ColorBlock & rgba, BlockDXT5 * dxtBlock, int iterationCount=8);
 
-        void outputBlock4(const ColorBlock & rgba, const Vector3 & start, const Vector3 & end, BlockDXT1 * block);
-        void outputBlock3(const ColorBlock & rgba, const Vector3 & start, const Vector3 & end, BlockDXT1 * block);
+        void outputBlock4(const ColorSet & set, const Vector3 & start, const Vector3 & end, BlockDXT1 * block);
+        void outputBlock3(const ColorSet & set, const Vector3 & start, const Vector3 & end, BlockDXT1 * block);
 	}
 } // nv namespace
 
diff --git a/src/nvtt/TexImage.cpp b/src/nvtt/TexImage.cpp
index d608c98..fc19d30 100644
--- a/src/nvtt/TexImage.cpp
+++ b/src/nvtt/TexImage.cpp
@@ -35,6 +35,7 @@
 #include "nvimage/BlockDXT.h"
 #include "nvimage/ColorBlock.h"
 #include "nvimage/PixelFormat.h"
+#include "nvimage/ErrorMetric.h"
 
 #include <float.h>
 
@@ -332,7 +333,8 @@ void TexImage::range(int channel, float * rangeMin, float * rangeMax)
     for (uint p = 0; p < count; p++) {
         float f = c[p];
         if (f < range.x) range.x = f;
-        if (f > range.y) range.y = f;
+        if (f > range.y) 
+            range.y = f;
     }
 
     *rangeMin = range.x;
@@ -340,7 +342,7 @@ void TexImage::range(int channel, float * rangeMin, float * rangeMax)
 }
 
 
-bool TexImage::load(const char * fileName)
+bool TexImage::load(const char * fileName, bool * hasAlpha/*= NULL*/)
 {
     AutoPtr<FloatImage> img(ImageIO::loadFloat(fileName));
     if (img == NULL) {
@@ -349,6 +351,10 @@ bool TexImage::load(const char * fileName)
 
     detach();
 
+    if (hasAlpha != NULL) {
+        *hasAlpha = (img->componentNum() == 4);
+    }
+
     // @@ Have loadFloat allocate the image with the desired number of channels.
     img->resizeChannelCount(4);
 
@@ -388,52 +394,52 @@ bool TexImage::setImage2D(nvtt::InputFormat format, int w, int h, const void * d
     {
         const Color32 * src = (const Color32 *)data;
 
-	try {
-	    for (int i = 0; i < count; i++)
-	    {
-		rdst[i] = float(src[i].r) / 255.0f;
-		gdst[i] = float(src[i].g) / 255.0f;
-		bdst[i] = float(src[i].b) / 255.0f;
-		adst[i] = float(src[i].a) / 255.0f;
-	    }
-	}
-	catch(...) {
-	    return false;
-	}
+        try {
+            for (int i = 0; i < count; i++)
+            {
+                rdst[i] = float(src[i].r) / 255.0f;
+                gdst[i] = float(src[i].g) / 255.0f;
+                bdst[i] = float(src[i].b) / 255.0f;
+                adst[i] = float(src[i].a) / 255.0f;
+            }
+        }
+        catch(...) {
+            return false;
+        }
     }
     else if (format == InputFormat_RGBA_16F)
     {
         const uint16 * src = (const uint16 *)data;
 
-	try {
-	    for (int i = 0; i < count; i++)
-	    {
-		((uint32 *)rdst)[i] = half_to_float(src[4*i+0]);
-		((uint32 *)gdst)[i] = half_to_float(src[4*i+1]);
-		((uint32 *)bdst)[i] = half_to_float(src[4*i+2]);
-		((uint32 *)adst)[i] = half_to_float(src[4*i+3]);
-	    }
-	}
-	catch(...) {
-	    return false;
-	}
+        try {
+            for (int i = 0; i < count; i++)
+            {
+                ((uint32 *)rdst)[i] = half_to_float(src[4*i+0]);
+                ((uint32 *)gdst)[i] = half_to_float(src[4*i+1]);
+                ((uint32 *)bdst)[i] = half_to_float(src[4*i+2]);
+                ((uint32 *)adst)[i] = half_to_float(src[4*i+3]);
+            }
+        }
+        catch(...) {
+            return false;
+        }
     }
     else if (format == InputFormat_RGBA_32F)
     {
         const float * src = (const float *)data;
 
-	try {
-	    for (int i = 0; i < count; i++)
-	    {
-		rdst[i] = src[4 * i + 0];
-		gdst[i] = src[4 * i + 1];
-		bdst[i] = src[4 * i + 2];
-		adst[i] = src[4 * i + 3];
-	    }
-	}
-	catch(...) {
-	    return false;
-	}
+        try {
+            for (int i = 0; i < count; i++)
+            {
+                rdst[i] = src[4 * i + 0];
+                gdst[i] = src[4 * i + 1];
+                bdst[i] = src[4 * i + 2];
+                adst[i] = src[4 * i + 3];
+            }
+        }
+        catch(...) {
+            return false;
+        }
     }
 
     return true;
@@ -462,15 +468,15 @@ bool TexImage::setImage2D(InputFormat format, int w, int h, const void * r, cons
         const uint8 * bsrc = (const uint8 *)b;
         const uint8 * asrc = (const uint8 *)a;
 
-	try {
-	    for (int i = 0; i < count; i++) rdst[i] = float(rsrc[i]) / 255.0f;
-	    for (int i = 0; i < count; i++) gdst[i] = float(gsrc[i]) / 255.0f;
-	    for (int i = 0; i < count; i++) bdst[i] = float(bsrc[i]) / 255.0f;
-	    for (int i = 0; i < count; i++) adst[i] = float(asrc[i]) / 255.0f;
-	}
-	catch(...) {
-	    return false;
-	}
+        try {
+            for (int i = 0; i < count; i++) rdst[i] = float(rsrc[i]) / 255.0f;
+            for (int i = 0; i < count; i++) gdst[i] = float(gsrc[i]) / 255.0f;
+            for (int i = 0; i < count; i++) bdst[i] = float(bsrc[i]) / 255.0f;
+            for (int i = 0; i < count; i++) adst[i] = float(asrc[i]) / 255.0f;
+        }
+        catch(...) {
+            return false;
+        }
     }
     else if (format == InputFormat_RGBA_16F)
     {
@@ -479,15 +485,15 @@ bool TexImage::setImage2D(InputFormat format, int w, int h, const void * r, cons
         const uint16 * bsrc = (const uint16 *)b;
         const uint16 * asrc = (const uint16 *)a;
 
-	try {
-	    for (int i = 0; i < count; i++) ((uint32 *)rdst)[i] = half_to_float(rsrc[i]);
-	    for (int i = 0; i < count; i++) ((uint32 *)gdst)[i] = half_to_float(gsrc[i]);
-	    for (int i = 0; i < count; i++) ((uint32 *)bdst)[i] = half_to_float(bsrc[i]);
-	    for (int i = 0; i < count; i++) ((uint32 *)adst)[i] = half_to_float(asrc[i]);
-	}
-	catch(...) {
-	    return false;
-	}
+        try {
+            for (int i = 0; i < count; i++) ((uint32 *)rdst)[i] = half_to_float(rsrc[i]);
+            for (int i = 0; i < count; i++) ((uint32 *)gdst)[i] = half_to_float(gsrc[i]);
+            for (int i = 0; i < count; i++) ((uint32 *)bdst)[i] = half_to_float(bsrc[i]);
+            for (int i = 0; i < count; i++) ((uint32 *)adst)[i] = half_to_float(asrc[i]);
+        }
+        catch(...) {
+            return false;
+        }
     }
     else if (format == InputFormat_RGBA_32F)
     {
@@ -496,15 +502,15 @@ bool TexImage::setImage2D(InputFormat format, int w, int h, const void * r, cons
         const float * bsrc = (const float *)b;
         const float * asrc = (const float *)a;
 
-	try {
-	    memcpy(rdst, rsrc, count * sizeof(float));
-	    memcpy(gdst, gsrc, count * sizeof(float));
-	    memcpy(bdst, bsrc, count * sizeof(float));
-	    memcpy(adst, asrc, count * sizeof(float));
-	}
-	catch(...) {
-	    return false;
-	}
+        try {
+            memcpy(rdst, rsrc, count * sizeof(float));
+            memcpy(gdst, gsrc, count * sizeof(float));
+            memcpy(bdst, bsrc, count * sizeof(float));
+            memcpy(adst, asrc, count * sizeof(float));
+        }
+        catch(...) {
+            return false;
+        }
     }
 
     return true;
@@ -987,10 +993,10 @@ void TexImage::setBorder(float r, float g, float b, float a)
         img->pixel(0, i, 2) = b;
         img->pixel(0, i, 3) = a;
 
-	img->pixel(w-1, i, 0) = r;
-	img->pixel(w-1, i, 1) = g;
-	img->pixel(w-1, i, 2) = b;
-	img->pixel(w-1, i, 3) = a;
+        img->pixel(w-1, i, 0) = r;
+        img->pixel(w-1, i, 1) = g;
+        img->pixel(w-1, i, 2) = b;
+        img->pixel(w-1, i, 3) = a;
     }
 }
 
@@ -1164,11 +1170,14 @@ void TexImage::blockScaleCoCg(int bits/*= 5*/, float threshold/*= 0.0*/)
         for (uint bi = 0; bi < bw; bi++) {
 
             // Compute per block scale.
-            float m = 1.0f / 256.0f;
+            float m = 1.0f / 255.0f;
             for (uint j = 0; j < 4; j++) {
+                const uint y = bj*4 + j;
+                if (y >= h) continue;
+
                 for (uint i = 0; i < 4; i++) {
-                    uint x = min(bi*4 + i, w);
-                    uint y = min(bj*4 + j, h);
+                    const uint x = bi*4 + i;
+                    if (x >= w) continue;
 
                     float Co = img->pixel(x, y, 0);
                     float Cg = img->pixel(x, y, 1);
@@ -1219,7 +1228,7 @@ void TexImage::fromYCoCg()
     for (uint i = 0; i < count; i++) {
         float Co = r[i];
         float Cg = g[i];
-        float scale = b[i];
+        float scale = b[i] * 0.5f;
         float Y = a[i];
 
         Co *= scale;
@@ -1271,6 +1280,141 @@ void TexImage::fromLUVW(float range/*= 1.0f*/)
     fromRGBM(range * sqrtf(3));
 }
 
+void TexImage::abs(int channel)
+{
+    if (m->image == NULL) return;
+
+    detach();
+
+    FloatImage * img = m->image;
+    float * c = img->channel(channel);
+
+    const uint count = img->width() * img->height();
+    for (uint i = 0; i < count; i++) {
+        c[i] = fabsf(c[i]);
+    }
+}
+
+void TexImage::blockLuminanceScale(float scale)
+{
+    if (m->image == NULL) return;
+
+    detach();
+
+    FloatImage * img = m->image;
+
+    //float * r = img->channel(0);
+    //float * g = img->channel(1);
+    //float * b = img->channel(2);
+    //float * a = img->channel(3);
+
+    const uint w = img->width();
+    const uint h = img->height();
+    const uint bw = max(1U, w/4);
+    const uint bh = max(1U, h/4);
+
+    Vector3 L = normalize(Vector3(1, 1, 1));
+
+    for (uint bj = 0; bj < bh; bj++) {
+        for (uint bi = 0; bi < bw; bi++) {
+
+            // Compute block centroid.
+            Vector3 centroid(0.0f);
+            int count = 0;
+            for (uint j = 0; j < 4; j++) {
+                const uint y = bj*4 + j;
+                if (y >= h) continue;
+
+                for (uint i = 0; i < 4; i++) {
+                    const uint x = bi*4 + i;
+                    if (x >= w) continue;
+
+                    float r = img->pixel(x, y, 0);
+                    float g = img->pixel(x, y, 1);
+                    float b = img->pixel(x, y, 2);
+                    Vector3 rgb(r, g, b);
+
+                    centroid += rgb;
+                    count++;
+                }
+            }
+
+            centroid /= float(count);
+
+            // Project to luminance plane.
+            for (uint j = 0; j < 4; j++) {
+                const uint y = bj*4 + j;
+                if (y >= h) continue;
+
+                for (uint i = 0; i < 4; i++) {
+                    const uint x = bi*4 + i;
+                    if (x >= w) continue;
+
+                    float & r = img->pixel(x, y, 0);
+                    float & g = img->pixel(x, y, 1);
+                    float & b = img->pixel(x, y, 2);
+                    Vector3 rgb(r, g, b);
+
+                    Vector3 delta = rgb - centroid;
+
+                    delta -= scale * dot(delta, L) * L;
+
+                    r = centroid.x + delta.x;
+                    g = centroid.y + delta.y;
+                    b = centroid.z + delta.z;
+                }
+            }
+        }
+    }
+}
+
+void TexImage::toJPEGLS()
+{
+    if (m->image == NULL) return;
+
+    detach();
+
+    FloatImage * img = m->image;
+    float * r = img->channel(0);
+    float * g = img->channel(1);
+    float * b = img->channel(2);
+
+    const uint count = img->width() * img->height();
+    for (uint i = 0; i < count; i++) {
+        float R = nv::clamp(r[i], 0.0f, 1.0f);
+        float G = nv::clamp(g[i], 0.0f, 1.0f);
+        float B = nv::clamp(b[i], 0.0f, 1.0f);
+
+        r[i] = R-G;
+        g[i] = G;
+        b[i] = B-G;
+    }
+}
+
+void TexImage::fromJPEGLS()
+{
+    if (m->image == NULL) return;
+
+    detach();
+
+    FloatImage * img = m->image;
+    float * r = img->channel(0);
+    float * g = img->channel(1);
+    float * b = img->channel(2);
+
+    const uint count = img->width() * img->height();
+    for (uint i = 0; i < count; i++) {
+        float R = nv::clamp(r[i], -1.0f, 1.0f);
+        float G = nv::clamp(g[i], 0.0f, 1.0f);
+        float B = nv::clamp(b[i], -1.0f, 1.0f);
+
+        r[i] = R+G;
+        g[i] = G;
+        b[i] = B+G;
+    }
+}
+
+
 
 void TexImage::binarize(int channel, float threshold, bool dither)
 {
@@ -1350,113 +1494,52 @@ bool TexImage::copyChannel(const TexImage & srcImage, int srcChannel, int dstCha
     return true;
 }
 
-
-
-
-float nvtt::rmsError(const TexImage & reference, const TexImage & image)
+bool TexImage::addChannel(const TexImage & srcImage, int srcChannel, int dstChannel, float scale)
 {
-    double mse = 0;
+    if (srcChannel < 0 || srcChannel > 3 || dstChannel < 0 || dstChannel > 3) return false;
 
-    const FloatImage * ref = reference.m->image;
-    const FloatImage * img = image.m->image;
+    FloatImage * dst = m->image;
+    const FloatImage * src = srcImage.m->image;
 
-    if (img == NULL || ref == NULL || img->width() != ref->width() || img->height() != ref->height()) {
-        return FLT_MAX;
+    if (dst == NULL || src == NULL || dst->width() != src->width() || dst->height() != src->height()) {
+        return false;
     }
-    nvDebugCheck(img->componentNum() == 4);
-    nvDebugCheck(ref->componentNum() == 4);
+    nvDebugCheck(dst->componentNum() == 4 && src->componentNum() == 4);
 
-    const uint count = img->width() * img->height();
-    for (uint i = 0; i < count; i++)
-    {
-        float r0 = img->pixel(i + count * 0);
-        float g0 = img->pixel(i + count * 1);
-        float b0 = img->pixel(i + count * 2);
-        //float a0 = img->pixel(i + count * 3);
-        float r1 = ref->pixel(i + count * 0);
-        float g1 = ref->pixel(i + count * 1);
-        float b1 = ref->pixel(i + count * 2);
-        float a1 = ref->pixel(i + count * 3);
-
-        float r = r0 - r1;
-        float g = g0 - g1;
-        float b = b0 - b1;
-        //float a = a0 - a1;
+    detach();
 
-        if (reference.alphaMode() == nvtt::AlphaMode_Transparency)
-        {
-            mse += double(r * r) * a1;
-            mse += double(g * g) * a1;
-            mse += double(b * b) * a1;
-        }
-        else
-        {
-            mse += r * r;
-            mse += g * g;
-            mse += b * b;
-        }
+    const uint w = src->width();
+    const uint h = src->height();
+
+    float * d = dst->channel(dstChannel);
+    const float * s = src->channel(srcChannel);
+    for (uint i = 0; i < w*h; i++) {
+        d[i] += s[i] * scale;
     }
 
-    return float(sqrt(mse / count));
+    return true;
 }
 
 
-/*float rmsError(const Image * a, const Image * b)
-{
-    nvCheck(a != NULL);
-    nvCheck(b != NULL);
-    nvCheck(a->width() == b->width());
-    nvCheck(a->height() == b->height());
-
-    double mse = 0;
-
-    const uint count = a->width() * a->height();
-
-    for (uint i = 0; i < count; i++)
-    {
-        Color32 c0 = a->pixel(i);
-        Color32 c1 = b->pixel(i);
 
-        int r = c0.r - c1.r;
-        int g = c0.g - c1.g;
-        int b = c0.b - c1.b;
-        int a = c0.a - c1.a;
-
-        mse += double(r * r * c0.a) / 255;
-        mse += double(g * g * c0.a) / 255;
-        mse += double(b * b * c0.a) / 255;
-    }
-
-    return float(sqrt(mse / count));
-}*/
+float nvtt::rmsError(const TexImage & reference, const TexImage & image)
+{
+    return nv::rmsColorError(reference.m->image, image.m->image, reference.alphaMode() == nvtt::AlphaMode_Transparency);
+}
 
 
 float nvtt::rmsAlphaError(const TexImage & reference, const TexImage & image)
 {
-    double mse = 0;
-
-    const FloatImage * img = image.m->image;
-    const FloatImage * ref = reference.m->image;
-
-    if (img == NULL || ref == NULL || img->width() != ref->width() || img->height() != ref->height()) {
-        return FLT_MAX;
-    }
-    nvDebugCheck(img->componentNum() == 4 && ref->componentNum() == 4);
-
-    const uint count = img->width() * img->height();
-    for (uint i = 0; i < count; i++)
-    {
-        float a0 = img->pixel(i + count * 3);
-        float a1 = ref->pixel(i + count * 3);
-
-        float a = a0 - a1;
+    return nv::rmsAlphaError(reference.m->image, image.m->image);
+}
 
-        mse += double(a * a);
-    }
 
-    return float(sqrt(mse / count));
+float nvtt::cieLabError(const TexImage & reference, const TexImage & image)
+{
+    return nv::cieLabError(reference.m->image, image.m->image);
 }
 
+
 TexImage nvtt::diff(const TexImage & reference, const TexImage & image, float scale)
 {
     const FloatImage * ref = reference.m->image;
diff --git a/src/nvtt/TexImage.h b/src/nvtt/TexImage.h
index 5f95e56..3230157 100644
--- a/src/nvtt/TexImage.h
+++ b/src/nvtt/TexImage.h
@@ -49,7 +49,7 @@ namespace nvtt
             wrapMode = WrapMode_Mirror;
             alphaMode = AlphaMode_None;
             isNormalMap = false;
-
+            
             image = NULL;
         }
         Private(const Private & p) : RefCounted() // Copy ctor. inits refcount to 0.
diff --git a/src/nvtt/bc6h/zohtwo.cpp b/src/nvtt/bc6h/zohtwo.cpp
index 52fd03e..06e8436 100644
--- a/src/nvtt/bc6h/zohtwo.cpp
+++ b/src/nvtt/bc6h/zohtwo.cpp
@@ -445,7 +445,7 @@ void ZOH::decompresstwo(const char *block, Tile &t)
         // reserved mode, return all zeroes
         for (int y = 0; y < Tile::TILE_H; y++)
             for (int x = 0; x < Tile::TILE_W; x++)
-                t.data[y][x] = Vector3 (zero);
+                t.data[y][x] = Vector3(0.0f);
 
         return;
     }
diff --git a/src/nvtt/nvtt.h b/src/nvtt/nvtt.h
index b82becb..c17c359 100644
--- a/src/nvtt/nvtt.h
+++ b/src/nvtt/nvtt.h
@@ -399,7 +399,7 @@ namespace nvtt
         NVTT_API void range(int channel, float * rangeMin, float * rangeMax);
 
         // Texture data.
-        NVTT_API bool load(const char * fileName);
+        NVTT_API bool load(const char * fileName, bool * hasAlpha = 0);
         NVTT_API bool save(const char * fileName) const;
         NVTT_API bool setImage2D(InputFormat format, int w, int h, const void * data);
         NVTT_API bool setImage2D(InputFormat format, int w, int h, const void * r, const void * g, const void * b, const void * a);
@@ -436,6 +436,11 @@ namespace nvtt
         NVTT_API void fromYCoCg();
         NVTT_API void toLUVW(float range = 1.0f);
         NVTT_API void fromLUVW(float range = 1.0f);
+        NVTT_API void abs(int channel);
+        NVTT_API void toJPEGLS();
+        NVTT_API void fromJPEGLS();
+
+        NVTT_API void blockLuminanceScale(float scale);
 
         // Color quantization.
         NVTT_API void binarize(int channel, float threshold, bool dither);
@@ -452,9 +457,12 @@ namespace nvtt
         NVTT_API bool copyChannel(const TexImage & srcImage, int srcChannel);
         NVTT_API bool copyChannel(const TexImage & srcImage, int srcChannel, int dstChannel);
 
+        NVTT_API bool addChannel(const TexImage & img, int srcChannel, int dstChannel, float scale);
+
         // Error compare.
         NVTT_API friend float rmsError(const TexImage & reference, const TexImage & img);
         NVTT_API friend float rmsAlphaError(const TexImage & reference, const TexImage & img);
+        NVTT_API friend float cieLabError(const TexImage & reference, const TexImage & img);
         NVTT_API friend TexImage diff(const TexImage & reference, const TexImage & img, float scale);
 
     private:
@@ -473,8 +481,10 @@ namespace nvtt
 
     NVTT_API float rmsError(const TexImage & reference, const TexImage & img);
     NVTT_API float rmsAlphaError(const TexImage & reference, const TexImage & img);
+    NVTT_API float cieLabError(const TexImage & reference, const TexImage & img);
     NVTT_API TexImage diff(const TexImage & reference, const TexImage & img, float scale);
 
+
 } // nvtt namespace
 
 #endif // NVTT_H
diff --git a/src/nvtt/tests/testsuite.cpp b/src/nvtt/tests/testsuite.cpp
index 7b73ee8..cd22f78 100644
--- a/src/nvtt/tests/testsuite.cpp
+++ b/src/nvtt/tests/testsuite.cpp
@@ -147,6 +147,8 @@ static const char * s_witnessImageSet[] = {
 
 static const char * s_witnessLmapImageSet[] = {
     "specruin.dds",
+    "cottage.dds",
+    "tower.dds",
 };
 
 
@@ -158,56 +160,64 @@ enum Mode {
     Mode_BC3_YCoCg,
     Mode_BC3_RGBM,
     Mode_BC3_LUVW,
+    Mode_BC3_RGBS,
     Mode_BC1_Normal,
     Mode_BC3_Normal,
     Mode_BC5_Normal,
-    Mode_BC3_Lightmap_1,
-    Mode_BC3_Lightmap_2,
+    Mode_Count
 };
 static const char * s_modeNames[] = {
-    "BC1",
-    "BC1-Alpha",
-    "BC2-Alpha",
-    "BC3-Alpha",
-    "BC3-YCoCg",
-    "BC3-RGBM",
-    "BC3-LUVW",
-    "BC1-Normal",
-    "BC3-Normal",
-    "BC5-Normal",
-    "BC3-RGBM",
-    "BC3-LUVW",
+    "BC1",          // Mode_BC1,
+    "BC1-Alpha",    // Mode_BC1_Alpha,
+    "BC2-Alpha",    // Mode_BC2_Alpha,
+    "BC3-Alpha",    // Mode_BC3_Alpha,
+    "BC3-YCoCg",    // Mode_BC3_YCoCg,
+    "BC3-RGBM",     // Mode_BC3_RGBM,
+    "BC3-LUVW",     // Mode_BC3_LUVW,
+    "BC3-RGBS",     // Mode_BC3_RGBS,
+    "BC1-Normal",   // Mode_BC1_Normal,
+    "BC3-Normal",   // Mode_BC3_Normal,
+    "BC5-Normal",   // Mode_BC5_Normal,
 };
+nvStaticCheck(ARRAY_SIZE(s_modeNames) == Mode_Count);
 
 struct Test {
     const char * name;
     int count;
-    Mode modes[4];
+    Mode modes[6];
 };
 static Test s_imageTests[] = {
-    {"DXT Color", 1, {Mode_BC1, Mode_BC3_YCoCg, Mode_BC3_RGBM, Mode_BC3_LUVW}},
-    {"DXT Alpha", 3, {Mode_BC1_Alpha, Mode_BC2_Alpha, Mode_BC3_Alpha}},
-    {"DXT Normal", 3, {Mode_BC1_Normal, Mode_BC3_Normal, Mode_BC5_Normal}},
-    {"DXT Lightmap", 2, {Mode_BC3_Lightmap_1, Mode_BC3_Lightmap_2}},
+    {"Color", 3, {Mode_BC1, Mode_BC3_YCoCg, Mode_BC3_RGBM, Mode_BC3_LUVW}},
+    {"Alpha", 3, {Mode_BC1_Alpha, Mode_BC2_Alpha, Mode_BC3_Alpha}},
+    {"Normal", 3, {Mode_BC1_Normal, Mode_BC3_Normal, Mode_BC5_Normal}},
+    {"Lightmap", 4, {Mode_BC1, Mode_BC3_YCoCg, Mode_BC3_RGBM, Mode_BC3_RGBS}},
 };
 const int s_imageTestCount = ARRAY_SIZE(s_imageTests);
 
+enum ImageType {
+    ImageType_RGB,
+    ImageType_RGBA,
+    ImageType_Normal,
+    ImageType_HDR,
+};
+
 struct ImageSet
 {
     const char * name;
     const char * basePath;
     const char ** fileNames;
     int fileCount;
+    ImageType type;
 };
 static ImageSet s_imageSets[] = {
-    {"Kodak",       "kodak",        s_kodakImageSet,        ARRAY_SIZE(s_kodakImageSet)},       // 0
-    {"Waterloo",    "waterloo",     s_waterlooImageSet,     ARRAY_SIZE(s_waterlooImageSet)},    // 1
-    {"Epic",        "epic",         s_epicImageSet,         ARRAY_SIZE(s_epicImageSet)},        // 2
-    {"Farbraush",   "farbrausch",   s_farbrauschImageSet,   ARRAY_SIZE(s_farbrauschImageSet)},  // 3
-    {"Lugaru",      "lugaru",       s_lugaruImageSet,       ARRAY_SIZE(s_lugaruImageSet)},      // 4
-    {"Quake3",      "quake3",       s_quake3ImageSet,       ARRAY_SIZE(s_quake3ImageSet)},      // 5
-    {"Witness",     "witness",      s_witnessImageSet,      ARRAY_SIZE(s_witnessImageSet)},     // 6
-    {"Lightmap",    "lightmap",     s_witnessLmapImageSet,  ARRAY_SIZE(s_witnessLmapImageSet)}, // 7
+    {"Kodak",       "kodak",        s_kodakImageSet,        ARRAY_SIZE(s_kodakImageSet),        ImageType_RGB},     // 0
+    {"Waterloo",    "waterloo",     s_waterlooImageSet,     ARRAY_SIZE(s_waterlooImageSet),     ImageType_RGB},     // 1
+    {"Epic",        "epic",         s_epicImageSet,         ARRAY_SIZE(s_epicImageSet),         ImageType_RGB},     // 2
+    {"Farbraush",   "farbrausch",   s_farbrauschImageSet,   ARRAY_SIZE(s_farbrauschImageSet),   ImageType_RGB},     // 3
+    {"Lugaru",      "lugaru",       s_lugaruImageSet,       ARRAY_SIZE(s_lugaruImageSet),       ImageType_RGBA},    // 4
+    {"Quake3",      "quake3",       s_quake3ImageSet,       ARRAY_SIZE(s_quake3ImageSet),       ImageType_RGBA},    // 5
+    {"Witness",     "witness",      s_witnessImageSet,      ARRAY_SIZE(s_witnessImageSet),      ImageType_RGB},     // 6
+    {"Lightmap",    "lightmap",     s_witnessLmapImageSet,  ARRAY_SIZE(s_witnessLmapImageSet),  ImageType_HDR},     // 7
 };
 const int s_imageSetCount = sizeof(s_imageSets)/sizeof(s_imageSets[0]);
 
@@ -237,14 +247,8 @@ struct MyOutputHandler : public nvtt::OutputHandler
         return true;
     }
 
-    nvtt::TexImage decompress(Mode mode, nvtt::Decoder decoder)
+    nvtt::TexImage decompress(Mode mode, nvtt::Format format, nvtt::Decoder decoder)
     {
-        nvtt::Format format; 
-        if (mode == Mode_BC1 || mode == Mode_BC1_Alpha || mode == Mode_BC1_Normal) format = nvtt::Format_BC1;
-        else if (mode == Mode_BC2_Alpha) format = nvtt::Format_BC2;
-        else if (mode == Mode_BC5_Normal) format = nvtt::Format_BC5;
-        else format = nvtt::Format_BC3;
-
         nvtt::TexImage img;
         img.setImage2D(format, decoder, m_width, m_height, m_data);
 
@@ -427,7 +431,8 @@ int main(int argc, char *argv[])
     //TextWriter csvWriter(&csvStream);
 
     Path graphFileName;
-    graphFileName.format("%s/chart.txt", outPath/*, test.name*/);
+    graphFileName.format("%s/chart_%s_CIE-Lab.txt", outPath, test.name);
+    //graphFileName.format("%s/chart_%s_RMSE.txt", outPath, test.name);
     StdOutputStream graphStream(graphFileName.str());
     TextWriter graphWriter(&graphStream);
 
@@ -446,7 +451,8 @@ int main(int argc, char *argv[])
     graphWriter << "&chxt=x,y&chxtc=0,-1000|1,-1000";
 
     // Labels.
-    graphWriter << "&chxr=0,1," << set.fileCount << ",1|1,0,0.05,0.01";
+    graphWriter << "&chxr=0,1," << set.fileCount << ",1|1,0,0.05,0.01"; // rmse
+    //graphWriter << "&chxr=0,1," << set.fileCount << ",1|1,4,22,1";  // cielab
     graphWriter << "&chdlp=b"; // Labels at the bottom.
 
     // Line colors.
@@ -473,7 +479,8 @@ int main(int argc, char *argv[])
     graphWriter << "&chds=";
     for (int t = 0; t < test.count; t++)
     {
-        graphWriter << "0,0.05";
+        graphWriter << "0,0.05"; // rmse
+        //graphWriter << "4,22";   // cielab
         if (t != test.count-1) graphWriter << ",";
     }
 
@@ -486,15 +493,14 @@ int main(int argc, char *argv[])
     }
 
     // Title
-    graphWriter << "&chtt=" << set.name << " - " << test.name;
-
-    float totalTime = 0;
-    float totalRMSE = 0;
-    //int failedTests = 0;
-    float totalDiff = 0;
+    graphWriter << "&chtt=" << set.name << "%20-%20" << test.name << "%20-%20RMSE";
+    //graphWriter << "&chtt=" << set.name << "%20-%20" << test.name << "%20-%20CIE-Lab";
+    
 
 
     Timer timer;
+    //int failedTests = 0;
+    //float totalDiff = 0;
 
     nvtt::TexImage img;
 
@@ -504,26 +510,42 @@ int main(int argc, char *argv[])
 
     for (int t = 0; t < test.count; t++)
     {
+        float totalTime = 0;
+        float totalRMSE = 0;
+        float totalDeltaE = 0;
+
         Mode mode = test.modes[t];
-        if (mode == Mode_BC1 || mode == Mode_BC1_Alpha || mode == Mode_BC1_Normal) {
-            compressionOptions.setFormat(nvtt::Format_BC1);
+
+        nvtt::Format format;
+        if (mode == Mode_BC1 || mode == Mode_BC1_Alpha || mode == Mode_BC1_Normal || mode == Mode_BC3_RGBS) {
+            format = nvtt::Format_BC1;
         }
-        else if (mode == Mode_BC3_Alpha || mode == Mode_BC3_YCoCg || mode == Mode_BC3_RGBM || mode == Mode_BC3_LUVW || mode == Mode_BC3_Lightmap_1 || mode == Mode_BC3_Lightmap_2) {
-            compressionOptions.setFormat(nvtt::Format_BC3);
+        else if (mode == Mode_BC3_Alpha || mode == Mode_BC3_YCoCg || mode == Mode_BC3_RGBM || mode == Mode_BC3_LUVW) {
+            format = nvtt::Format_BC3;
         }
         else if (mode == Mode_BC3_Normal) {
-            compressionOptions.setFormat(nvtt::Format_BC3n);
+            format = nvtt::Format_BC3n;
         }
         else if (mode == Mode_BC5_Normal) {
-            compressionOptions.setFormat(nvtt::Format_BC5);
+            format = nvtt::Format_BC5;
         }
+        
+        compressionOptions.setFormat(format);
 
-        if (mode == Mode_BC3_Alpha || mode == Mode_BC3_Lightmap_1 || mode == Mode_BC3_Lightmap_2) { // Lightmap's alpha channel is coverage.
+        if (set.type == ImageType_RGBA) {
             img.setAlphaMode(nvtt::AlphaMode_Transparency);
         }
-        if (mode == Mode_BC1_Normal || mode == Mode_BC3_Normal || mode == Mode_BC5_Normal) {
+        else if (set.type == ImageType_Normal) { 
             img.setNormalMap(true);
         }
+        else if (set.type == ImageType_HDR) { // Lightmap's alpha channel is coverage.
+            img.setAlphaMode(nvtt::AlphaMode_Transparency);
+        }
+
+        // Create output directory.
+        Path outputFilePath;
+        outputFilePath.format("%s/%s", outPath, s_modeNames[test.modes[t]]);
+        FileSystem::createDirectory(outputFilePath.str());
 
 
         printf("Processing Set: %s\n", set.name);
@@ -538,65 +560,72 @@ int main(int argc, char *argv[])
             if (img.isNormalMap()) {
                 img.normalizeNormalMap();
             }
+            if (set.type == ImageType_HDR) {
+                img.scaleBias(0, 1.0f/4.0f, 0.0f); img.clamp(0);
+                img.scaleBias(1, 1.0f/4.0f, 0.0f); img.clamp(1);
+                img.scaleBias(2, 1.0f/4.0f, 0.0f); img.clamp(2);
+                img.toGamma(2);
+            }
 
             nvtt::TexImage tmp = img;
+            if (mode == Mode_BC1) {
+                if (set.type == ImageType_HDR) {
+                    /*for (int i = 0; i < 3; i++) {
+                        tmp.scaleBias(i, 0.25f, 0);
+                        tmp.clamp(i);
+                    }*/
+                }
+            }
             if (mode == Mode_BC3_YCoCg) {
-                tmp.toYCoCg();
-                tmp.blockScaleCoCg();
-                tmp.scaleBias(0, 0.5, 0.5);
-                tmp.scaleBias(1, 0.5, 0.5);
+                tmp.setAlphaMode(nvtt::AlphaMode_None);
+                if (set.type == ImageType_HDR) {
+                    /*for (int i = 0; i < 3; i++) {
+                        tmp.scaleBias(i, 1.0f/4.0f, 0);
+                        tmp.clamp(i);
+                    }*/
+                }
+                tmp.toYCoCg();          // Y=3, Co=0, Cg=1
+                tmp.blockScaleCoCg();   // Co=0, Cg=1, Scale=2, ScaleBits = 5
+
+                tmp.scaleBias(0, 123.0f/255.0f, 123.0f/255.0f); tmp.clamp(0, 0, 246.0f/255.0f); // -1->0, 0->123, 1->246
+                tmp.scaleBias(1, 125.0f/255.0f, 125.0f/255.0f); tmp.clamp(1, 0, 250.0f/255.0f); // -1->0, 0->125, 1->250
+
+                //tmp.scaleBias(0, 0.5f, 0.5f); tmp.clamp(0);
+                //tmp.scaleBias(1, 0.5f, 0.5f); tmp.clamp(1);
+
+                tmp.clamp(2);
+                tmp.clamp(3);
             }
             else if (mode == Mode_BC3_RGBM) {
-                tmp.toRGBM();
+                tmp.setAlphaMode(nvtt::AlphaMode_None);
+                if (set.type == ImageType_HDR) {
+                    tmp.toRGBM(/*4*/);
+                }
+                else {
+                    tmp.toRGBM();
+                }
             }
             else if (mode == Mode_BC3_LUVW) {
-                tmp.toLUVW();
-            }
-            else if (mode == Mode_BC3_Lightmap_1) {
-                tmp.toRGBM(4);
-
-                /*float rmin, rmax;
-                tmp.range(0, &rmin, &rmax);
-
-                float gmin, gmax;
-                tmp.range(1, &gmin, &gmax);
-
-                float bmin, bmax;
-                tmp.range(2, &bmin, &bmax);
-
-                float lmin, lmax;
-                tmp.range(3, &lmin, &lmax);
-
-                printf("rmin: %.3f   rmax: %.3f\n", rmin, rmax);
-                printf("gmin: %.3f   gmax: %.3f\n", gmin, gmax);
-                printf("bmin: %.3f   bmax: %.3f\n", bmin, bmax);
-                printf("lmin: %.3f   lmax: %.3f\n", lmin, lmax);
-
-                const int N = 32;
-                int chistogram[N];
-                int lhistogram[N];
-                memset(chistogram, 0, sizeof(chistogram)); 
-                memset(lhistogram, 0, sizeof(lhistogram));
-
-                tmp.histogram(0, 0, 1, N, chistogram);
-                tmp.histogram(1, 0, 1, N, chistogram);
-                tmp.histogram(2, 0, 1, N, chistogram);
-                tmp.histogram(3, 0, 1, N, lhistogram);
-
-                printf("Color histogram:\n");
-                for (int i = 0; i < N; i++) {
-                    printf("%d, ", chistogram[i]);
+                tmp.setAlphaMode(nvtt::AlphaMode_None);
+                if (set.type == ImageType_HDR) {
+                    tmp.toLUVW(/*4*/);
                 }
-                printf("\n");
-
-                printf("Luminance histogram:\n");
-                for (int i = 0; i < N; i++) {
-                    printf("%d, ", lhistogram[i]);
+                else {
+                    tmp.toLUVW();
                 }
-                printf("\n");*/
             }
-            else if (mode == Mode_BC3_Lightmap_2) {
-                tmp.toLUVW(4);
+            else if (mode == Mode_BC3_RGBS) {
+                //tmp.toJPEGLS();
+                //tmp.scaleBias(0, 123.0f/255.0f, 123.0f/255.0f); tmp.clamp(0, 0, 246.0f/255.0f); // -1->0, 0->123, 1->246
+                //tmp.scaleBias(2, 123.0f/255.0f, 123.0f/255.0f); tmp.clamp(0, 0, 246.0f/255.0f); // -1->0, 0->123, 1->246
+
+                // Not helping...
+                //tmp.blockLuminanceScale(0.1f);
+                /*tmp.toYCoCg();
+                tmp.scaleBias(0, 0.5, 0.5);
+                tmp.scaleBias(1, 0.5, 0.5);
+                tmp.swizzle(0, 3, 1, 4); // Co Cg 1 Y -> Co Y Cg 1
+                tmp.copyChannel(img, 3); // Restore alpha channel for weighting.*/
             }
 
 
@@ -610,59 +639,152 @@ int main(int argc, char *argv[])
             printf("  Time: \t%.3f sec\n", timer.elapsed());
             totalTime += timer.elapsed();
 
-            nvtt::TexImage img_out = outputHandler.decompress(mode, decoder);
+            nvtt::TexImage img_out = outputHandler.decompress(mode, format, decoder);
             img_out.setAlphaMode(img.alphaMode());
             img_out.setNormalMap(img.isNormalMap());
 
-            if (mode == Mode_BC3_YCoCg) {
-                img_out.scaleBias(0, 1.0, -0.5);
-                img_out.scaleBias(1, 1.0, -0.5);
+            if (mode == Mode_BC1) {
+                if (set.type == ImageType_HDR) {
+                    /*for (int i = 0; i < 3; i++) {
+                        img_out.scaleBias(i, 4.0f, 0);
+                    }*/
+                }
+            }
+            else if (mode == Mode_BC3_YCoCg) {
+                img_out.scaleBias(0, 255.0f/123, -1.0f); // 0->-1, 123->0, 246->1
+                img_out.scaleBias(1, 255.0f/125, -1.0f); // 0->-1, 125->0, 150->1
+
+                //img_out.scaleBias(0, 2.0f, -1.0f);
+                //img_out.scaleBias(1, 2.0f, -1.0f);
+                
                 img_out.fromYCoCg();
+                img_out.clamp(0);
+                img_out.clamp(1);
+                img_out.clamp(2);
+                if (set.type == ImageType_HDR) {
+                    /*for (int i = 0; i < 3; i++) {
+                        img_out.scaleBias(i, 4.0f, 0);
+                    }*/
+                }
             }
             else if (mode == Mode_BC3_RGBM) {
-                img_out.fromRGBM();
+                if (set.type == ImageType_HDR) {
+                    img_out.fromRGBM(/*4*/);
+                }
+                else {
+                    img_out.fromRGBM();
+                }
             }
             else if (mode == Mode_BC3_LUVW) {
-                img_out.fromLUVW();
-            }
-            else if (mode == Mode_BC3_Lightmap_1) {
-                img_out.fromRGBM(4);
+                if (set.type == ImageType_HDR) {
+                    img_out.fromLUVW(/*4*/);
+                }
+                else {
+                    img_out.fromLUVW();
+                }
             }
-            else if (mode == Mode_BC3_Lightmap_2) {
-                img_out.fromLUVW(4);
+            else if (mode == Mode_BC3_RGBS) {
+                //img_out.scaleBias(0, 255.0f/123, -1.0f);
+                //img_out.scaleBias(2, 255.0f/123, -1.0f);
+                //img_out.fromJPEGLS();
+                /*img_out.swizzle(0, 2, 4, 1);    // Co Y Cg 1 - > Co Cg 1 Y
+                img_out.scaleBias(0, 1.0, -0.5);
+                img_out.scaleBias(1, 1.0, -0.5);
+                img_out.fromYCoCg();*/
             }
 
+            nvtt::TexImage diff = nvtt::diff(img, img_out, 1.0f);
+
+            //bool residualCompression = (set.type == ImageType_HDR);
+            bool residualCompression = (mode == Mode_BC3_RGBS);
+            if (residualCompression)
+            {
+                float residualScale = 8.0f;
+                nvtt::TexImage residual = diff;
+                for (int j = 0; j < 3; j++) {
+                    residual.scaleBias(j, residualScale, 0.5); // @@ The residual scale is fairly arbitrary.
+                    residual.clamp(j);
+                }
+                residual.toGreyScale(1, 1, 1, 0);
+
+                /*outputFileName.format("%s/%s", outputFilePath.str(), set.fileNames[i]);
+                outputFileName.stripExtension();
+                outputFileName.append("_residual.png");
+                residual.save(outputFileName.str());*/
+
+                nvtt::CompressionOptions residualCompressionOptions;
+                residualCompressionOptions.setFormat(nvtt::Format_BC4);
+                residualCompressionOptions.setQuality(nvtt::Quality_Production);
+                
+                context.compress(residual, 0, 0, compressionOptions, outputOptions);
+
+                nvtt::TexImage residual_out = outputHandler.decompress(mode, format, decoder);
 
-            Path outputFilePath;
-            outputFilePath.format("%s/%s", outPath, s_modeNames[test.modes[t]]);
-            FileSystem::createDirectory(outputFilePath.str());
+                /*outputFileName.format("%s/%s", outputFilePath.str(), set.fileNames[i]);
+                outputFileName.stripExtension();
+                outputFileName.append("_residual_out.png");
+                residual_out.save(outputFileName.str());*/
 
+                residual_out.scaleBias(0, 1.0f/residualScale, -0.5f/residualScale);
+                residual_out.scaleBias(1, 1.0f/residualScale, -0.5f/residualScale);
+                residual_out.scaleBias(2, 1.0f/residualScale, -0.5f/residualScale);
+
+                img_out.addChannel(residual_out, 0, 0, -1.0f); img_out.clamp(0);
+                img_out.addChannel(residual_out, 1, 1, -1.0f); img_out.clamp(1);
+                img_out.addChannel(residual_out, 2, 2, -1.0f); img_out.clamp(2);
+            }
+
+            if (set.type == ImageType_HDR)
+            {
+                Path outputFileName;
+                outputFileName.format("%s/%s", outPath, set.fileNames[i]);
+                outputFileName.stripExtension();
+                if (set.type == ImageType_HDR) outputFileName.append(".dds");
+                else outputFileName.append(".png");
+                if (!img.save(outputFileName.str()))
+                {
+                    printf("Error saving file '%s'.\n", outputFileName.str());
+                }
+            }
+
+            // Output compressed image.
             Path outputFileName;
             outputFileName.format("%s/%s", outputFilePath.str(), set.fileNames[i]);
             outputFileName.stripExtension();
-            if (mode == Mode_BC3_Lightmap_1 || mode == Mode_BC3_Lightmap_2) {
-                outputFileName.append(".dds");
-            }
-            else {
-                outputFileName.append(".png");
-            }
+            if (set.type == ImageType_HDR) outputFileName.append(".dds");
+            else outputFileName.append(".png");
             if (!img_out.save(outputFileName.str()))
             {
                 printf("Error saving file '%s'.\n", outputFileName.str());
             }
 
+            // Output RMSE.
             float rmse = nvtt::rmsError(img, img_out);
+            if (set.type == ImageType_HDR) rmse *= 4;
             totalRMSE += rmse;
+            printf("  RMSE:          \t%.4f\n", rmse);
+
+            float deltae = nvtt::cieLabError(img, img_out);
+            totalDeltaE += deltae;
+            printf("  CIE-Lab DeltaE:\t%.4f\n", deltae);
 
-            printf("  RMSE:  \t%.4f\n", rmse);
 
             graphWriter << rmse;
+            //graphWriter << deltae;
             if (i != set.fileCount-1) graphWriter << ",";
 
 
+            // Output diff.
+            for (int j = 0; j < 3; j++) {
+                diff.scaleBias(j, 4.0f, 0.0f); 
+                diff.abs(j);
+                diff.clamp(j);
+            }
+
+            outputFileName.format("%s/%s", outputFilePath.str(), set.fileNames[i]);
             outputFileName.stripExtension();
             outputFileName.append("_diff.png");
-            nvtt::diff(img, img_out, 4.0f).save(outputFileName.str());
+            diff.save(outputFileName.str());
 
 
             // Output csv file
@@ -701,11 +823,13 @@ int main(int argc, char *argv[])
         }
 
         totalRMSE /= set.fileCount;
-        totalDiff /= set.fileCount;
+        totalDeltaE /= set.fileCount;
+        //totalDiff /= set.fileCount;
 
         printf("Total Results:\n");
-        printf("  Total Time: \t%.3f sec\n", totalTime);
-        printf("  Average RMSE:\t%.4f\n", totalRMSE);
+        printf("  Total Time:            \t%.3f sec\n", totalTime);
+        printf("  Average RMSE:          \t%.4f\n", totalRMSE);
+        printf("  Average CIE-Lab DeltaE:\t%.4f\n", totalDeltaE);
 
         if (t != test.count-1) graphWriter << "|";
     }