diff --git a/src/nvtt/cuda/CompressKernel.cu b/src/nvtt/cuda/CompressKernel.cu
index 2859e60..e63ae61 100644
--- a/src/nvtt/cuda/CompressKernel.cu
+++ b/src/nvtt/cuda/CompressKernel.cu
@@ -29,12 +29,6 @@
 
 #define NUM_THREADS 64		// Number of threads per block.
 
-#if __DEVICE_EMULATION__
-#define __debugsync() __syncthreads()
-#else
-#define __debugsync()
-#endif
-
 typedef unsigned char uchar;
 typedef unsigned short ushort;
 typedef unsigned int uint;
@@ -42,9 +36,9 @@ typedef unsigned int uint;
 template <class T> 
 __device__ inline void swap(T & a, T & b)
 {
-	T tmp = a;
-	a = b;
-	b = tmp;
+    T tmp = a;
+    a = b;
+    b = tmp;
 }
 
 __constant__ uchar OMatch5[256][2];
@@ -85,9 +79,9 @@ __device__ inline float u8_to_float(uint value)
 __device__ float3 color32ToFloat3(uint c)
 {
     float3 color;
-	color.z = u8_to_float((c >> 0) & 0xFF);
-	color.y = u8_to_float((c >> 8) & 0xFF);
-	color.x = u8_to_float((c >> 16) & 0xFF);
+    color.z = u8_to_float((c >> 0) & 0xFF);
+    color.y = u8_to_float((c >> 8) & 0xFF);
+    color.x = u8_to_float((c >> 16) & 0xFF);
     return color;
 }
 
@@ -162,61 +156,61 @@ __device__ inline float u8_to_float(uint value)
 
 inline __device__ float3 roundAndExpand565(float3 v, ushort * w)
 {
-	uint x = float_to_u8(v.x) >> 3;
-	uint y = float_to_u8(v.y) >> 2;
-	uint z = float_to_u8(v.z) >> 3;
-	*w = (x << 11) | (y << 5) | z;
-	v.x = u8_to_float((x << 3) | (x >> 2));
-	v.y = u8_to_float((y << 2) | (y >> 4));
-	v.z = u8_to_float((z << 3) | (z >> 2));
-//	v.x = u8_to_float(x) * 255.0f / 31.0f;
-//	v.y = u8_to_float(y) * 255.0f / 63.0f;
-//	v.z = u8_to_float(z) * 255.0f / 31.0f;
-	return v;
+    uint x = float_to_u8(v.x) >> 3;
+    uint y = float_to_u8(v.y) >> 2;
+    uint z = float_to_u8(v.z) >> 3;
+    *w = (x << 11) | (y << 5) | z;
+    v.x = u8_to_float((x << 3) | (x >> 2));
+    v.y = u8_to_float((y << 2) | (y >> 4));
+    v.z = u8_to_float((z << 3) | (z >> 2));
+//    v.x = u8_to_float(x) * 255.0f / 31.0f;
+//    v.y = u8_to_float(y) * 255.0f / 63.0f;
+//    v.z = u8_to_float(z) * 255.0f / 31.0f;
+    return v;
 }
 #else
 
 inline __device__ float3 roundAndExpand565(float3 v, ushort * w)
 {
-	uint x = __float2uint_rn(__saturatef(v.x) * 31.0f);
-	uint y = __float2uint_rn(__saturatef(v.y) * 63.0f);
-	uint z = __float2uint_rn(__saturatef(v.z) * 31.0f);
+    uint x = __float2uint_rn(__saturatef(v.x) * 31.0f);
+    uint y = __float2uint_rn(__saturatef(v.y) * 63.0f);
+    uint z = __float2uint_rn(__saturatef(v.z) * 31.0f);
 
-	//uint x = float_to_u5(v.x);
+    //uint x = float_to_u5(v.x);
     //uint y = float_to_u6(v.y);
     //uint z = float_to_u5(v.z);
 
-	*w = (x << 11) | (y << 5) | z;
-	
+    *w = (x << 11) | (y << 5) | z;
+
     v.x = __uint2float_rn(x) * 1.0f / 31.0f;
-	v.y = __uint2float_rn(y) * 1.0f / 63.0f;
-	v.z = __uint2float_rn(z) * 1.0f / 31.0f;
+    v.y = __uint2float_rn(y) * 1.0f / 63.0f;
+    v.z = __uint2float_rn(z) * 1.0f / 31.0f;
 
-	//v.x = u8_to_float((x << 3) | (x >> 2));
-	//v.y = u8_to_float((y << 2) | (y >> 4));
-	//v.z = u8_to_float((z << 3) | (z >> 2));
+    //v.x = u8_to_float((x << 3) | (x >> 2));
+    //v.y = u8_to_float((y << 2) | (y >> 4));
+    //v.z = u8_to_float((z << 3) | (z >> 2));
 
     return v;
 }
 #endif
 inline __device__ float2 roundAndExpand56(float2 v, ushort * w)
 {
-	uint x = __float2uint_rn(__saturatef(v.x) * 31.0f);
-	uint y = __float2uint_rn(__saturatef(v.y) * 63.0f);
-	*w = (x << 11) | (y << 5);
-	v.x = __uint2float_rn(x) * 1.0f / 31.0f;
-	v.y = __uint2float_rn(y) * 1.0f / 63.0f;
-	return v;
+    uint x = __float2uint_rn(__saturatef(v.x) * 31.0f);
+    uint y = __float2uint_rn(__saturatef(v.y) * 63.0f);
+    *w = (x << 11) | (y << 5);
+    v.x = __uint2float_rn(x) * 1.0f / 31.0f;
+    v.y = __uint2float_rn(y) * 1.0f / 63.0f;
+    return v;
 }
 
 inline __device__ float2 roundAndExpand88(float2 v, ushort * w)
 {
-	uint x = __float2uint_rn(__saturatef(v.x) * 255.0f);
-	uint y = __float2uint_rn(__saturatef(v.y) * 255.0f);
-	*w = (x << 8) | y;
-	v.x = __uint2float_rn(x) * 1.0f / 255.0f;
-	v.y = __uint2float_rn(y) * 1.0f / 255.0f;
-	return v;
+    uint x = __float2uint_rn(__saturatef(v.x) * 255.0f);
+    uint y = __float2uint_rn(__saturatef(v.y) * 255.0f);
+    *w = (x << 8) | y;
+    v.x = __uint2float_rn(x) * 1.0f / 255.0f;
+    v.y = __uint2float_rn(y) * 1.0f / 255.0f;
+    return v;
 }
 
 
@@ -228,13 +222,13 @@ __device__ float3 blockError4(const float3 * colors, uint permutation, float3 a,
 {
     float3 error = make_float3(0.0f, 0.0f, 0.0f);
 
-	for (int i = 0; i < 16; i++)
-	{
-		const uint bits = permutation >> (2*i);
+    for (int i = 0; i < 16; i++)
+    {
+        const uint bits = permutation >> (2*i);
 
-		float beta = (bits & 1);
-		if (bits & 2) beta = (1 + beta) / 3.0f;
-		float alpha = 1.0f - beta;
+        float beta = (bits & 1);
+        if (bits & 2) beta = (1 + beta) / 3.0f;
+        float alpha = 1.0f - beta;
 
         float3 diff = colors[i] - (a*alpha + b*beta);
 
@@ -251,13 +245,13 @@ __device__ float3 blockError4(const float3 * colors, uint permutation, ushort c0
     int3 color0 = color16ToInt3(c0);
     int3 color1 = color16ToInt3(c1);
 
-	for (int i = 0; i < 16; i++)
-	{
-		const uint bits = permutation >> (2*i);
+    for (int i = 0; i < 16; i++)
+    {
+        const uint bits = permutation >> (2*i);
 
-		int beta = (bits & 1);
-		if (bits & 2) beta = (1 + beta);
-		float alpha = 3 - beta;
+        int beta = (bits & 1);
+        if (bits & 2) beta = (1 + beta);
+        float alpha = 3 - beta;
 
         int3 color;
         color.x = (color0.x * alpha + color1.x * beta) / 3;
@@ -277,13 +271,13 @@ __device__ float3 blockError3(const float3 * colors, uint permutation, float3 a,
 {
     float3 error = make_float3(0.0f, 0.0f, 0.0f);
 
-	for (int i = 0; i < 16; i++)
-	{
-		const uint bits = permutation >> (2*i);
+    for (int i = 0; i < 16; i++)
+    {
+        const uint bits = permutation >> (2*i);
 
-		float beta = (bits & 1);
-		if (bits & 2) beta = 0.5f;
-		float alpha = 1.0f - beta;
+        float beta = (bits & 1);
+        if (bits & 2) beta = 0.5f;
+        float alpha = 1.0f - beta;
 
         float3 diff = colors[i] - (a*alpha + b*beta);
 
@@ -308,67 +302,43 @@ __device__ int packColors(float3 * values, float * weights, int * ranks)
 {
     const int tid = threadIdx.x;
 
-	__shared__ int count;
-	count = 0;
+    __shared__ int count;
+    count = 0;
 
-	bool alive = true;
+    bool alive = true;
 
-	// Append this 
-	for (int i = 0; i < 16; i++)
-	{
-		// One thread leads on each iteration.
-		if (tid == i) {
-
-			// If thread alive, then append element.
-			if (alive) {
-				values[count] = values[i];
-				weights[count] = weights[i];
-				count++;
-			}
+    // Append this
+    for (int i = 0; i < 16; i++)
+    {
+        // One thread leads on each iteration.
+        if (tid == i) {
+
+            // If thread alive, then append element.
+            if (alive) {
+                values[count] = values[i];
+                weights[count] = weights[i];
+                count++;
+            }
 
-			// Otherwise update weight.
-			else {
-				weights[ranks[i]] += weights[i];
-			}
-		}
+            // Otherwise update weight.
+            else {
+                weights[ranks[i]] += weights[i];
+            }
+        }
 
-		// Kill all threads that have the same element and record rank.
-		if (values[i] == values[tid]) {
-			alive = false;
-			ranks[tid] = count - 1;
-		}
-	}
+        // Kill all threads that have the same element and record rank.
+        if (values[i] == values[tid]) {
+            alive = false;
+            ranks[tid] = count - 1;
+        }
+    }
 
-	return count;
+    return count;
 }
 
 
 __device__ void sortColors(const float * values, int * ranks)
 {
-#if __DEVICE_EMULATION__
-    if (threadIdx.x == 0)
-    {
-        for (int tid = 0; tid < 16; tid++)
-        {
-            int rank = 0;
-            for (int i = 0; i < 16; i++)
-            {
-                rank += (values[i] < values[tid]);
-            }
-            
-            ranks[tid] = rank;
-        }
-
-        // Resolve elements with the same index.
-        for (int i = 0; i < 15; i++)
-        {
-            for (int tid = 0; tid < 16; tid++)
-            {
-                if (tid > i && ranks[tid] == ranks[i]) ++ranks[tid];
-            }
-        }
-    }
-#else
     const int tid = threadIdx.x;
 
     int rank = 0;
@@ -387,35 +357,10 @@ __device__ void sortColors(const float * values, int * ranks)
     {
         if ((tid > i) & (ranks[tid] == ranks[i])) ++ranks[tid];
     }
-#endif
 }
 
 __device__ void sortColors(const float * values, int * ranks, int count)
 {
-#if __DEVICE_EMULATION__
-    if (threadIdx.x == 0)
-    {
-        for (int tid = 0; tid < count; tid++)
-        {
-            int rank = 0;
-            for (int i = 0; i < count; i++)
-            {
-                rank += (values[i] < values[tid]);
-            }
-            
-            ranks[tid] = rank;
-        }
-
-        // Resolve elements with the same index.
-        for (int i = 0; i < count-1; i++)
-        {
-            for (int tid = 0; tid < count; tid++)
-            {
-                if (tid > i && ranks[tid] == ranks[i]) ++ranks[tid];
-            }
-        }
-    }
-#else
     const int tid = threadIdx.x;
 
     int rank = 0;
@@ -434,7 +379,6 @@ __device__ void sortColors(const float * values, int * ranks, int count)
     {
         if ((tid > i) & (ranks[tid] == ranks[i])) ++ranks[tid];
     }
-#endif
 }
 
 
@@ -443,95 +387,38 @@ __device__ void sortColors(const float * values, int * ranks, int count)
 // Load color block to shared mem
 ////////////////////////////////////////////////////////////////////////////////
 
-/*__device__ void loadColorBlock(const uint * image, float3 colors[16], float3 sums[16], int xrefs[16], int * sameColor)
+__device__ void loadColorBlockTex(uint firstBlock, uint blockWidth, float3 colors[16], float3 sums[16], int xrefs[16], int * sameColor)
 {
-	const int bid = blockIdx.x;
-	const int idx = threadIdx.x;
+    const int bid = blockIdx.x;
+    const int idx = threadIdx.x;
 
-	__shared__ float dps[16];
+    __shared__ float dps[16];
 
-	if (idx < 16)
-	{
-		// Read color and copy to shared mem.
-		uint c = image[(bid) * 16 + idx];
-		
-		colors[idx] = color32ToFloat3(c);
-		
-		// No need to synchronize, 16 < warp size.
-		__debugsync();
-		
-		// Sort colors along the best fit line.
-		colorSums(colors, sums);
-		float3 axis = bestFitLine(colors, sums[0], kColorMetric);
-		
-		*sameColor = (axis == make_float3(0, 0, 0));
-		
-		dps[idx] = dot(colors[idx], axis);
-		
-		__debugsync();
-		
-		sortColors(dps, xrefs);
-		
-		float3 tmp = colors[idx];
-		__debugsync();
-		colors[xrefs[idx]] = tmp;
-	}
-#if __DEVICE_EMULATION__
-	else
-	{
-		__debugsync();
-		__debugsync();
-		__debugsync();
-	}
-#endif
-}*/
+    if (idx < 16)
+    {
+        float x = 4 * ((firstBlock + bid) % blockWidth) + idx % 4; // @@ Avoid mod and div by using 2D grid?
+        float y = 4 * ((firstBlock + bid) / blockWidth) + idx / 4;
 
-__device__ void loadColorBlockTex(uint firstBlock, uint width, float3 colors[16], float3 sums[16], int xrefs[16], int * sameColor)
-{
-	const int bid = blockIdx.x;
-	const int idx = threadIdx.x;
+        // Read color and copy to shared mem.
+        float4 c = tex2D(tex, x, y);
 
-	__shared__ float dps[16];
+        colors[idx].x = c.z;
+        colors[idx].y = c.y;
+        colors[idx].z = c.x;
 
-	if (idx < 16)
-	{
-		float x = 4 * ((firstBlock + bid) % width) + idx % 4; // @@ Avoid mod and div by using 2D grid?
-		float y = 4 * ((firstBlock + bid) / width) + idx / 4;
+        // Sort colors along the best fit line.
+        colorSums(colors, sums);
+        float3 axis = bestFitLine(colors, sums[0], kColorMetric);
 
-		// Read color and copy to shared mem.
-		float4 c = tex2D(tex, x, y);
+        *sameColor = (axis == make_float3(0, 0, 0));
 
-		colors[idx].x = c.z;
-		colors[idx].y = c.y;
-		colors[idx].z = c.x;
+        dps[idx] = dot(colors[idx], axis);
 
-		// No need to synchronize, 16 < warp size.
-		__debugsync();
-		
-		// Sort colors along the best fit line.
-		colorSums(colors, sums);
-		float3 axis = bestFitLine(colors, sums[0], kColorMetric);
-		
-		*sameColor = (axis == make_float3(0, 0, 0));
-		
-		dps[idx] = dot(colors[idx], axis);
-		
-		__debugsync();
-		
-		sortColors(dps, xrefs);
-		
-		float3 tmp = colors[idx];
-		__debugsync();
-		colors[xrefs[idx]] = tmp;
-	}
-#if __DEVICE_EMULATION__
-	else
-	{
-		__debugsync();
-		__debugsync();
-		__debugsync();
-	}
-#endif
+        sortColors(dps, xrefs);
+
+        float3 tmp = colors[idx];
+        colors[xrefs[idx]] = tmp;
+    }
 }
 
 /*
@@ -577,159 +464,75 @@ __device__ void loadColorBlockTex(uint firstBlock, uint w, float3 colors[16], fl
 
 __device__ void loadColorBlockTex(uint firstBlock, uint width, float3 colors[16], float3 sums[16], float weights[16], int xrefs[16], int * sameColor)
 {
-	const int bid = blockIdx.x;
-	const int idx = threadIdx.x;
+    const int bid = blockIdx.x;
+    const int idx = threadIdx.x;
 
-	__shared__ float3 rawColors[16];
-	__shared__ float dps[16];
+    __shared__ float3 rawColors[16];
+    __shared__ float dps[16];
 
-	if (idx < 16)
-	{
-		float x = 4 * ((firstBlock + bid) % width) + idx % 4; // @@ Avoid mod and div by using 2D grid?
-		float y = 4 * ((firstBlock + bid) / width) + idx / 4;
+    if (idx < 16)
+    {
+        float x = 4 * ((firstBlock + bid) % width) + idx % 4; // @@ Avoid mod and div by using 2D grid?
+        float y = 4 * ((firstBlock + bid) / width) + idx / 4;
 
-		// Read color and copy to shared mem.
-		float4 c = tex2D(tex, x, y);
+        // Read color and copy to shared mem.
+        float4 c = tex2D(tex, x, y);
 
-		rawColors[idx].x = c.z;
-		rawColors[idx].y = c.y;
-		rawColors[idx].z = c.x;
-		weights[idx] = c.w;
+        rawColors[idx].x = c.z;
+        rawColors[idx].y = c.y;
+        rawColors[idx].z = c.x;
+        weights[idx] = c.w;
 
-		colors[idx] = rawColors[idx] * weights[idx];
+        colors[idx] = rawColors[idx] * weights[idx];
 
-		// No need to synchronize, 16 < warp size.
-		__debugsync();
-		
-		// Sort colors along the best fit line.
-		colorSums(colors, sums);
-		float3 axis = bestFitLine(colors, sums[0], kColorMetric);
-		
-		*sameColor = (axis == make_float3(0, 0, 0));
-		
-		// Single color compressor needs unweighted colors.
-		if (*sameColor) colors[idx] = rawColors[idx];
+        // Sort colors along the best fit line.
+        colorSums(colors, sums);
+        float3 axis = bestFitLine(colors, sums[0], kColorMetric);
 
-		dps[idx] = dot(colors[idx], axis);
-		
-		__debugsync();
-		
-		sortColors(dps, xrefs);
-		
-		float3 tmp = colors[idx];
-		float w = weights[idx];
-		__debugsync();
-		colors[xrefs[idx]] = tmp;
-		weights[xrefs[idx]] = w;
-	}
-#if __DEVICE_EMULATION__
-	else
-	{
-		__debugsync();
-		__debugsync();
-		__debugsync();
-	}
-#endif
-}
+        *sameColor = (axis == make_float3(0, 0, 0));
 
-/*
-__device__ void loadColorBlock(const uint * image, float3 colors[16], float3 sums[16], float weights[16], int xrefs[16], int * sameColor)
-{
-	const int bid = blockIdx.x;
-	const int idx = threadIdx.x;
+        // Single color compressor needs unweighted colors.
+        if (*sameColor) colors[idx] = rawColors[idx];
 
-	__shared__ float3 rawColors[16];
-	__shared__ float dps[16];
+        dps[idx] = dot(colors[idx], axis);
 
-	if (idx < 16)
-	{
-		// Read color and copy to shared mem.
-		uint c = image[(bid) * 16 + idx];
-		
-		rawColors[idx].z = ((c >> 0) & 0xFF) * (1.0f / 255.0f);
-		rawColors[idx].y = ((c >> 8) & 0xFF) * (1.0f / 255.0f);
-		rawColors[idx].x = ((c >> 16) & 0xFF) * (1.0f / 255.0f);
-		weights[idx] = (((c >> 24) & 0xFF) + 1) * (1.0f / 256.0f);
-		
-		colors[idx] = rawColors[idx] * weights[idx];
+        sortColors(dps, xrefs);
 
-		// No need to synchronize, 16 < warp size.
-		__debugsync();
+        float3 tmp = colors[idx];
+        float w = weights[idx];
+        colors[xrefs[idx]] = tmp;
+        weights[xrefs[idx]] = w;
+    }
+}
 
-		// Sort colors along the best fit line.
-		colorSums(colors, sums);
-		float3 axis = bestFitLine(colors, sums[0], kColorMetric);
+__device__ void loadColorBlock(const uint * image, float2 colors[16], float2 sums[16], int xrefs[16], int * sameColor)
+{
+    const int bid = blockIdx.x;
+    const int idx = threadIdx.x;
 
-		*sameColor = (axis == make_float3(0, 0, 0));
+    __shared__ float dps[16];
 
-		// Single color compressor needs unweighted colors.
-		if (*sameColor) colors[idx] = rawColors[idx];
+    if (idx < 16)
+    {
+        // Read color and copy to shared mem.
+        uint c = image[(bid) * 16 + idx];
 
-		dps[idx] = dot(rawColors[idx], axis);
-		
-		__debugsync();
-		
-		sortColors(dps, xrefs);
-		
-		float3 tmp = colors[idx];
-		float w = weights[idx];
-		__debugsync();
-		colors[xrefs[idx]] = tmp;
-		weights[xrefs[idx]] = w;
-	}
-#if __DEVICE_EMULATION__
-	else
-	{
-		__debugsync();
-		__debugsync();
-		__debugsync();
-	}
-#endif
-}
-*/
+        colors[idx].y = ((c >> 8) & 0xFF) * (1.0f / 255.0f);
+        colors[idx].x = ((c >> 16) & 0xFF) * (1.0f / 255.0f);
 
-__device__ void loadColorBlock(const uint * image, float2 colors[16], float2 sums[16], int xrefs[16], int * sameColor)
-{
-	const int bid = blockIdx.x;
-	const int idx = threadIdx.x;
+        // Sort colors along the best fit line.
+        colorSums(colors, sums);
+        float2 axis = bestFitLine(colors, sums[0]);
 
-	__shared__ float dps[16];
+        *sameColor = (axis == make_float2(0, 0));
 
-	if (idx < 16)
-	{
-		// Read color and copy to shared mem.
-		uint c = image[(bid) * 16 + idx];
-	
-		colors[idx].y = ((c >> 8) & 0xFF) * (1.0f / 255.0f);
-		colors[idx].x = ((c >> 16) & 0xFF) * (1.0f / 255.0f);
-		
-		// No need to synchronize, 16 < warp size.
-		__debugsync();
+        dps[idx] = dot(colors[idx], axis);
 
-		// Sort colors along the best fit line.
-		colorSums(colors, sums);
-		float2 axis = bestFitLine(colors, sums[0]);
-		
-		*sameColor = (axis == make_float2(0, 0));
+        sortColors(dps, xrefs);
 
-		dps[idx] = dot(colors[idx], axis);
-		
-		__debugsync();
-		
-		sortColors(dps, xrefs);
-		
-		float2 tmp = colors[idx];
-		__debugsync();
-		colors[xrefs[idx]] = tmp;
-	}
-#if __DEVICE_EMULATION__
-	else
-	{
-		__debugsync();
-		__debugsync();
-		__debugsync();
-	}
-#endif
+        float2 tmp = colors[idx];
+        colors[xrefs[idx]] = tmp;
+    }
 }
 
 
@@ -738,82 +541,82 @@ __device__ void loadColorBlock(const uint * image, float2 colors[16], float2 sum
 ////////////////////////////////////////////////////////////////////////////////
 __device__ float evalPermutation4(const float3 * colors, uint permutation, ushort * start, ushort * end)
 {
-	// Compute endpoints using least squares.
-	float alpha2_sum = 0.0f;
-	float beta2_sum = 0.0f;
-	float alphabeta_sum = 0.0f;
-	float3 alphax_sum = make_float3(0.0f, 0.0f, 0.0f);
-	float3 betax_sum = make_float3(0.0f, 0.0f, 0.0f);
+    // Compute endpoints using least squares.
+    float alpha2_sum = 0.0f;
+    float beta2_sum = 0.0f;
+    float alphabeta_sum = 0.0f;
+    float3 alphax_sum = make_float3(0.0f, 0.0f, 0.0f);
+    float3 betax_sum = make_float3(0.0f, 0.0f, 0.0f);
 
-	// Compute alpha & beta for this permutation.
-	for (int i = 0; i < 16; i++)
-	{
-		const uint bits = permutation >> (2*i);
-		
-		float beta = (bits & 1);
-		if (bits & 2) beta = (1 + beta) / 3.0f;
-		float alpha = 1.0f - beta;
-		
-		alpha2_sum += alpha * alpha;
-		beta2_sum += beta * beta;
-		alphabeta_sum += alpha * beta;
-		alphax_sum += alpha * colors[i];
-		betax_sum += beta * colors[i];
-	}
+    // Compute alpha & beta for this permutation.
+    for (int i = 0; i < 16; i++)
+    {
+        const uint bits = permutation >> (2*i);
 
-	const float factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);
+        float beta = (bits & 1);
+        if (bits & 2) beta = (1 + beta) / 3.0f;
+        float alpha = 1.0f - beta;
 
-	float3 a = (alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor;
-	float3 b = (betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor;
-	
-	// Round a, b to the closest 5-6-5 color and expand...
-	a = roundAndExpand565(a, start);
-	b = roundAndExpand565(b, end);
+        alpha2_sum += alpha * alpha;
+        beta2_sum += beta * beta;
+        alphabeta_sum += alpha * beta;
+        alphax_sum += alpha * colors[i];
+        betax_sum += beta * colors[i];
+    }
 
-	// compute the error
-	float3 e = a * a * alpha2_sum + b * b * beta2_sum + 2.0f * (a * b * alphabeta_sum - a * alphax_sum - b * betax_sum);
+    const float factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);
 
-	return dot(e, kColorMetricSqr);
+    float3 a = (alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor;
+    float3 b = (betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor;
+
+    // Round a, b to the closest 5-6-5 color and expand...
+    a = roundAndExpand565(a, start);
+    b = roundAndExpand565(b, end);
+
+    // compute the error
+    float3 e = a * a * alpha2_sum + b * b * beta2_sum + 2.0f * (a * b * alphabeta_sum - a * alphax_sum - b * betax_sum);
+
+    return dot(e, kColorMetricSqr);
 }
 
 __device__ float evalPermutation3(const float3 * colors, uint permutation, ushort * start, ushort * end)
 {
-	// Compute endpoints using least squares.
-	float alpha2_sum = 0.0f;
-	float beta2_sum = 0.0f;
-	float alphabeta_sum = 0.0f;
-	float3 alphax_sum = make_float3(0.0f, 0.0f, 0.0f);
-	float3 betax_sum = make_float3(0.0f, 0.0f, 0.0f);
+    // Compute endpoints using least squares.
+    float alpha2_sum = 0.0f;
+    float beta2_sum = 0.0f;
+    float alphabeta_sum = 0.0f;
+    float3 alphax_sum = make_float3(0.0f, 0.0f, 0.0f);
+    float3 betax_sum = make_float3(0.0f, 0.0f, 0.0f);
 
-	// Compute alpha & beta for this permutation.
-	for (int i = 0; i < 16; i++)
-	{
-		const uint bits = permutation >> (2*i);
+    // Compute alpha & beta for this permutation.
+    for (int i = 0; i < 16; i++)
+    {
+        const uint bits = permutation >> (2*i);
 
-		float beta = (bits & 1);
-		if (bits & 2) beta = 0.5f;
-		float alpha = 1.0f - beta;
-	
-		alpha2_sum += alpha * alpha;
-		beta2_sum += beta * beta;
-		alphabeta_sum += alpha * beta;
-		alphax_sum += alpha * colors[i];
-		betax_sum += beta * colors[i];
-	}
+        float beta = (bits & 1);
+        if (bits & 2) beta = 0.5f;
+        float alpha = 1.0f - beta;
 
-	const float factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);
+        alpha2_sum += alpha * alpha;
+        beta2_sum += beta * beta;
+        alphabeta_sum += alpha * beta;
+        alphax_sum += alpha * colors[i];
+        betax_sum += beta * colors[i];
+    }
 
-	float3 a = (alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor;
-	float3 b = (betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor;
-	
-	// Round a, b to the closest 5-6-5 color and expand...
-	a = roundAndExpand565(a, start);
-	b = roundAndExpand565(b, end);
+    const float factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);
 
-	// compute the error
-	float3 e = a * a * alpha2_sum + b * b * beta2_sum + 2.0f * (a * b * alphabeta_sum - a * alphax_sum - b * betax_sum);
+    float3 a = (alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor;
+    float3 b = (betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor;
 
-	return dot(e, kColorMetricSqr);
+    // Round a, b to the closest 5-6-5 color and expand...
+    a = roundAndExpand565(a, start);
+    b = roundAndExpand565(b, end);
+
+    // compute the error
+    float3 e = a * a * alpha2_sum + b * b * beta2_sum + 2.0f * (a * b * alphabeta_sum - a * alphax_sum - b * betax_sum);
+
+    return dot(e, kColorMetricSqr);
 }
 
 __constant__ const float alphaTable4[4] = { 9.0f, 0.0f, 6.0f, 3.0f };
@@ -823,268 +626,268 @@ __constant__ const uint prods3[4] = { 0x040000,0x000400,0x040101,0x010401 };
 
 __device__ float evalPermutation4(const float3 * colors, float3 color_sum, uint permutation, ushort * start, ushort * end)
 {
-	// Compute endpoints using least squares.
-	float3 alphax_sum = make_float3(0.0f, 0.0f, 0.0f);
-	uint akku = 0;
+    // Compute endpoints using least squares.
+    float3 alphax_sum = make_float3(0.0f, 0.0f, 0.0f);
+    uint akku = 0;
 
-	// Compute alpha & beta for this permutation.
-	#pragma unroll
-	for (int i = 0; i < 16; i++)
-	{
-		const uint bits = permutation >> (2*i);
-		
-		alphax_sum += alphaTable4[bits & 3] * colors[i];
-		akku += prods4[bits & 3];
-	}
+    // Compute alpha & beta for this permutation.
+    #pragma unroll
+    for (int i = 0; i < 16; i++)
+    {
+        const uint bits = permutation >> (2*i);
 
-	float alpha2_sum = float(akku >> 16);
-	float beta2_sum = float((akku >> 8) & 0xff);
-	float alphabeta_sum = float(akku & 0xff);
-	float3 betax_sum = 9.0f * color_sum - alphax_sum;
+        alphax_sum += alphaTable4[bits & 3] * colors[i];
+        akku += prods4[bits & 3];
+    }
 
-	const float factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);
+    float alpha2_sum = float(akku >> 16);
+    float beta2_sum = float((akku >> 8) & 0xff);
+    float alphabeta_sum = float(akku & 0xff);
+    float3 betax_sum = 9.0f * color_sum - alphax_sum;
 
-	float3 a = (alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor;
-	float3 b = (betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor;
-	
-	// Round a, b to the closest 5-6-5 color and expand...
-	a = roundAndExpand565(a, start);
-	b = roundAndExpand565(b, end);
+    const float factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);
 
-	// compute the error
-	float3 e = a * a * alpha2_sum + b * b * beta2_sum + 2.0f * (a * b * alphabeta_sum - a * alphax_sum - b * betax_sum);
+    float3 a = (alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor;
+    float3 b = (betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor;
+
+    // Round a, b to the closest 5-6-5 color and expand...
+    a = roundAndExpand565(a, start);
+    b = roundAndExpand565(b, end);
+
+    // compute the error
+    float3 e = a * a * alpha2_sum + b * b * beta2_sum + 2.0f * (a * b * alphabeta_sum - a * alphax_sum - b * betax_sum);
 
     //float3 e = blockError4(colors, permutation, *start, *end);
 
-	return (1.0f / 9.0f) * dot(e, kColorMetricSqr);
+    return (1.0f / 9.0f) * dot(e, kColorMetricSqr);
 }
 
 __device__ float evalPermutation3(const float3 * colors, float3 color_sum, uint permutation, ushort * start, ushort * end)
 {
-	// Compute endpoints using least squares.
-	float3 alphax_sum = make_float3(0.0f, 0.0f, 0.0f);
-	uint akku = 0;
+    // Compute endpoints using least squares.
+    float3 alphax_sum = make_float3(0.0f, 0.0f, 0.0f);
+    uint akku = 0;
 
-	// Compute alpha & beta for this permutation.
-	#pragma unroll
-	for (int i = 0; i < 16; i++)
-	{
-		const uint bits = permutation >> (2*i);
+    // Compute alpha & beta for this permutation.
+    #pragma unroll
+    for (int i = 0; i < 16; i++)
+    {
+        const uint bits = permutation >> (2*i);
 
-		alphax_sum += alphaTable3[bits & 3] * colors[i];
-		akku += prods3[bits & 3];
-	}
+        alphax_sum += alphaTable3[bits & 3] * colors[i];
+        akku += prods3[bits & 3];
+    }
 
-	float alpha2_sum = float(akku >> 16);
-	float beta2_sum = float((akku >> 8) & 0xff);
-	float alphabeta_sum = float(akku & 0xff);
-	float3 betax_sum = 4.0f * color_sum - alphax_sum;
+    float alpha2_sum = float(akku >> 16);
+    float beta2_sum = float((akku >> 8) & 0xff);
+    float alphabeta_sum = float(akku & 0xff);
+    float3 betax_sum = 4.0f * color_sum - alphax_sum;
 
-	const float factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);
+    const float factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);
 
-	float3 a = (alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor;
-	float3 b = (betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor;
-	
-	// Round a, b to the closest 5-6-5 color and expand...
-	a = roundAndExpand565(a, start);
-	b = roundAndExpand565(b, end);
+    float3 a = (alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor;
+    float3 b = (betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor;
 
-	// compute the error
-	float3 e = a * a * alpha2_sum + b * b * beta2_sum + 2.0f * (a * b * alphabeta_sum - a * alphax_sum - b * betax_sum);
+    // Round a, b to the closest 5-6-5 color and expand...
+    a = roundAndExpand565(a, start);
+    b = roundAndExpand565(b, end);
+
+    // compute the error
+    float3 e = a * a * alpha2_sum + b * b * beta2_sum + 2.0f * (a * b * alphabeta_sum - a * alphax_sum - b * betax_sum);
 
     //float3 e = blockError3(colors, permutation, a, b);
 
-	return (1.0f / 4.0f) * dot(e, kColorMetricSqr);
+    return (1.0f / 4.0f) * dot(e, kColorMetricSqr);
 }
 
 __device__ float evalPermutation4(const float3 * colors, const float * weights, float3 color_sum, uint permutation, ushort * start, ushort * end)
 {
-	// Compute endpoints using least squares.
-	float alpha2_sum = 0.0f;
-	float beta2_sum = 0.0f;
-	float alphabeta_sum = 0.0f;
-	float3 alphax_sum = make_float3(0.0f, 0.0f, 0.0f);
+    // Compute endpoints using least squares.
+    float alpha2_sum = 0.0f;
+    float beta2_sum = 0.0f;
+    float alphabeta_sum = 0.0f;
+    float3 alphax_sum = make_float3(0.0f, 0.0f, 0.0f);
 
-	// Compute alpha & beta for this permutation.
-	for (int i = 0; i < 16; i++)
-	{
-		const uint bits = permutation >> (2*i);
-		
-		float beta = (bits & 1);
-		if (bits & 2) beta = (1 + beta) / 3.0f;
-		float alpha = 1.0f - beta;
-		
-		alpha2_sum += alpha * alpha * weights[i];
-		beta2_sum += beta * beta * weights[i];
-		alphabeta_sum += alpha * beta * weights[i];
-		alphax_sum += alpha * colors[i];
-	}
+    // Compute alpha & beta for this permutation.
+    for (int i = 0; i < 16; i++)
+    {
+        const uint bits = permutation >> (2*i);
 
-	float3 betax_sum = color_sum - alphax_sum;
+        float beta = (bits & 1);
+        if (bits & 2) beta = (1 + beta) / 3.0f;
+        float alpha = 1.0f - beta;
 
-	const float factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);
+        alpha2_sum += alpha * alpha * weights[i];
+        beta2_sum += beta * beta * weights[i];
+        alphabeta_sum += alpha * beta * weights[i];
+        alphax_sum += alpha * colors[i];
+    }
 
-	float3 a = (alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor;
-	float3 b = (betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor;
-	
-	// Round a, b to the closest 5-6-5 color and expand...
-	a = roundAndExpand565(a, start);
-	b = roundAndExpand565(b, end);
+    float3 betax_sum = color_sum - alphax_sum;
+
+    const float factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);
 
-	// compute the error
-	float3 e = a * a * alpha2_sum + b * b * beta2_sum + 2.0f * (a * b * alphabeta_sum - a * alphax_sum - b * betax_sum);
+    float3 a = (alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor;
+    float3 b = (betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor;
 
-	return dot(e, kColorMetricSqr);
+    // Round a, b to the closest 5-6-5 color and expand...
+    a = roundAndExpand565(a, start);
+    b = roundAndExpand565(b, end);
+
+    // compute the error
+    float3 e = a * a * alpha2_sum + b * b * beta2_sum + 2.0f * (a * b * alphabeta_sum - a * alphax_sum - b * betax_sum);
+
+    return dot(e, kColorMetricSqr);
 }
 
 /*
 __device__ float evalPermutation3(const float3 * colors, const float * weights, uint permutation, ushort * start, ushort * end)
 {
-	// Compute endpoints using least squares.
-	float alpha2_sum = 0.0f;
-	float beta2_sum = 0.0f;
-	float alphabeta_sum = 0.0f;
-	float3 alphax_sum = make_float3(0.0f, 0.0f, 0.0f);
+    // Compute endpoints using least squares.
+    float alpha2_sum = 0.0f;
+    float beta2_sum = 0.0f;
+    float alphabeta_sum = 0.0f;
+    float3 alphax_sum = make_float3(0.0f, 0.0f, 0.0f);
 
-	// Compute alpha & beta for this permutation.
-	for (int i = 0; i < 16; i++)
-	{
-		const uint bits = permutation >> (2*i);
+    // Compute alpha & beta for this permutation.
+    for (int i = 0; i < 16; i++)
+    {
+        const uint bits = permutation >> (2*i);
 
-		float beta = (bits & 1);
-		if (bits & 2) beta = 0.5f;
-		float alpha = 1.0f - beta;
+        float beta = (bits & 1);
+        if (bits & 2) beta = 0.5f;
+        float alpha = 1.0f - beta;
 
-		alpha2_sum += alpha * alpha * weights[i];
-		beta2_sum += beta * beta * weights[i];
-		alphabeta_sum += alpha * beta * weights[i];
-		alphax_sum += alpha * colors[i];
-	}
+        alpha2_sum += alpha * alpha * weights[i];
+        beta2_sum += beta * beta * weights[i];
+        alphabeta_sum += alpha * beta * weights[i];
+        alphax_sum += alpha * colors[i];
+    }
 
-	float3 betax_sum = color_sum - alphax_sum;
+    float3 betax_sum = color_sum - alphax_sum;
 
-	const float factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);
+    const float factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);
 
-	float3 a = (alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor;
-	float3 b = (betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor;
+    float3 a = (alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor;
+    float3 b = (betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor;
 
-	// Round a, b to the closest 5-6-5 color and expand...
-	a = roundAndExpand565(a, start);
-	b = roundAndExpand565(b, end);
+    // Round a, b to the closest 5-6-5 color and expand...
+    a = roundAndExpand565(a, start);
+    b = roundAndExpand565(b, end);
 
-	// compute the error
-	float3 e = a * a * alpha2_sum + b * b * beta2_sum + 2.0f * (a * b * alphabeta_sum - a * alphax_sum - b * betax_sum);
+    // compute the error
+    float3 e = a * a * alpha2_sum + b * b * beta2_sum + 2.0f * (a * b * alphabeta_sum - a * alphax_sum - b * betax_sum);
 
-	return dot(e, kColorMetricSqr);
+    return dot(e, kColorMetricSqr);
 }
 */
 
 __device__ float evalPermutation4(const float2 * colors, float2 color_sum, uint permutation, ushort * start, ushort * end)
 {
-	// Compute endpoints using least squares.
-	float2 alphax_sum = make_float2(0.0f, 0.0f);
-	uint akku = 0;
+    // Compute endpoints using least squares.
+    float2 alphax_sum = make_float2(0.0f, 0.0f);
+    uint akku = 0;
 
-	// Compute alpha & beta for this permutation.
-	#pragma unroll
-	for (int i = 0; i < 16; i++)
-	{
-		const uint bits = permutation >> (2*i);
-		
-		alphax_sum += alphaTable4[bits & 3] * colors[i];
-		akku += prods4[bits & 3];
-	}
+    // Compute alpha & beta for this permutation.
+    #pragma unroll
+    for (int i = 0; i < 16; i++)
+    {
+        const uint bits = permutation >> (2*i);
 
-	float alpha2_sum = float(akku >> 16);
-	float beta2_sum = float((akku >> 8) & 0xff);
-	float alphabeta_sum = float(akku & 0xff);
-	float2 betax_sum = 9.0f * color_sum - alphax_sum;
+        alphax_sum += alphaTable4[bits & 3] * colors[i];
+        akku += prods4[bits & 3];
+    }
 
-	const float factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);
+    float alpha2_sum = float(akku >> 16);
+    float beta2_sum = float((akku >> 8) & 0xff);
+    float alphabeta_sum = float(akku & 0xff);
+    float2 betax_sum = 9.0f * color_sum - alphax_sum;
 
-	float2 a = (alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor;
-	float2 b = (betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor;
-	
-	// Round a, b to the closest 5-6 color and expand...
-	a = roundAndExpand56(a, start);
-	b = roundAndExpand56(b, end);
+    const float factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);
 
-	// compute the error
-	float2 e = a * a * alpha2_sum + b * b * beta2_sum + 2.0f * (a * b * alphabeta_sum - a * alphax_sum - b * betax_sum);
+    float2 a = (alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor;
+    float2 b = (betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor;
 
-	return (1.0f / 9.0f) * (e.x + e.y);
+    // Round a, b to the closest 5-6 color and expand...
+    a = roundAndExpand56(a, start);
+    b = roundAndExpand56(b, end);
+
+    // compute the error
+    float2 e = a * a * alpha2_sum + b * b * beta2_sum + 2.0f * (a * b * alphabeta_sum - a * alphax_sum - b * betax_sum);
+
+    return (1.0f / 9.0f) * (e.x + e.y);
 }
 
 __device__ float evalPermutation3(const float2 * colors, float2 color_sum, uint permutation, ushort * start, ushort * end)
 {
-	// Compute endpoints using least squares.
-	float2 alphax_sum = make_float2(0.0f, 0.0f);
-	uint akku = 0;
+    // Compute endpoints using least squares.
+    float2 alphax_sum = make_float2(0.0f, 0.0f);
+    uint akku = 0;
 
-	// Compute alpha & beta for this permutation.
-	#pragma unroll
-	for (int i = 0; i < 16; i++)
-	{
-		const uint bits = permutation >> (2*i);
+    // Compute alpha & beta for this permutation.
+    #pragma unroll
+    for (int i = 0; i < 16; i++)
+    {
+        const uint bits = permutation >> (2*i);
 
-		alphax_sum += alphaTable3[bits & 3] * colors[i];
-		akku += prods3[bits & 3];
-	}
+        alphax_sum += alphaTable3[bits & 3] * colors[i];
+        akku += prods3[bits & 3];
+    }
 
-	float alpha2_sum = float(akku >> 16);
-	float beta2_sum = float((akku >> 8) & 0xff);
-	float alphabeta_sum = float(akku & 0xff);
-	float2 betax_sum = 4.0f * color_sum - alphax_sum;
+    float alpha2_sum = float(akku >> 16);
+    float beta2_sum = float((akku >> 8) & 0xff);
+    float alphabeta_sum = float(akku & 0xff);
+    float2 betax_sum = 4.0f * color_sum - alphax_sum;
 
-	const float factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);
+    const float factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);
 
-	float2 a = (alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor;
-	float2 b = (betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor;
-	
-	// Round a, b to the closest 5-6 color and expand...
-	a = roundAndExpand56(a, start);
-	b = roundAndExpand56(b, end);
+    float2 a = (alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor;
+    float2 b = (betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor;
+
+    // Round a, b to the closest 5-6 color and expand...
+    a = roundAndExpand56(a, start);
+    b = roundAndExpand56(b, end);
 
-	// compute the error
-	float2 e = a * a * alpha2_sum + b * b * beta2_sum + 2.0f * (a * b * alphabeta_sum - a * alphax_sum - b * betax_sum);
+    // compute the error
+    float2 e = a * a * alpha2_sum + b * b * beta2_sum + 2.0f * (a * b * alphabeta_sum - a * alphax_sum - b * betax_sum);
 
-	return (1.0f / 4.0f) * (e.x + e.y);
+    return (1.0f / 4.0f) * (e.x + e.y);
 }
 
 __device__ float evalPermutationCTX(const float2 * colors, float2 color_sum, uint permutation, ushort * start, ushort * end)
 {
-	// Compute endpoints using least squares.
-	float2 alphax_sum = make_float2(0.0f, 0.0f);
-	uint akku = 0;
+    // Compute endpoints using least squares.
+    float2 alphax_sum = make_float2(0.0f, 0.0f);
+    uint akku = 0;
 
-	// Compute alpha & beta for this permutation.
-	#pragma unroll
-	for (int i = 0; i < 16; i++)
-	{
-		const uint bits = permutation >> (2*i);
-		
-		alphax_sum += alphaTable4[bits & 3] * colors[i];
-		akku += prods4[bits & 3];
-	}
+    // Compute alpha & beta for this permutation.
+    #pragma unroll
+    for (int i = 0; i < 16; i++)
+    {
+        const uint bits = permutation >> (2*i);
+
+        alphax_sum += alphaTable4[bits & 3] * colors[i];
+        akku += prods4[bits & 3];
+    }
 
-	float alpha2_sum = float(akku >> 16);
-	float beta2_sum = float((akku >> 8) & 0xff);
-	float alphabeta_sum = float(akku & 0xff);
-	float2 betax_sum = 9.0f * color_sum - alphax_sum;
+    float alpha2_sum = float(akku >> 16);
+    float beta2_sum = float((akku >> 8) & 0xff);
+    float alphabeta_sum = float(akku & 0xff);
+    float2 betax_sum = 9.0f * color_sum - alphax_sum;
 
-	const float factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);
+    const float factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);
 
-	float2 a = (alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor;
-	float2 b = (betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor;
-	
-	// Round a, b to the closest 8-8 color and expand...
-	a = roundAndExpand88(a, start);
-	b = roundAndExpand88(b, end);
+    float2 a = (alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor;
+    float2 b = (betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor;
+
+    // Round a, b to the closest 8-8 color and expand...
+    a = roundAndExpand88(a, start);
+    b = roundAndExpand88(b, end);
 
-	// compute the error
-	float2 e = a * a * alpha2_sum + b * b * beta2_sum + 2.0f * (a * b * alphabeta_sum - a * alphax_sum - b * betax_sum);
+    // compute the error
+    float2 e = a * a * alpha2_sum + b * b * beta2_sum + 2.0f * (a * b * alphabeta_sum - a * alphax_sum - b * betax_sum);
 
-	return (1.0f / 9.0f) * (e.x + e.y);
+    return (1.0f / 9.0f) * (e.x + e.y);
 }
 
 
@@ -1093,63 +896,63 @@ __device__ float evalPermutationCTX(const float2 * colors, float2 color_sum, uin
 ////////////////////////////////////////////////////////////////////////////////
 __device__ void evalAllPermutations(const float3 * colors, float3 colorSum, const uint * permutations, ushort & bestStart, ushort & bestEnd, uint & bestPermutation, float * errors)
 {
-	const int idx = threadIdx.x;
-	
-	float bestError = FLT_MAX;
-	
-	__shared__ uint s_permutations[160];
+    const int idx = threadIdx.x;
 
-	for(int i = 0; i < 16; i++)
-	{
-		int pidx = idx + NUM_THREADS * i;
-		if (pidx >= 992) break;
-		
-		ushort start, end;
-		uint permutation = permutations[pidx];
-		if (pidx < 160) s_permutations[pidx] = permutation;
-				
-		float error = evalPermutation4(colors, colorSum, permutation, &start, &end);
-		
-		if (error < bestError)
-		{
-			bestError = error;
-			bestPermutation = permutation;
-			bestStart = start;
-			bestEnd = end;
-		}
-	}
+    float bestError = FLT_MAX;
 
-	if (bestStart < bestEnd)
-	{
-		swap(bestEnd, bestStart);
-		bestPermutation ^= 0x55555555;	// Flip indices.
-	}
+    __shared__ uint s_permutations[160];
 
-	for(int i = 0; i < 3; i++)
-	{
-		int pidx = idx + NUM_THREADS * i;
-		if (pidx >= 160) break;
-		
-		ushort start, end;
-		uint permutation = s_permutations[pidx];
-		float error = evalPermutation3(colors, colorSum, permutation, &start, &end);
-		
-		if (error < bestError)
-		{
-			bestError = error;
-			bestPermutation = permutation;
-			bestStart = start;
-			bestEnd = end;
-			
-			if (bestStart > bestEnd)
-			{
-				swap(bestEnd, bestStart);
-				bestPermutation ^= (~bestPermutation >> 1) & 0x55555555;	// Flip indices.
-			}
-		}
-	}
+    for(int i = 0; i < 16; i++)
+    {
+        int pidx = idx + NUM_THREADS * i;
+        if (pidx >= 992) break;
 
-	errors[idx] = bestError;
+        ushort start, end;
+        uint permutation = permutations[pidx];
+        if (pidx < 160) s_permutations[pidx] = permutation;
+
+        float error = evalPermutation4(colors, colorSum, permutation, &start, &end);
+
+        if (error < bestError)
+        {
+            bestError = error;
+            bestPermutation = permutation;
+            bestStart = start;
+            bestEnd = end;
+        }
+    }
+
+    if (bestStart < bestEnd)
+    {
+        swap(bestEnd, bestStart);
+        bestPermutation ^= 0x55555555;	// Flip indices.
+    }
+
+    for(int i = 0; i < 3; i++)
+    {
+        int pidx = idx + NUM_THREADS * i;
+        if (pidx >= 160) break;
+
+        ushort start, end;
+        uint permutation = s_permutations[pidx];
+        float error = evalPermutation3(colors, colorSum, permutation, &start, &end);
+
+        if (error < bestError)
+        {
+            bestError = error;
+            bestPermutation = permutation;
+            bestStart = start;
+            bestEnd = end;
+
+            if (bestStart > bestEnd)
+            {
+                swap(bestEnd, bestStart);
+                bestPermutation ^= (~bestPermutation >> 1) & 0x55555555;	// Flip indices.
+            }
+        }
+    }
+
+    errors[idx] = bestError;
 }
 
 /*
@@ -1217,165 +1020,165 @@ __device__ void evalAllPermutations(const float3 * colors, const float * weights
 
 __device__ void evalAllPermutations(const float2 * colors, float2 colorSum, const uint * permutations, ushort & bestStart, ushort & bestEnd, uint & bestPermutation, float * errors)
 {
-	const int idx = threadIdx.x;
-	
-	float bestError = FLT_MAX;
-	
-	__shared__ uint s_permutations[160];
+    const int idx = threadIdx.x;
 
-	for(int i = 0; i < 16; i++)
-	{
-		int pidx = idx + NUM_THREADS * i;
-		if (pidx >= 992) break;
-		
-		ushort start, end;
-		uint permutation = permutations[pidx];
-		if (pidx < 160) s_permutations[pidx] = permutation;
-				
-		float error = evalPermutation4(colors, colorSum, permutation, &start, &end);
-		
-		if (error < bestError)
-		{
-			bestError = error;
-			bestPermutation = permutation;
-			bestStart = start;
-			bestEnd = end;
-		}
-	}
+    float bestError = FLT_MAX;
 
-	if (bestStart < bestEnd)
-	{
-		swap(bestEnd, bestStart);
-		bestPermutation ^= 0x55555555;	// Flip indices.
-	}
+    __shared__ uint s_permutations[160];
 
-	for(int i = 0; i < 3; i++)
-	{
-		int pidx = idx + NUM_THREADS * i;
-		if (pidx >= 160) break;
-		
-		ushort start, end;
-		uint permutation = s_permutations[pidx];
-		float error = evalPermutation3(colors, colorSum, permutation, &start, &end);
-		
-		if (error < bestError)
-		{
-			bestError = error;
-			bestPermutation = permutation;
-			bestStart = start;
-			bestEnd = end;
-			
-			if (bestStart > bestEnd)
-			{
-				swap(bestEnd, bestStart);
-				bestPermutation ^= (~bestPermutation >> 1) & 0x55555555;	// Flip indices.
-			}
-		}
-	}
+    for(int i = 0; i < 16; i++)
+    {
+        int pidx = idx + NUM_THREADS * i;
+        if (pidx >= 992) break;
 
-	errors[idx] = bestError;
+        ushort start, end;
+        uint permutation = permutations[pidx];
+        if (pidx < 160) s_permutations[pidx] = permutation;
+
+        float error = evalPermutation4(colors, colorSum, permutation, &start, &end);
+
+        if (error < bestError)
+        {
+            bestError = error;
+            bestPermutation = permutation;
+            bestStart = start;
+            bestEnd = end;
+        }
+    }
+
+    if (bestStart < bestEnd)
+    {
+        swap(bestEnd, bestStart);
+        bestPermutation ^= 0x55555555;	// Flip indices.
+    }
+
+    for(int i = 0; i < 3; i++)
+    {
+        int pidx = idx + NUM_THREADS * i;
+        if (pidx >= 160) break;
+
+        ushort start, end;
+        uint permutation = s_permutations[pidx];
+        float error = evalPermutation3(colors, colorSum, permutation, &start, &end);
+
+        if (error < bestError)
+        {
+            bestError = error;
+            bestPermutation = permutation;
+            bestStart = start;
+            bestEnd = end;
+
+            if (bestStart > bestEnd)
+            {
+                swap(bestEnd, bestStart);
+                bestPermutation ^= (~bestPermutation >> 1) & 0x55555555;	// Flip indices.
+            }
+        }
+    }
+
+    errors[idx] = bestError;
 }
 
 __device__ void evalLevel4Permutations(const float3 * colors, float3 colorSum, const uint * permutations, ushort & bestStart, ushort & bestEnd, uint & bestPermutation, float * errors)
 {
-	const int idx = threadIdx.x;
-	
-	float bestError = FLT_MAX;
-	
-	for(int i = 0; i < 16; i++)
-	{
-		int pidx = idx + NUM_THREADS * i;
-		if (pidx >= 992) break;
-		
-		ushort start, end;
-		uint permutation = permutations[pidx];
+    const int idx = threadIdx.x;
 
-		float error = evalPermutation4(colors, colorSum, permutation, &start, &end);
-		
-		if (error < bestError)
-		{
-			bestError = error;
-			bestPermutation = permutation;
-			bestStart = start;
-			bestEnd = end;
-		}
-	}
+    float bestError = FLT_MAX;
 
-	if (bestStart < bestEnd)
-	{
-		swap(bestEnd, bestStart);
-		bestPermutation ^= 0x55555555;	// Flip indices.
-	}
+    for(int i = 0; i < 16; i++)
+    {
+        int pidx = idx + NUM_THREADS * i;
+        if (pidx >= 992) break;
 
-	errors[idx] = bestError;
+        ushort start, end;
+        uint permutation = permutations[pidx];
+
+        float error = evalPermutation4(colors, colorSum, permutation, &start, &end);
+
+        if (error < bestError)
+        {
+            bestError = error;
+            bestPermutation = permutation;
+            bestStart = start;
+            bestEnd = end;
+        }
+    }
+
+    if (bestStart < bestEnd)
+    {
+        swap(bestEnd, bestStart);
+        bestPermutation ^= 0x55555555;	// Flip indices.
+    }
+
+    errors[idx] = bestError;
 }
 
 __device__ void evalLevel4Permutations(const float3 * colors, const float * weights, float3 colorSum, const uint * permutations, ushort & bestStart, ushort & bestEnd, uint & bestPermutation, float * errors)
 {
-	const int idx = threadIdx.x;
-	
-	float bestError = FLT_MAX;
-	
-	for(int i = 0; i < 16; i++)
-	{
-		int pidx = idx + NUM_THREADS * i;
-		if (pidx >= 992) break;
-		
-		ushort start, end;
-		uint permutation = permutations[pidx];
+    const int idx = threadIdx.x;
 
-		float error = evalPermutation4(colors, weights, colorSum, permutation, &start, &end);
-		
-		if (error < bestError)
-		{
-			bestError = error;
-			bestPermutation = permutation;
-			bestStart = start;
-			bestEnd = end;
-		}
-	}
+    float bestError = FLT_MAX;
 
-	if (bestStart < bestEnd)
-	{
-		swap(bestEnd, bestStart);
-		bestPermutation ^= 0x55555555;	// Flip indices.
-	}
+    for(int i = 0; i < 16; i++)
+    {
+        int pidx = idx + NUM_THREADS * i;
+        if (pidx >= 992) break;
 
-	errors[idx] = bestError;
-}
+        ushort start, end;
+        uint permutation = permutations[pidx];
 
-__device__ void evalAllPermutationsCTX(const float2 * colors, float2 colorSum, const uint * permutations, ushort & bestStart, ushort & bestEnd, uint & bestPermutation, float * errors)
-{
-	const int idx = threadIdx.x;
-	
-	float bestError = FLT_MAX;
-	
-	for(int i = 0; i < 16; i++)
-	{
-		int pidx = idx + NUM_THREADS * i;
-		if (pidx >= 704) break;
-		
-		ushort start, end;
-		uint permutation = permutations[pidx];
+        float error = evalPermutation4(colors, weights, colorSum, permutation, &start, &end);
+
+        if (error < bestError)
+        {
+            bestError = error;
+            bestPermutation = permutation;
+            bestStart = start;
+            bestEnd = end;
+        }
+    }
+
+    if (bestStart < bestEnd)
+    {
+        swap(bestEnd, bestStart);
+        bestPermutation ^= 0x55555555;	// Flip indices.
+    }
+
+    errors[idx] = bestError;
+}
 
-		float error = evalPermutationCTX(colors, colorSum, permutation, &start, &end);
-		
-		if (error < bestError)
-		{
-			bestError = error;
-			bestPermutation = permutation;
-			bestStart = start;
-			bestEnd = end;
-		}
-	}
+__device__ void evalAllPermutationsCTX(const float2 * colors, float2 colorSum, const uint * permutations, ushort & bestStart, ushort & bestEnd, uint & bestPermutation, float * errors)
+{
+    const int idx = threadIdx.x;
 
-	if (bestStart < bestEnd)
-	{
-		swap(bestEnd, bestStart);
-		bestPermutation ^= 0x55555555;	// Flip indices.
-	}
+    float bestError = FLT_MAX;
 
-	errors[idx] = bestError;
+    for(int i = 0; i < 16; i++)
+    {
+        int pidx = idx + NUM_THREADS * i;
+        if (pidx >= 704) break;
+
+        ushort start, end;
+        uint permutation = permutations[pidx];
+
+        float error = evalPermutationCTX(colors, colorSum, permutation, &start, &end);
+
+        if (error < bestError)
+        {
+            bestError = error;
+            bestPermutation = permutation;
+            bestStart = start;
+            bestEnd = end;
+        }
+    }
+
+    if (bestStart < bestEnd)
+    {
+        swap(bestEnd, bestStart);
+        bestPermutation ^= 0x55555555;	// Flip indices.
+    }
+
+    errors[idx] = bestError;
 }
 
 
@@ -1384,79 +1187,61 @@ __device__ void evalAllPermutationsCTX(const float2 * colors, float2 colorSum, c
 ////////////////////////////////////////////////////////////////////////////////
 __device__ int findMinError(float * errors)
 {
-	const int idx = threadIdx.x;
+    const int idx = threadIdx.x;
 
-	__shared__ int indices[NUM_THREADS];
-	indices[idx] = idx;
+    __shared__ int indices[NUM_THREADS];
+    indices[idx] = idx;
 
-#if __DEVICE_EMULATION__
-	for(int d = NUM_THREADS/2; d > 0; d >>= 1)
-	{
-		__syncthreads();
-		
-		if (idx < d)
-		{
-			float err0 = errors[idx];
-			float err1 = errors[idx + d];
-			
-			if (err1 < err0) {
-				errors[idx] = err1;
-				indices[idx] = indices[idx + d];
-			}
-		}
-	}
-#else
-	for(int d = NUM_THREADS/2; d > 32; d >>= 1)
-	{
-		__syncthreads();
-		
-		if (idx < d)
-		{
-			float err0 = errors[idx];
-			float err1 = errors[idx + d];
-			
-			if (err1 < err0) {
-				errors[idx] = err1;
-				indices[idx] = indices[idx + d];
-			}
-		}
-	}
+    for(int d = NUM_THREADS/2; d > 32; d >>= 1)
+    {
+        __syncthreads();
 
-	__syncthreads();
+        if (idx < d)
+        {
+            float err0 = errors[idx];
+            float err1 = errors[idx + d];
 
-	// unroll last 6 iterations
-	if (idx < 32)
-	{
-		if (errors[idx + 32] < errors[idx]) {
-			errors[idx] = errors[idx + 32];
-			indices[idx] = indices[idx + 32];
-		}
-		if (errors[idx + 16] < errors[idx]) {
-			errors[idx] = errors[idx + 16];
-			indices[idx] = indices[idx + 16];
-		}
-		if (errors[idx + 8] < errors[idx]) {
-			errors[idx] = errors[idx + 8];
-			indices[idx] = indices[idx + 8];
-		}
-		if (errors[idx + 4] < errors[idx]) {
-			errors[idx] = errors[idx + 4];
-			indices[idx] = indices[idx + 4];
-		}
-		if (errors[idx + 2] < errors[idx]) {
-			errors[idx] = errors[idx + 2];
-			indices[idx] = indices[idx + 2];
-		}
-		if (errors[idx + 1] < errors[idx]) {
-			errors[idx] = errors[idx + 1];
-			indices[idx] = indices[idx + 1];
-		}
-	}
-#endif
+            if (err1 < err0) {
+                errors[idx] = err1;
+                indices[idx] = indices[idx + d];
+            }
+        }
+    }
 
-	__syncthreads();
+    __syncthreads();
+
+    // unroll last 6 iterations
+    if (idx < 32)
+    {
+        if (errors[idx + 32] < errors[idx]) {
+            errors[idx] = errors[idx + 32];
+            indices[idx] = indices[idx + 32];
+        }
+        if (errors[idx + 16] < errors[idx]) {
+            errors[idx] = errors[idx + 16];
+            indices[idx] = indices[idx + 16];
+        }
+        if (errors[idx + 8] < errors[idx]) {
+            errors[idx] = errors[idx + 8];
+            indices[idx] = indices[idx + 8];
+        }
+        if (errors[idx + 4] < errors[idx]) {
+            errors[idx] = errors[idx + 4];
+            indices[idx] = indices[idx + 4];
+        }
+        if (errors[idx + 2] < errors[idx]) {
+            errors[idx] = errors[idx + 2];
+            indices[idx] = indices[idx + 2];
+        }
+        if (errors[idx + 1] < errors[idx]) {
+            errors[idx] = errors[idx + 1];
+            indices[idx] = indices[idx + 1];
+        }
+    }
+
+    __syncthreads();
 
-	return indices[0];
+    return indices[0];
 }
 
 
@@ -1465,32 +1250,32 @@ __device__ int findMinError(float * errors)
 ////////////////////////////////////////////////////////////////////////////////
 __device__ void saveBlockDXT1(ushort start, ushort end, uint permutation, int xrefs[16], uint2 * result)
 {
-	const int bid = blockIdx.x;
+    const int bid = blockIdx.x;
 
-	if (start == end)
-	{
-		permutation = 0;
-	}
-	
-	// Reorder permutation.
-	uint indices = 0;
-	for(int i = 0; i < 16; i++)
-	{
-		int ref = xrefs[i];
-		indices |= ((permutation >> (2 * ref)) & 3) << (2 * i);
-	}
-	
-	// Write endpoints.
-	result[bid].x = (end << 16) | start;
-	
-	// Write palette indices.
-	result[bid].y = indices;
+    if (start == end)
+    {
+        permutation = 0;
+    }
+
+    // Reorder permutation.
+    uint indices = 0;
+    for(int i = 0; i < 16; i++)
+    {
+        int ref = xrefs[i];
+        indices |= ((permutation >> (2 * ref)) & 3) << (2 * i);
+    }
+
+    // Write endpoints.
+    result[bid].x = (end << 16) | start;
+
+    // Write palette indices.
+    result[bid].y = indices;
 }
 
 __device__ void saveBlockDXT1_Parallel(uint endpoints, float3 colors[16], int xrefs[16], uint * result)
 {
     const int tid = threadIdx.x;
-	const int bid = blockIdx.x;
+    const int bid = blockIdx.x;
 
     if (tid < 16)
     {
@@ -1499,9 +1284,9 @@ __device__ void saveBlockDXT1_Parallel(uint endpoints, float3 colors[16], int xr
         ushort endpoint0 = endpoints & 0xFFFF;
         ushort endpoint1 = endpoints >> 16;
 
-	    int3 palette[4];
-	    palette[0] = color16ToInt3(endpoint0);
-	    palette[1] = color16ToInt3(endpoint1);
+	int3 palette[4];
+	palette[0] = color16ToInt3(endpoint0);
+	palette[1] = color16ToInt3(endpoint1);
 
         int d0 = colorDistance(palette[0], color);
         int d1 = colorDistance(palette[1], color);
@@ -1509,36 +1294,36 @@ __device__ void saveBlockDXT1_Parallel(uint endpoints, float3 colors[16], int xr
         uint index;
         if (endpoint0 > endpoint1) 
         {
-	        palette[2].x = (2 * palette[0].x + palette[1].x) / 3;
+            palette[2].x = (2 * palette[0].x + palette[1].x) / 3;
             palette[2].y = (2 * palette[0].y + palette[1].y) / 3;
             palette[2].z = (2 * palette[0].z + palette[1].z) / 3;
 
-	        palette[3].x = (2 * palette[1].x + palette[0].x) / 3;
+            palette[3].x = (2 * palette[1].x + palette[0].x) / 3;
             palette[3].y = (2 * palette[1].y + palette[0].y) / 3;
             palette[3].z = (2 * palette[1].z + palette[0].z) / 3;
             
-		    int d2 = colorDistance(palette[2], color);
-		    int d3 = colorDistance(palette[3], color);
+            int d2 = colorDistance(palette[2], color);
+            int d3 = colorDistance(palette[3], color);
 
             // Compute the index that best fit color.
-		    uint b0 = d0 > d3;
-		    uint b1 = d1 > d2;
-		    uint b2 = d0 > d2;
-		    uint b3 = d1 > d3;
-		    uint b4 = d2 > d3;
-    		
-		    uint x0 = b1 & b2;
-		    uint x1 = b0 & b3;
-		    uint x2 = b0 & b4;
-    		
+            uint b0 = d0 > d3;
+            uint b1 = d1 > d2;
+            uint b2 = d0 > d2;
+            uint b3 = d1 > d3;
+            uint b4 = d2 > d3;
+
+            uint x0 = b1 & b2;
+            uint x1 = b0 & b3;
+            uint x2 = b0 & b4;
+
             index = (x2 | ((x0 | x1) << 1));
         }
         else {
-	        palette[2].x = (palette[0].x + palette[1].x) / 2;
+            palette[2].x = (palette[0].x + palette[1].x) / 2;
             palette[2].y = (palette[0].y + palette[1].y) / 2;
             palette[2].z = (palette[0].z + palette[1].z) / 2;
 
-       		int d2 = colorDistance(palette[2], color);
+            int d2 = colorDistance(palette[2], color);
 
             index = 0;
             if (d1 < d0 && d1 < d2) index = 1;
@@ -1562,11 +1347,11 @@ __device__ void saveBlockDXT1_Parallel(uint endpoints, float3 colors[16], int xr
 __device__ void saveBlockDXT1_Parallel(uint endpoints, uint permutation, int xrefs[16], uint * result)
 {
     const int tid = threadIdx.x;    
-	const int bid = blockIdx.x;
+    const int bid = blockIdx.x;
 
     if (tid < 16)
     {
-	    // Reorder permutation.
+        // Reorder permutation.
         uint index = ((permutation >> (2 * xrefs[tid])) & 3) << (2 * tid);
         __shared__ uint indices[16];
 
@@ -1585,65 +1370,65 @@ __device__ void saveBlockDXT1_Parallel(uint endpoints, uint permutation, int xre
 
 __device__ void saveBlockCTX1(ushort start, ushort end, uint permutation, int xrefs[16], uint2 * result)
 {
-	saveBlockDXT1(start, end, permutation, xrefs, result);
+    saveBlockDXT1(start, end, permutation, xrefs, result);
 }
 
 __device__ void saveSingleColorBlockDXT1(float3 color, uint2 * result)
 {
-	const int bid = blockIdx.x;
+    const int bid = blockIdx.x;
 
-	int r = color.x * 255;
-	int g = color.y * 255;
-	int b = color.z * 255;
+    int r = color.x * 255;
+    int g = color.y * 255;
+    int b = color.z * 255;
 
-	ushort color0 = (OMatch5[r][0] << 11) | (OMatch6[g][0] << 5) | OMatch5[b][0];
-	ushort color1 = (OMatch5[r][1] << 11) | (OMatch6[g][1] << 5) | OMatch5[b][1];
+    ushort color0 = (OMatch5[r][0] << 11) | (OMatch6[g][0] << 5) | OMatch5[b][0];
+    ushort color1 = (OMatch5[r][1] << 11) | (OMatch6[g][1] << 5) | OMatch5[b][1];
 
-	if (color0 < color1)
-	{
-		result[bid].x = (color0 << 16) | color1;
-		result[bid].y = 0xffffffff;
-	}
-	else
-	{
-		result[bid].x = (color1 << 16) | color0;
-		result[bid].y = 0xaaaaaaaa;
-	}
+    if (color0 < color1)
+    {
+        result[bid].x = (color0 << 16) | color1;
+        result[bid].y = 0xffffffff;
+    }
+    else
+    {
+        result[bid].x = (color1 << 16) | color0;
+        result[bid].y = 0xaaaaaaaa;
+    }
 }
 
 __device__ void saveSingleColorBlockDXT1(float2 color, uint2 * result)
 {
-	const int bid = blockIdx.x;
+    const int bid = blockIdx.x;
 
-	int r = color.x * 255;
-	int g = color.y * 255;
+    int r = color.x * 255;
+    int g = color.y * 255;
 
-	ushort color0 = (OMatch5[r][0] << 11) | (OMatch6[g][0] << 5);
-	ushort color1 = (OMatch5[r][1] << 11) | (OMatch6[g][1] << 5);
+    ushort color0 = (OMatch5[r][0] << 11) | (OMatch6[g][0] << 5);
+    ushort color1 = (OMatch5[r][1] << 11) | (OMatch6[g][1] << 5);
 
-	if (color0 < color1)
-	{
-		result[bid].x = (color0 << 16) | color1;
-		result[bid].y = 0xffffffff;
-	}
-	else
-	{
-		result[bid].x = (color1 << 16) | color0;
-		result[bid].y = 0xaaaaaaaa;
-	}
+    if (color0 < color1)
+    {
+        result[bid].x = (color0 << 16) | color1;
+        result[bid].y = 0xffffffff;
+    }
+    else
+    {
+        result[bid].x = (color1 << 16) | color0;
+        result[bid].y = 0xaaaaaaaa;
+    }
 }
 
 __device__ void saveSingleColorBlockCTX1(float2 color, uint2 * result)
 {
-	const int bid = blockIdx.x;
+    const int bid = blockIdx.x;
 
-	int r = color.x * 255;
-	int g = color.y * 255;
+    int r = color.x * 255;
+    int g = color.y * 255;
 
-	ushort color0 = (r << 8) | (g);
-	
-	result[bid].x = (color0 << 16) | color0;
-	result[bid].y = 0x00000000;
+    ushort color0 = (r << 8) | (g);
+
+    result[bid].x = (color0 << 16) | color0;
+    result[bid].y = 0x00000000;
 }
 
 
@@ -1651,28 +1436,28 @@ __device__ void saveSingleColorBlockCTX1(float2 color, uint2 * result)
 // Compress color block
 ////////////////////////////////////////////////////////////////////////////////
 
-__global__ void compressDXT1(uint firstBlock, uint w, const uint * permutations, uint2 * result)
+__global__ void compressDXT1(uint firstBlock, uint blockWidth, const uint * permutations, uint2 * result)
 {
-	__shared__ float3 colors[16];
-	__shared__ float3 sums[16];
-	__shared__ int xrefs[16];
-	__shared__ int sameColor;
-	
-	loadColorBlockTex(firstBlock, w, colors, sums, xrefs, &sameColor);
+    __shared__ float3 colors[16];
+    __shared__ float3 sums[16];
+    __shared__ int xrefs[16];
+    __shared__ int sameColor;
 
-	__syncthreads();
+    loadColorBlockTex(firstBlock, blockWidth, colors, sums, xrefs, &sameColor);
 
-	if (sameColor)
-	{
-		if (threadIdx.x == 0) saveSingleColorBlockDXT1(colors[0], result);
-		return;
-	}
+    __syncthreads();
 
-	ushort bestStart, bestEnd;
-	uint bestPermutation;
+    if (sameColor)
+    {
+        if (threadIdx.x == 0) saveSingleColorBlockDXT1(colors[0], result);
+        return;
+    }
+
+    ushort bestStart, bestEnd;
+    uint bestPermutation;
 
     __shared__ float errors[NUM_THREADS];
-   	evalAllPermutations(colors, sums[0], permutations, bestStart, bestEnd, bestPermutation, errors);
+    evalAllPermutations(colors, sums[0], permutations, bestStart, bestEnd, bestPermutation, errors);
     
     // Use a parallel reduction to find minimum error.
     const int minIdx = findMinError(errors);
@@ -1680,12 +1465,12 @@ __global__ void compressDXT1(uint firstBlock, uint w, const uint * permutations,
     __shared__ uint s_bestEndPoints;
     __shared__ uint s_bestPermutation;
 
-	// Only write the result of the winner thread.
-	if (threadIdx.x == minIdx)
-	{
-		s_bestEndPoints = (bestEnd << 16) | bestStart;
+    // Only write the result of the winner thread.
+    if (threadIdx.x == minIdx)
+    {
+        s_bestEndPoints = (bestEnd << 16) | bestStart;
         s_bestPermutation = (bestStart != bestEnd) ? bestPermutation : 0;
-	}
+    }
 
     __syncthreads();
 
@@ -1694,142 +1479,142 @@ __global__ void compressDXT1(uint firstBlock, uint w, const uint * permutations,
 }
 
 
-__global__ void compressLevel4DXT1(uint firstBlock, uint w, const uint * permutations, uint2 * result)
+__global__ void compressLevel4DXT1(uint firstBlock, uint blockWidth, const uint * permutations, uint2 * result)
 {
-	__shared__ float3 colors[16];
-	__shared__ float3 sums[16];
-	__shared__ int xrefs[16];
-	__shared__ int sameColor;
-	
-	loadColorBlockTex(firstBlock, w, colors, sums, xrefs, &sameColor);
+    __shared__ float3 colors[16];
+    __shared__ float3 sums[16];
+    __shared__ int xrefs[16];
+    __shared__ int sameColor;
 
-	__syncthreads();
+    loadColorBlockTex(firstBlock, blockWidth, colors, sums, xrefs, &sameColor);
 
-	if (sameColor)
-	{
-		if (threadIdx.x == 0) saveSingleColorBlockDXT1(colors[0], result);
-		return;
-	}
+    __syncthreads();
 
-	ushort bestStart, bestEnd;
-	uint bestPermutation;
+    if (sameColor)
+    {
+        if (threadIdx.x == 0) saveSingleColorBlockDXT1(colors[0], result);
+        return;
+    }
 
-	__shared__ float errors[NUM_THREADS];
+    ushort bestStart, bestEnd;
+    uint bestPermutation;
 
-	evalLevel4Permutations(colors, sums[0], permutations, bestStart, bestEnd, bestPermutation, errors);
-	
-	// Use a parallel reduction to find minimum error.
-	const int minIdx = findMinError(errors);
-	
-	// Only write the result of the winner thread.
-	if (threadIdx.x == minIdx)
-	{
-		saveBlockDXT1(bestStart, bestEnd, bestPermutation, xrefs, result);
-	}
+    __shared__ float errors[NUM_THREADS];
+
+    evalLevel4Permutations(colors, sums[0], permutations, bestStart, bestEnd, bestPermutation, errors);
+
+    // Use a parallel reduction to find minimum error.
+    const int minIdx = findMinError(errors);
+
+    // Only write the result of the winner thread.
+    if (threadIdx.x == minIdx)
+    {
+        saveBlockDXT1(bestStart, bestEnd, bestPermutation, xrefs, result);
+    }
 }
 
-__global__ void compressWeightedDXT1(uint firstBlock, uint w, const uint * permutations, uint2 * result)
+__global__ void compressWeightedDXT1(uint firstBlock, uint blockWidth, const uint * permutations, uint2 * result)
 {
-	__shared__ float3 colors[16];
-	__shared__ float3 sums[16];
-	__shared__ float weights[16];
-	__shared__ int xrefs[16];
-	__shared__ int sameColor;
-	
-	loadColorBlockTex(firstBlock, w, colors, sums, weights, xrefs, &sameColor);
-	
-	__syncthreads();
+    __shared__ float3 colors[16];
+    __shared__ float3 sums[16];
+    __shared__ float weights[16];
+    __shared__ int xrefs[16];
+    __shared__ int sameColor;
 
-	if (sameColor)
-	{
-		if (threadIdx.x == 0) saveSingleColorBlockDXT1(colors[0], result);
-		return;
-	}
+    loadColorBlockTex(firstBlock, blockWidth, colors, sums, weights, xrefs, &sameColor);
 
-	ushort bestStart, bestEnd;
-	uint bestPermutation;
+    __syncthreads();
 
-	__shared__ float errors[NUM_THREADS];
-	
-	evalLevel4Permutations(colors, weights, sums[0], permutations, bestStart, bestEnd, bestPermutation, errors);
-	
-	// Use a parallel reduction to find minimum error.
-	int minIdx = findMinError(errors);
-	
-	// Only write the result of the winner thread.
-	if (threadIdx.x == minIdx)
-	{
-		saveBlockDXT1(bestStart, bestEnd, bestPermutation, xrefs, result);
-	}
+    if (sameColor)
+    {
+        if (threadIdx.x == 0) saveSingleColorBlockDXT1(colors[0], result);
+        return;
+    }
+
+    ushort bestStart, bestEnd;
+    uint bestPermutation;
+
+    __shared__ float errors[NUM_THREADS];
+
+    evalLevel4Permutations(colors, weights, sums[0], permutations, bestStart, bestEnd, bestPermutation, errors);
+
+    // Use a parallel reduction to find minimum error.
+    int minIdx = findMinError(errors);
+
+    // Only write the result of the winner thread.
+    if (threadIdx.x == minIdx)
+    {
+        saveBlockDXT1(bestStart, bestEnd, bestPermutation, xrefs, result);
+    }
 }
 
 
 __global__ void compressNormalDXT1(const uint * permutations, const uint * image, uint2 * result)
 {
-	__shared__ float2 colors[16];
-	__shared__ float2 sums[16];
-	__shared__ int xrefs[16];
-	__shared__ int sameColor;
+    __shared__ float2 colors[16];
+    __shared__ float2 sums[16];
+    __shared__ int xrefs[16];
+    __shared__ int sameColor;
 
-	loadColorBlock(image, colors, sums, xrefs, &sameColor);
-	
-	__syncthreads();
+    loadColorBlock(image, colors, sums, xrefs, &sameColor);
 
-	if (sameColor)
-	{
-		if (threadIdx.x == 0) saveSingleColorBlockDXT1(colors[0], result);
-		return;
-	}
+    __syncthreads();
 
-	ushort bestStart, bestEnd;
-	uint bestPermutation;
+    if (sameColor)
+    {
+        if (threadIdx.x == 0) saveSingleColorBlockDXT1(colors[0], result);
+        return;
+    }
 
-	__shared__ float errors[NUM_THREADS];
+    ushort bestStart, bestEnd;
+    uint bestPermutation;
 
-	evalAllPermutations(colors, sums[0], permutations, bestStart, bestEnd, bestPermutation, errors);
-	
-	// Use a parallel reduction to find minimum error.
-	const int minIdx = findMinError(errors);
-	
-	// Only write the result of the winner thread.
-	if (threadIdx.x == minIdx)
-	{
-		saveBlockDXT1(bestStart, bestEnd, bestPermutation, xrefs, result);
-	}
+    __shared__ float errors[NUM_THREADS];
+
+    evalAllPermutations(colors, sums[0], permutations, bestStart, bestEnd, bestPermutation, errors);
+
+    // Use a parallel reduction to find minimum error.
+    const int minIdx = findMinError(errors);
+
+    // Only write the result of the winner thread.
+    if (threadIdx.x == minIdx)
+    {
+        saveBlockDXT1(bestStart, bestEnd, bestPermutation, xrefs, result);
+    }
 }
 
 __global__ void compressCTX1(const uint * permutations, const uint * image, uint2 * result)
 {
-	__shared__ float2 colors[16];
-	__shared__ float2 sums[16];
-	__shared__ int xrefs[16];
-	__shared__ int sameColor;
-	
-	loadColorBlock(image, colors, sums, xrefs, &sameColor);
-	
-	__syncthreads();
+    __shared__ float2 colors[16];
+    __shared__ float2 sums[16];
+    __shared__ int xrefs[16];
+    __shared__ int sameColor;
 
-	if (sameColor)
-	{
-		if (threadIdx.x == 0) saveSingleColorBlockCTX1(colors[0], result);
-		return;
-	}
+    loadColorBlock(image, colors, sums, xrefs, &sameColor);
 
-	ushort bestStart, bestEnd;
-	uint bestPermutation;
+    __syncthreads();
 
-	__shared__ float errors[NUM_THREADS];
+    if (sameColor)
+    {
+        if (threadIdx.x == 0) saveSingleColorBlockCTX1(colors[0], result);
+        return;
+    }
 
-	evalAllPermutationsCTX(colors, sums[0], permutations, bestStart, bestEnd, bestPermutation, errors);
-	
-	// Use a parallel reduction to find minimum error.
-	const int minIdx = findMinError(errors);
-	
-	// Only write the result of the winner thread.
-	if (threadIdx.x == minIdx)
-	{
-		saveBlockCTX1(bestStart, bestEnd, bestPermutation, xrefs, result);
-	}
+    ushort bestStart, bestEnd;
+    uint bestPermutation;
+
+    __shared__ float errors[NUM_THREADS];
+
+    evalAllPermutationsCTX(colors, sums[0], permutations, bestStart, bestEnd, bestPermutation, errors);
+
+    // Use a parallel reduction to find minimum error.
+    const int minIdx = findMinError(errors);
+
+    // Only write the result of the winner thread.
+    if (threadIdx.x == minIdx)
+    {
+        saveBlockCTX1(bestStart, bestEnd, bestPermutation, xrefs, result);
+    }
 }
 
 
@@ -2136,26 +1921,26 @@ __global__ void compressDXT5n(uint blockNum, uint2 * d_result)
 
 extern "C" void setupCompressKernel(const float weights[3])
 {
-	// Set constants.
-	cudaMemcpyToSymbol(kColorMetric, weights, sizeof(float) * 3, 0);
+    // Set constants.
+    cudaMemcpyToSymbol(kColorMetric, weights, sizeof(float) * 3, 0);
 
-	float weightsSqr[3];
-	weightsSqr[0] = weights[0] * weights[0];
-	weightsSqr[1] = weights[1] * weights[1];
-	weightsSqr[2] = weights[2] * weights[2];
+    float weightsSqr[3];
+    weightsSqr[0] = weights[0] * weights[0];
+    weightsSqr[1] = weights[1] * weights[1];
+    weightsSqr[2] = weights[2] * weights[2];
 
-	cudaMemcpyToSymbol(kColorMetricSqr, weightsSqr, sizeof(float) * 3, 0);
+    cudaMemcpyToSymbol(kColorMetricSqr, weightsSqr, sizeof(float) * 3, 0);
 }
 
 extern "C" void bindTextureToArray(cudaArray * d_data)
 {
-	// Setup texture
-	tex.normalized = false;
-	tex.filterMode = cudaFilterModePoint;
-	tex.addressMode[0] = cudaAddressModeClamp;
-	tex.addressMode[1] = cudaAddressModeClamp;
-    
-	cudaBindTextureToArray(tex, d_data);
+    // Setup texture
+    tex.normalized = false;
+    tex.filterMode = cudaFilterModePoint;
+    tex.addressMode[0] = cudaAddressModeClamp;
+    tex.addressMode[1] = cudaAddressModeClamp;
+
+    cudaBindTextureToArray(tex, d_data);
 }
 
 
@@ -2165,45 +1950,45 @@ extern "C" void bindTextureToArray(cudaArray * d_data)
 ////////////////////////////////////////////////////////////////////////////////
 
 // DXT1 compressors:
-extern "C" void compressKernelDXT1(uint firstBlock, uint blockNum, uint w, uint * d_result, uint * d_bitmaps)
+extern "C" void compressKernelDXT1(uint firstBlock, uint blockNum, uint blockWidth, uint * d_result, uint * d_bitmaps)
 {
-	compressDXT1<<<blockNum, NUM_THREADS>>>(firstBlock, w, d_bitmaps, (uint2 *)d_result);
+    compressDXT1<<<blockNum, NUM_THREADS>>>(firstBlock, blockWidth, d_bitmaps, (uint2 *)d_result);
 }
 
-extern "C" void compressKernelDXT1_Level4(uint firstBlock, uint blockNum, uint w, uint * d_result, uint * d_bitmaps)
+extern "C" void compressKernelDXT1_Level4(uint firstBlock, uint blockNum, uint blockWidth, uint * d_result, uint * d_bitmaps)
 {
-	compressLevel4DXT1<<<blockNum, NUM_THREADS>>>(firstBlock, w, d_bitmaps, (uint2 *)d_result);
+    compressLevel4DXT1<<<blockNum, NUM_THREADS>>>(firstBlock, blockWidth, d_bitmaps, (uint2 *)d_result);
 }
 
-extern "C" void compressWeightedKernelDXT1(uint firstBlock, uint blockNum, uint w, uint * d_result, uint * d_bitmaps)
+extern "C" void compressWeightedKernelDXT1(uint firstBlock, uint blockNum, uint blockWidth, uint * d_result, uint * d_bitmaps)
 {
-	compressWeightedDXT1<<<blockNum, NUM_THREADS>>>(firstBlock, w, d_bitmaps, (uint2 *)d_result);
+    compressWeightedDXT1<<<blockNum, NUM_THREADS>>>(firstBlock, blockWidth, d_bitmaps, (uint2 *)d_result);
 }
 
 // @@ DXT1a compressors.
 
 
 // @@ DXT3 compressors:
-extern "C" void compressKernelDXT3(uint firstBlock, uint blockNum, uint w, uint * d_result, uint * d_bitmaps)
+extern "C" void compressKernelDXT3(uint firstBlock, uint blockNum, uint blockWidth, uint * d_result, uint * d_bitmaps)
 {
-	//compressDXT3<<<blockNum, NUM_THREADS>>>(firstBlock, w, d_bitmaps, (uint2 *)d_result);
+    //compressDXT3<<<blockNum, NUM_THREADS>>>(firstBlock, blockWidth, d_bitmaps, (uint2 *)d_result);
 }
 
-extern "C" void compressWeightedKernelDXT3(uint firstBlock, uint blockNum, uint w, uint * d_result, uint * d_bitmaps)
+extern "C" void compressWeightedKernelDXT3(uint firstBlock, uint blockNum, uint blockWidth, uint * d_result, uint * d_bitmaps)
 {
-	//compressWeightedDXT3<<<blockNum, NUM_THREADS>>>(firstBlock, w, d_bitmaps, (uint2 *)d_result);
+    //compressWeightedDXT3<<<blockNum, NUM_THREADS>>>(firstBlock, blockWidth, d_bitmaps, (uint2 *)d_result);
 }
 
 
 // @@ DXT5 compressors.
 extern "C" void compressKernelDXT5(uint firstBlock, uint blockNum, uint w, uint * d_result, uint * d_bitmaps)
 {
-	//compressDXT5<<<blockNum, NUM_THREADS>>>(firstBlock, w, d_bitmaps, (uint2 *)d_result);
+    //compressDXT5<<<blockNum, NUM_THREADS>>>(firstBlock, w, d_bitmaps, (uint2 *)d_result);
 }
 
 extern "C" void compressWeightedKernelDXT5(uint firstBlock, uint blockNum, uint w, uint * d_result, uint * d_bitmaps)
 {
-	//compressWeightedDXT5<<<blockNum, NUM_THREADS>>>(firstBlock, w, d_bitmaps, (uint2 *)d_result);
+    //compressWeightedDXT5<<<blockNum, NUM_THREADS>>>(firstBlock, w, d_bitmaps, (uint2 *)d_result);
 }
 
 
@@ -2213,17 +1998,17 @@ extern "C" void compressWeightedKernelDXT5(uint firstBlock, uint blockNum, uint
 /*
 extern "C" void compressNormalKernelDXT1(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps)
 {
-	compressNormalDXT1<<<blockNum, NUM_THREADS>>>(d_bitmaps, d_data, (uint2 *)d_result);
+    compressNormalDXT1<<<blockNum, NUM_THREADS>>>(d_bitmaps, d_data, (uint2 *)d_result);
 }
 
 extern "C" void compressKernelCTX1(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps)
 {
-	compressCTX1<<<blockNum, NUM_THREADS>>>(d_bitmaps, d_data, (uint2 *)d_result);
+    compressCTX1<<<blockNum, NUM_THREADS>>>(d_bitmaps, d_data, (uint2 *)d_result);
 }
 */
 /*
 extern "C" void compressKernelDXT5n(uint blockNum, cudaArray * d_data, uint * d_result)
 {
-//	compressDXT5n<<<blockNum/128, 128>>>(blockNum, (uint2 *)d_result);
+//    compressDXT5n<<<blockNum/128, 128>>>(blockNum, (uint2 *)d_result);
 }
-*/
\ No newline at end of file
+*/
diff --git a/src/nvtt/cuda/CudaCompressorDXT.cpp b/src/nvtt/cuda/CudaCompressorDXT.cpp
index 251067f..4180d8a 100644
--- a/src/nvtt/cuda/CudaCompressorDXT.cpp
+++ b/src/nvtt/cuda/CudaCompressorDXT.cpp
@@ -34,10 +34,10 @@
 #include "nvtt/QuickCompressDXT.h"
 #include "nvtt/OptimalCompressDXT.h"
 
-
 #include <time.h>
 #include <stdio.h>
 
+
 #if defined HAVE_CUDA
 #include <cuda_runtime_api.h>
 
@@ -141,7 +141,7 @@ void CudaCompressor::compress(nvtt::InputFormat inputFormat, nvtt::AlphaMode alp
     }
     else
     {
-#pragma message(NV_FILE_LINE "FIXME: Floating point textures not really supported by CUDA compressors.")
+#pragma message(NV_FILE_LINE "FIXME: Floating point textures not really supported by CUDA compressors.") // @@ What's missing???
         cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 32, 32, 32, cudaChannelFormatKindFloat);
         cudaMallocArray(&d_image, &channelDesc, w, h);
 
@@ -156,7 +156,7 @@ void CudaCompressor::compress(nvtt::InputFormat inputFormat, nvtt::AlphaMode alp
     const uint blockNum = bw * bh;
     const uint compressedSize = blockNum * bs;
 
-    void * h_result = malloc(min(blockNum, MAX_BLOCKS) * bs);
+    void * h_result = ::malloc(min(blockNum, MAX_BLOCKS) * bs);
 
     setup(d_image, compressionOptions);
 
@@ -164,18 +164,18 @@ void CudaCompressor::compress(nvtt::InputFormat inputFormat, nvtt::AlphaMode alp
     // timer.start();
 
     uint bn = 0;
-    while(bn != blockNum)
+    while (bn != blockNum)
     {
         uint count = min(blockNum - bn, MAX_BLOCKS);
 
-        compressBlocks(bn, count, w, h, alphaMode, compressionOptions, h_result);
+        compressBlocks(bn, count, bw, bh, alphaMode, compressionOptions, h_result);
 
         // Check for errors.
         cudaError_t err = cudaGetLastError();
         if (err != cudaSuccess)
         {
-                //nvDebug("CUDA Error: %s\n", cudaGetErrorString(err));
-                outputOptions.error(Error_CudaError);
+            //nvDebug("CUDA Error: %s\n", cudaGetErrorString(err));
+            outputOptions.error(Error_CudaError);
         }
 
         // Output result.
@@ -198,10 +198,10 @@ void CudaCompressorDXT1::setup(cudaArray * image, const nvtt::CompressionOptions
     bindTextureToArray(image);
 }
 
-void CudaCompressorDXT1::compressBlocks(uint first, uint count, uint w, uint h, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
+void CudaCompressorDXT1::compressBlocks(uint first, uint count, uint bw, uint bh, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
 {
     // Launch kernel.
-    compressKernelDXT1(first, count, w, m_ctx.result, m_ctx.bitmapTable);
+    compressKernelDXT1(first, count, bw, m_ctx.result, m_ctx.bitmapTable);
 
     // Copy result to host.
     cudaMemcpy(output, m_ctx.result, count * 8, cudaMemcpyDeviceToHost);
@@ -214,10 +214,10 @@ void CudaCompressorDXT3::setup(cudaArray * image, const nvtt::CompressionOptions
     bindTextureToArray(image);
 }
 
-void CudaCompressorDXT3::compressBlocks(uint first, uint count, uint w, uint h, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
+void CudaCompressorDXT3::compressBlocks(uint first, uint count, uint bw, uint bh, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
 {
     // Launch kernel.
-    compressKernelDXT3(first, count, w, m_ctx.result, m_ctx.bitmapTable);
+    compressKernelDXT3(first, count, bw, m_ctx.result, m_ctx.bitmapTable);
 
     // Copy result to host.
     cudaMemcpy(output, m_ctx.result, count * 16, cudaMemcpyDeviceToHost);
@@ -230,10 +230,10 @@ void CudaCompressorDXT5::setup(cudaArray * image, const nvtt::CompressionOptions
     bindTextureToArray(image);
 }
 
-void CudaCompressorDXT5::compressBlocks(uint first, uint count, uint w, uint h, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
+void CudaCompressorDXT5::compressBlocks(uint first, uint count, uint bw, uint bh, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
 {
     /*// Launch kernel.
-    compressKernelDXT5(first, count, w, m_ctx.result, m_ctx.bitmapTable);
+    compressKernelDXT5(first, count, bw, m_ctx.result, m_ctx.bitmapTable);
 
     // Copy result to host.
     cudaMemcpy(output, m_ctx.result, count * 16, cudaMemcpyDeviceToHost);*/
@@ -241,7 +241,7 @@ void CudaCompressorDXT5::compressBlocks(uint first, uint count, uint w, uint h,
     // Launch kernel.
     if (alphaMode == AlphaMode_Transparency)
     {
-    //	compressWeightedKernelDXT1(first, count, w, m_ctx.result, m_ctx.bitmapTable);
+    //	compressWeightedKernelDXT1(first, count, bw, m_ctx.result, m_ctx.bitmapTable);
     }
     else
     {
@@ -251,8 +251,8 @@ void CudaCompressorDXT5::compressBlocks(uint first, uint count, uint w, uint h,
     // Compress alpha in parallel with the GPU.
     for (uint i = 0; i < count; i++)
     {
-            //ColorBlock rgba(blockLinearImage + (first + i) * 16);
-            //OptimalCompress::compressDXT3A(rgba, alphaBlocks + i);
+        //ColorBlock rgba(blockLinearImage + (first + i) * 16);
+        //OptimalCompress::compressDXT3A(rgba, alphaBlocks + i);
     }
 
     // Copy result to host.
diff --git a/src/nvtt/cuda/CudaCompressorDXT.h b/src/nvtt/cuda/CudaCompressorDXT.h
index 1afaff0..78097fa 100644
--- a/src/nvtt/cuda/CudaCompressorDXT.h
+++ b/src/nvtt/cuda/CudaCompressorDXT.h
@@ -31,78 +31,78 @@ struct cudaArray;
 
 namespace nv
 {
-	class CudaContext
-	{
-	public:
-		CudaContext();
-		~CudaContext();
-
-		bool isValid() const;
-
-	public:
-		// Device pointers.
-		uint * bitmapTable;
-		uint * bitmapTableCTX;
-		uint * data;
-		uint * result;
-	};
+    class CudaContext
+    {
+    public:
+        CudaContext();
+        ~CudaContext();
+
+        bool isValid() const;
+
+    public:
+        // Device pointers.
+        uint * bitmapTable;
+        uint * bitmapTableCTX;
+        uint * data;
+        uint * result;
+    };
 
 #if defined HAVE_CUDA
 
-	struct CudaCompressor : public CompressorInterface
-	{
-		CudaCompressor(CudaContext & ctx);
-
-		virtual void compress(nvtt::InputFormat inputFormat, nvtt::AlphaMode alphaMode, uint w, uint h, const void * data, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
-
-		virtual void setup(cudaArray * image, const nvtt::CompressionOptions::Private & compressionOptions) = 0;
-		virtual void compressBlocks(uint first, uint count, uint w, uint h, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output) = 0;
-		virtual uint blockSize() const = 0;
-
-	protected:
-		CudaContext & m_ctx;
-	};
-
-	struct CudaCompressorDXT1 : public CudaCompressor
-	{
-		CudaCompressorDXT1(CudaContext & ctx) : CudaCompressor(ctx) {}
-
-		virtual void setup(cudaArray * image, const nvtt::CompressionOptions::Private & compressionOptions);
-		virtual void compressBlocks(uint first, uint count, uint w, uint h, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
-		virtual uint blockSize() const { return 8; };
-	};
-
-	/*struct CudaCompressorDXT1n : public CudaCompressor
-	{
-		virtual void setup(const CompressionOptions::Private & compressionOptions);
-		virtual void compressBlocks(uint blockCount, const void * input, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output) = 0;
-		virtual uint blockSize() const { return 8; };
-	};*/
-
-	struct CudaCompressorDXT3 : public CudaCompressor
-	{
-		CudaCompressorDXT3(CudaContext & ctx) : CudaCompressor(ctx) {}
-
-		virtual void setup(cudaArray * image, const nvtt::CompressionOptions::Private & compressionOptions);
-		virtual void compressBlocks(uint first, uint count, uint w, uint h, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
-		virtual uint blockSize() const { return 16; };
-	};
-
-	struct CudaCompressorDXT5 : public CudaCompressor
-	{
-		CudaCompressorDXT5(CudaContext & ctx) : CudaCompressor(ctx) {}
-
-		virtual void setup(cudaArray * image, const nvtt::CompressionOptions::Private & compressionOptions);
-		virtual void compressBlocks(uint first, uint count, uint w, uint h, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
-		virtual uint blockSize() const { return 16; };
-	};
-
-	/*struct CudaCompressorCXT1 : public CudaCompressor
-	{
-		virtual void setup(const CompressionOptions::Private & compressionOptions);
-		virtual void compressBlocks(uint blockCount, const void * input, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output) = 0;
-		virtual uint blockSize() const { return 8; };
-	};*/
+    struct CudaCompressor : public CompressorInterface
+    {
+        CudaCompressor(CudaContext & ctx);
+
+        virtual void compress(nvtt::InputFormat inputFormat, nvtt::AlphaMode alphaMode, uint w, uint h, const void * data, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
+
+        virtual void setup(cudaArray * image, const nvtt::CompressionOptions::Private & compressionOptions) = 0;
+        virtual void compressBlocks(uint first, uint count, uint w, uint h, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output) = 0;
+        virtual uint blockSize() const = 0;
+
+    protected:
+        CudaContext & m_ctx;
+    };
+
+    struct CudaCompressorDXT1 : public CudaCompressor
+    {
+        CudaCompressorDXT1(CudaContext & ctx) : CudaCompressor(ctx) {}
+
+        virtual void setup(cudaArray * image, const nvtt::CompressionOptions::Private & compressionOptions);
+        virtual void compressBlocks(uint first, uint count, uint w, uint h, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
+        virtual uint blockSize() const { return 8; };
+    };
+
+    /*struct CudaCompressorDXT1n : public CudaCompressor
+    {
+        virtual void setup(const CompressionOptions::Private & compressionOptions);
+        virtual void compressBlocks(uint blockCount, const void * input, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output) = 0;
+        virtual uint blockSize() const { return 8; };
+    };*/
+
+    struct CudaCompressorDXT3 : public CudaCompressor
+    {
+        CudaCompressorDXT3(CudaContext & ctx) : CudaCompressor(ctx) {}
+
+        virtual void setup(cudaArray * image, const nvtt::CompressionOptions::Private & compressionOptions);
+        virtual void compressBlocks(uint first, uint count, uint w, uint h, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
+        virtual uint blockSize() const { return 16; };
+    };
+
+    struct CudaCompressorDXT5 : public CudaCompressor
+    {
+        CudaCompressorDXT5(CudaContext & ctx) : CudaCompressor(ctx) {}
+
+        virtual void setup(cudaArray * image, const nvtt::CompressionOptions::Private & compressionOptions);
+        virtual void compressBlocks(uint first, uint count, uint w, uint h, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
+        virtual uint blockSize() const { return 16; };
+    };
+
+    /*struct CudaCompressorCXT1 : public CudaCompressor
+    {
+        virtual void setup(const CompressionOptions::Private & compressionOptions);
+        virtual void compressBlocks(uint blockCount, const void * input, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output) = 0;
+        virtual uint blockSize() const { return 8; };
+    };*/
 
 #endif // defined HAVE_CUDA
 
diff --git a/src/nvtt/tools/CMakeLists.txt b/src/nvtt/tools/CMakeLists.txt
index 4272c53..20f80ee 100644
--- a/src/nvtt/tools/CMakeLists.txt
+++ b/src/nvtt/tools/CMakeLists.txt
@@ -48,14 +48,13 @@ IF(GCONFTOOL2)
 ENDIF(GCONFTOOL2)
 
 # UI tools
-IF(QT4_FOUND) # AND NOT MSVC)
+IF(QT4_FOUND)
 	SET(QT_USE_QTOPENGL TRUE)
 	INCLUDE_DIRECTORIES(${QT_INCLUDE_DIR} ${CMAKE_CURRENT_BINARY_DIR})
 	
 	SET(SRCS
-		ui/main.cpp
-		ui/configdialog.h
-		ui/configdialog.cpp)
+                compressdialog.h
+                compressdialog.cpp)
 
 	SET(LIBS
 		nvtt
@@ -63,12 +62,12 @@ IF(QT4_FOUND) # AND NOT MSVC)
 		${QT_QTGUI_LIBRARY}
 		${QT_QTOPENGL_LIBRARY})
 
-	QT4_WRAP_UI(UICS ui/configdialog.ui)
-	QT4_WRAP_CPP(MOCS ui/configdialog.h)
+        QT4_WRAP_UI(UICS compressdialog.ui)
+        QT4_WRAP_CPP(MOCS compressdialog.h)
 	#QT4_ADD_RESOURCES(RCCS ui/configdialog.rc)
 
-    #ADD_EXECUTABLE(nvcompressui MACOSX_BUNDLE ${SRCS} ${UICS} ${MOCS})
-    #TARGET_LINK_LIBRARIES(nvcompressui ${LIBS})
+    ADD_EXECUTABLE(nvtt-diag MACOSX_BUNDLE ${SRCS} ${UICS} ${MOCS})
+    TARGET_LINK_LIBRARIES(nvtt-diag ${LIBS})
 
-ENDIF(QT4_FOUND) # AND NOT MSVC)
+ENDIF(QT4_FOUND)
 
diff --git a/src/nvtt/tools/compressdialog.cpp b/src/nvtt/tools/compressdialog.cpp
new file mode 100644
index 0000000..c3b1d11
--- /dev/null
+++ b/src/nvtt/tools/compressdialog.cpp
@@ -0,0 +1,124 @@
+#include "compressdialog.h"
+#include "ui_compressdialog.h"
+
+#include <QtGui/QApplication>
+
+int main(int argc, char *argv[])
+{
+    QApplication app(argc, argv);
+
+    CompressDialog dialog("");
+
+    return dialog.exec();
+}
+
+
+
+CompressDialog::CompressDialog(const QString & fileName, QWidget *parent) : QDialog(parent)
+{
+    ui.setupUi(this);
+
+    //connect(ui.openButton, SIGNAL(clicked()), this, SLOT(openClicked()));
+    connect(ui.generateMipmapsCheckBox, SIGNAL(stateChanged(int)), this, SLOT(generateMipmapsChanged(int)));
+    connect(ui.mipmapFilterComboBox, SIGNAL(activated(QString)), this, SLOT(mipmapFilterChanged(QString)));
+    //connect(ui.mipmapFilterSettings, SIGNAL(clicked()), this, SLOT(mipmapFilterSettingsShow()));
+
+    connect(ui.formatComboBox, SIGNAL(activated(QString)), this, SLOT(formatChanged(QString)));
+
+
+    connect(ui.redSpinBox, SIGNAL(valueChanged(double)), this, SLOT(colorWeightChanged()));
+    connect(ui.greenSpinBox, SIGNAL(valueChanged(double)), this, SLOT(colorWeightChanged()));
+    connect(ui.blueSpinBox, SIGNAL(valueChanged(double)), this, SLOT(colorWeightChanged()));
+    connect(ui.uniformButton, SIGNAL(toggled(bool)), this, SLOT(uniformWeightToggled(bool)));
+    connect(ui.luminanceButton, SIGNAL(toggled(bool)), this, SLOT(luminanceWeightToggled(bool)));
+
+    //connect(ui.rgbMapRadioButton, SIGNAL(toggled(bool)), this, SLOT(colorModeChanged()));
+    //connect(ui.normalMapRadioButton, SIGNAL(toggled(bool)), this, SLOT(normalMapModeChanged(bool)));
+}
+
+CompressDialog::~CompressDialog()
+{
+}
+
+
+void CompressDialog::openClicked()
+{
+    // @@ What is openButton?
+}
+
+
+void CompressDialog::generateMipmapsChanged(int state)
+{
+    Q_UNUSED(state);
+
+    bool generateMipmapEnabled = ui.generateMipmapsCheckBox->isChecked();
+
+    ui.mipmapFilterLabel->setEnabled(generateMipmapEnabled);
+    ui.mipmapFilterComboBox->setEnabled(generateMipmapEnabled);
+    ui.limitMipmapsCheckBox->setEnabled(generateMipmapEnabled);
+
+    bool enableFilterSettings = (ui.mipmapFilterComboBox->currentText() == "Kaiser");
+    ui.mipmapFilterSettings->setEnabled(generateMipmapEnabled && enableFilterSettings);
+
+    bool enableMaxLevel = ui.limitMipmapsCheckBox->isChecked();
+    ui.maxLevelLabel->setEnabled(generateMipmapEnabled && enableMaxLevel);
+    ui.maxLevelSpinBox->setEnabled(generateMipmapEnabled && enableMaxLevel);
+}
+
+void CompressDialog::mipmapFilterChanged(QString name)
+{
+    bool enableFilterSettings = (name == "Kaiser");
+    ui.mipmapFilterSettings->setEnabled(enableFilterSettings);
+}
+
+void CompressDialog::formatChanged(QString format)
+{
+    if (format == "Uncompressed") {
+        ui.formatOptions->setCurrentIndex(1);
+    }
+    else {
+        ui.formatOptions->setCurrentIndex(0);
+    }
+}
+
+void CompressDialog::colorWeightChanged()
+{
+    double r = ui.redSpinBox->value();
+    double g = ui.greenSpinBox->value();
+    double b = ui.blueSpinBox->value();
+
+    bool uniform = (r == 1.0 && g == 1.0 && b == 1.0);
+    bool luminance = (r == 0.3 && g == 0.59 && b == 0.11);
+
+    ui.uniformButton->setChecked(uniform);
+    ui.luminanceButton->setChecked(luminance);
+}
+
+void CompressDialog::uniformWeightToggled(bool checked)
+{
+    if (checked)
+    {
+        ui.redSpinBox->setValue(1.0);
+        ui.greenSpinBox->setValue(1.0);
+        ui.blueSpinBox->setValue(1.0);
+    }
+}
+
+void CompressDialog::luminanceWeightToggled(bool checked)
+{
+    if (checked)
+    {
+        ui.redSpinBox->setValue(0.3);
+        ui.greenSpinBox->setValue(0.59);
+        ui.blueSpinBox->setValue(0.11);
+    }
+}
+
+void CompressDialog::normalMapModeChanged(bool checked)
+{
+    //ui.alphaModeGroupBox->setEnabled(!checked);
+    //ui.inputGammaSpinBox->setEnabled(!checked);
+    //ui.inputGammaLabel->setEnabled(!checked);
+    //ui.outputGammaSpinBox->setEnabled(!checked);
+    //ui.outputGammaLabel->setEnabled(!checked);
+}
diff --git a/src/nvtt/tools/compressdialog.h b/src/nvtt/tools/compressdialog.h
new file mode 100644
index 0000000..e17b948
--- /dev/null
+++ b/src/nvtt/tools/compressdialog.h
@@ -0,0 +1,34 @@
+#ifndef COMPRESSDIALOG_H
+#define COMPRESSDIALOG_H
+
+#include <QtGui/QDialog>
+
+#include "ui_compressdialog.h"
+
+
+class CompressDialog : public QDialog
+{
+    Q_OBJECT
+public:
+    explicit CompressDialog(const QString & fileName, QWidget *parent = 0);
+    ~CompressDialog();
+
+protected slots:
+
+    void openClicked();
+    void generateMipmapsChanged(int state);
+    void mipmapFilterChanged(QString name);
+    void formatChanged(QString format);
+
+    void colorWeightChanged();
+    void uniformWeightToggled(bool checked);
+    void luminanceWeightToggled(bool checked);
+
+    void normalMapModeChanged(bool checked);
+
+
+private:
+    Ui::CompressDialog ui;
+};
+
+#endif // COMPRESSDIALOG_H
diff --git a/src/nvtt/tools/compressdialog.ui b/src/nvtt/tools/compressdialog.ui
new file mode 100644
index 0000000..a850a29
--- /dev/null
+++ b/src/nvtt/tools/compressdialog.ui
@@ -0,0 +1,768 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<ui version="4.0">
+ <class>CompressDialog</class>
+ <widget class="QDialog" name="CompressDialog">
+  <property name="minimumSize">
+   <size>
+    <width>280</width>
+    <height>540</height>
+   </size>
+  </property>
+  <property name="windowTitle">
+   <string>Dialog</string>
+  </property>
+  <layout class="QVBoxLayout" name="verticalLayout_2">
+   <property name="spacing">
+    <number>8</number>
+   </property>
+   <property name="margin">
+    <number>10</number>
+   </property>
+   <item>
+    <layout class="QHBoxLayout" name="horizontalLayout">
+     <item>
+      <widget class="QLineEdit" name="lineEdit"/>
+     </item>
+     <item>
+      <widget class="QToolButton" name="toolButton">
+       <property name="maximumSize">
+        <size>
+         <width>16777215</width>
+         <height>24</height>
+        </size>
+       </property>
+       <property name="text">
+        <string>...</string>
+       </property>
+      </widget>
+     </item>
+    </layout>
+   </item>
+   <item>
+    <widget class="QToolBox" name="toolBox">
+     <property name="sizePolicy">
+      <sizepolicy hsizetype="Preferred" vsizetype="Expanding">
+       <horstretch>0</horstretch>
+       <verstretch>0</verstretch>
+      </sizepolicy>
+     </property>
+     <property name="frameShape">
+      <enum>QFrame::Box</enum>
+     </property>
+     <property name="frameShadow">
+      <enum>QFrame::Sunken</enum>
+     </property>
+     <property name="currentIndex">
+      <number>3</number>
+     </property>
+     <widget class="QWidget" name="page_info">
+      <property name="geometry">
+       <rect>
+        <x>0</x>
+        <y>0</y>
+        <width>249</width>
+        <height>367</height>
+       </rect>
+      </property>
+      <attribute name="label">
+       <string>Image Info</string>
+      </attribute>
+     </widget>
+     <widget class="QWidget" name="page_compression">
+      <property name="geometry">
+       <rect>
+        <x>0</x>
+        <y>0</y>
+        <width>249</width>
+        <height>367</height>
+       </rect>
+      </property>
+      <attribute name="label">
+       <string>Compression Options</string>
+      </attribute>
+      <layout class="QVBoxLayout" name="verticalLayout">
+       <property name="spacing">
+        <number>2</number>
+       </property>
+       <property name="margin">
+        <number>6</number>
+       </property>
+       <item>
+        <layout class="QHBoxLayout">
+         <item>
+          <widget class="QLabel" name="label_3">
+           <property name="sizePolicy">
+            <sizepolicy hsizetype="Expanding" vsizetype="Preferred">
+             <horstretch>0</horstretch>
+             <verstretch>0</verstretch>
+            </sizepolicy>
+           </property>
+           <property name="text">
+            <string>Format:</string>
+           </property>
+           <property name="textFormat">
+            <enum>Qt::PlainText</enum>
+           </property>
+           <property name="textInteractionFlags">
+            <set>Qt::NoTextInteraction</set>
+           </property>
+          </widget>
+         </item>
+         <item>
+          <widget class="QComboBox" name="formatComboBox">
+           <item>
+            <property name="text">
+             <string>Uncompressed</string>
+            </property>
+           </item>
+           <item>
+            <property name="text">
+             <string>BC1 (DXT1)</string>
+            </property>
+           </item>
+           <item>
+            <property name="text">
+             <string>BC1a (DXT1a)</string>
+            </property>
+           </item>
+           <item>
+            <property name="text">
+             <string>BC2 (DXT3)</string>
+            </property>
+           </item>
+           <item>
+            <property name="text">
+             <string>BC3 (DXT5)</string>
+            </property>
+           </item>
+           <item>
+            <property name="text">
+             <string>BC4</string>
+            </property>
+           </item>
+           <item>
+            <property name="text">
+             <string>BC5</string>
+            </property>
+           </item>
+          </widget>
+         </item>
+        </layout>
+       </item>
+       <item>
+        <widget class="QStackedWidget" name="formatOptions">
+         <property name="sizePolicy">
+          <sizepolicy hsizetype="Preferred" vsizetype="Expanding">
+           <horstretch>0</horstretch>
+           <verstretch>0</verstretch>
+          </sizepolicy>
+         </property>
+         <property name="currentIndex">
+          <number>1</number>
+         </property>
+         <widget class="QWidget" name="page_colorweights">
+          <layout class="QVBoxLayout" name="verticalLayout_3">
+           <property name="leftMargin">
+            <number>0</number>
+           </property>
+           <property name="topMargin">
+            <number>8</number>
+           </property>
+           <property name="rightMargin">
+            <number>0</number>
+           </property>
+           <property name="bottomMargin">
+            <number>0</number>
+           </property>
+           <item>
+            <widget class="QGroupBox" name="groupBox_6">
+             <property name="sizePolicy">
+              <sizepolicy hsizetype="MinimumExpanding" vsizetype="Fixed">
+               <horstretch>0</horstretch>
+               <verstretch>0</verstretch>
+              </sizepolicy>
+             </property>
+             <property name="title">
+              <string>Color Weights</string>
+             </property>
+             <property name="alignment">
+              <set>Qt::AlignHCenter</set>
+             </property>
+             <layout class="QVBoxLayout" name="_2">
+              <property name="topMargin">
+               <number>12</number>
+              </property>
+              <item>
+               <layout class="QHBoxLayout" name="_3">
+                <item>
+                 <widget class="QLabel" name="label_10">
+                  <property name="text">
+                   <string>Red</string>
+                  </property>
+                  <property name="buddy">
+                   <cstring>redSpinBox</cstring>
+                  </property>
+                 </widget>
+                </item>
+                <item>
+                 <widget class="QDoubleSpinBox" name="redSpinBox">
+                  <property name="maximum">
+                   <double>1.000000000000000</double>
+                  </property>
+                  <property name="singleStep">
+                   <double>0.050000000000000</double>
+                  </property>
+                  <property name="value">
+                   <double>1.000000000000000</double>
+                  </property>
+                 </widget>
+                </item>
+               </layout>
+              </item>
+              <item>
+               <layout class="QHBoxLayout" name="_4">
+                <item>
+                 <widget class="QLabel" name="label_11">
+                  <property name="text">
+                   <string>Green</string>
+                  </property>
+                  <property name="buddy">
+                   <cstring>greenSpinBox</cstring>
+                  </property>
+                 </widget>
+                </item>
+                <item>
+                 <widget class="QDoubleSpinBox" name="greenSpinBox">
+                  <property name="maximum">
+                   <double>1.000000000000000</double>
+                  </property>
+                  <property name="singleStep">
+                   <double>0.050000000000000</double>
+                  </property>
+                  <property name="value">
+                   <double>1.000000000000000</double>
+                  </property>
+                 </widget>
+                </item>
+               </layout>
+              </item>
+              <item>
+               <layout class="QHBoxLayout" name="_5">
+                <item>
+                 <widget class="QLabel" name="label_12">
+                  <property name="text">
+                   <string>Blue</string>
+                  </property>
+                  <property name="buddy">
+                   <cstring>blueSpinBox</cstring>
+                  </property>
+                 </widget>
+                </item>
+                <item>
+                 <widget class="QDoubleSpinBox" name="blueSpinBox">
+                  <property name="maximum">
+                   <double>1.000000000000000</double>
+                  </property>
+                  <property name="singleStep">
+                   <double>0.050000000000000</double>
+                  </property>
+                  <property name="value">
+                   <double>1.000000000000000</double>
+                  </property>
+                 </widget>
+                </item>
+               </layout>
+              </item>
+              <item>
+               <layout class="QHBoxLayout" name="_6">
+                <item>
+                 <widget class="QToolButton" name="uniformButton">
+                  <property name="text">
+                   <string>Uniform</string>
+                  </property>
+                  <property name="checkable">
+                   <bool>true</bool>
+                  </property>
+                  <property name="checked">
+                   <bool>true</bool>
+                  </property>
+                 </widget>
+                </item>
+                <item>
+                 <widget class="QToolButton" name="luminanceButton">
+                  <property name="text">
+                   <string>Luminance</string>
+                  </property>
+                  <property name="checkable">
+                   <bool>true</bool>
+                  </property>
+                 </widget>
+                </item>
+               </layout>
+              </item>
+             </layout>
+            </widget>
+           </item>
+          </layout>
+         </widget>
+         <widget class="QWidget" name="page_pixelformat">
+          <layout class="QVBoxLayout" name="verticalLayout_4">
+           <property name="spacing">
+            <number>1</number>
+           </property>
+           <property name="margin">
+            <number>0</number>
+           </property>
+           <item>
+            <layout class="QHBoxLayout" name="horizontalLayout_4">
+             <item>
+              <widget class="QLabel" name="label_4">
+               <property name="sizePolicy">
+                <sizepolicy hsizetype="Expanding" vsizetype="Preferred">
+                 <horstretch>0</horstretch>
+                 <verstretch>0</verstretch>
+                </sizepolicy>
+               </property>
+               <property name="maximumSize">
+                <size>
+                 <width>16777215</width>
+                 <height>24</height>
+                </size>
+               </property>
+               <property name="text">
+                <string>Pixel Format:</string>
+               </property>
+               <property name="textFormat">
+                <enum>Qt::PlainText</enum>
+               </property>
+               <property name="textInteractionFlags">
+                <set>Qt::NoTextInteraction</set>
+               </property>
+              </widget>
+             </item>
+             <item>
+              <widget class="QComboBox" name="pixelformatComboBox">
+               <property name="currentIndex">
+                <number>1</number>
+               </property>
+               <item>
+                <property name="text">
+                 <string>Custom</string>
+                </property>
+               </item>
+               <item>
+                <property name="text">
+                 <string>R8G8B8A8</string>
+                </property>
+               </item>
+               <item>
+                <property name="text">
+                 <string>R5G6B5</string>
+                </property>
+               </item>
+               <item>
+                <property name="text">
+                 <string>A1</string>
+                </property>
+               </item>
+              </widget>
+             </item>
+            </layout>
+           </item>
+           <item>
+            <layout class="QHBoxLayout" name="horizontalLayout_9">
+             <item>
+              <widget class="QLabel" name="label_7">
+               <property name="text">
+                <string>Color Type:</string>
+               </property>
+              </widget>
+             </item>
+             <item>
+              <widget class="QComboBox" name="comboBox">
+               <item>
+                <property name="text">
+                 <string>Fixed</string>
+                </property>
+               </item>
+               <item>
+                <property name="text">
+                 <string>Float</string>
+                </property>
+               </item>
+              </widget>
+             </item>
+            </layout>
+           </item>
+           <item>
+            <layout class="QHBoxLayout" name="horizontalLayout_5">
+             <item>
+              <widget class="QLabel" name="label">
+               <property name="text">
+                <string>Red Bits:</string>
+               </property>
+              </widget>
+             </item>
+             <item>
+              <widget class="QSpinBox" name="redbitSpinBox">
+               <property name="maximum">
+                <number>32</number>
+               </property>
+               <property name="value">
+                <number>8</number>
+               </property>
+              </widget>
+             </item>
+            </layout>
+           </item>
+           <item>
+            <layout class="QHBoxLayout" name="horizontalLayout_8">
+             <item>
+              <widget class="QLabel" name="label_2">
+               <property name="text">
+                <string>Green Bits:</string>
+               </property>
+              </widget>
+             </item>
+             <item>
+              <widget class="QSpinBox" name="greenbitSpinBox">
+               <property name="maximum">
+                <number>32</number>
+               </property>
+               <property name="value">
+                <number>8</number>
+               </property>
+              </widget>
+             </item>
+            </layout>
+           </item>
+           <item>
+            <layout class="QHBoxLayout" name="horizontalLayout_6">
+             <item>
+              <widget class="QLabel" name="label_5">
+               <property name="text">
+                <string>Blue Bits:</string>
+               </property>
+              </widget>
+             </item>
+             <item>
+              <widget class="QSpinBox" name="bluebitSpinBox">
+               <property name="maximum">
+                <number>32</number>
+               </property>
+               <property name="value">
+                <number>8</number>
+               </property>
+              </widget>
+             </item>
+            </layout>
+           </item>
+           <item>
+            <layout class="QHBoxLayout" name="horizontalLayout_7">
+             <item>
+              <widget class="QLabel" name="label_6">
+               <property name="text">
+                <string>Alpha Bits:</string>
+               </property>
+              </widget>
+             </item>
+             <item>
+              <widget class="QSpinBox" name="alphabitSpinBox">
+               <property name="maximum">
+                <number>32</number>
+               </property>
+               <property name="value">
+                <number>8</number>
+               </property>
+              </widget>
+             </item>
+            </layout>
+           </item>
+          </layout>
+         </widget>
+        </widget>
+       </item>
+       <item>
+        <widget class="QCheckBox" name="ditherColorCheckBox">
+         <property name="text">
+          <string>Dither Color</string>
+         </property>
+        </widget>
+       </item>
+       <item>
+        <widget class="QCheckBox" name="checkBox">
+         <property name="text">
+          <string>Dither Alpha</string>
+         </property>
+        </widget>
+       </item>
+       <item>
+        <spacer name="verticalSpacer">
+         <property name="orientation">
+          <enum>Qt::Vertical</enum>
+         </property>
+         <property name="sizeHint" stdset="0">
+          <size>
+           <width>20</width>
+           <height>40</height>
+          </size>
+         </property>
+        </spacer>
+       </item>
+      </layout>
+     </widget>
+     <widget class="QWidget" name="page_resize">
+      <property name="geometry">
+       <rect>
+        <x>0</x>
+        <y>0</y>
+        <width>249</width>
+        <height>367</height>
+       </rect>
+      </property>
+      <attribute name="label">
+       <string>Resize Options</string>
+      </attribute>
+     </widget>
+     <widget class="QWidget" name="page_mipmap">
+      <property name="geometry">
+       <rect>
+        <x>0</x>
+        <y>0</y>
+        <width>241</width>
+        <height>357</height>
+       </rect>
+      </property>
+      <attribute name="label">
+       <string>Mipmap Options</string>
+      </attribute>
+      <layout class="QVBoxLayout" name="verticalLayout_5">
+       <property name="spacing">
+        <number>2</number>
+       </property>
+       <property name="margin">
+        <number>6</number>
+       </property>
+       <item>
+        <widget class="QCheckBox" name="generateMipmapsCheckBox">
+         <property name="sizePolicy">
+          <sizepolicy hsizetype="Expanding" vsizetype="Fixed">
+           <horstretch>0</horstretch>
+           <verstretch>0</verstretch>
+          </sizepolicy>
+         </property>
+         <property name="text">
+          <string>Generate mipmaps</string>
+         </property>
+         <property name="checked">
+          <bool>true</bool>
+         </property>
+        </widget>
+       </item>
+       <item>
+        <layout class="QHBoxLayout" name="_8">
+         <property name="spacing">
+          <number>1</number>
+         </property>
+         <item>
+          <widget class="QLabel" name="mipmapFilterLabel">
+           <property name="sizePolicy">
+            <sizepolicy hsizetype="Expanding" vsizetype="Preferred">
+             <horstretch>0</horstretch>
+             <verstretch>0</verstretch>
+            </sizepolicy>
+           </property>
+           <property name="maximumSize">
+            <size>
+             <width>16777215</width>
+             <height>26</height>
+            </size>
+           </property>
+           <property name="text">
+            <string>Filter:</string>
+           </property>
+           <property name="buddy">
+            <cstring>mipmapFilterComboBox</cstring>
+           </property>
+          </widget>
+         </item>
+         <item>
+          <widget class="QComboBox" name="mipmapFilterComboBox">
+           <property name="maximumSize">
+            <size>
+             <width>16777215</width>
+             <height>26</height>
+            </size>
+           </property>
+           <item>
+            <property name="text">
+             <string>Box</string>
+            </property>
+           </item>
+           <item>
+            <property name="text">
+             <string>Triangle</string>
+            </property>
+           </item>
+           <item>
+            <property name="text">
+             <string>Kaiser</string>
+            </property>
+           </item>
+          </widget>
+         </item>
+         <item>
+          <widget class="QToolButton" name="mipmapFilterSettings">
+           <property name="enabled">
+            <bool>false</bool>
+           </property>
+           <property name="autoFillBackground">
+            <bool>false</bool>
+           </property>
+           <property name="text">
+            <string>...</string>
+           </property>
+           <property name="toolButtonStyle">
+            <enum>Qt::ToolButtonTextOnly</enum>
+           </property>
+          </widget>
+         </item>
+        </layout>
+       </item>
+       <item>
+        <layout class="QHBoxLayout" name="_9">
+         <item>
+          <widget class="QCheckBox" name="limitMipmapsCheckBox">
+           <property name="sizePolicy">
+            <sizepolicy hsizetype="Expanding" vsizetype="Fixed">
+             <horstretch>0</horstretch>
+             <verstretch>0</verstretch>
+            </sizepolicy>
+           </property>
+           <property name="text">
+            <string>Limit Mipmaps</string>
+           </property>
+          </widget>
+         </item>
+        </layout>
+       </item>
+       <item>
+        <layout class="QHBoxLayout" name="horizontalLayout_10">
+         <item>
+          <spacer name="horizontalSpacer">
+           <property name="orientation">
+            <enum>Qt::Horizontal</enum>
+           </property>
+           <property name="sizeHint" stdset="0">
+            <size>
+             <width>40</width>
+             <height>20</height>
+            </size>
+           </property>
+          </spacer>
+         </item>
+         <item>
+          <widget class="QLabel" name="maxLevelLabel">
+           <property name="enabled">
+            <bool>false</bool>
+           </property>
+           <property name="sizePolicy">
+            <sizepolicy hsizetype="Fixed" vsizetype="Fixed">
+             <horstretch>0</horstretch>
+             <verstretch>0</verstretch>
+            </sizepolicy>
+           </property>
+           <property name="text">
+            <string>&amp;Max Level:</string>
+           </property>
+           <property name="buddy">
+            <cstring>maxLevelSpinBox</cstring>
+           </property>
+          </widget>
+         </item>
+         <item>
+          <widget class="QSpinBox" name="maxLevelSpinBox">
+           <property name="enabled">
+            <bool>false</bool>
+           </property>
+           <property name="sizePolicy">
+            <sizepolicy hsizetype="Fixed" vsizetype="Fixed">
+             <horstretch>0</horstretch>
+             <verstretch>0</verstretch>
+            </sizepolicy>
+           </property>
+           <property name="maximumSize">
+            <size>
+             <width>80</width>
+             <height>16777215</height>
+            </size>
+           </property>
+          </widget>
+         </item>
+        </layout>
+       </item>
+       <item>
+        <spacer name="verticalSpacer_3">
+         <property name="orientation">
+          <enum>Qt::Vertical</enum>
+         </property>
+         <property name="sizeHint" stdset="0">
+          <size>
+           <width>20</width>
+           <height>40</height>
+          </size>
+         </property>
+        </spacer>
+       </item>
+      </layout>
+     </widget>
+    </widget>
+   </item>
+   <item>
+    <widget class="QDialogButtonBox" name="buttonBox">
+     <property name="orientation">
+      <enum>Qt::Horizontal</enum>
+     </property>
+     <property name="standardButtons">
+      <set>QDialogButtonBox::Cancel|QDialogButtonBox::Ok</set>
+     </property>
+    </widget>
+   </item>
+  </layout>
+ </widget>
+ <resources/>
+ <connections>
+  <connection>
+   <sender>buttonBox</sender>
+   <signal>accepted()</signal>
+   <receiver>CompressDialog</receiver>
+   <slot>accept()</slot>
+   <hints>
+    <hint type="sourcelabel">
+     <x>248</x>
+     <y>254</y>
+    </hint>
+    <hint type="destinationlabel">
+     <x>157</x>
+     <y>274</y>
+    </hint>
+   </hints>
+  </connection>
+  <connection>
+   <sender>buttonBox</sender>
+   <signal>rejected()</signal>
+   <receiver>CompressDialog</receiver>
+   <slot>reject()</slot>
+   <hints>
+    <hint type="sourcelabel">
+     <x>316</x>
+     <y>260</y>
+    </hint>
+    <hint type="destinationlabel">
+     <x>286</x>
+     <y>274</y>
+    </hint>
+   </hints>
+  </connection>
+ </connections>
+</ui>