From 2f6e885cedd5c3e2adf7b0efc9f1a084fb5859d5 Mon Sep 17 00:00:00 2001
From: castano <castano@95f4ed2b-212e-0410-8b90-d31948207fce>
Date: Wed, 1 Oct 2008 22:24:53 +0000
Subject: [PATCH] Add DXT1 compressor that uses texture to avoid CPU swizzling.
 Fix errors under emulation. Experiment with DXT5 compressor.

---
 src/nvtt/cuda/CompressKernel.cu   | 379 ++++++++++++++++++++++++------
 src/nvtt/cuda/CudaCompressDXT.cpp |  91 ++++++-
 src/nvtt/cuda/CudaCompressDXT.h   |   2 +
 src/nvtt/cuda/CudaMath.h          |  25 +-
 4 files changed, 410 insertions(+), 87 deletions(-)

diff --git a/src/nvtt/cuda/CompressKernel.cu b/src/nvtt/cuda/CompressKernel.cu
index 60ec132..9711014 100644
--- a/src/nvtt/cuda/CompressKernel.cu
+++ b/src/nvtt/cuda/CompressKernel.cu
@@ -21,10 +21,8 @@
 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 // OTHER DEALINGS IN THE SOFTWARE.
 
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
 #include <math.h>
+#include <float.h> // FLT_MAX
 
 #include "CudaMath.h"
 
@@ -53,65 +51,57 @@ __device__ inline void swap(T & a, T & b)
 __constant__ float3 kColorMetric = { 1.0f, 1.0f, 1.0f };
 __constant__ float3 kColorMetricSqr = { 1.0f, 1.0f, 1.0f };
 
+// Some kernels read the input through texture.
+texture<uchar4, 2, cudaReadModeNormalizedFloat> tex;
 
 
 ////////////////////////////////////////////////////////////////////////////////
 // Sort colors
 ////////////////////////////////////////////////////////////////////////////////
-__device__ void sortColors(const float * values, int * cmp)
+__device__ void sortColors(const float * values, int * ranks)
 {
-	int tid = threadIdx.x;
+#if __DEVICE_EMULATION__
+    if (threadIdx.x == 0)
+    {
+        for (int tid = 0; tid < 16; tid++)
+        {
+            int rank = 0;
+            for (int i = 0; i < 16; i++)
+            {
+                rank += (values[i] < values[tid]);
+            }
+            
+            ranks[tid] = rank;
+        }
 
-#if 1
-	cmp[tid] = (values[0] < values[tid]);
-	cmp[tid] += (values[1] < values[tid]);
-	cmp[tid] += (values[2] < values[tid]);
-	cmp[tid] += (values[3] < values[tid]);
-	cmp[tid] += (values[4] < values[tid]);
-	cmp[tid] += (values[5] < values[tid]);
-	cmp[tid] += (values[6] < values[tid]);
-	cmp[tid] += (values[7] < values[tid]);
-	cmp[tid] += (values[8] < values[tid]);
-	cmp[tid] += (values[9] < values[tid]);
-	cmp[tid] += (values[10] < values[tid]);
-	cmp[tid] += (values[11] < values[tid]);
-	cmp[tid] += (values[12] < values[tid]);
-	cmp[tid] += (values[13] < values[tid]);
-	cmp[tid] += (values[14] < values[tid]);
-	cmp[tid] += (values[15] < values[tid]);
-	
-	// Resolve elements with the same index.
-	if (tid > 0 && cmp[tid] == cmp[0]) ++cmp[tid];
-	if (tid > 1 && cmp[tid] == cmp[1]) ++cmp[tid];
-	if (tid > 2 && cmp[tid] == cmp[2]) ++cmp[tid];
-	if (tid > 3 && cmp[tid] == cmp[3]) ++cmp[tid];
-	if (tid > 4 && cmp[tid] == cmp[4]) ++cmp[tid];
-	if (tid > 5 && cmp[tid] == cmp[5]) ++cmp[tid];
-	if (tid > 6 && cmp[tid] == cmp[6]) ++cmp[tid];
-	if (tid > 7 && cmp[tid] == cmp[7]) ++cmp[tid];
-	if (tid > 8 && cmp[tid] == cmp[8]) ++cmp[tid];
-	if (tid > 9 && cmp[tid] == cmp[9]) ++cmp[tid];
-	if (tid > 10 && cmp[tid] == cmp[10]) ++cmp[tid];
-	if (tid > 11 && cmp[tid] == cmp[11]) ++cmp[tid];
-	if (tid > 12 && cmp[tid] == cmp[12]) ++cmp[tid];
-	if (tid > 13 && cmp[tid] == cmp[13]) ++cmp[tid];
-	if (tid > 14 && cmp[tid] == cmp[14]) ++cmp[tid];
+        // Resolve elements with the same index.
+        for (int i = 0; i < 15; i++)
+        {
+            for (int tid = 0; tid < 16; tid++)
+            {
+                if (tid > i && ranks[tid] == ranks[i]) ++ranks[tid];
+            }
+        }
+    }
 #else
+    const int tid = threadIdx.x;
 
-	cmp[tid] = 0;
+    int rank = 0;
 
-	#pragma unroll
-	for (int i = 0; i < 16; i++)
-	{
-		cmp[tid] += (values[i] < values[tid]);
-	}
+    #pragma unroll
+    for (int i = 0; i < 16; i++)
+    {
+        rank += (values[i] < values[tid]);
+    }
+    
+    ranks[tid] = rank;
 
-	// Resolve elements with the same index.
-	#pragma unroll
-	for (int i = 0; i < 15; i++)
-	{
-		if (tid > 0 && cmp[tid] == cmp[i]) ++cmp[tid];
-	}
+    // Resolve elements with the same index.
+    #pragma unroll
+    for (int i = 0; i < 15; i++)
+    {
+        if (tid > i && ranks[tid] == ranks[i]) ++ranks[tid];
+    }
 #endif
 }
 
@@ -136,9 +126,7 @@ __device__ void loadColorBlock(const uint * image, float3 colors[16], float3 sum
 		colors[idx].x = ((c >> 16) & 0xFF) * (1.0f / 255.0f);
 		
 		// No need to synchronize, 16 < warp size.
-#if __DEVICE_EMULATION__
-		} __debugsync(); if (idx < 16) {
-#endif
+		__debugsync();
 		
 		// Sort colors along the best fit line.
 		colorSums(colors, sums);
@@ -148,17 +136,74 @@ __device__ void loadColorBlock(const uint * image, float3 colors[16], float3 sum
 		
 		dps[idx] = dot(colors[idx], axis);
 		
-#if __DEVICE_EMULATION__
-		} __debugsync(); if (idx < 16) {
-#endif
+		__debugsync();
 		
 		sortColors(dps, xrefs);
 		
 		float3 tmp = colors[idx];
+		__debugsync();
 		colors[xrefs[idx]] = tmp;
 	}
+#if __DEVICE_EMULATION__
+	else
+	{
+		__debugsync();
+		__debugsync();
+		__debugsync();
+	}
+#endif
 }
 
+__device__ void loadColorBlockTex(uint bn, uint w, float3 colors[16], float3 sums[16], int xrefs[16], int * sameColor)
+{
+	const int bid = blockIdx.x;
+	const int idx = threadIdx.x;
+
+	__shared__ float dps[16];
+
+	if (idx < 16)
+	{
+		float x = 4 * ((bn + bid) % w) + idx % 4;
+		float y = 4 * ((bn + bid) / w) + idx / 4;
+
+		// Read color and copy to shared mem.
+		float4 c = tex2D(tex, x, y);
+
+		colors[idx].x = c.z;
+		colors[idx].y = c.y;
+		colors[idx].z = c.x;
+
+		// No need to synchronize, 16 < warp size.
+		__debugsync();
+		
+		// Sort colors along the best fit line.
+		colorSums(colors, sums);
+		float3 axis = bestFitLine(colors, sums[0], kColorMetric);
+		
+		*sameColor = (axis == make_float3(0, 0, 0));
+		
+		dps[idx] = dot(colors[idx], axis);
+		
+		__debugsync();
+		
+		sortColors(dps, xrefs);
+		
+		float3 tmp = colors[idx];
+		__debugsync();
+		colors[xrefs[idx]] = tmp;
+	}
+#if __DEVICE_EMULATION__
+	else
+	{
+		__debugsync();
+		__debugsync();
+		__debugsync();
+	}
+#endif
+
+}
+
+
 __device__ void loadColorBlock(const uint * image, float3 colors[16], float3 sums[16], float weights[16], int xrefs[16], int * sameColor)
 {
 	const int bid = blockIdx.x;
@@ -179,11 +224,8 @@ __device__ void loadColorBlock(const uint * image, float3 colors[16], float3 sum
 		
 		colors[idx] = rawColors[idx] * weights[idx];
 
-		
 		// No need to synchronize, 16 < warp size.
-#if __DEVICE_EMULATION__
-		} __debugsync(); if (idx < 16) {
-#endif
+		__debugsync();
 
 		// Sort colors along the best fit line.
 		colorSums(colors, sums);
@@ -193,18 +235,24 @@ __device__ void loadColorBlock(const uint * image, float3 colors[16], float3 sum
 
 		dps[idx] = dot(rawColors[idx], axis);
 		
-#if __DEVICE_EMULATION__
-		} __debugsync(); if (idx < 16) {
-#endif
+		__debugsync();
 		
 		sortColors(dps, xrefs);
 		
 		float3 tmp = colors[idx];
-		colors[xrefs[idx]] = tmp;
-		
 		float w = weights[idx];
+		__debugsync();
+		colors[xrefs[idx]] = tmp;
 		weights[xrefs[idx]] = w;
 	}
+#if __DEVICE_EMULATION__
+	else
+	{
+		__debugsync();
+		__debugsync();
+		__debugsync();
+	}
+#endif
 }
 
 __device__ void loadColorBlock(const uint * image, float2 colors[16], float2 sums[16], int xrefs[16], int * sameColor)
@@ -223,9 +271,7 @@ __device__ void loadColorBlock(const uint * image, float2 colors[16], float2 sum
 		colors[idx].x = ((c >> 16) & 0xFF) * (1.0f / 255.0f);
 		
 		// No need to synchronize, 16 < warp size.
-#if __DEVICE_EMULATION__
-		} __debugsync(); if (idx < 16) {
-#endif
+		__debugsync();
 
 		// Sort colors along the best fit line.
 		colorSums(colors, sums);
@@ -235,15 +281,22 @@ __device__ void loadColorBlock(const uint * image, float2 colors[16], float2 sum
 
 		dps[idx] = dot(colors[idx], axis);
 		
-#if __DEVICE_EMULATION__
-		} __debugsync(); if (idx < 16) {
-#endif
+		__debugsync();
 		
 		sortColors(dps, xrefs);
 		
 		float2 tmp = colors[idx];
+		__debugsync();
 		colors[xrefs[idx]] = tmp;
 	}
+#if __DEVICE_EMULATION__
+	else
+	{
+		__debugsync();
+		__debugsync();
+		__debugsync();
+	}
+#endif
 }
 
 
@@ -951,7 +1004,6 @@ __device__ int findMinError(float * errors)
 			}
 		}
 	}
-
 #else
 	for(int d = NUM_THREADS/2; d > 32; d >>= 1)
 	{
@@ -1135,6 +1187,41 @@ __global__ void compressDXT1(const uint * permutations, const uint * image, uint
 	}
 }
 
+__global__ void compressDXT1_Tex(uint bn, uint w, const uint * permutations, uint2 * result)
+{
+	__shared__ float3 colors[16];
+	__shared__ float3 sums[16];
+	__shared__ int xrefs[16];
+	__shared__ int sameColor;
+	
+	loadColorBlockTex(bn, w, colors, sums, xrefs, &sameColor);
+
+	__syncthreads();
+
+	if (sameColor)
+	{
+		if (threadIdx.x == 0) saveSingleColorBlockDXT1(colors[0], result);
+		return;
+	}
+
+	ushort bestStart, bestEnd;
+	uint bestPermutation;
+
+	__shared__ float errors[NUM_THREADS];
+
+	evalAllPermutations(colors, sums[0], permutations, bestStart, bestEnd, bestPermutation, errors);
+	
+	// Use a parallel reduction to find minimum error.
+	const int minIdx = findMinError(errors);
+	
+	// Only write the result of the winner thread.
+	if (threadIdx.x == minIdx)
+	{
+		saveBlockDXT1(bestStart, bestEnd, bestPermutation, xrefs, result);
+	}
+}
+
+
 __global__ void compressLevel4DXT1(const uint * permutations, const uint * image, uint2 * result)
 {
 	__shared__ float3 colors[16];
@@ -1452,6 +1539,125 @@ __global__ void compressDXT5(const uint * permutations, const uint * image, uint
 }
 */
 
+/*__device__ void evaluatePalette(uint alpha0, uint alpha1, uint alphas[8])
+{
+	alpha[0] = alpha0;
+	alpha[1] = alpha1;
+	alpha[2] = (6 * alpha[0] + 1 * alpha[1]) / 7;	// bit code 010
+	alpha[3] = (5 * alpha[0] + 2 * alpha[1]) / 7;	// bit code 011
+	alpha[4] = (4 * alpha[0] + 3 * alpha[1]) / 7;	// bit code 100
+	alpha[5] = (3 * alpha[0] + 4 * alpha[1]) / 7;	// bit code 101
+	alpha[6] = (2 * alpha[0] + 5 * alpha[1]) / 7;	// bit code 110
+	alpha[7] = (1 * alpha[0] + 6 * alpha[1]) / 7;	// bit code 111
+}
+
+__device__ uint computeAlphaError(const uint block[16], uint alpha0, uint alpha1, int bestError = INT_MAX)
+{
+	uint8 alphas[8];
+	evaluatePalette(alpha0, alpha1, alphas);
+
+	int totalError = 0;
+
+	for (uint i = 0; i < 16; i++)
+	{
+		uint8 alpha = block[i];
+
+		// @@ It should be possible to do this much faster.
+
+		int minDist = INT_MAX;
+		for (uint p = 0; p < 8; p++)
+		{
+			int dist = alphaDistance(alpha, alphas[p]);
+			minDist = min(dist, minDist);
+		}
+
+
+
+		totalError += minDist;
+
+		if (totalError > bestError)
+		{
+			// early out
+			return totalError;
+		}
+	}
+
+	return totalError;
+}
+
+
+void compressDXT5A(uint alpha[16])
+{
+	// Get min/max alpha.
+	for (uint i = 0; i < 16; i++)
+	{
+		mina = min(mina, alpha[i]);
+		maxa = max(maxa, alpha[i]);
+	}
+
+	dxtBlock->alpha0 = maxa;
+	dxtBlock->alpha1 = mina;
+
+	if (maxa - mina > 8)
+	{
+		int besterror = computeAlphaError(rgba, dxtBlock);
+		int besta0 = maxa;
+		int besta1 = mina;
+
+		// Expand search space a bit.
+		const int alphaExpand = 8;
+		mina = (mina <= alphaExpand) ? 0 : mina - alphaExpand;
+		maxa = (maxa <= 255-alphaExpand) ? 255 : maxa + alphaExpand;
+
+		for (int a0 = mina+9; a0 < maxa; a0++)
+		{
+			for (int a1 = mina; a1 < a0-8; a1++)
+			{
+				nvDebugCheck(a0 - a1 > 8);
+
+				dxtBlock->alpha0 = a0;
+				dxtBlock->alpha1 = a1;
+				int error = computeAlphaError(rgba, dxtBlock, besterror);
+
+				if (error < besterror)
+				{
+					besterror = error;
+					besta0 = a0;
+					besta1 = a1;
+				}
+			}
+		}
+
+		dxtBlock->alpha0 = besta0;
+		dxtBlock->alpha1 = besta1;
+	}
+}
+
+__global__ void compressDXT5n(uint blockNum, uint2 * d_result)
+{
+	uint idx = blockIdx.x * 128 + threadIdx.x;
+
+	if (idx >= blockNum)
+	{
+		return;
+	}
+
+	// @@ Ideally we would load the data to shared mem to achieve coalesced global mem access.
+	// @@ Blocks would require too much shared memory (8k) and limit occupancy.
+
+	// @@ Ideally we should use SIMD processing, multiple threads (4-8) processing the same block.
+	// That simplifies coalescing, and reduces divergence.
+
+	// @@ Experiment with texture. That's probably the most simple approach.
+
+	uint x[16];
+	uint y[16];
+
+
+}
+*/
+
+
 ////////////////////////////////////////////////////////////////////////////////
 // Setup kernel
 ////////////////////////////////////////////////////////////////////////////////
@@ -1479,6 +1685,20 @@ extern "C" void compressKernelDXT1(uint blockNum, uint * d_data, uint * d_result
 	compressDXT1<<<blockNum, NUM_THREADS>>>(d_bitmaps, d_data, (uint2 *)d_result);
 }
 
+extern "C" void compressKernelDXT1_Tex(uint bn, uint blockNum, uint w, cudaArray * d_data, uint * d_result, uint * d_bitmaps)
+{
+	// Setup texture
+	tex.normalized = false;
+	tex.filterMode = cudaFilterModePoint;
+	tex.addressMode[0] = cudaAddressModeClamp;
+	tex.addressMode[1] = cudaAddressModeClamp;
+    
+	cudaBindTextureToArray(tex, d_data);
+
+	compressDXT1_Tex<<<blockNum, NUM_THREADS>>>(bn, w, d_bitmaps, (uint2 *)d_result);
+}
+
+
 extern "C" void compressKernelDXT1_Level4(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps)
 {
 	compressLevel4DXT1<<<blockNum, NUM_THREADS>>>(d_bitmaps, d_data, (uint2 *)d_result);
@@ -1498,3 +1718,16 @@ extern "C" void compressKernelCTX1(uint blockNum, uint * d_data, uint * d_result
 {
 	compressCTX1<<<blockNum, NUM_THREADS>>>(d_bitmaps, d_data, (uint2 *)d_result);
 }
+
+extern "C" void compressKernelDXT5n(uint blockNum, cudaArray * d_data, uint * d_result)
+{
+	// Setup texture
+	tex.normalized = false;
+	tex.filterMode = cudaFilterModePoint;
+	tex.addressMode[0] = cudaAddressModeClamp;
+	tex.addressMode[1] = cudaAddressModeClamp;
+    
+	cudaBindTextureToArray(tex, d_data);
+
+//	compressDXT5n<<<blockNum/128, 128>>>(blockNum, (uint2 *)d_result);
+}
diff --git a/src/nvtt/cuda/CudaCompressDXT.cpp b/src/nvtt/cuda/CudaCompressDXT.cpp
index dfd9084..bad6213 100644
--- a/src/nvtt/cuda/CudaCompressDXT.cpp
+++ b/src/nvtt/cuda/CudaCompressDXT.cpp
@@ -53,6 +53,7 @@ using namespace nvtt;
 
 extern "C" void setupCompressKernel(const float weights[3]);
 extern "C" void compressKernelDXT1(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps);
+extern "C" void compressKernelDXT1_Tex(uint bn, uint blockNum, uint w, cudaArray * d_data, uint * d_result, uint * d_bitmaps);
 extern "C" void compressKernelDXT1_Level4(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps);
 extern "C" void compressWeightedKernelDXT1(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps);
 extern "C" void compressNormalKernelDXT1(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps);
@@ -286,7 +287,7 @@ void CudaCompressor::compressDXT1(const CompressionOptions::Private & compressio
 	}
 
 	clock_t end = clock();
-	//printf("\rCUDA time taken: %.3f seconds\n", float(end-start) / CLOCKS_PER_SEC);
+	printf("\rCUDA time taken: %.3f seconds\n", float(end-start) / CLOCKS_PER_SEC);
 
 	free(blockLinearImage);
 
@@ -298,6 +299,77 @@ void CudaCompressor::compressDXT1(const CompressionOptions::Private & compressio
 #endif
 }
 
+void CudaCompressor::compressDXT1_Tex(const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions)
+{
+	nvDebugCheck(cuda::isHardwarePresent());
+#if defined HAVE_CUDA
+
+    // Allocate image as a cuda array.
+    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsigned);
+
+    cudaArray * d_image;
+	const int imageSize = m_image->width() * m_image->height() * sizeof(uint);
+    cudaMallocArray(&d_image, &channelDesc, m_image->width(), m_image->height()); 
+    cudaMemcpyToArray(d_image, 0, 0, m_image->pixels(), imageSize, cudaMemcpyHostToDevice);
+
+	// Image size in blocks.
+	const uint w = (m_image->width() + 3) / 4;
+	const uint h = (m_image->height() + 3) / 4;
+	const uint blockNum = w * h;
+	const uint compressedSize = blockNum * 8;
+
+	void * h_result = malloc(MAX_BLOCKS * 8);
+
+	clock_t start = clock();
+
+	setupCompressKernel(compressionOptions.colorWeight.ptr());
+	
+	uint bn = 0;
+	while(bn != blockNum)
+	{
+		uint count = min(blockNum - bn, MAX_BLOCKS);
+
+		// Launch kernel.
+		compressKernelDXT1_Tex(bn, count, w, d_image, m_result, m_bitmapTable);
+
+		// Check for errors.
+		cudaError_t err = cudaGetLastError();
+		if (err != cudaSuccess)
+		{
+			nvDebug("CUDA Error: %s\n", cudaGetErrorString(err));
+
+			if (outputOptions.errorHandler != NULL)
+			{
+				outputOptions.errorHandler->error(Error_CudaError);
+			}
+		}
+
+		// Copy result to host, overwrite swizzled image.
+		cudaMemcpy(h_result, m_result, count * 8, cudaMemcpyDeviceToHost);
+
+		// Output result.
+		if (outputOptions.outputHandler != NULL)
+		{
+			outputOptions.outputHandler->writeData(h_result, count * 8);
+		}
+
+		bn += count;
+	}
+
+	clock_t end = clock();
+	printf("\rCUDA time taken: %.3f seconds\n", float(end-start) / CLOCKS_PER_SEC);
+
+	free(h_result);
+
+#else
+	if (outputOptions.errorHandler != NULL)
+	{
+		outputOptions.errorHandler->error(Error_CudaError);
+	}
+#endif
+}
+
+
 
 /// Compress image using CUDA.
 void CudaCompressor::compressDXT3(const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions)
@@ -618,3 +690,20 @@ void CudaCompressor::compressCTX1(const nvtt::CompressionOptions::Private & comp
 #endif
 }
 
+
+void CudaCompressor::compressDXT5n(const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions)
+{
+	nvDebugCheck(cuda::isHardwarePresent());
+#if defined HAVE_CUDA
+
+	// @@ TODO
+
+#else
+	if (outputOptions.errorHandler != NULL)
+	{
+		outputOptions.errorHandler->error(Error_CudaError);
+	}
+#endif
+}
+
+
diff --git a/src/nvtt/cuda/CudaCompressDXT.h b/src/nvtt/cuda/CudaCompressDXT.h
index 7567943..0dbca39 100644
--- a/src/nvtt/cuda/CudaCompressDXT.h
+++ b/src/nvtt/cuda/CudaCompressDXT.h
@@ -42,10 +42,12 @@ namespace nv
 		void setImage(const Image * image, nvtt::AlphaMode alphaMode);
 
 		void compressDXT1(const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
+		void compressDXT1_Tex(const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
 		void compressDXT3(const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
 		void compressDXT5(const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
 		void compressDXT1n(const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
 		void compressCTX1(const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
+		void compressDXT5n(const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
 
 	private:
 
diff --git a/src/nvtt/cuda/CudaMath.h b/src/nvtt/cuda/CudaMath.h
index 88cd233..17ed6ff 100644
--- a/src/nvtt/cuda/CudaMath.h
+++ b/src/nvtt/cuda/CudaMath.h
@@ -26,7 +26,6 @@
 #ifndef CUDAMATH_H
 #define CUDAMATH_H
 
-#include <float.h>
 
 
 inline __device__ __host__ float3 operator *(float3 a, float3 b)
@@ -211,7 +210,7 @@ inline __device__ bool singleColor(const float3 * colors)
 	bool sameColor = false;
 	for (int i = 0; i < 16; i++)
 	{
-		sameColor &= (colors[idx] == colors[0]);
+		sameColor &= (colors[i] == colors[0]);
 	}
 	return sameColor;
 #else
@@ -232,16 +231,16 @@ inline __device__ bool singleColor(const float3 * colors)
 inline __device__ void colorSums(const float3 * colors, float3 * sums)
 {
 #if __DEVICE_EMULATION__
-	float3 color_sum = make_float3(0.0f, 0.0f, 0.0f);
-	for (int i = 0; i < 16; i++)
-	{
-		color_sum += colors[i];
-	}
+    float3 color_sum = make_float3(0.0f, 0.0f, 0.0f);
+    for (int i = 0; i < 16; i++)
+    {
+        color_sum += colors[i];
+    }
 
-	for (int i = 0; i < 16; i++)
-	{
-		sums[i] = color_sum;
-	}
+    for (int i = 0; i < 16; i++)
+    {
+        sums[i] = color_sum;
+    }
 #else
 
 	const int idx = threadIdx.x;
@@ -327,7 +326,7 @@ inline __device__ __host__ float2 firstEigenVector2D( float matrix[3] )
 inline __device__ void colorSums(const float2 * colors, float2 * sums)
 {
 #if __DEVICE_EMULATION__
-	float2 color_sum = make_float2(0.0f, 0.0f, 0.0f);
+	float2 color_sum = make_float2(0.0f, 0.0f);
 	for (int i = 0; i < 16; i++)
 	{
 		color_sum += colors[i];
@@ -360,7 +359,7 @@ inline __device__ float2 bestFitLine(const float2 * colors, float2 color_sum)
 		float2 a = (colors[i] - color_sum * (1.0f / 16.0f));
 		covariance[0] += a.x * a.x;
 		covariance[1] += a.x * a.y;
-		covariance[3] += a.y * a.y;
+		covariance[2] += a.y * a.y;
 	}
 #else