Add DXT1n and CTX1 CUDA compressors.

2008-02-15 08:58:02 +00:00
parent c7fcc3ef4b
commit 5dbfb20b60
9 changed files with 2458 additions and 1695 deletions
--- a/src/nvtt/Compressor.cpp
+++ b/src/nvtt/Compressor.cpp
@ -56,7 +56,7 @@ namespace
 	
 	static int blockSize(Format format)
 	{
-		if (format == Format_DXT1 || format == Format_DXT1a) {
+		if (format == Format_DXT1 || format == Format_DXT1a || format == Format_DXT1n) {
 			return 8;
 		}
 		else if (format == Format_DXT3) {
@ -71,6 +71,9 @@ namespace
 		else if (format == Format_BC5) {
 			return 16;
 		}
+		else if (format == Format_CTX1) {
+			return 8;
+		}
 		return 0;
 	}

@ -333,7 +336,7 @@ bool Compressor::Private::outputHeader(const InputOptions::Private & inputOption
 	{
 		header.setLinearSize(computeImageSize(inputOptions.targetWidth, inputOptions.targetHeight, inputOptions.targetDepth, compressionOptions.bitcount, compressionOptions.format));
 		
-		if (compressionOptions.format == Format_DXT1 || compressionOptions.format == Format_DXT1a) {
+		if (compressionOptions.format == Format_DXT1 || compressionOptions.format == Format_DXT1a || compressionOptions.format == Format_DXT1n) {
 			header.setFourCC('D', 'X', 'T', '1');
 			if (inputOptions.isNormalMap) header.setNormalFlag(true);
 		}
@ -354,6 +357,10 @@ bool Compressor::Private::outputHeader(const InputOptions::Private & inputOption
 			header.setFourCC('A', 'T', 'I', '2');
 			if (inputOptions.isNormalMap) header.setNormalFlag(true);
 		}
+		else if (compressionOptions.format == Format_CTX1) {
+			header.setFourCC('C', 'T', 'X', '1');
+			if (inputOptions.isNormalMap) header.setNormalFlag(true);
+		}
 	}
 	
 	// Swap bytes if necessary.
@ -705,6 +712,18 @@ bool Compressor::Private::compressMipmap(const Mipmap & mipmap, const Compressio
 			}
 		}
 	}
+	else if (compressionOptions.format == Format_DXT1n)
+	{
+		if (cudaEnabled)
+		{
+			nvDebugCheck(cudaSupported);
+			cuda->compressDXT1n(image, outputOptions, compressionOptions);
+		}
+		else
+		{
+			if (outputOptions.errorHandler) outputOptions.errorHandler->error(Error_UnsupportedFeature);
+		}
+	}
 	else if (compressionOptions.format == Format_DXT3)
 	{
 		if (compressionOptions.quality == Quality_Fastest)
@ -762,6 +781,18 @@ bool Compressor::Private::compressMipmap(const Mipmap & mipmap, const Compressio
 	{
 		compressBC5(image, outputOptions, compressionOptions);
 	}
+	else if (compressionOptions.format == Format_CTX1)
+	{
+		if (cudaEnabled)
+		{
+			nvDebugCheck(cudaSupported);
+			cuda->compressCTX1(image, outputOptions, compressionOptions);
+		}
+		else
+		{
+			if (outputOptions.errorHandler) outputOptions.errorHandler->error(Error_UnsupportedFeature);
+		}
+	}

 	return true;
 }
--- a/src/nvtt/cuda/Bitmaps.h
+++ b/src/nvtt/cuda/Bitmaps.h
@ -122,7 +122,7 @@ static void doPrecomputation()
 */


-const static uint bitmaps[992] =
+const static uint s_bitmapTable[992] =
 {
 	0x80000000,
 	0x40000000,
--- a/src/nvtt/cuda/CompressKernel.cu
+++ b/src/nvtt/cuda/CompressKernel.cu
--- a/src/nvtt/cuda/CudaCompressDXT.cpp
+++ b/src/nvtt/cuda/CudaCompressDXT.cpp
--- a/src/nvtt/cuda/CudaCompressDXT.h
+++ b/src/nvtt/cuda/CudaCompressDXT.h
@ -40,6 +40,8 @@ namespace nv
 		void compressDXT1(const Image * image, const nvtt::OutputOptions::Private & outputOptions, const nvtt::CompressionOptions::Private & compressionOptions);
 		void compressDXT3(const Image * image, const nvtt::OutputOptions::Private & outputOptions, const nvtt::CompressionOptions::Private & compressionOptions);
 		void compressDXT5(const Image * image, const nvtt::OutputOptions::Private & outputOptions, const nvtt::CompressionOptions::Private & compressionOptions);
+		void compressDXT1n(const Image * image, const nvtt::OutputOptions::Private & outputOptions, const nvtt::CompressionOptions::Private & compressionOptions);
+		void compressCTX1(const Image * image, const nvtt::OutputOptions::Private & outputOptions, const nvtt::CompressionOptions::Private & compressionOptions);

 	private:

--- a/src/nvtt/cuda/CudaMath.h
+++ b/src/nvtt/cuda/CudaMath.h
@ -1,221 +1,363 @@
-// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
-// 
-// Permission is hereby granted, free of charge, to any person
-// obtaining a copy of this software and associated documentation
-// files (the "Software"), to deal in the Software without
-// restriction, including without limitation the rights to use,
-// copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the
-// Software is furnished to do so, subject to the following
-// conditions:
-// 
-// The above copyright notice and this permission notice shall be
-// included in all copies or substantial portions of the Software.
-// 
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-// OTHER DEALINGS IN THE SOFTWARE.
-
-// Math functions and operators to be used with vector types.
-
-#ifndef CUDAMATH_H
-#define CUDAMATH_H
-
-#include <float.h>
-
-
-inline __device__ __host__ float3 operator *(float3 a, float3 b)
-{
-    return make_float3(a.x*b.x, a.y*b.y, a.z*b.z);
-}
-
-inline __device__ __host__ float3 operator *(float f, float3 v)
-{
-    return make_float3(v.x*f, v.y*f, v.z*f);
-}
-
-inline __device__ __host__ float3 operator *(float3 v, float f)
-{
-    return make_float3(v.x*f, v.y*f, v.z*f);
-}
-
-inline __device__ __host__ float3 operator +(float3 a, float3 b)
-{
-    return make_float3(a.x+b.x, a.y+b.y, a.z+b.z);
-}
-
-inline __device__ __host__ void operator +=(float3 & b, float3 a)
-{
-    b.x += a.x;
-    b.y += a.y;
-    b.z += a.z;
-}
-
-inline __device__ __host__ float3 operator -(float3 a, float3 b)
-{
-    return make_float3(a.x-b.x, a.y-b.y, a.z-b.z);
-}
-
-inline __device__ __host__ void operator -=(float3 & b, float3 a)
-{
-    b.x -= a.x;
-    b.y -= a.y;
-    b.z -= a.z;
-}
-
-inline __device__ __host__ float3 operator /(float3 v, float f)
-{
-    float inv = 1.0f / f;
-    return v * inv;
-}
-
-inline __device__ __host__ void operator /=(float3 & b, float f)
-{
-    float inv = 1.0f / f;
-    b.x *= inv;
-    b.y *= inv;
-    b.z *= inv;
-}
-
-
-inline __device__ __host__ float dot(float3 a, float3 b)
-{
-    return a.x * b.x + a.y * b.y + a.z * b.z;
-}
-
-inline __device__ __host__ float dot(float4 a, float4 b)
-{
-    return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
-}
-
-inline __device__ __host__ float clamp(float f, float a, float b)
-{
-    return max(a, min(f, b));
-}
-
-inline __device__ __host__ float3 clamp(float3 v, float a, float b)
-{
-    return make_float3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b));
-}
-
-inline __device__ __host__ float3 clamp(float3 v, float3 a, float3 b)
-{
-    return make_float3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z));
-}
-
-
-inline __device__ __host__ float3 normalize(float3 v)
-{
-    float len = 1.0f / sqrtf(dot(v, v));
-    return make_float3(v.x * len, v.y * len, v.z * len);
-}
-
-
-
-
-// Use power method to find the first eigenvector.
-// http://www.miislita.com/information-retrieval-tutorial/matrix-tutorial-3-eigenvalues-eigenvectors.html
-inline __device__ __host__ float3 firstEigenVector( float matrix[6] )
-{
-	// 8 iterations seems to be more than enough.
-
-	float3 v = make_float3(1.0f, 1.0f, 1.0f);
-	for(int i = 0; i < 8; i++) {
-		float x = v.x * matrix[0] + v.y * matrix[1] + v.z * matrix[2];
-		float y = v.x * matrix[1] + v.y * matrix[3] + v.z * matrix[4];
-		float z = v.x * matrix[2] + v.y * matrix[4] + v.z * matrix[5];
-		float m = max(max(x, y), z);        
-		float iv = 1.0f / m;
-		#if __DEVICE_EMULATION__
-		if (m == 0.0f) iv = 0.0f;
-		#endif
-		v = make_float3(x*iv, y*iv, z*iv);
-	}
-
-	return v;
-}
-
-inline __device__ void colorSums(const float3 * colors, float3 * sums)
-{
-#if __DEVICE_EMULATION__
-	float3 color_sum = make_float3(0.0f, 0.0f, 0.0f);
-	for (int i = 0; i < 16; i++)
-	{
-		color_sum += colors[i];
-	}
-
-	for (int i = 0; i < 16; i++)
-	{
-		sums[i] = color_sum;
-	}
-#else
-
-	const int idx = threadIdx.x;
-
+// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
+// 
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+// Math functions and operators to be used with vector types.
+
+#ifndef CUDAMATH_H
+#define CUDAMATH_H
+
+#include <float.h>
+
+
+inline __device__ __host__ float3 operator *(float3 a, float3 b)
+{
+    return make_float3(a.x*b.x, a.y*b.y, a.z*b.z);
+}
+
+inline __device__ __host__ float3 operator *(float f, float3 v)
+{
+    return make_float3(v.x*f, v.y*f, v.z*f);
+}
+
+inline __device__ __host__ float3 operator *(float3 v, float f)
+{
+    return make_float3(v.x*f, v.y*f, v.z*f);
+}
+
+inline __device__ __host__ float3 operator +(float3 a, float3 b)
+{
+    return make_float3(a.x+b.x, a.y+b.y, a.z+b.z);
+}
+
+inline __device__ __host__ void operator +=(float3 & b, float3 a)
+{
+    b.x += a.x;
+    b.y += a.y;
+    b.z += a.z;
+}
+
+inline __device__ __host__ float3 operator -(float3 a, float3 b)
+{
+    return make_float3(a.x-b.x, a.y-b.y, a.z-b.z);
+}
+
+inline __device__ __host__ void operator -=(float3 & b, float3 a)
+{
+    b.x -= a.x;
+    b.y -= a.y;
+    b.z -= a.z;
+}
+
+inline __device__ __host__ float3 operator /(float3 v, float f)
+{
+    float inv = 1.0f / f;
+    return v * inv;
+}
+
+inline __device__ __host__ void operator /=(float3 & b, float f)
+{
+    float inv = 1.0f / f;
+    b.x *= inv;
+    b.y *= inv;
+    b.z *= inv;
+}
+
+// float2 operators
+inline __device__ __host__ float2 operator *(float2 a, float2 b)
+{
+    return make_float2(a.x*b.x, a.y*b.y);
+}
+
+inline __device__ __host__ float2 operator *(float f, float2 v)
+{
+    return make_float2(v.x*f, v.y*f);
+}
+
+inline __device__ __host__ float2 operator *(float2 v, float f)
+{
+    return make_float2(v.x*f, v.y*f);
+}
+
+inline __device__ __host__ float2 operator +(float2 a, float2 b)
+{
+    return make_float2(a.x+b.x, a.y+b.y);
+}
+
+inline __device__ __host__ void operator +=(float2 & b, float2 a)
+{
+    b.x += a.x;
+    b.y += a.y;
+}
+
+inline __device__ __host__ float2 operator -(float2 a, float2 b)
+{
+    return make_float2(a.x-b.x, a.y-b.y);
+}
+
+inline __device__ __host__ void operator -=(float2 & b, float2 a)
+{
+    b.x -= a.x;
+    b.y -= a.y;
+}
+
+inline __device__ __host__ float2 operator /(float2 v, float f)
+{
+    float inv = 1.0f / f;
+    return v * inv;
+}
+
+inline __device__ __host__ void operator /=(float2 & b, float f)
+{
+    float inv = 1.0f / f;
+    b.x *= inv;
+	b.y *= inv;
+}
+
+
+inline __device__ __host__ float dot(float2 a, float2 b)
+{
+    return a.x * b.x + a.y * b.y;
+}
+
+inline __device__ __host__ float dot(float3 a, float3 b)
+{
+    return a.x * b.x + a.y * b.y + a.z * b.z;
+}
+
+inline __device__ __host__ float dot(float4 a, float4 b)
+{
+    return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
+}
+
+inline __device__ __host__ float clamp(float f, float a, float b)
+{
+    return max(a, min(f, b));
+}
+
+inline __device__ __host__ float3 clamp(float3 v, float a, float b)
+{
+    return make_float3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b));
+}
+
+inline __device__ __host__ float3 clamp(float3 v, float3 a, float3 b)
+{
+    return make_float3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z));
+}
+
+
+inline __device__ __host__ float3 normalize(float3 v)
+{
+    float len = 1.0f / sqrtf(dot(v, v));
+    return make_float3(v.x * len, v.y * len, v.z * len);
+}
+
+
+
+
+// Use power method to find the first eigenvector.
+// http://www.miislita.com/information-retrieval-tutorial/matrix-tutorial-3-eigenvalues-eigenvectors.html
+inline __device__ __host__ float3 firstEigenVector( float matrix[6] )
+{
+	// 8 iterations seems to be more than enough.
+
+	float3 v = make_float3(1.0f, 1.0f, 1.0f);
+	for(int i = 0; i < 8; i++) {
+		float x = v.x * matrix[0] + v.y * matrix[1] + v.z * matrix[2];
+		float y = v.x * matrix[1] + v.y * matrix[3] + v.z * matrix[4];
+		float z = v.x * matrix[2] + v.y * matrix[4] + v.z * matrix[5];
+		float m = max(max(x, y), z);        
+		float iv = 1.0f / m;
+		#if __DEVICE_EMULATION__
+		if (m == 0.0f) iv = 0.0f;
+		#endif
+		v = make_float3(x*iv, y*iv, z*iv);
+	}
+
+	return v;
+}
+
+inline __device__ void colorSums(const float3 * colors, float3 * sums)
+{
+#if __DEVICE_EMULATION__
+	float3 color_sum = make_float3(0.0f, 0.0f, 0.0f);
+	for (int i = 0; i < 16; i++)
+	{
+		color_sum += colors[i];
+	}
+
+	for (int i = 0; i < 16; i++)
+	{
+		sums[i] = color_sum;
+	}
+#else
+
+	const int idx = threadIdx.x;
+
 	sums[idx] = colors[idx];
 	sums[idx] += sums[idx^8];
 	sums[idx] += sums[idx^4];
 	sums[idx] += sums[idx^2];
 	sums[idx] += sums[idx^1];
-
-#endif
-}
-
-inline __device__ float3 bestFitLine(const float3 * colors, float3 color_sum, float3 colorMetric)
-{
-	// Compute covariance matrix of the given colors.
-#if __DEVICE_EMULATION__
-	float covariance[6] = {0, 0, 0, 0, 0, 0};
-	for (int i = 0; i < 16; i++)
-	{
-		float3 a = (colors[i] - color_sum * (1.0f / 16.0f)) * colorMetric;
-		covariance[0] += a.x * a.x;
-		covariance[1] += a.x * a.y;
-		covariance[2] += a.x * a.z;
-		covariance[3] += a.y * a.y;
-		covariance[4] += a.y * a.z;
-		covariance[5] += a.z * a.z;
-	}
-#else
-
-	const int idx = threadIdx.x;
-
-	float3 diff = (colors[idx] - color_sum * (1.0f / 16.0f)) * colorMetric;
-
-	// @@ Eliminate two-way bank conflicts here.
-	// @@ It seems that doing that and unrolling the reduction doesn't help...
-	__shared__ float covariance[16*6];
-
-	covariance[6 * idx + 0] = diff.x * diff.x;    // 0, 6, 12, 2, 8, 14, 4, 10, 0
-	covariance[6 * idx + 1] = diff.x * diff.y;
-	covariance[6 * idx + 2] = diff.x * diff.z;
-	covariance[6 * idx + 3] = diff.y * diff.y;
-	covariance[6 * idx + 4] = diff.y * diff.z;
-	covariance[6 * idx + 5] = diff.z * diff.z;
-
-	for(int d = 8; d > 0; d >>= 1)
-	{
-		if (idx < d)
-		{
-			covariance[6 * idx + 0] += covariance[6 * (idx+d) + 0];
-			covariance[6 * idx + 1] += covariance[6 * (idx+d) + 1];
-			covariance[6 * idx + 2] += covariance[6 * (idx+d) + 2];
-			covariance[6 * idx + 3] += covariance[6 * (idx+d) + 3];
-			covariance[6 * idx + 4] += covariance[6 * (idx+d) + 4];
-			covariance[6 * idx + 5] += covariance[6 * (idx+d) + 5];
-		}
-	}
-
-#endif
-
-	// Compute first eigen vector.
-	return firstEigenVector(covariance);
-}
-
-
-#endif // CUDAMATH_H
+
+#endif
+}
+
+inline __device__ float3 bestFitLine(const float3 * colors, float3 color_sum, float3 colorMetric)
+{
+	// Compute covariance matrix of the given colors.
+#if __DEVICE_EMULATION__
+	float covariance[6] = {0, 0, 0, 0, 0, 0};
+	for (int i = 0; i < 16; i++)
+	{
+		float3 a = (colors[i] - color_sum * (1.0f / 16.0f)) * colorMetric;
+		covariance[0] += a.x * a.x;
+		covariance[1] += a.x * a.y;
+		covariance[2] += a.x * a.z;
+		covariance[3] += a.y * a.y;
+		covariance[4] += a.y * a.z;
+		covariance[5] += a.z * a.z;
+	}
+#else
+
+	const int idx = threadIdx.x;
+
+	float3 diff = (colors[idx] - color_sum * (1.0f / 16.0f)) * colorMetric;
+
+	// @@ Eliminate two-way bank conflicts here.
+	// @@ It seems that doing that and unrolling the reduction doesn't help...
+	__shared__ float covariance[16*6];
+
+	covariance[6 * idx + 0] = diff.x * diff.x;    // 0, 6, 12, 2, 8, 14, 4, 10, 0
+	covariance[6 * idx + 1] = diff.x * diff.y;
+	covariance[6 * idx + 2] = diff.x * diff.z;
+	covariance[6 * idx + 3] = diff.y * diff.y;
+	covariance[6 * idx + 4] = diff.y * diff.z;
+	covariance[6 * idx + 5] = diff.z * diff.z;
+
+	for(int d = 8; d > 0; d >>= 1)
+	{
+		if (idx < d)
+		{
+			covariance[6 * idx + 0] += covariance[6 * (idx+d) + 0];
+			covariance[6 * idx + 1] += covariance[6 * (idx+d) + 1];
+			covariance[6 * idx + 2] += covariance[6 * (idx+d) + 2];
+			covariance[6 * idx + 3] += covariance[6 * (idx+d) + 3];
+			covariance[6 * idx + 4] += covariance[6 * (idx+d) + 4];
+			covariance[6 * idx + 5] += covariance[6 * (idx+d) + 5];
+		}
+	}
+
+#endif
+
+	// Compute first eigen vector.
+	return firstEigenVector(covariance);
+}
+
+// @@ For 2D this may not be the most efficient method. It's a quadratic equation, right?
+inline __device__ __host__ float2 firstEigenVector2D( float matrix[3] )
+{
+	// @@ 8 iterations is probably more than enough.
+
+	float2 v = make_float2(1.0f, 1.0f);
+	for(int i = 0; i < 8; i++) {
+		float x = v.x * matrix[0] + v.y * matrix[1];
+		float y = v.x * matrix[1] + v.y * matrix[2];
+		float m = max(x, y);        
+		float iv = 1.0f / m;
+		#if __DEVICE_EMULATION__
+		if (m == 0.0f) iv = 0.0f;
+		#endif
+		v = make_float2(x*iv, y*iv);
+	}
+
+	return v;
+}
+
+inline __device__ void colorSums(const float2 * colors, float2 * sums)
+{
+#if __DEVICE_EMULATION__
+	float2 color_sum = make_float2(0.0f, 0.0f, 0.0f);
+	for (int i = 0; i < 16; i++)
+	{
+		color_sum += colors[i];
+	}
+
+	for (int i = 0; i < 16; i++)
+	{
+		sums[i] = color_sum;
+	}
+#else
+
+	const int idx = threadIdx.x;
+
+	sums[idx] = colors[idx];
+	sums[idx] += sums[idx^8];
+	sums[idx] += sums[idx^4];
+	sums[idx] += sums[idx^2];
+	sums[idx] += sums[idx^1];
+
+#endif
+}
+
+inline __device__ float2 bestFitLine(const float2 * colors, float2 color_sum)
+{
+	// Compute covariance matrix of the given colors.
+#if __DEVICE_EMULATION__
+	float covariance[3] = {0, 0, 0};
+	for (int i = 0; i < 16; i++)
+	{
+		float2 a = (colors[i] - color_sum * (1.0f / 16.0f));
+		covariance[0] += a.x * a.x;
+		covariance[1] += a.x * a.y;
+		covariance[3] += a.y * a.y;
+	}
+#else
+
+	const int idx = threadIdx.x;
+
+	float2 diff = (colors[idx] - color_sum * (1.0f / 16.0f));
+
+	__shared__ float covariance[16*3];
+
+	covariance[3 * idx + 0] = diff.x * diff.x;
+	covariance[3 * idx + 1] = diff.x * diff.y;
+	covariance[3 * idx + 2] = diff.y * diff.y;
+
+	for(int d = 8; d > 0; d >>= 1)
+	{
+		if (idx < d)
+		{
+			covariance[3 * idx + 0] += covariance[3 * (idx+d) + 0];
+			covariance[3 * idx + 1] += covariance[3 * (idx+d) + 1];
+			covariance[3 * idx + 2] += covariance[3 * (idx+d) + 2];
+		}
+	}
+
+#endif
+
+	// Compute first eigen vector.
+	return firstEigenVector2D(covariance);
+}
+
+
+#endif // CUDAMATH_H
--- a/src/nvtt/nvtt.h
+++ b/src/nvtt/nvtt.h
@ -75,6 +75,9 @@ namespace nvtt
 		Format_BC3n = Format_DXT5n,
 		Format_BC4,     // ATI1
 		Format_BC5,     // 3DC, ATI2
+
+		Format_DXT1n,
+		Format_CTX1,
 	};
 	
 	/// Quality modes.
--- a/src/nvtt/tests/stress.cpp
+++ b/src/nvtt/tests/stress.cpp
@ -83,9 +83,102 @@ struct MyOutputHandler : public nvtt::OutputHandler

 };

+void precomp()
+{
+	unsigned int bitmaps[1024];
+
+	int num = 0;
+
+	printf("{\n");
+	printf("\t%8X,\n", 0);
+
+	bitmaps[0] = 0;
+
+	num = 1;
+	for (int a = 1; a <= 15; a++)
+	{
+		  for (int b = a; b <= 15; b++)
+		  {
+				for (int c = b; c <= 15; c++)
+				{
+					int indices[16];
+
+					int i = 0;
+					for(; i < a; i++) {
+						indices[i] = 0;
+					}
+					for(; i < a+b; i++) {
+						indices[i] = 2;
+					}
+					for(; i < a+b+c; i++) {
+						indices[i] = 3;
+					}
+					for(; i < 16; i++) {
+						indices[i] = 1;
+					}
+
+					unsigned int bm = 0;
+					for(i = 0; i < 16; i++) {
+						bm |= indices[i] << (i * 2);
+					}
+
+					printf("\t0x%8X, // %d %d %d %d\n", bm, a-0, b-a, c-b, 16-c);
+
+					bitmaps[num] = bm;
+					num++;
+				}
+		  }
+	}
+
+	printf("}\n");
+
+	printf("// num = %d\n", num);
+
+/*
+	for( int i = imax; i >= 0; --i )
+	{
+		// second cluster [i,j) is one third along
+		for( int m = i; m < 16; ++m )
+		{
+			indices[m] = 2;
+		}
+		const int jmax = ( i == 0 ) ? 15 : 16;
+		for( int j = jmax; j >= i; --j )
+		{
+			// third cluster [j,k) is two thirds along
+			for( int m = j; m < 16; ++m )
+			{
+				indices[m] = 3;
+			}
+			
+			int kmax = ( j == 0 ) ? 15 : 16;
+			for( int k = kmax; k >= j; --k )
+			{
+				// last cluster [k,n) is at the end
+				if( k < 16 )
+				{
+					indices[k] = 1;
+				}
+				
+				uint bitmap = 0;
+				
+				bool hasThree = false;
+				for(int p = 0; p < 16; p++) {
+					bitmap |= indices[p] << (p * 2);
+				}
+				
+				bitmaps[num] = bitmap;
+				num++;
+			}
+		}
+	}
+*/
+}

 int main(int argc, char *argv[])
 {
+	precomp();
+
 	nvtt::InputOptions inputOptions;
 	inputOptions.setTextureLayout(nvtt::TextureType_2D, 1024, 1024);

@ -98,6 +191,9 @@ int main(int argc, char *argv[])
 	inputOptions.setMipmapGeneration(false);

 	nvtt::CompressionOptions compressionOptions;
+//	compressionOptions.setFormat(nvtt::Format_DXT1);
+//	compressionOptions.setFormat(nvtt::Format_DXT1n);
+	compressionOptions.setFormat(nvtt::Format_CTX1);
 	
 	nvtt::OutputOptions outputOptions;
 	outputOptions.setOutputHeader(false);
--- a/src/nvtt/tools/imgdiff.cpp
+++ b/src/nvtt/tools/imgdiff.cpp
@ -130,10 +130,13 @@ struct NormalError

 	void done()
 	{
-		ade /= samples;
-		mse /= samples * 3;
-		rmse = sqrt(mse);
-		psnr = (rmse == 0) ? 999.0f : 20.0f * log10(255.0f / rmse);
+		if (samples)
+		{
+			ade /= samples;
+			mse /= samples * 3;
+			rmse = sqrt(mse);
+			psnr = (rmse == 0) ? 999.0f : 20.0f * log10(255.0f / rmse);
+		}
 	}

 	void print()