Precompute fast cluster fit factors, and store as static const.

nvtt is completely reentrant now. Fixes issue 37. cleanup interface of cuda compressors.
2008-04-18 08:49:32 +00:00
parent 91eb30667f
commit 1df69495fc
8 changed files with 1296 additions and 316 deletions
--- a/src/nvtt/CompressDXT.cpp
+++ b/src/nvtt/CompressDXT.cpp
@ -199,18 +199,6 @@ void nv::fastCompressBC5(const Image * image, const nvtt::OutputOptions::Private
 }


-void nv::doPrecomputation()
-{
-	static bool done = false;	// @@ Stop using statics for reentrancy. Although the worst that could happen is that this stuff is precomputed multiple times.
-	
-	if (!done)
-	{
-		done = true;
-		squish::FastClusterFit::DoPrecomputation();
-	}
-}
-
-
 void nv::compressDXT1(const Image * image, const OutputOptions::Private & outputOptions, const CompressionOptions::Private & compressionOptions)
 {
 	const uint w = image->width();
@ -219,8 +207,6 @@ void nv::compressDXT1(const Image * image, const OutputOptions::Private & output
 	ColorBlock rgba;
 	BlockDXT1 block;

-	doPrecomputation();
-
 	//squish::WeightedClusterFit fit;
 	//squish::ClusterFit fit;
 	squish::FastClusterFit fit;
@ -363,8 +349,6 @@ void nv::compressDXT5n(const Image * image, const OutputOptions::Private & outpu
 	ColorBlock rgba;
 	BlockDXT5 block;
 	
-	doPrecomputation();
-
 	for (uint y = 0; y < h; y += 4) {
 		for (uint x = 0; x < w; x += 4) {
 			
--- a/src/nvtt/Compressor.cpp
+++ b/src/nvtt/Compressor.cpp
@ -725,7 +725,8 @@ bool Compressor::Private::compressMipmap(const Mipmap & mipmap, const InputOptio
 			if (cudaEnabled)
 			{
 				nvDebugCheck(cudaSupported);
-				cuda->compressDXT1(image, compressionOptions, outputOptions);
+				cuda->setImage(image, inputOptions.alphaMode);
+				cuda->compressDXT1(compressionOptions, outputOptions);
 			}
 			else
 			{
@ -757,7 +758,8 @@ bool Compressor::Private::compressMipmap(const Mipmap & mipmap, const InputOptio
 		if (cudaEnabled)
 		{
 			nvDebugCheck(cudaSupported);
-			cuda->compressDXT1n(image, compressionOptions, outputOptions);
+			cuda->setImage(image, inputOptions.alphaMode);	
+			cuda->compressDXT1n(compressionOptions, outputOptions);
 		}
 		else
 		{
@ -775,7 +777,8 @@ bool Compressor::Private::compressMipmap(const Mipmap & mipmap, const InputOptio
 			if (cudaEnabled)
 			{
 				nvDebugCheck(cudaSupported);
-				cuda->compressDXT3(image, inputOptions, compressionOptions, outputOptions);
+				cuda->setImage(image, inputOptions.alphaMode);
+				cuda->compressDXT3(compressionOptions, outputOptions);
 			}
 			else
 			{
@ -794,7 +797,8 @@ bool Compressor::Private::compressMipmap(const Mipmap & mipmap, const InputOptio
 			if (cudaEnabled)
 			{
 				nvDebugCheck(cudaSupported);
-				cuda->compressDXT5(image, inputOptions, compressionOptions, outputOptions);
+				cuda->setImage(image, inputOptions.alphaMode);
+				cuda->compressDXT5(compressionOptions, outputOptions);
 			}
 			else
 			{
@ -826,7 +830,8 @@ bool Compressor::Private::compressMipmap(const Mipmap & mipmap, const InputOptio
 		if (cudaEnabled)
 		{
 			nvDebugCheck(cudaSupported);
-			cuda->compressCTX1(image, compressionOptions, outputOptions);
+			cuda->setImage(image, inputOptions.alphaMode);
+			cuda->compressCTX1(compressionOptions, outputOptions);
 		}
 		else
 		{
--- a/src/nvtt/cuda/CudaCompressDXT.cpp
+++ b/src/nvtt/cuda/CudaCompressDXT.cpp
@ -213,21 +213,27 @@ void CudaCompressor::compressKernel(CudaCompressionKernel * kernel)
 #endif // 0


+void CudaCompressor::setImage(const Image * image, nvtt::AlphaMode alphaMode)
+{
+	m_image = image;
+	m_alphaMode = alphaMode;
+}
+


 /// Compress image using CUDA.
-void CudaCompressor::compressDXT1(const Image * image, const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions)
+void CudaCompressor::compressDXT1(const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions)
 {
 	nvDebugCheck(cuda::isHardwarePresent());
 #if defined HAVE_CUDA

 	// Image size in blocks.
-	const uint w = (image->width() + 3) / 4;
-	const uint h = (image->height() + 3) / 4;
+	const uint w = (m_image->width() + 3) / 4;
+	const uint h = (m_image->height() + 3) / 4;

 	uint imageSize = w * h * 16 * sizeof(Color32);
    uint * blockLinearImage = (uint *) malloc(imageSize);
-	convertToBlockLinear(image, blockLinearImage);	// @@ Do this in parallel with the GPU, or in the GPU!
+	convertToBlockLinear(m_image, blockLinearImage);	// @@ Do this in parallel with the GPU, or in the GPU!

 	const uint blockNum = w * h;
 	const uint compressedSize = blockNum * 8;
@ -286,18 +292,18 @@ void CudaCompressor::compressDXT1(const Image * image, const CompressionOptions:


 /// Compress image using CUDA.
-void CudaCompressor::compressDXT3(const Image * image, const InputOptions::Private & inputOptions, const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions)
+void CudaCompressor::compressDXT3(const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions)
 {
 	nvDebugCheck(cuda::isHardwarePresent());
 #if defined HAVE_CUDA

 	// Image size in blocks.
-	const uint w = (image->width() + 3) / 4;
-	const uint h = (image->height() + 3) / 4;
+	const uint w = (m_image->width() + 3) / 4;
+	const uint h = (m_image->height() + 3) / 4;

 	uint imageSize = w * h * 16 * sizeof(Color32);
    uint * blockLinearImage = (uint *) malloc(imageSize);
-	convertToBlockLinear(image, blockLinearImage);
+	convertToBlockLinear(m_image, blockLinearImage);

 	const uint blockNum = w * h;
 	const uint compressedSize = blockNum * 8;
@ -370,18 +376,18 @@ void CudaCompressor::compressDXT3(const Image * image, const InputOptions::Priva


 /// Compress image using CUDA.
-void CudaCompressor::compressDXT5(const Image * image, const InputOptions::Private & inputOptions, const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions)
+void CudaCompressor::compressDXT5(const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions)
 {
 	nvDebugCheck(cuda::isHardwarePresent());
 #if defined HAVE_CUDA

 	// Image size in blocks.
-	const uint w = (image->width() + 3) / 4;
-	const uint h = (image->height() + 3) / 4;
+	const uint w = (m_image->width() + 3) / 4;
+	const uint h = (m_image->height() + 3) / 4;

 	uint imageSize = w * h * 16 * sizeof(Color32);
    uint * blockLinearImage = (uint *) malloc(imageSize);
-	convertToBlockLinear(image, blockLinearImage);
+	convertToBlockLinear(m_image, blockLinearImage);

 	const uint blockNum = w * h;
 	const uint compressedSize = blockNum * 8;
@ -453,18 +459,18 @@ void CudaCompressor::compressDXT5(const Image * image, const InputOptions::Priva
 }


-void CudaCompressor::compressDXT1n(const Image * image, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions)
+void CudaCompressor::compressDXT1n(const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions)
 {
 	nvDebugCheck(cuda::isHardwarePresent());
 #if defined HAVE_CUDA

 	// Image size in blocks.
-	const uint w = (image->width() + 3) / 4;
-	const uint h = (image->height() + 3) / 4;
+	const uint w = (m_image->width() + 3) / 4;
+	const uint h = (m_image->height() + 3) / 4;

 	uint imageSize = w * h * 16 * sizeof(Color32);
    uint * blockLinearImage = (uint *) malloc(imageSize);
-	convertToBlockLinear(image, blockLinearImage);	// @@ Do this in parallel with the GPU, or in the GPU!
+	convertToBlockLinear(m_image, blockLinearImage);	// @@ Do this in parallel with the GPU, or in the GPU!

 	const uint blockNum = w * h;
 	const uint compressedSize = blockNum * 8;
@ -522,18 +528,18 @@ void CudaCompressor::compressDXT1n(const Image * image, const nvtt::CompressionO
 }


-void CudaCompressor::compressCTX1(const Image * image, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions)
+void CudaCompressor::compressCTX1(const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions)
 {
 	nvDebugCheck(cuda::isHardwarePresent());
 #if defined HAVE_CUDA

 	// Image size in blocks.
-	const uint w = (image->width() + 3) / 4;
-	const uint h = (image->height() + 3) / 4;
+	const uint w = (m_image->width() + 3) / 4;
+	const uint h = (m_image->height() + 3) / 4;

 	uint imageSize = w * h * 16 * sizeof(Color32);
    uint * blockLinearImage = (uint *) malloc(imageSize);
-	convertToBlockLinear(image, blockLinearImage);	// @@ Do this in parallel with the GPU, or in the GPU!
+	convertToBlockLinear(m_image, blockLinearImage);	// @@ Do this in parallel with the GPU, or in the GPU!

 	const uint blockNum = w * h;
 	const uint compressedSize = blockNum * 8;
@ -590,186 +596,3 @@ void CudaCompressor::compressCTX1(const Image * image, const nvtt::CompressionOp
 #endif
 }

-
-
-#if 0
-
-class Task
-{
-public:
-	explicit Task(uint numBlocks) : blockMaxCount(numBlocks), blockCount(0)
-	{
-		// System memory allocations.
-		blockLinearImage = new uint[blockMaxCount * 16];
-		xrefs = new uint[blockMaxCount * 16];
-		
-		// Device memory allocations.
-		cudaMalloc((void**) &d_blockLinearImage, blockMaxCount * 16 * sizeof(uint));
-		cudaMalloc((void**) &d_compressedImage, blockMaxCount * 8U);
-		
-		// @@ Check for allocation errors.
-	}
-	
-	~Task()
-	{
-		delete [] blockLinearImage;
-		delete [] xrefs;
-		
-		cudaFree(d_blockLinearImage);
-		cudaFree(d_compressedImage);
-	}
-	
-	
-	
-	void addColorBlock(const ColorBlock & rgba)
-	{
-		nvDebugCheck(!isFull());
-		
-		// @@ Count unique colors?
-		/*
-		// Convert colors to vectors.
-		Array<Vector3> pointArray(16);
-		
-		for(int i = 0; i < 16; i++) {
-			const Color32 color = rgba.color(i);
-			pointArray.append(Vector3(color.r, color.g, color.b));
-		}
-		
-		// Find best fit line.
-		const Vector3 axis = Fit::bestLine(pointArray).direction();
-		
-		// Project points to axis.
-		float dps[16];
-		uint * order = &xrefs[blockCount * 16];
-		
-		for (uint i = 0; i < 16; ++i)
-		{
-			dps[i] = dot(pointArray[i], axis);
-			order[i] = i;
-		}
-		
-		// Sort them.
-		for (uint i = 0; i < 16; ++i)
-		{
-			for (uint j = i; j > 0 && dps[j] < dps[j - 1]; --j)
-			{
-				swap(dps[j], dps[j - 1]);
-				swap(order[j], order[j - 1]);
-			}
-		}
-		*/
-		// Write sorted colors to blockLinearImage.
-		for(uint i = 0; i < 16; ++i)
-		{
-		//	blockLinearImage[blockCount * 16 + i] = rgba.color(order[i]);
-			blockLinearImage[blockCount * 16 + i] = rgba.color(i);
-		}
-		
-		++blockCount;
-	}
-	
-	bool isFull()
-	{
-		nvDebugCheck(blockCount <= blockMaxCount);
-		return blockCount == blockMaxCount;
-	}
-	
-	void flush(const OutputOptions::Private & outputOptions)
-	{
-		if (blockCount == 0)
-		{
-			// Nothing to do.
-			return;
-		}
-		
-		// Copy input color blocks.
-		cudaMemcpy(d_blockLinearImage, blockLinearImage, blockCount * 64, cudaMemcpyHostToDevice);
-		
-		// Launch kernel.
-		compressKernelDXT1(blockCount, d_blockLinearImage, d_compressedImage, d_bitmaps);
-		
-		// Check for errors.
-		cudaError_t err = cudaGetLastError();
-		if (err != cudaSuccess)
-		{
-			nvDebug("CUDA Error: %s\n", cudaGetErrorString(err));
-			
-			if (outputOptions.errorHandler != NULL)
-			{
-				outputOptions.errorHandler->error(Error_CudaError);
-			}
-		}
-		
-		// Copy result to host, overwrite swizzled image.
-		uint * compressedImage = blockLinearImage;
-		cudaMemcpy(compressedImage, d_compressedImage, blockCount * 8, cudaMemcpyDeviceToHost);
-		
-		// @@ Sort block indices.
-		
-		// Output result.
-		if (outputOptions.outputHandler != NULL)
-		{
-		//	outputOptions.outputHandler->writeData(compressedImage, blockCount * 8);
-		}
-
-		blockCount = 0;
-	}
-	
-private:
-	
-	const uint blockMaxCount;
-	uint blockCount;
-	
-	uint * blockLinearImage;
-	uint * xrefs;
-	
-	uint * d_blockLinearImage;
-	uint * d_compressedImage;
-	
-};
-
-
-void nv::cudaCompressDXT1_2(const Image * image, const OutputOptions::Private & outputOptions, const CompressionOptions::Private & compressionOptions)
-{
-#if defined HAVE_CUDA	
-	const uint w = image->width();
-	const uint h = image->height();
-	
-	const uint blockNum = ((w + 3) / 4) * ((h + 3) / 4);
-	const uint blockMax = 32768; // 49152, 65535
-		
-	setupCompressKernelDXT1(compressionOptions.colorWeight.ptr());
-
-	ColorBlock rgba;
-	Task task(min(blockNum, blockMax));
-
-	clock_t start = clock();
-
-	for (uint y = 0; y < h; y += 4) {
-		for (uint x = 0; x < w; x += 4) {
-			
-			rgba.init(image, x, y);
-			
-			task.addColorBlock(rgba);
-			
-			if (task.isFull())
-			{
-				task.flush(outputOptions);
-			}
-		}
-	}
-	
-	task.flush(outputOptions);
-
-	clock_t end = clock();
-	printf("\rCUDA time taken: %.3f seconds\n", float(end-start) / CLOCKS_PER_SEC);
-
-#else
-	if (outputOptions.errorHandler != NULL)
-	{
-		outputOptions.errorHandler->error(Error_CudaError);
-	}
-#endif
-}
-
-#endif // 0
--- a/src/nvtt/cuda/CudaCompressDXT.h
+++ b/src/nvtt/cuda/CudaCompressDXT.h
@ -39,17 +39,22 @@ namespace nv

 		bool isValid() const;

-		void compressDXT1(const Image * image, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
-		void compressDXT3(const Image * image, const nvtt::InputOptions::Private & inputOptions, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
-		void compressDXT5(const Image * image, const nvtt::InputOptions::Private & inputOptions, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
-		void compressDXT1n(const Image * image, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
-		void compressCTX1(const Image * image, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
+		void setImage(const Image * image, nvtt::AlphaMode alphaMode);
+
+		void compressDXT1(const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
+		void compressDXT3(const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
+		void compressDXT5(const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
+		void compressDXT1n(const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
+		void compressCTX1(const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);

 	private:

 		uint * m_bitmapTable;
 		uint * m_data;
 		uint * m_result;
+		
+		const Image * m_image;
+		nvtt::AlphaMode m_alphaMode;
 	};

 } // nv namespace
--- a/src/nvtt/squish/extra/squishgen2.cpp
+++ b/src/nvtt/squish/extra/squishgen2.cpp
@ -0,0 +1,113 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+	Copyright (c) 2008 Ignacio Castano                      castano@gmail.com
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+
+#include <stdio.h>
+#include <float.h>
+#include <math.h>
+
+struct Precomp {
+	float alpha2_sum;
+	float beta2_sum;
+	float alphabeta_sum;
+	float factor;
+};
+
+
+int main()
+{
+	int i = 0;
+	
+	printf("struct Precomp {\n");
+	printf("\tfloat alpha2_sum;\n");
+	printf("\tfloat beta2_sum;\n");
+	printf("\tfloat alphabeta_sum;\n");
+	printf("\tfloat factor;\n");
+	printf("};\n\n");
+
+	printf("static const SQUISH_ALIGN_16 Precomp s_threeElement[153] = {\n");
+	
+	// Three element clusters:
+	for( int c0 = 0; c0 <= 16; c0++)	// At least two clusters.
+	{
+		for( int c1 = 0; c1 <=  16-c0; c1++)
+		{
+			int c2 = 16 - c0 - c1;
+
+			Precomp p;
+			p.alpha2_sum = c0 + c1 * 0.25f;
+			p.beta2_sum = c2 + c1 * 0.25f;
+			p.alphabeta_sum = c1 * 0.25f;
+			p.factor = 1.0f / (p.alpha2_sum * p.beta2_sum - p.alphabeta_sum * p.alphabeta_sum);
+
+			if (isfinite(p.factor))
+			{
+				printf("\t{ %f, %f, %f, %f }, // %d (%d %d %d)\n", p.alpha2_sum, p.beta2_sum, p.alphabeta_sum, p.factor, i, c0, c1, c2);
+			}
+			else
+			{
+				printf("\t{ %f, %f, %f, INFINITY }, // %d (%d %d %d)\n", p.alpha2_sum, p.beta2_sum, p.alphabeta_sum, i, c0, c1, c2);
+			}
+			
+			i++;
+		}
+	}
+	printf("}; // %d three cluster elements\n\n", i);
+	
+	printf("static const SQUISH_ALIGN_16 Precomp s_fourElement[969] = {\n");
+
+	// Four element clusters:
+	i = 0;
+	for( int c0 = 0; c0 <= 16; c0++)
+	{
+		for( int c1 = 0; c1 <=  16-c0; c1++)
+		{
+			for( int c2 = 0; c2 <=  16-c0-c1; c2++)
+			{
+				int c3 = 16 - c0 - c1 - c2;
+
+				Precomp p;			
+				p.alpha2_sum = c0 + c1 * (4.0f/9.0f) + c2 * (1.0f/9.0f);
+				p.beta2_sum = c3 + c2 * (4.0f/9.0f) + c1 * (1.0f/9.0f);
+				p.alphabeta_sum = (c1 + c2) * (2.0f/9.0f);
+				p.factor = 1.0f / (p.alpha2_sum * p.beta2_sum - p.alphabeta_sum * p.alphabeta_sum);
+
+				if (isfinite(p.factor))
+				{
+					printf("\t{ %f, %f, %f, %f }, // %d (%d %d %d %d)\n", p.alpha2_sum, p.beta2_sum, p.alphabeta_sum, p.factor, i, c0, c1, c2, c3);
+				}
+				else
+				{
+					printf("\t{ %f, %f, %f, INFINITY }, // %d (%d %d %d %d)\n", p.alpha2_sum, p.beta2_sum, p.alphabeta_sum, i, c0, c1, c2, c3);
+				}
+
+				i++;
+			}
+		}
+	}
+	printf("}; // %d four cluster elements\n\n", i);
+
+	return 0;
+}
--- a/src/nvtt/squish/fastclusterfit.cpp
+++ b/src/nvtt/squish/fastclusterfit.cpp
@ -29,6 +29,8 @@
 #include "colourblock.h"
 #include <cfloat>

+#include "fastclusterlookup.inl"
+
 namespace squish {

 FastClusterFit::FastClusterFit()
@ -97,91 +99,6 @@ void FastClusterFit::SetColourSet( ColourSet const* colours, int flags )
 }


-struct Precomp {
-	float alpha2_sum;
-	float beta2_sum;
-	float alphabeta_sum;
-	float factor;
-};
-
-static SQUISH_ALIGN_16 Precomp s_threeElement[153];
-static SQUISH_ALIGN_16 Precomp s_fourElement[969];
-
-void FastClusterFit::DoPrecomputation()
-{
-	int i = 0;
-	
-	// Three element clusters:
-	for( int c0 = 0; c0 <= 16; c0++)	// At least two clusters.
-	{
-		for( int c1 = 0; c1 <=  16-c0; c1++)
-		{
-			int c2 = 16 - c0 - c1;
-
-			/*if (c2 == 16) {
-				// a = b = x2 / 16
-				s_threeElement[i].alpha2_sum = 0;
-				s_threeElement[i].beta2_sum = 16;
-				s_threeElement[i].alphabeta_sum = -16;
-				s_threeElement[i].factor = 1.0f / 256.0f;
-			}
-			else if (c0 == 16) {
-				// a = b = x0 / 16
-				s_threeElement[i].alpha2_sum = 16;
-				s_threeElement[i].beta2_sum = 0;
-				s_threeElement[i].alphabeta_sum = -16;
-				s_threeElement[i].factor = 1.0f / 256.0f;
-			}
-			else*/ {
-				s_threeElement[i].alpha2_sum = c0 + c1 * 0.25f;
-				s_threeElement[i].beta2_sum = c2 + c1 * 0.25f;
-				s_threeElement[i].alphabeta_sum = c1 * 0.25f;
-				s_threeElement[i].factor = 1.0f / (s_threeElement[i].alpha2_sum * s_threeElement[i].beta2_sum - s_threeElement[i].alphabeta_sum * s_threeElement[i].alphabeta_sum);
-			}
-			
-			i++;
-		}
-	}
-	//printf("%d three cluster elements\n", i);
-	
-	// Four element clusters:
-	i = 0;
-	for( int c0 = 0; c0 <= 16; c0++)
-	{
-		for( int c1 = 0; c1 <=  16-c0; c1++)
-		{
-			for( int c2 = 0; c2 <=  16-c0-c1; c2++)
-			{
-				int c3 = 16 - c0 - c1 - c2;
-				
-				/*if (c3 == 16) {
-					// a = b = x3 / 16
-					s_fourElement[i].alpha2_sum = 16.0f;
-					s_fourElement[i].beta2_sum = 0.0f;
-					s_fourElement[i].alphabeta_sum = -16.0f;
-					s_fourElement[i].factor = 1.0f / 256.0f;					
-				}
-				else if (c0 == 16) {
-					// a = b = x0 / 16
-					s_fourElement[i].alpha2_sum = 0.0f;
-					s_fourElement[i].beta2_sum = 16.0f;
-					s_fourElement[i].alphabeta_sum = -16.0f;
-					s_fourElement[i].factor = 1.0f / 256.0f;					
-				}
-				else*/ {
-					s_fourElement[i].alpha2_sum = c0 + c1 * (4.0f/9.0f) + c2 * (1.0f/9.0f);
-					s_fourElement[i].beta2_sum = c3 + c2 * (4.0f/9.0f) + c1 * (1.0f/9.0f);
-					s_fourElement[i].alphabeta_sum = (c1 + c2) * (2.0f/9.0f);
-					s_fourElement[i].factor = 1.0f / (s_fourElement[i].alpha2_sum * s_fourElement[i].beta2_sum - s_fourElement[i].alphabeta_sum * s_fourElement[i].alphabeta_sum);
-				}
-
-				i++;
-			}
-		}
-	}
-	//printf("%d four cluster elements\n", i);
-}
-
 void FastClusterFit::SetMetric(float r, float g, float b)
 {
 #if SQUISH_USE_SIMD
--- a/src/nvtt/squish/fastclusterfit.h
+++ b/src/nvtt/squish/fastclusterfit.h
@ -44,8 +44,6 @@ public:
 	void SetMetric(float r, float g, float b);
 	float GetBestError() const;

-	static void DoPrecomputation();
-
 	// Make them public
 	virtual void Compress3( void* block );
 	virtual void Compress4( void* block );
--- a/src/nvtt/squish/fastclusterlookup.inl
+++ b/src/nvtt/squish/fastclusterlookup.inl