Precompute fast cluster fit factors, and store as static const.

nvtt is completely reentrant now. Fixes issue 37. cleanup interface of cuda compressors.
2008-04-18 08:49:32 +00:00
parent 91eb30667f
commit 1df69495fc
8 changed files with 1296 additions and 316 deletions
--- a/src/nvtt/CompressDXT.cpp
+++ b/src/nvtt/CompressDXT.cpp
@ -199,18 +199,6 @@ void nv::fastCompressBC5(const Image * image, const nvtt::OutputOptions::Private
 }
 void nv::doPrecomputation()
 {
 	static bool done = false;	// @@ Stop using statics for reentrancy. Although the worst that could happen is that this stuff is precomputed multiple times.
 	if (!done)
 	{
 		done = true;
 		squish::FastClusterFit::DoPrecomputation();
 	}
 }
 void nv::compressDXT1(const Image * image, const OutputOptions::Private & outputOptions, const CompressionOptions::Private & compressionOptions)
 {
 	const uint w = image->width();
@ -219,8 +207,6 @@ void nv::compressDXT1(const Image * image, const OutputOptions::Private & output
 	ColorBlock rgba;
 	BlockDXT1 block;
 	doPrecomputation();
 	//squish::WeightedClusterFit fit;
 	//squish::ClusterFit fit;
 	squish::FastClusterFit fit;
@ -363,8 +349,6 @@ void nv::compressDXT5n(const Image * image, const OutputOptions::Private & outpu
 	ColorBlock rgba;
 	BlockDXT5 block;
 	doPrecomputation();
 	for (uint y = 0; y < h; y += 4) {
 		for (uint x = 0; x < w; x += 4) {
--- a/src/nvtt/Compressor.cpp
+++ b/src/nvtt/Compressor.cpp
@ -725,7 +725,8 @@ bool Compressor::Private::compressMipmap(const Mipmap & mipmap, const InputOptio
 			if (cudaEnabled)
 			{
 				nvDebugCheck(cudaSupported);
-				cuda->compressDXT1(image, compressionOptions, outputOptions);
+				cuda->setImage(image, inputOptions.alphaMode);
 				cuda->compressDXT1(compressionOptions, outputOptions);
 			}
 			else
 			{
@ -757,7 +758,8 @@ bool Compressor::Private::compressMipmap(const Mipmap & mipmap, const InputOptio
 		if (cudaEnabled)
 		{
 			nvDebugCheck(cudaSupported);
-			cuda->compressDXT1n(image, compressionOptions, outputOptions);
+			cuda->setImage(image, inputOptions.alphaMode);	
 			cuda->compressDXT1n(compressionOptions, outputOptions);
 		}
 		else
 		{
@ -775,7 +777,8 @@ bool Compressor::Private::compressMipmap(const Mipmap & mipmap, const InputOptio
 			if (cudaEnabled)
 			{
 				nvDebugCheck(cudaSupported);
-				cuda->compressDXT3(image, inputOptions, compressionOptions, outputOptions);
+				cuda->setImage(image, inputOptions.alphaMode);
 				cuda->compressDXT3(compressionOptions, outputOptions);
 			}
 			else
 			{
@ -794,7 +797,8 @@ bool Compressor::Private::compressMipmap(const Mipmap & mipmap, const InputOptio
 			if (cudaEnabled)
 			{
 				nvDebugCheck(cudaSupported);
-				cuda->compressDXT5(image, inputOptions, compressionOptions, outputOptions);
+				cuda->setImage(image, inputOptions.alphaMode);
 				cuda->compressDXT5(compressionOptions, outputOptions);
 			}
 			else
 			{
@ -826,7 +830,8 @@ bool Compressor::Private::compressMipmap(const Mipmap & mipmap, const InputOptio
 		if (cudaEnabled)
 		{
 			nvDebugCheck(cudaSupported);
-			cuda->compressCTX1(image, compressionOptions, outputOptions);
+			cuda->setImage(image, inputOptions.alphaMode);
 			cuda->compressCTX1(compressionOptions, outputOptions);
 		}
 		else
 		{
--- a/src/nvtt/cuda/CudaCompressDXT.cpp
+++ b/src/nvtt/cuda/CudaCompressDXT.cpp
@ -213,21 +213,27 @@ void CudaCompressor::compressKernel(CudaCompressionKernel * kernel)
 #endif // 0
 void CudaCompressor::setImage(const Image * image, nvtt::AlphaMode alphaMode)
 {
 	m_image = image;
 	m_alphaMode = alphaMode;
 }
 /// Compress image using CUDA.
-void CudaCompressor::compressDXT1(const Image * image, const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions)
+void CudaCompressor::compressDXT1(const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions)
 {
 	nvDebugCheck(cuda::isHardwarePresent());
 #if defined HAVE_CUDA
 	// Image size in blocks.
-	const uint w = (image->width() + 3) / 4;
+	const uint w = (m_image->width() + 3) / 4;
-	const uint h = (image->height() + 3) / 4;
+	const uint h = (m_image->height() + 3) / 4;
 	uint imageSize = w * h * 16 * sizeof(Color32);
    uint * blockLinearImage = (uint *) malloc(imageSize);
-	convertToBlockLinear(image, blockLinearImage);	// @@ Do this in parallel with the GPU, or in the GPU!
+	convertToBlockLinear(m_image, blockLinearImage);	// @@ Do this in parallel with the GPU, or in the GPU!
 	const uint blockNum = w * h;
 	const uint compressedSize = blockNum * 8;
@ -286,18 +292,18 @@ void CudaCompressor::compressDXT1(const Image * image, const CompressionOptions:
 /// Compress image using CUDA.
-void CudaCompressor::compressDXT3(const Image * image, const InputOptions::Private & inputOptions, const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions)
+void CudaCompressor::compressDXT3(const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions)
 {
 	nvDebugCheck(cuda::isHardwarePresent());
 #if defined HAVE_CUDA
 	// Image size in blocks.
-	const uint w = (image->width() + 3) / 4;
+	const uint w = (m_image->width() + 3) / 4;
-	const uint h = (image->height() + 3) / 4;
+	const uint h = (m_image->height() + 3) / 4;
 	uint imageSize = w * h * 16 * sizeof(Color32);
    uint * blockLinearImage = (uint *) malloc(imageSize);
-	convertToBlockLinear(image, blockLinearImage);
+	convertToBlockLinear(m_image, blockLinearImage);
 	const uint blockNum = w * h;
 	const uint compressedSize = blockNum * 8;
@ -370,18 +376,18 @@ void CudaCompressor::compressDXT3(const Image * image, const InputOptions::Priva
 /// Compress image using CUDA.
-void CudaCompressor::compressDXT5(const Image * image, const InputOptions::Private & inputOptions, const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions)
+void CudaCompressor::compressDXT5(const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions)
 {
 	nvDebugCheck(cuda::isHardwarePresent());
 #if defined HAVE_CUDA
 	// Image size in blocks.
-	const uint w = (image->width() + 3) / 4;
+	const uint w = (m_image->width() + 3) / 4;
-	const uint h = (image->height() + 3) / 4;
+	const uint h = (m_image->height() + 3) / 4;
 	uint imageSize = w * h * 16 * sizeof(Color32);
    uint * blockLinearImage = (uint *) malloc(imageSize);
-	convertToBlockLinear(image, blockLinearImage);
+	convertToBlockLinear(m_image, blockLinearImage);
 	const uint blockNum = w * h;
 	const uint compressedSize = blockNum * 8;
@ -453,18 +459,18 @@ void CudaCompressor::compressDXT5(const Image * image, const InputOptions::Priva
 }
-void CudaCompressor::compressDXT1n(const Image * image, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions)
+void CudaCompressor::compressDXT1n(const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions)
 {
 	nvDebugCheck(cuda::isHardwarePresent());
 #if defined HAVE_CUDA
 	// Image size in blocks.
-	const uint w = (image->width() + 3) / 4;
+	const uint w = (m_image->width() + 3) / 4;
-	const uint h = (image->height() + 3) / 4;
+	const uint h = (m_image->height() + 3) / 4;
 	uint imageSize = w * h * 16 * sizeof(Color32);
    uint * blockLinearImage = (uint *) malloc(imageSize);
-	convertToBlockLinear(image, blockLinearImage);	// @@ Do this in parallel with the GPU, or in the GPU!
+	convertToBlockLinear(m_image, blockLinearImage);	// @@ Do this in parallel with the GPU, or in the GPU!
 	const uint blockNum = w * h;
 	const uint compressedSize = blockNum * 8;
@ -522,18 +528,18 @@ void CudaCompressor::compressDXT1n(const Image * image, const nvtt::CompressionO
 }
-void CudaCompressor::compressCTX1(const Image * image, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions)
+void CudaCompressor::compressCTX1(const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions)
 {
 	nvDebugCheck(cuda::isHardwarePresent());
 #if defined HAVE_CUDA
 	// Image size in blocks.
-	const uint w = (image->width() + 3) / 4;
+	const uint w = (m_image->width() + 3) / 4;
-	const uint h = (image->height() + 3) / 4;
+	const uint h = (m_image->height() + 3) / 4;
 	uint imageSize = w * h * 16 * sizeof(Color32);
    uint * blockLinearImage = (uint *) malloc(imageSize);
-	convertToBlockLinear(image, blockLinearImage);	// @@ Do this in parallel with the GPU, or in the GPU!
+	convertToBlockLinear(m_image, blockLinearImage);	// @@ Do this in parallel with the GPU, or in the GPU!
 	const uint blockNum = w * h;
 	const uint compressedSize = blockNum * 8;
@ -590,186 +596,3 @@ void CudaCompressor::compressCTX1(const Image * image, const nvtt::CompressionOp
 #endif
 }
 #if 0
 class Task
 {
 public:
 	explicit Task(uint numBlocks) : blockMaxCount(numBlocks), blockCount(0)
 	{
 		// System memory allocations.
 		blockLinearImage = new uint[blockMaxCount * 16];
 		xrefs = new uint[blockMaxCount * 16];
 		// Device memory allocations.
 		cudaMalloc((void**) &d_blockLinearImage, blockMaxCount * 16 * sizeof(uint));
 		cudaMalloc((void**) &d_compressedImage, blockMaxCount * 8U);
 		// @@ Check for allocation errors.
 	}
 	~Task()
 	{
 		delete [] blockLinearImage;
 		delete [] xrefs;
 		cudaFree(d_blockLinearImage);
 		cudaFree(d_compressedImage);
 	}
 	void addColorBlock(const ColorBlock & rgba)
 	{
 		nvDebugCheck(!isFull());
 		// @@ Count unique colors?
 		/*
 		// Convert colors to vectors.
 		Array<Vector3> pointArray(16);
 		for(int i = 0; i < 16; i++) {
 			const Color32 color = rgba.color(i);
 			pointArray.append(Vector3(color.r, color.g, color.b));
 		}
 		// Find best fit line.
 		const Vector3 axis = Fit::bestLine(pointArray).direction();
 		// Project points to axis.
 		float dps[16];
 		uint * order = &xrefs[blockCount * 16];
 		for (uint i = 0; i < 16; ++i)
 		{
 			dps[i] = dot(pointArray[i], axis);
 			order[i] = i;
 		}
 		// Sort them.
 		for (uint i = 0; i < 16; ++i)
 		{
 			for (uint j = i; j > 0 && dps[j] < dps[j - 1]; --j)
 			{
 				swap(dps[j], dps[j - 1]);
 				swap(order[j], order[j - 1]);
 			}
 		}
 		*/
 		// Write sorted colors to blockLinearImage.
 		for(uint i = 0; i < 16; ++i)
 		{
 		//	blockLinearImage[blockCount * 16 + i] = rgba.color(order[i]);
 			blockLinearImage[blockCount * 16 + i] = rgba.color(i);
 		}
 		++blockCount;
 	}
 	bool isFull()
 	{
 		nvDebugCheck(blockCount <= blockMaxCount);
 		return blockCount == blockMaxCount;
 	}
 	void flush(const OutputOptions::Private & outputOptions)
 	{
 		if (blockCount == 0)
 		{
 			// Nothing to do.
 			return;
 		}
 		// Copy input color blocks.
 		cudaMemcpy(d_blockLinearImage, blockLinearImage, blockCount * 64, cudaMemcpyHostToDevice);
 		// Launch kernel.
 		compressKernelDXT1(blockCount, d_blockLinearImage, d_compressedImage, d_bitmaps);
 		// Check for errors.
 		cudaError_t err = cudaGetLastError();
 		if (err != cudaSuccess)
 		{
 			nvDebug("CUDA Error: %s\n", cudaGetErrorString(err));
 			if (outputOptions.errorHandler != NULL)
 			{
 				outputOptions.errorHandler->error(Error_CudaError);
 			}
 		}
 		// Copy result to host, overwrite swizzled image.
 		uint * compressedImage = blockLinearImage;
 		cudaMemcpy(compressedImage, d_compressedImage, blockCount * 8, cudaMemcpyDeviceToHost);
 		// @@ Sort block indices.
 		// Output result.
 		if (outputOptions.outputHandler != NULL)
 		{
 		//	outputOptions.outputHandler->writeData(compressedImage, blockCount * 8);
 		}
 		blockCount = 0;
 	}
 private:
 	const uint blockMaxCount;
 	uint blockCount;
 	uint * blockLinearImage;
 	uint * xrefs;
 	uint * d_blockLinearImage;
 	uint * d_compressedImage;
 };
 void nv::cudaCompressDXT1_2(const Image * image, const OutputOptions::Private & outputOptions, const CompressionOptions::Private & compressionOptions)
 {
 #if defined HAVE_CUDA	
 	const uint w = image->width();
 	const uint h = image->height();
 	const uint blockNum = ((w + 3) / 4) * ((h + 3) / 4);
 	const uint blockMax = 32768; // 49152, 65535
 	setupCompressKernelDXT1(compressionOptions.colorWeight.ptr());
 	ColorBlock rgba;
 	Task task(min(blockNum, blockMax));
 	clock_t start = clock();
 	for (uint y = 0; y < h; y += 4) {
 		for (uint x = 0; x < w; x += 4) {
 			rgba.init(image, x, y);
 			task.addColorBlock(rgba);
 			if (task.isFull())
 			{
 				task.flush(outputOptions);
 			}
 		}
 	}
 	task.flush(outputOptions);
 	clock_t end = clock();
 	printf("\rCUDA time taken: %.3f seconds\n", float(end-start) / CLOCKS_PER_SEC);
 #else
 	if (outputOptions.errorHandler != NULL)
 	{
 		outputOptions.errorHandler->error(Error_CudaError);
 	}
 #endif
 }
 #endif // 0
--- a/src/nvtt/cuda/CudaCompressDXT.h
+++ b/src/nvtt/cuda/CudaCompressDXT.h
@ -39,17 +39,22 @@ namespace nv
 		bool isValid() const;
-		void compressDXT1(const Image * image, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
+		void setImage(const Image * image, nvtt::AlphaMode alphaMode);
-		void compressDXT3(const Image * image, const nvtt::InputOptions::Private & inputOptions, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
+
-		void compressDXT5(const Image * image, const nvtt::InputOptions::Private & inputOptions, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
+		void compressDXT1(const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
-		void compressDXT1n(const Image * image, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
+		void compressDXT3(const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
-		void compressCTX1(const Image * image, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
+		void compressDXT5(const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
 		void compressDXT1n(const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
 		void compressCTX1(const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
 	private:
 		uint * m_bitmapTable;
 		uint * m_data;
 		uint * m_result;
 		const Image * m_image;
 		nvtt::AlphaMode m_alphaMode;
 	};
 } // nv namespace
--- a/src/nvtt/squish/extra/squishgen2.cpp
+++ b/src/nvtt/squish/extra/squishgen2.cpp
@ -0,0 +1,113 @@
 /* -----------------------------------------------------------------------------
 	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
 	Copyright (c) 2008 Ignacio Castano                      castano@gmail.com
 	Permission is hereby granted, free of charge, to any person obtaining
 	a copy of this software and associated documentation files (the 
 	"Software"), to	deal in the Software without restriction, including
 	without limitation the rights to use, copy, modify, merge, publish,
 	distribute, sublicense, and/or sell copies of the Software, and to 
 	permit persons to whom the Software is furnished to do so, subject to 
 	the following conditions:
 	The above copyright notice and this permission notice shall be included
 	in all copies or substantial portions of the Software.
 	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
 	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
 	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
 	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
 	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
 	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
   -------------------------------------------------------------------------- */
 #include <stdio.h>
 #include <float.h>
 #include <math.h>
 struct Precomp {
 	float alpha2_sum;
 	float beta2_sum;
 	float alphabeta_sum;
 	float factor;
 };
 int main()
 {
 	int i = 0;
 	printf("struct Precomp {\n");
 	printf("\tfloat alpha2_sum;\n");
 	printf("\tfloat beta2_sum;\n");
 	printf("\tfloat alphabeta_sum;\n");
 	printf("\tfloat factor;\n");
 	printf("};\n\n");
 	printf("static const SQUISH_ALIGN_16 Precomp s_threeElement[153] = {\n");
 	// Three element clusters:
 	for( int c0 = 0; c0 <= 16; c0++)	// At least two clusters.
 	{
 		for( int c1 = 0; c1 <=  16-c0; c1++)
 		{
 			int c2 = 16 - c0 - c1;
 			Precomp p;
 			p.alpha2_sum = c0 + c1 * 0.25f;
 			p.beta2_sum = c2 + c1 * 0.25f;
 			p.alphabeta_sum = c1 * 0.25f;
 			p.factor = 1.0f / (p.alpha2_sum * p.beta2_sum - p.alphabeta_sum * p.alphabeta_sum);
 			if (isfinite(p.factor))
 			{
 				printf("\t{ %f, %f, %f, %f }, // %d (%d %d %d)\n", p.alpha2_sum, p.beta2_sum, p.alphabeta_sum, p.factor, i, c0, c1, c2);
 			}
 			else
 			{
 				printf("\t{ %f, %f, %f, INFINITY }, // %d (%d %d %d)\n", p.alpha2_sum, p.beta2_sum, p.alphabeta_sum, i, c0, c1, c2);
 			}
 			i++;
 		}
 	}
 	printf("}; // %d three cluster elements\n\n", i);
 	printf("static const SQUISH_ALIGN_16 Precomp s_fourElement[969] = {\n");
 	// Four element clusters:
 	i = 0;
 	for( int c0 = 0; c0 <= 16; c0++)
 	{
 		for( int c1 = 0; c1 <=  16-c0; c1++)
 		{
 			for( int c2 = 0; c2 <=  16-c0-c1; c2++)
 			{
 				int c3 = 16 - c0 - c1 - c2;
 				Precomp p;			
 				p.alpha2_sum = c0 + c1 * (4.0f/9.0f) + c2 * (1.0f/9.0f);
 				p.beta2_sum = c3 + c2 * (4.0f/9.0f) + c1 * (1.0f/9.0f);
 				p.alphabeta_sum = (c1 + c2) * (2.0f/9.0f);
 				p.factor = 1.0f / (p.alpha2_sum * p.beta2_sum - p.alphabeta_sum * p.alphabeta_sum);
 				if (isfinite(p.factor))
 				{
 					printf("\t{ %f, %f, %f, %f }, // %d (%d %d %d %d)\n", p.alpha2_sum, p.beta2_sum, p.alphabeta_sum, p.factor, i, c0, c1, c2, c3);
 				}
 				else
 				{
 					printf("\t{ %f, %f, %f, INFINITY }, // %d (%d %d %d %d)\n", p.alpha2_sum, p.beta2_sum, p.alphabeta_sum, i, c0, c1, c2, c3);
 				}
 				i++;
 			}
 		}
 	}
 	printf("}; // %d four cluster elements\n\n", i);
 	return 0;
 }
--- a/src/nvtt/squish/fastclusterfit.cpp
+++ b/src/nvtt/squish/fastclusterfit.cpp
@ -29,6 +29,8 @@
 #include "colourblock.h"
 #include <cfloat>
 #include "fastclusterlookup.inl"
 namespace squish {
 FastClusterFit::FastClusterFit()
@ -97,91 +99,6 @@ void FastClusterFit::SetColourSet( ColourSet const* colours, int flags )
 }
 struct Precomp {
 	float alpha2_sum;
 	float beta2_sum;
 	float alphabeta_sum;
 	float factor;
 };
 static SQUISH_ALIGN_16 Precomp s_threeElement[153];
 static SQUISH_ALIGN_16 Precomp s_fourElement[969];
 void FastClusterFit::DoPrecomputation()
 {
 	int i = 0;
 	// Three element clusters:
 	for( int c0 = 0; c0 <= 16; c0++)	// At least two clusters.
 	{
 		for( int c1 = 0; c1 <=  16-c0; c1++)
 		{
 			int c2 = 16 - c0 - c1;
 			/*if (c2 == 16) {
 				// a = b = x2 / 16
 				s_threeElement[i].alpha2_sum = 0;
 				s_threeElement[i].beta2_sum = 16;
 				s_threeElement[i].alphabeta_sum = -16;
 				s_threeElement[i].factor = 1.0f / 256.0f;
 			}
 			else if (c0 == 16) {
 				// a = b = x0 / 16
 				s_threeElement[i].alpha2_sum = 16;
 				s_threeElement[i].beta2_sum = 0;
 				s_threeElement[i].alphabeta_sum = -16;
 				s_threeElement[i].factor = 1.0f / 256.0f;
 			}
 			else*/ {
 				s_threeElement[i].alpha2_sum = c0 + c1 * 0.25f;
 				s_threeElement[i].beta2_sum = c2 + c1 * 0.25f;
 				s_threeElement[i].alphabeta_sum = c1 * 0.25f;
 				s_threeElement[i].factor = 1.0f / (s_threeElement[i].alpha2_sum * s_threeElement[i].beta2_sum - s_threeElement[i].alphabeta_sum * s_threeElement[i].alphabeta_sum);
 			}
 			i++;
 		}
 	}
 	//printf("%d three cluster elements\n", i);
 	// Four element clusters:
 	i = 0;
 	for( int c0 = 0; c0 <= 16; c0++)
 	{
 		for( int c1 = 0; c1 <=  16-c0; c1++)
 		{
 			for( int c2 = 0; c2 <=  16-c0-c1; c2++)
 			{
 				int c3 = 16 - c0 - c1 - c2;
 				/*if (c3 == 16) {
 					// a = b = x3 / 16
 					s_fourElement[i].alpha2_sum = 16.0f;
 					s_fourElement[i].beta2_sum = 0.0f;
 					s_fourElement[i].alphabeta_sum = -16.0f;
 					s_fourElement[i].factor = 1.0f / 256.0f;					
 				}
 				else if (c0 == 16) {
 					// a = b = x0 / 16
 					s_fourElement[i].alpha2_sum = 0.0f;
 					s_fourElement[i].beta2_sum = 16.0f;
 					s_fourElement[i].alphabeta_sum = -16.0f;
 					s_fourElement[i].factor = 1.0f / 256.0f;					
 				}
 				else*/ {
 					s_fourElement[i].alpha2_sum = c0 + c1 * (4.0f/9.0f) + c2 * (1.0f/9.0f);
 					s_fourElement[i].beta2_sum = c3 + c2 * (4.0f/9.0f) + c1 * (1.0f/9.0f);
 					s_fourElement[i].alphabeta_sum = (c1 + c2) * (2.0f/9.0f);
 					s_fourElement[i].factor = 1.0f / (s_fourElement[i].alpha2_sum * s_fourElement[i].beta2_sum - s_fourElement[i].alphabeta_sum * s_fourElement[i].alphabeta_sum);
 				}
 				i++;
 			}
 		}
 	}
 	//printf("%d four cluster elements\n", i);
 }
 void FastClusterFit::SetMetric(float r, float g, float b)
 {
 #if SQUISH_USE_SIMD
--- a/src/nvtt/squish/fastclusterfit.h
+++ b/src/nvtt/squish/fastclusterfit.h
@ -44,8 +44,6 @@ public:
 	void SetMetric(float r, float g, float b);
 	float GetBestError() const;
 	static void DoPrecomputation();
 	// Make them public
 	virtual void Compress3( void* block );
 	virtual void Compress4( void* block );
--- a/src/nvtt/squish/fastclusterlookup.inl
+++ b/src/nvtt/squish/fastclusterlookup.inl