diff --git a/src/nvtt/squish/weightedclusterfit.cpp b/src/nvtt/squish/weightedclusterfit.cpp
index 6f2318a..d200f15 100644
--- a/src/nvtt/squish/weightedclusterfit.cpp
+++ b/src/nvtt/squish/weightedclusterfit.cpp
@@ -1,28 +1,28 @@
 /* -----------------------------------------------------------------------------
 
-	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
-	Copyright (c) 2006 Ignacio Castano                      icastano@nvidia.com
-
-	Permission is hereby granted, free of charge, to any person obtaining
-	a copy of this software and associated documentation files (the 
-	"Software"), to	deal in the Software without restriction, including
-	without limitation the rights to use, copy, modify, merge, publish,
-	distribute, sublicense, and/or sell copies of the Software, and to 
-	permit persons to whom the Software is furnished to do so, subject to 
-	the following conditions:
-
-	The above copyright notice and this permission notice shall be included
-	in all copies or substantial portions of the Software.
-
-	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
-	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
-	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
-	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
-	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
-	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-	
-   -------------------------------------------------------------------------- */
+Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+Copyright (c) 2006 Ignacio Castano                      icastano@nvidia.com
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the 
+"Software"), to	deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to 
+permit persons to whom the Software is furnished to do so, subject to 
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+-------------------------------------------------------------------------- */
 
 #include "weightedclusterfit.h"
 #include "colourset.h"
@@ -32,286 +32,166 @@
 
 namespace squish {
 
-WeightedClusterFit::WeightedClusterFit()
-{
-}
+	WeightedClusterFit::WeightedClusterFit()
+	{
+	}
 
-void WeightedClusterFit::SetColourSet( ColourSet const* colours, int flags )
-{
-	ColourFit::SetColourSet( colours, flags );
+	void WeightedClusterFit::SetColourSet( ColourSet const* colours, int flags )
+	{
+		ColourFit::SetColourSet( colours, flags );
 
-	// initialise the best error
+		// initialise the best error
 #if SQUISH_USE_SIMD
-	m_besterror = VEC4_CONST( FLT_MAX );
-	Vec3 metric = m_metric.GetVec3();
+		m_besterror = VEC4_CONST( FLT_MAX );
+		Vec3 metric = m_metric.GetVec3();
 #else
-	m_besterror = FLT_MAX;
-	Vec3 metric = m_metric;
+		m_besterror = FLT_MAX;
+		Vec3 metric = m_metric;
 #endif
 
-	// cache some values
-	int const count = m_colours->GetCount();
-	Vec3 const* values = m_colours->GetPoints();
-	
-	// get the covariance matrix
-	Sym3x3 covariance = ComputeWeightedCovariance( count, values, m_colours->GetWeights(), metric );
-	
-	// compute the principle component
-	Vec3 principle = ComputePrincipleComponent( covariance );
-
-	// build the list of values
-	float dps[16];
-	for( int i = 0; i < count; ++i )
-	{
-		dps[i] = Dot( values[i], principle );
-		m_order[i] = i;
-	}
-	
-	// stable sort
-	for( int i = 0; i < count; ++i )
-	{
-		for( int j = i; j > 0 && dps[j] < dps[j - 1]; --j )
+		// cache some values
+		int const count = m_colours->GetCount();
+		Vec3 const* values = m_colours->GetPoints();
+
+		// get the covariance matrix
+		Sym3x3 covariance = ComputeWeightedCovariance( count, values, m_colours->GetWeights(), metric );
+
+		// compute the principle component
+		Vec3 principle = ComputePrincipleComponent( covariance );
+
+		// build the list of values
+		float dps[16];
+		for( int i = 0; i < count; ++i )
 		{
-			std::swap( dps[j], dps[j - 1] );
-			std::swap( m_order[j], m_order[j - 1] );
+			dps[i] = Dot( values[i], principle );
+			m_order[i] = i;
 		}
-	}
-	
-	// weight all the points
+
+		// stable sort
+		for( int i = 0; i < count; ++i )
+		{
+			for( int j = i; j > 0 && dps[j] < dps[j - 1]; --j )
+			{
+				std::swap( dps[j], dps[j - 1] );
+				std::swap( m_order[j], m_order[j - 1] );
+			}
+		}
+
+		// weight all the points
 #if SQUISH_USE_SIMD
-	Vec4 const* unweighted = m_colours->GetPointsSimd();
-	Vec4 const* weights = m_colours->GetWeightsSimd();
-	m_xxsum = VEC4_CONST( 0.0f );
-	m_xsum = VEC4_CONST( 0.0f );
+		Vec4 const* unweighted = m_colours->GetPointsSimd();
+		Vec4 const* weights = m_colours->GetWeightsSimd();
+		m_xxsum = VEC4_CONST( 0.0f );
+		m_xsum = VEC4_CONST( 0.0f );
 #else
-	Vec3 const* unweighted = m_colours->GetPoints();
-	float const* weights = m_colours->GetWeights();
-	m_xxsum = Vec3( 0.0f );
-	m_xsum = Vec3( 0.0f );
-	m_wsum = 0.0f;	
+		Vec3 const* unweighted = m_colours->GetPoints();
+		float const* weights = m_colours->GetWeights();
+		m_xxsum = Vec3( 0.0f );
+		m_xsum = Vec3( 0.0f );
+		m_wsum = 0.0f;	
 #endif
-	
-	for( int i = 0; i < count; ++i )
-	{
-		int p = m_order[i];
-		m_weighted[i] = weights[p] * unweighted[p];
-		m_xxsum += m_weighted[i] * m_weighted[i];
-		m_xsum += m_weighted[i];
+
+		for( int i = 0; i < count; ++i )
+		{
+			int p = m_order[i];
+			m_weighted[i] = weights[p] * unweighted[p];
+			m_xxsum += m_weighted[i] * m_weighted[i];
+			m_xsum += m_weighted[i];
 #if !SQUISH_USE_SIMD		
-		m_weights[i] = weights[p];
-		m_wsum += m_weights[i];
+			m_weights[i] = weights[p];
+			m_wsum += m_weights[i];
 #endif
+		}
 	}
-}
 
 
-void WeightedClusterFit::SetMetric(float r, float g, float b)
-{
+	void WeightedClusterFit::SetMetric(float r, float g, float b)
+	{
 #if SQUISH_USE_SIMD
-	m_metric = Vec4(r, g, b, 0);
+		m_metric = Vec4(r, g, b, 0);
 #else
-	m_metric = Vec3(r, g, b);
+		m_metric = Vec3(r, g, b);
 #endif
-	m_metricSqr = m_metric * m_metric;
-}
+		m_metricSqr = m_metric * m_metric;
+	}
 
-float WeightedClusterFit::GetBestError() const
-{
+	float WeightedClusterFit::GetBestError() const
+	{
 #if SQUISH_USE_SIMD
-	Vec4 x = m_xxsum * m_metricSqr;
-	Vec4 error = m_besterror + x.SplatX() + x.SplatY() + x.SplatZ();
-	return error.GetVec3().X();
+		Vec4 x = m_xxsum * m_metricSqr;
+		Vec4 error = m_besterror + x.SplatX() + x.SplatY() + x.SplatZ();
+		return error.GetVec3().X();
 #else
-	return m_besterror + Dot(m_xxsum, m_metricSqr);
+		return m_besterror + Dot(m_xxsum, m_metricSqr);
 #endif
 
-}
+	}
 
 #if SQUISH_USE_SIMD
 
-void WeightedClusterFit::Compress3( void* block )
-{
-	int const count = m_colours->GetCount();
-	Vec4 const one = VEC4_CONST(1.0f);
-	Vec4 const zero = VEC4_CONST(0.0f);
-	Vec4 const half(0.5f, 0.5f, 0.5f, 0.25f);
-	Vec4 const two = VEC4_CONST(2.0);
-	 
-	// declare variables
-	Vec4 beststart = VEC4_CONST( 0.0f );
-	Vec4 bestend = VEC4_CONST( 0.0f );
-	Vec4 besterror = VEC4_CONST( FLT_MAX );
-
-	Vec4 x0 = zero;
-	
-	int b0 = 0, b1 = 0;
-
-	// check all possible clusters for this total order
-	for( int c0 = 0; c0 <= count; c0++)
-	{	
-		Vec4 x1 = zero;
-		
-		for( int c1 = 0; c1 <= count-c0; c1++)
-		{
-			Vec4 const x2 = m_xsum - x1 - x0;
-			
-			//Vec3 const alphax_sum = x0 + x1 * 0.5f;
-			//float const alpha2_sum = w0 + w1 * 0.25f;
-			Vec4 const alphax_sum = MultiplyAdd(x1, half, x0); // alphax_sum, alpha2_sum
-			Vec4 const alpha2_sum = alphax_sum.SplatW();
-			
-			//Vec3 const betax_sum = x2 + x1 * 0.5f;
-			//float const beta2_sum = w2 + w1 * 0.25f;
-			Vec4 const betax_sum = MultiplyAdd(x1, half, x2); // betax_sum, beta2_sum
-			Vec4 const beta2_sum = betax_sum.SplatW();
-			
-			//float const alphabeta_sum = w1 * 0.25f;
-			Vec4 const alphabeta_sum = (x1 * half).SplatW(); // alphabeta_sum
-			
-			// float const factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);
-			Vec4 const factor = Reciprocal( NegativeMultiplySubtract(alphabeta_sum, alphabeta_sum, alpha2_sum*beta2_sum) );
-			
-			Vec4 a = NegativeMultiplySubtract(betax_sum, alphabeta_sum, alphax_sum*beta2_sum) * factor;
-			Vec4 b = NegativeMultiplySubtract(alphax_sum, alphabeta_sum, betax_sum*alpha2_sum) * factor;
-			
-			// clamp the output to [0, 1]
-			a = Min( one, Max( zero, a ) );
-			b = Min( one, Max( zero, b ) );
-			
-			// clamp to the grid
-			Vec4 const grid( 31.0f, 63.0f, 31.0f, 0.0f );
-			Vec4 const gridrcp( 0.03227752766457f, 0.01583151765563f, 0.03227752766457f, 0.0f );
-			a = Truncate( MultiplyAdd( grid, a, half ) ) * gridrcp;
-			b = Truncate( MultiplyAdd( grid, b, half ) ) * gridrcp;
-			
-			// compute the error
-			Vec4 e1 = MultiplyAdd( a, alphax_sum, b*betax_sum );
-			Vec4 e2 = MultiplyAdd( a*a, alpha2_sum, b*b*beta2_sum );
-			Vec4 e3 = MultiplyAdd( a*b*alphabeta_sum - e1, two, e2 );
-			
-			// apply the metric to the error term
-			Vec4 e4 = e3 * m_metricSqr;
-			Vec4 error = e4.SplatX() + e4.SplatY() + e4.SplatZ();
-			
-			// keep the solution if it wins
-			if( CompareAnyLessThan( error, besterror ) )
-			{
-				besterror = error;
-				beststart = a;
-				bestend = b;
-				b0 = c0;
-				b1 = c1;
-			}
-			
-			x1 += m_weighted[c0+c1];
-		}
-		
-		x0 += m_weighted[c0];
-	}
-
-	// save the block if necessary
-	if( CompareAnyLessThan( besterror, m_besterror ) )
+	void WeightedClusterFit::Compress3( void* block )
 	{
-		// compute indices from cluster sizes.
-		u8 bestindices[16];
-		{
-			int i = 0;
-			for(; i < b0; i++) {
-				bestindices[i] = 0;
-			}
-			for(; i < b0+b1; i++) {
-				bestindices[i] = 2;
-			}
-			for(; i < 16; i++) {
-				bestindices[i] = 1;
-			}
-		}
-		
-		// remap the indices
-		u8 ordered[16];
-		for( int i = 0; i < count; ++i )
-			ordered[m_order[i]] = bestindices[i];
-		
-		m_colours->RemapIndices( ordered, bestindices ); // Set alpha indices.
+		int const count = m_colours->GetCount();
+		Vec4 const one = VEC4_CONST(1.0f);
+		Vec4 const zero = VEC4_CONST(0.0f);
+		Vec4 const half(0.5f, 0.5f, 0.5f, 0.25f);
+		Vec4 const two = VEC4_CONST(2.0);
+		Vec4 const grid( 31.0f, 63.0f, 31.0f, 0.0f );
+		Vec4 const gridrcp( 1.0f/31.0f, 1.0f/63.0f, 1.0f/31.0f, 0.0f );
 
+		// declare variables
+		Vec4 beststart = VEC4_CONST( 0.0f );
+		Vec4 bestend = VEC4_CONST( 0.0f );
+		Vec4 besterror = VEC4_CONST( FLT_MAX );
 
-		// save the block
-		WriteColourBlock3( beststart.GetVec3(), bestend.GetVec3(), bestindices, block );
-		
-		// save the error
-		m_besterror = besterror;
-	}
-}
-
-void WeightedClusterFit::Compress4( void* block )
-{
-	int const count = m_colours->GetCount();
-	Vec4 const one = VEC4_CONST(1.0f);
-	Vec4 const zero = VEC4_CONST(0.0f);
-	Vec4 const half = VEC4_CONST(0.5f);
-	Vec4 const two = VEC4_CONST(2.0);
-	Vec4 const onethird( 1.0f/3.0f, 1.0f/3.0f, 1.0f/3.0f, 1.0f/9.0f );
-	Vec4 const twothirds( 2.0f/3.0f, 2.0f/3.0f, 2.0f/3.0f, 4.0f/9.0f );
-	
-	// declare variables
-	Vec4 beststart = VEC4_CONST( 0.0f );
-	Vec4 bestend = VEC4_CONST( 0.0f );
-	Vec4 besterror = VEC4_CONST( FLT_MAX );
-
-	Vec4 x0 = zero;
-	int b0 = 0, b1 = 0, b2 = 0;
-
-	// check all possible clusters for this total order
-	for( int c0 = 0; c0 <= count; c0++)
-	{	
-		Vec4 x1 = zero;
-		
-		for( int c1 = 0; c1 <= count-c0; c1++)
+		Vec4 x0 = zero;
+
+		int b0 = 0, b1 = 0;
+
+		// check all possible clusters for this total order
+		for( int c0 = 0; c0 <= count; c0++)
 		{	
-			Vec4 x2 = zero;
-			
-			for( int c2 = 0; c2 <= count-c0-c1; c2++)
+			Vec4 x1 = zero;
+
+			for( int c1 = 0; c1 <= count-c0; c1++)
 			{
-				Vec4 const x3 = m_xsum - x2 - x1 - x0;
-				
-				//Vec3 const alphax_sum = x0 + x1 * (2.0f / 3.0f) + x2 * (1.0f / 3.0f);
-				//float const alpha2_sum = w0 + w1 * (4.0f/9.0f) + w2 * (1.0f/9.0f);
-				Vec4 const alphax_sum = x0 + MultiplyAdd(x1, twothirds, x2 * onethird); // alphax_sum, alpha2_sum
+				Vec4 const x2 = m_xsum - x1 - x0;
+
+				//Vec3 const alphax_sum = x0 + x1 * 0.5f;
+				//float const alpha2_sum = w0 + w1 * 0.25f;
+				Vec4 const alphax_sum = MultiplyAdd(x1, half, x0); // alphax_sum, alpha2_sum
 				Vec4 const alpha2_sum = alphax_sum.SplatW();
-				
-				//Vec3 const betax_sum = x3 + x2 * (2.0f / 3.0f) + x1 * (1.0f / 3.0f);
-				//float const beta2_sum = w3 + w2 * (4.0f/9.0f) + w1 * (1.0f/9.0f);
-				Vec4 const betax_sum = x3 + MultiplyAdd(x2, twothirds, x1 * onethird); // betax_sum, beta2_sum
+
+				//Vec3 const betax_sum = x2 + x1 * 0.5f;
+				//float const beta2_sum = w2 + w1 * 0.25f;
+				Vec4 const betax_sum = MultiplyAdd(x1, half, x2); // betax_sum, beta2_sum
 				Vec4 const beta2_sum = betax_sum.SplatW();
-				
-				//float const alphabeta_sum = w1 * (2.0f/9.0f) + w2 * (2.0f/9.0f);
-				Vec4 const alphabeta_sum = two * (x1 * onethird + x2 * onethird).SplatW(); // alphabeta_sum
-				
+
+				//float const alphabeta_sum = w1 * 0.25f;
+				Vec4 const alphabeta_sum = (x1 * half).SplatW(); // alphabeta_sum
+
 				// float const factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);
 				Vec4 const factor = Reciprocal( NegativeMultiplySubtract(alphabeta_sum, alphabeta_sum, alpha2_sum*beta2_sum) );
-				
+
 				Vec4 a = NegativeMultiplySubtract(betax_sum, alphabeta_sum, alphax_sum*beta2_sum) * factor;
 				Vec4 b = NegativeMultiplySubtract(alphax_sum, alphabeta_sum, betax_sum*alpha2_sum) * factor;
-				
-				// clamp the output to [0, 1]
+
+				// clamp to the grid
 				a = Min( one, Max( zero, a ) );
 				b = Min( one, Max( zero, b ) );
-				
-				// clamp to the grid
-				Vec4 const grid( 31.0f, 63.0f, 31.0f, 0.0f );
-				Vec4 const gridrcp( 0.03227752766457f, 0.01583151765563f, 0.03227752766457f, 0.0f );
 				a = Truncate( MultiplyAdd( grid, a, half ) ) * gridrcp;
 				b = Truncate( MultiplyAdd( grid, b, half ) ) * gridrcp;
-				
-				// compute the error
-				Vec4 e1 = MultiplyAdd( a, alphax_sum, b*betax_sum );
-				Vec4 e2 = MultiplyAdd( a*a, alpha2_sum, b*b*beta2_sum );
-				Vec4 e3 = MultiplyAdd( a*b*alphabeta_sum - e1, two, e2 );
-				
+
+				// compute the error (we skip the constant xxsum)
+				Vec4 e1 = MultiplyAdd( a*a, alpha2_sum, b*b*beta2_sum );
+				Vec4 e2 = NegativeMultiplySubtract( a, alphax_sum, a*b*alphabeta_sum );
+				Vec4 e3 = NegativeMultiplySubtract( b, betax_sum, e2 );
+				Vec4 e4 = MultiplyAdd( two, e3, e1 );
+
 				// apply the metric to the error term
-				Vec4 e4 = e3 * m_metricSqr;
-				Vec4 error = e4.SplatX() + e4.SplatY() + e4.SplatZ();
-				
+				Vec4 e5 = e4 * m_metricSqr;
+				Vec4 error = e5.SplatX() + e5.SplatY() + e5.SplatZ();
+
 				// keep the solution if it wins
 				if( CompareAnyLessThan( error, besterror ) )
 				{
@@ -320,212 +200,228 @@ void WeightedClusterFit::Compress4( void* block )
 					bestend = b;
 					b0 = c0;
 					b1 = c1;
-					b2 = c2;
 				}
-				
-				x2 += m_weighted[c0+c1+c2];
+
+				x1 += m_weighted[c0+c1];
 			}
-			
-			x1 += m_weighted[c0+c1];
+
+			x0 += m_weighted[c0];
 		}
-		
-		x0 += m_weighted[c0];
-	}
 
-	// save the block if necessary
-	if( CompareAnyLessThan( besterror, m_besterror ) )
-	{
-		// compute indices from cluster sizes.
-		u8 bestindices[16];
+		// save the block if necessary
+		if( CompareAnyLessThan( besterror, m_besterror ) )
 		{
-			int i = 0;
-			for(; i < b0; i++) {
-				bestindices[i] = 0;
-			}
-			for(; i < b0+b1; i++) {
-				bestindices[i] = 2;
-			}
-			for(; i < b0+b1+b2; i++) {
-				bestindices[i] = 3;
-			}
-			for(; i < 16; i++) {
-				bestindices[i] = 1;
+			// compute indices from cluster sizes.
+			u8 bestindices[16];
+			{
+				int i = 0;
+				for(; i < b0; i++) {
+					bestindices[i] = 0;
+				}
+				for(; i < b0+b1; i++) {
+					bestindices[i] = 2;
+				}
+				for(; i < count; i++) {
+					bestindices[i] = 1;
+				}
 			}
+
+			// remap the indices
+			u8 ordered[16];
+			for( int i = 0; i < count; ++i )
+				ordered[m_order[i]] = bestindices[i];
+
+			m_colours->RemapIndices( ordered, bestindices );
+
+
+			// save the block
+			WriteColourBlock3( beststart.GetVec3(), bestend.GetVec3(), bestindices, block );
+
+			// save the error
+			m_besterror = besterror;
 		}
-		
-		// remap the indices
-		u8 ordered[16];
-		for( int i = 0; i < count; ++i )
-			ordered[m_order[i]] = bestindices[i];
-		
-		// save the block
-		WriteColourBlock4( beststart.GetVec3(), bestend.GetVec3(), ordered, block );
-		
-		// save the error
-		m_besterror = besterror;
 	}
-}
-
-#else
 
-void WeightedClusterFit::Compress3( void* block )
-{
-	// declare variables
-	Vec3 beststart( 0.0f );
-	Vec3 bestend( 0.0f );
-	float besterror = FLT_MAX;
-
-	Vec3 x0(0.0f);
-	float w0 = 0.0f;
-	
-	int b0 = 0, b1 = 0;
-
-	// check all possible clusters for this total order
-	for( int c0 = 0; c0 <= 16; c0++)
-	{	
-		Vec3 x1(0.0f);
-		float w1 = 0.0f;
-		
-		for( int c1 = 0; c1 <= 16-c0; c1++)
+	void WeightedClusterFit::Compress4( void* block )
+	{
+		int const count = m_colours->GetCount();
+		Vec4 const one = VEC4_CONST(1.0f);
+		Vec4 const zero = VEC4_CONST(0.0f);
+		Vec4 const half = VEC4_CONST(0.5f);
+		Vec4 const two = VEC4_CONST(2.0);
+		Vec4 const onethird( 1.0f/3.0f, 1.0f/3.0f, 1.0f/3.0f, 1.0f/9.0f );
+		Vec4 const twothirds( 2.0f/3.0f, 2.0f/3.0f, 2.0f/3.0f, 4.0f/9.0f );
+		Vec4 const twonineths = VEC4_CONST( 2.0f/9.0f );
+		Vec4 const grid( 31.0f, 63.0f, 31.0f, 0.0f );
+		Vec4 const gridrcp( 1.0f/31.0f, 1.0f/63.0f, 1.0f/31.0f, 0.0f );
+
+		// declare variables
+		Vec4 beststart = VEC4_CONST( 0.0f );
+		Vec4 bestend = VEC4_CONST( 0.0f );
+		Vec4 besterror = VEC4_CONST( FLT_MAX );
+
+		Vec4 x0 = zero;
+		int b0 = 0, b1 = 0, b2 = 0;
+
+		// check all possible clusters for this total order
+		for( int c0 = 0; c0 <= count; c0++)
 		{	
-			float w2 = m_wsum - w0 - w1;
-			
-			// These factors could be entirely precomputed.
-			float const alpha2_sum = w0 + w1 * 0.25f;
-			float const beta2_sum = w2 + w1 * 0.25f;
-			float const alphabeta_sum = w1 * 0.25f;
-			float const factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);
-			
-			Vec3 const alphax_sum = x0 + x1 * 0.5f;
-			Vec3 const betax_sum = m_xsum - alphax_sum;
-			
-			Vec3 a = (alphax_sum*beta2_sum - betax_sum*alphabeta_sum) * factor;
-			Vec3 b = (betax_sum*alpha2_sum - alphax_sum*alphabeta_sum) * factor;
-			
-			// clamp the output to [0, 1]
-			Vec3 const one( 1.0f );
-			Vec3 const zero( 0.0f );
-			a = Min( one, Max( zero, a ) );
-			b = Min( one, Max( zero, b ) );
-			
-			// clamp to the grid
-			Vec3 const grid( 31.0f, 63.0f, 31.0f );
-			Vec3 const gridrcp( 0.03227752766457f, 0.01583151765563f, 0.03227752766457f );
-			Vec3 const half( 0.5f );
-			a = Floor( grid*a + half )*gridrcp;
-			b = Floor( grid*b + half )*gridrcp;
-			
-			// compute the error
-			Vec3 e1 = a*a*alpha2_sum + b*b*beta2_sum + 2.0f*( a*b*alphabeta_sum - a*alphax_sum - b*betax_sum );
-			
-			// apply the metric to the error term
-			float error = Dot( e1, m_metricSqr );
-			
-			// keep the solution if it wins
-			if( error < besterror )
-			{
-				besterror = error;
-				beststart = a;
-				bestend = b;
-				b0 = c0;
-				b1 = c1;
+			Vec4 x1 = zero;
+
+			for( int c1 = 0; c1 <= count-c0; c1++)
+			{	
+				Vec4 x2 = zero;
+
+				for( int c2 = 0; c2 <= count-c0-c1; c2++)
+				{
+					Vec4 const x3 = m_xsum - x2 - x1 - x0;
+
+					//Vec3 const alphax_sum = x0 + x1 * (2.0f / 3.0f) + x2 * (1.0f / 3.0f);
+					//float const alpha2_sum = w0 + w1 * (4.0f/9.0f) + w2 * (1.0f/9.0f);
+					Vec4 const alphax_sum = MultiplyAdd(x2, onethird, MultiplyAdd(x1, twothirds, x0)); // alphax_sum, alpha2_sum
+					Vec4 const alpha2_sum = alphax_sum.SplatW();
+
+					//Vec3 const betax_sum = x3 + x2 * (2.0f / 3.0f) + x1 * (1.0f / 3.0f);
+					//float const beta2_sum = w3 + w2 * (4.0f/9.0f) + w1 * (1.0f/9.0f);
+					Vec4 const betax_sum = MultiplyAdd(x2, twothirds, MultiplyAdd(x1, onethird, x3)); // betax_sum, beta2_sum
+					Vec4 const beta2_sum = betax_sum.SplatW();
+
+					//float const alphabeta_sum = (w1 + w2) * (2.0f/9.0f);
+					Vec4 const alphabeta_sum = twonineths*( x1 + x2 ).SplatW(); // alphabeta_sum
+
+					// float const factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);
+					Vec4 const factor = Reciprocal( NegativeMultiplySubtract(alphabeta_sum, alphabeta_sum, alpha2_sum*beta2_sum) );
+
+					Vec4 a = NegativeMultiplySubtract(betax_sum, alphabeta_sum, alphax_sum*beta2_sum) * factor;
+					Vec4 b = NegativeMultiplySubtract(alphax_sum, alphabeta_sum, betax_sum*alpha2_sum) * factor;
+
+					// clamp to the grid
+					a = Min( one, Max( zero, a ) );
+					b = Min( one, Max( zero, b ) );
+					a = Truncate( MultiplyAdd( grid, a, half ) ) * gridrcp;
+					b = Truncate( MultiplyAdd( grid, b, half ) ) * gridrcp;
+
+					// compute the error (we skip the constant xxsum)
+					Vec4 e1 = MultiplyAdd( a*a, alpha2_sum, b*b*beta2_sum );
+					Vec4 e2 = NegativeMultiplySubtract( a, alphax_sum, a*b*alphabeta_sum );
+					Vec4 e3 = NegativeMultiplySubtract( b, betax_sum, e2 );
+					Vec4 e4 = MultiplyAdd( two, e3, e1 );
+
+					// apply the metric to the error term
+					Vec4 e5 = e4 * m_metricSqr;
+					Vec4 error = e5.SplatX() + e5.SplatY() + e5.SplatZ();
+
+					// keep the solution if it wins
+					if( CompareAnyLessThan( error, besterror ) )
+					{
+						besterror = error;
+						beststart = a;
+						bestend = b;
+						b0 = c0;
+						b1 = c1;
+						b2 = c2;
+					}
+
+					x2 += m_weighted[c0+c1+c2];
+				}
+
+				x1 += m_weighted[c0+c1];
 			}
-			
-			x1 += m_weighted[c0+c1];
-			w1 += m_weights[c0+c1];
+
+			x0 += m_weighted[c0];
 		}
-		
-		x0 += m_weighted[c0];
-		w0 += m_weights[c0];
-	}
 
-	// save the block if necessary
-	if( besterror < m_besterror )
-	{
-		// compute indices from cluster sizes.
-		u8 bestindices[16];
+		// save the block if necessary
+		if( CompareAnyLessThan( besterror, m_besterror ) )
 		{
-			int i = 0;
-			for(; i < b0; i++) {
-				bestindices[i] = 0;
-			}
-			for(; i < b0+b1; i++) {
-				bestindices[i] = 2;
-			}
-			for(; i < 16; i++) {
-				bestindices[i] = 1;
+			// compute indices from cluster sizes.
+			u8 bestindices[16];
+			{
+				int i = 0;
+				for(; i < b0; i++) {
+					bestindices[i] = 0;
+				}
+				for(; i < b0+b1; i++) {
+					bestindices[i] = 2;
+				}
+				for(; i < b0+b1+b2; i++) {
+					bestindices[i] = 3;
+				}
+				for(; i < count; i++) {
+					bestindices[i] = 1;
+				}
 			}
+
+			// remap the indices
+			u8 ordered[16];
+			for( int i = 0; i < count; ++i )
+				ordered[m_order[i]] = bestindices[i];
+
+			m_colours->RemapIndices( ordered, bestindices );
+
+			// save the block
+			WriteColourBlock4( beststart.GetVec3(), bestend.GetVec3(), bestindices, block );
+
+			// save the error
+			m_besterror = besterror;
 		}
-		
-		// remap the indices
-		u8 ordered[16];
-		for( int i = 0; i < 16; ++i )
-			ordered[m_order[i]] = bestindices[i];
-		
-		// save the block
-		WriteColourBlock3( beststart, bestend, ordered, block );
-		
-		// save the error
-		m_besterror = besterror;
 	}
-}
-
-void WeightedClusterFit::Compress4( void* block )
-{
-	// declare variables
-	Vec3 beststart( 0.0f );
-	Vec3 bestend( 0.0f );
-	float besterror = FLT_MAX;
-
-	Vec3 x0(0.0f);
-	float w0 = 0.0f;
-	int b0 = 0, b1 = 0, b2 = 0;
-
-	// check all possible clusters for this total order
-	for( int c0 = 0; c0 <= 16; c0++)
-	{	
-		Vec3 x1(0.0f);
-		float w1 = 0.0f;
-		
-		for( int c1 = 0; c1 <= 16-c0; c1++)
+
+#else
+
+	void WeightedClusterFit::Compress3( void* block )
+	{
+		int const count = m_colours->GetCount();
+		Vec3 const one( 1.0f );
+		Vec3 const zero( 0.0f );
+		Vec3 const half( 0.5f );
+		Vec3 const grid( 31.0f, 63.0f, 31.0f );
+		Vec3 const gridrcp( 1.0f/31.0f, 1.0f/63.0f, 1.0f/31.0f );
+
+		// declare variables
+		Vec3 beststart( 0.0f );
+		Vec3 bestend( 0.0f );
+		float besterror = FLT_MAX;
+
+		Vec3 x0(0.0f);
+		float w0 = 0.0f;
+
+		int b0 = 0, b1 = 0;
+
+		// check all possible clusters for this total order
+		for( int c0 = 0; c0 <= count; c0++)
 		{	
-			Vec3 x2(0.0f);
-			float w2 = 0.0f;
-			
-			for( int c2 = 0; c2 <= 16-c0-c1; c2++)
-			{
-				float w3 = m_wsum - w0 - w1 - w2;
-				
-				float const alpha2_sum = w0 + w1 * (4.0f/9.0f) + w2 * (1.0f/9.0f);
-				float const beta2_sum = w3 + w2 * (4.0f/9.0f) + w1 * (1.0f/9.0f);
-				float const alphabeta_sum = (w1 + w2) * (2.0f/9.0f);
+			Vec3 x1(0.0f);
+			float w1 = 0.0f;
+
+			for( int c1 = 0; c1 <= count-c0; c1++)
+			{	
+				float w2 = m_wsum - w0 - w1;
+
+				// These factors could be entirely precomputed.
+				float const alpha2_sum = w0 + w1 * 0.25f;
+				float const beta2_sum = w2 + w1 * 0.25f;
+				float const alphabeta_sum = w1 * 0.25f;
 				float const factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);
-				
-				Vec3 const alphax_sum = x0 + x1 * (2.0f / 3.0f) + x2 * (1.0f / 3.0f);
+
+				Vec3 const alphax_sum = x0 + x1 * 0.5f;
 				Vec3 const betax_sum = m_xsum - alphax_sum;
-				
-				Vec3 a = ( alphax_sum*beta2_sum - betax_sum*alphabeta_sum )*factor;
-				Vec3 b = ( betax_sum*alpha2_sum - alphax_sum*alphabeta_sum )*factor;
-				
-				// clamp the output to [0, 1]
-				Vec3 const one( 1.0f );
-				Vec3 const zero( 0.0f );
+
+				Vec3 a = (alphax_sum*beta2_sum - betax_sum*alphabeta_sum) * factor;
+				Vec3 b = (betax_sum*alpha2_sum - alphax_sum*alphabeta_sum) * factor;
+
+				// clamp to the grid
 				a = Min( one, Max( zero, a ) );
 				b = Min( one, Max( zero, b ) );
-				
-				// clamp to the grid
-				Vec3 const grid( 31.0f, 63.0f, 31.0f );
-				Vec3 const gridrcp( 0.03227752766457f, 0.01583151765563f, 0.03227752766457f );
-				Vec3 const half( 0.5f );
 				a = Floor( grid*a + half )*gridrcp;
 				b = Floor( grid*b + half )*gridrcp;
-				
+
 				// compute the error
 				Vec3 e1 = a*a*alpha2_sum + b*b*beta2_sum + 2.0f*( a*b*alphabeta_sum - a*alphax_sum - b*betax_sum );
-				
+
 				// apply the metric to the error term
 				float error = Dot( e1, m_metricSqr );
-				
+
 				// keep the solution if it wins
 				if( error < besterror )
 				{
@@ -534,54 +430,163 @@ void WeightedClusterFit::Compress4( void* block )
 					bestend = b;
 					b0 = c0;
 					b1 = c1;
-					b2 = c2;
 				}
-				
-				x2 += m_weighted[c0+c1+c2];
-				w2 += m_weights[c0+c1+c2];
+
+				x1 += m_weighted[c0+c1];
+				w1 += m_weights[c0+c1];
 			}
-			
-			x1 += m_weighted[c0+c1];
-			w1 += m_weights[c0+c1];
+
+			x0 += m_weighted[c0];
+			w0 += m_weights[c0];
 		}
-		
-		x0 += m_weighted[c0];
-		w0 += m_weights[c0];
-	}
 
-	// save the block if necessary
-	if( besterror < m_besterror )
-	{
-		// compute indices from cluster sizes.
-		u8 bestindices[16];
+		// save the block if necessary
+		if( besterror < m_besterror )
 		{
-			int i = 0;
-			for(; i < b0; i++) {
-				bestindices[i] = 0;
-			}
-			for(; i < b0+b1; i++) {
-				bestindices[i] = 2;
+			// compute indices from cluster sizes.
+			u8 bestindices[16];
+			{
+				int i = 0;
+				for(; i < b0; i++) {
+					bestindices[i] = 0;
+				}
+				for(; i < b0+b1; i++) {
+					bestindices[i] = 2;
+				}
+				for(; i < count; i++) {
+					bestindices[i] = 1;
+				}
 			}
-			for(; i < b0+b1+b2; i++) {
-				bestindices[i] = 3;
+
+			// remap the indices
+			u8 ordered[16];
+			for( int i = 0; i < count; ++i )
+				ordered[m_order[i]] = bestindices[i];
+
+			m_colours->RemapIndices( ordered, bestindices );
+
+			// save the block
+			WriteColourBlock3( beststart, bestend, bestindices, block );
+
+			// save the error
+			m_besterror = besterror;
+		}
+	}
+
+	void WeightedClusterFit::Compress4( void* block )
+	{
+		int const count = m_colours->GetCount();
+		Vec3 const one( 1.0f );
+		Vec3 const zero( 0.0f );
+		Vec3 const half( 0.5f );
+		Vec3 const grid( 31.0f, 63.0f, 31.0f );
+		Vec3 const gridrcp( 1.0f/31.0f, 1.0f/63.0f, 1.0f/31.0f );
+
+		// declare variables
+		Vec3 beststart( 0.0f );
+		Vec3 bestend( 0.0f );
+		float besterror = FLT_MAX;
+
+		Vec3 x0(0.0f);
+		float w0 = 0.0f;
+		int b0 = 0, b1 = 0, b2 = 0;
+
+		// check all possible clusters for this total order
+		for( int c0 = 0; c0 <= count; c0++)
+		{	
+			Vec3 x1(0.0f);
+			float w1 = 0.0f;
+
+			for( int c1 = 0; c1 <= count-c0; c1++)
+			{	
+				Vec3 x2(0.0f);
+				float w2 = 0.0f;
+
+				for( int c2 = 0; c2 <= count-c0-c1; c2++)
+				{
+					float w3 = m_wsum - w0 - w1 - w2;
+
+					float const alpha2_sum = w0 + w1 * (4.0f/9.0f) + w2 * (1.0f/9.0f);
+					float const beta2_sum = w3 + w2 * (4.0f/9.0f) + w1 * (1.0f/9.0f);
+					float const alphabeta_sum = (w1 + w2) * (2.0f/9.0f);
+					float const factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);
+
+					Vec3 const alphax_sum = x0 + x1 * (2.0f / 3.0f) + x2 * (1.0f / 3.0f);
+					Vec3 const betax_sum = m_xsum - alphax_sum;
+
+					Vec3 a = ( alphax_sum*beta2_sum - betax_sum*alphabeta_sum )*factor;
+					Vec3 b = ( betax_sum*alpha2_sum - alphax_sum*alphabeta_sum )*factor;
+
+					// clamp to the grid
+					a = Min( one, Max( zero, a ) );
+					b = Min( one, Max( zero, b ) );
+					a = Floor( grid*a + half )*gridrcp;
+					b = Floor( grid*b + half )*gridrcp;
+
+					// compute the error
+					Vec3 e1 = a*a*alpha2_sum + b*b*beta2_sum + 2.0f*( a*b*alphabeta_sum - a*alphax_sum - b*betax_sum );
+
+					// apply the metric to the error term
+					float error = Dot( e1, m_metricSqr );
+
+					// keep the solution if it wins
+					if( error < besterror )
+					{
+						besterror = error;
+						beststart = a;
+						bestend = b;
+						b0 = c0;
+						b1 = c1;
+						b2 = c2;
+					}
+
+					x2 += m_weighted[c0+c1+c2];
+					w2 += m_weights[c0+c1+c2];
+				}
+
+				x1 += m_weighted[c0+c1];
+				w1 += m_weights[c0+c1];
 			}
-			for(; i < 16; i++) {
-				bestindices[i] = 1;
+
+			x0 += m_weighted[c0];
+			w0 += m_weights[c0];
+		}
+
+		// save the block if necessary
+		if( besterror < m_besterror )
+		{
+			// compute indices from cluster sizes.
+			u8 bestindices[16];
+			{
+				int i = 0;
+				for(; i < b0; i++) {
+					bestindices[i] = 0;
+				}
+				for(; i < b0+b1; i++) {
+					bestindices[i] = 2;
+				}
+				for(; i < b0+b1+b2; i++) {
+					bestindices[i] = 3;
+				}
+				for(; i < count; i++) {
+					bestindices[i] = 1;
+				}
 			}
+
+			// remap the indices
+			u8 ordered[16];
+			for( int i = 0; i < count; ++i )
+				ordered[m_order[i]] = bestindices[i];
+
+			m_colours->RemapIndices( ordered, bestindices );
+
+			// save the block
+			WriteColourBlock4( beststart, bestend, bestindices, block );
+
+			// save the error
+			m_besterror = besterror;
 		}
-		
-		// remap the indices
-		u8 ordered[16];
-		for( int i = 0; i < 16; ++i )
-			ordered[m_order[i]] = bestindices[i];
-		
-		// save the block
-		WriteColourBlock4( beststart, bestend, ordered, block );
-
-		// save the error
-		m_besterror = besterror;
 	}
-}
 
 #endif