diff --git a/src/nvtt/ClusterFit.cpp b/src/nvtt/ClusterFit.cpp
index 20c59ca..0170985 100644
--- a/src/nvtt/ClusterFit.cpp
+++ b/src/nvtt/ClusterFit.cpp
@@ -1,27 +1,27 @@
 /* -----------------------------------------------------------------------------
 
-	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
-	Copyright (c) 2006 Ignacio Castano                      icastano@nvidia.com
+    Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+    Copyright (c) 2006 Ignacio Castano                      icastano@nvidia.com
 
-	Permission is hereby granted, free of charge, to any person obtaining
-	a copy of this software and associated documentation files (the 
-	"Software"), to	deal in the Software without restriction, including
-	without limitation the rights to use, copy, modify, merge, publish,
-	distribute, sublicense, and/or sell copies of the Software, and to 
-	permit persons to whom the Software is furnished to do so, subject to 
-	the following conditions:
+    Permission is hereby granted, free of charge, to any person obtaining
+    a copy of this software and associated documentation files (the
+    "Software"), to	deal in the Software without restriction, including
+    without limitation the rights to use, copy, modify, merge, publish,
+    distribute, sublicense, and/or sell copies of the Software, and to
+    permit persons to whom the Software is furnished to do so, subject to
+    the following conditions:
 
-	The above copyright notice and this permission notice shall be included
-	in all copies or substantial portions of the Software.
+    The above copyright notice and this permission notice shall be included
+    in all copies or substantial portions of the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+    OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+    IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+    CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+    TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+    SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
-	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
-	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
-	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
-	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
-	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
-	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-	
    -------------------------------------------------------------------------- */
 
 #include "ClusterFit.h"
@@ -38,91 +38,90 @@ ClusterFit::ClusterFit()
 
 void ClusterFit::setColourSet(const ColorSet * set)
 {
-	// initialise the best error
+    // initialise the best error
 #if NVTT_USE_SIMD
-	m_besterror = SimdVector( FLT_MAX );
-	Vector3 metric = m_metric.toVector3();
+    m_besterror = SimdVector( FLT_MAX );
+    Vector3 metric = m_metric.toVector3();
 #else
-	m_besterror = FLT_MAX;
-	Vector3 metric = m_metric;
+    m_besterror = FLT_MAX;
+    Vector3 metric = m_metric;
 #endif
 
-	// cache some values
-	count = set->count;
+    // cache some values
+    m_count = set->count;
 
     Vector3 values[16];
-    for (uint i = 0; i < count; i++)
+    for (uint i = 0; i < m_count; i++)
     {
         values[i] = set->colors[i].xyz();
     }
-	
-    Vector3 principle = Fit::computePrincipalComponent(count, values, set->weights, metric);
 
+    Vector3 principle = Fit::computePrincipalComponent(m_count, values, set->weights, metric);
 
-	// build the list of values
-	float dps[16];
-	for (uint i = 0; i < count; ++i)
-	{
-		dps[i] = dot(values[i], principle);
-		m_order[i] = i;
-	}
-	
-	// stable sort
-	for (uint i = 0; i < count; ++i)
-	{
-		for (uint j = i; j > 0 && dps[j] < dps[j - 1]; --j)
-		{
-			swap( dps[j], dps[j - 1] );
-			swap( m_order[j], m_order[j - 1] );
-		}
-	}
-	
-	// weight all the points
+    // build the list of values
+    int order[16];
+    float dps[16];
+    for (uint i = 0; i < m_count; ++i)
+    {
+        dps[i] = dot(values[i], principle);
+        order[i] = i;
+    }
+
+    // stable sort
+    for (uint i = 0; i < m_count; ++i)
+    {
+        for (uint j = i; j > 0 && dps[j] < dps[j - 1]; --j)
+        {
+            swap(dps[j], dps[j - 1]);
+            swap(order[j], order[j - 1]);
+        }
+    }
+
+    // weight all the points
 #if NVTT_USE_SIMD
-	SimdVector const* unweighted = set->GetPointsSimd();
-	SimdVector const* weights = set->GetWeightsSimd();
-	m_xxsum = SimdVector( 0.0f );
-	m_xsum = SimdVector( 0.0f );
+    m_xxsum = SimdVector( 0.0f );
+    m_xsum = SimdVector( 0.0f );
 #else
-	Vector3 const* unweighted = values;
-	float const* weights = set->weights;
-	m_xxsum = Vector3(0.0f);
-	m_xsum = Vector3(0.0f);
-	m_wsum = 0.0f;	
+    m_xsum = Vector3(0.0f);
+    m_wsum = 0.0f;
 #endif
 	
-	for (uint i = 0; i < count; ++i)
-	{
-		int p = m_order[i];
-		m_weighted[i] = weights[p] * unweighted[p];
-		m_xxsum += m_weighted[i] * m_weighted[i];
-		m_xsum += m_weighted[i];
-#if !NVTT_USE_SIMD		
-		m_weights[i] = weights[p];
-		m_wsum += m_weights[i];
+    for (uint i = 0; i < m_count; ++i)
+    {
+        int p = order[i];
+#if NVTT_USE_SIMD
+        m_weighted[i] = SimdVector(Vector4(set->weights[p] * values[p], set->weights[p]));
+        m_xxsum += m_weighted[i] * m_weighted[i];
+        m_xsum += m_weighted[i];
+#else
+        m_weighted[i] = values[p];
+        m_xxsum += m_weighted[i] * m_weighted[i];
+        m_xsum += m_weighted[i];
+        m_weights[i] = set->weights[p];
+        m_wsum += m_weights[i];
 #endif
-	}
+    }
 }
 
 
 void ClusterFit::setMetric(Vector4::Arg w)
 {
 #if NVTT_USE_SIMD
-	m_metric = SimdVector(w);
+    m_metric = SimdVector(Vector4(w.xyz(), 1));
 #else
-	m_metric = w.xyz();
+    m_metric = w.xyz();
 #endif
-	m_metricSqr = m_metric * m_metric;
+    m_metricSqr = m_metric * m_metric;
 }
 
 float ClusterFit::bestError() const
 {
 #if NVTT_USE_SIMD
-	SimdVector x = m_xxsum * m_metricSqr;
-	SimdVector error = m_besterror + x.splatX() + x.splatY() + x.splatZ();
-	return error.toFloat();
+    SimdVector x = m_xxsum * m_metricSqr;
+    SimdVector error = m_besterror + x.splatX() + x.splatY() + x.splatZ();
+    return error.toFloat();
 #else
-	return m_besterror + dot(m_xxsum, m_metricSqr);
+    return m_besterror + dot(m_xxsum, m_metricSqr);
 #endif
 
 }
@@ -131,251 +130,199 @@ float ClusterFit::bestError() const
 
 bool ClusterFit::compress3( Vector3 * start, Vector3 * end )
 {
-    int const count = m_colours->count;
-	SimdVector const one = SimdVector(1.0f);
-	SimdVector const zero = SimdVector(0.0f);
-	SimdVector const half(0.5f, 0.5f, 0.5f, 0.25f);
-	SimdVector const two = SimdVector(2.0);
-	SimdVector const grid( 31.0f, 63.0f, 31.0f, 0.0f );
-	SimdVector const gridrcp( 1.0f/31.0f, 1.0f/63.0f, 1.0f/31.0f, 0.0f );
-	 
-	// declare variables
-	SimdVector beststart = SimdVector( 0.0f );
-	SimdVector bestend = SimdVector( 0.0f );
-	SimdVector besterror = SimdVector( FLT_MAX );
+    int const count = m_count;
+    SimdVector const one = SimdVector(1.0f);
+    SimdVector const zero = SimdVector(0.0f);
+    SimdVector const half(0.5f, 0.5f, 0.5f, 0.25f);
+    SimdVector const two = SimdVector(2.0);
+    SimdVector const grid( 31.0f, 63.0f, 31.0f, 0.0f );
+    SimdVector const gridrcp( 1.0f/31.0f, 1.0f/63.0f, 1.0f/31.0f, 0.0f );
 
-	SimdVector x0 = zero;
+    // declare variables
+    SimdVector beststart = SimdVector( 0.0f );
+    SimdVector bestend = SimdVector( 0.0f );
+    SimdVector besterror = SimdVector( FLT_MAX );
+
+    SimdVector x0 = zero;
 	
-	int b0 = 0, b1 = 0;
+    int b0 = 0, b1 = 0;
 
-	// check all possible clusters for this total order
-	for( int c0 = 0; c0 <= count; c0++)
-	{	
-		SimdVector x1 = zero;
-		
-		for( int c1 = 0; c1 <= count-c0; c1++)
-		{
-			SimdVector const x2 = m_xsum - x1 - x0;
-			
-			//Vector3 const alphax_sum = x0 + x1 * 0.5f;
-			//float const alpha2_sum = w0 + w1 * 0.25f;
-			SimdVector const alphax_sum = multiplyAdd(x1, half, x0); // alphax_sum, alpha2_sum
-			SimdVector const alpha2_sum = alphax_sum.splatW();
-			
-			//Vector3 const betax_sum = x2 + x1 * 0.5f;
-			//float const beta2_sum = w2 + w1 * 0.25f;
-			SimdVector const betax_sum = multiplyAdd(x1, half, x2); // betax_sum, beta2_sum
-			SimdVector const beta2_sum = betax_sum.splatW();
-			
-			//float const alphabeta_sum = w1 * 0.25f;
-			SimdVector const alphabeta_sum = (x1 * half).splatW(); // alphabeta_sum
-			
-			// float const factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);
-			SimdVector const factor = reciprocal( negativeMultiplySubtract(alphabeta_sum, alphabeta_sum, alpha2_sum*beta2_sum) );
-			
-			SimdVector a = negativeMultiplySubtract(betax_sum, alphabeta_sum, alphax_sum*beta2_sum) * factor;
-			SimdVector b = negativeMultiplySubtract(alphax_sum, alphabeta_sum, betax_sum*alpha2_sum) * factor;
-			
-			// clamp to the grid
-			a = min( one, max( zero, a ) );
-			b = min( one, max( zero, b ) );
-			a = truncate( multiplyAdd( grid, a, half ) ) * gridrcp;
-			b = truncate( multiplyAdd( grid, b, half ) ) * gridrcp;
-			
-			// compute the error (we skip the constant xxsum)
-			SimdVector e1 = multiplyAdd( a*a, alpha2_sum, b*b*beta2_sum );
-			SimdVector e2 = negativeMultiplySubtract( a, alphax_sum, a*b*alphabeta_sum );
-			SimdVector e3 = negativeMultiplySubtract( b, betax_sum, e2 );
-			SimdVector e4 = multiplyAdd( two, e3, e1 );
+    // check all possible clusters for this total order
+    for( int c0 = 0; c0 <= count; c0++)
+    {
+        SimdVector x1 = zero;
 
-			// apply the metric to the error term
-			SimdVector e5 = e4 * m_metricSqr;
-			SimdVector error = e5.splatX() + e5.splatY() + e5.splatZ();
-			
-			// keep the solution if it wins
-			if( compareAnyLessThan( error, besterror ) )
-			{
-				besterror = error;
-				beststart = a;
-				bestend = b;
-				b0 = c0;
-				b1 = c1;
-			}
-			
-			x1 += m_weighted[c0+c1];
-		}
-		
-		x0 += m_weighted[c0];
+        for( int c1 = 0; c1 <= count-c0; c1++)
+        {
+            SimdVector const x2 = m_xsum - x1 - x0;
+
+            //Vector3 const alphax_sum = x0 + x1 * 0.5f;
+            //float const alpha2_sum = w0 + w1 * 0.25f;
+            SimdVector const alphax_sum = multiplyAdd(x1, half, x0); // alphax_sum, alpha2_sum
+            SimdVector const alpha2_sum = alphax_sum.splatW();
+
+            //Vector3 const betax_sum = x2 + x1 * 0.5f;
+            //float const beta2_sum = w2 + w1 * 0.25f;
+            SimdVector const betax_sum = multiplyAdd(x1, half, x2); // betax_sum, beta2_sum
+            SimdVector const beta2_sum = betax_sum.splatW();
+
+            //float const alphabeta_sum = w1 * 0.25f;
+            SimdVector const alphabeta_sum = (x1 * half).splatW(); // alphabeta_sum
+
+            // float const factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);
+            SimdVector const factor = reciprocal( negativeMultiplySubtract(alphabeta_sum, alphabeta_sum, alpha2_sum*beta2_sum) );
+
+            SimdVector a = negativeMultiplySubtract(betax_sum, alphabeta_sum, alphax_sum*beta2_sum) * factor;
+            SimdVector b = negativeMultiplySubtract(alphax_sum, alphabeta_sum, betax_sum*alpha2_sum) * factor;
+
+            // clamp to the grid
+            a = min( one, max( zero, a ) );
+            b = min( one, max( zero, b ) );
+            a = truncate( multiplyAdd( grid, a, half ) ) * gridrcp;
+            b = truncate( multiplyAdd( grid, b, half ) ) * gridrcp;
+
+            // compute the error (we skip the constant xxsum)
+            SimdVector e1 = multiplyAdd( a*a, alpha2_sum, b*b*beta2_sum );
+            SimdVector e2 = negativeMultiplySubtract( a, alphax_sum, a*b*alphabeta_sum );
+            SimdVector e3 = negativeMultiplySubtract( b, betax_sum, e2 );
+            SimdVector e4 = multiplyAdd( two, e3, e1 );
+
+	    // apply the metric to the error term
+	    SimdVector e5 = e4 * m_metricSqr;
+	    SimdVector error = e5.splatX() + e5.splatY() + e5.splatZ();
+
+	    // keep the solution if it wins
+	    if( compareAnyLessThan( error, besterror ) )
+	    {
+		besterror = error;
+		beststart = a;
+		bestend = b;
+		b0 = c0;
+		b1 = c1;
+	    }
+
+	    x1 += m_weighted[c0+c1];
 	}
 
-	// save the block if necessary
-	if( compareAnyLessThan( besterror, m_besterror ) )
-	{
-		// compute indices from cluster sizes.
-		/*u8 bestindices[16];
-		{
-			int i = 0;
-			for(; i < b0; i++) {
-				bestindices[i] = 0;
-			}
-			for(; i < b0+b1; i++) {
-				bestindices[i] = 2;
-			}
-			for(; i < count; i++) {
-				bestindices[i] = 1;
-			}
-		}
-		
-		// remap the indices
-		u8 ordered[16];
-		for( int i = 0; i < count; ++i )
-			ordered[m_order[i]] = bestindices[i];
-		
-		m_colours->RemapIndices( ordered, bestindices );
+        x0 += m_weighted[c0];
+    }
 
-		// save the block
-		WriteColourBlock3( beststart.toVector3(), bestend.toVector3(), bestindices, block );*/
+    // save the block if necessary
+    if( compareAnyLessThan( besterror, m_besterror ) )
+    {
 
         *start = beststart.toVector3();
         *end = bestend.toVector3();
 
-		// save the error
-		m_besterror = besterror;
+	// save the error
+	m_besterror = besterror;
 
         return true;
-	}
+    }
 
     return false;
 }
 
 bool ClusterFit::compress4( Vector3 * start, Vector3 * end )
 {
-    int const count = m_colours->count;
-	SimdVector const one = SimdVector(1.0f);
-	SimdVector const zero = SimdVector(0.0f);
-	SimdVector const half = SimdVector(0.5f);
-	SimdVector const two = SimdVector(2.0);
-	SimdVector const onethird( 1.0f/3.0f, 1.0f/3.0f, 1.0f/3.0f, 1.0f/9.0f );
-	SimdVector const twothirds( 2.0f/3.0f, 2.0f/3.0f, 2.0f/3.0f, 4.0f/9.0f );
+    int const count = m_count;
+    SimdVector const one = SimdVector(1.0f);
+    SimdVector const zero = SimdVector(0.0f);
+    SimdVector const half = SimdVector(0.5f);
+    SimdVector const two = SimdVector(2.0);
+    SimdVector const onethird( 1.0f/3.0f, 1.0f/3.0f, 1.0f/3.0f, 1.0f/9.0f );
+    SimdVector const twothirds( 2.0f/3.0f, 2.0f/3.0f, 2.0f/3.0f, 4.0f/9.0f );
     SimdVector const twonineths = SimdVector( 2.0f/9.0f );
-	SimdVector const grid( 31.0f, 63.0f, 31.0f, 0.0f );
-	SimdVector const gridrcp( 1.0f/31.0f, 1.0f/63.0f, 1.0f/31.0f, 0.0f );
-	
-	// declare variables
-	SimdVector beststart = SimdVector( 0.0f );
-	SimdVector bestend = SimdVector( 0.0f );
-	SimdVector besterror = SimdVector( FLT_MAX );
+    SimdVector const grid( 31.0f, 63.0f, 31.0f, 0.0f );
+    SimdVector const gridrcp( 1.0f/31.0f, 1.0f/63.0f, 1.0f/31.0f, 0.0f );
 
-	SimdVector x0 = zero;
-	int b0 = 0, b1 = 0, b2 = 0;
+    // declare variables
+    SimdVector beststart = SimdVector( 0.0f );
+    SimdVector bestend = SimdVector( 0.0f );
+    SimdVector besterror = SimdVector( FLT_MAX );
 
-	// check all possible clusters for this total order
-	for( int c0 = 0; c0 <= count; c0++)
-	{	
-		SimdVector x1 = zero;
-		
-		for( int c1 = 0; c1 <= count-c0; c1++)
-		{	
-			SimdVector x2 = zero;
-			
-			for( int c2 = 0; c2 <= count-c0-c1; c2++)
-			{
-				SimdVector const x3 = m_xsum - x2 - x1 - x0;
-				
-				//Vector3 const alphax_sum = x0 + x1 * (2.0f / 3.0f) + x2 * (1.0f / 3.0f);
-				//float const alpha2_sum = w0 + w1 * (4.0f/9.0f) + w2 * (1.0f/9.0f);
+    SimdVector x0 = zero;
+    int b0 = 0, b1 = 0, b2 = 0;
+
+    // check all possible clusters for this total order
+    for( int c0 = 0; c0 <= count; c0++)
+    {
+        SimdVector x1 = zero;
+
+        for( int c1 = 0; c1 <= count-c0; c1++)
+        {
+            SimdVector x2 = zero;
+
+            for( int c2 = 0; c2 <= count-c0-c1; c2++)
+            {
+                SimdVector const x3 = m_xsum - x2 - x1 - x0;
+
+                //Vector3 const alphax_sum = x0 + x1 * (2.0f / 3.0f) + x2 * (1.0f / 3.0f);
+                //float const alpha2_sum = w0 + w1 * (4.0f/9.0f) + w2 * (1.0f/9.0f);
                 SimdVector const alphax_sum = multiplyAdd(x2, onethird, multiplyAdd(x1, twothirds, x0)); // alphax_sum, alpha2_sum
-				SimdVector const alpha2_sum = alphax_sum.splatW();
-				
-				//Vector3 const betax_sum = x3 + x2 * (2.0f / 3.0f) + x1 * (1.0f / 3.0f);
-				//float const beta2_sum = w3 + w2 * (4.0f/9.0f) + w1 * (1.0f/9.0f);
-				SimdVector const betax_sum = multiplyAdd(x2, twothirds, multiplyAdd(x1, onethird, x3)); // betax_sum, beta2_sum
-				SimdVector const beta2_sum = betax_sum.splatW();
-				
-				//float const alphabeta_sum = (w1 + w2) * (2.0f/9.0f);
-                SimdVector const alphabeta_sum = twonineths*( x1 + x2 ).splatW(); // alphabeta_sum
-				
-				// float const factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);
-				SimdVector const factor = reciprocal( negativeMultiplySubtract(alphabeta_sum, alphabeta_sum, alpha2_sum*beta2_sum) );
-				
-				SimdVector a = negativeMultiplySubtract(betax_sum, alphabeta_sum, alphax_sum*beta2_sum) * factor;
-				SimdVector b = negativeMultiplySubtract(alphax_sum, alphabeta_sum, betax_sum*alpha2_sum) * factor;
-				
-				// clamp to the grid
-				a = min( one, max( zero, a ) );
-				b = min( one, max( zero, b ) );
-				a = truncate( multiplyAdd( grid, a, half ) ) * gridrcp;
-				b = truncate( multiplyAdd( grid, b, half ) ) * gridrcp;
-				
-				// compute the error (we skip the constant xxsum)
-				SimdVector e1 = multiplyAdd( a*a, alpha2_sum, b*b*beta2_sum );
-				SimdVector e2 = negativeMultiplySubtract( a, alphax_sum, a*b*alphabeta_sum );
-				SimdVector e3 = negativeMultiplySubtract( b, betax_sum, e2 );
-				SimdVector e4 = multiplyAdd( two, e3, e1 );
+                SimdVector const alpha2_sum = alphax_sum.splatW();
 
-				// apply the metric to the error term
-				SimdVector e5 = e4 * m_metricSqr;
-				SimdVector error = e5.splatX() + e5.splatY() + e5.splatZ();
-				
-				// keep the solution if it wins
-				if( compareAnyLessThan( error, besterror ) )
-				{
-					besterror = error;
-					beststart = a;
-					bestend = b;
-					b0 = c0;
-					b1 = c1;
-					b2 = c2;
-				}
-				
-				x2 += m_weighted[c0+c1+c2];
-			}
-			
-			x1 += m_weighted[c0+c1];
+                //Vector3 const betax_sum = x3 + x2 * (2.0f / 3.0f) + x1 * (1.0f / 3.0f);
+                //float const beta2_sum = w3 + w2 * (4.0f/9.0f) + w1 * (1.0f/9.0f);
+                SimdVector const betax_sum = multiplyAdd(x2, twothirds, multiplyAdd(x1, onethird, x3)); // betax_sum, beta2_sum
+                SimdVector const beta2_sum = betax_sum.splatW();
+
+                //float const alphabeta_sum = (w1 + w2) * (2.0f/9.0f);
+                SimdVector const alphabeta_sum = twonineths*( x1 + x2 ).splatW(); // alphabeta_sum
+
+                // float const factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);
+                SimdVector const factor = reciprocal( negativeMultiplySubtract(alphabeta_sum, alphabeta_sum, alpha2_sum*beta2_sum) );
+
+                SimdVector a = negativeMultiplySubtract(betax_sum, alphabeta_sum, alphax_sum*beta2_sum) * factor;
+                SimdVector b = negativeMultiplySubtract(alphax_sum, alphabeta_sum, betax_sum*alpha2_sum) * factor;
+
+                // clamp to the grid
+                a = min( one, max( zero, a ) );
+                b = min( one, max( zero, b ) );
+                a = truncate( multiplyAdd( grid, a, half ) ) * gridrcp;
+                b = truncate( multiplyAdd( grid, b, half ) ) * gridrcp;
+
+                // compute the error (we skip the constant xxsum)
+                SimdVector e1 = multiplyAdd( a*a, alpha2_sum, b*b*beta2_sum );
+                SimdVector e2 = negativeMultiplySubtract( a, alphax_sum, a*b*alphabeta_sum );
+                SimdVector e3 = negativeMultiplySubtract( b, betax_sum, e2 );
+                SimdVector e4 = multiplyAdd( two, e3, e1 );
+
+		// apply the metric to the error term
+		SimdVector e5 = e4 * m_metricSqr;
+		SimdVector error = e5.splatX() + e5.splatY() + e5.splatZ();
+
+		// keep the solution if it wins
+		if( compareAnyLessThan( error, besterror ) )
+		{
+		    besterror = error;
+		    beststart = a;
+		    bestend = b;
+		    b0 = c0;
+		    b1 = c1;
+		    b2 = c2;
 		}
-		
-		x0 += m_weighted[c0];
+
+		x2 += m_weighted[c0+c1+c2];
+	    }
+
+	    x1 += m_weighted[c0+c1];
 	}
 
-	// save the block if necessary
-	if( compareAnyLessThan( besterror, m_besterror ) )
-	{
-		/*// compute indices from cluster sizes.
-		u8 bestindices[16];
-		{
-			int i = 0;
-			for(; i < b0; i++) {
-				bestindices[i] = 0;
-			}
-			for(; i < b0+b1; i++) {
-				bestindices[i] = 2;
-			}
-			for(; i < b0+b1+b2; i++) {
-				bestindices[i] = 3;
-			}
-			for(; i < count; i++) {
-				bestindices[i] = 1;
-			}
-		}
-		
-		// remap the indices
-		u8 ordered[16];
-		for( int i = 0; i < count; ++i )
-			ordered[m_order[i]] = bestindices[i];
-		
-        m_colours->RemapIndices( ordered, bestindices );
-
-		// save the block
-		WriteColourBlock4( beststart.toVector3(), bestend.toVector3(), bestindices, block );*/
+        x0 += m_weighted[c0];
+    }
 
+    // save the block if necessary
+    if( compareAnyLessThan( besterror, m_besterror ) )
+    {
         *start = beststart.toVector3();
         *end = bestend.toVector3();
 		
-		// save the error
-		m_besterror = besterror;
+	// save the error
+	m_besterror = besterror;
 
         return true;
-	}
+    }
 
     return false;
 }
@@ -384,231 +331,181 @@ bool ClusterFit::compress4( Vector3 * start, Vector3 * end )
 
 bool ClusterFit::compress3(Vector3 * start, Vector3 * end)
 {
-	const Vector3 one( 1.0f );
-	const Vector3 zero( 0.0f );
-	const Vector3 half( 0.5f );
+    int const count = m_count;
+    const Vector3 one( 1.0f );
+    const Vector3 zero( 0.0f );
+    const Vector3 half( 0.5f );
     const Vector3 grid( 31.0f, 63.0f, 31.0f );
     const Vector3 gridrcp( 1.0f/31.0f, 1.0f/63.0f, 1.0f/31.0f );
 
-	// declare variables
-	Vector3 beststart( 0.0f );
-	Vector3 bestend( 0.0f );
-	float besterror = FLT_MAX;
+    // declare variables
+    Vector3 beststart( 0.0f );
+    Vector3 bestend( 0.0f );
+    float besterror = FLT_MAX;
 
-	Vector3 x0(0.0f);
-	float w0 = 0.0f;
-	
-	int b0 = 0, b1 = 0;
+    Vector3 x0(0.0f);
+    float w0 = 0.0f;
 
-	// check all possible clusters for this total order
-	for (uint c0 = 0; c0 <= count; c0++)
-	{	
-		Vector3 x1(0.0f);
-		float w1 = 0.0f;
-		
-		for (uint c1 = 0; c1 <= count-c0; c1++)
-		{	
-			float w2 = m_wsum - w0 - w1;
-			
-			// These factors could be entirely precomputed.
-			float const alpha2_sum = w0 + w1 * 0.25f;
-			float const beta2_sum = w2 + w1 * 0.25f;
-			float const alphabeta_sum = w1 * 0.25f;
-			float const factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);
-			
-			Vector3 const alphax_sum = x0 + x1 * 0.5f;
-			Vector3 const betax_sum = m_xsum - alphax_sum;
-			
-			Vector3 a = (alphax_sum*beta2_sum - betax_sum*alphabeta_sum) * factor;
-			Vector3 b = (betax_sum*alpha2_sum - alphax_sum*alphabeta_sum) * factor;
-			
-			// clamp to the grid
-			a = min(one, max(zero, a));
-			b = min(one, max(zero, b));
-			a = floor(grid * a + half) * gridrcp;
-			b = floor(grid * b + half) * gridrcp;
-			
-			// compute the error
-			Vector3 e1 = a*a*alpha2_sum + b*b*beta2_sum + 2.0f*( a*b*alphabeta_sum - a*alphax_sum - b*betax_sum );
-			
-			// apply the metric to the error term
-			float error = dot(e1, m_metricSqr);
-			
-			// keep the solution if it wins
-			if (error < besterror)
-			{
-				besterror = error;
-				beststart = a;
-				bestend = b;
-				b0 = c0;
-				b1 = c1;
-			}
-			
-			x1 += m_weighted[c0+c1];
-			w1 += m_weights[c0+c1];
-		}
-		
-		x0 += m_weighted[c0];
-		w0 += m_weights[c0];
-	}
+    int b0 = 0, b1 = 0;
 
-	// save the block if necessary
-	if( besterror < m_besterror )
-	{
-		/*// compute indices from cluster sizes.
-		u8 bestindices[16];
-		{
-			int i = 0;
-			for(; i < b0; i++) {
-				bestindices[i] = 0;
-			}
-			for(; i < b0+b1; i++) {
-				bestindices[i] = 2;
-			}
-			for(; i < count; i++) {
-				bestindices[i] = 1;
-			}
-		}
-		
-		// remap the indices
-		u8 ordered[16];
-		for( int i = 0; i < count; ++i )
-			ordered[m_order[i]] = bestindices[i];
-		
-        m_colours->RemapIndices( ordered, bestindices );
+    // check all possible clusters for this total order
+    for (uint c0 = 0; c0 <= count; c0++)
+    {
+        Vector3 x1(0.0f);
+        float w1 = 0.0f;
 
-		// save the block
-		WriteColourBlock3( beststart, bestend, bestindices, block );*/
+        for (uint c1 = 0; c1 <= count-c0; c1++)
+        {
+            float w2 = m_wsum - w0 - w1;
+
+            // These factors could be entirely precomputed.
+            float const alpha2_sum = w0 + w1 * 0.25f;
+            float const beta2_sum = w2 + w1 * 0.25f;
+            float const alphabeta_sum = w1 * 0.25f;
+            float const factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);
+
+            Vector3 const alphax_sum = x0 + x1 * 0.5f;
+            Vector3 const betax_sum = m_xsum - alphax_sum;
+
+            Vector3 a = (alphax_sum*beta2_sum - betax_sum*alphabeta_sum) * factor;
+            Vector3 b = (betax_sum*alpha2_sum - alphax_sum*alphabeta_sum) * factor;
+
+            // clamp to the grid
+            a = min(one, max(zero, a));
+            b = min(one, max(zero, b));
+            a = floor(grid * a + half) * gridrcp;
+            b = floor(grid * b + half) * gridrcp;
+
+            // compute the error
+            Vector3 e1 = a*a*alpha2_sum + b*b*beta2_sum + 2.0f*( a*b*alphabeta_sum - a*alphax_sum - b*betax_sum );
+
+            // apply the metric to the error term
+            float error = dot(e1, m_metricSqr);
+
+            // keep the solution if it wins
+            if (error < besterror)
+            {
+                besterror = error;
+                beststart = a;
+                bestend = b;
+                b0 = c0;
+                b1 = c1;
+            }
+
+            x1 += m_weighted[c0+c1];
+            w1 += m_weights[c0+c1];
+        }
+
+        x0 += m_weighted[c0];
+        w0 += m_weights[c0];
+    }
+
+    // save the block if necessary
+    if( besterror < m_besterror )
+    {
 		
         *start = beststart;
         *end = bestend;
 
-		// save the error
-		m_besterror = besterror;
+	// save the error
+	m_besterror = besterror;
 
         return true;
-	}
+    }
 
     return false;
 }
 
 bool ClusterFit::compress4(Vector3 * start, Vector3 * end)
 {
-	Vector3 const one( 1.0f );
-	Vector3 const zero( 0.0f );
-	Vector3 const half( 0.5f );
-	Vector3 const grid( 31.0f, 63.0f, 31.0f );
-	Vector3 const gridrcp( 1.0f/31.0f, 1.0f/63.0f, 1.0f/31.0f );
+    int const count = m_count;
+    Vector3 const one( 1.0f );
+    Vector3 const zero( 0.0f );
+    Vector3 const half( 0.5f );
+    Vector3 const grid( 31.0f, 63.0f, 31.0f );
+    Vector3 const gridrcp( 1.0f/31.0f, 1.0f/63.0f, 1.0f/31.0f );
 
-	// declare variables
-	Vector3 beststart( 0.0f );
-	Vector3 bestend( 0.0f );
-	float besterror = FLT_MAX;
+    // declare variables
+    Vector3 beststart( 0.0f );
+    Vector3 bestend( 0.0f );
+    float besterror = FLT_MAX;
 
-	Vector3 x0(0.0f);
-	float w0 = 0.0f;
-	int b0 = 0, b1 = 0, b2 = 0;
+    Vector3 x0(0.0f);
+    float w0 = 0.0f;
+    int b0 = 0, b1 = 0, b2 = 0;
 
-	// check all possible clusters for this total order
-	for (uint c0 = 0; c0 <= count; c0++)
-	{	
-		Vector3 x1(0.0f);
-		float w1 = 0.0f;
-		
-		for (uint c1 = 0; c1 <= count-c0; c1++)
-		{	
-			Vector3 x2(0.0f);
-			float w2 = 0.0f;
-			
-			for (uint c2 = 0; c2 <= count-c0-c1; c2++)
-			{
-				float w3 = m_wsum - w0 - w1 - w2;
-				
-				float const alpha2_sum = w0 + w1 * (4.0f/9.0f) + w2 * (1.0f/9.0f);
-				float const beta2_sum = w3 + w2 * (4.0f/9.0f) + w1 * (1.0f/9.0f);
-				float const alphabeta_sum = (w1 + w2) * (2.0f/9.0f);
-				float const factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);
-				
-				Vector3 const alphax_sum = x0 + x1 * (2.0f / 3.0f) + x2 * (1.0f / 3.0f);
-				Vector3 const betax_sum = m_xsum - alphax_sum;
-				
-				Vector3 a = ( alphax_sum*beta2_sum - betax_sum*alphabeta_sum )*factor;
-				Vector3 b = ( betax_sum*alpha2_sum - alphax_sum*alphabeta_sum )*factor;
-				
-				// clamp to the grid
-				a = min( one, max( zero, a ) );
-				b = min( one, max( zero, b ) );
-				a = floor( grid*a + half )*gridrcp;
-				b = floor( grid*b + half )*gridrcp;
-				
-				// compute the error
-				Vector3 e1 = a*a*alpha2_sum + b*b*beta2_sum + 2.0f*( a*b*alphabeta_sum - a*alphax_sum - b*betax_sum );
-				
-				// apply the metric to the error term
-				float error = dot( e1, m_metricSqr );
-				
-				// keep the solution if it wins
-				if( error < besterror )
-				{
-					besterror = error;
-					beststart = a;
-					bestend = b;
-					b0 = c0;
-					b1 = c1;
-					b2 = c2;
-				}
-				
-				x2 += m_weighted[c0+c1+c2];
-				w2 += m_weights[c0+c1+c2];
-			}
-			
-			x1 += m_weighted[c0+c1];
-			w1 += m_weights[c0+c1];
-		}
-		
-		x0 += m_weighted[c0];
-		w0 += m_weights[c0];
-	}
+    // check all possible clusters for this total order
+    for (uint c0 = 0; c0 <= count; c0++)
+    {
+        Vector3 x1(0.0f);
+        float w1 = 0.0f;
 
-	// save the block if necessary
-	if( besterror < m_besterror )
-	{
-		/*// compute indices from cluster sizes.
-		u8 bestindices[16];
-		{
-			int i = 0;
-			for(; i < b0; i++) {
-				bestindices[i] = 0;
-			}
-			for(; i < b0+b1; i++) {
-				bestindices[i] = 2;
-			}
-			for(; i < b0+b1+b2; i++) {
-				bestindices[i] = 3;
-			}
-			for(; i < count; i++) {
-				bestindices[i] = 1;
-			}
-		}
-		
-		// remap the indices
-		u8 ordered[16];
-		for( int i = 0; i < count; ++i )
-			ordered[m_order[i]] = bestindices[i];
+        for (uint c1 = 0; c1 <= count-c0; c1++)
+        {
+            Vector3 x2(0.0f);
+            float w2 = 0.0f;
 
-        m_colours->RemapIndices( ordered, bestindices );
-		
-		// save the block
-		WriteColourBlock4( beststart, bestend, bestindices, block );*/
+            for (uint c2 = 0; c2 <= count-c0-c1; c2++)
+            {
+                float w3 = m_wsum - w0 - w1 - w2;
 
+                float const alpha2_sum = w0 + w1 * (4.0f/9.0f) + w2 * (1.0f/9.0f);
+                float const beta2_sum = w3 + w2 * (4.0f/9.0f) + w1 * (1.0f/9.0f);
+                float const alphabeta_sum = (w1 + w2) * (2.0f/9.0f);
+                float const factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);
+
+                Vector3 const alphax_sum = x0 + x1 * (2.0f / 3.0f) + x2 * (1.0f / 3.0f);
+                Vector3 const betax_sum = m_xsum - alphax_sum;
+
+                Vector3 a = ( alphax_sum*beta2_sum - betax_sum*alphabeta_sum )*factor;
+                Vector3 b = ( betax_sum*alpha2_sum - alphax_sum*alphabeta_sum )*factor;
+
+                // clamp to the grid
+                a = min( one, max( zero, a ) );
+                b = min( one, max( zero, b ) );
+                a = floor( grid*a + half )*gridrcp;
+                b = floor( grid*b + half )*gridrcp;
+
+                // compute the error
+                Vector3 e1 = a*a*alpha2_sum + b*b*beta2_sum + 2.0f*( a*b*alphabeta_sum - a*alphax_sum - b*betax_sum );
+
+                // apply the metric to the error term
+                float error = dot( e1, m_metricSqr );
+
+                // keep the solution if it wins
+                if( error < besterror )
+                {
+                    besterror = error;
+                    beststart = a;
+                    bestend = b;
+                    b0 = c0;
+                    b1 = c1;
+                    b2 = c2;
+                }
+
+                x2 += m_weighted[c0+c1+c2];
+                w2 += m_weights[c0+c1+c2];
+            }
+
+            x1 += m_weighted[c0+c1];
+            w1 += m_weights[c0+c1];
+        }
+
+        x0 += m_weighted[c0];
+        w0 += m_weights[c0];
+    }
+
+    // save the block if necessary
+    if( besterror < m_besterror )
+    {
         *start = beststart;
         *end = bestend;
 
-		// save the error
-		m_besterror = besterror;
+	// save the error
+	m_besterror = besterror;
 
         return true;
-	}
+    }
 
     return false;
 }
diff --git a/src/nvtt/ClusterFit.h b/src/nvtt/ClusterFit.h
index af3f84d..e023c66 100644
--- a/src/nvtt/ClusterFit.h
+++ b/src/nvtt/ClusterFit.h
@@ -1,26 +1,26 @@
 /* -----------------------------------------------------------------------------
 
-	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
-	Copyright (c) 2006 Ignacio Castano                      icastano@nvidia.com
+    Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+    Copyright (c) 2006 Ignacio Castano                      icastano@nvidia.com
 
-	Permission is hereby granted, free of charge, to any person obtaining
-	a copy of this software and associated documentation files (the 
-	"Software"), to	deal in the Software without restriction, including
-	without limitation the rights to use, copy, modify, merge, publish,
-	distribute, sublicense, and/or sell copies of the Software, and to 
-	permit persons to whom the Software is furnished to do so, subject to 
-	the following conditions:
+    Permission is hereby granted, free of charge, to any person obtaining
+    a copy of this software and associated documentation files (the
+    "Software"), to	deal in the Software without restriction, including
+    without limitation the rights to use, copy, modify, merge, publish,
+    distribute, sublicense, and/or sell copies of the Software, and to
+    permit persons to whom the Software is furnished to do so, subject to
+    the following conditions:
 
-	The above copyright notice and this permission notice shall be included
-	in all copies or substantial portions of the Software.
+    The above copyright notice and this permission notice shall be included
+    in all copies or substantial portions of the Software.
 
-	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
-	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
-	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
-	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
-	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
-	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+    OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+    IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+    CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+    TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+    SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 	
    -------------------------------------------------------------------------- */
    
@@ -39,42 +39,37 @@ namespace nv {
     class ClusterFit
     {
     public:
-	    ClusterFit();
+        ClusterFit();
+
+        void setColourSet(const ColorSet * set);
 
-	    void setColourSet(const ColorSet * set);
-    	
         void setMetric(Vector4::Arg w);
-	    float bestError() const;
+        float bestError() const;
+
+        bool compress3(Vector3 * start, Vector3 * end);
+        bool compress4(Vector3 * start, Vector3 * end);
 
-	    bool compress3(Vector3 * start, Vector3 * end);
-	    bool compress4(Vector3 * start, Vector3 * end);
-    	
     private:
 
-        uint count;
-	    //ColorSet const* m_colours;
-
-        Vector3 m_principle;
+        uint m_count;
 
     #if NVTT_USE_SIMD
-        SimdVector m_weighted[16];
-	    SimdVector m_metric;
-	    SimdVector m_metricSqr;
-	    SimdVector m_xxsum;
-	    SimdVector m_xsum;
-	    SimdVector m_besterror;
+        SimdVector m_weighted[16];  // color | weight
+        SimdVector m_metric;        // vec3
+        SimdVector m_metricSqr;     // vec3
+        SimdVector m_xxsum;         // color | weight
+        SimdVector m_xsum;          // color | weight (wsum)
+        SimdVector m_besterror;     // scalar
     #else
-	    Vector3 m_weighted[16];
-	    float m_weights[16];
-	    Vector3 m_metric;
-	    Vector3 m_metricSqr;
-	    Vector3 m_xxsum;
-	    Vector3 m_xsum;
-	    float m_wsum;
-	    float m_besterror;
+        Vector3 m_weighted[16];
+        float m_weights[16];
+        Vector3 m_metric;
+        Vector3 m_metricSqr;
+        Vector3 m_xxsum;
+        Vector3 m_xsum;
+        float m_wsum;
+        float m_besterror;
     #endif
-
-	    int m_order[16];
     };
 
 } // nv namespace