Cleanup clusterfit.

2011-01-19 07:27:04 +00:00
parent 4f098c4ff9
commit 95b5e1decd
2 changed files with 418 additions and 526 deletions
--- a/src/nvtt/ClusterFit.cpp
+++ b/src/nvtt/ClusterFit.cpp
@ -1,26 +1,26 @@
 /* -----------------------------------------------------------------------------
-	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+    Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
-	Copyright (c) 2006 Ignacio Castano                      icastano@nvidia.com
+    Copyright (c) 2006 Ignacio Castano                      icastano@nvidia.com
-	Permission is hereby granted, free of charge, to any person obtaining
+    Permission is hereby granted, free of charge, to any person obtaining
-	a copy of this software and associated documentation files (the 
+    a copy of this software and associated documentation files (the
-	"Software"), to	deal in the Software without restriction, including
+    "Software"), to	deal in the Software without restriction, including
-	without limitation the rights to use, copy, modify, merge, publish,
+    without limitation the rights to use, copy, modify, merge, publish,
-	distribute, sublicense, and/or sell copies of the Software, and to 
+    distribute, sublicense, and/or sell copies of the Software, and to
-	permit persons to whom the Software is furnished to do so, subject to 
+    permit persons to whom the Software is furnished to do so, subject to
-	the following conditions:
+    the following conditions:
-	The above copyright notice and this permission notice shall be included
+    The above copyright notice and this permission notice shall be included
-	in all copies or substantial portions of the Software.
+    in all copies or substantial portions of the Software.
-	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
-	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+    OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+    IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+    CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+    TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
-	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+    SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
   -------------------------------------------------------------------------- */
@ -38,91 +38,90 @@ ClusterFit::ClusterFit()
 void ClusterFit::setColourSet(const ColorSet * set)
 {
-	// initialise the best error
+    // initialise the best error
 #if NVTT_USE_SIMD
-	m_besterror = SimdVector( FLT_MAX );
+    m_besterror = SimdVector( FLT_MAX );
-	Vector3 metric = m_metric.toVector3();
+    Vector3 metric = m_metric.toVector3();
 #else
-	m_besterror = FLT_MAX;
+    m_besterror = FLT_MAX;
-	Vector3 metric = m_metric;
+    Vector3 metric = m_metric;
 #endif
-	// cache some values
+    // cache some values
-	count = set->count;
+    m_count = set->count;
    Vector3 values[16];
-    for (uint i = 0; i < count; i++)
+    for (uint i = 0; i < m_count; i++)
    {
        values[i] = set->colors[i].xyz();
    }
-    Vector3 principle = Fit::computePrincipalComponent(count, values, set->weights, metric);
+    Vector3 principle = Fit::computePrincipalComponent(m_count, values, set->weights, metric);
    // build the list of values
    int order[16];
    float dps[16];
    for (uint i = 0; i < m_count; ++i)
    {
        dps[i] = dot(values[i], principle);
        order[i] = i;
    }
-	// build the list of values
+    // stable sort
-	float dps[16];
+    for (uint i = 0; i < m_count; ++i)
-	for (uint i = 0; i < count; ++i)
+    {
-	{
+        for (uint j = i; j > 0 && dps[j] < dps[j - 1]; --j)
-		dps[i] = dot(values[i], principle);
+        {
-		m_order[i] = i;
+            swap(dps[j], dps[j - 1]);
-	}
+            swap(order[j], order[j - 1]);
        }
    }
-	// stable sort
+    // weight all the points
 	for (uint i = 0; i < count; ++i)
 	{
 		for (uint j = i; j > 0 && dps[j] < dps[j - 1]; --j)
 		{
 			swap( dps[j], dps[j - 1] );
 			swap( m_order[j], m_order[j - 1] );
 		}
 	}
 	// weight all the points
 #if NVTT_USE_SIMD
-	SimdVector const* unweighted = set->GetPointsSimd();
+    m_xxsum = SimdVector( 0.0f );
-	SimdVector const* weights = set->GetWeightsSimd();
+    m_xsum = SimdVector( 0.0f );
 	m_xxsum = SimdVector( 0.0f );
 	m_xsum = SimdVector( 0.0f );
 #else
-	Vector3 const* unweighted = values;
+    m_xsum = Vector3(0.0f);
-	float const* weights = set->weights;
+    m_wsum = 0.0f;
 	m_xxsum = Vector3(0.0f);
 	m_xsum = Vector3(0.0f);
 	m_wsum = 0.0f;	
 #endif
-	for (uint i = 0; i < count; ++i)
+    for (uint i = 0; i < m_count; ++i)
-	{
+    {
-		int p = m_order[i];
+        int p = order[i];
-		m_weighted[i] = weights[p] * unweighted[p];
+#if NVTT_USE_SIMD
-		m_xxsum += m_weighted[i] * m_weighted[i];
+        m_weighted[i] = SimdVector(Vector4(set->weights[p] * values[p], set->weights[p]));
-		m_xsum += m_weighted[i];
+        m_xxsum += m_weighted[i] * m_weighted[i];
-#if !NVTT_USE_SIMD		
+        m_xsum += m_weighted[i];
-		m_weights[i] = weights[p];
+#else
-		m_wsum += m_weights[i];
+        m_weighted[i] = values[p];
        m_xxsum += m_weighted[i] * m_weighted[i];
        m_xsum += m_weighted[i];
        m_weights[i] = set->weights[p];
        m_wsum += m_weights[i];
 #endif
-	}
+    }
 }
 void ClusterFit::setMetric(Vector4::Arg w)
 {
 #if NVTT_USE_SIMD
-	m_metric = SimdVector(w);
+    m_metric = SimdVector(Vector4(w.xyz(), 1));
 #else
-	m_metric = w.xyz();
+    m_metric = w.xyz();
 #endif
-	m_metricSqr = m_metric * m_metric;
+    m_metricSqr = m_metric * m_metric;
 }
 float ClusterFit::bestError() const
 {
 #if NVTT_USE_SIMD
-	SimdVector x = m_xxsum * m_metricSqr;
+    SimdVector x = m_xxsum * m_metricSqr;
-	SimdVector error = m_besterror + x.splatX() + x.splatY() + x.splatZ();
+    SimdVector error = m_besterror + x.splatX() + x.splatY() + x.splatZ();
-	return error.toFloat();
+    return error.toFloat();
 #else
-	return m_besterror + dot(m_xxsum, m_metricSqr);
+    return m_besterror + dot(m_xxsum, m_metricSqr);
 #endif
 }
@ -131,251 +130,199 @@ float ClusterFit::bestError() const
 bool ClusterFit::compress3( Vector3 * start, Vector3 * end )
 {
-    int const count = m_colours->count;
+    int const count = m_count;
-	SimdVector const one = SimdVector(1.0f);
+    SimdVector const one = SimdVector(1.0f);
-	SimdVector const zero = SimdVector(0.0f);
+    SimdVector const zero = SimdVector(0.0f);
-	SimdVector const half(0.5f, 0.5f, 0.5f, 0.25f);
+    SimdVector const half(0.5f, 0.5f, 0.5f, 0.25f);
-	SimdVector const two = SimdVector(2.0);
+    SimdVector const two = SimdVector(2.0);
-	SimdVector const grid( 31.0f, 63.0f, 31.0f, 0.0f );
+    SimdVector const grid( 31.0f, 63.0f, 31.0f, 0.0f );
-	SimdVector const gridrcp( 1.0f/31.0f, 1.0f/63.0f, 1.0f/31.0f, 0.0f );
+    SimdVector const gridrcp( 1.0f/31.0f, 1.0f/63.0f, 1.0f/31.0f, 0.0f );
-	// declare variables
+    // declare variables
-	SimdVector beststart = SimdVector( 0.0f );
+    SimdVector beststart = SimdVector( 0.0f );
-	SimdVector bestend = SimdVector( 0.0f );
+    SimdVector bestend = SimdVector( 0.0f );
-	SimdVector besterror = SimdVector( FLT_MAX );
+    SimdVector besterror = SimdVector( FLT_MAX );
-	SimdVector x0 = zero;
+    SimdVector x0 = zero;
-	int b0 = 0, b1 = 0;
+    int b0 = 0, b1 = 0;
-	// check all possible clusters for this total order
+    // check all possible clusters for this total order
-	for( int c0 = 0; c0 <= count; c0++)
+    for( int c0 = 0; c0 <= count; c0++)
-	{	
+    {
-		SimdVector x1 = zero;
+        SimdVector x1 = zero;
-		for( int c1 = 0; c1 <= count-c0; c1++)
+        for( int c1 = 0; c1 <= count-c0; c1++)
-		{
+        {
-			SimdVector const x2 = m_xsum - x1 - x0;
+            SimdVector const x2 = m_xsum - x1 - x0;
-			//Vector3 const alphax_sum = x0 + x1 * 0.5f;
+            //Vector3 const alphax_sum = x0 + x1 * 0.5f;
-			//float const alpha2_sum = w0 + w1 * 0.25f;
+            //float const alpha2_sum = w0 + w1 * 0.25f;
-			SimdVector const alphax_sum = multiplyAdd(x1, half, x0); // alphax_sum, alpha2_sum
+            SimdVector const alphax_sum = multiplyAdd(x1, half, x0); // alphax_sum, alpha2_sum
-			SimdVector const alpha2_sum = alphax_sum.splatW();
+            SimdVector const alpha2_sum = alphax_sum.splatW();
-			//Vector3 const betax_sum = x2 + x1 * 0.5f;
+            //Vector3 const betax_sum = x2 + x1 * 0.5f;
-			//float const beta2_sum = w2 + w1 * 0.25f;
+            //float const beta2_sum = w2 + w1 * 0.25f;
-			SimdVector const betax_sum = multiplyAdd(x1, half, x2); // betax_sum, beta2_sum
+            SimdVector const betax_sum = multiplyAdd(x1, half, x2); // betax_sum, beta2_sum
-			SimdVector const beta2_sum = betax_sum.splatW();
+            SimdVector const beta2_sum = betax_sum.splatW();
-			//float const alphabeta_sum = w1 * 0.25f;
+            //float const alphabeta_sum = w1 * 0.25f;
-			SimdVector const alphabeta_sum = (x1 * half).splatW(); // alphabeta_sum
+            SimdVector const alphabeta_sum = (x1 * half).splatW(); // alphabeta_sum
-			// float const factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);
+            // float const factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);
-			SimdVector const factor = reciprocal( negativeMultiplySubtract(alphabeta_sum, alphabeta_sum, alpha2_sum*beta2_sum) );
+            SimdVector const factor = reciprocal( negativeMultiplySubtract(alphabeta_sum, alphabeta_sum, alpha2_sum*beta2_sum) );
-			SimdVector a = negativeMultiplySubtract(betax_sum, alphabeta_sum, alphax_sum*beta2_sum) * factor;
+            SimdVector a = negativeMultiplySubtract(betax_sum, alphabeta_sum, alphax_sum*beta2_sum) * factor;
-			SimdVector b = negativeMultiplySubtract(alphax_sum, alphabeta_sum, betax_sum*alpha2_sum) * factor;
+            SimdVector b = negativeMultiplySubtract(alphax_sum, alphabeta_sum, betax_sum*alpha2_sum) * factor;
-			// clamp to the grid
+            // clamp to the grid
-			a = min( one, max( zero, a ) );
+            a = min( one, max( zero, a ) );
-			b = min( one, max( zero, b ) );
+            b = min( one, max( zero, b ) );
-			a = truncate( multiplyAdd( grid, a, half ) ) * gridrcp;
+            a = truncate( multiplyAdd( grid, a, half ) ) * gridrcp;
-			b = truncate( multiplyAdd( grid, b, half ) ) * gridrcp;
+            b = truncate( multiplyAdd( grid, b, half ) ) * gridrcp;
-			// compute the error (we skip the constant xxsum)
+            // compute the error (we skip the constant xxsum)
-			SimdVector e1 = multiplyAdd( a*a, alpha2_sum, b*b*beta2_sum );
+            SimdVector e1 = multiplyAdd( a*a, alpha2_sum, b*b*beta2_sum );
-			SimdVector e2 = negativeMultiplySubtract( a, alphax_sum, a*b*alphabeta_sum );
+            SimdVector e2 = negativeMultiplySubtract( a, alphax_sum, a*b*alphabeta_sum );
-			SimdVector e3 = negativeMultiplySubtract( b, betax_sum, e2 );
+            SimdVector e3 = negativeMultiplySubtract( b, betax_sum, e2 );
-			SimdVector e4 = multiplyAdd( two, e3, e1 );
+            SimdVector e4 = multiplyAdd( two, e3, e1 );
-			// apply the metric to the error term
+	    // apply the metric to the error term
-			SimdVector e5 = e4 * m_metricSqr;
+	    SimdVector e5 = e4 * m_metricSqr;
-			SimdVector error = e5.splatX() + e5.splatY() + e5.splatZ();
+	    SimdVector error = e5.splatX() + e5.splatY() + e5.splatZ();
-			// keep the solution if it wins
+	    // keep the solution if it wins
-			if( compareAnyLessThan( error, besterror ) )
+	    if( compareAnyLessThan( error, besterror ) )
-			{
+	    {
-				besterror = error;
+		besterror = error;
-				beststart = a;
+		beststart = a;
-				bestend = b;
+		bestend = b;
-				b0 = c0;
+		b0 = c0;
-				b1 = c1;
+		b1 = c1;
-			}
+	    }
-			x1 += m_weighted[c0+c1];
+	    x1 += m_weighted[c0+c1];
 		}
 		x0 += m_weighted[c0];
 	}
-	// save the block if necessary
+        x0 += m_weighted[c0];
-	if( compareAnyLessThan( besterror, m_besterror ) )
+    }
 	{
 		// compute indices from cluster sizes.
 		/*u8 bestindices[16];
 		{
 			int i = 0;
 			for(; i < b0; i++) {
 				bestindices[i] = 0;
 			}
 			for(; i < b0+b1; i++) {
 				bestindices[i] = 2;
 			}
 			for(; i < count; i++) {
 				bestindices[i] = 1;
 			}
 		}
-		// remap the indices
+    // save the block if necessary
-		u8 ordered[16];
+    if( compareAnyLessThan( besterror, m_besterror ) )
-		for( int i = 0; i < count; ++i )
+    {
 			ordered[m_order[i]] = bestindices[i];
 		m_colours->RemapIndices( ordered, bestindices );
 		// save the block
 		WriteColourBlock3( beststart.toVector3(), bestend.toVector3(), bestindices, block );*/
        *start = beststart.toVector3();
        *end = bestend.toVector3();
-		// save the error
+	// save the error
-		m_besterror = besterror;
+	m_besterror = besterror;
        return true;
-	}
+    }
    return false;
 }
 bool ClusterFit::compress4( Vector3 * start, Vector3 * end )
 {
-    int const count = m_colours->count;
+    int const count = m_count;
-	SimdVector const one = SimdVector(1.0f);
+    SimdVector const one = SimdVector(1.0f);
-	SimdVector const zero = SimdVector(0.0f);
+    SimdVector const zero = SimdVector(0.0f);
-	SimdVector const half = SimdVector(0.5f);
+    SimdVector const half = SimdVector(0.5f);
-	SimdVector const two = SimdVector(2.0);
+    SimdVector const two = SimdVector(2.0);
-	SimdVector const onethird( 1.0f/3.0f, 1.0f/3.0f, 1.0f/3.0f, 1.0f/9.0f );
+    SimdVector const onethird( 1.0f/3.0f, 1.0f/3.0f, 1.0f/3.0f, 1.0f/9.0f );
-	SimdVector const twothirds( 2.0f/3.0f, 2.0f/3.0f, 2.0f/3.0f, 4.0f/9.0f );
+    SimdVector const twothirds( 2.0f/3.0f, 2.0f/3.0f, 2.0f/3.0f, 4.0f/9.0f );
    SimdVector const twonineths = SimdVector( 2.0f/9.0f );
-	SimdVector const grid( 31.0f, 63.0f, 31.0f, 0.0f );
+    SimdVector const grid( 31.0f, 63.0f, 31.0f, 0.0f );
-	SimdVector const gridrcp( 1.0f/31.0f, 1.0f/63.0f, 1.0f/31.0f, 0.0f );
+    SimdVector const gridrcp( 1.0f/31.0f, 1.0f/63.0f, 1.0f/31.0f, 0.0f );
-	// declare variables
+    // declare variables
-	SimdVector beststart = SimdVector( 0.0f );
+    SimdVector beststart = SimdVector( 0.0f );
-	SimdVector bestend = SimdVector( 0.0f );
+    SimdVector bestend = SimdVector( 0.0f );
-	SimdVector besterror = SimdVector( FLT_MAX );
+    SimdVector besterror = SimdVector( FLT_MAX );
-	SimdVector x0 = zero;
+    SimdVector x0 = zero;
-	int b0 = 0, b1 = 0, b2 = 0;
+    int b0 = 0, b1 = 0, b2 = 0;
-	// check all possible clusters for this total order
+    // check all possible clusters for this total order
-	for( int c0 = 0; c0 <= count; c0++)
+    for( int c0 = 0; c0 <= count; c0++)
-	{	
+    {
-		SimdVector x1 = zero;
+        SimdVector x1 = zero;
-		for( int c1 = 0; c1 <= count-c0; c1++)
+        for( int c1 = 0; c1 <= count-c0; c1++)
-		{	
+        {
-			SimdVector x2 = zero;
+            SimdVector x2 = zero;
-			for( int c2 = 0; c2 <= count-c0-c1; c2++)
+            for( int c2 = 0; c2 <= count-c0-c1; c2++)
-			{
+            {
-				SimdVector const x3 = m_xsum - x2 - x1 - x0;
+                SimdVector const x3 = m_xsum - x2 - x1 - x0;
-				//Vector3 const alphax_sum = x0 + x1 * (2.0f / 3.0f) + x2 * (1.0f / 3.0f);
+                //Vector3 const alphax_sum = x0 + x1 * (2.0f / 3.0f) + x2 * (1.0f / 3.0f);
-				//float const alpha2_sum = w0 + w1 * (4.0f/9.0f) + w2 * (1.0f/9.0f);
+                //float const alpha2_sum = w0 + w1 * (4.0f/9.0f) + w2 * (1.0f/9.0f);
                SimdVector const alphax_sum = multiplyAdd(x2, onethird, multiplyAdd(x1, twothirds, x0)); // alphax_sum, alpha2_sum
-				SimdVector const alpha2_sum = alphax_sum.splatW();
+                SimdVector const alpha2_sum = alphax_sum.splatW();
-				//Vector3 const betax_sum = x3 + x2 * (2.0f / 3.0f) + x1 * (1.0f / 3.0f);
+                //Vector3 const betax_sum = x3 + x2 * (2.0f / 3.0f) + x1 * (1.0f / 3.0f);
-				//float const beta2_sum = w3 + w2 * (4.0f/9.0f) + w1 * (1.0f/9.0f);
+                //float const beta2_sum = w3 + w2 * (4.0f/9.0f) + w1 * (1.0f/9.0f);
-				SimdVector const betax_sum = multiplyAdd(x2, twothirds, multiplyAdd(x1, onethird, x3)); // betax_sum, beta2_sum
+                SimdVector const betax_sum = multiplyAdd(x2, twothirds, multiplyAdd(x1, onethird, x3)); // betax_sum, beta2_sum
-				SimdVector const beta2_sum = betax_sum.splatW();
+                SimdVector const beta2_sum = betax_sum.splatW();
-				//float const alphabeta_sum = (w1 + w2) * (2.0f/9.0f);
+                //float const alphabeta_sum = (w1 + w2) * (2.0f/9.0f);
                SimdVector const alphabeta_sum = twonineths*( x1 + x2 ).splatW(); // alphabeta_sum
-				// float const factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);
+                // float const factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);
-				SimdVector const factor = reciprocal( negativeMultiplySubtract(alphabeta_sum, alphabeta_sum, alpha2_sum*beta2_sum) );
+                SimdVector const factor = reciprocal( negativeMultiplySubtract(alphabeta_sum, alphabeta_sum, alpha2_sum*beta2_sum) );
-				SimdVector a = negativeMultiplySubtract(betax_sum, alphabeta_sum, alphax_sum*beta2_sum) * factor;
+                SimdVector a = negativeMultiplySubtract(betax_sum, alphabeta_sum, alphax_sum*beta2_sum) * factor;
-				SimdVector b = negativeMultiplySubtract(alphax_sum, alphabeta_sum, betax_sum*alpha2_sum) * factor;
+                SimdVector b = negativeMultiplySubtract(alphax_sum, alphabeta_sum, betax_sum*alpha2_sum) * factor;
-				// clamp to the grid
+                // clamp to the grid
-				a = min( one, max( zero, a ) );
+                a = min( one, max( zero, a ) );
-				b = min( one, max( zero, b ) );
+                b = min( one, max( zero, b ) );
-				a = truncate( multiplyAdd( grid, a, half ) ) * gridrcp;
+                a = truncate( multiplyAdd( grid, a, half ) ) * gridrcp;
-				b = truncate( multiplyAdd( grid, b, half ) ) * gridrcp;
+                b = truncate( multiplyAdd( grid, b, half ) ) * gridrcp;
-				// compute the error (we skip the constant xxsum)
+                // compute the error (we skip the constant xxsum)
-				SimdVector e1 = multiplyAdd( a*a, alpha2_sum, b*b*beta2_sum );
+                SimdVector e1 = multiplyAdd( a*a, alpha2_sum, b*b*beta2_sum );
-				SimdVector e2 = negativeMultiplySubtract( a, alphax_sum, a*b*alphabeta_sum );
+                SimdVector e2 = negativeMultiplySubtract( a, alphax_sum, a*b*alphabeta_sum );
-				SimdVector e3 = negativeMultiplySubtract( b, betax_sum, e2 );
+                SimdVector e3 = negativeMultiplySubtract( b, betax_sum, e2 );
-				SimdVector e4 = multiplyAdd( two, e3, e1 );
+                SimdVector e4 = multiplyAdd( two, e3, e1 );
-				// apply the metric to the error term
+		// apply the metric to the error term
-				SimdVector e5 = e4 * m_metricSqr;
+		SimdVector e5 = e4 * m_metricSqr;
-				SimdVector error = e5.splatX() + e5.splatY() + e5.splatZ();
+		SimdVector error = e5.splatX() + e5.splatY() + e5.splatZ();
-				// keep the solution if it wins
+		// keep the solution if it wins
-				if( compareAnyLessThan( error, besterror ) )
+		if( compareAnyLessThan( error, besterror ) )
-				{
+		{
-					besterror = error;
+		    besterror = error;
-					beststart = a;
+		    beststart = a;
-					bestend = b;
+		    bestend = b;
-					b0 = c0;
+		    b0 = c0;
-					b1 = c1;
+		    b1 = c1;
-					b2 = c2;
+		    b2 = c2;
 				}
 				x2 += m_weighted[c0+c1+c2];
 			}
 			x1 += m_weighted[c0+c1];
 		}
-		x0 += m_weighted[c0];
+		x2 += m_weighted[c0+c1+c2];
 	    }
 	    x1 += m_weighted[c0+c1];
 	}
-	// save the block if necessary
+        x0 += m_weighted[c0];
-	if( compareAnyLessThan( besterror, m_besterror ) )
+    }
 	{
 		/*// compute indices from cluster sizes.
 		u8 bestindices[16];
 		{
 			int i = 0;
 			for(; i < b0; i++) {
 				bestindices[i] = 0;
 			}
 			for(; i < b0+b1; i++) {
 				bestindices[i] = 2;
 			}
 			for(; i < b0+b1+b2; i++) {
 				bestindices[i] = 3;
 			}
 			for(; i < count; i++) {
 				bestindices[i] = 1;
 			}
 		}
 		// remap the indices
 		u8 ordered[16];
 		for( int i = 0; i < count; ++i )
 			ordered[m_order[i]] = bestindices[i];
        m_colours->RemapIndices( ordered, bestindices );
 		// save the block
 		WriteColourBlock4( beststart.toVector3(), bestend.toVector3(), bestindices, block );*/
    // save the block if necessary
    if( compareAnyLessThan( besterror, m_besterror ) )
    {
        *start = beststart.toVector3();
        *end = bestend.toVector3();
-		// save the error
+	// save the error
-		m_besterror = besterror;
+	m_besterror = besterror;
        return true;
-	}
+    }
    return false;
 }
@ -384,231 +331,181 @@ bool ClusterFit::compress4( Vector3 * start, Vector3 * end )
 bool ClusterFit::compress3(Vector3 * start, Vector3 * end)
 {
-	const Vector3 one( 1.0f );
+    int const count = m_count;
-	const Vector3 zero( 0.0f );
+    const Vector3 one( 1.0f );
-	const Vector3 half( 0.5f );
+    const Vector3 zero( 0.0f );
    const Vector3 half( 0.5f );
    const Vector3 grid( 31.0f, 63.0f, 31.0f );
    const Vector3 gridrcp( 1.0f/31.0f, 1.0f/63.0f, 1.0f/31.0f );
-	// declare variables
+    // declare variables
-	Vector3 beststart( 0.0f );
+    Vector3 beststart( 0.0f );
-	Vector3 bestend( 0.0f );
+    Vector3 bestend( 0.0f );
-	float besterror = FLT_MAX;
+    float besterror = FLT_MAX;
-	Vector3 x0(0.0f);
+    Vector3 x0(0.0f);
-	float w0 = 0.0f;
+    float w0 = 0.0f;
-	int b0 = 0, b1 = 0;
+    int b0 = 0, b1 = 0;
-	// check all possible clusters for this total order
+    // check all possible clusters for this total order
-	for (uint c0 = 0; c0 <= count; c0++)
+    for (uint c0 = 0; c0 <= count; c0++)
-	{	
+    {
-		Vector3 x1(0.0f);
+        Vector3 x1(0.0f);
-		float w1 = 0.0f;
+        float w1 = 0.0f;
-		for (uint c1 = 0; c1 <= count-c0; c1++)
+        for (uint c1 = 0; c1 <= count-c0; c1++)
-		{	
+        {
-			float w2 = m_wsum - w0 - w1;
+            float w2 = m_wsum - w0 - w1;
-			// These factors could be entirely precomputed.
+            // These factors could be entirely precomputed.
-			float const alpha2_sum = w0 + w1 * 0.25f;
+            float const alpha2_sum = w0 + w1 * 0.25f;
-			float const beta2_sum = w2 + w1 * 0.25f;
+            float const beta2_sum = w2 + w1 * 0.25f;
-			float const alphabeta_sum = w1 * 0.25f;
+            float const alphabeta_sum = w1 * 0.25f;
-			float const factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);
+            float const factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);
-			Vector3 const alphax_sum = x0 + x1 * 0.5f;
+            Vector3 const alphax_sum = x0 + x1 * 0.5f;
-			Vector3 const betax_sum = m_xsum - alphax_sum;
+            Vector3 const betax_sum = m_xsum - alphax_sum;
-			Vector3 a = (alphax_sum*beta2_sum - betax_sum*alphabeta_sum) * factor;
+            Vector3 a = (alphax_sum*beta2_sum - betax_sum*alphabeta_sum) * factor;
-			Vector3 b = (betax_sum*alpha2_sum - alphax_sum*alphabeta_sum) * factor;
+            Vector3 b = (betax_sum*alpha2_sum - alphax_sum*alphabeta_sum) * factor;
-			// clamp to the grid
+            // clamp to the grid
-			a = min(one, max(zero, a));
+            a = min(one, max(zero, a));
-			b = min(one, max(zero, b));
+            b = min(one, max(zero, b));
-			a = floor(grid * a + half) * gridrcp;
+            a = floor(grid * a + half) * gridrcp;
-			b = floor(grid * b + half) * gridrcp;
+            b = floor(grid * b + half) * gridrcp;
-			// compute the error
+            // compute the error
-			Vector3 e1 = a*a*alpha2_sum + b*b*beta2_sum + 2.0f*( a*b*alphabeta_sum - a*alphax_sum - b*betax_sum );
+            Vector3 e1 = a*a*alpha2_sum + b*b*beta2_sum + 2.0f*( a*b*alphabeta_sum - a*alphax_sum - b*betax_sum );
-			// apply the metric to the error term
+            // apply the metric to the error term
-			float error = dot(e1, m_metricSqr);
+            float error = dot(e1, m_metricSqr);
-			// keep the solution if it wins
+            // keep the solution if it wins
-			if (error < besterror)
+            if (error < besterror)
-			{
+            {
-				besterror = error;
+                besterror = error;
-				beststart = a;
+                beststart = a;
-				bestend = b;
+                bestend = b;
-				b0 = c0;
+                b0 = c0;
-				b1 = c1;
+                b1 = c1;
-			}
+            }
-			x1 += m_weighted[c0+c1];
+            x1 += m_weighted[c0+c1];
-			w1 += m_weights[c0+c1];
+            w1 += m_weights[c0+c1];
-		}
+        }
-		x0 += m_weighted[c0];
+        x0 += m_weighted[c0];
-		w0 += m_weights[c0];
+        w0 += m_weights[c0];
-	}
+    }
-	// save the block if necessary
+    // save the block if necessary
-	if( besterror < m_besterror )
+    if( besterror < m_besterror )
-	{
+    {
 		/*// compute indices from cluster sizes.
 		u8 bestindices[16];
 		{
 			int i = 0;
 			for(; i < b0; i++) {
 				bestindices[i] = 0;
 			}
 			for(; i < b0+b1; i++) {
 				bestindices[i] = 2;
 			}
 			for(; i < count; i++) {
 				bestindices[i] = 1;
 			}
 		}
 		// remap the indices
 		u8 ordered[16];
 		for( int i = 0; i < count; ++i )
 			ordered[m_order[i]] = bestindices[i];
        m_colours->RemapIndices( ordered, bestindices );
 		// save the block
 		WriteColourBlock3( beststart, bestend, bestindices, block );*/
        *start = beststart;
        *end = bestend;
-		// save the error
+	// save the error
-		m_besterror = besterror;
+	m_besterror = besterror;
        return true;
-	}
+    }
    return false;
 }
 bool ClusterFit::compress4(Vector3 * start, Vector3 * end)
 {
-	Vector3 const one( 1.0f );
+    int const count = m_count;
-	Vector3 const zero( 0.0f );
+    Vector3 const one( 1.0f );
-	Vector3 const half( 0.5f );
+    Vector3 const zero( 0.0f );
-	Vector3 const grid( 31.0f, 63.0f, 31.0f );
+    Vector3 const half( 0.5f );
-	Vector3 const gridrcp( 1.0f/31.0f, 1.0f/63.0f, 1.0f/31.0f );
+    Vector3 const grid( 31.0f, 63.0f, 31.0f );
    Vector3 const gridrcp( 1.0f/31.0f, 1.0f/63.0f, 1.0f/31.0f );
-	// declare variables
+    // declare variables
-	Vector3 beststart( 0.0f );
+    Vector3 beststart( 0.0f );
-	Vector3 bestend( 0.0f );
+    Vector3 bestend( 0.0f );
-	float besterror = FLT_MAX;
+    float besterror = FLT_MAX;
-	Vector3 x0(0.0f);
+    Vector3 x0(0.0f);
-	float w0 = 0.0f;
+    float w0 = 0.0f;
-	int b0 = 0, b1 = 0, b2 = 0;
+    int b0 = 0, b1 = 0, b2 = 0;
-	// check all possible clusters for this total order
+    // check all possible clusters for this total order
-	for (uint c0 = 0; c0 <= count; c0++)
+    for (uint c0 = 0; c0 <= count; c0++)
-	{	
+    {
-		Vector3 x1(0.0f);
+        Vector3 x1(0.0f);
-		float w1 = 0.0f;
+        float w1 = 0.0f;
-		for (uint c1 = 0; c1 <= count-c0; c1++)
+        for (uint c1 = 0; c1 <= count-c0; c1++)
-		{	
+        {
-			Vector3 x2(0.0f);
+            Vector3 x2(0.0f);
-			float w2 = 0.0f;
+            float w2 = 0.0f;
-			for (uint c2 = 0; c2 <= count-c0-c1; c2++)
+            for (uint c2 = 0; c2 <= count-c0-c1; c2++)
-			{
+            {
-				float w3 = m_wsum - w0 - w1 - w2;
+                float w3 = m_wsum - w0 - w1 - w2;
-				float const alpha2_sum = w0 + w1 * (4.0f/9.0f) + w2 * (1.0f/9.0f);
+                float const alpha2_sum = w0 + w1 * (4.0f/9.0f) + w2 * (1.0f/9.0f);
-				float const beta2_sum = w3 + w2 * (4.0f/9.0f) + w1 * (1.0f/9.0f);
+                float const beta2_sum = w3 + w2 * (4.0f/9.0f) + w1 * (1.0f/9.0f);
-				float const alphabeta_sum = (w1 + w2) * (2.0f/9.0f);
+                float const alphabeta_sum = (w1 + w2) * (2.0f/9.0f);
-				float const factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);
+                float const factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);
-				Vector3 const alphax_sum = x0 + x1 * (2.0f / 3.0f) + x2 * (1.0f / 3.0f);
+                Vector3 const alphax_sum = x0 + x1 * (2.0f / 3.0f) + x2 * (1.0f / 3.0f);
-				Vector3 const betax_sum = m_xsum - alphax_sum;
+                Vector3 const betax_sum = m_xsum - alphax_sum;
-				Vector3 a = ( alphax_sum*beta2_sum - betax_sum*alphabeta_sum )*factor;
+                Vector3 a = ( alphax_sum*beta2_sum - betax_sum*alphabeta_sum )*factor;
-				Vector3 b = ( betax_sum*alpha2_sum - alphax_sum*alphabeta_sum )*factor;
+                Vector3 b = ( betax_sum*alpha2_sum - alphax_sum*alphabeta_sum )*factor;
-				// clamp to the grid
+                // clamp to the grid
-				a = min( one, max( zero, a ) );
+                a = min( one, max( zero, a ) );
-				b = min( one, max( zero, b ) );
+                b = min( one, max( zero, b ) );
-				a = floor( grid*a + half )*gridrcp;
+                a = floor( grid*a + half )*gridrcp;
-				b = floor( grid*b + half )*gridrcp;
+                b = floor( grid*b + half )*gridrcp;
-				// compute the error
+                // compute the error
-				Vector3 e1 = a*a*alpha2_sum + b*b*beta2_sum + 2.0f*( a*b*alphabeta_sum - a*alphax_sum - b*betax_sum );
+                Vector3 e1 = a*a*alpha2_sum + b*b*beta2_sum + 2.0f*( a*b*alphabeta_sum - a*alphax_sum - b*betax_sum );
-				// apply the metric to the error term
+                // apply the metric to the error term
-				float error = dot( e1, m_metricSqr );
+                float error = dot( e1, m_metricSqr );
-				// keep the solution if it wins
+                // keep the solution if it wins
-				if( error < besterror )
+                if( error < besterror )
-				{
+                {
-					besterror = error;
+                    besterror = error;
-					beststart = a;
+                    beststart = a;
-					bestend = b;
+                    bestend = b;
-					b0 = c0;
+                    b0 = c0;
-					b1 = c1;
+                    b1 = c1;
-					b2 = c2;
+                    b2 = c2;
-				}
+                }
-				x2 += m_weighted[c0+c1+c2];
+                x2 += m_weighted[c0+c1+c2];
-				w2 += m_weights[c0+c1+c2];
+                w2 += m_weights[c0+c1+c2];
-			}
+            }
-			x1 += m_weighted[c0+c1];
+            x1 += m_weighted[c0+c1];
-			w1 += m_weights[c0+c1];
+            w1 += m_weights[c0+c1];
-		}
+        }
-		x0 += m_weighted[c0];
+        x0 += m_weighted[c0];
-		w0 += m_weights[c0];
+        w0 += m_weights[c0];
-	}
+    }
 	// save the block if necessary
 	if( besterror < m_besterror )
 	{
 		/*// compute indices from cluster sizes.
 		u8 bestindices[16];
 		{
 			int i = 0;
 			for(; i < b0; i++) {
 				bestindices[i] = 0;
 			}
 			for(; i < b0+b1; i++) {
 				bestindices[i] = 2;
 			}
 			for(; i < b0+b1+b2; i++) {
 				bestindices[i] = 3;
 			}
 			for(; i < count; i++) {
 				bestindices[i] = 1;
 			}
 		}
 		// remap the indices
 		u8 ordered[16];
 		for( int i = 0; i < count; ++i )
 			ordered[m_order[i]] = bestindices[i];
        m_colours->RemapIndices( ordered, bestindices );
 		// save the block
 		WriteColourBlock4( beststart, bestend, bestindices, block );*/
    // save the block if necessary
    if( besterror < m_besterror )
    {
        *start = beststart;
        *end = bestend;
-		// save the error
+	// save the error
-		m_besterror = besterror;
+	m_besterror = besterror;
        return true;
-	}
+    }
    return false;
 }
--- a/src/nvtt/ClusterFit.h
+++ b/src/nvtt/ClusterFit.h
@ -1,26 +1,26 @@
 /* -----------------------------------------------------------------------------
-	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+    Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
-	Copyright (c) 2006 Ignacio Castano                      icastano@nvidia.com
+    Copyright (c) 2006 Ignacio Castano                      icastano@nvidia.com
-	Permission is hereby granted, free of charge, to any person obtaining
+    Permission is hereby granted, free of charge, to any person obtaining
-	a copy of this software and associated documentation files (the 
+    a copy of this software and associated documentation files (the
-	"Software"), to	deal in the Software without restriction, including
+    "Software"), to	deal in the Software without restriction, including
-	without limitation the rights to use, copy, modify, merge, publish,
+    without limitation the rights to use, copy, modify, merge, publish,
-	distribute, sublicense, and/or sell copies of the Software, and to 
+    distribute, sublicense, and/or sell copies of the Software, and to
-	permit persons to whom the Software is furnished to do so, subject to 
+    permit persons to whom the Software is furnished to do so, subject to
-	the following conditions:
+    the following conditions:
-	The above copyright notice and this permission notice shall be included
+    The above copyright notice and this permission notice shall be included
-	in all copies or substantial portions of the Software.
+    in all copies or substantial portions of the Software.
-	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
-	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+    OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+    IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+    CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+    TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
-	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+    SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
   -------------------------------------------------------------------------- */
@ -39,42 +39,37 @@ namespace nv {
    class ClusterFit
    {
    public:
-	    ClusterFit();
+        ClusterFit();
-	    void setColourSet(const ColorSet * set);
+        void setColourSet(const ColorSet * set);
        void setMetric(Vector4::Arg w);
-	    float bestError() const;
+        float bestError() const;
-	    bool compress3(Vector3 * start, Vector3 * end);
+        bool compress3(Vector3 * start, Vector3 * end);
-	    bool compress4(Vector3 * start, Vector3 * end);
+        bool compress4(Vector3 * start, Vector3 * end);
    private:
-        uint count;
+        uint m_count;
 	    //ColorSet const* m_colours;
        Vector3 m_principle;
    #if NVTT_USE_SIMD
-        SimdVector m_weighted[16];
+        SimdVector m_weighted[16];  // color | weight
-	    SimdVector m_metric;
+        SimdVector m_metric;        // vec3
-	    SimdVector m_metricSqr;
+        SimdVector m_metricSqr;     // vec3
-	    SimdVector m_xxsum;
+        SimdVector m_xxsum;         // color | weight
-	    SimdVector m_xsum;
+        SimdVector m_xsum;          // color | weight (wsum)
-	    SimdVector m_besterror;
+        SimdVector m_besterror;     // scalar
    #else
-	    Vector3 m_weighted[16];
+        Vector3 m_weighted[16];
-	    float m_weights[16];
+        float m_weights[16];
-	    Vector3 m_metric;
+        Vector3 m_metric;
-	    Vector3 m_metricSqr;
+        Vector3 m_metricSqr;
-	    Vector3 m_xxsum;
+        Vector3 m_xxsum;
-	    Vector3 m_xsum;
+        Vector3 m_xsum;
-	    float m_wsum;
+        float m_wsum;
-	    float m_besterror;
+        float m_besterror;
    #endif
 	    int m_order[16];
    };
 } // nv namespace