diff --git a/src/nvtt/squish/fastclusterfit.cpp b/src/nvtt/squish/fastclusterfit.cpp
index 8ae8ab5..3bb9fbe 100644
--- a/src/nvtt/squish/fastclusterfit.cpp
+++ b/src/nvtt/squish/fastclusterfit.cpp
@@ -129,6 +129,8 @@ void FastClusterFit::Compress3( void* block )
 	Vec4 const zero = VEC4_CONST(0.0f);
 	Vec4 const half = VEC4_CONST(0.5f);
 	Vec4 const two = VEC4_CONST(2.0);
+	Vec4 const grid( 31.0f, 63.0f, 31.0f, 0.0f );
+	Vec4 const gridrcp( 1.0f/31.0f, 1.0f/63.0f, 1.0f/31.0f, 0.0f );
 	 
 	// declare variables
 	Vec4 beststart = VEC4_CONST( 0.0f );
@@ -160,25 +162,22 @@ void FastClusterFit::Compress3( void* block )
 			Vec4 a = NegativeMultiplySubtract(betax_sum, alphabeta_sum, alphax_sum*beta2_sum) * factor;
 			Vec4 b = NegativeMultiplySubtract(alphax_sum, alphabeta_sum, betax_sum*alpha2_sum) * factor;
 			
-			// clamp the output to [0, 1]
+			// clamp to the grid
 			a = Min( one, Max( zero, a ) );
 			b = Min( one, Max( zero, b ) );
-			
-			// clamp to the grid
-			Vec4 const grid( 31.0f, 63.0f, 31.0f, 0.0f );
-			Vec4 const gridrcp( 0.03227752766457f, 0.01583151765563f, 0.03227752766457f, 0.0f );
 			a = Truncate( MultiplyAdd( grid, a, half ) ) * gridrcp;
 			b = Truncate( MultiplyAdd( grid, b, half ) ) * gridrcp;
 			
-			// compute the error
-			Vec4 e1 = MultiplyAdd( a, alphax_sum, b*betax_sum );
-			Vec4 e2 = MultiplyAdd( a*a, alpha2_sum, b*b*beta2_sum );
-			Vec4 e3 = MultiplyAdd( a*b*alphabeta_sum - e1, two, e2 );
-			
+			// compute the error (we skip the constant xxsum)
+			Vec4 e1 = MultiplyAdd( a*a, alpha2_sum, b*b*beta2_sum );
+			Vec4 e2 = NegativeMultiplySubtract( a, alphax_sum, a*b*alphabeta_sum );
+			Vec4 e3 = NegativeMultiplySubtract( b, betax_sum, e2 );
+			Vec4 e4 = MultiplyAdd( two, e3, e1 );
+
 			// apply the metric to the error term
-			Vec4 e4 = e3 * m_metricSqr;
-			Vec4 error = e4.SplatX() + e4.SplatY() + e4.SplatZ();
-			
+			Vec4 e5 = e4 * m_metricSqr;
+			Vec4 error = e5.SplatX() + e5.SplatY() + e5.SplatZ();
+
 			// keep the solution if it wins
 			if( CompareAnyLessThan( error, besterror ) )
 			{
@@ -274,7 +273,7 @@ void FastClusterFit::Compress4( void* block )
 				Vec4 const factor = constants.SplatW();
 				i++;
 				
-				Vec4 const alphax_sum = x0 + MultiplyAdd(x1, twothirds, x2 * onethird);
+				Vec4 const alphax_sum = MultiplyAdd(x2, onethird, MultiplyAdd(x1, twothirds, x0));
 				Vec4 const betax_sum = m_xsum - alphax_sum;
 				
 				Vec4 a = NegativeMultiplySubtract(betax_sum, alphabeta_sum, alphax_sum*beta2_sum) * factor;
@@ -286,18 +285,19 @@ void FastClusterFit::Compress4( void* block )
 				
 				// clamp to the grid
 				Vec4 const grid( 31.0f, 63.0f, 31.0f, 0.0f );
-				Vec4 const gridrcp( 0.03227752766457f, 0.01583151765563f, 0.03227752766457f, 0.0f );
+				Vec4 const gridrcp( 1.0f/31.0f, 1.0f/63.0f, 1.0f/31.0f, 0.0f );
 				a = Truncate( MultiplyAdd( grid, a, half ) ) * gridrcp;
 				b = Truncate( MultiplyAdd( grid, b, half ) ) * gridrcp;
 				
-				// compute the error
-				Vec4 e1 = MultiplyAdd( a, alphax_sum, b*betax_sum );
-				Vec4 e2 = MultiplyAdd( a*a, alpha2_sum, b*b*beta2_sum );
-				Vec4 e3 = MultiplyAdd( a*b*alphabeta_sum - e1, two, e2 );
-				
+				// compute the error (we skip the constant xxsum)
+				Vec4 e1 = MultiplyAdd( a*a, alpha2_sum, b*b*beta2_sum );
+				Vec4 e2 = NegativeMultiplySubtract( a, alphax_sum, a*b*alphabeta_sum );
+				Vec4 e3 = NegativeMultiplySubtract( b, betax_sum, e2 );
+				Vec4 e4 = MultiplyAdd( two, e3, e1 );
+
 				// apply the metric to the error term
-				Vec4 e4 = e3 * m_metricSqr;
-				Vec4 error = e4.SplatX() + e4.SplatY() + e4.SplatZ();
+				Vec4 e5 = e4 * m_metricSqr;
+				Vec4 error = e5.SplatX() + e5.SplatY() + e5.SplatZ();
 				
 				// keep the solution if it wins
 				if( CompareAnyLessThan( error, besterror ) )
@@ -370,6 +370,12 @@ void FastClusterFit::Compress4( void* block )
 
 void FastClusterFit::Compress3( void* block )
 {
+	Vec3 const one( 1.0f );
+	Vec3 const zero( 0.0f );
+	Vec3 const half( 0.5f );
+	Vec3 const grid( 31.0f, 63.0f, 31.0f );
+	Vec3 const gridrcp( 1.0f/31.0f, 1.0f/63.0f, 1.0f/31.0f );
+
 	// declare variables
 	Vec3 beststart( 0.0f );
 	Vec3 bestend( 0.0f );
@@ -399,16 +405,9 @@ void FastClusterFit::Compress3( void* block )
 			Vec3 a = (alphax_sum*beta2_sum - betax_sum*alphabeta_sum) * factor;
 			Vec3 b = (betax_sum*alpha2_sum - alphax_sum*alphabeta_sum) * factor;
 			
-			// clamp the output to [0, 1]
-			Vec3 const one( 1.0f );
-			Vec3 const zero( 0.0f );
+			// clamp to the grid
 			a = Min( one, Max( zero, a ) );
 			b = Min( one, Max( zero, b ) );
-			
-			// clamp to the grid
-			Vec3 const grid( 31.0f, 63.0f, 31.0f );
-			Vec3 const gridrcp( 0.03227752766457f, 0.01583151765563f, 0.03227752766457f );
-			Vec3 const half( 0.5f );
 			a = Floor( grid*a + half )*gridrcp;
 			b = Floor( grid*b + half )*gridrcp;
 			
@@ -477,6 +476,12 @@ void FastClusterFit::Compress3( void* block )
 
 void FastClusterFit::Compress4( void* block )
 {
+	Vec3 const one( 1.0f );
+	Vec3 const zero( 0.0f );
+	Vec3 const half( 0.5f );
+	Vec3 const grid( 31.0f, 63.0f, 31.0f );
+	Vec3 const gridrcp( 1.0f/31.0f, 1.0f/63.0f, 1.0f/31.0f );
+
 	// declare variables
 	Vec3 beststart( 0.0f );
 	Vec3 bestend( 0.0f );
@@ -511,16 +516,9 @@ void FastClusterFit::Compress4( void* block )
 				Vec3 a = ( alphax_sum*beta2_sum - betax_sum*alphabeta_sum )*factor;
 				Vec3 b = ( betax_sum*alpha2_sum - alphax_sum*alphabeta_sum )*factor;
 				
-				// clamp the output to [0, 1]
-				Vec3 const one( 1.0f );
-				Vec3 const zero( 0.0f );
+				// clamp to the grid
 				a = Min( one, Max( zero, a ) );
 				b = Min( one, Max( zero, b ) );
-				
-				// clamp to the grid
-				Vec3 const grid( 31.0f, 63.0f, 31.0f );
-				Vec3 const gridrcp( 0.03227752766457f, 0.01583151765563f, 0.03227752766457f );
-				Vec3 const half( 0.5f );
 				a = Floor( grid*a + half )*gridrcp;
 				b = Floor( grid*b + half )*gridrcp;
 				
diff --git a/src/nvtt/squish/maths.cpp b/src/nvtt/squish/maths.cpp
index 87b4cd9..829b45d 100644
--- a/src/nvtt/squish/maths.cpp
+++ b/src/nvtt/squish/maths.cpp
@@ -59,28 +59,189 @@ Sym3x3 ComputeWeightedCovariance( int n, Vec3 const* points, float const* weight
 	return covariance;
 }
 
+#if 1
 
 Vec3 ComputePrincipleComponent( Sym3x3 const& matrix )
 {
 	const int NUM = 8;
 
 	Vec3 v(1, 1, 1);
-	for(int i = 0; i < NUM; i++) {
+	for (int i = 0; i < NUM; i++)
+    {
 		float x = v.X() * matrix[0] + v.Y() * matrix[1] + v.Z() * matrix[2];
 		float y = v.X() * matrix[1] + v.Y() * matrix[3] + v.Z() * matrix[4];
 		float z = v.X() * matrix[2] + v.Y() * matrix[4] + v.Z() * matrix[5];
 		
 		float norm = std::max(std::max(x, y), z);
+
 		float iv = 1.0f / norm;
-		if (norm == 0.0f) {		// @@ I think this is not necessary in this case!!
-			return Vec3(0.0f);
-		}
-		
 		v = Vec3(x*iv, y*iv, z*iv);
 	}
 
 	return v;
 }
 
+#else
+
+static Vec3 GetMultiplicity1Evector( Sym3x3 const& matrix, float evalue )
+{
+        // compute M
+        Sym3x3 m;
+        m[0] = matrix[0] - evalue;
+        m[1] = matrix[1];
+        m[2] = matrix[2];
+        m[3] = matrix[3] - evalue;
+        m[4] = matrix[4];
+        m[5] = matrix[5] - evalue;
+
+        // compute U
+        Sym3x3 u;
+        u[0] = m[3]*m[5] - m[4]*m[4];
+        u[1] = m[2]*m[4] - m[1]*m[5];
+        u[2] = m[1]*m[4] - m[2]*m[3];
+        u[3] = m[0]*m[5] - m[2]*m[2];
+        u[4] = m[1]*m[2] - m[4]*m[0];
+        u[5] = m[0]*m[3] - m[1]*m[1];
+
+        // find the largest component
+        float mc = std::fabs( u[0] );
+        int mi = 0;
+        for( int i = 1; i < 6; ++i )
+        {
+                float c = std::fabs( u[i] );
+                if( c > mc )
+                {
+                        mc = c;
+                        mi = i;
+                }
+        }
+
+        // pick the column with this component
+        switch( mi )
+        {
+        case 0:
+                return Vec3( u[0], u[1], u[2] );
+
+        case 1:
+        case 3:
+                return Vec3( u[1], u[3], u[4] );
+
+        default:
+                return Vec3( u[2], u[4], u[5] );
+        }
+}
+
+static Vec3 GetMultiplicity2Evector( Sym3x3 const& matrix, float evalue )
+{
+        // compute M
+        Sym3x3 m;
+        m[0] = matrix[0] - evalue;
+        m[1] = matrix[1];
+        m[2] = matrix[2];
+        m[3] = matrix[3] - evalue;
+        m[4] = matrix[4];
+        m[5] = matrix[5] - evalue;
+
+        // find the largest component
+        float mc = std::fabs( m[0] );
+        int mi = 0;
+        for( int i = 1; i < 6; ++i )
+        {
+                float c = std::fabs( m[i] );
+                if( c > mc )
+                {
+                        mc = c;
+                        mi = i;
+                }
+        }
+
+        // pick the first eigenvector based on this index
+        switch( mi )
+        {
+        case 0:
+        case 1:
+                return Vec3( -m[1], m[0], 0.0f );
+
+        case 2:
+                return Vec3( m[2], 0.0f, -m[0] );
+
+        case 3:
+        case 4:
+                return Vec3( 0.0f, -m[4], m[3] );
+
+        default:
+                return Vec3( 0.0f, -m[5], m[4] );
+        }
+}
+
+Vec3 ComputePrincipleComponent( Sym3x3 const& matrix )
+{
+        // compute the cubic coefficients
+        float c0 = matrix[0]*matrix[3]*matrix[5] 
+                + 2.0f*matrix[1]*matrix[2]*matrix[4] 
+                - matrix[0]*matrix[4]*matrix[4] 
+                - matrix[3]*matrix[2]*matrix[2] 
+                - matrix[5]*matrix[1]*matrix[1];
+        float c1 = matrix[0]*matrix[3] + matrix[0]*matrix[5] + matrix[3]*matrix[5]
+                - matrix[1]*matrix[1] - matrix[2]*matrix[2] - matrix[4]*matrix[4];
+        float c2 = matrix[0] + matrix[3] + matrix[5];
+
+        // compute the quadratic coefficients
+        float a = c1 - ( 1.0f/3.0f )*c2*c2;
+        float b = ( -2.0f/27.0f )*c2*c2*c2 + ( 1.0f/3.0f )*c1*c2 - c0;
+
+        // compute the root count check
+        float Q = 0.25f*b*b + ( 1.0f/27.0f )*a*a*a;
+
+        // test the multiplicity
+        if( FLT_EPSILON < Q )
+        {
+                // only one root, which implies we have a multiple of the identity
+        return Vec3( 1.0f );
+        }
+        else if( Q < -FLT_EPSILON )
+        {
+                // three distinct roots
+                float theta = std::atan2( std::sqrt( -Q ), -0.5f*b );
+                float rho = std::sqrt( 0.25f*b*b - Q );
+
+                float rt = std::pow( rho, 1.0f/3.0f );
+                float ct = std::cos( theta/3.0f );
+                float st = std::sin( theta/3.0f );
+
+                float l1 = ( 1.0f/3.0f )*c2 + 2.0f*rt*ct;
+                float l2 = ( 1.0f/3.0f )*c2 - rt*( ct + ( float )sqrt( 3.0f )*st );
+                float l3 = ( 1.0f/3.0f )*c2 - rt*( ct - ( float )sqrt( 3.0f )*st );
+
+                // pick the larger
+                if( std::fabs( l2 ) > std::fabs( l1 ) )
+                        l1 = l2;
+                if( std::fabs( l3 ) > std::fabs( l1 ) )
+                        l1 = l3;
+
+                // get the eigenvector
+                return GetMultiplicity1Evector( matrix, l1 );
+        }
+        else // if( -FLT_EPSILON <= Q && Q <= FLT_EPSILON )
+        {
+                // two roots
+                float rt;
+                if( b < 0.0f )
+                        rt = -std::pow( -0.5f*b, 1.0f/3.0f );
+                else
+                        rt = std::pow( 0.5f*b, 1.0f/3.0f );
+                
+                float l1 = ( 1.0f/3.0f )*c2 + rt;               // repeated
+                float l2 = ( 1.0f/3.0f )*c2 - 2.0f*rt;
+                
+                // get the eigenvector
+                if( std::fabs( l1 ) > std::fabs( l2 ) )
+                        return GetMultiplicity2Evector( matrix, l1 );
+                else
+                        return GetMultiplicity1Evector( matrix, l2 );
+        }
+}
+#endif
+
 
 } // namespace squish