diff --git a/src/nvtt/CompressionOptions.cpp b/src/nvtt/CompressionOptions.cpp
index 0cf2113..8fb251c 100644
--- a/src/nvtt/CompressionOptions.cpp
+++ b/src/nvtt/CompressionOptions.cpp
@@ -76,13 +76,11 @@ void CompressionOptions::setQuality(Quality quality, float errorThreshold /*= 0.
 /// The choice for these values is subjective. In many case uniform color weights 
 /// (1.0, 1.0, 1.0) work very well. A popular choice is to use the NTSC luma encoding 
 /// weights (0.2126, 0.7152, 0.0722), but I think that blue contributes to our 
-/// perception more than a 7%. A better choice in my opinion is (3, 4, 2). Ideally
-/// the compressor should use a non linear colour metric as described here:
-/// http://www.compuphase.com/cmetric.htm
+/// perception more than a 7%. A better choice in my opinion is (3, 4, 2).
 void CompressionOptions::setColorWeights(float red, float green, float blue)
 {
 	float total = red + green + blue;
-	float x = blue / total;
+	float x = red / total;
 	float y = green / total;
 
 	m.colorWeight.set(x, y, 1.0f - x - y);
diff --git a/src/nvtt/cuda/CompressKernel.cu b/src/nvtt/cuda/CompressKernel.cu
index 4db1141..9d805e6 100644
--- a/src/nvtt/cuda/CompressKernel.cu
+++ b/src/nvtt/cuda/CompressKernel.cu
@@ -49,6 +49,7 @@ __device__ inline void swap(T & a, T & b)
 }
 
 __constant__ float3 kColorMetric = { 1.0f, 1.0f, 1.0f };
+__constant__ float3 kColorMetricSqr = { 1.0f, 1.0f, 1.0f };
 
 
 
@@ -121,7 +122,7 @@ __device__ void loadColorBlock(const uint * image, float3 colors[16], float3 sum
 
 		// Sort colors along the best fit line.
 		colorSums(colors, sums);
-		float3 axis = bestFitLine(colors, sums[0]);
+		float3 axis = bestFitLine(colors, sums[0], kColorMetric);
 		
 		dps[idx] = dot(colors[idx], axis);
 		
@@ -164,7 +165,7 @@ __device__ void loadColorBlock(const uint * image, float3 colors[16], float3 sum
 
 		// Sort colors along the best fit line.
 		colorSums(colors, sums);
-		float3 axis = bestFitLine(colors, sums[0]);
+		float3 axis = bestFitLine(colors, sums[0], kColorMetric);
 
 		dps[idx] = dot(rawColors[idx], axis);
 		
@@ -239,7 +240,7 @@ __device__ float evalPermutation4(const float3 * colors, uint permutation, ushor
     // compute the error
     float3 e = a * a * alpha2_sum + b * b * beta2_sum + 2.0f * (a * b * alphabeta_sum - a * alphax_sum - b * betax_sum);
 
-    return dot(e, kColorMetric);
+    return dot(e, kColorMetricSqr);
 }
 
 __device__ float evalPermutation3(const float3 * colors, uint permutation, ushort * start, ushort * end)
@@ -279,7 +280,7 @@ __device__ float evalPermutation3(const float3 * colors, uint permutation, ushor
     // compute the error
     float3 e = a * a * alpha2_sum + b * b * beta2_sum + 2.0f * (a * b * alphabeta_sum - a * alphax_sum - b * betax_sum);
 
-    return dot(e, kColorMetric);
+    return dot(e, kColorMetricSqr);
 }
 
 __constant__ float alphaTable4[4] = { 9.0f, 0.0f, 6.0f, 3.0f };
@@ -320,7 +321,7 @@ __device__ float evalPermutation4(const float3 * colors, float3 color_sum, uint
     // compute the error
     float3 e = a * a * alpha2_sum + b * b * beta2_sum + 2.0f * (a * b * alphabeta_sum - a * alphax_sum - b * betax_sum);
 
-    return (1.0f / 9.0f) * dot(e, kColorMetric);
+    return (1.0f / 9.0f) * dot(e, kColorMetricSqr);
 }
 
 __device__ float evalPermutation3(const float3 * colors, float3 color_sum, uint permutation, ushort * start, ushort * end)
@@ -356,7 +357,7 @@ __device__ float evalPermutation3(const float3 * colors, float3 color_sum, uint
     // compute the error
     float3 e = a * a * alpha2_sum + b * b * beta2_sum + 2.0f * (a * b * alphabeta_sum - a * alphax_sum - b * betax_sum);
 
-    return (1.0f / 4.0f) * dot(e, kColorMetric);
+    return (1.0f / 4.0f) * dot(e, kColorMetricSqr);
 }
 
 __device__ float evalPermutation4(const float3 * colors, const float * weights, float3 color_sum, uint permutation, ushort * start, ushort * end)
@@ -396,7 +397,7 @@ __device__ float evalPermutation4(const float3 * colors, const float * weights,
     // compute the error
     float3 e = a * a * alpha2_sum + b * b * beta2_sum + 2.0f * (a * b * alphabeta_sum - a * alphax_sum - b * betax_sum);
 
-    return dot(e, kColorMetric);
+    return dot(e, kColorMetricSqr);
 }
 
 /*
@@ -437,7 +438,7 @@ __device__ float evalPermutation3(const float3 * colors, const float * weights,
 	// compute the error
 	float3 e = a * a * alpha2_sum + b * b * beta2_sum + 2.0f * (a * b * alphabeta_sum - a * alphax_sum - b * betax_sum);
 
-	return dot(e, kColorMetric);
+	return dot(e, kColorMetricSqr);
 }
 */
 
@@ -963,6 +964,13 @@ extern "C" void setupCompressKernel(const float weights[3])
 {
 	// Set constants.
 	cudaMemcpyToSymbol(kColorMetric, weights, sizeof(float) * 3, 0);
+
+	float weightsSqr[3];
+	weightsSqr[0] = weights[0] * weights[0];
+	weightsSqr[1] = weights[1] * weights[1];
+	weightsSqr[2] = weights[2] * weights[2];
+
+	cudaMemcpyToSymbol(kColorMetricSqr, weights, sizeof(float) * 3, 0);
 }
 
 
diff --git a/src/nvtt/cuda/CudaMath.h b/src/nvtt/cuda/CudaMath.h
index 363b7b5..ecb8f4c 100644
--- a/src/nvtt/cuda/CudaMath.h
+++ b/src/nvtt/cuda/CudaMath.h
@@ -166,14 +166,14 @@ inline __device__ void colorSums(const float3 * colors, float3 * sums)
 #endif
 }
 
-inline __device__ float3 bestFitLine(const float3 * colors, float3 color_sum)
+inline __device__ float3 bestFitLine(const float3 * colors, float3 color_sum, float3 colorMetric)
 {
 	// Compute covariance matrix of the given colors.
 #if __DEVICE_EMULATION__
 	float covariance[6] = {0, 0, 0, 0, 0, 0};
 	for (int i = 0; i < 16; i++)
 	{
-		float3 a = colors[i] - color_sum * (1.0f / 16.0f);
+		float3 a = (colors[i] - color_sum * (1.0f / 16.0f)) * colorMetric;
 		covariance[0] += a.x * a.x;
 		covariance[1] += a.x * a.y;
 		covariance[2] += a.x * a.z;
@@ -185,7 +185,7 @@ inline __device__ float3 bestFitLine(const float3 * colors, float3 color_sum)
 
 	const int idx = threadIdx.x;
 
-	float3 diff = colors[idx] - color_sum * (1.0f / 16.0f);
+	float3 diff = (colors[idx] - color_sum * (1.0f / 16.0f)) * colorMetric;
 
 	// @@ Eliminate two-way bank conflicts here.
 	// @@ It seems that doing that and unrolling the reduction doesn't help...
diff --git a/src/nvtt/squish/clusterfit.cpp b/src/nvtt/squish/clusterfit.cpp
index 9f0b51d..3f4fad1 100644
--- a/src/nvtt/squish/clusterfit.cpp
+++ b/src/nvtt/squish/clusterfit.cpp
@@ -36,30 +36,18 @@ ClusterFit::ClusterFit( ColourSet const* colours, int flags )
 	// initialise the best error
 #if SQUISH_USE_SIMD
 	m_besterror = VEC4_CONST( FLT_MAX );
+	Vec3 metric = m_metric.GetVec3();
 #else
 	m_besterror = FLT_MAX;
+	Vec3 metric = m_metric;
 #endif
 
-/*	// initialise the metric
-	bool perceptual = ( ( m_flags & kColourMetricPerceptual ) != 0 );
-#if SQUISH_USE_SIMD
-	if( perceptual )
-		m_metric = Vec4( 0.2126f, 0.7152f, 0.0722f, 0.0f );
-	else
-		m_metric = VEC4_CONST( 1.0f );	
-#else
-	if( perceptual )
-		m_metric = Vec3( 0.2126f, 0.7152f, 0.0722f );
-	else
-		m_metric = Vec3( 1.0f );
-#endif
-*/
 	// cache some values
 	int const count = m_colours->GetCount();
 	Vec3 const* values = m_colours->GetPoints();
 	
 	// get the covariance matrix
-	Sym3x3 covariance = ComputeWeightedCovariance( count, values, m_colours->GetWeights() );
+	Sym3x3 covariance = ComputeWeightedCovariance( count, values, m_colours->GetWeights(), metric );
 	
 	// compute the principle component
 	Vec3 principle = ComputePrincipleComponent( covariance );
diff --git a/src/nvtt/squish/fastclusterfit.cpp b/src/nvtt/squish/fastclusterfit.cpp
index 60c5e7d..c2e3103 100644
--- a/src/nvtt/squish/fastclusterfit.cpp
+++ b/src/nvtt/squish/fastclusterfit.cpp
@@ -37,8 +37,10 @@ FastClusterFit::FastClusterFit( ColourSet const* colours, int flags ) :
 	// initialise the best error
 #if SQUISH_USE_SIMD
 	m_besterror = VEC4_CONST( FLT_MAX );
+	Vec3 metric = m_metric.GetVec3();
 #else
 	m_besterror = FLT_MAX;
+	Vec3 metric = m_metric;
 #endif
 
 	// cache some values
@@ -46,7 +48,7 @@ FastClusterFit::FastClusterFit( ColourSet const* colours, int flags ) :
 	Vec3 const* values = m_colours->GetPoints();
 	
 	// get the covariance matrix
-	Sym3x3 covariance = ComputeWeightedCovariance( count, values, m_colours->GetWeights() );
+	Sym3x3 covariance = ComputeWeightedCovariance( count, values, m_colours->GetWeights(), metric );
 	
 	// compute the principle component
 	Vec3 principle = ComputePrincipleComponent( covariance );
diff --git a/src/nvtt/squish/maths.cpp b/src/nvtt/squish/maths.cpp
index d1a0051..87b4cd9 100644
--- a/src/nvtt/squish/maths.cpp
+++ b/src/nvtt/squish/maths.cpp
@@ -23,18 +23,12 @@
 	
    -------------------------------------------------------------------------- */
    
-/*! @file
-
-	The symmetric eigensystem solver algorithm is from 
-	http://www.geometrictools.com/Documentation/EigenSymmetric3x3.pdf
-*/
-
 #include "maths.h"
 #include <cfloat>
 
 namespace squish {
 
-Sym3x3 ComputeWeightedCovariance( int n, Vec3 const* points, float const* weights )
+Sym3x3 ComputeWeightedCovariance( int n, Vec3 const* points, float const* weights, Vec3::Arg metric )
 {
 	// compute the centroid
 	float total = 0.0f;
@@ -50,7 +44,7 @@ Sym3x3 ComputeWeightedCovariance( int n, Vec3 const* points, float const* weight
 	Sym3x3 covariance( 0.0f );
 	for( int i = 0; i < n; ++i )
 	{
-		Vec3 a = points[i] - centroid;
+		Vec3 a = (points[i] - centroid) * metric;
 		Vec3 b = weights[i]*a;
 		
 		covariance[0] += a.X()*b.X();
@@ -65,166 +59,6 @@ Sym3x3 ComputeWeightedCovariance( int n, Vec3 const* points, float const* weight
 	return covariance;
 }
 
-/*
-static Vec3 GetMultiplicity1Evector( Sym3x3 const& matrix, float evalue )
-{
-	// compute M
-	Sym3x3 m;
-	m[0] = matrix[0] - evalue;
-	m[1] = matrix[1];
-	m[2] = matrix[2];
-	m[3] = matrix[3] - evalue;
-	m[4] = matrix[4];
-	m[5] = matrix[5] - evalue;
-
-	// compute U
-	Sym3x3 u;
-	u[0] = m[3]*m[5] - m[4]*m[4];
-	u[1] = m[2]*m[4] - m[1]*m[5];
-	u[2] = m[1]*m[4] - m[2]*m[3];
-	u[3] = m[0]*m[5] - m[2]*m[2];
-	u[4] = m[1]*m[2] - m[4]*m[0];
-	u[5] = m[0]*m[3] - m[1]*m[1];
-
-	// find the largest component
-	float mc = std::fabs( u[0] );
-	int mi = 0;
-	for( int i = 1; i < 6; ++i )
-	{
-		float c = std::fabs( u[i] );
-		if( c > mc )
-		{
-			mc = c;
-			mi = i;
-		}
-	}
-
-	// pick the column with this component
-	switch( mi )
-	{
-	case 0:
-		return Vec3( u[0], u[1], u[2] );
-
-	case 1:
-	case 3:
-		return Vec3( u[1], u[3], u[4] );
-
-	default:
-		return Vec3( u[2], u[4], u[5] );
-	}
-}
-
-static Vec3 GetMultiplicity2Evector( Sym3x3 const& matrix, float evalue )
-{
-	// compute M
-	Sym3x3 m;
-	m[0] = matrix[0] - evalue;
-	m[1] = matrix[1];
-	m[2] = matrix[2];
-	m[3] = matrix[3] - evalue;
-	m[4] = matrix[4];
-	m[5] = matrix[5] - evalue;
-
-	// find the largest component
-	float mc = std::fabs( m[0] );
-	int mi = 0;
-	for( int i = 1; i < 6; ++i )
-	{
-		float c = std::fabs( m[i] );
-		if( c > mc )
-		{
-			mc = c;
-			mi = i;
-		}
-	}
-
-	// pick the first eigenvector based on this index
-	switch( mi )
-	{
-	case 0:
-	case 1:
-		return Vec3( -m[1], m[0], 0.0f );
-
-	case 2:
-		return Vec3( m[2], 0.0f, -m[0] );
-
-	case 3:
-	case 4:
-		return Vec3( 0.0f, -m[4], m[3] );
-
-	default:
-		return Vec3( 0.0f, -m[5], m[4] );
-	}
-}
-
-Vec3 ComputePrincipleComponent( Sym3x3 const& matrix )
-{
-	// compute the cubic coefficients
-	float c0 = matrix[0]*matrix[3]*matrix[5] 
-		+ 2.0f*matrix[1]*matrix[2]*matrix[4] 
-		- matrix[0]*matrix[4]*matrix[4] 
-		- matrix[3]*matrix[2]*matrix[2] 
-		- matrix[5]*matrix[1]*matrix[1];
-	float c1 = matrix[0]*matrix[3] + matrix[0]*matrix[5] + matrix[3]*matrix[5]
-		- matrix[1]*matrix[1] - matrix[2]*matrix[2] - matrix[4]*matrix[4];
-	float c2 = matrix[0] + matrix[3] + matrix[5];
-
-	// compute the quadratic coefficients
-	float a = c1 - ( 1.0f/3.0f )*c2*c2;
-	float b = ( -2.0f/27.0f )*c2*c2*c2 + ( 1.0f/3.0f )*c1*c2 - c0;
-
-	// compute the root count check
-	float Q = 0.25f*b*b + ( 1.0f/27.0f )*a*a*a;
-
-	// test the multiplicity
-	if( FLT_EPSILON < Q )
-	{
-		// only one root, which implies we have a multiple of the identity
-        return Vec3( 1.0f );
-	}
-	else if( Q < -FLT_EPSILON )
-	{
-		// three distinct roots
-		float theta = std::atan2( std::sqrt( -Q ), -0.5f*b );
-		float rho = std::sqrt( 0.25f*b*b - Q );
-
-		float rt = std::pow( rho, 1.0f/3.0f );
-		float ct = std::cos( theta/3.0f );
-		float st = std::sin( theta/3.0f );
-
-		float l1 = ( 1.0f/3.0f )*c2 + 2.0f*rt*ct;
-		float l2 = ( 1.0f/3.0f )*c2 - rt*( ct + ( float )sqrt( 3.0f )*st );
-		float l3 = ( 1.0f/3.0f )*c2 - rt*( ct - ( float )sqrt( 3.0f )*st );
-
-		// pick the larger
-		if( std::fabs( l2 ) > std::fabs( l1 ) )
-			l1 = l2;
-		if( std::fabs( l3 ) > std::fabs( l1 ) )
-			l1 = l3;
-
-		// get the eigenvector
-		return GetMultiplicity1Evector( matrix, l1 );
-	}
-	else // if( -FLT_EPSILON <= Q && Q <= FLT_EPSILON )
-	{
-		// two roots
-		float rt;
-		if( b < 0.0f )
-			rt = -std::pow( -0.5f*b, 1.0f/3.0f );
-		else
-			rt = std::pow( 0.5f*b, 1.0f/3.0f );
-		
-		float l1 = ( 1.0f/3.0f )*c2 + rt;		// repeated
-		float l2 = ( 1.0f/3.0f )*c2 - 2.0f*rt;
-		
-		// get the eigenvector
-		if( std::fabs( l1 ) > std::fabs( l2 ) )
-			return GetMultiplicity2Evector( matrix, l1 );
-		else
-			return GetMultiplicity1Evector( matrix, l2 );
-	}
-}
-*/
 
 Vec3 ComputePrincipleComponent( Sym3x3 const& matrix )
 {
diff --git a/src/nvtt/squish/maths.h b/src/nvtt/squish/maths.h
index 357d62f..087a889 100644
--- a/src/nvtt/squish/maths.h
+++ b/src/nvtt/squish/maths.h
@@ -231,7 +231,7 @@ private:
 	float m_x[6];
 };
 
-Sym3x3 ComputeWeightedCovariance( int n, Vec3 const* points, float const* weights );
+Sym3x3 ComputeWeightedCovariance( int n, Vec3 const* points, float const* weights, Vec3::Arg metric );
 Vec3 ComputePrincipleComponent( Sym3x3 const& matrix );
 
 } // namespace squish
diff --git a/src/nvtt/squish/weightedclusterfit.cpp b/src/nvtt/squish/weightedclusterfit.cpp
index 5b4a560..3ad504e 100644
--- a/src/nvtt/squish/weightedclusterfit.cpp
+++ b/src/nvtt/squish/weightedclusterfit.cpp
@@ -38,8 +38,10 @@ WeightedClusterFit::WeightedClusterFit( ColourSet const* colours, int flags ) :
 	// initialise the best error
 #if SQUISH_USE_SIMD
 	m_besterror = VEC4_CONST( FLT_MAX );
+	Vec3 metric = m_metric.GetVec3();
 #else
 	m_besterror = FLT_MAX;
+	Vec3 metric = m_metric;
 #endif
 
 	// cache some values
@@ -47,7 +49,7 @@ WeightedClusterFit::WeightedClusterFit( ColourSet const* colours, int flags ) :
 	Vec3 const* values = m_colours->GetPoints();
 	
 	// get the covariance matrix
-	Sym3x3 covariance = ComputeWeightedCovariance( count, values, m_colours->GetWeights() );
+	Sym3x3 covariance = ComputeWeightedCovariance( count, values, m_colours->GetWeights(), metric );
 	
 	// compute the principle component
 	Vec3 principle = ComputePrincipleComponent( covariance );
diff --git a/src/nvtt/tools/compress.cpp b/src/nvtt/tools/compress.cpp
index e5659c8..597049b 100644
--- a/src/nvtt/tools/compress.cpp
+++ b/src/nvtt/tools/compress.cpp
@@ -386,7 +386,15 @@ int main(int argc, char *argv[])
 		//compressionOptions.setQuality(nvtt::Quality_Highest);
 	}
 	compressionOptions.enableHardwareCompression(!nocuda);
-	compressionOptions.setColorWeights(1, 1, 1);
+
+	if (normal)
+	{
+		compressionOptions.setColorWeights(4, 4, 2);
+	}
+	else
+	{
+		compressionOptions.setColorWeights(1, 1, 1);
+	}
 
 	if (externalCompressor != NULL)
 	{