From 23bfc1b514dce8bad56cb3eac8fc965abba5c61f Mon Sep 17 00:00:00 2001
From: "castano@gmail.com"
 <castano@gmail.com@95f4ed2b-212e-0410-8b90-d31948207fce>
Date: Sun, 1 Jan 2012 21:29:27 +0000
Subject: [PATCH] Fix errors and enable new cluster fit compressor.

---
 project/vc9/nvconfig.h             |  1 +
 project/vc9/nvimage/nvimage.vcproj |  8 ++--
 project/vc9/squish/squish.vcproj   |  4 ++
 src/nvmath/Fitting.cpp             |  2 +-
 src/nvmath/SimdVector.h            | 13 +++++-
 src/nvmath/SimdVector_SSE.h        |  6 +--
 src/nvtt/ClusterFit.cpp            | 71 ++++++++++++++----------------
 src/nvtt/ClusterFit.h              |  6 ++-
 src/nvtt/CompressorDX9.cpp         |  2 +-
 src/nvtt/CompressorDX9.h           |  2 +-
 src/nvtt/QuickCompressDXT.cpp      | 19 ++++----
 11 files changed, 76 insertions(+), 58 deletions(-)
diff --git a/project/vc9/nvconfig.h b/project/vc9/nvconfig.h
index 0702786..b18fcc8 100644
--- a/project/vc9/nvconfig.h
+++ b/project/vc9/nvconfig.h
@@ -17,5 +17,6 @@
 #define HAVE_JPEG
 #define HAVE_TIFF
 #endif*/
+#define HAVE_STBIMAGE
 
 #endif // NV_CONFIG
diff --git a/project/vc9/nvimage/nvimage.vcproj b/project/vc9/nvimage/nvimage.vcproj
index 314783f..1676b05 100644
--- a/project/vc9/nvimage/nvimage.vcproj
+++ b/project/vc9/nvimage/nvimage.vcproj
@@ -45,7 +45,7 @@
 			<Tool
 				Name="VCCLCompilerTool"
 				Optimization="0"
-				AdditionalIncludeDirectories="$(GnuWinDir)/include; $(FreeImageDir)"
+				AdditionalIncludeDirectories="..\..\..\extern\stb"
 				PreprocessorDefinitions="WIN32;_DEBUG;_LIB;__SSE2__;__SSE__;__MMX__"
 				MinimalRebuild="true"
 				BasicRuntimeChecks="3"
@@ -109,7 +109,7 @@
 			<Tool
 				Name="VCCLCompilerTool"
 				Optimization="0"
-				AdditionalIncludeDirectories=""
+				AdditionalIncludeDirectories="..\..\..\extern\stb"
 				PreprocessorDefinitions="WIN32;_DEBUG;_LIB;__SSE2__;__SSE__;__MMX__"
 				MinimalRebuild="true"
 				BasicRuntimeChecks="3"
@@ -177,7 +177,7 @@
 				EnableIntrinsicFunctions="true"
 				FavorSizeOrSpeed="0"
 				OmitFramePointers="true"
-				AdditionalIncludeDirectories="$(GnuWinDir)/include; $(FreeImageDir)"
+				AdditionalIncludeDirectories="..\..\..\extern\stb"
 				PreprocessorDefinitions="WIN32;NDEBUG;_LIB;__SSE2__;__SSE__;__MMX__"
 				StringPooling="true"
 				RuntimeLibrary="2"
@@ -246,7 +246,7 @@
 				EnableIntrinsicFunctions="true"
 				OmitFramePointers="true"
 				WholeProgramOptimization="true"
-				AdditionalIncludeDirectories=""
+				AdditionalIncludeDirectories="..\..\..\extern\stb"
 				PreprocessorDefinitions="WIN32;NDEBUG;_LIB;__SSE2__;__SSE__;__MMX__"
 				StringPooling="true"
 				RuntimeLibrary="2"
diff --git a/project/vc9/squish/squish.vcproj b/project/vc9/squish/squish.vcproj
index 4770aca..7034e8e 100644
--- a/project/vc9/squish/squish.vcproj
+++ b/project/vc9/squish/squish.vcproj
@@ -312,6 +312,10 @@
 			RelativePath="..\..\..\src\nvtt\squish\colourset.h"
 			>
 		</File>
+		<File
+			RelativePath="..\..\..\src\nvtt\squish\config.h"
+			>
+		</File>
 		<File
 			RelativePath="..\..\..\src\nvtt\squish\maths.cpp"
 			>
diff --git a/src/nvmath/Fitting.cpp b/src/nvmath/Fitting.cpp
index 98172b4..57c755a 100644
--- a/src/nvmath/Fitting.cpp
+++ b/src/nvmath/Fitting.cpp
@@ -50,7 +50,7 @@ static inline Vector3 firstEigenVector_PowerMethod(const float *__restrict matri
         v = Vector3(x, y, z) / norm;
     }
 
-    return v;	
+    return v;
 }
 
 
diff --git a/src/nvmath/SimdVector.h b/src/nvmath/SimdVector.h
index fa164f3..42274c0 100644
--- a/src/nvmath/SimdVector.h
+++ b/src/nvmath/SimdVector.h
@@ -2,18 +2,29 @@
 
 #include "Vector.h" // Vector3, Vector4
 
-
 // Set some reasonable defaults.
 #ifndef NV_USE_ALTIVEC
 #   define NV_USE_ALTIVEC NV_CPU_PPC
+//#   define NV_USE_ALTIVEC defined(__VEC__)
 #endif
 
 #ifndef NV_USE_SSE
 #   if NV_CPU_X86 || NV_CPU_X86_64
 #       define NV_USE_SSE 2
 #   endif
+//#   if defined(__SSE2__)
+//#       define NV_USE_SSE 2
+//#   elif defined(__SSE__)
+//#       define NV_USE_SSE 1
+//#   else
+//#       define NV_USE_SSE 0
+//#   endif
 #endif
 
+// Internally set NV_USE_SIMD when either altivec or sse is available.
+#if NV_USE_ALTIVEC && NV_USE_SSE
+#	error "Cannot enable both altivec and sse!"
+#endif
 
 #if NV_USE_ALTIVEC
 #   include "SimdVector_VE.h"
diff --git a/src/nvmath/SimdVector_SSE.h b/src/nvmath/SimdVector_SSE.h
index 2b8271b..495d4ae 100644
--- a/src/nvmath/SimdVector_SSE.h
+++ b/src/nvmath/SimdVector_SSE.h
@@ -46,10 +46,10 @@ namespace nv {
         explicit SimdVector(float f) : vec(_mm_set1_ps(f)) {}
         explicit SimdVector(__m128 v) : vec(v) {}
 
-        explicit SimdVector(Vector4::Arg v)
+        /*explicit SimdVector(const Vector4 & v)
         {
-            vec = _mm_load_ps( v.component );
-        }
+            vec = _mm_load_ps( v.components );
+        }*/
 
         explicit SimdVector(const float * v)
         {
diff --git a/src/nvtt/ClusterFit.cpp b/src/nvtt/ClusterFit.cpp
index d148fb1..c191c35 100644
--- a/src/nvtt/ClusterFit.cpp
+++ b/src/nvtt/ClusterFit.cpp
@@ -83,6 +83,7 @@ void ClusterFit::setColourSet(const ColorSet * set)
     m_xxsum = SimdVector( 0.0f );
     m_xsum = SimdVector( 0.0f );
 #else
+    m_xxsum = Vector3(0.0f);
     m_xsum = Vector3(0.0f);
     m_wsum = 0.0f;
 #endif
@@ -91,11 +92,12 @@ void ClusterFit::setColourSet(const ColorSet * set)
     {
         int p = order[i];
 #if NVTT_USE_SIMD
-        m_weighted[i] = SimdVector(Vector4(set->weights[p] * values[p], set->weights[p]));
+        Vector4 tmp(values[p] * set->weights[p], set->weights[p]);
+        m_weighted[i] = SimdVector(tmp.component);
         m_xxsum += m_weighted[i] * m_weighted[i];
         m_xsum += m_weighted[i];
 #else
-        m_weighted[i] = values[p];
+        m_weighted[i] = values[p] * set->weights[p];
         m_xxsum += m_weighted[i] * m_weighted[i];
         m_xsum += m_weighted[i];
         m_weights[i] = set->weights[p];
@@ -108,7 +110,8 @@ void ClusterFit::setColourSet(const ColorSet * set)
 void ClusterFit::setMetric(Vector4::Arg w)
 {
 #if NVTT_USE_SIMD
-    m_metric = SimdVector(Vector4(w.xyz(), 1));
+    Vector4 tmp(w.xyz(), 1);
+    m_metric = SimdVector(tmp.component);
 #else
     m_metric = w.xyz();
 #endif
@@ -289,22 +292,22 @@ bool ClusterFit::compress4( Vector3 * start, Vector3 * end )
                 SimdVector e3 = negativeMultiplySubtract( b, betax_sum, e2 );
                 SimdVector e4 = multiplyAdd( two, e3, e1 );
 
-		// apply the metric to the error term
-		SimdVector e5 = e4 * m_metricSqr;
-		SimdVector error = e5.splatX() + e5.splatY() + e5.splatZ();
-
-		// keep the solution if it wins
-		if( compareAnyLessThan( error, besterror ) )
-		{
-		    besterror = error;
-		    beststart = a;
-		    bestend = b;
-		    b0 = c0;
-		    b1 = c1;
-		    b2 = c2;
-		}
-
-		x2 += m_weighted[c0+c1+c2];
+                // apply the metric to the error term
+                SimdVector e5 = e4 * m_metricSqr;
+                SimdVector error = e5.splatX() + e5.splatY() + e5.splatZ();
+
+                // keep the solution if it wins
+                if( compareAnyLessThan( error, besterror ) )
+                {
+                    besterror = error;
+                    beststart = a;
+                    bestend = b;
+                    b0 = c0;
+                    b1 = c1;
+                    b2 = c2;
+                }
+
+                x2 += m_weighted[c0+c1+c2];
 	    }
 
 	    x1 += m_weighted[c0+c1];
@@ -333,9 +336,6 @@ bool ClusterFit::compress4( Vector3 * start, Vector3 * end )
 bool ClusterFit::compress3(Vector3 * start, Vector3 * end)
 {
     const uint count = m_count;
-    const Vector3 one( 1.0f );
-    const Vector3 zero( 0.0f );
-    const Vector3 half( 0.5f );
     const Vector3 grid( 31.0f, 63.0f, 31.0f );
     const Vector3 gridrcp( 1.0f/31.0f, 1.0f/63.0f, 1.0f/31.0f );
 
@@ -372,10 +372,10 @@ bool ClusterFit::compress3(Vector3 * start, Vector3 * end)
             Vector3 b = (betax_sum*alpha2_sum - alphax_sum*alphabeta_sum) * factor;
 
             // clamp to the grid
-            a = min(one, max(zero, a));
-            b = min(one, max(zero, b));
-            a = floor(grid * a + half) * gridrcp;
-            b = floor(grid * b + half) * gridrcp;
+            a = clamp(a, 0, 1);
+            b = clamp(b, 0, 1);
+            a = floor(grid * a + 0.5f) * gridrcp;
+            b = floor(grid * b + 0.5f) * gridrcp;
 
             // compute the error
             Vector3 e1 = a*a*alpha2_sum + b*b*beta2_sum + 2.0f*( a*b*alphabeta_sum - a*alphax_sum - b*betax_sum );
@@ -420,9 +420,6 @@ bool ClusterFit::compress3(Vector3 * start, Vector3 * end)
 bool ClusterFit::compress4(Vector3 * start, Vector3 * end)
 {
     const uint count = m_count;
-    Vector3 const one( 1.0f );
-    Vector3 const zero( 0.0f );
-    Vector3 const half( 0.5f );
     Vector3 const grid( 31.0f, 63.0f, 31.0f );
     Vector3 const gridrcp( 1.0f/31.0f, 1.0f/63.0f, 1.0f/31.0f );
 
@@ -462,10 +459,10 @@ bool ClusterFit::compress4(Vector3 * start, Vector3 * end)
                 Vector3 b = ( betax_sum*alpha2_sum - alphax_sum*alphabeta_sum )*factor;
 
                 // clamp to the grid
-                a = min( one, max( zero, a ) );
-                b = min( one, max( zero, b ) );
-                a = floor( grid*a + half )*gridrcp;
-                b = floor( grid*b + half )*gridrcp;
+                a = clamp(a, 0, 1);
+                b = clamp(b, 0, 1);
+                a = floor(a * grid + 0.5f) * gridrcp;
+                b = floor(b * grid + 0.5f) * gridrcp;
 
                 // compute the error
                 Vector3 e1 = a*a*alpha2_sum + b*b*beta2_sum + 2.0f*( a*b*alphabeta_sum - a*alphax_sum - b*betax_sum );
@@ -474,7 +471,7 @@ bool ClusterFit::compress4(Vector3 * start, Vector3 * end)
                 float error = dot( e1, m_metricSqr );
 
                 // keep the solution if it wins
-                if( error < besterror )
+                if (error < besterror)
                 {
                     besterror = error;
                     beststart = a;
@@ -497,13 +494,13 @@ bool ClusterFit::compress4(Vector3 * start, Vector3 * end)
     }
 
     // save the block if necessary
-    if( besterror < m_besterror )
+    if (besterror < m_besterror)
     {
         *start = beststart;
         *end = bestend;
 
-	// save the error
-	m_besterror = besterror;
+        // save the error
+        m_besterror = besterror;
 
         return true;
     }
diff --git a/src/nvtt/ClusterFit.h b/src/nvtt/ClusterFit.h
index e023c66..6e85217 100644
--- a/src/nvtt/ClusterFit.h
+++ b/src/nvtt/ClusterFit.h
@@ -27,11 +27,13 @@
 #ifndef NVTT_CLUSTERFIT_H
 #define NVTT_CLUSTERFIT_H
 
-#define NVTT_USE_SIMD 0
-
 #include "nvmath/SimdVector.h"
 #include "nvmath/Vector.h"
 
+// Use SIMD version if altivec or SSE are available.
+//#define NVTT_USE_SIMD (NV_USE_ALTIVEC || NV_USE_SSE)
+#define NVTT_USE_SIMD 0
+
 namespace nv {
 
     struct ColorSet;
diff --git a/src/nvtt/CompressorDX9.cpp b/src/nvtt/CompressorDX9.cpp
index 10c74d9..60e8611 100644
--- a/src/nvtt/CompressorDX9.cpp
+++ b/src/nvtt/CompressorDX9.cpp
@@ -109,7 +109,7 @@ void FastCompressorDXT5n::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alpha
 	QuickCompress::compressDXT5(rgba, block);
 }
 
-#if 0
+#if 1
 void NormalCompressorDXT1::compressBlock(ColorSet & set, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
 {
     set.setUniformWeights();
diff --git a/src/nvtt/CompressorDX9.h b/src/nvtt/CompressorDX9.h
index 9f81e14..4ff8c10 100644
--- a/src/nvtt/CompressorDX9.h
+++ b/src/nvtt/CompressorDX9.h
@@ -64,7 +64,7 @@ namespace nv
 
 
     // Normal CPU compressors.
-#if 0
+#if 1
     struct NormalCompressorDXT1 : public ColorSetCompressor
     {
         virtual void compressBlock(ColorSet & set, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
diff --git a/src/nvtt/QuickCompressDXT.cpp b/src/nvtt/QuickCompressDXT.cpp
index 5659cef..b6e788a 100644
--- a/src/nvtt/QuickCompressDXT.cpp
+++ b/src/nvtt/QuickCompressDXT.cpp
@@ -115,6 +115,7 @@ inline static void insetBBox(Vector3 * restrict maxColor, Vector3 * restrict min
 	*minColor = clamp(*minColor + inset, 0.0f, 255.0f);
 }
 
+// Takes a normalized color in [0, 255] range and returns 
 inline static uint16 roundAndExpand(Vector3 * restrict v)
 {
 	uint r = uint(clamp(v->x * (31.0f / 255.0f), 0.0f, 31.0f) + 0.5f);
@@ -168,6 +169,7 @@ inline static uint computeIndices4(const Vector3 block[16], Vector3::Arg maxColo
 	return indices;
 }
 
+// maxColor and minColor are expected to be in the same range as the color set.
 inline static uint computeIndices4(const ColorSet & set, Vector3::Arg maxColor, Vector3::Arg minColor)
 {
 	Vector3 palette[4];
@@ -224,6 +226,7 @@ inline static float evaluatePaletteError4(const Vector3 block[16], Vector3::Arg
 	return total;
 }
 
+// maxColor and minColor are expected to be in the same range as the color set.
 inline static uint computeIndices3(const ColorSet & set, Vector3::Arg maxColor, Vector3::Arg minColor)
 {
 	Vector3 palette[4];
@@ -702,8 +705,8 @@ void QuickCompress::compressDXT5(const ColorBlock & rgba, BlockDXT5 * dxtBlock,
 
 void QuickCompress::outputBlock4(const ColorSet & set, const Vector3 & start, const Vector3 & end, BlockDXT1 * block)
 {
-    Vector3 maxColor = start * 255;
-    Vector3 minColor = end * 255;
+    Vector3 minColor = start * 255;
+    Vector3 maxColor = end * 255;
 	uint16 color0 = roundAndExpand(&maxColor);
 	uint16 color1 = roundAndExpand(&minColor);
 
@@ -715,17 +718,17 @@ void QuickCompress::outputBlock4(const ColorSet & set, const Vector3 & start, co
 
 	block->col0 = Color16(color0);
 	block->col1 = Color16(color1);
-	block->indices = computeIndices4(set, maxColor, minColor);
+	block->indices = computeIndices4(set, maxColor / 255, minColor / 255);
 
 	//optimizeEndPoints4(set, block);
 }
 
 void QuickCompress::outputBlock3(const ColorSet & set, const Vector3 & start, const Vector3 & end, BlockDXT1 * block)
 {
-    Vector3 maxColor = start * 255;
-    Vector3 minColor = end * 255;
-	uint16 color0 = roundAndExpand(&maxColor);
-	uint16 color1 = roundAndExpand(&minColor);
+    Vector3 minColor = start * 255;
+    Vector3 maxColor = end * 255;
+	uint16 color0 = roundAndExpand(&minColor);
+	uint16 color1 = roundAndExpand(&maxColor);
 
 	if (color0 > color1)
 	{
@@ -735,7 +738,7 @@ void QuickCompress::outputBlock3(const ColorSet & set, const Vector3 & start, co
 
 	block->col0 = Color16(color0);
 	block->col1 = Color16(color1);
-    block->indices = computeIndices3(set, maxColor, minColor);
+    block->indices = computeIndices3(set, maxColor / 255, minColor / 255);
 
 	//optimizeEndPoints3(set, block);
 }
\ No newline at end of file