From 49482d1441d34b1607908ef0f4d3e0b8e4155e42 Mon Sep 17 00:00:00 2001
From: castano <castano@95f4ed2b-212e-0410-8b90-d31948207fce>
Date: Tue, 9 Nov 2010 03:38:03 +0000
Subject: [PATCH] Work in progress. Merging squish into nvtt. Using squish only
 to find endpoints, do discrete refinement afterwards.

---
 project/vc9/nvconfig.h                 |    8 +-
 project/vc9/nvcore/nvcore.vcproj       |    8 -
 project/vc9/nvimage/nvimage.vcproj     |  204 ++---
 project/vc9/nvmath/nvmath.vcproj       |  108 +--
 project/vc9/nvtt.sln                   |    3 +-
 project/vc9/nvtt/nvtt.vcproj           |  310 ++++---
 src/nvcore/Memory.h                    |    8 +
 src/nvimage/ColorBlock.cpp             |    9 +-
 src/nvimage/ColorBlock.h               |   23 +-
 src/nvimage/FloatImage.h               |   21 +
 src/nvimage/ImageIO.cpp                |   46 +
 src/nvimage/PixelFormat.h              |    2 +-
 src/nvmath/SimdVector.h                |   24 +
 src/nvmath/SimdVector_SSE.h            |  201 +++++
 src/nvmath/SimdVector_VE.h             |  187 ++++
 src/nvtt/CompressorDX9.cpp             | 1126 ++++++++++++------------
 src/nvtt/QuickCompressDXT.cpp          |   83 +-
 src/nvtt/QuickCompressDXT.h            |    4 +
 src/nvtt/TexImage.cpp                  |  124 ++-
 src/nvtt/nvtt.h                        |   12 +-
 src/nvtt/squish/colourfit.cpp          |    8 +-
 src/nvtt/squish/colourfit.h            |    6 +-
 src/nvtt/squish/weightedclusterfit.cpp |   29 +-
 src/nvtt/squish/weightedclusterfit.h   |    4 +-
 src/nvtt/tests/testsuite.cpp           |  172 +++-
 25 files changed, 1724 insertions(+), 1006 deletions(-)
 create mode 100755 src/nvmath/SimdVector.h
 create mode 100755 src/nvmath/SimdVector_SSE.h
 create mode 100755 src/nvmath/SimdVector_VE.h
diff --git a/project/vc9/nvconfig.h b/project/vc9/nvconfig.h
index 486d087..2997693 100644
--- a/project/vc9/nvconfig.h
+++ b/project/vc9/nvconfig.h
@@ -12,10 +12,10 @@
 #endif
 
 #if !defined(_M_X64)
-#define HAVE_FREEIMAGE
-//#define HAVE_PNG
-//#define HAVE_JPEG
-//#define HAVE_TIFF
+//#define HAVE_FREEIMAGE
+#define HAVE_PNG
+#define HAVE_JPEG
+#define HAVE_TIFF
 #endif
 
 #endif // NV_CONFIG
diff --git a/project/vc9/nvcore/nvcore.vcproj b/project/vc9/nvcore/nvcore.vcproj
index ca5ae81..2f6b760 100644
--- a/project/vc9/nvcore/nvcore.vcproj
+++ b/project/vc9/nvcore/nvcore.vcproj
@@ -288,18 +288,10 @@
 	<References>
 	</References>
 	<Files>
-		<File
-			RelativePath="..\..\..\src\nvcore\Algorithms.h"
-			>
-		</File>
 		<File
 			RelativePath="..\..\..\src\nvcore\Array.h"
 			>
 		</File>
-		<File
-			RelativePath="..\..\..\src\nvcore\Containers.h"
-			>
-		</File>
 		<File
 			RelativePath="..\..\..\src\nvcore\Debug.cpp"
 			>
diff --git a/project/vc9/nvimage/nvimage.vcproj b/project/vc9/nvimage/nvimage.vcproj
index 11d6be8..93b3dae 100644
--- a/project/vc9/nvimage/nvimage.vcproj
+++ b/project/vc9/nvimage/nvimage.vcproj
@@ -287,118 +287,102 @@
 	<References>
 	</References>
 	<Files>
-		<Filter
-			Name="Source Files"
-			Filter="cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx"
-			UniqueIdentifier="{4FC737F1-C7A5-4376-A066-2A32D752A2FF}"
+		<File
+			RelativePath="..\..\..\src\nvimage\BlockDXT.cpp"
 			>
-			<File
-				RelativePath="..\..\..\src\nvimage\BlockDXT.cpp"
-				>
-			</File>
-			<File
-				RelativePath="..\..\..\src\nvimage\ColorBlock.cpp"
-				>
-			</File>
-			<File
-				RelativePath="..\..\..\src\nvimage\ColorSpace.cpp"
-				>
-			</File>
-			<File
-				RelativePath="..\..\..\src\nvimage\DirectDrawSurface.cpp"
-				>
-			</File>
-			<File
-				RelativePath="..\..\..\src\nvimage\Filter.cpp"
-				>
-			</File>
-			<File
-				RelativePath="..\..\..\src\nvimage\FloatImage.cpp"
-				>
-			</File>
-			<File
-				RelativePath="..\..\..\src\nvmath\Half.cpp"
-				>
-			</File>
-			<File
-				RelativePath="..\..\..\src\nvimage\Image.cpp"
-				>
-			</File>
-			<File
-				RelativePath="..\..\..\src\nvimage\ImageIO.cpp"
-				>
-			</File>
-			<File
-				RelativePath="..\..\..\src\nvimage\NormalMap.cpp"
-				>
-			</File>
-			<File
-				RelativePath="..\..\..\src\nvimage\Quantize.cpp"
-				>
-			</File>
-		</Filter>
-		<Filter
-			Name="Header Files"
-			Filter="h;hpp;hxx;hm;inl;inc;xsd"
-			UniqueIdentifier="{93995380-89BD-4b04-88EB-625FBE52EBFB}"
+		</File>
+		<File
+			RelativePath="..\..\..\src\nvimage\BlockDXT.h"
 			>
-			<File
-				RelativePath="..\..\..\src\nvimage\BlockDXT.h"
-				>
-			</File>
-			<File
-				RelativePath="..\..\..\src\nvimage\ColorBlock.h"
-				>
-			</File>
-			<File
-				RelativePath="..\..\..\src\nvimage\ColorSpace.h"
-				>
-			</File>
-			<File
-				RelativePath="..\..\..\src\nvimage\DirectDrawSurface.h"
-				>
-			</File>
-			<File
-				RelativePath="..\..\..\src\nvimage\Filter.h"
-				>
-			</File>
-			<File
-				RelativePath="..\..\..\src\nvimage\FloatImage.h"
-				>
-			</File>
-			<File
-				RelativePath="..\..\..\src\nvmath\Half.h"
-				>
-			</File>
-			<File
-				RelativePath="..\..\..\src\nvimage\Image.h"
-				>
-			</File>
-			<File
-				RelativePath="..\..\..\src\nvimage\ImageIO.h"
-				>
-			</File>
-			<File
-				RelativePath="..\..\..\src\nvimage\NormalMap.h"
-				>
-			</File>
-			<File
-				RelativePath="..\..\..\src\nvimage\nvimage.h"
-				>
-			</File>
-			<File
-				RelativePath="..\..\..\src\nvimage\PsdFile.h"
-				>
-			</File>
-			<File
-				RelativePath="..\..\..\src\nvimage\Quantize.h"
-				>
-			</File>
-			<File
-				RelativePath="..\..\..\src\nvimage\TgaFile.h"
-				>
-			</File>
-		</Filter>
+		</File>
+		<File
+			RelativePath="..\..\..\src\nvimage\ColorBlock.cpp"
+			>
+		</File>
+		<File
+			RelativePath="..\..\..\src\nvimage\ColorBlock.h"
+			>
+		</File>
+		<File
+			RelativePath="..\..\..\src\nvimage\DirectDrawSurface.cpp"
+			>
+		</File>
+		<File
+			RelativePath="..\..\..\src\nvimage\DirectDrawSurface.h"
+			>
+		</File>
+		<File
+			RelativePath="..\..\..\src\nvimage\Filter.cpp"
+			>
+		</File>
+		<File
+			RelativePath="..\..\..\src\nvimage\Filter.h"
+			>
+		</File>
+		<File
+			RelativePath="..\..\..\src\nvimage\FloatImage.cpp"
+			>
+		</File>
+		<File
+			RelativePath="..\..\..\src\nvimage\FloatImage.h"
+			>
+		</File>
+		<File
+			RelativePath="..\..\..\src\nvmath\Half.cpp"
+			>
+		</File>
+		<File
+			RelativePath="..\..\..\src\nvmath\Half.h"
+			>
+		</File>
+		<File
+			RelativePath="..\..\..\src\nvimage\Image.cpp"
+			>
+		</File>
+		<File
+			RelativePath="..\..\..\src\nvimage\Image.h"
+			>
+		</File>
+		<File
+			RelativePath="..\..\..\src\nvimage\ImageIO.cpp"
+			>
+		</File>
+		<File
+			RelativePath="..\..\..\src\nvimage\ImageIO.h"
+			>
+		</File>
+		<File
+			RelativePath="..\..\..\src\nvimage\NormalMap.cpp"
+			>
+		</File>
+		<File
+			RelativePath="..\..\..\src\nvimage\NormalMap.h"
+			>
+		</File>
+		<File
+			RelativePath="..\..\..\src\nvimage\nvimage.h"
+			>
+		</File>
+		<File
+			RelativePath="..\..\..\src\nvimage\PixelFormat.h"
+			>
+		</File>
+		<File
+			RelativePath="..\..\..\src\nvimage\PsdFile.h"
+			>
+		</File>
+		<File
+			RelativePath="..\..\..\src\nvimage\Quantize.cpp"
+			>
+		</File>
+		<File
+			RelativePath="..\..\..\src\nvimage\Quantize.h"
+			>
+		</File>
+		<File
+			RelativePath="..\..\..\src\nvimage\TgaFile.h"
+			>
+		</File>
 	</Files>
 	<Globals>
 	</Globals>
diff --git a/project/vc9/nvmath/nvmath.vcproj b/project/vc9/nvmath/nvmath.vcproj
index 2c95ba6..fe8a049 100644
--- a/project/vc9/nvmath/nvmath.vcproj
+++ b/project/vc9/nvmath/nvmath.vcproj
@@ -288,62 +288,62 @@
 	<References>
 	</References>
 	<Files>
-		<Filter
-			Name="Source Files"
-			Filter="cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx"
-			UniqueIdentifier="{4FC737F1-C7A5-4376-A066-2A32D752A2FF}"
+		<File
+			RelativePath="..\..\..\src\nvmath\Box.h"
 			>
-			<File
-				RelativePath="..\..\..\src\nvmath\Fitting.cpp"
-				>
-			</File>
-			<File
-				RelativePath="..\..\..\src\nvmath\Half.cpp"
-				>
-			</File>
-			<File
-				RelativePath="..\..\..\src\nvmath\Plane.cpp"
-				>
-			</File>
-		</Filter>
-		<Filter
-			Name="Header Files"
-			Filter="h;hpp;hxx;hm;inl;inc;xsd"
-			UniqueIdentifier="{93995380-89BD-4b04-88EB-625FBE52EBFB}"
+		</File>
+		<File
+			RelativePath="..\..\..\src\nvmath\Color.h"
 			>
-			<File
-				RelativePath="..\..\..\src\nvmath\Box.h"
-				>
-			</File>
-			<File
-				RelativePath="..\..\..\src\nvmath\Color.h"
-				>
-			</File>
-			<File
-				RelativePath="..\..\..\src\nvmath\Fitting.h"
-				>
-			</File>
-			<File
-				RelativePath="..\..\..\src\nvmath\Half.h"
-				>
-			</File>
-			<File
-				RelativePath="..\..\..\src\nvmath\Matrix.h"
-				>
-			</File>
-			<File
-				RelativePath="..\..\..\src\nvmath\nvmath.h"
-				>
-			</File>
-			<File
-				RelativePath="..\..\..\src\nvmath\Plane.h"
-				>
-			</File>
-			<File
-				RelativePath="..\..\..\src\nvmath\Vector.h"
-				>
-			</File>
-		</Filter>
+		</File>
+		<File
+			RelativePath="..\..\..\src\nvmath\Fitting.cpp"
+			>
+		</File>
+		<File
+			RelativePath="..\..\..\src\nvmath\Fitting.h"
+			>
+		</File>
+		<File
+			RelativePath="..\..\..\src\nvmath\Half.cpp"
+			>
+		</File>
+		<File
+			RelativePath="..\..\..\src\nvmath\Half.h"
+			>
+		</File>
+		<File
+			RelativePath="..\..\..\src\nvmath\Matrix.h"
+			>
+		</File>
+		<File
+			RelativePath="..\..\..\src\nvmath\nvmath.h"
+			>
+		</File>
+		<File
+			RelativePath="..\..\..\src\nvmath\Plane.cpp"
+			>
+		</File>
+		<File
+			RelativePath="..\..\..\src\nvmath\Plane.h"
+			>
+		</File>
+		<File
+			RelativePath="..\..\..\src\nvmath\SimdVector.h"
+			>
+		</File>
+		<File
+			RelativePath="..\..\..\src\nvmath\SimdVector_SSE.h"
+			>
+		</File>
+		<File
+			RelativePath="..\..\..\src\nvmath\SimdVector_VE.h"
+			>
+		</File>
+		<File
+			RelativePath="..\..\..\src\nvmath\Vector.h"
+			>
+		</File>
 	</Files>
 	<Globals>
 	</Globals>
diff --git a/project/vc9/nvtt.sln b/project/vc9/nvtt.sln
index c6fc6cd..6d2206b 100644
--- a/project/vc9/nvtt.sln
+++ b/project/vc9/nvtt.sln
@@ -393,7 +393,8 @@ Global
 		{317B694E-B5C1-42A6-956F-FC12B69175A6}.Debug|Mixed Platforms.Build.0 = Debug|x64
 		{317B694E-B5C1-42A6-956F-FC12B69175A6}.Debug|Win32.ActiveCfg = Debug|Win32
 		{317B694E-B5C1-42A6-956F-FC12B69175A6}.Debug|Win32.Build.0 = Debug|Win32
-		{317B694E-B5C1-42A6-956F-FC12B69175A6}.Debug|x64.ActiveCfg = Debug|Win32
+		{317B694E-B5C1-42A6-956F-FC12B69175A6}.Debug|x64.ActiveCfg = Debug|x64
+		{317B694E-B5C1-42A6-956F-FC12B69175A6}.Debug|x64.Build.0 = Debug|x64
 		{317B694E-B5C1-42A6-956F-FC12B69175A6}.Release (no cuda)|Mixed Platforms.ActiveCfg = Release|x64
 		{317B694E-B5C1-42A6-956F-FC12B69175A6}.Release (no cuda)|Mixed Platforms.Build.0 = Release|x64
 		{317B694E-B5C1-42A6-956F-FC12B69175A6}.Release (no cuda)|Win32.ActiveCfg = Release|Win32
diff --git a/project/vc9/nvtt/nvtt.vcproj b/project/vc9/nvtt/nvtt.vcproj
index 802d4ab..cab8305 100644
--- a/project/vc9/nvtt/nvtt.vcproj
+++ b/project/vc9/nvtt/nvtt.vcproj
@@ -45,7 +45,7 @@
 			<Tool
 				Name="VCCLCompilerTool"
 				Optimization="0"
-				AdditionalIncludeDirectories="..;..\..\..\src;..\..\..\extern\gnuwin32\include;$(CUDA_INC_PATH)"
+				AdditionalIncludeDirectories="..;..\..\..\src;..\..\..\extern\gnuwin32\include;$(CUDA_PATH)\include"
 				PreprocessorDefinitions="WIN32;_DEBUG;_WINDOWS;_USRDLL;NVTT_EXPORTS;NVTT_SHARED;HAVE_CUDA;__SSE2__;__SSE__;__MMX__"
 				MinimalRebuild="true"
 				BasicRuntimeChecks="3"
@@ -68,7 +68,7 @@
 				AdditionalDependencies="libpng.lib jpeg.lib tiff.lib FreeImage.lib cudart.lib"
 				OutputFile="$(SolutionDir)\$(ConfigurationName).$(PlatformName)\bin\$(ProjectName).dll"
 				LinkIncremental="2"
-				AdditionalLibraryDirectories="$(GnuWinDir)\lib; $(FreeImageDir); &quot;$(CUDA_LIB_PATH)\..\lib&quot;"
+				AdditionalLibraryDirectories="$(GnuWinDir)\lib; $(FreeImageDir); &quot;$(CUDA_PATH)\lib\Win32&quot;"
 				GenerateDebugInformation="true"
 				SubSystem="2"
 				RandomizedBaseAddress="1"
@@ -127,7 +127,7 @@
 			<Tool
 				Name="VCCLCompilerTool"
 				Optimization="0"
-				AdditionalIncludeDirectories="..;..\..\..\src;..\..\..\extern\gnuwin32\include;$(CUDA_INC_PATH)"
+				AdditionalIncludeDirectories="..;..\..\..\src;..\..\..\extern\gnuwin32\include;$(CUDA_PATH)\include"
 				PreprocessorDefinitions="WIN32;_DEBUG;_WINDOWS;_USRDLL;NVTT_EXPORTS;NVTT_SHARED;HAVE_CUDA;__SSE2__;__SSE__;__MMX__"
 				MinimalRebuild="true"
 				BasicRuntimeChecks="3"
@@ -150,7 +150,7 @@
 				AdditionalDependencies="cudart.lib"
 				OutputFile="$(SolutionDir)\$(ConfigurationName).$(PlatformName)\bin\$(ProjectName).dll"
 				LinkIncremental="2"
-				AdditionalLibraryDirectories="&quot;$(CUDA_LIB_PATH)\..\lib64&quot;"
+				AdditionalLibraryDirectories="&quot;$(CUDA_PATH)\lib\x64&quot;"
 				GenerateDebugInformation="true"
 				SubSystem="2"
 				RandomizedBaseAddress="1"
@@ -212,7 +212,7 @@
 				FavorSizeOrSpeed="0"
 				OmitFramePointers="true"
 				EnableFiberSafeOptimizations="true"
-				AdditionalIncludeDirectories="..;..\..\..\src;..\..\..\extern\gnuwin32\include;$(CUDA_INC_PATH)"
+				AdditionalIncludeDirectories="..;..\..\..\src;..\..\..\extern\gnuwin32\include;$(CUDA_PATH)\include"
 				PreprocessorDefinitions="WIN32;NDEBUG;_WINDOWS;_USRDLL;NVTT_EXPORTS;NVTT_SHARED;HAVE_CUDA;__SSE2__;__SSE__;__MMX__"
 				StringPooling="true"
 				RuntimeLibrary="2"
@@ -220,7 +220,7 @@
 				EnableEnhancedInstructionSet="2"
 				UsePrecompiledHeader="0"
 				WarningLevel="3"
-				DebugInformationFormat="0"
+				DebugInformationFormat="3"
 			/>
 			<Tool
 				Name="VCManagedResourceCompilerTool"
@@ -236,7 +236,7 @@
 				AdditionalDependencies="libpng.lib jpeg.lib tiff.lib FreeImage.lib cudart.lib"
 				OutputFile="$(SolutionDir)\$(ConfigurationName).$(PlatformName)\bin\$(ProjectName).dll"
 				LinkIncremental="1"
-				AdditionalLibraryDirectories="$(GnuWinDir)\lib; $(FreeImageDir); &quot;$(CUDA_LIB_PATH)\..\lib&quot;"
+				AdditionalLibraryDirectories="$(GnuWinDir)\lib; $(FreeImageDir); &quot;$(CUDA_PATH)\lib\Win32&quot;"
 				SubSystem="2"
 				OptimizeReferences="2"
 				EnableCOMDATFolding="2"
@@ -300,14 +300,14 @@
 				EnableIntrinsicFunctions="true"
 				OmitFramePointers="true"
 				WholeProgramOptimization="true"
-				AdditionalIncludeDirectories="..;..\..\..\src;..\..\..\extern\gnuwin32\include;$(CUDA_INC_PATH)"
+				AdditionalIncludeDirectories="..;..\..\..\src;..\..\..\extern\gnuwin32\include;$(CUDA_PATH)\include"
 				PreprocessorDefinitions="WIN32;NDEBUG;_WINDOWS;_USRDLL;NVTT_EXPORTS;NVTT_SHARED;HAVE_CUDA;__SSE2__;__SSE__;__MMX__"
 				StringPooling="true"
 				RuntimeLibrary="2"
 				EnableFunctionLevelLinking="true"
 				UsePrecompiledHeader="0"
 				WarningLevel="3"
-				DebugInformationFormat="0"
+				DebugInformationFormat="3"
 			/>
 			<Tool
 				Name="VCManagedResourceCompilerTool"
@@ -323,7 +323,7 @@
 				AdditionalDependencies="cudart.lib"
 				OutputFile="$(SolutionDir)\$(ConfigurationName).$(PlatformName)\bin\$(ProjectName).dll"
 				LinkIncremental="1"
-				AdditionalLibraryDirectories="&quot;$(CUDA_LIB_PATH)\..\lib64&quot;"
+				AdditionalLibraryDirectories="&quot;$(CUDA_PATH)\lib\x64&quot;"
 				SubSystem="2"
 				OptimizeReferences="2"
 				EnableCOMDATFolding="2"
@@ -677,14 +677,22 @@
 	</References>
 	<Files>
 		<Filter
-			Name="Source Files"
-			Filter="cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx"
-			UniqueIdentifier="{4FC737F1-C7A5-4376-A066-2A32D752A2FF}"
+			Name="Resource Files"
+			Filter="rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav"
+			UniqueIdentifier="{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}"
 			>
 			<File
-				RelativePath="..\..\..\src\nvtt\CompressionOptions.cpp"
+				RelativePath=".\nvtt.rc"
 				>
 			</File>
+			<File
+				RelativePath=".\resource.h"
+				>
+			</File>
+		</Filter>
+		<Filter
+			Name="cuda"
+			>
 			<File
 				RelativePath="..\..\..\src\nvtt\cuda\CompressKernel.cu"
 				>
@@ -767,34 +775,6 @@
 					/>
 				</FileConfiguration>
 			</File>
-			<File
-				RelativePath="..\..\..\src\nvtt\CompressorDX10.cpp"
-				>
-			</File>
-			<File
-				RelativePath="..\..\..\src\nvtt\CompressorDX11.cpp"
-				>
-			</File>
-			<File
-				RelativePath="..\..\..\src\nvtt\CompressorDX9.cpp"
-				>
-			</File>
-			<File
-				RelativePath="..\..\..\src\nvtt\CompressorDXT.cpp"
-				>
-			</File>
-			<File
-				RelativePath="..\..\..\src\nvtt\CompressorRGB.cpp"
-				>
-			</File>
-			<File
-				RelativePath="..\..\..\src\nvtt\CompressorRGBE.cpp"
-				>
-			</File>
-			<File
-				RelativePath="..\..\..\src\nvtt\Context.cpp"
-				>
-			</File>
 			<File
 				RelativePath="..\..\..\src\nvtt\cuda\ConvolveKernel.cu"
 				>
@@ -867,84 +847,6 @@
 				RelativePath="..\..\..\src\nvtt\cuda\CudaCompressorDXT.cpp"
 				>
 			</File>
-			<File
-				RelativePath="..\..\..\src\nvtt\cuda\CudaUtils.cpp"
-				>
-			</File>
-			<File
-				RelativePath="..\..\..\src\nvtt\InputOptions.cpp"
-				>
-			</File>
-			<File
-				RelativePath="..\..\..\src\nvtt\nvtt.cpp"
-				>
-			</File>
-			<File
-				RelativePath="..\..\..\src\nvtt\nvtt_wrapper.cpp"
-				>
-			</File>
-			<File
-				RelativePath="..\..\..\src\nvtt\OptimalCompressDXT.cpp"
-				>
-			</File>
-			<File
-				RelativePath="..\..\..\src\nvtt\OutputOptions.cpp"
-				>
-			</File>
-			<File
-				RelativePath="..\..\..\src\nvtt\QuickCompressDXT.cpp"
-				>
-			</File>
-			<File
-				RelativePath="..\..\..\src\nvtt\SingleColorLookup.cpp"
-				>
-			</File>
-			<File
-				RelativePath="..\..\..\src\nvtt\TexImage.cpp"
-				>
-			</File>
-		</Filter>
-		<Filter
-			Name="Header Files"
-			Filter="h;hpp;hxx;hm;inl;inc;xsd"
-			UniqueIdentifier="{93995380-89BD-4b04-88EB-625FBE52EBFB}"
-			>
-			<File
-				RelativePath="..\..\..\src\nvtt\CompressionOptions.h"
-				>
-			</File>
-			<File
-				RelativePath="..\..\..\src\nvtt\Compressor.h"
-				>
-			</File>
-			<File
-				RelativePath="..\..\..\src\nvtt\CompressorDX10.h"
-				>
-			</File>
-			<File
-				RelativePath="..\..\..\src\nvtt\CompressorDX11.h"
-				>
-			</File>
-			<File
-				RelativePath="..\..\..\src\nvtt\CompressorDX9.h"
-				>
-			</File>
-			<File
-				RelativePath="..\..\..\src\nvtt\CompressorDXT.h"
-				>
-			</File>
-			<File
-				RelativePath="..\..\..\src\nvtt\CompressorRGB.h"
-				>
-			</File>
-			<File
-				RelativePath="..\..\..\src\nvtt\CompressorRGBE.h"
-				>
-			</File>
-			<File
-				RelativePath="..\..\..\src\nvtt\Context.h"
-				>
-			</File>
 			<File
 				RelativePath="..\..\..\src\nvtt\cuda\CudaCompressorDXT.h"
 				>
@@ -954,52 +856,142 @@
 				>
 			</File>
 			<File
-				RelativePath="..\..\..\src\nvtt\cuda\CudaUtils.h"
-				>
-			</File>
-			<File
-				RelativePath="..\..\..\src\nvtt\InputOptions.h"
-				>
-			</File>
-			<File
-				RelativePath="..\..\..\src\nvtt\nvtt.h"
-				>
-			</File>
-			<File
-				RelativePath="..\..\..\src\nvtt\nvtt_wrapper.h"
-				>
-			</File>
-			<File
-				RelativePath="..\..\..\src\nvtt\OptimalCompressDXT.h"
-				>
-			</File>
-			<File
-				RelativePath="..\..\..\src\nvtt\OutputOptions.h"
-				>
-			</File>
-			<File
-				RelativePath="..\..\..\src\nvtt\QuickCompressDXT.h"
-				>
-			</File>
-			<File
-				RelativePath=".\resource.h"
+				RelativePath="..\..\..\src\nvtt\cuda\CudaUtils.cpp"
 				>
 			</File>
 			<File
-				RelativePath="..\..\..\src\nvtt\TexImage.h"
+				RelativePath="..\..\..\src\nvtt\cuda\CudaUtils.h"
 				>
 			</File>
 		</Filter>
-		<Filter
-			Name="Resource Files"
-			Filter="rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav"
-			UniqueIdentifier="{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}"
+		<File
+			RelativePath="..\..\..\src\nvtt\CompressionOptions.cpp"
 			>
-			<File
-				RelativePath=".\nvtt.rc"
-				>
-			</File>
-		</Filter>
+		</File>
+		<File
+			RelativePath="..\..\..\src\nvtt\CompressionOptions.h"
+			>
+		</File>
+		<File
+			RelativePath="..\..\..\src\nvtt\Compressor.h"
+			>
+		</File>
+		<File
+			RelativePath="..\..\..\src\nvtt\CompressorDX10.cpp"
+			>
+		</File>
+		<File
+			RelativePath="..\..\..\src\nvtt\CompressorDX10.h"
+			>
+		</File>
+		<File
+			RelativePath="..\..\..\src\nvtt\CompressorDX11.cpp"
+			>
+		</File>
+		<File
+			RelativePath="..\..\..\src\nvtt\CompressorDX11.h"
+			>
+		</File>
+		<File
+			RelativePath="..\..\..\src\nvtt\CompressorDX9.cpp"
+			>
+		</File>
+		<File
+			RelativePath="..\..\..\src\nvtt\CompressorDX9.h"
+			>
+		</File>
+		<File
+			RelativePath="..\..\..\src\nvtt\CompressorDXT.cpp"
+			>
+		</File>
+		<File
+			RelativePath="..\..\..\src\nvtt\CompressorDXT.h"
+			>
+		</File>
+		<File
+			RelativePath="..\..\..\src\nvtt\CompressorRGB.cpp"
+			>
+		</File>
+		<File
+			RelativePath="..\..\..\src\nvtt\CompressorRGB.h"
+			>
+		</File>
+		<File
+			RelativePath="..\..\..\src\nvtt\CompressorRGBE.cpp"
+			>
+		</File>
+		<File
+			RelativePath="..\..\..\src\nvtt\CompressorRGBE.h"
+			>
+		</File>
+		<File
+			RelativePath="..\..\..\src\nvtt\Context.cpp"
+			>
+		</File>
+		<File
+			RelativePath="..\..\..\src\nvtt\Context.h"
+			>
+		</File>
+		<File
+			RelativePath="..\..\..\src\nvtt\InputOptions.cpp"
+			>
+		</File>
+		<File
+			RelativePath="..\..\..\src\nvtt\InputOptions.h"
+			>
+		</File>
+		<File
+			RelativePath="..\..\..\src\nvtt\nvtt.cpp"
+			>
+		</File>
+		<File
+			RelativePath="..\..\..\src\nvtt\nvtt.h"
+			>
+		</File>
+		<File
+			RelativePath="..\..\..\src\nvtt\nvtt_wrapper.cpp"
+			>
+		</File>
+		<File
+			RelativePath="..\..\..\src\nvtt\nvtt_wrapper.h"
+			>
+		</File>
+		<File
+			RelativePath="..\..\..\src\nvtt\OptimalCompressDXT.cpp"
+			>
+		</File>
+		<File
+			RelativePath="..\..\..\src\nvtt\OptimalCompressDXT.h"
+			>
+		</File>
+		<File
+			RelativePath="..\..\..\src\nvtt\OutputOptions.cpp"
+			>
+		</File>
+		<File
+			RelativePath="..\..\..\src\nvtt\OutputOptions.h"
+			>
+		</File>
+		<File
+			RelativePath="..\..\..\src\nvtt\QuickCompressDXT.cpp"
+			>
+		</File>
+		<File
+			RelativePath="..\..\..\src\nvtt\QuickCompressDXT.h"
+			>
+		</File>
+		<File
+			RelativePath="..\..\..\src\nvtt\SingleColorLookup.cpp"
+			>
+		</File>
+		<File
+			RelativePath="..\..\..\src\nvtt\TexImage.cpp"
+			>
+		</File>
+		<File
+			RelativePath="..\..\..\src\nvtt\TexImage.h"
+			>
+		</File>
 	</Files>
 	<Globals>
 	</Globals>
diff --git a/src/nvcore/Memory.h b/src/nvcore/Memory.h
index 67d37c7..27d7e3d 100644
--- a/src/nvcore/Memory.h
+++ b/src/nvcore/Memory.h
@@ -11,6 +11,14 @@
 
 #include <new>	// new and delete
 
+
+#if NV_CC_GNUC
+#   define NV_ALIGN_16 __attribute__ ((__aligned__ (16)))
+#else
+#   define NV_ALIGN_16 __declspec(align(16))
+#endif
+
+
 #define NV_OVERRIDE_ALLOC 0
 
 #if NV_OVERRIDE_ALLOC
diff --git a/src/nvimage/ColorBlock.cpp b/src/nvimage/ColorBlock.cpp
index 511a6b9..db77083 100644
--- a/src/nvimage/ColorBlock.cpp
+++ b/src/nvimage/ColorBlock.cpp
@@ -458,7 +458,8 @@ float ColorBlock::volume() const
 }*/
 
 
-void FloatColorBlock::init(const Image * img, uint x, uint y)
+
+void ColorSet::init(const Image * img, uint x, uint y)
 {
     w = min(4U, img->width() - x);
     h = min(4U, img->height() - y);
@@ -485,15 +486,15 @@ void FloatColorBlock::init(const Image * img, uint x, uint y)
     }
 }
 
-void FloatColorBlock::init(const FloatImage * img, uint x, uint y)
+void ColorSet::init(const FloatImage * img, uint x, uint y)
 {
 }
 
-void FloatColorBlock::init(const uint * data, uint w, uint h, uint x, uint y)
+void ColorSet::init(const uint * data, uint w, uint h, uint x, uint y)
 {
 }
 
-void FloatColorBlock::init(const float * data, uint w, uint h, uint x, uint y)
+void ColorSet::init(const float * data, uint w, uint h, uint x, uint y)
 {
 }
 
diff --git a/src/nvimage/ColorBlock.h b/src/nvimage/ColorBlock.h
index 09ce254..e87cc9f 100644
--- a/src/nvimage/ColorBlock.h
+++ b/src/nvimage/ColorBlock.h
@@ -26,21 +26,8 @@ namespace nv
         void swizzle(uint x, uint y, uint z, uint w); // 0=r, 1=g, 2=b, 3=a, 4=0xFF, 5=0
 
         bool isSingleColor(Color32 mask = Color32(0xFF, 0xFF, 0xFF, 0x00)) const;
-        //uint countUniqueColors() const;
-        //Color32 averageColor() const;
         bool hasAlpha() const;
 
-        //void diameterRange(Color32 * start, Color32 * end) const;
-        //void luminanceRange(Color32 * start, Color32 * end) const;
-        //void boundsRange(Color32 * start, Color32 * end) const;
-        //void boundsRangeAlpha(Color32 * start, Color32 * end) const;
-
-        //void sortColorsByAbsoluteValue();
-
-        //void computeRange(const Vector3 & axis, Color32 * start, Color32 * end) const;
-        //void sortColors(const Vector3 & axis);
-
-        //float volume() const;
 
         // Accessors
         const Color32 * colors() const;
@@ -93,19 +80,21 @@ namespace nv
     }
 
 
-    struct FloatColorBlock
+    struct ColorSet
     {
-        FloatColorBlock() : w(4), h(4) {}
-        FloatColorBlock(uint w, uint h) : w(w), h(h) {}
+        ColorSet() : w(4), h(4) {}
+        ColorSet(uint w, uint h) : w(w), h(h) {}
 
         void init(const Image * img, uint x, uint y);
         void init(const FloatImage * img, uint x, uint y);
         void init(const uint * data, uint w, uint h, uint x, uint y);
         void init(const float * data, uint w, uint h, uint x, uint y);
 
-        Vector4 color(uint x, uint y) const { 	nvDebugCheck(x < w && y < h); return colors[y * 4 + x]; }
+        Vector4 color(uint x, uint y) const { nvDebugCheck(x < w && y < h); return colors[y * 4 + x]; }
         Vector4 & color(uint x, uint y) { nvDebugCheck(x < w && y < h); return colors[y * 4 + x]; }
 
+        Vector4 color(uint i) const { nvDebugCheck(i < 16); return colors[i]; }
+        Vector4 & color(uint i) { nvDebugCheck(i < 16); return colors[i]; }
 
         Vector4 colors[16];
         uint w, h;
diff --git a/src/nvimage/FloatImage.h b/src/nvimage/FloatImage.h
index e8f273d..9e8d7b6 100644
--- a/src/nvimage/FloatImage.h
+++ b/src/nvimage/FloatImage.h
@@ -111,6 +111,9 @@ namespace nv
         float pixel(uint x, uint y, uint c) const;
         float & pixel(uint x, uint y, uint c);
 
+        float pixel(uint idx, uint c) const;
+        float & pixel(uint idx, uint c);
+
         float pixel(uint idx) const;
         float & pixel(uint idx);
 
@@ -197,6 +200,24 @@ namespace nv
         return m_mem[(c * m_height + y) * m_width + x];
     }
 
+    /// Get pixel component.
+    inline float FloatImage::pixel(uint idx, uint c) const
+    {
+        nvDebugCheck(m_mem != NULL);
+        nvDebugCheck(idx < uint(m_width*m_height));
+        nvDebugCheck(c < m_componentNum);
+        return m_mem[c * m_height * m_width + idx];
+    }
+
+    /// Get pixel component.
+    inline float & FloatImage::pixel(uint idx, uint c)
+    {
+        nvDebugCheck(m_mem != NULL);
+        nvDebugCheck(idx < uint(m_width*m_height));
+        nvDebugCheck(c < m_componentNum);
+        return m_mem[c * m_height * m_width + idx];
+    }
+
     /// Get pixel component.
     inline float FloatImage::pixel(uint idx) const
     {
diff --git a/src/nvimage/ImageIO.cpp b/src/nvimage/ImageIO.cpp
index d28b270..8b5ecae 100644
--- a/src/nvimage/ImageIO.cpp
+++ b/src/nvimage/ImageIO.cpp
@@ -97,6 +97,7 @@ namespace nv
 	#endif // defined(HAVE_FREEIMAGE)
 
         static FloatImage * loadFloatDDS(Stream & s);
+        static bool saveFloatDDS(const char * fileName, Stream & s, const FloatImage * img, uint base_component, uint num_components);
 
 	} // ImageIO namespace
 } // nv namespace
@@ -264,6 +265,12 @@ bool nv::ImageIO::saveFloat(const char * fileName, Stream & s, const FloatImage
 		return false;
 	}
 
+    const char * extension = Path::extension(fileName);
+
+	if (strCaseCmp(extension, ".dds") == 0) {
+		return saveFloatDDS(fileName, s, fimage, baseComponent, componentCount);
+	}
+
 #if defined(HAVE_FREEIMAGE)
 	FREE_IMAGE_FORMAT fif = FreeImage_GetFIFFromFilename(fileName);
 	if (fif != FIF_UNKNOWN && FreeImage_FIFSupportsWriting(fif)) {
@@ -1792,3 +1799,42 @@ FloatImage * nv::ImageIO::loadFloatDDS(Stream & s)
  
     return NULL;
 }
+
+bool nv::ImageIO::saveFloatDDS(const char * fileName, Stream & s, const FloatImage * img, uint base_component, uint num_components)
+{
+    nvCheck(s.isSaving());
+	nvCheck(!s.isError());
+
+    if (num_components != 4) return false;
+
+    static const uint D3DFMT_A16B16G16R16F = 113;
+
+    DDSHeader header;
+    header.setTexture2D();
+    header.setWidth(img->width());
+    header.setHeight(img->height());
+    header.setFormatCode(D3DFMT_A16B16G16R16F);
+    // ...
+
+    s << header;
+
+    uint32 * r = (uint32 *)img->channel(base_component + 0);
+    uint32 * g = (uint32 *)img->channel(base_component + 1);
+    uint32 * b = (uint32 *)img->channel(base_component + 2);
+    uint32 * a = (uint32 *)img->channel(base_component + 3);
+
+    const uint size = img->width() * img->height();
+    for (uint i = 0; i < size; i++) {
+        uint16 R = half_from_float( *r++ );
+        uint16 G = half_from_float( *g++ );
+        uint16 B = half_from_float( *b++ );
+        uint16 A = half_from_float( *a++ );
+
+        s.serialize(&R, sizeof(uint16));
+        s.serialize(&G, sizeof(uint16));
+        s.serialize(&B, sizeof(uint16));
+        s.serialize(&A, sizeof(uint16));
+    }
+
+    return true;
+}
diff --git a/src/nvimage/PixelFormat.h b/src/nvimage/PixelFormat.h
index 9a702e7..8ccf2c1 100644
--- a/src/nvimage/PixelFormat.h
+++ b/src/nvimage/PixelFormat.h
@@ -85,7 +85,7 @@ namespace nv
             float result;
             int offset = 0;
             do {
-                uint i = offset + f * (float(1 << inbits) - 1);
+                uint i = offset + uint(f * (float(1 << inbits) - 1));
                 i = convert(i, inbits, outbits);
                 result = float(i) / (float(1 << outbits) - 1);
                 offset++;
diff --git a/src/nvmath/SimdVector.h b/src/nvmath/SimdVector.h
new file mode 100755
index 0000000..b84ea6f
--- /dev/null
+++ b/src/nvmath/SimdVector.h
@@ -0,0 +1,24 @@
+// This code is in the public domain -- Ignacio Casta�o <castano@gmail.com>
+
+#include "Vector.h" // Vector3, Vector4
+
+
+// Set some reasonable defaults.
+#ifndef NV_USE_ALTIVEC
+#   define NV_USE_ALTIVEC POSH_CPU_PPC
+#endif
+
+#ifndef NV_USE_SSE
+#   if NV_CPU_X86 || NV_CPU_X86_64
+#       define NV_USE_SSE 2
+#   endif
+#endif
+
+
+#if NV_USE_ALTIVEC
+#   include "SimdVector_VE.h"
+#endif
+
+#if NV_USE_SSE
+#   include "SimdVector_SSE.h"
+#endif
diff --git a/src/nvmath/SimdVector_SSE.h b/src/nvmath/SimdVector_SSE.h
new file mode 100755
index 0000000..8677322
--- /dev/null
+++ b/src/nvmath/SimdVector_SSE.h
@@ -0,0 +1,201 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#ifndef NV_SIMD_VECTOR_SSE_H
+#define NV_SIMD_VECTOR_SSE_H
+
+#include <xmmintrin.h>
+#if (NV_USE_SSE > 1)
+#include <emmintrin.h>
+#endif
+
+namespace nv {
+
+    class SimdVector
+    {
+        __m128 vec;
+
+        typedef SimdVector const& Arg;
+
+        SimdVector() {}
+        explicit SimdVector(float f) : vec(_mm_set1_ps(f)) {}
+        explicit SimdVector(__m128 v) : vec(v) {}
+        SimdVector(const SimdVector & arg) : vec(arg.vec) {}
+
+        SimdVector & operator=(const SimdVector & arg)
+        {
+            vec = arg.vec;
+            return *this;
+        }
+
+        SimdVector(const float * v)
+        {
+            vec = _mm_load_ps( v );
+        }
+
+        SimdVector(float x, float y, float z, float w)
+        {
+            vec = _mm_setr_ps( x, y, z, w );
+        }
+
+        float toFloat() const 
+        {
+            NV_ALIGN_16 float f;
+            _mm_store_ss(&f, vec);
+            return f;
+        }
+
+        Vector3 toVector3() const
+        {
+            NV_ALIGN_16 float c[4];
+            _mm_store_ps( c, vec );
+            return Vector3( c[0], c[1], c[2] );
+        }
+
+        Vector4 toVector4() const
+        {
+            NV_ALIGN_16 float c[4];
+            _mm_store_ps( v.components, vec );
+            return Vector4( c[0], c[1], c[2], c[3] );
+        }
+
+#define SSE_SPLAT( a ) ((a) | ((a) << 2) | ((a) << 4) | ((a) << 6))
+        SimdVector splatX() const { return SimdVector( _mm_shuffle_ps( vec, vec, SSE_SPLAT( 0 ) ) ); }
+        SimdVector splatY() const { return SimdVector( _mm_shuffle_ps( vec, vec, SSE_SPLAT( 1 ) ) ); }
+        SimdVector splatZ() const { return SimdVector( _mm_shuffle_ps( vec, vec, SSE_SPLAT( 2 ) ) ); }
+        SimdVector splatW() const { return SimdVector( _mm_shuffle_ps( vec, vec, SSE_SPLAT( 3 ) ) ); }
+#undef SSE_SPLAT
+
+        SimdVector& operator+=( Arg v )
+        {
+            vec = _mm_add_ps( vec, v.vec );
+            return *this;
+        }
+
+        SimdVector& operator-=( Arg v )
+        {
+            vec = _mm_sub_ps( vec, v.vec );
+            return *this;
+        }
+
+        SimdVector& operator*=( Arg v )
+        {
+            vec = _mm_mul_ps( vec, v.vec );
+            return *this;
+        }
+    };
+
+
+    SimdVector operator+( SimdVector::Arg left, SimdVector::Arg right  )
+    {
+        return SimdVector( _mm_add_ps( left.vec, right.vec ) );
+    }
+
+    SimdVector operator-( SimdVector::Arg left, SimdVector::Arg right  )
+    {
+        return SimdVector( _mm_sub_ps( left.vec, right.vec ) );
+    }
+
+    SimdVector operator*( SimdVector::Arg left, SimdVector::Arg right  )
+    {
+        return SimdVector( _mm_mul_ps( left.vec, right.vec ) );
+    }
+
+    // Returns a*b + c
+    SimdVector multiplyAdd( SimdVector::Arg a, SimdVector::Arg b, SimdVector::Arg c )
+    {
+        return SimdVector( _mm_add_ps( _mm_mul_ps( a.vec, b.vec ), c.vec ) );
+    }
+
+    // Returns -( a*b - c )
+    SimdVector negativeMultiplySubtract( SimdVector::Arg a, SimdVector::Arg b, SimdVector::Arg c )
+    {
+        return SimdVector( _mm_sub_ps( c.vec, _mm_mul_ps( a.vec, b.vec ) ) );
+    }
+
+    SimdVector reciprocal( SimdVector::Arg v )
+    {
+        // get the reciprocal estimate
+        __m128 estimate = _mm_rcp_ps( v.vec );
+
+        // one round of Newton-Rhaphson refinement
+        __m128 diff = _mm_sub_ps( _mm_set1_ps( 1.0f ), _mm_mul_ps( estimate, v.vec ) );
+        return SimdVector( _mm_add_ps( _mm_mul_ps( diff, estimate ), estimate ) );
+    }
+
+    SimdVector min( SimdVector::Arg left, SimdVector::Arg right )
+    {
+        return SimdVector( _mm_min_ps( left.vec, right.vec ) );
+    }
+
+    SimdVector max( SimdVector::Arg left, SimdVector::Arg right )
+    {
+        return SimdVector( _mm_max_ps( left.vec, right.vec ) );
+    }
+
+    SimdVector truncate( SimdVector::Arg v )
+    {
+#if (NV_USE_SSE == 1)
+        // convert to ints
+        __m128 input = v.vec;
+        __m64 lo = _mm_cvttps_pi32( input );
+        __m64 hi = _mm_cvttps_pi32( _mm_movehl_ps( input, input ) );
+
+        // convert to floats
+        __m128 part = _mm_movelh_ps( input, _mm_cvtpi32_ps( input, hi ) );
+        __m128 truncated = _mm_cvtpi32_ps( part, lo );
+
+        // clear out the MMX multimedia state to allow FP calls later
+        _mm_empty(); 
+        return SimdVector( truncated );
+#else
+        // use SSE2 instructions
+        return SimdVector( _mm_cvtepi32_ps( _mm_cvttps_epi32( v.vec ) ) );
+#endif
+    }
+
+    SimdVector compareEqual( SimdVector::Arg left, SimdVector::Arg right )
+    {
+        return SimdVector( _mm_cmpeq_ps( left.vec, right.vec ) );
+    }
+
+    SimdVector select( SimdVector::Arg off, SimdVector::Arg on, SimdVector::Arg bits )
+    {
+        __m128 a = _mm_andnot_ps( bits.vec, off.vec );
+        __m128 b = _mm_and_ps( bits.vec, on.vec );
+
+        return SimdVector( _mm_or_ps( a, b ) );
+    }
+
+    bool compareAnyLessThan( SimdVector::Arg left, SimdVector::Arg right ) 
+    {
+        __m128 bits = _mm_cmplt_ps( left.vec, right.vec );
+        int value = _mm_movemask_ps( bits );
+        return value != 0;
+    }
+
+} // namespace nv
+
+#endif // NV_SIMD_VECTOR_SSE_H
diff --git a/src/nvmath/SimdVector_VE.h b/src/nvmath/SimdVector_VE.h
new file mode 100755
index 0000000..dabb525
--- /dev/null
+++ b/src/nvmath/SimdVector_VE.h
@@ -0,0 +1,187 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#ifndef NV_SIMD_VECTOR_VE_H
+#define NV_SIMD_VECTOR_VE_H
+
+#ifndef __APPLE_ALTIVEC__
+#include <altivec.h>
+#undef bool
+#endif
+
+namespace nv {
+
+    class SimdVector
+    {
+        vector float vec;
+
+        typedef SimdVector Arg;
+
+        SimdVector() {}
+        explicit SimdVector(float v) : vec((vector float)(X)) {}	
+        explicit SimdVector(vector float v) : vec(v) {}
+        SimdVector(const SimdVector & arg) : vec(arg.vec) {}
+
+        SimdVector& operator=(const SimdVector & arg)
+        {
+            vec = arg.vec;
+            return *this;
+        }
+
+        SimdVector(const float * v)
+        {
+            union { vector float v; float c[4]; } u;
+            u.c[0] = v[0];
+            u.c[1] = v[1];
+            u.c[2] = v[2];
+            u.c[3] = v[3];
+            vec = u.v;
+        }
+
+        SimdVector(float x, float y, float z, float w)
+        {
+            union { vector float v; float c[4]; } u;
+            u.c[0] = x;
+            u.c[1] = y;
+            u.c[2] = z;
+            u.c[3] = w;
+            vec = u.v;
+        }
+
+        float toFloat() const
+        {
+            union { vector float v; float c[4]; } u;
+            u.v = vec;
+            return u.c[0];
+        }
+
+        Vector3 toVector3() const
+        {
+            union { vector float v; float c[4]; } u;
+            u.v = vec;
+            return Vector3( u.c[0], u.c[1], u.c[2] );
+        }
+
+        Vector4 toVector4() const
+        {
+            union { vector float v; float c[4]; } u;
+            u.v = vec;
+            return Vector4( u.c[0], u.c[1], u.c[2], u.c[3] );
+        }
+
+        SimdVector splatX() const { return SimdVector( vec_splat( vec, 0 ) ); }
+        SimdVector splatY() const { return SimdVector( vec_splat( vec, 1 ) ); }
+        SimdVector splatZ() const { return SimdVector( vec_splat( vec, 2 ) ); }
+        SimdVector splatW() const { return SimdVector( vec_splat( vec, 3 ) ); }
+
+        SimdVector& operator+=( Arg v )
+        {
+            vec = vec_add( vec, v.vec );
+            return *this;
+        }
+
+        SimdVector& operator-=( Arg v )
+        {
+            vec = vec_sub( vec, v.vec );
+            return *this;
+        }
+
+        SimdVector& operator*=( Arg v )
+        {
+            vec = vec_madd( vec, v.vec, ( vector float )( -0.0f ) );
+            return *this;
+        }
+    };
+
+    SimdVector operator+( SimdVector::Arg left, SimdVector::Arg right  )
+    {
+        return SimdVector( vec_add( left.vec, right.vec ) );
+    }
+
+    SimdVector operator-( SimdVector::Arg left, SimdVector::Arg right  )
+    {
+        return SimdVector( vec_sub( left.vec, right.vec ) );
+    }
+
+    SimdVector operator*( SimdVector::Arg left, SimdVector::Arg right  )
+    {
+        return SimdVector( vec_madd( left.vec, right.vec, ( vector float )( -0.0f ) ) );
+    }
+
+    // Returns a*b + c
+    SimdVector multiplyAdd( SimdVector::Arg a, SimdVector::Arg b, SimdVector::Arg c )
+    {
+        return SimdVector( vec_madd( a.vec, b.vec, c.vec ) );
+    }
+
+    // Returns -( a*b - c )
+    SimdVector negativeMultiplySubtract( SimdVector::Arg a, SimdVector::Arg b, SimdVector::Arg c )
+    {
+        return SimdVector( vec_nmsub( a.vec, b.vec, c.vec ) );
+    }
+
+    SimdVector reciprocal( SimdVector::Arg v )
+    {
+        // get the reciprocal estimate
+        vector float estimate = vec_re( v.vec );
+
+        // one round of Newton-Rhaphson refinement
+        vector float diff = vec_nmsub( estimate, v.vec, ( vector float )( 1.0f ) );
+        return SimdVector( vec_madd( diff, estimate, estimate ) );
+    }
+
+    SimdVector min( SimdVector::Arg left, SimdVector::Arg right )
+    {
+        return SimdVector( vec_min( left.vec, right.vec ) );
+    }
+
+    SimdVector max( SimdVector::Arg left, SimdVector::Arg right )
+    {
+        return SimdVector( vec_max( left.vec, right.vec ) );
+    }
+
+    SimdVector truncate( SimdVector::Arg v )
+    {
+        return SimdVector( vec_trunc( v.vec ) );
+    }
+
+    SimdVector compareEqual( SimdVector::Arg left, SimdVector::Arg right )
+    {
+        return SimdVector( ( vector float )vec_cmpeq( left.vec, right.vec ) );
+    }
+
+    SimdVector select( SimdVector::Arg off, SimdVector::Arg on, SimdVector::Arg bits )
+    {
+        return SimdVector( vec_sel( off.vec, on.vec, ( vector unsigned int )bits.vec ) );
+    }
+
+    bool compareAnyLessThan( SimdVector::Arg left, SimdVector::Arg right ) 
+    {
+        return vec_any_lt( left.vec, right.vec ) != 0;
+    }
+
+} // namespace nv
+
+#endif // NV_SIMD_VECTOR_VE_H
diff --git a/src/nvtt/CompressorDX9.cpp b/src/nvtt/CompressorDX9.cpp
index 5a4eae4..cd6ba84 100644
--- a/src/nvtt/CompressorDX9.cpp
+++ b/src/nvtt/CompressorDX9.cpp
@@ -1,554 +1,576 @@
 // Copyright (c) 2009-2011 Ignacio Castano <castano@gmail.com>
 // Copyright (c) 2007-2009 NVIDIA Corporation -- Ignacio Castano <icastano@nvidia.com>
-// 
-// Permission is hereby granted, free of charge, to any person
-// obtaining a copy of this software and associated documentation
-// files (the "Software"), to deal in the Software without
-// restriction, including without limitation the rights to use,
-// copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the
-// Software is furnished to do so, subject to the following
-// conditions:
-// 
-// The above copyright notice and this permission notice shall be
-// included in all copies or substantial portions of the Software.
-// 
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-// OTHER DEALINGS IN THE SOFTWARE.
-
-#include "CompressorDX9.h"
-#include "QuickCompressDXT.h"
-#include "OptimalCompressDXT.h"
-#include "CompressionOptions.h"
-#include "OutputOptions.h"
-
-// squish
-#include "squish/colourset.h"
-#include "squish/weightedclusterfit.h"
-
-#include "nvtt.h"
-
-#include "nvcore/Memory.h"
-
-#include "nvimage/Image.h"
-#include "nvimage/ColorBlock.h"
-#include "nvimage/BlockDXT.h"
-
-#include <new> // placement new
-
-// s3_quant
-#if defined(HAVE_S3QUANT)
-#include "s3tc/s3_quant.h"
-#endif
-
-// ati tc
-#if defined(HAVE_ATITC)
-typedef int BOOL;
-typedef _W64 unsigned long ULONG_PTR;
-typedef ULONG_PTR DWORD_PTR;
-#include "atitc/ATI_Compress.h"
-#endif
-
-// squish
-#if defined(HAVE_SQUISH)
-//#include "squish/squish.h"
-#include "squish-1.10/squish.h"
-#endif
-
-// d3dx
-#if defined(HAVE_D3DX)
-#include <d3dx9.h>
-#endif
-
-// stb
-#if defined(HAVE_STB)
-#define STB_DEFINE
-#include "stb/stb_dxt.h"
-#endif
-
-using namespace nv;
-using namespace nvtt;
-
-
-void FastCompressorDXT1::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
-{
-	BlockDXT1 * block = new(output) BlockDXT1;
-	QuickCompress::compressDXT1(rgba, block);
-}
-
-void FastCompressorDXT1a::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
-{
-	BlockDXT1 * block = new(output) BlockDXT1;
-	QuickCompress::compressDXT1a(rgba, block);
-}
-
-void FastCompressorDXT3::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
-{
-	BlockDXT3 * block = new(output) BlockDXT3;
-	QuickCompress::compressDXT3(rgba, block);
-}
-
-void FastCompressorDXT5::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
-{
-	BlockDXT5 * block = new(output) BlockDXT5;
-	QuickCompress::compressDXT5(rgba, block);
-}
-
-void FastCompressorDXT5n::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
-{
-	rgba.swizzle(4, 1, 5, 0); // 0xFF, G, 0, R
-
-	BlockDXT5 * block = new(output) BlockDXT5;
-	QuickCompress::compressDXT5(rgba, block);
-}
-
-
-void NormalCompressorDXT1::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
-{
-	nvsquish::WeightedClusterFit fit;
-	fit.SetMetric(compressionOptions.colorWeight.x, compressionOptions.colorWeight.y, compressionOptions.colorWeight.z);
-
-	if (rgba.isSingleColor())
-	{
-		BlockDXT1 * block = new(output) BlockDXT1;
-		OptimalCompress::compressDXT1(rgba.color(0), block);
-	}
-	else
-	{
-		nvsquish::ColourSet colours((uint8 *)rgba.colors(), 0);
-		fit.SetColourSet(&colours, nvsquish::kDxt1);
-		fit.Compress(output);
-	}
-}
-
-
-void NormalCompressorDXT1a::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
-{
-    uint alphaMask = 0;
-	for (uint i = 0; i < 16; i++)
-	{
-		if (rgba.color(i).a == 0) alphaMask |= (3 << (i * 2)); // Set two bits for each color.
-	}
-
-	const bool isSingleColor = rgba.isSingleColor();
-		
-	if (isSingleColor)
-	{
-		BlockDXT1 * block = new(output) BlockDXT1;
-        OptimalCompress::compressDXT1a(rgba.color(0), alphaMask, block);
-	}
-	else
-	{
-		nvsquish::WeightedClusterFit fit;
-		fit.SetMetric(compressionOptions.colorWeight.x, compressionOptions.colorWeight.y, compressionOptions.colorWeight.z);
-
-		int flags = nvsquish::kDxt1;
-		if (alphaMode == nvtt::AlphaMode_Transparency) flags |= nvsquish::kWeightColourByAlpha;
-
-		nvsquish::ColourSet colours((uint8 *)rgba.colors(), flags);
-		fit.SetColourSet(&colours, nvsquish::kDxt1);
-
-		fit.Compress(output);
-	}
-}
-
-
-void NormalCompressorDXT3::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
-{
-	BlockDXT3 * block = new(output) BlockDXT3;
-
-	// Compress explicit alpha.
-	OptimalCompress::compressDXT3A(rgba, &block->alpha);
-
-	// Compress color.
-	if (rgba.isSingleColor())
-	{
-		OptimalCompress::compressDXT1(rgba.color(0), &block->color);
-	}
-	else
-	{
-		nvsquish::WeightedClusterFit fit;
-		fit.SetMetric(compressionOptions.colorWeight.x, compressionOptions.colorWeight.y, compressionOptions.colorWeight.z);
-
-		int flags = 0;
-		if (alphaMode == nvtt::AlphaMode_Transparency) flags |= nvsquish::kWeightColourByAlpha;
-
-		nvsquish::ColourSet colours((uint8 *)rgba.colors(), flags);
-		fit.SetColourSet(&colours, 0);
-		fit.Compress(&block->color);
-	}
-}
-
-
-void NormalCompressorDXT5::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
-{
-	BlockDXT5 * block = new(output) BlockDXT5;
-
-	// Compress alpha.
-	if (compressionOptions.quality == Quality_Highest)
-	{
-		OptimalCompress::compressDXT5A(rgba, &block->alpha);
-	}
-	else
-	{
-		QuickCompress::compressDXT5A(rgba, &block->alpha);
-	}
-
-	// Compress color.
-	if (rgba.isSingleColor())
-	{
-		OptimalCompress::compressDXT1(rgba.color(0), &block->color);
-	}
-	else
-	{
-		nvsquish::WeightedClusterFit fit;
-		fit.SetMetric(compressionOptions.colorWeight.x, compressionOptions.colorWeight.y, compressionOptions.colorWeight.z);
-
-		int flags = 0;
-		if (alphaMode == nvtt::AlphaMode_Transparency) flags |= nvsquish::kWeightColourByAlpha;
-
-		nvsquish::ColourSet colours((uint8 *)rgba.colors(), flags);
-		fit.SetColourSet(&colours, 0);
-		fit.Compress(&block->color);
-	}
-}
-
-
-void NormalCompressorDXT5n::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
-{
-	BlockDXT5 * block = new(output) BlockDXT5;
-
-	// Compress Y.
-	if (compressionOptions.quality == Quality_Highest)
-	{
-		OptimalCompress::compressDXT1G(rgba, &block->color);
-	}
-	else
-	{
-		if (rgba.isSingleColor(Color32(0, 0xFF, 0, 0))) // Mask all but green channel.
-		{
-			OptimalCompress::compressDXT1G(rgba.color(0).g, &block->color);
-		}
-		else
-		{
-            ColorBlock tile = rgba;
-            tile.swizzle(4, 1, 5, 3); // leave alpha in alpha channel.
-
-			nvsquish::WeightedClusterFit fit;
-			fit.SetMetric(0, 1, 0);
-
-			int flags = 0;
-			if (alphaMode == nvtt::AlphaMode_Transparency) flags |= nvsquish::kWeightColourByAlpha;
-
-			nvsquish::ColourSet colours((uint8 *)tile.colors(), flags);
-			fit.SetColourSet(&colours, 0);
-			fit.Compress(&block->color);
-		}
-	}
-
-	rgba.swizzle(4, 1, 5, 0); // 1, G, 0, R
-
-	// Compress X.
-	if (compressionOptions.quality == Quality_Highest)
-	{
-		OptimalCompress::compressDXT5A(rgba, &block->alpha);
-	}
-	else
-	{
-		QuickCompress::compressDXT5A(rgba, &block->alpha);
-	}
-}
-
-
-#if defined(HAVE_S3QUANT)
-
-void S3CompressorDXT1::compress(nvtt::InputFormat inputFormat, nvtt::AlphaMode alphaMode, uint w, uint h, void * data, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions)
-{
-	float error = 0.0f;
-
-	BlockDXT1 dxtBlock3;
-	BlockDXT1 dxtBlock4;
-	ColorBlock block;
-
-	for (uint y = 0; y < h; y += 4) {
-		for (uint x = 0; x < w; x += 4) {
-			block.init(inputFormat, w, h, data, x, y);
-
-			// Init rgb block.
-			RGBBlock rgbBlock;
-			rgbBlock.n = 16;
-			for (uint i = 0; i < 16; i++) {
-				rgbBlock.colorChannel[i][0] = clamp(float(block.color(i).r) / 255.0f, 0.0f, 1.0f);
-				rgbBlock.colorChannel[i][1] = clamp(float(block.color(i).g) / 255.0f, 0.0f, 1.0f);
-				rgbBlock.colorChannel[i][2] = clamp(float(block.color(i).b) / 255.0f, 0.0f, 1.0f);
-			}
-			rgbBlock.weight[0] = 1.0f;
-			rgbBlock.weight[1] = 1.0f;
-			rgbBlock.weight[2] = 1.0f;
-
-			rgbBlock.inLevel = 4;
-			CodeRGBBlock(&rgbBlock);
-
-			// Copy results to DXT block.
-			dxtBlock4.col0.r = rgbBlock.endPoint[0][0];
-			dxtBlock4.col0.g = rgbBlock.endPoint[0][1];
-			dxtBlock4.col0.b = rgbBlock.endPoint[0][2];
-
-			dxtBlock4.col1.r = rgbBlock.endPoint[1][0];
-			dxtBlock4.col1.g = rgbBlock.endPoint[1][1];
-			dxtBlock4.col1.b = rgbBlock.endPoint[1][2];
-
-			dxtBlock4.setIndices(rgbBlock.index);
-
-			if (dxtBlock4.col0.u < dxtBlock4.col1.u) {
-				swap(dxtBlock4.col0.u, dxtBlock4.col1.u);
-				dxtBlock4.indices ^= 0x55555555;
-			}
-
-			uint error4 = blockError(block, dxtBlock4);
-
-			rgbBlock.inLevel = 3;
-
-			CodeRGBBlock(&rgbBlock);
-
-			// Copy results to DXT block.
-			dxtBlock3.col0.r = rgbBlock.endPoint[0][0];
-			dxtBlock3.col0.g = rgbBlock.endPoint[0][1];
-			dxtBlock3.col0.b = rgbBlock.endPoint[0][2];
-
-			dxtBlock3.col1.r = rgbBlock.endPoint[1][0];
-			dxtBlock3.col1.g = rgbBlock.endPoint[1][1];
-			dxtBlock3.col1.b = rgbBlock.endPoint[1][2];
-
-			dxtBlock3.setIndices(rgbBlock.index);
-
-			if (dxtBlock3.col0.u > dxtBlock3.col1.u) {
-				swap(dxtBlock3.col0.u, dxtBlock3.col1.u);
-				dxtBlock3.indices ^= (~dxtBlock3.indices  >> 1) & 0x55555555;
-			}
-
-			uint error3 = blockError(block, dxtBlock3);
-
-			if (error3 < error4) {
-				error += error3;
-
-				if (outputOptions.outputHandler != NULL) {
-					outputOptions.outputHandler->writeData(&dxtBlock3, sizeof(dxtBlock3));
-				}
-			}
-			else {
-				error += error4;
-
-				if (outputOptions.outputHandler != NULL) {
-					outputOptions.outputHandler->writeData(&dxtBlock4, sizeof(dxtBlock4));
-				}
-			}
-		}
-	}
-}
-
-#endif // defined(HAVE_S3QUANT)
-
-
-#if defined(HAVE_ATITC)
-
-void AtiCompressorDXT1::compress(nvtt::InputFormat inputFormat, nvtt::AlphaMode alphaMode, uint w, uint h, void * data, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions)
-{
-	// Init source texture
-	ATI_TC_Texture srcTexture;
-	srcTexture.dwSize = sizeof(srcTexture);
-	srcTexture.dwWidth = w;
-	srcTexture.dwHeight = h;
-	if (inputFormat == nvtt::InputFormat_BGRA_8UB)
-	{
-		srcTexture.dwPitch = w * 4;
-		srcTexture.format = ATI_TC_FORMAT_ARGB_8888;
-	}
-	else
-	{
-		srcTexture.dwPitch = w * 16;
-		srcTexture.format = ATI_TC_FORMAT_ARGB_32F;
-	}
-	srcTexture.dwDataSize = ATI_TC_CalculateBufferSize(&srcTexture);
-	srcTexture.pData = (ATI_TC_BYTE*) data;
-
-	// Init dest texture
-	ATI_TC_Texture destTexture;
-	destTexture.dwSize = sizeof(destTexture);
-	destTexture.dwWidth = w;
-	destTexture.dwHeight = h;
-	destTexture.dwPitch = 0;
-	destTexture.format = ATI_TC_FORMAT_DXT1;
-	destTexture.dwDataSize = ATI_TC_CalculateBufferSize(&destTexture);
-	destTexture.pData = (ATI_TC_BYTE*) mem::malloc(destTexture.dwDataSize);
-
-	ATI_TC_CompressOptions options;
-	options.dwSize = sizeof(options);
-	options.bUseChannelWeighting = false;
-	options.bUseAdaptiveWeighting = false;
-	options.bDXT1UseAlpha = false;
-	options.nCompressionSpeed = ATI_TC_Speed_Normal;
-	options.bDisableMultiThreading = false;
-	//options.bDisableMultiThreading = true;
-
-	// Compress
-	ATI_TC_ConvertTexture(&srcTexture, &destTexture, &options, NULL, NULL, NULL);
-
-	if (outputOptions.outputHandler != NULL) {
-		outputOptions.outputHandler->writeData(destTexture.pData, destTexture.dwDataSize);
-	}
-
-	mem::free(destTexture.pData);
-}
-
-void AtiCompressorDXT5::compress(nvtt::InputFormat inputFormat, nvtt::AlphaMode alphaMode, uint w, uint h, void * data, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions)
-{
-	// Init source texture
-	ATI_TC_Texture srcTexture;
-	srcTexture.dwSize = sizeof(srcTexture);
-	srcTexture.dwWidth = w;
-	srcTexture.dwHeight = h;
-	if (inputFormat == nvtt::InputFormat_BGRA_8UB)
-	{
-		srcTexture.dwPitch = w * 4;
-		srcTexture.format = ATI_TC_FORMAT_ARGB_8888;
-	}
-	else
-	{
-		srcTexture.dwPitch = w * 16;
-		srcTexture.format = ATI_TC_FORMAT_ARGB_32F;
-	}
-	srcTexture.dwDataSize = ATI_TC_CalculateBufferSize(&srcTexture);
-	srcTexture.pData = (ATI_TC_BYTE*) data;
-
-	// Init dest texture
-	ATI_TC_Texture destTexture;
-	destTexture.dwSize = sizeof(destTexture);
-	destTexture.dwWidth = w;
-	destTexture.dwHeight = h;
-	destTexture.dwPitch = 0;
-	destTexture.format = ATI_TC_FORMAT_DXT5;
-	destTexture.dwDataSize = ATI_TC_CalculateBufferSize(&destTexture);
-	destTexture.pData = (ATI_TC_BYTE*) mem::malloc(destTexture.dwDataSize);
-
-	// Compress
-	ATI_TC_ConvertTexture(&srcTexture, &destTexture, NULL, NULL, NULL, NULL);
-
-	if (outputOptions.outputHandler != NULL) {
-		outputOptions.outputHandler->writeData(destTexture.pData, destTexture.dwDataSize);
-	}
-
-	mem::free(destTexture.pData);
-}
-
-#endif // defined(HAVE_ATITC)
-
-#if defined(HAVE_SQUISH)
-
-void SquishCompressorDXT1::compress(nvtt::InputFormat inputFormat, nvtt::AlphaMode alphaMode, uint w, uint h, void * data, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions)
-{
-#pragma message(NV_FILE_LINE "TODO: Convert input to fixed point ABGR format instead of ARGB")
-	/*
-	Image img(*image);
-	int count = img.width() * img.height();
-	for (int i = 0; i < count; i++)
-	{
-		Color32 c = img.pixel(i);
-		img.pixel(i) = Color32(c.b, c.g, c.r, c.a);
-	}
-
-	int size = squish::GetStorageRequirements(img.width(), img.height(), squish::kDxt1);
-	void * blocks = mem::malloc(size);
-
-	squish::CompressImage((const squish::u8 *)img.pixels(), img.width(), img.height(), blocks, squish::kDxt1 | squish::kColourClusterFit);
-
-	if (outputOptions.outputHandler != NULL) {
-		outputOptions.outputHandler->writeData(blocks, size);
-	}
-
-	mem::free(blocks);
-	*/
-}
-
-#endif // defined(HAVE_SQUISH)
-
-
-#if defined(HAVE_D3DX)
-
-void D3DXCompressorDXT1::compress(nvtt::InputFormat inputFormat, nvtt::AlphaMode alphaMode, uint w, uint h, void * data, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions)
-{
-	IDirect3D9 * d3d = Direct3DCreate9(D3D_SDK_VERSION);
-
-	D3DPRESENT_PARAMETERS presentParams;
-	ZeroMemory(&presentParams, sizeof(presentParams));
-	presentParams.Windowed = TRUE;
-	presentParams.SwapEffect = D3DSWAPEFFECT_COPY;
-	presentParams.BackBufferWidth = 8;
-	presentParams.BackBufferHeight = 8;
-	presentParams.BackBufferFormat = D3DFMT_UNKNOWN;
-
-	HRESULT err;
-
-	IDirect3DDevice9 * device = NULL;
-	err = d3d->CreateDevice(D3DADAPTER_DEFAULT, D3DDEVTYPE_REF, GetDesktopWindow(), D3DCREATE_SOFTWARE_VERTEXPROCESSING, &presentParams, &device);
-
-	IDirect3DTexture9 * texture = NULL;
-	err = D3DXCreateTexture(device, w, h, 1, 0, D3DFMT_DXT1, D3DPOOL_SYSTEMMEM, &texture);
-	
-	IDirect3DSurface9 * surface = NULL;
-	err = texture->GetSurfaceLevel(0, &surface);
-
-	RECT rect;
-	rect.left = 0; 
-	rect.top = 0; 
-	rect.bottom = h;
-	rect.right = w;
-
-	if (inputFormat == nvtt::InputFormat_BGRA_8UB)
-	{
-		err = D3DXLoadSurfaceFromMemory(surface, NULL, NULL, data, D3DFMT_A8R8G8B8, w * 4, NULL, &rect, D3DX_DEFAULT, 0);
-	}
-	else
-	{
-		err = D3DXLoadSurfaceFromMemory(surface, NULL, NULL, data, D3DFMT_A32B32G32R32F, w * 16, NULL, &rect, D3DX_DEFAULT, 0);
-	}
-
-	if (err != D3DERR_INVALIDCALL && err != D3DXERR_INVALIDDATA)
-	{
-		D3DLOCKED_RECT rect;
-		ZeroMemory(&rect, sizeof(rect));
-
-		err = surface->LockRect(&rect, NULL, D3DLOCK_READONLY);
-
-		if (outputOptions.outputHandler != NULL) {
-			int size = rect.Pitch * ((h + 3) / 4);
-			outputOptions.outputHandler->writeData(rect.pBits, size);
-		}
-
-		err = surface->UnlockRect();
-	}
-
-	surface->Release();
-	device->Release();
-	d3d->Release();
-}
-
-#endif // defined(HAVE_D3DX)
-
-
-#if defined(HAVE_STB)
-
-void StbCompressorDXT1::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
-{
-	rgba.swizzle(2, 1, 0, 3); // Swap R and B
-	stb_compress_dxt_block((unsigned char *)output, (unsigned char *)rgba.colors(), 0, 0);
-}
-
-
-#endif // defined(HAVE_STB)
+// 
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+#include "CompressorDX9.h"
+#include "QuickCompressDXT.h"
+#include "OptimalCompressDXT.h"
+#include "CompressionOptions.h"
+#include "OutputOptions.h"
+
+// squish
+#include "squish/colourset.h"
+#include "squish/weightedclusterfit.h"
+
+#include "nvtt.h"
+
+#include "nvcore/Memory.h"
+
+#include "nvimage/Image.h"
+#include "nvimage/ColorBlock.h"
+#include "nvimage/BlockDXT.h"
+
+#include <new> // placement new
+
+// s3_quant
+#if defined(HAVE_S3QUANT)
+#include "s3tc/s3_quant.h"
+#endif
+
+// ati tc
+#if defined(HAVE_ATITC)
+typedef int BOOL;
+typedef _W64 unsigned long ULONG_PTR;
+typedef ULONG_PTR DWORD_PTR;
+#include "atitc/ATI_Compress.h"
+#endif
+
+// squish
+#if defined(HAVE_SQUISH)
+//#include "squish/squish.h"
+#include "squish-1.10/squish.h"
+#endif
+
+// d3dx
+#if defined(HAVE_D3DX)
+#include <d3dx9.h>
+#endif
+
+// stb
+#if defined(HAVE_STB)
+#define STB_DEFINE
+#include "stb/stb_dxt.h"
+#endif
+
+using namespace nv;
+using namespace nvtt;
+
+
+void FastCompressorDXT1::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
+{
+	BlockDXT1 * block = new(output) BlockDXT1;
+	QuickCompress::compressDXT1(rgba, block);
+}
+
+void FastCompressorDXT1a::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
+{
+	BlockDXT1 * block = new(output) BlockDXT1;
+	QuickCompress::compressDXT1a(rgba, block);
+}
+
+void FastCompressorDXT3::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
+{
+	BlockDXT3 * block = new(output) BlockDXT3;
+	QuickCompress::compressDXT3(rgba, block);
+}
+
+void FastCompressorDXT5::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
+{
+	BlockDXT5 * block = new(output) BlockDXT5;
+	QuickCompress::compressDXT5(rgba, block);
+}
+
+void FastCompressorDXT5n::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
+{
+	rgba.swizzle(4, 1, 5, 0); // 0xFF, G, 0, R
+
+	BlockDXT5 * block = new(output) BlockDXT5;
+	QuickCompress::compressDXT5(rgba, block);
+}
+
+
+inline static Vector3 vec(nvsquish::Vec3 v) { return Vector3(v.X(), v.Y(), v.Z()); }
+
+void NormalCompressorDXT1::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
+{
+	nvsquish::WeightedClusterFit fit;
+	fit.SetMetric(compressionOptions.colorWeight.x, compressionOptions.colorWeight.y, compressionOptions.colorWeight.z);
+
+    BlockDXT1 * block = new(output) BlockDXT1;
+    if (rgba.isSingleColor())
+	{
+		OptimalCompress::compressDXT1(rgba.color(0), block);
+	}
+	else
+	{
+		nvsquish::ColourSet colours((uint8 *)rgba.colors(), 0);
+		fit.SetColourSet(&colours, nvsquish::kDxt1);
+		
+        nvsquish::Vec3 start, end;
+        
+        fit.Compress4(&start, &end);
+        QuickCompress::outputBlock4(rgba, vec(start), vec(end), block);
+
+        if (fit.Compress3(&start, &end)) {
+            QuickCompress::outputBlock3(rgba, vec(start), vec(end), block);
+        }
+	}
+}
+
+
+void NormalCompressorDXT1a::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
+{
+#pragma NV_MESSAGE("NormalCompressorDXT1a - Not implemented!")
+    /*
+    uint alphaMask = 0;
+	for (uint i = 0; i < 16; i++)
+	{
+		if (rgba.color(i).a == 0) alphaMask |= (3 << (i * 2)); // Set two bits for each color.
+	}
+
+	const bool isSingleColor = rgba.isSingleColor();
+	
+	if (isSingleColor)
+	{
+		BlockDXT1 * block = new(output) BlockDXT1;
+        OptimalCompress::compressDXT1a(rgba.color(0), alphaMask, block);
+	}
+	else
+	{
+		nvsquish::WeightedClusterFit fit;
+		fit.SetMetric(compressionOptions.colorWeight.x, compressionOptions.colorWeight.y, compressionOptions.colorWeight.z);
+
+		int flags = nvsquish::kDxt1;
+		if (alphaMode == nvtt::AlphaMode_Transparency) flags |= nvsquish::kWeightColourByAlpha;
+
+		nvsquish::ColourSet colours((uint8 *)rgba.colors(), flags);
+		fit.SetColourSet(&colours, nvsquish::kDxt1);
+
+		fit.Compress(output);
+	}
+    */
+}
+
+
+void NormalCompressorDXT3::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
+{
+#pragma NV_MESSAGE("NormalCompressorDXT1a - Not implemented!")
+    /*
+	BlockDXT3 * block = new(output) BlockDXT3;
+
+	// Compress explicit alpha.
+	OptimalCompress::compressDXT3A(rgba, &block->alpha);
+
+	// Compress color.
+	if (rgba.isSingleColor())
+	{
+		OptimalCompress::compressDXT1(rgba.color(0), &block->color);
+	}
+	else
+	{
+		nvsquish::WeightedClusterFit fit;
+		fit.SetMetric(compressionOptions.colorWeight.x, compressionOptions.colorWeight.y, compressionOptions.colorWeight.z);
+
+		int flags = 0;
+		if (alphaMode == nvtt::AlphaMode_Transparency) flags |= nvsquish::kWeightColourByAlpha;
+
+		nvsquish::ColourSet colours((uint8 *)rgba.colors(), flags);
+		fit.SetColourSet(&colours, 0);
+		fit.Compress(&block->color);
+	}
+    */
+}
+
+
+void NormalCompressorDXT5::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
+{
+#pragma NV_MESSAGE("NormalCompressorDXT1a - Not implemented!")
+    /*
+	BlockDXT5 * block = new(output) BlockDXT5;
+
+	// Compress alpha.
+	if (compressionOptions.quality == Quality_Highest)
+	{
+		OptimalCompress::compressDXT5A(rgba, &block->alpha);
+	}
+	else
+	{
+		QuickCompress::compressDXT5A(rgba, &block->alpha);
+	}
+
+	// Compress color.
+	if (rgba.isSingleColor())
+	{
+		OptimalCompress::compressDXT1(rgba.color(0), &block->color);
+	}
+	else
+	{
+		nvsquish::WeightedClusterFit fit;
+		fit.SetMetric(compressionOptions.colorWeight.x, compressionOptions.colorWeight.y, compressionOptions.colorWeight.z);
+
+		int flags = 0;
+		if (alphaMode == nvtt::AlphaMode_Transparency) flags |= nvsquish::kWeightColourByAlpha;
+
+		nvsquish::ColourSet colours((uint8 *)rgba.colors(), flags);
+		fit.SetColourSet(&colours, 0);
+		fit.Compress(&block->color);
+	}
+    */
+}
+
+
+void NormalCompressorDXT5n::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
+{
+#pragma NV_MESSAGE("NormalCompressorDXT1a - Not implemented!")
+    /*
+	BlockDXT5 * block = new(output) BlockDXT5;
+
+	// Compress Y.
+	if (compressionOptions.quality == Quality_Highest)
+	{
+		OptimalCompress::compressDXT1G(rgba, &block->color);
+	}
+	else
+	{
+		if (rgba.isSingleColor(Color32(0, 0xFF, 0, 0))) // Mask all but green channel.
+		{
+			OptimalCompress::compressDXT1G(rgba.color(0).g, &block->color);
+		}
+		else
+		{
+            ColorBlock tile = rgba;
+            tile.swizzle(4, 1, 5, 3); // leave alpha in alpha channel.
+
+			nvsquish::WeightedClusterFit fit;
+			fit.SetMetric(0, 1, 0);
+
+			int flags = 0;
+			if (alphaMode == nvtt::AlphaMode_Transparency) flags |= nvsquish::kWeightColourByAlpha;
+
+			nvsquish::ColourSet colours((uint8 *)tile.colors(), flags);
+			fit.SetColourSet(&colours, 0);
+			fit.Compress(&block->color);
+		}
+	}
+
+	rgba.swizzle(4, 1, 5, 0); // 1, G, 0, R
+
+	// Compress X.
+	if (compressionOptions.quality == Quality_Highest)
+	{
+		OptimalCompress::compressDXT5A(rgba, &block->alpha);
+	}
+	else
+	{
+		QuickCompress::compressDXT5A(rgba, &block->alpha);
+	}
+    */
+}
+
+
+#if defined(HAVE_S3QUANT)
+
+void S3CompressorDXT1::compress(nvtt::InputFormat inputFormat, nvtt::AlphaMode alphaMode, uint w, uint h, void * data, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions)
+{
+	float error = 0.0f;
+
+	BlockDXT1 dxtBlock3;
+	BlockDXT1 dxtBlock4;
+	ColorBlock block;
+
+	for (uint y = 0; y < h; y += 4) {
+		for (uint x = 0; x < w; x += 4) {
+			block.init(inputFormat, w, h, data, x, y);
+
+			// Init rgb block.
+			RGBBlock rgbBlock;
+			rgbBlock.n = 16;
+			for (uint i = 0; i < 16; i++) {
+				rgbBlock.colorChannel[i][0] = clamp(float(block.color(i).r) / 255.0f, 0.0f, 1.0f);
+				rgbBlock.colorChannel[i][1] = clamp(float(block.color(i).g) / 255.0f, 0.0f, 1.0f);
+				rgbBlock.colorChannel[i][2] = clamp(float(block.color(i).b) / 255.0f, 0.0f, 1.0f);
+			}
+			rgbBlock.weight[0] = 1.0f;
+			rgbBlock.weight[1] = 1.0f;
+			rgbBlock.weight[2] = 1.0f;
+
+			rgbBlock.inLevel = 4;
+			CodeRGBBlock(&rgbBlock);
+
+			// Copy results to DXT block.
+			dxtBlock4.col0.r = rgbBlock.endPoint[0][0];
+			dxtBlock4.col0.g = rgbBlock.endPoint[0][1];
+			dxtBlock4.col0.b = rgbBlock.endPoint[0][2];
+
+			dxtBlock4.col1.r = rgbBlock.endPoint[1][0];
+			dxtBlock4.col1.g = rgbBlock.endPoint[1][1];
+			dxtBlock4.col1.b = rgbBlock.endPoint[1][2];
+
+			dxtBlock4.setIndices(rgbBlock.index);
+
+			if (dxtBlock4.col0.u < dxtBlock4.col1.u) {
+				swap(dxtBlock4.col0.u, dxtBlock4.col1.u);
+				dxtBlock4.indices ^= 0x55555555;
+			}
+
+			uint error4 = blockError(block, dxtBlock4);
+
+			rgbBlock.inLevel = 3;
+
+			CodeRGBBlock(&rgbBlock);
+
+			// Copy results to DXT block.
+			dxtBlock3.col0.r = rgbBlock.endPoint[0][0];
+			dxtBlock3.col0.g = rgbBlock.endPoint[0][1];
+			dxtBlock3.col0.b = rgbBlock.endPoint[0][2];
+
+			dxtBlock3.col1.r = rgbBlock.endPoint[1][0];
+			dxtBlock3.col1.g = rgbBlock.endPoint[1][1];
+			dxtBlock3.col1.b = rgbBlock.endPoint[1][2];
+
+			dxtBlock3.setIndices(rgbBlock.index);
+
+			if (dxtBlock3.col0.u > dxtBlock3.col1.u) {
+				swap(dxtBlock3.col0.u, dxtBlock3.col1.u);
+				dxtBlock3.indices ^= (~dxtBlock3.indices  >> 1) & 0x55555555;
+			}
+
+			uint error3 = blockError(block, dxtBlock3);
+
+			if (error3 < error4) {
+				error += error3;
+
+				if (outputOptions.outputHandler != NULL) {
+					outputOptions.outputHandler->writeData(&dxtBlock3, sizeof(dxtBlock3));
+				}
+			}
+			else {
+				error += error4;
+
+				if (outputOptions.outputHandler != NULL) {
+					outputOptions.outputHandler->writeData(&dxtBlock4, sizeof(dxtBlock4));
+				}
+			}
+		}
+	}
+}
+
+#endif // defined(HAVE_S3QUANT)
+
+
+#if defined(HAVE_ATITC)
+
+void AtiCompressorDXT1::compress(nvtt::InputFormat inputFormat, nvtt::AlphaMode alphaMode, uint w, uint h, void * data, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions)
+{
+	// Init source texture
+	ATI_TC_Texture srcTexture;
+	srcTexture.dwSize = sizeof(srcTexture);
+	srcTexture.dwWidth = w;
+	srcTexture.dwHeight = h;
+	if (inputFormat == nvtt::InputFormat_BGRA_8UB)
+	{
+		srcTexture.dwPitch = w * 4;
+		srcTexture.format = ATI_TC_FORMAT_ARGB_8888;
+	}
+	else
+	{
+		srcTexture.dwPitch = w * 16;
+		srcTexture.format = ATI_TC_FORMAT_ARGB_32F;
+	}
+	srcTexture.dwDataSize = ATI_TC_CalculateBufferSize(&srcTexture);
+	srcTexture.pData = (ATI_TC_BYTE*) data;
+
+	// Init dest texture
+	ATI_TC_Texture destTexture;
+	destTexture.dwSize = sizeof(destTexture);
+	destTexture.dwWidth = w;
+	destTexture.dwHeight = h;
+	destTexture.dwPitch = 0;
+	destTexture.format = ATI_TC_FORMAT_DXT1;
+	destTexture.dwDataSize = ATI_TC_CalculateBufferSize(&destTexture);
+	destTexture.pData = (ATI_TC_BYTE*) mem::malloc(destTexture.dwDataSize);
+
+	ATI_TC_CompressOptions options;
+	options.dwSize = sizeof(options);
+	options.bUseChannelWeighting = false;
+	options.bUseAdaptiveWeighting = false;
+	options.bDXT1UseAlpha = false;
+	options.nCompressionSpeed = ATI_TC_Speed_Normal;
+	options.bDisableMultiThreading = false;
+	//options.bDisableMultiThreading = true;
+
+	// Compress
+	ATI_TC_ConvertTexture(&srcTexture, &destTexture, &options, NULL, NULL, NULL);
+
+	if (outputOptions.outputHandler != NULL) {
+		outputOptions.outputHandler->writeData(destTexture.pData, destTexture.dwDataSize);
+	}
+
+	mem::free(destTexture.pData);
+}
+
+void AtiCompressorDXT5::compress(nvtt::InputFormat inputFormat, nvtt::AlphaMode alphaMode, uint w, uint h, void * data, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions)
+{
+	// Init source texture
+	ATI_TC_Texture srcTexture;
+	srcTexture.dwSize = sizeof(srcTexture);
+	srcTexture.dwWidth = w;
+	srcTexture.dwHeight = h;
+	if (inputFormat == nvtt::InputFormat_BGRA_8UB)
+	{
+		srcTexture.dwPitch = w * 4;
+		srcTexture.format = ATI_TC_FORMAT_ARGB_8888;
+	}
+	else
+	{
+		srcTexture.dwPitch = w * 16;
+		srcTexture.format = ATI_TC_FORMAT_ARGB_32F;
+	}
+	srcTexture.dwDataSize = ATI_TC_CalculateBufferSize(&srcTexture);
+	srcTexture.pData = (ATI_TC_BYTE*) data;
+
+	// Init dest texture
+	ATI_TC_Texture destTexture;
+	destTexture.dwSize = sizeof(destTexture);
+	destTexture.dwWidth = w;
+	destTexture.dwHeight = h;
+	destTexture.dwPitch = 0;
+	destTexture.format = ATI_TC_FORMAT_DXT5;
+	destTexture.dwDataSize = ATI_TC_CalculateBufferSize(&destTexture);
+	destTexture.pData = (ATI_TC_BYTE*) mem::malloc(destTexture.dwDataSize);
+
+	// Compress
+	ATI_TC_ConvertTexture(&srcTexture, &destTexture, NULL, NULL, NULL, NULL);
+
+	if (outputOptions.outputHandler != NULL) {
+		outputOptions.outputHandler->writeData(destTexture.pData, destTexture.dwDataSize);
+	}
+
+	mem::free(destTexture.pData);
+}
+
+#endif // defined(HAVE_ATITC)
+
+#if defined(HAVE_SQUISH)
+
+void SquishCompressorDXT1::compress(nvtt::InputFormat inputFormat, nvtt::AlphaMode alphaMode, uint w, uint h, void * data, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions)
+{
+#pragma message(NV_FILE_LINE "TODO: Convert input to fixed point ABGR format instead of ARGB")
+	/*
+	Image img(*image);
+	int count = img.width() * img.height();
+	for (int i = 0; i < count; i++)
+	{
+		Color32 c = img.pixel(i);
+		img.pixel(i) = Color32(c.b, c.g, c.r, c.a);
+	}
+
+	int size = squish::GetStorageRequirements(img.width(), img.height(), squish::kDxt1);
+	void * blocks = mem::malloc(size);
+
+	squish::CompressImage((const squish::u8 *)img.pixels(), img.width(), img.height(), blocks, squish::kDxt1 | squish::kColourClusterFit);
+
+	if (outputOptions.outputHandler != NULL) {
+		outputOptions.outputHandler->writeData(blocks, size);
+	}
+
+	mem::free(blocks);
+	*/
+}
+
+#endif // defined(HAVE_SQUISH)
+
+
+#if defined(HAVE_D3DX)
+
+void D3DXCompressorDXT1::compress(nvtt::InputFormat inputFormat, nvtt::AlphaMode alphaMode, uint w, uint h, void * data, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions)
+{
+	IDirect3D9 * d3d = Direct3DCreate9(D3D_SDK_VERSION);
+
+	D3DPRESENT_PARAMETERS presentParams;
+	ZeroMemory(&presentParams, sizeof(presentParams));
+	presentParams.Windowed = TRUE;
+	presentParams.SwapEffect = D3DSWAPEFFECT_COPY;
+	presentParams.BackBufferWidth = 8;
+	presentParams.BackBufferHeight = 8;
+	presentParams.BackBufferFormat = D3DFMT_UNKNOWN;
+
+	HRESULT err;
+
+	IDirect3DDevice9 * device = NULL;
+	err = d3d->CreateDevice(D3DADAPTER_DEFAULT, D3DDEVTYPE_REF, GetDesktopWindow(), D3DCREATE_SOFTWARE_VERTEXPROCESSING, &presentParams, &device);
+
+	IDirect3DTexture9 * texture = NULL;
+	err = D3DXCreateTexture(device, w, h, 1, 0, D3DFMT_DXT1, D3DPOOL_SYSTEMMEM, &texture);
+	
+	IDirect3DSurface9 * surface = NULL;
+	err = texture->GetSurfaceLevel(0, &surface);
+
+	RECT rect;
+	rect.left = 0; 
+	rect.top = 0; 
+	rect.bottom = h;
+	rect.right = w;
+
+	if (inputFormat == nvtt::InputFormat_BGRA_8UB)
+	{
+		err = D3DXLoadSurfaceFromMemory(surface, NULL, NULL, data, D3DFMT_A8R8G8B8, w * 4, NULL, &rect, D3DX_DEFAULT, 0);
+	}
+	else
+	{
+		err = D3DXLoadSurfaceFromMemory(surface, NULL, NULL, data, D3DFMT_A32B32G32R32F, w * 16, NULL, &rect, D3DX_DEFAULT, 0);
+	}
+
+	if (err != D3DERR_INVALIDCALL && err != D3DXERR_INVALIDDATA)
+	{
+		D3DLOCKED_RECT rect;
+		ZeroMemory(&rect, sizeof(rect));
+
+		err = surface->LockRect(&rect, NULL, D3DLOCK_READONLY);
+
+		if (outputOptions.outputHandler != NULL) {
+			int size = rect.Pitch * ((h + 3) / 4);
+			outputOptions.outputHandler->writeData(rect.pBits, size);
+		}
+
+		err = surface->UnlockRect();
+	}
+
+	surface->Release();
+	device->Release();
+	d3d->Release();
+}
+
+#endif // defined(HAVE_D3DX)
+
+
+#if defined(HAVE_STB)
+
+void StbCompressorDXT1::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
+{
+	rgba.swizzle(2, 1, 0, 3); // Swap R and B
+	stb_compress_dxt_block((unsigned char *)output, (unsigned char *)rgba.colors(), 0, 0);
+}
+
+
+#endif // defined(HAVE_STB)
diff --git a/src/nvtt/QuickCompressDXT.cpp b/src/nvtt/QuickCompressDXT.cpp
index 5f8e4b8..369b0d5 100644
--- a/src/nvtt/QuickCompressDXT.cpp
+++ b/src/nvtt/QuickCompressDXT.cpp
@@ -217,6 +217,33 @@ inline static uint computeIndices3(const ColorBlock & rgba, Vector3::Arg maxColo
 	return indices;
 }
 
+inline static uint computeIndices3(const Vector3 block[16], Vector3::Arg maxColor, Vector3::Arg minColor)
+{
+	Vector3 palette[4];
+	palette[0] = minColor;
+	palette[1] = maxColor;
+	palette[2] = (palette[0] + palette[1]) * 0.5f;
+	
+	uint indices = 0;
+	for(int i = 0; i < 16; i++)
+	{
+		float d0 = colorDistance(palette[0], block[i]);
+		float d1 = colorDistance(palette[1], block[i]);
+		float d2 = colorDistance(palette[2], block[i]);
+		
+		uint index;
+		if (d0 < d1 && d0 < d2) index = 0;
+		else if (d1 < d2) index = 1;
+		else index = 2;
+		
+		indices |= index << (2 * i);
+	}
+
+	return indices;
+}
+
+
+
 
 static void optimizeEndPoints4(Vector3 block[16], BlockDXT1 * dxtBlock)
 {
@@ -266,7 +293,7 @@ static void optimizeEndPoints4(Vector3 block[16], BlockDXT1 * dxtBlock)
 	dxtBlock->indices = computeIndices4(block, a, b);
 }
 
-/*static void optimizeEndPoints3(Vector3 block[16], BlockDXT1 * dxtBlock)
+static void optimizeEndPoints3(Vector3 block[16], BlockDXT1 * dxtBlock)
 {
 	float alpha2_sum = 0.0f;
 	float beta2_sum = 0.0f;
@@ -278,7 +305,7 @@ static void optimizeEndPoints4(Vector3 block[16], BlockDXT1 * dxtBlock)
 	{
 		const uint bits = dxtBlock->indices >> (2 * i);
 
-		float beta = (bits & 1);
+		float beta = float(bits & 1);
 		if (bits & 2) beta = 0.5f;
 		float alpha = 1.0f - beta;
 
@@ -312,7 +339,7 @@ static void optimizeEndPoints4(Vector3 block[16], BlockDXT1 * dxtBlock)
 	dxtBlock->col0 = Color16(color1);
 	dxtBlock->col1 = Color16(color0);
 	dxtBlock->indices = computeIndices3(block, a, b);
-}*/
+}
 
 namespace
 {
@@ -571,7 +598,7 @@ void QuickCompress::compressDXT1a(const ColorBlock & rgba, BlockDXT1 * dxtBlock)
 		
 		dxtBlock->col0 = Color16(color1);
 		dxtBlock->col1 = Color16(color0);
-		dxtBlock->indices = computeIndices3(rgba, maxColor, minColor);
+		dxtBlock->indices = computeIndices3(block, maxColor, minColor);
 		
 		//	optimizeEndPoints(block, dxtBlock);
 	}
@@ -634,3 +661,51 @@ void QuickCompress::compressDXT5(const ColorBlock & rgba, BlockDXT5 * dxtBlock,
 	compressDXT1(rgba, &dxtBlock->color);
 	compressDXT5A(rgba, &dxtBlock->alpha, iterationCount);
 }
+
+
+
+void QuickCompress::outputBlock4(const ColorBlock & rgba, const Vector3 & start, const Vector3 & end, BlockDXT1 * dxtBlock)
+{
+	Vector3 block[16];
+	extractColorBlockRGB(rgba, block);
+
+    Vector3 maxColor = start * 255;
+    Vector3 minColor = end * 255;
+	uint16 color0 = roundAndExpand(&maxColor);
+	uint16 color1 = roundAndExpand(&minColor);
+
+	if (color0 < color1)
+	{
+		swap(maxColor, minColor);
+		swap(color0, color1);
+	}
+
+	dxtBlock->col0 = Color16(color0);
+	dxtBlock->col1 = Color16(color1);
+	dxtBlock->indices = computeIndices4(block, maxColor, minColor);
+
+	optimizeEndPoints4(block, dxtBlock);
+}
+
+void QuickCompress::outputBlock3(const ColorBlock & rgba, const Vector3 & start, const Vector3 & end, BlockDXT1 * dxtBlock)
+{
+	Vector3 block[16];
+	extractColorBlockRGB(rgba, block);
+
+    Vector3 maxColor = start * 255;
+    Vector3 minColor = end * 255;
+	uint16 color0 = roundAndExpand(&maxColor);
+	uint16 color1 = roundAndExpand(&minColor);
+
+	if (color0 > color1)
+	{
+		swap(maxColor, minColor);
+		swap(color0, color1);
+	}
+
+	dxtBlock->col0 = Color16(color0);
+	dxtBlock->col1 = Color16(color1);
+    dxtBlock->indices = computeIndices3(block, maxColor, minColor);
+
+	optimizeEndPoints3(block, dxtBlock);
+}
\ No newline at end of file
diff --git a/src/nvtt/QuickCompressDXT.h b/src/nvtt/QuickCompressDXT.h
index 43d48cb..f7140c0 100644
--- a/src/nvtt/QuickCompressDXT.h
+++ b/src/nvtt/QuickCompressDXT.h
@@ -35,6 +35,7 @@ namespace nv
 	struct BlockDXT5;
 	struct AlphaBlockDXT3;
 	struct AlphaBlockDXT5;
+    class Vector3;
 
 	namespace QuickCompress
 	{
@@ -45,6 +46,9 @@ namespace nv
 		
 		void compressDXT5A(const ColorBlock & rgba, AlphaBlockDXT5 * dxtBlock, int iterationCount=8);
 		void compressDXT5(const ColorBlock & rgba, BlockDXT5 * dxtBlock, int iterationCount=8);
+
+        void outputBlock4(const ColorBlock & rgba, const Vector3 & start, const Vector3 & end, BlockDXT1 * block);
+        void outputBlock3(const ColorBlock & rgba, const Vector3 & start, const Vector3 & end, BlockDXT1 * block);
 	}
 } // nv namespace
 
diff --git a/src/nvtt/TexImage.cpp b/src/nvtt/TexImage.cpp
index ea719f3..d608c98 100644
--- a/src/nvtt/TexImage.cpp
+++ b/src/nvtt/TexImage.cpp
@@ -299,6 +299,46 @@ const float * TexImage::data() const
     return m->image->channel(0);
 }
 
+void TexImage::histogram(int channel, float rangeMin, float rangeMax, int binCount, int * binPtr) const
+{
+    // We assume it's clear in case we want to accumulate multiple histograms.
+    //memset(bins, 0, sizeof(int)*count);
+
+    if (m->image == NULL) return;
+
+    const float * c = m->image->channel(channel);
+
+    float scale = float(binCount) / rangeMax;
+    float bias = - scale * rangeMin;
+
+    const uint count = m->image->width() * m->image->height();
+    for (uint i = 0; i < count; i++) {
+        float f = c[i] * scale + bias;
+        int idx = ifloor(f);
+        if (idx < 0) idx = 0;
+        if (idx > binCount-1) idx = binCount-1;
+        binPtr[idx]++;
+    }
+}
+
+void TexImage::range(int channel, float * rangeMin, float * rangeMax)
+{
+    Vector2 range(FLT_MAX, -FLT_MAX);
+
+    FloatImage * img = m->image;
+    float * c = img->channel(channel);
+
+    const uint count = img->width() * img->height();
+    for (uint p = 0; p < count; p++) {
+        float f = c[p];
+        if (f < range.x) range.x = f;
+        if (f > range.y) range.y = f;
+    }
+
+    *rangeMin = range.x;
+    *rangeMax = range.y;
+}
+
 
 bool TexImage::load(const char * fileName)
 {
@@ -320,8 +360,6 @@ bool TexImage::load(const char * fileName)
 
 bool TexImage::save(const char * fileName) const
 {
-#pragma NV_MESSAGE("TODO: Add support for DDS textures in TexImage::save")
-
     if (m->image != NULL)
     {
         return ImageIO::saveFloat(fileName, m->image, 0, 4);
@@ -989,33 +1027,19 @@ void TexImage::scaleAlphaToCoverage(float coverage, float alphaRef/*= 0.5f*/)
     m->image->scaleAlphaToCoverage(coverage, alphaRef, 3);
 }
 
-bool TexImage::normalizeRange(float * rangeMin, float * rangeMax)
+/*bool TexImage::normalizeRange(float * rangeMin, float * rangeMax)
 {
     if (m->image == NULL) return false;
 
-    Vector2 range(FLT_MAX, -FLT_MAX);
-
-    // Compute range.
-    FloatImage * img = m->image;
+    range(0, rangeMin, rangeMax);
 
-    const uint count = img->count();
-    for (uint p = 0; p < count; p++) {
-        float c = img->pixel(p);
-
-        if (c < range.x) range.x = c;
-        if (c > range.y) range.y = c;
-    }
-
-    if (range.x == range.y) {
+    if (*rangeMin == *rangeMax) {
         // Single color image.
         return false;
     }
 
-    *rangeMin = range.x;
-    *rangeMax = range.y;
-
-    const float scale = 1.0f / (range.y - range.x);
-    const float bias = range.x * scale;
+    const float scale = 1.0f / (*rangeMax - *rangeMin);
+    const float bias = *rangeMin * scale;
 
     if (range.x == 0.0f && range.y == 1.0f) {
         // Already normalized.
@@ -1029,7 +1053,7 @@ bool TexImage::normalizeRange(float * rangeMin, float * rangeMax)
     //img->clamp(0, 4, 0.0f, 1.0f);
 
     return true;
-}
+}*/
 
 // Ideally you should compress/quantize the RGB and M portions independently.
 // Once you have M quantized, you would compute the corresponding RGB and quantize that.
@@ -1054,7 +1078,6 @@ void TexImage::toRGBM(float range/*= 1*/, float threshold/*= 0.25*/)
         float B = nv::clamp(b[i] * irange, 0.0f, 1.0f);
 
         float M = max(max(R, G), max(B, 1e-6f)); // Avoid division by zero.
-        //m = quantizeCeil(m, 8);
 
         r[i] = R / M;
         g[i] = G / M;
@@ -1233,20 +1256,19 @@ void TexImage::toLUVW(float range/*= 1.0f*/)
         float G = nv::clamp(g[i] * irange, 0.0f, 1.0f);
         float B = nv::clamp(b[i] * irange, 0.0f, 1.0f);
 
-        float L = max(sqrtf(R*R + G*G + B*B), 1e-6f)); // Avoid division by zero.
-        //m = quantizeCeil(m, 8);
+        float L = max(sqrtf(R*R + G*G + B*B), 1e-6f); // Avoid division by zero.
 
         r[i] = R / L;
         g[i] = G / L;
         b[i] = B / L;
-        a[i] = L;
+        a[i] = L / sqrtf(3);
     }
 }
 
 void TexImage::fromLUVW(float range/*= 1.0f*/)
 {
     // Decompression is the same as in RGBM.
-    fromRGBM(range);
+    fromRGBM(range * sqrtf(3));
 }
 
 
@@ -1435,10 +1457,52 @@ float nvtt::rmsAlphaError(const TexImage & reference, const TexImage & image)
     return float(sqrt(mse / count));
 }
 
-TexImage nvtt::diff(const TexImage & reference, const TexImage & image)
+TexImage nvtt::diff(const TexImage & reference, const TexImage & image, float scale)
 {
-    // @@ TODO.
-    return TexImage();
+    const FloatImage * ref = reference.m->image;
+    const FloatImage * img = image.m->image;
+
+    if (img == NULL || ref == NULL || img->width() != ref->width() || img->height() != ref->height()) {
+        return TexImage();
+    }
+    nvDebugCheck(img->componentNum() == 4);
+    nvDebugCheck(ref->componentNum() == 4);
+
+    nvtt::TexImage diffImage;
+    FloatImage * diff = diffImage.m->image = new FloatImage;
+    diff->allocate(4, img->width(), img->height());
+
+    const uint count = img->width() * img->height();
+    for (uint i = 0; i < count; i++)
+    {
+        float r0 = img->pixel(i, 0);
+        float g0 = img->pixel(i, 1);
+        float b0 = img->pixel(i, 2);
+        //float a0 = img->pixel(i, 3);
+        float r1 = ref->pixel(i, 0);
+        float g1 = ref->pixel(i, 1);
+        float b1 = ref->pixel(i, 2);
+        float a1 = ref->pixel(i, 3);
+
+        float dr = r0 - r1;
+        float dg = g0 - g1;
+        float db = b0 - b1;
+        //float da = a0 - a1;
+
+        if (reference.alphaMode() == nvtt::AlphaMode_Transparency)
+        {
+            dr *= a1;
+            dg *= a1;
+            db *= a1;
+        }
+
+        diff->pixel(i, 0) = dr * scale;
+        diff->pixel(i, 1) = dg * scale;
+        diff->pixel(i, 2) = db * scale;
+        diff->pixel(i, 3) = a1;
+    }
+
+    return diffImage;
 }
 
 
diff --git a/src/nvtt/nvtt.h b/src/nvtt/nvtt.h
index 43e9a98..b82becb 100644
--- a/src/nvtt/nvtt.h
+++ b/src/nvtt/nvtt.h
@@ -395,6 +395,8 @@ namespace nvtt
         NVTT_API float alphaTestCoverage(float alphaRef = 0.5) const;
         NVTT_API float average(int channel) const;
         NVTT_API const float * data() const;
+        NVTT_API void histogram(int channel, float rangeMin, float rangeMax, int binCount, int * binPtr) const;
+        NVTT_API void range(int channel, float * rangeMin, float * rangeMax);
 
         // Texture data.
         NVTT_API bool load(const char * fileName);
@@ -426,7 +428,7 @@ namespace nvtt
         NVTT_API void setBorder(float r, float g, float b, float a);
         NVTT_API void fill(float r, float g, float b, float a);
         NVTT_API void scaleAlphaToCoverage(float coverage, float alphaRef = 0.5f);
-        NVTT_API bool normalizeRange(float * rangeMin, float * rangeMax);
+        //NVTT_API bool normalizeRange(float * rangeMin, float * rangeMax);
         NVTT_API void toRGBM(float range = 1.0f, float threshold = 0.0f);
         NVTT_API void fromRGBM(float range = 1.0f);
         NVTT_API void toYCoCg();
@@ -451,9 +453,9 @@ namespace nvtt
         NVTT_API bool copyChannel(const TexImage & srcImage, int srcChannel, int dstChannel);
 
         // Error compare.
-        friend float rmsError(const TexImage & reference, const TexImage & img);
-        friend float rmsAlphaError(const TexImage & reference, const TexImage & img);
-        friend TexImage diff(const TexImage & reference, const TexImage & img);
+        NVTT_API friend float rmsError(const TexImage & reference, const TexImage & img);
+        NVTT_API friend float rmsAlphaError(const TexImage & reference, const TexImage & img);
+        NVTT_API friend TexImage diff(const TexImage & reference, const TexImage & img, float scale);
 
     private:
         void detach();
@@ -471,7 +473,7 @@ namespace nvtt
 
     NVTT_API float rmsError(const TexImage & reference, const TexImage & img);
     NVTT_API float rmsAlphaError(const TexImage & reference, const TexImage & img);
-    NVTT_API TexImage diff(const TexImage & reference, const TexImage & img);
+    NVTT_API TexImage diff(const TexImage & reference, const TexImage & img, float scale);
 
 } // nvtt namespace
 
diff --git a/src/nvtt/squish/colourfit.cpp b/src/nvtt/squish/colourfit.cpp
index 7df7047..f67a67f 100644
--- a/src/nvtt/squish/colourfit.cpp
+++ b/src/nvtt/squish/colourfit.cpp
@@ -38,21 +38,21 @@ void ColourFit::SetColourSet( ColourSet const* colours, int flags )
 	m_flags = flags;
 }
 
-void ColourFit::Compress( void* block )
+void ColourFit::Compress( Vec3 * start, Vec3 * end )
 {
 	bool isDxt1 = ( ( m_flags & kDxt1 ) != 0 );
 	if( isDxt1 )
 	{
-		Compress3( block );
+		Compress3( start, end );
 	
 		if( !m_colours->IsTransparent() )
 		{		
-			Compress4( block );
+			Compress4( start, end );
 		}
 	}
 	else
 	{
-		Compress4( block );
+		Compress4( start, end );
 	}
 }
 
diff --git a/src/nvtt/squish/colourfit.h b/src/nvtt/squish/colourfit.h
index 4c4dc6b..9e6281b 100644
--- a/src/nvtt/squish/colourfit.h
+++ b/src/nvtt/squish/colourfit.h
@@ -40,11 +40,11 @@ public:
 
 	void SetColourSet( ColourSet const* colours, int flags );
 
-	void Compress( void* block );
+	void Compress( Vec3 * start, Vec3 * end );
 
 protected:
-	virtual void Compress3( void* block ) = 0;
-	virtual void Compress4( void* block ) = 0;
+	virtual bool Compress3( Vec3 * start, Vec3 * end ) = 0;
+	virtual bool Compress4( Vec3 * start, Vec3 * end ) = 0;
 
 	ColourSet const* m_colours;
 	int m_flags;
diff --git a/src/nvtt/squish/weightedclusterfit.cpp b/src/nvtt/squish/weightedclusterfit.cpp
index 9181249..23e4fa6 100644
--- a/src/nvtt/squish/weightedclusterfit.cpp
+++ b/src/nvtt/squish/weightedclusterfit.cpp
@@ -129,7 +129,7 @@ float WeightedClusterFit::GetBestError() const
 
 #if SQUISH_USE_SIMD
 
-void WeightedClusterFit::Compress3( void* block )
+bool WeightedClusterFit::Compress3( Vec3 * start, Vec3 * end )
 {
     int const count = m_colours->GetCount();
 	Vec4 const one = VEC4_CONST(1.0f);
@@ -212,7 +212,7 @@ void WeightedClusterFit::Compress3( void* block )
 	if( CompareAnyLessThan( besterror, m_besterror ) )
 	{
 		// compute indices from cluster sizes.
-		u8 bestindices[16];
+		/*u8 bestindices[16];
 		{
 			int i = 0;
 			for(; i < b0; i++) {
@@ -233,16 +233,22 @@ void WeightedClusterFit::Compress3( void* block )
 		
 		m_colours->RemapIndices( ordered, bestindices );
 
-
 		// save the block
-		WriteColourBlock3( beststart.GetVec3(), bestend.GetVec3(), bestindices, block );
-		
+		WriteColourBlock3( beststart.GetVec3(), bestend.GetVec3(), bestindices, block );*/
+
+        *start = beststart.GetVec3();
+        *end = bestend.GetVec3();
+
 		// save the error
 		m_besterror = besterror;
+
+        return true;
 	}
+
+    return false;
 }
 
-void WeightedClusterFit::Compress4( void* block )
+bool WeightedClusterFit::Compress4( Vec3 * start, Vec3 * end )
 {
     int const count = m_colours->GetCount();
 	Vec4 const one = VEC4_CONST(1.0f);
@@ -334,7 +340,7 @@ void WeightedClusterFit::Compress4( void* block )
 	// save the block if necessary
 	if( CompareAnyLessThan( besterror, m_besterror ) )
 	{
-		// compute indices from cluster sizes.
+		/*// compute indices from cluster sizes.
 		u8 bestindices[16];
 		{
 			int i = 0;
@@ -360,11 +366,18 @@ void WeightedClusterFit::Compress4( void* block )
         m_colours->RemapIndices( ordered, bestindices );
 
 		// save the block
-		WriteColourBlock4( beststart.GetVec3(), bestend.GetVec3(), bestindices, block );
+		WriteColourBlock4( beststart.GetVec3(), bestend.GetVec3(), bestindices, block );*/
+
+        *start = beststart.GetVec3();
+        *end = bestend.GetVec3();
 		
 		// save the error
 		m_besterror = besterror;
+
+        return true;
 	}
+
+    return false;
 }
 
 #else
diff --git a/src/nvtt/squish/weightedclusterfit.h b/src/nvtt/squish/weightedclusterfit.h
index a0a45fb..66983ba 100644
--- a/src/nvtt/squish/weightedclusterfit.h
+++ b/src/nvtt/squish/weightedclusterfit.h
@@ -45,8 +45,8 @@ public:
 	float GetBestError() const;
 
 	// Make them public
-	virtual void Compress3( void* block );
-	virtual void Compress4( void* block );
+	bool Compress3( Vec3 * start, Vec3 * end );
+	bool Compress4( Vec3 * start, Vec3 * end );
 	
 private:
 
diff --git a/src/nvtt/tests/testsuite.cpp b/src/nvtt/tests/testsuite.cpp
index 48ec61a..7b73ee8 100644
--- a/src/nvtt/tests/testsuite.cpp
+++ b/src/nvtt/tests/testsuite.cpp
@@ -145,6 +145,11 @@ static const char * s_witnessImageSet[] = {
     "specRuin-puzzle.tga"
 };
 
+static const char * s_witnessLmapImageSet[] = {
+    "specruin.dds",
+};
+
+
 enum Mode {
     Mode_BC1,
     Mode_BC1_Alpha,
@@ -152,9 +157,12 @@ enum Mode {
     Mode_BC3_Alpha,
     Mode_BC3_YCoCg,
     Mode_BC3_RGBM,
+    Mode_BC3_LUVW,
     Mode_BC1_Normal,
     Mode_BC3_Normal,
     Mode_BC5_Normal,
+    Mode_BC3_Lightmap_1,
+    Mode_BC3_Lightmap_2,
 };
 static const char * s_modeNames[] = {
     "BC1",
@@ -167,6 +175,8 @@ static const char * s_modeNames[] = {
     "BC1-Normal",
     "BC3-Normal",
     "BC5-Normal",
+    "BC3-RGBM",
+    "BC3-LUVW",
 };
 
 struct Test {
@@ -175,26 +185,29 @@ struct Test {
     Mode modes[4];
 };
 static Test s_imageTests[] = {
-    {"DXT Color", 3, {Mode_BC1, Mode_BC3_YCoCg, Mode_BC3_RGBM}},
+    {"DXT Color", 1, {Mode_BC1, Mode_BC3_YCoCg, Mode_BC3_RGBM, Mode_BC3_LUVW}},
     {"DXT Alpha", 3, {Mode_BC1_Alpha, Mode_BC2_Alpha, Mode_BC3_Alpha}},
     {"DXT Normal", 3, {Mode_BC1_Normal, Mode_BC3_Normal, Mode_BC5_Normal}},
+    {"DXT Lightmap", 2, {Mode_BC3_Lightmap_1, Mode_BC3_Lightmap_2}},
 };
-const int s_testCount = ARRAY_SIZE(s_imageTests);
+const int s_imageTestCount = ARRAY_SIZE(s_imageTests);
 
 struct ImageSet
 {
     const char * name;
+    const char * basePath;
     const char ** fileNames;
     int fileCount;
 };
 static ImageSet s_imageSets[] = {
-    {"Kodak",       s_kodakImageSet,        ARRAY_SIZE(s_kodakImageSet)},       // 0
-    {"Waterloo",    s_waterlooImageSet,     ARRAY_SIZE(s_waterlooImageSet)},    // 1
-    {"Epic",        s_epicImageSet,         ARRAY_SIZE(s_epicImageSet)},        // 2
-    {"Farbraush",   s_farbrauschImageSet,   ARRAY_SIZE(s_farbrauschImageSet)},  // 3
-    {"Lugaru",      s_lugaruImageSet,       ARRAY_SIZE(s_lugaruImageSet)},      // 4
-    {"Quake3",      s_quake3ImageSet,       ARRAY_SIZE(s_quake3ImageSet)},      // 5
-    {"Witness",     s_witnessImageSet,       ARRAY_SIZE(s_witnessImageSet)}     // 6
+    {"Kodak",       "kodak",        s_kodakImageSet,        ARRAY_SIZE(s_kodakImageSet)},       // 0
+    {"Waterloo",    "waterloo",     s_waterlooImageSet,     ARRAY_SIZE(s_waterlooImageSet)},    // 1
+    {"Epic",        "epic",         s_epicImageSet,         ARRAY_SIZE(s_epicImageSet)},        // 2
+    {"Farbraush",   "farbrausch",   s_farbrauschImageSet,   ARRAY_SIZE(s_farbrauschImageSet)},  // 3
+    {"Lugaru",      "lugaru",       s_lugaruImageSet,       ARRAY_SIZE(s_lugaruImageSet)},      // 4
+    {"Quake3",      "quake3",       s_quake3ImageSet,       ARRAY_SIZE(s_quake3ImageSet)},      // 5
+    {"Witness",     "witness",      s_witnessImageSet,      ARRAY_SIZE(s_witnessImageSet)},     // 6
+    {"Lightmap",    "lightmap",     s_witnessLmapImageSet,  ARRAY_SIZE(s_witnessLmapImageSet)}, // 7
 };
 const int s_imageSetCount = sizeof(s_imageSets)/sizeof(s_imageSets[0]);
 
@@ -227,9 +240,10 @@ struct MyOutputHandler : public nvtt::OutputHandler
     nvtt::TexImage decompress(Mode mode, nvtt::Decoder decoder)
     {
         nvtt::Format format; 
-        if (mode == Mode_BC1) format = nvtt::Format_BC1;
+        if (mode == Mode_BC1 || mode == Mode_BC1_Alpha || mode == Mode_BC1_Normal) format = nvtt::Format_BC1;
+        else if (mode == Mode_BC2_Alpha) format = nvtt::Format_BC2;
         else if (mode == Mode_BC5_Normal) format = nvtt::Format_BC5;
-        else  format = nvtt::Format_BC3;
+        else format = nvtt::Format_BC3;
 
         nvtt::TexImage img;
         img.setImage2D(format, decoder, m_width, m_height, m_data);
@@ -263,7 +277,7 @@ int main(int argc, char *argv[])
     bool nocuda = false;
     bool showHelp = false;
     nvtt::Decoder decoder = nvtt::Decoder_Reference;
-    const char * basePath = "";
+    Path basePath = "";
     const char * outPath = "output";
     const char * regressPath = NULL;
 
@@ -274,6 +288,14 @@ int main(int argc, char *argv[])
         {
             if (i+1 < argc && argv[i+1][0] != '-') {
                 setIndex = atoi(argv[i+1]);
+
+                for (int j = 0; j < s_imageSetCount; j++) {
+                    if (strCaseCmp(s_imageSets[j].name, argv[i+1]) == 0) {
+                        setIndex = j;
+                        break;
+                    }
+                }
+
                 i++;
             }
         }
@@ -327,7 +349,7 @@ int main(int argc, char *argv[])
     }
 
     // Validate inputs.
-    if (testIndex >= s_testCount) {
+    if (testIndex >= s_imageTestCount) {
         printf("Invalid test %d\n", testIndex);
         return 0;
     }
@@ -343,17 +365,14 @@ int main(int argc, char *argv[])
         printf("Input options:\n");
         printf("  -path <path>   \tInput image path.\n");
         printf("  -regress <path>\tRegression directory.\n");
-        printf("  -set [0:5]     \tImage set.\n");
-        printf("    0:           \tKodak.\n");
-        printf("    1:           \tWaterloo.\n");
-        printf("    2:           \tEpic.\n");
-        printf("    3:           \tFarbrausch.\n");
-        printf("    4:           \tLugaru.\n");
-        printf("    5:           \tQuake 3.\n");
-        printf("  -test [0:2]    \tCompression tests to run.");
-        printf("    0:           \tDXT Color.\n");
-        printf("    1:           \tDXT Alpha.\n");
-        printf("    2:           \tDXT Normal.\n");
+        printf("  -set [0:%d]     \tImage set.\n", s_imageSetCount-1);
+        for (int i = 0; i < s_imageSetCount; i++) {
+            printf("    %i:           \t%s.\n", i, s_imageSets[i].name);
+        }
+        printf("  -test [0:%d]    \tCompression tests to run.", s_imageTestCount);
+        for (int i = 0; i < s_imageTestCount; i++) {
+            printf("    %i:           \t%s.\n", i, s_imageTests[i].name);
+        }
         printf("  -dec x         \tDecompressor.\n");
         printf("    0:           \tReference.\n");
         printf("    1:           \tNVIDIA.\n");
@@ -397,7 +416,9 @@ int main(int argc, char *argv[])
     nvtt::Context context;
     context.enableCudaAcceleration(!nocuda);
 
-    FileSystem::changeDirectory(basePath);
+    basePath.append(set.basePath);
+
+    FileSystem::changeDirectory(basePath.str());
     FileSystem::createDirectory(outPath);
 
     //Path csvFileName;
@@ -406,7 +427,7 @@ int main(int argc, char *argv[])
     //TextWriter csvWriter(&csvStream);
 
     Path graphFileName;
-    graphFileName.format("%s/result-%d.txt", outPath, setIndex);
+    graphFileName.format("%s/chart.txt", outPath/*, test.name*/);
     StdOutputStream graphStream(graphFileName.str());
     TextWriter graphWriter(&graphStream);
 
@@ -434,7 +455,7 @@ int main(int argc, char *argv[])
     {
         const char * colors[] = {
             "3D7930", "952826", "3D1FC1",
-            "3D7930", "952826", "3D1FC1", // pick other colors...
+            "FF9900", "999999", "999999", // pick other colors...
         };
         graphWriter << colors[t];
         if (t != test.count-1) graphWriter << ",";
@@ -484,10 +505,10 @@ int main(int argc, char *argv[])
     for (int t = 0; t < test.count; t++)
     {
         Mode mode = test.modes[t];
-        if (mode == Mode_BC1) {
+        if (mode == Mode_BC1 || mode == Mode_BC1_Alpha || mode == Mode_BC1_Normal) {
             compressionOptions.setFormat(nvtt::Format_BC1);
         }
-        else if (mode == Mode_BC3_Alpha || mode == Mode_BC3_YCoCg || mode == Mode_BC3_RGBM) {
+        else if (mode == Mode_BC3_Alpha || mode == Mode_BC3_YCoCg || mode == Mode_BC3_RGBM || mode == Mode_BC3_LUVW || mode == Mode_BC3_Lightmap_1 || mode == Mode_BC3_Lightmap_2) {
             compressionOptions.setFormat(nvtt::Format_BC3);
         }
         else if (mode == Mode_BC3_Normal) {
@@ -497,10 +518,10 @@ int main(int argc, char *argv[])
             compressionOptions.setFormat(nvtt::Format_BC5);
         }
 
-        if (mode == Mode_BC3_Alpha) {
+        if (mode == Mode_BC3_Alpha || mode == Mode_BC3_Lightmap_1 || mode == Mode_BC3_Lightmap_2) { // Lightmap's alpha channel is coverage.
             img.setAlphaMode(nvtt::AlphaMode_Transparency);
         }
-        if (mode == Mode_BC3_Normal || mode == Mode_BC5_Normal) {
+        if (mode == Mode_BC1_Normal || mode == Mode_BC3_Normal || mode == Mode_BC5_Normal) {
             img.setNormalMap(true);
         }
 
@@ -528,6 +549,56 @@ int main(int argc, char *argv[])
             else if (mode == Mode_BC3_RGBM) {
                 tmp.toRGBM();
             }
+            else if (mode == Mode_BC3_LUVW) {
+                tmp.toLUVW();
+            }
+            else if (mode == Mode_BC3_Lightmap_1) {
+                tmp.toRGBM(4);
+
+                /*float rmin, rmax;
+                tmp.range(0, &rmin, &rmax);
+
+                float gmin, gmax;
+                tmp.range(1, &gmin, &gmax);
+
+                float bmin, bmax;
+                tmp.range(2, &bmin, &bmax);
+
+                float lmin, lmax;
+                tmp.range(3, &lmin, &lmax);
+
+                printf("rmin: %.3f   rmax: %.3f\n", rmin, rmax);
+                printf("gmin: %.3f   gmax: %.3f\n", gmin, gmax);
+                printf("bmin: %.3f   bmax: %.3f\n", bmin, bmax);
+                printf("lmin: %.3f   lmax: %.3f\n", lmin, lmax);
+
+                const int N = 32;
+                int chistogram[N];
+                int lhistogram[N];
+                memset(chistogram, 0, sizeof(chistogram)); 
+                memset(lhistogram, 0, sizeof(lhistogram));
+
+                tmp.histogram(0, 0, 1, N, chistogram);
+                tmp.histogram(1, 0, 1, N, chistogram);
+                tmp.histogram(2, 0, 1, N, chistogram);
+                tmp.histogram(3, 0, 1, N, lhistogram);
+
+                printf("Color histogram:\n");
+                for (int i = 0; i < N; i++) {
+                    printf("%d, ", chistogram[i]);
+                }
+                printf("\n");
+
+                printf("Luminance histogram:\n");
+                for (int i = 0; i < N; i++) {
+                    printf("%d, ", lhistogram[i]);
+                }
+                printf("\n");*/
+            }
+            else if (mode == Mode_BC3_Lightmap_2) {
+                tmp.toLUVW(4);
+            }
+
 
             printf("Compressing: \t'%s'\n", set.fileNames[i]);
 
@@ -540,12 +611,8 @@ int main(int argc, char *argv[])
             totalTime += timer.elapsed();
 
             nvtt::TexImage img_out = outputHandler.decompress(mode, decoder);
-            if (mode == Mode_BC3_Alpha) {
-                img_out.setAlphaMode(nvtt::AlphaMode_Transparency);
-            }
-            if (mode == Mode_BC3_Normal || mode == Mode_BC5_Normal) {
-                img_out.setNormalMap(true);
-            }
+            img_out.setAlphaMode(img.alphaMode());
+            img_out.setNormalMap(img.isNormalMap());
 
             if (mode == Mode_BC3_YCoCg) {
                 img_out.scaleBias(0, 1.0, -0.5);
@@ -555,11 +622,30 @@ int main(int argc, char *argv[])
             else if (mode == Mode_BC3_RGBM) {
                 img_out.fromRGBM();
             }
+            else if (mode == Mode_BC3_LUVW) {
+                img_out.fromLUVW();
+            }
+            else if (mode == Mode_BC3_Lightmap_1) {
+                img_out.fromRGBM(4);
+            }
+            else if (mode == Mode_BC3_Lightmap_2) {
+                img_out.fromLUVW(4);
+            }
+
+
+            Path outputFilePath;
+            outputFilePath.format("%s/%s", outPath, s_modeNames[test.modes[t]]);
+            FileSystem::createDirectory(outputFilePath.str());
 
             Path outputFileName;
-            outputFileName.format("%s/%s", outPath, set.fileNames[i]);
+            outputFileName.format("%s/%s", outputFilePath.str(), set.fileNames[i]);
             outputFileName.stripExtension();
-            outputFileName.append(".png");
+            if (mode == Mode_BC3_Lightmap_1 || mode == Mode_BC3_Lightmap_2) {
+                outputFileName.append(".dds");
+            }
+            else {
+                outputFileName.append(".png");
+            }
             if (!img_out.save(outputFileName.str()))
             {
                 printf("Error saving file '%s'.\n", outputFileName.str());
@@ -573,6 +659,12 @@ int main(int argc, char *argv[])
             graphWriter << rmse;
             if (i != set.fileCount-1) graphWriter << ",";
 
+
+            outputFileName.stripExtension();
+            outputFileName.append("_diff.png");
+            nvtt::diff(img, img_out, 4.0f).save(outputFileName.str());
+
+
             // Output csv file
             //csvWriter << "\"" << fileNames[i] << "\"," << rmse << "\n";
 
@@ -615,7 +707,7 @@ int main(int argc, char *argv[])
         printf("  Total Time: \t%.3f sec\n", totalTime);
         printf("  Average RMSE:\t%.4f\n", totalRMSE);
 
-        if (t != s_testCount-1) graphWriter << "|";
+        if (t != test.count-1) graphWriter << "|";
     }
 
     /*if (regressPath != NULL)