From a08333747362b5ac1518581063b48a6a7b90188e Mon Sep 17 00:00:00 2001
From: Ignacio <castano@gmail.com>
Date: Tue, 24 Mar 2015 12:14:49 -0700
Subject: [PATCH] Merge changes from The Witness.

---
 project/vc9/bc6h/bc6h.vcproj           |  18 +-
 project/vc9/bc7/bc7.vcproj             |  34 +-
 project/vc9/nvmath/nvmath.vcproj       |   8 +
 project/vc9/nvtt.sln                   |   2 +
 project/vc9/nvtt/nvtt.vcproj           |  16 +
 project/vc9/testsuite/testsuite.vcproj |  24 +-
 src/CMakeLists.txt                     |  31 +-
 src/nvcore/nvcore.h                    |   6 +-
 src/nvimage/BlockDXT.cpp               |  15 +-
 src/nvimage/BlockDXT.h                 |   3 +-
 src/nvimage/ColorBlock.cpp             |  11 +-
 src/nvimage/ColorBlock.h               |   9 +-
 src/nvimage/DirectDrawSurface.cpp      |  15 +-
 src/nvimage/ErrorMetric.cpp            |  32 +-
 src/nvimage/ErrorMetric.h              |  14 +-
 src/nvmath/PackedFloat.cpp             |  61 ++++
 src/nvmath/PackedFloat.h               |  79 +++++
 src/nvmath/SimdVector.h                |  23 --
 src/nvmath/nvmath.h                    |  43 +++
 src/nvthread/Atomic.h                  |  24 +-
 src/nvthread/Event.cpp                 |   4 +-
 src/nvthread/Mutex.cpp                 |  46 ++-
 src/nvthread/Mutex.h                   |   2 +-
 src/nvthread/ParallelFor.cpp           |   6 +-
 src/nvthread/ParallelFor.h             |   2 +-
 src/nvthread/Thread.cpp                |  60 +++-
 src/nvthread/Thread.h                  |   1 +
 src/nvthread/ThreadPool.cpp            |   2 +-
 src/nvthread/ThreadPool.h              |   2 +-
 src/nvthread/nvthread.cpp              |   7 +
 src/nvthread/nvthread.h                |   2 +
 src/nvtt/BlockCompressor.cpp           | 128 ++++++--
 src/nvtt/BlockCompressor.h             |   6 +-
 src/nvtt/CMakeLists.txt                |   1 +
 src/nvtt/ClusterFit.cpp                |   3 +-
 src/nvtt/ClusterFit.h                  |   2 +-
 src/nvtt/CompressionOptions.cpp        |   2 +-
 src/nvtt/CompressorDX10.cpp            |   2 +
 src/nvtt/CompressorDX10.h              |   4 +-
 src/nvtt/CompressorDX11.cpp            |  38 +--
 src/nvtt/CompressorDX11.h              |   8 +-
 src/nvtt/CompressorDX9.cpp             | 399 +----------------------
 src/nvtt/CompressorDX9.h               |   8 +-
 src/nvtt/CompressorDXT1.cpp            |  64 ++--
 src/nvtt/CompressorDXT1.h              |   9 +-
 src/nvtt/CompressorDXT5_RGBM.cpp       | 423 +++++++++++++++++++++++++
 src/nvtt/CompressorDXT5_RGBM.h         |   9 +
 src/nvtt/CompressorRGB.cpp             | 173 ++++++++++
 src/nvtt/Context.cpp                   |  53 ++--
 src/nvtt/OptimalCompressDXT.cpp        |   3 +-
 src/nvtt/QuickCompressDXT.cpp          |  11 +-
 src/nvtt/Surface.cpp                   | 287 +++++++++++++++--
 src/nvtt/nvtt.cpp                      |   9 +-
 src/nvtt/nvtt.h                        |  14 +-
 src/nvtt/tools/compress.cpp            | 384 ++++++++++++++--------
 src/nvtt/tools/decompress.cpp          | 205 +++++++-----
 src/nvtt/tools/imgdiff.cpp             | 120 ++++++-
 57 files changed, 2056 insertions(+), 911 deletions(-)
 create mode 100755 src/nvmath/PackedFloat.cpp
 create mode 100755 src/nvmath/PackedFloat.h
 create mode 100755 src/nvtt/CompressorDXT5_RGBM.cpp
 create mode 100755 src/nvtt/CompressorDXT5_RGBM.h
diff --git a/project/vc9/bc6h/bc6h.vcproj b/project/vc9/bc6h/bc6h.vcproj
index c056d14..ba159ed 100755
--- a/project/vc9/bc6h/bc6h.vcproj
+++ b/project/vc9/bc6h/bc6h.vcproj
@@ -267,39 +267,39 @@
 	</References>
 	<Files>
 		<File
-			RelativePath="..\..\..\src\nvtt\bc6h\bits.h"
+			RelativePath="..\..\..\src\bc6h\bits.h"
 			>
 		</File>
 		<File
-			RelativePath="..\..\..\src\nvtt\bc6h\shapes_two.h"
+			RelativePath="..\..\..\src\bc6h\shapes_two.h"
 			>
 		</File>
 		<File
-			RelativePath="..\..\..\src\nvtt\bc6h\tile.h"
+			RelativePath="..\..\..\src\bc6h\tile.h"
 			>
 		</File>
 		<File
-			RelativePath="..\..\..\src\nvtt\bc6h\utils.cpp"
+			RelativePath="..\..\..\src\bc6h\zoh.cpp"
 			>
 		</File>
 		<File
-			RelativePath="..\..\..\src\nvtt\bc6h\utils.h"
+			RelativePath="..\..\..\src\bc6h\zoh.h"
 			>
 		</File>
 		<File
-			RelativePath="..\..\..\src\nvtt\bc6h\zoh.cpp"
+			RelativePath="..\..\..\src\bc6h\zoh_utils.cpp"
 			>
 		</File>
 		<File
-			RelativePath="..\..\..\src\nvtt\bc6h\zoh.h"
+			RelativePath="..\..\..\src\bc6h\zoh_utils.h"
 			>
 		</File>
 		<File
-			RelativePath="..\..\..\src\nvtt\bc6h\zohone.cpp"
+			RelativePath="..\..\..\src\bc6h\zohone.cpp"
 			>
 		</File>
 		<File
-			RelativePath="..\..\..\src\nvtt\bc6h\zohtwo.cpp"
+			RelativePath="..\..\..\src\bc6h\zohtwo.cpp"
 			>
 		</File>
 	</Files>
diff --git a/project/vc9/bc7/bc7.vcproj b/project/vc9/bc7/bc7.vcproj
index 38e0d1f..4dfdfa7 100644
--- a/project/vc9/bc7/bc7.vcproj
+++ b/project/vc9/bc7/bc7.vcproj
@@ -267,71 +267,71 @@
 	</References>
 	<Files>
 		<File
-			RelativePath="..\..\..\src\nvtt\bc7\avpcl.cpp"
+			RelativePath="..\..\..\src\bc7\avpcl.cpp"
 			>
 		</File>
 		<File
-			RelativePath="..\..\..\src\nvtt\bc7\avpcl.h"
+			RelativePath="..\..\..\src\bc7\avpcl.h"
 			>
 		</File>
 		<File
-			RelativePath="..\..\..\src\nvtt\bc7\avpcl_mode0.cpp"
+			RelativePath="..\..\..\src\bc7\avpcl_mode0.cpp"
 			>
 		</File>
 		<File
-			RelativePath="..\..\..\src\nvtt\bc7\avpcl_mode1.cpp"
+			RelativePath="..\..\..\src\bc7\avpcl_mode1.cpp"
 			>
 		</File>
 		<File
-			RelativePath="..\..\..\src\nvtt\bc7\avpcl_mode2.cpp"
+			RelativePath="..\..\..\src\bc7\avpcl_mode2.cpp"
 			>
 		</File>
 		<File
-			RelativePath="..\..\..\src\nvtt\bc7\avpcl_mode3.cpp"
+			RelativePath="..\..\..\src\bc7\avpcl_mode3.cpp"
 			>
 		</File>
 		<File
-			RelativePath="..\..\..\src\nvtt\bc7\avpcl_mode4.cpp"
+			RelativePath="..\..\..\src\bc7\avpcl_mode4.cpp"
 			>
 		</File>
 		<File
-			RelativePath="..\..\..\src\nvtt\bc7\avpcl_mode5.cpp"
+			RelativePath="..\..\..\src\bc7\avpcl_mode5.cpp"
 			>
 		</File>
 		<File
-			RelativePath="..\..\..\src\nvtt\bc7\avpcl_mode6.cpp"
+			RelativePath="..\..\..\src\bc7\avpcl_mode6.cpp"
 			>
 		</File>
 		<File
-			RelativePath="..\..\..\src\nvtt\bc7\avpcl_mode7.cpp"
+			RelativePath="..\..\..\src\bc7\avpcl_mode7.cpp"
 			>
 		</File>
 		<File
-			RelativePath="..\..\..\src\nvtt\bc7\bits.h"
+			RelativePath="..\..\..\src\bc7\avpcl_utils.cpp"
 			>
 		</File>
 		<File
-			RelativePath="..\..\..\src\nvtt\bc7\endpts.h"
+			RelativePath="..\..\..\src\bc7\avpcl_utils.h"
 			>
 		</File>
 		<File
-			RelativePath="..\..\..\src\nvtt\bc7\shapes_three.h"
+			RelativePath="..\..\..\src\bc7\bits.h"
 			>
 		</File>
 		<File
-			RelativePath="..\..\..\src\nvtt\bc7\shapes_two.h"
+			RelativePath="..\..\..\src\bc7\endpts.h"
 			>
 		</File>
 		<File
-			RelativePath="..\..\..\src\nvtt\bc7\tile.h"
+			RelativePath="..\..\..\src\bc7\shapes_three.h"
 			>
 		</File>
 		<File
-			RelativePath="..\..\..\src\nvtt\bc7\utils.cpp"
+			RelativePath="..\..\..\src\bc7\shapes_two.h"
 			>
 		</File>
 		<File
-			RelativePath="..\..\..\src\nvtt\bc7\utils.h"
+			RelativePath="..\..\..\src\bc7\tile.h"
 			>
 		</File>
 	</Files>
diff --git a/project/vc9/nvmath/nvmath.vcproj b/project/vc9/nvmath/nvmath.vcproj
index 16c8b7d..7ab7db8 100644
--- a/project/vc9/nvmath/nvmath.vcproj
+++ b/project/vc9/nvmath/nvmath.vcproj
@@ -334,6 +334,14 @@
 			RelativePath="..\..\..\src\nvmath\nvmath.h"
 			>
 		</File>
+		<File
+			RelativePath="..\..\..\src\nvmath\PackedFloat.cpp"
+			>
+		</File>
+		<File
+			RelativePath="..\..\..\src\nvmath\PackedFloat.h"
+			>
+		</File>
 		<File
 			RelativePath="..\..\..\src\nvmath\Plane.cpp"
 			>
diff --git a/project/vc9/nvtt.sln b/project/vc9/nvtt.sln
index 8d41bc3..76457fb 100644
--- a/project/vc9/nvtt.sln
+++ b/project/vc9/nvtt.sln
@@ -48,6 +48,7 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "nvdecompress", "nvdecompres
 	ProjectSection(ProjectDependencies) = postProject
 		{F974F34B-AF02-4C88-8E1E-85475094EA78} = {F974F34B-AF02-4C88-8E1E-85475094EA78}
 		{F143D180-D4C4-4037-B3DE-BE89A21C8D1D} = {F143D180-D4C4-4037-B3DE-BE89A21C8D1D}
+		{1AEB7681-57D8-48EE-813D-5C41CC38B647} = {1AEB7681-57D8-48EE-813D-5C41CC38B647}
 		{4046F392-A18B-4C66-9639-3EABFFF5D531} = {4046F392-A18B-4C66-9639-3EABFFF5D531}
 		{C33787E3-5564-4834-9FE3-A9020455A669} = {C33787E3-5564-4834-9FE3-A9020455A669}
 		{50C465FE-B308-42BC-894D-89484482AF06} = {50C465FE-B308-42BC-894D-89484482AF06}
@@ -57,6 +58,7 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "nvimgdiff", "nvimgdiff\nvim
 	ProjectSection(ProjectDependencies) = postProject
 		{F974F34B-AF02-4C88-8E1E-85475094EA78} = {F974F34B-AF02-4C88-8E1E-85475094EA78}
 		{F143D180-D4C4-4037-B3DE-BE89A21C8D1D} = {F143D180-D4C4-4037-B3DE-BE89A21C8D1D}
+		{1AEB7681-57D8-48EE-813D-5C41CC38B647} = {1AEB7681-57D8-48EE-813D-5C41CC38B647}
 		{4046F392-A18B-4C66-9639-3EABFFF5D531} = {4046F392-A18B-4C66-9639-3EABFFF5D531}
 		{C33787E3-5564-4834-9FE3-A9020455A669} = {C33787E3-5564-4834-9FE3-A9020455A669}
 		{50C465FE-B308-42BC-894D-89484482AF06} = {50C465FE-B308-42BC-894D-89484482AF06}
diff --git a/project/vc9/nvtt/nvtt.vcproj b/project/vc9/nvtt/nvtt.vcproj
index 3343500..9f27172 100644
--- a/project/vc9/nvtt/nvtt.vcproj
+++ b/project/vc9/nvtt/nvtt.vcproj
@@ -938,6 +938,22 @@
 			RelativePath="..\..\..\src\nvtt\CompressorDX9.h"
 			>
 		</File>
+		<File
+			RelativePath="..\..\..\src\nvtt\CompressorDXT1.cpp"
+			>
+		</File>
+		<File
+			RelativePath="..\..\..\src\nvtt\CompressorDXT1.h"
+			>
+		</File>
+		<File
+			RelativePath="..\..\..\src\nvtt\CompressorDXT5_RGBM.cpp"
+			>
+		</File>
+		<File
+			RelativePath="..\..\..\src\nvtt\CompressorDXT5_RGBM.h"
+			>
+		</File>
 		<File
 			RelativePath="..\..\..\src\nvtt\CompressorRGB.cpp"
 			>
diff --git a/project/vc9/testsuite/testsuite.vcproj b/project/vc9/testsuite/testsuite.vcproj
index 01fd7a8..47110c8 100644
--- a/project/vc9/testsuite/testsuite.vcproj
+++ b/project/vc9/testsuite/testsuite.vcproj
@@ -99,8 +99,8 @@
 		</Configuration>
 		<Configuration
 			Name="Debug|x64"
-			OutputDirectory="$(PlatformName)\$(ConfigurationName)"
-			IntermediateDirectory="$(PlatformName)\$(ConfigurationName)"
+			OutputDirectory="$(ConfigurationName)\$(PlatformName)"
+			IntermediateDirectory="$(ConfigurationName)\$(PlatformName)"
 			ConfigurationType="1"
 			InheritedPropertySheets="$(SolutionDir)\nvtt.vsprops"
 			CharacterSet="1"
@@ -257,8 +257,8 @@
 		</Configuration>
 		<Configuration
 			Name="Release|x64"
-			OutputDirectory="$(PlatformName)\$(ConfigurationName)"
-			IntermediateDirectory="$(PlatformName)\$(ConfigurationName)"
+			OutputDirectory="$(ConfigurationName)\$(PlatformName)"
+			IntermediateDirectory="$(ConfigurationName)\$(PlatformName)"
 			ConfigurationType="1"
 			InheritedPropertySheets="$(SolutionDir)\nvtt.vsprops"
 			CharacterSet="1"
@@ -337,8 +337,8 @@
 		</Configuration>
 		<Configuration
 			Name="Debug-CUDA|Win32"
-			OutputDirectory="$(ConfigurationName)"
-			IntermediateDirectory="$(ConfigurationName)"
+			OutputDirectory="$(ConfigurationName)\$(PlatformName)"
+			IntermediateDirectory="$(ConfigurationName)\$(PlatformName)"
 			ConfigurationType="1"
 			InheritedPropertySheets="$(SolutionDir)\nvtt.vsprops"
 			CharacterSet="1"
@@ -415,8 +415,8 @@
 		</Configuration>
 		<Configuration
 			Name="Debug-CUDA|x64"
-			OutputDirectory="$(PlatformName)\$(ConfigurationName)"
-			IntermediateDirectory="$(PlatformName)\$(ConfigurationName)"
+			OutputDirectory="$(ConfigurationName)\$(PlatformName)"
+			IntermediateDirectory="$(ConfigurationName)\$(PlatformName)"
 			ConfigurationType="1"
 			InheritedPropertySheets="$(SolutionDir)\nvtt.vsprops"
 			CharacterSet="1"
@@ -494,8 +494,8 @@
 		</Configuration>
 		<Configuration
 			Name="Release-CUDA|Win32"
-			OutputDirectory="$(ConfigurationName)"
-			IntermediateDirectory="$(ConfigurationName)"
+			OutputDirectory="$(ConfigurationName)\$(PlatformName)"
+			IntermediateDirectory="$(ConfigurationName)\$(PlatformName)"
 			ConfigurationType="1"
 			InheritedPropertySheets="$(SolutionDir)\nvtt.vsprops"
 			CharacterSet="1"
@@ -573,8 +573,8 @@
 		</Configuration>
 		<Configuration
 			Name="Release-CUDA|x64"
-			OutputDirectory="$(PlatformName)\$(ConfigurationName)"
-			IntermediateDirectory="$(PlatformName)\$(ConfigurationName)"
+			OutputDirectory="$(ConfigurationName)\$(PlatformName)"
+			IntermediateDirectory="$(ConfigurationName)\$(PlatformName)"
 			ConfigurationType="1"
 			InheritedPropertySheets="$(SolutionDir)\nvtt.vsprops"
 			CharacterSet="1"
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index d7f3bd4..35d5a31 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -44,15 +44,16 @@ ELSE(GLEW_FOUND)
 ENDIF(GLEW_FOUND)
 
 # Cg
-INCLUDE(${NV_CMAKE_DIR}/FindCg.cmake)
-IF(CG_FOUND)
-	MESSAGE(STATUS "Looking for Cg - found")
-ELSE(CG_FOUND)
-	MESSAGE(STATUS "Looking for Cg - not found")
-ENDIF(CG_FOUND)
+#INCLUDE(${NV_CMAKE_DIR}/FindCg.cmake)
+#IF(CG_FOUND)
+#	MESSAGE(STATUS "Looking for Cg - found")
+#ELSE(CG_FOUND)
+#	MESSAGE(STATUS "Looking for Cg - not found")
+#ENDIF(CG_FOUND)
 
 # CUDA
-#FIND_PACKAGE(CUDA)
+#INCLUDE(${NV_CMAKE_DIR}/FindCUDA.cmake)
+INCLUDE(CUDA)
 IF(CUDA_FOUND)
     IF(MINGW)
         MESSAGE(STATUS "Looking for CUDA - not supported on MinGW")
@@ -67,13 +68,13 @@ ELSE(CUDA_FOUND)
 ENDIF(CUDA_FOUND)
 
 # Maya
-INCLUDE(${NV_CMAKE_DIR}/FindMaya.cmake)
-IF(MAYA_FOUND)
-	SET(HAVE_MAYA ${MAYA_FOUND} CACHE BOOL "Set to TRUE if Maya is found, FALSE otherwise")
-	MESSAGE(STATUS "Looking for Maya - found")
-ELSE(MAYA_FOUND)
-	MESSAGE(STATUS "Looking for Maya - not found")
-ENDIF(MAYA_FOUND)
+#INCLUDE(${NV_CMAKE_DIR}/FindMaya.cmake)
+#IF(MAYA_FOUND)
+#	SET(HAVE_MAYA ${MAYA_FOUND} CACHE BOOL "Set to TRUE if Maya is found, FALSE otherwise")
+#	MESSAGE(STATUS "Looking for Maya - found")
+#ELSE(MAYA_FOUND)
+#	MESSAGE(STATUS "Looking for Maya - not found")
+#ENDIF(MAYA_FOUND)
 
 # FreeImage
 #INCLUDE(${NV_CMAKE_DIR}/FindFreeImage.cmake)
@@ -152,3 +153,5 @@ CHECK_INCLUDE_FILES("dispatch/dispatch.h" HAVE_DISPATCH_H)
 
 CONFIGURE_FILE(${CMAKE_CURRENT_SOURCE_DIR}/nvconfig.h.in ${CMAKE_CURRENT_BINARY_DIR}/nvconfig.h)
 
+#INSTALL(FILES ${CMAKE_CURRENT_BINARY_DIR}/nvconfig.h DESTINATION include)
+
diff --git a/src/nvcore/nvcore.h b/src/nvcore/nvcore.h
index b903f6f..1a3bf9a 100644
--- a/src/nvcore/nvcore.h
+++ b/src/nvcore/nvcore.h
@@ -61,10 +61,11 @@
 #   define NV_OS_IOS 1
 #elif defined POSH_OS_UNIX
 #   define NV_OS_UNIX 1
-#elif defined POSH_OS_WIN32
-#   define NV_OS_WIN32 1
 #elif defined POSH_OS_WIN64
+#   define NV_OS_WIN32 1
 #   define NV_OS_WIN64 1
+#elif defined POSH_OS_WIN32
+#   define NV_OS_WIN32 1
 #elif defined POSH_OS_XBOX
 #   define NV_OS_XBOX 1
 #else
@@ -96,6 +97,7 @@
 #define NV_CPU_STRING   POSH_CPU_STRING
 
 #if defined POSH_CPU_X86_64
+//#   define NV_CPU_X86 1
 #   define NV_CPU_X86_64 1
 #elif defined POSH_CPU_X86
 #   define NV_CPU_X86 1
diff --git a/src/nvimage/BlockDXT.cpp b/src/nvimage/BlockDXT.cpp
index 17f1ef9..9d334c4 100644
--- a/src/nvimage/BlockDXT.cpp
+++ b/src/nvimage/BlockDXT.cpp
@@ -632,13 +632,12 @@ void BlockCTX1::setIndices(int * idx)
 
 
 /// Decode BC6 block.
-void BlockBC6::decodeBlock(ColorSet * set) const
+void BlockBC6::decodeBlock(Vector3 colors[16]) const
 {
 	ZOH::Tile tile(4, 4);
 	ZOH::decompress((const char *)data, tile);
 
-	// Convert ZOH's tile struct back to NVTT's, and convert half to float.
-	set->allocate(4, 4);
+	// Convert ZOH's tile struct to Vector3, and convert half to float.
 	for (uint y = 0; y < 4; ++y)
 	{
 		for (uint x = 0; x < 4; ++x)
@@ -646,13 +645,9 @@ void BlockBC6::decodeBlock(ColorSet * set) const
 			uint16 rHalf = ZOH::Tile::float2half(tile.data[y][x].x);
 			uint16 gHalf = ZOH::Tile::float2half(tile.data[y][x].y);
 			uint16 bHalf = ZOH::Tile::float2half(tile.data[y][x].z);
-			set->colors[y * 4 + x].x = to_float(rHalf);
-			set->colors[y * 4 + x].y = to_float(gHalf);
-			set->colors[y * 4 + x].z = to_float(bHalf);
-			set->colors[y * 4 + x].w = 1.0f;
-
-			// Set indices in case someone uses them
-			set->indices[y * 4 + x] = y * 4 + x;
+			colors[y * 4 + x].x = to_float(rHalf);
+			colors[y * 4 + x].y = to_float(gHalf);
+			colors[y * 4 + x].z = to_float(bHalf);
 		}
 	}
 }
diff --git a/src/nvimage/BlockDXT.h b/src/nvimage/BlockDXT.h
index e03cff7..40f615f 100644
--- a/src/nvimage/BlockDXT.h
+++ b/src/nvimage/BlockDXT.h
@@ -35,6 +35,7 @@ namespace nv
     struct ColorSet;
     struct AlphaBlock4x4;
     class Stream;
+    class Vector3;
 
 
     /// DXT1 block.
@@ -219,7 +220,7 @@ namespace nv
 	struct BlockBC6
 	{
 		uint8 data[16];		// Not even going to try to write a union for this thing.
-		void decodeBlock(ColorSet * set) const;
+		void decodeBlock(Vector3 colors[16]) const;
 	};
 
 	/// BC7 block.
diff --git a/src/nvimage/ColorBlock.cpp b/src/nvimage/ColorBlock.cpp
index 026bb36..ad8f2b7 100644
--- a/src/nvimage/ColorBlock.cpp
+++ b/src/nvimage/ColorBlock.cpp
@@ -462,7 +462,7 @@ float ColorBlock::volume() const
     return bounds.volume();
 }*/
 
-
+#if 0
 void ColorSet::allocate(uint w, uint h)
 {
     nvDebugCheck(w <= 4 && h <= 4);
@@ -680,6 +680,7 @@ bool ColorSet::hasAlpha() const
     }
     return false;
 }
+#endif // 0
 
 
 void AlphaBlock4x4::init(uint8 a)
@@ -707,7 +708,7 @@ void AlphaBlock4x4::init(const ColorBlock & src, uint channel)
 
 
 
-void AlphaBlock4x4::init(const ColorSet & src, uint channel)
+/*void AlphaBlock4x4::init(const ColorSet & src, uint channel)
 {
     nvCheck(channel >= 0 && channel < 4);
 
@@ -727,12 +728,12 @@ void AlphaBlock4x4::initMaxRGB(const ColorSet & src, float threshold)
         alpha[i] = unitFloatToFixed8(max(max(x, y), max(z, threshold)));
         weights[i] = 1.0f;
     }
-}
+}*/
 
-void AlphaBlock4x4::initWeights(const ColorSet & src)
+/*void AlphaBlock4x4::initWeights(const ColorSet & src)
 {
     for (int i = 0; i < 16; i++) {
         weights[i] = src.weight(i);
     }
-}
+}*/
 
diff --git a/src/nvimage/ColorBlock.h b/src/nvimage/ColorBlock.h
index fe78a47..f87cb6d 100644
--- a/src/nvimage/ColorBlock.h
+++ b/src/nvimage/ColorBlock.h
@@ -81,7 +81,7 @@ namespace nv
         return m_color[y * 4 + x];
     }
 
-
+    /*
     struct ColorSet
     {
         ColorSet() : colorCount(0), indexCount(0), w(0), h(0) {}
@@ -124,6 +124,7 @@ namespace nv
         float weights[16];  // @@ Add mask to indicate what color components are weighted?
         int indices[16];
     };
+    */
 
 
     /// Uncompressed 4x4 alpha block.
@@ -131,10 +132,10 @@ namespace nv
     {
         void init(uint8 value);
         void init(const ColorBlock & src, uint channel);
-        void init(const ColorSet & src, uint channel);
+        //void init(const ColorSet & src, uint channel);
 
-        void initMaxRGB(const ColorSet & src, float threshold);
-        void initWeights(const ColorSet & src);
+        //void initMaxRGB(const ColorSet & src, float threshold);
+        //void initWeights(const ColorSet & src);
 
         uint8 alpha[4*4];
         float weights[16];
diff --git a/src/nvimage/DirectDrawSurface.cpp b/src/nvimage/DirectDrawSurface.cpp
index dff9255..a6bbdad 100644
--- a/src/nvimage/DirectDrawSurface.cpp
+++ b/src/nvimage/DirectDrawSurface.cpp
@@ -31,6 +31,7 @@
 #include "nvcore/Utils.h" // max
 #include "nvcore/StdStream.h"
 #include "nvmath/Vector.inl"
+#include "nvmath/ftoi.h"
 
 #include <string.h> // memset
 
@@ -1395,20 +1396,20 @@ void DirectDrawSurface::readBlock(ColorBlock * rgba)
     {
         BlockBC6 block;
         *stream << block;
-        ColorSet set;
-        block.decodeBlock(&set);
+        Vector3 colors[16];
+        block.decodeBlock(colors);
 
         // Clamp to [0, 1] and round to 8-bit
         for (int y = 0; y < 4; ++y)
         {
             for (int x = 0; x < 4; ++x)
             {
-                Vector4 px = set.colors[y*4 + x];
+                Vector3 px = colors[y*4 + x];
                 rgba->color(x, y).setRGBA(
-                                    uint8(clamp(px.x, 0.0f, 1.0f) * 255.0f + 0.5f),
-                                    uint8(clamp(px.y, 0.0f, 1.0f) * 255.0f + 0.5f),
-                                    uint8(clamp(px.z, 0.0f, 1.0f) * 255.0f + 0.5f),
-                                    uint8(clamp(px.w, 0.0f, 1.0f) * 255.0f + 0.5f));
+                                    ftoi_round(clamp(px.x, 0.0f, 1.0f) * 255.0f),
+                                    ftoi_round(clamp(px.y, 0.0f, 1.0f) * 255.0f),
+                                    ftoi_round(clamp(px.z, 0.0f, 1.0f) * 255.0f),
+                                    0xFF);
             }
         }
     }
diff --git a/src/nvimage/ErrorMetric.cpp b/src/nvimage/ErrorMetric.cpp
index 7a4970b..3f10a72 100644
--- a/src/nvimage/ErrorMetric.cpp
+++ b/src/nvimage/ErrorMetric.cpp
@@ -10,7 +10,7 @@
 
 using namespace nv;
 
-float nv::rmsColorError(const FloatImage * img, const FloatImage * ref, bool alphaWeight)
+float nv::rmsColorError(const FloatImage * ref, const FloatImage * img, bool alphaWeight)
 {
     if (!sameLayout(img, ref)) {
         return FLT_MAX;
@@ -23,31 +23,31 @@ float nv::rmsColorError(const FloatImage * img, const FloatImage * ref, bool alp
     const uint count = img->pixelCount();
     for (uint i = 0; i < count; i++)
     {
-        float r0 = img->pixel(i + count * 0);
-        float g0 = img->pixel(i + count * 1);
-        float b0 = img->pixel(i + count * 2);
-        //float a0 = img->pixel(i + count * 3);
-        float r1 = ref->pixel(i + count * 0);
-        float g1 = ref->pixel(i + count * 1);
-        float b1 = ref->pixel(i + count * 2);
-        float a1 = ref->pixel(i + count * 3);
+        float r0 = ref->pixel(i + count * 0);
+        float g0 = ref->pixel(i + count * 1);
+        float b0 = ref->pixel(i + count * 2);
+        float a0 = ref->pixel(i + count * 3);
+        float r1 = img->pixel(i + count * 0);
+        float g1 = img->pixel(i + count * 1);
+        float b1 = img->pixel(i + count * 2);
+        //float a1 = img->pixel(i + count * 3);
 
         float r = r0 - r1;
         float g = g0 - g1;
         float b = b0 - b1;
 
         float a = 1;
-        if (alphaWeight) a = a1;
+        if (alphaWeight) a = a0 * a0; // @@ a0*a1 or a0*a0 ?
 
-        mse += r * r * a;
-        mse += g * g * a;
-        mse += b * b * a;
+        mse += (r * r) * a;
+        mse += (g * g) * a;
+        mse += (b * b) * a;
     }
 
     return float(sqrt(mse / count));
 }
 
-float nv::rmsAlphaError(const FloatImage * img, const FloatImage * ref)
+float nv::rmsAlphaError(const FloatImage * ref, const FloatImage * img)
 {
     if (!sameLayout(img, ref)) {
         return FLT_MAX;
@@ -71,7 +71,7 @@ float nv::rmsAlphaError(const FloatImage * img, const FloatImage * ref)
 }
 
 
-float nv::averageColorError(const FloatImage * img, const FloatImage * ref, bool alphaWeight)
+float nv::averageColorError(const FloatImage * ref, const FloatImage * img, bool alphaWeight)
 {
     if (!sameLayout(img, ref)) {
         return FLT_MAX;
@@ -108,7 +108,7 @@ float nv::averageColorError(const FloatImage * img, const FloatImage * ref, bool
     return float(mae / count);
 }
 
-float nv::averageAlphaError(const FloatImage * img, const FloatImage * ref)
+float nv::averageAlphaError(const FloatImage * ref, const FloatImage * img)
 {
     if (img == NULL || ref == NULL || img->width() != ref->width() || img->height() != ref->height()) {
         return FLT_MAX;
diff --git a/src/nvimage/ErrorMetric.h b/src/nvimage/ErrorMetric.h
index 158dacf..b875802 100644
--- a/src/nvimage/ErrorMetric.h
+++ b/src/nvimage/ErrorMetric.h
@@ -6,15 +6,15 @@ namespace nv
 {
     class FloatImage;
 
-    float rmsColorError(const FloatImage * img, const FloatImage * ref, bool alphaWeight);
-    float rmsAlphaError(const FloatImage * img, const FloatImage * ref);
+    float rmsColorError(const FloatImage * ref, const FloatImage * img, bool alphaWeight);
+    float rmsAlphaError(const FloatImage * ref, const FloatImage * img);
 
-    float cieLabError(const FloatImage * img, const FloatImage * ref);
-    float cieLab94Error(const FloatImage * img, const FloatImage * ref);
-    float spatialCieLabError(const FloatImage * img, const FloatImage * ref);
+    float cieLabError(const FloatImage * ref, const FloatImage * img);
+    float cieLab94Error(const FloatImage * ref, const FloatImage * img);
+    float spatialCieLabError(const FloatImage * ref, const FloatImage * img);
 
-    float averageColorError(const FloatImage * img, const FloatImage * ref, bool alphaWeight);
-    float averageAlphaError(const FloatImage * img, const FloatImage * ref);
+    float averageColorError(const FloatImage * ref, const FloatImage * img, bool alphaWeight);
+    float averageAlphaError(const FloatImage * ref, const FloatImage * img);
 
     float averageAngularError(const FloatImage * img0, const FloatImage * img1);
     float rmsAngularError(const FloatImage * img0, const FloatImage * img1);
diff --git a/src/nvmath/PackedFloat.cpp b/src/nvmath/PackedFloat.cpp
new file mode 100755
index 0000000..3327d20
--- /dev/null
+++ b/src/nvmath/PackedFloat.cpp
@@ -0,0 +1,61 @@
+// This code is in the public domain -- Ignacio Casta�o <castano@gmail.com>
+
+#include "PackedFloat.h"
+#include "Vector.inl"
+#include "ftoi.h"
+
+using namespace nv;
+
+Vector3 nv::rgb9e5_to_vector3(FloatRGB9E5 v) {
+}
+
+FloatRGB9E5 nv::vector3_to_rgb9e5(const Vector3 & v) {
+}
+
+
+float nv::float11_to_float32(uint v) {
+}
+
+float nv::float10_to_float32(uint v) {
+}
+
+Vector3 nv::r11g11b10_to_vector3(FloatR11G11B10 v) {
+}
+
+FloatR11G11B10 nv::vector3_to_r11g11b10(const Vector3 & v) {
+}
+
+// These are based on: 
+// http://www.graphics.cornell.edu/~bjw/rgbe/rgbe.c
+// While this may not be the best way to encode/decode RGBE8, I'm not making any changes to maintain compatibility.
+FloatRGBE8 nv::vector3_to_rgbe8(const Vector3 & v) {
+
+    float m = max3(v.x, v.y, v.z);
+
+    FloatRGBE8 rgbe;
+
+    if (m < 1e-32) {
+        rgbe.v = 0;
+    }
+    else {
+        int e;
+        float scale = frexpf(m, &e) * 256.0f / m;
+        rgbe.r = U8(ftoi_round(v.x * scale));
+        rgbe.g = U8(ftoi_round(v.y * scale));
+        rgbe.b = U8(ftoi_round(v.z * scale));
+        rgbe.e = U8(e + 128);
+    }
+
+    return rgbe;
+}
+
+
+Vector3 nv::rgbe8_to_vector3(FloatRGBE8 v) {
+    if (v.e != 0) {
+        float scale = ldexpf(1.0f, v.e-(int)(128+8));             // +8 to divide by 256. @@ Shouldn't we divide by 255 instead?
+        return scale * Vector3(float(v.r), float(v.g), float(v.b));
+    }
+    
+    return Vector3(0);
+}
+
diff --git a/src/nvmath/PackedFloat.h b/src/nvmath/PackedFloat.h
new file mode 100755
index 0000000..bf84b85
--- /dev/null
+++ b/src/nvmath/PackedFloat.h
@@ -0,0 +1,79 @@
+// This code is in the public domain -- castano@gmail.com
+
+#pragma once
+#ifndef NV_MATH_PACKEDFLOAT_H
+#define NV_MATH_PACKEDFLOAT_H
+
+#include "nvmath.h"
+#include "Vector.h"
+
+namespace nv
+{
+
+    union FloatRGB9E5 {
+        uint32 v;
+        struct {
+        #if NV_BIG_ENDIAN
+            uint32 e : 5;
+            uint32 zm : 9;
+            uint32 ym : 9;
+            uint32 xm : 9;
+        #else
+            uint32 xm : 9;
+            uint32 ym : 9;
+            uint32 zm : 9;
+            uint32 e : 5;
+        #endif
+        };
+    };
+
+    union FloatR11G11B10 {
+        uint32 v;
+        struct {
+        #if NV_BIG_ENDIAN
+            uint32 ze : 5;
+            uint32 zm : 5;
+            uint32 ye : 5;
+            uint32 ym : 6;
+            uint32 xe : 5;
+            uint32 xm : 6;
+        #else
+            uint32 xm : 6;
+            uint32 xe : 5;
+            uint32 ym : 6;
+            uint32 ye : 5;
+            uint32 zm : 5;
+            uint32 ze : 5;
+        #endif
+        };
+    };
+
+    union FloatRGBE8 {
+        uint32 v;
+        struct {
+        #if NV_LITTLE_ENDIAN
+            uint8 r, g, b, e;
+        #else
+            uint8 e: 8;
+            uint8 b: 8;
+            uint8 g: 8;
+            uint8 r: 8;
+        #endif
+        };
+    };
+
+    NVMATH_API Vector3 rgb9e5_to_vector3(FloatRGB9E5 v);
+    NVMATH_API FloatRGB9E5 vector3_to_rgb9e5(const Vector3 & v);
+
+    NVMATH_API float float11_to_float32(uint v);
+    NVMATH_API float float10_to_float32(uint v);
+
+    NVMATH_API Vector3 r11g11b10_to_vector3(FloatR11G11B10 v);
+    NVMATH_API FloatR11G11B10 vector3_to_r11g11b10(const Vector3 & v);
+
+    NVMATH_API Vector3 rgbe8_to_vector3(FloatRGBE8 v);
+    NVMATH_API FloatRGBE8 vector3_to_rgbe8(const Vector3 & v);
+
+} // nv
+
+#endif // NV_MATH_PACKEDFLOAT_H
diff --git a/src/nvmath/SimdVector.h b/src/nvmath/SimdVector.h
index b6b7298..94e5186 100644
--- a/src/nvmath/SimdVector.h
+++ b/src/nvmath/SimdVector.h
@@ -2,29 +2,6 @@
 
 #include "Vector.h" // Vector3, Vector4
 
-// Set some reasonable defaults.
-#ifndef NV_USE_ALTIVEC
-#   define NV_USE_ALTIVEC NV_CPU_PPC
-//#   define NV_USE_ALTIVEC defined(__VEC__)
-#endif
-
-#ifndef NV_USE_SSE
-#   if NV_CPU_X86 || NV_CPU_X86_64
-#       define NV_USE_SSE 2
-#   endif
-#   if defined(__SSE2__)
-#       define NV_USE_SSE 2
-#   elif defined(__SSE__)
-#       define NV_USE_SSE 1
-#   else
-#       define NV_USE_SSE 0
-#   endif
-#endif
-
-// Internally set NV_USE_SIMD when either altivec or sse is available.
-#if NV_USE_ALTIVEC && NV_USE_SSE
-#	error "Cannot enable both altivec and sse!"
-#endif
 
 #if NV_USE_ALTIVEC
 #   include "SimdVector_VE.h"
diff --git a/src/nvmath/nvmath.h b/src/nvmath/nvmath.h
index 9626431..baeb02a 100644
--- a/src/nvmath/nvmath.h
+++ b/src/nvmath/nvmath.h
@@ -283,6 +283,49 @@ namespace nv
         f.value = x;
         return (f.field.biasedexponent - 127);
     }
+
+
+    // FloatRGB9E5
+    union Float3SE {
+        uint32 v;
+        struct {
+        #if NV_BIG_ENDIAN
+            uint32 e : 5;
+            uint32 zm : 9;
+            uint32 ym : 9;
+            uint32 xm : 9;
+        #else
+            uint32 xm : 9;
+            uint32 ym : 9;
+            uint32 zm : 9;
+            uint32 e : 5;
+        #endif
+        };
+    };
+
+    // FloatR11G11B10
+    union Float3PK {
+        uint32 v;
+        struct {
+        #if NV_BIG_ENDIAN
+            uint32 ze : 5;
+            uint32 zm : 5;
+            uint32 ye : 5;
+            uint32 ym : 6;
+            uint32 xe : 5;
+            uint32 xm : 6;
+        #else
+            uint32 xm : 6;
+            uint32 xe : 5;
+            uint32 ym : 6;
+            uint32 ye : 5;
+            uint32 zm : 5;
+            uint32 ze : 5;
+        #endif
+        };
+    };
+
+
 } // nv
 
 #endif // NV_MATH_H
diff --git a/src/nvthread/Atomic.h b/src/nvthread/Atomic.h
index ece44b5..6c2e0fa 100644
--- a/src/nvthread/Atomic.h
+++ b/src/nvthread/Atomic.h
@@ -14,6 +14,7 @@
 
 #pragma intrinsic(_InterlockedIncrement, _InterlockedDecrement)
 #pragma intrinsic(_InterlockedCompareExchange, _InterlockedExchange)
+//#pragma intrinsic(_InterlockedExchangeAdd64)
 
 /*
 extern "C"
@@ -147,6 +148,11 @@ namespace nv {
         return (uint32)_InterlockedExchange((long *)value, (long)desired);
     }
 
+    inline uint32 atomicAdd(uint32 * value, uint32 value_to_add) {
+        nvDebugCheck((intptr_t(value) & 3) == 0);
+        return (uint32)_InterlockedExchangeAdd((long*)value, (long)value_to_add);
+    }
+
 #elif NV_CC_CLANG && (NV_OS_IOS || NV_OS_DARWIN)
     NV_COMPILER_CHECK(sizeof(uint32) == sizeof(long));
 
@@ -177,14 +183,14 @@ namespace nv {
     inline uint32 atomicIncrement(uint32 * value)
     {
         nvDebugCheck((intptr_t(value) & 3) == 0);
-
+        
         return __sync_add_and_fetch(value, 1);
     }
-
+    
     inline uint32 atomicDecrement(uint32 * value)
     {
         nvDebugCheck((intptr_t(value) & 3) == 0);
-
+        
         return __sync_sub_and_fetch(value, 1);
     }
     
@@ -204,6 +210,12 @@ namespace nv {
         return __sync_lock_test_and_set(value, desired);
     }
 
+    inline uint32 atomicAdd(uint32 * value, uint32 value_to_add) {
+        nvDebugCheck((intptr_t(value) & 3) == 0);
+        return __sync_add_and_fetch(value, value_to_add);
+    }
+
+
 
 #elif NV_CC_CLANG && POSH_CPU_STRONGARM
     NV_COMPILER_CHECK(sizeof(uint32) == sizeof(long));
@@ -288,6 +300,12 @@ namespace nv {
         // this is confusingly named, it doesn't actually do a test but always sets
         return __sync_lock_test_and_set(value, desired);
     }
+
+    inline uint32 atomicAdd(uint32 * value, uint32 value_to_add) {
+        nvDebugCheck((intptr_t(value) & 3) == 0);
+        return __sync_add_and_fetch(value, value_to_add);
+    }
+
     
 #else
 #error "Atomics not implemented."
diff --git a/src/nvthread/Event.cpp b/src/nvthread/Event.cpp
index 98a4bcc..92903a8 100644
--- a/src/nvthread/Event.cpp
+++ b/src/nvthread/Event.cpp
@@ -60,7 +60,7 @@ void Event::post() {
     
     //ACS: move this after the unlock?
     if(m->wait_count>0) {
-    pthread_cond_signal(&m->pt_cond);
+        pthread_cond_signal(&m->pt_cond);
     }
     
     pthread_mutex_unlock(&m->pt_mutex);
@@ -71,7 +71,7 @@ void Event::wait() {
     
     while(m->count==0) {
         m->wait_count++;
-    pthread_cond_wait(&m->pt_cond, &m->pt_mutex);
+        pthread_cond_wait(&m->pt_cond, &m->pt_mutex);
         m->wait_count--;
     }
     m->count--;
diff --git a/src/nvthread/Mutex.cpp b/src/nvthread/Mutex.cpp
index b657c2e..9d4aa66 100644
--- a/src/nvthread/Mutex.cpp
+++ b/src/nvthread/Mutex.cpp
@@ -13,6 +13,11 @@
 
 #endif // NV_OS
 
+#if NV_USE_TELEMETRY
+#include <telemetry.h>
+extern HTELEMETRY tmContext;
+#endif
+
 using namespace nv;
 
 
@@ -20,12 +25,17 @@ using namespace nv;
 
 struct Mutex::Private {
     CRITICAL_SECTION mutex;
+    const char * name;
 };
 
 
-Mutex::Mutex () : m(new Private)
+Mutex::Mutex (const char * name) : m(new Private)
 {
     InitializeCriticalSection(&m->mutex);
+    m->name = name;
+#if NV_USE_TELEMETRY
+    tmLockName(tmContext, this, name);
+#endif
 }
 
 Mutex::~Mutex ()
@@ -35,16 +45,44 @@ Mutex::~Mutex ()
 
 void Mutex::lock()
 {
+#if NV_USE_TELEMETRY
+    TmU64 matcher;
+    tmTryLockEx(tmContext, &matcher, 100/*0.1 ms*/, __FILE__, __LINE__, this, "blocked");
+#endif
+    
     EnterCriticalSection(&m->mutex);
+
+#if NV_USE_TELEMETRY
+    tmEndTryLockEx(tmContext, matcher, __FILE__, __LINE__, this, TMLR_SUCCESS);
+    tmSetLockState(tmContext, this, TMLS_LOCKED, "acquired");
+#endif
 }
 
 bool Mutex::tryLock()
 {
+#if NV_USE_TELEMETRY
+    TmU64 matcher;
+    tmTryLockEx(tmContext, &matcher, 100/*0.1 ms*/, __FILE__, __LINE__, this, "blocked");
+    if (TryEnterCriticalSection(&m->mutex) != 0) {
+        tmEndTryLockEx(tmContext, matcher, __FILE__, __LINE__, this, TMLR_SUCCESS);
+        tmSetLockState(tmContext, this, TMLS_LOCKED, "acquired");
+        return true;
+    }
+    else {
+        tmEndTryLockEx(tmContext, matcher, __FILE__, __LINE__, this, TMLR_FAILED);
+        return false;
+    }
+#else
     return TryEnterCriticalSection(&m->mutex) != 0;
+#endif
 }
 
 void Mutex::unlock()
 {
+#if NV_USE_TELEMETRY
+    tmSetLockState(tmContext, this, TMLS_RELEASED, "released");
+#endif
+
     LeaveCriticalSection(&m->mutex);
 }
 
@@ -52,12 +90,14 @@ void Mutex::unlock()
 
 struct Mutex::Private {
     pthread_mutex_t mutex;
+    const char * name;
 };
 
 
-Mutex::Mutex () : m(new Private)
+Mutex::Mutex (const char * name) : m(new Private)
 {
-    int result = pthread_mutex_init(&m->mutex , NULL);
+    int result = pthread_mutex_init(&m->mutex, NULL);
+    m->name = name;
     nvDebugCheck(result == 0);
 }
 
diff --git a/src/nvthread/Mutex.h b/src/nvthread/Mutex.h
index 53aeb60..3259b9a 100644
--- a/src/nvthread/Mutex.h
+++ b/src/nvthread/Mutex.h
@@ -15,7 +15,7 @@ namespace nv
     {
         NV_FORBID_COPY(Mutex);
     public:
-        Mutex ();
+        Mutex (const char * name);
         ~Mutex ();
 
         void lock();
diff --git a/src/nvthread/ParallelFor.cpp b/src/nvthread/ParallelFor.cpp
index 216c6d2..c8e901e 100644
--- a/src/nvthread/ParallelFor.cpp
+++ b/src/nvthread/ParallelFor.cpp
@@ -38,7 +38,7 @@ ParallelFor::~ParallelFor() {
 #endif
 }
 
-void ParallelFor::run(uint count) {
+void ParallelFor::run(uint count, bool calling_thread_process_work /*=false*/) {
 #if ENABLE_PARALLEL_FOR
     storeRelease(&this->count, count);
 
@@ -48,6 +48,10 @@ void ParallelFor::run(uint count) {
     // Start threads.
     pool->start(worker, this);
 
+    if (calling_thread_process_work) {
+        worker(this);
+    }
+
     // Wait for all threads to complete.
     pool->wait();
 
diff --git a/src/nvthread/ParallelFor.h b/src/nvthread/ParallelFor.h
index e3e0fb8..b442dc6 100644
--- a/src/nvthread/ParallelFor.h
+++ b/src/nvthread/ParallelFor.h
@@ -18,7 +18,7 @@ namespace nv
         ParallelFor(ForTask * task, void * context);
         ~ParallelFor();
 
-        void run(uint count);
+        void run(uint count, bool calling_thread_process_work = false);
 
         // Invariant:
         ForTask * task;
diff --git a/src/nvthread/Thread.cpp b/src/nvthread/Thread.cpp
index 6c16ad8..b9e3bc3 100644
--- a/src/nvthread/Thread.cpp
+++ b/src/nvthread/Thread.cpp
@@ -9,6 +9,12 @@
     #include <unistd.h> // usleep
 #endif
 
+#if NV_USE_TELEMETRY
+#include <telemetry.h>
+extern HTELEMETRY tmContext;
+#endif
+
+
 using namespace nv;
 
 struct Thread::Private
@@ -21,6 +27,7 @@ struct Thread::Private
 
     ThreadFunc * func;
     void * arg;
+    const char * name;
 };
 
 
@@ -32,6 +39,39 @@ unsigned long __stdcall threadFunc(void * arg) {
     return 0;
 }
 
+// SetThreadName implementation from msdn:
+// http://msdn.microsoft.com/en-us/library/xcb2z8hs.aspx
+
+const DWORD MS_VC_EXCEPTION=0x406D1388;
+
+#pragma pack(push,8)
+typedef struct tagTHREADNAME_INFO
+{
+    DWORD dwType; // Must be 0x1000.
+    LPCSTR szName; // Pointer to name (in user addr space).
+    DWORD dwThreadID; // Thread ID (-1=caller thread).
+    DWORD dwFlags; // Reserved for future use, must be zero.
+} THREADNAME_INFO;
+#pragma pack(pop)
+
+static void setThreadName(DWORD dwThreadID, const char* threadName)
+{
+    THREADNAME_INFO info;
+    info.dwType = 0x1000;
+    info.szName = threadName;
+    info.dwThreadID = dwThreadID;
+    info.dwFlags = 0;
+
+    __try
+    {
+        RaiseException( MS_VC_EXCEPTION, 0, sizeof(info)/sizeof(ULONG_PTR), (ULONG_PTR*)&info );
+    }
+    __except(EXCEPTION_EXECUTE_HANDLER)
+    {
+    }
+}
+
+
 #elif NV_OS_USE_PTHREAD
 
 extern "C" void * threadFunc(void * arg) {
@@ -46,6 +86,13 @@ extern "C" void * threadFunc(void * arg) {
 Thread::Thread() : p(new Private)
 {
     p->thread = 0;
+    p->name = NULL;
+}
+
+Thread::Thread(const char * const name) : p(new Private)
+{
+    p->thread = 0;
+    p->name = name;
 }
 
 Thread::~Thread()
@@ -59,9 +106,20 @@ void Thread::start(ThreadFunc * func, void * arg)
     p->arg = arg;
 
 #if NV_OS_WIN32
-    p->thread = CreateThread(NULL, 0, threadFunc, p.ptr(), 0, NULL);
+    DWORD threadId;
+    p->thread = CreateThread(NULL, 0, threadFunc, p.ptr(), 0, &threadId);
     //p->thread = (HANDLE)_beginthreadex (0, 0, threadFunc, p.ptr(), 0, NULL);     // @@ So that we can call CRT functions...
     nvDebugCheck(p->thread != NULL);
+    setThreadName(threadId, p->name);
+#if NV_USE_TELEMETRY
+    tmThreadName(tmContext, threadId, p->name);
+#endif
+#elif NV_OS_ORBIS
+    int ret = scePthreadCreate(&p->thread, NULL, threadFunc, p.ptr(), p->name ? p->name : "nv::Thread");
+    nvDebugCheck(ret == 0);
+	// use any non-system core
+	scePthreadSetaffinity(p->thread, 0x3F);
+    scePthreadSetprio(p->thread, (SCE_KERNEL_PRIO_FIFO_DEFAULT + SCE_KERNEL_PRIO_FIFO_HIGHEST) / 2);
 #elif NV_OS_USE_PTHREAD
     int result = pthread_create(&p->thread, NULL, threadFunc, p.ptr());
     nvDebugCheck(result == 0);
diff --git a/src/nvthread/Thread.h b/src/nvthread/Thread.h
index 48fe800..5e0f0e2 100644
--- a/src/nvthread/Thread.h
+++ b/src/nvthread/Thread.h
@@ -17,6 +17,7 @@ namespace nv
         NV_FORBID_COPY(Thread);
     public:
         Thread();
+        Thread(const char * const name);
         ~Thread();
 
         void start(ThreadFunc * func, void * arg);
diff --git a/src/nvthread/ThreadPool.cpp b/src/nvthread/ThreadPool.cpp
index 8364c62..53667ae 100644
--- a/src/nvthread/ThreadPool.cpp
+++ b/src/nvthread/ThreadPool.cpp
@@ -14,7 +14,7 @@
 using namespace nv;
 
 #if PROTECT_THREAD_POOL 
-Mutex s_pool_mutex;
+Mutex s_pool_mutex("thread pool");
 #endif
 
 AutoPtr<ThreadPool> s_pool;
diff --git a/src/nvthread/ThreadPool.h b/src/nvthread/ThreadPool.h
index f1bd620..fb75b6d 100644
--- a/src/nvthread/ThreadPool.h
+++ b/src/nvthread/ThreadPool.h
@@ -12,7 +12,7 @@
 // The thread pool creates one worker thread for each physical core. 
 // The threads are idle waiting for their start events so that they do not consume any resources while inactive. 
 // The thread pool runs the same function in all worker threads, the idea is to use this as the foundation of a custom task scheduler.
-// When the thread pool starts, the main thread continues running, but the common use case is to inmmediately wait of the termination events of the worker threads.
+// When the thread pool starts, the main thread continues running, but the common use case is to inmmediately wait for the termination events of the worker threads.
 // @@ The start and wait methods could probably be merged.
 
 namespace nv {
diff --git a/src/nvthread/nvthread.cpp b/src/nvthread/nvthread.cpp
index 9de9a81..fe20592 100644
--- a/src/nvthread/nvthread.cpp
+++ b/src/nvthread/nvthread.cpp
@@ -72,3 +72,10 @@ uint nv::hardwareThreadCount() {
 #endif
 }
 
+uint nv::threadId() {
+#if NV_OS_WIN32
+    return GetCurrentThreadId();
+#else
+    return 0;   // @@ 
+#endif
+}
\ No newline at end of file
diff --git a/src/nvthread/nvthread.h b/src/nvthread/nvthread.h
index aa236d3..c246b57 100644
--- a/src/nvthread/nvthread.h
+++ b/src/nvthread/nvthread.h
@@ -90,6 +90,8 @@ namespace nv
     void shutWorkers();
     void setWorkerFunction(void * func);
 
+    uint threadId();
+
 } // nv namespace
 
 
diff --git a/src/nvtt/BlockCompressor.cpp b/src/nvtt/BlockCompressor.cpp
index 6998e1b..6b39636 100644
--- a/src/nvtt/BlockCompressor.cpp
+++ b/src/nvtt/BlockCompressor.cpp
@@ -113,22 +113,23 @@ void ColorBlockCompressor::compress(nvtt::AlphaMode alphaMode, uint w, uint h, c
 */
 
 
-struct ColorBlockCompressorContext
+struct CompressorContext
 {
     nvtt::AlphaMode alphaMode;
-    uint w, h;
+    uint w, h, d;
     const float * data;
     const nvtt::CompressionOptions::Private * compressionOptions;
 
     uint bw, bh, bs;
     uint8 * mem;
-    ColorBlockCompressor * compressor;
+    CompressorInterface * compressor;
 };
 
+
 // Each task compresses one block.
 void ColorBlockCompressorTask(void * data, int i)
 {
-    ColorBlockCompressorContext * d = (ColorBlockCompressorContext *) data;
+    CompressorContext * d = (CompressorContext *) data;
 
     uint x = i % d->bw;
     uint y = i / d->bw;
@@ -139,7 +140,7 @@ void ColorBlockCompressorTask(void * data, int i)
         rgba.init(d->w, d->h, d->data, 4*x, 4*y);
 
         uint8 * ptr = d->mem + (y * d->bw + x) * d->bs;
-        d->compressor->compressBlock(rgba, d->alphaMode, *d->compressionOptions, ptr);
+        ((ColorBlockCompressor *) d->compressor)->compressBlock(rgba, d->alphaMode, *d->compressionOptions, ptr);
     }
 }
 
@@ -147,10 +148,11 @@ void ColorBlockCompressor::compress(nvtt::AlphaMode alphaMode, uint w, uint h, u
 {
     nvDebugCheck(d == 1);
 
-    ColorBlockCompressorContext context;
+    CompressorContext context;
     context.alphaMode = alphaMode;
     context.w = w;
     context.h = h;
+    context.d = d;
     context.data = data;
     context.compressionOptions = &compressionOptions;
 
@@ -181,23 +183,11 @@ void ColorBlockCompressor::compress(nvtt::AlphaMode alphaMode, uint w, uint h, u
 }
 
 
-struct ColorSetCompressorContext
-{
-    nvtt::AlphaMode alphaMode;
-    uint w, h;
-    const float * data;
-    const nvtt::CompressionOptions::Private * compressionOptions;
-
-    uint bw, bh, bs;
-    uint8 * mem;
-    ColorSetCompressor * compressor;
-};
-
-
+#if 0
 // Each task compresses one block.
 void ColorSetCompressorTask(void * data, int i)
 {
-    ColorSetCompressorContext * d = (ColorSetCompressorContext *) data;
+    CompressorContext * d = (CompressorContext *) data;
 
     uint x = i % d->bw;
     uint y = i / d->bw;
@@ -208,7 +198,7 @@ void ColorSetCompressorTask(void * data, int i)
         set.setColors(d->data, d->w, d->h, x * 4, y * 4);
 
         uint8 * ptr = d->mem + (y * d->bw + x) * d->bs;
-        d->compressor->compressBlock(set, d->alphaMode, *d->compressionOptions, ptr);
+        ((ColorSetCompressor *)d->compressor)->compressBlock(set, d->alphaMode, *d->compressionOptions, ptr);
     }
 }
 
@@ -217,7 +207,7 @@ void ColorSetCompressor::compress(AlphaMode alphaMode, uint w, uint h, uint d, c
 {
     nvDebugCheck(d == 1);
 
-    ColorSetCompressorContext context;
+    CompressorContext context;
     context.alphaMode = alphaMode;
     context.w = w;
     context.h = h;
@@ -249,3 +239,97 @@ void ColorSetCompressor::compress(AlphaMode alphaMode, uint w, uint h, uint d, c
 
     delete [] context.mem;
 }
+#endif // 0
+
+
+// Each task compresses one block.
+void FloatColorCompressorTask(void * data, int i)
+{
+    CompressorContext * d = (CompressorContext *) data;
+
+    // Copy image to block.
+    const uint block_x = (i % d->bw);
+    const uint block_y = (i / d->bw);
+
+    const uint src_x_offset = block_x * 4;
+    const uint src_y_offset = block_y * 4;
+
+    const float * r = (const float *)d->data + d->w * d->h * d->d * 0;
+    const float * g = (const float *)d->data + d->w * d->h * d->d * 1;
+    const float * b = (const float *)d->data + d->w * d->h * d->d * 2;
+    const float * a = (const float *)d->data + d->w * d->h * d->d * 3;
+
+    Vector4 colors[16];
+    float weights[16];
+
+    const uint block_w = min(d->w, 4U);
+    const uint block_h = min(d->h, 4U);
+
+    uint x, y;
+    for (y = 0; y < block_h; y++) {
+        for (x = 0; x < block_w; x++) {
+            uint dst_idx = 4 * y + x;
+            uint src_idx = (y + src_y_offset) * d->w + (x + src_x_offset);
+            colors[dst_idx].x = r[src_idx];
+            colors[dst_idx].y = g[src_idx];
+            colors[dst_idx].z = b[src_idx];
+            colors[dst_idx].w = a[src_idx];
+            weights[dst_idx] = (d->alphaMode == nvtt::AlphaMode_Transparency) ? a[src_idx] : 1.0f;
+        }
+        for (; x < 4; x++) {
+            uint dst_idx = 4 * y + x;
+            colors[dst_idx] = Vector4(0);
+            weights[dst_idx] = 0.0f;
+        }
+    }
+    for (; y < 4; y++) {
+        for (x = 0; x < 4; x++) {
+            uint dst_idx = 4 * y + x;
+            colors[dst_idx] = Vector4(0);
+            weights[dst_idx] = 0.0f;
+        }
+    }
+
+    // Compress block.
+    uint8 * output = d->mem + (block_y * d->bw + block_x) * d->bs;
+    ((FloatColorCompressor *)d->compressor)->compressBlock(colors, weights, *d->compressionOptions, output);
+}
+
+
+void FloatColorCompressor::compress(AlphaMode alphaMode, uint w, uint h, uint d, const float * data, nvtt::TaskDispatcher * dispatcher, const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions)
+{
+    nvDebugCheck(d == 1);   // @@ Add support for compressed 3D textures.
+
+    CompressorContext context;
+    context.alphaMode = alphaMode;
+    context.w = w;
+    context.h = h;
+    context.d = d;
+    context.data = data;
+    context.compressionOptions = &compressionOptions;
+
+    context.bs = blockSize();
+    context.bw = (w + 3) / 4;
+    context.bh = (h + 3) / 4;
+
+    context.compressor = this;
+
+    SequentialTaskDispatcher sequential;
+
+    // Use a single thread to compress small textures.
+    if (context.bh < 4) dispatcher = &sequential;
+
+#if _DEBUG
+    dispatcher = &sequential;
+#endif
+
+    const uint count = context.bw * context.bh;
+    const uint size = context.bs * count;
+    context.mem = new uint8[size];
+
+    dispatcher->dispatch(FloatColorCompressorTask, &context, count);
+
+    outputOptions.writeData(context.mem, size);
+
+    delete [] context.mem;
+}
diff --git a/src/nvtt/BlockCompressor.h b/src/nvtt/BlockCompressor.h
index cc829ce..7514bde 100644
--- a/src/nvtt/BlockCompressor.h
+++ b/src/nvtt/BlockCompressor.h
@@ -30,8 +30,8 @@
 
 namespace nv
 {
-    struct ColorSet;
     struct ColorBlock;
+    class Vector4;
 
     struct ColorBlockCompressor : public CompressorInterface
     {
@@ -41,11 +41,11 @@ namespace nv
         virtual uint blockSize() const = 0;
     };
 
-    struct ColorSetCompressor : public CompressorInterface
+    struct FloatColorCompressor : public CompressorInterface
     {
         virtual void compress(nvtt::AlphaMode alphaMode, uint w, uint h, uint d, const float * rgba, nvtt::TaskDispatcher * dispatcher, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
 
-        virtual void compressBlock(ColorSet & set, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output) = 0;
+        virtual void compressBlock(const Vector4 colors[16], const float weights[16], const nvtt::CompressionOptions::Private & compressionOptions, void * output) = 0;
         virtual uint blockSize() const = 0;
     };
 
diff --git a/src/nvtt/CMakeLists.txt b/src/nvtt/CMakeLists.txt
index 932eaf2..2cc78bc 100644
--- a/src/nvtt/CMakeLists.txt
+++ b/src/nvtt/CMakeLists.txt
@@ -12,6 +12,7 @@ SET(NVTT_SRCS
     CompressorDX10.h CompressorDX10.cpp
     CompressorDX11.h CompressorDX11.cpp
     CompressorDXT1.h CompressorDXT1.cpp
+    CompressorDXT5_RGBM.h CompressorDXT5_RGBM.cpp
     CompressorRGB.h CompressorRGB.cpp
     Context.h Context.cpp
     QuickCompressDXT.h QuickCompressDXT.cpp
diff --git a/src/nvtt/ClusterFit.cpp b/src/nvtt/ClusterFit.cpp
index 7b91e2b..b3b2f1a 100644
--- a/src/nvtt/ClusterFit.cpp
+++ b/src/nvtt/ClusterFit.cpp
@@ -38,7 +38,7 @@ ClusterFit::ClusterFit()
 {
 }
 
-// @@ Deprecate. Do not use color set directly.
+#if 0 // @@ Deprecate. Do not use color set directly.
 void ClusterFit::setColorSet(const ColorSet * set) 
 {
     // initialise the best error
@@ -108,6 +108,7 @@ void ClusterFit::setColorSet(const ColorSet * set)
 #endif
     }
 }
+#endif // 0
 
 
 void ClusterFit::setColorSet(const Vector3 * colors, const float * weights, int count)
diff --git a/src/nvtt/ClusterFit.h b/src/nvtt/ClusterFit.h
index 72c9ef9..4f29680 100644
--- a/src/nvtt/ClusterFit.h
+++ b/src/nvtt/ClusterFit.h
@@ -43,7 +43,7 @@ namespace nv {
     public:
         ClusterFit();
 
-        void setColorSet(const ColorSet * set);
+        //void setColorSet(const ColorSet * set);
         void setColorSet(const Vector3 * colors, const float * weights, int count);
 
         void setColorWeights(const Vector4 & w);
diff --git a/src/nvtt/CompressionOptions.cpp b/src/nvtt/CompressionOptions.cpp
index 3951c32..a899a67 100644
--- a/src/nvtt/CompressionOptions.cpp
+++ b/src/nvtt/CompressionOptions.cpp
@@ -248,7 +248,7 @@ unsigned int CompressionOptions::d3d9Format() const
 		    0,              // Format_CTX1
             MAKEFOURCC('B', 'C', '6', 'H'),     // Format_BC6
             MAKEFOURCC('B', 'C', '7', 'L'),     // Format_BC7
-            FOURCC_ATI2,    // Format_BC5_Luma
+            //FOURCC_ATI2,    // Format_BC5_Luma
             FOURCC_DXT5,    // Format_BC3_RGBM
         };
 
diff --git a/src/nvtt/CompressorDX10.cpp b/src/nvtt/CompressorDX10.cpp
index d823db8..7a7842d 100644
--- a/src/nvtt/CompressorDX10.cpp
+++ b/src/nvtt/CompressorDX10.cpp
@@ -85,6 +85,7 @@ void ProductionCompressorBC5::compressBlock(ColorBlock & src, nvtt::AlphaMode al
 }
 
 
+#if 0
 void ProductionCompressorBC5_Luma::compressBlock(ColorSet & set, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
 {
     BlockATI2 * block = new(output) BlockATI2;
@@ -118,3 +119,4 @@ void ProductionCompressorBC5_Luma::compressBlock(ColorSet & set, nvtt::AlphaMode
     OptimalCompress::compressDXT5A(tmp, &block->y);
    
 }
+#endif // 0
\ No newline at end of file
diff --git a/src/nvtt/CompressorDX10.h b/src/nvtt/CompressorDX10.h
index 0ea16c3..67addd3 100644
--- a/src/nvtt/CompressorDX10.h
+++ b/src/nvtt/CompressorDX10.h
@@ -58,11 +58,11 @@ namespace nv
 		virtual uint blockSize() const { return 16; }
 	};
 
-    struct ProductionCompressorBC5_Luma : public ColorSetCompressor
+    /*struct ProductionCompressorBC5_Luma : public ColorSetCompressor
 	{
 		virtual void compressBlock(ColorSet & set, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
 		virtual uint blockSize() const { return 16; }
-	};
+	};*/
 
 
 } // nv namespace
diff --git a/src/nvtt/CompressorDX11.cpp b/src/nvtt/CompressorDX11.cpp
index cf83a69..a349ffa 100644
--- a/src/nvtt/CompressorDX11.cpp
+++ b/src/nvtt/CompressorDX11.cpp
@@ -39,7 +39,7 @@ using namespace nv;
 using namespace nvtt;
 
 
-void CompressorBC6::compressBlock(ColorSet & tile, AlphaMode alphaMode, const CompressionOptions::Private & compressionOptions, void * output)
+void CompressorBC6::compressBlock(const Vector4 colors[16], const float weights[16], const CompressionOptions::Private & compressionOptions, void * output)
 {
     // !!!UNDONE: support channel weights
     // !!!UNDONE: set flags once, not per block (this is especially sketchy since block compression is multithreaded...)
@@ -56,57 +56,45 @@ void CompressorBC6::compressBlock(ColorSet & tile, AlphaMode alphaMode, const Co
     }
 
     // Convert NVTT's tile struct to ZOH's, and convert float to half.
-    ZOH::Tile zohTile(tile.w, tile.h);
+    ZOH::Tile zohTile(4, 4);
     memset(zohTile.data, 0, sizeof(zohTile.data));
     memset(zohTile.importance_map, 0, sizeof(zohTile.importance_map));
-    for (uint y = 0; y < tile.h; ++y)
+    for (uint y = 0; y < 4; ++y)
     {
-        for (uint x = 0; x < tile.w; ++x)
+        for (uint x = 0; x < 4; ++x)
         {
-            Vector4 color = tile.color(x, y);
+            Vector4 color = colors[4*y+x];
             uint16 rHalf = to_half(color.x);
             uint16 gHalf = to_half(color.y);
             uint16 bHalf = to_half(color.z);
             zohTile.data[y][x].x = ZOH::Tile::half2float(rHalf);
             zohTile.data[y][x].y = ZOH::Tile::half2float(gHalf);
             zohTile.data[y][x].z = ZOH::Tile::half2float(bHalf);
-
-            if (alphaMode == AlphaMode_Transparency) {
-                zohTile.importance_map[y][x] = color.w;
-            }
-            else {
-                zohTile.importance_map[y][x] = 1.0f;
-            }
+            zohTile.importance_map[y][x] = weights[4*y+x];
         }
     }
 
     ZOH::compress(zohTile, (char *)output);
 }
 
-void CompressorBC7::compressBlock(ColorSet & tile, AlphaMode alphaMode, const CompressionOptions::Private & compressionOptions, void * output)
+void CompressorBC7::compressBlock(const Vector4 colors[16], const float weights[16], const CompressionOptions::Private & compressionOptions, void * output)
 {
     // !!!UNDONE: support channel weights
     // !!!UNDONE: set flags once, not per block (this is especially sketchy since block compression is multithreaded...)
 
     AVPCL::mode_rgb = false;
-    AVPCL::flag_premult = (alphaMode == AlphaMode_Premultiplied);
+    AVPCL::flag_premult = false; //(alphaMode == AlphaMode_Premultiplied);
     AVPCL::flag_nonuniform = false;
     AVPCL::flag_nonuniform_ati = false;
     
     // Convert NVTT's tile struct to AVPCL's.
-    AVPCL::Tile avpclTile(tile.w, tile.h);
+    AVPCL::Tile avpclTile(4, 4);
     memset(avpclTile.data, 0, sizeof(avpclTile.data));
-    for (uint y = 0; y < tile.h; ++y) {
-        for (uint x = 0; x < tile.w; ++x) {
-            Vector4 color = tile.color(x, y);
+    for (uint y = 0; y < 4; ++y) {
+        for (uint x = 0; x < 4; ++x) {
+            Vector4 color = colors[4*y+x];
             avpclTile.data[y][x] = color * 255.0f;
-            
-            /*if (alphaMode == AlphaMode_Transparency) {
-                avpclTile.importance_map[y][x] = color.w;
-            }
-            else*/ {
-                avpclTile.importance_map[y][x] = 1.0f;
-            }
+            avpclTile.importance_map[y][x] = 1.0f; //weights[4*y+x];
         }
     }
 
diff --git a/src/nvtt/CompressorDX11.h b/src/nvtt/CompressorDX11.h
index 3dda9ea..7afaacb 100644
--- a/src/nvtt/CompressorDX11.h
+++ b/src/nvtt/CompressorDX11.h
@@ -28,15 +28,15 @@
 
 namespace nv
 {
-    struct CompressorBC6 : public ColorSetCompressor
+    struct CompressorBC6 : public FloatColorCompressor
     {
-        virtual void compressBlock(ColorSet & set, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
+        virtual void compressBlock(const Vector4 colors[16], const float weights[16], const nvtt::CompressionOptions::Private & compressionOptions, void * output);
         virtual uint blockSize() const { return 16; }
     };
 
-    struct CompressorBC7 : public ColorSetCompressor
+    struct CompressorBC7 : public FloatColorCompressor
     {
-        virtual void compressBlock(ColorSet & set, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
+        virtual void compressBlock(const Vector4 colors[16], const float weights[16], const nvtt::CompressionOptions::Private & compressionOptions, void * output);
         virtual uint blockSize() const { return 16; }
     };
 	
diff --git a/src/nvtt/CompressorDX9.cpp b/src/nvtt/CompressorDX9.cpp
index aaef88d..9cfd7da 100644
--- a/src/nvtt/CompressorDX9.cpp
+++ b/src/nvtt/CompressorDX9.cpp
@@ -28,6 +28,8 @@
 #include "CompressionOptions.h"
 #include "OutputOptions.h"
 #include "ClusterFit.h"
+#include "CompressorDXT1.h"
+#include "CompressorDXT5_RGBM.h"
 
 // squish
 #include "squish/colourset.h"
@@ -113,102 +115,13 @@ void FastCompressorDXT5n::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alpha
 }
 
 
-namespace nv {
-    float compress_dxt1(const Vector3 input_colors[16], const float input_weights[16], const Vector3 & color_weights, BlockDXT1 * output);
-}
-
-#if 1
-void CompressorDXT1::compressBlock(ColorSet & set, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
-{
 #if 1
-    // @@ This setup is the same for all compressors.
-    Vector3 input_colors[16];
-    float input_weights[16];
-
-    uint x, y;
-    for (y = 0; y < set.h; y++) {
-        for (x = 0; x < set.w; x++) {
-            input_colors[4*y+x] = set.color(x, y).xyz();
-            input_weights[4*y+x] = 1.0f;
-            if (alphaMode == nvtt::AlphaMode_Transparency) input_weights[4*y+x] = set.color(x, y).z;
-        }
-        for (; x < 4; x++) {
-            input_colors[4*y+x] = Vector3(0);
-            input_weights[4*y+x] = 0.0f;
-        }
-    }
-    for (; y < 4; y++) {
-        for (x = 0; x < 4; x++) {
-            input_colors[4*y+x] = Vector3(0);
-            input_weights[4*y+x] = 0.0f;
-        }
-    }
-
-    compress_dxt1(input_colors, input_weights, compressionOptions.colorWeight.xyz(), (BlockDXT1 *)output);
-
-#else
-    set.setUniformWeights();
-    set.createMinimalSet(/*ignoreTransparent*/false);
-
-    BlockDXT1 * block = new(output) BlockDXT1;
-    
-    if (set.isSingleColor(/*ignoreAlpha*/true))
-    {
-        Color32 c = toColor32(set.colors[0]);
-        OptimalCompress::compressDXT1(c, block);
-    }
-    /*else if (set.colorCount == 2) {
-        QuickCompress::compressDXT1(..., block);
-    }*/
-    else
-    {
-        ClusterFit fit;
-        fit.setColorWeights(compressionOptions.colorWeight);
-        fit.setColorSet(&set);
-
-        Vector3 start, end;
-        fit.compress4(&start, &end);
-
-        if (fit.compress3(&start, &end)) {
-            QuickCompress::outputBlock3(set, start, end, block);
-        }
-        else {
-            QuickCompress::outputBlock4(set, start, end, block);        
-        }
-    }
-#endif
-}
-#elif 0
-
-
-extern void compress_dxt1_bounding_box_exhaustive(const ColorBlock & input, BlockDXT1 * output);
 
-
-void CompressorDXT1::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
+void CompressorDXT1::compressBlock(const Vector4 colors[16], const float weights[16], const nvtt::CompressionOptions::Private & compressionOptions, void * output)
 {
-    BlockDXT1 * block = new(output) BlockDXT1;
-
-    if (rgba.isSingleColor())
-    {
-        OptimalCompress::compressDXT1(rgba.color(0), block);
-        //compress_dxt1_single_color_optimal(rgba.color(0), block);
-    }
-    else
-    {
-        // Do an exhaustive search inside the bounding box.
-        compress_dxt1_bounding_box_exhaustive(rgba, block);
-    }
-
-    /*else
-    {
-        nvsquish::WeightedClusterFit fit;
-        fit.SetMetric(compressionOptions.colorWeight.x, compressionOptions.colorWeight.y, compressionOptions.colorWeight.z);
-
-        nvsquish::ColourSet colours((uint8 *)rgba.colors(), 0);
-        fit.SetColourSet(&colours, nvsquish::kDxt1);
-        fit.Compress(output);
-    }*/
+    compress_dxt1(colors, weights, compressionOptions.colorWeight.xyz(), /*three_color_mode*/true, (BlockDXT1 *)output);
 }
+
 #else
 void CompressorDXT1::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
 {
@@ -371,309 +284,13 @@ void CompressorDXT5n::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode
 }
 
 
-
-
-
-void CompressorBC3_RGBM::compressBlock(ColorSet & src, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
+void CompressorBC3_RGBM::compressBlock(const Vector4 colors[16], const float weights[16], const nvtt::CompressionOptions::Private & compressionOptions, void * output)
 {
-    BlockDXT5 * block = new(output)BlockDXT5;
-
-    if (alphaMode == AlphaMode_Transparency) {
-        src.setAlphaWeights();
-    }
-    else {
-        src.setUniformWeights();
-    }
-
-    // Decompress the color block and find the M values that reproduce the input most closely. This should compensate for some of the DXT errors.
-
-    // Compress the resulting M values optimally.
-
-    // Repeat this several times until compression error does not improve?
-
-    //Vector3 rgb_block[16];
-    //float m_block[16];
-
-    
-    // Init RGB/M block.
-    const float threshold = 0.15f; // @@ Use compression options.
-#if 0
-    nvsquish::WeightedClusterFit fit;
-
-    ColorBlock rgba;
-    for (int i = 0; i < 16; i++) {
-        const Vector4 & c = src.color(i);
-        float R = saturate(c.x);
-        float G = saturate(c.y);
-        float B = saturate(c.z);
-
-        float M = max(max(R, G), max(B, threshold));
-        float r = R / M;
-        float g = G / M;
-        float b = B / M;
-        float a = c.w;
-
-        rgba.color(i) = toColor32(Vector4(r, g, b, a));
-    }
-
-    if (rgba.isSingleColor())
-    {
-        OptimalCompress::compressDXT1(rgba.color(0), &block->color);
-    }
-    else
-    {
-        nvsquish::WeightedClusterFit fit;
-        fit.SetMetric(compressionOptions.colorWeight.x, compressionOptions.colorWeight.y, compressionOptions.colorWeight.z);
-
-        int flags = 0;
-        if (alphaMode == nvtt::AlphaMode_Transparency) flags |= nvsquish::kWeightColourByAlpha;
-
-        nvsquish::ColourSet colours((uint8 *)rgba.colors(), flags);
-        fit.SetColourSet(&colours, 0);
-        fit.Compress(&block->color);
-    }
-#endif
-#if 1
-    ColorSet rgb;
-    rgb.allocate(src.w, src.h);     // @@ Handle smaller blocks.
-
-    if (src.colorCount != 16) {
-        nvDebugBreak();
-    }
-
-    for (uint i = 0; i < src.colorCount; i++) {
-        const Vector4 & c = src.color(i);
-
-        float R = saturate(c.x);
-        float G = saturate(c.y);
-        float B = saturate(c.z);
-
-        float M = max(max(R, G), max(B, threshold));
-        float r = R / M;
-        float g = G / M;
-        float b = B / M;
-        float a = c.w;
-
-        rgb.colors[i] = Vector4(r, g, b, a);
-        rgb.indices[i] = i;
-        rgb.weights[i] = max(c.w, 0.001f);// src.weights[i];   // IC: For some reason 0 weights are causing problems, even if we eliminate the corresponding colors from the set.
-    }
-
-    rgb.createMinimalSet(/*ignoreTransparent=*/true);
-
-    if (rgb.isSingleColor(/*ignoreAlpha=*/true)) {
-        OptimalCompress::compressDXT1(toColor32(rgb.color(0)), &block->color);
-    }
-    else {
-        ClusterFit fit;
-        fit.setColorWeights(compressionOptions.colorWeight);
-        fit.setColorSet(&rgb);
-
-        Vector3 start, end;
-        fit.compress4(&start, &end);
-
-        QuickCompress::outputBlock4(rgb, start, end, &block->color);
-    }
-#endif
-
-    // Decompress RGB/M block.
-    nv::ColorBlock RGB;
-    block->color.decodeBlock(&RGB);
-    
-#if 1
-    AlphaBlock4x4 M;
-    for (int i = 0; i < 16; i++) {
-        const Vector4 & c = src.color(i);
-        float R = saturate(c.x);
-        float G = saturate(c.y);
-        float B = saturate(c.z);
-
-        float r = RGB.color(i).r / 255.0f;
-        float g = RGB.color(i).g / 255.0f;
-        float b = RGB.color(i).b / 255.0f;
-
-        float m = (R / r + G / g + B / b) / 3.0f;
-        //float m = max((R / r + G / g + B / b) / 3.0f, threshold);
-        //float m = max(max(R / r, G / g), max(B / b, threshold));
-        //float m = max(max(R, G), max(B, threshold));
-
-        m = (m - threshold) / (1 - threshold);
-
-        M.alpha[i] = U8(ftoi_round(saturate(m) * 255.0f));
-        M.weights[i] = src.weights[i];
-    }
-
-    // Compress M.
-    if (compressionOptions.quality == Quality_Fastest) {
-        QuickCompress::compressDXT5A(M, &block->alpha);
-    }
-    else {
-        OptimalCompress::compressDXT5A(M, &block->alpha);
-    }
-#else
-    OptimalCompress::compressDXT5A_RGBM(src, RGB, &block->alpha);
-#endif
-
-#if 0
-    // Decompress M.
-    block->alpha.decodeBlock(&M);
-
-    rgb.allocate(src.w, src.h);     // @@ Handle smaller blocks.
-
-    for (uint i = 0; i < src.colorCount; i++) {
-        const Vector4 & c = src.color(i);
-
-        float R = saturate(c.x);
-        float G = saturate(c.y);
-        float B = saturate(c.z);
-
-        //float m = max(max(R, G), max(B, threshold));
-        float m = float(M.alpha[i]) / 255.0f * (1 - threshold) + threshold;
-        float r = R / m;
-        float g = G / m;
-        float b = B / m;
-        float a = c.w;
-
-        rgb.colors[i] = Vector4(r, g, b, a);
-        rgb.indices[i] = i;
-        rgb.weights[i] = max(c.w, 0.001f);// src.weights[i];   // IC: For some reason 0 weights are causing problems, even if we eliminate the corresponding colors from the set.
-    }
-
-    rgb.createMinimalSet(/*ignoreTransparent=*/true);
-
-    if (rgb.isSingleColor(/*ignoreAlpha=*/true)) {
-        OptimalCompress::compressDXT1(toColor32(rgb.color(0)), &block->color);
-    }
-    else {
-        ClusterFit fit;
-        fit.setMetric(compressionOptions.colorWeight);
-        fit.setColourSet(&rgb);
-
-        Vector3 start, end;
-        fit.compress4(&start, &end);
-
-        QuickCompress::outputBlock4(rgb, start, end, &block->color);
-    }
-#endif
-
-#if 0
-    block->color.decodeBlock(&RGB);
-
-    //AlphaBlock4x4 M;
-    //M.initWeights(src);
-    
-    for (int i = 0; i < 16; i++) {
-        const Vector4 & c = src.color(i);
-        float R = saturate(c.x);
-        float G = saturate(c.y);
-        float B = saturate(c.z);
-
-        float r = RGB.color(i).r / 255.0f;
-        float g = RGB.color(i).g / 255.0f;
-        float b = RGB.color(i).b / 255.0f;
-
-        float m = (R / r + G / g + B / b) / 3.0f;
-        //float m = max((R / r + G / g + B / b) / 3.0f, threshold);
-        //float m = max(max(R / r, G / g), max(B / b, threshold));
-        //float m = max(max(R, G), max(B, threshold));
-
-        m = (m - threshold) / (1 - threshold);
-
-        M.alpha[i] = U8(ftoi_round(saturate(m) * 255.0f));
-        M.weights[i] = src.weights[i];
-    }
-
-    // Compress M.
-    if (compressionOptions.quality == Quality_Fastest) {
-        QuickCompress::compressDXT5A(M, &block->alpha);
-    }
-    else {
-        OptimalCompress::compressDXT5A(M, &block->alpha);
-    }
-#endif
-
-
-
-#if 0
-    src.fromRGBM(M, threshold);
-
-    src.createMinimalSet(/*ignoreTransparent=*/true);
-
-    if (src.isSingleColor(/*ignoreAlpha=*/true)) {
-        OptimalCompress::compressDXT1(src.color(0), &block->color);
-    }
-    else {
-        // @@ Use our improved compressor.
-        ClusterFit fit;
-        fit.setMetric(compressionOptions.colorWeight);
-        fit.setColourSet(&src);
-
-        Vector3 start, end;
-        fit.compress4(&start, &end);
-
-        if (fit.compress3(&start, &end)) {
-            QuickCompress::outputBlock3(src, start, end, block->color);
-        }
-        else {
-            QuickCompress::outputBlock4(src, start, end, block->color);
-        }
-    }
-#endif // 0
-
-    // @@ Decompress color and compute M that best approximates src with these colors? Then compress M again?
-
-
-
-    // RGBM encoding.
-    // Maximize precision.
-    // - Number of possible grey levels:
-    //   - Naive:  2^3 = 8
-    //   - Better: 2^3 + 2^2 = 12
-    //   - How to choose threshold? 
-    //     - Ideal = Adaptive per block, don't know where to store.
-    //     - Adaptive per lightmap. How to compute optimal?
-    //     - Fixed: 0.25 in our case. Lightmaps scaled to a fixed [0, 1] range.
-
-    // - Optimal compressor: Interpolation artifacts.
-
-    // - Color transform. 
-    //    - Measure error in post-tone-mapping color space. 
-    //    - Assume a simple tone mapping operator. We know minimum and maximum exposure, but don't know exact exposure in game.
-    //    - Guess based on average lighmap color? Use fixed exposure, in scaled lightmap space.
-
-    // - Enhanced DXT compressor.
-    //    - Typical RGBM encoding as follows:
-    //      rgb -> M = max(rgb), RGB=rgb/M -> RGBM
-    //    - If we add a compression step (M' = M) and M' < M, then rgb may be greater than 1.
-    //      - We could ensure that M' >= M during compression.
-    //      - We could clamp RGB anyway.
-    //      - We could add a fixed scale value to take into account compression errors and avoid clamping.
-
-
-    
-
-
-    // Compress color.
-    /*if (rgba.isSingleColor())
-    {
-        OptimalCompress::compressDXT1(rgba.color(0), &block->color);
-    }
-    else
-    {
-        nvsquish::WeightedClusterFit fit;
-        fit.SetMetric(compressionOptions.colorWeight.x, compressionOptions.colorWeight.y, compressionOptions.colorWeight.z);
-
-        int flags = 0;
-        if (alphaMode == nvtt::AlphaMode_Transparency) flags |= nvsquish::kWeightColourByAlpha;
-
-        nvsquish::ColourSet colours((uint8 *)rgba.colors(), flags);
-        fit.SetColourSet(&colours, 0);
-        fit.Compress(&block->color);
-    }*/
+    float min_m = 0.25f; // @@ Get from compression options.
+    compress_dxt5_rgbm(colors, weights, min_m, (BlockDXT5 *)output);
 }
 
 
-
 #if defined(HAVE_ATITC)
 
 void AtiCompressorDXT1::compress(nvtt::InputFormat inputFormat, nvtt::AlphaMode alphaMode, uint w, uint h, uint d, void * data, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions)
diff --git a/src/nvtt/CompressorDX9.h b/src/nvtt/CompressorDX9.h
index 33c1112..8a298c6 100644
--- a/src/nvtt/CompressorDX9.h
+++ b/src/nvtt/CompressorDX9.h
@@ -65,9 +65,9 @@ namespace nv
 
     // Normal CPU compressors.
 #if 1
-    struct CompressorDXT1 : public ColorSetCompressor
+    struct CompressorDXT1 : public FloatColorCompressor
     {
-        virtual void compressBlock(ColorSet & set, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
+        virtual void compressBlock(const Vector4 colors[16], const float weights[16], const nvtt::CompressionOptions::Private & compressionOptions, void * output);
         virtual uint blockSize() const { return 8; }
     };
 #else
@@ -108,9 +108,9 @@ namespace nv
         virtual uint blockSize() const { return 16; }
     };
 
-    struct CompressorBC3_RGBM : public ColorSetCompressor
+    struct CompressorBC3_RGBM : public FloatColorCompressor
     {
-        virtual void compressBlock(ColorSet & set, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
+        virtual void compressBlock(const Vector4 colors[16], const float weights[16], const nvtt::CompressionOptions::Private & compressionOptions, void * output);
         virtual uint blockSize() const { return 16; }
     };
 
diff --git a/src/nvtt/CompressorDXT1.cpp b/src/nvtt/CompressorDXT1.cpp
index b5c4b54..08134f8 100644
--- a/src/nvtt/CompressorDXT1.cpp
+++ b/src/nvtt/CompressorDXT1.cpp
@@ -2,7 +2,6 @@
 #include "CompressorDXT1.h"
 #include "SingleColorLookup.h"
 #include "ClusterFit.h"
-#include "QuickCompressDXT.h"  // Deprecate.
 
 #include "nvimage/ColorBlock.h"
 #include "nvimage/BlockDXT.h"
@@ -162,12 +161,12 @@ static bool is_single_color_rgb(const Vector3 * colors, const float * weights, i
 }
 
 // Find similar colors and combine them together.
-static int reduce_colors(const Vector3 * input_colors, const float * input_weights, Vector3 * colors, float * weights)
+static int reduce_colors(const Vector4 * input_colors, const float * input_weights, Vector3 * colors, float * weights)
 {
     int n = 0;
     for (int i = 0; i < 16; i++)
     {
-        Vector3 ci = input_colors[i];
+        Vector3 ci = input_colors[i].xyz();
         float wi = input_weights[i];
 
         if (wi > 0) {
@@ -276,7 +275,7 @@ static float evaluate_mse(const BlockDXT1 * output, const Vector3 colors[16]) {
 }
 #endif
 
-static float evaluate_mse(const Vector3 colors[16], const float weights[16], const Vector3 & color_weights, const BlockDXT1 * output) {
+static float evaluate_mse(const Vector4 input_colors[16], const float input_weights[16], const Vector3 & color_weights, const BlockDXT1 * output) {
     Color32 palette[4];
     output->evaluatePalette(palette, /*d3d9=*/false);
 
@@ -290,7 +289,7 @@ static float evaluate_mse(const Vector3 colors[16], const float weights[16], con
     float error = 0.0f;
     for (int i = 0; i < 16; i++) {
         int index = (output->indices >> (2 * i)) & 3;
-        error += weights[i] * evaluate_mse(vector_palette[index], colors[i], color_weights);
+        error += input_weights[i] * evaluate_mse(vector_palette[index], input_colors[i].xyz(), color_weights);
     }
     return error;
 }
@@ -353,14 +352,14 @@ static void evaluate_palette3(Color16 c0, Color16 c1, Vector3 palette[4]) {
 
 
 
-static uint compute_indices4(const Vector3 input_colors[16], const Vector3 & color_weights, const Vector3 palette[4]) {
+static uint compute_indices4(const Vector4 input_colors[16], const Vector3 & color_weights, const Vector3 palette[4]) {
     
     uint indices = 0;
 	for (int i = 0; i < 16; i++) {
-		float d0 = evaluate_mse(palette[0], input_colors[i], color_weights);
-		float d1 = evaluate_mse(palette[1], input_colors[i], color_weights);
-		float d2 = evaluate_mse(palette[2], input_colors[i], color_weights);
-		float d3 = evaluate_mse(palette[3], input_colors[i], color_weights);
+		float d0 = evaluate_mse(palette[0], input_colors[i].xyz(), color_weights);
+		float d1 = evaluate_mse(palette[1], input_colors[i].xyz(), color_weights);
+		float d2 = evaluate_mse(palette[2], input_colors[i].xyz(), color_weights);
+		float d3 = evaluate_mse(palette[3], input_colors[i].xyz(), color_weights);
 		
 		uint b0 = d0 > d3;
 		uint b1 = d1 > d2;
@@ -379,14 +378,14 @@ static uint compute_indices4(const Vector3 input_colors[16], const Vector3 & col
 }
 
 
-static uint compute_indices(const Vector3 input_colors[16], const Vector3 & color_weights, const Vector3 palette[4]) {
+static uint compute_indices(const Vector4 input_colors[16], const Vector3 & color_weights, const Vector3 palette[4]) {
     
     uint indices = 0;
 	for (int i = 0; i < 16; i++) {
-		float d0 = evaluate_mse(palette[0], input_colors[i], color_weights);
-		float d1 = evaluate_mse(palette[1], input_colors[i], color_weights);
-		float d2 = evaluate_mse(palette[2], input_colors[i], color_weights);
-		float d3 = evaluate_mse(palette[3], input_colors[i], color_weights);
+		float d0 = evaluate_mse(palette[0], input_colors[i].xyz(), color_weights);
+		float d1 = evaluate_mse(palette[1], input_colors[i].xyz(), color_weights);
+		float d2 = evaluate_mse(palette[2], input_colors[i].xyz(), color_weights);
+		float d3 = evaluate_mse(palette[3], input_colors[i].xyz(), color_weights);
 		
         uint index;
         if (d0 < d1 && d0 < d2 && d0 < d3) index = 0;
@@ -401,7 +400,7 @@ static uint compute_indices(const Vector3 input_colors[16], const Vector3 & colo
 }
 
 
-static void output_block3(const Vector3 input_colors[16], const Vector3 & color_weights, const Vector3 & v0, const Vector3 & v1, BlockDXT1 * block)
+static void output_block3(const Vector4 input_colors[16], const Vector3 & color_weights, const Vector3 & v0, const Vector3 & v1, BlockDXT1 * block)
 {
     Color16 color0 = vector3_to_color16(v0);
     Color16 color1 = vector3_to_color16(v1);
@@ -418,7 +417,7 @@ static void output_block3(const Vector3 input_colors[16], const Vector3 & color_
     block->indices = compute_indices(input_colors, color_weights, palette);
 }
 
-static void output_block4(const Vector3 input_colors[16], const Vector3 & color_weights, const Vector3 & v0, const Vector3 & v1, BlockDXT1 * block)
+static void output_block4(const Vector4 input_colors[16], const Vector3 & color_weights, const Vector3 & v0, const Vector3 & v1, BlockDXT1 * block)
 {
     Color16 color0 = vector3_to_color16(v0);
     Color16 color1 = vector3_to_color16(v1);
@@ -515,7 +514,7 @@ float nv::compress_dxt1_least_squares_fit(const Vector3 * input_colors, const Ve
 }*/
 
 
-float nv::compress_dxt1_bounding_box_exhaustive(const Vector3 input_colors[16], const Vector3 * colors, const float * weights, int count, const Vector3 & color_weights, int max_volume, BlockDXT1 * output)
+float nv::compress_dxt1_bounding_box_exhaustive(const Vector4 input_colors[16], const Vector3 * colors, const float * weights, int count, const Vector3 & color_weights, bool three_color_mode, int max_volume, BlockDXT1 * output)
 {
     // Compute bounding box.
     Vector3 min_color(1.0f);
@@ -586,13 +585,14 @@ float nv::compress_dxt1_bounding_box_exhaustive(const Vector3 input_colors[16],
                 evaluate_palette4(palette);
             }
             else {
-    #if 1
-                // Evaluate error in 3 color mode.
-                evaluate_palette3(palette);
-    #else
-                // Skip 3 color mode.
-                continue;
-    #endif
+                if (three_color_mode) {
+                    // Evaluate error in 3 color mode.
+                    evaluate_palette3(palette);
+                }
+                else {
+                    // Skip 3 color mode.
+                    continue;
+                }
             }
 
             float error = evaluate_palette_error(palette, colors32, weights, count);
@@ -608,10 +608,6 @@ float nv::compress_dxt1_bounding_box_exhaustive(const Vector3 input_colors[16],
     output->col0 = best0;
     output->col1 = best1;
 
-    if (output->col0.u < output->col1.u) {
-        int k = 1;
-    }
-
     Vector3 vector_palette[4];
     evaluate_palette(output->col0, output->col1, vector_palette);
 
@@ -621,7 +617,7 @@ float nv::compress_dxt1_bounding_box_exhaustive(const Vector3 input_colors[16],
 }
 
 
-void nv::compress_dxt1_cluster_fit(const Vector3 input_colors[16], const Vector3 * colors, const float * weights, int count, const Vector3 & color_weights, BlockDXT1 * output)
+void nv::compress_dxt1_cluster_fit(const Vector4 input_colors[16], const Vector3 * colors, const float * weights, int count, const Vector3 & color_weights, bool three_color_mode, BlockDXT1 * output)
 {
     ClusterFit fit;
     fit.setColorWeights(Vector4(color_weights, 1));
@@ -631,7 +627,7 @@ void nv::compress_dxt1_cluster_fit(const Vector3 input_colors[16], const Vector3
     Vector3 start, end;
     fit.compress4(&start, &end);
 
-    if (fit.compress3(&start, &end)) {
+    if (three_color_mode && fit.compress3(&start, &end)) {
         output_block3(input_colors, color_weights, start, end, output);
     }
     else {
@@ -642,7 +638,7 @@ void nv::compress_dxt1_cluster_fit(const Vector3 input_colors[16], const Vector3
 
 
 
-float nv::compress_dxt1(const Vector3 input_colors[16], const float input_weights[16], const Vector3 & color_weights, BlockDXT1 * output)
+float nv::compress_dxt1(const Vector4 input_colors[16], const float input_weights[16], const Vector3 & color_weights, bool three_color_mode, BlockDXT1 * output)
 {
     Vector3 colors[16];
     float weights[16];
@@ -674,7 +670,7 @@ float nv::compress_dxt1(const Vector3 input_colors[16], const float input_weight
     // If high quality:
     if (0) {
         BlockDXT1 exhaustive_output;
-        float exhaustive_error = compress_dxt1_bounding_box_exhaustive(input_colors, colors, weights, count, color_weights, 400, &exhaustive_output);
+        float exhaustive_error = compress_dxt1_bounding_box_exhaustive(input_colors, colors, weights, count, color_weights, three_color_mode, 1400, &exhaustive_output);
 
         if (exhaustive_error != FLT_MAX) {
             float exhaustive_error2 = evaluate_mse(input_colors, input_weights, color_weights, &exhaustive_output);
@@ -700,7 +696,7 @@ float nv::compress_dxt1(const Vector3 input_colors[16], const float input_weight
 
     if (count > 1) {
         BlockDXT1 cluster_fit_output;
-        compress_dxt1_cluster_fit(input_colors, colors, weights, count, color_weights, &cluster_fit_output);
+        compress_dxt1_cluster_fit(input_colors, colors, weights, count, color_weights, three_color_mode, &cluster_fit_output);
 
         float cluster_fit_error = evaluate_mse(input_colors, input_weights, color_weights, &cluster_fit_output);
         
diff --git a/src/nvtt/CompressorDXT1.h b/src/nvtt/CompressorDXT1.h
index daf99ac..c7e51d7 100644
--- a/src/nvtt/CompressorDXT1.h
+++ b/src/nvtt/CompressorDXT1.h
@@ -5,6 +5,7 @@ namespace nv {
     struct ColorBlock;
     struct BlockDXT1;
     class Vector3;
+    class Vector4;
 
     // All these functions return MSE.
 
@@ -12,11 +13,11 @@ namespace nv {
     float compress_dxt1_single_color_optimal(const Vector3 & color, BlockDXT1 * output);
 
     float compress_dxt1_single_color(const Vector3 * colors, const float * weights, int count, const Vector3 & color_weights, BlockDXT1 * output);
-    float compress_dxt1_least_squares_fit(const Vector3 input_colors[16], const Vector3 * colors, const float * weights, int count, const Vector3 & color_weights, BlockDXT1 * output);
-    float compress_dxt1_bounding_box_exhaustive(const Vector3 input_colors[16], const Vector3 * colors, const float * weights, int count, const Vector3 & color_weights, int search_limit, BlockDXT1 * output);
-    void compress_dxt1_cluster_fit(const Vector3 input_colors[16], const Vector3 * colors, const float * weights, int count, const Vector3 & color_weights, BlockDXT1 * output);
+    float compress_dxt1_least_squares_fit(const Vector4 input_colors[16], const Vector3 * colors, const float * weights, int count, const Vector3 & color_weights, BlockDXT1 * output);
+    float compress_dxt1_bounding_box_exhaustive(const Vector4 input_colors[16], const Vector3 * colors, const float * weights, int count, const Vector3 & color_weights, bool three_color_mode, int search_limit, BlockDXT1 * output);
+    void compress_dxt1_cluster_fit(const Vector4 input_colors[16], const Vector3 * colors, const float * weights, int count, const Vector3 & color_weights, bool three_color_mode, BlockDXT1 * output);
 
 
-    float compress_dxt1(const Vector3 colors[16], const float weights[16], const Vector3 & color_weights, BlockDXT1 * output);
+    float compress_dxt1(const Vector4 input_colors[16], const float input_weights[16], const Vector3 & color_weights, bool three_color_mode, BlockDXT1 * output);
 
 }
diff --git a/src/nvtt/CompressorDXT5_RGBM.cpp b/src/nvtt/CompressorDXT5_RGBM.cpp
new file mode 100755
index 0000000..21d4b06
--- /dev/null
+++ b/src/nvtt/CompressorDXT5_RGBM.cpp
@@ -0,0 +1,423 @@
+#include "CompressorDXT5_RGBM.h"
+#include "CompressorDXT1.h"
+
+#include "OptimalCompressDXT.h"
+#include "QuickCompressDXT.h"
+
+#include "nvimage/ColorBlock.h"
+#include "nvimage/BlockDXT.h"
+
+#include "nvmath/Color.inl"
+#include "nvmath/Vector.inl"
+#include "nvmath/Fitting.h"
+#include "nvmath/ftoi.h"
+
+#include "nvthread/Atomic.h"
+#include <stdio.h>
+
+using namespace nv;
+
+static uint atomic_counter = 0;
+
+
+float nv::compress_dxt5_rgbm(const Vector4 input_colors[16], const float input_weights[16], float min_m, BlockDXT5 * output) {
+
+    // Convert to RGBM.
+    Vector4 input_colors_rgbm[16]; // @@ Write over input_colors?
+    float rgb_weights[16];
+
+    float weight_sum = 0;
+
+    for (uint i = 0; i < 16; i++) {
+        const Vector4 & c = input_colors[i];
+
+        float R = saturate(c.x);
+        float G = saturate(c.y);
+        float B = saturate(c.z);
+
+        float M = max(max(R, G), max(B, min_m));
+        float r = R / M;
+        float g = G / M;
+        float b = B / M;
+        float a = (M - min_m) / (1 - min_m);
+
+        input_colors_rgbm[i] = Vector4(r, g, b, a);
+        rgb_weights[i] = input_weights[i] * M;
+        weight_sum += input_weights[i];
+    }
+
+    if (weight_sum == 0) {
+        for (uint i = 0; i < 16; i++) rgb_weights[i] = 1;
+    }
+
+    // Compress RGB.
+    compress_dxt1(input_colors_rgbm, rgb_weights, Vector3(1), /*three_color_mode=*/false, &output->color);
+
+    // Decompress RGB/M block.
+    nv::ColorBlock RGB;
+    output->color.decodeBlock(&RGB);
+
+    // Compute M values to compensate for RGB's error.
+    AlphaBlock4x4 M;
+    for (int i = 0; i < 16; i++) {
+        const Vector4 & c = input_colors[i];
+
+        float R = saturate(c.x);
+        float G = saturate(c.y);
+        float B = saturate(c.z);
+
+        float rm = RGB.color(i).r / 255.0f;
+        float gm = RGB.color(i).g / 255.0f;
+        float bm = RGB.color(i).b / 255.0f;
+
+        // compute m such that m * (r/M, g/M, b/M) == RGB
+    
+        // Three equations, one unknown:
+        //  m * r/M == R
+        //  m * g/M == G
+        //  m * b/M == B
+        
+        // Solve in the least squares sense!
+
+        // m (rm gm bm) (rm gm bm)^T == (rm gm bm) (R G B)^T
+
+        // m == dot(rgb, RGB) / dot(rgb, rgb)
+
+        float m = dot(Vector3(rm, gm, bm), Vector3(R, G, B)) / dot(Vector3(rm, gm, bm), Vector3(rm, gm, bm));
+
+        m = (m - min_m) / (1 - min_m);
+
+        if (m > 1.0f) {
+            uint counter = atomicIncrement(&atomic_counter);
+            printf("It happens %u times!", counter);
+        }
+
+        M.alpha[i] = U8(ftoi_round(saturate(m) * 255.0f));
+        M.weights[i] = input_weights[i];
+    }
+
+    // Compress M.
+    //if (compressionOptions.quality == Quality_Fastest) {
+    //    QuickCompress::compressDXT5A(M, &output->alpha);
+    /*}
+    else {*/
+        OptimalCompress::compressDXT5A(M, &output->alpha);
+    //}
+
+
+#if 0   // Multiple iterations do not seem to help.
+    // Decompress M.
+    output->alpha.decodeBlock(&M);
+
+    // Feed it back to the input RGB block.
+    for (uint i = 0; i < 16; i++) {
+        const Vector4 & c = input_colors[i];
+
+        float R = saturate(c.x);
+        float G = saturate(c.y);
+        float B = saturate(c.z);
+
+        float m = float(M.alpha[i]) / 255.0f * (1 - min_m) + min_m;
+
+        float r = R / m;
+        float g = G / m;
+        float b = B / m;
+        float a = float(M.alpha[i]) / 255.0f;
+
+        input_colors_rgbm[i] = Vector4(r, g, b, a);
+        rgb_weights[i] = input_weights[i] * m;
+    }
+#endif
+
+    return 0; // @@ 
+}
+
+
+
+
+#if 0
+
+    BlockDXT5 * block = new(output)BlockDXT5;
+
+    // Decompress the color block and find the M values that reproduce the input most closely. This should compensate for some of the DXT errors.
+
+    // Compress the resulting M values optimally.
+
+    // Repeat this several times until compression error does not improve?
+
+    //Vector3 rgb_block[16];
+    //float m_block[16];
+
+
+    // Init RGB/M block.
+#if 0
+    nvsquish::WeightedClusterFit fit;
+
+    ColorBlock rgba;
+    for (int i = 0; i < 16; i++) {
+        const Vector4 & c = src.color(i);
+        float R = saturate(c.x);
+        float G = saturate(c.y);
+        float B = saturate(c.z);
+
+        float M = max(max(R, G), max(B, min_m));
+        float r = R / M;
+        float g = G / M;
+        float b = B / M;
+        float a = c.w;
+
+        rgba.color(i) = toColor32(Vector4(r, g, b, a));
+    }
+
+    if (rgba.isSingleColor())
+    {
+        OptimalCompress::compressDXT1(rgba.color(0), &block->color);
+    }
+    else
+    {
+        nvsquish::WeightedClusterFit fit;
+        fit.SetMetric(compressionOptions.colorWeight.x, compressionOptions.colorWeight.y, compressionOptions.colorWeight.z);
+
+        int flags = 0;
+        if (alphaMode == nvtt::AlphaMode_Transparency) flags |= nvsquish::kWeightColourByAlpha;
+
+        nvsquish::ColourSet colours((uint8 *)rgba.colors(), flags);
+        fit.SetColourSet(&colours, 0);
+        fit.Compress(&block->color);
+    }
+#endif
+#if 1
+    ColorSet rgb;
+    rgb.allocate(4, 4);
+
+    for (uint i = 0; i < 16; i++) {
+        const Vector4 & c = colors[i];
+
+        float R = saturate(c.x);
+        float G = saturate(c.y);
+        float B = saturate(c.z);
+
+        float M = max(max(R, G), max(B, min_m));
+        float r = R / M;
+        float g = G / M;
+        float b = B / M;
+        float a = c.w;
+
+        rgb.colors[i] = Vector4(r, g, b, a);
+        rgb.indices[i] = i;
+        rgb.weights[i] = max(weights[i], 0.001f);// weights[i];   // IC: For some reason 0 weights are causing problems, even if we eliminate the corresponding colors from the set.
+    }
+
+    rgb.createMinimalSet(/*ignoreTransparent=*/true);
+
+    if (rgb.isSingleColor(/*ignoreAlpha=*/true)) {
+        OptimalCompress::compressDXT1(toColor32(rgb.color(0)), &block->color);
+    }
+    else {
+        ClusterFit fit;
+        fit.setColorWeights(compressionOptions.colorWeight);
+        fit.setColorSet(&rgb);
+
+        Vector3 start, end;
+        fit.compress4(&start, &end);
+
+        QuickCompress::outputBlock4(rgb, start, end, &block->color);
+    }
+#endif
+
+    // Decompress RGB/M block.
+    nv::ColorBlock RGB;
+    block->color.decodeBlock(&RGB);
+    
+#if 1
+    AlphaBlock4x4 M;
+    for (int i = 0; i < 16; i++) {
+        const Vector4 & c = colors[i];
+        float R = saturate(c.x);
+        float G = saturate(c.y);
+        float B = saturate(c.z);
+
+        float r = RGB.color(i).r / 255.0f;
+        float g = RGB.color(i).g / 255.0f;
+        float b = RGB.color(i).b / 255.0f;
+
+        float m = (R / r + G / g + B / b) / 3.0f;
+        //float m = max((R / r + G / g + B / b) / 3.0f, min_m);
+        //float m = max(max(R / r, G / g), max(B / b, min_m));
+        //float m = max(max(R, G), max(B, min_m));
+
+        m = (m - min_m) / (1 - min_m);
+
+        M.alpha[i] = U8(ftoi_round(saturate(m) * 255.0f));
+        M.weights[i] = weights[i];
+    }
+
+    // Compress M.
+    if (compressionOptions.quality == Quality_Fastest) {
+        QuickCompress::compressDXT5A(M, &block->alpha);
+    }
+    else {
+        OptimalCompress::compressDXT5A(M, &block->alpha);
+    }
+#else
+    OptimalCompress::compressDXT5A_RGBM(src, RGB, &block->alpha);
+#endif
+
+#if 0
+    // Decompress M.
+    block->alpha.decodeBlock(&M);
+
+    rgb.allocate(src.w, src.h);     // @@ Handle smaller blocks.
+
+    for (uint i = 0; i < src.colorCount; i++) {
+        const Vector4 & c = src.color(i);
+
+        float R = saturate(c.x);
+        float G = saturate(c.y);
+        float B = saturate(c.z);
+
+        //float m = max(max(R, G), max(B, min_m));
+        float m = float(M.alpha[i]) / 255.0f * (1 - min_m) + min_m;
+        float r = R / m;
+        float g = G / m;
+        float b = B / m;
+        float a = c.w;
+
+        rgb.colors[i] = Vector4(r, g, b, a);
+        rgb.indices[i] = i;
+        rgb.weights[i] = max(c.w, 0.001f);// src.weights[i];   // IC: For some reason 0 weights are causing problems, even if we eliminate the corresponding colors from the set.
+    }
+
+    rgb.createMinimalSet(/*ignoreTransparent=*/true);
+
+    if (rgb.isSingleColor(/*ignoreAlpha=*/true)) {
+        OptimalCompress::compressDXT1(toColor32(rgb.color(0)), &block->color);
+    }
+    else {
+        ClusterFit fit;
+        fit.setMetric(compressionOptions.colorWeight);
+        fit.setColourSet(&rgb);
+
+        Vector3 start, end;
+        fit.compress4(&start, &end);
+
+        QuickCompress::outputBlock4(rgb, start, end, &block->color);
+    }
+#endif
+
+#if 0
+    block->color.decodeBlock(&RGB);
+
+    //AlphaBlock4x4 M;
+    //M.initWeights(src);
+    
+    for (int i = 0; i < 16; i++) {
+        const Vector4 & c = src.color(i);
+        float R = saturate(c.x);
+        float G = saturate(c.y);
+        float B = saturate(c.z);
+
+        float r = RGB.color(i).r / 255.0f;
+        float g = RGB.color(i).g / 255.0f;
+        float b = RGB.color(i).b / 255.0f;
+
+        float m = (R / r + G / g + B / b) / 3.0f;
+        //float m = max((R / r + G / g + B / b) / 3.0f, min_m);
+        //float m = max(max(R / r, G / g), max(B / b, min_m));
+        //float m = max(max(R, G), max(B, min_m));
+
+        m = (m - min_m) / (1 - min_m);
+
+        M.alpha[i] = U8(ftoi_round(saturate(m) * 255.0f));
+        M.weights[i] = src.weights[i];
+    }
+
+    // Compress M.
+    if (compressionOptions.quality == Quality_Fastest) {
+        QuickCompress::compressDXT5A(M, &block->alpha);
+    }
+    else {
+        OptimalCompress::compressDXT5A(M, &block->alpha);
+    }
+#endif
+
+
+
+#if 0
+    src.fromRGBM(M, min_m);
+
+    src.createMinimalSet(/*ignoreTransparent=*/true);
+
+    if (src.isSingleColor(/*ignoreAlpha=*/true)) {
+        OptimalCompress::compressDXT1(src.color(0), &block->color);
+    }
+    else {
+        // @@ Use our improved compressor.
+        ClusterFit fit;
+        fit.setMetric(compressionOptions.colorWeight);
+        fit.setColourSet(&src);
+
+        Vector3 start, end;
+        fit.compress4(&start, &end);
+
+        if (fit.compress3(&start, &end)) {
+            QuickCompress::outputBlock3(src, start, end, block->color);
+        }
+        else {
+            QuickCompress::outputBlock4(src, start, end, block->color);
+        }
+    }
+#endif // 0
+
+    // @@ Decompress color and compute M that best approximates src with these colors? Then compress M again?
+
+
+
+    // RGBM encoding.
+    // Maximize precision.
+    // - Number of possible grey levels:
+    //   - Naive:  2^3 = 8
+    //   - Better: 2^3 + 2^2 = 12
+    //   - How to choose min_m? 
+    //     - Ideal = Adaptive per block, don't know where to store.
+    //     - Adaptive per lightmap. How to compute optimal?
+    //     - Fixed: 0.25 in our case. Lightmaps scaled to a fixed [0, 1] range.
+
+    // - Optimal compressor: Interpolation artifacts.
+
+    // - Color transform. 
+    //    - Measure error in post-tone-mapping color space. 
+    //    - Assume a simple tone mapping operator. We know minimum and maximum exposure, but don't know exact exposure in game.
+    //    - Guess based on average lighmap color? Use fixed exposure, in scaled lightmap space.
+
+    // - Enhanced DXT compressor.
+    //    - Typical RGBM encoding as follows:
+    //      rgb -> M = max(rgb), RGB=rgb/M -> RGBM
+    //    - If we add a compression step (M' = M) and M' < M, then rgb may be greater than 1.
+    //      - We could ensure that M' >= M during compression.
+    //      - We could clamp RGB anyway.
+    //      - We could add a fixed scale value to take into account compression errors and avoid clamping.
+
+
+    
+
+
+    // Compress color.
+    /*if (rgba.isSingleColor())
+    {
+        OptimalCompress::compressDXT1(rgba.color(0), &block->color);
+    }
+    else
+    {
+        nvsquish::WeightedClusterFit fit;
+        fit.SetMetric(compressionOptions.colorWeight.x, compressionOptions.colorWeight.y, compressionOptions.colorWeight.z);
+
+        int flags = 0;
+        if (alphaMode == nvtt::AlphaMode_Transparency) flags |= nvsquish::kWeightColourByAlpha;
+
+        nvsquish::ColourSet colours((uint8 *)rgba.colors(), flags);
+        fit.SetColourSet(&colours, 0);
+        fit.Compress(&block->color);
+    }*/
+
+#endif // 0
\ No newline at end of file
diff --git a/src/nvtt/CompressorDXT5_RGBM.h b/src/nvtt/CompressorDXT5_RGBM.h
new file mode 100755
index 0000000..88cf646
--- /dev/null
+++ b/src/nvtt/CompressorDXT5_RGBM.h
@@ -0,0 +1,9 @@
+
+namespace nv {
+
+    struct BlockDXT5;
+    class Vector4;
+
+    float compress_dxt5_rgbm(const Vector4 input_colors[16], const float input_weights[16], float min_m, BlockDXT5 * output);
+
+}
diff --git a/src/nvtt/CompressorRGB.cpp b/src/nvtt/CompressorRGB.cpp
index 442c251..1ef7327 100644
--- a/src/nvtt/CompressorRGB.cpp
+++ b/src/nvtt/CompressorRGB.cpp
@@ -33,6 +33,7 @@
 #include "nvmath/Color.h"
 #include "nvmath/Half.h"
 #include "nvmath/ftoi.h"
+#include "nvmath/Vector.inl"
 
 #include "nvcore/Debug.h"
 
@@ -159,6 +160,164 @@ namespace
     }
 
 
+    // IC: Inf/NaN and denormal handling based on DirectXMath.
+    static float fromFloat11(uint u) {
+        // 5 bit exponent
+        // 6 bit mantissa
+        
+        uint E = (u >> 6) & 0x1F;
+        uint M = u & 0x3F;
+
+        Float754 F;
+        F.field.negative = 0;
+
+        if (E == 0x1f) { // INF or NAN.
+            E = 0xFF;
+        }
+        else {
+            if (E != 0) {
+                F.field.biasedexponent = E + 127 - 15;
+                F.field.mantissa = M << (23 - 6);
+            }
+            else if (M != 0) {
+                E = 1;
+                do {
+                    E--;
+                    M <<= 1;
+                } while((M & 0x40) == 0);
+
+                M &= 0x3F;
+            }
+        }
+
+        F.field.biasedexponent = 0xFF;
+        F.field.mantissa = M << (23 - 6);
+        
+
+#if 0
+        // X Channel (6-bit mantissa)
+        Mantissa = pSource->xm;
+
+        if ( pSource->xe == 0x1f ) // INF or NAN
+        {
+            Result[0] = 0x7f800000 | (pSource->xm << 17);
+        }
+        else
+        {
+            if ( pSource->xe != 0 ) // The value is normalized
+            {
+                Exponent = pSource->xe;
+            }
+            else if (Mantissa != 0) // The value is denormalized
+            {
+                // Normalize the value in the resulting float
+                Exponent = 1;
+        
+                do
+                {
+                    Exponent--;
+                    Mantissa <<= 1;
+                } while ((Mantissa & 0x40) == 0);
+        
+                Mantissa &= 0x3F;
+            }
+            else // The value is zero
+            {
+                Exponent = (uint32_t)-112;
+            }
+    
+            Result[0] = ((Exponent + 112) << 23) | (Mantissa << 17);
+        }
+    }
+#endif
+    
+    }
+
+    // https://www.opengl.org/registry/specs/EXT/texture_shared_exponent.txt
+    Float3SE toFloat3SE(float r, float g, float b)
+    {
+        const int N = 9;                    // Mantissa bits.
+        const int E = 5;                    // Exponent bits.
+        const int Emax = (1 << E) - 1;      // 31
+        const int B = (1 << (E-1)) - 1;     // 15
+        const float sharedexp_max = float((1 << N) - 1) / (1 << N) * (1 << (Emax-B));   // 65408
+
+        // Clamp color components.
+        r = max(0.0f, min(sharedexp_max, r));
+        g = max(0.0f, min(sharedexp_max, g));
+        b = max(0.0f, min(sharedexp_max, b));
+
+        // Get max component.
+        float max_c = max3(r, g, b);
+
+        // Compute shared exponent.
+        int exp_shared_p = max(-B-1, ftoi_floor(log2f(max_c))) + 1 + B;
+
+        int max_s = ftoi_round(max_c / (1 << (exp_shared_p - B - N)));
+
+        int exp_shared = exp_shared_p;
+        if (max_s == (1 << N)) exp_shared++;
+
+        Float3SE v;
+        v.e = exp_shared;
+
+        // Compute mantissas.
+        v.xm = ftoi_round(r / (1 << (exp_shared - B - N)));
+        v.ym = ftoi_round(g / (1 << (exp_shared - B - N)));
+        v.zm = ftoi_round(b / (1 << (exp_shared - B - N)));
+
+        return v;
+    }
+
+    Vector3 fromFloat3SE(Float3SE v) {
+        Float754 f;
+        f.raw = 0x33800000 + (v.e << 23);
+        float scale = f.value;
+        return scale * Vector3(float(v.xm), float(v.ym), float(v.zm));
+    }
+
+    // These are based on: http://www.graphics.cornell.edu/~bjw/rgbe/rgbe.c
+    uint toRGBE(float r, float g, float b)
+    {
+        float v = max3(r, g, b);
+
+        uint rgbe;
+
+        if (v < 1e-32) {
+            rgbe = 0;
+        }
+        else {
+            int e;
+            float scale = frexpf(v, &e) * 256.0f / v;
+            //Float754 f;
+            //f.value = v;
+            //float scale = f.field.biasedexponent * 256.0f / v;
+            //e = f.field.biasedexponent - 127
+
+            rgbe |= U8(ftoi_round(r * scale)) << 0;
+            rgbe |= U8(ftoi_round(g * scale)) << 8;
+            rgbe |= U8(ftoi_round(b * scale)) << 16;
+            rgbe |= U8(e + 128) << 24;
+        }
+
+        return rgbe;
+    }
+
+    Vector3 fromRGBE(uint rgbe) {
+        uint r = (rgbe >> 0) & 0xFF;
+        uint g = (rgbe >> 8) & 0xFF;
+        uint b = (rgbe >> 16) & 0xFF;
+        uint e = (rgbe >> 24);
+
+        if (e != 0) {
+            float scale = ldexpf(1.0f, e-(int)(128+8));             // +8 to divide by 256. @@ Shouldn't we divide by 255 instead?
+            return scale * Vector3(float(r), float(g), float(b));
+        }
+        
+        return Vector3(0);
+    }
+
+
     struct BitStream
     {
         BitStream(uint8 * ptr) : ptr(ptr), buffer(0), bits(0) {
@@ -348,6 +507,20 @@ void PixelFormatConverter::compress(nvtt::AlphaMode /*alphaMode*/, uint w, uint
                     else if (asize == 10) stream.putFloat10(a);
                     else stream.putBits(0, asize);
                 }
+                else if (compressionOptions.pixelType == nvtt::PixelType_SharedExp)
+                {
+                    if (rsize == 9 && gsize == 9 && bsize == 9 && asize == 5) {
+                        Float3SE v = toFloat3SE(r, g, b);
+                        stream.putBits(v.v, 32);
+                    }
+                    else if (rsize == 8 && gsize == 8 && bsize == 8 && asize == 8) {
+                        // @@ 
+                    }
+                    else {
+                        // @@ Not supported. Filling with zeros.
+                        stream.putBits(0, bitCount);
+                    }
+                }
                 else
                 {
                     // We first convert to 16 bits, then to the target size. @@ If greater than 16 bits, this will truncate and bitexpand.
diff --git a/src/nvtt/Context.cpp b/src/nvtt/Context.cpp
index fea0017..b1b6e6f 100644
--- a/src/nvtt/Context.cpp
+++ b/src/nvtt/Context.cpp
@@ -447,29 +447,38 @@ bool Compressor::Private::outputHeader(nvtt::TextureType textureType, int w, int
             {
                 const uint bitcount = compressionOptions.getBitCount();
 
-                if (bitcount == 16)
-                {
-                    if (compressionOptions.rsize == 16)
-                    {
-                        header.setDX10Format(56); // R16_UNORM
+                if (compressionOptions.pixelType == PixelType_Float) {
+                    if (compressionOptions.rsize == 16 && compressionOptions.gsize == 16 && compressionOptions.bsize == 16 && compressionOptions.asize == 16) {
+                        header.setDX10Format(DXGI_FORMAT_R16G16B16A16_FLOAT);
                     }
-                    else
-                    {
-                        // B5G6R5_UNORM
-                        // B5G5R5A1_UNORM
+                    else if (compressionOptions.rsize == 11 && compressionOptions.gsize == 11 && compressionOptions.bsize == 10 && compressionOptions.asize == 0) {
+                        header.setDX10Format(DXGI_FORMAT_R11G11B10_FLOAT);
+                    }
+                    else {
                         supported = false;
                     }
                 }
-                else if (bitcount == 32)
-                {
-                    // B8G8R8A8_UNORM
-                    // B8G8R8X8_UNORM
-                    // R8G8B8A8_UNORM
-                    // R10G10B10A2_UNORM
-                    supported = false;
-                }
                 else {
-                    supported = false;
+                    if (bitcount == 16) {
+                        if (compressionOptions.rsize == 16) {
+                            header.setDX10Format(DXGI_FORMAT_R16_UNORM);
+                        }
+                        else {
+                            // B5G6R5_UNORM
+                            // B5G5R5A1_UNORM
+                            supported = false;
+                        }
+                    }
+                    else if (bitcount == 32) {
+                        // B8G8R8A8_UNORM
+                        // B8G8R8X8_UNORM
+                        // R8G8B8A8_UNORM
+                        // R10G10B10A2_UNORM
+                        supported = false;
+                    }
+                    else {
+                        supported = false;
+                    }
                 }
             }
             else
@@ -492,7 +501,7 @@ bool Compressor::Private::outputHeader(nvtt::TextureType textureType, int w, int
                 else if (compressionOptions.format == Format_BC4) {
                     header.setDX10Format(DXGI_FORMAT_BC4_UNORM); // DXGI_FORMAT_BC4_SNORM ?
                 }
-                else if (compressionOptions.format == Format_BC5 || compressionOptions.format == Format_BC5_Luma) {
+                else if (compressionOptions.format == Format_BC5 /*|| compressionOptions.format == Format_BC5_Luma*/) {
                     header.setDX10Format(DXGI_FORMAT_BC5_UNORM); // DXGI_FORMAT_BC5_SNORM ?
                     if (isNormalMap) header.setNormalFlag(true);
                 }
@@ -605,7 +614,7 @@ bool Compressor::Private::outputHeader(nvtt::TextureType textureType, int w, int
                 else if (compressionOptions.format == Format_BC4) {
                     header.setFourCC('A', 'T', 'I', '1');
                 }
-                else if (compressionOptions.format == Format_BC5 || compressionOptions.format == Format_BC5_Luma) {
+                else if (compressionOptions.format == Format_BC5 /*|| compressionOptions.format == Format_BC5_Luma*/) {
                     header.setFourCC('A', 'T', 'I', '2');
                     if (isNormalMap) {
                         header.setNormalFlag(true);
@@ -773,10 +782,10 @@ CompressorInterface * Compressor::Private::chooseCpuCompressor(const Compression
     {
         return new CompressorBC7;
     }
-    else if (compressionOptions.format == Format_BC5_Luma)
+    /*else if (compressionOptions.format == Format_BC5_Luma)
     {
         return new ProductionCompressorBC5_Luma;
-    }
+    }*/
     else if (compressionOptions.format == Format_BC3_RGBM)
     {
         return new CompressorBC3_RGBM;
diff --git a/src/nvtt/OptimalCompressDXT.cpp b/src/nvtt/OptimalCompressDXT.cpp
index 602b6af..4c3731e 100644
--- a/src/nvtt/OptimalCompressDXT.cpp
+++ b/src/nvtt/OptimalCompressDXT.cpp
@@ -614,7 +614,7 @@ void OptimalCompress::compressDXT5A(const ColorBlock & src, AlphaBlockDXT5 * dst
     compressDXT5A(tmp, dst);
 }
 
-
+#if 0
 #include "nvmath/Vector.inl"
 #include "nvmath/ftoi.h"
 const float threshold = 0.15f;
@@ -809,3 +809,4 @@ void OptimalCompress::compressDXT5A_RGBM(const ColorSet & src, const ColorBlock
 
     computeAlphaIndices_RGBM(src, RGB, dst);
 }
+#endif // 0
\ No newline at end of file
diff --git a/src/nvtt/QuickCompressDXT.cpp b/src/nvtt/QuickCompressDXT.cpp
index 8390610..4676fee 100644
--- a/src/nvtt/QuickCompressDXT.cpp
+++ b/src/nvtt/QuickCompressDXT.cpp
@@ -227,6 +227,7 @@ inline static uint computeIndices4(const Vector3 block[16], Vector3::Arg maxColo
 }
 
 // maxColor and minColor are expected to be in the same range as the color set.
+/*
 inline static uint computeIndices4(const ColorSet & set, Vector3::Arg maxColor, Vector3::Arg minColor)
 {
 	Vector3 palette[4];
@@ -290,7 +291,7 @@ inline static uint computeIndices4(const ColorSet & set, Vector3::Arg maxColor,
 	}
 
 	return indices;
-}
+}*/
 
 inline static float evaluatePaletteError4(const Vector3 block[16], Vector3::Arg maxColor, Vector3::Arg minColor)
 {
@@ -341,7 +342,7 @@ inline static float evaluatePaletteError3(const Vector3 block[16], Vector3::Arg
 
 
 // maxColor and minColor are expected to be in the same range as the color set.
-inline static uint computeIndices3(const ColorSet & set, Vector3::Arg maxColor, Vector3::Arg minColor)
+/*inline static uint computeIndices3(const ColorSet & set, Vector3::Arg maxColor, Vector3::Arg minColor)
 {
 	Vector3 palette[4];
 	palette[0] = minColor;
@@ -372,7 +373,7 @@ inline static uint computeIndices3(const ColorSet & set, Vector3::Arg maxColor,
 	}
 
 	return indices;
-}
+}*/
 
 inline static uint computeIndices3(const Vector3 block[16], Vector3::Arg maxColor, Vector3::Arg minColor)
 {
@@ -827,7 +828,7 @@ void QuickCompress::compressDXT5(const ColorBlock & rgba, BlockDXT5 * dxtBlock,
 
 
 
-void QuickCompress::outputBlock4(const ColorSet & set, const Vector3 & start, const Vector3 & end, BlockDXT1 * block)
+/*void QuickCompress::outputBlock4(const ColorSet & set, const Vector3 & start, const Vector3 & end, BlockDXT1 * block)
 {
     Vector3 minColor = start * 255.0f;
     Vector3 maxColor = end * 255.0f;
@@ -866,4 +867,4 @@ void QuickCompress::outputBlock3(const ColorSet & set, const Vector3 & start, co
 
     //optimizeEndPoints3(set, block);
 }
-
+*/
diff --git a/src/nvtt/Surface.cpp b/src/nvtt/Surface.cpp
index 51d1521..aa612ee 100644
--- a/src/nvtt/Surface.cpp
+++ b/src/nvtt/Surface.cpp
@@ -37,6 +37,7 @@
 #include "nvimage/ColorBlock.h"
 #include "nvimage/PixelFormat.h"
 #include "nvimage/ErrorMetric.h"
+#include "nvimage/DirectDrawSurface.h"
 
 #include <float.h>
 #include <string.h> // memset, memcpy
@@ -85,7 +86,7 @@ namespace
         else if (format == Format_BC4) {
             return 8;
         }
-        else if (format == Format_BC5 || format == Format_BC5_Luma) {
+        else if (format == Format_BC5 /*|| format == Format_BC5_Luma*/) {
             return 16;
         }
         else if (format == Format_CTX1) {
@@ -469,11 +470,66 @@ void Surface::range(int channel, float * rangeMin, float * rangeMax, int alpha_c
     *rangeMax = range.y;
 }
 
-
 bool Surface::load(const char * fileName, bool * hasAlpha/*= NULL*/)
 {
     AutoPtr<FloatImage> img(ImageIO::loadFloat(fileName));
     if (img == NULL) {
+        // Try loading as DDS.
+        if (nv::strEqual(nv::Path::extension(fileName), ".dds")) {
+            nv::DirectDrawSurface dds;
+            if (dds.load(fileName)) {
+                if (dds.header.isBlockFormat()) {
+                    int w = dds.surfaceWidth(0);
+                    int h = dds.surfaceHeight(0);
+                    uint size = dds.surfaceSize(0);
+
+                    void * data = malloc(size);
+                    dds.readSurface(0, 0, data, size);
+
+                    // @@ Handle all formats! @@ Get nvtt format from dds.surfaceFormat() ?
+
+                    if (dds.header.hasDX10Header()) {
+                        if (dds.header.header10.dxgiFormat == DXGI_FORMAT_BC6H_UF16) {
+                            this->setImage2D(nvtt::Format_BC6, nvtt::Decoder_D3D10, w, h, data);
+                        }
+                        else {
+                            // @@
+                            nvCheck(false);
+                        }
+                    }
+                    else {
+                        uint fourcc = dds.header.pf.fourcc;
+                        if (fourcc == FOURCC_DXT1) {
+                            this->setImage2D(nvtt::Format_BC1, nvtt::Decoder_D3D10, w, h, data);
+                        }
+                        else if (fourcc == FOURCC_DXT5) {
+                            this->setImage2D(nvtt::Format_BC3, nvtt::Decoder_D3D10, w, h, data);
+                        }
+                        else {
+                            // @@ 
+                            nvCheck(false);
+                        }
+                    }
+
+                    free(data);
+                }
+                else {
+                    Image img;
+                    dds.mipmap(&img, /*face=*/0, /*mipmap=*/0);
+
+                    int w = img.width();
+                    int h = img.height();
+                    int d = img.depth();
+
+                    // @@ Add support for all pixel formats.
+
+                    this->setImage(nvtt::InputFormat_BGRA_8UB, w, h, d, img.pixels());
+                }
+
+                return true;
+            }
+        }
+
         return false;
     }
 
@@ -768,22 +824,22 @@ bool Surface::setImage2D(Format format, Decoder decoder, int w, int h, const voi
 			{
 				for (int x = 0; x < bw; x++)
 				{
-					ColorSet colors;
-					const BlockBC6 * block = (const BlockBC6 *)ptr;
-					block->decodeBlock(&colors);
+                    Vector3 colors[16];
+                    const BlockBC6 * block = (const BlockBC6 *)ptr;
+					block->decodeBlock(colors);
 
 					for (int yy = 0; yy < 4; yy++)
 					{
 						for (int xx = 0; xx < 4; xx++)
 						{
-							Vector4 rgba = colors.colors[yy*4 + xx];
+							Vector3 rgb = colors[yy*4 + xx];
 
 							if (x * 4 + xx < w && y * 4 + yy < h)
 							{
-								m->image->pixel(0, x*4 + xx, y*4 + yy, 0) = rgba.x;
-								m->image->pixel(1, x*4 + xx, y*4 + yy, 0) = rgba.y;
-								m->image->pixel(2, x*4 + xx, y*4 + yy, 0) = rgba.z;
-								m->image->pixel(3, x*4 + xx, y*4 + yy, 0) = rgba.w;
+								m->image->pixel(0, x*4 + xx, y*4 + yy, 0) = rgb.x;
+								m->image->pixel(1, x*4 + xx, y*4 + yy, 0) = rgb.y;
+								m->image->pixel(2, x*4 + xx, y*4 + yy, 0) = rgb.z;
+								m->image->pixel(3, x*4 + xx, y*4 + yy, 0) = 1.0f;
 							}
 						}
 					}
@@ -1579,25 +1635,32 @@ void Surface::toRGBM(float range/*= 1*/, float threshold/*= 0.25*/)
         float bestM;
         float bestError = FLT_MAX;
 
+        //float range = 15;  // 4 bit quantization.
+        //int irange = 16;
+        float range = 255;  // 8 bit quantization.
+        int irange = 256;
+
+
         float M = max(max(R, G), max(B, threshold));
-        int iM = ftoi_ceil((M - threshold) / (1 - threshold) * 255.0f);
+        int iM = ftoi_ceil((M - threshold) / (1 - threshold) * range);
 
         //for (int m = 0; m < 256; m++) {                           // If we use the entire search space, interpolation errors are very likely to occur.
-        for (int m = max(iM-16, 0); m < min(iM+16, 256); m++) {     // If we constrain the search space, these errors disappear.
-            float fm = float(m) / 255.0f;
+        for (int m = max(iM-16, 0); m < min(iM+16, irange); m++) {     // If we constrain the search space, these errors disappear.
+        //for (int m = max(iM-4, 0); m < min(iM+4, irange); m++) {     // If we constrain the search space, these errors disappear.
+            float fm = float(m) / range;
 
             // Decode M
             float M = fm * (1 - threshold) + threshold;
 
             // Encode.
-            int ir = ftoi_round(255.0f * nv::saturate(R / M));
-            int ig = ftoi_round(255.0f * nv::saturate(G / M));
-            int ib = ftoi_round(255.0f * nv::saturate(B / M));
+            int ir = ftoi_round(range * nv::saturate(R / M));
+            int ig = ftoi_round(range * nv::saturate(G / M));
+            int ib = ftoi_round(range * nv::saturate(B / M));
 
             // Decode.
-            float fr = (float(ir) / 255.0f) * M;
-            float fg = (float(ig) / 255.0f) * M;
-            float fb = (float(ib) / 255.0f) * M;
+            float fr = (float(ir) / range) * M;
+            float fg = (float(ig) / range) * M;
+            float fb = (float(ib) / range) * M;
 
             // Measure error.
             float error = square(R-fr) + square(G-fg) + square(B-fb);
@@ -2961,3 +3024,189 @@ float nvtt::rmsToneMappedError(const Surface & reference, const Surface & img, f
     return nv::rmsColorError(r.m->image, i.m->image, reference.alphaMode() == nvtt::AlphaMode_Transparency);
 }
 
+
+Surface nvtt::histogram(const Surface & img, int width, int height)
+{
+    float min_color[3], max_color[3];
+    img.range(0, &min_color[0], &max_color[0]);
+    img.range(1, &min_color[1], &max_color[1]);
+    img.range(2, &min_color[2], &max_color[2]);
+
+    float minRange = nv::min3(min_color[0], min_color[1], min_color[2]);
+    float maxRange = nv::max3(max_color[0], max_color[1], max_color[2]);
+
+    if (maxRange > 16) maxRange = 16;
+
+    return histogram(img, /*minRange*/0, maxRange, width, height);
+}
+
+#include "nvcore/Array.inl"
+#include "nvmath/PackedFloat.h"
+#include <stdio.h>
+
+nvtt::Surface nvtt::histogram(const Surface & img, float minRange, float maxRange, int width, int height)
+{
+    nv::Array<Vector3> buckets;
+    buckets.resize(width, Vector3(0));
+
+    int w = img.width();
+    int h = img.height();
+    int d = img.depth();
+
+    const float * r = img.channel(0);
+    const float * g = img.channel(1);
+    const float * b = img.channel(2);
+    const float * a = img.channel(3);
+
+#if 0
+    for (int z = 0; z < d; z++)
+    for (int y = 0; y < h; y++)
+    for (int x = 0; x < w; x++)
+    {
+        int i = x + y * w + z * w * d;
+
+        float fr = (r[i] - minRange) / (maxRange - minRange);
+        float fg = (g[i] - minRange) / (maxRange - minRange);
+        float fb = (b[i] - minRange) / (maxRange - minRange);
+
+        int R = ftoi_round(fr * (width - 1));
+        int G = ftoi_round(fg * (width - 1));
+        int B = ftoi_round(fb * (width - 1));
+
+        R = nv::clamp(R, 0, width-1);
+        G = nv::clamp(G, 0, width-1);
+        B = nv::clamp(B, 0, width-1);
+        
+        // Alpha weighted histogram?
+        float A = nv::saturate(a[i]);
+
+        buckets[R].x += A;
+        buckets[G].y += A;
+        buckets[B].z += A;
+    }
+
+#elif 1
+    
+    float exposure = 0.22f;
+
+    //int E = 8, M = 23;    // float
+    int E = 5, M = 10;    // half
+    //int E = 5, M = 9;     // rgb9e5
+    //int E = 5, M = 6;     // r11g11b10
+
+    for (int e = 0; e < (1 << E); e++)
+    {
+        /*if (e == 0x1f) {    // Skip NaN and inf.
+            continue;
+        }*/
+        if (e == 0) {       // Skip denormals.
+            continue;
+        }
+
+        for (int m = 0; m < (1 << M); m++)
+        {
+            Float754 F;
+            F.field.negative = 0;
+            F.field.biasedexponent = e + 128 - (1 << (E - 1)) - 1;  // E=5 -> 128 - 15
+            F.field.mantissa = m << (23 - M);
+
+            // value = (1 + mantissa) * 2^(e-15)
+
+            // @@ Handle denormals.
+
+            float fc = F.value;
+
+            // Tone mapping:
+            fc /= exposure;
+            //fc /= (fc + 1);             // Reindhart tone mapping.
+            fc = 1 - exp2f(-fc);        // Halo2 tone mapping.
+
+            // Gamma space conversion:
+            //fc = sqrtf(fc);
+            fc = powf(fc, 1.0f/2.2f);
+            //fc = toSrgb(fc);
+
+            //fc = (fc - 0.5f) * 8; // zoom in
+            //if (fc < 0 || fc > 1) continue;
+
+            //printf("%f\n", fc);
+
+            int c = ftoi_round(fc * (width - 1) / 1);
+            c = clamp(c, 0, width - 1);
+
+            buckets[c] += Vector3(1);
+        }
+    }
+
+#else
+
+    float exposure = 0.22f;
+
+    int R = 8, M = 8;
+    //int R = 6, M = 8;
+    //int R = 9, M = 5;
+
+    float threshold = 1.0f / (1 << M);
+    //float threshold = 0.25f;
+
+    for (int r = 0; r < (1 << R); r++)
+    {
+        float fr = float(r) / ((1 << R) - 1);
+
+        for (int m = 0; m < (1 << M); m++)
+        {
+            float fm = float(m) / ((1 << M) - 1);
+            float M = fm * (1 - threshold) + threshold;
+
+            float fc = fr * M;
+
+            fc /= exposure;
+            
+            //fc /= (fc + 1);             // Reindhart tone mapping.
+            fc = 1 - exp2f(-fc);        // Halo2 tone mapping.
+
+            // Gamma space conversion:
+            //fc = sqrtf(fc);
+            fc = powf(fc, 1.0f/2.2f);
+            //fc = toSrgb(fc);
+
+            //fc = (fc - 0.5f) * 8; // zoom in
+            //if (fc < 0 || fc > 1) continue;
+
+            int c = ftoi_round(fc * (width - 1));
+            c = clamp(c, 0, width - 1);
+
+            buckets[c] += Vector3(1);
+        }
+    }
+
+    //buckets[0] = Vector3(1);    // Hack, for prettier histograms.
+
+#endif
+
+
+    // Compute largerst height.
+    float maxh = 0;
+    for (int i = 0; i < width; i++) {
+        maxh = nv::max(maxh, nv::max3(buckets[i].x, buckets[i].y, buckets[i].z));
+    }
+
+    printf("maxh = %f\n", maxh);
+    //maxh = 80;
+    maxh = 256;
+
+    // Draw histogram.
+    nvtt::Surface hist;
+    hist.setImage(width, height, 1);
+    
+    for (int y = 0; y < height; y++) {
+        float fy = 1.0f - float(y) / (height - 1);
+        for (int x = 0; x < width; x++) {
+            hist.m->image->pixel(0, x, y, /*z=*/0) = fy < (buckets[x].x / maxh);
+            hist.m->image->pixel(1, x, y, /*z=*/0) = fy < (buckets[x].y / maxh);
+            hist.m->image->pixel(2, x, y, /*z=*/0) = fy < (buckets[x].z / maxh);
+        }
+    }
+
+    return hist;
+}
diff --git a/src/nvtt/nvtt.cpp b/src/nvtt/nvtt.cpp
index 83b9aac..b85d52e 100644
--- a/src/nvtt/nvtt.cpp
+++ b/src/nvtt/nvtt.cpp
@@ -23,12 +23,14 @@
 // OTHER DEALINGS IN THE SOFTWARE.
 
 #include "nvtt.h"
+#include "nvcore/nvcore.h"
 
 using namespace nvtt;
 
-/// Return a string for the given error.
+// Return a string for the given error.
 const char * nvtt::errorString(Error e)
 {
+    NV_COMPILER_CHECK(Error_Count == 7);
     switch(e)
     {
         case Error_Unknown:
@@ -50,11 +52,8 @@ const char * nvtt::errorString(Error e)
     return "Invalid error";
 }
 
-/// Return NVTT version.
+// Return NVTT version.
 unsigned int nvtt::version()
 {
     return NVTT_VERSION;
 }
-
-
-
diff --git a/src/nvtt/nvtt.h b/src/nvtt/nvtt.h
index 0195362..8724d42 100644
--- a/src/nvtt/nvtt.h
+++ b/src/nvtt/nvtt.h
@@ -71,7 +71,7 @@ namespace nvtt
     struct CubeSurface;
 
 
-    // Supported compression formats.
+    // Supported block-compression formats.
     // @@ I wish I had distinguished between "formats" and compressors.
     // That is:
     // - 'DXT1' is a format 'DXT1a' and 'DXT1n' are DXT1 compressors.
@@ -79,7 +79,7 @@ namespace nvtt
     // Having multiple enums for the same ids only creates confusion. Clean this up.
     enum Format
     {
-        // No compression.
+        // No block-compression (linear).
         Format_RGB,
         Format_RGBA = Format_RGB,
 
@@ -105,7 +105,7 @@ namespace nvtt
         Format_BC6,     // Not supported yet.
         Format_BC7,     // Not supported yet.
 
-        Format_BC5_Luma,    // Two DXT alpha blocks encoding a single float.
+        //Format_BC5_Luma,    // Two DXT alpha blocks encoding a single float.
         Format_BC3_RGBM,    // 
 
         Format_Count
@@ -120,6 +120,7 @@ namespace nvtt
         PixelType_SignedInt = 3,    // Not supported yet.
         PixelType_Float = 4,
         PixelType_UnsignedFloat = 5,
+        PixelType_SharedExp = 6,    // Shared exponent.
     };
 
     // Quality modes.
@@ -309,7 +310,7 @@ namespace nvtt
         // Output data. Compressed data is output as soon as it's generated to minimize memory allocations.
         virtual bool writeData(const void * data, int size) = 0;
 
-        // Indicate the end of a the compressed image. (New in NVTT 2.1)
+        // Indicate the end of the compressed image. (New in NVTT 2.1)
         virtual void endImage() = 0;
     };
 
@@ -323,6 +324,7 @@ namespace nvtt
         Error_FileOpen,
         Error_FileWrite,
         Error_UnsupportedOutputFormat,
+        Error_Count
     };
 
     // Error handler.
@@ -660,6 +662,10 @@ namespace nvtt
 
     NVTT_API float rmsToneMappedError(const Surface & reference, const Surface & img, float exposure);
 
+
+    NVTT_API Surface histogram(const Surface & img, int width, int height);
+    NVTT_API Surface histogram(const Surface & img, float minRange, float maxRange, int width, int height);
+
 } // nvtt namespace
 
 #endif // NVTT_H
diff --git a/src/nvtt/tools/compress.cpp b/src/nvtt/tools/compress.cpp
index c7d662d..3a3ce7c 100644
--- a/src/nvtt/tools/compress.cpp
+++ b/src/nvtt/tools/compress.cpp
@@ -152,6 +152,8 @@ int main(int argc, char *argv[])
     bool premultiplyAlpha = false;
     nvtt::MipmapFilter mipmapFilter = nvtt::MipmapFilter_Box;
     bool loadAsFloat = false;
+    bool rgbm = false;
+    bool rangescale = false;
 
     const char * externalCompressor = NULL;
 
@@ -209,6 +211,15 @@ int main(int argc, char *argv[])
         {
             loadAsFloat = true;
         }
+        else if (strcmp("-rgbm", argv[i]) == 0)
+        {
+            rgbm = true;
+        }
+        else if (strcmp("-rangescale", argv[i]) == 0)
+        {
+            rangescale = true;
+        }
+
 
         // Compression options.
         else if (strcmp("-fast", argv[i]) == 0)
@@ -269,6 +280,11 @@ int main(int argc, char *argv[])
         {
             format = nvtt::Format_BC7;
         }
+        else if (strcmp("-bc3_rgbm", argv[i]) == 0)
+        {
+            format = nvtt::Format_BC3_RGBM;
+            rgbm = true;
+        }
 
         // Undocumented option. Mainly used for testing.
         else if (strcmp("-ext", argv[i]) == 0)
@@ -332,32 +348,35 @@ int main(int argc, char *argv[])
         printf("usage: nvcompress [options] infile [outfile.dds]\n\n");
 
         printf("Input options:\n");
-        printf("  -color     \tThe input image is a color map (default).\n");
-        printf("  -alpha     \tThe input image has an alpha channel used for transparency.\n");
-        printf("  -normal    \tThe input image is a normal map.\n");
-        printf("  -tonormal  \tConvert input to normal map.\n");
-        printf("  -clamp     \tClamp wrapping mode (default).\n");
-        printf("  -repeat    \tRepeat wrapping mode.\n");
-        printf("  -nomips    \tDisable mipmap generation.\n");
-        printf("  -premula   \tPremultiply alpha into color channel.\n");
-        printf("  -mipfilter \tMipmap filter. One of the following: box, triangle, kaiser.\n");
-        printf("  -float     \tLoad as floating point image.\n\n");
+        printf("  -color        The input image is a color map (default).\n");
+        printf("  -alpha        The input image has an alpha channel used for transparency.\n");
+        printf("  -normal       The input image is a normal map.\n");
+        printf("  -tonormal     Convert input to normal map.\n");
+        printf("  -clamp        Clamp wrapping mode (default).\n");
+        printf("  -repeat       Repeat wrapping mode.\n");
+        printf("  -nomips       Disable mipmap generation.\n");
+        printf("  -premula      Premultiply alpha into color channel.\n");
+        printf("  -mipfilter    Mipmap filter. One of the following: box, triangle, kaiser.\n");
+        printf("  -float        Load as floating point image.\n\n");
+        printf("  -rgbm         Transform input to RGBM.\n\n");
+        printf("  -rangescale   Scale image to use entire color range.\n\n");
 
         printf("Compression options:\n");
-        printf("  -fast    \tFast compression.\n");
-        printf("  -nocuda  \tDo not use cuda compressor.\n");
-        printf("  -rgb     \tRGBA format\n");
-        printf("  -lumi    \tLUMINANCE format\n");
-        printf("  -bc1     \tBC1 format (DXT1)\n");
-        printf("  -bc1n    \tBC1 normal map format (DXT1nm)\n");
-        printf("  -bc1a    \tBC1 format with binary alpha (DXT1a)\n");
-        printf("  -bc2     \tBC2 format (DXT3)\n");
-        printf("  -bc3     \tBC3 format (DXT5)\n");
-        printf("  -bc3n    \tBC3 normal map format (DXT5nm)\n");
-        printf("  -bc4     \tBC4 format (ATI1)\n");
-        printf("  -bc5     \tBC5 format (3Dc/ATI2)\n");
-        printf("  -bc6     \tBC6 format\n");
-        printf("  -bc7     \tBC7 format\n\n");
+        printf("  -fast         Fast compression.\n");
+        printf("  -nocuda       Do not use cuda compressor.\n");
+        printf("  -rgb          RGBA format\n");
+        printf("  -lumi         LUMINANCE format\n");
+        printf("  -bc1          BC1 format (DXT1)\n");
+        printf("  -bc1n         BC1 normal map format (DXT1nm)\n");
+        printf("  -bc1a         BC1 format with binary alpha (DXT1a)\n");
+        printf("  -bc2          BC2 format (DXT3)\n");
+        printf("  -bc3          BC3 format (DXT5)\n");
+        printf("  -bc3n         BC3 normal map format (DXT5nm)\n");
+        printf("  -bc4          BC4 format (ATI1)\n");
+        printf("  -bc5          BC5 format (3Dc/ATI2)\n");
+        printf("  -bc6          BC6 format\n");
+        printf("  -bc7          BC7 format\n\n");
+        printf("  -bc3_rgbm     BC3-rgbm format\n\n");
 
         printf("Output options:\n");
         printf("  -silent  \tDo not output progress messages\n");
@@ -376,145 +395,211 @@ int main(int argc, char *argv[])
     // Set input options.
     nvtt::InputOptions inputOptions;
 
-    if (nv::strCaseDiff(input.extension(), ".dds") == 0)
-    {
-        // Load surface.
-        nv::DirectDrawSurface dds(input.str());
-        if (!dds.isValid())
-        {
-            fprintf(stderr, "The file '%s' is not a valid DDS file.\n", input.str());
-            return EXIT_FAILURE;
-        }
+    bool useSurface = false;    // @@ use Surface API in all cases!
+    nvtt::Surface image;
 
-        if (!dds.isSupported())
-        {
-            fprintf(stderr, "The file '%s' is not a supported DDS file.\n", input.str());
+    if (true || format == nvtt::Format_BC3_RGBM || rgbm) {
+        useSurface = true;
+
+        if (!image.load(input.str())) {
+            fprintf(stderr, "Error opening input file '%s'.\n", input.str());
             return EXIT_FAILURE;
         }
 
-        uint faceCount;
-        if (dds.isTexture2D())
-        {
-            inputOptions.setTextureLayout(nvtt::TextureType_2D, dds.width(), dds.height());
-            faceCount = 1;
-        }
-        else if (dds.isTexture3D())
-        {
-            inputOptions.setTextureLayout(nvtt::TextureType_3D, dds.width(), dds.height(), dds.depth());
-            faceCount = 1;
+        if (rangescale) {
+            // get color range
+            float min_color[3], max_color[3];
+            image.range(0, &min_color[0], &max_color[0]);
+            image.range(1, &min_color[1], &max_color[1]);
+            image.range(2, &min_color[2], &max_color[2]);
 
-            nvDebugBreak();
-        }
-        else 
-        {
-            nvDebugCheck(dds.isTextureCube());
-            inputOptions.setTextureLayout(nvtt::TextureType_Cube, dds.width(), dds.height());
-            faceCount = 6;
-        }
-
-        uint mipmapCount = dds.mipmapCount();
+            //printf("Color range = %.2f %.2f %.2f\n", max_color[0], max_color[1], max_color[2]);
 
-        nv::Image mipmap;
+            float color_range = nv::max3(max_color[0], max_color[1], max_color[2]);
+            const float max_color_range = 16.0f;
 
-        for (uint f = 0; f < faceCount; f++)
-        {
-            for (uint m = 0; m < mipmapCount; m++)
-            {
-                dds.mipmap(&mipmap, f, m); // @@ Load as float.
+            if (color_range > max_color_range) {
+                //Log::print("Clamping color range %f to %f\n", color_range, max_color_range);
+                color_range = max_color_range;
+            }
+            //color_range = max_color_range;  // Use a fixed color range for now.
 
-                inputOptions.setMipmapData(mipmap.pixels(), mipmap.width(), mipmap.height(), mipmap.depth(), f, m);
+            for (int i = 0; i < 3; i++) {
+                image.scaleBias(i, 1.0f / color_range, 0.0f);
             }
+            image.toneMap(nvtt::ToneMapper_Linear, /*parameters=*/NULL); // Clamp without changing the hue.
+
+            // Clamp alpha.
+            image.clamp(3);
+        }
+
+        if (alpha) {
+            image.setAlphaMode(nvtt::AlphaMode_Transparency);
+        }
+
+        // To gamma.
+        image.toGamma(2);
+
+        if (format != nvtt::Format_BC3_RGBM) {
+            image.setAlphaMode(nvtt::AlphaMode_None);
+            image.toRGBM(1, 0.15f);
         }
     }
-    else
-    {
-        if (nv::strCaseDiff(input.extension(), ".exr") == 0 || nv::strCaseDiff(input.extension(), ".hdr") == 0)
-        {
-            loadAsFloat = true;
+    else if (format == nvtt::Format_BC6) {
+        //format = nvtt::Format_BC1;
+        //fprintf(stderr, "BLABLABLA.\n");
+        useSurface = true;
+
+        if (!image.load(input.str())) {
+            fprintf(stderr, "Error opening input file '%s'.\n", input.str());
+            return EXIT_FAILURE;
         }
 
-        if (loadAsFloat)
+        image.setAlphaMode(nvtt::AlphaMode_Transparency);
+    }
+    else {
+        if (nv::strCaseDiff(input.extension(), ".dds") == 0)
         {
-            nv::AutoPtr<nv::FloatImage> image(nv::ImageIO::loadFloat(input.str()));
+            // Load surface.
+            nv::DirectDrawSurface dds(input.str());
+            if (!dds.isValid())
+            {
+                fprintf(stderr, "The file '%s' is not a valid DDS file.\n", input.str());
+                return EXIT_FAILURE;
+            }
 
-            if (image == NULL)
+            if (!dds.isSupported())
             {
-                fprintf(stderr, "The file '%s' is not a supported image type.\n", input.str());
+                fprintf(stderr, "The file '%s' is not a supported DDS file.\n", input.str());
                 return EXIT_FAILURE;
             }
 
-            inputOptions.setFormat(nvtt::InputFormat_RGBA_32F);
-            inputOptions.setTextureLayout(nvtt::TextureType_2D, image->width(), image->height());
+            uint faceCount;
+            if (dds.isTexture2D())
+            {
+                inputOptions.setTextureLayout(nvtt::TextureType_2D, dds.width(), dds.height());
+                faceCount = 1;
+            }
+            else if (dds.isTexture3D())
+            {
+                inputOptions.setTextureLayout(nvtt::TextureType_3D, dds.width(), dds.height(), dds.depth());
+                faceCount = 1;
+
+                nvDebugBreak();
+            }
+            else 
+            {
+                nvDebugCheck(dds.isTextureCube());
+                inputOptions.setTextureLayout(nvtt::TextureType_Cube, dds.width(), dds.height());
+                faceCount = 6;
+            }
+
+            uint mipmapCount = dds.mipmapCount();
+
+            nv::Image mipmap;
 
-            /*for (uint i = 0; i < image->componentNum(); i++)
+            for (uint f = 0; f < faceCount; f++)
             {
-                inputOptions.setMipmapChannelData(image->channel(i), i, image->width(), image->height());
-            }*/
+                for (uint m = 0; m < mipmapCount; m++)
+                {
+                    dds.mipmap(&mipmap, f, m); // @@ Load as float.
+
+                    inputOptions.setMipmapData(mipmap.pixels(), mipmap.width(), mipmap.height(), mipmap.depth(), f, m);
+                }
+            }
         }
         else
         {
-            // Regular image.
-            nv::Image image;
-            if (!image.load(input.str()))
+            if (nv::strCaseDiff(input.extension(), ".exr") == 0 || nv::strCaseDiff(input.extension(), ".hdr") == 0)
             {
-                fprintf(stderr, "The file '%s' is not a supported image type.\n", input.str());
-                return 1;
+                loadAsFloat = true;
             }
 
-            inputOptions.setTextureLayout(nvtt::TextureType_2D, image.width(), image.height());
-            inputOptions.setMipmapData(image.pixels(), image.width(), image.height());
+            if (loadAsFloat)
+            {
+                nv::AutoPtr<nv::FloatImage> image(nv::ImageIO::loadFloat(input.str()));
+
+                if (image == NULL)
+                {
+                    fprintf(stderr, "The file '%s' is not a supported image type.\n", input.str());
+                    return EXIT_FAILURE;
+                }
+
+                inputOptions.setFormat(nvtt::InputFormat_RGBA_32F);
+                inputOptions.setTextureLayout(nvtt::TextureType_2D, image->width(), image->height());
+
+                /*for (uint i = 0; i < image->componentNum(); i++)
+                {
+                    inputOptions.setMipmapChannelData(image->channel(i), i, image->width(), image->height());
+                }*/
+            }
+            else
+            {
+                // Regular image.
+                nv::Image image;
+                if (!image.load(input.str()))
+                {
+                    fprintf(stderr, "The file '%s' is not a supported image type.\n", input.str());
+                    return 1;
+                }
+
+                inputOptions.setTextureLayout(nvtt::TextureType_2D, image.width(), image.height());
+                inputOptions.setMipmapData(image.pixels(), image.width(), image.height());
+            }
         }
-    }
 
-    if (wrapRepeat)
-    {
-        inputOptions.setWrapMode(nvtt::WrapMode_Repeat);
-    }
-    else
-    {
-        inputOptions.setWrapMode(nvtt::WrapMode_Clamp);
-    }
+        if (wrapRepeat)
+        {
+            inputOptions.setWrapMode(nvtt::WrapMode_Repeat);
+        }
+        else
+        {
+            inputOptions.setWrapMode(nvtt::WrapMode_Clamp);
+        }
 
-    if (alpha)
-    {
-        inputOptions.setAlphaMode(nvtt::AlphaMode_Transparency);
-    }
-    else
-    {
-        inputOptions.setAlphaMode(nvtt::AlphaMode_None);
-    }
+        if (alpha)
+        {
+            inputOptions.setAlphaMode(nvtt::AlphaMode_Transparency);
+        }
+        else
+        {
+            inputOptions.setAlphaMode(nvtt::AlphaMode_None);
+        }
 
-    // Block compressed textures with mipmaps must be powers of two.
-    if (!noMipmaps && format != nvtt::Format_RGB)
-    {
-        inputOptions.setRoundMode(nvtt::RoundMode_ToPreviousPowerOfTwo);
-    }
+        // Block compressed textures with mipmaps must be powers of two.
+        if (!noMipmaps && format != nvtt::Format_RGB)
+        {
+            inputOptions.setRoundMode(nvtt::RoundMode_ToPreviousPowerOfTwo);
+        }
 
-    if (normal)
-    {
-        setNormalMap(inputOptions);
-    }
-    else if (color2normal)
-    {
-        setColorToNormalMap(inputOptions);
-    }
-    else
-    {
-        setColorMap(inputOptions);
-    }
+        if (normal)
+        {
+            setNormalMap(inputOptions);
+        }
+        else if (color2normal)
+        {
+            setColorToNormalMap(inputOptions);
+        }
+        else
+        {
+            setColorMap(inputOptions);
+        }
 
-    if (noMipmaps)
-    {
-        inputOptions.setMipmapGeneration(false);
+        if (noMipmaps)
+        {
+            inputOptions.setMipmapGeneration(false);
+        }
+
+        /*if (premultiplyAlpha)
+        {
+            inputOptions.setPremultiplyAlpha(true);
+            inputOptions.setAlphaMode(nvtt::AlphaMode_Premultiplied);
+        }*/
+
+        inputOptions.setMipmapFilter(mipmapFilter);
     }
 
-    /*if (premultiplyAlpha)
-    {
-        inputOptions.setPremultiplyAlpha(true);
-        inputOptions.setAlphaMode(nvtt::AlphaMode_Premultiplied);
-    }*/
 
-    inputOptions.setMipmapFilter(mipmapFilter);
 
     nvtt::CompressionOptions compressionOptions;
     compressionOptions.setFormat(format);
@@ -545,8 +630,25 @@ int main(int argc, char *argv[])
             //compressionOptions.setQuantization(/*color dithering*/true, /*alpha dithering*/false, /*binary alpha*/false);
             //compressionOptions.setPixelType(nvtt::PixelType_UnsignedNorm);
             //compressionOptions.setPixelFormat(5, 6, 5, 0);
+            //compressionOptions.setPixelFormat(8, 8, 8, 8);
+
+            // A4R4G4B4
+            //compressionOptions.setPixelFormat(16, 0xF00, 0xF0, 0xF, 0xF000);
+
+            //compressionOptions.setPixelFormat(32, 0xFF0000, 0xFF00, 0xFF, 0xFF000000);
+
+            // R10B20G10A2
+            //compressionOptions.setPixelFormat(10, 10, 10, 2);
+
+            // DXGI_FORMAT_R11G11B10_FLOAT
+            compressionOptions.setPixelType(nvtt::PixelType_Float);
+            compressionOptions.setPixelFormat(11, 11, 10, 0);
         }
     }
+    else if (format == nvtt::Format_BC6)
+    {
+        compressionOptions.setPixelType(nvtt::PixelType_UnsignedFloat);
+    }
 
     if (fast)
     {
@@ -599,7 +701,15 @@ int main(int argc, char *argv[])
         }
     }
 
-    outputHandler.setTotal(context.estimateSize(inputOptions, compressionOptions));
+    int outputSize = 0;
+    if (useSurface) {
+        outputSize = context.estimateSize(image, 1, compressionOptions);
+    }
+    else {
+        outputSize = context.estimateSize(inputOptions, compressionOptions);
+    }
+
+    outputHandler.setTotal(outputSize);
     outputHandler.setDisplayProgress(!silent);
 
     nvtt::OutputOptions outputOptions;
@@ -625,10 +735,22 @@ int main(int argc, char *argv[])
     nv::Timer timer;
     timer.start();
 
-    if (!context.process(inputOptions, compressionOptions, outputOptions))
-    {
-        return EXIT_FAILURE;
+    if (useSurface) {
+        if (!context.outputHeader(image, 1, compressionOptions, outputOptions)) {
+            fprintf(stderr, "Error writing file header.\n");
+            return EXIT_FAILURE;
+        }
+        if (!context.compress(image, 0, 0, compressionOptions, outputOptions)) {
+            fprintf(stderr, "Error compressing file.\n");
+            return EXIT_FAILURE;
+        } 
     }
+    else {
+        if (!context.process(inputOptions, compressionOptions, outputOptions)) {
+            return EXIT_FAILURE;
+        }
+    }
+
     timer.stop();
 
     if (!silent) {
diff --git a/src/nvtt/tools/decompress.cpp b/src/nvtt/tools/decompress.cpp
index 0163cca..21a70b4 100644
--- a/src/nvtt/tools/decompress.cpp
+++ b/src/nvtt/tools/decompress.cpp
@@ -29,6 +29,8 @@
 
 #include <nvimage/ImageIO.h>
 
+#include <nvtt/nvtt.h>
+
 #include "cmdline.h"
 
 #include <time.h> // clock
@@ -42,6 +44,8 @@ int main(int argc, char *argv[])
 	bool mipmaps = false;
 	bool faces = false;
 	bool savePNG = false;
+    bool rgbm = false;
+    bool histogram = true;
 
 	nv::Path input;
 	nv::Path output;
@@ -57,10 +61,18 @@ int main(int argc, char *argv[])
 		{
 			mipmaps = true;
 		}
+		else if (strcmp("-rgbm", argv[i]) == 0)
+		{
+			rgbm = true;
+		}
 		else if (strcmp("-faces", argv[i]) == 0)
 		{
 			faces = true;
 		}
+		else if (strcmp("-histogram", argv[i]) == 0)
+		{
+            histogram = true;
+        }
 		else if (strcmp("-format", argv[i]) == 0)
 		{
 			if (i+1 == argc) break;
@@ -109,90 +121,125 @@ int main(int argc, char *argv[])
 		printf("Note: the .tga or .png extension is forced on outfile\n\n");
 
 		printf("Input options:\n");
-		printf("  -forcenormal    \tThe input image is a normal map.\n");
-		printf("  -mipmaps        \tDecompress all mipmaps.\n");
-		printf("  -faces          \tDecompress all faces.\n");
-		printf("  -format <format>\tOutput format ('tga' or 'png').\n");
+		printf("  -forcenormal      The input image is a normal map.\n");
+		printf("  -mipmaps          Decompress all mipmaps.\n");
+		printf("  -faces            Decompress all faces.\n");
+        printf("  -histogram        Output histogram.\n");
+		printf("  -format <format>  Output format ('tga' or 'png').\n");
 
  		return 1;
  	}
 
-	// Load surface.
-	// !!! DirectDrawSurface API doesn't support float images, so BC6 will be converted to 8-bit on load.
-	// Should use nvtt::Surface instead.
-	nv::DirectDrawSurface dds(input.str());
-	if (!dds.isValid())
-	{
-		fprintf(stderr, "The file '%s' is not a valid DDS file.\n", input.str());
-		return 1;
-	}
-
-	if (!dds.isSupported() || dds.isTexture3D())
-	{
-		fprintf(stderr, "The file '%s' is not a supported DDS file.\n", input.str());
-		return 1;
-	}
-	
-	uint faceCount;
-	if (dds.isTexture2D())
-	{
-		faceCount = 1;
-	}
-	else
-	{
-		nvCheck(dds.isTextureCube());
-		faceCount = 6;
-	}
-	
-	uint mipmapCount = dds.mipmapCount();
-	
-	clock_t start = clock();
- 
-	// apply arguments
-	if (forcenormal)
-	{
-		dds.setNormalFlag(true);
-	}
-	if (!faces)
-	{
-		faceCount = 1;
-	}
-	if (!mipmaps)
-	{
-		mipmapCount = 1;
-	}
-
-	nv::Image mipmap;	
-	nv::Path name;
-
-	// strip extension, we force the tga extension
-	output.stripExtension();
-
-	// extract faces and mipmaps
-	for (uint f = 0; f < faceCount; f++)
-	{
-		for (uint m = 0; m < mipmapCount; m++)
-		{
-			dds.mipmap(&mipmap, f, m);
-	
-			// set output filename, if we are doing faces and/or mipmaps
-			name.copy(output);
-			if (faces) name.appendFormat("_face%d", f);
-			if (mipmaps) name.appendFormat("_mipmap%d", m);
-			name.append(savePNG ? ".png" : ".tga");
-			
-			nv::StdOutputStream stream(name.str());
-			if (stream.isError()) {
-				fprintf(stderr, "Error opening '%s' for writting\n", name.str());
-				return 1;
-			}
-			
-			nv::ImageIO::save(name.str(), stream, &mipmap);
-		}
-	}
 
-	clock_t end = clock();
-	printf("\rtime taken: %.3f seconds\n", float(end-start) / CLOCKS_PER_SEC);
+    if (histogram) {
+        nvtt::Surface img;
+        if (!img.load(input.str())) {
+		    fprintf(stderr, "The file '%s' is not a valid DDS file.\n", input.str());
+		    return 1;
+        }
+
+        float exposure = 2.2f;
+        float scale = 1.0f / exposure;
+        img.scaleBias(0, scale, 0);
+        img.scaleBias(1, scale, 0);
+        img.scaleBias(2, scale, 0);
+
+        //img.toneMap(nvtt::ToneMapper_Reindhart, NULL);
+        //img.toSrgb();
+        img.toGamma(2.2f);
+
+        nvtt::Surface hist = nvtt::histogram(img, 3*512, 128);
+
+        // Resize for pretier histograms.
+        hist.resize(512, 128, 1, nvtt::ResizeFilter_Box);
+
+        nv::Path name;
+        name.copy(output);
+        name.stripExtension();
+        name.append(".histogram");
+        name.append(savePNG ? ".png" : ".tga");
+
+        hist.save(name.str());
+    }
+    else {
+
+	    // Load surface.
+	    // !!! DirectDrawSurface API doesn't support float images, so BC6 will be converted to 8-bit on load.
+	    // Should use nvtt::Surface instead.
+	    nv::DirectDrawSurface dds(input.str());
+	    if (!dds.isValid())
+	    {
+		    fprintf(stderr, "The file '%s' is not a valid DDS file.\n", input.str());
+		    return 1;
+	    }
+
+	    if (!dds.isSupported() || dds.isTexture3D())
+	    {
+		    fprintf(stderr, "The file '%s' is not a supported DDS file.\n", input.str());
+		    return 1;
+	    }
+    	
+	    uint faceCount;
+	    if (dds.isTexture2D())
+	    {
+		    faceCount = 1;
+	    }
+	    else
+	    {
+		    nvCheck(dds.isTextureCube());
+		    faceCount = 6;
+	    }
+    	
+	    uint mipmapCount = dds.mipmapCount();
+    	
+	    clock_t start = clock();
+     
+	    // apply arguments
+	    if (forcenormal)
+	    {
+		    dds.setNormalFlag(true);
+	    }
+	    if (!faces)
+	    {
+		    faceCount = 1;
+	    }
+	    if (!mipmaps)
+	    {
+		    mipmapCount = 1;
+	    }
+
+	    nv::Image mipmap;	
+	    nv::Path name;
+
+	    // strip extension, we force the tga extension
+	    output.stripExtension();
+
+	    // extract faces and mipmaps
+	    for (uint f = 0; f < faceCount; f++)
+	    {
+		    for (uint m = 0; m < mipmapCount; m++)
+		    {
+			    dds.mipmap(&mipmap, f, m);
+    	
+			    // set output filename, if we are doing faces and/or mipmaps
+			    name.copy(output);
+			    if (faces) name.appendFormat("_face%d", f);
+			    if (mipmaps) name.appendFormat("_mipmap%d", m);
+			    name.append(savePNG ? ".png" : ".tga");
+    			
+			    nv::StdOutputStream stream(name.str());
+			    if (stream.isError()) {
+				    fprintf(stderr, "Error opening '%s' for writting\n", name.str());
+				    return 1;
+			    }
+    			
+			    nv::ImageIO::save(name.str(), stream, &mipmap);
+		    }
+	    }
+
+	    clock_t end = clock();
+	    printf("\rtime taken: %.3f seconds\n", float(end-start) / CLOCKS_PER_SEC);
+    }
 	
 	return 0;
 }
diff --git a/src/nvtt/tools/imgdiff.cpp b/src/nvtt/tools/imgdiff.cpp
index 9809cf4..3dd5eca 100644
--- a/src/nvtt/tools/imgdiff.cpp
+++ b/src/nvtt/tools/imgdiff.cpp
@@ -23,18 +23,15 @@
 
 #include "cmdline.h"
 
-#include "nvmath/Color.h"
-#include "nvmath/Vector.inl"
-
-#include "nvimage/Image.h"
-#include "nvimage/DirectDrawSurface.h"
+#include "nvtt/nvtt.h"
 
 #include "nvcore/StrLib.h"
-#include "nvcore/StdStream.h"
+#include "nvmath/nvmath.h"
 
+#include <string.h> // strstr
 #include <math.h>
 
-
+/*
 static bool loadImage(nv::Image & image, const char * fileName)
 {
 	if (nv::strCaseDiff(nv::Path::extension(fileName), ".dds") == 0)
@@ -160,7 +157,7 @@ static float luma(const nv::Color32 & c) {
     //return 0.333f * float(c.r) + 0.334f * float(c.g) + 0.333f * float(c.b);
     //return 0.1f * float(c.r) + 0.8f * float(c.g) + 0.1f * float(c.g);
 }
-
+*/
 
 int main(int argc, char *argv[])
 {
@@ -169,6 +166,7 @@ int main(int argc, char *argv[])
 
 	bool compareNormal = false;
 	bool compareAlpha = false;
+    bool rangescale = false;
 
 	nv::Path input0;
 	nv::Path input1;
@@ -178,14 +176,18 @@ int main(int argc, char *argv[])
 	for (int i = 1; i < argc; i++)
 	{
 		// Input options.
-		if (strcmp("-normal", argv[i]) == 0)
+        if (nv::strEqual("-normal", argv[i]))
 		{
 			compareNormal = true;
 		}
-		else if (strcmp("-alpha", argv[i]) == 0)
+		else if (nv::strEqual("-alpha", argv[i]))
 		{
 			compareAlpha = true;
 		}
+		else if (nv::strEqual("-rangescale", argv[i]))
+		{
+			rangescale = true;
+		}
 		else if (argv[i][0] != '-')
 		{
 			input0 = argv[i];
@@ -209,12 +211,105 @@ int main(int argc, char *argv[])
 		printf("usage: nvimgdiff [options] original_file updated_file [output]\n\n");
 		
 		printf("Diff options:\n");
-		printf("  -normal \tCompare images as if they were normal maps.\n");
-		printf("  -alpha  \tCompare alpha weighted images.\n");
+		printf("  -normal       Compare images as if they were normal maps.\n");
+		printf("  -alpha        Compare alpha weighted images.\n");
+        printf("  -rangescale   Scale second image based on range of first one.\n");
 
 		return 1;
 	}
 
+    nvtt::Surface image0, image1;
+
+    if (!image0.load(input0.str())) {
+        printf("Error loading %s.", input0.str());
+        return 1;
+    }
+    if (!image1.load(input1.str())) {
+        printf("Error loading %s.", input1.str());
+        return 1;
+    }
+
+    if (compareNormal) {
+        image0.setNormalMap(true);
+        image1.setNormalMap(true);
+    }
+    if (compareAlpha) {
+        image0.setAlphaMode(nvtt::AlphaMode_Transparency);
+    }
+
+    // Do some transforms based on the naming convention of the file.
+    if (strstr(input1.str(), "rgbm")) {
+
+        //image0.toGamma(2);
+
+        image1.fromRGBM(1.0f, 0.25f);
+        image1.toLinear(2);
+
+        image1.copyChannel(image0, 3);          // Copy alpha channel from source.
+        image1.setAlphaMode(nvtt::AlphaMode_Transparency);
+
+        rangescale = true;
+    }
+
+    if (strstr(input1.str(), "bc6")) {
+        // @@ Do any transform that we may have done before compression.
+
+        image1.copyChannel(image0, 3);          // Copy alpha channel from source.
+        image1.setAlphaMode(nvtt::AlphaMode_Transparency);
+    }
+
+
+    // Scale second image to range of the first one.
+    if (rangescale) {
+        float min_color[3], max_color[3];
+        image0.range(0, &min_color[0], &max_color[0]);
+        image0.range(1, &min_color[1], &max_color[1]);
+        image0.range(2, &min_color[2], &max_color[2]);
+        float color_range = nv::max3(max_color[0], max_color[1], max_color[2]);
+
+        const float max_color_range = 16.0f;
+        if (color_range > max_color_range) color_range = max_color_range;
+
+#if 0
+        for (int i = 0; i < 3; i++) {
+            image0.scaleBias(i, 1.0f / color_range, 0.0f);
+        }
+        image0.toneMap(nvtt::ToneMapper_Linear, NULL); // Clamp without changing the hue.
+#else
+        for (int i = 0; i < 3; i++) {
+            image1.scaleBias(i, color_range, 0.0f);
+        }
+#endif
+    }
+    
+    float rmse = nvtt::rmsError(image0, image1);
+    //float rmsa = nvtt::rmsAlphaError(image0, image1);
+
+    // In The Witness:
+    // exposure = key_value / luminance
+    // key_value = 0.22
+    // min_luminance = 0.1 -> exposure = 2.2
+    // max_luminance = 1.0 -> exposure = 0.22
+
+    float rmse0 = nvtt::rmsToneMappedError(image0, image1, 2.2f);
+    float rmse1 = nvtt::rmsToneMappedError(image0, image1, 1.0f);
+    float rmse2 = nvtt::rmsToneMappedError(image0, image1, 0.22f);
+
+    printf("RMSE = %.5f %.5f %.5f -> %.5f | %.5f\n", rmse0, rmse1, rmse2, (rmse0 + rmse1 + rmse2)/3, rmse);
+
+
+    //printf("MSE = %f\n", rmse * rmse);
+    //printf("RMSE = %f\n", rmse);
+    //printf("PSNR = %f\n", (rmse == 0) ? 999.0 : 20.0 * log10(255.0 / rmse));
+
+    if (compareNormal) {
+        // @@ Does this assume normal maps are packed or unpacked?
+        float ae = nvtt::angularError(image0, image1);
+        printf("AE = %f\n", ae);
+    }
+
+
+#if 0
 	nv::Image image0, image1;
 	if (!loadImage(image0, input0.str())) return 0;
 	if (!loadImage(image1, input1.str())) return 0;
@@ -304,6 +399,7 @@ int main(int argc, char *argv[])
 		error_a.print();
 	}
 
+#endif
 	// @@ Write image difference.
 	
 	return 0;