From 94d063528542cd87d36989e6a918589c997b69e1 Mon Sep 17 00:00:00 2001
From: "castano@gmail.com"
 <castano@gmail.com@95f4ed2b-212e-0410-8b90-d31948207fce>
Date: Fri, 7 Jun 2013 17:53:55 +0000
Subject: [PATCH] Merge changes from the Witness.

---
 project/vc9/hdrtest/hdrtest.vcproj            | 330 +++++++++++
 project/vc9/nvtt.sln                          |  41 +-
 project/vc9/nvtt/nvtt.vcproj                  |  16 +-
 project/vc9/testsuite/testsuite.vcproj        | 162 ++++++
 src/nvcore/Array.h                            |   4 +-
 src/nvcore/Array.inl                          |   5 +-
 src/nvcore/Debug.cpp                          | 153 ++++-
 src/nvcore/Debug.h                            |  10 +-
 src/nvcore/DefsGnucDarwin.h                   |   3 +
 src/nvcore/DefsGnucLinux.h                    |   3 +-
 src/nvcore/StdStream.h                        |  17 +-
 src/nvcore/StrLib.cpp                         |   9 +-
 src/nvcore/StrLib.h                           |   4 +-
 src/nvcore/Utils.h                            |   2 +-
 src/nvcore/nvcore.h                           |  26 +-
 src/nvimage/DirectDrawSurface.cpp             |   6 +-
 src/nvimage/ErrorMetric.cpp                   |   2 -
 src/nvimage/FloatImage.cpp                    |  20 +-
 src/nvimage/NormalMap.cpp                     |   4 -
 src/nvmath/Color.h                            |  26 +
 src/nvmath/Color.inl                          | 114 +++-
 src/nvmath/Fitting.cpp                        |   2 +-
 src/nvmath/Fitting.h                          |   3 -
 src/nvmath/Half.cpp                           |  23 +-
 src/nvmath/Half.h                             |  48 +-
 src/nvmath/Vector.h                           |  22 +-
 src/nvmath/Vector.inl                         |  89 +++
 src/nvmath/nvmath.h                           |  27 +-
 src/nvthread/Atomic.h                         | 109 +++-
 src/nvthread/Event.cpp                        |  31 +-
 src/nvthread/Mutex.cpp                        |   4 +-
 src/nvthread/ParallelFor.cpp                  |   5 -
 src/nvthread/Thread.cpp                       |  16 +-
 src/nvthread/nvthread.cpp                     |  28 +-
 ...{CompressorDXT.cpp => BlockCompressor.cpp} |  18 +-
 .../{CompressorDXT.h => BlockCompressor.h}    |   8 +-
 src/nvtt/CompressorDX10.h                     |  10 +-
 src/nvtt/CompressorDX11.h                     |   2 +-
 src/nvtt/CompressorDX9.cpp                    |   7 +-
 src/nvtt/CompressorDX9.h                      |  26 +-
 src/nvtt/CompressorRGB.cpp                    |  25 +-
 src/nvtt/CubeSurface.cpp                      | 391 +++++++------
 src/nvtt/CubeSurface.h                        |   3 +
 src/nvtt/InputOptions.h                       | 164 +++---
 src/nvtt/Surface.cpp                          | 541 ++++++++++++------
 src/nvtt/Surface.h                            |   2 +
 src/nvtt/nvtt.h                               |  34 +-
 src/nvtt/tests/testsuite.cpp                  |   2 +-
 src/nvtt/tools/cmdline.h                      |   2 +-
 49 files changed, 1974 insertions(+), 625 deletions(-)
 create mode 100755 project/vc9/hdrtest/hdrtest.vcproj
 rename src/nvtt/{CompressorDXT.cpp => BlockCompressor.cpp} (90%)
 rename src/nvtt/{CompressorDXT.h => BlockCompressor.h} (91%)
diff --git a/project/vc9/hdrtest/hdrtest.vcproj b/project/vc9/hdrtest/hdrtest.vcproj
new file mode 100755
index 0000000..3299430
--- /dev/null
+++ b/project/vc9/hdrtest/hdrtest.vcproj
@@ -0,0 +1,330 @@
+<?xml version="1.0" encoding="Windows-1252"?>
+<VisualStudioProject
+	ProjectType="Visual C++"
+	Version="9.00"
+	Name="hdrtest"
+	ProjectGUID="{E493E368-A4CF-4A8D-99DD-E128CC3A27EF}"
+	RootNamespace="hdrtest"
+	TargetFrameworkVersion="196613"
+	>
+	<Platforms>
+		<Platform
+			Name="Win32"
+		/>
+		<Platform
+			Name="x64"
+		/>
+	</Platforms>
+	<ToolFiles>
+	</ToolFiles>
+	<Configurations>
+		<Configuration
+			Name="Debug|Win32"
+			OutputDirectory="$(ConfigurationName)\$(PlatformName)"
+			IntermediateDirectory="$(ConfigurationName)\$(PlatformName)"
+			ConfigurationType="1"
+			InheritedPropertySheets="$(SolutionDir)\nvtt.vsprops"
+			CharacterSet="2"
+			>
+			<Tool
+				Name="VCPreBuildEventTool"
+			/>
+			<Tool
+				Name="VCCustomBuildTool"
+			/>
+			<Tool
+				Name="VCXMLDataGeneratorTool"
+			/>
+			<Tool
+				Name="VCWebServiceProxyGeneratorTool"
+			/>
+			<Tool
+				Name="VCMIDLTool"
+			/>
+			<Tool
+				Name="VCCLCompilerTool"
+				Optimization="0"
+				MinimalRebuild="true"
+				BasicRuntimeChecks="3"
+				RuntimeLibrary="3"
+				WarningLevel="3"
+				DebugInformationFormat="3"
+			/>
+			<Tool
+				Name="VCManagedResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCPreLinkEventTool"
+			/>
+			<Tool
+				Name="VCLinkerTool"
+				AdditionalDependencies="libpng.lib jpeg.lib tiff.lib FreeImage.lib"
+				OutputFile="$(SolutionDir)\$(ConfigurationName).$(PlatformName)\bin\$(ProjectName).exe"
+				AdditionalLibraryDirectories="$(GnuWinDir)\lib; $(FreeImageDir)"
+				GenerateDebugInformation="true"
+				TargetMachine="1"
+			/>
+			<Tool
+				Name="VCALinkTool"
+			/>
+			<Tool
+				Name="VCManifestTool"
+			/>
+			<Tool
+				Name="VCXDCMakeTool"
+			/>
+			<Tool
+				Name="VCBscMakeTool"
+			/>
+			<Tool
+				Name="VCFxCopTool"
+			/>
+			<Tool
+				Name="VCAppVerifierTool"
+			/>
+			<Tool
+				Name="VCPostBuildEventTool"
+			/>
+		</Configuration>
+		<Configuration
+			Name="Debug|x64"
+			OutputDirectory="$(ConfigurationName)\$(PlatformName)"
+			IntermediateDirectory="$(ConfigurationName)\$(PlatformName)"
+			ConfigurationType="1"
+			InheritedPropertySheets="$(SolutionDir)\nvtt.vsprops"
+			CharacterSet="2"
+			>
+			<Tool
+				Name="VCPreBuildEventTool"
+			/>
+			<Tool
+				Name="VCCustomBuildTool"
+			/>
+			<Tool
+				Name="VCXMLDataGeneratorTool"
+			/>
+			<Tool
+				Name="VCWebServiceProxyGeneratorTool"
+			/>
+			<Tool
+				Name="VCMIDLTool"
+				TargetEnvironment="3"
+			/>
+			<Tool
+				Name="VCCLCompilerTool"
+				Optimization="0"
+				MinimalRebuild="true"
+				BasicRuntimeChecks="3"
+				RuntimeLibrary="3"
+				WarningLevel="3"
+				DebugInformationFormat="3"
+			/>
+			<Tool
+				Name="VCManagedResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCPreLinkEventTool"
+			/>
+			<Tool
+				Name="VCLinkerTool"
+				AdditionalDependencies="FreeImage.lib jpeg.lib libpng.lib tiff.lib zlib.lib"
+				OutputFile="$(SolutionDir)\$(ConfigurationName).$(PlatformName)\bin\$(ProjectName).exe"
+				AdditionalLibraryDirectories="$(GnuWinDir)\lib; $(FreeImageDir)"
+				GenerateDebugInformation="true"
+				TargetMachine="17"
+			/>
+			<Tool
+				Name="VCALinkTool"
+			/>
+			<Tool
+				Name="VCManifestTool"
+			/>
+			<Tool
+				Name="VCXDCMakeTool"
+			/>
+			<Tool
+				Name="VCBscMakeTool"
+			/>
+			<Tool
+				Name="VCFxCopTool"
+			/>
+			<Tool
+				Name="VCAppVerifierTool"
+			/>
+			<Tool
+				Name="VCPostBuildEventTool"
+			/>
+		</Configuration>
+		<Configuration
+			Name="Release|Win32"
+			OutputDirectory="$(ConfigurationName)\$(PlatformName)"
+			IntermediateDirectory="$(ConfigurationName)\$(PlatformName)"
+			ConfigurationType="1"
+			InheritedPropertySheets="$(SolutionDir)\nvtt.vsprops"
+			CharacterSet="2"
+			WholeProgramOptimization="1"
+			>
+			<Tool
+				Name="VCPreBuildEventTool"
+			/>
+			<Tool
+				Name="VCCustomBuildTool"
+			/>
+			<Tool
+				Name="VCXMLDataGeneratorTool"
+			/>
+			<Tool
+				Name="VCWebServiceProxyGeneratorTool"
+			/>
+			<Tool
+				Name="VCMIDLTool"
+			/>
+			<Tool
+				Name="VCCLCompilerTool"
+				Optimization="3"
+				InlineFunctionExpansion="0"
+				EnableIntrinsicFunctions="true"
+				FavorSizeOrSpeed="0"
+				OmitFramePointers="true"
+				StringPooling="true"
+				RuntimeLibrary="2"
+				EnableFunctionLevelLinking="false"
+				EnableEnhancedInstructionSet="2"
+				WarningLevel="3"
+			/>
+			<Tool
+				Name="VCManagedResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCPreLinkEventTool"
+			/>
+			<Tool
+				Name="VCLinkerTool"
+				AdditionalDependencies="libpng.lib jpeg.lib tiff.lib FreeImage.lib"
+				OutputFile="$(SolutionDir)\$(ConfigurationName).$(PlatformName)\bin\$(ProjectName).exe"
+				AdditionalLibraryDirectories="$(GnuWinDir)\lib; $(FreeImageDir)"
+				GenerateDebugInformation="true"
+				OptimizeReferences="2"
+				EnableCOMDATFolding="2"
+				TargetMachine="1"
+			/>
+			<Tool
+				Name="VCALinkTool"
+			/>
+			<Tool
+				Name="VCManifestTool"
+			/>
+			<Tool
+				Name="VCXDCMakeTool"
+			/>
+			<Tool
+				Name="VCBscMakeTool"
+			/>
+			<Tool
+				Name="VCFxCopTool"
+			/>
+			<Tool
+				Name="VCAppVerifierTool"
+			/>
+			<Tool
+				Name="VCPostBuildEventTool"
+			/>
+		</Configuration>
+		<Configuration
+			Name="Release|x64"
+			OutputDirectory="$(ConfigurationName)\$(PlatformName)"
+			IntermediateDirectory="$(ConfigurationName)\$(PlatformName)"
+			ConfigurationType="1"
+			InheritedPropertySheets="$(SolutionDir)\nvtt.vsprops"
+			CharacterSet="2"
+			WholeProgramOptimization="1"
+			>
+			<Tool
+				Name="VCPreBuildEventTool"
+			/>
+			<Tool
+				Name="VCCustomBuildTool"
+			/>
+			<Tool
+				Name="VCXMLDataGeneratorTool"
+			/>
+			<Tool
+				Name="VCWebServiceProxyGeneratorTool"
+			/>
+			<Tool
+				Name="VCMIDLTool"
+				TargetEnvironment="3"
+			/>
+			<Tool
+				Name="VCCLCompilerTool"
+				Optimization="3"
+				EnableIntrinsicFunctions="true"
+				OmitFramePointers="true"
+				WholeProgramOptimization="true"
+				StringPooling="true"
+				RuntimeLibrary="2"
+				EnableFunctionLevelLinking="false"
+				WarningLevel="3"
+			/>
+			<Tool
+				Name="VCManagedResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCPreLinkEventTool"
+			/>
+			<Tool
+				Name="VCLinkerTool"
+				AdditionalDependencies="FreeImage.lib jpeg.lib libpng.lib tiff.lib zlib.lib"
+				OutputFile="$(SolutionDir)\$(ConfigurationName).$(PlatformName)\bin\$(ProjectName).exe"
+				AdditionalLibraryDirectories="$(GnuWinDir)\lib; $(FreeImageDir)"
+				GenerateDebugInformation="true"
+				OptimizeReferences="2"
+				EnableCOMDATFolding="2"
+				TargetMachine="17"
+			/>
+			<Tool
+				Name="VCALinkTool"
+			/>
+			<Tool
+				Name="VCManifestTool"
+			/>
+			<Tool
+				Name="VCXDCMakeTool"
+			/>
+			<Tool
+				Name="VCBscMakeTool"
+			/>
+			<Tool
+				Name="VCFxCopTool"
+			/>
+			<Tool
+				Name="VCAppVerifierTool"
+			/>
+			<Tool
+				Name="VCPostBuildEventTool"
+			/>
+		</Configuration>
+	</Configurations>
+	<References>
+	</References>
+	<Files>
+		<File
+			RelativePath="..\..\..\src\nvtt\tests\hdrtest.cpp"
+			>
+		</File>
+	</Files>
+	<Globals>
+	</Globals>
+</VisualStudioProject>
diff --git a/project/vc9/nvtt.sln b/project/vc9/nvtt.sln
index 04e494c..256cacb 100644
--- a/project/vc9/nvtt.sln
+++ b/project/vc9/nvtt.sln
@@ -97,6 +97,13 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "cubemaptest", "cubemaptest\
 		{1AEB7681-57D8-48EE-813D-5C41CC38B647} = {1AEB7681-57D8-48EE-813D-5C41CC38B647}
 	EndProjectSection
 EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "hdrtest", "hdrtest\hdrtest.vcproj", "{E493E368-A4CF-4A8D-99DD-E128CC3A27EF}"
+	ProjectSection(ProjectDependencies) = postProject
+		{F143D180-D4C4-4037-B3DE-BE89A21C8D1D} = {F143D180-D4C4-4037-B3DE-BE89A21C8D1D}
+		{1AEB7681-57D8-48EE-813D-5C41CC38B647} = {1AEB7681-57D8-48EE-813D-5C41CC38B647}
+		{4046F392-A18B-4C66-9639-3EABFFF5D531} = {4046F392-A18B-4C66-9639-3EABFFF5D531}
+	EndProjectSection
+EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Debug|Mixed Platforms = Debug|Mixed Platforms
@@ -402,22 +409,26 @@ Global
 		{317B694E-B5C1-42A6-956F-FC12B69175A6}.Debug|Mixed Platforms.Build.0 = Debug|Win32
 		{317B694E-B5C1-42A6-956F-FC12B69175A6}.Debug|Win32.ActiveCfg = Debug|Win32
 		{317B694E-B5C1-42A6-956F-FC12B69175A6}.Debug|Win32.Build.0 = Debug|Win32
-		{317B694E-B5C1-42A6-956F-FC12B69175A6}.Debug|x64.ActiveCfg = Debug|Win32
+		{317B694E-B5C1-42A6-956F-FC12B69175A6}.Debug|x64.ActiveCfg = Debug|x64
+		{317B694E-B5C1-42A6-956F-FC12B69175A6}.Debug|x64.Build.0 = Debug|x64
 		{317B694E-B5C1-42A6-956F-FC12B69175A6}.Debug-CUDA|Mixed Platforms.ActiveCfg = Debug|Win32
 		{317B694E-B5C1-42A6-956F-FC12B69175A6}.Debug-CUDA|Mixed Platforms.Build.0 = Debug|Win32
 		{317B694E-B5C1-42A6-956F-FC12B69175A6}.Debug-CUDA|Win32.ActiveCfg = Debug|Win32
 		{317B694E-B5C1-42A6-956F-FC12B69175A6}.Debug-CUDA|Win32.Build.0 = Debug|Win32
-		{317B694E-B5C1-42A6-956F-FC12B69175A6}.Debug-CUDA|x64.ActiveCfg = Debug|Win32
+		{317B694E-B5C1-42A6-956F-FC12B69175A6}.Debug-CUDA|x64.ActiveCfg = Debug|x64
+		{317B694E-B5C1-42A6-956F-FC12B69175A6}.Debug-CUDA|x64.Build.0 = Debug|x64
 		{317B694E-B5C1-42A6-956F-FC12B69175A6}.Release|Mixed Platforms.ActiveCfg = Release|Win32
 		{317B694E-B5C1-42A6-956F-FC12B69175A6}.Release|Mixed Platforms.Build.0 = Release|Win32
 		{317B694E-B5C1-42A6-956F-FC12B69175A6}.Release|Win32.ActiveCfg = Release|Win32
 		{317B694E-B5C1-42A6-956F-FC12B69175A6}.Release|Win32.Build.0 = Release|Win32
-		{317B694E-B5C1-42A6-956F-FC12B69175A6}.Release|x64.ActiveCfg = Release|Win32
+		{317B694E-B5C1-42A6-956F-FC12B69175A6}.Release|x64.ActiveCfg = Release|x64
+		{317B694E-B5C1-42A6-956F-FC12B69175A6}.Release|x64.Build.0 = Release|x64
 		{317B694E-B5C1-42A6-956F-FC12B69175A6}.Release-CUDA|Mixed Platforms.ActiveCfg = Release|Win32
 		{317B694E-B5C1-42A6-956F-FC12B69175A6}.Release-CUDA|Mixed Platforms.Build.0 = Release|Win32
 		{317B694E-B5C1-42A6-956F-FC12B69175A6}.Release-CUDA|Win32.ActiveCfg = Release|Win32
 		{317B694E-B5C1-42A6-956F-FC12B69175A6}.Release-CUDA|Win32.Build.0 = Release|Win32
-		{317B694E-B5C1-42A6-956F-FC12B69175A6}.Release-CUDA|x64.ActiveCfg = Release|Win32
+		{317B694E-B5C1-42A6-956F-FC12B69175A6}.Release-CUDA|x64.ActiveCfg = Release|x64
+		{317B694E-B5C1-42A6-956F-FC12B69175A6}.Release-CUDA|x64.Build.0 = Release|x64
 		{7DCF280E-702B-49F3-84A7-AE7E146384D6}.Debug|Mixed Platforms.ActiveCfg = Debug|Win32
 		{7DCF280E-702B-49F3-84A7-AE7E146384D6}.Debug|Mixed Platforms.Build.0 = Debug|Win32
 		{7DCF280E-702B-49F3-84A7-AE7E146384D6}.Debug|Win32.ActiveCfg = Debug|Win32
@@ -514,6 +525,28 @@ Global
 		{CFB3FEAC-5720-4B16-9D7E-039DB180B641}.Release-CUDA|Win32.Build.0 = Release|Win32
 		{CFB3FEAC-5720-4B16-9D7E-039DB180B641}.Release-CUDA|x64.ActiveCfg = Release|x64
 		{CFB3FEAC-5720-4B16-9D7E-039DB180B641}.Release-CUDA|x64.Build.0 = Release|x64
+		{E493E368-A4CF-4A8D-99DD-E128CC3A27EF}.Debug|Mixed Platforms.ActiveCfg = Debug|x64
+		{E493E368-A4CF-4A8D-99DD-E128CC3A27EF}.Debug|Mixed Platforms.Build.0 = Debug|x64
+		{E493E368-A4CF-4A8D-99DD-E128CC3A27EF}.Debug|Win32.ActiveCfg = Debug|Win32
+		{E493E368-A4CF-4A8D-99DD-E128CC3A27EF}.Debug|Win32.Build.0 = Debug|Win32
+		{E493E368-A4CF-4A8D-99DD-E128CC3A27EF}.Debug|x64.ActiveCfg = Debug|x64
+		{E493E368-A4CF-4A8D-99DD-E128CC3A27EF}.Debug|x64.Build.0 = Debug|x64
+		{E493E368-A4CF-4A8D-99DD-E128CC3A27EF}.Debug-CUDA|Mixed Platforms.ActiveCfg = Debug|x64
+		{E493E368-A4CF-4A8D-99DD-E128CC3A27EF}.Debug-CUDA|Mixed Platforms.Build.0 = Debug|x64
+		{E493E368-A4CF-4A8D-99DD-E128CC3A27EF}.Debug-CUDA|Win32.ActiveCfg = Debug|x64
+		{E493E368-A4CF-4A8D-99DD-E128CC3A27EF}.Debug-CUDA|x64.ActiveCfg = Debug|x64
+		{E493E368-A4CF-4A8D-99DD-E128CC3A27EF}.Debug-CUDA|x64.Build.0 = Debug|x64
+		{E493E368-A4CF-4A8D-99DD-E128CC3A27EF}.Release|Mixed Platforms.ActiveCfg = Release|x64
+		{E493E368-A4CF-4A8D-99DD-E128CC3A27EF}.Release|Mixed Platforms.Build.0 = Release|x64
+		{E493E368-A4CF-4A8D-99DD-E128CC3A27EF}.Release|Win32.ActiveCfg = Release|Win32
+		{E493E368-A4CF-4A8D-99DD-E128CC3A27EF}.Release|Win32.Build.0 = Release|Win32
+		{E493E368-A4CF-4A8D-99DD-E128CC3A27EF}.Release|x64.ActiveCfg = Release|x64
+		{E493E368-A4CF-4A8D-99DD-E128CC3A27EF}.Release|x64.Build.0 = Release|x64
+		{E493E368-A4CF-4A8D-99DD-E128CC3A27EF}.Release-CUDA|Mixed Platforms.ActiveCfg = Release|x64
+		{E493E368-A4CF-4A8D-99DD-E128CC3A27EF}.Release-CUDA|Mixed Platforms.Build.0 = Release|x64
+		{E493E368-A4CF-4A8D-99DD-E128CC3A27EF}.Release-CUDA|Win32.ActiveCfg = Release|x64
+		{E493E368-A4CF-4A8D-99DD-E128CC3A27EF}.Release-CUDA|x64.ActiveCfg = Release|x64
+		{E493E368-A4CF-4A8D-99DD-E128CC3A27EF}.Release-CUDA|x64.Build.0 = Release|x64
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
diff --git a/project/vc9/nvtt/nvtt.vcproj b/project/vc9/nvtt/nvtt.vcproj
index d5b9407..88dce20 100644
--- a/project/vc9/nvtt/nvtt.vcproj
+++ b/project/vc9/nvtt/nvtt.vcproj
@@ -877,6 +877,14 @@
 				>
 			</File>
 		</Filter>
+		<File
+			RelativePath="..\..\..\src\nvtt\BlockCompressor.cpp"
+			>
+		</File>
+		<File
+			RelativePath="..\..\..\src\nvtt\BlockCompressor.h"
+			>
+		</File>
 		<File
 			RelativePath="..\..\..\src\nvtt\ClusterFit.cpp"
 			>
@@ -1049,14 +1057,6 @@
 			RelativePath="..\..\..\src\nvtt\CompressorDX9.h"
 			>
 		</File>
-		<File
-			RelativePath="..\..\..\src\nvtt\CompressorDXT.cpp"
-			>
-		</File>
-		<File
-			RelativePath="..\..\..\src\nvtt\CompressorDXT.h"
-			>
-		</File>
 		<File
 			RelativePath="..\..\..\src\nvtt\CompressorRGB.cpp"
 			>
diff --git a/project/vc9/testsuite/testsuite.vcproj b/project/vc9/testsuite/testsuite.vcproj
index 294eac8..907bef0 100644
--- a/project/vc9/testsuite/testsuite.vcproj
+++ b/project/vc9/testsuite/testsuite.vcproj
@@ -12,6 +12,9 @@
 		<Platform
 			Name="Win32"
 		/>
+		<Platform
+			Name="x64"
+		/>
 	</Platforms>
 	<ToolFiles>
 	</ToolFiles>
@@ -173,6 +176,165 @@
 				Name="VCPostBuildEventTool"
 			/>
 		</Configuration>
+		<Configuration
+			Name="Debug|x64"
+			OutputDirectory="$(PlatformName)\$(ConfigurationName)"
+			IntermediateDirectory="$(PlatformName)\$(ConfigurationName)"
+			ConfigurationType="1"
+			InheritedPropertySheets="$(SolutionDir)\nvtt.vsprops"
+			CharacterSet="1"
+			>
+			<Tool
+				Name="VCPreBuildEventTool"
+			/>
+			<Tool
+				Name="VCCustomBuildTool"
+			/>
+			<Tool
+				Name="VCXMLDataGeneratorTool"
+			/>
+			<Tool
+				Name="VCWebServiceProxyGeneratorTool"
+			/>
+			<Tool
+				Name="VCMIDLTool"
+				TargetEnvironment="3"
+			/>
+			<Tool
+				Name="VCCLCompilerTool"
+				Optimization="0"
+				AdditionalIncludeDirectories="..;..\..\..\src"
+				PreprocessorDefinitions="WIN32;_DEBUG;_CONSOLE"
+				MinimalRebuild="true"
+				BasicRuntimeChecks="3"
+				RuntimeLibrary="3"
+				UsePrecompiledHeader="0"
+				WarningLevel="3"
+				DebugInformationFormat="3"
+			/>
+			<Tool
+				Name="VCManagedResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCPreLinkEventTool"
+			/>
+			<Tool
+				Name="VCLinkerTool"
+				AdditionalDependencies="FreeImage.lib jpeg.lib libpng.lib tiff.lib zlib.lib"
+				OutputFile="$(SolutionDir)\$(ConfigurationName).$(PlatformName)\bin\$(ProjectName).exe"
+				LinkIncremental="2"
+				AdditionalLibraryDirectories="$(GnuWinDir)\lib;$(FreeImageDir)"
+				GenerateDebugInformation="true"
+				SubSystem="1"
+				RandomizedBaseAddress="1"
+				DataExecutionPrevention="0"
+				TargetMachine="17"
+			/>
+			<Tool
+				Name="VCALinkTool"
+			/>
+			<Tool
+				Name="VCManifestTool"
+			/>
+			<Tool
+				Name="VCXDCMakeTool"
+			/>
+			<Tool
+				Name="VCBscMakeTool"
+			/>
+			<Tool
+				Name="VCFxCopTool"
+			/>
+			<Tool
+				Name="VCAppVerifierTool"
+			/>
+			<Tool
+				Name="VCPostBuildEventTool"
+			/>
+		</Configuration>
+		<Configuration
+			Name="Release|x64"
+			OutputDirectory="$(PlatformName)\$(ConfigurationName)"
+			IntermediateDirectory="$(PlatformName)\$(ConfigurationName)"
+			ConfigurationType="1"
+			InheritedPropertySheets="$(SolutionDir)\nvtt.vsprops"
+			CharacterSet="1"
+			WholeProgramOptimization="1"
+			>
+			<Tool
+				Name="VCPreBuildEventTool"
+			/>
+			<Tool
+				Name="VCCustomBuildTool"
+			/>
+			<Tool
+				Name="VCXMLDataGeneratorTool"
+			/>
+			<Tool
+				Name="VCWebServiceProxyGeneratorTool"
+			/>
+			<Tool
+				Name="VCMIDLTool"
+				TargetEnvironment="3"
+			/>
+			<Tool
+				Name="VCCLCompilerTool"
+				AdditionalIncludeDirectories="..;..\..\..\src"
+				PreprocessorDefinitions="WIN32;NDEBUG;_CONSOLE"
+				StringPooling="true"
+				RuntimeLibrary="2"
+				EnableEnhancedInstructionSet="2"
+				UsePrecompiledHeader="0"
+				WarningLevel="3"
+			/>
+			<Tool
+				Name="VCManagedResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCPreLinkEventTool"
+			/>
+			<Tool
+				Name="VCLinkerTool"
+				AdditionalDependencies="FreeImage.lib jpeg.lib libpng.lib tiff.lib zlib.lib"
+				OutputFile="$(SolutionDir)\$(ConfigurationName).$(PlatformName)\bin\$(ProjectName).exe"
+				LinkIncremental="1"
+				AdditionalLibraryDirectories="$(GnuWinDir)\lib;$(FreeImageDir)"
+				GenerateDebugInformation="true"
+				SubSystem="1"
+				OptimizeReferences="2"
+				EnableCOMDATFolding="2"
+				RandomizedBaseAddress="1"
+				DataExecutionPrevention="0"
+				TargetMachine="17"
+			/>
+			<Tool
+				Name="VCALinkTool"
+			/>
+			<Tool
+				Name="VCManifestTool"
+			/>
+			<Tool
+				Name="VCXDCMakeTool"
+			/>
+			<Tool
+				Name="VCBscMakeTool"
+			/>
+			<Tool
+				Name="VCFxCopTool"
+			/>
+			<Tool
+				Name="VCAppVerifierTool"
+			/>
+			<Tool
+				Name="VCPostBuildEventTool"
+			/>
+		</Configuration>
 	</Configurations>
 	<References>
 	</References>
diff --git a/src/nvcore/Array.h b/src/nvcore/Array.h
index 1a309d6..984aa90 100644
--- a/src/nvcore/Array.h
+++ b/src/nvcore/Array.h
@@ -108,7 +108,7 @@ namespace nv
 
         void push_back( const T & val );
         void pushBack( const T & val );
-        void append( const T & val );
+        Array<T> & append( const T & val );
         Array<T> & operator<< ( T & t );
         void pop_back();
         void popBack();
@@ -160,7 +160,7 @@ namespace nv
         friend void swap(Array<Typ> & a, Array<Typ> & b);
 
 
-protected:
+    protected:
 
         void setArraySize(uint new_size);
         void setArrayCapacity(uint new_capacity);
diff --git a/src/nvcore/Array.inl b/src/nvcore/Array.inl
index a59dd89..f7369bc 100755
--- a/src/nvcore/Array.inl
+++ b/src/nvcore/Array.inl
@@ -22,7 +22,7 @@ namespace nv
     NV_FORCEINLINE void Array<T>::push_back( const T & val )
     {
 #if 1
-        nvDebugCheck(&val < m_buffer || &val > m_buffer+m_size);
+        nvDebugCheck(&val < m_buffer || &val >= m_buffer+m_size);
 
         uint old_size = m_size;
         uint new_size = m_size + 1;
@@ -57,9 +57,10 @@ namespace nv
         push_back(val);
     }
     template <typename T>
-    NV_FORCEINLINE void Array<T>::append( const T & val )
+    NV_FORCEINLINE Array<T> & Array<T>::append( const T & val )
     {
         push_back(val);
+        return *this;
     }
 
     // Qt like push operator.
diff --git a/src/nvcore/Debug.cpp b/src/nvcore/Debug.cpp
index 25c93a8..ebf77b3 100644
--- a/src/nvcore/Debug.cpp
+++ b/src/nvcore/Debug.cpp
@@ -66,7 +66,7 @@
 #   endif
 #endif
 
-#define USE_SEPARATE_THREAD 1
+#define NV_USE_SEPARATE_THREAD 1
 
 
 using namespace nv;
@@ -101,7 +101,7 @@ namespace
     // We should try to simplify the top level filter as much as possible.
     // http://www.nynaeve.net/?p=128
 
-#if USE_SEPARATE_THREAD
+#if NV_USE_SEPARATE_THREAD
 
     // The critical section enforcing the requirement that only one exception be
     // handled by a handler at a time.
@@ -121,7 +121,7 @@ namespace
     static DWORD s_requesting_thread_id = 0;
     static EXCEPTION_POINTERS * s_exception_info = NULL;
 
-#endif // USE_SEPARATE_THREAD
+#endif // NV_USE_SEPARATE_THREAD
 
 
     struct MinidumpCallbackContext {
@@ -236,7 +236,7 @@ namespace
         return true;
     }
 
-#if USE_SEPARATE_THREAD
+#if NV_USE_SEPARATE_THREAD
 
     static DWORD WINAPI ExceptionHandlerThreadMain(void* lpParameter) {
         nvDebugCheck(s_handler_start_semaphore != NULL);
@@ -256,7 +256,7 @@ namespace
         return 0;
     }
 
-#endif // USE_SEPARATE_THREAD
+#endif // NV_USE_SEPARATE_THREAD
 
     static bool hasStackTrace() {
         return true;
@@ -387,7 +387,9 @@ namespace
 			    DWORD dwDisplacement;
 			    if (!SymGetLineFromAddr64(hProcess, ip, &dwDisplacement, &theLine))
 			    {
-                    builder.format("unknown(%08X) : %s\n", (uint32)ip, pFunc);
+                    // Do not print unknown symbols anymore.
+                    break;
+                    //builder.format("unknown(%08X) : %s\n", (uint32)ip, pFunc);
 			    }
 			    else
 			    {
@@ -404,6 +406,10 @@ namespace
 			    }
 
                 lines.append(builder.release());
+
+                if (pFunc != NULL && strcmp(pFunc, "WinMain") == 0) {
+                    break;
+                }
 		    }
 	    }
     }
@@ -413,7 +419,7 @@ namespace
     static LONG WINAPI handleException(EXCEPTION_POINTERS * pExceptionInfo)
     {
         EnterCriticalSection(&s_handler_critical_section);
-#if USE_SEPARATE_THREAD
+#if NV_USE_SEPARATE_THREAD
         s_requesting_thread_id = GetCurrentThreadId();
         s_exception_info = pExceptionInfo;
 
@@ -474,6 +480,36 @@ namespace
     }
 
     static void handleInvalidParameter(const wchar_t * expresion, const wchar_t * function, const wchar_t * file, unsigned int line, uintptr_t reserved) {
+
+        size_t convertedCharCount = 0;
+        StringBuilder tmp;
+
+        if (expresion != NULL) {
+            uint size = toU32(wcslen(expresion) + 1);
+            tmp.reserve(size);
+            wcstombs_s(&convertedCharCount, tmp.str(), size, expresion, _TRUNCATE);
+
+            nvDebug("*** Invalid parameter: %s\n", tmp.str());
+
+            if (file != NULL) {
+                size = toU32(wcslen(file) + 1);
+                tmp.reserve(size);
+                wcstombs_s(&convertedCharCount, tmp.str(), size, file, _TRUNCATE);
+
+                nvDebug("    On file: %s\n", tmp.str());
+
+                if (function != NULL) {
+                    size = toU32(wcslen(function) + 1);
+                    tmp.reserve(size);
+                    wcstombs_s(&convertedCharCount, tmp.str(), size, function, _TRUNCATE);
+
+                    nvDebug("    On function: %s\n", tmp.str());
+                }
+
+                nvDebug("    On line: %u\n", line);
+            }
+        }
+
         nvDebugBreak();
         TerminateProcess(GetCurrentProcess(), EXIT_FAILURE + 8);
     }
@@ -706,16 +742,22 @@ namespace
         }
 
         // Assert handler method.
-        virtual int assertion( const char * exp, const char * file, int line, const char * func/*=NULL*/ )
+        virtual int assertion(const char * exp, const char * file, int line, const char * func, const char * msg, va_list arg)
         {
             int ret = NV_ABORT_EXIT;
 
             StringBuilder error_string;
-            if( func != NULL ) {
-                error_string.format( "*** Assertion failed: %s\n    On file: %s\n    On function: %s\n    On line: %d\n ", exp, file, func, line );
+            error_string.format("*** Assertion failed: %s\n    On file: %s\n    On line: %d\n", exp, file, line );
+            if (func != NULL) {
+                error_string.appendFormat("    On function: %s\n", func);
             }
-            else {
-                error_string.format( "*** Assertion failed: %s\n    On file: %s\n    On line: %d\n ", exp, file, line );
+            if (msg != NULL) {
+                error_string.append("    Message: ");
+                va_list tmp;
+                va_copy(tmp, arg);
+                error_string.appendFormatList(msg, tmp);
+                va_end(tmp);
+                error_string.append("\n");
             }
             nvDebug( error_string.str() );
 
@@ -760,7 +802,7 @@ namespace
     struct Xbox360AssertHandler : public AssertHandler 
     {
         // Assert handler method.
-        virtual int assertion( const char * exp, const char * file, int line, const char * func/*=NULL*/ )
+        virtual int assertion(const char * exp, const char * file, int line, const char * func, const char * msg, va_list arg)
         {
             int ret = NV_ABORT_EXIT;
 
@@ -786,14 +828,47 @@ namespace
             return ret;
         }
     };
+#elif NV_OS_ORBIS
+
+    /** Orbis assert handler. */
+    struct OrbisAssertHandler : public AssertHandler
+    {
+        // Assert handler method.
+        virtual int assertion(const char * exp, const char * file, int line, const char * func, const char * msg, va_list arg)
+        {
+            if( func != NULL ) {
+                nvDebug( "*** Assertion failed: %s\n    On file: %s\n    On function: %s\n    On line: %d\n ", exp, file, func, line );
+            }
+            else {
+                nvDebug( "*** Assertion failed: %s\n    On file: %s\n    On line: %d\n ", exp, file, line );
+            }
+
+            //SBtodoORBIS print stack trace
+            /*if (hasStackTrace())
+            {
+                void * trace[64];
+                int size = backtrace(trace, 64);
+                printStackTrace(trace, size, 2);
+            }*/
+            
+            //SBtodoORBIS check for debugger present
+            //if (debug::isDebuggerPresent())
+                nvDebugBreak();
+
+            return NV_ABORT_DEBUG;
+        }
+    };
+
 #else
 
     /** Unix assert handler. */
     struct UnixAssertHandler : public AssertHandler
     {
         // Assert handler method.
-        virtual int assertion(const char * exp, const char * file, int line, const char * func)
+        virtual int assertion(const char * exp, const char * file, int line, const char * func, const char * msg, va_list arg)
         {
+            int ret = NV_ABORT_EXIT;            
+            
             if( func != NULL ) {
                 nvDebug( "*** Assertion failed: %s\n    On file: %s\n    On function: %s\n    On line: %d\n ", exp, file, func, line );
             }
@@ -816,9 +891,13 @@ namespace
             }
 #endif
 
+            if( ret == NV_ABORT_EXIT ) {
             // Exit cleanly.
             exit(EXIT_FAILURE + 1);
         }
+            
+            return ret;
+        }
     };
 
 #endif
@@ -827,22 +906,27 @@ namespace
 
 
 /// Handle assertion through the assert handler.
-int nvAbort(const char * exp, const char * file, int line, const char * func/*=NULL*/)
+int nvAbort(const char * exp, const char * file, int line, const char * func/*=NULL*/, const char * msg/*= NULL*/, ...)
 {
 #if NV_OS_WIN32 //&& NV_CC_MSVC
     static Win32AssertHandler s_default_assert_handler;
 #elif NV_OS_XBOX
     static Xbox360AssertHandler s_default_assert_handler;
+#elif NV_OS_ORBIS
+    static OrbisAssertHandler s_default_assert_handler;
 #else
     static UnixAssertHandler s_default_assert_handler;
 #endif
 
-    if (s_assert_handler != NULL) {
-        return s_assert_handler->assertion( exp, file, line, func );
-    }
-    else {
-        return s_default_assert_handler.assertion( exp, file, line, func );
-    }
+    va_list arg;
+    va_start(arg,msg);
+
+    AssertHandler * handler = s_assert_handler != NULL ? s_assert_handler : &s_default_assert_handler;
+    int result = handler->assertion(exp, file, line, func, msg, arg);
+
+    va_end(arg);
+
+    return result;
 }
 
 // Abnormal termination. Create mini dump and output call stack.
@@ -914,6 +998,26 @@ void debug::dumpInfo()
 #endif
 }
 
+/// Dump callstack using the specified handler.
+void debug::dumpCallstack(MessageHandler *messageHandler, int callstackLevelsToSkip /*= 0*/)
+{
+#if (NV_OS_WIN32 && NV_CC_MSVC) || (defined(HAVE_SIGNAL_H) && defined(HAVE_EXECINFO_H))
+    if (hasStackTrace())
+    {
+        void * trace[64];
+        int size = backtrace(trace, 64);
+
+        Array<const char *> lines;
+        writeStackTrace(trace, size, callstackLevelsToSkip + 1, lines);     // + 1 to skip the call to dumpCallstack
+
+        for (uint i = 0; i < lines.count(); i++) {
+            messageHandler->log(lines[i], NULL);
+            delete lines[i];
+        }
+    }
+#endif
+}
+
 
 /// Set the debug message handler.
 void debug::setMessageHandler(MessageHandler * message_handler)
@@ -939,9 +1043,8 @@ void debug::resetAssertHandler()
     s_assert_handler = NULL;
 }
 
-
 #if NV_OS_WIN32
-#if USE_SEPARATE_THREAD
+#if NV_USE_SEPARATE_THREAD
 
 static void initHandlerThread()
 {
@@ -984,7 +1087,7 @@ static void shutHandlerThread() {
     // @@ Free stuff. Terminate thread.
 }
 
-#endif // USE_SEPARATE_THREAD
+#endif // NV_USE_SEPARATE_THREAD
 #endif // NV_OS_WIN32
 
 
@@ -1009,7 +1112,7 @@ void debug::enableSigHandler(bool interactive)
     }
 
 
-#if USE_SEPARATE_THREAD
+#if NV_USE_SEPARATE_THREAD
     initHandlerThread();
 #endif
 
diff --git a/src/nvcore/Debug.h b/src/nvcore/Debug.h
index 5ba868c..c987e10 100644
--- a/src/nvcore/Debug.h
+++ b/src/nvcore/Debug.h
@@ -70,11 +70,12 @@
     } \
     NV_MULTI_LINE_MACRO_END
 
-#define nvAssertMacroWithIgnoreAll(exp) \
+// GCC, LLVM need "##" before the __VA_ARGS__, MSVC doesn't care
+#define nvAssertMacroWithIgnoreAll(exp,...) \
     NV_MULTI_LINE_MACRO_BEGIN \
         static bool ignoreAll = false; \
         if (!ignoreAll && !(exp)) { \
-            int result = nvAbort(#exp, __FILE__, __LINE__, __FUNC__); \
+            int result = nvAbort(#exp, __FILE__, __LINE__, __FUNC__, ##__VA_ARGS__); \
             if (result == NV_ABORT_DEBUG) { \
                 nvDebugBreak(); \
             } else if (result == NV_ABORT_IGNORE) { \
@@ -157,7 +158,7 @@
 #endif
 
 
-NVCORE_API int nvAbort(const char *exp, const char *file, int line, const char * func = NULL);
+NVCORE_API int nvAbort(const char *exp, const char *file, int line, const char * func = NULL, const char * msg = NULL, ...);
 NVCORE_API void NV_CDECL nvDebugPrint( const char *msg, ... ) __attribute__((format (printf, 1, 2)));
 
 namespace nv
@@ -184,7 +185,7 @@ namespace nv
 
     // Assert handler interface.
     struct AssertHandler {
-        virtual int assertion(const char *exp, const char *file, int line, const char *func = NULL) = 0;
+        virtual int assertion(const char *exp, const char *file, int line, const char *func, const char *msg, va_list arg) = 0;
         virtual ~AssertHandler() {}
     };
 
@@ -192,6 +193,7 @@ namespace nv
     namespace debug
     {
         NVCORE_API void dumpInfo();
+        NVCORE_API void dumpCallstack( MessageHandler *messageHandler, int callstackLevelsToSkip = 0 );
 
         NVCORE_API void setMessageHandler( MessageHandler * messageHandler );
         NVCORE_API void resetMessageHandler();
diff --git a/src/nvcore/DefsGnucDarwin.h b/src/nvcore/DefsGnucDarwin.h
index cc1de77..6a3a52b 100644
--- a/src/nvcore/DefsGnucDarwin.h
+++ b/src/nvcore/DefsGnucDarwin.h
@@ -27,6 +27,7 @@
 #define NV_FASTCALL		__attribute__((fastcall))
 #define NV_FORCEINLINE	__attribute__((always_inline)) inline
 #define NV_DEPRECATED   __attribute__((deprecated))
+#define NV_THREAD_LOCAL //ACS: there's no "__thread" or equivalent on iOS/OSX
 
 #if __GNUC__ > 2
 #define NV_PURE     __attribute__((pure))
@@ -38,6 +39,8 @@
 
 #define NV_NOINLINE __attribute__((noinline))
 
+
+
 // Define __FUNC__ properly.
 #if __STDC_VERSION__ < 199901L
 #	if __GNUC__ >= 2
diff --git a/src/nvcore/DefsGnucLinux.h b/src/nvcore/DefsGnucLinux.h
index 6ecd123..f8e6f80 100644
--- a/src/nvcore/DefsGnucLinux.h
+++ b/src/nvcore/DefsGnucLinux.h
@@ -25,8 +25,9 @@
 #endif
 
 #define NV_FASTCALL     __attribute__((fastcall))
-#define NV_FORCEINLINE  inline __attribute__((always_inline))
+#define NV_FORCEINLINE  __attribute__((always_inline))
 #define NV_DEPRECATED   __attribute__((deprecated))
+#define NV_THREAD_LOCAL __thread 
 
 
 #if __GNUC__ > 2
diff --git a/src/nvcore/StdStream.h b/src/nvcore/StdStream.h
index 7c0e438..08f399d 100644
--- a/src/nvcore/StdStream.h
+++ b/src/nvcore/StdStream.h
@@ -103,10 +103,25 @@ namespace nv
             clearerr(m_fp);
         }
 
+        // @@ The original implementation uses feof, which only returns true when we attempt to read *past* the end of the stream. 
+        // That is, if we read the last byte of a file, then isAtEnd would still return false, even though the stream pointer is at the file end. This is not the intent and was inconsistent with the implementation of the MemoryStream, a better 
+        // implementation uses use ftell and fseek to determine our location within the file.
         virtual bool isAtEnd() const
         {
             nvDebugCheck(m_fp != NULL);
-            return feof( m_fp ) != 0;
+            //return feof( m_fp ) != 0;
+#if NV_OS_WIN32
+            uint pos = _ftell_nolock(m_fp);
+            _fseek_nolock(m_fp, 0, SEEK_END);
+            uint end = _ftell_nolock(m_fp);
+            _fseek_nolock(m_fp, pos, SEEK_SET);
+#else
+            uint pos = (uint)ftell(m_fp);
+            fseek(m_fp, 0, SEEK_END);
+            uint end = (uint)ftell(m_fp);
+            fseek(m_fp, pos, SEEK_SET);
+#endif
+            return pos == end;
         }
 
         /// Always true.
diff --git a/src/nvcore/StrLib.cpp b/src/nvcore/StrLib.cpp
index df30095..01c498c 100644
--- a/src/nvcore/StrLib.cpp
+++ b/src/nvcore/StrLib.cpp
@@ -101,6 +101,13 @@ bool nv::strEqual(const char * s1, const char * s2)
     return strCmp(s1, s2) == 0;
 }
 
+bool nv::strCaseEqual(const char * s1, const char * s2)
+{
+    if (s1 == s2) return true;
+    if (s1 == NULL || s2 == NULL) return false;
+    return strCaseCmp(s1, s2) == 0;
+}
+
 bool nv::strBeginsWith(const char * str, const char * prefix)
 {
     //return strstr(str, prefix) == dst;
@@ -326,7 +333,7 @@ StringBuilder & StringBuilder::append( const char * s )
     if (m_str == NULL) {
         m_size = slen + 1;
         m_str = strAlloc(m_size);
-        memcpy(m_str, s, m_size + 1);
+        memcpy(m_str, s, m_size);
     }
     else {
         const uint len = uint(strlen( m_str ));
diff --git a/src/nvcore/StrLib.h b/src/nvcore/StrLib.h
index a132325..e3c2e9a 100644
--- a/src/nvcore/StrLib.h
+++ b/src/nvcore/StrLib.h
@@ -35,12 +35,12 @@ namespace nv
         uint operator()(const char * str) const { return strHash(str); }
     };
 
-
     NVCORE_API uint strLen(const char * str) NV_PURE;
 
-    NVCORE_API int strCaseCmp(const char * s1, const char * s2) NV_PURE;
     NVCORE_API int strCmp(const char * s1, const char * s2) NV_PURE;
+    NVCORE_API int strCaseCmp(const char * s1, const char * s2) NV_PURE;
     NVCORE_API bool strEqual(const char * s1, const char * s2) NV_PURE; // Accepts NULL strings.
+    NVCORE_API bool strCaseEqual(const char * s1, const char * s2) NV_PURE; // Accepts NULL strings.
 
     template <> struct Equal<const char *> {
         bool operator()(const char * a, const char * b) const { return strEqual(a, b); }
diff --git a/src/nvcore/Utils.h b/src/nvcore/Utils.h
index cef87ed..29ae96b 100644
--- a/src/nvcore/Utils.h
+++ b/src/nvcore/Utils.h
@@ -23,7 +23,7 @@
 #define NV_INT32_MAX    2147483647
 #define NV_UINT32_MAX   0xffffffff
 #define NV_INT64_MAX    POSH_I64(9223372036854775807)
-#define NV_INT64_MIN    (-POSH_I64(9223372036854775808))
+#define NV_INT64_MIN    (-POSH_I64(9223372036854775807)-1)
 #define NV_UINT64_MAX   POSH_U64(0xffffffffffffffff)
 
 #define NV_HALF_MAX     65504.0F
diff --git a/src/nvcore/nvcore.h b/src/nvcore/nvcore.h
index 1681550..b02d5c4 100644
--- a/src/nvcore/nvcore.h
+++ b/src/nvcore/nvcore.h
@@ -31,12 +31,16 @@
 // NV_OS_UNIX
 // NV_OS_DARWIN
 // NV_OS_XBOX
+// NV_OS_ORBIS
+// NV_OS_IOS
 
 #define NV_OS_STRING POSH_OS_STRING
 
 #if defined POSH_OS_LINUX
 #   define NV_OS_LINUX 1
 #   define NV_OS_UNIX 1
+#elif defined POSH_OS_ORBIS
+#   define NV_OS_ORBIS 1
 #elif defined POSH_OS_FREEBSD
 #   define NV_OS_FREEBSD 1
 #   define NV_OS_UNIX 1
@@ -51,6 +55,10 @@
 #elif defined POSH_OS_OSX
 #   define NV_OS_DARWIN 1
 #   define NV_OS_UNIX 1
+#elif defined POSH_OS_IOS
+#   define NV_OS_DARWIN 1 //ACS should we keep this on IOS?
+#   define NV_OS_UNIX 1
+#   define NV_OS_IOS 1
 #elif defined POSH_OS_UNIX
 #   define NV_OS_UNIX 1
 #elif defined POSH_OS_WIN32
@@ -63,6 +71,22 @@
 #   error "Unsupported OS"
 #endif
 
+
+// Threading:
+// some platforms don't implement __thread or similar for thread-local-storage
+#if NV_OS_UNIX || NV_OS_ORBIS || NV_OS_IOS //ACStodoIOS darwin instead of ios?
+#   define NV_OS_USE_PTHREAD 1
+#   if NV_OS_DARWIN || NV_OS_IOS
+#       define NV_OS_HAS_TLS_QUALIFIER 0
+#   else
+#       define NV_OS_HAS_TLS_QUALIFIER 1
+#   endif
+#else
+#   define NV_OS_USE_PTHREAD 0
+#   define NV_OS_HAS_TLS_QUALIFIER 1
+#endif
+
+
 // CPUs:
 // NV_CPU_X86
 // NV_CPU_X86_64
@@ -182,7 +206,7 @@ typedef uint32      uint;
 #endif
 
 #if __cplusplus > 199711L
-#define nvStaticCheck(x) static_assert(x)
+#define nvStaticCheck(x) static_assert(x, "Static assert "#x" failed")
 #else
 #define nvStaticCheck(x) typedef char NV_STRING_JOIN2(__static_assert_,__LINE__)[(x)]
 #endif
diff --git a/src/nvimage/DirectDrawSurface.cpp b/src/nvimage/DirectDrawSurface.cpp
index 0ca3d8a..002fd98 100644
--- a/src/nvimage/DirectDrawSurface.cpp
+++ b/src/nvimage/DirectDrawSurface.cpp
@@ -343,6 +343,7 @@ namespace
             case DXGI_FORMAT_B8G8R8X8_TYPELESS:
             case DXGI_FORMAT_B8G8R8X8_UNORM_SRGB:
                 return 8*4;
+                
             default:
                 return 0;
         }
@@ -452,8 +453,9 @@ namespace
         { D3DFMT_A2R10G10B10,    32, 0x3FF00000, 0xFFC00,    0x3FF,      0xC0000000 },
         { D3DFMT_A2B10G10R10,    32, 0x3FF,      0xFFC00,    0x3FF00000, 0xC0000000 },
 
-        { D3DFMT_L8,             8,  8,          0,          0,          0 },           // DXGI_FORMAT_R8_UNORM 
-        { D3DFMT_L16,            16, 16,         0,          0,          0 },           // DXGI_FORMAT_R16_UNORM
+        { D3DFMT_L8,             8,  0xFF,       0,          0,          0 },           // DXGI_FORMAT_R8_UNORM 
+        { D3DFMT_L16,            16, 0xFFFF,     0,          0,          0 },           // DXGI_FORMAT_R16_UNORM
+        { D3DFMT_A8L8,           16, 0xFF,       0,          0,     0xFF00 },           // DXGI_FORMAT_R8G8_UNORM?
     };
 
     static const uint s_d3d9FormatCount = NV_ARRAY_SIZE(s_d3d9Formats);
diff --git a/src/nvimage/ErrorMetric.cpp b/src/nvimage/ErrorMetric.cpp
index 8c14575..6d14b7a 100644
--- a/src/nvimage/ErrorMetric.cpp
+++ b/src/nvimage/ErrorMetric.cpp
@@ -186,8 +186,6 @@ static float f(float t)
 
 static float finv(float t)
 {
-    const float epsilon = powf(6.0f/29.0f, 3);
-
     if (t > 6.0f / 29.0f) {
         return powf(t, 3.0f);
     }
diff --git a/src/nvimage/FloatImage.cpp b/src/nvimage/FloatImage.cpp
index 7f591e6..51acc98 100644
--- a/src/nvimage/FloatImage.cpp
+++ b/src/nvimage/FloatImage.cpp
@@ -344,15 +344,15 @@ float FloatImage::sampleLinear(uint c, float x, float y, float z, WrapMode wm) c
 
 float FloatImage::sampleNearestClamp(uint c, float x, float y) const
 {
-    int ix = ::clamp(iround(x * m_width), 0, m_width-1);
-    int iy = ::clamp(iround(y * m_height), 0, m_height-1);
+    int ix = wrapClamp(iround(x * m_width), m_width);
+    int iy = wrapClamp(iround(y * m_height), m_height);
     return pixel(c, ix, iy, 0);
 }
 
 float FloatImage::sampleNearestRepeat(uint c, float x, float y) const
 {
-    int ix = iround(frac(x) * m_width);
-    int iy = iround(frac(y) * m_height);
+    int ix = wrapRepeat(iround(x * m_width), m_width);
+    int iy = wrapRepeat(iround(y * m_height), m_height);
     return pixel(c, ix, iy, 0);
 }
 
@@ -373,9 +373,9 @@ float FloatImage::sampleNearestClamp(uint c, float x, float y, float z) const
 
 float FloatImage::sampleNearestRepeat(uint c, float x, float y, float z) const
 {
-    int ix = iround(frac(x) * m_width);     // wrapRepeat(iround(x * m_width), m_width)
-    int iy = iround(frac(y) * m_height);    // wrapRepeat(iround(y * m_height), m_height)
-    int iz = iround(frac(z) * m_depth);     // wrapRepeat(iround(z * m_depth), m_depth)
+    int ix = wrapRepeat(iround(x * m_width), m_width);
+    int iy = wrapRepeat(iround(y * m_height), m_height);
+    int iz = wrapRepeat(iround(z * m_depth), m_depth);
     return pixel(c, ix, iy, iz);
 }
 
@@ -1326,7 +1326,7 @@ void FloatImage::flipZ()
     const uint d2 = d / 2;
 
     for (uint c = 0; c < m_componentCount; c++) {
-        for (uint z = 0; z < d/2; z++) {
+        for (uint z = 0; z < d2; z++) {
             float * src = plane(c, z);
             float * dst = plane(c, d - 1 - z);
             for (uint i = 0; i < w*h; i++) {
@@ -1345,9 +1345,9 @@ float FloatImage::alphaTestCoverage(float alphaRef, int alphaChannel, float alph
 
     float coverage = 0.0f;
 
+#if 0
     const float * alpha = channel(alphaChannel);
 
-#if 0
     const uint count = m_pixelCount;
     for (uint i = 0; i < count; i++) {
         if (alpha[i] > alphaRef) coverage += 1.0f; // @@ gt or lt?
@@ -1435,7 +1435,7 @@ void FloatImage::scaleAlphaToCoverage(float desiredCoverage, float alphaRef, int
     clamp(alphaChannel, 1, 0.0f, 1.0f); 
 #endif
 #if _DEBUG
-    float newCoverage = alphaTestCoverage(alphaRef, alphaChannel);
+    alphaTestCoverage(alphaRef, alphaChannel);
 #endif
 }
 
diff --git a/src/nvimage/NormalMap.cpp b/src/nvimage/NormalMap.cpp
index 404186d..2db99a7 100644
--- a/src/nvimage/NormalMap.cpp
+++ b/src/nvimage/NormalMap.cpp
@@ -199,10 +199,6 @@ void nv::normalizeNormalMap(FloatImage * img)
 {
     nvDebugCheck(img != NULL);
 
-#pragma NV_MESSAGE("TODO: Pack and expand normals explicitly?")
-
-    img->expandNormals(0);
     img->normalize(0);
-    img->packNormals(0);
 }
 
diff --git a/src/nvmath/Color.h b/src/nvmath/Color.h
index 7324723..055395b 100644
--- a/src/nvmath/Color.h
+++ b/src/nvmath/Color.h
@@ -118,6 +118,32 @@ namespace nv
         };
     };
 
+    /// 16 bit 4444 BGRA color.
+    class NVMATH_CLASS Color16_4444
+    {
+    public:
+        Color16_4444() { }
+        Color16_4444(const Color16_4444 & c) : u(c.u) { }
+        explicit Color16_4444(uint16 U) : u(U) { }
+
+        union {
+            struct {
+#if NV_LITTLE_ENDIAN
+                uint16 b : 4;
+                uint16 g : 4;
+                uint16 r : 4;
+                uint16 a : 4;
+#else
+                uint16 a : 4;
+                uint16 r : 4;
+                uint16 g : 4;
+                uint16 b : 4;
+#endif
+            };
+            uint16 u;
+        };
+    };
+
 } // nv namespace
 
 #endif // NV_MATH_COLOR_H
diff --git a/src/nvmath/Color.inl b/src/nvmath/Color.inl
index a5dfcb2..84ddc59 100644
--- a/src/nvmath/Color.inl
+++ b/src/nvmath/Color.inl
@@ -10,6 +10,12 @@
 
 namespace nv
 {
+    // for Color16 & Color16_4444 bitfields
+    NV_FORCEINLINE uint32 U32round(float f) { return uint32(floorf(f + 0.5f)); }
+    NV_FORCEINLINE uint16 U16round(float f) { return uint16(floorf(f + 0.5f)); }
+    NV_FORCEINLINE uint16 toU4_in_U16(int x) { nvDebugCheck(x >= 0 && x <= 15u); return (uint16)x; }
+    NV_FORCEINLINE uint16 toU5_in_U16(int x) { nvDebugCheck(x >= 0 && x <= 31u); return (uint16)x; }
+    NV_FORCEINLINE uint16 toU6_in_U16(int x) { nvDebugCheck(x >= 0 && x <= 63u); return (uint16)x; }
 
     // Clamp color components.
     inline Vector3 colorClamp(Vector3::Arg c)
@@ -27,6 +33,16 @@ namespace nv
         return c / scale;
     }
 
+    // Convert Color16 from float components
+    inline Color16 toColor16(float r, float g, float b)
+    {
+        Color16 color; // 5,6,5
+        color.r = toU5_in_U16(nv::U16round(saturate(r) * 31u));
+        color.g = toU6_in_U16(nv::U16round(saturate(g) * 63u));
+        color.b = toU5_in_U16(nv::U16round(saturate(b) * 31u));
+        return color;
+    }
+
     // Convert Color32 to Color16.
     inline Color16 toColor16(Color32 c)
     {
@@ -43,6 +59,49 @@ namespace nv
         return color; 
     }
 
+    // Convert Color32 to Color16_4444.
+    inline Color16_4444 toColor16_4444(Color32 c)
+    {
+        Color16_4444 color;
+        color.a = c.a >> 4;
+        color.r = c.r >> 4;
+        color.g = c.g >> 4;
+        color.b = c.b >> 4;
+        return color; 
+    }
+
+    // Convert float[4] to Color16_4444.
+    inline Color16_4444 toColor16_4444(float r, float g, float b, float a)
+    {
+        Color16_4444 color;
+        color.a = toU4_in_U16(nv::U16round(saturate(a) * 15u));
+        color.r = toU4_in_U16(nv::U16round(saturate(r) * 15u));
+        color.g = toU4_in_U16(nv::U16round(saturate(g) * 15u));
+        color.b = toU4_in_U16(nv::U16round(saturate(b) * 15u));
+        return color;
+    }
+
+    // Convert float[4] to Color16_4444.
+    inline Color16_4444 toColor16_4444_from_argb(float * fc)
+    {
+        Color16_4444 color;
+        color.a = toU4_in_U16(nv::U16round(saturate(fc[0]) * 15u));
+        color.r = toU4_in_U16(nv::U16round(saturate(fc[1]) * 15u));
+        color.g = toU4_in_U16(nv::U16round(saturate(fc[2]) * 15u));
+        color.b = toU4_in_U16(nv::U16round(saturate(fc[3]) * 15u));
+        return color;
+    }
+
+    // Convert float[4] to Color16_4444.
+    inline Color16_4444 toColor16_4444_from_bgra(float * fc)
+    {
+        Color16_4444 color;
+        color.b = toU4_in_U16(nv::U16round(saturate(fc[0]) * 15u));
+        color.g = toU4_in_U16(nv::U16round(saturate(fc[1]) * 15u));
+        color.r = toU4_in_U16(nv::U16round(saturate(fc[2]) * 15u));
+        color.a = toU4_in_U16(nv::U16round(saturate(fc[3]) * 15u));
+        return color;
+    }
 
     // Promote 16 bit color to 32 bit using regular bit expansion.
     inline Color32 toColor32(Color16 c)
@@ -60,13 +119,34 @@ namespace nv
         return color;
     }
 
-    inline Color32 toColor32(Vector4::Arg v)
+    // @@ Quantize with exact endpoints or with uniform bins?
+    inline Color32 toColor32(const Vector4 & v)
     {
         Color32 color;
-        color.r = uint8(saturate(v.x) * 255);
-        color.g = uint8(saturate(v.y) * 255);
-        color.b = uint8(saturate(v.z) * 255);
-        color.a = uint8(saturate(v.w) * 255);
+        color.r = toU8(nv::iround(saturate(v.x) * 255));
+        color.g = toU8(nv::iround(saturate(v.y) * 255));
+        color.b = toU8(nv::iround(saturate(v.z) * 255));
+        color.a = toU8(nv::iround(saturate(v.w) * 255));
+        return color;
+    }
+
+    inline Color32 toColor32_from_bgra(const Vector4 & v)
+    {
+        Color32 color;
+        color.b = toU8(nv::iround(saturate(v.x) * 255));
+        color.g = toU8(nv::iround(saturate(v.y) * 255));
+        color.r = toU8(nv::iround(saturate(v.z) * 255));
+        color.a = toU8(nv::iround(saturate(v.w) * 255));
+        return color;
+    }
+
+    inline Color32 toColor32_from_argb(const Vector4 & v)
+    {
+        Color32 color;
+        color.a = toU8(nv::iround(saturate(v.x) * 255));
+        color.r = toU8(nv::iround(saturate(v.y) * 255));
+        color.g = toU8(nv::iround(saturate(v.z) * 255));
+        color.b = toU8(nv::iround(saturate(v.w) * 255));
         return color;
     }
 
@@ -92,6 +172,30 @@ namespace nv
         return h;
     }
 
+    inline float toSrgb(float f) {
+        if (nv::isNan(f))           f = 0.0f;
+        else if (f <= 0.0f)         f = 0.0f;
+        else if (f <= 0.0031308f)   f = 12.92f * f;
+        else if (f <= 1.0f)         f = (powf(f, 0.41666f) * 1.055f) - 0.055f;
+        else                        f = 1.0f;
+        return f;
+    }
+
+    inline float fromSrgb(float f) {
+        if (f < 0.0f)           f = 0.0f;
+        else if (f < 0.04045f)  f = f / 12.92f;
+        else if (f <= 1.0f)     f = powf((f + 0.055f) / 1.055f, 2.4f);
+        else                    f = 1.0f;
+        return f;
+    }
+
+    inline Vector3 toSrgb(const Vector3 & v) {
+        return Vector3(toSrgb(v.x), toSrgb(v.y), toSrgb(v.z));
+    }
+
+    inline Vector3 fromSrgb(const Vector3 & v) {
+        return Vector3(fromSrgb(v.x), fromSrgb(v.y), fromSrgb(v.z));
+    }
 
 } // nv namespace
 
diff --git a/src/nvmath/Fitting.cpp b/src/nvmath/Fitting.cpp
index ff170d4..5b43ede 100644
--- a/src/nvmath/Fitting.cpp
+++ b/src/nvmath/Fitting.cpp
@@ -179,7 +179,7 @@ bool nv::Fit::isPlanar(int n, const Vector3 * points, float epsilon/*=NV_EPSILON
 {
     // compute the centroid and covariance
     float matrix[6];
-    Vector3 centroid = computeCovariance(n, points, matrix);
+    computeCovariance(n, points, matrix);
 
     float eigenValues[3];
     Vector3 eigenVectors[3];
diff --git a/src/nvmath/Fitting.h b/src/nvmath/Fitting.h
index cf7bcdc..a99c4ac 100644
--- a/src/nvmath/Fitting.h
+++ b/src/nvmath/Fitting.h
@@ -9,9 +9,6 @@
 
 namespace nv
 {
-    class Vector3;
-    class Plane;
-
     namespace Fit
     {
         Vector3 computeCentroid(int n, const Vector3 * points);
diff --git a/src/nvmath/Half.cpp b/src/nvmath/Half.cpp
index 52c4bf4..512b5d3 100644
--- a/src/nvmath/Half.cpp
+++ b/src/nvmath/Half.cpp
@@ -74,14 +74,8 @@
 //
 
 #include "Half.h"
-
-#include "nvcore/Memory.h"
-
 #include <stdio.h>
 
-#if NV_CC_GNUC
-#include <xmmintrin.h>
-#endif
 
 // Load immediate
 static inline uint32 _uint32_li( uint32 a )
@@ -495,9 +489,20 @@ nv::half_to_float( uint16 h )
 }
 
 
+#if !NV_OS_IOS //ACStodoIOS some better define to choose this?
+
+#if NV_CC_GNUC
+#if defined(__i386__) || defined(__x86_64__)
+#include <xmmintrin.h>
+#endif
+#endif
+
+#include "nvcore/Memory.h" // NV_ALIGN_16
+
 static __m128 half_to_float4_SSE2(__m128i h)
 {
 #define SSE_CONST4(name, val) static const NV_ALIGN_16 uint name[4] = { (val), (val), (val), (val) }
+    
 #define CONST(name) *(const __m128i *)&name
 
     SSE_CONST4(mask_nosign,         0x7fff);
@@ -541,7 +546,7 @@ static __m128 half_to_float4_SSE2(__m128i h)
 }
 
 
-void nv::half_to_float_array(const uint16 * vin, float * vout, int count) {
+void nv::half_to_float_array_SSE2(const uint16 * vin, float * vout, int count) {
     nvDebugCheck((intptr_t(vin) & 15) == 0);
     nvDebugCheck((intptr_t(vout) & 15) == 0);
     nvDebugCheck((count & 7) == 0);
@@ -562,7 +567,7 @@ void nv::half_to_float_array(const uint16 * vin, float * vout, int count) {
     }
 }
 
-
+#endif 
 
 
 // @@ These tables could be smaller.
@@ -769,4 +774,4 @@ static inline uint16_t float_to_half_nobranch(uint32_t x)
     bits |= (x & 0x007fffff) >> shifttable[(x >> 23) & 0x1ff];
     return bits;
 }
-#endif
\ No newline at end of file
+#endif
diff --git a/src/nvmath/Half.h b/src/nvmath/Half.h
index 962767a..9027618 100644
--- a/src/nvmath/Half.h
+++ b/src/nvmath/Half.h
@@ -10,7 +10,8 @@ namespace nv {
     uint16 half_from_float( uint32 f );
 
     // vin,vout must be 16 byte aligned. count must be a multiple of 8.
-    void half_to_float_array(const uint16 * vin, float * vout, int count);
+    // implement a non-SSE version if we need it. For now, this naming makes it clear this is only available when SSE2 is
+    void half_to_float_array_SSE2(const uint16 * vin, float * vout, int count);
 
     void half_init_tables();
 
@@ -40,6 +41,51 @@ namespace nv {
         return f.f;
     }
 
+
+    union Half {
+        uint16 raw;
+        struct {
+        #if NV_BIG_ENDIAN
+            uint negative:1;
+            uint biasedexponent:5;
+            uint mantissa:10;
+        #else
+            uint mantissa:10;
+            uint biasedexponent:5;
+            uint negative:1;
+        #endif
+        } field;
+    };
+
+
+    inline float TestHalfPrecisionAwayFromZero(float input)
+    {
+        Half h;
+        h.raw = to_half(input);
+        h.raw += 1;
+
+        float f = to_float(h.raw);
+        
+        // Subtract the initial value to find our precision
+        float delta = f - input;
+
+        return delta;
+    }
+     
+    inline float TestHalfPrecisionTowardsZero(float input)
+    {
+        Half h;
+        h.raw = to_half(input);
+        h.raw -= 1;
+
+        float f = to_float(h.raw);
+
+        // Subtract the initial value to find our precision
+        float delta = f - input;
+
+        return -delta;
+    }
+
 } // nv namespace
 
 #endif // NV_MATH_HALF_H
diff --git a/src/nvmath/Vector.h b/src/nvmath/Vector.h
index ef09b86..231d3b9 100644
--- a/src/nvmath/Vector.h
+++ b/src/nvmath/Vector.h
@@ -18,7 +18,8 @@ namespace nv
         Vector2(float x, float y);
         Vector2(Vector2::Arg v);
 
-        template <typename T> operator T() const { return T(x, y); }
+        //template <typename T> explicit Vector2(const T & v) : x(v.x), y(v.y) {}
+        //template <typename T> operator T() const { return T(x, y); }
 
         const Vector2 & operator=(Vector2::Arg v);
 
@@ -50,11 +51,13 @@ namespace nv
 
         Vector3();
         explicit Vector3(float x);
+        //explicit Vector3(int x) : x(float(x)), y(float(x)), z(float(x)) {}
         Vector3(float x, float y, float z);
         Vector3(Vector2::Arg v, float z);
         Vector3(Vector3::Arg v);
 
-        template <typename T> operator T() const { return T(x, y, z); }
+        //template <typename T> explicit Vector3(const T & v) : x(v.x), y(v.y), z(v.z) {}
+        //template <typename T> operator T() const { return T(x, y, z); }
 
         const Vector3 & operator=(Vector3::Arg v);
 
@@ -96,7 +99,8 @@ namespace nv
         Vector4(Vector4::Arg v);
         //	Vector4(const Quaternion & v);
 
-        template <typename T> operator T() const { return T(x, y, z, w); }
+        //template <typename T> explicit Vector4(const T & v) : x(v.x), y(v.y), z(v.z), w(v.w) {}
+        //template <typename T> operator T() const { return T(x, y, z, w); }
 
         const Vector4 & operator=(Vector4::Arg v);
 
@@ -127,4 +131,16 @@ namespace nv
 
 } // nv namespace
 
+// If we had these functions, they would be ambiguous, the compiler would not know which one to pick:
+//template <typename T> Vector2 to(const T & v) { return Vector2(v.x, v.y); }
+//template <typename T> Vector3 to(const T & v) { return Vector3(v.x, v.y, v.z); }
+//template <typename T> Vector4 to(const T & v) { return Vector4(v.x, v.y, v.z, v.z); }
+
+// We could use a cast operator so that we could infer the expected type, but that doesn't work the same way in all compilers and produces horrible error messages.
+
+// Instead we simply have explicit casts:
+template <typename T> T to(const nv::Vector2 & v) { NV_COMPILER_CHECK(sizeof(T) == sizeof(nv::Vector2)); return T(v.x, v.y); }
+template <typename T> T to(const nv::Vector3 & v) { NV_COMPILER_CHECK(sizeof(T) == sizeof(nv::Vector3)); return T(v.x, v.y, v.z); }
+template <typename T> T to(const nv::Vector4 & v) { NV_COMPILER_CHECK(sizeof(T) == sizeof(nv::Vector4)); return T(v.x, v.y, v.z, v.z); }
+
 #endif // NV_MATH_VECTOR_H
diff --git a/src/nvmath/Vector.inl b/src/nvmath/Vector.inl
index 996f764..6f26262 100644
--- a/src/nvmath/Vector.inl
+++ b/src/nvmath/Vector.inl
@@ -336,6 +336,11 @@ namespace nv
         return sqrtf(lengthSquared(v));
     }
 
+    inline float distance(Vector2::Arg a, Vector2::Arg b)
+    {
+        return length(a - b);
+    }
+
     inline float inverseLength(Vector2::Arg v)
     {
         return 1.0f / sqrtf(lengthSquared(v));
@@ -784,6 +789,90 @@ namespace nv
         return sdbmFloatHash(v.component, 4, h);
     }
 
+
+#if NV_OS_IOS // LLVM is not happy with implicit conversion of immediate constants to float
+
+    //int:
+
+    inline Vector2 scale(Vector2::Arg v, int s)
+    {
+        return Vector2(v.x * s, v.y * s);
+    }
+
+    inline Vector2 operator*(Vector2::Arg v, int s)
+    {
+        return scale(v, s);
+    }
+
+    inline Vector2 operator*(int s, Vector2::Arg v)
+    {
+        return scale(v, s);
+    }
+
+    inline Vector2 operator/(Vector2::Arg v, int s)
+    {
+        return scale(v, 1.0f/s);
+    }
+
+    inline Vector3 scale(Vector3::Arg v, int s)
+    {
+        return Vector3(v.x * s, v.y * s, v.z * s);
+    }
+
+    inline Vector3 operator*(Vector3::Arg v, int s)
+    {
+        return scale(v, s);
+    }
+
+    inline Vector3 operator*(int s, Vector3::Arg v)
+    {
+        return scale(v, s);
+    }
+
+    inline Vector3 operator/(Vector3::Arg v, int s)
+    {
+        return scale(v, 1.0f/s);
+    }
+
+    inline Vector4 scale(Vector4::Arg v, int s)
+    {
+        return Vector4(v.x * s, v.y * s, v.z * s, v.w * s);
+    }
+
+    inline Vector4 operator*(Vector4::Arg v, int s)
+    {
+        return scale(v, s);
+    }
+
+    inline Vector4 operator*(int s, Vector4::Arg v)
+    {
+        return scale(v, s);
+    }
+
+    inline Vector4 operator/(Vector4::Arg v, int s)
+    {
+        return scale(v, 1.0f/s);
+    }
+
+    //double:
+
+    inline Vector3 operator*(Vector3::Arg v, double s)
+    {
+        return scale(v, (float)s);
+    }
+
+    inline Vector3 operator*(double s, Vector3::Arg v)
+    {
+        return scale(v, (float)s);
+    }
+
+    inline Vector3 operator/(Vector3::Arg v, double s)
+    {
+        return scale(v, 1.f/((float)s));
+    }    
+        
+#endif //NV_OS_IOS
+
 } // nv namespace
 
 #endif // NV_MATH_VECTOR_INL
diff --git a/src/nvmath/nvmath.h b/src/nvmath/nvmath.h
index 91d9c43..4c455e8 100644
--- a/src/nvmath/nvmath.h
+++ b/src/nvmath/nvmath.h
@@ -132,7 +132,7 @@ namespace nv
     {
 #if NV_OS_WIN32 || NV_OS_XBOX
         return _finite(f) != 0;
-#elif NV_OS_DARWIN || NV_OS_FREEBSD || NV_OS_OPENBSD
+#elif NV_OS_DARWIN || NV_OS_FREEBSD || NV_OS_OPENBSD || NV_OS_ORBIS
         return isfinite(f);
 #elif NV_OS_LINUX
         return finitef(f);
@@ -147,7 +147,7 @@ namespace nv
     {
 #if NV_OS_WIN32 || NV_OS_XBOX
         return _isnan(f) != 0;
-#elif NV_OS_DARWIN || NV_OS_FREEBSD || NV_OS_OPENBSD
+#elif NV_OS_DARWIN || NV_OS_FREEBSD || NV_OS_OPENBSD || NV_OS_ORBIS
         return isnan(f);
 #elif NV_OS_LINUX
         return isnanf(f);
@@ -242,21 +242,18 @@ namespace nv
 
     // I'm always confused about which quantizer to use. I think we should choose a quantizer based on how the values are expanded later and this is generally using the 'exact endpoints' rule.
 
-    // Quantize a [0, 1] full precision float, using exact endpoints.
-    inline float quantizeFloat(float f, uint bits) {
+    // Quantize a float in the [0,1] range, using exact end points or uniform bins.
+    inline float quantizeFloat(float x, uint bits, bool exactEndPoints = true) {
         nvDebugCheck(bits <= 16);
-        float scale = float((1 << bits) - 1);
-        float offset = 0.0f;
-        return floor(saturate(f) * scale + offset) / scale;
-    }
 
-    // Quantize a [0, 1] full precision float, using uniform bins.
-    /*inline float quantizeFloat(float f, uint bits) {
-        nvDebugCheck(bits <= 16);
-        float scale = float(1 << bits);
-        float offset = 0.5f;
-        return floor(saturate(f) * scale + offset) / scale;
-    }*/
+        float range = float(1 << bits);
+        if (exactEndPoints) {
+            return floorf(x * (range-1) + 0.5f) / (range-1);
+        }
+        else {
+            return (floorf(x * range) + 0.5f) / range;
+        }
+    }
 
     union Float754 {
         unsigned int raw;
diff --git a/src/nvthread/Atomic.h b/src/nvthread/Atomic.h
index 0c16f8c..ece44b5 100644
--- a/src/nvthread/Atomic.h
+++ b/src/nvthread/Atomic.h
@@ -33,9 +33,15 @@ extern "C"
 #if NV_CC_CLANG && POSH_CPU_STRONGARM
 // LLVM/Clang do not yet have functioning atomics as of 2.1
 // #include <atomic>
-
 #endif
 
+//ACS: need this if we want to use Apple's atomics.
+/*
+#if NV_OS_IOS || NV_OS_DARWIN
+// for iOS & OSX we use apple's atomics
+#include "libkern/OSAtomic.h"
+#endif
+*/
 
 namespace nv {
 
@@ -72,8 +78,9 @@ namespace nv {
         nvDebugCheck((intptr_t(&value) & 3) == 0);
 
 #if POSH_CPU_X86 || POSH_CPU_X86_64
-        *ptr = value;   // on x86, stores are Release
         nvCompilerWriteBarrier();
+        *ptr = value;   // on x86, stores are Release
+        //nvCompilerWriteBarrier(); // @@ IC: Where does this barrier go? In nvtt it was after, in Witness before. Not sure which one is right.
 #elif POSH_CPU_STRONGARM
         // this is the easiest but slowest way to do this
         nvCompilerReadWriteBarrier();
@@ -114,17 +121,90 @@ namespace nv {
     inline uint32 atomicIncrement(uint32 * value)
     {
         nvDebugCheck((intptr_t(value) & 3) == 0);
-
         return (uint32)_InterlockedIncrement((long *)value);
     }
 
     inline uint32 atomicDecrement(uint32 * value)
     {
         nvDebugCheck((intptr_t(value) & 3) == 0);
-
         return (uint32)_InterlockedDecrement((long *)value);
     }
+
+    // Compare '*value' against 'expected', if equal, then stores 'desired' in '*value'.
+    // @@ C++0x style CAS? Unlike the C++0x version, 'expected' is not passed by reference and not mutated.
+    // @@ Is this strong or weak? Does InterlockedCompareExchange have spurious failures?
+    inline bool atomicCompareAndSwap(uint32 * value, uint32 expected, uint32 desired)
+    {
+        nvDebugCheck((intptr_t(value) & 3) == 0);
+        long result = _InterlockedCompareExchange((long *)value, (long)desired, (long)expected);
+        return result == (long)expected;
+    }
+
+
+    inline uint32 atomicSwap(uint32 * value, uint32 desired)
+    {
+        nvDebugCheck((intptr_t(value) & 3) == 0);
+        return (uint32)_InterlockedExchange((long *)value, (long)desired);
+    }
+
+#elif NV_CC_CLANG && (NV_OS_IOS || NV_OS_DARWIN)
+    NV_COMPILER_CHECK(sizeof(uint32) == sizeof(long));
+
+    //ACS: Use Apple's atomics instead? I don't know if these are better in any way; there are non-barrier versions too. There's no OSAtomicSwap32 tho'
+    /*
+    inline uint32 atomicIncrement(uint32 * value)
+    {
+        nvDebugCheck((intptr_t(value) & 3) == 0);
+        return (uint32)OSAtomicIncrement32Barrier((int32_t *)value);
+    }
     
+    inline uint32 atomicDecrement(uint32 * value)
+    {
+        nvDebugCheck((intptr_t(value) & 3) == 0);
+        return (uint32)OSAtomicDecrement32Barrier((int32_t *)value);
+    }
+
+    // Compare '*value' against 'expected', if equal, then stores 'desired' in '*value'.
+    // @@ C++0x style CAS? Unlike the C++0x version, 'expected' is not passed by reference and not mutated.
+    // @@ Is this strong or weak?
+    inline bool atomicCompareAndSwap(uint32 * value, uint32 expected, uint32 desired)
+    {
+        nvDebugCheck((intptr_t(value) & 3) == 0);
+        return OSAtomicCompareAndSwap32Barrier((int32_t)expected, (int32_t)desired, (int32_t *)value);
+    }
+    */
+
+    inline uint32 atomicIncrement(uint32 * value)
+    {
+        nvDebugCheck((intptr_t(value) & 3) == 0);
+
+        return __sync_add_and_fetch(value, 1);
+    }
+
+    inline uint32 atomicDecrement(uint32 * value)
+    {
+        nvDebugCheck((intptr_t(value) & 3) == 0);
+
+        return __sync_sub_and_fetch(value, 1);
+    }
+    
+    // Compare '*value' against 'expected', if equal, then stores 'desired' in '*value'.
+    // @@ C++0x style CAS? Unlike the C++0x version, 'expected' is not passed by reference and not mutated.
+    // @@ Is this strong or weak?
+    inline bool atomicCompareAndSwap(uint32 * value, uint32 expected, uint32 desired)
+    {
+        nvDebugCheck((intptr_t(value) & 3) == 0);
+        return __sync_bool_compare_and_swap(value, expected, desired);
+    }
+    
+    inline uint32 atomicSwap(uint32 * value, uint32 desired)
+    {
+        nvDebugCheck((intptr_t(value) & 3) == 0);
+        // this is confusingly named, it doesn't actually do a test but always sets
+        return __sync_lock_test_and_set(value, desired);
+    }
+
+
 #elif NV_CC_CLANG && POSH_CPU_STRONGARM
     NV_COMPILER_CHECK(sizeof(uint32) == sizeof(long));
     
@@ -183,15 +263,32 @@ namespace nv {
     {
         nvDebugCheck((intptr_t(value) & 3) == 0);
 
-        return __sync_fetch_and_add(value, 1);
+        return __sync_add_and_fetch(value, 1);
     }
 
     inline uint32 atomicDecrement(uint32 * value)
     {
         nvDebugCheck((intptr_t(value) & 3) == 0);
 
-        return __sync_fetch_and_sub(value, 1);
+        return __sync_sub_and_fetch(value, 1);
     }
+    
+    // Compare '*value' against 'expected', if equal, then stores 'desired' in '*value'.
+    // @@ C++0x style CAS? Unlike the C++0x version, 'expected' is not passed by reference and not mutated.
+    // @@ Is this strong or weak?
+    inline bool atomicCompareAndSwap(uint32 * value, uint32 expected, uint32 desired)
+    {
+        nvDebugCheck((intptr_t(value) & 3) == 0);
+        return __sync_bool_compare_and_swap(value, expected, desired);
+    }
+    
+    inline uint32 atomicSwap(uint32 * value, uint32 desired)
+    {
+        nvDebugCheck((intptr_t(value) & 3) == 0);
+        // this is confusingly named, it doesn't actually do a test but always sets
+        return __sync_lock_test_and_set(value, desired);
+    }
+    
 #else
 #error "Atomics not implemented."
 
diff --git a/src/nvthread/Event.cpp b/src/nvthread/Event.cpp
index d0c03b9..98a4bcc 100644
--- a/src/nvthread/Event.cpp
+++ b/src/nvthread/Event.cpp
@@ -4,7 +4,7 @@
 
 #if NV_OS_WIN32
 #include "Win32.h"
-#elif NV_OS_UNIX
+#elif NV_OS_USE_PTHREAD
 #include <pthread.h>
 #endif
 
@@ -32,19 +32,20 @@ void Event::wait() {
     WaitForSingleObject(m->handle, INFINITE);
 }
 
-#elif NV_OS_UNIX
-
-#pragma NV_MESSAGE("Implement event using pthreads!")
+#elif NV_OS_USE_PTHREAD
 
 struct Event::Private {
     pthread_cond_t pt_cond;
     pthread_mutex_t pt_mutex;
+    int count;
+    int wait_count;
 };
 
 Event::Event() : m(new Private) {
-    // pthread equivalent of auto-reset event
-    pthread_cond_init(&m->pt_cond, NULL);
+    m->count=0;
+    m->wait_count=0;
     pthread_mutex_init(&m->pt_mutex, NULL);
+    pthread_cond_init(&m->pt_cond, NULL);
 }
 
 Event::~Event() {
@@ -53,11 +54,29 @@ Event::~Event() {
 }
 
 void Event::post() {
+    pthread_mutex_lock(&m->pt_mutex);
+
+    m->count++;
+    
+    //ACS: move this after the unlock?
+    if(m->wait_count>0) {
     pthread_cond_signal(&m->pt_cond);
+    }
+    
+    pthread_mutex_unlock(&m->pt_mutex);
 }
 
 void Event::wait() {
+    pthread_mutex_lock(&m->pt_mutex);
+    
+    while(m->count==0) {
+        m->wait_count++;
     pthread_cond_wait(&m->pt_cond, &m->pt_mutex);
+        m->wait_count--;
+    }
+    m->count--;
+    
+    pthread_mutex_unlock(&m->pt_mutex);
 }
 
 #endif // NV_OS_UNIX
diff --git a/src/nvthread/Mutex.cpp b/src/nvthread/Mutex.cpp
index cb6ebfc..b657c2e 100644
--- a/src/nvthread/Mutex.cpp
+++ b/src/nvthread/Mutex.cpp
@@ -6,7 +6,7 @@
 
 #include "Win32.h"
 
-#elif NV_OS_UNIX
+#elif NV_OS_USE_PTHREAD
 
 #include <pthread.h>
 #include <errno.h> // EBUSY
@@ -48,7 +48,7 @@ void Mutex::unlock()
     LeaveCriticalSection(&m->mutex);
 }
 
-#elif NV_OS_UNIX
+#elif NV_OS_USE_PTHREAD
 
 struct Mutex::Private {
     pthread_mutex_t mutex;
diff --git a/src/nvthread/ParallelFor.cpp b/src/nvthread/ParallelFor.cpp
index 9632414..216c6d2 100644
--- a/src/nvthread/ParallelFor.cpp
+++ b/src/nvthread/ParallelFor.cpp
@@ -9,12 +9,7 @@
 
 using namespace nv;
 
-// @@ nvthread is only fully implemented in win32.
-#if NV_OS_WIN32
 #define ENABLE_PARALLEL_FOR 1
-#else
-#define ENABLE_PARALLEL_FOR 0
-#endif
 
 static void worker(void * arg) {
     ParallelFor * owner = (ParallelFor *)arg;
diff --git a/src/nvthread/Thread.cpp b/src/nvthread/Thread.cpp
index 441d208..6c16ad8 100644
--- a/src/nvthread/Thread.cpp
+++ b/src/nvthread/Thread.cpp
@@ -4,7 +4,7 @@
 
 #if NV_OS_WIN32
     #include "Win32.h"
-#elif NV_OS_UNIX
+#elif NV_OS_USE_PTHREAD
     #include <pthread.h>
     #include <unistd.h> // usleep
 #endif
@@ -15,7 +15,7 @@ struct Thread::Private
 {
 #if NV_OS_WIN32
     HANDLE thread;
-#elif NV_OS_UNIX
+#elif NV_OS_USE_PTHREAD
     pthread_t thread;
 #endif
 
@@ -32,7 +32,7 @@ unsigned long __stdcall threadFunc(void * arg) {
     return 0;
 }
 
-#elif NV_OS_UNIX
+#elif NV_OS_USE_PTHREAD
 
 extern "C" void * threadFunc(void * arg) {
     Thread::Private * thread = (Thread::Private *)arg;
@@ -62,7 +62,7 @@ void Thread::start(ThreadFunc * func, void * arg)
     p->thread = CreateThread(NULL, 0, threadFunc, p.ptr(), 0, NULL);
     //p->thread = (HANDLE)_beginthreadex (0, 0, threadFunc, p.ptr(), 0, NULL);     // @@ So that we can call CRT functions...
     nvDebugCheck(p->thread != NULL);
-#elif NV_OS_UNIX
+#elif NV_OS_USE_PTHREAD
     int result = pthread_create(&p->thread, NULL, threadFunc, p.ptr());
     nvDebugCheck(result == 0);
 #endif
@@ -76,7 +76,7 @@ void Thread::wait()
     BOOL ok = CloseHandle (p->thread);
     p->thread = NULL;
     nvCheck (ok);
-#elif NV_OS_UNIX
+#elif NV_OS_USE_PTHREAD
     int result = pthread_join(p->thread, NULL);
     p->thread = 0;
     nvDebugCheck(result == 0);
@@ -87,7 +87,7 @@ bool Thread::isRunning () const
 {
 #if NV_OS_WIN32
     return p->thread != NULL;
-#elif NV_OS_UNIX
+#elif NV_OS_USE_PTHREAD
     return p->thread != 0;
 #endif
 }
@@ -101,7 +101,7 @@ bool Thread::isRunning () const
 {
 #if NV_OS_WIN32
     SwitchToThread();
-#elif NV_OS_UNIX
+#elif NV_OS_USE_PTHREAD
     int result = sched_yield();
     nvDebugCheck(result == 0);
 #endif
@@ -111,7 +111,7 @@ bool Thread::isRunning () const
 {
 #if NV_OS_WIN32
     Sleep(ms);
-#elif NV_OS_UNIX
+#elif NV_OS_USE_PTHREAD
     usleep(1000 * ms);
 #endif
 }
diff --git a/src/nvthread/nvthread.cpp b/src/nvthread/nvthread.cpp
index 987e791..9de9a81 100644
--- a/src/nvthread/nvthread.cpp
+++ b/src/nvthread/nvthread.cpp
@@ -5,24 +5,24 @@
 #include "Thread.h"
 
 #if NV_OS_WIN32
-#  include "Win32.h"
+#include "Win32.h"
 #elif NV_OS_UNIX
-#  include <sys/types.h>
-#  include <sys/sysctl.h>
-#  include <unistd.h>
+#include <sys/types.h>
+#include <sys/sysctl.h>
+#include <unistd.h>
 #elif NV_OS_DARWIN
-#  import <stdio.h>
-#  import <string.h>
-#  import <mach/mach_host.h>
-#  import <sys/sysctl.h>
+#import <stdio.h>
+#import <string.h>
+#import <mach/mach_host.h>
+#import <sys/sysctl.h>
 
-#  include <CoreFoundation/CoreFoundation.h>
+//#include <CoreFoundation/CoreFoundation.h>
 
-#  include <assert.h>
-#  include <errno.h>
-#  include <stdlib.h>
-#  include <string.h>
-#  include <syslog.h>
+#include <assert.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <string.h>
+#include <syslog.h>
 #endif
 
 using namespace nv;
diff --git a/src/nvtt/CompressorDXT.cpp b/src/nvtt/BlockCompressor.cpp
similarity index 90%
rename from src/nvtt/CompressorDXT.cpp
rename to src/nvtt/BlockCompressor.cpp
index 7faeb49..88ebebb 100644
--- a/src/nvtt/CompressorDXT.cpp
+++ b/src/nvtt/BlockCompressor.cpp
@@ -22,7 +22,7 @@
 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 // OTHER DEALINGS IN THE SOFTWARE.
 
-#include "CompressorDXT.h"
+#include "BlockCompressor.h"
 #include "OutputOptions.h"
 #include "TaskDispatcher.h"
 
@@ -46,7 +46,7 @@ using namespace nvtt;
 #include <omp.h>
 #endif
 
-void FixedBlockCompressor::compress(nvtt::AlphaMode alphaMode, uint w, uint h, const float * data, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions)
+void ColorBlockCompressor::compress(nvtt::AlphaMode alphaMode, uint w, uint h, const float * data, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions)
 {
     const uint bs = blockSize();
     const uint bw = (w + 3) / 4;
@@ -113,7 +113,7 @@ void FixedBlockCompressor::compress(nvtt::AlphaMode alphaMode, uint w, uint h, c
 */
 
 
-struct FixedBlockCompressorContext
+struct ColorBlockCompressorContext
 {
     nvtt::AlphaMode alphaMode;
     uint w, h;
@@ -122,13 +122,13 @@ struct FixedBlockCompressorContext
 
     uint bw, bh, bs;
     uint8 * mem;
-    FixedBlockCompressor * compressor;
+    ColorBlockCompressor * compressor;
 };
 
 // Each task compresses one block.
-void FixedBlockCompressorTask(void * data, int i)
+void ColorBlockCompressorTask(void * data, int i)
 {
-    FixedBlockCompressorContext * d = (FixedBlockCompressorContext *) data;
+    ColorBlockCompressorContext * d = (ColorBlockCompressorContext *) data;
 
     uint x = i % d->bw;
     uint y = i / d->bw;
@@ -143,11 +143,11 @@ void FixedBlockCompressorTask(void * data, int i)
     }
 }
 
-void FixedBlockCompressor::compress(nvtt::AlphaMode alphaMode, uint w, uint h, uint d, const float * data, nvtt::TaskDispatcher * dispatcher, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions)
+void ColorBlockCompressor::compress(nvtt::AlphaMode alphaMode, uint w, uint h, uint d, const float * data, nvtt::TaskDispatcher * dispatcher, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions)
 {
     nvDebugCheck(d == 1);
 
-    FixedBlockCompressorContext context;
+    ColorBlockCompressorContext context;
     context.alphaMode = alphaMode;
     context.w = w;
     context.h = h;
@@ -169,7 +169,7 @@ void FixedBlockCompressor::compress(nvtt::AlphaMode alphaMode, uint w, uint h, u
     const uint size = context.bs * count;
     context.mem = new uint8[size];
 
-    dispatcher->dispatch(FixedBlockCompressorTask, &context, count);
+    dispatcher->dispatch(ColorBlockCompressorTask, &context, count);
 
     outputOptions.writeData(context.mem, size);
 
diff --git a/src/nvtt/CompressorDXT.h b/src/nvtt/BlockCompressor.h
similarity index 91%
rename from src/nvtt/CompressorDXT.h
rename to src/nvtt/BlockCompressor.h
index 871eb94..cc829ce 100644
--- a/src/nvtt/CompressorDXT.h
+++ b/src/nvtt/BlockCompressor.h
@@ -22,8 +22,8 @@
 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 // OTHER DEALINGS IN THE SOFTWARE.
 
-#ifndef NVTT_COMPRESSORDXT_H
-#define NVTT_COMPRESSORDXT_H
+#ifndef NVTT_BLOCKCOMPRESSOR_H
+#define NVTT_BLOCKCOMPRESSOR_H
 
 #include "Compressor.h"
 
@@ -33,7 +33,7 @@ namespace nv
     struct ColorSet;
     struct ColorBlock;
 
-    struct FixedBlockCompressor : public CompressorInterface
+    struct ColorBlockCompressor : public CompressorInterface
     {
         virtual void compress(nvtt::AlphaMode alphaMode, uint w, uint h, uint d, const float * rgba, nvtt::TaskDispatcher * dispatcher, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
 
@@ -52,4 +52,4 @@ namespace nv
 } // nv namespace
 
 
-#endif // NVTT_COMPRESSORDXT_H
+#endif // NVTT_BLOCKCOMPRESSOR_H
diff --git a/src/nvtt/CompressorDX10.h b/src/nvtt/CompressorDX10.h
index 355e642..5be6361 100644
--- a/src/nvtt/CompressorDX10.h
+++ b/src/nvtt/CompressorDX10.h
@@ -25,20 +25,20 @@
 #ifndef NVTT_COMPRESSORDX10_H
 #define NVTT_COMPRESSORDX10_H
 
-#include "CompressorDXT.h"
+#include "BlockCompressor.h"
 
 namespace nv
 {
 	struct ColorBlock;
 
 	// Fast CPU compressors.
-	struct FastCompressorBC4 : public FixedBlockCompressor
+	struct FastCompressorBC4 : public ColorBlockCompressor
 	{
 		virtual void compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
 		virtual uint blockSize() const { return 8; }
 	};
 
-	struct FastCompressorBC5 : public FixedBlockCompressor
+	struct FastCompressorBC5 : public ColorBlockCompressor
 	{
 		virtual void compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
 		virtual uint blockSize() const { return 16; }
@@ -46,13 +46,13 @@ namespace nv
 
 
 	// Production CPU compressors.
-	struct ProductionCompressorBC4 : public FixedBlockCompressor
+	struct ProductionCompressorBC4 : public ColorBlockCompressor
 	{
 		virtual void compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
 		virtual uint blockSize() const { return 8; }
 	};
 
-	struct ProductionCompressorBC5 : public FixedBlockCompressor
+	struct ProductionCompressorBC5 : public ColorBlockCompressor
 	{
 		virtual void compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
 		virtual uint blockSize() const { return 16; }
diff --git a/src/nvtt/CompressorDX11.h b/src/nvtt/CompressorDX11.h
index f665e3f..3dda9ea 100644
--- a/src/nvtt/CompressorDX11.h
+++ b/src/nvtt/CompressorDX11.h
@@ -24,7 +24,7 @@
 #ifndef NVTT_COMPRESSORDX11_H
 #define NVTT_COMPRESSORDX11_H
 
-#include "CompressorDXT.h"
+#include "BlockCompressor.h"
 
 namespace nv
 {
diff --git a/src/nvtt/CompressorDX9.cpp b/src/nvtt/CompressorDX9.cpp
index f5446a0..c3bee15 100644
--- a/src/nvtt/CompressorDX9.cpp
+++ b/src/nvtt/CompressorDX9.cpp
@@ -122,11 +122,7 @@ void CompressorDXT1::compressBlock(ColorSet & set, nvtt::AlphaMode alphaMode, co
     
     if (set.isSingleColor(/*ignoreAlpha*/true))
     {
-        Color32 c;
-        c.r = uint8(clamp(set.colors[0].x, 0.0f, 1.0f) * 255);
-        c.g = uint8(clamp(set.colors[0].y, 0.0f, 1.0f) * 255);
-        c.b = uint8(clamp(set.colors[0].z, 0.0f, 1.0f) * 255);
-        c.a = 255;
+        Color32 c = toColor32(set.colors[0]);
         OptimalCompress::compressDXT1(c, block);
     }
     else
@@ -202,7 +198,6 @@ void CompressorDXT1_Luma::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alpha
     OptimalCompress::compressDXT1_Luma(rgba, block);
 }
 
-
 void CompressorDXT3::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
 {
     BlockDXT3 * block = new(output) BlockDXT3;
diff --git a/src/nvtt/CompressorDX9.h b/src/nvtt/CompressorDX9.h
index 2a6e6fe..e3e830b 100644
--- a/src/nvtt/CompressorDX9.h
+++ b/src/nvtt/CompressorDX9.h
@@ -25,38 +25,38 @@
 #ifndef NVTT_COMPRESSORDX9_H
 #define NVTT_COMPRESSORDX9_H
 
-#include "CompressorDXT.h"
+#include "BlockCompressor.h"
 
 namespace nv
 {
     struct ColorBlock;
 
     // Fast CPU compressors.
-    struct FastCompressorDXT1 : public FixedBlockCompressor
+    struct FastCompressorDXT1 : public ColorBlockCompressor
     {
         virtual void compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
         virtual uint blockSize() const { return 8; }
     };
 
-    struct FastCompressorDXT1a : public FixedBlockCompressor
+    struct FastCompressorDXT1a : public ColorBlockCompressor
     {
         virtual void compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
         virtual uint blockSize() const { return 8; }
     };
 
-    struct FastCompressorDXT3 : public FixedBlockCompressor
+    struct FastCompressorDXT3 : public ColorBlockCompressor
     {
         virtual void compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
         virtual uint blockSize() const { return 16; }
     };
 
-    struct FastCompressorDXT5 : public FixedBlockCompressor
+    struct FastCompressorDXT5 : public ColorBlockCompressor
     {
         virtual void compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
         virtual uint blockSize() const { return 16; }
     };
 
-    struct FastCompressorDXT5n : public FixedBlockCompressor
+    struct FastCompressorDXT5n : public ColorBlockCompressor
     {
         virtual void compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
         virtual uint blockSize() const { return 16; }
@@ -71,38 +71,38 @@ namespace nv
         virtual uint blockSize() const { return 8; }
     };
 #else
-    struct CompressorDXT1 : public FixedBlockCompressor
+    struct CompressorDXT1 : public ColorBlockCompressor
     {
         virtual void compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
         virtual uint blockSize() const { return 8; }
     };
 #endif
 
-    struct CompressorDXT1a : public FixedBlockCompressor
+    struct CompressorDXT1a : public ColorBlockCompressor
     {
         virtual void compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
         virtual uint blockSize() const { return 8; }
     };
 
-    struct CompressorDXT1_Luma : public FixedBlockCompressor
+    struct CompressorDXT1_Luma : public ColorBlockCompressor
     {
         virtual void compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
         virtual uint blockSize() const { return 8; }
     };
 
-    struct CompressorDXT3 : public FixedBlockCompressor
+    struct CompressorDXT3 : public ColorBlockCompressor
     {
         virtual void compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
         virtual uint blockSize() const { return 16; }
     };
 
-    struct CompressorDXT5 : public FixedBlockCompressor
+    struct CompressorDXT5 : public ColorBlockCompressor
     {
         virtual void compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
         virtual uint blockSize() const { return 16; }
     };
 
-    struct CompressorDXT5n : public FixedBlockCompressor
+    struct CompressorDXT5n : public ColorBlockCompressor
     {
         virtual void compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
         virtual uint blockSize() const { return 16; }
@@ -137,7 +137,7 @@ namespace nv
 #endif
 
 #if defined(HAVE_STB)
-    struct StbCompressorDXT1 : public FixedBlockCompressor
+    struct StbCompressorDXT1 : public ColorBlockCompressor
     {
         virtual void compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
         virtual uint blockSize() const { return 8; }
diff --git a/src/nvtt/CompressorRGB.cpp b/src/nvtt/CompressorRGB.cpp
index 9a52d16..f2b91b6 100644
--- a/src/nvtt/CompressorRGB.cpp
+++ b/src/nvtt/CompressorRGB.cpp
@@ -349,20 +349,23 @@ void PixelFormatConverter::compress(nvtt::AlphaMode /*alphaMode*/, uint w, uint
                 }
                 else
                 {
-                    Color32 c;
-                    if (compressionOptions.pixelType == nvtt::PixelType_UnsignedNorm) {
-                        c.r = uint8(clamp(r * 255, 0.0f, 255.0f));
-                        c.g = uint8(clamp(g * 255, 0.0f, 255.0f));
-                        c.b = uint8(clamp(b * 255, 0.0f, 255.0f));
-                        c.a = uint8(clamp(a * 255, 0.0f, 255.0f));
-                    }
+                    // We first convert to 16 bits, then to the target size. @@ If greater than 16 bits, this will truncate and bitexpand.
+                    
                     // @@ Add support for nvtt::PixelType_SignedInt, nvtt::PixelType_SignedNorm, nvtt::PixelType_UnsignedInt
 
+                    int ir, ig, ib, ia;
+                    if (compressionOptions.pixelType == nvtt::PixelType_UnsignedNorm) {
+                        ir = iround(clamp(r * 65535.0f, 0.0f, 65535.0f));
+                        ig = iround(clamp(g * 65535.0f, 0.0f, 65535.0f));
+                        ib = iround(clamp(b * 65535.0f, 0.0f, 65535.0f));
+                        ia = iround(clamp(a * 65535.0f, 0.0f, 65535.0f));
+                    }
+
                     uint p = 0;
-                    p |= PixelFormat::convert(c.r, 8, rsize) << rshift;
-                    p |= PixelFormat::convert(c.g, 8, gsize) << gshift;
-                    p |= PixelFormat::convert(c.b, 8, bsize) << bshift;
-                    p |= PixelFormat::convert(c.a, 8, asize) << ashift;
+                    p |= PixelFormat::convert(ir, 16, rsize) << rshift;
+                    p |= PixelFormat::convert(ig, 16, gsize) << gshift;
+                    p |= PixelFormat::convert(ib, 16, bsize) << bshift;
+                    p |= PixelFormat::convert(ia, 16, asize) << ashift;
 
                     stream.putBits(p, bitCount);
                 }
diff --git a/src/nvtt/CubeSurface.cpp b/src/nvtt/CubeSurface.cpp
index f906472..cb68113 100644
--- a/src/nvtt/CubeSurface.cpp
+++ b/src/nvtt/CubeSurface.cpp
@@ -429,6 +429,12 @@ void CubeSurface::range(int channel, float * minimum_ptr, float * maximum_ptr) c
     *maximum_ptr = maximum;
 }
 
+void CubeSurface::clamp(int channel, float low/*= 0.0f*/, float high/*= 1.0f*/) {
+    for (int f = 0; f < 6; f++) {
+        m->face[f].clamp(channel, low, high);
+    }
+}
+
 
 
 #include "nvmath/SphericalHarmonic.h"
@@ -470,13 +476,114 @@ CubeSurface CubeSurface::irradianceFilter(int size, EdgeFixup fixupMethod) const
 }
 
 
-// Warp uv coordinate from [-1, 1] to
-/*float warp(float u, int size) {
-
-}*/
 
 
+// Convolve filter against this cube.
+Vector3 CubeSurface::Private::applyAngularFilter(const Vector3 & filterDir, float coneAngle, float * filterTable, int tableSize)
+{
+    const float cosineConeAngle = cos(coneAngle);
+    nvDebugCheck(cosineConeAngle >= 0);
 
+    Vector3 color(0);
+    float sum = 0;
+
+    // Things I have tried to speed this up:
+    // - Compute accurate bounds assuming cone axis aligned to plane, result was too small elsewhere.
+    // - Compute ellipse that results in the cone/plane intersection and compute its bounds. Sometimes intersection is a parabolla, hard to handle that case.
+    // - Compute the 6 axis aligned planes that bound the cone, clip faces against planes. Resulting plane equations are way too complex.
+
+    // What AMD CubeMapGen does:
+    // - Compute conservative bounds on the primary face, wrap around the adjacent faces.
+
+
+    // For each texel of the input cube.
+    for (uint f = 0; f < 6; f++) {
+
+        // Test face cone agains filter cone.
+        float cosineFaceAngle = dot(filterDir, faceNormals[f]);
+        float faceAngle = acosf(cosineFaceAngle);
+
+        if (faceAngle > coneAngle + atanf(sqrtf(2))) {
+            // Skip face.
+            continue;
+        }
+
+        const int L = toI32(edgeLength-1);
+        int x0 = 0, x1 = L;
+        int y0 = 0, y1 = L;
+
+#if 0
+        float u0 = -1;
+        float u1 = 1;
+        float v0 = -1;
+        float v1 = 1;
+
+        // @@ Compute uvs.
+
+        // Expand uv coordinates from [-1,1] to [0, edgeLength)
+        u0 = (u0 + 1) * edgeLength * 0.5f - 0.5f;
+        v0 = (v0 + 1) * edgeLength * 0.5f - 0.5f;
+        u1 = (u1 + 1) * edgeLength * 0.5f - 0.5f;
+        v1 = (v1 + 1) * edgeLength * 0.5f - 0.5f;
+        nvDebugCheck(u0 >= -0.5f && u0 <= edgeLength - 0.5f);
+        nvDebugCheck(v0 >= -0.5f && v0 <= edgeLength - 0.5f);
+        nvDebugCheck(u1 >= -0.5f && u1 <= edgeLength - 0.5f);
+        nvDebugCheck(v1 >= -0.5f && v1 <= edgeLength - 0.5f);
+
+        x0 = clamp(ifloor(u0), 0, L);
+        y0 = clamp(ifloor(v0), 0, L);
+        x1 = clamp(iceil(u1), 0, L);
+        y1 = clamp(iceil(v1), 0, L);
+#endif
+
+        nvDebugCheck(x1 >= x0);
+        nvDebugCheck(y1 >= y0);
+
+        if (x1 == x0 || y1 == y0) {
+            // Skip this face.
+            continue;
+        }
+
+
+        const Surface & inputFace = face[f];
+        const FloatImage * inputImage = inputFace.m->image;
+
+        for (int y = y0; y <= y1; y++) {
+            bool inside = false;
+            for (int x = x0; x <= x1; x++) {
+
+                Vector3 dir = texelTable->direction(f, x, y);
+                float cosineAngle = dot(dir, filterDir);
+
+                if (cosineAngle > cosineConeAngle) {
+                    float solidAngle = texelTable->solidAngle(f, x, y);
+                    //float scale = powf(saturate(cosineAngle), cosinePower);
+                    
+                    int idx = int(saturate(cosineAngle) * (tableSize - 1));
+                    float scale = filterTable[idx]; // @@ Do bilinear interpolation?
+
+                    float contribution = solidAngle * scale;
+
+                    sum += contribution;
+                    color.x += contribution * inputImage->pixel(0, x, y, 0);
+                    color.y += contribution * inputImage->pixel(1, x, y, 0);
+                    color.z += contribution * inputImage->pixel(2, x, y, 0);
+
+                    inside = true;
+                }
+                else if (inside) {
+                    // Filter scale is monotonic, if we have been inside once and we just exit, then we can skip the rest of the row.
+                    // We could do the same thing for the columns and skip entire rows.
+                    break;
+                }
+            }
+        }
+    }
+
+    color *= (1.0f / sum);
+
+    return color;
+}
 
 // We want to find the alpha such that:
 // cos(alpha)^cosinePower = epsilon
@@ -491,6 +598,7 @@ CubeSurface CubeSurface::irradianceFilter(int size, EdgeFixup fixupMethod) const
 // - parallelize. Done.
 // - use ISPC?
 
+
 // Convolve filter against this cube.
 Vector3 CubeSurface::Private::applyCosinePowerFilter(const Vector3 & filterDir, float coneAngle, float cosinePower)
 {
@@ -500,6 +608,15 @@ Vector3 CubeSurface::Private::applyCosinePowerFilter(const Vector3 & filterDir,
     Vector3 color(0);
     float sum = 0;
 
+    // Things I have tried to speed this up:
+    // - Compute accurate bounds assuming cone axis aligned to plane, result was too small elsewhere.
+    // - Compute ellipse that results in the cone/plane intersection and compute its bounds. Sometimes intersection is a parabolla, hard to handle that case.
+    // - Compute the 6 axis aligned planes that bound the cone, clip faces against planes. Resulting plane equations are way too complex.
+
+    // What AMD CubeMapGen does:
+    // - Compute conservative bounds on the primary face, wrap around the adjacent faces.
+
+
     // For each texel of the input cube.
     for (uint f = 0; f < 6; f++) {
 
@@ -512,163 +629,36 @@ Vector3 CubeSurface::Private::applyCosinePowerFilter(const Vector3 & filterDir,
             continue;
         }
 
-        // @@ We could do a less conservative test and test the face frustum against the cone...
-        // Or maybe easier: the face quad against the cone.
-
-        // Compute bounding box of cone intersection against face.
-        // The intersection of the cone with the face is an elipse, we want the extents of that elipse.
-        // @@ Hmm... we could even rasterize an elipse! Sounds like FUN!
-
         const int L = toI32(edgeLength-1);
         int x0 = 0, x1 = L;
         int y0 = 0, y1 = L;
 
-        if (false) {
-            // New approach?
+#if 0
+        float u0 = -1;
+        float u1 = 1;
+        float v0 = -1;
+        float v1 = 1;
 
-            // For each face, we are looking for 4 planes that bound the cone.
+        // @@ Compute uvs.
 
-            // All planes go through the origin.
-            // Plane fully determined by its normal.
-            // We only care about planes aligned to one axis. So, for the XY face, we have 4 planes:
+        // Expand uv coordinates from [-1,1] to [0, edgeLength)
+        u0 = (u0 + 1) * edgeLength * 0.5f - 0.5f;
+        v0 = (v0 + 1) * edgeLength * 0.5f - 0.5f;
+        u1 = (u1 + 1) * edgeLength * 0.5f - 0.5f;
+        v1 = (v1 + 1) * edgeLength * 0.5f - 0.5f;
+        nvDebugCheck(u0 >= -0.5f && u0 <= edgeLength - 0.5f);
+        nvDebugCheck(v0 >= -0.5f && v0 <= edgeLength - 0.5f);
+        nvDebugCheck(u1 >= -0.5f && u1 <= edgeLength - 0.5f);
+        nvDebugCheck(v1 >= -0.5f && v1 <= edgeLength - 0.5f);
 
-            // Plane goes through origin.
-            // Plane normal is unit length.
-
-            // Plane must be tangent to cone ->
-            //  angle between plane normal and cone axis is 90 - cone angle & 90 + cone angle
-            //  dot(N, D) == cos(90 - cone angle)
-            //  dot(N, D) == cos(90 + cone angle)
-
-            // Plane must contain face UV axis
-
-            // Find the 4 planes and how they intersect the unit face, which gives us (u0,v0, u1,v1).
-
-            // Expand uv coordinates, clamp to
-        }
-
-        // @@ Ugh. This is wrong, or only right when filterDir is aligned to one axis.
-        if (false) {
-            // uv coordinates corresponding to filterDir.
-            //float u = dot(filterDir, faceU[f]) / cosineFaceAngle;
-            //float v = dot(filterDir, faceV[f]) / cosineFaceAngle;
-
-            // Angular coordinates corresponding to filterDir with respect to faceNormal.
-            float atu = atan2(dot(filterDir, faceU[f]), cosineFaceAngle);
-            float atv = atan2(dot(filterDir, faceV[f]), cosineFaceAngle);
-
-            // Expand angles and project back to the face plane.
-            float u0 = tan(clamp(atu - coneAngle, -PI/4, PI/4));
-            float v0 = tan(clamp(atv - coneAngle, -PI/4, PI/4));
-            float u1 = tan(clamp(atu + coneAngle, -PI/4, PI/4));
-            float v1 = tan(clamp(atv + coneAngle, -PI/4, PI/4));
-            nvDebugCheck(u0 >= -1 && u0 <= 1);
-            nvDebugCheck(v0 >= -1 && v0 <= 1);
-            nvDebugCheck(u1 >= -1 && u1 <= 1);
-            nvDebugCheck(v1 >= -1 && v1 <= 1);
-
-            // Expand uv coordinates from [-1,1] to [0, edgeLength)
-            u0 = (u0 + 1) * edgeLength * 0.5f - 0.5f;
-            v0 = (v0 + 1) * edgeLength * 0.5f - 0.5f;
-            u1 = (u1 + 1) * edgeLength * 0.5f - 0.5f;
-            v1 = (v1 + 1) * edgeLength * 0.5f - 0.5f;
-            nvDebugCheck(u0 >= -0.5f && u0 <= edgeLength - 0.5f);
-            nvDebugCheck(v0 >= -0.5f && v0 <= edgeLength - 0.5f);
-            nvDebugCheck(u1 >= -0.5f && u1 <= edgeLength - 0.5f);
-            nvDebugCheck(v1 >= -0.5f && v1 <= edgeLength - 0.5f);
-
-            x0 = clamp(ifloor(u0), 0, L);
-            y0 = clamp(ifloor(v0), 0, L);
-            x1 = clamp(iceil(u1), 0, L);
-            y1 = clamp(iceil(v1), 0, L);
-
-            nvDebugCheck(x1 >= x0);
-            nvDebugCheck(y1 >= y0);
-        }
-
-        // This is elegant and all that, but the problem is that the projection is not always an ellipse, but often a parabola.
-        // A parabola has infinite bounds, so this approach is not very practical. Ugh.
-        if (false) {
-            //nvCheck(cosineFaceAngle >= 0.0f); @@ Not true for wide angles.
-
-            // Focal point in cartessian coordinates:
-            Vector3 F = Vector3(dot(faceU[f], filterDir), dot(faceV[f], filterDir), cosineFaceAngle);
-
-            // Focal point in polar coordinates:
-            Vector2 Fp = toPolar(F);
-            nvCheck(Fp.y >= 0.0f);  // top
-            //nvCheck(Fp.y <= PI/2);  // horizon
-
-            // If this is an ellipse:
-            if (Fp.y + coneAngle < PI/2) {
-                nvCheck(Fp.y - coneAngle > -PI/2);
-
-                // Major axis endpoints:
-                Vector2 Fa1 = toPlane(Fp.x, Fp.y - cosineFaceAngle);  // near endpoint.
-                Vector2 Fa2 = toPlane(Fp.x, Fp.y + cosineFaceAngle);  // far endpoint.
-                nvCheck(length(Fa1) <= length(Fa2));
-
-                // Ellipse center:
-                Vector2 Fc = (Fa1 + Fa2) * 0.5f;
-
-                // Major radius:
-                float a = 0.5f * length(Fa1 - Fa2);
-
-                // Focal point:
-                Vector2 F1 = toPlane(Fp.x, Fp.y);
-
-                // If we project Fa1, Fa2, Fc, F1 onto the filter direction, then:
-                float da1 = dot(Fa1, F.xy()) / fabs(cosineFaceAngle);
-                float d1 = dot(F1, F.xy()) / fabs(cosineFaceAngle);
-                float dc = dot(Fc, F.xy()) / fabs(cosineFaceAngle);
-                float da2 = dot(Fa2, F.xy()) / fabs(cosineFaceAngle);
-                //nvDebug("%f <= %f <= %f <= %f   (%d: %f %f | %f %f)\n", da1, d1, dc, da2, f, F.x, F.y, Fp.y - coneAngle, Fp.y + coneAngle);
-                //nvCheck(da1 <= d1 && d1 <= dc && dc <= da2);
-
-                // Translate focal point relative to center:
-                F1 -= Fc;
-
-                // Focal distance:
-                //float f = length(F1);  // @@ Overriding f!
-
-                // Minor radius:
-                //float b = sqrtf(a*a - f*f);
-
-                // Second order quadric coefficients:
-                float A = a*a - F1.x * F1.x;
-                nvCheck(A >= 0);
-
-                float B = a*a - F1.y * F1.y;
-                nvCheck(B >= 0);
-
-                // Floating point bounds:
-                float u0 = clamp(Fc.x - sqrtf(B), -1.0f, 1.0f);
-                float u1 = clamp(Fc.x + sqrtf(B), -1.0f, 1.0f);
-                float v0 = clamp(Fc.y - sqrtf(A), -1.0f, 1.0f);
-                float v1 = clamp(Fc.y + sqrtf(A), -1.0f, 1.0f);
-
-                // Expand uv coordinates from [-1,1] to [0, edgeLength)
-                u0 = (u0 + 1) * edgeLength * 0.5f - 0.5f;
-                v0 = (v0 + 1) * edgeLength * 0.5f - 0.5f;
-                u1 = (u1 + 1) * edgeLength * 0.5f - 0.5f;
-                v1 = (v1 + 1) * edgeLength * 0.5f - 0.5f;
-                //nvDebugCheck(u0 >= -0.5f && u0 <= edgeLength - 0.5f);
-                //nvDebugCheck(v0 >= -0.5f && v0 <= edgeLength - 0.5f);
-                //nvDebugCheck(u1 >= -0.5f && u1 <= edgeLength - 0.5f);
-                //nvDebugCheck(v1 >= -0.5f && v1 <= edgeLength - 0.5f);
-
-                x0 = clamp(ifloor(u0), 0, L);
-                y0 = clamp(ifloor(v0), 0, L);
-                x1 = clamp(iceil(u1), 0, L);
-                y1 = clamp(iceil(v1), 0, L);
-
-                nvDebugCheck(x1 >= x0);
-                nvDebugCheck(y1 >= y0);
-            }
-
-            // @@ What to do with parabolas?
-        }
+        x0 = clamp(ifloor(u0), 0, L);
+        y0 = clamp(ifloor(v0), 0, L);
+        x1 = clamp(iceil(u1), 0, L);
+        y1 = clamp(iceil(v1), 0, L);
+#endif
 
+        nvDebugCheck(x1 >= x0);
+        nvDebugCheck(y1 >= y0);
 
         if (x1 == x0 || y1 == y0) {
             // Skip this face.
@@ -714,17 +704,18 @@ Vector3 CubeSurface::Private::applyCosinePowerFilter(const Vector3 & filterDir,
 
 #include "nvthread/ParallelFor.h"
 
-struct ApplyCosinePowerFilterContext {
+struct ApplyAngularFilterContext {
     CubeSurface::Private * inputCube;
     CubeSurface::Private * filteredCube;
     float coneAngle;
-    float cosinePower;
+    float * filterTable;
+    int tableSize;
     EdgeFixup fixupMethod;
 };
 
-void ApplyCosinePowerFilterTask(void * context, int id)
+void ApplyAngularFilterTask(void * context, int id)
 {
-    ApplyCosinePowerFilterContext * ctx = (ApplyCosinePowerFilterContext *)context;
+    ApplyAngularFilterContext * ctx = (ApplyAngularFilterContext *)context;
 
     int size = ctx->filteredCube->edgeLength;
 
@@ -739,7 +730,7 @@ void ApplyCosinePowerFilterTask(void * context, int id)
     const Vector3 filterDir = texelDirection(f, x, y, size, ctx->fixupMethod);
 
     // Convolve filter against cube.
-    Vector3 color = ctx->inputCube->applyCosinePowerFilter(filterDir, ctx->coneAngle, ctx->cosinePower);
+    Vector3 color = ctx->inputCube->applyAngularFilter(filterDir, ctx->coneAngle, ctx->filterTable, ctx->tableSize);
 
     filteredImage->pixel(0, idx) = color.x;
     filteredImage->pixel(1, idx) = color.y;
@@ -749,8 +740,6 @@ void ApplyCosinePowerFilterTask(void * context, int id)
 
 CubeSurface CubeSurface::cosinePowerFilter(int size, float cosinePower, EdgeFixup fixupMethod) const
 {
-    const uint edgeLength = m->edgeLength;
-
     // Allocate output cube.
     CubeSurface filteredCube;
     filteredCube.m->allocate(size);
@@ -782,14 +771,24 @@ CubeSurface CubeSurface::cosinePowerFilter(int size, float cosinePower, EdgeFixu
         }
     }*/
 
-    ApplyCosinePowerFilterContext context;
+    ApplyAngularFilterContext context;
     context.inputCube = m;
     context.filteredCube = filteredCube.m;
     context.coneAngle = coneAngle;
-    context.cosinePower = cosinePower;
     context.fixupMethod = fixupMethod;
 
-    nv::ParallelFor parallelFor(ApplyCosinePowerFilterTask, &context);
+    context.tableSize = 512;
+    context.filterTable = new float[context.tableSize];
+
+    // @@ Instead of looking up table between [0 - 1] we should probably use [cos(coneAngle), 1]
+
+    for (int i = 0; i < context.tableSize; i++) {
+        float f = float(i) / (context.tableSize - 1);
+        context.filterTable[i] = powf(f, cosinePower);
+    }
+    
+
+    nv::ParallelFor parallelFor(ApplyAngularFilterTask, &context);
     parallelFor.run(6 * size * size);
 
     // @@ Implement edge averaging.
@@ -816,6 +815,72 @@ CubeSurface CubeSurface::cosinePowerFilter(int size, float cosinePower, EdgeFixu
 }
 
 
+// Sample cubemap in the given direction.
+Vector3 CubeSurface::Private::sample(const Vector3 & dir)
+{
+    int f = -1;
+    if (fabs(dir.x) > fabs(dir.y) && fabs(dir.x) > fabs(dir.z)) {
+        if (dir.x > 0) f = 0;
+        else f = 1;
+    }
+    else if (fabs(dir.y) > fabs(dir.z)) {
+        if (dir.y > 0) f = 2;
+        else f = 3;
+    }
+    else {
+        if (dir.z > 0) f = 4;
+        else f = 5;
+    }
+    nvDebugCheck(f != -1);
+
+    // uv coordinates corresponding to filterDir.
+    float u = dot(dir, faceU[f]);
+    float v = dot(dir, faceV[f]);
+
+    FloatImage * img = face[f].m->image;
+
+    Vector3 color;
+    color.x = img->sampleLinearClamp(0, u, v);
+    color.y = img->sampleLinearClamp(1, u, v);
+    color.z = img->sampleLinearClamp(2, u, v);
+
+    return color;
+}
+
+// @@ Not tested!
+CubeSurface CubeSurface::fastResample(int size, EdgeFixup fixupMethod) const
+{
+    // Allocate output cube.
+    CubeSurface resampledCube;
+    resampledCube.m->allocate(size);
+
+    // For each texel of the output cube.
+    for (uint f = 0; f < 6; f++) {
+        nvtt::Surface resampledFace = resampledCube.m->face[f];
+        FloatImage * resampledImage = resampledFace.m->image;
+
+        for (uint y = 0; y < uint(size); y++) {
+            for (uint x = 0; x < uint(size); x++) {
+
+                const Vector3 filterDir = texelDirection(f, x, y, size, fixupMethod);
+
+                Vector3 color = m->sample(filterDir);
+
+                resampledImage->pixel(0, x, y, 0) = color.x;
+                resampledImage->pixel(1, x, y, 0) = color.y;
+                resampledImage->pixel(2, x, y, 0) = color.z;
+            }
+        }
+    }
+
+    // @@ Implement edge averaging. Share this code with cosinePowerFilter
+    if (fixupMethod == EdgeFixup_Average) {
+    }
+
+    return resampledCube;
+}
+
+
 void CubeSurface::toLinear(float gamma)
 {
     if (isNull()) return;
diff --git a/src/nvtt/CubeSurface.h b/src/nvtt/CubeSurface.h
index 84df471..b5e3757 100644
--- a/src/nvtt/CubeSurface.h
+++ b/src/nvtt/CubeSurface.h
@@ -94,8 +94,11 @@ namespace nvtt
         }
 
         // Filtering helpers:
+        nv::Vector3 applyAngularFilter(const nv::Vector3 & dir, float coneAngle, float * filterTable, int tableSize);
         nv::Vector3 applyCosinePowerFilter(const nv::Vector3 & dir, float coneAngle, float cosinePower);
 
+        nv::Vector3 sample(const nv::Vector3 & dir);
+
         uint edgeLength;
         Surface face[6];
         TexelTable * texelTable;
diff --git a/src/nvtt/InputOptions.h b/src/nvtt/InputOptions.h
index 953957c..b28c8fb 100644
--- a/src/nvtt/InputOptions.h
+++ b/src/nvtt/InputOptions.h
@@ -1,82 +1,82 @@
-// Copyright (c) 2009-2011 Ignacio Castano <castano@gmail.com>
-// Copyright (c) 2007-2009 NVIDIA Corporation -- Ignacio Castano <icastano@nvidia.com>
-// 
-// Permission is hereby granted, free of charge, to any person
-// obtaining a copy of this software and associated documentation
-// files (the "Software"), to deal in the Software without
-// restriction, including without limitation the rights to use,
-// copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the
-// Software is furnished to do so, subject to the following
-// conditions:
-// 
-// The above copyright notice and this permission notice shall be
-// included in all copies or substantial portions of the Software.
-// 
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-// OTHER DEALINGS IN THE SOFTWARE.
-
-#ifndef NVTT_INPUTOPTIONS_H
-#define NVTT_INPUTOPTIONS_H
-
-#include "nvtt.h"
-
-#include "nvmath/Vector.h"
-
-
-namespace nvtt
-{
-
-    struct InputOptions::Private
-    {
-        Private() : images(NULL) {}
-
-        WrapMode wrapMode;
-        TextureType textureType;
-        InputFormat inputFormat;
-        AlphaMode alphaMode;
-
-        uint width;
-        uint height;
-        uint depth;
-        uint faceCount;
-        uint mipmapCount;
-        uint imageCount;
-
-        void ** images;
-
-        // Gamma conversion.
-        float inputGamma;
-        float outputGamma;
-
-        // Mipmap generation options.
-        bool generateMipmaps;
-        int maxLevel;
-        MipmapFilter mipmapFilter;
-
-        // Kaiser filter parameters.
-        float kaiserWidth;
-        float kaiserAlpha;
-        float kaiserStretch;
-
-        // Normal map options.
-        bool isNormalMap;
-        bool normalizeMipmaps;
-        bool convertToNormalMap;
-        nv::Vector4 heightFactors;
-        nv::Vector4 bumpFrequencyScale;
-
-        // Adjust extents.
-        uint maxExtent;
-        RoundMode roundMode;
-    };
-
-} // nvtt namespace
-
-#endif // NVTT_INPUTOPTIONS_H
+// Copyright (c) 2009-2011 Ignacio Castano <castano@gmail.com>
+// Copyright (c) 2007-2009 NVIDIA Corporation -- Ignacio Castano <icastano@nvidia.com>
+// 
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef NVTT_INPUTOPTIONS_H
+#define NVTT_INPUTOPTIONS_H
+
+#include "nvtt.h"
+
+#include "nvmath/Vector.h"
+
+
+namespace nvtt
+{
+
+    struct InputOptions::Private
+    {
+        Private() : images(NULL) {}
+
+        WrapMode wrapMode;
+        TextureType textureType;
+        InputFormat inputFormat;
+        AlphaMode alphaMode;
+
+        uint width;
+        uint height;
+        uint depth;
+        uint faceCount;
+        uint mipmapCount;
+        uint imageCount;
+
+        void ** images;
+
+        // Gamma conversion.
+        float inputGamma;
+        float outputGamma;
+
+        // Mipmap generation options.
+        bool generateMipmaps;
+        int maxLevel;
+        MipmapFilter mipmapFilter;
+
+        // Kaiser filter parameters.
+        float kaiserWidth;
+        float kaiserAlpha;
+        float kaiserStretch;
+
+        // Normal map options.
+        bool isNormalMap;
+        bool normalizeMipmaps;
+        bool convertToNormalMap;
+        nv::Vector4 heightFactors;
+        nv::Vector4 bumpFrequencyScale;
+
+        // Adjust extents.
+        uint maxExtent;
+        RoundMode roundMode;
+    };
+
+} // nvtt namespace
+
+#endif // NVTT_INPUTOPTIONS_H
diff --git a/src/nvtt/Surface.cpp b/src/nvtt/Surface.cpp
index bc16696..a0534fa 100644
--- a/src/nvtt/Surface.cpp
+++ b/src/nvtt/Surface.cpp
@@ -40,6 +40,10 @@
 #include <float.h>
 #include <string.h> // memset, memcpy
 
+#if NV_CC_GNUC
+#include <math.h> // exp2f and log2f
+#endif
+
 using namespace nv;
 using namespace nvtt;
 
@@ -101,6 +105,20 @@ namespace
     }*/
 }
 
+bool nv::canMakeNextMipmap(uint w, uint h, uint d, uint min_size)
+{
+    if (min_size==1u) {  
+        if(w==1u && h==1u && d==1u) {
+            return false;
+        }
+    }
+    else if (((w <= min_size || h <= min_size) && d == 1u)) {
+        return false;
+    }
+
+    return true;
+}
+
 uint nv::countMipmaps(uint w)
 {
     uint mipmap = 0;
@@ -127,6 +145,21 @@ uint nv::countMipmaps(uint w, uint h, uint d)
     return mipmap + 1;
 }
 
+uint nv::countMipmapsWithMinSize(uint w, uint h, uint d, uint min_size)
+{
+    uint mipmap = 0;
+
+    while (canMakeNextMipmap(w, h, d, min_size)) {
+        w = max(1U, w / 2);
+        h = max(1U, h / 2);
+        d = max(1U, d / 2);
+        mipmap++;
+    }
+
+   return mipmap + 1;
+}
+
+
 uint nv::computeImageSize(uint w, uint h, uint d, uint bitCount, uint pitchAlignmentInBytes, Format format)
 {
     if (format == Format_RGBA) {
@@ -308,10 +341,18 @@ int Surface::countMipmaps() const
     return ::countMipmaps(m->image->width(), m->image->height(), 1);
 }
 
+int Surface::countMipmaps(int min_size) const
+{
+    if (m->image == NULL) return 0;
+    return ::countMipmapsWithMinSize(m->image->width(), m->image->height(), 1, min_size);
+}
+
 float Surface::alphaTestCoverage(float alphaRef/*= 0.5*/) const
 {
     if (m->image == NULL) return 0.0f;
 
+    alphaRef = nv::clamp(alphaRef, 1.0f/256, 255.0f/256);
+
     return m->image->alphaTestCoverage(alphaRef, 3);
 }
 
@@ -348,7 +389,7 @@ float Surface::average(int channel, int alpha_channel/*= -1*/, float gamma /*= 2
     // Avoid division by zero.
     if (denom == 0.0f) return 0.0f;
 
-    return sum / denom;
+    return powf(sum / denom, 1.0f/gamma);
 }
 
 const float * Surface::data() const
@@ -356,6 +397,13 @@ const float * Surface::data() const
     return m->image->channel(0);
 }
 
+const float * Surface::channel(int i) const
+{
+    if (i < 0 || i > 3) return NULL;
+    return m->image->channel(i);
+}
+
+
 void Surface::histogram(int channel, float rangeMin, float rangeMax, int binCount, int * binPtr) const
 {
     // We assume it's clear in case we want to accumulate multiple histograms.
@@ -378,12 +426,14 @@ void Surface::histogram(int channel, float rangeMin, float rangeMax, int binCoun
     }
 }
 
-void Surface::range(int channel, float * rangeMin, float * rangeMax) const
+void Surface::range(int channel, float * rangeMin, float * rangeMax, int alpha_channel/*= -1*/, float alpha_ref/*= 0.f*/) const
 {
     Vector2 range(FLT_MAX, -FLT_MAX);
 
     FloatImage * img = m->image;
 
+    if (alpha_channel == -1) { // no alpha channel; just like the original range function
+
     if (m->image != NULL)
     {
         float * c = img->channel(channel);
@@ -395,6 +445,25 @@ void Surface::range(int channel, float * rangeMin, float * rangeMax) const
             if (f > range.y) range.y = f;
         }
     }
+    }
+    else { // use alpha test to ignore some pixels
+        //note, it's quite possible to get FLT_MAX,-FLT_MAX back if all pixels fail the test
+
+        if (m->image != NULL)
+        {
+            const float * c = img->channel(channel);
+            const float * a = img->channel(alpha_channel);
+
+            const uint count = img->pixelCount();
+            for (uint p = 0; p < count; p++) {
+                if(a[p]>alpha_ref) {
+                    float f = c[p];
+                    if (f < range.x) range.x = f;
+                    if (f > range.y) range.y = f;
+                }
+            }
+        }
+    }
 
     *rangeMin = range.x;
     *rangeMax = range.y;
@@ -423,16 +492,44 @@ bool Surface::load(const char * fileName, bool * hasAlpha/*= NULL*/)
     return true;
 }
 
-bool Surface::save(const char * fileName) const
+bool Surface::save(const char * fileName, bool hasAlpha/*=0*/, bool hdr/*=0*/) const
 {
-    if (m->image != NULL)
-    {
-        return ImageIO::saveFloat(fileName, m->image, 0, 4);
+    if (m->image == NULL) {
+        return false;
     }
 
-    return false;
+    if (hdr) {
+        return ImageIO::saveFloat(fileName, m->image, 0, 4);
+    }
+    else {
+        AutoPtr<Image> image(m->image->createImage(0, 4));
+        nvCheck(image != NULL);
+
+        if (hasAlpha) {
+            image->setFormat(Image::Format_ARGB);
+        }
+
+        return ImageIO::save(fileName, image.ptr());
+    }
 }
 
+
+bool Surface::setImage(int w, int h, int d)
+{
+    detach();
+
+    if (m->image == NULL) {
+        m->image = new FloatImage();
+    }
+    m->image->allocate(4, w, h, d);
+    m->type = (d == 1) ? TextureType_2D : TextureType_3D;
+
+    m->image->clear();
+
+    return true;
+}
+
+
 #if 0 //NV_OS_WIN32
 
 #include <windows.h>
@@ -449,13 +546,14 @@ static int filter(unsigned int code, struct _EXCEPTION_POINTERS *ep) {
 }
 
 #define TRY __try
-    
 #define CATCH __except (filter(GetExceptionCode(), GetExceptionInformation()))
-#else
+
+#else // 0
+
 #define TRY if (true)
 #define CATCH else
-#endif
 
+#endif
 
 bool Surface::setImage(nvtt::InputFormat format, int w, int h, int d, const void * data)
 {
@@ -553,13 +651,13 @@ bool Surface::setImage(InputFormat format, int w, int h, int d, const void * r,
         const uint8 * bsrc = (const uint8 *)b;
         const uint8 * asrc = (const uint8 *)a;
 
-        try {
+        TRY {
             for (int i = 0; i < count; i++) rdst[i] = float(rsrc[i]) / 255.0f;
             for (int i = 0; i < count; i++) gdst[i] = float(gsrc[i]) / 255.0f;
             for (int i = 0; i < count; i++) bdst[i] = float(bsrc[i]) / 255.0f;
             for (int i = 0; i < count; i++) adst[i] = float(asrc[i]) / 255.0f;
         }
-        catch(...) {
+        CATCH {
             return false;
         }
     }
@@ -570,13 +668,13 @@ bool Surface::setImage(InputFormat format, int w, int h, int d, const void * r,
         const uint16 * bsrc = (const uint16 *)b;
         const uint16 * asrc = (const uint16 *)a;
 
-        try {
+        TRY {
             for (int i = 0; i < count; i++) ((uint32 *)rdst)[i] = half_to_float(rsrc[i]);
             for (int i = 0; i < count; i++) ((uint32 *)gdst)[i] = half_to_float(gsrc[i]);
             for (int i = 0; i < count; i++) ((uint32 *)bdst)[i] = half_to_float(bsrc[i]);
             for (int i = 0; i < count; i++) ((uint32 *)adst)[i] = half_to_float(asrc[i]);
         }
-        catch(...) {
+        CATCH {
             return false;
         }
     }
@@ -587,13 +685,13 @@ bool Surface::setImage(InputFormat format, int w, int h, int d, const void * r,
         const float * bsrc = (const float *)b;
         const float * asrc = (const float *)a;
 
-        try {
+        TRY {
             memcpy(rdst, rsrc, count * sizeof(float));
             memcpy(gdst, gsrc, count * sizeof(float));
             memcpy(bdst, bsrc, count * sizeof(float));
             memcpy(adst, asrc, count * sizeof(float));
         }
-        catch(...) {
+        CATCH {
             return false;
         }
     }
@@ -624,87 +722,87 @@ bool Surface::setImage2D(Format format, Decoder decoder, int w, int h, const voi
 
     const uint8 * ptr = (const uint8 *)data;
 
-    try {
+    TRY {
         for (int y = 0; y < bh; y++)
         {
             for (int x = 0; x < bw; x++)
             {
                 ColorBlock colors;
 
-		if (format == nvtt::Format_BC1)
-		{
-		    const BlockDXT1 * block = (const BlockDXT1 *)ptr;
+                if (format == nvtt::Format_BC1)
+                {
+                    const BlockDXT1 * block = (const BlockDXT1 *)ptr;
 
-		    if (decoder == Decoder_D3D10) {
-			    block->decodeBlock(&colors, false);
-		    }
-		    else if (decoder == Decoder_D3D9) {
-			    block->decodeBlock(&colors, false);
-		    }
-		    else if (decoder == Decoder_NV5x) {
-			    block->decodeBlockNV5x(&colors);
-		    }
-		}
-		else if (format == nvtt::Format_BC2)
-		{
-		    const BlockDXT3 * block = (const BlockDXT3 *)ptr;
+                    if (decoder == Decoder_D3D10) {
+	                    block->decodeBlock(&colors, false);
+                    }
+                    else if (decoder == Decoder_D3D9) {
+	                    block->decodeBlock(&colors, false);
+                    }
+                    else if (decoder == Decoder_NV5x) {
+	                    block->decodeBlockNV5x(&colors);
+                    }
+                }
+                else if (format == nvtt::Format_BC2)
+                {
+                    const BlockDXT3 * block = (const BlockDXT3 *)ptr;
 
-		    if (decoder == Decoder_D3D10) {
-			    block->decodeBlock(&colors, false);
-		    }
-		    else if (decoder == Decoder_D3D9) {
-			    block->decodeBlock(&colors, false);
-		    }
-		    else if (decoder == Decoder_NV5x) {
-			    block->decodeBlockNV5x(&colors);
-		    }
-		}
-		else if (format == nvtt::Format_BC3)
-		{
-		    const BlockDXT5 * block = (const BlockDXT5 *)ptr;
+                    if (decoder == Decoder_D3D10) {
+	                    block->decodeBlock(&colors, false);
+                    }
+                    else if (decoder == Decoder_D3D9) {
+	                    block->decodeBlock(&colors, false);
+                    }
+                    else if (decoder == Decoder_NV5x) {
+	                    block->decodeBlockNV5x(&colors);
+                    }
+                }
+                else if (format == nvtt::Format_BC3)
+                {
+                    const BlockDXT5 * block = (const BlockDXT5 *)ptr;
 
-		    if (decoder == Decoder_D3D10) {
-			    block->decodeBlock(&colors, false);
-		    }
-		    else if (decoder == Decoder_D3D9) {
-			    block->decodeBlock(&colors, false);
-		    }
-		    else if (decoder == Decoder_NV5x) {
-			    block->decodeBlockNV5x(&colors);
-		    }
-		}
-		else if (format == nvtt::Format_BC4)
-		{
-            const BlockATI1 * block = (const BlockATI1 *)ptr;
-            block->decodeBlock(&colors, decoder == Decoder_D3D9);
+                    if (decoder == Decoder_D3D10) {
+	                    block->decodeBlock(&colors, false);
+                    }
+                    else if (decoder == Decoder_D3D9) {
+	                    block->decodeBlock(&colors, false);
+                    }
+                    else if (decoder == Decoder_NV5x) {
+	                    block->decodeBlockNV5x(&colors);
+                    }
+                }
+                else if (format == nvtt::Format_BC4)
+                {
+                    const BlockATI1 * block = (const BlockATI1 *)ptr;
+                    block->decodeBlock(&colors, decoder == Decoder_D3D9);
+                }
+                else if (format == nvtt::Format_BC5)
+                {
+                    const BlockATI2 * block = (const BlockATI2 *)ptr;
+                    block->decodeBlock(&colors, decoder == Decoder_D3D9);
+                }
+
+                for (int yy = 0; yy < 4; yy++)
+                {
+                    for (int xx = 0; xx < 4; xx++)
+                    {
+                        Color32 c = colors.color(xx, yy);
+
+                        if (x * 4 + xx < w && y * 4 + yy < h)
+                        {
+                            m->image->pixel(0, x*4 + xx, y*4 + yy, 0) = float(c.r) * 1.0f/255.0f;
+                            m->image->pixel(1, x*4 + xx, y*4 + yy, 0) = float(c.g) * 1.0f/255.0f;
+                            m->image->pixel(2, x*4 + xx, y*4 + yy, 0) = float(c.b) * 1.0f/255.0f;
+                            m->image->pixel(3, x*4 + xx, y*4 + yy, 0) = float(c.a) * 1.0f/255.0f;
+                        }
+                    }
+                }
+
+                ptr += bs;
+            }
         }
-        else if (format == nvtt::Format_BC5)
-        {
-            const BlockATI2 * block = (const BlockATI2 *)ptr;
-            block->decodeBlock(&colors, decoder == Decoder_D3D9);
-        }
-
-		for (int yy = 0; yy < 4; yy++)
-		{
-		    for (int xx = 0; xx < 4; xx++)
-		    {
-			Color32 c = colors.color(xx, yy);
-
-			if (x * 4 + xx < w && y * 4 + yy < h)
-			{
-			    m->image->pixel(0, x*4 + xx, y*4 + yy, 0) = float(c.r) * 1.0f/255.0f;
-			    m->image->pixel(1, x*4 + xx, y*4 + yy, 0) = float(c.g) * 1.0f/255.0f;
-			    m->image->pixel(2, x*4 + xx, y*4 + yy, 0) = float(c.b) * 1.0f/255.0f;
-			    m->image->pixel(3, x*4 + xx, y*4 + yy, 0) = float(c.a) * 1.0f/255.0f;
-			}
-		    }
-		}
-
-		ptr += bs;
-	    }
-	}
     }
-    catch(...) {
+    CATCH {
         return false;
     }
 
@@ -812,6 +910,43 @@ void Surface::resize(int w, int h, int d, ResizeFilter filter, float filterWidth
     m->image = img;
 }
 
+void Surface::resize_make_square(int maxExtent, RoundMode roundMode, ResizeFilter filter)
+{
+    if (isNull()) return;
+
+    float filterWidth;
+    float params[2];
+    getDefaultFilterWidthAndParams(filter, &filterWidth, params);
+
+    int w = m->image->width();
+    int h = m->image->height();
+    int d = m->image->depth();
+
+    getTargetExtent(&w, &h, &d, maxExtent, roundMode, m->type);
+
+    if (m->type == TextureType_2D) 
+    {
+        nvDebugCheck(d==1);
+        int md = nv::min(w,h);
+        w = md;
+        h = md;
+    }
+    else if (m->type == TextureType_Cube)
+    {
+        nvDebugCheck(d==1);
+        nvDebugCheck(w==h);
+    }
+    else if (m->type == TextureType_3D)
+    {
+        int md = nv::min(nv::min(w,h),d);
+        w = md;
+        h = md;
+        d = md;
+    }
+
+    resize(w, h, d, filter, filterWidth, params);
+}
+
 void Surface::resize(int maxExtent, RoundMode roundMode, ResizeFilter filter)
 {
     float filterWidth;
@@ -834,18 +969,26 @@ void Surface::resize(int maxExtent, RoundMode roundMode, ResizeFilter filter, fl
     resize(w, h, d, filter, filterWidth, params);
 }
 
-bool Surface::buildNextMipmap(MipmapFilter filter)
+bool Surface::canMakeNextMipmap(int min_size /*= 1*/)
+{
+    if (isNull()) return false;
+
+    return nv::canMakeNextMipmap(width(), height(), depth(), min_size);
+}
+
+
+bool Surface::buildNextMipmap(MipmapFilter filter, int min_size /*= 1*/)
 {
     float filterWidth;
     float params[2];
     getDefaultFilterWidthAndParams(filter, &filterWidth, params);
 
-    return buildNextMipmap(filter, filterWidth, params);
+    return buildNextMipmap(filter, filterWidth, params, min_size);
 }
 
-bool Surface::buildNextMipmap(MipmapFilter filter, float filterWidth, const float * params)
+bool Surface::buildNextMipmap(MipmapFilter filter, float filterWidth, const float * params, int min_size /*= 1*/)
 {
-    if (isNull() || (width() == 1 && height() == 1 && depth() == 1)) {
+    if (!canMakeNextMipmap(min_size)) {
         return false;
     }
 
@@ -907,6 +1050,30 @@ bool Surface::buildNextMipmap(MipmapFilter filter, float filterWidth, const floa
     return true;
 }
 
+bool Surface::buildNextMipmapSolidColor(const float * const color_components)
+{
+    if (isNull() || (width() == 1 && height() == 1 && depth() == 1)) {
+        return false;
+    }
+
+    detach();
+
+    FloatImage * img = new FloatImage();
+    const uint w = max(1, m->image->m_width / 2);
+    const uint h = max(1, m->image->m_height / 2);
+    img->allocate(m->image->m_componentCount, w, h);
+
+    for(uint c = 0; c < img->m_componentCount; c++)
+    {
+        img->clear(c, color_components[c]);
+    }
+
+    delete m->image;
+    m->image = img;
+
+    return true;
+}
+
 void Surface::canvasSize(int w, int h, int d)
 {
     nvDebugCheck(w > 0 && h > 0 && d > 0);
@@ -1083,6 +1250,7 @@ void Surface::transform(const float w0[4], const float w1[4], const float w2[4],
     m->image->transform(0, xform, voffset);
 }
 
+// R, G, B, A, 1, 0, -1
 void Surface::swizzle(int r, int g, int b, int a)
 {
     if (isNull()) return;
@@ -1113,52 +1281,6 @@ void Surface::clamp(int channel, float low, float high)
     m->image->clamp(channel, 1, low, high);
 }
 
-void Surface::packNormal()
-{
-    if (isNull()) return;
-
-    detach();
-
-    m->image->scaleBias(0, 3, 0.5f, 0.5f);
-}
-
-void Surface::expandNormal()
-{
-    if (isNull()) return;
-
-    detach();
-
-    m->image->scaleBias(0, 3, 2.0f, -1.0f);
-}
-
-// Create a Toksvig map for this normal map.
-// http://blog.selfshadow.com/2011/07/22/specular-showdown/
-// @@ Assumes this is a normal map expanded in the [-1, 1] range.
-Surface Surface::createToksvigMap(float power) const
-{
-    if (isNull()) return Surface();
-
-    // @@ TODO
-
-    return Surface();
-}
-
-// @@ Should I add support for LEAN maps? That requires 5 terms, which would have to be encoded in two textures.
-// There's nothing stopping us from having 5 channels in a surface, and then, let the user swizzle them as they wish.
-// CLEAN maps are probably more practical, though.
-// http://www.cs.umbc.edu/~olano/papers/lean/
-// http://gaim.umbc.edu/2011/07/24/shiny-and-clean/
-// http://gaim.umbc.edu/2011/07/26/on-error/
-NVTT_API Surface Surface::createCleanMap() const
-{
-    if (isNull()) return Surface();
-
-    // @@ TODO
-
-    return Surface();
-}
-
-
 void Surface::blend(float red, float green, float blue, float alpha, float t)
 {
     if (isNull()) return;
@@ -1285,13 +1407,10 @@ void Surface::fill(float red, float green, float blue, float alpha)
     float * a = img->channel(3);
 
     const uint count = img->pixelCount();
-    for (uint i = 0; i < count; i++)
-    {
-        r[i] = red;
-        g[i] = green;
-        b[i] = blue;
-        a[i] = alpha;
-    }
+    for (uint i = 0; i < count; i++) r[i] = red;
+    for (uint i = 0; i < count; i++) g[i] = green;
+    for (uint i = 0; i < count; i++) b[i] = blue;
+    for (uint i = 0; i < count; i++) a[i] = alpha;
 }
 
 
@@ -1301,6 +1420,8 @@ void Surface::scaleAlphaToCoverage(float coverage, float alphaRef/*= 0.5f*/)
 
     detach();
 
+    alphaRef = nv::clamp(alphaRef, 1.0f/256, 255.0f/256);
+
     m->image->scaleAlphaToCoverage(coverage, alphaRef, 3);
 }
 
@@ -1341,7 +1462,6 @@ void Surface::toRGBM(float range/*= 1*/, float threshold/*= 0.25*/)
     detach();
 
     threshold = ::clamp(threshold, 1e-6f, 1.0f);
-    float irange = 1.0f / range;
 
     FloatImage * img = m->image;
     float * r = img->channel(0);
@@ -1360,6 +1480,7 @@ void Surface::toRGBM(float range/*= 1*/, float threshold/*= 0.25*/)
         r[i] = R / M;
         g[i] = G / M;
         b[i] = B / M;
+
         a[i] = (M - threshold) / (1 - threshold);
 
 #else
@@ -1402,6 +1523,7 @@ void Surface::toRGBM(float range/*= 1*/, float threshold/*= 0.25*/)
     }
 }
 
+
 void Surface::fromRGBM(float range/*= 1*/)
 {
     if (isNull()) return;
@@ -1425,6 +1547,37 @@ void Surface::fromRGBM(float range/*= 1*/)
     }
 }
 
+// This is dumb way to encode luminance only values.
+void Surface::toLM(float range/*= 1*/, float threshold/*= 0.25*/)
+{
+    if (isNull()) return;
+
+    detach();
+
+    threshold = ::clamp(threshold, 1e-6f, 1.0f);
+
+    FloatImage * img = m->image;
+    float * r = img->channel(0);
+    float * g = img->channel(1);
+    float * b = img->channel(2);
+    float * a = img->channel(3);
+
+    const uint count = img->pixelCount();
+    for (uint i = 0; i < count; i++) {
+        float R = nv::clamp(r[i], 0.0f, 1.0f);
+        float G = nv::clamp(g[i], 0.0f, 1.0f);
+        float B = nv::clamp(b[i], 0.0f, 1.0f);
+
+        float M = max(max(R, G), max(B, threshold));
+
+        float L = (R + G + B) / 3;
+        r[i] = L / M;
+        b[i] = L / M;
+        g[i] = L / M;
+        a[i] = (M - threshold) / (1 - threshold);
+    }
+}
+
 
 static Color32 toRgbe8(float r, float g, float b)
 {
@@ -2147,21 +2300,25 @@ void Surface::quantize(int channel, int bits, bool exactEndPoints, bool dither)
 
     FloatImage * img = m->image;
 
-    float scale, offset;
+    float scale, offset0, offset1;
     if (exactEndPoints) {
+        // floor(x*(range-1) + 0.5) / (range-1)
         scale = float((1 << bits) - 1);
-        offset = 0.0f;
+        offset0 = 0.5f;
+        offset1 = 0.0f;
     }
     else {
+        // (floor(x*range) + 0.5) / range
         scale = float(1 << bits);
-        offset = 0.5f;
+        offset0 = 0.0f;
+        offset1 = 0.5f;
     }
 
     if (!dither) {
         float * c = img->channel(channel);
         const uint count = img->pixelCount();
         for (uint i = 0; i < count; i++) {
-            c[i] = floorf(c[i] * scale + offset) / scale;
+            c[i] = saturate((floorf(c[i] * scale + offset0) + offset1) / scale);
         }
     }
     else {
@@ -2182,7 +2339,7 @@ void Surface::quantize(int channel, int bits, bool exactEndPoints, bool dither)
                     float & f = img->pixel(channel, x, y, 0);
 
                     // Add error and quantize.
-                    float qf = floorf((f + row0[1+x]) * scale + offset) / scale;
+                    float qf = saturate((floorf((f + row0[1+x]) * scale + offset0) + offset1) / scale);
 
                     // Compute new error:
                     float diff = f - qf;
@@ -2221,9 +2378,6 @@ void Surface::toNormalMap(float sm, float medium, float big, float large)
     const FloatImage * img = m->image;
     m->image = nv::createNormalMap(img, (FloatImage::WrapMode)m->wrapMode, filterWeights);
 
-#pragma NV_MESSAGE("TODO: Pack and expand normals explicitly?")
-    m->image->packNormals(0);
-
     delete img;
 
     m->isNormalMap = true;
@@ -2246,7 +2400,6 @@ void Surface::transformNormals(NormalTransform xform)
     detach();
 
     FloatImage * img = m->image;
-    img->expandNormals(0);
 
     const uint count = img->pixelCount();
     for (uint i = 0; i < count; i++) {
@@ -2308,8 +2461,6 @@ void Surface::transformNormals(NormalTransform xform)
         y = n.y;
         z = n.z;
     }
-
-    img->packNormals(0);
 }
 
 void Surface::reconstructNormals(NormalTransform xform)
@@ -2319,7 +2470,6 @@ void Surface::reconstructNormals(NormalTransform xform)
     detach();
 
     FloatImage * img = m->image;
-    img->expandNormals(0);
 
     const uint count = img->pixelCount();
     for (uint i = 0; i < count; i++) {
@@ -2357,8 +2507,6 @@ void Surface::reconstructNormals(NormalTransform xform)
         y = n.y;
         z = n.z;
     }
-
-    img->packNormals(0);
 }
 
 void Surface::toCleanNormalMap()
@@ -2367,8 +2515,6 @@ void Surface::toCleanNormalMap()
 
     detach();
 
-    m->image->expandNormals(0);
-
     const uint count = m->image->pixelCount();
     for (uint i = 0; i < count; i++) {
         float x = m->image->pixel(0, i);
@@ -2376,22 +2522,48 @@ void Surface::toCleanNormalMap()
 
         m->image->pixel(2, i) = x*x + y*y;
     }
-
-    m->image->packNormals(0);
 }
 
 // [-1,1] -> [ 0,1]
-void Surface::packNormals() {
+void Surface::packNormals(float scale/*= 0.5f*/, float bias/*= 0.5f*/) {
     if (isNull()) return;
     detach();
-    m->image->packNormals(0);
+    m->image->scaleBias(0, 3, scale, bias);
 }
 
 // [ 0,1] -> [-1,1]
-void Surface::expandNormals() {
+void Surface::expandNormals(float scale/*= 2.0f*/, float bias/*= - 2.0f * 127.0f / 255.0f*/) {
     if (isNull()) return;
     detach();
-    m->image->expandNormals(0);
+    m->image->scaleBias(0, 3, scale, bias);
+}
+
+
+// Create a Toksvig map for this normal map.
+// http://blog.selfshadow.com/2011/07/22/specular-showdown/
+// @@ Assumes this is a normal map expanded in the [-1, 1] range.
+Surface Surface::createToksvigMap(float power) const
+{
+    if (isNull()) return Surface();
+
+    // @@ TODO
+
+    return Surface();
+}
+
+// @@ Should I add support for LEAN maps? That requires 5 terms, which would have to be encoded in two textures.
+// There's nothing stopping us from having 5 channels in a surface, and then, let the user swizzle them as they wish.
+// CLEAN maps are probably more practical, though.
+// http://www.cs.umbc.edu/~olano/papers/lean/
+// http://gaim.umbc.edu/2011/07/24/shiny-and-clean/
+// http://gaim.umbc.edu/2011/07/26/on-error/
+NVTT_API Surface Surface::createCleanMap() const
+{
+    if (isNull()) return Surface();
+
+    // @@ TODO
+
+    return Surface();
 }
 
 
@@ -2422,7 +2594,7 @@ void Surface::flipZ()
     m->image->flipZ();
 }
 
-Surface Surface::subImage(int x0, int x1, int y0, int y1, int z0, int z1) const
+Surface Surface::createSubImage(int x0, int x1, int y0, int y1, int z0, int z1) const
 {
     Surface s;
 
@@ -2495,9 +2667,6 @@ bool Surface::addChannel(const Surface & srcImage, int srcChannel, int dstChanne
 
     dst = m->image;
 
-    const uint w = src->width();
-    const uint h = src->height();
-
     float * d = dst->channel(dstChannel);
     const float * s = src->channel(srcChannel);
 
@@ -2510,6 +2679,38 @@ bool Surface::addChannel(const Surface & srcImage, int srcChannel, int dstChanne
 }
 
 
+bool Surface::copy(const Surface & srcImage, int xsrc, int ysrc, int zsrc, int xsize, int ysize, int zsize, int xdst, int ydst, int zdst)
+{
+    if (xsrc < 0 || ysrc < 0 || zsrc < 0) return false;
+    if (xdst < 0 || ydst < 0 || zdst < 0) return false;
+
+    FloatImage * dst = m->image;
+    const FloatImage * src = srcImage.m->image;
+
+    if (toU32(xsrc + xsize) > src->width() || toU32(ysrc + ysize) > src->height() || toU32(zsrc + zsize) > src->depth()) return false;
+    if (toU32(xdst + xsize) > dst->width() || toU32(ydst + ysize) > dst->height() || toU32(zdst + zsize) > dst->depth()) return false;
+
+    detach();
+
+    // For each channel.
+    for(int i = 0; i < 4; i++) {
+        float * d = dst->channel(i);
+        const float * s = src->channel(i);
+
+        // Copy region from src to dst.
+        for (int z = 0; z < zsize; z++) {
+            for (int y = 0; y < ysize; y++) {
+                for (int x = 0; x < xsize; x++) {
+                    d[dst->index(xdst + x, ydst + y, zdst + z)] = s[src->index(xsrc + x, ysrc + y, zsrc + z)];
+                }
+            }
+        }
+    }
+
+    return true;
+}
+
+
 
 float nvtt::rmsError(const Surface & reference, const Surface & image)
 {
diff --git a/src/nvtt/Surface.h b/src/nvtt/Surface.h
index 6013995..419a0e1 100644
--- a/src/nvtt/Surface.h
+++ b/src/nvtt/Surface.h
@@ -78,8 +78,10 @@ namespace nvtt
 } // nvtt namespace
 
 namespace nv {
+    bool canMakeNextMipmap(uint w, uint h, uint d, uint min_size);
     uint countMipmaps(uint w);
     uint countMipmaps(uint w, uint h, uint d);
+    uint countMipmapsWithMinSize(uint w, uint h, uint d, uint min_size);
     uint computeImageSize(uint w, uint h, uint d, uint bitCount, uint alignmentInBytes, nvtt::Format format);
     void getTargetExtent(int * w, int * h, int * d, int maxExtent, nvtt::RoundMode roundMode, nvtt::TextureType textureType);
 }
diff --git a/src/nvtt/nvtt.h b/src/nvtt/nvtt.h
index ce5cb1f..c6c6aec 100644
--- a/src/nvtt/nvtt.h
+++ b/src/nvtt/nvtt.h
@@ -454,15 +454,18 @@ namespace nvtt
         NVTT_API AlphaMode alphaMode() const;
         NVTT_API bool isNormalMap() const;
         NVTT_API int countMipmaps() const;
+        NVTT_API int countMipmaps(int min_size) const;
         NVTT_API float alphaTestCoverage(float alphaRef = 0.5) const;
         NVTT_API float average(int channel, int alpha_channel = -1, float gamma = 2.2f) const;
         NVTT_API const float * data() const;
+        NVTT_API const float * channel(int i) const;
         NVTT_API void histogram(int channel, float rangeMin, float rangeMax, int binCount, int * binPtr) const;
-        NVTT_API void range(int channel, float * rangeMin, float * rangeMax) const;
+        NVTT_API void range(int channel, float * rangeMin, float * rangeMax, int alpha_channel = -1, float alpha_ref = 0.f) const;
 
         // Texture data.
         NVTT_API bool load(const char * fileName, bool * hasAlpha = 0);
-        NVTT_API bool save(const char * fileName) const;
+        NVTT_API bool save(const char * fileName, bool hasAlpha = 0, bool hdr = 0) const;
+        NVTT_API bool setImage(int w, int h, int d);
         NVTT_API bool setImage(InputFormat format, int w, int h, int d, const void * data);
         NVTT_API bool setImage(InputFormat format, int w, int h, int d, const void * r, const void * g, const void * b, const void * a);
         NVTT_API bool setImage2D(Format format, Decoder decoder, int w, int h, const void * data);
@@ -472,9 +475,14 @@ namespace nvtt
         NVTT_API void resize(int w, int h, int d, ResizeFilter filter, float filterWidth, const float * params = 0);
         NVTT_API void resize(int maxExtent, RoundMode mode, ResizeFilter filter);
         NVTT_API void resize(int maxExtent, RoundMode mode, ResizeFilter filter, float filterWidth, const float * params = 0);
-        NVTT_API bool buildNextMipmap(MipmapFilter filter);
-        NVTT_API bool buildNextMipmap(MipmapFilter filter, float filterWidth, const float * params = 0);
+        NVTT_API void resize_make_square(int maxExtent, RoundMode roundMode, ResizeFilter filter);
+
+        NVTT_API bool buildNextMipmap(MipmapFilter filter, int min_size = 1);
+        NVTT_API bool buildNextMipmap(MipmapFilter filter, float filterWidth, const float * params = 0, int min_size = 1);
+        NVTT_API bool buildNextMipmapSolidColor(const float * const color_components);
         NVTT_API void canvasSize(int w, int h, int d);
+        // associated to resizing:
+        NVTT_API bool canMakeNextMipmap(int min_size = 1);
 
         // Color transforms.
         NVTT_API void toLinear(float gamma);
@@ -488,17 +496,15 @@ namespace nvtt
         NVTT_API void swizzle(int r, int g, int b, int a);
         NVTT_API void scaleBias(int channel, float scale, float bias);
         NVTT_API void clamp(int channel, float low = 0.0f, float high = 1.0f);
-        NVTT_API void packNormal();
-        NVTT_API void expandNormal();
         NVTT_API void blend(float r, float g, float b, float a, float t);
         NVTT_API void premultiplyAlpha();
         NVTT_API void toGreyScale(float redScale, float greenScale, float blueScale, float alphaScale);
         NVTT_API void setBorder(float r, float g, float b, float a);
         NVTT_API void fill(float r, float g, float b, float a);
         NVTT_API void scaleAlphaToCoverage(float coverage, float alphaRef = 0.5f);
-        //NVTT_API bool normalizeRange(float * rangeMin, float * rangeMax);
         NVTT_API void toRGBM(float range = 1.0f, float threshold = 0.0f);
         NVTT_API void fromRGBM(float range = 1.0f);
+        NVTT_API void toLM(float range = 1.0f, float threshold = 0.0f);
         NVTT_API void toRGBE(int mantissaBits, int exponentBits);
         NVTT_API void fromRGBE(int mantissaBits, int exponentBits);
         NVTT_API void toYCoCg();
@@ -519,14 +525,14 @@ namespace nvtt
         NVTT_API void binarize(int channel, float threshold, bool dither);
         NVTT_API void quantize(int channel, int bits, bool exactEndPoints, bool dither);
 
-        // Normal map transforms. @@ All these methods assume packed normals.
+        // Normal map transforms.
         NVTT_API void toNormalMap(float sm, float medium, float big, float large);
         NVTT_API void normalizeNormalMap();
         NVTT_API void transformNormals(NormalTransform xform);
         NVTT_API void reconstructNormals(NormalTransform xform);
         NVTT_API void toCleanNormalMap();
-        NVTT_API void packNormals();   // [-1,1] -> [ 0,1]
-        NVTT_API void expandNormals(); // [ 0,1] -> [-1,1]
+        NVTT_API void packNormals(float scale = 0.5f, float bias = 0.5f);       // [-1,1] -> [ 0,1]
+        NVTT_API void expandNormals(float scale = 2.0f, float bias = -1.0f);    // [ 0,1] -> [-1,1]
         NVTT_API Surface createToksvigMap(float power) const;
         NVTT_API Surface createCleanMap() const;
 
@@ -534,7 +540,7 @@ namespace nvtt
         NVTT_API void flipX();
         NVTT_API void flipY();
         NVTT_API void flipZ();
-        NVTT_API Surface subImage(int x0, int x1, int y0, int y1, int z0, int z1) const;
+        NVTT_API Surface createSubImage(int x0, int x1, int y0, int y1, int z0, int z1) const;
 
         // Copy image data.
         NVTT_API bool copyChannel(const Surface & srcImage, int srcChannel);
@@ -542,6 +548,9 @@ namespace nvtt
 
         NVTT_API bool addChannel(const Surface & img, int srcChannel, int dstChannel, float scale);
 
+        NVTT_API bool copy(const Surface & src, int xsrc, int ysrc, int zsrc, int xsize, int ysize, int zsize, int xdst, int ydst, int zdst);
+
+
     //private:
         void detach();
 
@@ -599,12 +608,15 @@ namespace nvtt
 
         NVTT_API float average(int channel) const;
         NVTT_API void range(int channel, float * minimum_ptr, float * maximum_ptr) const;
+        NVTT_API void clamp(int channel, float low = 0.0f, float high = 1.0f);
 
 
         // Filtering.
         NVTT_API CubeSurface irradianceFilter(int size, EdgeFixup fixupMethod) const;
         NVTT_API CubeSurface cosinePowerFilter(int size, float cosinePower, EdgeFixup fixupMethod) const;
 
+        NVTT_API CubeSurface fastResample(int size, EdgeFixup fixupMethod) const;
+
 
         /*
         NVTT_API void resize(int w, int h, ResizeFilter filter);
diff --git a/src/nvtt/tests/testsuite.cpp b/src/nvtt/tests/testsuite.cpp
index dfd8e3b..184e5cb 100644
--- a/src/nvtt/tests/testsuite.cpp
+++ b/src/nvtt/tests/testsuite.cpp
@@ -856,7 +856,7 @@ int main(int argc, char *argv[])
             outputFileName.stripExtension();
             if (set.type == ImageType_HDR) outputFileName.append(".dds");
             else outputFileName.append(".tga");
-            if (!img_out.save(outputFileName.str()))
+            if (!img_out.save(outputFileName.str(), set.type == ImageType_RGBA, set.type == ImageType_HDR))
             {
                 printf("Error saving file '%s'.\n", outputFileName.str());
             }
diff --git a/src/nvtt/tools/cmdline.h b/src/nvtt/tools/cmdline.h
index 14878ef..7617ae7 100644
--- a/src/nvtt/tools/cmdline.h
+++ b/src/nvtt/tools/cmdline.h
@@ -58,7 +58,7 @@ struct MyAssertHandler : public nv::AssertHandler {
     }
 
     // Handler method, note that func might be NULL!
-    virtual int assertion( const char *exp, const char *file, int line, const char *func ) {
+    virtual int assertion( const char *exp, const char *file, int line, const char *func, const char *msg, va_list arg ) {
         fprintf(stderr, "Assertion failed: %s\nIn %s:%d\n", exp, file, line);
         nv::debug::dumpInfo();
         exit(1);