Merge changes from the witness.

2011-09-27 17:48:46 +00:00
parent 9c0658edca
commit 3c0ab2d3f3
47 changed files with 1811 additions and 186 deletions
--- a/project/vc9/nvthread/nvthread.vcproj
+++ b/project/vc9/nvthread/nvthread.vcproj
@ -0,0 +1,346 @@
 <?xml version="1.0" encoding="Windows-1252"?>
 <VisualStudioProject
 	ProjectType="Visual C++"
 	Version="9.00"
 	Name="nvthread"
 	ProjectGUID="{3DD3A43D-C6EA-460F-821B-6C339A03C5BB}"
 	RootNamespace="nvthread"
 	Keyword="Win32Proj"
 	TargetFrameworkVersion="131072"
 	>
 	<Platforms>
 		<Platform
 			Name="Win32"
 		/>
 		<Platform
 			Name="x64"
 		/>
 	</Platforms>
 	<ToolFiles>
 	</ToolFiles>
 	<Configurations>
 		<Configuration
 			Name="Debug|Win32"
 			OutputDirectory="$(ConfigurationName)\$(PlatformName)"
 			IntermediateDirectory="$(ConfigurationName)\$(PlatformName)"
 			ConfigurationType="4"
 			InheritedPropertySheets="$(SolutionDir)\nvtt.vsprops"
 			CharacterSet="2"
 			>
 			<Tool
 				Name="VCPreBuildEventTool"
 			/>
 			<Tool
 				Name="VCCustomBuildTool"
 			/>
 			<Tool
 				Name="VCXMLDataGeneratorTool"
 			/>
 			<Tool
 				Name="VCWebServiceProxyGeneratorTool"
 			/>
 			<Tool
 				Name="VCMIDLTool"
 			/>
 			<Tool
 				Name="VCCLCompilerTool"
 				Optimization="0"
 				AdditionalIncludeDirectories=""
 				PreprocessorDefinitions="WIN32;_DEBUG;_LIB;__SSE2__;__SSE__;__MMX__"
 				MinimalRebuild="true"
 				BasicRuntimeChecks="3"
 				RuntimeLibrary="3"
 				UsePrecompiledHeader="0"
 				WarningLevel="3"
 				DebugInformationFormat="4"
 			/>
 			<Tool
 				Name="VCManagedResourceCompilerTool"
 			/>
 			<Tool
 				Name="VCResourceCompilerTool"
 			/>
 			<Tool
 				Name="VCPreLinkEventTool"
 			/>
 			<Tool
 				Name="VCLibrarianTool"
 			/>
 			<Tool
 				Name="VCALinkTool"
 			/>
 			<Tool
 				Name="VCXDCMakeTool"
 			/>
 			<Tool
 				Name="VCBscMakeTool"
 			/>
 			<Tool
 				Name="VCFxCopTool"
 			/>
 			<Tool
 				Name="VCPostBuildEventTool"
 			/>
 		</Configuration>
 		<Configuration
 			Name="Debug|x64"
 			OutputDirectory="$(ConfigurationName)\$(PlatformName)"
 			IntermediateDirectory="$(ConfigurationName)\$(PlatformName)"
 			ConfigurationType="4"
 			InheritedPropertySheets="$(SolutionDir)\nvtt.vsprops"
 			CharacterSet="0"
 			>
 			<Tool
 				Name="VCPreBuildEventTool"
 			/>
 			<Tool
 				Name="VCCustomBuildTool"
 			/>
 			<Tool
 				Name="VCXMLDataGeneratorTool"
 			/>
 			<Tool
 				Name="VCWebServiceProxyGeneratorTool"
 			/>
 			<Tool
 				Name="VCMIDLTool"
 				TargetEnvironment="3"
 			/>
 			<Tool
 				Name="VCCLCompilerTool"
 				Optimization="0"
 				AdditionalIncludeDirectories=""
 				PreprocessorDefinitions="WIN32;_DEBUG;_LIB;__SSE2__;__SSE__;__MMX__"
 				MinimalRebuild="true"
 				BasicRuntimeChecks="3"
 				RuntimeLibrary="3"
 				UsePrecompiledHeader="0"
 				WarningLevel="3"
 				DebugInformationFormat="3"
 			/>
 			<Tool
 				Name="VCManagedResourceCompilerTool"
 			/>
 			<Tool
 				Name="VCResourceCompilerTool"
 			/>
 			<Tool
 				Name="VCPreLinkEventTool"
 			/>
 			<Tool
 				Name="VCLibrarianTool"
 			/>
 			<Tool
 				Name="VCALinkTool"
 			/>
 			<Tool
 				Name="VCXDCMakeTool"
 			/>
 			<Tool
 				Name="VCBscMakeTool"
 			/>
 			<Tool
 				Name="VCFxCopTool"
 			/>
 			<Tool
 				Name="VCPostBuildEventTool"
 			/>
 		</Configuration>
 		<Configuration
 			Name="Release|Win32"
 			OutputDirectory="$(ConfigurationName)\$(PlatformName)"
 			IntermediateDirectory="$(ConfigurationName)\$(PlatformName)"
 			ConfigurationType="4"
 			InheritedPropertySheets="$(SolutionDir)\nvtt.vsprops"
 			CharacterSet="2"
 			WholeProgramOptimization="1"
 			>
 			<Tool
 				Name="VCPreBuildEventTool"
 			/>
 			<Tool
 				Name="VCCustomBuildTool"
 			/>
 			<Tool
 				Name="VCXMLDataGeneratorTool"
 			/>
 			<Tool
 				Name="VCWebServiceProxyGeneratorTool"
 			/>
 			<Tool
 				Name="VCMIDLTool"
 			/>
 			<Tool
 				Name="VCCLCompilerTool"
 				Optimization="3"
 				InlineFunctionExpansion="0"
 				EnableIntrinsicFunctions="true"
 				FavorSizeOrSpeed="0"
 				OmitFramePointers="true"
 				EnableFiberSafeOptimizations="true"
 				AdditionalIncludeDirectories=""
 				PreprocessorDefinitions="WIN32;NDEBUG;_LIB;__SSE2__;__SSE__;__MMX__"
 				StringPooling="true"
 				RuntimeLibrary="2"
 				EnableFunctionLevelLinking="false"
 				EnableEnhancedInstructionSet="2"
 				UsePrecompiledHeader="0"
 				WarningLevel="3"
 				DebugInformationFormat="3"
 			/>
 			<Tool
 				Name="VCManagedResourceCompilerTool"
 			/>
 			<Tool
 				Name="VCResourceCompilerTool"
 			/>
 			<Tool
 				Name="VCPreLinkEventTool"
 			/>
 			<Tool
 				Name="VCLibrarianTool"
 			/>
 			<Tool
 				Name="VCALinkTool"
 			/>
 			<Tool
 				Name="VCXDCMakeTool"
 			/>
 			<Tool
 				Name="VCBscMakeTool"
 			/>
 			<Tool
 				Name="VCFxCopTool"
 			/>
 			<Tool
 				Name="VCPostBuildEventTool"
 			/>
 		</Configuration>
 		<Configuration
 			Name="Release|x64"
 			OutputDirectory="$(ConfigurationName)\$(PlatformName)"
 			IntermediateDirectory="$(ConfigurationName)\$(PlatformName)"
 			ConfigurationType="4"
 			InheritedPropertySheets="$(SolutionDir)\nvtt.vsprops"
 			CharacterSet="0"
 			WholeProgramOptimization="1"
 			>
 			<Tool
 				Name="VCPreBuildEventTool"
 			/>
 			<Tool
 				Name="VCCustomBuildTool"
 			/>
 			<Tool
 				Name="VCXMLDataGeneratorTool"
 			/>
 			<Tool
 				Name="VCWebServiceProxyGeneratorTool"
 			/>
 			<Tool
 				Name="VCMIDLTool"
 				TargetEnvironment="3"
 			/>
 			<Tool
 				Name="VCCLCompilerTool"
 				Optimization="3"
 				EnableIntrinsicFunctions="true"
 				OmitFramePointers="true"
 				WholeProgramOptimization="true"
 				AdditionalIncludeDirectories=""
 				PreprocessorDefinitions="WIN32;NDEBUG;_LIB;__SSE2__;__SSE__;__MMX__"
 				StringPooling="true"
 				RuntimeLibrary="2"
 				EnableFunctionLevelLinking="false"
 				UsePrecompiledHeader="0"
 				WarningLevel="3"
 				DebugInformationFormat="3"
 			/>
 			<Tool
 				Name="VCManagedResourceCompilerTool"
 			/>
 			<Tool
 				Name="VCResourceCompilerTool"
 			/>
 			<Tool
 				Name="VCPreLinkEventTool"
 			/>
 			<Tool
 				Name="VCLibrarianTool"
 			/>
 			<Tool
 				Name="VCALinkTool"
 			/>
 			<Tool
 				Name="VCXDCMakeTool"
 			/>
 			<Tool
 				Name="VCBscMakeTool"
 			/>
 			<Tool
 				Name="VCFxCopTool"
 			/>
 			<Tool
 				Name="VCPostBuildEventTool"
 			/>
 		</Configuration>
 	</Configurations>
 	<References>
 	</References>
 	<Files>
 		<File
 			RelativePath="..\..\..\src\nvthread\Atomic.h"
 			>
 		</File>
 		<File
 			RelativePath="..\..\..\src\nvthread\Event.cpp"
 			>
 		</File>
 		<File
 			RelativePath="..\..\..\src\nvthread\Event.h"
 			>
 		</File>
 		<File
 			RelativePath="..\..\..\src\nvthread\Mutex.cpp"
 			>
 		</File>
 		<File
 			RelativePath="..\..\..\src\nvthread\Mutex.h"
 			>
 		</File>
 		<File
 			RelativePath="..\..\..\src\nvthread\nvthread.cpp"
 			>
 		</File>
 		<File
 			RelativePath="..\..\..\src\nvthread\nvthread.h"
 			>
 		</File>
 		<File
 			RelativePath="..\..\..\src\nvthread\ParallelFor.cpp"
 			>
 		</File>
 		<File
 			RelativePath="..\..\..\src\nvthread\ParallelFor.h"
 			>
 		</File>
 		<File
 			RelativePath="..\..\..\src\nvthread\Thread.cpp"
 			>
 		</File>
 		<File
 			RelativePath="..\..\..\src\nvthread\Thread.h"
 			>
 		</File>
 		<File
 			RelativePath="..\..\..\src\nvthread\ThreadPool.cpp"
 			>
 		</File>
 		<File
 			RelativePath="..\..\..\src\nvthread\ThreadPool.h"
 			>
 		</File>
 	</Files>
 	<Globals>
 	</Globals>
 </VisualStudioProject>
--- a/project/vc9/nvtt.sln
+++ b/project/vc9/nvtt.sln
@ -4,6 +4,7 @@ Microsoft Visual Studio Solution File, Format Version 10.00
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "nvtt", "nvtt\nvtt.vcproj", "{1AEB7681-57D8-48EE-813D-5C41CC38B647}"
 	ProjectSection(ProjectDependencies) = postProject
 		{CE017322-01FC-4851-9C8B-64E9A8E26C38} = {CE017322-01FC-4851-9C8B-64E9A8E26C38}
 		{3DD3A43D-C6EA-460F-821B-6C339A03C5BB} = {3DD3A43D-C6EA-460F-821B-6C339A03C5BB}
 		{F143D180-D4C4-4037-B3DE-BE89A21C8D1D} = {F143D180-D4C4-4037-B3DE-BE89A21C8D1D}
 		{4046F392-A18B-4C66-9639-3EABFFF5D531} = {4046F392-A18B-4C66-9639-3EABFFF5D531}
 		{C33787E3-5564-4834-9FE3-A9020455A669} = {C33787E3-5564-4834-9FE3-A9020455A669}
@ -88,6 +89,8 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "imperativeapi", "imperative
 EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "bc6h", "bc6h\bc6h.vcproj", "{C33787E3-5564-4834-9FE3-A9020455A669}"
 EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "nvthread", "nvthread\nvthread.vcproj", "{3DD3A43D-C6EA-460F-821B-6C339A03C5BB}"
 EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Debug (no cuda)|Mixed Platforms = Debug (no cuda)|Mixed Platforms
@ -457,6 +460,28 @@ Global
 		{C33787E3-5564-4834-9FE3-A9020455A669}.Release|Win32.Build.0 = Release|Win32
 		{C33787E3-5564-4834-9FE3-A9020455A669}.Release|x64.ActiveCfg = Release|x64
 		{C33787E3-5564-4834-9FE3-A9020455A669}.Release|x64.Build.0 = Release|x64
 		{3DD3A43D-C6EA-460F-821B-6C339A03C5BB}.Debug (no cuda)|Mixed Platforms.ActiveCfg = Debug|x64
 		{3DD3A43D-C6EA-460F-821B-6C339A03C5BB}.Debug (no cuda)|Mixed Platforms.Build.0 = Debug|x64
 		{3DD3A43D-C6EA-460F-821B-6C339A03C5BB}.Debug (no cuda)|Win32.ActiveCfg = Debug|x64
 		{3DD3A43D-C6EA-460F-821B-6C339A03C5BB}.Debug (no cuda)|x64.ActiveCfg = Debug|x64
 		{3DD3A43D-C6EA-460F-821B-6C339A03C5BB}.Debug (no cuda)|x64.Build.0 = Debug|x64
 		{3DD3A43D-C6EA-460F-821B-6C339A03C5BB}.Debug|Mixed Platforms.ActiveCfg = Debug|x64
 		{3DD3A43D-C6EA-460F-821B-6C339A03C5BB}.Debug|Mixed Platforms.Build.0 = Debug|x64
 		{3DD3A43D-C6EA-460F-821B-6C339A03C5BB}.Debug|Win32.ActiveCfg = Debug|Win32
 		{3DD3A43D-C6EA-460F-821B-6C339A03C5BB}.Debug|Win32.Build.0 = Debug|Win32
 		{3DD3A43D-C6EA-460F-821B-6C339A03C5BB}.Debug|x64.ActiveCfg = Debug|x64
 		{3DD3A43D-C6EA-460F-821B-6C339A03C5BB}.Debug|x64.Build.0 = Debug|x64
 		{3DD3A43D-C6EA-460F-821B-6C339A03C5BB}.Release (no cuda)|Mixed Platforms.ActiveCfg = Release|x64
 		{3DD3A43D-C6EA-460F-821B-6C339A03C5BB}.Release (no cuda)|Mixed Platforms.Build.0 = Release|x64
 		{3DD3A43D-C6EA-460F-821B-6C339A03C5BB}.Release (no cuda)|Win32.ActiveCfg = Release|x64
 		{3DD3A43D-C6EA-460F-821B-6C339A03C5BB}.Release (no cuda)|x64.ActiveCfg = Release|x64
 		{3DD3A43D-C6EA-460F-821B-6C339A03C5BB}.Release (no cuda)|x64.Build.0 = Release|x64
 		{3DD3A43D-C6EA-460F-821B-6C339A03C5BB}.Release|Mixed Platforms.ActiveCfg = Release|x64
 		{3DD3A43D-C6EA-460F-821B-6C339A03C5BB}.Release|Mixed Platforms.Build.0 = Release|x64
 		{3DD3A43D-C6EA-460F-821B-6C339A03C5BB}.Release|Win32.ActiveCfg = Release|Win32
 		{3DD3A43D-C6EA-460F-821B-6C339A03C5BB}.Release|Win32.Build.0 = Release|Win32
 		{3DD3A43D-C6EA-460F-821B-6C339A03C5BB}.Release|x64.ActiveCfg = Release|x64
 		{3DD3A43D-C6EA-460F-821B-6C339A03C5BB}.Release|x64.Build.0 = Release|x64
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@ -6,6 +6,7 @@ INCLUDE_DIRECTORIES(${NV_SOURCE_DIR}/extern/stb)
 SUBDIRS(nvcore)
 SUBDIRS(nvmath)
 SUBDIRS(nvimage)
 SUBDIRS(nvthread)
 SUBDIRS(nvtt)
 # OpenGL
--- a/src/nvcore/Array.h
+++ b/src/nvcore/Array.h
@ -78,8 +78,8 @@ namespace nv
    }
    template <typename T>
-    bool find(const T & element, const T * restrict ptr, uint count, uint * index) {
+    bool find(const T & element, const T * restrict ptr, uint begin, uint end, uint * index) {
-        for (uint i = 0; i < count; i++) {
+        for (uint i = begin; i < end; i++) {
            if (ptr[i] == element) {
                if (index != NULL) *index = i;
                return true;
@ -257,15 +257,15 @@ namespace nv
        }
        /// Return true if element found.
-        NV_FORCEINLINE bool find(const T & element, uint * index) const
+        NV_FORCEINLINE bool find(const T & element, uint * indexPtr) const
        {
-            return find(element, 0, m_size, index);
+            return find(element, 0, m_size, indexPtr);
        }
        /// Return true if element found within the given range.
-        NV_FORCEINLINE bool find(const T & element, uint first, uint count, uint * index) const
+        NV_FORCEINLINE bool find(const T & element, uint begin, uint end, uint * indexPtr) const
        {
-            return ::nv::find(element, m_buffer + first, count, index);
+            return ::nv::find(element, m_buffer, begin, end, indexPtr);
        }
        /// Remove the element at the given index. This is an expensive operation!
--- a/src/nvcore/Debug.cpp
+++ b/src/nvcore/Debug.cpp
@ -448,19 +448,6 @@ namespace
    /** Win32 assert handler. */
    struct Win32AssertHandler : public AssertHandler 
    {
        // Code from Daniel Vogel.
        static bool isDebuggerPresent()
        {
            HINSTANCE kernel32 = GetModuleHandle("kernel32.dll");
            if (kernel32) {
                FARPROC IsDebuggerPresent = GetProcAddress(kernel32, "IsDebuggerPresent");
                if (IsDebuggerPresent != NULL && IsDebuggerPresent()) {
                    return true;
                }
            }
            return false;
        }
        // Flush the message queue. This is necessary for the message box to show up.
        static void flushMessageQueue()
        {
@ -487,7 +474,7 @@ namespace
                nvDebug( error_string.str() );
            }
-            if (isDebuggerPresent()) {
+            if (debug::isDebuggerPresent()) {
                return NV_ABORT_DEBUG;
            }
@ -522,15 +509,6 @@ namespace
    /** Xbox360 assert handler. */
    struct Xbox360AssertHandler : public AssertHandler 
    {
        static bool isDebuggerPresent()
        {
 #ifdef _DEBUG
            return DmIsDebuggerPresent() == TRUE;
 #else
            return false;
 #endif
        }
        // Assert handler method.
        virtual int assertion( const char * exp, const char * file, int line, const char * func/*=NULL*/ )
        {
@ -546,7 +524,7 @@ namespace
                nvDebug( error_string.str() );
            }
-            if (isDebuggerPresent()) {
+            if (debug::isDebuggerPresent()) {
                return NV_ABORT_DEBUG;
            }
@ -563,26 +541,6 @@ namespace
    /** Unix assert handler. */
    struct UnixAssertHandler : public AssertHandler
    {
        bool isDebuggerPresent()
        {
 #if NV_OS_DARWIN
            int mib[4];
            struct kinfo_proc info;
            size_t size;
            mib[0] = CTL_KERN;
            mib[1] = KERN_PROC;
            mib[2] = KERN_PROC_PID;
            mib[3] = getpid();
            size = sizeof(info);
            info.kp_proc.p_flag = 0;
            sysctl(mib,4,&info,&size,NULL,0);
            return ((info.kp_proc.p_flag & P_TRACED) == P_TRACED);
 #else
            // if ppid != sid, some process spawned our app, probably a debugger. 
            return getsid(getpid()) != getppid();
 #endif
        }
        // Assert handler method.
        virtual int assertion(const char * exp, const char * file, int line, const char * func)
        {
@ -594,7 +552,7 @@ namespace
            }
 #if _DEBUG
-            if (isDebuggerPresent()) {
+            if (debug::isDebuggerPresent()) {
                return NV_ABORT_DEBUG;
            }
 #endif
@ -702,7 +660,10 @@ void debug::enableSigHandler()
    // SYMOPT_DEFERRED_LOADS make us not take a ton of time unless we actual log traces
    SymSetOptions(SYMOPT_DEFERRED_LOADS|SYMOPT_FAIL_CRITICAL_ERRORS|SYMOPT_LOAD_LINES|SYMOPT_UNDNAME);
-    SymInitialize(GetCurrentProcess(), NULL, TRUE);
+    if (!SymInitialize(GetCurrentProcess(), NULL, TRUE)) {
        DWORD error = GetLastError();
        nvDebug("SymInitialize returned error : %d\n", error);
    }
 #elif !NV_OS_WIN32 && defined(HAVE_SIGNAL_H)
@ -743,3 +704,38 @@ void debug::disableSigHandler()
 #endif
 }
 bool debug::isDebuggerPresent()
 {
 #if NV_OS_WIN32
    HINSTANCE kernel32 = GetModuleHandle("kernel32.dll");
    if (kernel32) {
        FARPROC IsDebuggerPresent = GetProcAddress(kernel32, "IsDebuggerPresent");
        if (IsDebuggerPresent != NULL && IsDebuggerPresent()) {
            return true;
        }
    }
    return false;
 #elif NV_OS_XBOX
 #ifdef _DEBUG
    return DmIsDebuggerPresent() == TRUE;
 #else
    return false;
 #endif
 #elif NV_OS_DARWIN
    int mib[4];
    struct kinfo_proc info;
    size_t size;
    mib[0] = CTL_KERN;
    mib[1] = KERN_PROC;
    mib[2] = KERN_PROC_PID;
    mib[3] = getpid();
    size = sizeof(info);
    info.kp_proc.p_flag = 0;
    sysctl(mib,4,&info,&size,NULL,0);
    return ((info.kp_proc.p_flag & P_TRACED) == P_TRACED);
 #else
    // if ppid != sid, some process spawned our app, probably a debugger. 
    return getsid(getpid()) != getppid();
 #endif
 }
--- a/src/nvcore/Debug.h
+++ b/src/nvcore/Debug.h
@ -10,6 +10,9 @@
 #   include <stdarg.h> // va_list
 #endif
 // Make sure we are using our assert.
 #undef assert
 #define NV_ABORT_DEBUG      1
 #define NV_ABORT_IGNORE     2
 #define NV_ABORT_EXIT       3
@ -116,12 +119,6 @@
 #endif
 #if __cplusplus > 199711L
 #define nvStaticCheck(x) static_assert(x)
 #else
 #define nvStaticCheck(x) typedef char NV_DO_STRING_JOIN2(__static_assert_,__LINE__)[(x)]
 #endif
 NVCORE_API int nvAbort(const char *exp, const char *file, int line, const char * func = NULL);
 NVCORE_API void NV_CDECL nvDebugPrint( const char *msg, ... ) __attribute__((format (printf, 1, 2)));
@ -166,6 +163,8 @@ namespace nv
        NVCORE_API void enableSigHandler();
        NVCORE_API void disableSigHandler();
        NVCORE_API bool isDebuggerPresent();
    }
 } // nv namespace
--- a/src/nvcore/DefsGnucDarwin.h
+++ b/src/nvcore/DefsGnucDarwin.h
@ -2,7 +2,7 @@
 #error "Do not include this file directly."
 #endif
-//#include <stdint.h> // uint8_t, int8_t, ...
+#include <stdint.h> // uint8_t, int8_t, ... uintptr_t
 #include <cstddef> // operator new, size_t, NULL
 // Function linkage
@ -67,4 +67,4 @@ typedef int64_t     int64;
 // Aliases
 typedef uint32      uint;
-*/
+*/
--- a/src/nvcore/Memory.h
+++ b/src/nvcore/Memory.h
@ -12,10 +12,10 @@
 #include <new>	// new and delete
-#if NV_CC_GNUC
+#if NV_CC_GNUC
-#   define NV_ALIGN_16 __attribute__ ((__aligned__ (16)))
+#   define NV_ALIGN_16 __attribute__ ((__aligned__ (16)))
-#else
+#else
-#   define NV_ALIGN_16 __declspec(align(16))
+#   define NV_ALIGN_16 __declspec(align(16))
 #endif
@ -43,15 +43,15 @@ extern "C" {
 namespace nv {
    // C++ helpers.
-    template <typename T> T * malloc(size_t count) {
+    template <typename T> NV_FORCEINLINE T * malloc(size_t count) {
        return (T *)::malloc(sizeof(T) * count);
    }
-    template <typename T> T * realloc(T * ptr, size_t count) {
+    template <typename T> NV_FORCEINLINE T * realloc(T * ptr, size_t count) {
        return (T *)::realloc(ptr, sizeof(T) * count);
    }
-    template <typename T> void free(const T * ptr) {
+    template <typename T> NV_FORCEINLINE void free(const T * ptr) {
        ::free((void *)ptr);
    }
--- a/src/nvcore/StdStream.h
+++ b/src/nvcore/StdStream.h
@ -72,7 +72,7 @@ namespace nv
 #if NV_OS_WIN32
            return _ftell_nolock(m_fp);
 #else
-            return ftell(m_fp);
+            return (uint)ftell(m_fp);
 #endif
        }
@ -85,9 +85,9 @@ namespace nv
            uint end = _ftell_nolock(m_fp);
            _fseek_nolock(m_fp, pos, SEEK_SET);
 #else
-            uint pos = ftell(m_fp);
+            uint pos = (uint)ftell(m_fp);
            fseek(m_fp, 0, SEEK_END);
-            uint end = ftell(m_fp);
+            uint end = (uint)ftell(m_fp);
            fseek(m_fp, pos, SEEK_SET);            
 #endif
            return end;
--- a/src/nvcore/StrLib.cpp
+++ b/src/nvcore/StrLib.cpp
@ -189,7 +189,7 @@ StringBuilder::StringBuilder() : m_size(0), m_str(NULL)
 }
 /** Preallocate space. */
-StringBuilder::StringBuilder( int size_hint ) : m_size(size_hint)
+StringBuilder::StringBuilder( uint size_hint ) : m_size(size_hint)
 {
    nvDebugCheck(m_size > 0);
    m_str = strAlloc(m_size);
@ -203,9 +203,15 @@ StringBuilder::StringBuilder( const StringBuilder & s ) : m_size(0), m_str(NULL)
 }
 /** Copy string. */
-StringBuilder::StringBuilder( const char * s, int extra_size_hint/*=0*/ ) : m_size(0), m_str(NULL)
+StringBuilder::StringBuilder(const char * s) : m_size(0), m_str(NULL)
 {
-    copy(s, extra_size_hint);
+    copy(s);
 }
 /** Copy string. */
 StringBuilder::StringBuilder(const char * s, uint len) : m_size(0), m_str(NULL)
 {
    copy(s, len);
 }
 /** Delete the string. */
@ -396,15 +402,25 @@ StringBuilder & StringBuilder::reserve( uint size_hint )
 /** Copy a string safely. */
-StringBuilder & StringBuilder::copy( const char * s, int extra_size/*=0*/ )
+StringBuilder & StringBuilder::copy(const char * s)
 {
    nvCheck( s != NULL );
    const uint str_size = uint(strlen( s )) + 1;
-    reserve(str_size + extra_size);
+    reserve(str_size);
    memcpy(m_str, s, str_size);
    return *this;
 }
 /** Copy a string safely. */
 StringBuilder & StringBuilder::copy(const char * s, uint len)
 {
    nvCheck( s != NULL );
    const uint str_size = len + 1;
    reserve(str_size);
    strCpy(m_str, str_size, s, len);
    return *this;
 }
 /** Copy an StringBuilder. */
 StringBuilder & StringBuilder::copy( const StringBuilder & s )
--- a/src/nvcore/StrLib.h
+++ b/src/nvcore/StrLib.h
@ -59,9 +59,10 @@ namespace nv
    public:
        StringBuilder();
-        explicit StringBuilder( int size_hint );
+        explicit StringBuilder( uint size_hint );
-        StringBuilder( const char * str, int extra_size_hint = 0);
+        StringBuilder(const char * str);
-        StringBuilder( const StringBuilder & );
+        StringBuilder(const char * str, uint len);
        StringBuilder(const StringBuilder & other);
        ~StringBuilder();
@ -75,9 +76,10 @@ namespace nv
        StringBuilder & number( int i, int base = 10 );
        StringBuilder & number( uint i, int base = 10 );
-        StringBuilder & reserve( uint size_hint );
+        StringBuilder & reserve(uint size_hint);
-        StringBuilder & copy( const char * str, int extra_size/*=0*/ );
+        StringBuilder & copy(const char * str);
-        StringBuilder & copy( const StringBuilder & str );
+        StringBuilder & copy(const char * str, uint len);
        StringBuilder & copy(const StringBuilder & str);
        StringBuilder & toLower();
        StringBuilder & toUpper();
@ -145,7 +147,7 @@ namespace nv
    public:
        Path() : StringBuilder() {}
        explicit Path(int size_hint) : StringBuilder(size_hint) {}
-        Path(const char * str, int extra_size_hint = 0) : StringBuilder(str, extra_size_hint) {}
+        Path(const char * str) : StringBuilder(str) {}
        Path(const Path & path) : StringBuilder(path) {}
        const char * fileName() const;
--- a/src/nvcore/Utils.h
+++ b/src/nvcore/Utils.h
@ -7,9 +7,76 @@
 #include "nvcore.h"
 #include "Debug.h" // nvDebugCheck
 // Just in case. Grrr.
 #undef min
 #undef max
 namespace nv
 {
    // Less error prone than casting. From CB:
    // http://cbloomrants.blogspot.com/2011/06/06-17-11-c-casting-is-devil.html
    inline int8  asSigned(uint8 x)  { return (int8) x; }
    inline int16 asSigned(uint16 x) { return (int16) x; }
    inline int32 asSigned(uint32 x) { return (int32) x; }
    inline int64 asSigned(uint64 x) { return (int64) x; }
    inline uint8  asUnsigned(int8 x)  { return (uint8) x; }
    inline uint16 asUnsigned(int16 x) { return (uint16) x; }
    inline uint32 asUnsigned(int32 x) { return (uint32) x; }
    inline uint64 asUnsigned(int64 x) { return (uint64) x; }
    /*
    template <typename T> inline int8 toI8(T x) { 
        nvDebugCheck(x <= INT8_MAX);
        nvDebugCheck(x >= INT8_MIN);
        int8 y = (int8) x;
        nvDebugCheck(x == (T)y);
        return y;
    }
    template <typename T> inline uint8 toU8(T x) { 
        nvDebugCheck(x <= UINT8_MAX);
        nvDebugCheck(x >= 0);
        return (uint8) x;
    }
    template <typename T> inline int16 toI16(T x) { 
        nvDebugCheck(x <= INT16_MAX);
        nvDebugCheck(x >= INT16_MIN);
        return (int16) x;
    }
    template <typename T> inline uint16 toU16(T x) { 
        nvDebugCheck(x <= UINT16_MAX);
        nvDebugCheck(x >= 0);
        return (uint16) x;
    }
    template <typename T> inline int32 toI32(T x) { 
        nvDebugCheck(x <= INT32_MAX);
        nvDebugCheck(x >= INT32_MIN);
        return (int32) x;
    }
    template <typename T> inline uint32 toU32(T x) { 
        nvDebugCheck(x <= UINT32_MAX);
        nvDebugCheck(x >= 0);
        return (uint32) x;
    }
    template <typename T> inline int64 toI64(T x) { 
        nvDebugCheck(x <= INT64_MAX);
        nvDebugCheck(x >= INT64_MIN);
        return (int64) x;
    }
    template <typename T> inline uint64 toU64(T x) { 
        nvDebugCheck(x <= UINT64_MAX);
        nvDebugCheck(x >= 0);
        return (uint64) x;
    }
    */
    /// Swap two values.
    template <typename T> 
    inline void swap(T & a, T & b)
--- a/src/nvcore/nvcore.h
+++ b/src/nvcore/nvcore.h
@ -4,9 +4,6 @@
 #ifndef NV_CORE_H
 #define NV_CORE_H
 // cmake config
 #include <nvconfig.h>
 // Function linkage
 #if NVCORE_SHARED
 #ifdef NVCORE_EXPORTS
@ -91,7 +88,11 @@
 // @@ NV_CC_MSVC7
 // @@ NV_CC_MSVC8
-#if defined POSH_COMPILER_GCC
+#if defined POSH_COMPILER_CLANG
 #   define NV_CC_CLANG  1
 #   define NV_CC_GCC    1    // Clang is compatible with GCC.
 #   define NV_CC_STRING "clang"
 #elif defined POSH_COMPILER_GCC
 #   define NV_CC_GNUC   1
 #   define NV_CC_STRING "gcc"
 #elif defined POSH_COMPILER_MSVC
@ -108,6 +109,18 @@
 #define NV_ENDIAN_STRING    POSH_ENDIAN_STRING
 // Define the right printf prefix for size_t arguments:
 #if POSH_64BIT_POINTER
 #  define NV_SIZET_PRINTF_PREFIX POSH_I64_PRINTF_PREFIX
 #else
 #  define NV_SIZET_PRINTF_PREFIX
 #endif
 // cmake config
 #include "nvconfig.h"
 // Type definitions:
 typedef posh_u8_t   uint8;
 typedef posh_i8_t   int8;
@ -144,6 +157,8 @@ typedef uint32      uint;
    private: \
    void *operator new(size_t size); \
    void *operator new[](size_t size);
    //static void *operator new(size_t size); \
    //static void *operator new[](size_t size);
 // String concatenation macros.
 #define NV_STRING_JOIN2(arg1, arg2) NV_DO_STRING_JOIN2(arg1, arg2)
@ -153,6 +168,25 @@ typedef uint32      uint;
 #define NV_STRING2(x) #x
 #define NV_STRING(x) NV_STRING2(x)
 #if __cplusplus > 199711L
 #define nvStaticCheck(x) static_assert(x)
 #else
 #define nvStaticCheck(x) typedef char NV_STRING_JOIN2(__static_assert_,__LINE__)[(x)]
 #endif
 #define NV_COMPILER_CHECK(x) nvStaticCheck(x)   // I like this name best.
 // Make sure type definitions are fine.
 NV_COMPILER_CHECK(sizeof(int8) == 1);
 NV_COMPILER_CHECK(sizeof(uint8) == 1);
 NV_COMPILER_CHECK(sizeof(int16) == 2);
 NV_COMPILER_CHECK(sizeof(uint16) == 2);
 NV_COMPILER_CHECK(sizeof(int32) == 4);
 NV_COMPILER_CHECK(sizeof(uint32) == 4);
 NV_COMPILER_CHECK(sizeof(int32) == 4);
 NV_COMPILER_CHECK(sizeof(uint32) == 4);
 #define NV_ARRAY_SIZE(x) (sizeof(x)/sizeof((x)[0]))
 #if 1
@ -180,6 +214,7 @@ typedef uint32      uint;
 // Null index. @@ Move this somewhere else... it's only used by nvmesh.
 //const unsigned int NIL = unsigned int(~0);
 //#define NIL uint(~0)
 // Null pointer.
 #ifndef NULL
--- a/src/nvimage/DirectDrawSurface.cpp
+++ b/src/nvimage/DirectDrawSurface.cpp
@ -1418,7 +1418,7 @@ uint DirectDrawSurface::mipmapSize(uint mipmap) const
    {
        nvDebugCheck((header.pf.flags & DDPF_RGB) || (header.pf.flags & DDPF_LUMINANCE));
-        uint pitch = computeBytePitch(w, header.pf.bitcount, 8); // Asuming 8 bit alignment, which is the same D3DX expects.
+        uint pitch = computeBytePitch(w, header.pf.bitcount, 1); // Asuming 1 byte alignment, which is the same D3DX expects.
        return pitch * h * d;
    }
--- a/src/nvimage/FloatImage.cpp
+++ b/src/nvimage/FloatImage.cpp
@ -181,7 +181,7 @@ void FloatImage::normalize(uint baseComponent)
    for (uint i = 0; i < count; i++) {
        Vector3 normal(xChannel[i], yChannel[i], zChannel[i]);
-        normal = normalizeSafe(normal, Vector3(zero), 0.0f);
+        normal = normalizeSafe(normal, Vector3(0), 0.0f);
        xChannel[i] = normal.x;
        yChannel[i] = normal.y;
--- a/src/nvimage/FloatImage.h
+++ b/src/nvimage/FloatImage.h
@ -56,6 +56,7 @@ namespace nv
        //@{
        NVIMAGE_API void clear(float f = 0.0f);
        NVIMAGE_API void clear(uint component, float f = 0.0f);
        NVIMAGE_API void copyChannel(uint src, uint dst);
        NVIMAGE_API void normalize(uint base_component);
@ -113,8 +114,6 @@ namespace nv
        uint pixelCount() const { return m_pixelCount; }
        // @@ It would make sense to swap the order of the arguments so that 'c' is always first.
        /** @name Pixel access. */
        //@{
        const float * channel(uint c) const;
--- a/src/nvimage/Image.h
+++ b/src/nvimage/Image.h
@ -70,14 +70,14 @@ namespace nv
    inline const Color32 & Image::pixel(uint x, uint y) const
    {
-        nvDebugCheck(x < width() && y < height());
+        nvDebugCheck(x < m_width && y < m_height);
-        return pixel(y * width() + x);
+        return pixel(y * m_width + x);
    }
    inline Color32 & Image::pixel(uint x, uint y)
    {
-        nvDebugCheck(x < width() && y < height());
+        nvDebugCheck(x < m_width && y < m_height);
-        return pixel(y * width() + x);
+        return pixel(y * m_width + x);
    }
 } // nv namespace
--- a/src/nvimage/ImageIO.cpp
+++ b/src/nvimage/ImageIO.cpp
@ -215,7 +215,7 @@ FloatImage * nv::ImageIO::loadFloat(const char * fileName)
    StdInputStream stream(fileName);
    if (stream.isError()) {
-        return false;
+        return NULL;
    }
    return loadFloat(fileName, stream);
@ -324,9 +324,9 @@ bool nv::ImageIO::saveFloat(const char * fileName, Stream & s, const FloatImage
 bool nv::ImageIO::saveFloat(const char * fileName, const FloatImage * fimage, uint baseComponent, uint componentCount)
 {
 #if !defined(HAVE_FREEIMAGE)
    const char * extension = Path::extension(fileName);
 #if !defined(HAVE_FREEIMAGE)
 #if defined(HAVE_OPENEXR)
    if (strCaseCmp(extension, ".exr") == 0) {
        return saveFloatEXR(fileName, fimage, baseComponent, componentCount);
@ -711,7 +711,7 @@ Image * nv::ImageIO::loadTGA(Stream & s)
        case TGA_TYPE_INDEXED:
            if( tga.colormap_type!=1 || tga.colormap_size!=24 || tga.colormap_length>256 ) {
                nvDebug( "*** loadTGA: Error, only 24bit paletted images are supported.\n" );
-                return false;
+                return NULL;
            }
            pal = true;
            break;
@ -732,7 +732,7 @@ Image * nv::ImageIO::loadTGA(Stream & s)
 	default:
 	    nvDebug( "*** loadTGA: Error, unsupported image type.\n" );
-	    return false;
+	    return NULL;
    }
    const uint pixel_size = (tga.pixel_size/8);
@ -1369,7 +1369,7 @@ Image * nv::ImageIO::loadJPG(Stream & s)
    // Read the entire file.
    Array<uint8> byte_array;
    byte_array.resize(s.size());
-    s.serialize(byte_array.mutableBuffer(), s.size());
+    s.serialize(byte_array.buffer(), s.size());
    jpeg_decompress_struct cinfo;
    jpeg_error_mgr jerr;
--- a/src/nvmath/Half.cpp
+++ b/src/nvmath/Half.cpp
@ -487,46 +487,126 @@ nv::half_to_float( uint16 h )
    return (f_result);
 }
 uint32 
 nv::fast_half_to_float( uint16 h )
 {
    const uint32 h_e_mask              = _uint32_li( 0x00007c00 );
    const uint32 h_m_mask              = _uint32_li( 0x000003ff );
    const uint32 h_s_mask              = _uint32_li( 0x00008000 );
    const uint32 h_f_s_pos_offset      = _uint32_li( 0x00000010 );
    const uint32 h_f_e_pos_offset      = _uint32_li( 0x0000000d );
    const uint32 h_f_bias_offset       = _uint32_li( 0x0001c000 );
    const uint32 f_e_mask              = _uint32_li( 0x7f800000 );
    const uint32 f_m_mask              = _uint32_li( 0x007fffff );
    const uint32 h_f_e_denorm_bias     = _uint32_li( 0x0000007e );
    const uint32 h_f_m_denorm_sa_bias  = _uint32_li( 0x00000008 );
    const uint32 f_e_pos               = _uint32_li( 0x00000017 );
    const uint32 h_e_mask_minus_one    = _uint32_li( 0x00007bff );
    const uint32 h_e                   = _uint32_and( h, h_e_mask );
    const uint32 h_m                   = _uint32_and( h, h_m_mask );
    const uint32 h_s                   = _uint32_and( h, h_s_mask );
    const uint32 h_e_f_bias            = _uint32_add( h_e, h_f_bias_offset );
    const uint32 h_m_nlz               = _uint32_cntlz( h_m );
    const uint32 f_s                   = _uint32_sll( h_s,        h_f_s_pos_offset );
    const uint32 f_e                   = _uint32_sll( h_e_f_bias, h_f_e_pos_offset );
    const uint32 f_m                   = _uint32_sll( h_m,        h_f_e_pos_offset );
    const uint32 f_em                  = _uint32_or(  f_e,        f_m              );
    const uint32 h_f_m_sa              = _uint32_sub( h_m_nlz,             h_f_m_denorm_sa_bias );
    const uint32 f_e_denorm_unpacked   = _uint32_sub( h_f_e_denorm_bias,   h_f_m_sa             );
    const uint32 h_f_m                 = _uint32_sll( h_m,                 h_f_m_sa             );
    const uint32 f_m_denorm            = _uint32_and( h_f_m,               f_m_mask             );
    const uint32 f_e_denorm            = _uint32_sll( f_e_denorm_unpacked, f_e_pos              );
    const uint32 f_em_denorm           = _uint32_or(  f_e_denorm,          f_m_denorm           );
    const uint32 f_em_nan              = _uint32_or(  f_e_mask,            f_m                  );
    const uint32 is_e_eqz_msb          = _uint32_dec(  h_e );
    const uint32 is_m_nez_msb          = _uint32_neg(  h_m );
    const uint32 is_e_flagged_msb      = _uint32_sub(  h_e_mask_minus_one, h_e );
    const uint32 is_zero_msb           = _uint32_andc( is_e_eqz_msb,       is_m_nez_msb );
    const uint32 is_denorm_msb         = _uint32_and(  is_m_nez_msb,       is_e_eqz_msb );
    const uint32 is_zero               = _uint32_ext(  is_zero_msb );
    const uint32 f_zero_result         = _uint32_andc( f_em, is_zero );
    const uint32 f_denorm_result       = _uint32_sels( is_denorm_msb, f_em_denorm, f_zero_result );
    const uint32 f_result              = _uint32_or( f_s, f_denorm_result );
-    return (f_result);
+// @@ These tables could be smaller.
 static uint32 mantissa_table[2048];
 static uint32 exponent_table[64];
 static uint32 offset_table[64];
 void nv::half_init_tables()
 {
    // Init mantissa table.
 	mantissa_table[0] = 0;
 	for (int i = 1; i < 1024; i++) {
 		uint m = i << 13;
 		uint e = 0;
 		while ((m & 0x00800000) == 0) {
 			e -= 0x00800000;
 			m <<= 1;
 		}
 		m &= ~0x00800000;
 		e += 0x38800000;
 		mantissa_table[i] = m | e;
 	}
    for (int i = 1024; i < 2048; i++) {
 		mantissa_table[i] = 0x38000000 + ((i - 1024) << 13);
    }
    // Init exponent table.
 	exponent_table[0] = 0;
    for (int i = 1; i < 31; i++) {
 		exponent_table[i] = (i << 23);
    }
 	exponent_table[31] = 0x47800000;
 	exponent_table[32] = 0x80000000;
    for (int i = 33; i < 63; i++) {
 		exponent_table[i] = 0x80000000 + ((i - 32) << 23);
    }
 	exponent_table[63] = 0xC7800000;
    // Init offset table.
 	offset_table[0] = 0;
    for (int i = 1; i < 32; i++) {
 		offset_table[i] = 1024;
    }
 	offset_table[32] = 0;
    for (int i = 33; i < 64; i++) {
 		offset_table[i] = 1024;
    }
    /*for (int i = 0; i < 64; i++) {
        offset_table[i] = ((i & 31) != 0) * 1024;
    }*/
 }
 // Fast half to float conversion based on:
 // http://www.fox-toolkit.org/ftp/fasthalffloatconversion.pdf
 uint32 nv::fast_half_to_float(uint16 h)
 {
 	uint exp = h >> 10;
 	return mantissa_table[offset_table[exp] + (h & 0x3ff)] + exponent_table[exp];
 }
 #if 0
 // Inaccurate conversion suggested at the ffmpeg mailing list:
 // http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/2009-July/068949.html
 uint32 nv::fast_half_to_float(uint16 v)
 {
    if (v & 0x8000) return 0;
    uint exp = v >> 10;
    if (!exp) return (v>>9)&1;
    if (exp >= 15) return 0xffff;
    v <<= 6;
    return (v+(1<<16)) >> (15-exp);
 }
 #endif
 #if 0
 // Some more from a gamedev thread:
 // http://www.devmaster.net/forums/showthread.php?t=10924
 // I believe it does not handle specials either.
 // Mike Acton's code should be fairly easy to vectorize and that would handle all cases too, the table method might still be faster, though.
 static __declspec(align(16)) unsigned half_sign[4]	  = {0x00008000, 0x00008000, 0x00008000, 0x00008000};
 static __declspec(align(16)) unsigned half_exponent[4]	  = {0x00007C00, 0x00007C00, 0x00007C00, 0x00007C00};
 static __declspec(align(16)) unsigned half_mantissa[4]	  = {0x000003FF, 0x000003FF, 0x000003FF, 0x000003FF};
 static __declspec(align(16)) unsigned half_bias_offset[4] = {0x0001C000, 0x0001C000, 0x0001C000, 0x0001C000};
 __asm
 {
 	movaps	xmm1, xmm0  // Input in xmm0
 	movaps	xmm2, xmm0
 	andps	xmm0, half_sign
 	andps	xmm1, half_exponent
 	andps	xmm2, half_mantissa
 	paddd	xmm1, half_bias_offset
 	pslld	xmm0, 16
 	pslld	xmm1, 13
 	pslld	xmm2, 13
 	orps	xmm1, xmm2
 	orps	xmm0, xmm1  // Result in xmm0
 }
 #endif
--- a/src/nvmath/Half.h
+++ b/src/nvmath/Half.h
@ -9,8 +9,9 @@ namespace nv {
    uint32 half_to_float( uint16 h );
    uint16 half_from_float( uint32 f );
-    // Does not handle NaN or infinity.
+    void half_init_tables();
-    uint32 fast_half_to_float( uint16 h );
+
    uint32 fast_half_to_float(uint16 h);
    inline uint16 to_half(float c) {
        union { float f; uint32 u; } f;
--- a/src/nvmath/Matrix.h
+++ b/src/nvmath/Matrix.h
@ -9,15 +9,14 @@
 namespace nv
 {
    enum zero_t { zero };
    enum identity_t { identity };
    class NVMATH_CLASS Matrix3
    {
    public:
        Matrix3();
-        Matrix3(zero_t);
+        explicit Matrix3(float f);
-        Matrix3(identity_t);
+        explicit Matrix3(identity_t);
        Matrix3(const Matrix3 & m);
        Matrix3(Vector3::Arg v0, Vector3::Arg v1, Vector3::Arg v2);
@ -41,10 +40,10 @@ namespace nv
    inline Matrix3::Matrix3() {}
-    inline Matrix3::Matrix3(zero_t)
+    inline Matrix3::Matrix3(float f)
    {
        for(int i = 0; i < 9; i++) {
-            m_data[i] = 0.0f;
+            m_data[i] = f;
        }
    }
@ -204,11 +203,11 @@ namespace nv
        typedef Matrix const & Arg;
        Matrix();
-        Matrix(zero_t);
+        explicit Matrix(float f);
-        Matrix(identity_t);
+        explicit Matrix(identity_t);
        Matrix(const Matrix & m);
        Matrix(Vector4::Arg v0, Vector4::Arg v1, Vector4::Arg v2, Vector4::Arg v3);
-        Matrix(const scalar m[]);	// m is assumed to contain 16 elements
+        //explicit Matrix(const scalar m[]);	// m is assumed to contain 16 elements
        scalar data(uint idx) const;
        scalar & data(uint idx);
@ -237,7 +236,7 @@ namespace nv
    {
    }
-    inline Matrix::Matrix(zero_t)
+    inline Matrix::Matrix(float f)
    {
        for(int i = 0; i < 16; i++) {
            m_data[i] = 0.0f;
@ -268,12 +267,12 @@ namespace nv
        m_data[12] = v3.x; m_data[13] = v3.y; m_data[14] = v3.z; m_data[15] = v3.w;
    }
-    inline Matrix::Matrix(const scalar m[])
+    /*inline Matrix::Matrix(const scalar m[])
    {
        for(int i = 0; i < 16; i++) {
            m_data[i] = m[i];
        }
-    }
+    }*/
    // Accessors
@ -456,7 +455,7 @@ namespace nv
    /// Get frustum matrix.
    inline Matrix frustum(scalar xmin, scalar xmax, scalar ymin, scalar ymax, scalar zNear, scalar zFar)
    {
-        Matrix m(zero);
+        Matrix m(0.0f);
        scalar doubleznear = 2.0f * zNear;
        scalar one_deltax = 1.0f / (xmax - xmin);
@ -477,7 +476,7 @@ namespace nv
    /// Get infinite frustum matrix.
    inline Matrix frustum(scalar xmin, scalar xmax, scalar ymin, scalar ymax, scalar zNear)
    {
-        Matrix m(zero);
+        Matrix m(0.0f);
        scalar doubleznear = 2.0f * zNear;
        scalar one_deltax = 1.0f / (xmax - xmin);
--- a/src/nvmath/Vector.h
+++ b/src/nvmath/Vector.h
@ -100,6 +100,7 @@ namespace nv
        explicit Vector4(scalar x);
        Vector4(scalar x, scalar y, scalar z, scalar w);
        Vector4(Vector2::Arg v, scalar z, scalar w);
        Vector4(Vector2::Arg v, Vector2::Arg u);
        Vector4(Vector3::Arg v, scalar w);
        Vector4(Vector4::Arg v);
        //	Vector4(const Quaternion & v);
@ -107,6 +108,7 @@ namespace nv
        const Vector4 & operator=(Vector4::Arg v);
        Vector2 xy() const;
        Vector2 zw() const;
        Vector3 xyz() const;
        const scalar * ptr() const;
@ -290,6 +292,7 @@ namespace nv
    inline Vector4::Vector4(scalar f) : x(f), y(f), z(f), w(f) {}
    inline Vector4::Vector4(scalar x, scalar y, scalar z, scalar w) : x(x), y(y), z(z), w(w) {}
    inline Vector4::Vector4(Vector2::Arg v, scalar z, scalar w) : x(v.x), y(v.y), z(z), w(w) {}
    inline Vector4::Vector4(Vector2::Arg v, Vector2::Arg u) : x(v.x), y(v.y), z(u.x), w(u.y) {}
    inline Vector4::Vector4(Vector3::Arg v, scalar w) : x(v.x), y(v.y), z(v.z), w(w) {}
    inline Vector4::Vector4(Vector4::Arg v) : x(v.x), y(v.y), z(v.z), w(v.w) {}
@ -307,6 +310,11 @@ namespace nv
        return Vector2(x, y);
    }
    inline Vector2 Vector4::zw() const
    {
        return Vector2(z, w);
    }
    inline Vector3 Vector4::xyz() const
    {
        return Vector3(x, y, z);
@ -469,6 +477,14 @@ namespace nv
        return scale(v, 1.0f / l);
    }
    // Safe, branchless normalization from Andy Firth. All error checking ommitted.
    // http://altdevblogaday.com/2011/08/21/practical-flt-point-tricks/
    inline Vector2 normalizeFast(Vector2::Arg v)
    {
        const float very_small_float = 1.0e-037f;
        float l = very_small_float + length(v);
        return scale(v, 1.0f / l);
    }
    inline bool equal(Vector2::Arg v1, Vector2::Arg v2, float epsilon = NV_EPSILON)
    {
@ -498,6 +514,14 @@ namespace nv
        return vf;
    }
    inline float triangleArea(Vector2::Arg a, Vector2::Arg b, Vector2::Arg c)
    {
 	    Vector2 v0 = a - c;
 	    Vector2 v1 = b - c;
 	    return (v0.x * v1.y - v0.y * v1.x);
    }
    // Vector3
@ -570,10 +594,10 @@ namespace nv
        return scale(v, 1.0f/s);
    }
-    inline Vector3 add_scaled(Vector3::Arg a, Vector3::Arg b, scalar s)
+    /*inline Vector3 add_scaled(Vector3::Arg a, Vector3::Arg b, scalar s)
    {
        return Vector3(a.x + b.x * s, a.y + b.y * s, a.z + b.z * s);
-    }
+    }*/
    inline Vector3 lerp(Vector3::Arg v1, Vector3::Arg v2, scalar t)
    {
@ -624,6 +648,15 @@ namespace nv
        return scale(v, 1.0f / l);
    }
    // Safe, branchless normalization from Andy Firth. All error checking ommitted.
    // http://altdevblogaday.com/2011/08/21/practical-flt-point-tricks/
    inline Vector3 normalizeFast(Vector3::Arg v)
    {
        const float very_small_float = 1.0e-037f;
        float l = very_small_float + length(v);
        return scale(v, 1.0f / l);
    }
    inline bool equal(Vector3::Arg v1, Vector3::Arg v2, float epsilon = NV_EPSILON)
    {
        return equal(v1.x, v2.x, epsilon) && equal(v1.y, v2.y, epsilon) && equal(v1.z, v2.z, epsilon);
@ -762,6 +795,15 @@ namespace nv
        return scale(v, 1.0f / l);
    }
    // Safe, branchless normalization from Andy Firth. All error checking ommitted.
    // http://altdevblogaday.com/2011/08/21/practical-flt-point-tricks/
    inline Vector4 normalizeFast(Vector4::Arg v)
    {
        const float very_small_float = 1.0e-037f;
        float l = very_small_float + length(v);
        return scale(v, 1.0f / l);
    }
    inline bool equal(Vector4::Arg v1, Vector4::Arg v2, float epsilon = NV_EPSILON)
    {
        return equal(v1.x, v2.x, epsilon) && equal(v1.y, v2.y, epsilon) && equal(v1.z, v2.z, epsilon) && equal(v1.w, v2.w, epsilon);
--- a/src/nvmath/nvmath.h
+++ b/src/nvmath/nvmath.h
@ -4,8 +4,9 @@
 #ifndef NV_MATH_H
 #define NV_MATH_H
-#include <nvcore/nvcore.h>
+#include "nvcore/nvcore.h"
-#include <nvcore/Debug.h>
+#include "nvcore/Debug.h"
 #include "nvcore/Utils.h" // clamp
 #include <math.h>
 #include <limits.h> // INT_MAX
@ -194,7 +195,7 @@ namespace nv
        return f - floor(f);
    }
-    inline float fround(float f)
+    inline float fround(float f)    // @@ rename floatRound
    {
        // @@ Do something better.
        return float(iround(f));
@ -210,6 +211,29 @@ namespace nv
        }
    }
    inline float saturate(float f) {
        return clamp(f, 0.0f, 1.0f);
    }
    inline float linearstep(float edge0, float edge1, float x) {
        // Scale, bias and saturate x to 0..1 range
        return saturate((x - edge0) / (edge1 - edge0));
    }
    inline float smoothstep(float edge0, float edge1, float x) {
        x = linearstep(edge0, edge1, x); 
        // Evaluate polynomial
        return x*x*(3 - 2*x);
    }
    inline int sign(float a)
    {
        if (a > 0.0f) return 1;
        if (a < 0.0f) return -1;
        return 0;
    }
 } // nv
 #endif // NV_MATH_H
--- a/src/nvthread/CMakeLists.txt
+++ b/src/nvthread/CMakeLists.txt
@ -0,0 +1,26 @@
 PROJECT(nvthreads)
 SET(THREADS_SRCS
 	nvthreads.h
 	Mutex.h Mutex.cpp
 	SpinWaiter.h SpinWaiter.cpp
 	Thread.h Thread.cpp
 	ThreadLocalStorage.h ThreadLocalStorage.cpp)
 INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
 # targets
 ADD_DEFINITIONS(-DNVTHREADS_EXPORTS)
 IF(NVTHREADS_SHARED)
 	ADD_LIBRARY(nvthreads SHARED ${THREADS_SRCS})
 ELSE(NVTHREADS_SHARED)
 	ADD_LIBRARY(nvthreads ${THREADS_SRCS})
 ENDIF(NVTHREADS_SHARED)
 TARGET_LINK_LIBRARIES(nvthreads ${LIBS} nvcore)
 INSTALL(TARGETS nvthreads
 	RUNTIME DESTINATION bin
 	LIBRARY DESTINATION lib
 	ARCHIVE DESTINATION lib/static)
--- a/src/nvthread/Event.cpp
+++ b/src/nvthread/Event.cpp
@ -0,0 +1,52 @@
 // This code is in the public domain -- castano@gmail.com
 #include "Event.h"
 #if NV_OS_WIN32
 #include "Win32.h"
 #elif NV_OS_UNIX
 #include <pthread.h>
 #endif
 using namespace nv;
 #if NV_OS_WIN32
 struct Event::Private {
 	HANDLE handle;
 };
 Event::Event() : m(new Private) {
    m->handle = CreateEvent(NULL, FALSE, FALSE, NULL);
 }
 Event::~Event() {
    CloseHandle(m->handle);
 }
 void Event::post() {
    SetEvent(m->handle);
 }
 void Event::wait() {
    WaitForSingleObject(m->handle, INFINITE);
 }
 /*static*/ void Event::post(Event * events, uint count) {
    for (uint i = 0; i < count; i++) {
        events[i].post();
    }
 }
 /*static*/ void Event::wait(Event * events, uint count) {
    // @@ Use wait for multiple objects?
    for (uint i = 0; i < count; i++) {
        events[i].wait();
    }
 }
 #elif NV_OS_UNIX
    // @@ 
 #endif	
--- a/src/nvthread/Event.h
+++ b/src/nvthread/Event.h
@ -0,0 +1,34 @@
 // This code is in the public domain -- castano@gmail.com
 #pragma once
 #ifndef NV_THREAD_EVENT_H
 #define NV_THREAD_EVENT_H
 #include "nvthread.h"
 #include "nvcore/Ptr.h"
 namespace nv
 {
    // This is intended to be used by a single waiter thread.
 	class NVTHREAD_CLASS Event
 	{
 		NV_FORBID_COPY(Event);
 	public:
 		Event();
 		~Event();
 		void post();
 		void wait();    // Wait resets the event.
        static void post(Event * events, uint count);
        static void wait(Event * events, uint count);
 	private:
 		struct Private;
 		AutoPtr<Private> m;
 	};
 } // nv namespace
 #endif // NV_THREAD_EVENT_H
--- a/src/nvthread/Mutex.cpp
+++ b/src/nvthread/Mutex.cpp
@ -0,0 +1,89 @@
 // This code is in the public domain -- castano@gmail.com
 #include "Mutex.h"
 #if NV_OS_WIN32
 #include "Win32.h"
 #elif NV_OS_UNIX
 #include <pthread.h>
 #include <errno.h> // EBUSY
 #endif // NV_OS
 using namespace nv;
 #if NV_OS_WIN32
 struct Mutex::Private {
 	CRITICAL_SECTION mutex;
 };
 Mutex::Mutex () : m(new Private)
 {
 	InitializeCriticalSection(&m->mutex);
 }
 Mutex::~Mutex ()
 {
 	DeleteCriticalSection(&m->mutex);
 }
 void Mutex::lock()
 {
 	EnterCriticalSection(&m->mutex);
 }
 bool Mutex::tryLock()
 {
 	return TryEnterCriticalSection(&m->mutex) != 0;
 }
 void Mutex::unlock()
 {
 	LeaveCriticalSection(&m->mutex);	
 }
 #elif NV_OS_UNIX
 struct Mutex::Private {
 	pthread_mutex_t mutex;
 };
 Mutex::Mutex () : m(new Private)
 {
 	int result = pthread_mutex_init(&m->mutex , NULL);
 	nvDebugCheck(result == 0);
 }
 Mutex::~Mutex ()
 {
 	int result = pthread_mutex_destroy(&m->mutex);
 	nvDebugCheck(result == 0);
 }
 void Mutex::lock()
 {
 	int result = pthread_mutex_lock(&m->mutex);
 	nvDebugCheck(result == 0);
 }
 bool Mutex::tryLock()
 {
 	int result = pthread_mutex_trylock(&m->mutex);
 	nvDebugCheck(result == 0 || result == EBUSY);
 	return result == 0;
 }
 void Mutex::unlock()
 {
 	int result = pthread_mutex_unlock(&m->mutex);
 	nvDebugCheck(result == 0);
 }
 #endif // NV_OS
--- a/src/nvthread/Mutex.h
+++ b/src/nvthread/Mutex.h
@ -0,0 +1,47 @@
 // This code is in the public domain -- castano@gmail.com
 #pragma once
 #ifndef NV_THREAD_MUTEX_H
 #define NV_THREAD_MUTEX_H
 #include "nvthread.h"
 #include "nvcore/Ptr.h"
 namespace nv
 {
 	class NVTHREAD_CLASS Mutex
 	{
 		NV_FORBID_COPY(Mutex);
 	public:
 		Mutex ();
 		~Mutex ();
 		void lock();
 		bool tryLock();
 		void unlock();
 	private:
 		struct Private;
 		AutoPtr<Private> m;
 	};
    // Templated lock that can be used with any mutex.
    template <class M>
 	class Lock
 	{
 		NV_FORBID_COPY(Lock);
 	public:
 		Lock (M & m) : m_mutex (m) { m_mutex.lock(); }
 		~Lock () { m_mutex.unlock(); }
 	private:
 		M & m_mutex;
 	};
 } // nv namespace
 #endif // NV_THREAD_MUTEX_H
--- a/src/nvthread/ParallelFor.cpp
+++ b/src/nvthread/ParallelFor.cpp
@ -0,0 +1,61 @@
 // This code is in the public domain -- Ignacio Casta<74>o <castano@gmail.com>
 #include "ParallelFor.h"
 #include "Thread.h"
 #include "Atomic.h"
 #include "ThreadPool.h"
 using namespace nv;
 #define ENABLE_PARALLEL_FOR 1
 void worker(void * arg) {
    ParallelFor * owner = (ParallelFor *)arg;
    while(true) {
        // Consume one element at a time. @@ Might be more efficient to have custom grain.
        uint i = atomicIncrement(&owner->idx);
        if (i > owner->count) {
            break;
        }
        owner->task(owner->context, i - 1);
    } 
 }
 ParallelFor::ParallelFor(ForTask * task, void * context) : task(task), context(context) {
 #if ENABLE_PARALLEL_FOR
    pool = ThreadPool::acquire();
 #endif
 }
 ParallelFor::~ParallelFor() {
 #if ENABLE_PARALLEL_FOR
    ThreadPool::release(pool);
 #endif
 }
 void ParallelFor::run(uint count) {
 #if ENABLE_PARALLEL_FOR
    storeRelease(&this->count, count);
    // Init atomic counter to zero.
    storeRelease(&idx, 0);
    // Start threads.
    pool->start(worker, this);
    // Wait for all threads to complete.
    pool->wait();
    nvDebugCheck(idx >= count);
 #else
    for (int i = 0; i < count; i++) {
        task(context, i);
    }
 #endif
 }
--- a/src/nvthread/ParallelFor.h
+++ b/src/nvthread/ParallelFor.h
@ -0,0 +1,38 @@
 // This code is in the public domain -- Ignacio Casta<74>o <castano@gmail.com>
 #pragma once
 #ifndef NV_THREAD_PARALLELFOR_H
 #define NV_THREAD_PARALLELFOR_H
 #include "nvthread.h"
 //#include "Atomic.h" // atomic<uint>
 namespace nv
 {
    class Thread;
    class ThreadPool;
    typedef void ForTask(void * context, int id);
    struct ParallelFor {
        ParallelFor(ForTask * task, void * context);
        ~ParallelFor();
        void run(uint count);
        // Invariant:
        ForTask * task;
        void * context;
        ThreadPool * pool;
        //uint workerCount;   // @@ Move to thread pool.
        //Thread * workers;
        // State:
        uint count;
        /*atomic<uint>*/ uint idx;
    };
 } // nv namespace
 #endif // NV_THREAD_PARALLELFOR_H
--- a/src/nvthread/Thread.cpp
+++ b/src/nvthread/Thread.cpp
@ -0,0 +1,136 @@
 // This code is in the public domain -- castano@gmail.com
 #include "Thread.h"
 #if NV_OS_WIN32
 	#include "Win32.h"
 #elif NV_OS_UNIX
 	#include <pthread.h>
 	#include <unistd.h> // usleep
 #endif
 using namespace nv;
 struct Thread::Private
 {
 #if NV_OS_WIN32
 	HANDLE thread;
 #elif NV_OS_UNIX
 	pthread_t thread;
 #endif
    ThreadFunc * func;
    void * arg;
 };
 #if NV_OS_WIN32
 unsigned long __stdcall threadFunc(void * arg) {
    Thread * thread = (Thread *)arg;
    thread->func(thread->arg);
    return 0;
 }
 #elif NV_OS_UNIX
 extern "C" void * threadFunc(void * arg) {
    Thread * thread = (Thread *)arg;
 	thread->func(thread->arg);
 	pthread_exit(0);
 }
 #endif
 Thread::Thread() : p(new Private)
 {
    p->thread = 0;
 }
 Thread::~Thread()
 {
 	nvDebugCheck(p->thread == 0);
 }
 void Thread::start(ThreadFunc * func, void * arg)
 {
    this->func = func;
    this->arg = arg;
 #if NV_OS_WIN32
    p->thread = CreateThread(NULL, 0, threadFunc, this, 0, NULL);
 	//p->thread = (HANDLE)_beginthreadex (0, 0, threadFunc, this, 0, NULL);     // @@ So that we can call CRT functions...
 	nvDebugCheck(p->thread != NULL);
 #elif NV_OS_UNIX
 	int result = pthread_create(&p->thread, NULL, threadFunc, this);
 	nvDebugCheck(result == 0);
 #endif
 }
 void Thread::wait()
 {
 #if NV_OS_WIN32
    DWORD status = WaitForSingleObject (p->thread, INFINITE);
    nvCheck (status ==  WAIT_OBJECT_0);
    BOOL ok = CloseHandle (p->thread);
    p->thread = NULL;
    nvCheck (ok);
 #elif NV_OS_UNIX
 	int result = pthread_join(p->thread, NULL); 
    p->thread = 0;
 	nvDebugCheck(result == 0);
 #endif
 }
 bool Thread::isRunning () const
 {
 #if NV_OS_WIN32
 	return p->thread != NULL;
 #elif NV_OS_UNIX
 	return p->thread != 0;
 #endif
 }
 /*static*/ void Thread::spinWait(uint count)
 {
 	for (uint i = 0; i < count; i++) {}
 }
 /*static*/ void Thread::yield()
 {
 #if NV_OS_WIN32
 	SwitchToThread();
 #elif NV_OS_UNIX
 	int result = sched_yield();
 	nvDebugCheck(result == 0);
 #endif
 }
 /*static*/ void Thread::sleep(uint ms)
 {
 #if NV_OS_WIN32
 	Sleep(ms);
 #elif NV_OS_UNIX
 	usleep(1000 * ms);
 #endif
 }
 /*static*/ void Thread::wait(Thread * threads, uint count)
 {
 /*#if NV_OS_WIN32
    // @@ Is there any advantage in doing this?
    nvDebugCheck(count < MAXIMUM_WAIT_OBJECTS);
    HANDLE * handles = new HANDLE[count];
    for (uint i = 0; i < count; i++) {
        handles[i] = threads->p->thread;
    }
    DWORD result = WaitForMultipleObjects(count, handles, TRUE, INFINITE);
    delete [] handles;
 #else*/
    for (uint i = 0; i < count; i++) {
        threads[i].wait();
    }
 //#endif
 }
--- a/src/nvthread/Thread.h
+++ b/src/nvthread/Thread.h
@ -0,0 +1,46 @@
 // This code is in the public domain -- castano@gmail.com
 #pragma once
 #ifndef NV_THREAD_THREAD_H
 #define NV_THREAD_THREAD_H
 #include "nvthread.h"
 #include "nvcore/Ptr.h"
 namespace nv
 {
    typedef void ThreadFunc(void * arg);
 	class NVTHREAD_CLASS Thread
 	{
 		NV_FORBID_COPY(Thread);
 	public:
 		Thread();
 		~Thread();
 		void start(ThreadFunc * func, void * arg);
 		void wait();
 		bool isRunning() const;
 		static void spinWait(uint count);
 		static void yield();
 		static void sleep(uint ms);
        static void wait(Thread * threads, uint count);
 	private:
 		struct Private;
 		AutoPtr<Private> p;
    public:
        ThreadFunc * func;
        void * arg;
 	};
 } // nv namespace
 #endif // NV_THREAD_THREAD_H
--- a/src/nvthread/ThreadPool.cpp
+++ b/src/nvthread/ThreadPool.cpp
@ -0,0 +1,121 @@
 // This code is in the public domain -- castano@gmail.com
 #include "ThreadPool.h"
 #include "Mutex.h"
 #include "Thread.h"
 // Most of the time it's not necessary to protect the thread pool, but if it doesn't add a significant overhead, then it'd be safer to do it.
 #define PROTECT_THREAD_POOL 1
 using namespace nv;
 #if PROTECT_THREAD_POOL 
 Mutex s_pool_mutex;
 #endif
 AutoPtr<ThreadPool> s_pool;
 /*static*/ ThreadPool * ThreadPool::acquire()
 {
 #if PROTECT_THREAD_POOL 
    s_pool_mutex.lock();    // @@ If same thread tries to lock twice, this should assert.
 #endif
    if (s_pool == NULL) {
        ThreadPool * p = new ThreadPool;
        nvDebugCheck(s_pool == p);
    }
    return s_pool.ptr();
 }
 /*static*/ void ThreadPool::release(ThreadPool * pool)
 {
    nvDebugCheck(pool == s_pool);
    // Make sure the threads of the pool are idle.
    s_pool->wait();
 #if PROTECT_THREAD_POOL 
    s_pool_mutex.unlock();
 #endif
 }
 /*static*/ void ThreadPool::workerFunc(void * arg) {
    uint i = (uint)arg;
    while(true) 
    {
        s_pool->startEvents[i].wait();
        if (s_pool->func == NULL) {
            return; // @@ should we post finish event anyway?
        }
        s_pool->func(s_pool->arg);
        s_pool->finishEvents[i].post();
    }
 }
 ThreadPool::ThreadPool() 
 {
    s_pool = this;  // Worker threads need this to be initialized before they start.
    workerCount = nv::hardwareThreadCount();
    workers = new Thread[workerCount];
    startEvents = new Event[workerCount];
    finishEvents = new Event[workerCount];
    for (uint i = 0; i < workerCount; i++) {
        workers[i].start(workerFunc, (void *)i);
    }
    allIdle = true;
 }
 ThreadPool::~ThreadPool()
 {
    // Set threads to terminate.
    start(NULL, NULL);
    // Wait until threads actually exit.
    Thread::wait(workers, workerCount);
    delete [] workers;
    delete [] startEvents;
    delete [] finishEvents;
 }
 void ThreadPool::start(ThreadFunc * func, void * arg)
 {
    // Wait until threads are idle.
    wait();
    // Set our desired function.
    this->func = func;
    this->arg = arg;
    allIdle = false;
    // Resume threads.
    Event::post(startEvents, workerCount);
 }
 void ThreadPool::wait()
 {
    if (!allIdle)
    {
        // Wait for threads to complete.
        Event::wait(finishEvents, workerCount);
        allIdle = true;
    }
 }
--- a/src/nvthread/ThreadPool.h
+++ b/src/nvthread/ThreadPool.h
@ -0,0 +1,49 @@
 // This code is in the public domain -- castano@gmail.com
 #pragma once
 #ifndef NV_THREAD_THREADPOOL_H
 #define NV_THREAD_THREADPOOL_H
 #include "nvthread.h"
 #include "Event.h"
 #include "Thread.h"
 namespace nv {
    class Thread;
    class Event;
    class ThreadPool {
        NV_FORBID_COPY(ThreadPool);
    public:
        static ThreadPool * acquire();
        static void release(ThreadPool *);
        ThreadPool();
        ~ThreadPool();
        void start(ThreadFunc * func, void * arg);
        void wait();
    private:
        static void workerFunc(void * arg);
        uint workerCount;
        Thread * workers;
        Event * startEvents;
        Event * finishEvents;
        uint allIdle;
        // Current function:
        ThreadFunc * func;
        void * arg;
    };
 } // namespace nv
 #endif // NV_THREAD_THREADPOOL_H
--- a/src/nvthread/Win32.h
+++ b/src/nvthread/Win32.h
@ -0,0 +1,9 @@
 // This code is in the public domain -- castano@gmail.com
 // Never include this from a header file.
 #define WIN32_LEAN_AND_MEAN
 #define VC_EXTRALEAN
 #define _WIN32_WINNT 0x0400 // for SwitchToThread, TryEnterCriticalSection
 #include <windows.h>
 //#include <process.h> // for _beginthreadex
--- a/src/nvthread/nvthread.cpp
+++ b/src/nvthread/nvthread.cpp
@ -0,0 +1,51 @@
 #include "nvthread.h"
 #include "Thread.h"
 #define WIN32_LEAN_AND_MEAN
 #define VC_EXTRALEAN
 #include <windows.h>
 using namespace nv;
 // Find the number of cores in the system.
 // Based on: http://stackoverflow.com/questions/150355/programmatically-find-the-number-of-cores-on-a-machine
 // @@ Distinguish between logical and physical cores?
 uint nv::hardwareThreadCount() {
 #if NV_OS_WIN32
    SYSTEM_INFO sysinfo;
    GetSystemInfo( &sysinfo );
    return sysinfo.dwNumberOfProcessors;
 #elif NV_OS_XBOX
    return 3; // or 6?
 #elif NV_OS_LINUX // Linux, Solaris, & AIX
    return sysconf(_SC_NPROCESSORS_ONLN);
 #elif NV_OS_DARWIN || NV_OS_FREEBSD
    int numCPU;
    int mib[4];
    size_t len = sizeof(numCPU); 
    // set the mib for hw.ncpu
    mib[0] = CTL_HW;
    mib[1] = HW_AVAILCPU;  // alternatively, try HW_NCPU;
    // get the number of CPUs from the system
    sysctl(mib, 2, &numCPU, &len, NULL, 0);
    if (numCPU < 1) {
         mib[1] = HW_NCPU;
         sysctl( mib, 2, &numCPU, &len, NULL, 0 );
         if (numCPU < 1) {
              return 1; // Assume single core.
         }
    }
    return numCPU;
 #else
    return 1; // Assume single core.
 #endif
 }
--- a/src/nvthread/nvthread.h
+++ b/src/nvthread/nvthread.h
@ -0,0 +1,83 @@
 // This code is in the public domain -- castanyo@yahoo.es
 #pragma once
 #ifndef NV_THREAD_H
 #define NV_THREAD_H
 #include "nvcore/nvcore.h"
 // Function linkage
 #if NVTHREAD_SHARED
 #ifdef NVTHREAD_EXPORTS
 #define NVTHREAD_API DLL_EXPORT
 #define NVTHREAD_CLASS DLL_EXPORT_CLASS
 #else
 #define NVTHREAD_API DLL_IMPORT
 #define NVTHREAD_CLASS DLL_IMPORT
 #endif
 #else // NVMATH_SHARED
 #define NVTHREAD_API
 #define NVTHREAD_CLASS
 #endif // NVMATH_SHARED
 // Compiler barriers.
 // See: http://en.wikipedia.org/wiki/Memory_ordering
 #if NV_CC_MSVC
 #include <intrin.h>
 #pragma intrinsic(_WriteBarrier)
 #define nvCompilerWriteBarrier      _WriteBarrier
 #pragma intrinsic(_ReadWriteBarrier)
 #define nvCompilerReadWriteBarrier  _ReadWriteBarrier
 #if _MSC_VER >= 1400            // ReadBarrier is VC2005
 #pragma intrinsic(_ReadBarrier)
 #define nvCompilerReadBarrier       _ReadBarrier	
 #else
 #define nvCompilerReadBarrier       _ReadWriteBarrier
 #endif
 #elif NV_CC_GNUC
 #define nvCompilerReadWriteBarrier()    asm volatile("" ::: "memory");
 #define nvCompilerWriteBarrier          nvCompilerReadWriteBarrier
 #define nvCompilerReadBarrier           nvCompilerReadWriteBarrier
 #endif // NV_CC_MSVC
 // @@ Memory barriers / fences.
 // @@ Atomics.
 /* Wrap this up:
 #define YieldProcessor() __asm { rep nop }
 #define YieldProcessor _mm_pause
 #define YieldProcessor __yield
 BOOL WINAPI SwitchToThread(void);
 */
 namespace nv
 {
    // Reentrant.
    uint hardwareThreadCount();
    // Not thread-safe. Use from main thread only.
    void initWorkers();
    void shutWorkers();
    void setWorkerFunction(void * func);
 } // nv namespace
 #endif // NV_THREAD_H
--- a/src/nvtt/CompressorDX11.cpp
+++ b/src/nvtt/CompressorDX11.cpp
@ -37,7 +37,7 @@ using namespace nv;
 using namespace nvtt;
-void CompressorBC6::compressBlock(Tile & tile, AlphaMode alphaMode, const CompressionOptions::Private & compressionOptions, void * output)
+void CompressorBC6::compressBlock(ColorSet & tile, AlphaMode alphaMode, const CompressionOptions::Private & compressionOptions, void * output)
 {
    NV_UNUSED(alphaMode); // ZOH does not support alpha.
@ -56,7 +56,7 @@ void CompressorBC6::compressBlock(Tile & tile, AlphaMode alphaMode, const Compre
 }
-void CompressorBC7::compressBlock(Tile & tile, AlphaMode alphaMode, const CompressionOptions::Private & compressionOptions, void * output)
+void CompressorBC7::compressBlock(ColorSet & tile, AlphaMode alphaMode, const CompressionOptions::Private & compressionOptions, void * output)
 {
    // @@ TODO
 }
--- a/src/nvtt/CompressorDX9.cpp
+++ b/src/nvtt/CompressorDX9.cpp
@ -481,10 +481,10 @@ void D3DXCompressorDXT1::compress(nvtt::InputFormat inputFormat, nvtt::AlphaMode
        err = surface->LockRect(&rect, NULL, D3DLOCK_READONLY);
-	if (outputOptions.outputHandler != NULL) {
+	    if (outputOptions.outputHandler != NULL) {
-	    int size = rect.Pitch * ((h + 3) / 4);
+	        int size = rect.Pitch * ((h + 3) / 4);
-	    outputOptions.outputHandler->writeData(rect.pBits, size);
+	        outputOptions.outputHandler->writeData(rect.pBits, size);
-	}
+	    }
        err = surface->UnlockRect();
    }
--- a/src/nvtt/CompressorRGB.cpp
+++ b/src/nvtt/CompressorRGB.cpp
@ -110,7 +110,7 @@ namespace
        {
            nvDebugCheck(alignment >= 1);
            flush();
-            int remainder = (size_t)ptr % alignment;
+            int remainder = (int)((uintptr_t)ptr % alignment);
            if (remainder != 0) {
                putBits(0, (alignment - remainder) * 8);
            }
--- a/src/nvtt/Context.cpp
+++ b/src/nvtt/Context.cpp
@ -349,6 +349,8 @@ bool Compressor::Private::compress(AlphaMode alphaMode, int w, int h, int d, int
        compressor->compress(alphaMode, w, h, d, rgba, dispatcher, compressionOptions, outputOptions);
    }
    outputOptions.endImage();
    return true;
 }
--- a/src/nvtt/OutputOptions.cpp
+++ b/src/nvtt/OutputOptions.cpp
@ -135,6 +135,11 @@ bool OutputOptions::Private::writeData(const void * data, int size) const
    return outputHandler == NULL || outputHandler->writeData(data, size);
 }
 void OutputOptions::Private::endImage() const
 {
    if (outputHandler != NULL) outputHandler->endImage();
 }
 void OutputOptions::Private::error(Error e) const
 {
    if (errorHandler != NULL) errorHandler->error(e);
--- a/src/nvtt/OutputOptions.h
+++ b/src/nvtt/OutputOptions.h
@ -52,6 +52,11 @@ namespace nvtt
 			return true;
 		}
 		virtual void endImage()
 		{
 			// ignore.
 		}
 		nv::StdOutputStream stream;
 	};
@ -72,6 +77,7 @@ namespace nvtt
 		void beginImage(int size, int width, int height, int depth, int face, int miplevel) const;
 		bool writeData(const void * data, int size) const;
        void endImage() const;
 		void error(Error e) const;
 	};
--- a/src/nvtt/TaskDispatcher.h
+++ b/src/nvtt/TaskDispatcher.h
@ -18,8 +18,8 @@
 // http://msdn.microsoft.com/en-us/library/dd504870.aspx
 #if NV_OS_WIN32 && _MSC_VER >= 1600
 #define HAVE_PPL 1
-//#include <array>
+#include <array>
-#include <ppl.h>
+//#include <ppl.h>
 #endif
 // Intel Thread Building Blocks (TBB).
@ -28,6 +28,8 @@
 #include <tbb/parallel_for.h>
 #endif
 #include "nvthread/ParallelFor.h"
 namespace nvtt {
@ -40,6 +42,15 @@ namespace nvtt {
        }
    };
    struct ParallelTaskDispatcher : public TaskDispatcher
    {
        virtual void dispatch(Task * task, void * context, int count) {
            nv::ParallelFor parallelFor(task, context);
            parallelFor.run(count); // @@ Add support for custom grain.
        }
    };
 #if defined(HAVE_OPENMP)
    struct OpenMPTaskDispatcher : public TaskDispatcher
@ -81,9 +92,24 @@ namespace nvtt {
 #if defined(HAVE_PPL)
    class CountingIterator
    {
    public:
        CountingIterator() : i(0) {}
        CountingIterator(const CountingIterator & rhs) : i(0) {}
        explicit CountingIterator(int x) : i(x) {}
        const int & operator*() const { return i; }
        CountingIterator & operator++() { i++; return *this; }
        CountingIterator & operator--() { i--; return *this; }
    private:
        int i;
    };
    struct TaskFunctor {
        TaskFunctor(Task * task, void * context) : task(task), context(context) {}
-        void operator()(int n) const {
+        void operator()(int & n) const {
            task(context, n);
        }
        Task * task;
@ -95,12 +121,16 @@ namespace nvtt {
    {
        virtual void dispatch(Task * task, void * context, int count)
        {
            CountingIterator begin(0);
            CountingIterator end((int)count);
            TaskFunctor func(task, context);
-            Concurrency::parallel_for(0, count, func);
+
            std::for_each(begin, end, func);
            //parallel_for_each(begin, end, func);
        }
    };
-#endif // HAVE_PPL
+#endif
 #if defined(HAVE_TBB)
@ -132,7 +162,8 @@ namespace nvtt {
 #elif defined(HAVE_GCD)
    typedef AppleTaskDispatcher         ConcurrentTaskDispatcher;
 #else
-    typedef SequentialTaskDispatcher    ConcurrentTaskDispatcher;
+    //typedef SequentialTaskDispatcher    ConcurrentTaskDispatcher;
    typedef ParallelTaskDispatcher        ConcurrentTaskDispatcher;
 #endif
 } // namespace nvtt
--- a/src/nvtt/TexImage.cpp
+++ b/src/nvtt/TexImage.cpp
@ -615,7 +615,7 @@ bool TexImage::setImage2D(Format format, Decoder decoder, int w, int h, const vo
 			    block->decodeBlock(&colors, false);
 		    }
 		    else if (decoder == Decoder_NV5x) {
-			block->decodeBlockNV5x(&colors);
+			    block->decodeBlockNV5x(&colors);
 		    }
 		}
 		else if (format == nvtt::Format_BC3)
@ -629,19 +629,19 @@ bool TexImage::setImage2D(Format format, Decoder decoder, int w, int h, const vo
 			    block->decodeBlock(&colors, false);
 		    }
 		    else if (decoder == Decoder_NV5x) {
-			block->decodeBlockNV5x(&colors);
+			    block->decodeBlockNV5x(&colors);
 		    }
 		}
 		else if (format == nvtt::Format_BC4)
 		{
-                    const BlockATI1 * block = (const BlockATI1 *)ptr;
+            const BlockATI1 * block = (const BlockATI1 *)ptr;
-                    block->decodeBlock(&colors, decoder == Decoder_D3D9);
+            block->decodeBlock(&colors, decoder == Decoder_D3D9);
-                }
+        }
-                else if (format == nvtt::Format_BC5)
+        else if (format == nvtt::Format_BC5)
-                {
+        {
-                    const BlockATI2 * block = (const BlockATI2 *)ptr;
+            const BlockATI2 * block = (const BlockATI2 *)ptr;
-                    block->decodeBlock(&colors, decoder == Decoder_D3D9);
+            block->decodeBlock(&colors, decoder == Decoder_D3D9);
-                }
+        }
 		for (int yy = 0; yy < 4; yy++)
 		{
@ -864,6 +864,42 @@ bool TexImage::buildNextMipmap(MipmapFilter filter, float filterWidth, const flo
    return true;
 }
 void TexImage::canvasSize(int w, int h, int d)
 {
    nvDebugCheck(w > 0 && h > 0 && d > 0);
    FloatImage * img = m->image;
    if (img == NULL || (w == img->width() && h == img->height() && d == img->depth())) {
        return;
    }
    detach();
    FloatImage * new_img = new FloatImage;
    new_img->allocate(4, w, h, d);
    new_img->clear();
    w = min(uint(w), img->width());
    h = min(uint(h), img->height());
    d = min(uint(d), img->depth());
    for (int z = 0; z < d; z++) {
        for (int y = 0; y < h; y++) {
            for (int x = 0; x < w; x++) {
                new_img->pixel(0, x, y, z) = img->pixel(0, x, y, z);
                new_img->pixel(1, x, y, z) = img->pixel(1, x, y, z);
                new_img->pixel(2, x, y, z) = img->pixel(2, x, y, z);
                new_img->pixel(3, x, y, z) = img->pixel(3, x, y, z);
            }
        }
    }
    delete m->image;
    m->image = new_img;
    m->type = (d == 1) ? TextureType_2D : TextureType_3D;
 }
 // Color transforms.
 void TexImage::toLinear(float gamma)
 {
@ -885,6 +921,66 @@ void TexImage::toGamma(float gamma)
    m->image->toGamma(0, 3, gamma);
 }
 static float toSrgb(float f) {
    if (f <= 0.0)               f = 0.0f;
    else if (f <= 0.0031308f)   f = 12.92f * f;
    else if (f <= 1.0f)         f = (powf(f, 0.41666f) * 1.055f) - 0.055f;
    else                        f = 1.0f;
    return f;
 }
 void TexImage::toSrgb()
 {
    FloatImage * img = m->image;
    if (img == NULL) return;
    detach();
    const uint count = img->pixelCount();
    for (uint j = 0; j < count; j++)
    {
        float & r = img->pixel(0, j);
        float & g = img->pixel(1, j);
        float & b = img->pixel(2, j);
        r = ::toSrgb(r);
        g = ::toSrgb(g);
        b = ::toSrgb(b);
    }
 }
 static float toXenonSrgb(float f) {
    if (f < 0)                  f = 0;
    else if (f < (1.0f/16.0f))  f = 4.0f * f;
    else if (f < (1.0f/8.0f))   f = 0.25f  + 2.0f * (f - 0.0625f);
    else if (f < 0.5f)          f = 0.375f + 1.0f * (f - 0.125f);
    else if (f < 1.0f)          f = 0.75f  + 0.5f * (f - 0.50f);
    else                        f = 1.0f;
    return f;
 }
 void TexImage::toXenonSrgb()
 {
    FloatImage * img = m->image;
    if (img == NULL) return;
    detach();
    const uint count = img->pixelCount();
    for (uint j = 0; j < count; j++)
    {
        float & r = img->pixel(0, j);
        float & g = img->pixel(1, j);
        float & b = img->pixel(2, j);
        r = ::toXenonSrgb(r);
        g = ::toXenonSrgb(g);
        b = ::toXenonSrgb(b);
    }
 }
 void TexImage::transform(const float w0[4], const float w1[4], const float w2[4], const float w3[4], const float offset[4])
 {
    if (m->image == NULL) return;
@ -1140,9 +1236,9 @@ void TexImage::toRGBM(float range/*= 1*/, float threshold/*= 0.25*/)
    const uint count = img->pixelCount();
    for (uint i = 0; i < count; i++) {
-        float R = nv::clamp(r[i] * irange, 0.0f, 1.0f);
+        float R = nv::clamp(r[i], 0.0f, 1.0f);
-        float G = nv::clamp(g[i] * irange, 0.0f, 1.0f);
+        float G = nv::clamp(g[i], 0.0f, 1.0f);
-        float B = nv::clamp(b[i] * irange, 0.0f, 1.0f);
+        float B = nv::clamp(b[i], 0.0f, 1.0f);
 #if 1
        float M = max(max(R, G), max(B, threshold));
--- a/src/nvtt/nvtt.h
+++ b/src/nvtt/nvtt.h
@ -294,6 +294,9 @@ namespace nvtt
        /// Output data. Compressed data is output as soon as it's generated to minimize memory allocations.
        virtual bool writeData(const void * data, int size) = 0;
        /// Indicate the end of a the compressed image.
        virtual void endImage() = 0;
    };
    /// Error codes.
@ -440,10 +443,13 @@ namespace nvtt
        NVTT_API void resize(int maxExtent, RoundMode mode, ResizeFilter filter, float filterWidth, const float * params = 0);
        NVTT_API bool buildNextMipmap(MipmapFilter filter);
        NVTT_API bool buildNextMipmap(MipmapFilter filter, float filterWidth, const float * params = 0);
        NVTT_API void canvasSize(int w, int h, int d);
        // Color transforms.
        NVTT_API void toLinear(float gamma);
        NVTT_API void toGamma(float gamma);
        NVTT_API void toSrgb();
        NVTT_API void toXenonSrgb();
        NVTT_API void transform(const float w0[4], const float w1[4], const float w2[4], const float w3[4], const float offset[4]);
        NVTT_API void swizzle(int r, int g, int b, int a);
        NVTT_API void scaleBias(int channel, float scale, float bias);
--- a/src/nvtt/tools/compress.cpp
+++ b/src/nvtt/tools/compress.cpp
@ -56,6 +56,11 @@ struct MyOutputHandler : public nvtt::OutputHandler
        // ignore.
    }
    virtual void endImage()
    {
        // Ignore.
    }
    // Output data.
    virtual bool writeData(const void * data, int size)
    {