Merge changes from the witness.

2011-09-27 17:48:46 +00:00 · 2011-09-27 17:48:46 +00:00 · 3c0ab2d3f3
commit 3c0ab2d3f3
parent 9c0658edca
47 changed files with 1811 additions and 186 deletions
--- a/project/vc9/nvthread/nvthread.vcproj
+++ b/project/vc9/nvthread/nvthread.vcproj
@ -0,0 +1,346 @@
+<?xml version="1.0" encoding="Windows-1252"?>
+<VisualStudioProject
+	ProjectType="Visual C++"
+	Version="9.00"
+	Name="nvthread"
+	ProjectGUID="{3DD3A43D-C6EA-460F-821B-6C339A03C5BB}"
+	RootNamespace="nvthread"
+	Keyword="Win32Proj"
+	TargetFrameworkVersion="131072"
+	>
+	<Platforms>
+		<Platform
+			Name="Win32"
+		/>
+		<Platform
+			Name="x64"
+		/>
+	</Platforms>
+	<ToolFiles>
+	</ToolFiles>
+	<Configurations>
+		<Configuration
+			Name="Debug|Win32"
+			OutputDirectory="$(ConfigurationName)\$(PlatformName)"
+			IntermediateDirectory="$(ConfigurationName)\$(PlatformName)"
+			ConfigurationType="4"
+			InheritedPropertySheets="$(SolutionDir)\nvtt.vsprops"
+			CharacterSet="2"
+			>
+			<Tool
+				Name="VCPreBuildEventTool"
+			/>
+			<Tool
+				Name="VCCustomBuildTool"
+			/>
+			<Tool
+				Name="VCXMLDataGeneratorTool"
+			/>
+			<Tool
+				Name="VCWebServiceProxyGeneratorTool"
+			/>
+			<Tool
+				Name="VCMIDLTool"
+			/>
+			<Tool
+				Name="VCCLCompilerTool"
+				Optimization="0"
+				AdditionalIncludeDirectories=""
+				PreprocessorDefinitions="WIN32;_DEBUG;_LIB;__SSE2__;__SSE__;__MMX__"
+				MinimalRebuild="true"
+				BasicRuntimeChecks="3"
+				RuntimeLibrary="3"
+				UsePrecompiledHeader="0"
+				WarningLevel="3"
+				DebugInformationFormat="4"
+			/>
+			<Tool
+				Name="VCManagedResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCPreLinkEventTool"
+			/>
+			<Tool
+				Name="VCLibrarianTool"
+			/>
+			<Tool
+				Name="VCALinkTool"
+			/>
+			<Tool
+				Name="VCXDCMakeTool"
+			/>
+			<Tool
+				Name="VCBscMakeTool"
+			/>
+			<Tool
+				Name="VCFxCopTool"
+			/>
+			<Tool
+				Name="VCPostBuildEventTool"
+			/>
+		</Configuration>
+		<Configuration
+			Name="Debug|x64"
+			OutputDirectory="$(ConfigurationName)\$(PlatformName)"
+			IntermediateDirectory="$(ConfigurationName)\$(PlatformName)"
+			ConfigurationType="4"
+			InheritedPropertySheets="$(SolutionDir)\nvtt.vsprops"
+			CharacterSet="0"
+			>
+			<Tool
+				Name="VCPreBuildEventTool"
+			/>
+			<Tool
+				Name="VCCustomBuildTool"
+			/>
+			<Tool
+				Name="VCXMLDataGeneratorTool"
+			/>
+			<Tool
+				Name="VCWebServiceProxyGeneratorTool"
+			/>
+			<Tool
+				Name="VCMIDLTool"
+				TargetEnvironment="3"
+			/>
+			<Tool
+				Name="VCCLCompilerTool"
+				Optimization="0"
+				AdditionalIncludeDirectories=""
+				PreprocessorDefinitions="WIN32;_DEBUG;_LIB;__SSE2__;__SSE__;__MMX__"
+				MinimalRebuild="true"
+				BasicRuntimeChecks="3"
+				RuntimeLibrary="3"
+				UsePrecompiledHeader="0"
+				WarningLevel="3"
+				DebugInformationFormat="3"
+			/>
+			<Tool
+				Name="VCManagedResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCPreLinkEventTool"
+			/>
+			<Tool
+				Name="VCLibrarianTool"
+			/>
+			<Tool
+				Name="VCALinkTool"
+			/>
+			<Tool
+				Name="VCXDCMakeTool"
+			/>
+			<Tool
+				Name="VCBscMakeTool"
+			/>
+			<Tool
+				Name="VCFxCopTool"
+			/>
+			<Tool
+				Name="VCPostBuildEventTool"
+			/>
+		</Configuration>
+		<Configuration
+			Name="Release|Win32"
+			OutputDirectory="$(ConfigurationName)\$(PlatformName)"
+			IntermediateDirectory="$(ConfigurationName)\$(PlatformName)"
+			ConfigurationType="4"
+			InheritedPropertySheets="$(SolutionDir)\nvtt.vsprops"
+			CharacterSet="2"
+			WholeProgramOptimization="1"
+			>
+			<Tool
+				Name="VCPreBuildEventTool"
+			/>
+			<Tool
+				Name="VCCustomBuildTool"
+			/>
+			<Tool
+				Name="VCXMLDataGeneratorTool"
+			/>
+			<Tool
+				Name="VCWebServiceProxyGeneratorTool"
+			/>
+			<Tool
+				Name="VCMIDLTool"
+			/>
+			<Tool
+				Name="VCCLCompilerTool"
+				Optimization="3"
+				InlineFunctionExpansion="0"
+				EnableIntrinsicFunctions="true"
+				FavorSizeOrSpeed="0"
+				OmitFramePointers="true"
+				EnableFiberSafeOptimizations="true"
+				AdditionalIncludeDirectories=""
+				PreprocessorDefinitions="WIN32;NDEBUG;_LIB;__SSE2__;__SSE__;__MMX__"
+				StringPooling="true"
+				RuntimeLibrary="2"
+				EnableFunctionLevelLinking="false"
+				EnableEnhancedInstructionSet="2"
+				UsePrecompiledHeader="0"
+				WarningLevel="3"
+				DebugInformationFormat="3"
+			/>
+			<Tool
+				Name="VCManagedResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCPreLinkEventTool"
+			/>
+			<Tool
+				Name="VCLibrarianTool"
+			/>
+			<Tool
+				Name="VCALinkTool"
+			/>
+			<Tool
+				Name="VCXDCMakeTool"
+			/>
+			<Tool
+				Name="VCBscMakeTool"
+			/>
+			<Tool
+				Name="VCFxCopTool"
+			/>
+			<Tool
+				Name="VCPostBuildEventTool"
+			/>
+		</Configuration>
+		<Configuration
+			Name="Release|x64"
+			OutputDirectory="$(ConfigurationName)\$(PlatformName)"
+			IntermediateDirectory="$(ConfigurationName)\$(PlatformName)"
+			ConfigurationType="4"
+			InheritedPropertySheets="$(SolutionDir)\nvtt.vsprops"
+			CharacterSet="0"
+			WholeProgramOptimization="1"
+			>
+			<Tool
+				Name="VCPreBuildEventTool"
+			/>
+			<Tool
+				Name="VCCustomBuildTool"
+			/>
+			<Tool
+				Name="VCXMLDataGeneratorTool"
+			/>
+			<Tool
+				Name="VCWebServiceProxyGeneratorTool"
+			/>
+			<Tool
+				Name="VCMIDLTool"
+				TargetEnvironment="3"
+			/>
+			<Tool
+				Name="VCCLCompilerTool"
+				Optimization="3"
+				EnableIntrinsicFunctions="true"
+				OmitFramePointers="true"
+				WholeProgramOptimization="true"
+				AdditionalIncludeDirectories=""
+				PreprocessorDefinitions="WIN32;NDEBUG;_LIB;__SSE2__;__SSE__;__MMX__"
+				StringPooling="true"
+				RuntimeLibrary="2"
+				EnableFunctionLevelLinking="false"
+				UsePrecompiledHeader="0"
+				WarningLevel="3"
+				DebugInformationFormat="3"
+			/>
+			<Tool
+				Name="VCManagedResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCPreLinkEventTool"
+			/>
+			<Tool
+				Name="VCLibrarianTool"
+			/>
+			<Tool
+				Name="VCALinkTool"
+			/>
+			<Tool
+				Name="VCXDCMakeTool"
+			/>
+			<Tool
+				Name="VCBscMakeTool"
+			/>
+			<Tool
+				Name="VCFxCopTool"
+			/>
+			<Tool
+				Name="VCPostBuildEventTool"
+			/>
+		</Configuration>
+	</Configurations>
+	<References>
+	</References>
+	<Files>
+		<File
+			RelativePath="..\..\..\src\nvthread\Atomic.h"
+			>
+		</File>
+		<File
+			RelativePath="..\..\..\src\nvthread\Event.cpp"
+			>
+		</File>
+		<File
+			RelativePath="..\..\..\src\nvthread\Event.h"
+			>
+		</File>
+		<File
+			RelativePath="..\..\..\src\nvthread\Mutex.cpp"
+			>
+		</File>
+		<File
+			RelativePath="..\..\..\src\nvthread\Mutex.h"
+			>
+		</File>
+		<File
+			RelativePath="..\..\..\src\nvthread\nvthread.cpp"
+			>
+		</File>
+		<File
+			RelativePath="..\..\..\src\nvthread\nvthread.h"
+			>
+		</File>
+		<File
+			RelativePath="..\..\..\src\nvthread\ParallelFor.cpp"
+			>
+		</File>
+		<File
+			RelativePath="..\..\..\src\nvthread\ParallelFor.h"
+			>
+		</File>
+		<File
+			RelativePath="..\..\..\src\nvthread\Thread.cpp"
+			>
+		</File>
+		<File
+			RelativePath="..\..\..\src\nvthread\Thread.h"
+			>
+		</File>
+		<File
+			RelativePath="..\..\..\src\nvthread\ThreadPool.cpp"
+			>
+		</File>
+		<File
+			RelativePath="..\..\..\src\nvthread\ThreadPool.h"
+			>
+		</File>
+	</Files>
+	<Globals>
+	</Globals>
+</VisualStudioProject>
--- a/project/vc9/nvtt.sln
+++ b/project/vc9/nvtt.sln
@ -4,6 +4,7 @@ Microsoft Visual Studio Solution File, Format Version 10.00
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "nvtt", "nvtt\nvtt.vcproj", "{1AEB7681-57D8-48EE-813D-5C41CC38B647}"
 	ProjectSection(ProjectDependencies) = postProject
 		{CE017322-01FC-4851-9C8B-64E9A8E26C38} = {CE017322-01FC-4851-9C8B-64E9A8E26C38}
+		{3DD3A43D-C6EA-460F-821B-6C339A03C5BB} = {3DD3A43D-C6EA-460F-821B-6C339A03C5BB}
 		{F143D180-D4C4-4037-B3DE-BE89A21C8D1D} = {F143D180-D4C4-4037-B3DE-BE89A21C8D1D}
 		{4046F392-A18B-4C66-9639-3EABFFF5D531} = {4046F392-A18B-4C66-9639-3EABFFF5D531}
 		{C33787E3-5564-4834-9FE3-A9020455A669} = {C33787E3-5564-4834-9FE3-A9020455A669}
@ -88,6 +89,8 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "imperativeapi", "imperative
 EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "bc6h", "bc6h\bc6h.vcproj", "{C33787E3-5564-4834-9FE3-A9020455A669}"
 EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "nvthread", "nvthread\nvthread.vcproj", "{3DD3A43D-C6EA-460F-821B-6C339A03C5BB}"
+EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Debug (no cuda)|Mixed Platforms = Debug (no cuda)|Mixed Platforms
@ -457,6 +460,28 @@ Global
 		{C33787E3-5564-4834-9FE3-A9020455A669}.Release|Win32.Build.0 = Release|Win32
 		{C33787E3-5564-4834-9FE3-A9020455A669}.Release|x64.ActiveCfg = Release|x64
 		{C33787E3-5564-4834-9FE3-A9020455A669}.Release|x64.Build.0 = Release|x64
+		{3DD3A43D-C6EA-460F-821B-6C339A03C5BB}.Debug (no cuda)|Mixed Platforms.ActiveCfg = Debug|x64
+		{3DD3A43D-C6EA-460F-821B-6C339A03C5BB}.Debug (no cuda)|Mixed Platforms.Build.0 = Debug|x64
+		{3DD3A43D-C6EA-460F-821B-6C339A03C5BB}.Debug (no cuda)|Win32.ActiveCfg = Debug|x64
+		{3DD3A43D-C6EA-460F-821B-6C339A03C5BB}.Debug (no cuda)|x64.ActiveCfg = Debug|x64
+		{3DD3A43D-C6EA-460F-821B-6C339A03C5BB}.Debug (no cuda)|x64.Build.0 = Debug|x64
+		{3DD3A43D-C6EA-460F-821B-6C339A03C5BB}.Debug|Mixed Platforms.ActiveCfg = Debug|x64
+		{3DD3A43D-C6EA-460F-821B-6C339A03C5BB}.Debug|Mixed Platforms.Build.0 = Debug|x64
+		{3DD3A43D-C6EA-460F-821B-6C339A03C5BB}.Debug|Win32.ActiveCfg = Debug|Win32
+		{3DD3A43D-C6EA-460F-821B-6C339A03C5BB}.Debug|Win32.Build.0 = Debug|Win32
+		{3DD3A43D-C6EA-460F-821B-6C339A03C5BB}.Debug|x64.ActiveCfg = Debug|x64
+		{3DD3A43D-C6EA-460F-821B-6C339A03C5BB}.Debug|x64.Build.0 = Debug|x64
+		{3DD3A43D-C6EA-460F-821B-6C339A03C5BB}.Release (no cuda)|Mixed Platforms.ActiveCfg = Release|x64
+		{3DD3A43D-C6EA-460F-821B-6C339A03C5BB}.Release (no cuda)|Mixed Platforms.Build.0 = Release|x64
+		{3DD3A43D-C6EA-460F-821B-6C339A03C5BB}.Release (no cuda)|Win32.ActiveCfg = Release|x64
+		{3DD3A43D-C6EA-460F-821B-6C339A03C5BB}.Release (no cuda)|x64.ActiveCfg = Release|x64
+		{3DD3A43D-C6EA-460F-821B-6C339A03C5BB}.Release (no cuda)|x64.Build.0 = Release|x64
+		{3DD3A43D-C6EA-460F-821B-6C339A03C5BB}.Release|Mixed Platforms.ActiveCfg = Release|x64
+		{3DD3A43D-C6EA-460F-821B-6C339A03C5BB}.Release|Mixed Platforms.Build.0 = Release|x64
+		{3DD3A43D-C6EA-460F-821B-6C339A03C5BB}.Release|Win32.ActiveCfg = Release|Win32
+		{3DD3A43D-C6EA-460F-821B-6C339A03C5BB}.Release|Win32.Build.0 = Release|Win32
+		{3DD3A43D-C6EA-460F-821B-6C339A03C5BB}.Release|x64.ActiveCfg = Release|x64
+		{3DD3A43D-C6EA-460F-821B-6C339A03C5BB}.Release|x64.Build.0 = Release|x64
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@ -6,6 +6,7 @@ INCLUDE_DIRECTORIES(${NV_SOURCE_DIR}/extern/stb)
 SUBDIRS(nvcore)
 SUBDIRS(nvmath)
 SUBDIRS(nvimage)
+SUBDIRS(nvthread)
 SUBDIRS(nvtt)

 # OpenGL
--- a/src/nvcore/Array.h
+++ b/src/nvcore/Array.h
@ -78,8 +78,8 @@ namespace nv
    }

    template <typename T>
-    bool find(const T & element, const T * restrict ptr, uint count, uint * index) {
-        for (uint i = 0; i < count; i++) {
+    bool find(const T & element, const T * restrict ptr, uint begin, uint end, uint * index) {
+        for (uint i = begin; i < end; i++) {
            if (ptr[i] == element) {
                if (index != NULL) *index = i;
                return true;
@ -257,15 +257,15 @@ namespace nv
        }

        /// Return true if element found.
-        NV_FORCEINLINE bool find(const T & element, uint * index) const
+        NV_FORCEINLINE bool find(const T & element, uint * indexPtr) const
        {
-            return find(element, 0, m_size, index);
+            return find(element, 0, m_size, indexPtr);
        }

        /// Return true if element found within the given range.
-        NV_FORCEINLINE bool find(const T & element, uint first, uint count, uint * index) const
+        NV_FORCEINLINE bool find(const T & element, uint begin, uint end, uint * indexPtr) const
        {
-            return ::nv::find(element, m_buffer + first, count, index);
+            return ::nv::find(element, m_buffer, begin, end, indexPtr);
        }

        /// Remove the element at the given index. This is an expensive operation!
--- a/src/nvcore/Debug.cpp
+++ b/src/nvcore/Debug.cpp
@ -448,19 +448,6 @@ namespace
    /** Win32 assert handler. */
    struct Win32AssertHandler : public AssertHandler 
    {
-        // Code from Daniel Vogel.
-        static bool isDebuggerPresent()
-        {
-            HINSTANCE kernel32 = GetModuleHandle("kernel32.dll");
-            if (kernel32) {
-                FARPROC IsDebuggerPresent = GetProcAddress(kernel32, "IsDebuggerPresent");
-                if (IsDebuggerPresent != NULL && IsDebuggerPresent()) {
-                    return true;
-                }
-            }
-            return false;
-        }
-
        // Flush the message queue. This is necessary for the message box to show up.
        static void flushMessageQueue()
        {
@ -487,7 +474,7 @@ namespace
                nvDebug( error_string.str() );
            }

-            if (isDebuggerPresent()) {
+            if (debug::isDebuggerPresent()) {
                return NV_ABORT_DEBUG;
            }

@ -522,15 +509,6 @@ namespace
    /** Xbox360 assert handler. */
    struct Xbox360AssertHandler : public AssertHandler 
    {
-        static bool isDebuggerPresent()
-        {
-#ifdef _DEBUG
-            return DmIsDebuggerPresent() == TRUE;
-#else
-            return false;
-#endif
-        }
-
        // Assert handler method.
        virtual int assertion( const char * exp, const char * file, int line, const char * func/*=NULL*/ )
        {
@ -546,7 +524,7 @@ namespace
                nvDebug( error_string.str() );
            }

-            if (isDebuggerPresent()) {
+            if (debug::isDebuggerPresent()) {
                return NV_ABORT_DEBUG;
            }

@ -563,26 +541,6 @@ namespace
    /** Unix assert handler. */
    struct UnixAssertHandler : public AssertHandler
    {
-        bool isDebuggerPresent()
-        {
-#if NV_OS_DARWIN
-            int mib[4];
-            struct kinfo_proc info;
-            size_t size;
-            mib[0] = CTL_KERN;
-            mib[1] = KERN_PROC;
-            mib[2] = KERN_PROC_PID;
-            mib[3] = getpid();
-            size = sizeof(info);
-            info.kp_proc.p_flag = 0;
-            sysctl(mib,4,&info,&size,NULL,0);
-            return ((info.kp_proc.p_flag & P_TRACED) == P_TRACED);
-#else
-            // if ppid != sid, some process spawned our app, probably a debugger. 
-            return getsid(getpid()) != getppid();
-#endif
-        }
-
        // Assert handler method.
        virtual int assertion(const char * exp, const char * file, int line, const char * func)
        {
@ -594,7 +552,7 @@ namespace
            }

 #if _DEBUG
-            if (isDebuggerPresent()) {
+            if (debug::isDebuggerPresent()) {
                return NV_ABORT_DEBUG;
            }
 #endif
@ -702,7 +660,10 @@ void debug::enableSigHandler()
    // SYMOPT_DEFERRED_LOADS make us not take a ton of time unless we actual log traces
    SymSetOptions(SYMOPT_DEFERRED_LOADS|SYMOPT_FAIL_CRITICAL_ERRORS|SYMOPT_LOAD_LINES|SYMOPT_UNDNAME);

-    SymInitialize(GetCurrentProcess(), NULL, TRUE);
+    if (!SymInitialize(GetCurrentProcess(), NULL, TRUE)) {
+        DWORD error = GetLastError();
+        nvDebug("SymInitialize returned error : %d\n", error);
+    }

 #elif !NV_OS_WIN32 && defined(HAVE_SIGNAL_H)

@ -743,3 +704,38 @@ void debug::disableSigHandler()
 #endif
 }

+
+bool debug::isDebuggerPresent()
+{
+#if NV_OS_WIN32
+    HINSTANCE kernel32 = GetModuleHandle("kernel32.dll");
+    if (kernel32) {
+        FARPROC IsDebuggerPresent = GetProcAddress(kernel32, "IsDebuggerPresent");
+        if (IsDebuggerPresent != NULL && IsDebuggerPresent()) {
+            return true;
+        }
+    }
+    return false;
+#elif NV_OS_XBOX
+#ifdef _DEBUG
+    return DmIsDebuggerPresent() == TRUE;
+#else
+    return false;
+#endif
+#elif NV_OS_DARWIN
+    int mib[4];
+    struct kinfo_proc info;
+    size_t size;
+    mib[0] = CTL_KERN;
+    mib[1] = KERN_PROC;
+    mib[2] = KERN_PROC_PID;
+    mib[3] = getpid();
+    size = sizeof(info);
+    info.kp_proc.p_flag = 0;
+    sysctl(mib,4,&info,&size,NULL,0);
+    return ((info.kp_proc.p_flag & P_TRACED) == P_TRACED);
+#else
+    // if ppid != sid, some process spawned our app, probably a debugger. 
+    return getsid(getpid()) != getppid();
+#endif
+}
--- a/src/nvcore/Debug.h
+++ b/src/nvcore/Debug.h
@ -10,6 +10,9 @@
 #   include <stdarg.h> // va_list
 #endif

+// Make sure we are using our assert.
+#undef assert
+
 #define NV_ABORT_DEBUG      1
 #define NV_ABORT_IGNORE     2
 #define NV_ABORT_EXIT       3
@ -116,12 +119,6 @@
 #endif


-#if __cplusplus > 199711L
-#define nvStaticCheck(x) static_assert(x)
-#else
-#define nvStaticCheck(x) typedef char NV_DO_STRING_JOIN2(__static_assert_,__LINE__)[(x)]
-#endif
-
 NVCORE_API int nvAbort(const char *exp, const char *file, int line, const char * func = NULL);
 NVCORE_API void NV_CDECL nvDebugPrint( const char *msg, ... ) __attribute__((format (printf, 1, 2)));

@ -166,6 +163,8 @@ namespace nv

        NVCORE_API void enableSigHandler();
        NVCORE_API void disableSigHandler();
+
+        NVCORE_API bool isDebuggerPresent();
    }

 } // nv namespace
--- a/src/nvcore/DefsGnucDarwin.h
+++ b/src/nvcore/DefsGnucDarwin.h
@ -2,7 +2,7 @@
 #error "Do not include this file directly."
 #endif

-//#include <stdint.h> // uint8_t, int8_t, ...
+#include <stdint.h> // uint8_t, int8_t, ... uintptr_t
 #include <cstddef> // operator new, size_t, NULL

 // Function linkage
@ -67,4 +67,4 @@ typedef int64_t     int64;

 // Aliases
 typedef uint32      uint;
-*/
+*/
--- a/src/nvcore/Memory.h
+++ b/src/nvcore/Memory.h
@ -12,10 +12,10 @@
 #include <new>	// new and delete


-#if NV_CC_GNUC
-#   define NV_ALIGN_16 __attribute__ ((__aligned__ (16)))
-#else
-#   define NV_ALIGN_16 __declspec(align(16))
+#if NV_CC_GNUC
+#   define NV_ALIGN_16 __attribute__ ((__aligned__ (16)))
+#else
+#   define NV_ALIGN_16 __declspec(align(16))
 #endif


@ -43,15 +43,15 @@ extern "C" {
 namespace nv {

    // C++ helpers.
-    template <typename T> T * malloc(size_t count) {
+    template <typename T> NV_FORCEINLINE T * malloc(size_t count) {
        return (T *)::malloc(sizeof(T) * count);
    }

-    template <typename T> T * realloc(T * ptr, size_t count) {
+    template <typename T> NV_FORCEINLINE T * realloc(T * ptr, size_t count) {
        return (T *)::realloc(ptr, sizeof(T) * count);
    }

-    template <typename T> void free(const T * ptr) {
+    template <typename T> NV_FORCEINLINE void free(const T * ptr) {
        ::free((void *)ptr);
    }

--- a/src/nvcore/StdStream.h
+++ b/src/nvcore/StdStream.h
@ -72,7 +72,7 @@ namespace nv
 #if NV_OS_WIN32
            return _ftell_nolock(m_fp);
 #else
-            return ftell(m_fp);
+            return (uint)ftell(m_fp);
 #endif
        }

@ -85,9 +85,9 @@ namespace nv
            uint end = _ftell_nolock(m_fp);
            _fseek_nolock(m_fp, pos, SEEK_SET);
 #else
-            uint pos = ftell(m_fp);
+            uint pos = (uint)ftell(m_fp);
            fseek(m_fp, 0, SEEK_END);
-            uint end = ftell(m_fp);
+            uint end = (uint)ftell(m_fp);
            fseek(m_fp, pos, SEEK_SET);            
 #endif
            return end;
--- a/src/nvcore/StrLib.cpp
+++ b/src/nvcore/StrLib.cpp
@ -189,7 +189,7 @@ StringBuilder::StringBuilder() : m_size(0), m_str(NULL)
 }

 /** Preallocate space. */
-StringBuilder::StringBuilder( int size_hint ) : m_size(size_hint)
+StringBuilder::StringBuilder( uint size_hint ) : m_size(size_hint)
 {
    nvDebugCheck(m_size > 0);
    m_str = strAlloc(m_size);
@ -203,9 +203,15 @@ StringBuilder::StringBuilder( const StringBuilder & s ) : m_size(0), m_str(NULL)
 }

 /** Copy string. */
-StringBuilder::StringBuilder( const char * s, int extra_size_hint/*=0*/ ) : m_size(0), m_str(NULL)
+StringBuilder::StringBuilder(const char * s) : m_size(0), m_str(NULL)
 {
-    copy(s, extra_size_hint);
+    copy(s);
+}
+
+/** Copy string. */
+StringBuilder::StringBuilder(const char * s, uint len) : m_size(0), m_str(NULL)
+{
+    copy(s, len);
 }

 /** Delete the string. */
@ -396,15 +402,25 @@ StringBuilder & StringBuilder::reserve( uint size_hint )


 /** Copy a string safely. */
-StringBuilder & StringBuilder::copy( const char * s, int extra_size/*=0*/ )
+StringBuilder & StringBuilder::copy(const char * s)
 {
    nvCheck( s != NULL );
    const uint str_size = uint(strlen( s )) + 1;
-    reserve(str_size + extra_size);
+    reserve(str_size);
    memcpy(m_str, s, str_size);
    return *this;
 }

+/** Copy a string safely. */
+StringBuilder & StringBuilder::copy(const char * s, uint len)
+{
+    nvCheck( s != NULL );
+    const uint str_size = len + 1;
+    reserve(str_size);
+    strCpy(m_str, str_size, s, len);
+    return *this;
+}
+

 /** Copy an StringBuilder. */
 StringBuilder & StringBuilder::copy( const StringBuilder & s )
--- a/src/nvcore/StrLib.h
+++ b/src/nvcore/StrLib.h
@ -59,9 +59,10 @@ namespace nv
    public:

        StringBuilder();
-        explicit StringBuilder( int size_hint );
-        StringBuilder( const char * str, int extra_size_hint = 0);
-        StringBuilder( const StringBuilder & );
+        explicit StringBuilder( uint size_hint );
+        StringBuilder(const char * str);
+        StringBuilder(const char * str, uint len);
+        StringBuilder(const StringBuilder & other);

        ~StringBuilder();

@ -75,9 +76,10 @@ namespace nv
        StringBuilder & number( int i, int base = 10 );
        StringBuilder & number( uint i, int base = 10 );

-        StringBuilder & reserve( uint size_hint );
-        StringBuilder & copy( const char * str, int extra_size/*=0*/ );
-        StringBuilder & copy( const StringBuilder & str );
+        StringBuilder & reserve(uint size_hint);
+        StringBuilder & copy(const char * str);
+        StringBuilder & copy(const char * str, uint len);
+        StringBuilder & copy(const StringBuilder & str);

        StringBuilder & toLower();
        StringBuilder & toUpper();
@ -145,7 +147,7 @@ namespace nv
    public:
        Path() : StringBuilder() {}
        explicit Path(int size_hint) : StringBuilder(size_hint) {}
-        Path(const char * str, int extra_size_hint = 0) : StringBuilder(str, extra_size_hint) {}
+        Path(const char * str) : StringBuilder(str) {}
        Path(const Path & path) : StringBuilder(path) {}

        const char * fileName() const;
--- a/src/nvcore/Utils.h
+++ b/src/nvcore/Utils.h
@ -7,9 +7,76 @@
 #include "nvcore.h"
 #include "Debug.h" // nvDebugCheck

+// Just in case. Grrr.
+#undef min
+#undef max
+
 namespace nv
 {
+    // Less error prone than casting. From CB:
+    // http://cbloomrants.blogspot.com/2011/06/06-17-11-c-casting-is-devil.html
+    inline int8  asSigned(uint8 x)  { return (int8) x; }
+    inline int16 asSigned(uint16 x) { return (int16) x; }
+    inline int32 asSigned(uint32 x) { return (int32) x; }
+    inline int64 asSigned(uint64 x) { return (int64) x; }

+    inline uint8  asUnsigned(int8 x)  { return (uint8) x; }
+    inline uint16 asUnsigned(int16 x) { return (uint16) x; }
+    inline uint32 asUnsigned(int32 x) { return (uint32) x; }
+    inline uint64 asUnsigned(int64 x) { return (uint64) x; }
+
+    /*
+    template <typename T> inline int8 toI8(T x) { 
+        nvDebugCheck(x <= INT8_MAX);
+        nvDebugCheck(x >= INT8_MIN);
+        int8 y = (int8) x;
+        nvDebugCheck(x == (T)y);
+        return y;
+    }
+    
+    template <typename T> inline uint8 toU8(T x) { 
+        nvDebugCheck(x <= UINT8_MAX);
+        nvDebugCheck(x >= 0);
+        return (uint8) x;
+    }
+
+    template <typename T> inline int16 toI16(T x) { 
+        nvDebugCheck(x <= INT16_MAX);
+        nvDebugCheck(x >= INT16_MIN);
+        return (int16) x;
+    }
+    
+    template <typename T> inline uint16 toU16(T x) { 
+        nvDebugCheck(x <= UINT16_MAX);
+        nvDebugCheck(x >= 0);
+        return (uint16) x;
+    }
+    
+    template <typename T> inline int32 toI32(T x) { 
+        nvDebugCheck(x <= INT32_MAX);
+        nvDebugCheck(x >= INT32_MIN);
+        return (int32) x;
+    }
+
+    template <typename T> inline uint32 toU32(T x) { 
+        nvDebugCheck(x <= UINT32_MAX);
+        nvDebugCheck(x >= 0);
+        return (uint32) x;
+    }
+    
+    template <typename T> inline int64 toI64(T x) { 
+        nvDebugCheck(x <= INT64_MAX);
+        nvDebugCheck(x >= INT64_MIN);
+        return (int64) x;
+    }
+    
+    template <typename T> inline uint64 toU64(T x) { 
+        nvDebugCheck(x <= UINT64_MAX);
+        nvDebugCheck(x >= 0);
+        return (uint64) x;
+    }
+    */
+    
    /// Swap two values.
    template <typename T> 
    inline void swap(T & a, T & b)
--- a/src/nvcore/nvcore.h
+++ b/src/nvcore/nvcore.h
@ -4,9 +4,6 @@
 #ifndef NV_CORE_H
 #define NV_CORE_H

-// cmake config
-#include <nvconfig.h>
-
 // Function linkage
 #if NVCORE_SHARED
 #ifdef NVCORE_EXPORTS
@ -91,7 +88,11 @@
 // @@ NV_CC_MSVC7
 // @@ NV_CC_MSVC8

-#if defined POSH_COMPILER_GCC
+#if defined POSH_COMPILER_CLANG
+#   define NV_CC_CLANG  1
+#   define NV_CC_GCC    1    // Clang is compatible with GCC.
+#   define NV_CC_STRING "clang"
+#elif defined POSH_COMPILER_GCC
 #   define NV_CC_GNUC   1
 #   define NV_CC_STRING "gcc"
 #elif defined POSH_COMPILER_MSVC
@ -108,6 +109,18 @@
 #define NV_ENDIAN_STRING    POSH_ENDIAN_STRING


+// Define the right printf prefix for size_t arguments:
+#if POSH_64BIT_POINTER
+#  define NV_SIZET_PRINTF_PREFIX POSH_I64_PRINTF_PREFIX
+#else
+#  define NV_SIZET_PRINTF_PREFIX
+#endif
+
+
+// cmake config
+#include "nvconfig.h"
+
+
 // Type definitions:
 typedef posh_u8_t   uint8;
 typedef posh_i8_t   int8;
@ -144,6 +157,8 @@ typedef uint32      uint;
    private: \
    void *operator new(size_t size); \
    void *operator new[](size_t size);
+    //static void *operator new(size_t size); \
+    //static void *operator new[](size_t size);

 // String concatenation macros.
 #define NV_STRING_JOIN2(arg1, arg2) NV_DO_STRING_JOIN2(arg1, arg2)
@ -153,6 +168,25 @@ typedef uint32      uint;
 #define NV_STRING2(x) #x
 #define NV_STRING(x) NV_STRING2(x)

+
+#if __cplusplus > 199711L
+#define nvStaticCheck(x) static_assert(x)
+#else
+#define nvStaticCheck(x) typedef char NV_STRING_JOIN2(__static_assert_,__LINE__)[(x)]
+#endif
+#define NV_COMPILER_CHECK(x) nvStaticCheck(x)   // I like this name best.
+
+// Make sure type definitions are fine.
+NV_COMPILER_CHECK(sizeof(int8) == 1);
+NV_COMPILER_CHECK(sizeof(uint8) == 1);
+NV_COMPILER_CHECK(sizeof(int16) == 2);
+NV_COMPILER_CHECK(sizeof(uint16) == 2);
+NV_COMPILER_CHECK(sizeof(int32) == 4);
+NV_COMPILER_CHECK(sizeof(uint32) == 4);
+NV_COMPILER_CHECK(sizeof(int32) == 4);
+NV_COMPILER_CHECK(sizeof(uint32) == 4);
+
+
 #define NV_ARRAY_SIZE(x) (sizeof(x)/sizeof((x)[0]))

 #if 1
@ -180,6 +214,7 @@ typedef uint32      uint;

 // Null index. @@ Move this somewhere else... it's only used by nvmesh.
 //const unsigned int NIL = unsigned int(~0);
+//#define NIL uint(~0)

 // Null pointer.
 #ifndef NULL
--- a/src/nvimage/DirectDrawSurface.cpp
+++ b/src/nvimage/DirectDrawSurface.cpp
@ -1418,7 +1418,7 @@ uint DirectDrawSurface::mipmapSize(uint mipmap) const
    {
        nvDebugCheck((header.pf.flags & DDPF_RGB) || (header.pf.flags & DDPF_LUMINANCE));

-        uint pitch = computeBytePitch(w, header.pf.bitcount, 8); // Asuming 8 bit alignment, which is the same D3DX expects.
+        uint pitch = computeBytePitch(w, header.pf.bitcount, 1); // Asuming 1 byte alignment, which is the same D3DX expects.

        return pitch * h * d;
    }
--- a/src/nvimage/FloatImage.cpp
+++ b/src/nvimage/FloatImage.cpp
@ -181,7 +181,7 @@ void FloatImage::normalize(uint baseComponent)
    for (uint i = 0; i < count; i++) {

        Vector3 normal(xChannel[i], yChannel[i], zChannel[i]);
-        normal = normalizeSafe(normal, Vector3(zero), 0.0f);
+        normal = normalizeSafe(normal, Vector3(0), 0.0f);

        xChannel[i] = normal.x;
        yChannel[i] = normal.y;
--- a/src/nvimage/FloatImage.h
+++ b/src/nvimage/FloatImage.h
@ -56,6 +56,7 @@ namespace nv
        //@{
        NVIMAGE_API void clear(float f = 0.0f);
        NVIMAGE_API void clear(uint component, float f = 0.0f);
+        NVIMAGE_API void copyChannel(uint src, uint dst);

        NVIMAGE_API void normalize(uint base_component);

@ -113,8 +114,6 @@ namespace nv
        uint pixelCount() const { return m_pixelCount; }


-        // @@ It would make sense to swap the order of the arguments so that 'c' is always first.
-
        /** @name Pixel access. */
        //@{
        const float * channel(uint c) const;
--- a/src/nvimage/Image.h
+++ b/src/nvimage/Image.h
@ -70,14 +70,14 @@ namespace nv

    inline const Color32 & Image::pixel(uint x, uint y) const
    {
-        nvDebugCheck(x < width() && y < height());
-        return pixel(y * width() + x);
+        nvDebugCheck(x < m_width && y < m_height);
+        return pixel(y * m_width + x);
    }

    inline Color32 & Image::pixel(uint x, uint y)
    {
-        nvDebugCheck(x < width() && y < height());
-        return pixel(y * width() + x);
+        nvDebugCheck(x < m_width && y < m_height);
+        return pixel(y * m_width + x);
    }

 } // nv namespace
--- a/src/nvimage/ImageIO.cpp
+++ b/src/nvimage/ImageIO.cpp
@ -215,7 +215,7 @@ FloatImage * nv::ImageIO::loadFloat(const char * fileName)
    StdInputStream stream(fileName);

    if (stream.isError()) {
-        return false;
+        return NULL;
    }

    return loadFloat(fileName, stream);
@ -324,9 +324,9 @@ bool nv::ImageIO::saveFloat(const char * fileName, Stream & s, const FloatImage

 bool nv::ImageIO::saveFloat(const char * fileName, const FloatImage * fimage, uint baseComponent, uint componentCount)
 {
+#if !defined(HAVE_FREEIMAGE)
    const char * extension = Path::extension(fileName);

-#if !defined(HAVE_FREEIMAGE)
 #if defined(HAVE_OPENEXR)
    if (strCaseCmp(extension, ".exr") == 0) {
        return saveFloatEXR(fileName, fimage, baseComponent, componentCount);
@ -711,7 +711,7 @@ Image * nv::ImageIO::loadTGA(Stream & s)
        case TGA_TYPE_INDEXED:
            if( tga.colormap_type!=1 || tga.colormap_size!=24 || tga.colormap_length>256 ) {
                nvDebug( "*** loadTGA: Error, only 24bit paletted images are supported.\n" );
-                return false;
+                return NULL;
            }
            pal = true;
            break;
@ -732,7 +732,7 @@ Image * nv::ImageIO::loadTGA(Stream & s)

 	default:
 	    nvDebug( "*** loadTGA: Error, unsupported image type.\n" );
-	    return false;
+	    return NULL;
    }

    const uint pixel_size = (tga.pixel_size/8);
@ -1369,7 +1369,7 @@ Image * nv::ImageIO::loadJPG(Stream & s)
    // Read the entire file.
    Array<uint8> byte_array;
    byte_array.resize(s.size());
-    s.serialize(byte_array.mutableBuffer(), s.size());
+    s.serialize(byte_array.buffer(), s.size());

    jpeg_decompress_struct cinfo;
    jpeg_error_mgr jerr;
--- a/src/nvmath/Half.cpp
+++ b/src/nvmath/Half.cpp
@ -487,46 +487,126 @@ nv::half_to_float( uint16 h )
    return (f_result);
 }

-uint32 
-nv::fast_half_to_float( uint16 h )
-{
-    const uint32 h_e_mask              = _uint32_li( 0x00007c00 );
-    const uint32 h_m_mask              = _uint32_li( 0x000003ff );
-    const uint32 h_s_mask              = _uint32_li( 0x00008000 );
-    const uint32 h_f_s_pos_offset      = _uint32_li( 0x00000010 );
-    const uint32 h_f_e_pos_offset      = _uint32_li( 0x0000000d );
-    const uint32 h_f_bias_offset       = _uint32_li( 0x0001c000 );
-    const uint32 f_e_mask              = _uint32_li( 0x7f800000 );
-    const uint32 f_m_mask              = _uint32_li( 0x007fffff );
-    const uint32 h_f_e_denorm_bias     = _uint32_li( 0x0000007e );
-    const uint32 h_f_m_denorm_sa_bias  = _uint32_li( 0x00000008 );
-    const uint32 f_e_pos               = _uint32_li( 0x00000017 );
-    const uint32 h_e_mask_minus_one    = _uint32_li( 0x00007bff );
-    const uint32 h_e                   = _uint32_and( h, h_e_mask );
-    const uint32 h_m                   = _uint32_and( h, h_m_mask );
-    const uint32 h_s                   = _uint32_and( h, h_s_mask );
-    const uint32 h_e_f_bias            = _uint32_add( h_e, h_f_bias_offset );
-    const uint32 h_m_nlz               = _uint32_cntlz( h_m );
-    const uint32 f_s                   = _uint32_sll( h_s,        h_f_s_pos_offset );
-    const uint32 f_e                   = _uint32_sll( h_e_f_bias, h_f_e_pos_offset );
-    const uint32 f_m                   = _uint32_sll( h_m,        h_f_e_pos_offset );
-    const uint32 f_em                  = _uint32_or(  f_e,        f_m              );
-    const uint32 h_f_m_sa              = _uint32_sub( h_m_nlz,             h_f_m_denorm_sa_bias );
-    const uint32 f_e_denorm_unpacked   = _uint32_sub( h_f_e_denorm_bias,   h_f_m_sa             );
-    const uint32 h_f_m                 = _uint32_sll( h_m,                 h_f_m_sa             );
-    const uint32 f_m_denorm            = _uint32_and( h_f_m,               f_m_mask             );
-    const uint32 f_e_denorm            = _uint32_sll( f_e_denorm_unpacked, f_e_pos              );
-    const uint32 f_em_denorm           = _uint32_or(  f_e_denorm,          f_m_denorm           );
-    const uint32 f_em_nan              = _uint32_or(  f_e_mask,            f_m                  );
-    const uint32 is_e_eqz_msb          = _uint32_dec(  h_e );
-    const uint32 is_m_nez_msb          = _uint32_neg(  h_m );
-    const uint32 is_e_flagged_msb      = _uint32_sub(  h_e_mask_minus_one, h_e );
-    const uint32 is_zero_msb           = _uint32_andc( is_e_eqz_msb,       is_m_nez_msb );
-    const uint32 is_denorm_msb         = _uint32_and(  is_m_nez_msb,       is_e_eqz_msb );
-    const uint32 is_zero               = _uint32_ext(  is_zero_msb );
-    const uint32 f_zero_result         = _uint32_andc( f_em, is_zero );
-    const uint32 f_denorm_result       = _uint32_sels( is_denorm_msb, f_em_denorm, f_zero_result );
-    const uint32 f_result              = _uint32_or( f_s, f_denorm_result );

-    return (f_result);
+// @@ These tables could be smaller.
+static uint32 mantissa_table[2048];
+static uint32 exponent_table[64];
+static uint32 offset_table[64];
+
+void nv::half_init_tables()
+{
+    // Init mantissa table.
+	mantissa_table[0] = 0;
+
+	for (int i = 1; i < 1024; i++) {
+		uint m = i << 13;
+		uint e = 0;
+
+		while ((m & 0x00800000) == 0) {
+			e -= 0x00800000;
+			m <<= 1;
+		}
+		m &= ~0x00800000;
+		e += 0x38800000;
+		mantissa_table[i] = m | e;
+	}
+
+    for (int i = 1024; i < 2048; i++) {
+		mantissa_table[i] = 0x38000000 + ((i - 1024) << 13);
+    }
+
+
+    // Init exponent table.
+	exponent_table[0] = 0;
+
+    for (int i = 1; i < 31; i++) {
+		exponent_table[i] = (i << 23);
+    }
+
+	exponent_table[31] = 0x47800000;
+	exponent_table[32] = 0x80000000;
+
+    for (int i = 33; i < 63; i++) {
+		exponent_table[i] = 0x80000000 + ((i - 32) << 23);
+    }
+
+	exponent_table[63] = 0xC7800000;
+
+
+    // Init offset table.
+	offset_table[0] = 0;
+
+    for (int i = 1; i < 32; i++) {
+		offset_table[i] = 1024;
+    }
+
+	offset_table[32] = 0;
+
+    for (int i = 33; i < 64; i++) {
+		offset_table[i] = 1024;
+    }
+
+    /*for (int i = 0; i < 64; i++) {
+        offset_table[i] = ((i & 31) != 0) * 1024;
+    }*/
 }
+
+// Fast half to float conversion based on:
+// http://www.fox-toolkit.org/ftp/fasthalffloatconversion.pdf
+uint32 nv::fast_half_to_float(uint16 h)
+{
+	uint exp = h >> 10;
+	return mantissa_table[offset_table[exp] + (h & 0x3ff)] + exponent_table[exp];
+}
+
+
+#if 0
+// Inaccurate conversion suggested at the ffmpeg mailing list:
+// http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/2009-July/068949.html
+uint32 nv::fast_half_to_float(uint16 v)
+{
+    if (v & 0x8000) return 0;
+    uint exp = v >> 10;
+    if (!exp) return (v>>9)&1;
+    if (exp >= 15) return 0xffff;
+    v <<= 6;
+    return (v+(1<<16)) >> (15-exp);
+}
+
+#endif
+
+#if 0
+
+// Some more from a gamedev thread:
+// http://www.devmaster.net/forums/showthread.php?t=10924
+
+// I believe it does not handle specials either.
+
+// Mike Acton's code should be fairly easy to vectorize and that would handle all cases too, the table method might still be faster, though.
+
+
+static __declspec(align(16)) unsigned half_sign[4]	  = {0x00008000, 0x00008000, 0x00008000, 0x00008000};
+static __declspec(align(16)) unsigned half_exponent[4]	  = {0x00007C00, 0x00007C00, 0x00007C00, 0x00007C00};
+static __declspec(align(16)) unsigned half_mantissa[4]	  = {0x000003FF, 0x000003FF, 0x000003FF, 0x000003FF};
+static __declspec(align(16)) unsigned half_bias_offset[4] = {0x0001C000, 0x0001C000, 0x0001C000, 0x0001C000};
+
+__asm
+{
+	movaps	xmm1, xmm0  // Input in xmm0
+	movaps	xmm2, xmm0
+
+	andps	xmm0, half_sign
+	andps	xmm1, half_exponent
+	andps	xmm2, half_mantissa
+	paddd	xmm1, half_bias_offset
+
+	pslld	xmm0, 16
+	pslld	xmm1, 13
+	pslld	xmm2, 13
+
+	orps	xmm1, xmm2
+	orps	xmm0, xmm1  // Result in xmm0
+}
+
+
+#endif
--- a/src/nvmath/Half.h
+++ b/src/nvmath/Half.h
@ -9,8 +9,9 @@ namespace nv {
    uint32 half_to_float( uint16 h );
    uint16 half_from_float( uint32 f );

-    // Does not handle NaN or infinity.
-    uint32 fast_half_to_float( uint16 h );
+    void half_init_tables();
+
+    uint32 fast_half_to_float(uint16 h);

    inline uint16 to_half(float c) {
        union { float f; uint32 u; } f;
--- a/src/nvmath/Matrix.h
+++ b/src/nvmath/Matrix.h
@ -9,15 +9,14 @@

 namespace nv
 {
-    enum zero_t { zero };
    enum identity_t { identity };

    class NVMATH_CLASS Matrix3
    {
    public:
        Matrix3();
-        Matrix3(zero_t);
-        Matrix3(identity_t);
+        explicit Matrix3(float f);
+        explicit Matrix3(identity_t);
        Matrix3(const Matrix3 & m);
        Matrix3(Vector3::Arg v0, Vector3::Arg v1, Vector3::Arg v2);

@ -41,10 +40,10 @@ namespace nv

    inline Matrix3::Matrix3() {}
    
-    inline Matrix3::Matrix3(zero_t)
+    inline Matrix3::Matrix3(float f)
    {
        for(int i = 0; i < 9; i++) {
-            m_data[i] = 0.0f;
+            m_data[i] = f;
        }
    }

@ -204,11 +203,11 @@ namespace nv
        typedef Matrix const & Arg;

        Matrix();
-        Matrix(zero_t);
-        Matrix(identity_t);
+        explicit Matrix(float f);
+        explicit Matrix(identity_t);
        Matrix(const Matrix & m);
        Matrix(Vector4::Arg v0, Vector4::Arg v1, Vector4::Arg v2, Vector4::Arg v3);
-        Matrix(const scalar m[]);	// m is assumed to contain 16 elements
+        //explicit Matrix(const scalar m[]);	// m is assumed to contain 16 elements

        scalar data(uint idx) const;
        scalar & data(uint idx);
@ -237,7 +236,7 @@ namespace nv
    {
    }

-    inline Matrix::Matrix(zero_t)
+    inline Matrix::Matrix(float f)
    {
        for(int i = 0; i < 16; i++) {
            m_data[i] = 0.0f;
@ -268,12 +267,12 @@ namespace nv
        m_data[12] = v3.x; m_data[13] = v3.y; m_data[14] = v3.z; m_data[15] = v3.w;
    }

-    inline Matrix::Matrix(const scalar m[])
+    /*inline Matrix::Matrix(const scalar m[])
    {
        for(int i = 0; i < 16; i++) {
            m_data[i] = m[i];
        }
-    }
+    }*/


    // Accessors
@ -456,7 +455,7 @@ namespace nv
    /// Get frustum matrix.
    inline Matrix frustum(scalar xmin, scalar xmax, scalar ymin, scalar ymax, scalar zNear, scalar zFar)
    {
-        Matrix m(zero);
+        Matrix m(0.0f);

        scalar doubleznear = 2.0f * zNear;
        scalar one_deltax = 1.0f / (xmax - xmin);
@ -477,7 +476,7 @@ namespace nv
    /// Get infinite frustum matrix.
    inline Matrix frustum(scalar xmin, scalar xmax, scalar ymin, scalar ymax, scalar zNear)
    {
-        Matrix m(zero);
+        Matrix m(0.0f);

        scalar doubleznear = 2.0f * zNear;
        scalar one_deltax = 1.0f / (xmax - xmin);
--- a/src/nvmath/Vector.h
+++ b/src/nvmath/Vector.h
@ -100,6 +100,7 @@ namespace nv
        explicit Vector4(scalar x);
        Vector4(scalar x, scalar y, scalar z, scalar w);
        Vector4(Vector2::Arg v, scalar z, scalar w);
+        Vector4(Vector2::Arg v, Vector2::Arg u);
        Vector4(Vector3::Arg v, scalar w);
        Vector4(Vector4::Arg v);
        //	Vector4(const Quaternion & v);
@ -107,6 +108,7 @@ namespace nv
        const Vector4 & operator=(Vector4::Arg v);

        Vector2 xy() const;
+        Vector2 zw() const;
        Vector3 xyz() const;

        const scalar * ptr() const;
@ -290,6 +292,7 @@ namespace nv
    inline Vector4::Vector4(scalar f) : x(f), y(f), z(f), w(f) {}
    inline Vector4::Vector4(scalar x, scalar y, scalar z, scalar w) : x(x), y(y), z(z), w(w) {}
    inline Vector4::Vector4(Vector2::Arg v, scalar z, scalar w) : x(v.x), y(v.y), z(z), w(w) {}
+    inline Vector4::Vector4(Vector2::Arg v, Vector2::Arg u) : x(v.x), y(v.y), z(u.x), w(u.y) {}
    inline Vector4::Vector4(Vector3::Arg v, scalar w) : x(v.x), y(v.y), z(v.z), w(w) {}
    inline Vector4::Vector4(Vector4::Arg v) : x(v.x), y(v.y), z(v.z), w(v.w) {}

@ -307,6 +310,11 @@ namespace nv
        return Vector2(x, y);
    }

+    inline Vector2 Vector4::zw() const
+    {
+        return Vector2(z, w);
+    }
+
    inline Vector3 Vector4::xyz() const
    {
        return Vector3(x, y, z);
@ -469,6 +477,14 @@ namespace nv
        return scale(v, 1.0f / l);
    }

+    // Safe, branchless normalization from Andy Firth. All error checking ommitted.
+    // http://altdevblogaday.com/2011/08/21/practical-flt-point-tricks/
+    inline Vector2 normalizeFast(Vector2::Arg v)
+    {
+        const float very_small_float = 1.0e-037f;
+        float l = very_small_float + length(v);
+        return scale(v, 1.0f / l);
+    }

    inline bool equal(Vector2::Arg v1, Vector2::Arg v2, float epsilon = NV_EPSILON)
    {
@ -498,6 +514,14 @@ namespace nv
        return vf;
    }

+    inline float triangleArea(Vector2::Arg a, Vector2::Arg b, Vector2::Arg c)
+    {
+	    Vector2 v0 = a - c;
+	    Vector2 v1 = b - c;
+
+	    return (v0.x * v1.y - v0.y * v1.x);
+    }
+

    // Vector3

@ -570,10 +594,10 @@ namespace nv
        return scale(v, 1.0f/s);
    }

-    inline Vector3 add_scaled(Vector3::Arg a, Vector3::Arg b, scalar s)
+    /*inline Vector3 add_scaled(Vector3::Arg a, Vector3::Arg b, scalar s)
    {
        return Vector3(a.x + b.x * s, a.y + b.y * s, a.z + b.z * s);
-    }
+    }*/

    inline Vector3 lerp(Vector3::Arg v1, Vector3::Arg v2, scalar t)
    {
@ -624,6 +648,15 @@ namespace nv
        return scale(v, 1.0f / l);
    }

+    // Safe, branchless normalization from Andy Firth. All error checking ommitted.
+    // http://altdevblogaday.com/2011/08/21/practical-flt-point-tricks/
+    inline Vector3 normalizeFast(Vector3::Arg v)
+    {
+        const float very_small_float = 1.0e-037f;
+        float l = very_small_float + length(v);
+        return scale(v, 1.0f / l);
+    }
+
    inline bool equal(Vector3::Arg v1, Vector3::Arg v2, float epsilon = NV_EPSILON)
    {
        return equal(v1.x, v2.x, epsilon) && equal(v1.y, v2.y, epsilon) && equal(v1.z, v2.z, epsilon);
@ -762,6 +795,15 @@ namespace nv
        return scale(v, 1.0f / l);
    }

+    // Safe, branchless normalization from Andy Firth. All error checking ommitted.
+    // http://altdevblogaday.com/2011/08/21/practical-flt-point-tricks/
+    inline Vector4 normalizeFast(Vector4::Arg v)
+    {
+        const float very_small_float = 1.0e-037f;
+        float l = very_small_float + length(v);
+        return scale(v, 1.0f / l);
+    }
+
    inline bool equal(Vector4::Arg v1, Vector4::Arg v2, float epsilon = NV_EPSILON)
    {
        return equal(v1.x, v2.x, epsilon) && equal(v1.y, v2.y, epsilon) && equal(v1.z, v2.z, epsilon) && equal(v1.w, v2.w, epsilon);
--- a/src/nvmath/nvmath.h
+++ b/src/nvmath/nvmath.h
@ -4,8 +4,9 @@
 #ifndef NV_MATH_H
 #define NV_MATH_H

-#include <nvcore/nvcore.h>
-#include <nvcore/Debug.h>
+#include "nvcore/nvcore.h"
+#include "nvcore/Debug.h"
+#include "nvcore/Utils.h" // clamp

 #include <math.h>
 #include <limits.h> // INT_MAX
@ -194,7 +195,7 @@ namespace nv
        return f - floor(f);
    }

-    inline float fround(float f)
+    inline float fround(float f)    // @@ rename floatRound
    {
        // @@ Do something better.
        return float(iround(f));
@ -210,6 +211,29 @@ namespace nv
        }
    }

+    inline float saturate(float f) {
+        return clamp(f, 0.0f, 1.0f);
+    }
+
+    inline float linearstep(float edge0, float edge1, float x) {
+        // Scale, bias and saturate x to 0..1 range
+        return saturate((x - edge0) / (edge1 - edge0));
+    }
+
+    inline float smoothstep(float edge0, float edge1, float x) {
+        x = linearstep(edge0, edge1, x); 
+
+        // Evaluate polynomial
+        return x*x*(3 - 2*x);
+    }
+
+    inline int sign(float a)
+    {
+        if (a > 0.0f) return 1;
+        if (a < 0.0f) return -1;
+        return 0;
+    }
+
 } // nv

 #endif // NV_MATH_H
--- a/src/nvthread/CMakeLists.txt
+++ b/src/nvthread/CMakeLists.txt
@ -0,0 +1,26 @@
+PROJECT(nvthreads)
+
+SET(THREADS_SRCS
+	nvthreads.h
+	Mutex.h Mutex.cpp
+	SpinWaiter.h SpinWaiter.cpp
+	Thread.h Thread.cpp
+	ThreadLocalStorage.h ThreadLocalStorage.cpp)
+
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+
+# targets
+ADD_DEFINITIONS(-DNVTHREADS_EXPORTS)
+
+IF(NVTHREADS_SHARED)
+	ADD_LIBRARY(nvthreads SHARED ${THREADS_SRCS})
+ELSE(NVTHREADS_SHARED)
+	ADD_LIBRARY(nvthreads ${THREADS_SRCS})
+ENDIF(NVTHREADS_SHARED)
+
+TARGET_LINK_LIBRARIES(nvthreads ${LIBS} nvcore)
+
+INSTALL(TARGETS nvthreads
+	RUNTIME DESTINATION bin
+	LIBRARY DESTINATION lib
+	ARCHIVE DESTINATION lib/static)
--- a/src/nvthread/Event.cpp
+++ b/src/nvthread/Event.cpp
@ -0,0 +1,52 @@
+// This code is in the public domain -- castano@gmail.com
+
+#include "Event.h"
+
+#if NV_OS_WIN32
+#include "Win32.h"
+#elif NV_OS_UNIX
+#include <pthread.h>
+#endif
+
+using namespace nv;
+
+#if NV_OS_WIN32
+
+struct Event::Private {
+	HANDLE handle;
+};
+
+Event::Event() : m(new Private) {
+    m->handle = CreateEvent(NULL, FALSE, FALSE, NULL);
+}
+
+Event::~Event() {
+    CloseHandle(m->handle);
+}
+
+void Event::post() {
+    SetEvent(m->handle);
+}
+
+void Event::wait() {
+    WaitForSingleObject(m->handle, INFINITE);
+}
+
+
+/*static*/ void Event::post(Event * events, uint count) {
+    for (uint i = 0; i < count; i++) {
+        events[i].post();
+    }
+}
+
+/*static*/ void Event::wait(Event * events, uint count) {
+    // @@ Use wait for multiple objects?
+
+    for (uint i = 0; i < count; i++) {
+        events[i].wait();
+    }
+}
+
+#elif NV_OS_UNIX
+    // @@ 
+#endif	
--- a/src/nvthread/Event.h
+++ b/src/nvthread/Event.h
@ -0,0 +1,34 @@
+// This code is in the public domain -- castano@gmail.com
+
+#pragma once
+#ifndef NV_THREAD_EVENT_H
+#define NV_THREAD_EVENT_H
+
+#include "nvthread.h"
+
+#include "nvcore/Ptr.h"
+
+namespace nv
+{
+    // This is intended to be used by a single waiter thread.
+	class NVTHREAD_CLASS Event
+	{
+		NV_FORBID_COPY(Event);
+	public:
+		Event();
+		~Event();
+
+		void post();
+		void wait();    // Wait resets the event.
+
+        static void post(Event * events, uint count);
+        static void wait(Event * events, uint count);
+
+	private:
+		struct Private;
+		AutoPtr<Private> m;
+	};
+
+} // nv namespace
+
+#endif // NV_THREAD_EVENT_H
--- a/src/nvthread/Mutex.cpp
+++ b/src/nvthread/Mutex.cpp
@ -0,0 +1,89 @@
+// This code is in the public domain -- castano@gmail.com
+
+#include "Mutex.h"
+
+#if NV_OS_WIN32
+
+#include "Win32.h"
+
+#elif NV_OS_UNIX
+
+#include <pthread.h>
+#include <errno.h> // EBUSY
+
+#endif // NV_OS
+
+using namespace nv;
+
+
+#if NV_OS_WIN32
+
+struct Mutex::Private {
+	CRITICAL_SECTION mutex;
+};
+
+
+Mutex::Mutex () : m(new Private)
+{
+	InitializeCriticalSection(&m->mutex);
+}
+
+Mutex::~Mutex ()
+{
+	DeleteCriticalSection(&m->mutex);
+}
+
+void Mutex::lock()
+{
+	EnterCriticalSection(&m->mutex);
+}
+
+bool Mutex::tryLock()
+{
+	return TryEnterCriticalSection(&m->mutex) != 0;
+}
+
+void Mutex::unlock()
+{
+	LeaveCriticalSection(&m->mutex);	
+}
+
+#elif NV_OS_UNIX
+
+struct Mutex::Private {
+	pthread_mutex_t mutex;
+};
+
+
+Mutex::Mutex () : m(new Private)
+{
+	int result = pthread_mutex_init(&m->mutex , NULL);
+	nvDebugCheck(result == 0);
+}
+
+Mutex::~Mutex ()
+{
+	int result = pthread_mutex_destroy(&m->mutex);
+	nvDebugCheck(result == 0);
+}
+
+void Mutex::lock()
+{
+	int result = pthread_mutex_lock(&m->mutex);
+	nvDebugCheck(result == 0);
+}
+
+bool Mutex::tryLock()
+{
+	int result = pthread_mutex_trylock(&m->mutex);
+	nvDebugCheck(result == 0 || result == EBUSY);
+	return result == 0;
+}
+
+void Mutex::unlock()
+{
+	int result = pthread_mutex_unlock(&m->mutex);
+	nvDebugCheck(result == 0);
+}
+
+#endif // NV_OS
--- a/src/nvthread/Mutex.h
+++ b/src/nvthread/Mutex.h
@ -0,0 +1,47 @@
+// This code is in the public domain -- castano@gmail.com
+
+#pragma once
+#ifndef NV_THREAD_MUTEX_H
+#define NV_THREAD_MUTEX_H
+
+#include "nvthread.h"
+
+#include "nvcore/Ptr.h"
+
+namespace nv
+{
+
+	class NVTHREAD_CLASS Mutex
+	{
+		NV_FORBID_COPY(Mutex);
+	public:
+		Mutex ();
+		~Mutex ();
+
+		void lock();
+		bool tryLock();
+		void unlock();
+
+	private:
+		struct Private;
+		AutoPtr<Private> m;
+	};
+
+
+    // Templated lock that can be used with any mutex.
+    template <class M>
+	class Lock
+	{
+		NV_FORBID_COPY(Lock);
+	public:
+
+		Lock (M & m) : m_mutex (m) { m_mutex.lock(); }
+		~Lock () { m_mutex.unlock(); }
+		
+	private:
+		M & m_mutex;
+	};
+
+} // nv namespace
+
+#endif // NV_THREAD_MUTEX_H
--- a/src/nvthread/ParallelFor.cpp
+++ b/src/nvthread/ParallelFor.cpp
@ -0,0 +1,61 @@
+// This code is in the public domain -- Ignacio Castaño <castano@gmail.com>
+
+#include "ParallelFor.h"
+#include "Thread.h"
+#include "Atomic.h"
+#include "ThreadPool.h"
+
+using namespace nv;
+
+#define ENABLE_PARALLEL_FOR 1
+
+
+void worker(void * arg) {
+    ParallelFor * owner = (ParallelFor *)arg;
+
+    while(true) {
+        // Consume one element at a time. @@ Might be more efficient to have custom grain.
+        uint i = atomicIncrement(&owner->idx);
+        if (i > owner->count) {
+            break;
+        }
+
+        owner->task(owner->context, i - 1);
+    } 
+}
+
+
+ParallelFor::ParallelFor(ForTask * task, void * context) : task(task), context(context) {
+#if ENABLE_PARALLEL_FOR
+    pool = ThreadPool::acquire();
+#endif
+}
+
+ParallelFor::~ParallelFor() {
+#if ENABLE_PARALLEL_FOR
+    ThreadPool::release(pool);
+#endif
+}
+
+void ParallelFor::run(uint count) {
+#if ENABLE_PARALLEL_FOR
+    storeRelease(&this->count, count);
+
+    // Init atomic counter to zero.
+    storeRelease(&idx, 0);
+
+    // Start threads.
+    pool->start(worker, this);
+
+    // Wait for all threads to complete.
+    pool->wait();
+
+    nvDebugCheck(idx >= count);
+#else
+    for (int i = 0; i < count; i++) {
+        task(context, i);
+    }
+#endif
+}
+
+
--- a/src/nvthread/ParallelFor.h
+++ b/src/nvthread/ParallelFor.h
@ -0,0 +1,38 @@
+// This code is in the public domain -- Ignacio Castaño <castano@gmail.com>
+
+#pragma once
+#ifndef NV_THREAD_PARALLELFOR_H
+#define NV_THREAD_PARALLELFOR_H
+
+#include "nvthread.h"
+//#include "Atomic.h" // atomic<uint>
+
+namespace nv
+{
+    class Thread;
+    class ThreadPool;
+
+    typedef void ForTask(void * context, int id);
+
+    struct ParallelFor {
+        ParallelFor(ForTask * task, void * context);
+        ~ParallelFor();
+
+        void run(uint count);
+
+        // Invariant:
+        ForTask * task;
+        void * context;
+        ThreadPool * pool;
+        //uint workerCount;   // @@ Move to thread pool.
+        //Thread * workers;
+
+        // State:
+        uint count;
+        /*atomic<uint>*/ uint idx;
+    };
+
+} // nv namespace
+
+
+#endif // NV_THREAD_PARALLELFOR_H
--- a/src/nvthread/Thread.cpp
+++ b/src/nvthread/Thread.cpp
@ -0,0 +1,136 @@
+// This code is in the public domain -- castano@gmail.com
+
+#include "Thread.h"
+
+#if NV_OS_WIN32
+	#include "Win32.h"
+#elif NV_OS_UNIX
+	#include <pthread.h>
+	#include <unistd.h> // usleep
+#endif
+
+using namespace nv;
+
+struct Thread::Private
+{
+#if NV_OS_WIN32
+	HANDLE thread;
+#elif NV_OS_UNIX
+	pthread_t thread;
+#endif
+
+    ThreadFunc * func;
+    void * arg;
+};
+
+#if NV_OS_WIN32
+
+unsigned long __stdcall threadFunc(void * arg) {
+    Thread * thread = (Thread *)arg;
+    thread->func(thread->arg);
+    return 0;
+}
+
+#elif NV_OS_UNIX
+extern "C" void * threadFunc(void * arg) {
+    Thread * thread = (Thread *)arg;
+	thread->func(thread->arg);
+	pthread_exit(0);
+}
+#endif
+
+
+Thread::Thread() : p(new Private)
+{
+    p->thread = 0;
+}
+
+Thread::~Thread()
+{
+	nvDebugCheck(p->thread == 0);
+}
+
+void Thread::start(ThreadFunc * func, void * arg)
+{
+    this->func = func;
+    this->arg = arg;
+
+#if NV_OS_WIN32
+    p->thread = CreateThread(NULL, 0, threadFunc, this, 0, NULL);
+	//p->thread = (HANDLE)_beginthreadex (0, 0, threadFunc, this, 0, NULL);     // @@ So that we can call CRT functions...
+	nvDebugCheck(p->thread != NULL);
+#elif NV_OS_UNIX
+	int result = pthread_create(&p->thread, NULL, threadFunc, this);
+	nvDebugCheck(result == 0);
+#endif
+}
+
+void Thread::wait()
+{
+#if NV_OS_WIN32
+    DWORD status = WaitForSingleObject (p->thread, INFINITE);
+    nvCheck (status ==  WAIT_OBJECT_0);
+    BOOL ok = CloseHandle (p->thread);
+    p->thread = NULL;
+    nvCheck (ok);
+#elif NV_OS_UNIX
+	int result = pthread_join(p->thread, NULL); 
+    p->thread = 0;
+	nvDebugCheck(result == 0);
+#endif
+}
+
+bool Thread::isRunning () const
+{
+#if NV_OS_WIN32
+	return p->thread != NULL;
+#elif NV_OS_UNIX
+	return p->thread != 0;
+#endif
+}
+
+/*static*/ void Thread::spinWait(uint count)
+{
+	for (uint i = 0; i < count; i++) {}
+}
+
+/*static*/ void Thread::yield()
+{
+#if NV_OS_WIN32
+	SwitchToThread();
+#elif NV_OS_UNIX
+	int result = sched_yield();
+	nvDebugCheck(result == 0);
+#endif
+}
+
+/*static*/ void Thread::sleep(uint ms)
+{
+#if NV_OS_WIN32
+	Sleep(ms);
+#elif NV_OS_UNIX
+	usleep(1000 * ms);
+#endif
+}
+
+/*static*/ void Thread::wait(Thread * threads, uint count)
+{
+/*#if NV_OS_WIN32
+    // @@ Is there any advantage in doing this?
+    nvDebugCheck(count < MAXIMUM_WAIT_OBJECTS);
+
+    HANDLE * handles = new HANDLE[count];
+    for (uint i = 0; i < count; i++) {
+        handles[i] = threads->p->thread;
+    }
+
+    DWORD result = WaitForMultipleObjects(count, handles, TRUE, INFINITE);
+
+
+    delete [] handles;
+#else*/
+    for (uint i = 0; i < count; i++) {
+        threads[i].wait();
+    }
+//#endif
+}
--- a/src/nvthread/Thread.h
+++ b/src/nvthread/Thread.h
@ -0,0 +1,46 @@
+// This code is in the public domain -- castano@gmail.com
+
+#pragma once
+#ifndef NV_THREAD_THREAD_H
+#define NV_THREAD_THREAD_H
+
+#include "nvthread.h"
+
+#include "nvcore/Ptr.h"
+
+namespace nv
+{
+    typedef void ThreadFunc(void * arg);
+
+	class NVTHREAD_CLASS Thread
+	{
+		NV_FORBID_COPY(Thread);
+	public:
+		Thread();
+		~Thread();
+
+		void start(ThreadFunc * func, void * arg);
+		void wait();
+
+		bool isRunning() const;
+
+		static void spinWait(uint count);
+		static void yield();
+		static void sleep(uint ms);
+
+        static void wait(Thread * threads, uint count);
+	
+	private:
+
+		struct Private;
+		AutoPtr<Private> p;
+    
+    public:
+        ThreadFunc * func;
+        void * arg;
+
+	};
+
+} // nv namespace
+
+#endif // NV_THREAD_THREAD_H
--- a/src/nvthread/ThreadPool.cpp
+++ b/src/nvthread/ThreadPool.cpp
@ -0,0 +1,121 @@
+// This code is in the public domain -- castano@gmail.com
+
+#include "ThreadPool.h"
+#include "Mutex.h"
+#include "Thread.h"
+
+// Most of the time it's not necessary to protect the thread pool, but if it doesn't add a significant overhead, then it'd be safer to do it.
+#define PROTECT_THREAD_POOL 1
+
+
+using namespace nv;
+
+#if PROTECT_THREAD_POOL 
+Mutex s_pool_mutex;
+#endif
+
+AutoPtr<ThreadPool> s_pool;
+
+
+/*static*/ ThreadPool * ThreadPool::acquire()
+{
+#if PROTECT_THREAD_POOL 
+    s_pool_mutex.lock();    // @@ If same thread tries to lock twice, this should assert.
+#endif
+
+    if (s_pool == NULL) {
+        ThreadPool * p = new ThreadPool;
+        nvDebugCheck(s_pool == p);
+    }
+
+    return s_pool.ptr();
+}
+
+/*static*/ void ThreadPool::release(ThreadPool * pool)
+{
+    nvDebugCheck(pool == s_pool);
+
+    // Make sure the threads of the pool are idle.
+    s_pool->wait();
+
+#if PROTECT_THREAD_POOL 
+    s_pool_mutex.unlock();
+#endif
+}
+
+
+
+
+/*static*/ void ThreadPool::workerFunc(void * arg) {
+    uint i = (uint)arg;
+
+    while(true) 
+    {
+        s_pool->startEvents[i].wait();
+
+        if (s_pool->func == NULL) {
+            return; // @@ should we post finish event anyway?
+        }
+        
+        s_pool->func(s_pool->arg);
+
+        s_pool->finishEvents[i].post();
+    }
+}
+
+
+ThreadPool::ThreadPool() 
+{
+    s_pool = this;  // Worker threads need this to be initialized before they start.
+
+    workerCount = nv::hardwareThreadCount();
+    workers = new Thread[workerCount];
+
+    startEvents = new Event[workerCount];
+    finishEvents = new Event[workerCount];
+
+    for (uint i = 0; i < workerCount; i++) {
+        workers[i].start(workerFunc, (void *)i);
+    }
+
+    allIdle = true;
+}
+
+ThreadPool::~ThreadPool()
+{
+    // Set threads to terminate.
+    start(NULL, NULL);
+
+    // Wait until threads actually exit.
+    Thread::wait(workers, workerCount);
+
+    delete [] workers;
+    delete [] startEvents;
+    delete [] finishEvents;
+}
+
+void ThreadPool::start(ThreadFunc * func, void * arg)
+{
+    // Wait until threads are idle.
+    wait();
+
+    // Set our desired function.
+    this->func = func;
+    this->arg = arg;
+
+    allIdle = false;
+
+    // Resume threads.
+    Event::post(startEvents, workerCount);
+}
+
+void ThreadPool::wait()
+{
+    if (!allIdle)
+    {
+        // Wait for threads to complete.
+        Event::wait(finishEvents, workerCount);
+
+        allIdle = true;
+    }
+}
--- a/src/nvthread/ThreadPool.h
+++ b/src/nvthread/ThreadPool.h
@ -0,0 +1,49 @@
+// This code is in the public domain -- castano@gmail.com
+
+#pragma once
+#ifndef NV_THREAD_THREADPOOL_H
+#define NV_THREAD_THREADPOOL_H
+
+#include "nvthread.h"
+
+#include "Event.h"
+#include "Thread.h"
+
+namespace nv {
+
+    class Thread;
+    class Event;
+
+    class ThreadPool {
+        NV_FORBID_COPY(ThreadPool);
+    public:
+
+        static ThreadPool * acquire();
+        static void release(ThreadPool *);
+
+        ThreadPool();
+        ~ThreadPool();
+
+        void start(ThreadFunc * func, void * arg);
+        void wait();
+
+    private:
+
+        static void workerFunc(void * arg);
+
+        uint workerCount;
+        Thread * workers;
+        Event * startEvents;
+        Event * finishEvents;
+
+        uint allIdle;
+
+        // Current function:
+        ThreadFunc * func;
+        void * arg;
+    };
+
+} // namespace nv
+
+
+#endif // NV_THREAD_THREADPOOL_H
--- a/src/nvthread/Win32.h
+++ b/src/nvthread/Win32.h
@ -0,0 +1,9 @@
+// This code is in the public domain -- castano@gmail.com
+
+// Never include this from a header file.
+
+#define WIN32_LEAN_AND_MEAN
+#define VC_EXTRALEAN
+#define _WIN32_WINNT 0x0400 // for SwitchToThread, TryEnterCriticalSection
+#include <windows.h>
+//#include <process.h> // for _beginthreadex
--- a/src/nvthread/nvthread.cpp
+++ b/src/nvthread/nvthread.cpp
@ -0,0 +1,51 @@
+
+#include "nvthread.h"
+
+#include "Thread.h"
+
+#define WIN32_LEAN_AND_MEAN
+#define VC_EXTRALEAN
+#include <windows.h>
+
+using namespace nv;
+
+
+// Find the number of cores in the system.
+// Based on: http://stackoverflow.com/questions/150355/programmatically-find-the-number-of-cores-on-a-machine
+// @@ Distinguish between logical and physical cores?
+uint nv::hardwareThreadCount() {
+#if NV_OS_WIN32
+    SYSTEM_INFO sysinfo;
+    GetSystemInfo( &sysinfo );
+    return sysinfo.dwNumberOfProcessors;
+#elif NV_OS_XBOX
+    return 3; // or 6?
+#elif NV_OS_LINUX // Linux, Solaris, & AIX
+    return sysconf(_SC_NPROCESSORS_ONLN);
+#elif NV_OS_DARWIN || NV_OS_FREEBSD
+    int numCPU;
+    int mib[4];
+    size_t len = sizeof(numCPU); 
+
+    // set the mib for hw.ncpu
+    mib[0] = CTL_HW;
+    mib[1] = HW_AVAILCPU;  // alternatively, try HW_NCPU;
+
+    // get the number of CPUs from the system
+    sysctl(mib, 2, &numCPU, &len, NULL, 0);
+
+    if (numCPU < 1) {
+         mib[1] = HW_NCPU;
+         sysctl( mib, 2, &numCPU, &len, NULL, 0 );
+
+         if (numCPU < 1) {
+              return 1; // Assume single core.
+         }
+    }
+
+    return numCPU;
+#else
+    return 1; // Assume single core.
+#endif
+}
+
--- a/src/nvthread/nvthread.h
+++ b/src/nvthread/nvthread.h
@ -0,0 +1,83 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#pragma once
+#ifndef NV_THREAD_H
+#define NV_THREAD_H
+
+#include "nvcore/nvcore.h"
+
+// Function linkage
+#if NVTHREAD_SHARED
+#ifdef NVTHREAD_EXPORTS
+#define NVTHREAD_API DLL_EXPORT
+#define NVTHREAD_CLASS DLL_EXPORT_CLASS
+#else
+#define NVTHREAD_API DLL_IMPORT
+#define NVTHREAD_CLASS DLL_IMPORT
+#endif
+#else // NVMATH_SHARED
+#define NVTHREAD_API
+#define NVTHREAD_CLASS
+#endif // NVMATH_SHARED
+
+
+// Compiler barriers.
+// See: http://en.wikipedia.org/wiki/Memory_ordering
+#if NV_CC_MSVC
+
+#include <intrin.h>
+
+#pragma intrinsic(_WriteBarrier)
+#define nvCompilerWriteBarrier      _WriteBarrier
+
+#pragma intrinsic(_ReadWriteBarrier)
+#define nvCompilerReadWriteBarrier  _ReadWriteBarrier
+
+#if _MSC_VER >= 1400            // ReadBarrier is VC2005
+#pragma intrinsic(_ReadBarrier)
+#define nvCompilerReadBarrier       _ReadBarrier	
+#else
+#define nvCompilerReadBarrier       _ReadWriteBarrier
+#endif
+
+#elif NV_CC_GNUC
+
+#define nvCompilerReadWriteBarrier()    asm volatile("" ::: "memory");
+#define nvCompilerWriteBarrier          nvCompilerReadWriteBarrier
+#define nvCompilerReadBarrier           nvCompilerReadWriteBarrier
+
+#endif // NV_CC_MSVC
+
+
+// @@ Memory barriers / fences.
+
+// @@ Atomics.
+
+
+/* Wrap this up:
+#define YieldProcessor() __asm { rep nop }
+#define YieldProcessor _mm_pause
+#define YieldProcessor __yield
+
+BOOL WINAPI SwitchToThread(void);
+*/
+
+
+namespace nv
+{
+    // Reentrant.
+    uint hardwareThreadCount();
+
+    // Not thread-safe. Use from main thread only.
+    void initWorkers();
+    void shutWorkers();
+    void setWorkerFunction(void * func);
+
+} // nv namespace
+
+
+
+
+
+
+#endif // NV_THREAD_H
--- a/src/nvtt/CompressorDX11.cpp
+++ b/src/nvtt/CompressorDX11.cpp
@ -37,7 +37,7 @@ using namespace nv;
 using namespace nvtt;


-void CompressorBC6::compressBlock(Tile & tile, AlphaMode alphaMode, const CompressionOptions::Private & compressionOptions, void * output)
+void CompressorBC6::compressBlock(ColorSet & tile, AlphaMode alphaMode, const CompressionOptions::Private & compressionOptions, void * output)
 {
    NV_UNUSED(alphaMode); // ZOH does not support alpha.

@ -56,7 +56,7 @@ void CompressorBC6::compressBlock(Tile & tile, AlphaMode alphaMode, const Compre
 }


-void CompressorBC7::compressBlock(Tile & tile, AlphaMode alphaMode, const CompressionOptions::Private & compressionOptions, void * output)
+void CompressorBC7::compressBlock(ColorSet & tile, AlphaMode alphaMode, const CompressionOptions::Private & compressionOptions, void * output)
 {
    // @@ TODO
 }
--- a/src/nvtt/CompressorDX9.cpp
+++ b/src/nvtt/CompressorDX9.cpp
@ -481,10 +481,10 @@ void D3DXCompressorDXT1::compress(nvtt::InputFormat inputFormat, nvtt::AlphaMode

        err = surface->LockRect(&rect, NULL, D3DLOCK_READONLY);

-	if (outputOptions.outputHandler != NULL) {
-	    int size = rect.Pitch * ((h + 3) / 4);
-	    outputOptions.outputHandler->writeData(rect.pBits, size);
-	}
+	    if (outputOptions.outputHandler != NULL) {
+	        int size = rect.Pitch * ((h + 3) / 4);
+	        outputOptions.outputHandler->writeData(rect.pBits, size);
+	    }

        err = surface->UnlockRect();
    }
--- a/src/nvtt/CompressorRGB.cpp
+++ b/src/nvtt/CompressorRGB.cpp
@ -110,7 +110,7 @@ namespace
        {
            nvDebugCheck(alignment >= 1);
            flush();
-            int remainder = (size_t)ptr % alignment;
+            int remainder = (int)((uintptr_t)ptr % alignment);
            if (remainder != 0) {
                putBits(0, (alignment - remainder) * 8);
            }
--- a/src/nvtt/Context.cpp
+++ b/src/nvtt/Context.cpp
@ -349,6 +349,8 @@ bool Compressor::Private::compress(AlphaMode alphaMode, int w, int h, int d, int
        compressor->compress(alphaMode, w, h, d, rgba, dispatcher, compressionOptions, outputOptions);
    }

+    outputOptions.endImage();
+
    return true;
 }

--- a/src/nvtt/OutputOptions.cpp
+++ b/src/nvtt/OutputOptions.cpp
@ -135,6 +135,11 @@ bool OutputOptions::Private::writeData(const void * data, int size) const
    return outputHandler == NULL || outputHandler->writeData(data, size);
 }

+void OutputOptions::Private::endImage() const
+{
+    if (outputHandler != NULL) outputHandler->endImage();
+}
+
 void OutputOptions::Private::error(Error e) const
 {
    if (errorHandler != NULL) errorHandler->error(e);
--- a/src/nvtt/OutputOptions.h
+++ b/src/nvtt/OutputOptions.h
@ -52,6 +52,11 @@ namespace nvtt
 			return true;
 		}

+		virtual void endImage()
+		{
+			// ignore.
+		}
+
 		nv::StdOutputStream stream;
 	};

@ -72,6 +77,7 @@ namespace nvtt

 		void beginImage(int size, int width, int height, int depth, int face, int miplevel) const;
 		bool writeData(const void * data, int size) const;
+        void endImage() const;
 		void error(Error e) const;
 	};

--- a/src/nvtt/TaskDispatcher.h
+++ b/src/nvtt/TaskDispatcher.h
@ -18,8 +18,8 @@
 // http://msdn.microsoft.com/en-us/library/dd504870.aspx
 #if NV_OS_WIN32 && _MSC_VER >= 1600
 #define HAVE_PPL 1
-//#include <array>
-#include <ppl.h>
+#include <array>
+//#include <ppl.h>
 #endif

 // Intel Thread Building Blocks (TBB).
@ -28,6 +28,8 @@
 #include <tbb/parallel_for.h>
 #endif

+#include "nvthread/ParallelFor.h"
+

 namespace nvtt {

@ -40,6 +42,15 @@ namespace nvtt {
        }
    };

+    struct ParallelTaskDispatcher : public TaskDispatcher
+    {
+        virtual void dispatch(Task * task, void * context, int count) {
+            nv::ParallelFor parallelFor(task, context);
+            parallelFor.run(count); // @@ Add support for custom grain.
+        }
+    };
+
+
 #if defined(HAVE_OPENMP)

    struct OpenMPTaskDispatcher : public TaskDispatcher
@ -81,9 +92,24 @@ namespace nvtt {

 #if defined(HAVE_PPL)

+    class CountingIterator
+    {
+    public:
+        CountingIterator() : i(0) {}
+        CountingIterator(const CountingIterator & rhs) : i(0) {}
+        explicit CountingIterator(int x) : i(x) {}
+
+        const int & operator*() const { return i; }
+        CountingIterator & operator++() { i++; return *this; }
+        CountingIterator & operator--() { i--; return *this; }
+
+    private:
+        int i;
+    };
+
    struct TaskFunctor {
        TaskFunctor(Task * task, void * context) : task(task), context(context) {}
-        void operator()(int n) const {
+        void operator()(int & n) const {
            task(context, n);
        }
        Task * task;
@ -95,12 +121,16 @@ namespace nvtt {
    {
        virtual void dispatch(Task * task, void * context, int count)
        {
+            CountingIterator begin(0);
+            CountingIterator end((int)count);
            TaskFunctor func(task, context);
-            Concurrency::parallel_for(0, count, func);
+
+            std::for_each(begin, end, func);
+            //parallel_for_each(begin, end, func);
        }
    };

-#endif // HAVE_PPL
+#endif

 #if defined(HAVE_TBB)

@ -132,7 +162,8 @@ namespace nvtt {
 #elif defined(HAVE_GCD)
    typedef AppleTaskDispatcher         ConcurrentTaskDispatcher;
 #else
-    typedef SequentialTaskDispatcher    ConcurrentTaskDispatcher;
+    //typedef SequentialTaskDispatcher    ConcurrentTaskDispatcher;
+    typedef ParallelTaskDispatcher        ConcurrentTaskDispatcher;
 #endif

 } // namespace nvtt
--- a/src/nvtt/TexImage.cpp
+++ b/src/nvtt/TexImage.cpp
@ -615,7 +615,7 @@ bool TexImage::setImage2D(Format format, Decoder decoder, int w, int h, const vo
 			    block->decodeBlock(&colors, false);
 		    }
 		    else if (decoder == Decoder_NV5x) {
-			block->decodeBlockNV5x(&colors);
+			    block->decodeBlockNV5x(&colors);
 		    }
 		}
 		else if (format == nvtt::Format_BC3)
@ -629,19 +629,19 @@ bool TexImage::setImage2D(Format format, Decoder decoder, int w, int h, const vo
 			    block->decodeBlock(&colors, false);
 		    }
 		    else if (decoder == Decoder_NV5x) {
-			block->decodeBlockNV5x(&colors);
+			    block->decodeBlockNV5x(&colors);
 		    }
 		}
 		else if (format == nvtt::Format_BC4)
 		{
-                    const BlockATI1 * block = (const BlockATI1 *)ptr;
-                    block->decodeBlock(&colors, decoder == Decoder_D3D9);
-                }
-                else if (format == nvtt::Format_BC5)
-                {
-                    const BlockATI2 * block = (const BlockATI2 *)ptr;
-                    block->decodeBlock(&colors, decoder == Decoder_D3D9);
-                }
+            const BlockATI1 * block = (const BlockATI1 *)ptr;
+            block->decodeBlock(&colors, decoder == Decoder_D3D9);
+        }
+        else if (format == nvtt::Format_BC5)
+        {
+            const BlockATI2 * block = (const BlockATI2 *)ptr;
+            block->decodeBlock(&colors, decoder == Decoder_D3D9);
+        }

 		for (int yy = 0; yy < 4; yy++)
 		{
@ -864,6 +864,42 @@ bool TexImage::buildNextMipmap(MipmapFilter filter, float filterWidth, const flo
    return true;
 }

+void TexImage::canvasSize(int w, int h, int d)
+{
+    nvDebugCheck(w > 0 && h > 0 && d > 0);
+
+    FloatImage * img = m->image;
+    if (img == NULL || (w == img->width() && h == img->height() && d == img->depth())) {
+        return;
+    }
+
+    detach();
+
+    FloatImage * new_img = new FloatImage;
+    new_img->allocate(4, w, h, d);
+    new_img->clear();
+
+    w = min(uint(w), img->width());
+    h = min(uint(h), img->height());
+    d = min(uint(d), img->depth());
+
+    for (int z = 0; z < d; z++) {
+        for (int y = 0; y < h; y++) {
+            for (int x = 0; x < w; x++) {
+                new_img->pixel(0, x, y, z) = img->pixel(0, x, y, z);
+                new_img->pixel(1, x, y, z) = img->pixel(1, x, y, z);
+                new_img->pixel(2, x, y, z) = img->pixel(2, x, y, z);
+                new_img->pixel(3, x, y, z) = img->pixel(3, x, y, z);
+            }
+        }
+    }
+
+    delete m->image;
+    m->image = new_img;
+    m->type = (d == 1) ? TextureType_2D : TextureType_3D;
+}
+
+
 // Color transforms.
 void TexImage::toLinear(float gamma)
 {
@ -885,6 +921,66 @@ void TexImage::toGamma(float gamma)
    m->image->toGamma(0, 3, gamma);
 }

+
+static float toSrgb(float f) {
+    if (f <= 0.0)               f = 0.0f;
+    else if (f <= 0.0031308f)   f = 12.92f * f;
+    else if (f <= 1.0f)         f = (powf(f, 0.41666f) * 1.055f) - 0.055f;
+    else                        f = 1.0f;
+    return f;
+}
+
+void TexImage::toSrgb()
+{
+    FloatImage * img = m->image;
+    if (img == NULL) return;
+
+    detach();
+
+    const uint count = img->pixelCount();
+    for (uint j = 0; j < count; j++)
+    {
+        float & r = img->pixel(0, j);
+        float & g = img->pixel(1, j);
+        float & b = img->pixel(2, j);
+
+        r = ::toSrgb(r);
+        g = ::toSrgb(g);
+        b = ::toSrgb(b);
+    }
+}
+
+static float toXenonSrgb(float f) {
+    if (f < 0)                  f = 0;
+    else if (f < (1.0f/16.0f))  f = 4.0f * f;
+    else if (f < (1.0f/8.0f))   f = 0.25f  + 2.0f * (f - 0.0625f);
+    else if (f < 0.5f)          f = 0.375f + 1.0f * (f - 0.125f);
+    else if (f < 1.0f)          f = 0.75f  + 0.5f * (f - 0.50f);
+    else                        f = 1.0f;
+    return f;
+}
+
+void TexImage::toXenonSrgb()
+{
+    FloatImage * img = m->image;
+    if (img == NULL) return;
+
+    detach();
+
+    const uint count = img->pixelCount();
+    for (uint j = 0; j < count; j++)
+    {
+        float & r = img->pixel(0, j);
+        float & g = img->pixel(1, j);
+        float & b = img->pixel(2, j);
+
+        r = ::toXenonSrgb(r);
+        g = ::toXenonSrgb(g);
+        b = ::toXenonSrgb(b);
+    }
+}
+
+
 void TexImage::transform(const float w0[4], const float w1[4], const float w2[4], const float w3[4], const float offset[4])
 {
    if (m->image == NULL) return;
@ -1140,9 +1236,9 @@ void TexImage::toRGBM(float range/*= 1*/, float threshold/*= 0.25*/)

    const uint count = img->pixelCount();
    for (uint i = 0; i < count; i++) {
-        float R = nv::clamp(r[i] * irange, 0.0f, 1.0f);
-        float G = nv::clamp(g[i] * irange, 0.0f, 1.0f);
-        float B = nv::clamp(b[i] * irange, 0.0f, 1.0f);
+        float R = nv::clamp(r[i], 0.0f, 1.0f);
+        float G = nv::clamp(g[i], 0.0f, 1.0f);
+        float B = nv::clamp(b[i], 0.0f, 1.0f);
 #if 1
        float M = max(max(R, G), max(B, threshold));

--- a/src/nvtt/nvtt.h
+++ b/src/nvtt/nvtt.h
@ -294,6 +294,9 @@ namespace nvtt

        /// Output data. Compressed data is output as soon as it's generated to minimize memory allocations.
        virtual bool writeData(const void * data, int size) = 0;
+
+        /// Indicate the end of a the compressed image.
+        virtual void endImage() = 0;
    };

    /// Error codes.
@ -440,10 +443,13 @@ namespace nvtt
        NVTT_API void resize(int maxExtent, RoundMode mode, ResizeFilter filter, float filterWidth, const float * params = 0);
        NVTT_API bool buildNextMipmap(MipmapFilter filter);
        NVTT_API bool buildNextMipmap(MipmapFilter filter, float filterWidth, const float * params = 0);
+        NVTT_API void canvasSize(int w, int h, int d);

        // Color transforms.
        NVTT_API void toLinear(float gamma);
        NVTT_API void toGamma(float gamma);
+        NVTT_API void toSrgb();
+        NVTT_API void toXenonSrgb();
        NVTT_API void transform(const float w0[4], const float w1[4], const float w2[4], const float w3[4], const float offset[4]);
        NVTT_API void swizzle(int r, int g, int b, int a);
        NVTT_API void scaleBias(int channel, float scale, float bias);
--- a/src/nvtt/tools/compress.cpp
+++ b/src/nvtt/tools/compress.cpp
@ -56,6 +56,11 @@ struct MyOutputHandler : public nvtt::OutputHandler
        // ignore.
    }

+    virtual void endImage()
+    {
+        // Ignore.
+    }
+
    // Output data.
    virtual bool writeData(const void * data, int size)
    {