From 04bdc76749793641395f26e3bd2005ed9dce2f74 Mon Sep 17 00:00:00 2001
From: castano <castano@95f4ed2b-212e-0410-8b90-d31948207fce>
Date: Fri, 20 Jul 2012 16:32:26 +0000
Subject: [PATCH] Merge changes from The Witness.

---
 src/nvcore/Array.inl         |  16 ++-
 src/nvcore/Debug.cpp         | 192 +++++++++++++++++------------------
 src/nvcore/Debug.h           |   2 +
 src/nvcore/Utils.h           |   9 +-
 src/nvimage/FloatImage.cpp   |  58 ++++++++++-
 src/nvimage/FloatImage.h     |   2 +-
 src/nvmath/Half.cpp          |  77 +++++++++++++-
 src/nvmath/Half.h            |   4 +
 src/nvmath/Matrix.h          |   1 +
 src/nvmath/Matrix.inl        |  13 +++
 src/nvthread/ParallelFor.cpp |   2 +-
 src/nvtt/ClusterFit.cpp      |   4 +-
 src/nvtt/CompressorDX9.cpp   |  28 ++---
 src/nvtt/CompressorDX9.h     |   2 +-
 src/nvtt/CompressorRGB.cpp   |   2 +-
 15 files changed, 284 insertions(+), 128 deletions(-)
diff --git a/src/nvcore/Array.inl b/src/nvcore/Array.inl
index 6a7236e..fefa19e 100755
--- a/src/nvcore/Array.inl
+++ b/src/nvcore/Array.inl
@@ -290,11 +290,23 @@ namespace nv
     template <typename T>
     NV_FORCEINLINE void Array<T>::copy(const T * data, uint count)
     {
-        destroy_range(m_buffer, count, m_size);
+#if 1   // More simple, but maybe not be as efficient?
+        destroy_range(m_buffer, 0, m_size);
 
         setArraySize(count);
 
-        ::nv::copy(m_buffer, data, count);
+        construct_range(m_buffer, count, 0, data);
+#else
+        const uint old_size = m_size;
+
+        destroy_range(m_buffer, count, old_size);
+
+        setArraySize(count);
+
+        copy_range(m_buffer, data, old_size);
+
+        construct_range(m_buffer, count, old_size, data);
+#endif
     }
 
     // Assignment operator.
diff --git a/src/nvcore/Debug.cpp b/src/nvcore/Debug.cpp
index a662820..9c852d1 100644
--- a/src/nvcore/Debug.cpp
+++ b/src/nvcore/Debug.cpp
@@ -172,48 +172,53 @@ namespace
             return false;
         }
 
-        MINIDUMP_EXCEPTION_INFORMATION ExInfo;
-        ExInfo.ThreadId = ::GetCurrentThreadId();
-        ExInfo.ExceptionPointers = pExceptionInfo;
-        ExInfo.ClientPointers = NULL;
-
-        MINIDUMP_CALLBACK_INFORMATION callback;
-        MINIDUMP_CALLBACK_INFORMATION * callback_pointer = NULL;
-        MinidumpCallbackContext context;
-
-        // Find a memory region of 256 bytes centered on the
-        // faulting instruction pointer.
-        const ULONG64 instruction_pointer = 
-        #if defined(_M_IX86)
-            pExceptionInfo->ContextRecord->Eip;
-        #elif defined(_M_AMD64)
-            pExceptionInfo->ContextRecord->Rip;
-        #else
-            #error Unsupported platform
-        #endif
-
-        MEMORY_BASIC_INFORMATION info;
-        
-        if (VirtualQuery(reinterpret_cast<LPCVOID>(instruction_pointer), &info, sizeof(MEMORY_BASIC_INFORMATION)) != 0 && info.State == MEM_COMMIT)
-        {
-            // Attempt to get 128 bytes before and after the instruction
-            // pointer, but settle for whatever's available up to the
-            // boundaries of the memory region.
-            const ULONG64 kIPMemorySize = 256;
-            context.memory_base = max(reinterpret_cast<ULONG64>(info.BaseAddress), instruction_pointer - (kIPMemorySize / 2));
-            ULONG64 end_of_range = min(instruction_pointer + (kIPMemorySize / 2), reinterpret_cast<ULONG64>(info.BaseAddress) + info.RegionSize);
-            context.memory_size = static_cast<ULONG>(end_of_range - context.memory_base);
-            context.finished = false;
-
-            callback.CallbackRoutine = miniDumpWriteDumpCallback;
-            callback.CallbackParam = reinterpret_cast<void*>(&context);
-            callback_pointer = &callback;
+        MINIDUMP_EXCEPTION_INFORMATION * pExInfo = NULL;
+        MINIDUMP_CALLBACK_INFORMATION * pCallback = NULL;
+
+        if (pExceptionInfo != NULL) {
+            MINIDUMP_EXCEPTION_INFORMATION ExInfo;
+            ExInfo.ThreadId = ::GetCurrentThreadId();
+            ExInfo.ExceptionPointers = pExceptionInfo;
+            ExInfo.ClientPointers = NULL;
+            pExInfo = &ExInfo;
+
+            MINIDUMP_CALLBACK_INFORMATION callback;
+            MinidumpCallbackContext context;
+
+            // Find a memory region of 256 bytes centered on the
+            // faulting instruction pointer.
+            const ULONG64 instruction_pointer = 
+            #if defined(_M_IX86)
+                pExceptionInfo->ContextRecord->Eip;
+            #elif defined(_M_AMD64)
+                pExceptionInfo->ContextRecord->Rip;
+            #else
+                #error Unsupported platform
+            #endif
+
+            MEMORY_BASIC_INFORMATION info;
+            
+            if (VirtualQuery(reinterpret_cast<LPCVOID>(instruction_pointer), &info, sizeof(MEMORY_BASIC_INFORMATION)) != 0 && info.State == MEM_COMMIT)
+            {
+                // Attempt to get 128 bytes before and after the instruction
+                // pointer, but settle for whatever's available up to the
+                // boundaries of the memory region.
+                const ULONG64 kIPMemorySize = 256;
+                context.memory_base = max(reinterpret_cast<ULONG64>(info.BaseAddress), instruction_pointer - (kIPMemorySize / 2));
+                ULONG64 end_of_range = min(instruction_pointer + (kIPMemorySize / 2), reinterpret_cast<ULONG64>(info.BaseAddress) + info.RegionSize);
+                context.memory_size = static_cast<ULONG>(end_of_range - context.memory_base);
+                context.finished = false;
+
+                callback.CallbackRoutine = miniDumpWriteDumpCallback;
+                callback.CallbackParam = reinterpret_cast<void*>(&context);
+                pCallback = &callback;
+            }
         }
 
         MINIDUMP_TYPE miniDumpType = (MINIDUMP_TYPE)(MiniDumpNormal|MiniDumpWithHandleData|MiniDumpWithThreadInfo);
 
         // write the dump
-        BOOL ok = MiniDumpWriteDump(GetCurrentProcess(), GetCurrentProcessId(), hFile, miniDumpType, &ExInfo, NULL, callback_pointer) != 0;
+        BOOL ok = MiniDumpWriteDump(GetCurrentProcess(), GetCurrentProcessId(), hFile, miniDumpType, pExInfo, NULL, pCallback) != 0;
         CloseHandle(hFile);
 
         if (ok == FALSE) {
@@ -402,9 +407,8 @@ namespace
     // Write mini dump and print stack trace.
     static LONG WINAPI handleException(EXCEPTION_POINTERS * pExceptionInfo)
     {
-#if USE_SEPARATE_THREAD
         EnterCriticalSection(&s_handler_critical_section);
-
+#if USE_SEPARATE_THREAD
         s_requesting_thread_id = GetCurrentThreadId();
         s_exception_info = pExceptionInfo;
 
@@ -418,12 +422,11 @@ namespace
         // Clean up.
         s_requesting_thread_id = 0;
         s_exception_info = NULL;
-
-        LeaveCriticalSection(&s_handler_critical_section);
 #else
         // First of all, write mini dump.
         writeMiniDump(pExceptionInfo);
 #endif
+        LeaveCriticalSection(&s_handler_critical_section);
 
         nvDebug("\nDump file saved.\n");
 
@@ -454,62 +457,21 @@ namespace
             fclose(fp);
         }
 
-        return EXCEPTION_EXECUTE_HANDLER;   // Terminate app.
+        // This should terminate the process and set the error exit code.
+        TerminateProcess(GetCurrentProcess(), EXIT_FAILURE + 2);
+
+        return EXCEPTION_EXECUTE_HANDLER;   // Terminate app. In case terminate process did not succeed.
     }
 
-    /*static void handlePureVirtualCall() {
-        // This is an pure virtual function call, not an exception.  It's safe to
-        // play with sprintf here.
-        AutoExceptionHandler auto_exception_handler;
-        ExceptionHandler* current_handler = auto_exception_handler.get_handler();
-
-        MDRawAssertionInfo assertion;
-        memset(&assertion, 0, sizeof(assertion));
-        assertion.type = MD_ASSERTION_INFO_TYPE_PURE_VIRTUAL_CALL;
-
-        // Make up an exception record for the current thread and CPU context
-        // to make it possible for the crash processor to classify these
-        // as do regular crashes, and to make it humane for developers to
-        // analyze them.
-        EXCEPTION_RECORD exception_record = {};
-        CONTEXT exception_context = {};
-        EXCEPTION_POINTERS exception_ptrs = { &exception_record, &exception_context };
-
-        ::RtlCaptureContext(&exception_context);
-
-        exception_record.ExceptionCode = STATUS_NONCONTINUABLE_EXCEPTION;
-
-        // We store pointers to the the expression and function strings,
-        // and the line as exception parameters to make them easy to
-        // access by the developer on the far side.
-        exception_record.NumberParameters = 3;
-        exception_record.ExceptionInformation[0] = reinterpret_cast<ULONG_PTR>(&assertion.expression);
-        exception_record.ExceptionInformation[1] = reinterpret_cast<ULONG_PTR>(&assertion.file);
-        exception_record.ExceptionInformation[2] = assertion.line;
-
-        bool success = false;
-        // In case of out-of-process dump generation, directly call
-        // WriteMinidumpWithException since there is no separate thread running.
-
-        success = current_handler->WriteMinidumpOnHandlerThread(&exception_ptrs, &assertion);
-
-        if (!success) {
-            if (current_handler->previous_pch_) {
-                // The handler didn't fully handle the exception.  Give it to the
-                // previous purecall handler.
-                current_handler->previous_pch_();
-            else {
-                // If there's no previous handler, return and let _purecall handle it.
-                // This will just put up an assertion dialog.
-                return;
-            }
-        }
+    static void handlePureVirtualCall() {
+        nvDebugBreak();
+        TerminateProcess(GetCurrentProcess(), EXIT_FAILURE + 8);
+    }
 
-        // The handler either took care of the invalid parameter problem itself,
-        // or passed it on to another handler.  "Swallow" it by exiting, paralleling
-        // the behavior of "swallowing" exceptions.
-        exit(0);
-    }*/
+    static void handleInvalidParameter(const wchar_t * expresion, const wchar_t * function, const wchar_t * file, unsigned int line, uintptr_t reserved) {
+        nvDebugBreak();
+        TerminateProcess(GetCurrentProcess(), EXIT_FAILURE + 8);
+    }
 
 
 #elif !NV_OS_WIN32 && defined(HAVE_SIGNAL_H) // NV_OS_LINUX || NV_OS_DARWIN
@@ -755,8 +717,8 @@ namespace
             }
 
             if (ret == NV_ABORT_EXIT) {
-                 // Exit cleanly.
-                throw "Assertion failed";
+                // Exit cleanly.
+                exit(EXIT_FAILURE + 1);
             }
 
             return ret;
@@ -788,7 +750,7 @@ namespace
 
             if( ret == NV_ABORT_EXIT ) {
                  // Exit cleanly.
-                throw "Assertion failed";
+                exit(EXIT_FAILURE + 1);
             }
 
             return ret;
@@ -825,7 +787,7 @@ namespace
 #endif
 
             // Exit cleanly.
-            throw "Assertion failed";
+            exit(EXIT_FAILURE + 1);
         }
     };
 
@@ -853,6 +815,38 @@ int nvAbort(const char * exp, const char * file, int line, const char * func/*=N
     }
 }
 
+// Abnormal termination. Create mini dump and output call stack.
+void debug::terminate(int code)
+{
+    EnterCriticalSection(&s_handler_critical_section);
+
+    writeMiniDump(NULL);
+
+    const int max_stack_size = 64;
+    void * trace[max_stack_size];
+    int size = backtrace(trace, max_stack_size);
+
+    // @@ Use win32's CreateFile?
+    FILE * fp = fileOpen("crash.txt", "wb");
+    if (fp != NULL) {
+        Array<const char *> lines;
+        writeStackTrace(trace, size, 0, lines);
+
+        for (uint i = 0; i < lines.count(); i++) {
+            fputs(lines[i], fp);
+            delete lines[i];
+        }
+
+        // @@ Add more info to crash.txt?
+
+        fclose(fp);
+    }
+
+    LeaveCriticalSection(&s_handler_critical_section);
+
+    exit(code);
+}
+
 
 /// Shows a message through the message handler.
 void NV_CDECL nvDebugPrint(const char *msg, ...)
@@ -987,13 +981,11 @@ void debug::enableSigHandler(bool interactive)
 
     s_old_exception_filter = ::SetUnhandledExceptionFilter( handleException );
 
-    /*
 #if _MSC_VER >= 1400  // MSVC 2005/8
     _set_invalid_parameter_handler(handleInvalidParameter);
 #endif  // _MSC_VER >= 1400
 
     _set_purecall_handler(handlePureVirtualCall);
-    */
 
 
     // SYMOPT_DEFERRED_LOADS make us not take a ton of time unless we actual log traces
diff --git a/src/nvcore/Debug.h b/src/nvcore/Debug.h
index 0a9e468..f0f73d9 100644
--- a/src/nvcore/Debug.h
+++ b/src/nvcore/Debug.h
@@ -197,6 +197,8 @@ namespace nv
 
         NVCORE_API bool isDebuggerPresent();
         NVCORE_API bool attachToDebugger();
+
+        NVCORE_API void terminate(int code);
     }
 
 } // nv namespace
diff --git a/src/nvcore/Utils.h b/src/nvcore/Utils.h
index 61cb7ea..cef87ed 100644
--- a/src/nvcore/Utils.h
+++ b/src/nvcore/Utils.h
@@ -207,6 +207,13 @@ namespace nv
         }
     }
 
+    template <typename T>
+    void construct_range(T * restrict ptr, uint new_size, uint old_size, const T * src) {
+        for (uint i = old_size; i < new_size; i++) {
+            new(ptr+i) T(src[i]); // placement new
+        }
+    }
+
     template <typename T>
     void destroy_range(T * restrict ptr, uint new_size, uint old_size) {
         for (uint i = new_size; i < old_size; i++) {
@@ -223,7 +230,7 @@ namespace nv
     }
 
     template <typename T>
-    void copy(T * restrict dst, const T * restrict src, uint count) {
+    void copy_range(T * restrict dst, const T * restrict src, uint count) {
         for (uint i = 0; i < count; i++) {
             dst[i] = src[i];
         }
diff --git a/src/nvimage/FloatImage.cpp b/src/nvimage/FloatImage.cpp
index c91da6b..3b6c070 100644
--- a/src/nvimage/FloatImage.cpp
+++ b/src/nvimage/FloatImage.cpp
@@ -1338,7 +1338,7 @@ void FloatImage::flipZ()
 
 
 
-float FloatImage::alphaTestCoverage(float alphaRef, int alphaChannel) const
+float FloatImage::alphaTestCoverage(float alphaRef, int alphaChannel, float alphaScale/*=1*/) const
 {
     const uint w = m_width;
     const uint h = m_height;
@@ -1347,16 +1347,41 @@ float FloatImage::alphaTestCoverage(float alphaRef, int alphaChannel) const
 
     const float * alpha = channel(alphaChannel);
 
+#if 0
     const uint count = m_pixelCount;
     for (uint i = 0; i < count; i++) {
         if (alpha[i] > alphaRef) coverage += 1.0f; // @@ gt or lt?
     }
-
+    
     return coverage / float(w * h);
+#else
+    const uint n = 8;
+
+    // If we want subsampling:
+    for (uint y = 0; y < h-1; y++) {
+        for (uint x = 0; x < w-1; x++) {
+
+            float alpha00 = nv::saturate(pixel(alphaChannel, x+0, y+0, 0) * alphaScale);
+            float alpha10 = nv::saturate(pixel(alphaChannel, x+1, y+0, 0) * alphaScale);
+            float alpha01 = nv::saturate(pixel(alphaChannel, x+0, y+1, 0) * alphaScale);
+            float alpha11 = nv::saturate(pixel(alphaChannel, x+1, y+1, 0) * alphaScale);
+
+            for (float fy = 0.5f/n; fy < 1.0f; fy++) {
+                for (float fx = 0.5f/n; fx < 1.0f; fx++) {
+                    float alpha = alpha00 * (1 - fx) * (1 - fy) + alpha10 * fx * (1 - fy) + alpha01 * (1 - fx) * fy + alpha11 * fx * fy;
+                    if (alpha > alphaRef) coverage += 1.0f;
+                }
+            }
+        }
+    }
+
+    return coverage / float(w * h * n * n);
+#endif
 }
 
 void FloatImage::scaleAlphaToCoverage(float desiredCoverage, float alphaRef, int alphaChannel)
 {
+#if 0
     float minAlphaRef = 0.0f;
     float maxAlphaRef = 1.0f;
     float midAlphaRef = 0.5f;
@@ -1383,8 +1408,35 @@ void FloatImage::scaleAlphaToCoverage(float desiredCoverage, float alphaRef, int
     // Scale alpha channel.
     scaleBias(alphaChannel, 1, alphaScale, 0.0f);
     clamp(alphaChannel, 1, 0.0f, 1.0f); 
+#else
+    float minAlphaScale = 0.0f;
+    float maxAlphaScale = 4.0f;
+    float alphaScale = 1.0f;
+
+    // Determine desired scale using a binary search. Hardcoded to 8 steps max.
+    for (int i = 0; i < 10; i++) {
+        float currentCoverage = alphaTestCoverage(alphaRef, alphaChannel, alphaScale);
+
+        if (currentCoverage < desiredCoverage) {
+            minAlphaScale = alphaScale;
+        }
+        else if (currentCoverage > desiredCoverage) {
+            maxAlphaScale = alphaScale;
+        }
+        else {
+            break;
+        }
 
-    //float newCoverage = alphaTestCoverage(alphaRef, alphaChannel);
+        alphaScale = (minAlphaScale + maxAlphaScale) * 0.5f;
+    }
+
+    // Scale alpha channel.
+    scaleBias(alphaChannel, 1, alphaScale, 0.0f);
+    clamp(alphaChannel, 1, 0.0f, 1.0f); 
+#endif
+#if _DEBUG
+    float newCoverage = alphaTestCoverage(alphaRef, alphaChannel);
+#endif
 }
 
 FloatImage* FloatImage::clone() const
diff --git a/src/nvimage/FloatImage.h b/src/nvimage/FloatImage.h
index 39085c8..10a236f 100644
--- a/src/nvimage/FloatImage.h
+++ b/src/nvimage/FloatImage.h
@@ -103,7 +103,7 @@ namespace nv
         NVIMAGE_API void flipY();
         NVIMAGE_API void flipZ();
 
-        NVIMAGE_API float alphaTestCoverage(float alphaRef, int alphaChannel) const;
+        NVIMAGE_API float alphaTestCoverage(float alphaRef, int alphaChannel, float alphaScale = 1.0f) const;
         NVIMAGE_API void scaleAlphaToCoverage(float coverage, float alphaRef, int alphaChannel);
 
 
diff --git a/src/nvmath/Half.cpp b/src/nvmath/Half.cpp
index 7bd9122..d73f64a 100644
--- a/src/nvmath/Half.cpp
+++ b/src/nvmath/Half.cpp
@@ -76,6 +76,10 @@
 #include "Half.h"
 #include <stdio.h>
 
+#if NV_CC_GNUC
+#include <xmmintrin.h>
+#endif
+
 // Load immediate
 static inline uint32 _uint32_li( uint32 a )
 {
@@ -488,10 +492,79 @@ nv::half_to_float( uint16 h )
 }
 
 
-// @@ This code appears to be wrong.
+static __m128 half_to_float4_SSE2(__m128i h)
+{
+#define SSE_CONST4(name, val) static const __declspec(align(16)) uint name[4] = { (val), (val), (val), (val) }
+#define CONST(name) *(const __m128i *)&name
+
+    SSE_CONST4(mask_nosign,         0x7fff);
+    SSE_CONST4(mask_justsign,       0x8000);
+    SSE_CONST4(mask_shifted_exp,    0x7c00 << 13);
+    SSE_CONST4(expadjust_normal,    (127 - 15) << 23);
+    SSE_CONST4(expadjust_infnan,    (128 - 16) << 23);
+    SSE_CONST4(expadjust_denorm,    1 << 23);
+    SSE_CONST4(magic_denorm,        113 << 23);
+
+    __m128i mnosign     = CONST(mask_nosign);
+    __m128i expmant     = _mm_and_si128(mnosign, h);
+    __m128i justsign    = _mm_and_si128(h, CONST(mask_justsign));
+    __m128i mshiftexp   = CONST(mask_shifted_exp);
+    __m128i eadjust     = CONST(expadjust_normal);
+    __m128i shifted     = _mm_slli_epi32(expmant, 13);
+    __m128i adjusted    = _mm_add_epi32(eadjust, shifted);
+    __m128i justexp     = _mm_and_si128(shifted, mshiftexp);
+
+    __m128i zero        = _mm_setzero_si128();
+    __m128i b_isinfnan  = _mm_cmpeq_epi32(mshiftexp, justexp);
+    __m128i b_isdenorm  = _mm_cmpeq_epi32(zero, justexp);
+
+    __m128i adj_infnan  = _mm_and_si128(b_isinfnan, CONST(expadjust_infnan));
+    __m128i adjusted2   = _mm_add_epi32(adjusted, adj_infnan);
+
+    __m128i adj_den     = CONST(expadjust_denorm);
+    __m128i den1        = _mm_add_epi32(adj_den, adjusted2);
+    __m128  den2        = _mm_sub_ps(_mm_castsi128_ps(den1), *(const __m128 *)&magic_denorm);
+    __m128  adjusted3   = _mm_and_ps(den2, _mm_castsi128_ps(b_isdenorm));
+    __m128  adjusted4   = _mm_andnot_ps(_mm_castsi128_ps(b_isdenorm), _mm_castsi128_ps(adjusted2));
+    __m128  adjusted5   = _mm_or_ps(adjusted3, adjusted4);
+    __m128i sign        = _mm_slli_epi32(justsign, 16);
+    __m128  final       = _mm_or_ps(adjusted5, _mm_castsi128_ps(sign));
+
+    // ~21 SSE2 ops.
+    return final;
+
+#undef SSE_CONST4
+#undef CONST
+}
+
+
+void nv::half_to_float_array(const uint16 * vin, float * vout, int count) {
+    nvDebugCheck((intptr_t(vin) & 15) == 0);
+    nvDebugCheck((intptr_t(vout) & 15) == 0);
+    nvDebugCheck((count & 7) == 0);
+
+    __m128i zero = _mm_setzero_si128();
+
+    for (int i = 0; i < count; i += 8)
+    {
+        __m128i in = _mm_loadu_si128((const __m128i *)(vin + i));
+        __m128i a = _mm_unpacklo_epi16(in, zero);
+        __m128i b = _mm_unpackhi_epi16(in, zero);
+        
+        __m128 outa = half_to_float4_SSE2(a);
+        _mm_storeu_ps((float *)(vout + i), outa);
+        
+        __m128 outb = half_to_float4_SSE2(b);
+        _mm_storeu_ps((float *)(vout + i + 4), outb);
+    }
+}
+
+
+
+
 // @@ These tables could be smaller.
 namespace nv {
-    uint32 mantissa_table[2048];
+    uint32 mantissa_table[2048] = { 0xDEADBEEF };
     uint32 exponent_table[64];
     uint32 offset_table[64];
 }
diff --git a/src/nvmath/Half.h b/src/nvmath/Half.h
index 53fceb6..962767a 100644
--- a/src/nvmath/Half.h
+++ b/src/nvmath/Half.h
@@ -9,6 +9,9 @@ namespace nv {
     uint32 half_to_float( uint16 h );
     uint16 half_from_float( uint32 f );
 
+    // vin,vout must be 16 byte aligned. count must be a multiple of 8.
+    void half_to_float_array(const uint16 * vin, float * vout, int count);
+
     void half_init_tables();
 
     extern uint32 mantissa_table[2048];
@@ -19,6 +22,7 @@ namespace nv {
     // http://www.fox-toolkit.org/ftp/fasthalffloatconversion.pdf
     inline uint32 fast_half_to_float(uint16 h)
     {
+        nvDebugCheck(mantissa_table[0] == 0); // Make sure table was initialized.
 	    uint exp = h >> 10;
 	    return mantissa_table[offset_table[exp] + (h & 0x3ff)] + exponent_table[exp];
     }
diff --git a/src/nvmath/Matrix.h b/src/nvmath/Matrix.h
index 8f67c0f..5ccfaf5 100644
--- a/src/nvmath/Matrix.h
+++ b/src/nvmath/Matrix.h
@@ -62,6 +62,7 @@ namespace nv
         Matrix();
         explicit Matrix(float f);
         explicit Matrix(identity_t);
+        Matrix(const Matrix3 & m);
         Matrix(const Matrix & m);
         Matrix(Vector4::Arg v0, Vector4::Arg v1, Vector4::Arg v2, Vector4::Arg v3);
         //explicit Matrix(const float m[]);	// m is assumed to contain 16 elements
diff --git a/src/nvmath/Matrix.inl b/src/nvmath/Matrix.inl
index 4843643..b9460a2 100644
--- a/src/nvmath/Matrix.inl
+++ b/src/nvmath/Matrix.inl
@@ -250,6 +250,19 @@ namespace nv
         }
     }
 
+    inline Matrix::Matrix(const Matrix3 & m)
+    {
+        for(int i = 0; i < 3; i++) {
+            for(int j = 0; j < 3; j++) {
+                operator()(i, j) = m.get(i, j);
+            }
+        }
+        for(int i = 0; i < 4; i++) {
+            operator()(3, i) = 0;
+            operator()(i, 3) = 0;
+        }
+    }
+
     inline Matrix::Matrix(Vector4::Arg v0, Vector4::Arg v1, Vector4::Arg v2, Vector4::Arg v3)
     {
         m_data[ 0] = v0.x; m_data[ 1] = v0.y; m_data[ 2] = v0.z; m_data[ 3] = v0.w;
diff --git a/src/nvthread/ParallelFor.cpp b/src/nvthread/ParallelFor.cpp
index 9ab8144..9632414 100644
--- a/src/nvthread/ParallelFor.cpp
+++ b/src/nvthread/ParallelFor.cpp
@@ -16,7 +16,7 @@ using namespace nv;
 #define ENABLE_PARALLEL_FOR 0
 #endif
 
-void worker(void * arg) {
+static void worker(void * arg) {
     ParallelFor * owner = (ParallelFor *)arg;
 
     while(true) {
diff --git a/src/nvtt/ClusterFit.cpp b/src/nvtt/ClusterFit.cpp
index a8e08a4..ebe6ac1 100644
--- a/src/nvtt/ClusterFit.cpp
+++ b/src/nvtt/ClusterFit.cpp
@@ -92,8 +92,8 @@ void ClusterFit::setColourSet(const ColorSet * set)
     {
         int p = order[i];
 #if NVTT_USE_SIMD
-        NV_ALIGN_16 Vector4 tmp(values[p] * set->weights[p], set->weights[p]);
-        m_weighted[i] = SimdVector(tmp.component);
+        NV_ALIGN_16 Vector4 tmp(values[p], 1);
+        m_weighted[i] = SimdVector(tmp.component) * SimdVector(set->weights[p]);
         m_xxsum += m_weighted[i] * m_weighted[i];
         m_xsum += m_weighted[i];
 #else
diff --git a/src/nvtt/CompressorDX9.cpp b/src/nvtt/CompressorDX9.cpp
index 53f979c..f5446a0 100644
--- a/src/nvtt/CompressorDX9.cpp
+++ b/src/nvtt/CompressorDX9.cpp
@@ -40,6 +40,7 @@
 #include "nvimage/BlockDXT.h"
 
 #include "nvmath/Vector.inl"
+#include "nvmath/Color.inl"
 
 #include "nvcore/Memory.h"
 
@@ -111,18 +112,15 @@ void FastCompressorDXT5n::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alpha
     QuickCompress::compressDXT5(rgba, block);
 }
 
-#if 1
+#if 0
 void CompressorDXT1::compressBlock(ColorSet & set, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
 {
     set.setUniformWeights();
-    set.createMinimalSet(false);
-
-    ClusterFit fit;
-    fit.setMetric(compressionOptions.colorWeight);
+    set.createMinimalSet(/*ignoreTransparent*/false);
 
     BlockDXT1 * block = new(output) BlockDXT1;
     
-    if (set.isSingleColor(true))
+    if (set.isSingleColor(/*ignoreAlpha*/true))
     {
         Color32 c;
         c.r = uint8(clamp(set.colors[0].x, 0.0f, 1.0f) * 255);
@@ -133,16 +131,19 @@ void CompressorDXT1::compressBlock(ColorSet & set, nvtt::AlphaMode alphaMode, co
     }
     else
     {
+        ClusterFit fit;
+        fit.setMetric(compressionOptions.colorWeight);
         fit.setColourSet(&set);
 
         Vector3 start, end;
-
         fit.compress4(&start, &end);
-        QuickCompress::outputBlock4(set, start, end, block);
 
         if (fit.compress3(&start, &end)) {
             QuickCompress::outputBlock3(set, start, end, block);
         }
+        else {
+            QuickCompress::outputBlock4(set, start, end, block);        
+        }
     }
 }
 #else
@@ -219,16 +220,15 @@ void CompressorDXT3::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode,
         nvsquish::WeightedClusterFit fit;
         fit.SetMetric(compressionOptions.colorWeight.x, compressionOptions.colorWeight.y, compressionOptions.colorWeight.z);
 
-	int flags = 0;
-	if (alphaMode == nvtt::AlphaMode_Transparency) flags |= nvsquish::kWeightColourByAlpha;
+        int flags = 0;
+        if (alphaMode == nvtt::AlphaMode_Transparency) flags |= nvsquish::kWeightColourByAlpha;
 
-	nvsquish::ColourSet colours((uint8 *)rgba.colors(), flags);
-	fit.SetColourSet(&colours, 0);
-	fit.Compress(&block->color);
+        nvsquish::ColourSet colours((uint8 *)rgba.colors(), flags);
+        fit.SetColourSet(&colours, 0);
+        fit.Compress(&block->color);
     }
 }
 
-
 void CompressorDXT5::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
 {
     BlockDXT5 * block = new(output) BlockDXT5;
diff --git a/src/nvtt/CompressorDX9.h b/src/nvtt/CompressorDX9.h
index 8e65ba6..2a6e6fe 100644
--- a/src/nvtt/CompressorDX9.h
+++ b/src/nvtt/CompressorDX9.h
@@ -64,7 +64,7 @@ namespace nv
 
 
     // Normal CPU compressors.
-#if 1
+#if 0
     struct CompressorDXT1 : public ColorSetCompressor
     {
         virtual void compressBlock(ColorSet & set, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
diff --git a/src/nvtt/CompressorRGB.cpp b/src/nvtt/CompressorRGB.cpp
index 130f5ce..9a52d16 100644
--- a/src/nvtt/CompressorRGB.cpp
+++ b/src/nvtt/CompressorRGB.cpp
@@ -310,7 +310,7 @@ void PixelFormatConverter::compress(nvtt::AlphaMode /*alphaMode*/, uint w, uint
     {
         for (uint y = 0; y < h; y++)
         {
-            const float * src = (const float *)data + y * w;
+            const float * src = (const float *)data + (z * h + y) * w;
 
             BitStream stream(dst);