Merge changes from The Witness.

2012-07-20 16:32:26 +00:00 · 2012-07-20 16:32:26 +00:00 · 04bdc76749
commit 04bdc76749
parent 3b4fcd0369
15 changed files with 280 additions and 124 deletions
--- a/src/nvcore/Array.inl
+++ b/src/nvcore/Array.inl
@ -290,11 +290,23 @@ namespace nv
    template <typename T>
    NV_FORCEINLINE void Array<T>::copy(const T * data, uint count)
    {
-        destroy_range(m_buffer, count, m_size);
+#if 1   // More simple, but maybe not be as efficient?
+        destroy_range(m_buffer, 0, m_size);

        setArraySize(count);

-        ::nv::copy(m_buffer, data, count);
+        construct_range(m_buffer, count, 0, data);
+#else
+        const uint old_size = m_size;
+
+        destroy_range(m_buffer, count, old_size);
+
+        setArraySize(count);
+
+        copy_range(m_buffer, data, old_size);
+
+        construct_range(m_buffer, count, old_size, data);
+#endif
    }

    // Assignment operator.
--- a/src/nvcore/Debug.cpp
+++ b/src/nvcore/Debug.cpp
@ -172,48 +172,53 @@ namespace
            return false;
        }

-        MINIDUMP_EXCEPTION_INFORMATION ExInfo;
-        ExInfo.ThreadId = ::GetCurrentThreadId();
-        ExInfo.ExceptionPointers = pExceptionInfo;
-        ExInfo.ClientPointers = NULL;
+        MINIDUMP_EXCEPTION_INFORMATION * pExInfo = NULL;
+        MINIDUMP_CALLBACK_INFORMATION * pCallback = NULL;

-        MINIDUMP_CALLBACK_INFORMATION callback;
-        MINIDUMP_CALLBACK_INFORMATION * callback_pointer = NULL;
-        MinidumpCallbackContext context;
+        if (pExceptionInfo != NULL) {
+            MINIDUMP_EXCEPTION_INFORMATION ExInfo;
+            ExInfo.ThreadId = ::GetCurrentThreadId();
+            ExInfo.ExceptionPointers = pExceptionInfo;
+            ExInfo.ClientPointers = NULL;
+            pExInfo = &ExInfo;

-        // Find a memory region of 256 bytes centered on the
-        // faulting instruction pointer.
-        const ULONG64 instruction_pointer = 
-        #if defined(_M_IX86)
-            pExceptionInfo->ContextRecord->Eip;
-        #elif defined(_M_AMD64)
-            pExceptionInfo->ContextRecord->Rip;
-        #else
-            #error Unsupported platform
-        #endif
+            MINIDUMP_CALLBACK_INFORMATION callback;
+            MinidumpCallbackContext context;

-        MEMORY_BASIC_INFORMATION info;
-        
-        if (VirtualQuery(reinterpret_cast<LPCVOID>(instruction_pointer), &info, sizeof(MEMORY_BASIC_INFORMATION)) != 0 && info.State == MEM_COMMIT)
-        {
-            // Attempt to get 128 bytes before and after the instruction
-            // pointer, but settle for whatever's available up to the
-            // boundaries of the memory region.
-            const ULONG64 kIPMemorySize = 256;
-            context.memory_base = max(reinterpret_cast<ULONG64>(info.BaseAddress), instruction_pointer - (kIPMemorySize / 2));
-            ULONG64 end_of_range = min(instruction_pointer + (kIPMemorySize / 2), reinterpret_cast<ULONG64>(info.BaseAddress) + info.RegionSize);
-            context.memory_size = static_cast<ULONG>(end_of_range - context.memory_base);
-            context.finished = false;
+            // Find a memory region of 256 bytes centered on the
+            // faulting instruction pointer.
+            const ULONG64 instruction_pointer = 
+            #if defined(_M_IX86)
+                pExceptionInfo->ContextRecord->Eip;
+            #elif defined(_M_AMD64)
+                pExceptionInfo->ContextRecord->Rip;
+            #else
+                #error Unsupported platform
+            #endif

-            callback.CallbackRoutine = miniDumpWriteDumpCallback;
-            callback.CallbackParam = reinterpret_cast<void*>(&context);
-            callback_pointer = &callback;
+            MEMORY_BASIC_INFORMATION info;
+            
+            if (VirtualQuery(reinterpret_cast<LPCVOID>(instruction_pointer), &info, sizeof(MEMORY_BASIC_INFORMATION)) != 0 && info.State == MEM_COMMIT)
+            {
+                // Attempt to get 128 bytes before and after the instruction
+                // pointer, but settle for whatever's available up to the
+                // boundaries of the memory region.
+                const ULONG64 kIPMemorySize = 256;
+                context.memory_base = max(reinterpret_cast<ULONG64>(info.BaseAddress), instruction_pointer - (kIPMemorySize / 2));
+                ULONG64 end_of_range = min(instruction_pointer + (kIPMemorySize / 2), reinterpret_cast<ULONG64>(info.BaseAddress) + info.RegionSize);
+                context.memory_size = static_cast<ULONG>(end_of_range - context.memory_base);
+                context.finished = false;
+
+                callback.CallbackRoutine = miniDumpWriteDumpCallback;
+                callback.CallbackParam = reinterpret_cast<void*>(&context);
+                pCallback = &callback;
+            }
        }

        MINIDUMP_TYPE miniDumpType = (MINIDUMP_TYPE)(MiniDumpNormal|MiniDumpWithHandleData|MiniDumpWithThreadInfo);

        // write the dump
-        BOOL ok = MiniDumpWriteDump(GetCurrentProcess(), GetCurrentProcessId(), hFile, miniDumpType, &ExInfo, NULL, callback_pointer) != 0;
+        BOOL ok = MiniDumpWriteDump(GetCurrentProcess(), GetCurrentProcessId(), hFile, miniDumpType, pExInfo, NULL, pCallback) != 0;
        CloseHandle(hFile);

        if (ok == FALSE) {
@ -402,9 +407,8 @@ namespace
    // Write mini dump and print stack trace.
    static LONG WINAPI handleException(EXCEPTION_POINTERS * pExceptionInfo)
    {
-#if USE_SEPARATE_THREAD
        EnterCriticalSection(&s_handler_critical_section);
-
+#if USE_SEPARATE_THREAD
        s_requesting_thread_id = GetCurrentThreadId();
        s_exception_info = pExceptionInfo;

@ -418,12 +422,11 @@ namespace
        // Clean up.
        s_requesting_thread_id = 0;
        s_exception_info = NULL;
-
-        LeaveCriticalSection(&s_handler_critical_section);
 #else
        // First of all, write mini dump.
        writeMiniDump(pExceptionInfo);
 #endif
+        LeaveCriticalSection(&s_handler_critical_section);

        nvDebug("\nDump file saved.\n");

@ -454,62 +457,21 @@ namespace
            fclose(fp);
        }

-        return EXCEPTION_EXECUTE_HANDLER;   // Terminate app.
+        // This should terminate the process and set the error exit code.
+        TerminateProcess(GetCurrentProcess(), EXIT_FAILURE + 2);
+
+        return EXCEPTION_EXECUTE_HANDLER;   // Terminate app. In case terminate process did not succeed.
    }

-    /*static void handlePureVirtualCall() {
-        // This is an pure virtual function call, not an exception.  It's safe to
-        // play with sprintf here.
-        AutoExceptionHandler auto_exception_handler;
-        ExceptionHandler* current_handler = auto_exception_handler.get_handler();
+    static void handlePureVirtualCall() {
+        nvDebugBreak();
+        TerminateProcess(GetCurrentProcess(), EXIT_FAILURE + 8);
+    }

-        MDRawAssertionInfo assertion;
-        memset(&assertion, 0, sizeof(assertion));
-        assertion.type = MD_ASSERTION_INFO_TYPE_PURE_VIRTUAL_CALL;
-
-        // Make up an exception record for the current thread and CPU context
-        // to make it possible for the crash processor to classify these
-        // as do regular crashes, and to make it humane for developers to
-        // analyze them.
-        EXCEPTION_RECORD exception_record = {};
-        CONTEXT exception_context = {};
-        EXCEPTION_POINTERS exception_ptrs = { &exception_record, &exception_context };
-
-        ::RtlCaptureContext(&exception_context);
-
-        exception_record.ExceptionCode = STATUS_NONCONTINUABLE_EXCEPTION;
-
-        // We store pointers to the the expression and function strings,
-        // and the line as exception parameters to make them easy to
-        // access by the developer on the far side.
-        exception_record.NumberParameters = 3;
-        exception_record.ExceptionInformation[0] = reinterpret_cast<ULONG_PTR>(&assertion.expression);
-        exception_record.ExceptionInformation[1] = reinterpret_cast<ULONG_PTR>(&assertion.file);
-        exception_record.ExceptionInformation[2] = assertion.line;
-
-        bool success = false;
-        // In case of out-of-process dump generation, directly call
-        // WriteMinidumpWithException since there is no separate thread running.
-
-        success = current_handler->WriteMinidumpOnHandlerThread(&exception_ptrs, &assertion);
-
-        if (!success) {
-            if (current_handler->previous_pch_) {
-                // The handler didn't fully handle the exception.  Give it to the
-                // previous purecall handler.
-                current_handler->previous_pch_();
-            else {
-                // If there's no previous handler, return and let _purecall handle it.
-                // This will just put up an assertion dialog.
-                return;
-            }
-        }
-
-        // The handler either took care of the invalid parameter problem itself,
-        // or passed it on to another handler.  "Swallow" it by exiting, paralleling
-        // the behavior of "swallowing" exceptions.
-        exit(0);
-    }*/
+    static void handleInvalidParameter(const wchar_t * expresion, const wchar_t * function, const wchar_t * file, unsigned int line, uintptr_t reserved) {
+        nvDebugBreak();
+        TerminateProcess(GetCurrentProcess(), EXIT_FAILURE + 8);
+    }


 #elif !NV_OS_WIN32 && defined(HAVE_SIGNAL_H) // NV_OS_LINUX || NV_OS_DARWIN
@ -755,8 +717,8 @@ namespace
            }

            if (ret == NV_ABORT_EXIT) {
-                 // Exit cleanly.
-                throw "Assertion failed";
+                // Exit cleanly.
+                exit(EXIT_FAILURE + 1);
            }

            return ret;
@ -788,7 +750,7 @@ namespace

            if( ret == NV_ABORT_EXIT ) {
                 // Exit cleanly.
-                throw "Assertion failed";
+                exit(EXIT_FAILURE + 1);
            }

            return ret;
@ -825,7 +787,7 @@ namespace
 #endif

            // Exit cleanly.
-            throw "Assertion failed";
+            exit(EXIT_FAILURE + 1);
        }
    };

@ -853,6 +815,38 @@ int nvAbort(const char * exp, const char * file, int line, const char * func/*=N
    }
 }

+// Abnormal termination. Create mini dump and output call stack.
+void debug::terminate(int code)
+{
+    EnterCriticalSection(&s_handler_critical_section);
+
+    writeMiniDump(NULL);
+
+    const int max_stack_size = 64;
+    void * trace[max_stack_size];
+    int size = backtrace(trace, max_stack_size);
+
+    // @@ Use win32's CreateFile?
+    FILE * fp = fileOpen("crash.txt", "wb");
+    if (fp != NULL) {
+        Array<const char *> lines;
+        writeStackTrace(trace, size, 0, lines);
+
+        for (uint i = 0; i < lines.count(); i++) {
+            fputs(lines[i], fp);
+            delete lines[i];
+        }
+
+        // @@ Add more info to crash.txt?
+
+        fclose(fp);
+    }
+
+    LeaveCriticalSection(&s_handler_critical_section);
+
+    exit(code);
+}
+

 /// Shows a message through the message handler.
 void NV_CDECL nvDebugPrint(const char *msg, ...)
@ -987,13 +981,11 @@ void debug::enableSigHandler(bool interactive)

    s_old_exception_filter = ::SetUnhandledExceptionFilter( handleException );

-    /*
 #if _MSC_VER >= 1400  // MSVC 2005/8
    _set_invalid_parameter_handler(handleInvalidParameter);
 #endif  // _MSC_VER >= 1400

    _set_purecall_handler(handlePureVirtualCall);
-    */


    // SYMOPT_DEFERRED_LOADS make us not take a ton of time unless we actual log traces
--- a/src/nvcore/Debug.h
+++ b/src/nvcore/Debug.h
@ -197,6 +197,8 @@ namespace nv

        NVCORE_API bool isDebuggerPresent();
        NVCORE_API bool attachToDebugger();
+
+        NVCORE_API void terminate(int code);
    }

 } // nv namespace
--- a/src/nvcore/Utils.h
+++ b/src/nvcore/Utils.h
@ -207,6 +207,13 @@ namespace nv
        }
    }

+    template <typename T>
+    void construct_range(T * restrict ptr, uint new_size, uint old_size, const T * src) {
+        for (uint i = old_size; i < new_size; i++) {
+            new(ptr+i) T(src[i]); // placement new
+        }
+    }
+
    template <typename T>
    void destroy_range(T * restrict ptr, uint new_size, uint old_size) {
        for (uint i = new_size; i < old_size; i++) {
@ -223,7 +230,7 @@ namespace nv
    }

    template <typename T>
-    void copy(T * restrict dst, const T * restrict src, uint count) {
+    void copy_range(T * restrict dst, const T * restrict src, uint count) {
        for (uint i = 0; i < count; i++) {
            dst[i] = src[i];
        }
--- a/src/nvimage/FloatImage.cpp
+++ b/src/nvimage/FloatImage.cpp
@ -1338,7 +1338,7 @@ void FloatImage::flipZ()



-float FloatImage::alphaTestCoverage(float alphaRef, int alphaChannel) const
+float FloatImage::alphaTestCoverage(float alphaRef, int alphaChannel, float alphaScale/*=1*/) const
 {
    const uint w = m_width;
    const uint h = m_height;
@ -1347,16 +1347,41 @@ float FloatImage::alphaTestCoverage(float alphaRef, int alphaChannel) const

    const float * alpha = channel(alphaChannel);

+#if 0
    const uint count = m_pixelCount;
    for (uint i = 0; i < count; i++) {
        if (alpha[i] > alphaRef) coverage += 1.0f; // @@ gt or lt?
    }
-
+    
    return coverage / float(w * h);
+#else
+    const uint n = 8;
+
+    // If we want subsampling:
+    for (uint y = 0; y < h-1; y++) {
+        for (uint x = 0; x < w-1; x++) {
+
+            float alpha00 = nv::saturate(pixel(alphaChannel, x+0, y+0, 0) * alphaScale);
+            float alpha10 = nv::saturate(pixel(alphaChannel, x+1, y+0, 0) * alphaScale);
+            float alpha01 = nv::saturate(pixel(alphaChannel, x+0, y+1, 0) * alphaScale);
+            float alpha11 = nv::saturate(pixel(alphaChannel, x+1, y+1, 0) * alphaScale);
+
+            for (float fy = 0.5f/n; fy < 1.0f; fy++) {
+                for (float fx = 0.5f/n; fx < 1.0f; fx++) {
+                    float alpha = alpha00 * (1 - fx) * (1 - fy) + alpha10 * fx * (1 - fy) + alpha01 * (1 - fx) * fy + alpha11 * fx * fy;
+                    if (alpha > alphaRef) coverage += 1.0f;
+                }
+            }
+        }
+    }
+
+    return coverage / float(w * h * n * n);
+#endif
 }

 void FloatImage::scaleAlphaToCoverage(float desiredCoverage, float alphaRef, int alphaChannel)
 {
+#if 0
    float minAlphaRef = 0.0f;
    float maxAlphaRef = 1.0f;
    float midAlphaRef = 0.5f;
@ -1383,8 +1408,35 @@ void FloatImage::scaleAlphaToCoverage(float desiredCoverage, float alphaRef, int
    // Scale alpha channel.
    scaleBias(alphaChannel, 1, alphaScale, 0.0f);
    clamp(alphaChannel, 1, 0.0f, 1.0f); 
+#else
+    float minAlphaScale = 0.0f;
+    float maxAlphaScale = 4.0f;
+    float alphaScale = 1.0f;

-    //float newCoverage = alphaTestCoverage(alphaRef, alphaChannel);
+    // Determine desired scale using a binary search. Hardcoded to 8 steps max.
+    for (int i = 0; i < 10; i++) {
+        float currentCoverage = alphaTestCoverage(alphaRef, alphaChannel, alphaScale);
+
+        if (currentCoverage < desiredCoverage) {
+            minAlphaScale = alphaScale;
+        }
+        else if (currentCoverage > desiredCoverage) {
+            maxAlphaScale = alphaScale;
+        }
+        else {
+            break;
+        }
+
+        alphaScale = (minAlphaScale + maxAlphaScale) * 0.5f;
+    }
+
+    // Scale alpha channel.
+    scaleBias(alphaChannel, 1, alphaScale, 0.0f);
+    clamp(alphaChannel, 1, 0.0f, 1.0f); 
+#endif
+#if _DEBUG
+    float newCoverage = alphaTestCoverage(alphaRef, alphaChannel);
+#endif
 }

 FloatImage* FloatImage::clone() const
--- a/src/nvimage/FloatImage.h
+++ b/src/nvimage/FloatImage.h
@ -103,7 +103,7 @@ namespace nv
        NVIMAGE_API void flipY();
        NVIMAGE_API void flipZ();

-        NVIMAGE_API float alphaTestCoverage(float alphaRef, int alphaChannel) const;
+        NVIMAGE_API float alphaTestCoverage(float alphaRef, int alphaChannel, float alphaScale = 1.0f) const;
        NVIMAGE_API void scaleAlphaToCoverage(float coverage, float alphaRef, int alphaChannel);


--- a/src/nvmath/Half.cpp
+++ b/src/nvmath/Half.cpp
@ -76,6 +76,10 @@
 #include "Half.h"
 #include <stdio.h>

+#if NV_CC_GNUC
+#include <xmmintrin.h>
+#endif
+
 // Load immediate
 static inline uint32 _uint32_li( uint32 a )
 {
@ -488,10 +492,79 @@ nv::half_to_float( uint16 h )
 }


-// @@ This code appears to be wrong.
+static __m128 half_to_float4_SSE2(__m128i h)
+{
+#define SSE_CONST4(name, val) static const __declspec(align(16)) uint name[4] = { (val), (val), (val), (val) }
+#define CONST(name) *(const __m128i *)&name
+
+    SSE_CONST4(mask_nosign,         0x7fff);
+    SSE_CONST4(mask_justsign,       0x8000);
+    SSE_CONST4(mask_shifted_exp,    0x7c00 << 13);
+    SSE_CONST4(expadjust_normal,    (127 - 15) << 23);
+    SSE_CONST4(expadjust_infnan,    (128 - 16) << 23);
+    SSE_CONST4(expadjust_denorm,    1 << 23);
+    SSE_CONST4(magic_denorm,        113 << 23);
+
+    __m128i mnosign     = CONST(mask_nosign);
+    __m128i expmant     = _mm_and_si128(mnosign, h);
+    __m128i justsign    = _mm_and_si128(h, CONST(mask_justsign));
+    __m128i mshiftexp   = CONST(mask_shifted_exp);
+    __m128i eadjust     = CONST(expadjust_normal);
+    __m128i shifted     = _mm_slli_epi32(expmant, 13);
+    __m128i adjusted    = _mm_add_epi32(eadjust, shifted);
+    __m128i justexp     = _mm_and_si128(shifted, mshiftexp);
+
+    __m128i zero        = _mm_setzero_si128();
+    __m128i b_isinfnan  = _mm_cmpeq_epi32(mshiftexp, justexp);
+    __m128i b_isdenorm  = _mm_cmpeq_epi32(zero, justexp);
+
+    __m128i adj_infnan  = _mm_and_si128(b_isinfnan, CONST(expadjust_infnan));
+    __m128i adjusted2   = _mm_add_epi32(adjusted, adj_infnan);
+
+    __m128i adj_den     = CONST(expadjust_denorm);
+    __m128i den1        = _mm_add_epi32(adj_den, adjusted2);
+    __m128  den2        = _mm_sub_ps(_mm_castsi128_ps(den1), *(const __m128 *)&magic_denorm);
+    __m128  adjusted3   = _mm_and_ps(den2, _mm_castsi128_ps(b_isdenorm));
+    __m128  adjusted4   = _mm_andnot_ps(_mm_castsi128_ps(b_isdenorm), _mm_castsi128_ps(adjusted2));
+    __m128  adjusted5   = _mm_or_ps(adjusted3, adjusted4);
+    __m128i sign        = _mm_slli_epi32(justsign, 16);
+    __m128  final       = _mm_or_ps(adjusted5, _mm_castsi128_ps(sign));
+
+    // ~21 SSE2 ops.
+    return final;
+
+#undef SSE_CONST4
+#undef CONST
+}
+
+
+void nv::half_to_float_array(const uint16 * vin, float * vout, int count) {
+    nvDebugCheck((intptr_t(vin) & 15) == 0);
+    nvDebugCheck((intptr_t(vout) & 15) == 0);
+    nvDebugCheck((count & 7) == 0);
+
+    __m128i zero = _mm_setzero_si128();
+
+    for (int i = 0; i < count; i += 8)
+    {
+        __m128i in = _mm_loadu_si128((const __m128i *)(vin + i));
+        __m128i a = _mm_unpacklo_epi16(in, zero);
+        __m128i b = _mm_unpackhi_epi16(in, zero);
+        
+        __m128 outa = half_to_float4_SSE2(a);
+        _mm_storeu_ps((float *)(vout + i), outa);
+        
+        __m128 outb = half_to_float4_SSE2(b);
+        _mm_storeu_ps((float *)(vout + i + 4), outb);
+    }
+}
+
+
+
+
 // @@ These tables could be smaller.
 namespace nv {
-    uint32 mantissa_table[2048];
+    uint32 mantissa_table[2048] = { 0xDEADBEEF };
    uint32 exponent_table[64];
    uint32 offset_table[64];
 }
--- a/src/nvmath/Half.h
+++ b/src/nvmath/Half.h
@ -9,6 +9,9 @@ namespace nv {
    uint32 half_to_float( uint16 h );
    uint16 half_from_float( uint32 f );

+    // vin,vout must be 16 byte aligned. count must be a multiple of 8.
+    void half_to_float_array(const uint16 * vin, float * vout, int count);
+
    void half_init_tables();

    extern uint32 mantissa_table[2048];
@ -19,6 +22,7 @@ namespace nv {
    // http://www.fox-toolkit.org/ftp/fasthalffloatconversion.pdf
    inline uint32 fast_half_to_float(uint16 h)
    {
+        nvDebugCheck(mantissa_table[0] == 0); // Make sure table was initialized.
 	    uint exp = h >> 10;
 	    return mantissa_table[offset_table[exp] + (h & 0x3ff)] + exponent_table[exp];
    }
--- a/src/nvmath/Matrix.h
+++ b/src/nvmath/Matrix.h
@ -62,6 +62,7 @@ namespace nv
        Matrix();
        explicit Matrix(float f);
        explicit Matrix(identity_t);
+        Matrix(const Matrix3 & m);
        Matrix(const Matrix & m);
        Matrix(Vector4::Arg v0, Vector4::Arg v1, Vector4::Arg v2, Vector4::Arg v3);
        //explicit Matrix(const float m[]);	// m is assumed to contain 16 elements
--- a/src/nvmath/Matrix.inl
+++ b/src/nvmath/Matrix.inl
@ -250,6 +250,19 @@ namespace nv
        }
    }

+    inline Matrix::Matrix(const Matrix3 & m)
+    {
+        for(int i = 0; i < 3; i++) {
+            for(int j = 0; j < 3; j++) {
+                operator()(i, j) = m.get(i, j);
+            }
+        }
+        for(int i = 0; i < 4; i++) {
+            operator()(3, i) = 0;
+            operator()(i, 3) = 0;
+        }
+    }
+
    inline Matrix::Matrix(Vector4::Arg v0, Vector4::Arg v1, Vector4::Arg v2, Vector4::Arg v3)
    {
        m_data[ 0] = v0.x; m_data[ 1] = v0.y; m_data[ 2] = v0.z; m_data[ 3] = v0.w;
--- a/src/nvthread/ParallelFor.cpp
+++ b/src/nvthread/ParallelFor.cpp
@ -16,7 +16,7 @@ using namespace nv;
 #define ENABLE_PARALLEL_FOR 0
 #endif

-void worker(void * arg) {
+static void worker(void * arg) {
    ParallelFor * owner = (ParallelFor *)arg;

    while(true) {
--- a/src/nvtt/ClusterFit.cpp
+++ b/src/nvtt/ClusterFit.cpp
@ -92,8 +92,8 @@ void ClusterFit::setColourSet(const ColorSet * set)
    {
        int p = order[i];
 #if NVTT_USE_SIMD
-        NV_ALIGN_16 Vector4 tmp(values[p] * set->weights[p], set->weights[p]);
-        m_weighted[i] = SimdVector(tmp.component);
+        NV_ALIGN_16 Vector4 tmp(values[p], 1);
+        m_weighted[i] = SimdVector(tmp.component) * SimdVector(set->weights[p]);
        m_xxsum += m_weighted[i] * m_weighted[i];
        m_xsum += m_weighted[i];
 #else
--- a/src/nvtt/CompressorDX9.cpp
+++ b/src/nvtt/CompressorDX9.cpp
@ -40,6 +40,7 @@
 #include "nvimage/BlockDXT.h"

 #include "nvmath/Vector.inl"
+#include "nvmath/Color.inl"

 #include "nvcore/Memory.h"

@ -111,18 +112,15 @@ void FastCompressorDXT5n::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alpha
    QuickCompress::compressDXT5(rgba, block);
 }

-#if 1
+#if 0
 void CompressorDXT1::compressBlock(ColorSet & set, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
 {
    set.setUniformWeights();
-    set.createMinimalSet(false);
-
-    ClusterFit fit;
-    fit.setMetric(compressionOptions.colorWeight);
+    set.createMinimalSet(/*ignoreTransparent*/false);

    BlockDXT1 * block = new(output) BlockDXT1;
    
-    if (set.isSingleColor(true))
+    if (set.isSingleColor(/*ignoreAlpha*/true))
    {
        Color32 c;
        c.r = uint8(clamp(set.colors[0].x, 0.0f, 1.0f) * 255);
@ -133,16 +131,19 @@ void CompressorDXT1::compressBlock(ColorSet & set, nvtt::AlphaMode alphaMode, co
    }
    else
    {
+        ClusterFit fit;
+        fit.setMetric(compressionOptions.colorWeight);
        fit.setColourSet(&set);

        Vector3 start, end;
-
        fit.compress4(&start, &end);
-        QuickCompress::outputBlock4(set, start, end, block);

        if (fit.compress3(&start, &end)) {
            QuickCompress::outputBlock3(set, start, end, block);
        }
+        else {
+            QuickCompress::outputBlock4(set, start, end, block);        
+        }
    }
 }
 #else
@ -219,16 +220,15 @@ void CompressorDXT3::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode,
        nvsquish::WeightedClusterFit fit;
        fit.SetMetric(compressionOptions.colorWeight.x, compressionOptions.colorWeight.y, compressionOptions.colorWeight.z);

-	int flags = 0;
-	if (alphaMode == nvtt::AlphaMode_Transparency) flags |= nvsquish::kWeightColourByAlpha;
+        int flags = 0;
+        if (alphaMode == nvtt::AlphaMode_Transparency) flags |= nvsquish::kWeightColourByAlpha;

-	nvsquish::ColourSet colours((uint8 *)rgba.colors(), flags);
-	fit.SetColourSet(&colours, 0);
-	fit.Compress(&block->color);
+        nvsquish::ColourSet colours((uint8 *)rgba.colors(), flags);
+        fit.SetColourSet(&colours, 0);
+        fit.Compress(&block->color);
    }
 }

-
 void CompressorDXT5::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
 {
    BlockDXT5 * block = new(output) BlockDXT5;
--- a/src/nvtt/CompressorDX9.h
+++ b/src/nvtt/CompressorDX9.h
@ -64,7 +64,7 @@ namespace nv


    // Normal CPU compressors.
-#if 1
+#if 0
    struct CompressorDXT1 : public ColorSetCompressor
    {
        virtual void compressBlock(ColorSet & set, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
--- a/src/nvtt/CompressorRGB.cpp
+++ b/src/nvtt/CompressorRGB.cpp
@ -310,7 +310,7 @@ void PixelFormatConverter::compress(nvtt::AlphaMode /*alphaMode*/, uint w, uint
    {
        for (uint y = 0; y < h; y++)
        {
-            const float * src = (const float *)data + y * w;
+            const float * src = (const float *)data + (z * h + y) * w;

            BitStream stream(dst);