From 04bdc76749793641395f26e3bd2005ed9dce2f74 Mon Sep 17 00:00:00 2001 From: castano Date: Fri, 20 Jul 2012 16:32:26 +0000 Subject: [PATCH] Merge changes from The Witness. --- src/nvcore/Array.inl | 16 ++- src/nvcore/Debug.cpp | 192 +++++++++++++++++------------------ src/nvcore/Debug.h | 2 + src/nvcore/Utils.h | 9 +- src/nvimage/FloatImage.cpp | 58 ++++++++++- src/nvimage/FloatImage.h | 2 +- src/nvmath/Half.cpp | 77 +++++++++++++- src/nvmath/Half.h | 4 + src/nvmath/Matrix.h | 1 + src/nvmath/Matrix.inl | 13 +++ src/nvthread/ParallelFor.cpp | 2 +- src/nvtt/ClusterFit.cpp | 4 +- src/nvtt/CompressorDX9.cpp | 28 ++--- src/nvtt/CompressorDX9.h | 2 +- src/nvtt/CompressorRGB.cpp | 2 +- 15 files changed, 284 insertions(+), 128 deletions(-) diff --git a/src/nvcore/Array.inl b/src/nvcore/Array.inl index 6a7236e..fefa19e 100755 --- a/src/nvcore/Array.inl +++ b/src/nvcore/Array.inl @@ -290,11 +290,23 @@ namespace nv template NV_FORCEINLINE void Array::copy(const T * data, uint count) { - destroy_range(m_buffer, count, m_size); +#if 1 // More simple, but maybe not be as efficient? + destroy_range(m_buffer, 0, m_size); setArraySize(count); - ::nv::copy(m_buffer, data, count); + construct_range(m_buffer, count, 0, data); +#else + const uint old_size = m_size; + + destroy_range(m_buffer, count, old_size); + + setArraySize(count); + + copy_range(m_buffer, data, old_size); + + construct_range(m_buffer, count, old_size, data); +#endif } // Assignment operator. diff --git a/src/nvcore/Debug.cpp b/src/nvcore/Debug.cpp index a662820..9c852d1 100644 --- a/src/nvcore/Debug.cpp +++ b/src/nvcore/Debug.cpp @@ -172,48 +172,53 @@ namespace return false; } - MINIDUMP_EXCEPTION_INFORMATION ExInfo; - ExInfo.ThreadId = ::GetCurrentThreadId(); - ExInfo.ExceptionPointers = pExceptionInfo; - ExInfo.ClientPointers = NULL; - - MINIDUMP_CALLBACK_INFORMATION callback; - MINIDUMP_CALLBACK_INFORMATION * callback_pointer = NULL; - MinidumpCallbackContext context; - - // Find a memory region of 256 bytes centered on the - // faulting instruction pointer. - const ULONG64 instruction_pointer = - #if defined(_M_IX86) - pExceptionInfo->ContextRecord->Eip; - #elif defined(_M_AMD64) - pExceptionInfo->ContextRecord->Rip; - #else - #error Unsupported platform - #endif - - MEMORY_BASIC_INFORMATION info; - - if (VirtualQuery(reinterpret_cast(instruction_pointer), &info, sizeof(MEMORY_BASIC_INFORMATION)) != 0 && info.State == MEM_COMMIT) - { - // Attempt to get 128 bytes before and after the instruction - // pointer, but settle for whatever's available up to the - // boundaries of the memory region. - const ULONG64 kIPMemorySize = 256; - context.memory_base = max(reinterpret_cast(info.BaseAddress), instruction_pointer - (kIPMemorySize / 2)); - ULONG64 end_of_range = min(instruction_pointer + (kIPMemorySize / 2), reinterpret_cast(info.BaseAddress) + info.RegionSize); - context.memory_size = static_cast(end_of_range - context.memory_base); - context.finished = false; - - callback.CallbackRoutine = miniDumpWriteDumpCallback; - callback.CallbackParam = reinterpret_cast(&context); - callback_pointer = &callback; + MINIDUMP_EXCEPTION_INFORMATION * pExInfo = NULL; + MINIDUMP_CALLBACK_INFORMATION * pCallback = NULL; + + if (pExceptionInfo != NULL) { + MINIDUMP_EXCEPTION_INFORMATION ExInfo; + ExInfo.ThreadId = ::GetCurrentThreadId(); + ExInfo.ExceptionPointers = pExceptionInfo; + ExInfo.ClientPointers = NULL; + pExInfo = &ExInfo; + + MINIDUMP_CALLBACK_INFORMATION callback; + MinidumpCallbackContext context; + + // Find a memory region of 256 bytes centered on the + // faulting instruction pointer. + const ULONG64 instruction_pointer = + #if defined(_M_IX86) + pExceptionInfo->ContextRecord->Eip; + #elif defined(_M_AMD64) + pExceptionInfo->ContextRecord->Rip; + #else + #error Unsupported platform + #endif + + MEMORY_BASIC_INFORMATION info; + + if (VirtualQuery(reinterpret_cast(instruction_pointer), &info, sizeof(MEMORY_BASIC_INFORMATION)) != 0 && info.State == MEM_COMMIT) + { + // Attempt to get 128 bytes before and after the instruction + // pointer, but settle for whatever's available up to the + // boundaries of the memory region. + const ULONG64 kIPMemorySize = 256; + context.memory_base = max(reinterpret_cast(info.BaseAddress), instruction_pointer - (kIPMemorySize / 2)); + ULONG64 end_of_range = min(instruction_pointer + (kIPMemorySize / 2), reinterpret_cast(info.BaseAddress) + info.RegionSize); + context.memory_size = static_cast(end_of_range - context.memory_base); + context.finished = false; + + callback.CallbackRoutine = miniDumpWriteDumpCallback; + callback.CallbackParam = reinterpret_cast(&context); + pCallback = &callback; + } } MINIDUMP_TYPE miniDumpType = (MINIDUMP_TYPE)(MiniDumpNormal|MiniDumpWithHandleData|MiniDumpWithThreadInfo); // write the dump - BOOL ok = MiniDumpWriteDump(GetCurrentProcess(), GetCurrentProcessId(), hFile, miniDumpType, &ExInfo, NULL, callback_pointer) != 0; + BOOL ok = MiniDumpWriteDump(GetCurrentProcess(), GetCurrentProcessId(), hFile, miniDumpType, pExInfo, NULL, pCallback) != 0; CloseHandle(hFile); if (ok == FALSE) { @@ -402,9 +407,8 @@ namespace // Write mini dump and print stack trace. static LONG WINAPI handleException(EXCEPTION_POINTERS * pExceptionInfo) { -#if USE_SEPARATE_THREAD EnterCriticalSection(&s_handler_critical_section); - +#if USE_SEPARATE_THREAD s_requesting_thread_id = GetCurrentThreadId(); s_exception_info = pExceptionInfo; @@ -418,12 +422,11 @@ namespace // Clean up. s_requesting_thread_id = 0; s_exception_info = NULL; - - LeaveCriticalSection(&s_handler_critical_section); #else // First of all, write mini dump. writeMiniDump(pExceptionInfo); #endif + LeaveCriticalSection(&s_handler_critical_section); nvDebug("\nDump file saved.\n"); @@ -454,62 +457,21 @@ namespace fclose(fp); } - return EXCEPTION_EXECUTE_HANDLER; // Terminate app. + // This should terminate the process and set the error exit code. + TerminateProcess(GetCurrentProcess(), EXIT_FAILURE + 2); + + return EXCEPTION_EXECUTE_HANDLER; // Terminate app. In case terminate process did not succeed. } - /*static void handlePureVirtualCall() { - // This is an pure virtual function call, not an exception. It's safe to - // play with sprintf here. - AutoExceptionHandler auto_exception_handler; - ExceptionHandler* current_handler = auto_exception_handler.get_handler(); - - MDRawAssertionInfo assertion; - memset(&assertion, 0, sizeof(assertion)); - assertion.type = MD_ASSERTION_INFO_TYPE_PURE_VIRTUAL_CALL; - - // Make up an exception record for the current thread and CPU context - // to make it possible for the crash processor to classify these - // as do regular crashes, and to make it humane for developers to - // analyze them. - EXCEPTION_RECORD exception_record = {}; - CONTEXT exception_context = {}; - EXCEPTION_POINTERS exception_ptrs = { &exception_record, &exception_context }; - - ::RtlCaptureContext(&exception_context); - - exception_record.ExceptionCode = STATUS_NONCONTINUABLE_EXCEPTION; - - // We store pointers to the the expression and function strings, - // and the line as exception parameters to make them easy to - // access by the developer on the far side. - exception_record.NumberParameters = 3; - exception_record.ExceptionInformation[0] = reinterpret_cast(&assertion.expression); - exception_record.ExceptionInformation[1] = reinterpret_cast(&assertion.file); - exception_record.ExceptionInformation[2] = assertion.line; - - bool success = false; - // In case of out-of-process dump generation, directly call - // WriteMinidumpWithException since there is no separate thread running. - - success = current_handler->WriteMinidumpOnHandlerThread(&exception_ptrs, &assertion); - - if (!success) { - if (current_handler->previous_pch_) { - // The handler didn't fully handle the exception. Give it to the - // previous purecall handler. - current_handler->previous_pch_(); - else { - // If there's no previous handler, return and let _purecall handle it. - // This will just put up an assertion dialog. - return; - } - } + static void handlePureVirtualCall() { + nvDebugBreak(); + TerminateProcess(GetCurrentProcess(), EXIT_FAILURE + 8); + } - // The handler either took care of the invalid parameter problem itself, - // or passed it on to another handler. "Swallow" it by exiting, paralleling - // the behavior of "swallowing" exceptions. - exit(0); - }*/ + static void handleInvalidParameter(const wchar_t * expresion, const wchar_t * function, const wchar_t * file, unsigned int line, uintptr_t reserved) { + nvDebugBreak(); + TerminateProcess(GetCurrentProcess(), EXIT_FAILURE + 8); + } #elif !NV_OS_WIN32 && defined(HAVE_SIGNAL_H) // NV_OS_LINUX || NV_OS_DARWIN @@ -755,8 +717,8 @@ namespace } if (ret == NV_ABORT_EXIT) { - // Exit cleanly. - throw "Assertion failed"; + // Exit cleanly. + exit(EXIT_FAILURE + 1); } return ret; @@ -788,7 +750,7 @@ namespace if( ret == NV_ABORT_EXIT ) { // Exit cleanly. - throw "Assertion failed"; + exit(EXIT_FAILURE + 1); } return ret; @@ -825,7 +787,7 @@ namespace #endif // Exit cleanly. - throw "Assertion failed"; + exit(EXIT_FAILURE + 1); } }; @@ -853,6 +815,38 @@ int nvAbort(const char * exp, const char * file, int line, const char * func/*=N } } +// Abnormal termination. Create mini dump and output call stack. +void debug::terminate(int code) +{ + EnterCriticalSection(&s_handler_critical_section); + + writeMiniDump(NULL); + + const int max_stack_size = 64; + void * trace[max_stack_size]; + int size = backtrace(trace, max_stack_size); + + // @@ Use win32's CreateFile? + FILE * fp = fileOpen("crash.txt", "wb"); + if (fp != NULL) { + Array lines; + writeStackTrace(trace, size, 0, lines); + + for (uint i = 0; i < lines.count(); i++) { + fputs(lines[i], fp); + delete lines[i]; + } + + // @@ Add more info to crash.txt? + + fclose(fp); + } + + LeaveCriticalSection(&s_handler_critical_section); + + exit(code); +} + /// Shows a message through the message handler. void NV_CDECL nvDebugPrint(const char *msg, ...) @@ -987,13 +981,11 @@ void debug::enableSigHandler(bool interactive) s_old_exception_filter = ::SetUnhandledExceptionFilter( handleException ); - /* #if _MSC_VER >= 1400 // MSVC 2005/8 _set_invalid_parameter_handler(handleInvalidParameter); #endif // _MSC_VER >= 1400 _set_purecall_handler(handlePureVirtualCall); - */ // SYMOPT_DEFERRED_LOADS make us not take a ton of time unless we actual log traces diff --git a/src/nvcore/Debug.h b/src/nvcore/Debug.h index 0a9e468..f0f73d9 100644 --- a/src/nvcore/Debug.h +++ b/src/nvcore/Debug.h @@ -197,6 +197,8 @@ namespace nv NVCORE_API bool isDebuggerPresent(); NVCORE_API bool attachToDebugger(); + + NVCORE_API void terminate(int code); } } // nv namespace diff --git a/src/nvcore/Utils.h b/src/nvcore/Utils.h index 61cb7ea..cef87ed 100644 --- a/src/nvcore/Utils.h +++ b/src/nvcore/Utils.h @@ -207,6 +207,13 @@ namespace nv } } + template + void construct_range(T * restrict ptr, uint new_size, uint old_size, const T * src) { + for (uint i = old_size; i < new_size; i++) { + new(ptr+i) T(src[i]); // placement new + } + } + template void destroy_range(T * restrict ptr, uint new_size, uint old_size) { for (uint i = new_size; i < old_size; i++) { @@ -223,7 +230,7 @@ namespace nv } template - void copy(T * restrict dst, const T * restrict src, uint count) { + void copy_range(T * restrict dst, const T * restrict src, uint count) { for (uint i = 0; i < count; i++) { dst[i] = src[i]; } diff --git a/src/nvimage/FloatImage.cpp b/src/nvimage/FloatImage.cpp index c91da6b..3b6c070 100644 --- a/src/nvimage/FloatImage.cpp +++ b/src/nvimage/FloatImage.cpp @@ -1338,7 +1338,7 @@ void FloatImage::flipZ() -float FloatImage::alphaTestCoverage(float alphaRef, int alphaChannel) const +float FloatImage::alphaTestCoverage(float alphaRef, int alphaChannel, float alphaScale/*=1*/) const { const uint w = m_width; const uint h = m_height; @@ -1347,16 +1347,41 @@ float FloatImage::alphaTestCoverage(float alphaRef, int alphaChannel) const const float * alpha = channel(alphaChannel); +#if 0 const uint count = m_pixelCount; for (uint i = 0; i < count; i++) { if (alpha[i] > alphaRef) coverage += 1.0f; // @@ gt or lt? } - + return coverage / float(w * h); +#else + const uint n = 8; + + // If we want subsampling: + for (uint y = 0; y < h-1; y++) { + for (uint x = 0; x < w-1; x++) { + + float alpha00 = nv::saturate(pixel(alphaChannel, x+0, y+0, 0) * alphaScale); + float alpha10 = nv::saturate(pixel(alphaChannel, x+1, y+0, 0) * alphaScale); + float alpha01 = nv::saturate(pixel(alphaChannel, x+0, y+1, 0) * alphaScale); + float alpha11 = nv::saturate(pixel(alphaChannel, x+1, y+1, 0) * alphaScale); + + for (float fy = 0.5f/n; fy < 1.0f; fy++) { + for (float fx = 0.5f/n; fx < 1.0f; fx++) { + float alpha = alpha00 * (1 - fx) * (1 - fy) + alpha10 * fx * (1 - fy) + alpha01 * (1 - fx) * fy + alpha11 * fx * fy; + if (alpha > alphaRef) coverage += 1.0f; + } + } + } + } + + return coverage / float(w * h * n * n); +#endif } void FloatImage::scaleAlphaToCoverage(float desiredCoverage, float alphaRef, int alphaChannel) { +#if 0 float minAlphaRef = 0.0f; float maxAlphaRef = 1.0f; float midAlphaRef = 0.5f; @@ -1383,8 +1408,35 @@ void FloatImage::scaleAlphaToCoverage(float desiredCoverage, float alphaRef, int // Scale alpha channel. scaleBias(alphaChannel, 1, alphaScale, 0.0f); clamp(alphaChannel, 1, 0.0f, 1.0f); +#else + float minAlphaScale = 0.0f; + float maxAlphaScale = 4.0f; + float alphaScale = 1.0f; + + // Determine desired scale using a binary search. Hardcoded to 8 steps max. + for (int i = 0; i < 10; i++) { + float currentCoverage = alphaTestCoverage(alphaRef, alphaChannel, alphaScale); + + if (currentCoverage < desiredCoverage) { + minAlphaScale = alphaScale; + } + else if (currentCoverage > desiredCoverage) { + maxAlphaScale = alphaScale; + } + else { + break; + } - //float newCoverage = alphaTestCoverage(alphaRef, alphaChannel); + alphaScale = (minAlphaScale + maxAlphaScale) * 0.5f; + } + + // Scale alpha channel. + scaleBias(alphaChannel, 1, alphaScale, 0.0f); + clamp(alphaChannel, 1, 0.0f, 1.0f); +#endif +#if _DEBUG + float newCoverage = alphaTestCoverage(alphaRef, alphaChannel); +#endif } FloatImage* FloatImage::clone() const diff --git a/src/nvimage/FloatImage.h b/src/nvimage/FloatImage.h index 39085c8..10a236f 100644 --- a/src/nvimage/FloatImage.h +++ b/src/nvimage/FloatImage.h @@ -103,7 +103,7 @@ namespace nv NVIMAGE_API void flipY(); NVIMAGE_API void flipZ(); - NVIMAGE_API float alphaTestCoverage(float alphaRef, int alphaChannel) const; + NVIMAGE_API float alphaTestCoverage(float alphaRef, int alphaChannel, float alphaScale = 1.0f) const; NVIMAGE_API void scaleAlphaToCoverage(float coverage, float alphaRef, int alphaChannel); diff --git a/src/nvmath/Half.cpp b/src/nvmath/Half.cpp index 7bd9122..d73f64a 100644 --- a/src/nvmath/Half.cpp +++ b/src/nvmath/Half.cpp @@ -76,6 +76,10 @@ #include "Half.h" #include +#if NV_CC_GNUC +#include +#endif + // Load immediate static inline uint32 _uint32_li( uint32 a ) { @@ -488,10 +492,79 @@ nv::half_to_float( uint16 h ) } -// @@ This code appears to be wrong. +static __m128 half_to_float4_SSE2(__m128i h) +{ +#define SSE_CONST4(name, val) static const __declspec(align(16)) uint name[4] = { (val), (val), (val), (val) } +#define CONST(name) *(const __m128i *)&name + + SSE_CONST4(mask_nosign, 0x7fff); + SSE_CONST4(mask_justsign, 0x8000); + SSE_CONST4(mask_shifted_exp, 0x7c00 << 13); + SSE_CONST4(expadjust_normal, (127 - 15) << 23); + SSE_CONST4(expadjust_infnan, (128 - 16) << 23); + SSE_CONST4(expadjust_denorm, 1 << 23); + SSE_CONST4(magic_denorm, 113 << 23); + + __m128i mnosign = CONST(mask_nosign); + __m128i expmant = _mm_and_si128(mnosign, h); + __m128i justsign = _mm_and_si128(h, CONST(mask_justsign)); + __m128i mshiftexp = CONST(mask_shifted_exp); + __m128i eadjust = CONST(expadjust_normal); + __m128i shifted = _mm_slli_epi32(expmant, 13); + __m128i adjusted = _mm_add_epi32(eadjust, shifted); + __m128i justexp = _mm_and_si128(shifted, mshiftexp); + + __m128i zero = _mm_setzero_si128(); + __m128i b_isinfnan = _mm_cmpeq_epi32(mshiftexp, justexp); + __m128i b_isdenorm = _mm_cmpeq_epi32(zero, justexp); + + __m128i adj_infnan = _mm_and_si128(b_isinfnan, CONST(expadjust_infnan)); + __m128i adjusted2 = _mm_add_epi32(adjusted, adj_infnan); + + __m128i adj_den = CONST(expadjust_denorm); + __m128i den1 = _mm_add_epi32(adj_den, adjusted2); + __m128 den2 = _mm_sub_ps(_mm_castsi128_ps(den1), *(const __m128 *)&magic_denorm); + __m128 adjusted3 = _mm_and_ps(den2, _mm_castsi128_ps(b_isdenorm)); + __m128 adjusted4 = _mm_andnot_ps(_mm_castsi128_ps(b_isdenorm), _mm_castsi128_ps(adjusted2)); + __m128 adjusted5 = _mm_or_ps(adjusted3, adjusted4); + __m128i sign = _mm_slli_epi32(justsign, 16); + __m128 final = _mm_or_ps(adjusted5, _mm_castsi128_ps(sign)); + + // ~21 SSE2 ops. + return final; + +#undef SSE_CONST4 +#undef CONST +} + + +void nv::half_to_float_array(const uint16 * vin, float * vout, int count) { + nvDebugCheck((intptr_t(vin) & 15) == 0); + nvDebugCheck((intptr_t(vout) & 15) == 0); + nvDebugCheck((count & 7) == 0); + + __m128i zero = _mm_setzero_si128(); + + for (int i = 0; i < count; i += 8) + { + __m128i in = _mm_loadu_si128((const __m128i *)(vin + i)); + __m128i a = _mm_unpacklo_epi16(in, zero); + __m128i b = _mm_unpackhi_epi16(in, zero); + + __m128 outa = half_to_float4_SSE2(a); + _mm_storeu_ps((float *)(vout + i), outa); + + __m128 outb = half_to_float4_SSE2(b); + _mm_storeu_ps((float *)(vout + i + 4), outb); + } +} + + + + // @@ These tables could be smaller. namespace nv { - uint32 mantissa_table[2048]; + uint32 mantissa_table[2048] = { 0xDEADBEEF }; uint32 exponent_table[64]; uint32 offset_table[64]; } diff --git a/src/nvmath/Half.h b/src/nvmath/Half.h index 53fceb6..962767a 100644 --- a/src/nvmath/Half.h +++ b/src/nvmath/Half.h @@ -9,6 +9,9 @@ namespace nv { uint32 half_to_float( uint16 h ); uint16 half_from_float( uint32 f ); + // vin,vout must be 16 byte aligned. count must be a multiple of 8. + void half_to_float_array(const uint16 * vin, float * vout, int count); + void half_init_tables(); extern uint32 mantissa_table[2048]; @@ -19,6 +22,7 @@ namespace nv { // http://www.fox-toolkit.org/ftp/fasthalffloatconversion.pdf inline uint32 fast_half_to_float(uint16 h) { + nvDebugCheck(mantissa_table[0] == 0); // Make sure table was initialized. uint exp = h >> 10; return mantissa_table[offset_table[exp] + (h & 0x3ff)] + exponent_table[exp]; } diff --git a/src/nvmath/Matrix.h b/src/nvmath/Matrix.h index 8f67c0f..5ccfaf5 100644 --- a/src/nvmath/Matrix.h +++ b/src/nvmath/Matrix.h @@ -62,6 +62,7 @@ namespace nv Matrix(); explicit Matrix(float f); explicit Matrix(identity_t); + Matrix(const Matrix3 & m); Matrix(const Matrix & m); Matrix(Vector4::Arg v0, Vector4::Arg v1, Vector4::Arg v2, Vector4::Arg v3); //explicit Matrix(const float m[]); // m is assumed to contain 16 elements diff --git a/src/nvmath/Matrix.inl b/src/nvmath/Matrix.inl index 4843643..b9460a2 100644 --- a/src/nvmath/Matrix.inl +++ b/src/nvmath/Matrix.inl @@ -250,6 +250,19 @@ namespace nv } } + inline Matrix::Matrix(const Matrix3 & m) + { + for(int i = 0; i < 3; i++) { + for(int j = 0; j < 3; j++) { + operator()(i, j) = m.get(i, j); + } + } + for(int i = 0; i < 4; i++) { + operator()(3, i) = 0; + operator()(i, 3) = 0; + } + } + inline Matrix::Matrix(Vector4::Arg v0, Vector4::Arg v1, Vector4::Arg v2, Vector4::Arg v3) { m_data[ 0] = v0.x; m_data[ 1] = v0.y; m_data[ 2] = v0.z; m_data[ 3] = v0.w; diff --git a/src/nvthread/ParallelFor.cpp b/src/nvthread/ParallelFor.cpp index 9ab8144..9632414 100644 --- a/src/nvthread/ParallelFor.cpp +++ b/src/nvthread/ParallelFor.cpp @@ -16,7 +16,7 @@ using namespace nv; #define ENABLE_PARALLEL_FOR 0 #endif -void worker(void * arg) { +static void worker(void * arg) { ParallelFor * owner = (ParallelFor *)arg; while(true) { diff --git a/src/nvtt/ClusterFit.cpp b/src/nvtt/ClusterFit.cpp index a8e08a4..ebe6ac1 100644 --- a/src/nvtt/ClusterFit.cpp +++ b/src/nvtt/ClusterFit.cpp @@ -92,8 +92,8 @@ void ClusterFit::setColourSet(const ColorSet * set) { int p = order[i]; #if NVTT_USE_SIMD - NV_ALIGN_16 Vector4 tmp(values[p] * set->weights[p], set->weights[p]); - m_weighted[i] = SimdVector(tmp.component); + NV_ALIGN_16 Vector4 tmp(values[p], 1); + m_weighted[i] = SimdVector(tmp.component) * SimdVector(set->weights[p]); m_xxsum += m_weighted[i] * m_weighted[i]; m_xsum += m_weighted[i]; #else diff --git a/src/nvtt/CompressorDX9.cpp b/src/nvtt/CompressorDX9.cpp index 53f979c..f5446a0 100644 --- a/src/nvtt/CompressorDX9.cpp +++ b/src/nvtt/CompressorDX9.cpp @@ -40,6 +40,7 @@ #include "nvimage/BlockDXT.h" #include "nvmath/Vector.inl" +#include "nvmath/Color.inl" #include "nvcore/Memory.h" @@ -111,18 +112,15 @@ void FastCompressorDXT5n::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alpha QuickCompress::compressDXT5(rgba, block); } -#if 1 +#if 0 void CompressorDXT1::compressBlock(ColorSet & set, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output) { set.setUniformWeights(); - set.createMinimalSet(false); - - ClusterFit fit; - fit.setMetric(compressionOptions.colorWeight); + set.createMinimalSet(/*ignoreTransparent*/false); BlockDXT1 * block = new(output) BlockDXT1; - if (set.isSingleColor(true)) + if (set.isSingleColor(/*ignoreAlpha*/true)) { Color32 c; c.r = uint8(clamp(set.colors[0].x, 0.0f, 1.0f) * 255); @@ -133,16 +131,19 @@ void CompressorDXT1::compressBlock(ColorSet & set, nvtt::AlphaMode alphaMode, co } else { + ClusterFit fit; + fit.setMetric(compressionOptions.colorWeight); fit.setColourSet(&set); Vector3 start, end; - fit.compress4(&start, &end); - QuickCompress::outputBlock4(set, start, end, block); if (fit.compress3(&start, &end)) { QuickCompress::outputBlock3(set, start, end, block); } + else { + QuickCompress::outputBlock4(set, start, end, block); + } } } #else @@ -219,16 +220,15 @@ void CompressorDXT3::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, nvsquish::WeightedClusterFit fit; fit.SetMetric(compressionOptions.colorWeight.x, compressionOptions.colorWeight.y, compressionOptions.colorWeight.z); - int flags = 0; - if (alphaMode == nvtt::AlphaMode_Transparency) flags |= nvsquish::kWeightColourByAlpha; + int flags = 0; + if (alphaMode == nvtt::AlphaMode_Transparency) flags |= nvsquish::kWeightColourByAlpha; - nvsquish::ColourSet colours((uint8 *)rgba.colors(), flags); - fit.SetColourSet(&colours, 0); - fit.Compress(&block->color); + nvsquish::ColourSet colours((uint8 *)rgba.colors(), flags); + fit.SetColourSet(&colours, 0); + fit.Compress(&block->color); } } - void CompressorDXT5::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output) { BlockDXT5 * block = new(output) BlockDXT5; diff --git a/src/nvtt/CompressorDX9.h b/src/nvtt/CompressorDX9.h index 8e65ba6..2a6e6fe 100644 --- a/src/nvtt/CompressorDX9.h +++ b/src/nvtt/CompressorDX9.h @@ -64,7 +64,7 @@ namespace nv // Normal CPU compressors. -#if 1 +#if 0 struct CompressorDXT1 : public ColorSetCompressor { virtual void compressBlock(ColorSet & set, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output); diff --git a/src/nvtt/CompressorRGB.cpp b/src/nvtt/CompressorRGB.cpp index 130f5ce..9a52d16 100644 --- a/src/nvtt/CompressorRGB.cpp +++ b/src/nvtt/CompressorRGB.cpp @@ -310,7 +310,7 @@ void PixelFormatConverter::compress(nvtt::AlphaMode /*alphaMode*/, uint w, uint { for (uint y = 0; y < h; y++) { - const float * src = (const float *)data + y * w; + const float * src = (const float *)data + (z * h + y) * w; BitStream stream(dst);