From d019cd7080144a7881f49919b426e8be8c497780 Mon Sep 17 00:00:00 2001 From: castano Date: Tue, 4 Nov 2014 17:49:29 +0000 Subject: [PATCH] Merge changes from the witness. --- src/nvcore/Array.h | 4 + src/nvcore/Array.inl | 14 +- src/nvcore/Debug.cpp | 198 +++++---- src/nvcore/Debug.h | 6 +- src/nvcore/DefsGnucLinux.h | 36 +- src/nvcore/DefsVcWin32.h | 2 +- src/nvcore/FileSystem.cpp | 13 + src/nvcore/ForEach.h | 1 + src/nvcore/Memory.h | 4 + src/nvcore/StdStream.h | 11 +- src/nvcore/StrLib.cpp | 49 ++- src/nvcore/StrLib.h | 110 ++++- src/nvcore/Stream.h | 2 +- src/nvcore/TextWriter.cpp | 4 +- src/nvcore/TextWriter.h | 10 +- src/nvcore/Utils.h | 144 +++--- src/nvcore/nvcore.h | 17 +- src/nvimage/BlockDXT.cpp | 28 +- src/nvimage/BlockDXT.h | 4 +- src/nvimage/ColorBlock.cpp | 81 +++- src/nvimage/ColorBlock.h | 37 +- src/nvimage/ColorSpace.cpp | 7 +- src/nvimage/DirectDrawSurface.cpp | 53 +-- src/nvimage/FloatImage.cpp | 1 + src/nvimage/Image.cpp | 41 +- src/nvimage/Image.h | 2 + src/nvimage/ImageIO.cpp | 123 +++++- src/nvmath/Box.inl | 2 +- src/nvmath/Color.h | 1 + src/nvmath/Color.inl | 25 +- src/nvmath/Fitting.cpp | 17 +- src/nvmath/Half.cpp | 76 ++-- src/nvmath/Matrix.cpp | 139 +++++- src/nvmath/Matrix.h | 12 +- src/nvmath/Matrix.inl | 64 ++- src/nvmath/SimdVector.h | 14 +- src/nvmath/Vector.h | 2 +- src/nvmath/Vector.inl | 11 +- src/nvmath/ftoi.h | 256 +++++++++++ src/nvmath/nvmath.h | 73 ++-- src/nvthread/ThreadPool.cpp | 2 +- src/nvtt/BlockCompressor.cpp | 8 + src/nvtt/ClusterFit.cpp | 159 ++++++- src/nvtt/ClusterFit.h | 9 +- src/nvtt/CompressionOptions.cpp | 9 +- src/nvtt/CompressorDX10.cpp | 75 +++- src/nvtt/CompressorDX10.h | 7 + src/nvtt/CompressorDX11.cpp | 89 ++-- src/nvtt/CompressorDX9.cpp | 344 ++++++++++++++- src/nvtt/CompressorDX9.h | 8 +- src/nvtt/CompressorDXT1.cpp | 461 ++++++++++++++++++++ src/nvtt/CompressorDXT1.h | 38 ++ src/nvtt/CompressorRGB.cpp | 15 +- src/nvtt/Context.cpp | 52 ++- src/nvtt/CubeSurface.cpp | 8 +- src/nvtt/InputOptions.cpp | 12 +- src/nvtt/OptimalCompressDXT.cpp | 360 ++++++++++++--- src/nvtt/OptimalCompressDXT.h | 19 +- src/nvtt/QuickCompressDXT.cpp | 305 +++++++++++-- src/nvtt/QuickCompressDXT.h | 13 +- src/nvtt/Surface.cpp | 234 +++++++--- src/nvtt/bc6h/bits.h | 5 +- src/nvtt/bc6h/tile.h | 2 +- src/nvtt/bc6h/{utils.cpp => zoh_utils.cpp} | 2 +- src/nvtt/bc6h/{utils.h => zoh_utils.h} | 0 src/nvtt/bc6h/zohone.cpp | 21 +- src/nvtt/bc6h/zohtwo.cpp | 5 +- src/nvtt/bc7/avpcl_mode0.cpp | 49 ++- src/nvtt/bc7/avpcl_mode1.cpp | 53 ++- src/nvtt/bc7/avpcl_mode2.cpp | 48 +- src/nvtt/bc7/avpcl_mode3.cpp | 49 ++- src/nvtt/bc7/avpcl_mode4.cpp | 53 ++- src/nvtt/bc7/avpcl_mode5.cpp | 53 ++- src/nvtt/bc7/avpcl_mode6.cpp | 45 +- src/nvtt/bc7/avpcl_mode7.cpp | 45 +- src/nvtt/bc7/{utils.cpp => avpcl_utils.cpp} | 10 +- src/nvtt/bc7/{utils.h => avpcl_utils.h} | 0 src/nvtt/bc7/bits.h | 3 +- src/nvtt/bc7/tile.h | 3 +- src/nvtt/cuda/CompressKernel.cu | 6 +- src/nvtt/nvtt.h | 27 +- src/nvtt/tests/testsuite.cpp | 2 +- src/nvtt/tools/assemble.cpp | 2 +- src/nvtt/tools/compress.cpp | 10 +- src/nvtt/tools/imgdiff.cpp | 4 +- src/nvtt/tools/resize.cpp | 2 +- 86 files changed, 3536 insertions(+), 884 deletions(-) create mode 100755 src/nvmath/ftoi.h create mode 100644 src/nvtt/CompressorDXT1.cpp create mode 100644 src/nvtt/CompressorDXT1.h rename src/nvtt/bc6h/{utils.cpp => zoh_utils.cpp} (96%) rename src/nvtt/bc6h/{utils.h => zoh_utils.h} (100%) rename src/nvtt/bc7/{utils.cpp => avpcl_utils.cpp} (93%) rename src/nvtt/bc7/{utils.h => avpcl_utils.h} (100%) diff --git a/src/nvcore/Array.h b/src/nvcore/Array.h index 984aa90..2e332fb 100644 --- a/src/nvcore/Array.h +++ b/src/nvcore/Array.h @@ -96,8 +96,11 @@ namespace nv /// Get vector pointer. NV_FORCEINLINE T * buffer() { return m_buffer; } + /// Provide begin/end pointers for C++11 range-based for loops. NV_FORCEINLINE T * begin() { return m_buffer; } NV_FORCEINLINE T * end() { return m_buffer + m_size; } + NV_FORCEINLINE const T * begin() const { return m_buffer; } + NV_FORCEINLINE const T * end() const { return m_buffer + m_size; } /// Is vector empty. NV_FORCEINLINE bool isEmpty() const { return m_size == 0; } @@ -106,6 +109,7 @@ namespace nv NV_FORCEINLINE bool isNull() const { return m_buffer == NULL; } + T & append(); void push_back( const T & val ); void pushBack( const T & val ); Array & append( const T & val ); diff --git a/src/nvcore/Array.inl b/src/nvcore/Array.inl index f7369bc..2f7cc44 100755 --- a/src/nvcore/Array.inl +++ b/src/nvcore/Array.inl @@ -16,6 +16,18 @@ namespace nv { + template + NV_FORCEINLINE T & Array::append() + { + uint old_size = m_size; + uint new_size = m_size + 1; + + setArraySize(new_size); + + construct_range(m_buffer, new_size, old_size); + + return m_buffer[old_size]; // Return reference to last element. + } // Push an element at the end of the vector. template @@ -211,7 +223,7 @@ namespace nv void Array::replaceWithLast(uint index) { nvDebugCheck( index < m_size ); - nv::swap(m_buffer[index], back()); + nv::swap(m_buffer[index], back()); // @@ Is this OK when index == size-1? (m_buffer+m_size-1)->~T(); m_size--; } diff --git a/src/nvcore/Debug.cpp b/src/nvcore/Debug.cpp index ebf77b3..99e7d39 100644 --- a/src/nvcore/Debug.cpp +++ b/src/nvcore/Debug.cpp @@ -66,6 +66,10 @@ # endif #endif +#if NV_OS_ORBIS +#include +#endif + #define NV_USE_SEPARATE_THREAD 1 @@ -263,7 +267,7 @@ namespace } /*static NV_NOINLINE int backtrace(void * trace[], int maxcount) { - + // In Windows XP and Windows Server 2003, the sum of the FramesToSkip and FramesToCapture parameters must be less than 63. int xp_maxcount = min(63-1, maxcount); @@ -274,7 +278,7 @@ namespace }*/ static NV_NOINLINE int backtraceWithSymbols(CONTEXT * ctx, void * trace[], int maxcount, int skip = 0) { - + // Init the stack frame for this function STACKFRAME64 stackFrame = { 0 }; @@ -344,74 +348,74 @@ namespace StringBuilder builder(512); HANDLE hProcess = GetCurrentProcess(); - - // Resolve PC to function names - for (int i = start; i < size; i++) - { - // Check for end of stack walk - DWORD64 ip = (DWORD64)trace[i]; - if (ip == NULL) - break; - - // Get function name - #define MAX_STRING_LEN (512) - unsigned char byBuffer[sizeof(IMAGEHLP_SYMBOL64) + MAX_STRING_LEN] = { 0 }; - IMAGEHLP_SYMBOL64 * pSymbol = (IMAGEHLP_SYMBOL64*)byBuffer; - pSymbol->SizeOfStruct = sizeof(IMAGEHLP_SYMBOL64); - pSymbol->MaxNameLength = MAX_STRING_LEN; - - DWORD64 dwDisplacement; - - if (SymGetSymFromAddr64(hProcess, ip, &dwDisplacement, pSymbol)) - { - pSymbol->Name[MAX_STRING_LEN-1] = 0; - - /* - // Make the symbol readable for humans - UnDecorateSymbolName( pSym->Name, lpszNonUnicodeUnDSymbol, BUFFERSIZE, - UNDNAME_COMPLETE | - UNDNAME_NO_THISTYPE | - UNDNAME_NO_SPECIAL_SYMS | - UNDNAME_NO_MEMBER_TYPE | - UNDNAME_NO_MS_KEYWORDS | - UNDNAME_NO_ACCESS_SPECIFIERS ); - */ - - // pSymbol->Name - const char * pFunc = pSymbol->Name; - - // Get file/line number - IMAGEHLP_LINE64 theLine = { 0 }; - theLine.SizeOfStruct = sizeof(theLine); - - DWORD dwDisplacement; - if (!SymGetLineFromAddr64(hProcess, ip, &dwDisplacement, &theLine)) - { + + // Resolve PC to function names + for (int i = start; i < size; i++) + { + // Check for end of stack walk + DWORD64 ip = (DWORD64)trace[i]; + if (ip == NULL) + break; + + // Get function name + #define MAX_STRING_LEN (512) + unsigned char byBuffer[sizeof(IMAGEHLP_SYMBOL64) + MAX_STRING_LEN] = { 0 }; + IMAGEHLP_SYMBOL64 * pSymbol = (IMAGEHLP_SYMBOL64*)byBuffer; + pSymbol->SizeOfStruct = sizeof(IMAGEHLP_SYMBOL64); + pSymbol->MaxNameLength = MAX_STRING_LEN; + + DWORD64 dwDisplacement; + + if (SymGetSymFromAddr64(hProcess, ip, &dwDisplacement, pSymbol)) + { + pSymbol->Name[MAX_STRING_LEN-1] = 0; + + /* + // Make the symbol readable for humans + UnDecorateSymbolName( pSym->Name, lpszNonUnicodeUnDSymbol, BUFFERSIZE, + UNDNAME_COMPLETE | + UNDNAME_NO_THISTYPE | + UNDNAME_NO_SPECIAL_SYMS | + UNDNAME_NO_MEMBER_TYPE | + UNDNAME_NO_MS_KEYWORDS | + UNDNAME_NO_ACCESS_SPECIFIERS ); + */ + + // pSymbol->Name + const char * pFunc = pSymbol->Name; + + // Get file/line number + IMAGEHLP_LINE64 theLine = { 0 }; + theLine.SizeOfStruct = sizeof(theLine); + + DWORD dwDisplacement; + if (!SymGetLineFromAddr64(hProcess, ip, &dwDisplacement, &theLine)) + { // Do not print unknown symbols anymore. break; //builder.format("unknown(%08X) : %s\n", (uint32)ip, pFunc); - } - else - { - /* - const char* pFile = strrchr(theLine.FileName, '\\'); - if ( pFile == NULL ) pFile = theLine.FileName; - else pFile++; - */ - const char * pFile = theLine.FileName; - - int line = theLine.LineNumber; - + } + else + { + /* + const char* pFile = strrchr(theLine.FileName, '\\'); + if ( pFile == NULL ) pFile = theLine.FileName; + else pFile++; + */ + const char * pFile = theLine.FileName; + + int line = theLine.LineNumber; + builder.format("%s(%d) : %s\n", pFile, line, pFunc); - } + } lines.append(builder.release()); if (pFunc != NULL && strcmp(pFunc, "WinMain") == 0) { break; } - } - } + } + } } @@ -479,42 +483,37 @@ namespace TerminateProcess(GetCurrentProcess(), EXIT_FAILURE + 8); } - static void handleInvalidParameter(const wchar_t * expresion, const wchar_t * function, const wchar_t * file, unsigned int line, uintptr_t reserved) { + static void handleInvalidParameter(const wchar_t * wexpresion, const wchar_t * wfunction, const wchar_t * wfile, unsigned int line, uintptr_t reserved) { size_t convertedCharCount = 0; - StringBuilder tmp; - - if (expresion != NULL) { - uint size = toU32(wcslen(expresion) + 1); - tmp.reserve(size); - wcstombs_s(&convertedCharCount, tmp.str(), size, expresion, _TRUNCATE); - - nvDebug("*** Invalid parameter: %s\n", tmp.str()); - - if (file != NULL) { - size = toU32(wcslen(file) + 1); - tmp.reserve(size); - wcstombs_s(&convertedCharCount, tmp.str(), size, file, _TRUNCATE); - - nvDebug(" On file: %s\n", tmp.str()); - - if (function != NULL) { - size = toU32(wcslen(function) + 1); - tmp.reserve(size); - wcstombs_s(&convertedCharCount, tmp.str(), size, function, _TRUNCATE); - - nvDebug(" On function: %s\n", tmp.str()); - } + + StringBuilder expresion; + if (wexpresion != NULL) { + uint size = U32(wcslen(wexpresion) + 1); + expresion.reserve(size); + wcstombs_s(&convertedCharCount, expresion.str(), size, wexpresion, _TRUNCATE); + } - nvDebug(" On line: %u\n", line); - } + StringBuilder file; + if (wfile != NULL) { + uint size = U32(wcslen(wfile) + 1); + file.reserve(size); + wcstombs_s(&convertedCharCount, file.str(), size, wfile, _TRUNCATE); } - nvDebugBreak(); - TerminateProcess(GetCurrentProcess(), EXIT_FAILURE + 8); + StringBuilder function; + if (wfunction != NULL) { + uint size = U32(wcslen(wfunction) + 1); + function.reserve(size); + wcstombs_s(&convertedCharCount, function.str(), size, wfunction, _TRUNCATE); + } + + int result = nvAbort(expresion.str(), file.str(), line, function.str()); + if (result == NV_ABORT_DEBUG) { + nvDebugBreak(); + } } - #elif !NV_OS_WIN32 && defined(HAVE_SIGNAL_H) // NV_OS_LINUX || NV_OS_DARWIN #if defined(HAVE_EXECINFO_H) @@ -770,7 +769,7 @@ namespace if (s_interactive) { flushMessageQueue(); - int action = MessageBoxA(NULL, error_string.str(), "Assertion failed", MB_ABORTRETRYIGNORE|MB_ICONERROR); + int action = MessageBoxA(NULL, error_string.str(), "Assertion failed", MB_ABORTRETRYIGNORE | MB_ICONERROR | MB_TOPMOST); switch( action ) { case IDRETRY: ret = NV_ABORT_DEBUG; @@ -851,11 +850,10 @@ namespace printStackTrace(trace, size, 2); }*/ - //SBtodoORBIS check for debugger present - //if (debug::isDebuggerPresent()) - nvDebugBreak(); + if (debug::isDebuggerPresent()) + return NV_ABORT_DEBUG; - return NV_ABORT_DEBUG; + return NV_ABORT_IGNORE; } }; @@ -892,9 +890,9 @@ namespace #endif if( ret == NV_ABORT_EXIT ) { - // Exit cleanly. - exit(EXIT_FAILURE + 1); - } + // Exit cleanly. + exit(EXIT_FAILURE + 1); + } return ret; } @@ -1190,6 +1188,12 @@ bool debug::isDebuggerPresent() #else return false; #endif +#elif NV_OS_ORBIS + #if PS4_FINAL_REQUIREMENTS + return false; + #else + return sceDbgIsDebuggerAttached() == 1; + #endif #elif NV_OS_DARWIN int mib[4]; struct kinfo_proc info; diff --git a/src/nvcore/Debug.h b/src/nvcore/Debug.h index c987e10..48e765e 100644 --- a/src/nvcore/Debug.h +++ b/src/nvcore/Debug.h @@ -34,7 +34,9 @@ # if NV_CC_MSVC // @@ Does this work in msvc-6 and earlier? # define nvDebugBreak() __debugbreak() - //#define nvDebugBreak() __asm { int 3 } +//# define nvDebugBreak() __asm { int 3 } +# elif NV_OS_ORBIS +# define nvDebugBreak() __debugbreak() # elif NV_CC_GNUC # define nvDebugBreak() __builtin_trap() # else @@ -158,7 +160,7 @@ #endif -NVCORE_API int nvAbort(const char *exp, const char *file, int line, const char * func = NULL, const char * msg = NULL, ...); +NVCORE_API int nvAbort(const char *exp, const char *file, int line, const char * func = NULL, const char * msg = NULL, ...) __attribute__((format (printf, 5, 6))); NVCORE_API void NV_CDECL nvDebugPrint( const char *msg, ... ) __attribute__((format (printf, 1, 2))); namespace nv diff --git a/src/nvcore/DefsGnucLinux.h b/src/nvcore/DefsGnucLinux.h index 79a64de..f7f9281 100644 --- a/src/nvcore/DefsGnucLinux.h +++ b/src/nvcore/DefsGnucLinux.h @@ -8,24 +8,30 @@ // Function linkage #define DLL_IMPORT #if __GNUC__ >= 4 -# define DLL_EXPORT __attribute__((visibility("default"))) -# define DLL_EXPORT_CLASS DLL_EXPORT +# define DLL_EXPORT __attribute__((visibility("default"))) +# define DLL_EXPORT_CLASS DLL_EXPORT #else -# define DLL_EXPORT -# define DLL_EXPORT_CLASS +# define DLL_EXPORT +# define DLL_EXPORT_CLASS #endif // Function calling modes #if NV_CPU_X86 -# define NV_CDECL __attribute__((cdecl)) -# define NV_STDCALL __attribute__((stdcall)) +# define NV_CDECL __attribute__((cdecl)) +# define NV_STDCALL __attribute__((stdcall)) #else -# define NV_CDECL -# define NV_STDCALL +# define NV_CDECL +# define NV_STDCALL #endif #define NV_FASTCALL __attribute__((fastcall)) -#define NV_FORCEINLINE __attribute__((always_inline)) inline +//#if __GNUC__ > 3 +// It seems that GCC does not assume always_inline implies inline. I think this depends on the GCC version :( +#define NV_FORCEINLINE inline __attribute__((always_inline)) +//#else +// Some compilers complain that inline and always_inline are redundant. +//#define NV_FORCEINLINE __attribute__((always_inline)) +//#endif #define NV_DEPRECATED __attribute__((deprecated)) #define NV_THREAD_LOCAL __thread @@ -41,13 +47,13 @@ // Define __FUNC__ properly. #if __STDC_VERSION__ < 199901L -# if __GNUC__ >= 2 -# define __FUNC__ __PRETTY_FUNCTION__ // __FUNCTION__ -# else -# define __FUNC__ "" -# endif +# if __GNUC__ >= 2 +# define __FUNC__ __PRETTY_FUNCTION__ // __FUNCTION__ +# else +# define __FUNC__ "" +# endif #else -# define __FUNC__ __PRETTY_FUNCTION__ +# define __FUNC__ __PRETTY_FUNCTION__ #endif #define restrict __restrict__ diff --git a/src/nvcore/DefsVcWin32.h b/src/nvcore/DefsVcWin32.h index 8b4b970..723965e 100644 --- a/src/nvcore/DefsVcWin32.h +++ b/src/nvcore/DefsVcWin32.h @@ -26,7 +26,7 @@ #define chdir _chdir #define getcwd _getcwd -#if _MSC_VER < 1800 // Not sure what version introduced this. +#if _MSC_VER < 1800 // Not sure what version introduced this. #define va_copy(a, b) (a) = (b) #endif diff --git a/src/nvcore/FileSystem.cpp b/src/nvcore/FileSystem.cpp index be19b92..bf64c28 100644 --- a/src/nvcore/FileSystem.cpp +++ b/src/nvcore/FileSystem.cpp @@ -9,6 +9,8 @@ #include // _mkdir #elif NV_OS_XBOX #include +#elif NV_OS_ORBIS +#include #else #include #include @@ -29,6 +31,11 @@ bool FileSystem::exists(const char * path) // PathFileExists requires linking to shlwapi.lib //return PathFileExists(path) != 0; return GetFileAttributesA(path) != INVALID_FILE_ATTRIBUTES; +#elif NV_OS_ORBIS + const int BUFFER_SIZE = 2048; + char file_fullpath[BUFFER_SIZE]; + snprintf(file_fullpath, BUFFER_SIZE, "/app0/%s", path); + return sceFiosExistsSync(NULL, file_fullpath); #else if (FILE * fp = fopen(path, "r")) { @@ -43,6 +50,9 @@ bool FileSystem::createDirectory(const char * path) { #if NV_OS_WIN32 || NV_OS_XBOX return CreateDirectoryA(path, NULL) != 0; +#elif NV_OS_ORBIS + // not implemented + return false; #else return mkdir(path, 0777) != -1; #endif @@ -55,6 +65,9 @@ bool FileSystem::changeDirectory(const char * path) #elif NV_OS_XBOX // Xbox doesn't support Current Working Directory! return false; +#elif NV_OS_ORBIS + // Orbis doesn't support Current Working Directory! + return false; #else return chdir(path) != -1; #endif diff --git a/src/nvcore/ForEach.h b/src/nvcore/ForEach.h index 71a573d..6a86aca 100644 --- a/src/nvcore/ForEach.h +++ b/src/nvcore/ForEach.h @@ -50,6 +50,7 @@ struct PseudoIndexWrapper { // Declare foreach keyword. #if !defined NV_NO_USE_KEYWORDS # define foreach NV_FOREACH +# define foreach_index NV_FOREACH #endif diff --git a/src/nvcore/Memory.h b/src/nvcore/Memory.h index 79ce674..a7fe197 100644 --- a/src/nvcore/Memory.h +++ b/src/nvcore/Memory.h @@ -55,6 +55,10 @@ namespace nv { ::free((void *)ptr); } + template NV_FORCEINLINE void zero(T & data) { + memset(&data, 0, sizeof(T)); + } + } // nv namespace #endif // NV_CORE_MEMORY_H diff --git a/src/nvcore/StdStream.h b/src/nvcore/StdStream.h index 08f399d..dbebff2 100644 --- a/src/nvcore/StdStream.h +++ b/src/nvcore/StdStream.h @@ -37,7 +37,7 @@ namespace nv public: /// Ctor. - StdStream( FILE * fp, bool autoclose=true ) : m_fp(fp), m_autoclose(autoclose) { } + StdStream( FILE * fp, bool autoclose ) : m_fp(fp), m_autoclose(autoclose) { } /// Dtor. virtual ~StdStream() @@ -108,7 +108,8 @@ namespace nv // implementation uses use ftell and fseek to determine our location within the file. virtual bool isAtEnd() const { - nvDebugCheck(m_fp != NULL); + if (m_fp == NULL) return true; + //nvDebugCheck(m_fp != NULL); //return feof( m_fp ) != 0; #if NV_OS_WIN32 uint pos = _ftell_nolock(m_fp); @@ -143,10 +144,10 @@ namespace nv public: /// Construct stream by file name. - StdOutputStream( const char * name ) : StdStream(fileOpen(name, "wb")) { } + StdOutputStream( const char * name ) : StdStream(fileOpen(name, "wb"), /*autoclose=*/true) { } /// Construct stream by file handle. - StdOutputStream( FILE * fp, bool autoclose=true ) : StdStream(fp, autoclose) + StdOutputStream( FILE * fp, bool autoclose ) : StdStream(fp, autoclose) { } @@ -193,7 +194,7 @@ namespace nv public: /// Construct stream by file name. - StdInputStream( const char * name ) : StdStream(fileOpen(name, "rb")) { } + StdInputStream( const char * name ) : StdStream(fileOpen(name, "rb"), /*autoclose=*/true) { } /// Construct stream by file handle. StdInputStream( FILE * fp, bool autoclose=true ) : StdStream(fp, autoclose) diff --git a/src/nvcore/StrLib.cpp b/src/nvcore/StrLib.cpp index 01c498c..3835061 100644 --- a/src/nvcore/StrLib.cpp +++ b/src/nvcore/StrLib.cpp @@ -73,17 +73,17 @@ namespace uint nv::strLen(const char * str) { nvDebugCheck(str != NULL); - return toU32(strlen(str)); + return U32(strlen(str)); } -int nv::strCmp(const char * s1, const char * s2) +int nv::strDiff(const char * s1, const char * s2) { nvDebugCheck(s1 != NULL); nvDebugCheck(s2 != NULL); return strcmp(s1, s2); } -int nv::strCaseCmp(const char * s1, const char * s2) +int nv::strCaseDiff(const char * s1, const char * s2) { nvDebugCheck(s1 != NULL); nvDebugCheck(s1 != NULL); @@ -98,14 +98,14 @@ bool nv::strEqual(const char * s1, const char * s2) { if (s1 == s2) return true; if (s1 == NULL || s2 == NULL) return false; - return strCmp(s1, s2) == 0; + return strcmp(s1, s2) == 0; } bool nv::strCaseEqual(const char * s1, const char * s2) { if (s1 == s2) return true; if (s1 == NULL || s2 == NULL) return false; - return strCaseCmp(s1, s2) == 0; + return strCaseDiff(s1, s2) == 0; } bool nv::strBeginsWith(const char * str, const char * prefix) @@ -122,7 +122,7 @@ bool nv::strEndsWith(const char * str, const char * suffix) return strncmp(str + ml - sl, suffix, sl) == 0; } - +// @@ Add asserts to detect overlap between dst and src? void nv::strCpy(char * dst, uint size, const char * src) { nvDebugCheck(dst != NULL); @@ -142,8 +142,9 @@ void nv::strCpy(char * dst, uint size, const char * src, uint len) #if NV_CC_MSVC && _MSC_VER >= 1400 strncpy_s(dst, size, src, len); #else - NV_UNUSED(size); - strncpy(dst, src, len); + int n = min(len+1, size); + strncpy(dst, src, n); + dst[n-1] = '\0'; #endif } @@ -220,6 +221,13 @@ match: } } +bool nv::isNumber(const char * str) { + while(*str != '\0') { + if (!isDigit(*str)) return false; + str++; + } + return true; +} /** Empty string. */ @@ -326,24 +334,19 @@ StringBuilder & StringBuilder::formatList( const char * fmt, va_list arg ) /** Append a string. */ StringBuilder & StringBuilder::append( const char * s ) { - nvDebugCheck(s != NULL); + return append(s, U32(strlen( s ))); +} - const uint slen = uint(strlen( s )); - if (m_str == NULL) { - m_size = slen + 1; - m_str = strAlloc(m_size); - memcpy(m_str, s, m_size); - } - else { - const uint len = uint(strlen( m_str )); - if (m_size < len + slen + 1) { - m_size = len + slen + 1; - m_str = strReAlloc(m_str, m_size); - } +/** Append a string. */ +StringBuilder & StringBuilder::append(const char * s, uint len) +{ + nvDebugCheck(s != NULL); - memcpy(m_str + len, s, slen + 1); - } + uint offset = length(); + const uint size = offset + len + 1; + reserve(size); + strCpy(m_str + offset, len + 1, s, len); return *this; } diff --git a/src/nvcore/StrLib.h b/src/nvcore/StrLib.h index bc0d05d..5d3d248 100644 --- a/src/nvcore/StrLib.h +++ b/src/nvcore/StrLib.h @@ -35,11 +35,11 @@ namespace nv uint operator()(const char * str) const { return strHash(str); } }; - NVCORE_API uint strLen(const char * str) NV_PURE; + NVCORE_API uint strLen(const char * str) NV_PURE; // Asserts on NULL strings. - NVCORE_API int strCmp(const char * s1, const char * s2) NV_PURE; - NVCORE_API int strCaseCmp(const char * s1, const char * s2) NV_PURE; - NVCORE_API bool strEqual(const char * s1, const char * s2) NV_PURE; // Accepts NULL strings. + NVCORE_API int strDiff(const char * s1, const char * s2) NV_PURE; // Asserts on NULL strings. + NVCORE_API int strCaseDiff(const char * s1, const char * s2) NV_PURE; // Asserts on NULL strings. + NVCORE_API bool strEqual(const char * s1, const char * s2) NV_PURE; // Accepts NULL strings. NVCORE_API bool strCaseEqual(const char * s1, const char * s2) NV_PURE; // Accepts NULL strings. template <> struct Equal { @@ -56,6 +56,35 @@ namespace nv NVCORE_API bool strMatch(const char * str, const char * pat) NV_PURE; + NVCORE_API bool isNumber(const char * str) NV_PURE; + + /* @@ Implement these two functions and modify StringBuilder to use them? + NVCORE_API void strFormat(const char * dst, const char * fmt, ...); + NVCORE_API void strFormatList(const char * dst, const char * fmt, va_list arg); + + template void strFormatSafe(char (&buffer)[count], const char *fmt, ...) __attribute__((format (printf, 2, 3))); + template void strFormatSafe(char (&buffer)[count], const char *fmt, ...) { + va_list args; + va_start(args, fmt); + strFormatList(buffer, count, fmt, args); + va_end(args); + } + template void strFormatListSafe(char (&buffer)[count], const char *fmt, va_list arg) { + va_list tmp; + va_copy(tmp, args); + strFormatList(buffer, count, fmt, tmp); + va_end(tmp); + }*/ + + template void strCpySafe(char (&buffer)[count], const char *src) { + strCpy(buffer, count, src); + } + + template void strCatSafe(char (&buffer)[count], const char * src) { + strCat(buffer, count, src); + } + + /// String builder. class NVCORE_CLASS StringBuilder @@ -73,9 +102,10 @@ namespace nv StringBuilder & format( const char * format, ... ) __attribute__((format (printf, 2, 3))); StringBuilder & formatList( const char * format, va_list arg ); - StringBuilder & append( const char * str ); - StringBuilder & appendFormat( const char * format, ... ) __attribute__((format (printf, 2, 3))); - StringBuilder & appendFormatList( const char * format, va_list arg ); + StringBuilder & append(const char * str); + StringBuilder & append(const char * str, uint len); + StringBuilder & appendFormat(const char * format, ...) __attribute__((format (printf, 2, 3))); + StringBuilder & appendFormatList(const char * format, va_list arg); StringBuilder & appendSpace(uint n); @@ -162,9 +192,9 @@ namespace nv void stripExtension(); // statics - static char separator(); - static const char * fileName(const char *); - static const char * extension(const char *); + NVCORE_API static char separator(); + NVCORE_API static const char * fileName(const char *); + NVCORE_API static const char * extension(const char *); }; @@ -328,6 +358,66 @@ namespace nv uint operator()(const String & str) const { return str.hash(); } }; + + // Like AutoPtr, but for const char strings. + class AutoString + { + NV_FORBID_COPY(AutoString); + NV_FORBID_HEAPALLOC(); + public: + + // Ctor. + AutoString(const char * p = NULL) : m_ptr(p) { } + +#if NV_CC_CPP11 + // Move ctor. + AutoString(AutoString && ap) : m_ptr(ap.m_ptr) { ap.m_ptr = NULL; } +#endif + + // Dtor. Deletes owned pointer. + ~AutoString() { + delete [] m_ptr; + m_ptr = NULL; + } + + // Delete owned pointer and assign new one. + void operator=(const char * p) { + if (p != m_ptr) + { + delete [] m_ptr; + m_ptr = p; + } + } + + // Get pointer. + const char * ptr() const { return m_ptr; } + operator const char *() const { return m_ptr; } + + // Relinquish ownership of the underlying pointer and returns that pointer. + const char * release() { + const char * tmp = m_ptr; + m_ptr = NULL; + return tmp; + } + + // comparison operators. + friend bool operator == (const AutoString & ap, const char * const p) { + return (ap.ptr() == p); + } + friend bool operator != (const AutoString & ap, const char * const p) { + return (ap.ptr() != p); + } + friend bool operator == (const char * const p, const AutoString & ap) { + return (ap.ptr() == p); + } + friend bool operator != (const char * const p, const AutoString & ap) { + return (ap.ptr() != p); + } + + private: + const char * m_ptr; + }; + } // nv namespace #endif // NV_CORE_STRING_H diff --git a/src/nvcore/Stream.h b/src/nvcore/Stream.h index d45cd6c..513cd0c 100644 --- a/src/nvcore/Stream.h +++ b/src/nvcore/Stream.h @@ -78,7 +78,7 @@ namespace nv // friends friend Stream & operator<<( Stream & s, bool & c ) { -#if NV_OS_DARWIN +#if NV_OS_DARWIN && !NV_CC_CPP11 nvStaticCheck(sizeof(bool) == 4); uint8 b = c ? 1 : 0; s.serialize( &b, 1 ); diff --git a/src/nvcore/TextWriter.cpp b/src/nvcore/TextWriter.cpp index 5a57c43..67937f4 100644 --- a/src/nvcore/TextWriter.cpp +++ b/src/nvcore/TextWriter.cpp @@ -26,7 +26,7 @@ void TextWriter::writeString(const char * str, uint len) s->serialize(const_cast(str), len); } -void TextWriter::write(const char * format, ...) +void TextWriter::format(const char * format, ...) { va_list arg; va_start(arg,format); @@ -35,7 +35,7 @@ void TextWriter::write(const char * format, ...) va_end(arg); } -void TextWriter::write(const char * format, va_list arg) +void TextWriter::formatList(const char * format, va_list arg) { va_list tmp; va_copy(tmp, arg); diff --git a/src/nvcore/TextWriter.h b/src/nvcore/TextWriter.h index 8fde3bb..3aac8ad 100644 --- a/src/nvcore/TextWriter.h +++ b/src/nvcore/TextWriter.h @@ -20,8 +20,8 @@ namespace nv void writeString(const char * str); void writeString(const char * str, uint len); - void write(const char * format, ...) __attribute__((format (printf, 2, 3))); - void write(const char * format, va_list arg); + void format(const char * format, ...) __attribute__((format (printf, 2, 3))); + void formatList(const char * format, va_list arg); private: @@ -35,19 +35,19 @@ namespace nv inline TextWriter & operator<<( TextWriter & tw, int i) { - tw.write("%d", i); + tw.format("%d", i); return tw; } inline TextWriter & operator<<( TextWriter & tw, uint i) { - tw.write("%u", i); + tw.format("%u", i); return tw; } inline TextWriter & operator<<( TextWriter & tw, float f) { - tw.write("%f", f); + tw.format("%f", f); return tw; } diff --git a/src/nvcore/Utils.h b/src/nvcore/Utils.h index 29ae96b..79f4ef7 100644 --- a/src/nvcore/Utils.h +++ b/src/nvcore/Utils.h @@ -29,78 +29,96 @@ #define NV_HALF_MAX 65504.0F #define NV_FLOAT_MAX 3.402823466e+38F +#define NV_INTEGER_TO_FLOAT_MAX 16777217 // Largest integer such that it and all smaller integers can be stored in a 32bit float. + namespace nv { // Less error prone than casting. From CB: // http://cbloomrants.blogspot.com/2011/06/06-17-11-c-casting-is-devil.html + // These intentionally look like casts. + // uint32 casts: - template inline uint32 toU32(T x) { return x; } - template <> inline uint32 toU32(uint64 x) { nvDebugCheck(x <= NV_UINT32_MAX); return (uint32)x; } - template <> inline uint32 toU32(int64 x) { nvDebugCheck(x >= 0 && x <= NV_UINT32_MAX); return (uint32)x; } - //template <> inline uint32 toU32(uint32 x) { return x; } - template <> inline uint32 toU32(int32 x) { nvDebugCheck(x >= 0); return (uint32)x; } - //template <> inline uint32 toU32(uint16 x) { return x; } - template <> inline uint32 toU32(int16 x) { nvDebugCheck(x >= 0); return (uint32)x; } - //template <> inline uint32 toU32(uint8 x) { return x; } - template <> inline uint32 toU32(int8 x) { nvDebugCheck(x >= 0); return (uint32)x; } + template inline uint32 U32(T x) { return x; } + template <> inline uint32 U32(uint64 x) { nvDebugCheck(x <= NV_UINT32_MAX); return (uint32)x; } + template <> inline uint32 U32(int64 x) { nvDebugCheck(x >= 0 && x <= NV_UINT32_MAX); return (uint32)x; } + //template <> inline uint32 U32(uint32 x) { return x; } + template <> inline uint32 U32(int32 x) { nvDebugCheck(x >= 0); return (uint32)x; } + //template <> inline uint32 U32(uint16 x) { return x; } + template <> inline uint32 U32(int16 x) { nvDebugCheck(x >= 0); return (uint32)x; } + //template <> inline uint32 U32(uint8 x) { return x; } + template <> inline uint32 U32(int8 x) { nvDebugCheck(x >= 0); return (uint32)x; } // int32 casts: - template inline int32 toI32(T x) { return x; } - template <> inline int32 toI32(uint64 x) { nvDebugCheck(x <= NV_INT32_MAX); return (int32)x; } - template <> inline int32 toI32(int64 x) { nvDebugCheck(x >= NV_INT32_MIN && x <= NV_UINT32_MAX); return (int32)x; } - template <> inline int32 toI32(uint32 x) { nvDebugCheck(x <= NV_INT32_MAX); return (int32)x; } - //template <> inline int32 toI32(int32 x) { return x; } - //template <> inline int32 toI32(uint16 x) { return x; } - //template <> inline int32 toI32(int16 x) { return x; } - //template <> inline int32 toI32(uint8 x) { return x; } - //template <> inline int32 toI32(int8 x) { return x; } + template inline int32 I32(T x) { return x; } + template <> inline int32 I32(uint64 x) { nvDebugCheck(x <= NV_INT32_MAX); return (int32)x; } + template <> inline int32 I32(int64 x) { nvDebugCheck(x >= NV_INT32_MIN && x <= NV_UINT32_MAX); return (int32)x; } + template <> inline int32 I32(uint32 x) { nvDebugCheck(x <= NV_INT32_MAX); return (int32)x; } + //template <> inline int32 I32(int32 x) { return x; } + //template <> inline int32 I32(uint16 x) { return x; } + //template <> inline int32 I32(int16 x) { return x; } + //template <> inline int32 I32(uint8 x) { return x; } + //template <> inline int32 I32(int8 x) { return x; } // uint16 casts: - template inline uint16 toU16(T x) { return x; } - template <> inline uint16 toU16(uint64 x) { nvDebugCheck(x <= NV_UINT16_MAX); return (uint16)x; } - template <> inline uint16 toU16(int64 x) { nvDebugCheck(x >= 0 && x <= NV_UINT16_MAX); return (uint16)x; } - template <> inline uint16 toU16(uint32 x) { nvDebugCheck(x <= NV_UINT16_MAX); return (uint16)x; } - template <> inline uint16 toU16(int32 x) { nvDebugCheck(x >= 0 && x <= NV_UINT16_MAX); return (uint16)x; } - //template <> inline uint16 toU16(uint16 x) { return x; } - template <> inline uint16 toU16(int16 x) { nvDebugCheck(x >= 0); return (uint16)x; } - //template <> inline uint16 toU16(uint8 x) { return x; } - template <> inline uint16 toU16(int8 x) { nvDebugCheck(x >= 0); return (uint16)x; } + template inline uint16 U16(T x) { return x; } + template <> inline uint16 U16(uint64 x) { nvDebugCheck(x <= NV_UINT16_MAX); return (uint16)x; } + template <> inline uint16 U16(int64 x) { nvDebugCheck(x >= 0 && x <= NV_UINT16_MAX); return (uint16)x; } + template <> inline uint16 U16(uint32 x) { nvDebugCheck(x <= NV_UINT16_MAX); return (uint16)x; } + template <> inline uint16 U16(int32 x) { nvDebugCheck(x >= 0 && x <= NV_UINT16_MAX); return (uint16)x; } + //template <> inline uint16 U16(uint16 x) { return x; } + template <> inline uint16 U16(int16 x) { nvDebugCheck(x >= 0); return (uint16)x; } + //template <> inline uint16 U16(uint8 x) { return x; } + template <> inline uint16 U16(int8 x) { nvDebugCheck(x >= 0); return (uint16)x; } // int16 casts: - template inline int16 toI16(T x) { return x; } - template <> inline int16 toI16(uint64 x) { nvDebugCheck(x <= NV_INT16_MAX); return (int16)x; } - template <> inline int16 toI16(int64 x) { nvDebugCheck(x >= NV_INT16_MIN && x <= NV_UINT16_MAX); return (int16)x; } - template <> inline int16 toI16(uint32 x) { nvDebugCheck(x <= NV_INT16_MAX); return (int16)x; } - template <> inline int16 toI16(int32 x) { nvDebugCheck(x >= NV_INT16_MIN && x <= NV_UINT16_MAX); return (int16)x; } - template <> inline int16 toI16(uint16 x) { nvDebugCheck(x <= NV_INT16_MAX); return (int16)x; } - //template <> inline int16 toI16(int16 x) { return x; } - //template <> inline int16 toI16(uint8 x) { return x; } - //template <> inline int16 toI16(int8 x) { return x; } + template inline int16 I16(T x) { return x; } + template <> inline int16 I16(uint64 x) { nvDebugCheck(x <= NV_INT16_MAX); return (int16)x; } + template <> inline int16 I16(int64 x) { nvDebugCheck(x >= NV_INT16_MIN && x <= NV_UINT16_MAX); return (int16)x; } + template <> inline int16 I16(uint32 x) { nvDebugCheck(x <= NV_INT16_MAX); return (int16)x; } + template <> inline int16 I16(int32 x) { nvDebugCheck(x >= NV_INT16_MIN && x <= NV_UINT16_MAX); return (int16)x; } + template <> inline int16 I16(uint16 x) { nvDebugCheck(x <= NV_INT16_MAX); return (int16)x; } + //template <> inline int16 I16(int16 x) { return x; } + //template <> inline int16 I16(uint8 x) { return x; } + //template <> inline int16 I16(int8 x) { return x; } // uint8 casts: - template inline uint8 toU8(T x) { return x; } - template <> inline uint8 toU8(uint64 x) { nvDebugCheck(x <= NV_UINT8_MAX); return (uint8)x; } - template <> inline uint8 toU8(int64 x) { nvDebugCheck(x >= 0 && x <= NV_UINT8_MAX); return (uint8)x; } - template <> inline uint8 toU8(uint32 x) { nvDebugCheck(x <= NV_UINT8_MAX); return (uint8)x; } - template <> inline uint8 toU8(int32 x) { nvDebugCheck(x >= 0 && x <= NV_UINT8_MAX); return (uint8)x; } - template <> inline uint8 toU8(uint16 x) { nvDebugCheck(x <= NV_UINT8_MAX); return (uint8)x; } - template <> inline uint8 toU8(int16 x) { nvDebugCheck(x >= 0 && x <= NV_UINT8_MAX); return (uint8)x; } - //template <> inline uint8 toU8(uint8 x) { return x; } - template <> inline uint8 toU8(int8 x) { nvDebugCheck(x >= 0); return (uint8)x; } + template inline uint8 U8(T x) { return x; } + template <> inline uint8 U8(uint64 x) { nvDebugCheck(x <= NV_UINT8_MAX); return (uint8)x; } + template <> inline uint8 U8(int64 x) { nvDebugCheck(x >= 0 && x <= NV_UINT8_MAX); return (uint8)x; } + template <> inline uint8 U8(uint32 x) { nvDebugCheck(x <= NV_UINT8_MAX); return (uint8)x; } + template <> inline uint8 U8(int32 x) { nvDebugCheck(x >= 0 && x <= NV_UINT8_MAX); return (uint8)x; } + template <> inline uint8 U8(uint16 x) { nvDebugCheck(x <= NV_UINT8_MAX); return (uint8)x; } + template <> inline uint8 U8(int16 x) { nvDebugCheck(x >= 0 && x <= NV_UINT8_MAX); return (uint8)x; } + //template <> inline uint8 U8(uint8 x) { return x; } + template <> inline uint8 U8(int8 x) { nvDebugCheck(x >= 0); return (uint8)x; } + //template <> inline uint8 U8(int8 x) { nvDebugCheck(x >= 0.0f && x <= 255.0f); return (uint8)x; } // int8 casts: - template inline int8 toI8(T x) { return x; } - template <> inline int8 toI8(uint64 x) { nvDebugCheck(x <= NV_INT8_MAX); return (int8)x; } - template <> inline int8 toI8(int64 x) { nvDebugCheck(x >= NV_INT8_MIN && x <= NV_UINT8_MAX); return (int8)x; } - template <> inline int8 toI8(uint32 x) { nvDebugCheck(x <= NV_INT8_MAX); return (int8)x; } - template <> inline int8 toI8(int32 x) { nvDebugCheck(x >= NV_INT8_MIN && x <= NV_UINT8_MAX); return (int8)x; } - template <> inline int8 toI8(uint16 x) { nvDebugCheck(x <= NV_INT8_MAX); return (int8)x; } - template <> inline int8 toI8(int16 x) { nvDebugCheck(x >= NV_INT8_MIN && x <= NV_UINT8_MAX); return (int8)x; } - template <> inline int8 toI8(uint8 x) { nvDebugCheck(x <= NV_INT8_MAX); return (int8)x; } - //template <> inline int8 toI8(int8 x) { return x; } - + template inline int8 I8(T x) { return x; } + template <> inline int8 I8(uint64 x) { nvDebugCheck(x <= NV_INT8_MAX); return (int8)x; } + template <> inline int8 I8(int64 x) { nvDebugCheck(x >= NV_INT8_MIN && x <= NV_UINT8_MAX); return (int8)x; } + template <> inline int8 I8(uint32 x) { nvDebugCheck(x <= NV_INT8_MAX); return (int8)x; } + template <> inline int8 I8(int32 x) { nvDebugCheck(x >= NV_INT8_MIN && x <= NV_UINT8_MAX); return (int8)x; } + template <> inline int8 I8(uint16 x) { nvDebugCheck(x <= NV_INT8_MAX); return (int8)x; } + template <> inline int8 I8(int16 x) { nvDebugCheck(x >= NV_INT8_MIN && x <= NV_UINT8_MAX); return (int8)x; } + template <> inline int8 I8(uint8 x) { nvDebugCheck(x <= NV_INT8_MAX); return (int8)x; } + //template <> inline int8 I8(int8 x) { return x; } + + // float casts: + template inline float F32(T x) { return x; } + template <> inline float F32(uint64 x) { nvDebugCheck(x <= NV_INTEGER_TO_FLOAT_MAX); return (float)x; } + template <> inline float F32(int64 x) { nvDebugCheck(x >= -NV_INTEGER_TO_FLOAT_MAX && x <= NV_INTEGER_TO_FLOAT_MAX); return (float)x; } + template <> inline float F32(uint32 x) { nvDebugCheck(x <= NV_INTEGER_TO_FLOAT_MAX); return (float)x; } + template <> inline float F32(int32 x) { nvDebugCheck(x >= -NV_INTEGER_TO_FLOAT_MAX && x <= NV_INTEGER_TO_FLOAT_MAX); return (float)x; } + // The compiler should not complain about these conversions: + //template <> inline float F32(uint16 x) { nvDebugCheck(return (float)x; } + //template <> inline float F32(int16 x) { nvDebugCheck(return (float)x; } + //template <> inline float F32(uint8 x) { nvDebugCheck(return (float)x; } + //template <> inline float F32(int8 x) { nvDebugCheck(return (float)x; } + + /// Swap two values. template inline void swap(T & a, T & b) @@ -112,35 +130,40 @@ namespace nv /// Return the maximum of the two arguments. For floating point values, it returns the second value if the first is NaN. template - inline const T & max(const T & a, const T & b) + //inline const T & max(const T & a, const T & b) + inline T max(const T & a, const T & b) { return (b < a) ? a : b; } /// Return the maximum of the three arguments. template - inline const T & max3(const T & a, const T & b, const T & c) + //inline const T & max3(const T & a, const T & b, const T & c) + inline T max3(const T & a, const T & b, const T & c) { return max(a, max(b, c)); } /// Return the minimum of two values. template - inline const T & min(const T & a, const T & b) + //inline const T & min(const T & a, const T & b) + inline T min(const T & a, const T & b) { return (a < b) ? a : b; } /// Return the maximum of the three arguments. template - inline const T & min3(const T & a, const T & b, const T & c) + //inline const T & min3(const T & a, const T & b, const T & c) + inline T min3(const T & a, const T & b, const T & c) { return min(a, min(b, c)); } /// Clamp between two values. template - inline const T & clamp(const T & x, const T & a, const T & b) + //inline const T & clamp(const T & x, const T & a, const T & b) + inline T clamp(const T & x, const T & a, const T & b) { return min(max(x, a), b); } @@ -217,7 +240,6 @@ namespace nv template void destroy_range(T * restrict ptr, uint new_size, uint old_size) { for (uint i = new_size; i < old_size; i++) { - nvDebugCheck(ptr != NULL && isValidPtr(ptr)); (ptr+i)->~T(); // Explicit call to the destructor } } diff --git a/src/nvcore/nvcore.h b/src/nvcore/nvcore.h index b02d5c4..b903f6f 100644 --- a/src/nvcore/nvcore.h +++ b/src/nvcore/nvcore.h @@ -127,6 +127,12 @@ # error "Unsupported compiler" #endif +#if NV_CC_MSVC +#define NV_CC_CPP11 (__cplusplus > 199711L) +#else +// @@ IC: This works in CLANG, about GCC? +#define NV_CC_CPP11 (__has_feature(cxx_deleted_functions) && __has_feature(cxx_rvalue_references) && __has_feature(cxx_static_assert)) +#endif // Endiannes: #define NV_LITTLE_ENDIAN POSH_LITTLE_ENDIAN @@ -170,11 +176,16 @@ typedef uint32 uint; // Disable copy constructor and assignment operator. +#if NV_CC_CPP11 +#define NV_FORBID_COPY(C) \ + C( const C & ) = delete; \ + C &operator=( const C & ) = delete +#else #define NV_FORBID_COPY(C) \ private: \ C( const C & ); \ C &operator=( const C & ) - +#endif // Disable dynamic allocation on the heap. // See Prohibiting Heap-Based Objects in More Effective C++. @@ -205,8 +216,8 @@ typedef uint32 uint; #define NV_MULTI_LINE_MACRO_END } while(false) #endif -#if __cplusplus > 199711L -#define nvStaticCheck(x) static_assert(x, "Static assert "#x" failed") +#if NV_CC_CPP11 +#define nvStaticCheck(x) static_assert((x), "Static assert "#x" failed") #else #define nvStaticCheck(x) typedef char NV_STRING_JOIN2(__static_assert_,__LINE__)[(x)] #endif diff --git a/src/nvimage/BlockDXT.cpp b/src/nvimage/BlockDXT.cpp index cc5d081..42d8c05 100644 --- a/src/nvimage/BlockDXT.cpp +++ b/src/nvimage/BlockDXT.cpp @@ -138,9 +138,9 @@ uint BlockDXT1::evaluatePaletteNV5x(Color32 color_array[4]) const color_array[2].a = 0xFF; // Set all components to 0 to match DXT specs. - color_array[3].r = 0x00; // color_array[2].r; - color_array[3].g = 0x00; // color_array[2].g; - color_array[3].b = 0x00; // color_array[2].b; + color_array[3].r = 0x00; + color_array[3].g = 0x00; + color_array[3].b = 0x00; color_array[3].a = 0x00; return 3; @@ -167,9 +167,9 @@ void BlockDXT1::evaluatePalette3(Color32 color_array[4], bool d3d9) const color_array[2].a = 0xFF; // Set all components to 0 to match DXT specs. - color_array[3].r = 0x00; // color_array[2].r; - color_array[3].g = 0x00; // color_array[2].g; - color_array[3].b = 0x00; // color_array[2].b; + color_array[3].r = 0x00; + color_array[3].g = 0x00; + color_array[3].b = 0x00; color_array[3].a = 0x00; } @@ -433,6 +433,22 @@ void AlphaBlockDXT5::decodeBlock(ColorBlock * block, bool d3d9/*= false*/) const } } +void AlphaBlockDXT5::decodeBlock(AlphaBlock4x4 * block, bool d3d9/*= false*/) const +{ + nvDebugCheck(block != NULL); + + uint8 alpha_array[8]; + evaluatePalette(alpha_array, d3d9); + + uint8 index_array[16]; + indices(index_array); + + for(uint i = 0; i < 16; i++) { + block->alpha[i] = alpha_array[index_array[i]]; + } +} + + void AlphaBlockDXT5::flip4() { uint64 * b = (uint64 *)this; diff --git a/src/nvimage/BlockDXT.h b/src/nvimage/BlockDXT.h index 40ba7fe..e03cff7 100644 --- a/src/nvimage/BlockDXT.h +++ b/src/nvimage/BlockDXT.h @@ -32,7 +32,8 @@ namespace nv { struct ColorBlock; - struct ColorSet; + struct ColorSet; + struct AlphaBlock4x4; class Stream; @@ -152,6 +153,7 @@ namespace nv void setIndex(uint index, uint value); void decodeBlock(ColorBlock * block, bool d3d9 = false) const; + void decodeBlock(AlphaBlock4x4 * block, bool d3d9 = false) const; void flip4(); void flip2(); diff --git a/src/nvimage/ColorBlock.cpp b/src/nvimage/ColorBlock.cpp index ddf02cb..026bb36 100644 --- a/src/nvimage/ColorBlock.cpp +++ b/src/nvimage/ColorBlock.cpp @@ -6,6 +6,8 @@ #include "nvmath/Box.h" #include "nvmath/Vector.inl" +#include "nvmath/ftoi.h" + #include "nvcore/Utils.h" // swap #include // memcpy @@ -519,11 +521,24 @@ void ColorSet::setColors(const float * data, uint img_w, uint img_h, uint img_x, } } +void ColorSet::setColors(const Vector3 colors[16], const float weights[16]) +{ + +} + +void ColorSet::setColors(const Vector4 colors[16], const float weights[16]) +{ + +} + + + void ColorSet::setAlphaWeights() { for (uint i = 0; i < colorCount; i++) { - weights[i] = max(colors[i].w, 0.001f); // Avoid division by zero. + //weights[i] = max(colors[i].w, 0.001f); // Avoid division by zero. + weights[i] = max(colors[i].w, 0.0f); } } @@ -539,6 +554,7 @@ void ColorSet::setUniformWeights() // @@ Handle complex blocks (not 4x4). void ColorSet::createMinimalSet(bool ignoreTransparent) { + nvDebugCheck(indexCount == 16); nvDebugCheck(colorCount <= 16); Vector4 C[16]; @@ -556,7 +572,7 @@ void ColorSet::createMinimalSet(bool ignoreTransparent) Vector4 ci = C[indices[i]]; float wi = W[indices[i]]; - if (ignoreTransparent && ci.w == 0) { + if (ignoreTransparent && wi == 0) { indices[i] = -1; continue; } @@ -582,9 +598,10 @@ void ColorSet::createMinimalSet(bool ignoreTransparent) n++; } } - nvDebugCheck(n != 0); + //nvDebugCheck(n != 0); // Fully transparent blocks are OK. for (uint i = n; i < colorCount; i++) { + colors[i] = Vector4(0); weights[i] = 0; } @@ -594,6 +611,8 @@ void ColorSet::createMinimalSet(bool ignoreTransparent) if (colorCount == 0) { colorCount = 1; indices[0] = 0; + //colors[0] = Vector4(0); + weights[0] = 1; } } @@ -661,3 +680,59 @@ bool ColorSet::hasAlpha() const } return false; } + + +void AlphaBlock4x4::init(uint8 a) +{ + for (int i = 0; i < 16; i++) { + alpha[i] = a; + weights[i] = 1.0f; + } +} + +void AlphaBlock4x4::init(const ColorBlock & src, uint channel) +{ + nvCheck(channel >= 0 && channel < 4); + + // Colors are in BGRA format. + if (channel == 0) channel = 2; + else if (channel == 2) channel = 0; + + for (int i = 0; i < 16; i++) { + alpha[i] = src.color(i).component[channel]; + weights[i] = 1.0f; + } +} + + + + +void AlphaBlock4x4::init(const ColorSet & src, uint channel) +{ + nvCheck(channel >= 0 && channel < 4); + + for (int i = 0; i < 16; i++) { + float f = src.color(i).component[channel]; + alpha[i] = unitFloatToFixed8(f); + weights[i] = 1.0f; + } +} + +void AlphaBlock4x4::initMaxRGB(const ColorSet & src, float threshold) +{ + for (int i = 0; i < 16; i++) { + float x = src.color(i).x; + float y = src.color(i).y; + float z = src.color(i).z; + alpha[i] = unitFloatToFixed8(max(max(x, y), max(z, threshold))); + weights[i] = 1.0f; + } +} + +void AlphaBlock4x4::initWeights(const ColorSet & src) +{ + for (int i = 0; i < 16; i++) { + weights[i] = src.weight(i); + } +} + diff --git a/src/nvimage/ColorBlock.h b/src/nvimage/ColorBlock.h index 6541fa8..fe78a47 100644 --- a/src/nvimage/ColorBlock.h +++ b/src/nvimage/ColorBlock.h @@ -12,6 +12,7 @@ namespace nv class Image; class FloatImage; + /// Uncompressed 4x4 color block. struct ColorBlock { @@ -89,6 +90,8 @@ namespace nv void allocate(uint w, uint h); void setColors(const float * data, uint img_w, uint img_h, uint img_x, uint img_y); + void setColors(const Vector3 colors[16], const float weights[16]); + void setColors(const Vector4 colors[16], const float weights[16]); void setAlphaWeights(); void setUniformWeights(); @@ -108,6 +111,8 @@ namespace nv Vector4 color(uint i) const { nvDebugCheck(i < indexCount); return colors[indices[i]]; } Vector4 & color(uint i) { nvDebugCheck(i < indexCount); return colors[indices[i]]; } + float weight(uint i) const { nvDebugCheck(i < indexCount); return weights[indices[i]]; } + bool isValidIndex(uint i) const { return i < indexCount && indices[i] >= 0; } uint colorCount; @@ -116,10 +121,40 @@ namespace nv // Allocate color set dynamically and add support for sets larger than 4x4. Vector4 colors[16]; - float weights[16]; + float weights[16]; // @@ Add mask to indicate what color components are weighted? int indices[16]; }; + + /// Uncompressed 4x4 alpha block. + struct AlphaBlock4x4 + { + void init(uint8 value); + void init(const ColorBlock & src, uint channel); + void init(const ColorSet & src, uint channel); + + void initMaxRGB(const ColorSet & src, float threshold); + void initWeights(const ColorSet & src); + + uint8 alpha[4*4]; + float weights[16]; + }; + + + struct FloatAlphaBlock4x4 + { + float alphas[4 * 4]; + float weights[4 * 4]; + }; + + struct FloatColorBlock4x4 + { + Vector4 colors[4 * 4]; + float weights[4 * 4]; + }; + + + } // nv namespace #endif // NV_IMAGE_COLORBLOCK_H diff --git a/src/nvimage/ColorSpace.cpp b/src/nvimage/ColorSpace.cpp index 11bdae0..72807c1 100644 --- a/src/nvimage/ColorSpace.cpp +++ b/src/nvimage/ColorSpace.cpp @@ -1,11 +1,10 @@ // This code is in the public domain -- jim@tilander.org -#include +#include "ColorSpace.h" -#include -#include +#include "nvimage/Image.h" +#include "nvmath/Color.h" -#include "ColorSpace.h" namespace nv { diff --git a/src/nvimage/DirectDrawSurface.cpp b/src/nvimage/DirectDrawSurface.cpp index c2531ab..dff9255 100644 --- a/src/nvimage/DirectDrawSurface.cpp +++ b/src/nvimage/DirectDrawSurface.cpp @@ -952,7 +952,8 @@ bool DirectDrawSurface::isSupported() const header.header10.dxgiFormat == DXGI_FORMAT_BC3_UNORM || header.header10.dxgiFormat == DXGI_FORMAT_BC4_UNORM || header.header10.dxgiFormat == DXGI_FORMAT_BC5_UNORM || - header.header10.dxgiFormat == DXGI_FORMAT_BC6H_UF16) + header.header10.dxgiFormat == DXGI_FORMAT_BC6H_UF16 || + header.header10.dxgiFormat == DXGI_FORMAT_BC7_UNORM) { return true; } @@ -1390,37 +1391,37 @@ void DirectDrawSurface::readBlock(ColorBlock * rgba) *stream << block; block.decodeBlock(rgba); } - else if (header.hasDX10Header() && header.header10.dxgiFormat == DXGI_FORMAT_BC6H_UF16) - { - BlockBC6 block; - *stream << block; - ColorSet set; - block.decodeBlock(&set); - - // Clamp to [0, 1] and round to 8-bit - for (int y = 0; y < 4; ++y) - { - for (int x = 0; x < 4; ++x) - { - Vector4 px = set.colors[y*4 + x]; - rgba->color(x, y).setRGBA( - uint8(clamp(px.x, 0.0f, 1.0f) * 255.0f + 0.5f), - uint8(clamp(px.y, 0.0f, 1.0f) * 255.0f + 0.5f), - uint8(clamp(px.z, 0.0f, 1.0f) * 255.0f + 0.5f), - uint8(clamp(px.w, 0.0f, 1.0f) * 255.0f + 0.5f)); - } - } - } + else if (header.hasDX10Header() && header.header10.dxgiFormat == DXGI_FORMAT_BC6H_UF16) + { + BlockBC6 block; + *stream << block; + ColorSet set; + block.decodeBlock(&set); + + // Clamp to [0, 1] and round to 8-bit + for (int y = 0; y < 4; ++y) + { + for (int x = 0; x < 4; ++x) + { + Vector4 px = set.colors[y*4 + x]; + rgba->color(x, y).setRGBA( + uint8(clamp(px.x, 0.0f, 1.0f) * 255.0f + 0.5f), + uint8(clamp(px.y, 0.0f, 1.0f) * 255.0f + 0.5f), + uint8(clamp(px.z, 0.0f, 1.0f) * 255.0f + 0.5f), + uint8(clamp(px.w, 0.0f, 1.0f) * 255.0f + 0.5f)); + } + } + } else if (header.hasDX10Header() && header.header10.dxgiFormat == DXGI_FORMAT_BC7_UNORM) { BlockBC7 block; *stream << block; block.decodeBlock(rgba); } - else - { - nvDebugCheck(false); - } + else + { + nvDebugCheck(false); + } // If normal flag set, convert to normal. if (header.pf.flags & DDPF_NORMAL) diff --git a/src/nvimage/FloatImage.cpp b/src/nvimage/FloatImage.cpp index 51acc98..bae9da1 100644 --- a/src/nvimage/FloatImage.cpp +++ b/src/nvimage/FloatImage.cpp @@ -7,6 +7,7 @@ #include "nvmath/Color.h" #include "nvmath/Vector.inl" #include "nvmath/Matrix.inl" +#include "nvmath/ftoi.h" #include "nvcore/Utils.h" // max #include "nvcore/Ptr.h" diff --git a/src/nvimage/Image.cpp b/src/nvimage/Image.cpp index 3438b32..3d99108 100644 --- a/src/nvimage/Image.cpp +++ b/src/nvimage/Image.cpp @@ -40,7 +40,7 @@ const Image & Image::operator=(const Image & img) } -void Image::allocate(uint w, uint h, uint d) +void Image::allocate(uint w, uint h, uint d/*= 1*/) { free(); m_width = w; @@ -49,6 +49,45 @@ void Image::allocate(uint w, uint h, uint d) m_data = realloc(m_data, w * h * d); } +void Image::resize(uint w, uint h, uint d/*= 1*/) { + + Image img; + img.allocate(w, h, d); + + Color32 background(0,0,0,0); + + // Copy image. + uint x, y, z; + for(z = 0; z < min(d, m_depth); z++) { + for(y = 0; y < min(h, m_height); y++) { + for(x = 0; x < min(w, m_width); x++) { + img.pixel(x, y, z) = pixel(x, y, z); + } + for(; x < w; x++) { + img.pixel(x, y, z) = background; + } + } + for(; y < h; y++) { + for(x = 0; x < w; x++) { + img.pixel(x, y, z) = background; + } + } + } + for(; z < d; z++) { + for(y = 0; y < h; y++) { + for(x = 0; x < w; x++) { + img.pixel(x, y, z) = background; + } + } + } + + swap(m_width, img.m_width); + swap(m_height, img.m_height); + swap(m_depth, img.m_depth); + swap(m_format, img.m_format); + swap(m_data, img.m_data); +} + bool Image::load(const char * name) { free(); diff --git a/src/nvimage/Image.h b/src/nvimage/Image.h index 4ab00a9..e39c41e 100644 --- a/src/nvimage/Image.h +++ b/src/nvimage/Image.h @@ -32,6 +32,8 @@ namespace nv void allocate(uint w, uint h, uint d = 1); bool load(const char * name); + void resize(uint w, uint h, uint d = 1); + void wrap(void * data, uint w, uint h, uint d = 1); void unwrap(); diff --git a/src/nvimage/ImageIO.cpp b/src/nvimage/ImageIO.cpp index 9a81c00..d820cb0 100644 --- a/src/nvimage/ImageIO.cpp +++ b/src/nvimage/ImageIO.cpp @@ -319,9 +319,9 @@ static bool savePPM(Stream & s, const Image * img) uint h = img->height(); TextWriter writer(&s); - writer.write("P6\n"); - writer.write("%d %d\n", w, h); - writer.write("255\n"); + writer.format("P6\n"); + writer.format("%d %d\n", w, h); + writer.writeString("255\n"); for (uint i = 0; i < w * h; i++) { Color32 c = img->pixel(i); s << c.r << c.g << c.b; @@ -501,14 +501,16 @@ static FloatImage * loadFloatDDS(Stream & s) DDSHeader header; s << header; - static const uint D3DFMT_A16B16G16R16F = 113; + // @@ We only support a few formats for now. - // @@ We only support RGBA16F for now. if (header.pf.fourcc == D3DFMT_A16B16G16R16F) { const int size = header.width * header.height; uint16 * const data = new uint16[size * 4]; - s.serialize(data, size * 4 * sizeof(uint16)); + //s.serialize(data, size * 4 * sizeof(uint16)); + for (int i = 0; i < 4* size; i++) { + s << data[i]; + } FloatImage * img = new FloatImage; img->allocate(4, header.width, header.height); @@ -530,7 +532,84 @@ static FloatImage * loadFloatDDS(Stream & s) return img; } + else if (header.pf.fourcc == D3DFMT_R32F) { + const int size = header.width * header.height; + float * const data = new float[size]; + + for (int i = 0; i < size; i++) { + s << data[i]; + } + + FloatImage * img = new FloatImage; + img->allocate(4, header.width, header.height); + + float * r = img->channel(0); + + float * ptr = data; + for (int i = 0; i < size; i++) { + *r++ = *ptr++; + } + + delete [] data; + + img->clear(1, 0.0f); + img->clear(2, 0.0f); + img->clear(3, 1.0f); + + return img; + } + else if (header.pf.fourcc == D3DFMT_L16 || (header.pf.bitcount == 16 && header.pf.rmask == 0xFFFF && header.pf.gmask == 0 && header.pf.bmask == 0 && header.pf.amask == 0)) + { + const int size = header.width * header.height; + uint16 * const data = new uint16[size]; + + for (int i = 0; i < size; i++) { + s << data[i]; + } + + FloatImage * img = new FloatImage; + img->allocate(4, header.width, header.height); + float * r = img->channel(0); + + uint16 * ptr = data; + for (int i = 0; i < size; i++) { + *r++ = float(*ptr++) / 65535.0f; + } + + delete [] data; + + img->clear(1, 0.0f); + img->clear(2, 0.0f); + img->clear(3, 1.0f); + + return img; + } + else if (header.pf.fourcc == D3DFMT_L8 || (header.pf.bitcount == 8 && header.pf.rmask == 0xFF && header.pf.gmask == 0 && header.pf.bmask == 0 && header.pf.amask == 0)) + { + const int size = header.width * header.height; + uint8 * const data = new uint8[size]; + + s.serialize(data, size); + + FloatImage * img = new FloatImage; + img->allocate(4, header.width, header.height); + + float * r = img->channel(0); + + uint8 * ptr = data; + for (int i = 0; i < size; i++) { + *r++ = float(*ptr++) / 255.0f; + } + + delete [] data; + + img->clear(1, 0.0f); + img->clear(2, 0.0f); + img->clear(3, 1.0f); + + return img; + } return NULL; } @@ -1713,26 +1792,26 @@ Image * nv::ImageIO::load(const char * fileName, Stream & s) const char * extension = Path::extension(fileName); - if (strCaseCmp(extension, ".tga") == 0) { + if (strCaseDiff(extension, ".tga") == 0) { return loadTGA(s); } - if (strCaseCmp(extension, ".psd") == 0) { + if (strCaseDiff(extension, ".psd") == 0) { return loadPSD(s); } - /*if (strCaseCmp(extension, ".ppm") == 0) { + /*if (strCaseDiff(extension, ".ppm") == 0) { return loadPPM(s); }*/ #if defined(HAVE_JPEG) - if (strCaseCmp(extension, ".jpg") == 0 || strCaseCmp(extension, ".jpeg") == 0) { + if (strCaseDiff(extension, ".jpg") == 0 || strCaseDiff(extension, ".jpeg") == 0) { return loadJPG(s); } #endif #if defined(HAVE_PNG) - if (strCaseCmp(extension, ".png") == 0) { + if (strCaseDiff(extension, ".png") == 0) { return loadPNG(s); } #endif @@ -1759,16 +1838,16 @@ bool nv::ImageIO::save(const char * fileName, Stream & s, const Image * img, con const char * extension = Path::extension(fileName); - if (strCaseCmp(extension, ".tga") == 0) { + if (strCaseDiff(extension, ".tga") == 0) { return saveTGA(s, img); } - if (strCaseCmp(extension, ".ppm") == 0) { + if (strCaseDiff(extension, ".ppm") == 0) { return savePPM(s, img); } #if defined(HAVE_PNG) - if (strCaseCmp(extension, ".png") == 0) { + if (strCaseDiff(extension, ".png") == 0) { return savePNG(s, img, tags); } #endif @@ -1816,20 +1895,20 @@ FloatImage * nv::ImageIO::loadFloat(const char * fileName, Stream & s) const char * extension = Path::extension(fileName); - /*if (strCaseCmp(extension, ".pfm") == 0) { + /*if (strCaseDiff(extension, ".pfm") == 0) { return loadFloatPFM(s); }*/ #if defined(HAVE_TIFF) #pragma NV_MESSAGE("TODO: Load TIFF from stream.") - if (strCaseCmp(extension, ".tif") == 0 || strCaseCmp(extension, ".tiff") == 0) { + if (strCaseDiff(extension, ".tif") == 0 || strCaseDiff(extension, ".tiff") == 0) { return loadFloatTIFF(fileName, s); } #endif #if defined(HAVE_OPENEXR) #pragma NV_MESSAGE("TODO: Load EXR from stream.") - if (strCaseCmp(extension, ".exr") == 0) { + if (strCaseDiff(extension, ".exr") == 0) { return loadFloatEXR(fileName, s); } #endif @@ -1841,7 +1920,7 @@ FloatImage * nv::ImageIO::loadFloat(const char * fileName, Stream & s) } #endif - if (strCaseCmp(extension, ".dds") == 0) { + if (strCaseDiff(extension, ".dds") == 0) { const uint spos = s.tell(); // Save stream position. FloatImage * floatImage = loadFloatDDS(s); if (floatImage != NULL) return floatImage; @@ -1868,11 +1947,11 @@ bool nv::ImageIO::saveFloat(const char * fileName, Stream & s, const FloatImage const char * extension = Path::extension(fileName); - if (strCaseCmp(extension, ".dds") == 0) { + if (strCaseDiff(extension, ".dds") == 0) { return saveFloatDDS(s, fimage, baseComponent, componentCount); } - /*if (strCaseCmp(extension, ".pfm") == 0) { + /*if (strCaseDiff(extension, ".pfm") == 0) { return saveFloatPFM(s, fimage, baseComponent, componentCount); }*/ @@ -1922,13 +2001,13 @@ bool nv::ImageIO::saveFloat(const char * fileName, const FloatImage * fimage, ui const char * extension = Path::extension(fileName); #if defined(HAVE_OPENEXR) - if (strCaseCmp(extension, ".exr") == 0) { + if (strCaseDiff(extension, ".exr") == 0) { return saveFloatEXR(fileName, fimage, baseComponent, componentCount); } #endif #if defined(HAVE_TIFF) - if (strCaseCmp(extension, ".tif") == 0 || strCaseCmp(extension, ".tiff") == 0) { + if (strCaseDiff(extension, ".tif") == 0 || strCaseDiff(extension, ".tiff") == 0) { return saveFloatTIFF(fileName, fimage, baseComponent, componentCount); } #endif diff --git a/src/nvmath/Box.inl b/src/nvmath/Box.inl index f6756e4..dcfa70f 100644 --- a/src/nvmath/Box.inl +++ b/src/nvmath/Box.inl @@ -39,7 +39,7 @@ namespace nv // Build a cube centered on center and with edge = 2*dist inline void Box::cube(const Vector3 & center, float dist) { - setCenterExtents(center, Vector3(dist, dist, dist)); + setCenterExtents(center, Vector3(dist)); } // Build a box, given center and extents. diff --git a/src/nvmath/Color.h b/src/nvmath/Color.h index 055395b..5cdc374 100644 --- a/src/nvmath/Color.h +++ b/src/nvmath/Color.h @@ -89,6 +89,7 @@ namespace nv uint8 b: 8; #endif }; + uint8 component[4]; uint32 u; }; }; diff --git a/src/nvmath/Color.inl b/src/nvmath/Color.inl index 84ddc59..2b87ee4 100644 --- a/src/nvmath/Color.inl +++ b/src/nvmath/Color.inl @@ -6,6 +6,7 @@ #include "Color.h" #include "Vector.inl" +#include "ftoi.h" namespace nv @@ -123,30 +124,30 @@ namespace nv inline Color32 toColor32(const Vector4 & v) { Color32 color; - color.r = toU8(nv::iround(saturate(v.x) * 255)); - color.g = toU8(nv::iround(saturate(v.y) * 255)); - color.b = toU8(nv::iround(saturate(v.z) * 255)); - color.a = toU8(nv::iround(saturate(v.w) * 255)); + color.r = U8(ftoi_round(saturate(v.x) * 255)); + color.g = U8(ftoi_round(saturate(v.y) * 255)); + color.b = U8(ftoi_round(saturate(v.z) * 255)); + color.a = U8(ftoi_round(saturate(v.w) * 255)); return color; } inline Color32 toColor32_from_bgra(const Vector4 & v) { Color32 color; - color.b = toU8(nv::iround(saturate(v.x) * 255)); - color.g = toU8(nv::iround(saturate(v.y) * 255)); - color.r = toU8(nv::iround(saturate(v.z) * 255)); - color.a = toU8(nv::iround(saturate(v.w) * 255)); + color.b = U8(ftoi_round(saturate(v.x) * 255)); + color.g = U8(ftoi_round(saturate(v.y) * 255)); + color.r = U8(ftoi_round(saturate(v.z) * 255)); + color.a = U8(ftoi_round(saturate(v.w) * 255)); return color; } inline Color32 toColor32_from_argb(const Vector4 & v) { Color32 color; - color.a = toU8(nv::iround(saturate(v.x) * 255)); - color.r = toU8(nv::iround(saturate(v.y) * 255)); - color.g = toU8(nv::iround(saturate(v.z) * 255)); - color.b = toU8(nv::iround(saturate(v.w) * 255)); + color.a = U8(ftoi_round(saturate(v.x) * 255)); + color.r = U8(ftoi_round(saturate(v.y) * 255)); + color.g = U8(ftoi_round(saturate(v.z) * 255)); + color.b = U8(ftoi_round(saturate(v.w) * 255)); return color; } diff --git a/src/nvmath/Fitting.cpp b/src/nvmath/Fitting.cpp index 3cbb712..6ac2ab9 100644 --- a/src/nvmath/Fitting.cpp +++ b/src/nvmath/Fitting.cpp @@ -4,10 +4,11 @@ #include "Vector.inl" #include "Plane.inl" +#include "nvcore/Array.inl" #include "nvcore/Utils.h" // max, swap #include // FLT_MAX -#include +//#include #include using namespace nv; @@ -329,7 +330,7 @@ void ArvoSVD(int rows, int cols, float * Q, float * diag, float * R); Vector3 nv::Fit::computePrincipalComponent_SVD(int n, const Vector3 *__restrict points) { // Store the points in an n x n matrix - std::vector Q(n*n, 0.0f); + Array Q; Q.resize(n*n, 0.0f); for (int i = 0; i < n; ++i) { Q[i*n+0] = points[i].x; @@ -338,8 +339,8 @@ Vector3 nv::Fit::computePrincipalComponent_SVD(int n, const Vector3 *__restrict } // Alloc space for the SVD outputs - std::vector diag(n, 0.0f); - std::vector R(n*n, 0.0f); + Array diag; diag.resize(n, 0.0f); + Array R; R.resize(n*n, 0.0f); ArvoSVD(n, n, &Q[0], &diag[0], &R[0]); @@ -350,7 +351,7 @@ Vector3 nv::Fit::computePrincipalComponent_SVD(int n, const Vector3 *__restrict Vector4 nv::Fit::computePrincipalComponent_SVD(int n, const Vector4 *__restrict points) { // Store the points in an n x n matrix - std::vector Q(n*n, 0.0f); + Array Q; Q.resize(n*n, 0.0f); for (int i = 0; i < n; ++i) { Q[i*n+0] = points[i].x; @@ -360,8 +361,8 @@ Vector4 nv::Fit::computePrincipalComponent_SVD(int n, const Vector4 *__restrict } // Alloc space for the SVD outputs - std::vector diag(n, 0.0f); - std::vector R(n*n, 0.0f); + Array diag; diag.resize(n, 0.0f); + Array R; R.resize(n*n, 0.0f); ArvoSVD(n, n, &Q[0], &diag[0], &R[0]); @@ -940,7 +941,7 @@ void ArvoSVD(int rows, int cols, float * Q, float * diag, float * R) float g = 0.0f; float scale = 0.0f; - std::vector temp(cols, 0.0f); + Array temp; temp.resize(cols, 0.0f); for( i = 0; i < cols; i++ ) { diff --git a/src/nvmath/Half.cpp b/src/nvmath/Half.cpp index 512b5d3..81465a4 100644 --- a/src/nvmath/Half.cpp +++ b/src/nvmath/Half.cpp @@ -580,56 +580,56 @@ namespace nv { void nv::half_init_tables() { // Init mantissa table. - mantissa_table[0] = 0; + mantissa_table[0] = 0; // denormals - for (int i = 1; i < 1024; i++) { - uint m = i << 13; - uint e = 0; - - while ((m & 0x00800000) == 0) { - e -= 0x00800000; - m <<= 1; - } - m &= ~0x00800000; - e += 0x38800000; - mantissa_table[i] = m | e; - } + for (int i = 1; i < 1024; i++) { + uint m = i << 13; + uint e = 0; + + while ((m & 0x00800000) == 0) { + e -= 0x00800000; + m <<= 1; + } + m &= ~0x00800000; + e += 0x38800000; + mantissa_table[i] = m | e; + } // normals for (int i = 1024; i < 2048; i++) { - mantissa_table[i] = (i - 1024) << 13; + mantissa_table[i] = (i - 1024) << 13; } // Init exponent table. - exponent_table[0] = 0; + exponent_table[0] = 0; for (int i = 1; i < 31; i++) { - exponent_table[i] = 0x38000000 + (i << 23); + exponent_table[i] = 0x38000000 + (i << 23); } - exponent_table[31] = 0x7f800000; - exponent_table[32] = 0x80000000; + exponent_table[31] = 0x7f800000; + exponent_table[32] = 0x80000000; for (int i = 33; i < 63; i++) { - exponent_table[i] = 0xb8000000 + ((i - 32) << 23); + exponent_table[i] = 0xb8000000 + ((i - 32) << 23); } - exponent_table[63] = 0xff800000; + exponent_table[63] = 0xff800000; // Init offset table. - offset_table[0] = 0; + offset_table[0] = 0; for (int i = 1; i < 32; i++) { - offset_table[i] = 1024; + offset_table[i] = 1024; } - offset_table[32] = 0; + offset_table[32] = 0; for (int i = 33; i < 64; i++) { - offset_table[i] = 1024; + offset_table[i] = 1024; } } @@ -660,27 +660,27 @@ uint32 nv::fast_half_to_float(uint16 v) // Mike Acton's code should be fairly easy to vectorize and that would handle all cases too, the table method might still be faster, though. -static __declspec(align(16)) unsigned half_sign[4] = {0x00008000, 0x00008000, 0x00008000, 0x00008000}; -static __declspec(align(16)) unsigned half_exponent[4] = {0x00007C00, 0x00007C00, 0x00007C00, 0x00007C00}; -static __declspec(align(16)) unsigned half_mantissa[4] = {0x000003FF, 0x000003FF, 0x000003FF, 0x000003FF}; +static __declspec(align(16)) unsigned half_sign[4] = {0x00008000, 0x00008000, 0x00008000, 0x00008000}; +static __declspec(align(16)) unsigned half_exponent[4] = {0x00007C00, 0x00007C00, 0x00007C00, 0x00007C00}; +static __declspec(align(16)) unsigned half_mantissa[4] = {0x000003FF, 0x000003FF, 0x000003FF, 0x000003FF}; static __declspec(align(16)) unsigned half_bias_offset[4] = {0x0001C000, 0x0001C000, 0x0001C000, 0x0001C000}; __asm { - movaps xmm1, xmm0 // Input in xmm0 - movaps xmm2, xmm0 + movaps xmm1, xmm0 // Input in xmm0 + movaps xmm2, xmm0 - andps xmm0, half_sign - andps xmm1, half_exponent - andps xmm2, half_mantissa - paddd xmm1, half_bias_offset + andps xmm0, half_sign + andps xmm1, half_exponent + andps xmm2, half_mantissa + paddd xmm1, half_bias_offset - pslld xmm0, 16 - pslld xmm1, 13 - pslld xmm2, 13 + pslld xmm0, 16 + pslld xmm1, 13 + pslld xmm2, 13 - orps xmm1, xmm2 - orps xmm0, xmm1 // Result in xmm0 + orps xmm1, xmm2 + orps xmm0, xmm1 // Result in xmm0 } diff --git a/src/nvmath/Matrix.cpp b/src/nvmath/Matrix.cpp index bf73026..29bd19f 100644 --- a/src/nvmath/Matrix.cpp +++ b/src/nvmath/Matrix.cpp @@ -7,6 +7,10 @@ #include +#if !NV_CC_MSVC && !NV_OS_ORBIS +#include +#endif + using namespace nv; @@ -20,8 +24,7 @@ static bool ludcmp(float **a, int n, int *indx, float *d) { const float TINY = 1.0e-20f; - Array vv; // vv stores the implicit scaling of each row. - vv.resize(n); + float * vv = (float*)alloca(sizeof(float) * n); // vv stores the implicit scaling of each row. *d = 1.0; // No row interchanges yet. for (int i = 0; i < n; i++) { // Loop over rows to get the implicit scaling information. @@ -149,6 +152,21 @@ bool nv::solveLU(const Matrix & A, const Vector4 & b, Vector4 * x) return true; } +// @@ Not tested. +Matrix nv::inverseLU(const Matrix & A) +{ + Vector4 Ai[4]; + + solveLU(A, Vector4(1, 0, 0, 0), &Ai[0]); + solveLU(A, Vector4(0, 1, 0, 0), &Ai[1]); + solveLU(A, Vector4(0, 0, 1, 0), &Ai[2]); + solveLU(A, Vector4(0, 0, 0, 1), &Ai[3]); + + return Matrix(Ai[0], Ai[1], Ai[2], Ai[3]); +} + + + bool nv::solveLU(const Matrix3 & A, const Vector3 & b, Vector3 * x) { nvDebugCheck(x != NULL); @@ -184,7 +202,7 @@ bool nv::solveCramer(const Matrix & A, const Vector4 & b, Vector4 * x) { nvDebugCheck(x != NULL); - *x = transform(inverse(A), b); + *x = transform(inverseCramer(A), b); return true; // @@ Return false if determinant(A) == 0 ! } @@ -198,7 +216,7 @@ bool nv::solveCramer(const Matrix3 & A, const Vector3 & b, Vector3 * x) return false; } - Matrix3 Ai = inverse(A); + Matrix3 Ai = inverseCramer(A); *x = transform(Ai, b); @@ -207,6 +225,119 @@ bool nv::solveCramer(const Matrix3 & A, const Vector3 & b, Vector3 * x) +// Inverse using gaussian elimination. From Jon's code. +Matrix nv::inverse(const Matrix & m) { + + Matrix A = m; + Matrix B(identity); + + int i, j, k; + float max, t, det, pivot; + + det = 1.0; + for (i=0; i<4; i++) { /* eliminate in column i, below diag */ + max = -1.; + for (k=i; k<4; k++) /* find pivot for column i */ + if (fabs(A(k, i)) > max) { + max = fabs(A(k, i)); + j = k; + } + if (max<=0.) return B; /* if no nonzero pivot, PUNT */ + if (j!=i) { /* swap rows i and j */ + for (k=i; k<4; k++) + swap(A(i, k), A(j, k)); + for (k=0; k<4; k++) + swap(B(i, k), B(j, k)); + det = -det; + } + pivot = A(i, i); + det *= pivot; + for (k=i+1; k<4; k++) /* only do elems to right of pivot */ + A(i, k) /= pivot; + for (k=0; k<4; k++) + B(i, k) /= pivot; + /* we know that A(i, i) will be set to 1, so don't bother to do it */ + + for (j=i+1; j<4; j++) { /* eliminate in rows below i */ + t = A(j, i); /* we're gonna zero this guy */ + for (k=i+1; k<4; k++) /* subtract scaled row i from row j */ + A(j, k) -= A(i, k)*t; /* (ignore k<=i, we know they're 0) */ + for (k=0; k<4; k++) + B(j, k) -= B(i, k)*t; + } + } + + /*---------- backward elimination ----------*/ + + for (i=4-1; i>0; i--) { /* eliminate in column i, above diag */ + for (j=0; j max) { + max = fabs(A(k, i)); + j = k; + } + if (max<=0.) return B; /* if no nonzero pivot, PUNT */ + if (j!=i) { /* swap rows i and j */ + for (k=i; k<3; k++) + swap(A(i, k), A(j, k)); + for (k=0; k<3; k++) + swap(B(i, k), B(j, k)); + det = -det; + } + pivot = A(i, i); + det *= pivot; + for (k=i+1; k<3; k++) /* only do elems to right of pivot */ + A(i, k) /= pivot; + for (k=0; k<3; k++) + B(i, k) /= pivot; + /* we know that A(i, i) will be set to 1, so don't bother to do it */ + + for (j=i+1; j<3; j++) { /* eliminate in rows below i */ + t = A(j, i); /* we're gonna zero this guy */ + for (k=i+1; k<3; k++) /* subtract scaled row i from row j */ + A(j, k) -= A(i, k)*t; /* (ignore k<=i, we know they're 0) */ + for (k=0; k<3; k++) + B(j, k) -= B(i, k)*t; + } + } + + /*---------- backward elimination ----------*/ + + for (i=3-1; i>0; i--) { /* eliminate in column i, above diag */ + for (j=0; j T to(const nv::Vector2 & v) { NV_COMPILER_CHECK(sizeof(T) == sizeof(nv::Vector2)); return T(v.x, v.y); } template T to(const nv::Vector3 & v) { NV_COMPILER_CHECK(sizeof(T) == sizeof(nv::Vector3)); return T(v.x, v.y, v.z); } -template T to(const nv::Vector4 & v) { NV_COMPILER_CHECK(sizeof(T) == sizeof(nv::Vector4)); return T(v.x, v.y, v.z, v.z); } +template T to(const nv::Vector4 & v) { NV_COMPILER_CHECK(sizeof(T) == sizeof(nv::Vector4)); return T(v.x, v.y, v.z, v.w); } #endif // NV_MATH_VECTOR_H diff --git a/src/nvmath/Vector.inl b/src/nvmath/Vector.inl index 769e366..c896885 100644 --- a/src/nvmath/Vector.inl +++ b/src/nvmath/Vector.inl @@ -440,14 +440,17 @@ namespace nv } // Note, this is the area scaled by 2! + inline float triangleArea(Vector2::Arg v0, Vector2::Arg v1) + { + return (v0.x * v1.y - v0.y * v1.x); // * 0.5f; + } inline float triangleArea(Vector2::Arg a, Vector2::Arg b, Vector2::Arg c) { - Vector2 v0 = a - c; - Vector2 v1 = b - c; - - return (v0.x * v1.y - v0.y * v1.x); + return (c.x * a.y + a.x * b.y + b.x * c.y - b.x * a.y - c.x * b.y - a.x * c.y); // * 0.5f; + //return triangleArea(a-c, b-c); } + template <> inline uint hash(const Vector2 & v, uint h) { diff --git a/src/nvmath/ftoi.h b/src/nvmath/ftoi.h new file mode 100755 index 0000000..4258b8a --- /dev/null +++ b/src/nvmath/ftoi.h @@ -0,0 +1,256 @@ +// This code is in the public domain -- castano@gmail.com + +#pragma once +#ifndef NV_MATH_FTOI_H +#define NV_MATH_FTOI_H + +#include "nvmath/nvmath.h" + +#include + +namespace nv +{ + // Optimized float to int conversions. See: + // http://cbloomrants.blogspot.com/2009/01/01-17-09-float-to-int.html + // http://www.stereopsis.com/sree/fpu2006.html + // http://assemblyrequired.crashworks.org/2009/01/12/why-you-should-never-cast-floats-to-ints/ + // http://chrishecker.com/Miscellaneous_Technical_Articles#Floating_Point + + + union DoubleAnd64 { + uint64 i; + double d; + }; + + static const double floatutil_xs_doublemagic = (6755399441055744.0); // 2^52 * 1.5 + static const double floatutil_xs_doublemagicdelta = (1.5e-8); // almost .5f = .5f + 1e^(number of exp bit) + static const double floatutil_xs_doublemagicroundeps = (0.5f - floatutil_xs_doublemagicdelta); // almost .5f = .5f - 1e^(number of exp bit) + + NV_FORCEINLINE int ftoi_round_xs(double val, double magic) { +#if 1 + DoubleAnd64 dunion; + dunion.d = val + magic; + return (int32) dunion.i; // just cast to grab the bottom bits +#else + val += magic; + return ((int*)&val)[0]; // @@ Assumes little endian. +#endif + } + + NV_FORCEINLINE int ftoi_round_xs(float val) { + return ftoi_round_xs(val, floatutil_xs_doublemagic); + } + + NV_FORCEINLINE int ftoi_floor_xs(float val) { + return ftoi_round_xs(val - floatutil_xs_doublemagicroundeps, floatutil_xs_doublemagic); + } + + NV_FORCEINLINE int ftoi_ceil_xs(float val) { + return ftoi_round_xs(val + floatutil_xs_doublemagicroundeps, floatutil_xs_doublemagic); + } + + NV_FORCEINLINE int ftoi_trunc_xs(float val) { + return (val<0) ? ftoi_ceil_xs(val) : ftoi_floor_xs(val); + } + +#if NV_CPU_X86 || NV_CPU_X86_64 + + NV_FORCEINLINE int ftoi_round_sse(float f) { + return _mm_cvt_ss2si(_mm_set_ss(f)); + } + + NV_FORCEINLINE int ftoi_trunc_sse(float f) { + return _mm_cvtt_ss2si(_mm_set_ss(f)); + } + +#endif + + + +#if NV_USE_SSE + + NV_FORCEINLINE int ftoi_round(float val) { + return ftoi_round_sse(val); + } + + NV_FORCEINLINE int ftoi_trunc(float f) { + return ftoi_trunc_sse(f); + } + + // We can probably do better than this. See for example: + // http://dss.stephanierct.com/DevBlog/?p=8 + NV_FORCEINLINE int ftoi_floor(float val) { + return ftoi_round(floorf(val)); + } + + NV_FORCEINLINE int ftoi_ceil(float val) { + return ftoi_round(ceilf(val)); + } + +#else + + // In theory this should work with any double floating point math implementation, but it appears that MSVC produces incorrect code + // when SSE2 is targeted and fast math is enabled (/arch:SSE2 & /fp:fast). These problems go away with /fp:precise, which is the default mode. + + NV_FORCEINLINE int ftoi_round(float val) { + return ftoi_round_xs(val); + } + + NV_FORCEINLINE int ftoi_floor(float val) { + return ftoi_floor_xs(val); + } + + NV_FORCEINLINE int ftoi_ceil(float val) { + return ftoi_ceil_xs(val); + } + + NV_FORCEINLINE int ftoi_trunc(float f) { + return ftoi_trunc_xs(f); + } + +#endif + + + inline void test_ftoi() { + + // Round to nearest integer. + nvCheck(ftoi_round(0.1f) == 0); + nvCheck(ftoi_round(0.6f) == 1); + nvCheck(ftoi_round(-0.2f) == 0); + nvCheck(ftoi_round(-0.7f) == -1); + nvCheck(ftoi_round(10.1f) == 10); + nvCheck(ftoi_round(10.6f) == 11); + nvCheck(ftoi_round(-90.1f) == -90); + nvCheck(ftoi_round(-90.6f) == -91); + + nvCheck(ftoi_round(0) == 0); + nvCheck(ftoi_round(1) == 1); + nvCheck(ftoi_round(-1) == -1); + + nvCheck(ftoi_round(0.5f) == 0); // How are midpoints rounded? Bankers rounding. + nvCheck(ftoi_round(1.5f) == 2); + nvCheck(ftoi_round(2.5f) == 2); + nvCheck(ftoi_round(3.5f) == 4); + nvCheck(ftoi_round(4.5f) == 4); + nvCheck(ftoi_round(-0.5f) == 0); + nvCheck(ftoi_round(-1.5f) == -2); + + + // Truncation (round down if > 0, round up if < 0). + nvCheck(ftoi_trunc(0.1f) == 0); + nvCheck(ftoi_trunc(0.6f) == 0); + nvCheck(ftoi_trunc(-0.2f) == 0); + nvCheck(ftoi_trunc(-0.7f) == 0); // @@ When using /arch:SSE2 in Win32, msvc produce wrong code for this one. It is skipping the addition. + nvCheck(ftoi_trunc(1.99f) == 1); + nvCheck(ftoi_trunc(-1.2f) == -1); + + // Floor (round down). + nvCheck(ftoi_floor(0.1f) == 0); + nvCheck(ftoi_floor(0.6f) == 0); + nvCheck(ftoi_floor(-0.2f) == -1); + nvCheck(ftoi_floor(-0.7f) == -1); + nvCheck(ftoi_floor(1.99f) == 1); + nvCheck(ftoi_floor(-1.2f) == -2); + + nvCheck(ftoi_floor(0) == 0); + nvCheck(ftoi_floor(1) == 1); + nvCheck(ftoi_floor(-1) == -1); + nvCheck(ftoi_floor(2) == 2); + nvCheck(ftoi_floor(-2) == -2); + + // Ceil (round up). + nvCheck(ftoi_ceil(0.1f) == 1); + nvCheck(ftoi_ceil(0.6f) == 1); + nvCheck(ftoi_ceil(-0.2f) == 0); + nvCheck(ftoi_ceil(-0.7f) == 0); + nvCheck(ftoi_ceil(1.99f) == 2); + nvCheck(ftoi_ceil(-1.2f) == -1); + + nvCheck(ftoi_ceil(0) == 0); + nvCheck(ftoi_ceil(1) == 1); + nvCheck(ftoi_ceil(-1) == -1); + nvCheck(ftoi_ceil(2) == 2); + nvCheck(ftoi_ceil(-2) == -2); + } + + + + + + // Safe versions using standard casts. + + inline int iround(float f) + { + return int(floorf(f + 0.5f)); + } + + inline int iround(double f) + { + return int(::floor(f + 0.5)); + } + + inline int ifloor(float f) + { + return int(floorf(f)); + } + + inline int iceil(float f) + { + return int(ceilf(f)); + } + + + + // I'm always confused about which quantizer to use. I think we should choose a quantizer based on how the values are expanded later and this is generally using the 'exact endpoints' rule. + // Some notes from cbloom: http://cbloomrants.blogspot.com/2011/07/07-26-11-pixel-int-to-float-options.html + + // Quantize a float in the [0,1] range, using exact end points or uniform bins. + inline float quantizeFloat(float x, uint bits, bool exactEndPoints = true) { + nvDebugCheck(bits <= 16); + + float range = float(1 << bits); + if (exactEndPoints) { + return floorf(x * (range-1) + 0.5f) / (range-1); + } + else { + return (floorf(x * range) + 0.5f) / range; + } + } + + + // This is the most common rounding mode: + // + // 0 1 2 3 + // |___|_______|_______|___| + // 0 1 + // + // You get that if you take the unit floating point number multiply by 'N-1' and round to nearest. That is, `i = round(f * (N-1))`. + // You reconstruct the original float dividing by 'N-1': `f = i / (N-1)` + + + // 0 1 2 3 + // |_____|_____|_____|_____| + // 0 1 + + /*enum BinningMode { + RoundMode_ExactEndPoints, + RoundMode_UniformBins, + };*/ + + template + inline uint unitFloatToFixed(float f) { + return ftoi_round(f * ((1<(f); + } + + inline uint16 unitFloatToFixed16(float f) { + return (uint16)unitFloatToFixed<16>(f); + } + + +} // nv + +#endif // NV_MATH_FTOI_H diff --git a/src/nvmath/nvmath.h b/src/nvmath/nvmath.h index f2a907c..9626431 100644 --- a/src/nvmath/nvmath.h +++ b/src/nvmath/nvmath.h @@ -14,6 +14,13 @@ #include // finite, isnan #endif +#if NV_CPU_X86 || NV_CPU_X86_64 + //#include + #include +#endif + + + // Function linkage #if NVMATH_SHARED #ifdef NVMATH_EXPORTS @@ -28,6 +35,37 @@ #define NVMATH_CLASS #endif // NVMATH_SHARED +// Set some reasonable defaults. +#ifndef NV_USE_ALTIVEC +# define NV_USE_ALTIVEC NV_CPU_PPC +//# define NV_USE_ALTIVEC defined(__VEC__) +#endif + +#ifndef NV_USE_SSE +# if NV_CPU_X86_64 + // x64 always supports at least SSE2 +# define NV_USE_SSE 2 +# elif NV_CC_MSVC && defined(_M_IX86_FP) + // Also on x86 with the /arch:SSE flag in MSVC. +# define NV_USE_SSE _M_IX86_FP // 1=SSE, 2=SS2 +# elif defined(__SSE__) +# define NV_USE_SSE 1 +# elif defined(__SSE2__) +# define NV_USE_SSE 2 +# else + // Otherwise we assume no SSE. +# define NV_USE_SSE 0 +# endif +#endif + + +// Internally set NV_USE_SIMD when either altivec or sse is available. +#if NV_USE_ALTIVEC && NV_USE_SSE +# error "Cannot enable both altivec and sse!" +#endif + + + #ifndef PI #define PI float(3.1415926535897932384626433833) #endif @@ -179,26 +217,6 @@ namespace nv inline float cube(float f) { return f * f * f; } inline int cube(int i) { return i * i * i; } - // @@ Float to int conversions to be optimized at some point. See: - // http://cbloomrants.blogspot.com/2009/01/01-17-09-float-to-int.html - // http://www.stereopsis.com/sree/fpu2006.html - // http://assemblyrequired.crashworks.org/2009/01/12/why-you-should-never-cast-floats-to-ints/ - // http://chrishecker.com/Miscellaneous_Technical_Articles#Floating_Point - inline int iround(float f) - { - return int(floorf(f + 0.5f)); - } - - inline int ifloor(float f) - { - return int(floorf(f)); - } - - inline int iceil(float f) - { - return int(ceilf(f)); - } - inline float frac(float f) { return f - floor(f); @@ -242,21 +260,6 @@ namespace nv return 0; } - // I'm always confused about which quantizer to use. I think we should choose a quantizer based on how the values are expanded later and this is generally using the 'exact endpoints' rule. - - // Quantize a float in the [0,1] range, using exact end points or uniform bins. - inline float quantizeFloat(float x, uint bits, bool exactEndPoints = true) { - nvDebugCheck(bits <= 16); - - float range = float(1 << bits); - if (exactEndPoints) { - return floorf(x * (range-1) + 0.5f) / (range-1); - } - else { - return (floorf(x * range) + 0.5f) / range; - } - } - union Float754 { unsigned int raw; float value; diff --git a/src/nvthread/ThreadPool.cpp b/src/nvthread/ThreadPool.cpp index 1ecec5b..8364c62 100644 --- a/src/nvthread/ThreadPool.cpp +++ b/src/nvthread/ThreadPool.cpp @@ -50,7 +50,7 @@ AutoPtr s_pool; /*static*/ void ThreadPool::workerFunc(void * arg) { - uint i = toU32((uintptr_t)arg); // This is OK, because workerCount should always be much smaller than 2^32 + uint i = U32((uintptr_t)arg); // This is OK, because workerCount should always be much smaller than 2^32 while(true) { diff --git a/src/nvtt/BlockCompressor.cpp b/src/nvtt/BlockCompressor.cpp index 88ebebb..6998e1b 100644 --- a/src/nvtt/BlockCompressor.cpp +++ b/src/nvtt/BlockCompressor.cpp @@ -165,6 +165,10 @@ void ColorBlockCompressor::compress(nvtt::AlphaMode alphaMode, uint w, uint h, u // Use a single thread to compress small textures. if (context.bh < 4) dispatcher = &sequential; +#if _DEBUG + dispatcher = &sequential; +#endif + const uint count = context.bw * context.bh; const uint size = context.bs * count; context.mem = new uint8[size]; @@ -231,6 +235,10 @@ void ColorSetCompressor::compress(AlphaMode alphaMode, uint w, uint h, uint d, c // Use a single thread to compress small textures. if (context.bh < 4) dispatcher = &sequential; +#if _DEBUG + dispatcher = &sequential; +#endif + const uint count = context.bw * context.bh; const uint size = context.bs * count; context.mem = new uint8[size]; diff --git a/src/nvtt/ClusterFit.cpp b/src/nvtt/ClusterFit.cpp index f9444b5..4f5602a 100644 --- a/src/nvtt/ClusterFit.cpp +++ b/src/nvtt/ClusterFit.cpp @@ -27,6 +27,7 @@ #include "ClusterFit.h" #include "nvmath/Fitting.h" #include "nvmath/Vector.inl" +#include "nvmath/ftoi.h" #include "nvimage/ColorBlock.h" #include // FLT_MAX @@ -37,7 +38,8 @@ ClusterFit::ClusterFit() { } -void ClusterFit::setColourSet(const ColorSet * set) +// @@ Deprecate. Do not use color set directly. +void ClusterFit::setColorSet(const ColorSet * set) { // initialise the best error #if NVTT_USE_SIMD @@ -58,6 +60,7 @@ void ClusterFit::setColourSet(const ColorSet * set) } Vector3 principal = Fit::computePrincipalComponent_PowerMethod(m_count, values, set->weights, metric); + //Vector3 principal = Fit::computePrincipalComponent_EigenSolver(m_count, values, set->weights, metric); // build the list of values int order[16]; @@ -107,7 +110,72 @@ void ClusterFit::setColourSet(const ColorSet * set) } -void ClusterFit::setMetric(Vector4::Arg w) +void ClusterFit::setColorSet(const Vector3 * colors, const float * weights, int count) +{ + // initialise the best error +#if NVTT_USE_SIMD + m_besterror = SimdVector( FLT_MAX ); + Vector3 metric = m_metric.toVector3(); +#else + m_besterror = FLT_MAX; + Vector3 metric = m_metric; +#endif + + m_count = count; + + Vector3 principal = Fit::computePrincipalComponent_PowerMethod(count, colors, weights, metric); + //Vector3 principal = Fit::computePrincipalComponent_EigenSolver(count, colors, weights, metric); + + // build the list of values + int order[16]; + float dps[16]; + for (uint i = 0; i < m_count; ++i) + { + dps[i] = dot(colors[i], principal); + order[i] = i; + } + + // stable sort + for (uint i = 0; i < m_count; ++i) + { + for (uint j = i; j > 0 && dps[j] < dps[j - 1]; --j) + { + swap(dps[j], dps[j - 1]); + swap(order[j], order[j - 1]); + } + } + + // weight all the points +#if NVTT_USE_SIMD + m_xxsum = SimdVector( 0.0f ); + m_xsum = SimdVector( 0.0f ); +#else + m_xxsum = Vector3(0.0f); + m_xsum = Vector3(0.0f); + m_wsum = 0.0f; +#endif + + for (uint i = 0; i < m_count; ++i) + { + int p = order[i]; +#if NVTT_USE_SIMD + NV_ALIGN_16 Vector4 tmp(colors[p], 1); + m_weighted[i] = SimdVector(tmp.component) * SimdVector(weights[p]); + m_xxsum += m_weighted[i] * m_weighted[i]; + m_xsum += m_weighted[i]; +#else + m_weighted[i] = colors[p] * weights[p]; + m_xxsum += m_weighted[i] * m_weighted[i]; + m_xsum += m_weighted[i]; + m_weights[i] = weights[p]; + m_wsum += m_weights[i]; +#endif + } +} + + + +void ClusterFit::setColorWeights(Vector4::Arg w) { #if NVTT_USE_SIMD NV_ALIGN_16 Vector4 tmp(w.xyz(), 1); @@ -292,12 +360,21 @@ bool ClusterFit::compress4( Vector3 * start, Vector3 * end ) SimdVector e3 = negativeMultiplySubtract( b, betax_sum, e2 ); SimdVector e4 = multiplyAdd( two, e3, e1 ); +#if 1 // apply the metric to the error term SimdVector e5 = e4 * m_metricSqr; SimdVector error = e5.splatX() + e5.splatY() + e5.splatZ(); +#else + // @@ Is there a horizontal max SIMD instruction? + SimdVector error = e4.splatX() + e4.splatY() + e4.splatZ(); + error *= two; + error += max(max(e4.splatX(), e4.splatY()), e4.splatZ()); + error -= min(min(e4.splatX(), e4.splatY()), e4.splatZ()); + +#endif // keep the solution if it wins - if( compareAnyLessThan( error, besterror ) ) + if (compareAnyLessThan(error, besterror)) { besterror = error; beststart = a; @@ -317,7 +394,7 @@ bool ClusterFit::compress4( Vector3 * start, Vector3 * end ) } // save the block if necessary - if( compareAnyLessThan( besterror, m_besterror ) ) + if (compareAnyLessThan(besterror, m_besterror)) { *start = beststart.toVector3(); *end = bestend.toVector3(); @@ -333,6 +410,29 @@ bool ClusterFit::compress4( Vector3 * start, Vector3 * end ) #else +inline Vector3 round565(const Vector3 & v) { + uint r = ftoi_floor(v.x * 31.0f); + float r0 = float(((r+0) << 3) | ((r+0) >> 2)); + float r1 = float(((r+1) << 3) | ((r+1) >> 2)); + if (fabs(v.x - r1) < fabs(v.x - r0)) r = min(r+1, 31U); + r = (r << 3) | (r >> 2); + + uint g = ftoi_floor(v.y * 63.0f); + float g0 = float(((g+0) << 2) | ((g+0) >> 4)); + float g1 = float(((g+1) << 2) | ((g+1) >> 4)); + if (fabs(v.y - g1) < fabs(v.y - g0)) g = min(g+1, 63U); + g = (g << 2) | (g >> 4); + + uint b = ftoi_floor(v.z * 31.0f); + float b0 = float(((b+0) << 3) | ((b+0) >> 2)); + float b1 = float(((b+1) << 3) | ((b+1) >> 2)); + if (fabs(v.z - b1) < fabs(v.z - b0)) b = min(b+1, 31U); + + b = (b << 3) | (b >> 2); + + return Vector3(float(r)/255, float(g)/255, float(b)/255); +} + bool ClusterFit::compress3(Vector3 * start, Vector3 * end) { const uint count = m_count; @@ -374,8 +474,29 @@ bool ClusterFit::compress3(Vector3 * start, Vector3 * end) // clamp to the grid a = clamp(a, 0, 1); b = clamp(b, 0, 1); - a = floor(grid * a + 0.5f) * gridrcp; - b = floor(grid * b + 0.5f) * gridrcp; + //a = floor(grid * a + 0.5f) * gridrcp; + //b = floor(grid * b + 0.5f) * gridrcp; + + //int ar = ftoi_round(31 * a.x); ar = (ar << 3) | (ar >> 2); a.x = float(ar) / 255.0f; + //int ag = ftoi_round(63 * a.y); ar = (ag << 2) | (ag >> 4); a.y = float(ag) / 255.0f; + //int ab = ftoi_round(31 * a.z); ar = (ab << 3) | (ab >> 2); a.z = float(ab) / 255.0f; + //int br = ftoi_round(31 * b.x); br = (br << 3) | (br >> 2); b.x = float(br) / 255.0f; + //int bg = ftoi_round(63 * b.y); br = (bg << 2) | (bg >> 4); b.y = float(bg) / 255.0f; + //int bb = ftoi_round(31 * b.z); br = (bb << 3) | (bb >> 2); b.z = float(bb) / 255.0f; + + /*a = floor(a * grid + 0.5f); + a.x = (a.x * 8 + floorf(a.x / 4)) / 255.0f; + a.y = (a.y * 4 + floorf(a.y / 16)) / 255.0f; + a.z = (a.z * 8 + floorf(a.z / 4)) / 255.0f; + + b = floor(b * grid + 0.5f); + b.x = (b.x * 8 + floorf(b.x / 4)) / 255.0f; + b.y = (b.y * 4 + floorf(b.y / 16)) / 255.0f; + b.z = (b.z * 8 + floorf(b.z / 4)) / 255.0f;*/ + + a = round565(a); + b = round565(b); + // compute the error Vector3 e1 = a*a*alpha2_sum + b*b*beta2_sum + 2.0f*( a*b*alphabeta_sum - a*alphax_sum - b*betax_sum ); @@ -461,8 +582,30 @@ bool ClusterFit::compress4(Vector3 * start, Vector3 * end) // clamp to the grid a = clamp(a, 0, 1); b = clamp(b, 0, 1); - a = floor(a * grid + 0.5f) * gridrcp; - b = floor(b * grid + 0.5f) * gridrcp; + //a = floor(a * grid + 0.5f) * gridrcp; + //b = floor(b * grid + 0.5f) * gridrcp; + + //int ar = ftoi_round(31 * a.x); ar = (ar << 3) | (ar >> 2); a.x = float(ar) / 255.0f; + //int ag = ftoi_round(63 * a.y); ar = (ag << 2) | (ag >> 4); a.y = float(ag) / 255.0f; + //int ab = ftoi_round(31 * a.z); ar = (ab << 3) | (ab >> 2); a.z = float(ab) / 255.0f; + //int br = ftoi_round(31 * b.x); br = (br << 3) | (br >> 2); b.x = float(br) / 255.0f; + //int bg = ftoi_round(63 * b.y); br = (bg << 2) | (bg >> 4); b.y = float(bg) / 255.0f; + //int bb = ftoi_round(31 * b.z); br = (bb << 3) | (bb >> 2); b.z = float(bb) / 255.0f; + + /* + a = floor(a * grid + 0.5f); + a.x = (a.x * 8 + floorf(a.x / 4)) / 255.0f; + a.y = (a.y * 4 + floorf(a.y / 16)) / 255.0f; + a.z = (a.z * 8 + floorf(a.z / 4)) / 255.0f; + + b = floor(b * grid + 0.5f); + b.x = (b.x * 8 + floorf(b.x / 4)) / 255.0f; + b.y = (b.y * 4 + floorf(b.y / 16)) / 255.0f; + b.z = (b.z * 8 + floorf(b.z / 4)) / 255.0f; + */ + + a = round565(a); + b = round565(b); // compute the error Vector3 e1 = a*a*alpha2_sum + b*b*beta2_sum + 2.0f*( a*b*alphabeta_sum - a*alphax_sum - b*betax_sum ); diff --git a/src/nvtt/ClusterFit.h b/src/nvtt/ClusterFit.h index 49a3ec4..39dcbfb 100644 --- a/src/nvtt/ClusterFit.h +++ b/src/nvtt/ClusterFit.h @@ -31,8 +31,8 @@ #include "nvmath/Vector.h" // Use SIMD version if altivec or SSE are available. -#define NVTT_USE_SIMD (NV_USE_ALTIVEC || NV_USE_SSE) -//#define NVTT_USE_SIMD 0 +//#define NVTT_USE_SIMD (NV_USE_ALTIVEC || NV_USE_SSE) +#define NVTT_USE_SIMD 0 namespace nv { @@ -43,9 +43,10 @@ namespace nv { public: ClusterFit(); - void setColourSet(const ColorSet * set); + void setColorSet(const ColorSet * set); + void setColorSet(const Vector3 * colors, const float * weights, int count); - void setMetric(const Vector4 & w); + void setColorWeights(const Vector4 & w); float bestError() const; bool compress3(Vector3 * start, Vector3 * end); diff --git a/src/nvtt/CompressionOptions.cpp b/src/nvtt/CompressionOptions.cpp index 1bdbd1f..3951c32 100644 --- a/src/nvtt/CompressionOptions.cpp +++ b/src/nvtt/CompressionOptions.cpp @@ -246,11 +246,14 @@ unsigned int CompressionOptions::d3d9Format() const FOURCC_ATI2, // Format_BC5 FOURCC_DXT1, // Format_DXT1n 0, // Format_CTX1 - 0, // Format_BC6 - 0, // Format_BC7 - 0, // Format_RGBE + MAKEFOURCC('B', 'C', '6', 'H'), // Format_BC6 + MAKEFOURCC('B', 'C', '7', 'L'), // Format_BC7 + FOURCC_ATI2, // Format_BC5_Luma + FOURCC_DXT5, // Format_BC3_RGBM }; + NV_COMPILER_CHECK(NV_ARRAY_SIZE(d3d9_formats) == Format_Count); + return d3d9_formats[m.format]; } } diff --git a/src/nvtt/CompressorDX10.cpp b/src/nvtt/CompressorDX10.cpp index bb602ff..d823db8 100644 --- a/src/nvtt/CompressorDX10.cpp +++ b/src/nvtt/CompressorDX10.cpp @@ -31,49 +31,90 @@ #include "nvimage/ColorBlock.h" #include "nvimage/BlockDXT.h" +#include "nvmath/ftoi.h" + #include // placement new using namespace nv; using namespace nvtt; -void FastCompressorBC4::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output) +void FastCompressorBC4::compressBlock(ColorBlock & src, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output) { BlockATI1 * block = new(output) BlockATI1; - rgba.swizzle(0, 1, 2, 0); // Copy red to alpha - QuickCompress::compressDXT5A(rgba, &block->alpha); + AlphaBlock4x4 tmp; + tmp.init(src, 0); // Copy red to alpha + QuickCompress::compressDXT5A(tmp, &block->alpha); } -void FastCompressorBC5::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output) +void FastCompressorBC5::compressBlock(ColorBlock & src, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output) { BlockATI2 * block = new(output) BlockATI2; - rgba.swizzle(0, 1, 2, 0); // Copy red to alpha - QuickCompress::compressDXT5A(rgba, &block->x); + AlphaBlock4x4 tmp; + + tmp.init(src, 0); // Copy red to alpha + QuickCompress::compressDXT5A(tmp, &block->x); - rgba.swizzle(0, 1, 2, 1); // Copy green to alpha - QuickCompress::compressDXT5A(rgba, &block->y); + tmp.init(src, 1); // Copy green to alpha + QuickCompress::compressDXT5A(tmp, &block->y); } -void ProductionCompressorBC4::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output) +void ProductionCompressorBC4::compressBlock(ColorBlock & src, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output) { BlockATI1 * block = new(output) BlockATI1; - rgba.swizzle(0, 1, 2, 0); // Copy red to alpha - OptimalCompress::compressDXT5A(rgba, &block->alpha); + AlphaBlock4x4 tmp; + tmp.init(src, 0); // Copy red to alpha + OptimalCompress::compressDXT5A(tmp, &block->alpha); } -void ProductionCompressorBC5::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output) +void ProductionCompressorBC5::compressBlock(ColorBlock & src, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output) { BlockATI2 * block = new(output) BlockATI2; + + AlphaBlock4x4 tmp; + + tmp.init(src, 0); // Copy red to alpha + OptimalCompress::compressDXT5A(tmp, &block->x); - rgba.swizzle(0, 1, 2, 0); // Copy red to alpha - OptimalCompress::compressDXT5A(rgba, &block->x); - - rgba.swizzle(0, 1, 2, 1); // Copy green to alpha - OptimalCompress::compressDXT5A(rgba, &block->y); + tmp.init(src, 1); // Copy green to alpha + OptimalCompress::compressDXT5A(tmp, &block->y); } +void ProductionCompressorBC5_Luma::compressBlock(ColorSet & set, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output) +{ + BlockATI2 * block = new(output) BlockATI2; + + AlphaBlock4x4 tmp; + tmp.init(set, /*channel=*/0); + OptimalCompress::compressDXT5A(tmp, &block->x); + + // Decode block->x + AlphaBlock4x4 decoded; + block->x.decodeBlock(&decoded); + + const float R = 1.0f / 256.0f; // Maximum residual that we can represent. @@ Tweak this. + + // Compute residual block. + for (int i = 0; i < 16; i++) { + float in = set.color(i).x; // [0,1] + float out = float(decoded.alpha[i]) / 255.0f; // [0,1] + + float residual = (out - in); // [-1,1], but usually [-R,R] + + // Normalize residual to [-1,1] range. + residual /= R; + + // Pack in [0,1] range. + residual = residual * 0.5f + 0.5f; + + tmp.alpha[i] = nv::ftoi_round(nv::saturate(residual) * 255.0f); + } + + OptimalCompress::compressDXT5A(tmp, &block->y); + +} diff --git a/src/nvtt/CompressorDX10.h b/src/nvtt/CompressorDX10.h index 5be6361..0ea16c3 100644 --- a/src/nvtt/CompressorDX10.h +++ b/src/nvtt/CompressorDX10.h @@ -58,6 +58,13 @@ namespace nv virtual uint blockSize() const { return 16; } }; + struct ProductionCompressorBC5_Luma : public ColorSetCompressor + { + virtual void compressBlock(ColorSet & set, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output); + virtual uint blockSize() const { return 16; } + }; + + } // nv namespace diff --git a/src/nvtt/CompressorDX11.cpp b/src/nvtt/CompressorDX11.cpp index 98c736d..cf83a69 100644 --- a/src/nvtt/CompressorDX11.cpp +++ b/src/nvtt/CompressorDX11.cpp @@ -24,7 +24,6 @@ #include "CompressorDX11.h" -#include #include "nvtt.h" #include "CompressionOptions.h" #include "nvimage/ColorBlock.h" @@ -34,16 +33,16 @@ #include "bc6h/zoh.h" #include "bc7/avpcl.h" +#include // memset + using namespace nv; using namespace nvtt; void CompressorBC6::compressBlock(ColorSet & tile, AlphaMode alphaMode, const CompressionOptions::Private & compressionOptions, void * output) { - // !!!UNDONE: support channel weights - // !!!UNDONE: set flags once, not per block (this is especially sketchy since block compression is multithreaded...) - - NV_UNUSED(alphaMode); // ZOH does not support alpha. + // !!!UNDONE: support channel weights + // !!!UNDONE: set flags once, not per block (this is especially sketchy since block compression is multithreaded...) if (compressionOptions.pixelType == PixelType_UnsignedFloat || compressionOptions.pixelType == PixelType_UnsignedNorm || @@ -56,44 +55,60 @@ void CompressorBC6::compressBlock(ColorSet & tile, AlphaMode alphaMode, const Co ZOH::Utils::FORMAT = ZOH::SIGNED_F16; } - // Convert NVTT's tile struct to ZOH's, and convert float to half. - ZOH::Tile zohTile(tile.w, tile.h); - memset(zohTile.data, 0, sizeof(zohTile.data)); - memset(zohTile.importance_map, 0, sizeof(zohTile.importance_map)); - for (uint y = 0; y < tile.h; ++y) - { - for (uint x = 0; x < tile.w; ++x) - { - Vector3 color = tile.color(x, y).xyz(); - uint16 rHalf = to_half(color.x); - uint16 gHalf = to_half(color.y); - uint16 bHalf = to_half(color.z); - zohTile.data[y][x].x = ZOH::Tile::half2float(rHalf); - zohTile.data[y][x].y = ZOH::Tile::half2float(gHalf); - zohTile.data[y][x].z = ZOH::Tile::half2float(bHalf); - zohTile.importance_map[y][x] = 1.0f; - } - } + // Convert NVTT's tile struct to ZOH's, and convert float to half. + ZOH::Tile zohTile(tile.w, tile.h); + memset(zohTile.data, 0, sizeof(zohTile.data)); + memset(zohTile.importance_map, 0, sizeof(zohTile.importance_map)); + for (uint y = 0; y < tile.h; ++y) + { + for (uint x = 0; x < tile.w; ++x) + { + Vector4 color = tile.color(x, y); + uint16 rHalf = to_half(color.x); + uint16 gHalf = to_half(color.y); + uint16 bHalf = to_half(color.z); + zohTile.data[y][x].x = ZOH::Tile::half2float(rHalf); + zohTile.data[y][x].y = ZOH::Tile::half2float(gHalf); + zohTile.data[y][x].z = ZOH::Tile::half2float(bHalf); + + if (alphaMode == AlphaMode_Transparency) { + zohTile.importance_map[y][x] = color.w; + } + else { + zohTile.importance_map[y][x] = 1.0f; + } + } + } ZOH::compress(zohTile, (char *)output); } void CompressorBC7::compressBlock(ColorSet & tile, AlphaMode alphaMode, const CompressionOptions::Private & compressionOptions, void * output) { - // !!!UNDONE: support channel weights - // !!!UNDONE: set flags once, not per block (this is especially sketchy since block compression is multithreaded...) - - AVPCL::mode_rgb = false; - AVPCL::flag_premult = (alphaMode == AlphaMode_Premultiplied); - AVPCL::flag_nonuniform = false; - AVPCL::flag_nonuniform_ati = false; + // !!!UNDONE: support channel weights + // !!!UNDONE: set flags once, not per block (this is especially sketchy since block compression is multithreaded...) - // Convert NVTT's tile struct to AVPCL's. - AVPCL::Tile avpclTile(tile.w, tile.h); - memset(avpclTile.data, 0, sizeof(avpclTile.data)); - for (uint y = 0; y < tile.h; ++y) - for (uint x = 0; x < tile.w; ++x) - avpclTile.data[y][x] = tile.color(x, y) * 255.0f; + AVPCL::mode_rgb = false; + AVPCL::flag_premult = (alphaMode == AlphaMode_Premultiplied); + AVPCL::flag_nonuniform = false; + AVPCL::flag_nonuniform_ati = false; + + // Convert NVTT's tile struct to AVPCL's. + AVPCL::Tile avpclTile(tile.w, tile.h); + memset(avpclTile.data, 0, sizeof(avpclTile.data)); + for (uint y = 0; y < tile.h; ++y) { + for (uint x = 0; x < tile.w; ++x) { + Vector4 color = tile.color(x, y); + avpclTile.data[y][x] = color * 255.0f; + + /*if (alphaMode == AlphaMode_Transparency) { + avpclTile.importance_map[y][x] = color.w; + } + else*/ { + avpclTile.importance_map[y][x] = 1.0f; + } + } + } - AVPCL::compress(avpclTile, (char *)output); + AVPCL::compress(avpclTile, (char *)output); } diff --git a/src/nvtt/CompressorDX9.cpp b/src/nvtt/CompressorDX9.cpp index c3bee15..1fc3254 100644 --- a/src/nvtt/CompressorDX9.cpp +++ b/src/nvtt/CompressorDX9.cpp @@ -112,7 +112,8 @@ void FastCompressorDXT5n::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alpha QuickCompress::compressDXT5(rgba, block); } -#if 0 + +#if 1 void CompressorDXT1::compressBlock(ColorSet & set, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output) { set.setUniformWeights(); @@ -125,11 +126,14 @@ void CompressorDXT1::compressBlock(ColorSet & set, nvtt::AlphaMode alphaMode, co Color32 c = toColor32(set.colors[0]); OptimalCompress::compressDXT1(c, block); } + /*else if (set.colorCount == 2) { + QuickCompress::compressDXT1(..., block); + }*/ else { ClusterFit fit; - fit.setMetric(compressionOptions.colorWeight); - fit.setColourSet(&set); + fit.setColorWeights(compressionOptions.colorWeight); + fit.setColorSet(&set); Vector3 start, end; fit.compress4(&start, &end); @@ -142,6 +146,37 @@ void CompressorDXT1::compressBlock(ColorSet & set, nvtt::AlphaMode alphaMode, co } } } +#elif 1 + + +extern void compress_dxt1_bounding_box_exhaustive(const ColorBlock & input, BlockDXT1 * output); + + +void CompressorDXT1::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output) +{ + BlockDXT1 * block = new(output) BlockDXT1; + + if (rgba.isSingleColor()) + { + OptimalCompress::compressDXT1(rgba.color(0), block); + //compress_dxt1_single_color_optimal(rgba.color(0), block); + } + else + { + // Do an exhaustive search inside the bounding box. + compress_dxt1_bounding_box_exhaustive(rgba, block); + } + + /*else + { + nvsquish::WeightedClusterFit fit; + fit.SetMetric(compressionOptions.colorWeight.x, compressionOptions.colorWeight.y, compressionOptions.colorWeight.z); + + nvsquish::ColourSet colours((uint8 *)rgba.colors(), 0); + fit.SetColourSet(&colours, nvsquish::kDxt1); + fit.Compress(output); + }*/ +} #else void CompressorDXT1::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output) { @@ -304,6 +339,309 @@ void CompressorDXT5n::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode } + + + +void CompressorBC3_RGBM::compressBlock(ColorSet & src, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output) +{ + BlockDXT5 * block = new(output)BlockDXT5; + + if (alphaMode == AlphaMode_Transparency) { + src.setAlphaWeights(); + } + else { + src.setUniformWeights(); + } + + // Decompress the color block and find the M values that reproduce the input most closely. This should compensate for some of the DXT errors. + + // Compress the resulting M values optimally. + + // Repeat this several times until compression error does not improve? + + //Vector3 rgb_block[16]; + //float m_block[16]; + + + // Init RGB/M block. + const float threshold = 0.15f; // @@ Use compression options. +#if 0 + nvsquish::WeightedClusterFit fit; + + ColorBlock rgba; + for (int i = 0; i < 16; i++) { + const Vector4 & c = src.color(i); + float R = saturate(c.x); + float G = saturate(c.y); + float B = saturate(c.z); + + float M = max(max(R, G), max(B, threshold)); + float r = R / M; + float g = G / M; + float b = B / M; + float a = c.w; + + rgba.color(i) = toColor32(Vector4(r, g, b, a)); + } + + if (rgba.isSingleColor()) + { + OptimalCompress::compressDXT1(rgba.color(0), &block->color); + } + else + { + nvsquish::WeightedClusterFit fit; + fit.SetMetric(compressionOptions.colorWeight.x, compressionOptions.colorWeight.y, compressionOptions.colorWeight.z); + + int flags = 0; + if (alphaMode == nvtt::AlphaMode_Transparency) flags |= nvsquish::kWeightColourByAlpha; + + nvsquish::ColourSet colours((uint8 *)rgba.colors(), flags); + fit.SetColourSet(&colours, 0); + fit.Compress(&block->color); + } +#endif +#if 1 + ColorSet rgb; + rgb.allocate(src.w, src.h); // @@ Handle smaller blocks. + + if (src.colorCount != 16) { + nvDebugBreak(); + } + + for (uint i = 0; i < src.colorCount; i++) { + const Vector4 & c = src.color(i); + + float R = saturate(c.x); + float G = saturate(c.y); + float B = saturate(c.z); + + float M = max(max(R, G), max(B, threshold)); + float r = R / M; + float g = G / M; + float b = B / M; + float a = c.w; + + rgb.colors[i] = Vector4(r, g, b, a); + rgb.indices[i] = i; + rgb.weights[i] = max(c.w, 0.001f);// src.weights[i]; // IC: For some reason 0 weights are causing problems, even if we eliminate the corresponding colors from the set. + } + + rgb.createMinimalSet(/*ignoreTransparent=*/true); + + if (rgb.isSingleColor(/*ignoreAlpha=*/true)) { + OptimalCompress::compressDXT1(toColor32(rgb.color(0)), &block->color); + } + else { + ClusterFit fit; + fit.setColorWeights(compressionOptions.colorWeight); + fit.setColorSet(&rgb); + + Vector3 start, end; + fit.compress4(&start, &end); + + QuickCompress::outputBlock4(rgb, start, end, &block->color); + } +#endif + + // Decompress RGB/M block. + nv::ColorBlock RGB; + block->color.decodeBlock(&RGB); + +#if 1 + AlphaBlock4x4 M; + for (int i = 0; i < 16; i++) { + const Vector4 & c = src.color(i); + float R = saturate(c.x); + float G = saturate(c.y); + float B = saturate(c.z); + + float r = RGB.color(i).r / 255.0f; + float g = RGB.color(i).g / 255.0f; + float b = RGB.color(i).b / 255.0f; + + float m = (R / r + G / g + B / b) / 3.0f; + //float m = max((R / r + G / g + B / b) / 3.0f, threshold); + //float m = max(max(R / r, G / g), max(B / b, threshold)); + //float m = max(max(R, G), max(B, threshold)); + + m = (m - threshold) / (1 - threshold); + + M.alpha[i] = U8(ftoi_round(saturate(m) * 255.0f)); + M.weights[i] = src.weights[i]; + } + + // Compress M. + if (compressionOptions.quality == Quality_Fastest) { + QuickCompress::compressDXT5A(M, &block->alpha); + } + else { + OptimalCompress::compressDXT5A(M, &block->alpha); + } +#else + OptimalCompress::compressDXT5A_RGBM(src, RGB, &block->alpha); +#endif + +#if 0 + // Decompress M. + block->alpha.decodeBlock(&M); + + rgb.allocate(src.w, src.h); // @@ Handle smaller blocks. + + for (uint i = 0; i < src.colorCount; i++) { + const Vector4 & c = src.color(i); + + float R = saturate(c.x); + float G = saturate(c.y); + float B = saturate(c.z); + + //float m = max(max(R, G), max(B, threshold)); + float m = float(M.alpha[i]) / 255.0f * (1 - threshold) + threshold; + float r = R / m; + float g = G / m; + float b = B / m; + float a = c.w; + + rgb.colors[i] = Vector4(r, g, b, a); + rgb.indices[i] = i; + rgb.weights[i] = max(c.w, 0.001f);// src.weights[i]; // IC: For some reason 0 weights are causing problems, even if we eliminate the corresponding colors from the set. + } + + rgb.createMinimalSet(/*ignoreTransparent=*/true); + + if (rgb.isSingleColor(/*ignoreAlpha=*/true)) { + OptimalCompress::compressDXT1(toColor32(rgb.color(0)), &block->color); + } + else { + ClusterFit fit; + fit.setMetric(compressionOptions.colorWeight); + fit.setColourSet(&rgb); + + Vector3 start, end; + fit.compress4(&start, &end); + + QuickCompress::outputBlock4(rgb, start, end, &block->color); + } +#endif + +#if 0 + block->color.decodeBlock(&RGB); + + //AlphaBlock4x4 M; + //M.initWeights(src); + + for (int i = 0; i < 16; i++) { + const Vector4 & c = src.color(i); + float R = saturate(c.x); + float G = saturate(c.y); + float B = saturate(c.z); + + float r = RGB.color(i).r / 255.0f; + float g = RGB.color(i).g / 255.0f; + float b = RGB.color(i).b / 255.0f; + + float m = (R / r + G / g + B / b) / 3.0f; + //float m = max((R / r + G / g + B / b) / 3.0f, threshold); + //float m = max(max(R / r, G / g), max(B / b, threshold)); + //float m = max(max(R, G), max(B, threshold)); + + m = (m - threshold) / (1 - threshold); + + M.alpha[i] = U8(ftoi_round(saturate(m) * 255.0f)); + M.weights[i] = src.weights[i]; + } + + // Compress M. + if (compressionOptions.quality == Quality_Fastest) { + QuickCompress::compressDXT5A(M, &block->alpha); + } + else { + OptimalCompress::compressDXT5A(M, &block->alpha); + } +#endif + + + +#if 0 + src.fromRGBM(M, threshold); + + src.createMinimalSet(/*ignoreTransparent=*/true); + + if (src.isSingleColor(/*ignoreAlpha=*/true)) { + OptimalCompress::compressDXT1(src.color(0), &block->color); + } + else { + // @@ Use our improved compressor. + ClusterFit fit; + fit.setMetric(compressionOptions.colorWeight); + fit.setColourSet(&src); + + Vector3 start, end; + fit.compress4(&start, &end); + + if (fit.compress3(&start, &end)) { + QuickCompress::outputBlock3(src, start, end, block->color); + } + else { + QuickCompress::outputBlock4(src, start, end, block->color); + } + } +#endif // 0 + + // @@ Decompress color and compute M that best approximates src with these colors? Then compress M again? + + + + // RGBM encoding. + // Maximize precision. + // - Number of possible grey levels: + // - Naive: 2^3 = 8 + // - Better: 2^3 + 2^2 = 12 + // - How to choose threshold? + // - Ideal = Adaptive per block, don't know where to store. + // - Adaptive per lightmap. How to compute optimal? + // - Fixed: 0.25 in our case. Lightmaps scaled to a fixed [0, 1] range. + + // - Optimal compressor: Interpolation artifacts. + + // - Color transform. + // - Measure error in post-tone-mapping color space. + // - Assume a simple tone mapping operator. We know minimum and maximum exposure, but don't know exact exposure in game. + // - Guess based on average lighmap color? Use fixed exposure, in scaled lightmap space. + + // - Enhanced DXT compressor. + // - Typical RGBM encoding as follows: + // rgb -> M = max(rgb), RGB=rgb/M -> RGBM + // - If we add a compression step (M' = M) and M' < M, then rgb may be greater than 1. + // - We could ensure that M' >= M during compression. + // - We could clamp RGB anyway. + // - We could add a fixed scale value to take into account compression errors and avoid clamping. + + + + + + // Compress color. + /*if (rgba.isSingleColor()) + { + OptimalCompress::compressDXT1(rgba.color(0), &block->color); + } + else + { + nvsquish::WeightedClusterFit fit; + fit.SetMetric(compressionOptions.colorWeight.x, compressionOptions.colorWeight.y, compressionOptions.colorWeight.z); + + int flags = 0; + if (alphaMode == nvtt::AlphaMode_Transparency) flags |= nvsquish::kWeightColourByAlpha; + + nvsquish::ColourSet colours((uint8 *)rgba.colors(), flags); + fit.SetColourSet(&colours, 0); + fit.Compress(&block->color); + }*/ +} + + + #if defined(HAVE_ATITC) void AtiCompressorDXT1::compress(nvtt::InputFormat inputFormat, nvtt::AlphaMode alphaMode, uint w, uint h, uint d, void * data, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions) diff --git a/src/nvtt/CompressorDX9.h b/src/nvtt/CompressorDX9.h index e3e830b..33c1112 100644 --- a/src/nvtt/CompressorDX9.h +++ b/src/nvtt/CompressorDX9.h @@ -64,7 +64,7 @@ namespace nv // Normal CPU compressors. -#if 0 +#if 1 struct CompressorDXT1 : public ColorSetCompressor { virtual void compressBlock(ColorSet & set, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output); @@ -108,6 +108,12 @@ namespace nv virtual uint blockSize() const { return 16; } }; + struct CompressorBC3_RGBM : public ColorSetCompressor + { + virtual void compressBlock(ColorSet & set, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output); + virtual uint blockSize() const { return 16; } + }; + // External compressors. #if defined(HAVE_ATITC) diff --git a/src/nvtt/CompressorDXT1.cpp b/src/nvtt/CompressorDXT1.cpp new file mode 100644 index 0000000..f884a9a --- /dev/null +++ b/src/nvtt/CompressorDXT1.cpp @@ -0,0 +1,461 @@ + +#include "CompressorDXT1.h" +#include "SingleColorLookup.h" +#include "ClusterFit.h" +#include "QuickCompressDXT.h" // Deprecate. + +#include "nvimage/ColorBlock.h" +#include "nvimage/BlockDXT.h" + +#include "nvmath/Color.inl" +#include "nvmath/Vector.inl" +#include "nvmath/Fitting.h" +#include "nvmath/ftoi.h" + +#include "nvcore/Utils.h" // swap + +#include // memset + + +using namespace nv; + + +inline static void color_block_to_vector_block(const ColorBlock & rgba, Vector3 block[16]) +{ + for (int i = 0; i < 16; i++) + { + const Color32 c = rgba.color(i); + block[i] = Vector3(c.r, c.g, c.b); + } +} + +inline Vector3 r5g6b5_to_vector3(int r, int g, int b) +{ + Vector3 c; + c.x = float((r << 3) | (r >> 2)); + c.y = float((g << 2) | (g >> 4)); + c.z = float((b << 3) | (b >> 2)); + return c; +} + +inline Vector3 color_to_vector3(Color32 c) +{ + const float scale = 1.0f / 255.0f; + return Vector3(c.r * scale, c.g * scale, c.b * scale); +} + +inline Color32 vector3_to_color(Vector3 v) +{ + Color32 color; + color.r = U8(ftoi_round(saturate(v.x) * 255)); + color.g = U8(ftoi_round(saturate(v.y) * 255)); + color.b = U8(ftoi_round(saturate(v.z) * 255)); + color.a = 255; +} + + + +// Find first valid color. +static bool find_valid_color_rgb(const Vector3 * colors, const float * weights, int count, Vector3 * valid_color) +{ + for (int i = 0; i < count; i++) { + if (weights[i] > 0.0f) { + *valid_color = colors[i]; + return true; + } + } + + // No valid colors. + return false; +} + +static bool is_single_color_rgb(const Vector3 * colors, const float * weights, int count, Vector3 color) +{ + for (int i = 0; i < count; i++) { + if (weights[i] > 0.0f) { + if (colors[i] != color) return false; + } + } + + return true; +} + +// Find similar colors and combine them together. +static int reduce_colors(const Vector3 * input_colors, const float * input_weights, Vector3 * colors, float * weights) +{ + int n = 0; + for (int i = 0; i < 16; i++) + { + Vector3 ci = input_colors[i]; + float wi = input_weights[i]; + + if (wi > 0) { + // Find matching color. + int j; + for (j = 0; j < n; j++) { + if (equal(colors[j].x, ci.x) && equal(colors[j].y, ci.y) && equal(colors[j].z, ci.z)) { + weights[j] += wi; + break; + } + } + + // No match found. Add new color. + if (j == n) { + colors[n] = ci; + weights[n] = wi; + n++; + } + } + } + + nvDebugCheck(n <= 16); + + return n; +} + + + +// Different ways of estimating the error. +static float evaluate_mse(const Vector3 & p, const Vector3 & c) { + return square(p.x-c.x) + square(p.y-c.y) + square(p.z-c.z); +} + +/*static float evaluate_mse(const Vector3 & p, const Vector3 & c, const Vector3 & w) { + return ww.x * square(p.x-c.x) + ww.y * square(p.y-c.y) + ww.z * square(p.z-c.z); +}*/ + +static int evaluate_mse_rgb(const Color32 & p, const Color32 & c) { + return square(int(p.r)-c.r) + square(int(p.g)-c.g) + square(int(p.b)-c.b); +} + +static float evaluate_mse(const Vector3 palette[4], const Vector3 & c) { + float e0 = evaluate_mse(palette[0], c); + float e1 = evaluate_mse(palette[1], c); + float e2 = evaluate_mse(palette[2], c); + float e3 = evaluate_mse(palette[3], c); + return min(min(e0, e1), min(e2, e3)); +} + +static int evaluate_mse(const Color32 palette[4], const Color32 & c) { + int e0 = evaluate_mse_rgb(palette[0], c); + int e1 = evaluate_mse_rgb(palette[1], c); + int e2 = evaluate_mse_rgb(palette[2], c); + int e3 = evaluate_mse_rgb(palette[3], c); + return min(min(e0, e1), min(e2, e3)); +} + +static float evaluate_mse(const Vector3 palette[4], const Vector3 & c, int index) { + return evaluate_mse(palette[index], c); +} + +static int evaluate_mse(const Color32 palette[4], const Color32 & c, int index) { + return evaluate_mse_rgb(palette[index], c); +} + + +static float evaluate_mse(const BlockDXT1 * output, Vector3 colors[16]) { + Color32 palette[4]; + output->evaluatePalette(palette, /*d3d9=*/false); + + // convert palette to float. + Vector3 vector_palette[4]; + for (int i = 0; i < 4; i++) { + vector_palette[i] = color_to_vector3(palette[i]); + } + + // evaluate error for each index. + float error = 0.0f; + for (int i = 0; i < 16; i++) { + int index = (output->indices >> (2*i)) & 3; // @@ Is this the right order? + error += evaluate_mse(vector_palette, colors[i], index); + } + + return error; +} + +static int evaluate_mse(const BlockDXT1 * output, Color32 color, int index) { + Color32 palette[4]; + output->evaluatePalette(palette, /*d3d9=*/false); + + return evaluate_mse(palette, color, index); +} + + +/*void output_block3(const ColorSet & set, const Vector3 & start, const Vector3 & end, BlockDXT1 * block) +{ + Vector3 minColor = start * 255.0f; + Vector3 maxColor = end * 255.0f; + uint16 color0 = roundAndExpand(&minColor); + uint16 color1 = roundAndExpand(&maxColor); + + if (color0 > color1) { + swap(maxColor, minColor); + swap(color0, color1); + } + + block->col0 = Color16(color0); + block->col1 = Color16(color1); + block->indices = compute_indices3(colors, weights, count, maxColor / 255.0f, minColor / 255.0f); + + //optimizeEndPoints3(set, block); +}*/ + + + + + + +// Single color compressor, based on: +// https://mollyrocket.com/forums/viewtopic.php?t=392 +float nv::compress_dxt1_single_color_optimal(Color32 c, BlockDXT1 * output) +{ + output->col0.r = OMatch5[c.r][0]; + output->col0.g = OMatch6[c.g][0]; + output->col0.b = OMatch5[c.b][0]; + output->col1.r = OMatch5[c.r][1]; + output->col1.g = OMatch6[c.g][1]; + output->col1.b = OMatch5[c.b][1]; + output->indices = 0xaaaaaaaa; + + if (output->col0.u < output->col1.u) + { + swap(output->col0.u, output->col1.u); + output->indices ^= 0x55555555; + } + + return (float) evaluate_mse(output, c, output->indices & 3); +} + + +float nv::compress_dxt1_single_color_optimal(const Vector3 & color, BlockDXT1 * output) +{ + return compress_dxt1_single_color_optimal(vector3_to_color(color), output); +} + + +// Low quality baseline compressor. +float nv::compress_dxt1_least_squares_fit(const Vector3 * input_colors, const Vector3 * colors, const float * weights, int count, BlockDXT1 * output) +{ + // @@ Iterative best end point fit. + + return FLT_MAX; +} + + +static Color32 bitexpand_color16_to_color32(Color16 c16) { + Color32 c32; + c32.b = (c16.b << 3) | (c16.b >> 2); + c32.g = (c16.g << 2) | (c16.g >> 4); + c32.r = (c16.r << 3) | (c16.r >> 2); + c32.a = 0xFF; + + //c32.u = ((c16.u << 3) & 0xf8) | ((c16.u << 5) & 0xfc00) | ((c16.u << 8) & 0xf80000); + //c32.u |= (c32.u >> 5) & 0x070007; + //c32.u |= (c32.u >> 6) & 0x000300; + + return c32; +} + +static Color32 bitexpand_color16_to_color32(int r, int g, int b) { + Color32 c32; + c32.b = (b << 3) | (b >> 2); + c32.g = (g << 2) | (g >> 4); + c32.r = (r << 3) | (r >> 2); + c32.a = 0xFF; + return c32; +} + +static Color16 truncate_color32_to_color16(Color32 c32) { + Color16 c16; + c16.b = (c32.b >> 3); + c16.g = (c32.g >> 2); + c16.r = (c32.r >> 3); + return c16; +} + + + + +static float evaluate_palette4(Color32 palette[4]) { + palette[2].r = (2 * palette[0].r + palette[1].r) / 3; + palette[2].g = (2 * palette[0].g + palette[1].g) / 3; + palette[2].b = (2 * palette[0].b + palette[1].b) / 3; + palette[3].r = (2 * palette[1].r + palette[0].r) / 3; + palette[3].g = (2 * palette[1].g + palette[0].g) / 3; + palette[3].b = (2 * palette[1].b + palette[0].b) / 3; +} + +static float evaluate_palette3(Color32 palette[4]) { + palette[2].r = (palette[0].r + palette[1].r) / 2; + palette[2].g = (palette[0].g + palette[1].g) / 2; + palette[2].b = (palette[0].b + palette[1].b) / 2; + palette[3].r = 0; + palette[3].g = 0; + palette[3].b = 0; +} + +static float evaluate_palette_error(Color32 palette[4], const Color32 * colors, const float * weights, int count) { + + float total = 0.0f; + for (int i = 0; i < count; i++) { + total += (weights[i] * weights[i]) * evaluate_mse(palette, colors[i]); + } + + return total; +} + + + + +float nv::compress_dxt1_bounding_box_exhaustive(const Vector3 input_colors[16], const Vector3 * colors, const float * weights, int count, int max_volume, BlockDXT1 * output) +{ + // Compute bounding box. + Vector3 min_color(1.0f); + Vector3 max_color(0.0f); + + for (int i = 0; i < count; i++) { + min_color = min(min_color, colors[i]); + max_color = max(max_color, colors[i]); + } + + // Convert to 5:6:5 + int min_r = ftoi_floor(31 * min_color.x); + int min_g = ftoi_floor(63 * min_color.y); + int min_b = ftoi_floor(31 * min_color.z); + int max_r = ftoi_ceil(31 * max_color.x); + int max_g = ftoi_ceil(63 * max_color.y); + int max_b = ftoi_ceil(31 * max_color.z); + + // Expand the box. + int range_r = max_r - min_r; + int range_g = max_g - min_g; + int range_b = max_b - min_b; + + min_r = max(0, min_r - (range_r + 1) / 1 - 1); + min_g = max(0, min_g - (range_g + 1) / 1 - 1); + min_b = max(0, min_b - (range_b + 1) / 1 - 1); + + max_r = min(31, max_r + (range_r + 1) / 2 + 1); + max_g = min(63, max_g + (range_g + 1) / 2 + 1); + max_b = min(31, max_b + (range_b + 1) / 2 + 1); + + // Estimate size of search space. + int volume = (max_r-min_r+1) * (max_g-min_g+1) * (max_b-min_b+1); + + // if size under search_limit, then proceed. Note that search_limit is sqrt of number of evaluations. + if (volume > max_volume) { + return FLT_MAX; + } + + Color32 colors32[16]; + for (int i = 0; i < count; i++) { + colors32[i] = toColor32(Vector4(colors[i], 1)); + } + + float best_error = FLT_MAX; + Color32 best0, best1; + + for(int r0 = min_r; r0 <= max_r; r0++) + for(int r1 = max_r; r1 >= r0; r1--) + for(int g0 = min_g; g0 <= max_g; g0++) + for(int g1 = max_g; g1 >= g0; g1--) + for(int b0 = min_b; b0 <= max_b; b0++) + for(int b1 = max_b; b1 >= b0; b1--) + { + Color32 palette[4]; + palette[0] = bitexpand_color16_to_color32(r1, g1, b1); + palette[1] = bitexpand_color16_to_color32(r0, g0, b0); + + // Evaluate error in 4 color mode. + evaluate_palette4(palette); + + float error = evaluate_palette_error(palette, colors32, weights, count); + + if (error < best_error) { + best_error = error; + best0 = palette[0]; + best1 = palette[1]; + } + +#if 0 + // Evaluate error in 3 color mode. + evaluate_palette3(palette); + + float error = evaluate_palette_error(palette, colors, weights, count); + + if (error < best_error) { + best_error = error; + best0 = palette[1]; + best1 = palette[0]; + } +#endif + } + + output->col0 = truncate_color32_to_color16(best0); + output->col1 = truncate_color32_to_color16(best1); + + if (output->col0.u <= output->col1.u) { + //output->indices = computeIndices3(colors, best0, best1); + } + else { + //output->indices = computeIndices4(colors, best0, best1); + } + + return FLT_MAX; +} + + +float nv::compress_dxt1_cluster_fit(const Vector3 input_colors[16], const Vector3 * colors, const float * weights, int count, BlockDXT1 * output) +{ + ClusterFit fit; + //fit.setColorWeights(compressionOptions.colorWeight); + fit.setColorWeights(Vector4(1)); // @@ Set color weights. + fit.setColorSet(colors, weights, count); + + // start & end are in [0, 1] range. + Vector3 start, end; + fit.compress4(&start, &end); + + if (fit.compress3(&start, &end)) { + //output_block3(input_colors, start, end, block); + // @@ Output block. + } + else { + //output_block4(input_colors, start, end, block); + // @@ Output block. + } +} + + + +float nv::compress_dxt1(const Vector3 input_colors[16], const float input_weights[16], BlockDXT1 * output) +{ + Vector3 colors[16]; + float weights[16]; + int count = reduce_colors(input_colors, input_weights, colors, weights); + + if (count == 0) { + // Output trivial block. + output->col0.u = 0; + output->col1.u = 0; + output->indices = 0; + return 0; + } + + if (count == 1) { + return compress_dxt1_single_color_optimal(colors[0], output); + } + + // If high quality: + //error = compress_dxt1_bounding_box_exhaustive(colors, weigths, count, 3200, error, output); + //if (error < FLT_MAX) return error; + + // This is pretty fast and in some cases can produces better quality than cluster fit. +// error = compress_dxt1_least_squares_fit(colors, weigths, error, output); + + // + float error = compress_dxt1_cluster_fit(input_colors, colors, weights, count, output); + + return error; +} + diff --git a/src/nvtt/CompressorDXT1.h b/src/nvtt/CompressorDXT1.h new file mode 100644 index 0000000..3c2fe57 --- /dev/null +++ b/src/nvtt/CompressorDXT1.h @@ -0,0 +1,38 @@ + +namespace nv { + + class Color32; + struct ColorBlock; + struct BlockDXT1; + class Vector3; + + // All these functions return MSE. + + // Optimal compressors: + /*float compress_dxt1_single_color_optimal(const Color32 & rgb, BlockDXT1 * output); + float compress_dxt1_single_color_optimal(const ColorBlock & input, BlockDXT1 * output); + float compress_dxt1_optimal(const ColorBlock & input, BlockDXT1 * output); + + + + // Brute force with restricted search space: + float compress_dxt1_bounding_box_exhaustive(const ColorBlock & input, BlockDXT1 * output); + float compress_dxt1_best_fit_line_exhaustive(const ColorBlock & input, BlockDXT1 * output); + + + // Fast least squres fitting compressors: + float compress_dxt1_least_squares_fit(const ColorBlock & input, BlockDXT1 * output); + float compress_dxt1_least_squares_fit_iterative(const ColorBlock & input, BlockDXT1 * output); + */ + + float compress_dxt1_single_color_optimal(Color32 c, BlockDXT1 * output); + float compress_dxt1_single_color_optimal(const Vector3 & color, BlockDXT1 * output); + + float compress_dxt1_least_squares_fit(const Vector3 input_colors[16], const Vector3 * colors, const float * weights, int count, BlockDXT1 * output); + float compress_dxt1_bounding_box_exhaustive(const Vector3 input_colors[16], const Vector3 * colors, const float * weights, int count, int search_limit, BlockDXT1 * output); + float compress_dxt1_cluster_fit(const Vector3 input_colors[16], const Vector3 * colors, const float * weights, int count, BlockDXT1 * output); + + + float compress_dxt1(const Vector3 colors[16], const float weights[16], BlockDXT1 * output); + +} \ No newline at end of file diff --git a/src/nvtt/CompressorRGB.cpp b/src/nvtt/CompressorRGB.cpp index f2b91b6..442c251 100644 --- a/src/nvtt/CompressorRGB.cpp +++ b/src/nvtt/CompressorRGB.cpp @@ -32,6 +32,7 @@ #include "nvmath/Color.h" #include "nvmath/Half.h" +#include "nvmath/ftoi.h" #include "nvcore/Debug.h" @@ -360,7 +361,19 @@ void PixelFormatConverter::compress(nvtt::AlphaMode /*alphaMode*/, uint w, uint ib = iround(clamp(b * 65535.0f, 0.0f, 65535.0f)); ia = iround(clamp(a * 65535.0f, 0.0f, 65535.0f)); } - + else if (compressionOptions.pixelType == nvtt::PixelType_SignedNorm) { + // @@ + } + else if (compressionOptions.pixelType == nvtt::PixelType_UnsignedInt) { + ir = iround(clamp(r, 0.0f, 65535.0f)); + ig = iround(clamp(g, 0.0f, 65535.0f)); + ib = iround(clamp(b, 0.0f, 65535.0f)); + ia = iround(clamp(a, 0.0f, 65535.0f)); + } + else if (compressionOptions.pixelType == nvtt::PixelType_SignedInt) { + // @@ + } + uint p = 0; p |= PixelFormat::convert(ir, 16, rsize) << rshift; p |= PixelFormat::convert(ig, 16, gsize) << gshift; diff --git a/src/nvtt/Context.cpp b/src/nvtt/Context.cpp index c7c9631..fea0017 100644 --- a/src/nvtt/Context.cpp +++ b/src/nvtt/Context.cpp @@ -268,9 +268,6 @@ bool Compressor::Private::compress(const InputOptions::Private & inputOptions, c if (!img.isNormalMap()) { img.toLinear(inputOptions.inputGamma); } - else { - img.expandNormals(); - } // Resize input. img.resize(w, h, d, ResizeFilter_Box); @@ -279,9 +276,6 @@ bool Compressor::Private::compress(const InputOptions::Private & inputOptions, c if (!img.isNormalMap()) { tmp.toGamma(inputOptions.outputGamma); } - else { - tmp.packNormals(); - } quantize(tmp, compressionOptions); compress(tmp, f, 0, compressionOptions, outputOptions); @@ -310,9 +304,6 @@ bool Compressor::Private::compress(const InputOptions::Private & inputOptions, c if (!img.isNormalMap()) { img.toLinear(inputOptions.inputGamma); } - else { - img.expandNormals(); - } } else { if (inputOptions.mipmapFilter == MipmapFilter_Kaiser) { @@ -332,7 +323,6 @@ bool Compressor::Private::compress(const InputOptions::Private & inputOptions, c img.normalizeNormalMap(); } tmp = img; - tmp.packNormals(); } else { tmp = img; @@ -485,34 +475,38 @@ bool Compressor::Private::outputHeader(nvtt::TextureType textureType, int w, int else { if (compressionOptions.format == Format_DXT1 || compressionOptions.format == Format_DXT1a || compressionOptions.format == Format_DXT1n) { - header.setDX10Format(DXGI_FORMAT_BC1_UNORM); + header.setDX10Format(outputOptions.srgb ? DXGI_FORMAT_BC1_UNORM_SRGB : DXGI_FORMAT_BC1_UNORM); if (compressionOptions.format == Format_DXT1a) header.setHasAlphaFlag(true); if (isNormalMap) header.setNormalFlag(true); } else if (compressionOptions.format == Format_DXT3) { - header.setDX10Format(DXGI_FORMAT_BC2_UNORM); + header.setDX10Format(outputOptions.srgb ? DXGI_FORMAT_BC2_UNORM_SRGB : DXGI_FORMAT_BC2_UNORM); } - else if (compressionOptions.format == Format_DXT5) { - header.setDX10Format(DXGI_FORMAT_BC3_UNORM); + else if (compressionOptions.format == Format_DXT5 || compressionOptions.format == Format_BC3_RGBM) { + header.setDX10Format(outputOptions.srgb ? DXGI_FORMAT_BC3_UNORM_SRGB : DXGI_FORMAT_BC3_UNORM); } else if (compressionOptions.format == Format_DXT5n) { header.setDX10Format(DXGI_FORMAT_BC3_UNORM); if (isNormalMap) header.setNormalFlag(true); } else if (compressionOptions.format == Format_BC4) { - header.setDX10Format(DXGI_FORMAT_BC4_UNORM); + header.setDX10Format(DXGI_FORMAT_BC4_UNORM); // DXGI_FORMAT_BC4_SNORM ? } - else if (compressionOptions.format == Format_BC5) { - header.setDX10Format(DXGI_FORMAT_BC5_UNORM); + else if (compressionOptions.format == Format_BC5 || compressionOptions.format == Format_BC5_Luma) { + header.setDX10Format(DXGI_FORMAT_BC5_UNORM); // DXGI_FORMAT_BC5_SNORM ? if (isNormalMap) header.setNormalFlag(true); } else if (compressionOptions.format == Format_BC6) { - header.setDX10Format(DXGI_FORMAT_BC6H_UF16); + if (compressionOptions.pixelType == PixelType_Float) header.setDX10Format(DXGI_FORMAT_BC6H_SF16); + /*if (compressionOptions.pixelType == PixelType_UnsignedFloat)*/ header.setDX10Format(DXGI_FORMAT_BC6H_UF16); // By default we assume unsigned. } else if (compressionOptions.format == Format_BC7) { - header.setDX10Format(DXGI_FORMAT_BC7_UNORM); + header.setDX10Format(outputOptions.srgb ? DXGI_FORMAT_BC7_UNORM_SRGB : DXGI_FORMAT_BC7_UNORM); if (isNormalMap) header.setNormalFlag(true); } + else if (compressionOptions.format == Format_CTX1) { + supported = false; + } else { supported = false; } @@ -597,7 +591,7 @@ bool Compressor::Private::outputHeader(nvtt::TextureType textureType, int w, int else if (compressionOptions.format == Format_DXT3) { header.setFourCC('D', 'X', 'T', '3'); } - else if (compressionOptions.format == Format_DXT5) { + else if (compressionOptions.format == Format_DXT5 || compressionOptions.format == Format_BC3_RGBM) { header.setFourCC('D', 'X', 'T', '5'); } else if (compressionOptions.format == Format_DXT5n) { @@ -611,19 +605,21 @@ bool Compressor::Private::outputHeader(nvtt::TextureType textureType, int w, int else if (compressionOptions.format == Format_BC4) { header.setFourCC('A', 'T', 'I', '1'); } - else if (compressionOptions.format == Format_BC5) { + else if (compressionOptions.format == Format_BC5 || compressionOptions.format == Format_BC5_Luma) { header.setFourCC('A', 'T', 'I', '2'); if (isNormalMap) { header.setNormalFlag(true); header.setSwizzleCode('A', '2', 'X', 'Y'); } } - else if (compressionOptions.format == Format_BC6) { // @@ This is not supported by D3DX. Always use DX10 header with BC6-7 formats. - header.setFourCC('Z', 'O', 'H', ' '); + else if (compressionOptions.format == Format_BC6) { + header.setFourCC('Z', 'O', 'H', ' '); // This is not supported by D3DX. Always use DX10 header with BC6-7 formats. + supported = false; } else if (compressionOptions.format == Format_BC7) { - header.setFourCC('Z', 'O', 'L', 'A'); + header.setFourCC('Z', 'O', 'L', 'A'); // This is not supported by D3DX. Always use DX10 header with BC6-7 formats. if (isNormalMap) header.setNormalFlag(true); + supported = false; } else if (compressionOptions.format == Format_CTX1) { header.setFourCC('C', 'T', 'X', '1'); @@ -777,6 +773,14 @@ CompressorInterface * Compressor::Private::chooseCpuCompressor(const Compression { return new CompressorBC7; } + else if (compressionOptions.format == Format_BC5_Luma) + { + return new ProductionCompressorBC5_Luma; + } + else if (compressionOptions.format == Format_BC3_RGBM) + { + return new CompressorBC3_RGBM; + } return NULL; } diff --git a/src/nvtt/CubeSurface.cpp b/src/nvtt/CubeSurface.cpp index 5cbb351..22b6a45 100644 --- a/src/nvtt/CubeSurface.cpp +++ b/src/nvtt/CubeSurface.cpp @@ -320,7 +320,7 @@ bool CubeSurface::load(const char * fileName, int mipmap) if (mipmap < 0) { mipmap = dds.mipmapCount() - 1 - mipmap; } - if (mipmap < 0 || mipmap > toI32(dds.mipmapCount())) return false; + if (mipmap < 0 || mipmap > I32(dds.mipmapCount())) return false; nvtt::InputFormat inputFormat = nvtt::InputFormat_RGBA_16F; @@ -328,12 +328,14 @@ bool CubeSurface::load(const char * fileName, int mipmap) if (dds.header.hasDX10Header()) { if (dds.header.header10.dxgiFormat == DXGI_FORMAT_R16G16B16A16_FLOAT) inputFormat = nvtt::InputFormat_RGBA_16F; else if (dds.header.header10.dxgiFormat == DXGI_FORMAT_R32G32B32A32_FLOAT) inputFormat = nvtt::InputFormat_RGBA_32F; + else if (dds.header.header10.dxgiFormat == DXGI_FORMAT_R32_FLOAT) inputFormat = nvtt::InputFormat_R_32F; else return false; } else { if ((dds.header.pf.flags & DDPF_FOURCC) != 0) { if (dds.header.pf.fourcc == D3DFMT_A16B16G16R16F) inputFormat = nvtt::InputFormat_RGBA_16F; else if (dds.header.pf.fourcc == D3DFMT_A32B32G32R32F) inputFormat = nvtt::InputFormat_RGBA_32F; + else if (dds.header.pf.fourcc == D3DFMT_R32F) inputFormat = nvtt::InputFormat_R_32F; else return false; } else { @@ -594,7 +596,7 @@ Vector3 CubeSurface::Private::applyAngularFilter(const Vector3 & filterDir, floa continue; } - const int L = toI32(edgeLength-1); + const int L = I32(edgeLength-1); int x0 = 0, x1 = L; int y0 = 0, y1 = L; @@ -715,7 +717,7 @@ Vector3 CubeSurface::Private::applyCosinePowerFilter(const Vector3 & filterDir, continue; } - const int L = toI32(edgeLength-1); + const int L = I32(edgeLength-1); int x0 = 0, x1 = L; int y0 = 0, y1 = L; diff --git a/src/nvtt/InputOptions.cpp b/src/nvtt/InputOptions.cpp index 8052990..33c09e6 100644 --- a/src/nvtt/InputOptions.cpp +++ b/src/nvtt/InputOptions.cpp @@ -202,18 +202,22 @@ bool InputOptions::setMipmapData(const void * data, int width, int height, int d return false; } - int imageSize = width * height * depth * 4; + int imageSize = width * height * depth; if (m.inputFormat == InputFormat_BGRA_8UB) { - imageSize *= sizeof(uint8); + imageSize *= 4 * sizeof(uint8); } else if (m.inputFormat == InputFormat_RGBA_16F) { - imageSize *= sizeof(uint16); + imageSize *= 4 * sizeof(uint16); } else if (m.inputFormat == InputFormat_RGBA_32F) { - imageSize *= sizeof(float); + imageSize *= 4 * sizeof(float); + } + else if (m.inputFormat == InputFormat_R_32F) + { + imageSize *= 1 * sizeof(float); } else { diff --git a/src/nvtt/OptimalCompressDXT.cpp b/src/nvtt/OptimalCompressDXT.cpp index b9eb351..602b6af 100644 --- a/src/nvtt/OptimalCompressDXT.cpp +++ b/src/nvtt/OptimalCompressDXT.cpp @@ -32,7 +32,8 @@ #include // swap -#include +#include // INT_MAX +#include // FLT_MAX using namespace nv; using namespace OptimalCompress; @@ -185,16 +186,16 @@ namespace return totalError; }*/ - static uint computeAlphaError(const ColorBlock & rgba, const AlphaBlockDXT5 * block, int bestError = INT_MAX) + static float computeAlphaError(const AlphaBlock4x4 & src, const AlphaBlockDXT5 * dst, float bestError = FLT_MAX) { uint8 alphas[8]; - block->evaluatePalette(alphas, false); // @@ Use target decoder. + dst->evaluatePalette(alphas, false); // @@ Use target decoder. - int totalError = 0; + float totalError = 0; for (uint i = 0; i < 16; i++) { - uint8 alpha = rgba.color(i).a; + uint8 alpha = src.alpha[i]; int minDist = INT_MAX; for (uint p = 0; p < 8; p++) @@ -203,7 +204,7 @@ namespace minDist = min(dist, minDist); } - totalError += minDist; + totalError += minDist * src.weights[i]; if (totalError > bestError) { @@ -215,14 +216,14 @@ namespace return totalError; } - static void computeAlphaIndices(const ColorBlock & rgba, AlphaBlockDXT5 * block) + static void computeAlphaIndices(const AlphaBlock4x4 & src, AlphaBlockDXT5 * dst) { uint8 alphas[8]; - block->evaluatePalette(alphas, false); // @@ Use target decoder. + dst->evaluatePalette(alphas, /*d3d9=*/false); // @@ Use target decoder. for (uint i = 0; i < 16; i++) { - uint8 alpha = rgba.color(i).a; + uint8 alpha = src.alpha[i]; int minDist = INT_MAX; int bestIndex = 8; @@ -238,7 +239,7 @@ namespace } nvDebugCheck(bestIndex < 8); - block->setIndex(i, bestIndex); + dst->setIndex(i, bestIndex); } } @@ -252,19 +253,19 @@ namespace // https://mollyrocket.com/forums/viewtopic.php?t=392 void OptimalCompress::compressDXT1(Color32 c, BlockDXT1 * dxtBlock) { - dxtBlock->col0.r = OMatch5[c.r][0]; - dxtBlock->col0.g = OMatch6[c.g][0]; - dxtBlock->col0.b = OMatch5[c.b][0]; - dxtBlock->col1.r = OMatch5[c.r][1]; - dxtBlock->col1.g = OMatch6[c.g][1]; - dxtBlock->col1.b = OMatch5[c.b][1]; - dxtBlock->indices = 0xaaaaaaaa; - - if (dxtBlock->col0.u < dxtBlock->col1.u) - { - swap(dxtBlock->col0.u, dxtBlock->col1.u); - dxtBlock->indices ^= 0x55555555; - } + dxtBlock->col0.r = OMatch5[c.r][0]; + dxtBlock->col0.g = OMatch6[c.g][0]; + dxtBlock->col0.b = OMatch5[c.b][0]; + dxtBlock->col1.r = OMatch5[c.r][1]; + dxtBlock->col1.g = OMatch6[c.g][1]; + dxtBlock->col1.b = OMatch5[c.b][1]; + dxtBlock->indices = 0xaaaaaaaa; + + if (dxtBlock->col0.u < dxtBlock->col1.u) + { + swap(dxtBlock->col0.u, dxtBlock->col1.u); + dxtBlock->indices ^= 0x55555555; + } } void OptimalCompress::compressDXT1a(Color32 c, uint alphaMask, BlockDXT1 * dxtBlock) @@ -481,46 +482,68 @@ void OptimalCompress::compressDXT1_Luma(const ColorBlock & rgba, BlockDXT1 * blo } -void OptimalCompress::compressDXT3A(const ColorBlock & rgba, AlphaBlockDXT3 * dxtBlock) +void OptimalCompress::compressDXT3A(const AlphaBlock4x4 & src, AlphaBlockDXT3 * dst) { - dxtBlock->alpha0 = quantize4(rgba.color(0).a); - dxtBlock->alpha1 = quantize4(rgba.color(1).a); - dxtBlock->alpha2 = quantize4(rgba.color(2).a); - dxtBlock->alpha3 = quantize4(rgba.color(3).a); - dxtBlock->alpha4 = quantize4(rgba.color(4).a); - dxtBlock->alpha5 = quantize4(rgba.color(5).a); - dxtBlock->alpha6 = quantize4(rgba.color(6).a); - dxtBlock->alpha7 = quantize4(rgba.color(7).a); - dxtBlock->alpha8 = quantize4(rgba.color(8).a); - dxtBlock->alpha9 = quantize4(rgba.color(9).a); - dxtBlock->alphaA = quantize4(rgba.color(10).a); - dxtBlock->alphaB = quantize4(rgba.color(11).a); - dxtBlock->alphaC = quantize4(rgba.color(12).a); - dxtBlock->alphaD = quantize4(rgba.color(13).a); - dxtBlock->alphaE = quantize4(rgba.color(14).a); - dxtBlock->alphaF = quantize4(rgba.color(15).a); + dst->alpha0 = quantize4(src.alpha[0]); + dst->alpha1 = quantize4(src.alpha[1]); + dst->alpha2 = quantize4(src.alpha[2]); + dst->alpha3 = quantize4(src.alpha[3]); + dst->alpha4 = quantize4(src.alpha[4]); + dst->alpha5 = quantize4(src.alpha[5]); + dst->alpha6 = quantize4(src.alpha[6]); + dst->alpha7 = quantize4(src.alpha[7]); + dst->alpha8 = quantize4(src.alpha[8]); + dst->alpha9 = quantize4(src.alpha[9]); + dst->alphaA = quantize4(src.alpha[10]); + dst->alphaB = quantize4(src.alpha[11]); + dst->alphaC = quantize4(src.alpha[12]); + dst->alphaD = quantize4(src.alpha[13]); + dst->alphaE = quantize4(src.alpha[14]); + dst->alphaF = quantize4(src.alpha[15]); } +void OptimalCompress::compressDXT3A(const ColorBlock & src, AlphaBlockDXT3 * dst) +{ + AlphaBlock4x4 tmp; + tmp.init(src, 3); + compressDXT3A(tmp, dst); +} -void OptimalCompress::compressDXT5A(const ColorBlock & rgba, AlphaBlockDXT5 * dxtBlock) +void OptimalCompress::compressDXT5A(const AlphaBlock4x4 & src, AlphaBlockDXT5 * dst) { uint8 mina = 255; uint8 maxa = 0; + uint8 mina_no01 = 255; + uint8 maxa_no01 = 0; + // Get min/max alpha. for (uint i = 0; i < 16; i++) { - uint8 alpha = rgba.color(i).a; + uint8 alpha = src.alpha[i]; mina = min(mina, alpha); maxa = max(maxa, alpha); + + if (alpha != 0 && alpha != 255) { + mina_no01 = min(mina_no01, alpha); + maxa_no01 = max(maxa_no01, alpha); + } } - dxtBlock->alpha0 = maxa; - dxtBlock->alpha1 = mina; + if (maxa - mina < 8) { + dst->alpha0 = maxa; + dst->alpha1 = mina; - if (maxa - mina > 8) - { - int besterror = computeAlphaError(rgba, dxtBlock); + nvDebugCheck(computeAlphaError(src, dst) == 0); + } + else if (maxa_no01 - mina_no01 < 6) { + dst->alpha0 = mina_no01; + dst->alpha1 = maxa_no01; + + nvDebugCheck(computeAlphaError(src, dst) == 0); + } + else { + float besterror = computeAlphaError(src, dst); int besta0 = maxa; int besta1 = mina; @@ -535,9 +558,9 @@ void OptimalCompress::compressDXT5A(const ColorBlock & rgba, AlphaBlockDXT5 * dx { nvDebugCheck(a0 - a1 > 8); - dxtBlock->alpha0 = a0; - dxtBlock->alpha1 = a1; - int error = computeAlphaError(rgba, dxtBlock, besterror); + dst->alpha0 = a0; + dst->alpha1 = a1; + float error = computeAlphaError(src, dst, besterror); if (error < besterror) { @@ -548,10 +571,241 @@ void OptimalCompress::compressDXT5A(const ColorBlock & rgba, AlphaBlockDXT5 * dx } } - dxtBlock->alpha0 = besta0; - dxtBlock->alpha1 = besta1; + // Try using the 6 step encoding. + /*if (mina == 0 || maxa == 255)*/ { + + // Expand search space a bit. + const int alphaExpand = 6; + mina_no01 = (mina_no01 <= alphaExpand) ? 0 : mina_no01 - alphaExpand; + maxa_no01 = (maxa_no01 >= 255 - alphaExpand) ? 255 : maxa_no01 + alphaExpand; + + for (int a0 = mina_no01 + 9; a0 < maxa_no01; a0++) + { + for (int a1 = mina_no01; a1 < a0 - 8; a1++) + { + nvDebugCheck(a0 - a1 > 8); + + dst->alpha0 = a1; + dst->alpha1 = a0; + float error = computeAlphaError(src, dst, besterror); + + if (error < besterror) + { + besterror = error; + besta0 = a1; + besta1 = a0; + } + } + } + } + + dst->alpha0 = besta0; + dst->alpha1 = besta1; } - computeAlphaIndices(rgba, dxtBlock); + computeAlphaIndices(src, dst); +} + + +void OptimalCompress::compressDXT5A(const ColorBlock & src, AlphaBlockDXT5 * dst) +{ + AlphaBlock4x4 tmp; + tmp.init(src, 3); + compressDXT5A(tmp, dst); +} + + +#include "nvmath/Vector.inl" +#include "nvmath/ftoi.h" +const float threshold = 0.15f; + +static float computeAlphaError_RGBM(const ColorSet & src, const ColorBlock & RGB, const AlphaBlockDXT5 * dst, float bestError = FLT_MAX) +{ + uint8 alphas[8]; + dst->evaluatePalette(alphas, /*d3d9=*/false); // @@ Use target decoder. + + float totalError = 0; + + for (uint i = 0; i < 16; i++) + { + float R = src.color(i).x; + float G = src.color(i).y; + float B = src.color(i).z; + + float r = float(RGB.color(i).r) / 255.0f; + float g = float(RGB.color(i).g) / 255.0f; + float b = float(RGB.color(i).b) / 255.0f; + + float minDist = FLT_MAX; + for (uint p = 0; p < 8; p++) + { + // Compute M. + float M = float(alphas[p]) / 255.0f * (1 - threshold) + threshold; + + // Decode color. + float fr = r * M; + float fg = g * M; + float fb = b * M; + + // Measure error. + float error = square(R - fr) + square(G - fg) + square(B - fb); + + minDist = min(error, minDist); + } + + totalError += minDist * src.weights[i]; + + if (totalError > bestError) + { + // early out + return totalError; + } + } + + return totalError; +} + +static void computeAlphaIndices_RGBM(const ColorSet & src, const ColorBlock & RGB, AlphaBlockDXT5 * dst) +{ + uint8 alphas[8]; + dst->evaluatePalette(alphas, /*d3d9=*/false); // @@ Use target decoder. + + for (uint i = 0; i < 16; i++) + { + float R = src.color(i).x; + float G = src.color(i).y; + float B = src.color(i).z; + + float r = float(RGB.color(i).r) / 255.0f; + float g = float(RGB.color(i).g) / 255.0f; + float b = float(RGB.color(i).b) / 255.0f; + + float minDist = FLT_MAX; + int bestIndex = 8; + for (uint p = 0; p < 8; p++) + { + // Compute M. + float M = float(alphas[p]) / 255.0f * (1 - threshold) + threshold; + + // Decode color. + float fr = r * M; + float fg = g * M; + float fb = b * M; + + // Measure error. + float error = square(R - fr) + square(G - fg) + square(B - fb); + + if (error < minDist) + { + minDist = error; + bestIndex = p; + } + } + nvDebugCheck(bestIndex < 8); + + dst->setIndex(i, bestIndex); + } } + +void OptimalCompress::compressDXT5A_RGBM(const ColorSet & src, const ColorBlock & RGB, AlphaBlockDXT5 * dst) +{ + uint8 mina = 255; + uint8 maxa = 0; + + uint8 mina_no01 = 255; + uint8 maxa_no01 = 0; + + // Get min/max alpha. + /*for (uint i = 0; i < 16; i++) + { + uint8 alpha = src.alpha[i]; + mina = min(mina, alpha); + maxa = max(maxa, alpha); + + if (alpha != 0 && alpha != 255) { + mina_no01 = min(mina_no01, alpha); + maxa_no01 = max(maxa_no01, alpha); + } + }*/ + mina = 0; + maxa = 255; + mina_no01 = 0; + maxa_no01 = 255; + + /*if (maxa - mina < 8) { + dst->alpha0 = maxa; + dst->alpha1 = mina; + + nvDebugCheck(computeAlphaError(src, dst) == 0); + } + else if (maxa_no01 - mina_no01 < 6) { + dst->alpha0 = mina_no01; + dst->alpha1 = maxa_no01; + + nvDebugCheck(computeAlphaError(src, dst) == 0); + } + else*/ + { + float besterror = computeAlphaError_RGBM(src, RGB, dst); + int besta0 = maxa; + int besta1 = mina; + + // Expand search space a bit. + const int alphaExpand = 8; + mina = (mina <= alphaExpand) ? 0 : mina - alphaExpand; + maxa = (maxa >= 255 - alphaExpand) ? 255 : maxa + alphaExpand; + + for (int a0 = mina + 9; a0 < maxa; a0++) + { + for (int a1 = mina; a1 < a0 - 8; a1++) + { + nvDebugCheck(a0 - a1 > 8); + + dst->alpha0 = a0; + dst->alpha1 = a1; + float error = computeAlphaError_RGBM(src, RGB, dst, besterror); + + if (error < besterror) + { + besterror = error; + besta0 = a0; + besta1 = a1; + } + } + } + + // Try using the 6 step encoding. + /*if (mina == 0 || maxa == 255)*/ { + + // Expand search space a bit. + const int alphaExpand = 6; + mina_no01 = (mina_no01 <= alphaExpand) ? 0 : mina_no01 - alphaExpand; + maxa_no01 = (maxa_no01 >= 255 - alphaExpand) ? 255 : maxa_no01 + alphaExpand; + + for (int a0 = mina_no01 + 9; a0 < maxa_no01; a0++) + { + for (int a1 = mina_no01; a1 < a0 - 8; a1++) + { + nvDebugCheck(a0 - a1 > 8); + + dst->alpha0 = a1; + dst->alpha1 = a0; + float error = computeAlphaError_RGBM(src, RGB, dst, besterror); + + if (error < besterror) + { + besterror = error; + besta0 = a1; + besta1 = a0; + } + } + } + } + + dst->alpha0 = besta0; + dst->alpha1 = besta1; + } + + computeAlphaIndices_RGBM(src, RGB, dst); +} diff --git a/src/nvtt/OptimalCompressDXT.h b/src/nvtt/OptimalCompressDXT.h index 87bbe70..2c173cf 100644 --- a/src/nvtt/OptimalCompressDXT.h +++ b/src/nvtt/OptimalCompressDXT.h @@ -25,31 +25,38 @@ #ifndef NV_TT_OPTIMALCOMPRESSDXT_H #define NV_TT_OPTIMALCOMPRESSDXT_H -#include +//#include "nvimage/nvimage.h" -#include +#include "nvmath/Color.h" namespace nv { + struct ColorSet; struct ColorBlock; struct BlockDXT1; struct BlockDXT3; struct BlockDXT5; struct AlphaBlockDXT3; struct AlphaBlockDXT5; + struct AlphaBlock4x4; namespace OptimalCompress { + // Single color compressors: void compressDXT1(Color32 rgba, BlockDXT1 * dxtBlock); void compressDXT1a(Color32 rgba, uint alphaMask, BlockDXT1 * dxtBlock); void compressDXT1G(uint8 g, BlockDXT1 * dxtBlock); - void compressDXT1G(const ColorBlock & rgba, BlockDXT1 * block); - void compressDXT3A(const ColorBlock & rgba, AlphaBlockDXT3 * dxtBlock); - void compressDXT5A(const ColorBlock & rgba, AlphaBlockDXT5 * dxtBlock); + void compressDXT3A(const AlphaBlock4x4 & src, AlphaBlockDXT3 * dst); + void compressDXT5A(const AlphaBlock4x4 & src, AlphaBlockDXT5 * dst); - void compressDXT1_Luma(const ColorBlock & rgba, BlockDXT1 * block); + void compressDXT1G(const ColorBlock & src, BlockDXT1 * dst); + void compressDXT3A(const ColorBlock & src, AlphaBlockDXT3 * dst); + void compressDXT5A(const ColorBlock & src, AlphaBlockDXT5 * dst); + + void compressDXT1_Luma(const ColorBlock & src, BlockDXT1 * dst); + void compressDXT5A_RGBM(const ColorSet & src, const ColorBlock & RGB, AlphaBlockDXT5 * dst); } } // nv namespace diff --git a/src/nvtt/QuickCompressDXT.cpp b/src/nvtt/QuickCompressDXT.cpp index 50bab8d..69ee3de 100644 --- a/src/nvtt/QuickCompressDXT.cpp +++ b/src/nvtt/QuickCompressDXT.cpp @@ -28,13 +28,13 @@ #include "nvimage/ColorBlock.h" #include "nvimage/BlockDXT.h" -#include "nvmath/Color.h" +#include "nvmath/Color.inl" #include "nvmath/Vector.inl" #include "nvmath/Fitting.h" #include "nvcore/Utils.h" // swap - +#include // memset using namespace nv; using namespace QuickCompress; @@ -115,13 +115,28 @@ inline static void insetBBox(Vector3 * restrict maxColor, Vector3 * restrict min *minColor = clamp(*minColor + inset, 0.0f, 255.0f); } +#include "nvmath/ftoi.h" + // Takes a normalized color in [0, 255] range and returns inline static uint16 roundAndExpand(Vector3 * restrict v) { - uint r = uint(clamp(v->x * (31.0f / 255.0f), 0.0f, 31.0f) + 0.5f); - uint g = uint(clamp(v->y * (63.0f / 255.0f), 0.0f, 63.0f) + 0.5f); - uint b = uint(clamp(v->z * (31.0f / 255.0f), 0.0f, 31.0f) + 0.5f); - + uint r = ftoi_floor(clamp(v->x * (31.0f / 255.0f), 0.0f, 31.0f)); + uint g = ftoi_floor(clamp(v->y * (63.0f / 255.0f), 0.0f, 63.0f)); + uint b = ftoi_floor(clamp(v->z * (31.0f / 255.0f), 0.0f, 31.0f)); + + float r0 = float(((r+0) << 3) | ((r+0) >> 2)); + float r1 = float(((r+1) << 3) | ((r+1) >> 2)); + if (fabs(v->x - r1) < fabs(v->x - r0)) r = min(r+1, 31U); + + float g0 = float(((g+0) << 2) | ((g+0) >> 4)); + float g1 = float(((g+1) << 2) | ((g+1) >> 4)); + if (fabs(v->y - g1) < fabs(v->y - g0)) g = min(g+1, 63U); + + float b0 = float(((b+0) << 3) | ((b+0) >> 2)); + float b1 = float(((b+1) << 3) | ((b+1) >> 2)); + if (fabs(v->z - b1) < fabs(v->z - b0)) b = min(b+1, 31U); + + uint16 w = (r << 11) | (g << 5) | b; r = (r << 3) | (r >> 2); @@ -132,16 +147,57 @@ inline static uint16 roundAndExpand(Vector3 * restrict v) return w; } +// Takes a normalized color in [0, 255] range and returns +inline static uint16 roundAndExpand01(Vector3 * restrict v) +{ + uint r = ftoi_floor(clamp(v->x * 31.0f, 0.0f, 31.0f)); + uint g = ftoi_floor(clamp(v->y * 63.0f, 0.0f, 63.0f)); + uint b = ftoi_floor(clamp(v->z * 31.0f, 0.0f, 31.0f)); + + float r0 = float(((r+0) << 3) | ((r+0) >> 2)); + float r1 = float(((r+1) << 3) | ((r+1) >> 2)); + if (fabs(v->x - r1) < fabs(v->x - r0)) r = min(r+1, 31U); + + float g0 = float(((g+0) << 2) | ((g+0) >> 4)); + float g1 = float(((g+1) << 2) | ((g+1) >> 4)); + if (fabs(v->y - g1) < fabs(v->y - g0)) g = min(g+1, 63U); + + float b0 = float(((b+0) << 3) | ((b+0) >> 2)); + float b1 = float(((b+1) << 3) | ((b+1) >> 2)); + if (fabs(v->z - b1) < fabs(v->z - b0)) b = min(b+1, 31U); + + + uint16 w = (r << 11) | (g << 5) | b; + + r = (r << 3) | (r >> 2); + g = (g << 2) | (g >> 4); + b = (b << 3) | (b >> 2); + *v = Vector3(float(r) / 255.0f, float(g) / 255.0f, float(b) / 255.0f); + + return w; +} + + + inline static float colorDistance(Vector3::Arg c0, Vector3::Arg c1) { return dot(c0-c1, c0-c1); } +Vector3 round255(const Vector3 & v) { + //return Vector3(ftoi_round(255 * v.x), ftoi_round(255 * v.y), ftoi_round(255 * v.z)) * (1.0f / 255); + //return Vector3(floorf(v.x + 0.5f), floorf(v.y + 0.5f), floorf(v.z + 0.5f)); + return v; +} + + inline static uint computeIndices4(const Vector3 block[16], Vector3::Arg maxColor, Vector3::Arg minColor) { Vector3 palette[4]; palette[0] = maxColor; palette[1] = minColor; + //palette[2] = round255((2 * palette[0] + palette[1]) / 3.0f); + //palette[3] = round255((2 * palette[1] + palette[0]) / 3.0f); palette[2] = lerp(palette[0], palette[1], 1.0f / 3.0f); palette[3] = lerp(palette[0], palette[1], 2.0f / 3.0f); @@ -178,32 +234,58 @@ inline static uint computeIndices4(const ColorSet & set, Vector3::Arg maxColor, palette[2] = lerp(palette[0], palette[1], 1.0f / 3.0f); palette[3] = lerp(palette[0], palette[1], 2.0f / 3.0f); + Vector3 mem[(4+2)*2]; + memset(mem, 0, sizeof(mem)); + + Vector3 * row0 = mem; + Vector3 * row1 = mem + (4+2); + uint indices = 0; - for(int i = 0; i < 16; i++) - { - if (!set.isValidIndex(i)) { - // Skip masked pixels and out of bounds. - continue; + //for(int i = 0; i < 16; i++) + for (uint y = 0; y < 4; y++) { + for (uint x = 0; x < 4; x++) { + int i = y*4+x; + + if (!set.isValidIndex(i)) { + // Skip masked pixels and out of bounds. + continue; + } + + Vector3 color = set.color(i).xyz(); + + // Add error. + color += row0[1+x]; + + float d0 = colorDistance(palette[0], color); + float d1 = colorDistance(palette[1], color); + float d2 = colorDistance(palette[2], color); + float d3 = colorDistance(palette[3], color); + + uint b0 = d0 > d3; + uint b1 = d1 > d2; + uint b2 = d0 > d2; + uint b3 = d1 > d3; + uint b4 = d2 > d3; + + uint x0 = b1 & b2; + uint x1 = b0 & b3; + uint x2 = b0 & b4; + + int index = x2 | ((x0 | x1) << 1); + indices |= index << (2 * i); + + // Compute new error. + Vector3 diff = color - palette[index]; + + // Propagate new error. + //row0[1+x+1] += 7.0f / 16.0f * diff; + //row1[1+x-1] += 3.0f / 16.0f * diff; + //row1[1+x+0] += 5.0f / 16.0f * diff; + //row1[1+x+1] += 1.0f / 16.0f * diff; } - Vector3 color = set.color(i).xyz(); - - float d0 = colorDistance(palette[0], color); - float d1 = colorDistance(palette[1], color); - float d2 = colorDistance(palette[2], color); - float d3 = colorDistance(palette[3], color); - - uint b0 = d0 > d3; - uint b1 = d1 > d2; - uint b2 = d0 > d2; - uint b3 = d1 > d3; - uint b4 = d2 > d3; - - uint x0 = b1 & b2; - uint x1 = b0 & b3; - uint x2 = b0 & b4; - - indices |= (x2 | ((x0 | x1) << 1)) << (2 * i); + swap(row0, row1); + memset(row1, 0, sizeof(row1)); } return indices; @@ -214,6 +296,8 @@ inline static float evaluatePaletteError4(const Vector3 block[16], Vector3::Arg Vector3 palette[4]; palette[0] = maxColor; palette[1] = minColor; + //palette[2] = round255((2 * palette[0] + palette[1]) / 3.0f); + //palette[3] = round255((2 * palette[1] + palette[0]) / 3.0f); palette[2] = lerp(palette[0], palette[1], 1.0f / 3.0f); palette[3] = lerp(palette[0], palette[1], 2.0f / 3.0f); @@ -231,6 +315,30 @@ inline static float evaluatePaletteError4(const Vector3 block[16], Vector3::Arg return total; } +inline static float evaluatePaletteError3(const Vector3 block[16], Vector3::Arg maxColor, Vector3::Arg minColor) +{ + Vector3 palette[4]; + palette[0] = minColor; + palette[1] = maxColor; + palette[2] = (palette[0] + palette[1]) * 0.5f; + palette[3] = Vector3(0); + + float total = 0.0f; + for (int i = 0; i < 16; i++) + { + float d0 = colorDistance(palette[0], block[i]); + float d1 = colorDistance(palette[1], block[i]); + float d2 = colorDistance(palette[2], block[i]); + //float d3 = colorDistance(palette[3], block[i]); + + //total += min(min(d0, d1), min(d2, d3)); + total += min(min(d0, d1), d2); + } + + return total; +} + + // maxColor and minColor are expected to be in the same range as the color set. inline static uint computeIndices3(const ColorSet & set, Vector3::Arg maxColor, Vector3::Arg minColor) { @@ -392,7 +500,7 @@ static void optimizeEndPoints3(Vector3 block[16], BlockDXT1 * dxtBlock) namespace { - static uint computeAlphaIndices(const ColorBlock & rgba, AlphaBlockDXT5 * block) + static uint computeAlphaIndices(const AlphaBlock4x4 & src, AlphaBlockDXT5 * block) { uint8 alphas[8]; block->evaluatePalette(alphas, false); // @@ Use target decoder. @@ -401,7 +509,7 @@ namespace for (uint i = 0; i < 16; i++) { - uint8 alpha = rgba.color(i).a; + uint8 alpha = src.alpha[i]; uint besterror = 256*256; uint best = 8; @@ -425,7 +533,7 @@ namespace return totalError; } - static void optimizeAlpha8(const ColorBlock & rgba, AlphaBlockDXT5 * block) + static void optimizeAlpha8(const AlphaBlock4x4 & src, AlphaBlockDXT5 * block) { float alpha2_sum = 0; float beta2_sum = 0; @@ -445,8 +553,8 @@ namespace alpha2_sum += alpha * alpha; beta2_sum += beta * beta; alphabeta_sum += alpha * beta; - alphax_sum += alpha * rgba.color(i).a; - betax_sum += beta * rgba.color(i).a; + alphax_sum += alpha * src.alpha[i]; + betax_sum += beta * src.alpha[i]; } const float factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum); @@ -653,14 +761,20 @@ void QuickCompress::compressDXT1a(const ColorBlock & rgba, BlockDXT1 * dxtBlock) } -void QuickCompress::compressDXT3(const ColorBlock & rgba, BlockDXT3 * dxtBlock) +void QuickCompress::compressDXT3(const ColorBlock & src, BlockDXT3 * dxtBlock) { - compressDXT1(rgba, &dxtBlock->color); - OptimalCompress::compressDXT3A(rgba, &dxtBlock->alpha); + compressDXT1(src, &dxtBlock->color); + OptimalCompress::compressDXT3A(src, &dxtBlock->alpha); } +void QuickCompress::compressDXT5A(const ColorBlock & src, AlphaBlockDXT5 * dst, int iterationCount/*=8*/) +{ + AlphaBlock4x4 tmp; + tmp.init(src, 3); + compressDXT5A(tmp, dst, iterationCount); +} -void QuickCompress::compressDXT5A(const ColorBlock & rgba, AlphaBlockDXT5 * dxtBlock, int iterationCount/*=8*/) +void QuickCompress::compressDXT5A(const AlphaBlock4x4 & src, AlphaBlockDXT5 * dst, int iterationCount/*=8*/) { uint8 alpha0 = 0; uint8 alpha1 = 255; @@ -668,7 +782,7 @@ void QuickCompress::compressDXT5A(const ColorBlock & rgba, AlphaBlockDXT5 * dxtB // Get min/max alpha. for (uint i = 0; i < 16; i++) { - uint8 alpha = rgba.color(i).a; + uint8 alpha = src.alpha[i]; alpha0 = max(alpha0, alpha); alpha1 = min(alpha1, alpha); } @@ -676,14 +790,14 @@ void QuickCompress::compressDXT5A(const ColorBlock & rgba, AlphaBlockDXT5 * dxtB AlphaBlockDXT5 block; block.alpha0 = alpha0 - (alpha0 - alpha1) / 34; block.alpha1 = alpha1 + (alpha0 - alpha1) / 34; - uint besterror = computeAlphaIndices(rgba, &block); + uint besterror = computeAlphaIndices(src, &block); AlphaBlockDXT5 bestblock = block; for (int i = 0; i < iterationCount; i++) { - optimizeAlpha8(rgba, &block); - uint error = computeAlphaIndices(rgba, &block); + optimizeAlpha8(src, &block); + uint error = computeAlphaIndices(src, &block); if (error >= besterror) { @@ -701,7 +815,7 @@ void QuickCompress::compressDXT5A(const ColorBlock & rgba, AlphaBlockDXT5 * dxtB }; // Copy best block to result; - *dxtBlock = bestblock; + *dst = bestblock; } void QuickCompress::compressDXT5(const ColorBlock & rgba, BlockDXT5 * dxtBlock, int iterationCount/*=8*/) @@ -752,3 +866,108 @@ void QuickCompress::outputBlock3(const ColorSet & set, const Vector3 & start, co //optimizeEndPoints3(set, block); } + + + + +inline Vector3 toVectorColor(int r, int g, int b) { + Vector3 c; + c.x = float((r << 3) | (r >> 2)); + c.y = float((g << 2) | (g >> 4)); + c.z = float((b << 3) | (b >> 2)); + return c; +} + +// Do an exhaustive search inside the bounding box. +void compress_dxt1_bounding_box_exhaustive(const ColorBlock & input, BlockDXT1 * output) +{ + int min_r = 255, min_g = 255, min_b = 255; + int max_r = 0, max_g = 0, max_b = 0; + + for (int i = 0; i < 16; i++) { + Color32 c = input.color(i); + min_r = min(min_r, int(c.r)); + max_r = max(max_r, int(c.r)); + min_g = min(min_g, int(c.g)); + max_g = max(max_g, int(c.g)); + min_b = min(min_b, int(c.b)); + max_b = max(max_b, int(c.b)); + } + + // Convert to 5:6:5 + min_r >>= 3; min_g >>= 2; min_b >>= 3; + max_r >>= 3; max_g >>= 2; max_b >>= 3; + + // Expand the box. + int range_r = max_r - min_r; + int range_g = max_g - min_g; + int range_b = max_b - min_b; + + min_r = max(0, min_r - (range_r + 1) / 1 - 1); + min_g = max(0, min_g - (range_g + 1) / 1 - 1); + min_b = max(0, min_b - (range_b + 1) / 1 - 1); + + max_r = min(31, max_r + (range_r + 1) / 2 + 1); + max_g = min(63, max_g + (range_g + 1) / 2 + 1); + max_b = min(31, max_b + (range_b + 1) / 2 + 1); + + int count = (max_r-min_r) + (max_g-min_g) + (max_b-min_b); + + Vector3 colors[16]; + extractColorBlockRGB(input, colors); + + + // @@ Use a single loop and remap index to box location? + float bestError = FLT_MAX; + Vector3 best0, best1; + bool threeColorMode; + + for(int r0 = min_r; r0 <= max_r; r0++) + for(int r1 = max_r; r1 >= r0; r1--) + for(int g0 = min_g; g0 <= max_g; g0++) + for(int g1 = max_g; g1 >= g0; g1--) + for(int b0 = min_b; b0 <= max_b; b0++) + for(int b1 = max_b; b1 >= b0; b1--) + { + Vector3 c0 = toVectorColor(r0, g0, b0); + Vector3 c1 = toVectorColor(r1, g1, b1); + + // Compute palette and evaluate error for these endpoints. + float error = evaluatePaletteError4(colors, c1, c0); + + if (error < bestError) { + bestError = error; + best0 = c1; // c0 > c1 + best1 = c0; + threeColorMode = false; + } + +#if 0 + error = evaluatePaletteError3(colors, /*maxColor=*/c1, /*minColor=*/c0); + + if (error < bestError) { + bestError = error; + best0 = c0; + best1 = c1; + threeColorMode = true; + } +#endif + } + + uint16 color0 = roundAndExpand(&best0); + uint16 color1 = roundAndExpand(&best1); + + if (threeColorMode) { + nvCheck(color0 <= color1); + output->col0 = Color16(color1); + output->col1 = Color16(color0); + output->indices = computeIndices3(colors, best0, best1); + } + else { + nvCheck(color0 >= color1); + output->col0 = Color16(color0); + output->col1 = Color16(color1); + output->indices = computeIndices4(colors, best0, best1); + } + +} diff --git a/src/nvtt/QuickCompressDXT.h b/src/nvtt/QuickCompressDXT.h index 56adf3a..dbfc824 100644 --- a/src/nvtt/QuickCompressDXT.h +++ b/src/nvtt/QuickCompressDXT.h @@ -31,6 +31,7 @@ namespace nv { struct ColorBlock; struct ColorSet; + struct AlphaBlock4x4; struct BlockDXT1; struct BlockDXT3; struct BlockDXT5; @@ -40,13 +41,15 @@ namespace nv namespace QuickCompress { - void compressDXT1(const ColorBlock & rgba, BlockDXT1 * dxtBlock); - void compressDXT1a(const ColorBlock & rgba, BlockDXT1 * dxtBlock); + void compressDXT1(const ColorBlock & src, BlockDXT1 * dst); + void compressDXT1a(const ColorBlock & src, BlockDXT1 * dst); - void compressDXT3(const ColorBlock & rgba, BlockDXT3 * dxtBlock); + void compressDXT3(const ColorBlock & src, BlockDXT3 * dst); - void compressDXT5A(const ColorBlock & rgba, AlphaBlockDXT5 * dxtBlock, int iterationCount=8); - void compressDXT5(const ColorBlock & rgba, BlockDXT5 * dxtBlock, int iterationCount=8); + void compressDXT5A(const ColorBlock & src, AlphaBlockDXT5 * dst, int iterationCount=8); + void compressDXT5A(const AlphaBlock4x4 & src, AlphaBlockDXT5 * dst, int iterationCount=8); + + void compressDXT5(const ColorBlock & src, BlockDXT5 * dst, int iterationCount=8); void outputBlock4(const ColorSet & set, const Vector3 & start, const Vector3 & end, BlockDXT1 * block); void outputBlock3(const ColorSet & set, const Vector3 & start, const Vector3 & end, BlockDXT1 * block); diff --git a/src/nvtt/Surface.cpp b/src/nvtt/Surface.cpp index b3ebc0b..51d1521 100644 --- a/src/nvtt/Surface.cpp +++ b/src/nvtt/Surface.cpp @@ -28,6 +28,7 @@ #include "nvmath/Matrix.inl" #include "nvmath/Color.h" #include "nvmath/Half.h" +#include "nvmath/ftoi.h" #include "nvimage/Filter.h" #include "nvimage/ImageIO.h" @@ -78,13 +79,13 @@ namespace else if (format == Format_DXT3) { return 16; } - else if (format == Format_DXT5 || format == Format_DXT5n) { + else if (format == Format_DXT5 || format == Format_DXT5n || format == Format_BC3_RGBM) { return 16; } else if (format == Format_BC4) { return 8; } - else if (format == Format_BC5) { + else if (format == Format_BC5 || format == Format_BC5_Luma) { return 16; } else if (format == Format_CTX1) { @@ -347,13 +348,13 @@ int Surface::countMipmaps(int min_size) const return ::countMipmapsWithMinSize(m->image->width(), m->image->height(), 1, min_size); } -float Surface::alphaTestCoverage(float alphaRef/*= 0.5*/) const +float Surface::alphaTestCoverage(float alphaRef/*= 0.5*/, int alpha_channel/*=3*/) const { if (m->image == NULL) return 0.0f; alphaRef = nv::clamp(alphaRef, 1.0f/256, 255.0f/256); - return m->image->alphaTestCoverage(alphaRef, 3); + return m->image->alphaTestCoverage(alphaRef, alpha_channel); } float Surface::average(int channel, int alpha_channel/*= -1*/, float gamma /*= 2.2f*/) const @@ -419,7 +420,7 @@ void Surface::histogram(int channel, float rangeMin, float rangeMax, int binCoun const uint count = m->image->pixelCount(); for (uint i = 0; i < count; i++) { float f = c[i] * scale + bias; - int idx = ifloor(f); + int idx = ftoi_floor(f); if (idx < 0) idx = 0; if (idx > binCount-1) idx = binCount-1; binPtr[idx]++; @@ -434,18 +435,17 @@ void Surface::range(int channel, float * rangeMin, float * rangeMax, int alpha_c if (alpha_channel == -1) { // no alpha channel; just like the original range function - if (m->image != NULL) - { - float * c = img->channel(channel); + if (m->image != NULL) { + float * c = img->channel(channel); - const uint count = img->pixelCount(); - for (uint p = 0; p < count; p++) { - float f = c[p]; - if (f < range.x) range.x = f; - if (f > range.y) range.y = f; + const uint count = img->pixelCount(); + for (uint p = 0; p < count; p++) { + float f = c[p]; + if (f < range.x) range.x = f; + if (f > range.y) range.y = f; + } } } - } else { // use alpha test to ignore some pixels //note, it's quite possible to get FLT_MAX,-FLT_MAX back if all pixels fail the test @@ -623,6 +623,23 @@ bool Surface::setImage(nvtt::InputFormat format, int w, int h, int d, const void return false; } } + else if (format == InputFormat_R_32F) + { + const float * src = (const float *)data; + + TRY { + for (int i = 0; i < count; i++) + { + rdst[i] = src[i]; + gdst[i] = 0; + bdst[i] = 0; + adst[i] = 0; + } + } + CATCH { + return false; + } + } return true; } @@ -695,6 +712,20 @@ bool Surface::setImage(InputFormat format, int w, int h, int d, const void * r, return false; } } + else if (format == InputFormat_R_32F) + { + const float * rsrc = (const float *)r; + + TRY { + memcpy(rdst, rsrc, count * sizeof(float)); + memset(gdst, 0, count * sizeof(float)); + memset(bdst, 0, count * sizeof(float)); + memset(adst, 0, count * sizeof(float)); + } + CATCH { + return false; + } + } return true; } @@ -703,12 +734,12 @@ bool Surface::setImage(InputFormat format, int w, int h, int d, const void * r, bool Surface::setImage2D(Format format, Decoder decoder, int w, int h, const void * data) { if (format != nvtt::Format_BC1 && - format != nvtt::Format_BC2 && - format != nvtt::Format_BC3 && - format != nvtt::Format_BC4 && - format != nvtt::Format_BC5 && - format != nvtt::Format_BC6 && - format != nvtt::Format_BC7) + format != nvtt::Format_BC2 && + format != nvtt::Format_BC3 && + format != nvtt::Format_BC4 && + format != nvtt::Format_BC5 && + format != nvtt::Format_BC6 && + format != nvtt::Format_BC7) { return false; } @@ -1466,7 +1497,7 @@ void Surface::fill(float red, float green, float blue, float alpha) } -void Surface::scaleAlphaToCoverage(float coverage, float alphaRef/*= 0.5f*/) +void Surface::scaleAlphaToCoverage(float coverage, float alphaRef/*= 0.5f*/, int alpha_channel/*= 3*/) { if (isNull()) return; @@ -1474,7 +1505,7 @@ void Surface::scaleAlphaToCoverage(float coverage, float alphaRef/*= 0.5f*/) alphaRef = nv::clamp(alphaRef, 1.0f/256, 255.0f/256); - m->image->scaleAlphaToCoverage(coverage, alphaRef, 3); + m->image->scaleAlphaToCoverage(coverage, alphaRef, alpha_channel); } /*bool Surface::normalizeRange(float * rangeMin, float * rangeMax) @@ -1507,7 +1538,7 @@ void Surface::scaleAlphaToCoverage(float coverage, float alphaRef/*= 0.5f*/) // Ideally you should compress/quantize the RGB and M portions independently. // Once you have M quantized, you would compute the corresponding RGB and quantize that. -void Surface::toRGBM(float range/*= 1*/, float threshold/*= 0.0f*/) +void Surface::toRGBM(float range/*= 1*/, float threshold/*= 0.25*/) { if (isNull()) return; @@ -1523,60 +1554,71 @@ void Surface::toRGBM(float range/*= 1*/, float threshold/*= 0.0f*/) const uint count = img->pixelCount(); for (uint i = 0; i < count; i++) { - float R = r[i]; - float G = g[i]; - float B = b[i]; -#if 1 - float M = nv::clamp(max(max(R, G), max(B, threshold)), 0.0f, range); + float R = nv::clamp(r[i], 0.0f, 1.0f); + float G = nv::clamp(g[i], 0.0f, 1.0f); + float B = nv::clamp(b[i], 0.0f, 1.0f); - r[i] = nv::clamp(R / M, 0.0f, 1.0f); - g[i] = nv::clamp(G / M, 0.0f, 1.0f); - b[i] = nv::clamp(B / M, 0.0f, 1.0f); +#if 0 + // Baseline, no compression: + r[i] = R; + g[i] = G; + b[i] = B; + a[i] = 1; - a[i] = (M - threshold) / (range - threshold); +#elif 0 + float M = max(max(R, G), max(B, threshold)); -#else + r[i] = R / M; + g[i] = G / M; + b[i] = B / M; - // The optimal compressor theoretically produces the best results, but unfortunately introduces - // severe interpolation errors! + a[i] = (M - threshold) / (1 - threshold); + +#else + // The optimal compressor produces the best results, but can introduce interpolation errors! float bestM; float bestError = FLT_MAX; - int minM = iround(min(R, G, B) * 255.0f); + float M = max(max(R, G), max(B, threshold)); + int iM = ftoi_ceil((M - threshold) / (1 - threshold) * 255.0f); - for (int m = minM; m < 256; m++) { + //for (int m = 0; m < 256; m++) { // If we use the entire search space, interpolation errors are very likely to occur. + for (int m = max(iM-16, 0); m < min(iM+16, 256); m++) { // If we constrain the search space, these errors disappear. float fm = float(m) / 255.0f; + // Decode M + float M = fm * (1 - threshold) + threshold; + // Encode. - int ir = iround(255.0f * nv::clamp(R / fm, 0.0f, 1.0f)); - int ig = iround(255.0f * nv::clamp(G / fm, 0.0f, 1.0f)); - int ib = iround(255.0f * nv::clamp(B / fm, 0.0f, 1.0f)); + int ir = ftoi_round(255.0f * nv::saturate(R / M)); + int ig = ftoi_round(255.0f * nv::saturate(G / M)); + int ib = ftoi_round(255.0f * nv::saturate(B / M)); // Decode. - float fr = (float(ir) / 255.0f) * fm; - float fg = (float(ig) / 255.0f) * fm; - float fb = (float(ib) / 255.0f) * fm; + float fr = (float(ir) / 255.0f) * M; + float fg = (float(ig) / 255.0f) * M; + float fb = (float(ib) / 255.0f) * M; // Measure error. float error = square(R-fr) + square(G-fg) + square(B-fb); if (error < bestError) { bestError = error; - bestM = fm; + bestM = M; } } M = bestM; - r[i] = nv::clamp(R / M, 0.0f, 1.0f); - g[i] = nv::clamp(G / M, 0.0f, 1.0f); - b[i] = nv::clamp(B / M, 0.0f, 1.0f); - a[i] = M; + r[i] = nv::saturate(R / M); + g[i] = nv::saturate(G / M); + b[i] = nv::saturate(B / M); + a[i] = (M - threshold) / (1 - threshold); #endif } } - -void Surface::fromRGBM(float range/*= 1*/, float threshold/*= 0.0*/) +// @@ IC: Dubious merge. Review! +void Surface::fromRGBM(float range/*= 1*/, float threshold/*= 0.25*/) { if (isNull()) return; @@ -1798,7 +1840,7 @@ void Surface::toRGBE(int mantissaBits, int exponentBits) double denom = pow(2.0, double(E - exponentBias - mantissaBits)); // Refine exponent: - int m = iround(float(M / denom)); + int m = ftoi_round(float(M / denom)); nvDebugCheck(m <= (1 << mantissaBits)); if (m == (1 << mantissaBits)) { @@ -1866,10 +1908,10 @@ void Surface::fromRGBE(int mantissaBits, int exponentBits) const uint count = img->pixelCount(); for (uint i = 0; i < count; i++) { // Expand normalized float to to 9995 - int R = iround(r[i] * ((1 << mantissaBits) - 1)); - int G = iround(g[i] * ((1 << mantissaBits) - 1)); - int B = iround(b[i] * ((1 << mantissaBits) - 1)); - int E = iround(a[i] * ((1 << exponentBits) - 1)); + int R = ftoi_round(r[i] * ((1 << mantissaBits) - 1)); + int G = ftoi_round(g[i] * ((1 << mantissaBits) - 1)); + int B = ftoi_round(b[i] * ((1 << mantissaBits) - 1)); + int E = ftoi_round(a[i] * ((1 << exponentBits) - 1)); //float scale = ldexpf(1.0f, E - exponentBias - mantissaBits); float scale = powf(2, float(E - exponentBias - mantissaBits)); @@ -2741,8 +2783,8 @@ bool Surface::copy(const Surface & srcImage, int xsrc, int ysrc, int zsrc, int x FloatImage * dst = m->image; const FloatImage * src = srcImage.m->image; - if (toU32(xsrc + xsize) > src->width() || toU32(ysrc + ysize) > src->height() || toU32(zsrc + zsize) > src->depth()) return false; - if (toU32(xdst + xsize) > dst->width() || toU32(ydst + ysize) > dst->height() || toU32(zdst + zsize) > dst->depth()) return false; + if (U32(xsrc + xsize) > src->width() || U32(ysrc + ysize) > src->height() || U32(zsrc + zsize) > src->depth()) return false; + if (U32(xdst + xsize) > dst->width() || U32(ydst + ysize) > dst->height() || U32(zdst + zsize) > dst->depth()) return false; detach(); @@ -2765,6 +2807,65 @@ bool Surface::copy(const Surface & srcImage, int xsrc, int ysrc, int zsrc, int x } +// Draw colored border around atlas elements. +void Surface::setAtlasBorder(int aw, int ah, float r, float g, float b, float a) +{ + if (isNull()) return; + if (aw <= 0) return; + if (ah <= 0) return; + + detach(); + + FloatImage * img = m->image; + const uint w = img->width(); + const uint h = img->height(); + const uint d = img->depth(); + + // @@ Ideally the reminder of these divisions should be 0. + uint tile_height = h / ah; + uint tile_width = w / aw; + + // Note that this renders two consecutive lines between tiles. In theory we could just have one, but this way I think we have better rotation invariance. + + for (uint z = 0; z < d; z++) + { + // Horizontal lines: + for (uint i = 0, y = 0; i < uint(ah); i++, y += tile_height) + { + for (uint x = 0; x < w; x++) + { + img->pixel(0, x, y, z) = r; + img->pixel(1, x, y, z) = g; + img->pixel(2, x, y, z) = b; + img->pixel(3, x, y, z) = a; + + img->pixel(0, x, y + tile_height - 1, z) = r; + img->pixel(1, x, y + tile_height - 1, z) = g; + img->pixel(2, x, y + tile_height - 1, z) = b; + img->pixel(3, x, y + tile_height - 1, z) = a; + } + } + + // Vertical lines: + for (uint i = 0, x = 0; i < uint(ah); i++, x += tile_width) + { + for (uint y = 0; y < h; y++) + { + img->pixel(0, x, y, z) = r; + img->pixel(1, x, y, z) = g; + img->pixel(2, x, y, z) = b; + img->pixel(3, x, y, z) = a; + + img->pixel(0, x + tile_width - 1, y, z) = r; + img->pixel(1, x + tile_width - 1, y, z) = g; + img->pixel(2, x + tile_width - 1, y, z) = b; + img->pixel(3, x + tile_width - 1, y, z) = a; + } + } + } +} + + float nvtt::rmsError(const Surface & reference, const Surface & image) { @@ -2839,5 +2940,24 @@ Surface nvtt::diff(const Surface & reference, const Surface & image, float scale return diffImage; } +float nvtt::rmsToneMappedError(const Surface & reference, const Surface & img, float exposure) +{ + // @@ We could do this in the rms function without having to create image copies. + Surface r = reference; + Surface i = img; + + // @@ Ideally we should use our Reindhart operator. Add Reindhart_L & Reindhart_M ? + + float scale = 1.0f / exposure; + + r.scaleBias(0, scale, 0); r.scaleBias(1, scale, 0); r.scaleBias(2, scale, 0); + r.toneMap(ToneMapper_Reindhart, NULL); + r.toSrgb(); + i.scaleBias(0, scale, 0); i.scaleBias(1, scale, 0); i.scaleBias(2, scale, 0); + i.toneMap(ToneMapper_Reindhart, NULL); + i.toSrgb(); + + return nv::rmsColorError(r.m->image, i.m->image, reference.alphaMode() == nvtt::AlphaMode_Transparency); +} diff --git a/src/nvtt/bc6h/bits.h b/src/nvtt/bc6h/bits.h index 9969c9e..35bc38e 100644 --- a/src/nvtt/bc6h/bits.h +++ b/src/nvtt/bc6h/bits.h @@ -41,7 +41,7 @@ public: return out; } int getptr() { return bptr; } - int setptr(int ptr) { nvAssert (ptr >= 0 && ptr < maxbits); bptr = ptr; } + void setptr(int ptr) { nvAssert (ptr >= 0 && ptr < maxbits); bptr = ptr; } int getsize() { return bend; } private: @@ -60,8 +60,7 @@ private: return bit != 0; } void writeone(int bit) { - if (readonly) - throw "Writing a read-only bit stream"; + nvAssert (!readonly); // "Writing a read-only bit stream" nvAssert (bptr < maxbits); if (bptr >= maxbits) return; if (bit&1) diff --git a/src/nvtt/bc6h/tile.h b/src/nvtt/bc6h/tile.h index 3a9e068..66d5983 100644 --- a/src/nvtt/bc6h/tile.h +++ b/src/nvtt/bc6h/tile.h @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations #ifndef _ZOH_TILE_H #define _ZOH_TILE_H -#include "utils.h" +#include "zoh_utils.h" #include "nvmath/Vector.h" #include diff --git a/src/nvtt/bc6h/utils.cpp b/src/nvtt/bc6h/zoh_utils.cpp similarity index 96% rename from src/nvtt/bc6h/utils.cpp rename to src/nvtt/bc6h/zoh_utils.cpp index ff888e8..166d1f3 100644 --- a/src/nvtt/bc6h/utils.cpp +++ b/src/nvtt/bc6h/zoh_utils.cpp @@ -12,7 +12,7 @@ See the License for the specific language governing permissions and limitations // Utility and common routines -#include "utils.h" +#include "zoh_utils.h" #include "nvmath/Vector.inl" #include diff --git a/src/nvtt/bc6h/utils.h b/src/nvtt/bc6h/zoh_utils.h similarity index 100% rename from src/nvtt/bc6h/utils.h rename to src/nvtt/bc6h/zoh_utils.h diff --git a/src/nvtt/bc6h/zohone.cpp b/src/nvtt/bc6h/zohone.cpp index 2b246bb..43a302c 100644 --- a/src/nvtt/bc6h/zohone.cpp +++ b/src/nvtt/bc6h/zohone.cpp @@ -16,7 +16,7 @@ See the License for the specific language governing permissions and limitations #include "bits.h" #include "tile.h" #include "zoh.h" -#include "utils.h" +#include "zoh_utils.h" #include "nvmath/Vector.inl" #include "nvmath/Fitting.h" @@ -591,13 +591,14 @@ static void optimize_endpts(const Tile &tile, int shapeindex, const float orig_e // collect the pixels in the region int np = 0; - for (int y = 0; y < tile.size_y; y++) - for (int x = 0; x < tile.size_x; x++) - if (REGION(x,y,shapeindex) == region) - { - pixels[np] = tile.data[y][x]; - importance[np] = tile.importance_map[y][x]; - ++np; + for (int y = 0; y < tile.size_y; y++) { + for (int x = 0; x < tile.size_x; x++) { + if (REGION(x, y, shapeindex) == region) { + pixels[np] = tile.data[y][x]; + importance[np] = tile.importance_map[y][x]; + ++np; + } + } } optimize_one(pixels, importance, np, orig_err[region], orig_endpts[region], prec, opt_endpts[region]); @@ -660,7 +661,9 @@ float ZOH::refineone(const Tile &tile, int shapeindex_best, const FltEndpts endp } } } - throw "No candidate found, should never happen (refineone.)"; + + nvAssert (false); // "No candidate found, should never happen (refineone.)"; + return FLT_MAX; } static void generate_palette_unquantized(const FltEndpts endpts[NREGIONS_ONE], Vector3 palette[NREGIONS_ONE][NINDICES]) diff --git a/src/nvtt/bc6h/zohtwo.cpp b/src/nvtt/bc6h/zohtwo.cpp index 4c43fad..c585ed3 100644 --- a/src/nvtt/bc6h/zohtwo.cpp +++ b/src/nvtt/bc6h/zohtwo.cpp @@ -40,7 +40,7 @@ See the License for the specific language governing permissions and limitations #include "bits.h" #include "tile.h" #include "zoh.h" -#include "utils.h" +#include "zoh_utils.h" #include "nvmath/Fitting.h" #include "nvmath/Vector.inl" @@ -747,7 +747,8 @@ float ZOH::refinetwo(const Tile &tile, int shapeindex_best, const FltEndpts endp } } } - throw "No candidate found, should never happen (refinetwo.)"; + nvAssert(false); //throw "No candidate found, should never happen (refinetwo.)"; + return FLT_MAX; } static void generate_palette_unquantized(const FltEndpts endpts[NREGIONS_TWO], Vector3 palette[NREGIONS_TWO][NINDICES]) diff --git a/src/nvtt/bc7/avpcl_mode0.cpp b/src/nvtt/bc7/avpcl_mode0.cpp index 3464ebc..cc83ad2 100644 --- a/src/nvtt/bc7/avpcl_mode0.cpp +++ b/src/nvtt/bc7/avpcl_mode0.cpp @@ -21,7 +21,7 @@ See the License for the specific language governing permissions and limitations #include "nvmath/Vector.inl" #include "nvmath/Matrix.inl" #include "nvmath/Fitting.h" -#include "utils.h" +#include "avpcl_utils.h" #include "endpts.h" #include #include @@ -394,7 +394,7 @@ void AVPCL::decompress_mode0(const char *block, Tile &t) } // given a collection of colors and quantized endpoints, generate a palette, choose best entries, and return a single toterr -static float map_colors(const Vector4 colors[], int np, const IntEndptsRGB_2 &endpts, const RegionPrec ®ion_prec, float current_err, int indices[Tile::TILE_TOTAL]) +static float map_colors(const Vector4 colors[], const float importance[], int np, const IntEndptsRGB_2 &endpts, const RegionPrec ®ion_prec, float current_err, int indices[Tile::TILE_TOTAL]) { Vector4 palette[NINDICES]; float toterr = 0; @@ -404,11 +404,11 @@ static float map_colors(const Vector4 colors[], int np, const IntEndptsRGB_2 &en for (int i = 0; i < np; ++i) { - float err, besterr = FLT_MAX; + float besterr = FLT_MAX; for (int j = 0; j < NINDICES && besterr > 0; ++j) { - err = Utils::metric4(colors[i], palette[j]); + float err = Utils::metric4(colors[i], palette[j]) * importance[i]; if (err > besterr) // error increased, so we're done searching break; @@ -472,7 +472,7 @@ static void assign_indices(const Tile &tile, int shapeindex, IntEndptsRGB_2 endp // note: indices are valid only if the value returned is less than old_err; otherwise they contain -1's // this function returns either old_err or a value smaller (if it was successful in improving the error) -static float perturb_one(const Vector4 colors[], int np, int ch, const RegionPrec ®ion_prec, const IntEndptsRGB_2 &old_endpts, IntEndptsRGB_2 &new_endpts, +static float perturb_one(const Vector4 colors[], const float importance[], int np, int ch, const RegionPrec ®ion_prec, const IntEndptsRGB_2 &old_endpts, IntEndptsRGB_2 &new_endpts, float old_err, int do_b, int indices[Tile::TILE_TOTAL]) { // we have the old endpoints: old_endpts @@ -511,7 +511,7 @@ static float perturb_one(const Vector4 colors[], int np, int ch, const RegionPre continue; } - float err = map_colors(colors, np, temp_endpts, region_prec, min_err, temp_indices); + float err = map_colors(colors, importance, np, temp_endpts, region_prec, min_err, temp_indices); if (err < min_err) { @@ -543,7 +543,7 @@ static float perturb_one(const Vector4 colors[], int np, int ch, const RegionPre // for np = 16 -- adjust error thresholds as a function of np // always ensure endpoint ordering is preserved (no need to overlap the scan) // if orig_err returned from this is less than its input value, then indices[] will contain valid indices -static float exhaustive(const Vector4 colors[], int np, int ch, const RegionPrec ®ion_prec, float &orig_err, IntEndptsRGB_2 &opt_endpts, int indices[Tile::TILE_TOTAL]) +static float exhaustive(const Vector4 colors[], const float importance[], int np, int ch, const RegionPrec ®ion_prec, float &orig_err, IntEndptsRGB_2 &opt_endpts, int indices[Tile::TILE_TOTAL]) { IntEndptsRGB_2 temp_endpts; float best_err = orig_err; @@ -593,7 +593,7 @@ static float exhaustive(const Vector4 colors[], int np, int ch, const RegionPrec temp_endpts.A[ch] = a; temp_endpts.B[ch] = b; - float err = map_colors(colors, np, temp_endpts, region_prec, best_err, temp_indices); + float err = map_colors(colors, importance, np, temp_endpts, region_prec, best_err, temp_indices); if (err < best_err) { amin = a; @@ -613,7 +613,7 @@ static float exhaustive(const Vector4 colors[], int np, int ch, const RegionPrec temp_endpts.A[ch] = a; temp_endpts.B[ch] = b; - float err = map_colors(colors, np, temp_endpts, region_prec, best_err, temp_indices); + float err = map_colors(colors, importance, np, temp_endpts, region_prec, best_err, temp_indices); if (err < best_err) { amin = a; @@ -636,7 +636,7 @@ static float exhaustive(const Vector4 colors[], int np, int ch, const RegionPrec return best_err; } -static float optimize_one(const Vector4 colors[], int np, float orig_err, const IntEndptsRGB_2 &orig_endpts, const RegionPrec ®ion_prec, IntEndptsRGB_2 &opt_endpts) +static float optimize_one(const Vector4 colors[], const float importance[], int np, float orig_err, const IntEndptsRGB_2 &orig_endpts, const RegionPrec ®ion_prec, IntEndptsRGB_2 &opt_endpts) { float opt_err = orig_err; @@ -675,8 +675,8 @@ static float optimize_one(const Vector4 colors[], int np, float orig_err, const { // figure out which endpoint when perturbed gives the most improvement and start there // if we just alternate, we can easily end up in a local minima - float err0 = perturb_one(colors, np, ch, region_prec, opt_endpts, new_a, opt_err, 0, temp_indices0); // perturb endpt A - float err1 = perturb_one(colors, np, ch, region_prec, opt_endpts, new_b, opt_err, 1, temp_indices1); // perturb endpt B + float err0 = perturb_one(colors, importance, np, ch, region_prec, opt_endpts, new_a, opt_err, 0, temp_indices0); // perturb endpt A + float err1 = perturb_one(colors, importance, np, ch, region_prec, opt_endpts, new_b, opt_err, 1, temp_indices1); // perturb endpt B if (err0 < err1) { @@ -712,7 +712,7 @@ static float optimize_one(const Vector4 colors[], int np, float orig_err, const // now alternate endpoints and keep trying until there is no improvement for (;;) { - float err = perturb_one(colors, np, ch, region_prec, opt_endpts, new_endpt, opt_err, do_b, temp_indices0); + float err = perturb_one(colors, importance, np, ch, region_prec, opt_endpts, new_endpt, opt_err, do_b, temp_indices0); if (err >= opt_err) break; @@ -746,7 +746,7 @@ static float optimize_one(const Vector4 colors[], int np, float orig_err, const bool first = true; for (int ch = 0; ch < NCHANNELS_RGB; ++ch) { - float new_err = exhaustive(colors, np, ch, region_prec, opt_err, opt_endpts, temp_indices0); + float new_err = exhaustive(colors, importance, np, ch, region_prec, opt_err, opt_endpts, temp_indices0); if (new_err < opt_err) { @@ -786,6 +786,7 @@ static void optimize_endpts(const Tile &tile, int shapeindex, const float orig_e const IntEndptsRGB_2 orig_endpts[NREGIONS], const PatternPrec &pattern_prec, float opt_err[NREGIONS], IntEndptsRGB_2 opt_endpts[NREGIONS]) { Vector4 pixels[Tile::TILE_TOTAL]; + float importance[Tile::TILE_TOTAL]; IntEndptsRGB_2 temp_in, temp_out; int temp_indices[Tile::TILE_TOTAL]; @@ -794,10 +795,15 @@ static void optimize_endpts(const Tile &tile, int shapeindex, const float orig_e // collect the pixels in the region int np = 0; - for (int y = 0; y < tile.size_y; y++) - for (int x = 0; x < tile.size_x; x++) - if (REGION(x,y,shapeindex) == region) - pixels[np++] = tile.data[y][x]; + for (int y = 0; y < tile.size_y; y++) { + for (int x = 0; x < tile.size_x; x++) { + if (REGION(x, y, shapeindex) == region) { + pixels[np] = tile.data[y][x]; + importance[np] = tile.importance_map[y][x]; + np++; + } + } + } opt_endpts[region] = temp_in = orig_endpts[region]; opt_err[region] = orig_err[region]; @@ -812,10 +818,10 @@ static void optimize_endpts(const Tile &tile, int shapeindex, const float orig_e // make sure we have a valid error for temp_in // we use FLT_MAX here because we want an accurate temp_in_err, no shortcuts // (mapcolors will compute a mapping but will stop if the error exceeds the value passed in the FLT_MAX position) - float temp_in_err = map_colors(pixels, np, temp_in, pattern_prec.region_precs[region], FLT_MAX, temp_indices); + float temp_in_err = map_colors(pixels, importance, np, temp_in, pattern_prec.region_precs[region], FLT_MAX, temp_indices); // now try to optimize these endpoints - float temp_out_err = optimize_one(pixels, np, temp_in_err, temp_in, pattern_prec.region_precs[region], temp_out); + float temp_out_err = optimize_one(pixels, importance, np, temp_in_err, temp_in, pattern_prec.region_precs[region], temp_out); // if we find an improvement, update the best so far and correct the output endpoints and errors if (temp_out_err < best_err) @@ -890,7 +896,8 @@ static float refine(const Tile &tile, int shapeindex_best, const FltEndpts endpt } } } - throw "No candidate found, should never happen (mode avpcl 0)."; + nvAssert(false); // throw "No candidate found, should never happen (mode avpcl 0)."; + return FLT_MAX; } static void clamp(Vector4 &v) diff --git a/src/nvtt/bc7/avpcl_mode1.cpp b/src/nvtt/bc7/avpcl_mode1.cpp index 2141a0d..f8d03dc 100644 --- a/src/nvtt/bc7/avpcl_mode1.cpp +++ b/src/nvtt/bc7/avpcl_mode1.cpp @@ -21,7 +21,7 @@ See the License for the specific language governing permissions and limitations #include "nvmath/Vector.inl" #include "nvmath/Matrix.inl" #include "nvmath/Fitting.h" -#include "utils.h" +#include "avpcl_utils.h" #include "endpts.h" #include #include @@ -378,7 +378,7 @@ void AVPCL::decompress_mode1(const char *block, Tile &t) } // given a collection of colors and quantized endpoints, generate a palette, choose best entries, and return a single toterr -static float map_colors(const Vector4 colors[], int np, const IntEndptsRGB_1 &endpts, const RegionPrec ®ion_prec, float current_err, int indices[Tile::TILE_TOTAL]) +static float map_colors(const Vector4 colors[], const float importance[], int np, const IntEndptsRGB_1 &endpts, const RegionPrec ®ion_prec, float current_err, int indices[Tile::TILE_TOTAL]) { Vector4 palette[NINDICES]; float toterr = 0; @@ -388,11 +388,11 @@ static float map_colors(const Vector4 colors[], int np, const IntEndptsRGB_1 &en for (int i = 0; i < np; ++i) { - float err, besterr = FLT_MAX; + float besterr = FLT_MAX; for (int j = 0; j < NINDICES && besterr > 0; ++j) { - err = Utils::metric4(colors[i], palette[j]); + float err = Utils::metric4(colors[i], palette[j]) * importance[i]; if (err > besterr) // error increased, so we're done searching break; @@ -456,7 +456,7 @@ static void assign_indices(const Tile &tile, int shapeindex, IntEndptsRGB_1 endp // note: indices are valid only if the value returned is less than old_err; otherwise they contain -1's // this function returns either old_err or a value smaller (if it was successful in improving the error) -static float perturb_one(const Vector4 colors[], int np, int ch, const RegionPrec ®ion_prec, const IntEndptsRGB_1 &old_endpts, IntEndptsRGB_1 &new_endpts, +static float perturb_one(const Vector4 colors[], const float importance[], int np, int ch, const RegionPrec ®ion_prec, const IntEndptsRGB_1 &old_endpts, IntEndptsRGB_1 &new_endpts, float old_err, int do_b, int indices[Tile::TILE_TOTAL]) { // we have the old endpoints: old_endpts @@ -495,7 +495,7 @@ static float perturb_one(const Vector4 colors[], int np, int ch, const RegionPre continue; } - float err = map_colors(colors, np, temp_endpts, region_prec, min_err, temp_indices); + float err = map_colors(colors, importance, np, temp_endpts, region_prec, min_err, temp_indices); if (err < min_err) { @@ -527,7 +527,7 @@ static float perturb_one(const Vector4 colors[], int np, int ch, const RegionPre // for np = 16 -- adjust error thresholds as a function of np // always ensure endpoint ordering is preserved (no need to overlap the scan) // if orig_err returned from this is less than its input value, then indices[] will contain valid indices -static float exhaustive(const Vector4 colors[], int np, int ch, const RegionPrec ®ion_prec, float orig_err, IntEndptsRGB_1 &opt_endpts, int indices[Tile::TILE_TOTAL]) +static float exhaustive(const Vector4 colors[], const float importance[], int np, int ch, const RegionPrec ®ion_prec, float orig_err, IntEndptsRGB_1 &opt_endpts, int indices[Tile::TILE_TOTAL]) { IntEndptsRGB_1 temp_endpts; float best_err = orig_err; @@ -577,7 +577,7 @@ static float exhaustive(const Vector4 colors[], int np, int ch, const RegionPrec temp_endpts.A[ch] = a; temp_endpts.B[ch] = b; - float err = map_colors(colors, np, temp_endpts, region_prec, best_err, temp_indices); + float err = map_colors(colors, importance, np, temp_endpts, region_prec, best_err, temp_indices); if (err < best_err) { amin = a; @@ -597,7 +597,7 @@ static float exhaustive(const Vector4 colors[], int np, int ch, const RegionPrec temp_endpts.A[ch] = a; temp_endpts.B[ch] = b; - float err = map_colors(colors, np, temp_endpts, region_prec, best_err, temp_indices); + float err = map_colors(colors, importance, np, temp_endpts, region_prec, best_err, temp_indices); if (err < best_err) { amin = a; @@ -619,7 +619,7 @@ static float exhaustive(const Vector4 colors[], int np, int ch, const RegionPrec return best_err; } -static float optimize_one(const Vector4 colors[], int np, float orig_err, const IntEndptsRGB_1 &orig_endpts, const RegionPrec ®ion_prec, IntEndptsRGB_1 &opt_endpts) +static float optimize_one(const Vector4 colors[], const float importance[], int np, float orig_err, const IntEndptsRGB_1 &orig_endpts, const RegionPrec ®ion_prec, IntEndptsRGB_1 &opt_endpts) { float opt_err = orig_err; @@ -658,8 +658,8 @@ static float optimize_one(const Vector4 colors[], int np, float orig_err, const { // figure out which endpoint when perturbed gives the most improvement and start there // if we just alternate, we can easily end up in a local minima - float err0 = perturb_one(colors, np, ch, region_prec, opt_endpts, new_a, opt_err, 0, temp_indices0); // perturb endpt A - float err1 = perturb_one(colors, np, ch, region_prec, opt_endpts, new_b, opt_err, 1, temp_indices1); // perturb endpt B + float err0 = perturb_one(colors, importance, np, ch, region_prec, opt_endpts, new_a, opt_err, 0, temp_indices0); // perturb endpt A + float err1 = perturb_one(colors, importance, np, ch, region_prec, opt_endpts, new_b, opt_err, 1, temp_indices1); // perturb endpt B if (err0 < err1) { @@ -695,7 +695,7 @@ static float optimize_one(const Vector4 colors[], int np, float orig_err, const // now alternate endpoints and keep trying until there is no improvement for (;;) { - float err = perturb_one(colors, np, ch, region_prec, opt_endpts, new_endpt, opt_err, do_b, temp_indices0); + float err = perturb_one(colors, importance, np, ch, region_prec, opt_endpts, new_endpt, opt_err, do_b, temp_indices0); if (err >= opt_err) break; @@ -729,7 +729,7 @@ static float optimize_one(const Vector4 colors[], int np, float orig_err, const bool first = true; for (int ch = 0; ch < NCHANNELS_RGB; ++ch) { - float new_err = exhaustive(colors, np, ch, region_prec, opt_err, opt_endpts, temp_indices0); + float new_err = exhaustive(colors, importance, np, ch, region_prec, opt_err, opt_endpts, temp_indices0); if (new_err < opt_err) { @@ -768,6 +768,7 @@ static void optimize_endpts(const Tile &tile, int shapeindex, const float orig_e IntEndptsRGB_1 orig_endpts[NREGIONS], const PatternPrec &pattern_prec, float opt_err[NREGIONS], IntEndptsRGB_1 opt_endpts[NREGIONS]) { Vector4 pixels[Tile::TILE_TOTAL]; + float importance[Tile::TILE_TOTAL]; IntEndptsRGB_1 temp_in, temp_out; int temp_indices[Tile::TILE_TOTAL]; @@ -776,10 +777,15 @@ static void optimize_endpts(const Tile &tile, int shapeindex, const float orig_e // collect the pixels in the region int np = 0; - for (int y = 0; y < tile.size_y; y++) - for (int x = 0; x < tile.size_x; x++) - if (REGION(x,y,shapeindex) == region) - pixels[np++] = tile.data[y][x]; + for (int y = 0; y < tile.size_y; y++) { + for (int x = 0; x < tile.size_x; x++) { + if (REGION(x, y, shapeindex) == region) { + pixels[np] = tile.data[y][x]; + importance[np] = tile.importance_map[y][x]; + np++; + } + } + } opt_endpts[region] = temp_in = orig_endpts[region]; opt_err[region] = orig_err[region]; @@ -793,10 +799,10 @@ static void optimize_endpts(const Tile &tile, int shapeindex, const float orig_e // make sure we have a valid error for temp_in // we use FLT_MAX here because we want an accurate temp_in_err, no shortcuts // (mapcolors will compute a mapping but will stop if the error exceeds the value passed in the FLT_MAX position) - float temp_in_err = map_colors(pixels, np, temp_in, pattern_prec.region_precs[region], FLT_MAX, temp_indices); + float temp_in_err = map_colors(pixels, importance, np, temp_in, pattern_prec.region_precs[region], FLT_MAX, temp_indices); // now try to optimize these endpoints - float temp_out_err = optimize_one(pixels, np, temp_in_err, temp_in, pattern_prec.region_precs[region], temp_out); + float temp_out_err = optimize_one(pixels, importance, np, temp_in_err, temp_in, pattern_prec.region_precs[region], temp_out); // if we find an improvement, update the best so far and correct the output endpoints and errors if (temp_out_err < best_err) @@ -873,7 +879,8 @@ static float refine(const Tile &tile, int shapeindex_best, const FltEndpts endpt } } } - throw "No candidate found, should never happen (mode avpcl 1)."; + nvAssert(false); //throw "No candidate found, should never happen (mode avpcl 1)."; + return FLT_MAX; } static void clamp(Vector4 &v) @@ -909,11 +916,11 @@ static float map_colors(const Tile &tile, int shapeindex, const FltEndpts endpts for (int x = 0; x < tile.size_x; x++) { int region = REGION(x,y,shapeindex); - float err, besterr = FLT_MAX; + float besterr = FLT_MAX; for (int i = 0; i < NINDICES && besterr > 0; ++i) { - err = Utils::metric4(tile.data[y][x], palette[region][i]); + float err = Utils::metric4(tile.data[y][x], palette[region][i]) * tile.importance_map[y][x]; if (err > besterr) // error increased, so we're done searching. this works for most norms. break; diff --git a/src/nvtt/bc7/avpcl_mode2.cpp b/src/nvtt/bc7/avpcl_mode2.cpp index f6a5909..1d903eb 100644 --- a/src/nvtt/bc7/avpcl_mode2.cpp +++ b/src/nvtt/bc7/avpcl_mode2.cpp @@ -21,7 +21,7 @@ See the License for the specific language governing permissions and limitations #include "nvmath/Vector.inl" #include "nvmath/Matrix.inl" #include "nvmath/Fitting.h" -#include "utils.h" +#include "avpcl_utils.h" #include "endpts.h" #include #include @@ -342,7 +342,7 @@ void AVPCL::decompress_mode2(const char *block, Tile &t) } // given a collection of colors and quantized endpoints, generate a palette, choose best entries, and return a single toterr -static float map_colors(const Vector4 colors[], int np, const IntEndptsRGB &endpts, const RegionPrec ®ion_prec, float current_err, int indices[Tile::TILE_TOTAL]) +static float map_colors(const Vector4 colors[], const float importance[], int np, const IntEndptsRGB &endpts, const RegionPrec ®ion_prec, float current_err, int indices[Tile::TILE_TOTAL]) { Vector4 palette[NINDICES]; float toterr = 0; @@ -352,11 +352,11 @@ static float map_colors(const Vector4 colors[], int np, const IntEndptsRGB &endp for (int i = 0; i < np; ++i) { - float err, besterr = FLT_MAX; + float besterr = FLT_MAX; for (int j = 0; j < NINDICES && besterr > 0; ++j) { - err = Utils::metric4(colors[i], palette[j]); + float err = Utils::metric4(colors[i], palette[j]) * importance[i]; if (err > besterr) // error increased, so we're done searching break; @@ -420,7 +420,7 @@ static void assign_indices(const Tile &tile, int shapeindex, IntEndptsRGB endpts // note: indices are valid only if the value returned is less than old_err; otherwise they contain -1's // this function returns either old_err or a value smaller (if it was successful in improving the error) -static float perturb_one(const Vector4 colors[], int np, int ch, const RegionPrec ®ion_prec, const IntEndptsRGB &old_endpts, IntEndptsRGB &new_endpts, +static float perturb_one(const Vector4 colors[], const float importance[], int np, int ch, const RegionPrec ®ion_prec, const IntEndptsRGB &old_endpts, IntEndptsRGB &new_endpts, float old_err, int do_b, int indices[Tile::TILE_TOTAL]) { // we have the old endpoints: old_endpts @@ -459,7 +459,7 @@ static float perturb_one(const Vector4 colors[], int np, int ch, const RegionPre continue; } - float err = map_colors(colors, np, temp_endpts, region_prec, min_err, temp_indices); + float err = map_colors(colors, importance, np, temp_endpts, region_prec, min_err, temp_indices); if (err < min_err) { @@ -491,7 +491,7 @@ static float perturb_one(const Vector4 colors[], int np, int ch, const RegionPre // for np = 16 -- adjust error thresholds as a function of np // always ensure endpoint ordering is preserved (no need to overlap the scan) // if orig_err returned from this is less than its input value, then indices[] will contain valid indices -static float exhaustive(const Vector4 colors[], int np, int ch, const RegionPrec ®ion_prec, float orig_err, IntEndptsRGB &opt_endpts, int indices[Tile::TILE_TOTAL]) +static float exhaustive(const Vector4 colors[], const float importance[], int np, int ch, const RegionPrec ®ion_prec, float orig_err, IntEndptsRGB &opt_endpts, int indices[Tile::TILE_TOTAL]) { IntEndptsRGB temp_endpts; float best_err = orig_err; @@ -541,7 +541,7 @@ static float exhaustive(const Vector4 colors[], int np, int ch, const RegionPrec temp_endpts.A[ch] = a; temp_endpts.B[ch] = b; - float err = map_colors(colors, np, temp_endpts, region_prec, best_err, temp_indices); + float err = map_colors(colors, importance, np, temp_endpts, region_prec, best_err, temp_indices); if (err < best_err) { amin = a; @@ -561,7 +561,7 @@ static float exhaustive(const Vector4 colors[], int np, int ch, const RegionPrec temp_endpts.A[ch] = a; temp_endpts.B[ch] = b; - float err = map_colors(colors, np, temp_endpts, region_prec, best_err, temp_indices); + float err = map_colors(colors, importance, np, temp_endpts, region_prec, best_err, temp_indices); if (err < best_err) { amin = a; @@ -584,7 +584,7 @@ static float exhaustive(const Vector4 colors[], int np, int ch, const RegionPrec return best_err; } -static float optimize_one(const Vector4 colors[], int np, float orig_err, const IntEndptsRGB &orig_endpts, const RegionPrec ®ion_prec, IntEndptsRGB &opt_endpts) +static float optimize_one(const Vector4 colors[], const float importance[], int np, float orig_err, const IntEndptsRGB &orig_endpts, const RegionPrec ®ion_prec, IntEndptsRGB &opt_endpts) { float opt_err = orig_err; @@ -623,8 +623,8 @@ static float optimize_one(const Vector4 colors[], int np, float orig_err, const { // figure out which endpoint when perturbed gives the most improvement and start there // if we just alternate, we can easily end up in a local minima - float err0 = perturb_one(colors, np, ch, region_prec, opt_endpts, new_a, opt_err, 0, temp_indices0); // perturb endpt A - float err1 = perturb_one(colors, np, ch, region_prec, opt_endpts, new_b, opt_err, 1, temp_indices1); // perturb endpt B + float err0 = perturb_one(colors, importance, np, ch, region_prec, opt_endpts, new_a, opt_err, 0, temp_indices0); // perturb endpt A + float err1 = perturb_one(colors, importance, np, ch, region_prec, opt_endpts, new_b, opt_err, 1, temp_indices1); // perturb endpt B if (err0 < err1) { @@ -660,7 +660,7 @@ static float optimize_one(const Vector4 colors[], int np, float orig_err, const // now alternate endpoints and keep trying until there is no improvement for (;;) { - float err = perturb_one(colors, np, ch, region_prec, opt_endpts, new_endpt, opt_err, do_b, temp_indices0); + float err = perturb_one(colors, importance, np, ch, region_prec, opt_endpts, new_endpt, opt_err, do_b, temp_indices0); if (err >= opt_err) break; @@ -694,7 +694,7 @@ static float optimize_one(const Vector4 colors[], int np, float orig_err, const bool first = true; for (int ch = 0; ch < NCHANNELS_RGB; ++ch) { - float new_err = exhaustive(colors, np, ch, region_prec, opt_err, opt_endpts, temp_indices0); + float new_err = exhaustive(colors, importance, np, ch, region_prec, opt_err, opt_endpts, temp_indices0); if (new_err < opt_err) { @@ -733,6 +733,7 @@ static void optimize_endpts(const Tile &tile, int shapeindex, const float orig_e const IntEndptsRGB orig_endpts[NREGIONS_THREE], const PatternPrec &pattern_prec, float opt_err[NREGIONS], IntEndptsRGB opt_endpts[NREGIONS_THREE]) { Vector4 pixels[Tile::TILE_TOTAL]; + float importance[Tile::TILE_TOTAL]; IntEndptsRGB temp_in, temp_out; for (int region=0; region #include @@ -390,7 +390,7 @@ void AVPCL::decompress_mode3(const char *block, Tile &t) } // given a collection of colors and quantized endpoints, generate a palette, choose best entries, and return a single toterr -static float map_colors(const Vector4 colors[], int np, const IntEndptsRGB_2 &endpts, const RegionPrec ®ion_prec, float current_err, int indices[Tile::TILE_TOTAL]) +static float map_colors(const Vector4 colors[], const float importance[], int np, const IntEndptsRGB_2 &endpts, const RegionPrec ®ion_prec, float current_err, int indices[Tile::TILE_TOTAL]) { Vector4 palette[NINDICES]; float toterr = 0; @@ -400,11 +400,11 @@ static float map_colors(const Vector4 colors[], int np, const IntEndptsRGB_2 &en for (int i = 0; i < np; ++i) { - float err, besterr = FLT_MAX; + float besterr = FLT_MAX; for (int j = 0; j < NINDICES && besterr > 0; ++j) { - err = Utils::metric4(colors[i], palette[j]); + float err = Utils::metric4(colors[i], palette[j]) * importance[i]; if (err > besterr) // error increased, so we're done searching break; @@ -467,7 +467,7 @@ static void assign_indices(const Tile &tile, int shapeindex, IntEndptsRGB_2 endp // note: indices are valid only if the value returned is less than old_err; otherwise they contain -1's // this function returns either old_err or a value smaller (if it was successful in improving the error) -static float perturb_one(const Vector4 colors[], int np, int ch, const RegionPrec ®ion_prec, const IntEndptsRGB_2 &old_endpts, IntEndptsRGB_2 &new_endpts, +static float perturb_one(const Vector4 colors[], const float importance[], int np, int ch, const RegionPrec ®ion_prec, const IntEndptsRGB_2 &old_endpts, IntEndptsRGB_2 &new_endpts, float old_err, int do_b, int indices[Tile::TILE_TOTAL]) { // we have the old endpoints: old_endpts @@ -506,7 +506,7 @@ static float perturb_one(const Vector4 colors[], int np, int ch, const RegionPre continue; } - float err = map_colors(colors, np, temp_endpts, region_prec, min_err, temp_indices); + float err = map_colors(colors, importance, np, temp_endpts, region_prec, min_err, temp_indices); if (err < min_err) { @@ -538,7 +538,7 @@ static float perturb_one(const Vector4 colors[], int np, int ch, const RegionPre // for np = 16 -- adjust error thresholds as a function of np // always ensure endpoint ordering is preserved (no need to overlap the scan) // if orig_err returned from this is less than its input value, then indices[] will contain valid indices -static float exhaustive(const Vector4 colors[], int np, int ch, const RegionPrec ®ion_prec, float &orig_err, IntEndptsRGB_2 &opt_endpts, int indices[Tile::TILE_TOTAL]) +static float exhaustive(const Vector4 colors[], const float importance[], int np, int ch, const RegionPrec ®ion_prec, float &orig_err, IntEndptsRGB_2 &opt_endpts, int indices[Tile::TILE_TOTAL]) { IntEndptsRGB_2 temp_endpts; float best_err = orig_err; @@ -588,7 +588,7 @@ static float exhaustive(const Vector4 colors[], int np, int ch, const RegionPrec temp_endpts.A[ch] = a; temp_endpts.B[ch] = b; - float err = map_colors(colors, np, temp_endpts, region_prec, best_err, temp_indices); + float err = map_colors(colors, importance, np, temp_endpts, region_prec, best_err, temp_indices); if (err < best_err) { amin = a; @@ -608,7 +608,7 @@ static float exhaustive(const Vector4 colors[], int np, int ch, const RegionPrec temp_endpts.A[ch] = a; temp_endpts.B[ch] = b; - float err = map_colors(colors, np, temp_endpts, region_prec, best_err, temp_indices); + float err = map_colors(colors, importance, np, temp_endpts, region_prec, best_err, temp_indices); if (err < best_err) { amin = a; @@ -631,7 +631,7 @@ static float exhaustive(const Vector4 colors[], int np, int ch, const RegionPrec return best_err; } -static float optimize_one(const Vector4 colors[], int np, float orig_err, const IntEndptsRGB_2 &orig_endpts, const RegionPrec ®ion_prec, IntEndptsRGB_2 &opt_endpts) +static float optimize_one(const Vector4 colors[], const float importance[], int np, float orig_err, const IntEndptsRGB_2 &orig_endpts, const RegionPrec ®ion_prec, IntEndptsRGB_2 &opt_endpts) { float opt_err = orig_err; @@ -670,8 +670,8 @@ static float optimize_one(const Vector4 colors[], int np, float orig_err, const { // figure out which endpoint when perturbed gives the most improvement and start there // if we just alternate, we can easily end up in a local minima - float err0 = perturb_one(colors, np, ch, region_prec, opt_endpts, new_a, opt_err, 0, temp_indices0); // perturb endpt A - float err1 = perturb_one(colors, np, ch, region_prec, opt_endpts, new_b, opt_err, 1, temp_indices1); // perturb endpt B + float err0 = perturb_one(colors, importance, np, ch, region_prec, opt_endpts, new_a, opt_err, 0, temp_indices0); // perturb endpt A + float err1 = perturb_one(colors, importance, np, ch, region_prec, opt_endpts, new_b, opt_err, 1, temp_indices1); // perturb endpt B if (err0 < err1) { @@ -707,7 +707,7 @@ static float optimize_one(const Vector4 colors[], int np, float orig_err, const // now alternate endpoints and keep trying until there is no improvement for (;;) { - float err = perturb_one(colors, np, ch, region_prec, opt_endpts, new_endpt, opt_err, do_b, temp_indices0); + float err = perturb_one(colors, importance, np, ch, region_prec, opt_endpts, new_endpt, opt_err, do_b, temp_indices0); if (err >= opt_err) break; @@ -741,7 +741,7 @@ static float optimize_one(const Vector4 colors[], int np, float orig_err, const bool first = true; for (int ch = 0; ch < NCHANNELS_RGB; ++ch) { - float new_err = exhaustive(colors, np, ch, region_prec, opt_err, opt_endpts, temp_indices0); + float new_err = exhaustive(colors, importance, np, ch, region_prec, opt_err, opt_endpts, temp_indices0); if (new_err < opt_err) { @@ -781,6 +781,7 @@ static void optimize_endpts(const Tile &tile, int shapeindex, const float orig_e const IntEndptsRGB_2 orig_endpts[NREGIONS], const PatternPrec &pattern_prec, float opt_err[NREGIONS], IntEndptsRGB_2 opt_endpts[NREGIONS]) { Vector4 pixels[Tile::TILE_TOTAL]; + float importance[Tile::TILE_TOTAL]; IntEndptsRGB_2 temp_in, temp_out; int temp_indices[Tile::TILE_TOTAL]; @@ -789,10 +790,15 @@ static void optimize_endpts(const Tile &tile, int shapeindex, const float orig_e // collect the pixels in the region int np = 0; - for (int y = 0; y < tile.size_y; y++) - for (int x = 0; x < tile.size_x; x++) - if (REGION(x,y,shapeindex) == region) - pixels[np++] = tile.data[y][x]; + for (int y = 0; y < tile.size_y; y++) { + for (int x = 0; x < tile.size_x; x++) { + if (REGION(x, y, shapeindex) == region) { + pixels[np] = tile.data[y][x]; + importance[np] = tile.importance_map[y][x]; + np++; + } + } + } opt_endpts[region] = temp_in = orig_endpts[region]; opt_err[region] = orig_err[region]; @@ -807,10 +813,10 @@ static void optimize_endpts(const Tile &tile, int shapeindex, const float orig_e // make sure we have a valid error for temp_in // we use FLT_MAX here because we want an accurate temp_in_err, no shortcuts // (mapcolors will compute a mapping but will stop if the error exceeds the value passed in the FLT_MAX position) - float temp_in_err = map_colors(pixels, np, temp_in, pattern_prec.region_precs[region], FLT_MAX, temp_indices); + float temp_in_err = map_colors(pixels, importance, np, temp_in, pattern_prec.region_precs[region], FLT_MAX, temp_indices); // now try to optimize these endpoints - float temp_out_err = optimize_one(pixels, np, temp_in_err, temp_in, pattern_prec.region_precs[region], temp_out); + float temp_out_err = optimize_one(pixels, importance, np, temp_in_err, temp_in, pattern_prec.region_precs[region], temp_out); // if we find an improvement, update the best so far and correct the output endpoints and errors if (temp_out_err < best_err) @@ -885,7 +891,8 @@ static float refine(const Tile &tile, int shapeindex_best, const FltEndpts endpt } } } - throw "No candidate found, should never happen (avpcl mode 3)."; + nvAssert(false); //throw "No candidate found, should never happen (mode avpcl 3)."; + return FLT_MAX; } static void clamp(Vector4 &v) diff --git a/src/nvtt/bc7/avpcl_mode4.cpp b/src/nvtt/bc7/avpcl_mode4.cpp index 7ed5585..4be6d3a 100644 --- a/src/nvtt/bc7/avpcl_mode4.cpp +++ b/src/nvtt/bc7/avpcl_mode4.cpp @@ -21,7 +21,7 @@ See the License for the specific language governing permissions and limitations #include "nvmath/Vector.inl" #include "nvmath/Matrix.inl" #include "nvmath/Fitting.h" -#include "utils.h" +#include "avpcl_utils.h" #include "endpts.h" #include #include @@ -353,9 +353,9 @@ static void sign_extend(Pattern &p, IntEndptsRGBA endpts[NREGIONS]) if (p.transform_mode) { // endpts[0].A[i] = SIGN_EXTEND(endpts[0].B[i], p.chan[i].nbitsizes[0]); // always positive here - endpts[0].B[i] = SIGN_EXTEND(endpts[0].B[i], p.chan[i].nbitsizes[1]); - endpts[1].A[i] = SIGN_EXTEND(endpts[1].A[i], p.chan[i].nbitsizes[2]); - endpts[1].B[i] = SIGN_EXTEND(endpts[1].B[i], p.chan[i].nbitsizes[3]); + endpts[0].B[i] = SIGN_EXTEND(endpts[0].B[i], p.chan[i].nbitsizes[0]); + endpts[1].A[i] = SIGN_EXTEND(endpts[1].A[i], p.chan[i].nbitsizes[1]); + endpts[1].B[i] = SIGN_EXTEND(endpts[1].B[i], p.chan[i].nbitsizes[1]); } } } @@ -422,7 +422,7 @@ void AVPCL::decompress_mode4(const char *block, Tile &t) // given a collection of colors and quantized endpoints, generate a palette, choose best entries, and return a single toterr // we already have a candidate mapping when we call this function, thus an error. take an early exit if the accumulated error so far // exceeds what we already have -static float map_colors(const Vector4 colors[], int np, int rotatemode, int indexmode, const IntEndptsRGBA &endpts, const RegionPrec ®ion_prec, float current_besterr, int indices[NINDEXARRAYS][Tile::TILE_TOTAL]) +static float map_colors(const Vector4 colors[], const float importance[], int np, int rotatemode, int indexmode, const IntEndptsRGBA &endpts, const RegionPrec ®ion_prec, float current_besterr, int indices[NINDEXARRAYS][Tile::TILE_TOTAL]) { Vector3 palette_rgb[NINDICES3]; // could be nindices2 float palette_a[NINDICES3]; // could be nindices2 @@ -519,7 +519,7 @@ static float map_colors(const Vector4 colors[], int np, int rotatemode, int inde } palette_alpha = (rotatemode == ROTATEMODE_RGBA_AGBR) ? (palette_rgb[bestindex]).x : (rotatemode == ROTATEMODE_RGBA_RABG) ? (palette_rgb[bestindex]).y : - (rotatemode == ROTATEMODE_RGBA_RGAB) ? (palette_rgb[bestindex]).z : (nvCheckMacro(0),0); + (rotatemode == ROTATEMODE_RGBA_RGAB) ? (palette_rgb[bestindex]).z : nvCheckMacro(0); toterr += besterr; // do A index @@ -647,7 +647,7 @@ static void assign_indices(const Tile &tile, int shapeindex, int rotatemode, int } palette_alpha = (rotatemode == ROTATEMODE_RGBA_AGBR) ? (palette_rgb[region][bestindex]).x : (rotatemode == ROTATEMODE_RGBA_RABG) ? (palette_rgb[region][bestindex]).y : - (rotatemode == ROTATEMODE_RGBA_RGAB) ? (palette_rgb[region][bestindex]).z : (nvCheckMacro(0),0); + (rotatemode == ROTATEMODE_RGBA_RGAB) ? (palette_rgb[region][bestindex]).z : nvCheckMacro(0); toterr[region] += besterr; // do A index @@ -672,7 +672,7 @@ static void assign_indices(const Tile &tile, int shapeindex, int rotatemode, int // note: indices are valid only if the value returned is less than old_err; otherwise they contain -1's // this function returns either old_err or a value smaller (if it was successful in improving the error) -static float perturb_one(const Vector4 colors[], int np, int rotatemode, int indexmode, int ch, const RegionPrec ®ion_prec, const IntEndptsRGBA &old_endpts, IntEndptsRGBA &new_endpts, +static float perturb_one(const Vector4 colors[], const float importance[], int np, int rotatemode, int indexmode, int ch, const RegionPrec ®ion_prec, const IntEndptsRGBA &old_endpts, IntEndptsRGBA &new_endpts, float old_err, int do_b, int indices[NINDEXARRAYS][Tile::TILE_TOTAL]) { // we have the old endpoints: old_endpts @@ -712,7 +712,7 @@ static float perturb_one(const Vector4 colors[], int np, int rotatemode, int ind continue; } - float err = map_colors(colors, np, rotatemode, indexmode, temp_endpts, region_prec, min_err, temp_indices); + float err = map_colors(colors, importance, np, rotatemode, indexmode, temp_endpts, region_prec, min_err, temp_indices); if (err < min_err) { @@ -744,7 +744,7 @@ static float perturb_one(const Vector4 colors[], int np, int rotatemode, int ind // if err > 40 6.25% // for np = 16 -- adjust error thresholds as a function of np // always ensure endpoint ordering is preserved (no need to overlap the scan) -static float exhaustive(const Vector4 colors[], int np, int rotatemode, int indexmode, int ch, const RegionPrec ®ion_prec, float orig_err, IntEndptsRGBA &opt_endpts, int indices[NINDEXARRAYS][Tile::TILE_TOTAL]) +static float exhaustive(const Vector4 colors[], const float importance[], int np, int rotatemode, int indexmode, int ch, const RegionPrec ®ion_prec, float orig_err, IntEndptsRGBA &opt_endpts, int indices[NINDEXARRAYS][Tile::TILE_TOTAL]) { IntEndptsRGBA temp_endpts; float best_err = orig_err; @@ -795,7 +795,7 @@ static float exhaustive(const Vector4 colors[], int np, int rotatemode, int inde temp_endpts.A[ch] = a; temp_endpts.B[ch] = b; - float err = map_colors(colors, np, rotatemode, indexmode, temp_endpts, region_prec, best_err, temp_indices); + float err = map_colors(colors, importance, np, rotatemode, indexmode, temp_endpts, region_prec, best_err, temp_indices); if (err < best_err) { amin = a; @@ -816,7 +816,7 @@ static float exhaustive(const Vector4 colors[], int np, int rotatemode, int inde temp_endpts.A[ch] = a; temp_endpts.B[ch] = b; - float err = map_colors(colors, np, rotatemode, indexmode, temp_endpts, region_prec, best_err, temp_indices); + float err = map_colors(colors, importance, np, rotatemode, indexmode, temp_endpts, region_prec, best_err, temp_indices); if (err < best_err) { amin = a; @@ -841,7 +841,7 @@ static float exhaustive(const Vector4 colors[], int np, int rotatemode, int inde return best_err; } -static float optimize_one(const Vector4 colors[], int np, int rotatemode, int indexmode, float orig_err, const IntEndptsRGBA &orig_endpts, const RegionPrec ®ion_prec, IntEndptsRGBA &opt_endpts) +static float optimize_one(const Vector4 colors[], const float importance[], int np, int rotatemode, int indexmode, float orig_err, const IntEndptsRGBA &orig_endpts, const RegionPrec ®ion_prec, IntEndptsRGBA &opt_endpts) { float opt_err = orig_err; @@ -878,8 +878,8 @@ static float optimize_one(const Vector4 colors[], int np, int rotatemode, int in { // figure out which endpoint when perturbed gives the most improvement and start there // if we just alternate, we can easily end up in a local minima - float err0 = perturb_one(colors, np, rotatemode, indexmode, ch, region_prec, opt_endpts, new_a, opt_err, 0, temp_indices0); // perturb endpt A - float err1 = perturb_one(colors, np, rotatemode, indexmode, ch, region_prec, opt_endpts, new_b, opt_err, 1, temp_indices1); // perturb endpt B + float err0 = perturb_one(colors, importance, np, rotatemode, indexmode, ch, region_prec, opt_endpts, new_a, opt_err, 0, temp_indices0); // perturb endpt A + float err1 = perturb_one(colors, importance, np, rotatemode, indexmode, ch, region_prec, opt_endpts, new_b, opt_err, 1, temp_indices1); // perturb endpt B if (err0 < err1) { @@ -917,7 +917,7 @@ static float optimize_one(const Vector4 colors[], int np, int rotatemode, int in // now alternate endpoints and keep trying until there is no improvement for (;;) { - float err = perturb_one(colors, np, rotatemode, indexmode, ch, region_prec, opt_endpts, new_endpt, opt_err, do_b, temp_indices0); + float err = perturb_one(colors, importance, np, rotatemode, indexmode, ch, region_prec, opt_endpts, new_endpt, opt_err, do_b, temp_indices0); if (err >= opt_err) break; @@ -950,7 +950,7 @@ static float optimize_one(const Vector4 colors[], int np, int rotatemode, int in bool first = true; for (int ch = 0; ch < NCHANNELS_RGBA; ++ch) { - float new_err = exhaustive(colors, np, rotatemode, indexmode, ch, region_prec, opt_err, opt_endpts, temp_indices0); + float new_err = exhaustive(colors, importance, np, rotatemode, indexmode, ch, region_prec, opt_err, opt_endpts, temp_indices0); if (new_err < opt_err) { @@ -990,6 +990,7 @@ static void optimize_endpts(const Tile &tile, int shapeindex, int rotatemode, in const IntEndptsRGBA orig_endpts[NREGIONS], const PatternPrec &pattern_prec, float opt_err[NREGIONS], IntEndptsRGBA opt_endpts[NREGIONS]) { Vector4 pixels[Tile::TILE_TOTAL]; + float importance[Tile::TILE_TOTAL]; IntEndptsRGBA temp_in, temp_out; for (int region=0; region #include @@ -354,9 +354,9 @@ static void sign_extend(Pattern &p, IntEndptsRGBA endpts[NREGIONS]) if (p.transform_mode) { // endpts[0].A[i] = SIGN_EXTEND(endpts[0].B[i], p.chan[i].nbitsizes[0]); // always positive here - endpts[0].B[i] = SIGN_EXTEND(endpts[0].B[i], p.chan[i].nbitsizes[1]); - endpts[1].A[i] = SIGN_EXTEND(endpts[1].A[i], p.chan[i].nbitsizes[2]); - endpts[1].B[i] = SIGN_EXTEND(endpts[1].B[i], p.chan[i].nbitsizes[3]); + endpts[0].B[i] = SIGN_EXTEND(endpts[0].B[i], p.chan[i].nbitsizes[0]); + endpts[1].A[i] = SIGN_EXTEND(endpts[1].A[i], p.chan[i].nbitsizes[1]); + endpts[1].B[i] = SIGN_EXTEND(endpts[1].B[i], p.chan[i].nbitsizes[1]); } } } @@ -423,7 +423,7 @@ void AVPCL::decompress_mode5(const char *block, Tile &t) // given a collection of colors and quantized endpoints, generate a palette, choose best entries, and return a single toterr // we already have a candidate mapping when we call this function, thus an error. take an early exit if the accumulated error so far // exceeds what we already have -static float map_colors(const Vector4 colors[], int np, int rotatemode, int indexmode, const IntEndptsRGBA &endpts, const RegionPrec ®ion_prec, float current_besterr, int indices[NINDEXARRAYS][Tile::TILE_TOTAL]) +static float map_colors(const Vector4 colors[], const float importance[], int np, int rotatemode, int indexmode, const IntEndptsRGBA &endpts, const RegionPrec ®ion_prec, float current_besterr, int indices[NINDEXARRAYS][Tile::TILE_TOTAL]) { Vector3 palette_rgb[NINDICES3]; // could be nindices2 float palette_a[NINDICES3]; // could be nindices2 @@ -520,7 +520,7 @@ static float map_colors(const Vector4 colors[], int np, int rotatemode, int inde } palette_alpha = (rotatemode == ROTATEMODE_RGBA_AGBR) ? (palette_rgb[bestindex]).x : (rotatemode == ROTATEMODE_RGBA_RABG) ? (palette_rgb[bestindex]).y : - (rotatemode == ROTATEMODE_RGBA_RGAB) ? (palette_rgb[bestindex]).z : (nvCheckMacro(0),0); + (rotatemode == ROTATEMODE_RGBA_RGAB) ? (palette_rgb[bestindex]).z : nvCheckMacro(0); toterr += besterr; // do A index @@ -648,7 +648,7 @@ static void assign_indices(const Tile &tile, int shapeindex, int rotatemode, int } palette_alpha = (rotatemode == ROTATEMODE_RGBA_AGBR) ? (palette_rgb[region][bestindex]).x : (rotatemode == ROTATEMODE_RGBA_RABG) ? (palette_rgb[region][bestindex]).y : - (rotatemode == ROTATEMODE_RGBA_RGAB) ? (palette_rgb[region][bestindex]).z : (nvCheckMacro(0),0); + (rotatemode == ROTATEMODE_RGBA_RGAB) ? (palette_rgb[region][bestindex]).z : nvCheckMacro(0); toterr[region] += besterr; // do A index @@ -673,7 +673,7 @@ static void assign_indices(const Tile &tile, int shapeindex, int rotatemode, int // note: indices are valid only if the value returned is less than old_err; otherwise they contain -1's // this function returns either old_err or a value smaller (if it was successful in improving the error) -static float perturb_one(const Vector4 colors[], int np, int rotatemode, int indexmode, int ch, const RegionPrec ®ion_prec, const IntEndptsRGBA &old_endpts, IntEndptsRGBA &new_endpts, +static float perturb_one(const Vector4 colors[], const float importance[], int np, int rotatemode, int indexmode, int ch, const RegionPrec ®ion_prec, const IntEndptsRGBA &old_endpts, IntEndptsRGBA &new_endpts, float old_err, int do_b, int indices[NINDEXARRAYS][Tile::TILE_TOTAL]) { // we have the old endpoints: old_endpts @@ -713,7 +713,7 @@ static float perturb_one(const Vector4 colors[], int np, int rotatemode, int ind continue; } - float err = map_colors(colors, np, rotatemode, indexmode, temp_endpts, region_prec, min_err, temp_indices); + float err = map_colors(colors, importance, np, rotatemode, indexmode, temp_endpts, region_prec, min_err, temp_indices); if (err < min_err) { @@ -745,7 +745,7 @@ static float perturb_one(const Vector4 colors[], int np, int rotatemode, int ind // if err > 40 6.25% // for np = 16 -- adjust error thresholds as a function of np // always ensure endpoint ordering is preserved (no need to overlap the scan) -static float exhaustive(const Vector4 colors[], int np, int rotatemode, int indexmode, int ch, const RegionPrec ®ion_prec, float orig_err, IntEndptsRGBA &opt_endpts, int indices[NINDEXARRAYS][Tile::TILE_TOTAL]) +static float exhaustive(const Vector4 colors[], const float importance[], int np, int rotatemode, int indexmode, int ch, const RegionPrec ®ion_prec, float orig_err, IntEndptsRGBA &opt_endpts, int indices[NINDEXARRAYS][Tile::TILE_TOTAL]) { IntEndptsRGBA temp_endpts; float best_err = orig_err; @@ -796,7 +796,7 @@ static float exhaustive(const Vector4 colors[], int np, int rotatemode, int inde temp_endpts.A[ch] = a; temp_endpts.B[ch] = b; - float err = map_colors(colors, np, rotatemode, indexmode, temp_endpts, region_prec, best_err, temp_indices); + float err = map_colors(colors, importance, np, rotatemode, indexmode, temp_endpts, region_prec, best_err, temp_indices); if (err < best_err) { amin = a; @@ -817,7 +817,7 @@ static float exhaustive(const Vector4 colors[], int np, int rotatemode, int inde temp_endpts.A[ch] = a; temp_endpts.B[ch] = b; - float err = map_colors(colors, np, rotatemode, indexmode, temp_endpts, region_prec, best_err, temp_indices); + float err = map_colors(colors, importance, np, rotatemode, indexmode, temp_endpts, region_prec, best_err, temp_indices); if (err < best_err) { amin = a; @@ -842,7 +842,7 @@ static float exhaustive(const Vector4 colors[], int np, int rotatemode, int inde return best_err; } -static float optimize_one(const Vector4 colors[], int np, int rotatemode, int indexmode, float orig_err, const IntEndptsRGBA &orig_endpts, const RegionPrec ®ion_prec, IntEndptsRGBA &opt_endpts) +static float optimize_one(const Vector4 colors[], const float importance[], int np, int rotatemode, int indexmode, float orig_err, const IntEndptsRGBA &orig_endpts, const RegionPrec ®ion_prec, IntEndptsRGBA &opt_endpts) { float opt_err = orig_err; @@ -879,8 +879,8 @@ static float optimize_one(const Vector4 colors[], int np, int rotatemode, int in { // figure out which endpoint when perturbed gives the most improvement and start there // if we just alternate, we can easily end up in a local minima - float err0 = perturb_one(colors, np, rotatemode, indexmode, ch, region_prec, opt_endpts, new_a, opt_err, 0, temp_indices0); // perturb endpt A - float err1 = perturb_one(colors, np, rotatemode, indexmode, ch, region_prec, opt_endpts, new_b, opt_err, 1, temp_indices1); // perturb endpt B + float err0 = perturb_one(colors, importance, np, rotatemode, indexmode, ch, region_prec, opt_endpts, new_a, opt_err, 0, temp_indices0); // perturb endpt A + float err1 = perturb_one(colors, importance, np, rotatemode, indexmode, ch, region_prec, opt_endpts, new_b, opt_err, 1, temp_indices1); // perturb endpt B if (err0 < err1) { @@ -918,7 +918,7 @@ static float optimize_one(const Vector4 colors[], int np, int rotatemode, int in // now alternate endpoints and keep trying until there is no improvement for (;;) { - float err = perturb_one(colors, np, rotatemode, indexmode, ch, region_prec, opt_endpts, new_endpt, opt_err, do_b, temp_indices0); + float err = perturb_one(colors, importance, np, rotatemode, indexmode, ch, region_prec, opt_endpts, new_endpt, opt_err, do_b, temp_indices0); if (err >= opt_err) break; @@ -951,7 +951,7 @@ static float optimize_one(const Vector4 colors[], int np, int rotatemode, int in bool first = true; for (int ch = 0; ch < NCHANNELS_RGBA; ++ch) { - float new_err = exhaustive(colors, np, rotatemode, indexmode, ch, region_prec, opt_err, opt_endpts, temp_indices0); + float new_err = exhaustive(colors, importance, np, rotatemode, indexmode, ch, region_prec, opt_err, opt_endpts, temp_indices0); if (new_err < opt_err) { @@ -991,6 +991,7 @@ static void optimize_endpts(const Tile &tile, int shapeindex, int rotatemode, in const IntEndptsRGBA orig_endpts[NREGIONS], const PatternPrec &pattern_prec, float opt_err[NREGIONS], IntEndptsRGBA opt_endpts[NREGIONS]) { Vector4 pixels[Tile::TILE_TOTAL]; + float importance[Tile::TILE_TOTAL]; IntEndptsRGBA temp_in, temp_out; for (int region=0; region #include @@ -390,7 +390,7 @@ void AVPCL::decompress_mode6(const char *block, Tile &t) } // given a collection of colors and quantized endpoints, generate a palette, choose best entries, and return a single toterr -static float map_colors(const Vector4 colors[], int np, const IntEndptsRGBA_2 &endpts, const RegionPrec ®ion_prec, float current_err, int indices[Tile::TILE_TOTAL]) +static float map_colors(const Vector4 colors[], const float importance[], int np, const IntEndptsRGBA_2 &endpts, const RegionPrec ®ion_prec, float current_err, int indices[Tile::TILE_TOTAL]) { Vector4 palette[NINDICES]; float toterr = 0; @@ -470,7 +470,7 @@ static void assign_indices(const Tile &tile, int shapeindex, IntEndptsRGBA_2 end // note: indices are valid only if the value returned is less than old_err; otherwise they contain -1's // this function returns either old_err or a value smaller (if it was successful in improving the error) -static float perturb_one(const Vector4 colors[], int np, int ch, const RegionPrec ®ion_prec, const IntEndptsRGBA_2 &old_endpts, IntEndptsRGBA_2 &new_endpts, +static float perturb_one(const Vector4 colors[], const float importance[], int np, int ch, const RegionPrec ®ion_prec, const IntEndptsRGBA_2 &old_endpts, IntEndptsRGBA_2 &new_endpts, float old_err, int do_b, int indices[Tile::TILE_TOTAL]) { // we have the old endpoints: old_endpts @@ -509,7 +509,7 @@ static float perturb_one(const Vector4 colors[], int np, int ch, const RegionPre continue; } - float err = map_colors(colors, np, temp_endpts, region_prec, min_err, temp_indices); + float err = map_colors(colors, importance, np, temp_endpts, region_prec, min_err, temp_indices); if (err < min_err) { @@ -541,7 +541,7 @@ static float perturb_one(const Vector4 colors[], int np, int ch, const RegionPre // for np = 16 -- adjust error thresholds as a function of np // always ensure endpoint ordering is preserved (no need to overlap the scan) // if orig_err returned from this is less than its input value, then indices[] will contain valid indices -static float exhaustive(const Vector4 colors[], int np, int ch, const RegionPrec ®ion_prec, float orig_err, IntEndptsRGBA_2 &opt_endpts, int indices[Tile::TILE_TOTAL]) +static float exhaustive(const Vector4 colors[], const float importance[], int np, int ch, const RegionPrec ®ion_prec, float orig_err, IntEndptsRGBA_2 &opt_endpts, int indices[Tile::TILE_TOTAL]) { IntEndptsRGBA_2 temp_endpts; float best_err = orig_err; @@ -591,7 +591,7 @@ static float exhaustive(const Vector4 colors[], int np, int ch, const RegionPrec temp_endpts.A[ch] = a; temp_endpts.B[ch] = b; - float err = map_colors(colors, np, temp_endpts, region_prec, best_err, temp_indices); + float err = map_colors(colors, importance, np, temp_endpts, region_prec, best_err, temp_indices); if (err < best_err) { amin = a; @@ -611,7 +611,7 @@ static float exhaustive(const Vector4 colors[], int np, int ch, const RegionPrec temp_endpts.A[ch] = a; temp_endpts.B[ch] = b; - float err = map_colors(colors, np, temp_endpts, region_prec, best_err, temp_indices); + float err = map_colors(colors, importance, np, temp_endpts, region_prec, best_err, temp_indices); if (err < best_err) { amin = a; @@ -634,7 +634,7 @@ static float exhaustive(const Vector4 colors[], int np, int ch, const RegionPrec return best_err; } -static float optimize_one(const Vector4 colors[], int np, float orig_err, const IntEndptsRGBA_2 &orig_endpts, const RegionPrec ®ion_prec, IntEndptsRGBA_2 &opt_endpts) +static float optimize_one(const Vector4 colors[], const float importance[], int np, float orig_err, const IntEndptsRGBA_2 &orig_endpts, const RegionPrec ®ion_prec, IntEndptsRGBA_2 &opt_endpts) { float opt_err = orig_err; @@ -673,8 +673,8 @@ static float optimize_one(const Vector4 colors[], int np, float orig_err, const { // figure out which endpoint when perturbed gives the most improvement and start there // if we just alternate, we can easily end up in a local minima - float err0 = perturb_one(colors, np, ch, region_prec, opt_endpts, new_a, opt_err, 0, temp_indices0); // perturb endpt A - float err1 = perturb_one(colors, np, ch, region_prec, opt_endpts, new_b, opt_err, 1, temp_indices1); // perturb endpt B + float err0 = perturb_one(colors, importance, np, ch, region_prec, opt_endpts, new_a, opt_err, 0, temp_indices0); // perturb endpt A + float err1 = perturb_one(colors, importance, np, ch, region_prec, opt_endpts, new_b, opt_err, 1, temp_indices1); // perturb endpt B if (err0 < err1) { @@ -710,7 +710,7 @@ static float optimize_one(const Vector4 colors[], int np, float orig_err, const // now alternate endpoints and keep trying until there is no improvement for (;;) { - float err = perturb_one(colors, np, ch, region_prec, opt_endpts, new_endpt, opt_err, do_b, temp_indices0); + float err = perturb_one(colors, importance, np, ch, region_prec, opt_endpts, new_endpt, opt_err, do_b, temp_indices0); if (err >= opt_err) break; @@ -744,7 +744,7 @@ static float optimize_one(const Vector4 colors[], int np, float orig_err, const bool first = true; for (int ch = 0; ch < NCHANNELS_RGBA; ++ch) { - float new_err = exhaustive(colors, np, ch, region_prec, opt_err, opt_endpts, temp_indices0); + float new_err = exhaustive(colors, importance, np, ch, region_prec, opt_err, opt_endpts, temp_indices0); if (new_err < opt_err) { @@ -783,6 +783,7 @@ static void optimize_endpts(const Tile &tile, int shapeindex, const float orig_e IntEndptsRGBA_2 orig_endpts[NREGIONS], const PatternPrec &pattern_prec, float opt_err[NREGIONS], IntEndptsRGBA_2 opt_endpts[NREGIONS]) { Vector4 pixels[Tile::TILE_TOTAL]; + float importance[Tile::TILE_TOTAL]; IntEndptsRGBA_2 temp_in, temp_out; int temp_indices[Tile::TILE_TOTAL]; @@ -791,10 +792,15 @@ static void optimize_endpts(const Tile &tile, int shapeindex, const float orig_e // collect the pixels in the region int np = 0; - for (int y = 0; y < tile.size_y; y++) - for (int x = 0; x < tile.size_x; x++) - if (REGION(x,y,shapeindex) == region) - pixels[np++] = tile.data[y][x]; + for (int y = 0; y < tile.size_y; y++) { + for (int x = 0; x < tile.size_x; x++) { + if (REGION(x, y, shapeindex) == region) { + pixels[np] = tile.data[y][x]; + importance[np] = tile.importance_map[y][x]; + np++; + } + } + } opt_endpts[region] = temp_in = orig_endpts[region]; opt_err[region] = orig_err[region]; @@ -810,10 +816,10 @@ static void optimize_endpts(const Tile &tile, int shapeindex, const float orig_e // make sure we have a valid error for temp_in // we use FLT_MAX here because we want an accurate temp_in_err, no shortcuts // (mapcolors will compute a mapping but will stop if the error exceeds the value passed in the FLT_MAX position) - float temp_in_err = map_colors(pixels, np, temp_in, pattern_prec.region_precs[region], FLT_MAX, temp_indices); + float temp_in_err = map_colors(pixels, importance, np, temp_in, pattern_prec.region_precs[region], FLT_MAX, temp_indices); // now try to optimize these endpoints - float temp_out_err = optimize_one(pixels, np, temp_in_err, temp_in, pattern_prec.region_precs[region], temp_out); + float temp_out_err = optimize_one(pixels, importance, np, temp_in_err, temp_in, pattern_prec.region_precs[region], temp_out); // if we find an improvement, update the best so far and correct the output endpoints and errors if (temp_out_err < best_err) @@ -880,7 +886,8 @@ static float refine(const Tile &tile, int shapeindex_best, const FltEndpts endpt return orig_toterr; } } - throw "No candidate found, should never happen (avpcl mode 6)."; + nvAssert(false); //throw "No candidate found, should never happen (mode avpcl 6)."; + return FLT_MAX; } static void clamp(Vector4 &v) diff --git a/src/nvtt/bc7/avpcl_mode7.cpp b/src/nvtt/bc7/avpcl_mode7.cpp index fe72d51..6733e02 100644 --- a/src/nvtt/bc7/avpcl_mode7.cpp +++ b/src/nvtt/bc7/avpcl_mode7.cpp @@ -21,7 +21,7 @@ See the License for the specific language governing permissions and limitations #include "nvmath/Vector.inl" #include "nvmath/Matrix.inl" #include "nvmath/Fitting.h" -#include "utils.h" +#include "avpcl_utils.h" #include "endpts.h" #include #include @@ -423,7 +423,7 @@ void AVPCL::decompress_mode7(const char *block, Tile &t) } // given a collection of colors and quantized endpoints, generate a palette, choose best entries, and return a single toterr -static float map_colors(const Vector4 colors[], int np, const IntEndptsRGBA_2 &endpts, const RegionPrec ®ion_prec, float current_err, int indices[Tile::TILE_TOTAL]) +static float map_colors(const Vector4 colors[], const float importance[], int np, const IntEndptsRGBA_2 &endpts, const RegionPrec ®ion_prec, float current_err, int indices[Tile::TILE_TOTAL]) { Vector4 palette[NINDICES]; float toterr = 0; @@ -503,7 +503,7 @@ static void assign_indices(const Tile &tile, int shapeindex, IntEndptsRGBA_2 end // note: indices are valid only if the value returned is less than old_err; otherwise they contain -1's // this function returns either old_err or a value smaller (if it was successful in improving the error) -static float perturb_one(const Vector4 colors[], int np, int ch, const RegionPrec ®ion_prec, const IntEndptsRGBA_2 &old_endpts, IntEndptsRGBA_2 &new_endpts, +static float perturb_one(const Vector4 colors[], const float importance[], int np, int ch, const RegionPrec ®ion_prec, const IntEndptsRGBA_2 &old_endpts, IntEndptsRGBA_2 &new_endpts, float old_err, int do_b, int indices[Tile::TILE_TOTAL]) { // we have the old endpoints: old_endpts @@ -542,7 +542,7 @@ static float perturb_one(const Vector4 colors[], int np, int ch, const RegionPre continue; } - float err = map_colors(colors, np, temp_endpts, region_prec, min_err, temp_indices); + float err = map_colors(colors, importance, np, temp_endpts, region_prec, min_err, temp_indices); if (err < min_err) { @@ -574,7 +574,7 @@ static float perturb_one(const Vector4 colors[], int np, int ch, const RegionPre // for np = 16 -- adjust error thresholds as a function of np // always ensure endpoint ordering is preserved (no need to overlap the scan) // if orig_err returned from this is less than its input value, then indices[] will contain valid indices -static float exhaustive(const Vector4 colors[], int np, int ch, const RegionPrec ®ion_prec, float orig_err, IntEndptsRGBA_2 &opt_endpts, int indices[Tile::TILE_TOTAL]) +static float exhaustive(const Vector4 colors[], const float importance[], int np, int ch, const RegionPrec ®ion_prec, float orig_err, IntEndptsRGBA_2 &opt_endpts, int indices[Tile::TILE_TOTAL]) { IntEndptsRGBA_2 temp_endpts; float best_err = orig_err; @@ -624,7 +624,7 @@ static float exhaustive(const Vector4 colors[], int np, int ch, const RegionPrec temp_endpts.A[ch] = a; temp_endpts.B[ch] = b; - float err = map_colors(colors, np, temp_endpts, region_prec, best_err, temp_indices); + float err = map_colors(colors, importance, np, temp_endpts, region_prec, best_err, temp_indices); if (err < best_err) { amin = a; @@ -644,7 +644,7 @@ static float exhaustive(const Vector4 colors[], int np, int ch, const RegionPrec temp_endpts.A[ch] = a; temp_endpts.B[ch] = b; - float err = map_colors(colors, np, temp_endpts, region_prec, best_err, temp_indices); + float err = map_colors(colors, importance, np, temp_endpts, region_prec, best_err, temp_indices); if (err < best_err) { amin = a; @@ -667,7 +667,7 @@ static float exhaustive(const Vector4 colors[], int np, int ch, const RegionPrec return best_err; } -static float optimize_one(const Vector4 colors[], int np, float orig_err, const IntEndptsRGBA_2 &orig_endpts, const RegionPrec ®ion_prec, IntEndptsRGBA_2 &opt_endpts) +static float optimize_one(const Vector4 colors[], const float importance[], int np, float orig_err, const IntEndptsRGBA_2 &orig_endpts, const RegionPrec ®ion_prec, IntEndptsRGBA_2 &opt_endpts) { float opt_err = orig_err; @@ -706,8 +706,8 @@ static float optimize_one(const Vector4 colors[], int np, float orig_err, const { // figure out which endpoint when perturbed gives the most improvement and start there // if we just alternate, we can easily end up in a local minima - float err0 = perturb_one(colors, np, ch, region_prec, opt_endpts, new_a, opt_err, 0, temp_indices0); // perturb endpt A - float err1 = perturb_one(colors, np, ch, region_prec, opt_endpts, new_b, opt_err, 1, temp_indices1); // perturb endpt B + float err0 = perturb_one(colors, importance, np, ch, region_prec, opt_endpts, new_a, opt_err, 0, temp_indices0); // perturb endpt A + float err1 = perturb_one(colors, importance, np, ch, region_prec, opt_endpts, new_b, opt_err, 1, temp_indices1); // perturb endpt B if (err0 < err1) { @@ -743,7 +743,7 @@ static float optimize_one(const Vector4 colors[], int np, float orig_err, const // now alternate endpoints and keep trying until there is no improvement for (;;) { - float err = perturb_one(colors, np, ch, region_prec, opt_endpts, new_endpt, opt_err, do_b, temp_indices0); + float err = perturb_one(colors, importance, np, ch, region_prec, opt_endpts, new_endpt, opt_err, do_b, temp_indices0); if (err >= opt_err) break; @@ -777,7 +777,7 @@ static float optimize_one(const Vector4 colors[], int np, float orig_err, const bool first = true; for (int ch = 0; ch < NCHANNELS_RGBA; ++ch) { - float new_err = exhaustive(colors, np, ch, region_prec, opt_err, opt_endpts, temp_indices0); + float new_err = exhaustive(colors, importance, np, ch, region_prec, opt_err, opt_endpts, temp_indices0); if (new_err < opt_err) { @@ -816,6 +816,7 @@ static void optimize_endpts(const Tile &tile, int shapeindex, const float orig_e IntEndptsRGBA_2 orig_endpts[NREGIONS], const PatternPrec &pattern_prec, float opt_err[NREGIONS], IntEndptsRGBA_2 opt_endpts[NREGIONS]) { Vector4 pixels[Tile::TILE_TOTAL]; + float importance[Tile::TILE_TOTAL]; IntEndptsRGBA_2 temp_in, temp_out; int temp_indices[Tile::TILE_TOTAL]; @@ -824,10 +825,15 @@ static void optimize_endpts(const Tile &tile, int shapeindex, const float orig_e // collect the pixels in the region int np = 0; - for (int y = 0; y < tile.size_y; y++) - for (int x = 0; x < tile.size_x; x++) - if (REGION(x,y,shapeindex) == region) - pixels[np++] = tile.data[y][x]; + for (int y = 0; y < tile.size_y; y++) { + for (int x = 0; x < tile.size_x; x++) { + if (REGION(x, y, shapeindex) == region) { + pixels[np] = tile.data[y][x]; + importance[np] = tile.importance_map[y][x]; + np++; + } + } + } opt_endpts[region] = temp_in = orig_endpts[region]; opt_err[region] = orig_err[region]; @@ -843,10 +849,10 @@ static void optimize_endpts(const Tile &tile, int shapeindex, const float orig_e // make sure we have a valid error for temp_in // we use FLT_MAX here because we want an accurate temp_in_err, no shortcuts // (mapcolors will compute a mapping but will stop if the error exceeds the value passed in the FLT_MAX position) - float temp_in_err = map_colors(pixels, np, temp_in, pattern_prec.region_precs[region], FLT_MAX, temp_indices); + float temp_in_err = map_colors(pixels, importance, np, temp_in, pattern_prec.region_precs[region], FLT_MAX, temp_indices); // now try to optimize these endpoints - float temp_out_err = optimize_one(pixels, np, temp_in_err, temp_in, pattern_prec.region_precs[region], temp_out); + float temp_out_err = optimize_one(pixels, importance, np, temp_in_err, temp_in, pattern_prec.region_precs[region], temp_out); // if we find an improvement, update the best so far and correct the output endpoints and errors if (temp_out_err < best_err) @@ -921,7 +927,8 @@ static float refine(const Tile &tile, int shapeindex_best, const FltEndpts endpt } } } - throw "No candidate found, should never happen (avpcl mode 7)."; + nvAssert(false); //throw "No candidate found, should never happen (mode avpcl 7)."; + return FLT_MAX; } static void clamp(Vector4 &v) diff --git a/src/nvtt/bc7/utils.cpp b/src/nvtt/bc7/avpcl_utils.cpp similarity index 93% rename from src/nvtt/bc7/utils.cpp rename to src/nvtt/bc7/avpcl_utils.cpp index a581703..f7aa003 100644 --- a/src/nvtt/bc7/utils.cpp +++ b/src/nvtt/bc7/avpcl_utils.cpp @@ -12,7 +12,7 @@ See the License for the specific language governing permissions and limitations // Utility and common routines -#include "utils.h" +#include "avpcl_utils.h" #include "avpcl.h" #include "nvcore/Debug.h" #include "nvmath/Vector.inl" @@ -129,7 +129,7 @@ float Utils::metric4(Vector4::Arg a, Vector4::Arg b) { rwt = 0.299f; gwt = 0.587f; bwt = 0.114f; } - else if (AVPCL::flag_nonuniform_ati) + else /*if (AVPCL::flag_nonuniform_ati)*/ { rwt = 0.3086f; gwt = 0.6094f; bwt = 0.0820f; } @@ -255,7 +255,7 @@ float Utils::metric4premult(Vector4::Arg a, Vector4::Arg b) { rwt = 0.299f; gwt = 0.587f; bwt = 0.114f; } - else if (AVPCL::flag_nonuniform_ati) + else /*if (AVPCL::flag_nonuniform_ati)*/ { rwt = 0.3086f; gwt = 0.6094f; bwt = 0.0820f; } @@ -286,7 +286,7 @@ float Utils::metric3premult_alphaout(Vector3::Arg rgb0, float a0, Vector3::Arg r { rwt = 0.299f; gwt = 0.587f; bwt = 0.114f; } - else if (AVPCL::flag_nonuniform_ati) + else /*if (AVPCL::flag_nonuniform_ati)*/ { rwt = 0.3086f; gwt = 0.6094f; bwt = 0.0820f; } @@ -341,7 +341,7 @@ float Utils::metric3premult_alphain(Vector3::Arg rgb0, Vector3::Arg rgb1, int ro { rwt = 0.299f; gwt = 0.587f; bwt = 0.114f; } - else if (AVPCL::flag_nonuniform_ati) + else /*if (AVPCL::flag_nonuniform_ati)*/ { rwt = 0.3086f; gwt = 0.6094f; bwt = 0.0820f; } diff --git a/src/nvtt/bc7/utils.h b/src/nvtt/bc7/avpcl_utils.h similarity index 100% rename from src/nvtt/bc7/utils.h rename to src/nvtt/bc7/avpcl_utils.h diff --git a/src/nvtt/bc7/bits.h b/src/nvtt/bc7/bits.h index 7b42a70..3730b70 100644 --- a/src/nvtt/bc7/bits.h +++ b/src/nvtt/bc7/bits.h @@ -60,8 +60,7 @@ private: return bit != 0; } void writeone(int bit) { - if (readonly) - throw "Writing a read-only bit stream"; + nvAssert (!readonly); // "Writing a read-only bit stream" nvAssert (bptr < maxbits); if (bptr >= maxbits) return; if (bit&1) diff --git a/src/nvtt/bc7/tile.h b/src/nvtt/bc7/tile.h index 85fcc57..99ae2ef 100644 --- a/src/nvtt/bc7/tile.h +++ b/src/nvtt/bc7/tile.h @@ -15,7 +15,7 @@ See the License for the specific language governing permissions and limitations #include "nvmath/Vector.h" #include -#include "utils.h" +#include "avpcl_utils.h" namespace AVPCL { @@ -28,6 +28,7 @@ public: static const int TILE_W = 4; static const int TILE_TOTAL = TILE_H * TILE_W; nv::Vector4 data[TILE_H][TILE_W]; + float importance_map[TILE_H][TILE_W]; int size_x, size_y; // actual size of tile Tile() {}; diff --git a/src/nvtt/cuda/CompressKernel.cu b/src/nvtt/cuda/CompressKernel.cu index 21eca4f..4ec6143 100644 --- a/src/nvtt/cuda/CompressKernel.cu +++ b/src/nvtt/cuda/CompressKernel.cu @@ -1285,9 +1285,9 @@ __device__ void saveBlockDXT1_Parallel(uint endpoints, float3 colors[16], int xr ushort endpoint0 = endpoints & 0xFFFF; ushort endpoint1 = endpoints >> 16; - int3 palette[4]; - palette[0] = color16ToInt3(endpoint0); - palette[1] = color16ToInt3(endpoint1); + int3 palette[4]; + palette[0] = color16ToInt3(endpoint0); + palette[1] = color16ToInt3(endpoint1); int d0 = colorDistance(palette[0], color); int d1 = colorDistance(palette[1], color); diff --git a/src/nvtt/nvtt.h b/src/nvtt/nvtt.h index b64e41a..0195362 100644 --- a/src/nvtt/nvtt.h +++ b/src/nvtt/nvtt.h @@ -102,8 +102,13 @@ namespace nvtt Format_DXT1n, // Not supported. Format_CTX1, // Not supported. - Format_BC6, - Format_BC7, + Format_BC6, // Not supported yet. + Format_BC7, // Not supported yet. + + Format_BC5_Luma, // Two DXT alpha blocks encoding a single float. + Format_BC3_RGBM, // + + Format_Count }; // Pixel types. These basically indicate how the output should be interpreted, but do not have any influence over the input. They are only relevant in RGBA mode. @@ -132,6 +137,7 @@ namespace nvtt Decoder_D3D10, Decoder_D3D9, Decoder_NV5x, + //Decoder_RSX, // To take advantage of DXT5 bug. }; @@ -160,8 +166,9 @@ namespace nvtt NVTT_API void setPitchAlignment(int pitchAlignment); - // @@ I wish this wasn't part of the compression options. Quantization is applied before compression. We don't have compressors with error diffusion. - NVTT_API void setQuantization(bool colorDithering, bool alphaDithering, bool binaryAlpha, int alphaThreshold = 127); // (Deprecated in NVTT 2.1) + // @@ I wish this wasn't part of the compression options. Quantization is applied before compression. We don't have compressors with error diffusion. + // @@ These options are only taken into account when using the InputOptions API. + NVTT_API void setQuantization(bool colorDithering, bool alphaDithering, bool binaryAlpha, int alphaThreshold = 127); NVTT_API void setTargetDecoder(Decoder decoder); @@ -205,6 +212,7 @@ namespace nvtt InputFormat_BGRA_8UB, // Normalized [0, 1] 8 bit fixed point. InputFormat_RGBA_16F, // 16 bit floating point. InputFormat_RGBA_32F, // 32 bit floating point. + InputFormat_R_32F, // Single channel 32 bit floating point. }; // Mipmap downsampling filters. @@ -426,6 +434,7 @@ namespace nvtt // A surface is one level of a 2D or 3D texture. (New in NVTT 2.1) + // @@ It would be nice to add support for texture borders for correct resizing of tiled textures and constrained DXT compression. struct Surface { NVTT_API Surface(); @@ -450,7 +459,7 @@ namespace nvtt NVTT_API bool isNormalMap() const; NVTT_API int countMipmaps() const; NVTT_API int countMipmaps(int min_size) const; - NVTT_API float alphaTestCoverage(float alphaRef = 0.5) const; + NVTT_API float alphaTestCoverage(float alphaRef = 0.5, int alpha_channel = 3) const; NVTT_API float average(int channel, int alpha_channel = -1, float gamma = 2.2f) const; NVTT_API const float * data() const; NVTT_API const float * channel(int i) const; @@ -496,9 +505,9 @@ namespace nvtt NVTT_API void toGreyScale(float redScale, float greenScale, float blueScale, float alphaScale); NVTT_API void setBorder(float r, float g, float b, float a); NVTT_API void fill(float r, float g, float b, float a); - NVTT_API void scaleAlphaToCoverage(float coverage, float alphaRef = 0.5f); - NVTT_API void toRGBM(float range = 1.0f, float threshold = 0.0f); - NVTT_API void fromRGBM(float range = 1.0f, float threshold = 0.0f); + NVTT_API void scaleAlphaToCoverage(float coverage, float alphaRef = 0.5f, int alpha_channel = 3); + NVTT_API void toRGBM(float range = 1.0f, float threshold = 0.25f); + NVTT_API void fromRGBM(float range = 1.0f, float threshold = 0.25f); NVTT_API void toLM(float range = 1.0f, float threshold = 0.0f); NVTT_API void toRGBE(int mantissaBits, int exponentBits); NVTT_API void fromRGBE(int mantissaBits, int exponentBits); @@ -511,6 +520,7 @@ namespace nvtt NVTT_API void convolve(int channel, int kernelSize, float * kernelData); NVTT_API void toLogScale(int channel, float base); NVTT_API void fromLogScale(int channel, float base); + NVTT_API void setAtlasBorder(int w, int h, float r, float g, float b, float a); NVTT_API void toneMap(ToneMapper tm, float * parameters); @@ -648,6 +658,7 @@ namespace nvtt NVTT_API float angularError(const Surface & reference, const Surface & img); NVTT_API Surface diff(const Surface & reference, const Surface & img, float scale); + NVTT_API float rmsToneMappedError(const Surface & reference, const Surface & img, float exposure); } // nvtt namespace diff --git a/src/nvtt/tests/testsuite.cpp b/src/nvtt/tests/testsuite.cpp index f1d6e99..aceac51 100644 --- a/src/nvtt/tests/testsuite.cpp +++ b/src/nvtt/tests/testsuite.cpp @@ -341,7 +341,7 @@ int main(int argc, char *argv[]) setIndex = atoi(argv[i+1]); for (int j = 0; j < s_imageSetCount; j++) { - if (strCaseCmp(s_imageSets[j].name, argv[i+1]) == 0) { + if (strCaseDiff(s_imageSets[j].name, argv[i+1]) == 0) { setIndex = j; break; } diff --git a/src/nvtt/tools/assemble.cpp b/src/nvtt/tools/assemble.cpp index 9e2510e..2886454 100644 --- a/src/nvtt/tools/assemble.cpp +++ b/src/nvtt/tools/assemble.cpp @@ -96,7 +96,7 @@ int main(int argc, char *argv[]) return 1; } - if (nv::strCaseCmp(output.extension(), ".dds") != 0) + if (nv::strCaseDiff(output.extension(), ".dds") != 0) { //output.stripExtension(); output.append(".dds"); diff --git a/src/nvtt/tools/compress.cpp b/src/nvtt/tools/compress.cpp index 03af1de..c7d662d 100644 --- a/src/nvtt/tools/compress.cpp +++ b/src/nvtt/tools/compress.cpp @@ -376,7 +376,7 @@ int main(int argc, char *argv[]) // Set input options. nvtt::InputOptions inputOptions; - if (nv::strCaseCmp(input.extension(), ".dds") == 0) + if (nv::strCaseDiff(input.extension(), ".dds") == 0) { // Load surface. nv::DirectDrawSurface dds(input.str()); @@ -428,7 +428,7 @@ int main(int argc, char *argv[]) } else { - if (nv::strCaseCmp(input.extension(), ".exr") == 0 || nv::strCaseCmp(input.extension(), ".hdr") == 0) + if (nv::strCaseDiff(input.extension(), ".exr") == 0 || nv::strCaseDiff(input.extension(), ".hdr") == 0) { loadAsFloat = true; } @@ -519,6 +519,8 @@ int main(int argc, char *argv[]) nvtt::CompressionOptions compressionOptions; compressionOptions.setFormat(format); + //compressionOptions.setQuantization(/*color dithering*/true, /*alpha dithering*/false, /*binary alpha*/false); + if (format == nvtt::Format_BC2) { // Dither alpha when using BC2. compressionOptions.setQuantization(/*color dithering*/false, /*alpha dithering*/true, /*binary alpha*/false); @@ -539,6 +541,10 @@ int main(int argc, char *argv[]) // compressionOptions.setPixelFormat(16, 16, 16, 16); // compressionOptions.setPixelType(nvtt::PixelType_UnsignedNorm); // compressionOptions.setPixelFormat(16, 0, 0, 0); + + //compressionOptions.setQuantization(/*color dithering*/true, /*alpha dithering*/false, /*binary alpha*/false); + //compressionOptions.setPixelType(nvtt::PixelType_UnsignedNorm); + //compressionOptions.setPixelFormat(5, 6, 5, 0); } } diff --git a/src/nvtt/tools/imgdiff.cpp b/src/nvtt/tools/imgdiff.cpp index 4cb0bfc..9809cf4 100644 --- a/src/nvtt/tools/imgdiff.cpp +++ b/src/nvtt/tools/imgdiff.cpp @@ -37,7 +37,7 @@ static bool loadImage(nv::Image & image, const char * fileName) { - if (nv::strCaseCmp(nv::Path::extension(fileName), ".dds") == 0) + if (nv::strCaseDiff(nv::Path::extension(fileName), ".dds") == 0) { nv::DirectDrawSurface dds(fileName); if (!dds.isValid()) @@ -246,7 +246,7 @@ int main(int argc, char *argv[]) double g = float(c0.g - c1.g); double b = float(c0.b - c1.b); double a = float(c0.a - c1.a); - + error_r.addSample(r); error_g.addSample(g); error_b.addSample(b); diff --git a/src/nvtt/tools/resize.cpp b/src/nvtt/tools/resize.cpp index a22aa4b..c563c12 100644 --- a/src/nvtt/tools/resize.cpp +++ b/src/nvtt/tools/resize.cpp @@ -40,7 +40,7 @@ static bool loadImage(nv::Image & image, const char * fileName) { - if (nv::strCaseCmp(nv::Path::extension(fileName), ".dds") == 0) + if (nv::strCaseDiff(nv::Path::extension(fileName), ".dds") == 0) { nv::DirectDrawSurface dds(fileName); if (!dds.isValid())