From 3c0ab2d3f374483f73f85476d223ef1b7cbc40cc Mon Sep 17 00:00:00 2001 From: castano Date: Tue, 27 Sep 2011 17:48:46 +0000 Subject: [PATCH] Merge changes from the witness. --- project/vc9/nvthread/nvthread.vcproj | 346 +++++++++++++++++++++++++++ project/vc9/nvtt.sln | 25 ++ src/CMakeLists.txt | 1 + src/nvcore/Array.h | 12 +- src/nvcore/Debug.cpp | 88 ++++--- src/nvcore/Debug.h | 11 +- src/nvcore/DefsGnucDarwin.h | 4 +- src/nvcore/Memory.h | 14 +- src/nvcore/StdStream.h | 6 +- src/nvcore/StrLib.cpp | 26 +- src/nvcore/StrLib.h | 16 +- src/nvcore/Utils.h | 67 ++++++ src/nvcore/nvcore.h | 43 +++- src/nvimage/DirectDrawSurface.cpp | 2 +- src/nvimage/FloatImage.cpp | 2 +- src/nvimage/FloatImage.h | 3 +- src/nvimage/Image.h | 8 +- src/nvimage/ImageIO.cpp | 10 +- src/nvmath/Half.cpp | 162 +++++++++---- src/nvmath/Half.h | 5 +- src/nvmath/Matrix.h | 25 +- src/nvmath/Vector.h | 46 +++- src/nvmath/nvmath.h | 30 ++- src/nvthread/CMakeLists.txt | 26 ++ src/nvthread/Event.cpp | 52 ++++ src/nvthread/Event.h | 34 +++ src/nvthread/Mutex.cpp | 89 +++++++ src/nvthread/Mutex.h | 47 ++++ src/nvthread/ParallelFor.cpp | 61 +++++ src/nvthread/ParallelFor.h | 38 +++ src/nvthread/Thread.cpp | 136 +++++++++++ src/nvthread/Thread.h | 46 ++++ src/nvthread/ThreadPool.cpp | 121 ++++++++++ src/nvthread/ThreadPool.h | 49 ++++ src/nvthread/Win32.h | 9 + src/nvthread/nvthread.cpp | 51 ++++ src/nvthread/nvthread.h | 83 +++++++ src/nvtt/CompressorDX11.cpp | 4 +- src/nvtt/CompressorDX9.cpp | 8 +- src/nvtt/CompressorRGB.cpp | 2 +- src/nvtt/Context.cpp | 2 + src/nvtt/OutputOptions.cpp | 5 + src/nvtt/OutputOptions.h | 6 + src/nvtt/TaskDispatcher.h | 43 +++- src/nvtt/TexImage.cpp | 122 +++++++++- src/nvtt/nvtt.h | 6 + src/nvtt/tools/compress.cpp | 5 + 47 files changed, 1811 insertions(+), 186 deletions(-) create mode 100644 project/vc9/nvthread/nvthread.vcproj create mode 100644 src/nvthread/CMakeLists.txt create mode 100644 src/nvthread/Event.cpp create mode 100644 src/nvthread/Event.h create mode 100644 src/nvthread/Mutex.cpp create mode 100644 src/nvthread/Mutex.h create mode 100644 src/nvthread/ParallelFor.cpp create mode 100644 src/nvthread/ParallelFor.h create mode 100644 src/nvthread/Thread.cpp create mode 100644 src/nvthread/Thread.h create mode 100644 src/nvthread/ThreadPool.cpp create mode 100644 src/nvthread/ThreadPool.h create mode 100644 src/nvthread/Win32.h create mode 100644 src/nvthread/nvthread.cpp create mode 100644 src/nvthread/nvthread.h diff --git a/project/vc9/nvthread/nvthread.vcproj b/project/vc9/nvthread/nvthread.vcproj new file mode 100644 index 0000000..e351e0b --- /dev/null +++ b/project/vc9/nvthread/nvthread.vcproj @@ -0,0 +1,346 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/project/vc9/nvtt.sln b/project/vc9/nvtt.sln index 82885e7..271b0b0 100644 --- a/project/vc9/nvtt.sln +++ b/project/vc9/nvtt.sln @@ -4,6 +4,7 @@ Microsoft Visual Studio Solution File, Format Version 10.00 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "nvtt", "nvtt\nvtt.vcproj", "{1AEB7681-57D8-48EE-813D-5C41CC38B647}" ProjectSection(ProjectDependencies) = postProject {CE017322-01FC-4851-9C8B-64E9A8E26C38} = {CE017322-01FC-4851-9C8B-64E9A8E26C38} + {3DD3A43D-C6EA-460F-821B-6C339A03C5BB} = {3DD3A43D-C6EA-460F-821B-6C339A03C5BB} {F143D180-D4C4-4037-B3DE-BE89A21C8D1D} = {F143D180-D4C4-4037-B3DE-BE89A21C8D1D} {4046F392-A18B-4C66-9639-3EABFFF5D531} = {4046F392-A18B-4C66-9639-3EABFFF5D531} {C33787E3-5564-4834-9FE3-A9020455A669} = {C33787E3-5564-4834-9FE3-A9020455A669} @@ -88,6 +89,8 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "imperativeapi", "imperative EndProject Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "bc6h", "bc6h\bc6h.vcproj", "{C33787E3-5564-4834-9FE3-A9020455A669}" EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "nvthread", "nvthread\nvthread.vcproj", "{3DD3A43D-C6EA-460F-821B-6C339A03C5BB}" +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug (no cuda)|Mixed Platforms = Debug (no cuda)|Mixed Platforms @@ -457,6 +460,28 @@ Global {C33787E3-5564-4834-9FE3-A9020455A669}.Release|Win32.Build.0 = Release|Win32 {C33787E3-5564-4834-9FE3-A9020455A669}.Release|x64.ActiveCfg = Release|x64 {C33787E3-5564-4834-9FE3-A9020455A669}.Release|x64.Build.0 = Release|x64 + {3DD3A43D-C6EA-460F-821B-6C339A03C5BB}.Debug (no cuda)|Mixed Platforms.ActiveCfg = Debug|x64 + {3DD3A43D-C6EA-460F-821B-6C339A03C5BB}.Debug (no cuda)|Mixed Platforms.Build.0 = Debug|x64 + {3DD3A43D-C6EA-460F-821B-6C339A03C5BB}.Debug (no cuda)|Win32.ActiveCfg = Debug|x64 + {3DD3A43D-C6EA-460F-821B-6C339A03C5BB}.Debug (no cuda)|x64.ActiveCfg = Debug|x64 + {3DD3A43D-C6EA-460F-821B-6C339A03C5BB}.Debug (no cuda)|x64.Build.0 = Debug|x64 + {3DD3A43D-C6EA-460F-821B-6C339A03C5BB}.Debug|Mixed Platforms.ActiveCfg = Debug|x64 + {3DD3A43D-C6EA-460F-821B-6C339A03C5BB}.Debug|Mixed Platforms.Build.0 = Debug|x64 + {3DD3A43D-C6EA-460F-821B-6C339A03C5BB}.Debug|Win32.ActiveCfg = Debug|Win32 + {3DD3A43D-C6EA-460F-821B-6C339A03C5BB}.Debug|Win32.Build.0 = Debug|Win32 + {3DD3A43D-C6EA-460F-821B-6C339A03C5BB}.Debug|x64.ActiveCfg = Debug|x64 + {3DD3A43D-C6EA-460F-821B-6C339A03C5BB}.Debug|x64.Build.0 = Debug|x64 + {3DD3A43D-C6EA-460F-821B-6C339A03C5BB}.Release (no cuda)|Mixed Platforms.ActiveCfg = Release|x64 + {3DD3A43D-C6EA-460F-821B-6C339A03C5BB}.Release (no cuda)|Mixed Platforms.Build.0 = Release|x64 + {3DD3A43D-C6EA-460F-821B-6C339A03C5BB}.Release (no cuda)|Win32.ActiveCfg = Release|x64 + {3DD3A43D-C6EA-460F-821B-6C339A03C5BB}.Release (no cuda)|x64.ActiveCfg = Release|x64 + {3DD3A43D-C6EA-460F-821B-6C339A03C5BB}.Release (no cuda)|x64.Build.0 = Release|x64 + {3DD3A43D-C6EA-460F-821B-6C339A03C5BB}.Release|Mixed Platforms.ActiveCfg = Release|x64 + {3DD3A43D-C6EA-460F-821B-6C339A03C5BB}.Release|Mixed Platforms.Build.0 = Release|x64 + {3DD3A43D-C6EA-460F-821B-6C339A03C5BB}.Release|Win32.ActiveCfg = Release|Win32 + {3DD3A43D-C6EA-460F-821B-6C339A03C5BB}.Release|Win32.Build.0 = Release|Win32 + {3DD3A43D-C6EA-460F-821B-6C339A03C5BB}.Release|x64.ActiveCfg = Release|x64 + {3DD3A43D-C6EA-460F-821B-6C339A03C5BB}.Release|x64.Build.0 = Release|x64 EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 1e72b65..16fc8df 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -6,6 +6,7 @@ INCLUDE_DIRECTORIES(${NV_SOURCE_DIR}/extern/stb) SUBDIRS(nvcore) SUBDIRS(nvmath) SUBDIRS(nvimage) +SUBDIRS(nvthread) SUBDIRS(nvtt) # OpenGL diff --git a/src/nvcore/Array.h b/src/nvcore/Array.h index fd35c65..51afab6 100644 --- a/src/nvcore/Array.h +++ b/src/nvcore/Array.h @@ -78,8 +78,8 @@ namespace nv } template - bool find(const T & element, const T * restrict ptr, uint count, uint * index) { - for (uint i = 0; i < count; i++) { + bool find(const T & element, const T * restrict ptr, uint begin, uint end, uint * index) { + for (uint i = begin; i < end; i++) { if (ptr[i] == element) { if (index != NULL) *index = i; return true; @@ -257,15 +257,15 @@ namespace nv } /// Return true if element found. - NV_FORCEINLINE bool find(const T & element, uint * index) const + NV_FORCEINLINE bool find(const T & element, uint * indexPtr) const { - return find(element, 0, m_size, index); + return find(element, 0, m_size, indexPtr); } /// Return true if element found within the given range. - NV_FORCEINLINE bool find(const T & element, uint first, uint count, uint * index) const + NV_FORCEINLINE bool find(const T & element, uint begin, uint end, uint * indexPtr) const { - return ::nv::find(element, m_buffer + first, count, index); + return ::nv::find(element, m_buffer, begin, end, indexPtr); } /// Remove the element at the given index. This is an expensive operation! diff --git a/src/nvcore/Debug.cpp b/src/nvcore/Debug.cpp index bdde41c..f980c07 100644 --- a/src/nvcore/Debug.cpp +++ b/src/nvcore/Debug.cpp @@ -448,19 +448,6 @@ namespace /** Win32 assert handler. */ struct Win32AssertHandler : public AssertHandler { - // Code from Daniel Vogel. - static bool isDebuggerPresent() - { - HINSTANCE kernel32 = GetModuleHandle("kernel32.dll"); - if (kernel32) { - FARPROC IsDebuggerPresent = GetProcAddress(kernel32, "IsDebuggerPresent"); - if (IsDebuggerPresent != NULL && IsDebuggerPresent()) { - return true; - } - } - return false; - } - // Flush the message queue. This is necessary for the message box to show up. static void flushMessageQueue() { @@ -487,7 +474,7 @@ namespace nvDebug( error_string.str() ); } - if (isDebuggerPresent()) { + if (debug::isDebuggerPresent()) { return NV_ABORT_DEBUG; } @@ -522,15 +509,6 @@ namespace /** Xbox360 assert handler. */ struct Xbox360AssertHandler : public AssertHandler { - static bool isDebuggerPresent() - { -#ifdef _DEBUG - return DmIsDebuggerPresent() == TRUE; -#else - return false; -#endif - } - // Assert handler method. virtual int assertion( const char * exp, const char * file, int line, const char * func/*=NULL*/ ) { @@ -546,7 +524,7 @@ namespace nvDebug( error_string.str() ); } - if (isDebuggerPresent()) { + if (debug::isDebuggerPresent()) { return NV_ABORT_DEBUG; } @@ -563,26 +541,6 @@ namespace /** Unix assert handler. */ struct UnixAssertHandler : public AssertHandler { - bool isDebuggerPresent() - { -#if NV_OS_DARWIN - int mib[4]; - struct kinfo_proc info; - size_t size; - mib[0] = CTL_KERN; - mib[1] = KERN_PROC; - mib[2] = KERN_PROC_PID; - mib[3] = getpid(); - size = sizeof(info); - info.kp_proc.p_flag = 0; - sysctl(mib,4,&info,&size,NULL,0); - return ((info.kp_proc.p_flag & P_TRACED) == P_TRACED); -#else - // if ppid != sid, some process spawned our app, probably a debugger. - return getsid(getpid()) != getppid(); -#endif - } - // Assert handler method. virtual int assertion(const char * exp, const char * file, int line, const char * func) { @@ -594,7 +552,7 @@ namespace } #if _DEBUG - if (isDebuggerPresent()) { + if (debug::isDebuggerPresent()) { return NV_ABORT_DEBUG; } #endif @@ -702,7 +660,10 @@ void debug::enableSigHandler() // SYMOPT_DEFERRED_LOADS make us not take a ton of time unless we actual log traces SymSetOptions(SYMOPT_DEFERRED_LOADS|SYMOPT_FAIL_CRITICAL_ERRORS|SYMOPT_LOAD_LINES|SYMOPT_UNDNAME); - SymInitialize(GetCurrentProcess(), NULL, TRUE); + if (!SymInitialize(GetCurrentProcess(), NULL, TRUE)) { + DWORD error = GetLastError(); + nvDebug("SymInitialize returned error : %d\n", error); + } #elif !NV_OS_WIN32 && defined(HAVE_SIGNAL_H) @@ -743,3 +704,38 @@ void debug::disableSigHandler() #endif } + +bool debug::isDebuggerPresent() +{ +#if NV_OS_WIN32 + HINSTANCE kernel32 = GetModuleHandle("kernel32.dll"); + if (kernel32) { + FARPROC IsDebuggerPresent = GetProcAddress(kernel32, "IsDebuggerPresent"); + if (IsDebuggerPresent != NULL && IsDebuggerPresent()) { + return true; + } + } + return false; +#elif NV_OS_XBOX +#ifdef _DEBUG + return DmIsDebuggerPresent() == TRUE; +#else + return false; +#endif +#elif NV_OS_DARWIN + int mib[4]; + struct kinfo_proc info; + size_t size; + mib[0] = CTL_KERN; + mib[1] = KERN_PROC; + mib[2] = KERN_PROC_PID; + mib[3] = getpid(); + size = sizeof(info); + info.kp_proc.p_flag = 0; + sysctl(mib,4,&info,&size,NULL,0); + return ((info.kp_proc.p_flag & P_TRACED) == P_TRACED); +#else + // if ppid != sid, some process spawned our app, probably a debugger. + return getsid(getpid()) != getppid(); +#endif +} diff --git a/src/nvcore/Debug.h b/src/nvcore/Debug.h index 28c81d8..c6a0f76 100644 --- a/src/nvcore/Debug.h +++ b/src/nvcore/Debug.h @@ -10,6 +10,9 @@ # include // va_list #endif +// Make sure we are using our assert. +#undef assert + #define NV_ABORT_DEBUG 1 #define NV_ABORT_IGNORE 2 #define NV_ABORT_EXIT 3 @@ -116,12 +119,6 @@ #endif -#if __cplusplus > 199711L -#define nvStaticCheck(x) static_assert(x) -#else -#define nvStaticCheck(x) typedef char NV_DO_STRING_JOIN2(__static_assert_,__LINE__)[(x)] -#endif - NVCORE_API int nvAbort(const char *exp, const char *file, int line, const char * func = NULL); NVCORE_API void NV_CDECL nvDebugPrint( const char *msg, ... ) __attribute__((format (printf, 1, 2))); @@ -166,6 +163,8 @@ namespace nv NVCORE_API void enableSigHandler(); NVCORE_API void disableSigHandler(); + + NVCORE_API bool isDebuggerPresent(); } } // nv namespace diff --git a/src/nvcore/DefsGnucDarwin.h b/src/nvcore/DefsGnucDarwin.h index 3826d81..79a47ff 100644 --- a/src/nvcore/DefsGnucDarwin.h +++ b/src/nvcore/DefsGnucDarwin.h @@ -2,7 +2,7 @@ #error "Do not include this file directly." #endif -//#include // uint8_t, int8_t, ... +#include // uint8_t, int8_t, ... uintptr_t #include // operator new, size_t, NULL // Function linkage @@ -67,4 +67,4 @@ typedef int64_t int64; // Aliases typedef uint32 uint; -*/ \ No newline at end of file +*/ diff --git a/src/nvcore/Memory.h b/src/nvcore/Memory.h index 27d7e3d..52f03fa 100644 --- a/src/nvcore/Memory.h +++ b/src/nvcore/Memory.h @@ -12,10 +12,10 @@ #include // new and delete -#if NV_CC_GNUC -# define NV_ALIGN_16 __attribute__ ((__aligned__ (16))) -#else -# define NV_ALIGN_16 __declspec(align(16)) +#if NV_CC_GNUC +# define NV_ALIGN_16 __attribute__ ((__aligned__ (16))) +#else +# define NV_ALIGN_16 __declspec(align(16)) #endif @@ -43,15 +43,15 @@ extern "C" { namespace nv { // C++ helpers. - template T * malloc(size_t count) { + template NV_FORCEINLINE T * malloc(size_t count) { return (T *)::malloc(sizeof(T) * count); } - template T * realloc(T * ptr, size_t count) { + template NV_FORCEINLINE T * realloc(T * ptr, size_t count) { return (T *)::realloc(ptr, sizeof(T) * count); } - template void free(const T * ptr) { + template NV_FORCEINLINE void free(const T * ptr) { ::free((void *)ptr); } diff --git a/src/nvcore/StdStream.h b/src/nvcore/StdStream.h index 50fcd84..79912f5 100644 --- a/src/nvcore/StdStream.h +++ b/src/nvcore/StdStream.h @@ -72,7 +72,7 @@ namespace nv #if NV_OS_WIN32 return _ftell_nolock(m_fp); #else - return ftell(m_fp); + return (uint)ftell(m_fp); #endif } @@ -85,9 +85,9 @@ namespace nv uint end = _ftell_nolock(m_fp); _fseek_nolock(m_fp, pos, SEEK_SET); #else - uint pos = ftell(m_fp); + uint pos = (uint)ftell(m_fp); fseek(m_fp, 0, SEEK_END); - uint end = ftell(m_fp); + uint end = (uint)ftell(m_fp); fseek(m_fp, pos, SEEK_SET); #endif return end; diff --git a/src/nvcore/StrLib.cpp b/src/nvcore/StrLib.cpp index 3fee676..55b305d 100644 --- a/src/nvcore/StrLib.cpp +++ b/src/nvcore/StrLib.cpp @@ -189,7 +189,7 @@ StringBuilder::StringBuilder() : m_size(0), m_str(NULL) } /** Preallocate space. */ -StringBuilder::StringBuilder( int size_hint ) : m_size(size_hint) +StringBuilder::StringBuilder( uint size_hint ) : m_size(size_hint) { nvDebugCheck(m_size > 0); m_str = strAlloc(m_size); @@ -203,9 +203,15 @@ StringBuilder::StringBuilder( const StringBuilder & s ) : m_size(0), m_str(NULL) } /** Copy string. */ -StringBuilder::StringBuilder( const char * s, int extra_size_hint/*=0*/ ) : m_size(0), m_str(NULL) +StringBuilder::StringBuilder(const char * s) : m_size(0), m_str(NULL) { - copy(s, extra_size_hint); + copy(s); +} + +/** Copy string. */ +StringBuilder::StringBuilder(const char * s, uint len) : m_size(0), m_str(NULL) +{ + copy(s, len); } /** Delete the string. */ @@ -396,15 +402,25 @@ StringBuilder & StringBuilder::reserve( uint size_hint ) /** Copy a string safely. */ -StringBuilder & StringBuilder::copy( const char * s, int extra_size/*=0*/ ) +StringBuilder & StringBuilder::copy(const char * s) { nvCheck( s != NULL ); const uint str_size = uint(strlen( s )) + 1; - reserve(str_size + extra_size); + reserve(str_size); memcpy(m_str, s, str_size); return *this; } +/** Copy a string safely. */ +StringBuilder & StringBuilder::copy(const char * s, uint len) +{ + nvCheck( s != NULL ); + const uint str_size = len + 1; + reserve(str_size); + strCpy(m_str, str_size, s, len); + return *this; +} + /** Copy an StringBuilder. */ StringBuilder & StringBuilder::copy( const StringBuilder & s ) diff --git a/src/nvcore/StrLib.h b/src/nvcore/StrLib.h index 8b9fc8e..65fb759 100644 --- a/src/nvcore/StrLib.h +++ b/src/nvcore/StrLib.h @@ -59,9 +59,10 @@ namespace nv public: StringBuilder(); - explicit StringBuilder( int size_hint ); - StringBuilder( const char * str, int extra_size_hint = 0); - StringBuilder( const StringBuilder & ); + explicit StringBuilder( uint size_hint ); + StringBuilder(const char * str); + StringBuilder(const char * str, uint len); + StringBuilder(const StringBuilder & other); ~StringBuilder(); @@ -75,9 +76,10 @@ namespace nv StringBuilder & number( int i, int base = 10 ); StringBuilder & number( uint i, int base = 10 ); - StringBuilder & reserve( uint size_hint ); - StringBuilder & copy( const char * str, int extra_size/*=0*/ ); - StringBuilder & copy( const StringBuilder & str ); + StringBuilder & reserve(uint size_hint); + StringBuilder & copy(const char * str); + StringBuilder & copy(const char * str, uint len); + StringBuilder & copy(const StringBuilder & str); StringBuilder & toLower(); StringBuilder & toUpper(); @@ -145,7 +147,7 @@ namespace nv public: Path() : StringBuilder() {} explicit Path(int size_hint) : StringBuilder(size_hint) {} - Path(const char * str, int extra_size_hint = 0) : StringBuilder(str, extra_size_hint) {} + Path(const char * str) : StringBuilder(str) {} Path(const Path & path) : StringBuilder(path) {} const char * fileName() const; diff --git a/src/nvcore/Utils.h b/src/nvcore/Utils.h index a5d760b..6c11185 100644 --- a/src/nvcore/Utils.h +++ b/src/nvcore/Utils.h @@ -7,9 +7,76 @@ #include "nvcore.h" #include "Debug.h" // nvDebugCheck +// Just in case. Grrr. +#undef min +#undef max + namespace nv { + // Less error prone than casting. From CB: + // http://cbloomrants.blogspot.com/2011/06/06-17-11-c-casting-is-devil.html + inline int8 asSigned(uint8 x) { return (int8) x; } + inline int16 asSigned(uint16 x) { return (int16) x; } + inline int32 asSigned(uint32 x) { return (int32) x; } + inline int64 asSigned(uint64 x) { return (int64) x; } + inline uint8 asUnsigned(int8 x) { return (uint8) x; } + inline uint16 asUnsigned(int16 x) { return (uint16) x; } + inline uint32 asUnsigned(int32 x) { return (uint32) x; } + inline uint64 asUnsigned(int64 x) { return (uint64) x; } + + /* + template inline int8 toI8(T x) { + nvDebugCheck(x <= INT8_MAX); + nvDebugCheck(x >= INT8_MIN); + int8 y = (int8) x; + nvDebugCheck(x == (T)y); + return y; + } + + template inline uint8 toU8(T x) { + nvDebugCheck(x <= UINT8_MAX); + nvDebugCheck(x >= 0); + return (uint8) x; + } + + template inline int16 toI16(T x) { + nvDebugCheck(x <= INT16_MAX); + nvDebugCheck(x >= INT16_MIN); + return (int16) x; + } + + template inline uint16 toU16(T x) { + nvDebugCheck(x <= UINT16_MAX); + nvDebugCheck(x >= 0); + return (uint16) x; + } + + template inline int32 toI32(T x) { + nvDebugCheck(x <= INT32_MAX); + nvDebugCheck(x >= INT32_MIN); + return (int32) x; + } + + template inline uint32 toU32(T x) { + nvDebugCheck(x <= UINT32_MAX); + nvDebugCheck(x >= 0); + return (uint32) x; + } + + template inline int64 toI64(T x) { + nvDebugCheck(x <= INT64_MAX); + nvDebugCheck(x >= INT64_MIN); + return (int64) x; + } + + template inline uint64 toU64(T x) { + nvDebugCheck(x <= UINT64_MAX); + nvDebugCheck(x >= 0); + return (uint64) x; + } + */ + /// Swap two values. template inline void swap(T & a, T & b) diff --git a/src/nvcore/nvcore.h b/src/nvcore/nvcore.h index 07050c4..ab2a81a 100644 --- a/src/nvcore/nvcore.h +++ b/src/nvcore/nvcore.h @@ -4,9 +4,6 @@ #ifndef NV_CORE_H #define NV_CORE_H -// cmake config -#include - // Function linkage #if NVCORE_SHARED #ifdef NVCORE_EXPORTS @@ -91,7 +88,11 @@ // @@ NV_CC_MSVC7 // @@ NV_CC_MSVC8 -#if defined POSH_COMPILER_GCC +#if defined POSH_COMPILER_CLANG +# define NV_CC_CLANG 1 +# define NV_CC_GCC 1 // Clang is compatible with GCC. +# define NV_CC_STRING "clang" +#elif defined POSH_COMPILER_GCC # define NV_CC_GNUC 1 # define NV_CC_STRING "gcc" #elif defined POSH_COMPILER_MSVC @@ -108,6 +109,18 @@ #define NV_ENDIAN_STRING POSH_ENDIAN_STRING +// Define the right printf prefix for size_t arguments: +#if POSH_64BIT_POINTER +# define NV_SIZET_PRINTF_PREFIX POSH_I64_PRINTF_PREFIX +#else +# define NV_SIZET_PRINTF_PREFIX +#endif + + +// cmake config +#include "nvconfig.h" + + // Type definitions: typedef posh_u8_t uint8; typedef posh_i8_t int8; @@ -144,6 +157,8 @@ typedef uint32 uint; private: \ void *operator new(size_t size); \ void *operator new[](size_t size); + //static void *operator new(size_t size); \ + //static void *operator new[](size_t size); // String concatenation macros. #define NV_STRING_JOIN2(arg1, arg2) NV_DO_STRING_JOIN2(arg1, arg2) @@ -153,6 +168,25 @@ typedef uint32 uint; #define NV_STRING2(x) #x #define NV_STRING(x) NV_STRING2(x) + +#if __cplusplus > 199711L +#define nvStaticCheck(x) static_assert(x) +#else +#define nvStaticCheck(x) typedef char NV_STRING_JOIN2(__static_assert_,__LINE__)[(x)] +#endif +#define NV_COMPILER_CHECK(x) nvStaticCheck(x) // I like this name best. + +// Make sure type definitions are fine. +NV_COMPILER_CHECK(sizeof(int8) == 1); +NV_COMPILER_CHECK(sizeof(uint8) == 1); +NV_COMPILER_CHECK(sizeof(int16) == 2); +NV_COMPILER_CHECK(sizeof(uint16) == 2); +NV_COMPILER_CHECK(sizeof(int32) == 4); +NV_COMPILER_CHECK(sizeof(uint32) == 4); +NV_COMPILER_CHECK(sizeof(int32) == 4); +NV_COMPILER_CHECK(sizeof(uint32) == 4); + + #define NV_ARRAY_SIZE(x) (sizeof(x)/sizeof((x)[0])) #if 1 @@ -180,6 +214,7 @@ typedef uint32 uint; // Null index. @@ Move this somewhere else... it's only used by nvmesh. //const unsigned int NIL = unsigned int(~0); +//#define NIL uint(~0) // Null pointer. #ifndef NULL diff --git a/src/nvimage/DirectDrawSurface.cpp b/src/nvimage/DirectDrawSurface.cpp index f67644b..466d397 100644 --- a/src/nvimage/DirectDrawSurface.cpp +++ b/src/nvimage/DirectDrawSurface.cpp @@ -1418,7 +1418,7 @@ uint DirectDrawSurface::mipmapSize(uint mipmap) const { nvDebugCheck((header.pf.flags & DDPF_RGB) || (header.pf.flags & DDPF_LUMINANCE)); - uint pitch = computeBytePitch(w, header.pf.bitcount, 8); // Asuming 8 bit alignment, which is the same D3DX expects. + uint pitch = computeBytePitch(w, header.pf.bitcount, 1); // Asuming 1 byte alignment, which is the same D3DX expects. return pitch * h * d; } diff --git a/src/nvimage/FloatImage.cpp b/src/nvimage/FloatImage.cpp index 996b6ea..b25c7d6 100644 --- a/src/nvimage/FloatImage.cpp +++ b/src/nvimage/FloatImage.cpp @@ -181,7 +181,7 @@ void FloatImage::normalize(uint baseComponent) for (uint i = 0; i < count; i++) { Vector3 normal(xChannel[i], yChannel[i], zChannel[i]); - normal = normalizeSafe(normal, Vector3(zero), 0.0f); + normal = normalizeSafe(normal, Vector3(0), 0.0f); xChannel[i] = normal.x; yChannel[i] = normal.y; diff --git a/src/nvimage/FloatImage.h b/src/nvimage/FloatImage.h index f22c6c2..712d1c6 100644 --- a/src/nvimage/FloatImage.h +++ b/src/nvimage/FloatImage.h @@ -56,6 +56,7 @@ namespace nv //@{ NVIMAGE_API void clear(float f = 0.0f); NVIMAGE_API void clear(uint component, float f = 0.0f); + NVIMAGE_API void copyChannel(uint src, uint dst); NVIMAGE_API void normalize(uint base_component); @@ -113,8 +114,6 @@ namespace nv uint pixelCount() const { return m_pixelCount; } - // @@ It would make sense to swap the order of the arguments so that 'c' is always first. - /** @name Pixel access. */ //@{ const float * channel(uint c) const; diff --git a/src/nvimage/Image.h b/src/nvimage/Image.h index 7d44a4a..9161e57 100644 --- a/src/nvimage/Image.h +++ b/src/nvimage/Image.h @@ -70,14 +70,14 @@ namespace nv inline const Color32 & Image::pixel(uint x, uint y) const { - nvDebugCheck(x < width() && y < height()); - return pixel(y * width() + x); + nvDebugCheck(x < m_width && y < m_height); + return pixel(y * m_width + x); } inline Color32 & Image::pixel(uint x, uint y) { - nvDebugCheck(x < width() && y < height()); - return pixel(y * width() + x); + nvDebugCheck(x < m_width && y < m_height); + return pixel(y * m_width + x); } } // nv namespace diff --git a/src/nvimage/ImageIO.cpp b/src/nvimage/ImageIO.cpp index b80bc3e..4a7d3ff 100644 --- a/src/nvimage/ImageIO.cpp +++ b/src/nvimage/ImageIO.cpp @@ -215,7 +215,7 @@ FloatImage * nv::ImageIO::loadFloat(const char * fileName) StdInputStream stream(fileName); if (stream.isError()) { - return false; + return NULL; } return loadFloat(fileName, stream); @@ -324,9 +324,9 @@ bool nv::ImageIO::saveFloat(const char * fileName, Stream & s, const FloatImage bool nv::ImageIO::saveFloat(const char * fileName, const FloatImage * fimage, uint baseComponent, uint componentCount) { +#if !defined(HAVE_FREEIMAGE) const char * extension = Path::extension(fileName); -#if !defined(HAVE_FREEIMAGE) #if defined(HAVE_OPENEXR) if (strCaseCmp(extension, ".exr") == 0) { return saveFloatEXR(fileName, fimage, baseComponent, componentCount); @@ -711,7 +711,7 @@ Image * nv::ImageIO::loadTGA(Stream & s) case TGA_TYPE_INDEXED: if( tga.colormap_type!=1 || tga.colormap_size!=24 || tga.colormap_length>256 ) { nvDebug( "*** loadTGA: Error, only 24bit paletted images are supported.\n" ); - return false; + return NULL; } pal = true; break; @@ -732,7 +732,7 @@ Image * nv::ImageIO::loadTGA(Stream & s) default: nvDebug( "*** loadTGA: Error, unsupported image type.\n" ); - return false; + return NULL; } const uint pixel_size = (tga.pixel_size/8); @@ -1369,7 +1369,7 @@ Image * nv::ImageIO::loadJPG(Stream & s) // Read the entire file. Array byte_array; byte_array.resize(s.size()); - s.serialize(byte_array.mutableBuffer(), s.size()); + s.serialize(byte_array.buffer(), s.size()); jpeg_decompress_struct cinfo; jpeg_error_mgr jerr; diff --git a/src/nvmath/Half.cpp b/src/nvmath/Half.cpp index 057bbaf..b76794e 100644 --- a/src/nvmath/Half.cpp +++ b/src/nvmath/Half.cpp @@ -487,46 +487,126 @@ nv::half_to_float( uint16 h ) return (f_result); } -uint32 -nv::fast_half_to_float( uint16 h ) -{ - const uint32 h_e_mask = _uint32_li( 0x00007c00 ); - const uint32 h_m_mask = _uint32_li( 0x000003ff ); - const uint32 h_s_mask = _uint32_li( 0x00008000 ); - const uint32 h_f_s_pos_offset = _uint32_li( 0x00000010 ); - const uint32 h_f_e_pos_offset = _uint32_li( 0x0000000d ); - const uint32 h_f_bias_offset = _uint32_li( 0x0001c000 ); - const uint32 f_e_mask = _uint32_li( 0x7f800000 ); - const uint32 f_m_mask = _uint32_li( 0x007fffff ); - const uint32 h_f_e_denorm_bias = _uint32_li( 0x0000007e ); - const uint32 h_f_m_denorm_sa_bias = _uint32_li( 0x00000008 ); - const uint32 f_e_pos = _uint32_li( 0x00000017 ); - const uint32 h_e_mask_minus_one = _uint32_li( 0x00007bff ); - const uint32 h_e = _uint32_and( h, h_e_mask ); - const uint32 h_m = _uint32_and( h, h_m_mask ); - const uint32 h_s = _uint32_and( h, h_s_mask ); - const uint32 h_e_f_bias = _uint32_add( h_e, h_f_bias_offset ); - const uint32 h_m_nlz = _uint32_cntlz( h_m ); - const uint32 f_s = _uint32_sll( h_s, h_f_s_pos_offset ); - const uint32 f_e = _uint32_sll( h_e_f_bias, h_f_e_pos_offset ); - const uint32 f_m = _uint32_sll( h_m, h_f_e_pos_offset ); - const uint32 f_em = _uint32_or( f_e, f_m ); - const uint32 h_f_m_sa = _uint32_sub( h_m_nlz, h_f_m_denorm_sa_bias ); - const uint32 f_e_denorm_unpacked = _uint32_sub( h_f_e_denorm_bias, h_f_m_sa ); - const uint32 h_f_m = _uint32_sll( h_m, h_f_m_sa ); - const uint32 f_m_denorm = _uint32_and( h_f_m, f_m_mask ); - const uint32 f_e_denorm = _uint32_sll( f_e_denorm_unpacked, f_e_pos ); - const uint32 f_em_denorm = _uint32_or( f_e_denorm, f_m_denorm ); - const uint32 f_em_nan = _uint32_or( f_e_mask, f_m ); - const uint32 is_e_eqz_msb = _uint32_dec( h_e ); - const uint32 is_m_nez_msb = _uint32_neg( h_m ); - const uint32 is_e_flagged_msb = _uint32_sub( h_e_mask_minus_one, h_e ); - const uint32 is_zero_msb = _uint32_andc( is_e_eqz_msb, is_m_nez_msb ); - const uint32 is_denorm_msb = _uint32_and( is_m_nez_msb, is_e_eqz_msb ); - const uint32 is_zero = _uint32_ext( is_zero_msb ); - const uint32 f_zero_result = _uint32_andc( f_em, is_zero ); - const uint32 f_denorm_result = _uint32_sels( is_denorm_msb, f_em_denorm, f_zero_result ); - const uint32 f_result = _uint32_or( f_s, f_denorm_result ); - return (f_result); +// @@ These tables could be smaller. +static uint32 mantissa_table[2048]; +static uint32 exponent_table[64]; +static uint32 offset_table[64]; + +void nv::half_init_tables() +{ + // Init mantissa table. + mantissa_table[0] = 0; + + for (int i = 1; i < 1024; i++) { + uint m = i << 13; + uint e = 0; + + while ((m & 0x00800000) == 0) { + e -= 0x00800000; + m <<= 1; + } + m &= ~0x00800000; + e += 0x38800000; + mantissa_table[i] = m | e; + } + + for (int i = 1024; i < 2048; i++) { + mantissa_table[i] = 0x38000000 + ((i - 1024) << 13); + } + + + // Init exponent table. + exponent_table[0] = 0; + + for (int i = 1; i < 31; i++) { + exponent_table[i] = (i << 23); + } + + exponent_table[31] = 0x47800000; + exponent_table[32] = 0x80000000; + + for (int i = 33; i < 63; i++) { + exponent_table[i] = 0x80000000 + ((i - 32) << 23); + } + + exponent_table[63] = 0xC7800000; + + + // Init offset table. + offset_table[0] = 0; + + for (int i = 1; i < 32; i++) { + offset_table[i] = 1024; + } + + offset_table[32] = 0; + + for (int i = 33; i < 64; i++) { + offset_table[i] = 1024; + } + + /*for (int i = 0; i < 64; i++) { + offset_table[i] = ((i & 31) != 0) * 1024; + }*/ } + +// Fast half to float conversion based on: +// http://www.fox-toolkit.org/ftp/fasthalffloatconversion.pdf +uint32 nv::fast_half_to_float(uint16 h) +{ + uint exp = h >> 10; + return mantissa_table[offset_table[exp] + (h & 0x3ff)] + exponent_table[exp]; +} + + +#if 0 +// Inaccurate conversion suggested at the ffmpeg mailing list: +// http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/2009-July/068949.html +uint32 nv::fast_half_to_float(uint16 v) +{ + if (v & 0x8000) return 0; + uint exp = v >> 10; + if (!exp) return (v>>9)&1; + if (exp >= 15) return 0xffff; + v <<= 6; + return (v+(1<<16)) >> (15-exp); +} + +#endif + +#if 0 + +// Some more from a gamedev thread: +// http://www.devmaster.net/forums/showthread.php?t=10924 + +// I believe it does not handle specials either. + +// Mike Acton's code should be fairly easy to vectorize and that would handle all cases too, the table method might still be faster, though. + + +static __declspec(align(16)) unsigned half_sign[4] = {0x00008000, 0x00008000, 0x00008000, 0x00008000}; +static __declspec(align(16)) unsigned half_exponent[4] = {0x00007C00, 0x00007C00, 0x00007C00, 0x00007C00}; +static __declspec(align(16)) unsigned half_mantissa[4] = {0x000003FF, 0x000003FF, 0x000003FF, 0x000003FF}; +static __declspec(align(16)) unsigned half_bias_offset[4] = {0x0001C000, 0x0001C000, 0x0001C000, 0x0001C000}; + +__asm +{ + movaps xmm1, xmm0 // Input in xmm0 + movaps xmm2, xmm0 + + andps xmm0, half_sign + andps xmm1, half_exponent + andps xmm2, half_mantissa + paddd xmm1, half_bias_offset + + pslld xmm0, 16 + pslld xmm1, 13 + pslld xmm2, 13 + + orps xmm1, xmm2 + orps xmm0, xmm1 // Result in xmm0 +} + + +#endif \ No newline at end of file diff --git a/src/nvmath/Half.h b/src/nvmath/Half.h index 2341921..08f8f11 100644 --- a/src/nvmath/Half.h +++ b/src/nvmath/Half.h @@ -9,8 +9,9 @@ namespace nv { uint32 half_to_float( uint16 h ); uint16 half_from_float( uint32 f ); - // Does not handle NaN or infinity. - uint32 fast_half_to_float( uint16 h ); + void half_init_tables(); + + uint32 fast_half_to_float(uint16 h); inline uint16 to_half(float c) { union { float f; uint32 u; } f; diff --git a/src/nvmath/Matrix.h b/src/nvmath/Matrix.h index 1bf182d..5bd2cab 100644 --- a/src/nvmath/Matrix.h +++ b/src/nvmath/Matrix.h @@ -9,15 +9,14 @@ namespace nv { - enum zero_t { zero }; enum identity_t { identity }; class NVMATH_CLASS Matrix3 { public: Matrix3(); - Matrix3(zero_t); - Matrix3(identity_t); + explicit Matrix3(float f); + explicit Matrix3(identity_t); Matrix3(const Matrix3 & m); Matrix3(Vector3::Arg v0, Vector3::Arg v1, Vector3::Arg v2); @@ -41,10 +40,10 @@ namespace nv inline Matrix3::Matrix3() {} - inline Matrix3::Matrix3(zero_t) + inline Matrix3::Matrix3(float f) { for(int i = 0; i < 9; i++) { - m_data[i] = 0.0f; + m_data[i] = f; } } @@ -204,11 +203,11 @@ namespace nv typedef Matrix const & Arg; Matrix(); - Matrix(zero_t); - Matrix(identity_t); + explicit Matrix(float f); + explicit Matrix(identity_t); Matrix(const Matrix & m); Matrix(Vector4::Arg v0, Vector4::Arg v1, Vector4::Arg v2, Vector4::Arg v3); - Matrix(const scalar m[]); // m is assumed to contain 16 elements + //explicit Matrix(const scalar m[]); // m is assumed to contain 16 elements scalar data(uint idx) const; scalar & data(uint idx); @@ -237,7 +236,7 @@ namespace nv { } - inline Matrix::Matrix(zero_t) + inline Matrix::Matrix(float f) { for(int i = 0; i < 16; i++) { m_data[i] = 0.0f; @@ -268,12 +267,12 @@ namespace nv m_data[12] = v3.x; m_data[13] = v3.y; m_data[14] = v3.z; m_data[15] = v3.w; } - inline Matrix::Matrix(const scalar m[]) + /*inline Matrix::Matrix(const scalar m[]) { for(int i = 0; i < 16; i++) { m_data[i] = m[i]; } - } + }*/ // Accessors @@ -456,7 +455,7 @@ namespace nv /// Get frustum matrix. inline Matrix frustum(scalar xmin, scalar xmax, scalar ymin, scalar ymax, scalar zNear, scalar zFar) { - Matrix m(zero); + Matrix m(0.0f); scalar doubleznear = 2.0f * zNear; scalar one_deltax = 1.0f / (xmax - xmin); @@ -477,7 +476,7 @@ namespace nv /// Get infinite frustum matrix. inline Matrix frustum(scalar xmin, scalar xmax, scalar ymin, scalar ymax, scalar zNear) { - Matrix m(zero); + Matrix m(0.0f); scalar doubleznear = 2.0f * zNear; scalar one_deltax = 1.0f / (xmax - xmin); diff --git a/src/nvmath/Vector.h b/src/nvmath/Vector.h index dc593aa..c94e306 100644 --- a/src/nvmath/Vector.h +++ b/src/nvmath/Vector.h @@ -100,6 +100,7 @@ namespace nv explicit Vector4(scalar x); Vector4(scalar x, scalar y, scalar z, scalar w); Vector4(Vector2::Arg v, scalar z, scalar w); + Vector4(Vector2::Arg v, Vector2::Arg u); Vector4(Vector3::Arg v, scalar w); Vector4(Vector4::Arg v); // Vector4(const Quaternion & v); @@ -107,6 +108,7 @@ namespace nv const Vector4 & operator=(Vector4::Arg v); Vector2 xy() const; + Vector2 zw() const; Vector3 xyz() const; const scalar * ptr() const; @@ -290,6 +292,7 @@ namespace nv inline Vector4::Vector4(scalar f) : x(f), y(f), z(f), w(f) {} inline Vector4::Vector4(scalar x, scalar y, scalar z, scalar w) : x(x), y(y), z(z), w(w) {} inline Vector4::Vector4(Vector2::Arg v, scalar z, scalar w) : x(v.x), y(v.y), z(z), w(w) {} + inline Vector4::Vector4(Vector2::Arg v, Vector2::Arg u) : x(v.x), y(v.y), z(u.x), w(u.y) {} inline Vector4::Vector4(Vector3::Arg v, scalar w) : x(v.x), y(v.y), z(v.z), w(w) {} inline Vector4::Vector4(Vector4::Arg v) : x(v.x), y(v.y), z(v.z), w(v.w) {} @@ -307,6 +310,11 @@ namespace nv return Vector2(x, y); } + inline Vector2 Vector4::zw() const + { + return Vector2(z, w); + } + inline Vector3 Vector4::xyz() const { return Vector3(x, y, z); @@ -469,6 +477,14 @@ namespace nv return scale(v, 1.0f / l); } + // Safe, branchless normalization from Andy Firth. All error checking ommitted. + // http://altdevblogaday.com/2011/08/21/practical-flt-point-tricks/ + inline Vector2 normalizeFast(Vector2::Arg v) + { + const float very_small_float = 1.0e-037f; + float l = very_small_float + length(v); + return scale(v, 1.0f / l); + } inline bool equal(Vector2::Arg v1, Vector2::Arg v2, float epsilon = NV_EPSILON) { @@ -498,6 +514,14 @@ namespace nv return vf; } + inline float triangleArea(Vector2::Arg a, Vector2::Arg b, Vector2::Arg c) + { + Vector2 v0 = a - c; + Vector2 v1 = b - c; + + return (v0.x * v1.y - v0.y * v1.x); + } + // Vector3 @@ -570,10 +594,10 @@ namespace nv return scale(v, 1.0f/s); } - inline Vector3 add_scaled(Vector3::Arg a, Vector3::Arg b, scalar s) + /*inline Vector3 add_scaled(Vector3::Arg a, Vector3::Arg b, scalar s) { return Vector3(a.x + b.x * s, a.y + b.y * s, a.z + b.z * s); - } + }*/ inline Vector3 lerp(Vector3::Arg v1, Vector3::Arg v2, scalar t) { @@ -624,6 +648,15 @@ namespace nv return scale(v, 1.0f / l); } + // Safe, branchless normalization from Andy Firth. All error checking ommitted. + // http://altdevblogaday.com/2011/08/21/practical-flt-point-tricks/ + inline Vector3 normalizeFast(Vector3::Arg v) + { + const float very_small_float = 1.0e-037f; + float l = very_small_float + length(v); + return scale(v, 1.0f / l); + } + inline bool equal(Vector3::Arg v1, Vector3::Arg v2, float epsilon = NV_EPSILON) { return equal(v1.x, v2.x, epsilon) && equal(v1.y, v2.y, epsilon) && equal(v1.z, v2.z, epsilon); @@ -762,6 +795,15 @@ namespace nv return scale(v, 1.0f / l); } + // Safe, branchless normalization from Andy Firth. All error checking ommitted. + // http://altdevblogaday.com/2011/08/21/practical-flt-point-tricks/ + inline Vector4 normalizeFast(Vector4::Arg v) + { + const float very_small_float = 1.0e-037f; + float l = very_small_float + length(v); + return scale(v, 1.0f / l); + } + inline bool equal(Vector4::Arg v1, Vector4::Arg v2, float epsilon = NV_EPSILON) { return equal(v1.x, v2.x, epsilon) && equal(v1.y, v2.y, epsilon) && equal(v1.z, v2.z, epsilon) && equal(v1.w, v2.w, epsilon); diff --git a/src/nvmath/nvmath.h b/src/nvmath/nvmath.h index 9b8a2b4..8163eb3 100644 --- a/src/nvmath/nvmath.h +++ b/src/nvmath/nvmath.h @@ -4,8 +4,9 @@ #ifndef NV_MATH_H #define NV_MATH_H -#include -#include +#include "nvcore/nvcore.h" +#include "nvcore/Debug.h" +#include "nvcore/Utils.h" // clamp #include #include // INT_MAX @@ -194,7 +195,7 @@ namespace nv return f - floor(f); } - inline float fround(float f) + inline float fround(float f) // @@ rename floatRound { // @@ Do something better. return float(iround(f)); @@ -210,6 +211,29 @@ namespace nv } } + inline float saturate(float f) { + return clamp(f, 0.0f, 1.0f); + } + + inline float linearstep(float edge0, float edge1, float x) { + // Scale, bias and saturate x to 0..1 range + return saturate((x - edge0) / (edge1 - edge0)); + } + + inline float smoothstep(float edge0, float edge1, float x) { + x = linearstep(edge0, edge1, x); + + // Evaluate polynomial + return x*x*(3 - 2*x); + } + + inline int sign(float a) + { + if (a > 0.0f) return 1; + if (a < 0.0f) return -1; + return 0; + } + } // nv #endif // NV_MATH_H diff --git a/src/nvthread/CMakeLists.txt b/src/nvthread/CMakeLists.txt new file mode 100644 index 0000000..435141c --- /dev/null +++ b/src/nvthread/CMakeLists.txt @@ -0,0 +1,26 @@ +PROJECT(nvthreads) + +SET(THREADS_SRCS + nvthreads.h + Mutex.h Mutex.cpp + SpinWaiter.h SpinWaiter.cpp + Thread.h Thread.cpp + ThreadLocalStorage.h ThreadLocalStorage.cpp) + +INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) + +# targets +ADD_DEFINITIONS(-DNVTHREADS_EXPORTS) + +IF(NVTHREADS_SHARED) + ADD_LIBRARY(nvthreads SHARED ${THREADS_SRCS}) +ELSE(NVTHREADS_SHARED) + ADD_LIBRARY(nvthreads ${THREADS_SRCS}) +ENDIF(NVTHREADS_SHARED) + +TARGET_LINK_LIBRARIES(nvthreads ${LIBS} nvcore) + +INSTALL(TARGETS nvthreads + RUNTIME DESTINATION bin + LIBRARY DESTINATION lib + ARCHIVE DESTINATION lib/static) diff --git a/src/nvthread/Event.cpp b/src/nvthread/Event.cpp new file mode 100644 index 0000000..d39f54c --- /dev/null +++ b/src/nvthread/Event.cpp @@ -0,0 +1,52 @@ +// This code is in the public domain -- castano@gmail.com + +#include "Event.h" + +#if NV_OS_WIN32 +#include "Win32.h" +#elif NV_OS_UNIX +#include +#endif + +using namespace nv; + +#if NV_OS_WIN32 + +struct Event::Private { + HANDLE handle; +}; + +Event::Event() : m(new Private) { + m->handle = CreateEvent(NULL, FALSE, FALSE, NULL); +} + +Event::~Event() { + CloseHandle(m->handle); +} + +void Event::post() { + SetEvent(m->handle); +} + +void Event::wait() { + WaitForSingleObject(m->handle, INFINITE); +} + + +/*static*/ void Event::post(Event * events, uint count) { + for (uint i = 0; i < count; i++) { + events[i].post(); + } +} + +/*static*/ void Event::wait(Event * events, uint count) { + // @@ Use wait for multiple objects? + + for (uint i = 0; i < count; i++) { + events[i].wait(); + } +} + +#elif NV_OS_UNIX + // @@ +#endif diff --git a/src/nvthread/Event.h b/src/nvthread/Event.h new file mode 100644 index 0000000..c8ff1d0 --- /dev/null +++ b/src/nvthread/Event.h @@ -0,0 +1,34 @@ +// This code is in the public domain -- castano@gmail.com + +#pragma once +#ifndef NV_THREAD_EVENT_H +#define NV_THREAD_EVENT_H + +#include "nvthread.h" + +#include "nvcore/Ptr.h" + +namespace nv +{ + // This is intended to be used by a single waiter thread. + class NVTHREAD_CLASS Event + { + NV_FORBID_COPY(Event); + public: + Event(); + ~Event(); + + void post(); + void wait(); // Wait resets the event. + + static void post(Event * events, uint count); + static void wait(Event * events, uint count); + + private: + struct Private; + AutoPtr m; + }; + +} // nv namespace + +#endif // NV_THREAD_EVENT_H diff --git a/src/nvthread/Mutex.cpp b/src/nvthread/Mutex.cpp new file mode 100644 index 0000000..698c879 --- /dev/null +++ b/src/nvthread/Mutex.cpp @@ -0,0 +1,89 @@ +// This code is in the public domain -- castano@gmail.com + +#include "Mutex.h" + +#if NV_OS_WIN32 + +#include "Win32.h" + +#elif NV_OS_UNIX + +#include +#include // EBUSY + +#endif // NV_OS + +using namespace nv; + + +#if NV_OS_WIN32 + +struct Mutex::Private { + CRITICAL_SECTION mutex; +}; + + +Mutex::Mutex () : m(new Private) +{ + InitializeCriticalSection(&m->mutex); +} + +Mutex::~Mutex () +{ + DeleteCriticalSection(&m->mutex); +} + +void Mutex::lock() +{ + EnterCriticalSection(&m->mutex); +} + +bool Mutex::tryLock() +{ + return TryEnterCriticalSection(&m->mutex) != 0; +} + +void Mutex::unlock() +{ + LeaveCriticalSection(&m->mutex); +} + +#elif NV_OS_UNIX + +struct Mutex::Private { + pthread_mutex_t mutex; +}; + + +Mutex::Mutex () : m(new Private) +{ + int result = pthread_mutex_init(&m->mutex , NULL); + nvDebugCheck(result == 0); +} + +Mutex::~Mutex () +{ + int result = pthread_mutex_destroy(&m->mutex); + nvDebugCheck(result == 0); +} + +void Mutex::lock() +{ + int result = pthread_mutex_lock(&m->mutex); + nvDebugCheck(result == 0); +} + +bool Mutex::tryLock() +{ + int result = pthread_mutex_trylock(&m->mutex); + nvDebugCheck(result == 0 || result == EBUSY); + return result == 0; +} + +void Mutex::unlock() +{ + int result = pthread_mutex_unlock(&m->mutex); + nvDebugCheck(result == 0); +} + +#endif // NV_OS \ No newline at end of file diff --git a/src/nvthread/Mutex.h b/src/nvthread/Mutex.h new file mode 100644 index 0000000..841fc3d --- /dev/null +++ b/src/nvthread/Mutex.h @@ -0,0 +1,47 @@ +// This code is in the public domain -- castano@gmail.com + +#pragma once +#ifndef NV_THREAD_MUTEX_H +#define NV_THREAD_MUTEX_H + +#include "nvthread.h" + +#include "nvcore/Ptr.h" + +namespace nv +{ + + class NVTHREAD_CLASS Mutex + { + NV_FORBID_COPY(Mutex); + public: + Mutex (); + ~Mutex (); + + void lock(); + bool tryLock(); + void unlock(); + + private: + struct Private; + AutoPtr m; + }; + + + // Templated lock that can be used with any mutex. + template + class Lock + { + NV_FORBID_COPY(Lock); + public: + + Lock (M & m) : m_mutex (m) { m_mutex.lock(); } + ~Lock () { m_mutex.unlock(); } + + private: + M & m_mutex; + }; + +} // nv namespace + +#endif // NV_THREAD_MUTEX_H diff --git a/src/nvthread/ParallelFor.cpp b/src/nvthread/ParallelFor.cpp new file mode 100644 index 0000000..fe15416 --- /dev/null +++ b/src/nvthread/ParallelFor.cpp @@ -0,0 +1,61 @@ +// This code is in the public domain -- Ignacio Castaņo + +#include "ParallelFor.h" +#include "Thread.h" +#include "Atomic.h" +#include "ThreadPool.h" + +using namespace nv; + +#define ENABLE_PARALLEL_FOR 1 + + +void worker(void * arg) { + ParallelFor * owner = (ParallelFor *)arg; + + while(true) { + // Consume one element at a time. @@ Might be more efficient to have custom grain. + uint i = atomicIncrement(&owner->idx); + if (i > owner->count) { + break; + } + + owner->task(owner->context, i - 1); + } +} + + +ParallelFor::ParallelFor(ForTask * task, void * context) : task(task), context(context) { +#if ENABLE_PARALLEL_FOR + pool = ThreadPool::acquire(); +#endif +} + +ParallelFor::~ParallelFor() { +#if ENABLE_PARALLEL_FOR + ThreadPool::release(pool); +#endif +} + +void ParallelFor::run(uint count) { +#if ENABLE_PARALLEL_FOR + storeRelease(&this->count, count); + + // Init atomic counter to zero. + storeRelease(&idx, 0); + + // Start threads. + pool->start(worker, this); + + // Wait for all threads to complete. + pool->wait(); + + nvDebugCheck(idx >= count); +#else + for (int i = 0; i < count; i++) { + task(context, i); + } +#endif +} + + diff --git a/src/nvthread/ParallelFor.h b/src/nvthread/ParallelFor.h new file mode 100644 index 0000000..e3e0fb8 --- /dev/null +++ b/src/nvthread/ParallelFor.h @@ -0,0 +1,38 @@ +// This code is in the public domain -- Ignacio Castaņo + +#pragma once +#ifndef NV_THREAD_PARALLELFOR_H +#define NV_THREAD_PARALLELFOR_H + +#include "nvthread.h" +//#include "Atomic.h" // atomic + +namespace nv +{ + class Thread; + class ThreadPool; + + typedef void ForTask(void * context, int id); + + struct ParallelFor { + ParallelFor(ForTask * task, void * context); + ~ParallelFor(); + + void run(uint count); + + // Invariant: + ForTask * task; + void * context; + ThreadPool * pool; + //uint workerCount; // @@ Move to thread pool. + //Thread * workers; + + // State: + uint count; + /*atomic*/ uint idx; + }; + +} // nv namespace + + +#endif // NV_THREAD_PARALLELFOR_H diff --git a/src/nvthread/Thread.cpp b/src/nvthread/Thread.cpp new file mode 100644 index 0000000..c8c39d8 --- /dev/null +++ b/src/nvthread/Thread.cpp @@ -0,0 +1,136 @@ +// This code is in the public domain -- castano@gmail.com + +#include "Thread.h" + +#if NV_OS_WIN32 + #include "Win32.h" +#elif NV_OS_UNIX + #include + #include // usleep +#endif + +using namespace nv; + +struct Thread::Private +{ +#if NV_OS_WIN32 + HANDLE thread; +#elif NV_OS_UNIX + pthread_t thread; +#endif + + ThreadFunc * func; + void * arg; +}; + +#if NV_OS_WIN32 + +unsigned long __stdcall threadFunc(void * arg) { + Thread * thread = (Thread *)arg; + thread->func(thread->arg); + return 0; +} + +#elif NV_OS_UNIX +extern "C" void * threadFunc(void * arg) { + Thread * thread = (Thread *)arg; + thread->func(thread->arg); + pthread_exit(0); +} +#endif + + +Thread::Thread() : p(new Private) +{ + p->thread = 0; +} + +Thread::~Thread() +{ + nvDebugCheck(p->thread == 0); +} + +void Thread::start(ThreadFunc * func, void * arg) +{ + this->func = func; + this->arg = arg; + +#if NV_OS_WIN32 + p->thread = CreateThread(NULL, 0, threadFunc, this, 0, NULL); + //p->thread = (HANDLE)_beginthreadex (0, 0, threadFunc, this, 0, NULL); // @@ So that we can call CRT functions... + nvDebugCheck(p->thread != NULL); +#elif NV_OS_UNIX + int result = pthread_create(&p->thread, NULL, threadFunc, this); + nvDebugCheck(result == 0); +#endif +} + +void Thread::wait() +{ +#if NV_OS_WIN32 + DWORD status = WaitForSingleObject (p->thread, INFINITE); + nvCheck (status == WAIT_OBJECT_0); + BOOL ok = CloseHandle (p->thread); + p->thread = NULL; + nvCheck (ok); +#elif NV_OS_UNIX + int result = pthread_join(p->thread, NULL); + p->thread = 0; + nvDebugCheck(result == 0); +#endif +} + +bool Thread::isRunning () const +{ +#if NV_OS_WIN32 + return p->thread != NULL; +#elif NV_OS_UNIX + return p->thread != 0; +#endif +} + +/*static*/ void Thread::spinWait(uint count) +{ + for (uint i = 0; i < count; i++) {} +} + +/*static*/ void Thread::yield() +{ +#if NV_OS_WIN32 + SwitchToThread(); +#elif NV_OS_UNIX + int result = sched_yield(); + nvDebugCheck(result == 0); +#endif +} + +/*static*/ void Thread::sleep(uint ms) +{ +#if NV_OS_WIN32 + Sleep(ms); +#elif NV_OS_UNIX + usleep(1000 * ms); +#endif +} + +/*static*/ void Thread::wait(Thread * threads, uint count) +{ +/*#if NV_OS_WIN32 + // @@ Is there any advantage in doing this? + nvDebugCheck(count < MAXIMUM_WAIT_OBJECTS); + + HANDLE * handles = new HANDLE[count]; + for (uint i = 0; i < count; i++) { + handles[i] = threads->p->thread; + } + + DWORD result = WaitForMultipleObjects(count, handles, TRUE, INFINITE); + + + delete [] handles; +#else*/ + for (uint i = 0; i < count; i++) { + threads[i].wait(); + } +//#endif +} \ No newline at end of file diff --git a/src/nvthread/Thread.h b/src/nvthread/Thread.h new file mode 100644 index 0000000..cdd5b70 --- /dev/null +++ b/src/nvthread/Thread.h @@ -0,0 +1,46 @@ +// This code is in the public domain -- castano@gmail.com + +#pragma once +#ifndef NV_THREAD_THREAD_H +#define NV_THREAD_THREAD_H + +#include "nvthread.h" + +#include "nvcore/Ptr.h" + +namespace nv +{ + typedef void ThreadFunc(void * arg); + + class NVTHREAD_CLASS Thread + { + NV_FORBID_COPY(Thread); + public: + Thread(); + ~Thread(); + + void start(ThreadFunc * func, void * arg); + void wait(); + + bool isRunning() const; + + static void spinWait(uint count); + static void yield(); + static void sleep(uint ms); + + static void wait(Thread * threads, uint count); + + private: + + struct Private; + AutoPtr p; + + public: + ThreadFunc * func; + void * arg; + + }; + +} // nv namespace + +#endif // NV_THREAD_THREAD_H diff --git a/src/nvthread/ThreadPool.cpp b/src/nvthread/ThreadPool.cpp new file mode 100644 index 0000000..a343fab --- /dev/null +++ b/src/nvthread/ThreadPool.cpp @@ -0,0 +1,121 @@ +// This code is in the public domain -- castano@gmail.com + +#include "ThreadPool.h" +#include "Mutex.h" +#include "Thread.h" + +// Most of the time it's not necessary to protect the thread pool, but if it doesn't add a significant overhead, then it'd be safer to do it. +#define PROTECT_THREAD_POOL 1 + + +using namespace nv; + +#if PROTECT_THREAD_POOL +Mutex s_pool_mutex; +#endif + +AutoPtr s_pool; + + +/*static*/ ThreadPool * ThreadPool::acquire() +{ +#if PROTECT_THREAD_POOL + s_pool_mutex.lock(); // @@ If same thread tries to lock twice, this should assert. +#endif + + if (s_pool == NULL) { + ThreadPool * p = new ThreadPool; + nvDebugCheck(s_pool == p); + } + + return s_pool.ptr(); +} + +/*static*/ void ThreadPool::release(ThreadPool * pool) +{ + nvDebugCheck(pool == s_pool); + + // Make sure the threads of the pool are idle. + s_pool->wait(); + +#if PROTECT_THREAD_POOL + s_pool_mutex.unlock(); +#endif +} + + + + +/*static*/ void ThreadPool::workerFunc(void * arg) { + uint i = (uint)arg; + + while(true) + { + s_pool->startEvents[i].wait(); + + if (s_pool->func == NULL) { + return; // @@ should we post finish event anyway? + } + + s_pool->func(s_pool->arg); + + s_pool->finishEvents[i].post(); + } +} + + +ThreadPool::ThreadPool() +{ + s_pool = this; // Worker threads need this to be initialized before they start. + + workerCount = nv::hardwareThreadCount(); + workers = new Thread[workerCount]; + + startEvents = new Event[workerCount]; + finishEvents = new Event[workerCount]; + + for (uint i = 0; i < workerCount; i++) { + workers[i].start(workerFunc, (void *)i); + } + + allIdle = true; +} + +ThreadPool::~ThreadPool() +{ + // Set threads to terminate. + start(NULL, NULL); + + // Wait until threads actually exit. + Thread::wait(workers, workerCount); + + delete [] workers; + delete [] startEvents; + delete [] finishEvents; +} + +void ThreadPool::start(ThreadFunc * func, void * arg) +{ + // Wait until threads are idle. + wait(); + + // Set our desired function. + this->func = func; + this->arg = arg; + + allIdle = false; + + // Resume threads. + Event::post(startEvents, workerCount); +} + +void ThreadPool::wait() +{ + if (!allIdle) + { + // Wait for threads to complete. + Event::wait(finishEvents, workerCount); + + allIdle = true; + } +} \ No newline at end of file diff --git a/src/nvthread/ThreadPool.h b/src/nvthread/ThreadPool.h new file mode 100644 index 0000000..84fc41e --- /dev/null +++ b/src/nvthread/ThreadPool.h @@ -0,0 +1,49 @@ +// This code is in the public domain -- castano@gmail.com + +#pragma once +#ifndef NV_THREAD_THREADPOOL_H +#define NV_THREAD_THREADPOOL_H + +#include "nvthread.h" + +#include "Event.h" +#include "Thread.h" + +namespace nv { + + class Thread; + class Event; + + class ThreadPool { + NV_FORBID_COPY(ThreadPool); + public: + + static ThreadPool * acquire(); + static void release(ThreadPool *); + + ThreadPool(); + ~ThreadPool(); + + void start(ThreadFunc * func, void * arg); + void wait(); + + private: + + static void workerFunc(void * arg); + + uint workerCount; + Thread * workers; + Event * startEvents; + Event * finishEvents; + + uint allIdle; + + // Current function: + ThreadFunc * func; + void * arg; + }; + +} // namespace nv + + +#endif // NV_THREAD_THREADPOOL_H diff --git a/src/nvthread/Win32.h b/src/nvthread/Win32.h new file mode 100644 index 0000000..f5b14b4 --- /dev/null +++ b/src/nvthread/Win32.h @@ -0,0 +1,9 @@ +// This code is in the public domain -- castano@gmail.com + +// Never include this from a header file. + +#define WIN32_LEAN_AND_MEAN +#define VC_EXTRALEAN +#define _WIN32_WINNT 0x0400 // for SwitchToThread, TryEnterCriticalSection +#include +//#include // for _beginthreadex \ No newline at end of file diff --git a/src/nvthread/nvthread.cpp b/src/nvthread/nvthread.cpp new file mode 100644 index 0000000..0d40f86 --- /dev/null +++ b/src/nvthread/nvthread.cpp @@ -0,0 +1,51 @@ + +#include "nvthread.h" + +#include "Thread.h" + +#define WIN32_LEAN_AND_MEAN +#define VC_EXTRALEAN +#include + +using namespace nv; + + +// Find the number of cores in the system. +// Based on: http://stackoverflow.com/questions/150355/programmatically-find-the-number-of-cores-on-a-machine +// @@ Distinguish between logical and physical cores? +uint nv::hardwareThreadCount() { +#if NV_OS_WIN32 + SYSTEM_INFO sysinfo; + GetSystemInfo( &sysinfo ); + return sysinfo.dwNumberOfProcessors; +#elif NV_OS_XBOX + return 3; // or 6? +#elif NV_OS_LINUX // Linux, Solaris, & AIX + return sysconf(_SC_NPROCESSORS_ONLN); +#elif NV_OS_DARWIN || NV_OS_FREEBSD + int numCPU; + int mib[4]; + size_t len = sizeof(numCPU); + + // set the mib for hw.ncpu + mib[0] = CTL_HW; + mib[1] = HW_AVAILCPU; // alternatively, try HW_NCPU; + + // get the number of CPUs from the system + sysctl(mib, 2, &numCPU, &len, NULL, 0); + + if (numCPU < 1) { + mib[1] = HW_NCPU; + sysctl( mib, 2, &numCPU, &len, NULL, 0 ); + + if (numCPU < 1) { + return 1; // Assume single core. + } + } + + return numCPU; +#else + return 1; // Assume single core. +#endif +} + diff --git a/src/nvthread/nvthread.h b/src/nvthread/nvthread.h new file mode 100644 index 0000000..bb31435 --- /dev/null +++ b/src/nvthread/nvthread.h @@ -0,0 +1,83 @@ +// This code is in the public domain -- castanyo@yahoo.es + +#pragma once +#ifndef NV_THREAD_H +#define NV_THREAD_H + +#include "nvcore/nvcore.h" + +// Function linkage +#if NVTHREAD_SHARED +#ifdef NVTHREAD_EXPORTS +#define NVTHREAD_API DLL_EXPORT +#define NVTHREAD_CLASS DLL_EXPORT_CLASS +#else +#define NVTHREAD_API DLL_IMPORT +#define NVTHREAD_CLASS DLL_IMPORT +#endif +#else // NVMATH_SHARED +#define NVTHREAD_API +#define NVTHREAD_CLASS +#endif // NVMATH_SHARED + + +// Compiler barriers. +// See: http://en.wikipedia.org/wiki/Memory_ordering +#if NV_CC_MSVC + +#include + +#pragma intrinsic(_WriteBarrier) +#define nvCompilerWriteBarrier _WriteBarrier + +#pragma intrinsic(_ReadWriteBarrier) +#define nvCompilerReadWriteBarrier _ReadWriteBarrier + +#if _MSC_VER >= 1400 // ReadBarrier is VC2005 +#pragma intrinsic(_ReadBarrier) +#define nvCompilerReadBarrier _ReadBarrier +#else +#define nvCompilerReadBarrier _ReadWriteBarrier +#endif + +#elif NV_CC_GNUC + +#define nvCompilerReadWriteBarrier() asm volatile("" ::: "memory"); +#define nvCompilerWriteBarrier nvCompilerReadWriteBarrier +#define nvCompilerReadBarrier nvCompilerReadWriteBarrier + +#endif // NV_CC_MSVC + + +// @@ Memory barriers / fences. + +// @@ Atomics. + + +/* Wrap this up: +#define YieldProcessor() __asm { rep nop } +#define YieldProcessor _mm_pause +#define YieldProcessor __yield + +BOOL WINAPI SwitchToThread(void); +*/ + + +namespace nv +{ + // Reentrant. + uint hardwareThreadCount(); + + // Not thread-safe. Use from main thread only. + void initWorkers(); + void shutWorkers(); + void setWorkerFunction(void * func); + +} // nv namespace + + + + + + +#endif // NV_THREAD_H diff --git a/src/nvtt/CompressorDX11.cpp b/src/nvtt/CompressorDX11.cpp index 2b443e7..63635fe 100644 --- a/src/nvtt/CompressorDX11.cpp +++ b/src/nvtt/CompressorDX11.cpp @@ -37,7 +37,7 @@ using namespace nv; using namespace nvtt; -void CompressorBC6::compressBlock(Tile & tile, AlphaMode alphaMode, const CompressionOptions::Private & compressionOptions, void * output) +void CompressorBC6::compressBlock(ColorSet & tile, AlphaMode alphaMode, const CompressionOptions::Private & compressionOptions, void * output) { NV_UNUSED(alphaMode); // ZOH does not support alpha. @@ -56,7 +56,7 @@ void CompressorBC6::compressBlock(Tile & tile, AlphaMode alphaMode, const Compre } -void CompressorBC7::compressBlock(Tile & tile, AlphaMode alphaMode, const CompressionOptions::Private & compressionOptions, void * output) +void CompressorBC7::compressBlock(ColorSet & tile, AlphaMode alphaMode, const CompressionOptions::Private & compressionOptions, void * output) { // @@ TODO } diff --git a/src/nvtt/CompressorDX9.cpp b/src/nvtt/CompressorDX9.cpp index 532d03b..10c74d9 100644 --- a/src/nvtt/CompressorDX9.cpp +++ b/src/nvtt/CompressorDX9.cpp @@ -481,10 +481,10 @@ void D3DXCompressorDXT1::compress(nvtt::InputFormat inputFormat, nvtt::AlphaMode err = surface->LockRect(&rect, NULL, D3DLOCK_READONLY); - if (outputOptions.outputHandler != NULL) { - int size = rect.Pitch * ((h + 3) / 4); - outputOptions.outputHandler->writeData(rect.pBits, size); - } + if (outputOptions.outputHandler != NULL) { + int size = rect.Pitch * ((h + 3) / 4); + outputOptions.outputHandler->writeData(rect.pBits, size); + } err = surface->UnlockRect(); } diff --git a/src/nvtt/CompressorRGB.cpp b/src/nvtt/CompressorRGB.cpp index 612c955..237b583 100644 --- a/src/nvtt/CompressorRGB.cpp +++ b/src/nvtt/CompressorRGB.cpp @@ -110,7 +110,7 @@ namespace { nvDebugCheck(alignment >= 1); flush(); - int remainder = (size_t)ptr % alignment; + int remainder = (int)((uintptr_t)ptr % alignment); if (remainder != 0) { putBits(0, (alignment - remainder) * 8); } diff --git a/src/nvtt/Context.cpp b/src/nvtt/Context.cpp index 4c6801f..8361b02 100644 --- a/src/nvtt/Context.cpp +++ b/src/nvtt/Context.cpp @@ -349,6 +349,8 @@ bool Compressor::Private::compress(AlphaMode alphaMode, int w, int h, int d, int compressor->compress(alphaMode, w, h, d, rgba, dispatcher, compressionOptions, outputOptions); } + outputOptions.endImage(); + return true; } diff --git a/src/nvtt/OutputOptions.cpp b/src/nvtt/OutputOptions.cpp index f5c9906..59de6b8 100644 --- a/src/nvtt/OutputOptions.cpp +++ b/src/nvtt/OutputOptions.cpp @@ -135,6 +135,11 @@ bool OutputOptions::Private::writeData(const void * data, int size) const return outputHandler == NULL || outputHandler->writeData(data, size); } +void OutputOptions::Private::endImage() const +{ + if (outputHandler != NULL) outputHandler->endImage(); +} + void OutputOptions::Private::error(Error e) const { if (errorHandler != NULL) errorHandler->error(e); diff --git a/src/nvtt/OutputOptions.h b/src/nvtt/OutputOptions.h index 5645a70..2a272a0 100644 --- a/src/nvtt/OutputOptions.h +++ b/src/nvtt/OutputOptions.h @@ -52,6 +52,11 @@ namespace nvtt return true; } + virtual void endImage() + { + // ignore. + } + nv::StdOutputStream stream; }; @@ -72,6 +77,7 @@ namespace nvtt void beginImage(int size, int width, int height, int depth, int face, int miplevel) const; bool writeData(const void * data, int size) const; + void endImage() const; void error(Error e) const; }; diff --git a/src/nvtt/TaskDispatcher.h b/src/nvtt/TaskDispatcher.h index 2f62d9d..5ebf92b 100644 --- a/src/nvtt/TaskDispatcher.h +++ b/src/nvtt/TaskDispatcher.h @@ -18,8 +18,8 @@ // http://msdn.microsoft.com/en-us/library/dd504870.aspx #if NV_OS_WIN32 && _MSC_VER >= 1600 #define HAVE_PPL 1 -//#include -#include +#include +//#include #endif // Intel Thread Building Blocks (TBB). @@ -28,6 +28,8 @@ #include #endif +#include "nvthread/ParallelFor.h" + namespace nvtt { @@ -40,6 +42,15 @@ namespace nvtt { } }; + struct ParallelTaskDispatcher : public TaskDispatcher + { + virtual void dispatch(Task * task, void * context, int count) { + nv::ParallelFor parallelFor(task, context); + parallelFor.run(count); // @@ Add support for custom grain. + } + }; + + #if defined(HAVE_OPENMP) struct OpenMPTaskDispatcher : public TaskDispatcher @@ -81,9 +92,24 @@ namespace nvtt { #if defined(HAVE_PPL) + class CountingIterator + { + public: + CountingIterator() : i(0) {} + CountingIterator(const CountingIterator & rhs) : i(0) {} + explicit CountingIterator(int x) : i(x) {} + + const int & operator*() const { return i; } + CountingIterator & operator++() { i++; return *this; } + CountingIterator & operator--() { i--; return *this; } + + private: + int i; + }; + struct TaskFunctor { TaskFunctor(Task * task, void * context) : task(task), context(context) {} - void operator()(int n) const { + void operator()(int & n) const { task(context, n); } Task * task; @@ -95,12 +121,16 @@ namespace nvtt { { virtual void dispatch(Task * task, void * context, int count) { + CountingIterator begin(0); + CountingIterator end((int)count); TaskFunctor func(task, context); - Concurrency::parallel_for(0, count, func); + + std::for_each(begin, end, func); + //parallel_for_each(begin, end, func); } }; -#endif // HAVE_PPL +#endif #if defined(HAVE_TBB) @@ -132,7 +162,8 @@ namespace nvtt { #elif defined(HAVE_GCD) typedef AppleTaskDispatcher ConcurrentTaskDispatcher; #else - typedef SequentialTaskDispatcher ConcurrentTaskDispatcher; + //typedef SequentialTaskDispatcher ConcurrentTaskDispatcher; + typedef ParallelTaskDispatcher ConcurrentTaskDispatcher; #endif } // namespace nvtt diff --git a/src/nvtt/TexImage.cpp b/src/nvtt/TexImage.cpp index 17312be..b753ef1 100644 --- a/src/nvtt/TexImage.cpp +++ b/src/nvtt/TexImage.cpp @@ -615,7 +615,7 @@ bool TexImage::setImage2D(Format format, Decoder decoder, int w, int h, const vo block->decodeBlock(&colors, false); } else if (decoder == Decoder_NV5x) { - block->decodeBlockNV5x(&colors); + block->decodeBlockNV5x(&colors); } } else if (format == nvtt::Format_BC3) @@ -629,19 +629,19 @@ bool TexImage::setImage2D(Format format, Decoder decoder, int w, int h, const vo block->decodeBlock(&colors, false); } else if (decoder == Decoder_NV5x) { - block->decodeBlockNV5x(&colors); + block->decodeBlockNV5x(&colors); } } else if (format == nvtt::Format_BC4) { - const BlockATI1 * block = (const BlockATI1 *)ptr; - block->decodeBlock(&colors, decoder == Decoder_D3D9); - } - else if (format == nvtt::Format_BC5) - { - const BlockATI2 * block = (const BlockATI2 *)ptr; - block->decodeBlock(&colors, decoder == Decoder_D3D9); - } + const BlockATI1 * block = (const BlockATI1 *)ptr; + block->decodeBlock(&colors, decoder == Decoder_D3D9); + } + else if (format == nvtt::Format_BC5) + { + const BlockATI2 * block = (const BlockATI2 *)ptr; + block->decodeBlock(&colors, decoder == Decoder_D3D9); + } for (int yy = 0; yy < 4; yy++) { @@ -864,6 +864,42 @@ bool TexImage::buildNextMipmap(MipmapFilter filter, float filterWidth, const flo return true; } +void TexImage::canvasSize(int w, int h, int d) +{ + nvDebugCheck(w > 0 && h > 0 && d > 0); + + FloatImage * img = m->image; + if (img == NULL || (w == img->width() && h == img->height() && d == img->depth())) { + return; + } + + detach(); + + FloatImage * new_img = new FloatImage; + new_img->allocate(4, w, h, d); + new_img->clear(); + + w = min(uint(w), img->width()); + h = min(uint(h), img->height()); + d = min(uint(d), img->depth()); + + for (int z = 0; z < d; z++) { + for (int y = 0; y < h; y++) { + for (int x = 0; x < w; x++) { + new_img->pixel(0, x, y, z) = img->pixel(0, x, y, z); + new_img->pixel(1, x, y, z) = img->pixel(1, x, y, z); + new_img->pixel(2, x, y, z) = img->pixel(2, x, y, z); + new_img->pixel(3, x, y, z) = img->pixel(3, x, y, z); + } + } + } + + delete m->image; + m->image = new_img; + m->type = (d == 1) ? TextureType_2D : TextureType_3D; +} + + // Color transforms. void TexImage::toLinear(float gamma) { @@ -885,6 +921,66 @@ void TexImage::toGamma(float gamma) m->image->toGamma(0, 3, gamma); } + +static float toSrgb(float f) { + if (f <= 0.0) f = 0.0f; + else if (f <= 0.0031308f) f = 12.92f * f; + else if (f <= 1.0f) f = (powf(f, 0.41666f) * 1.055f) - 0.055f; + else f = 1.0f; + return f; +} + +void TexImage::toSrgb() +{ + FloatImage * img = m->image; + if (img == NULL) return; + + detach(); + + const uint count = img->pixelCount(); + for (uint j = 0; j < count; j++) + { + float & r = img->pixel(0, j); + float & g = img->pixel(1, j); + float & b = img->pixel(2, j); + + r = ::toSrgb(r); + g = ::toSrgb(g); + b = ::toSrgb(b); + } +} + +static float toXenonSrgb(float f) { + if (f < 0) f = 0; + else if (f < (1.0f/16.0f)) f = 4.0f * f; + else if (f < (1.0f/8.0f)) f = 0.25f + 2.0f * (f - 0.0625f); + else if (f < 0.5f) f = 0.375f + 1.0f * (f - 0.125f); + else if (f < 1.0f) f = 0.75f + 0.5f * (f - 0.50f); + else f = 1.0f; + return f; +} + +void TexImage::toXenonSrgb() +{ + FloatImage * img = m->image; + if (img == NULL) return; + + detach(); + + const uint count = img->pixelCount(); + for (uint j = 0; j < count; j++) + { + float & r = img->pixel(0, j); + float & g = img->pixel(1, j); + float & b = img->pixel(2, j); + + r = ::toXenonSrgb(r); + g = ::toXenonSrgb(g); + b = ::toXenonSrgb(b); + } +} + + void TexImage::transform(const float w0[4], const float w1[4], const float w2[4], const float w3[4], const float offset[4]) { if (m->image == NULL) return; @@ -1140,9 +1236,9 @@ void TexImage::toRGBM(float range/*= 1*/, float threshold/*= 0.25*/) const uint count = img->pixelCount(); for (uint i = 0; i < count; i++) { - float R = nv::clamp(r[i] * irange, 0.0f, 1.0f); - float G = nv::clamp(g[i] * irange, 0.0f, 1.0f); - float B = nv::clamp(b[i] * irange, 0.0f, 1.0f); + float R = nv::clamp(r[i], 0.0f, 1.0f); + float G = nv::clamp(g[i], 0.0f, 1.0f); + float B = nv::clamp(b[i], 0.0f, 1.0f); #if 1 float M = max(max(R, G), max(B, threshold)); diff --git a/src/nvtt/nvtt.h b/src/nvtt/nvtt.h index 3188227..ef13771 100644 --- a/src/nvtt/nvtt.h +++ b/src/nvtt/nvtt.h @@ -294,6 +294,9 @@ namespace nvtt /// Output data. Compressed data is output as soon as it's generated to minimize memory allocations. virtual bool writeData(const void * data, int size) = 0; + + /// Indicate the end of a the compressed image. + virtual void endImage() = 0; }; /// Error codes. @@ -440,10 +443,13 @@ namespace nvtt NVTT_API void resize(int maxExtent, RoundMode mode, ResizeFilter filter, float filterWidth, const float * params = 0); NVTT_API bool buildNextMipmap(MipmapFilter filter); NVTT_API bool buildNextMipmap(MipmapFilter filter, float filterWidth, const float * params = 0); + NVTT_API void canvasSize(int w, int h, int d); // Color transforms. NVTT_API void toLinear(float gamma); NVTT_API void toGamma(float gamma); + NVTT_API void toSrgb(); + NVTT_API void toXenonSrgb(); NVTT_API void transform(const float w0[4], const float w1[4], const float w2[4], const float w3[4], const float offset[4]); NVTT_API void swizzle(int r, int g, int b, int a); NVTT_API void scaleBias(int channel, float scale, float bias); diff --git a/src/nvtt/tools/compress.cpp b/src/nvtt/tools/compress.cpp index 70e729d..fba7a26 100644 --- a/src/nvtt/tools/compress.cpp +++ b/src/nvtt/tools/compress.cpp @@ -56,6 +56,11 @@ struct MyOutputHandler : public nvtt::OutputHandler // ignore. } + virtual void endImage() + { + // Ignore. + } + // Output data. virtual bool writeData(const void * data, int size) {