diff --git a/src/nvcore/Algorithms.h b/src/nvcore/Algorithms.h index 6bb0796..b543b11 100644 --- a/src/nvcore/Algorithms.h +++ b/src/nvcore/Algorithms.h @@ -3,6 +3,8 @@ #ifndef NV_CORE_ALGORITHMS_H #define NV_CORE_ALGORITHMS_H +#include + namespace nv { @@ -45,22 +47,42 @@ namespace nv } } - - // @@ Swap should be implemented here. - - -#if 0 - // This does not use swap, but copies, in some cases swaps are much faster than copies! - // Container should implement operator[], and size() - template - void insertionSort(Container & container) + // @@ Should swap be implemented here? + + + + template class C> + void sort(C & container) { - const uint n = container.size(); - for (uint i=1; i < n; ++i) + introsortLoop(container, 0, container.count()); + insertionSort(container, 0, container.count()); + } + + template class C> + void sort(C & container, uint begin, uint end) + { + if (begin < end) + { + introsortLoop(container, begin, end); + insertionSort(container, begin, end); + } + } + + template class C> + void insertionSort(C & container) + { + insertionSort(container, 0, container.count()); + } + + template class C> + void insertionSort(C & container, uint begin, uint end) + { + for (uint i = begin + 1; i != end; ++i) { T value = container[i]; + uint j = i; - while (j > 0 && container[j-1] > value) + while (j != begin && container[j-1] > value) { container[j] = container[j-1]; --j; @@ -72,75 +94,60 @@ namespace nv } } - template - void quickSort(Container & container) - { - quickSort(container, 0, container.count()); - } - - { - /* threshhold for transitioning to insertion sort */ - while (n > 12) { - int c01,c12,c,m,i,j; + template class C> + void introsortLoop(C & container, uint begin, uint end) + { + while (end-begin > 16) + { + uint p = partition(container, begin, end, medianof3(container, begin, begin+((end-begin)/2)+1, end-1)); + introsortLoop(container, p, end); + end = p; + } + } - /* compute median of three */ - m = n >> 1; - c = p[0] > p[m]; - c01 = c; - c = &p[m] > &p[n-1]; - c12 = c; - /* if 0 >= mid >= end, or 0 < mid < end, then use mid */ - if (c01 != c12) { - /* otherwise, we'll need to swap something else to middle */ - int z; - c = p[0] < p[n-1]; - /* 0>mid && midn => n; 0 0 */ - /* 0n: 0>n => 0; 0 n */ - z = (c == c12) ? 0 : n-1; - swap(p[z], p[m]); + template class C> + uint partition(C & a, uint begin, uint end, const T & x) + { + int i = begin, j = end; + while (true) + { + while (a[i] < x) ++i; + --j; + while (x < a[j]) --j; + if (i >= j) + return i; + swap(a[i], a[j]); + i++; + } + } + + template class C> + const T & medianof3(C & a, uint lo, uint mid, uint hi) + { + if (a[mid] < a[lo]) + { + if (a[hi] < a[mid]) + { + return a[mid]; } - /* now p[m] is the median-of-three */ - /* swap it to the beginning so it won't move around */ - swap(p[0], p[m]); - - /* partition loop */ - i=1; - j=n-1; - for(;;) { - /* handling of equality is crucial here */ - /* for sentinels & efficiency with duplicates */ - for (;;++i) { - c = p[i] > p[0]; - if (!c) break; - } - a = &p[0]; - for (;;--j) { - b=&p[j]; - c = p[j] > p[0] - if (!c) break; - } - /* make sure we haven't crossed */ - if (i >= j) break; - swap(p[i], p[j]); - - ++i; - --j; - } - /* recurse on smaller side, iterate on larger */ - if (j < (n-i)) { - quickSort(p, j); - p = p+i; - n = n-i; - } - else { - quickSort(p+i, n-i); - n = j; + else + { + return (a[hi] < a[lo]) ? a[hi] : a[lo]; } } - - insertionSort(); - } -#endif // 0 + else + { + if (a[hi] < a[mid]) + { + return (a[hi] < a[lo]) ? a[lo] : a[hi]; + } + else + { + return a[mid]; + } + } + } + } // nv namespace diff --git a/src/nvcore/CMakeLists.txt b/src/nvcore/CMakeLists.txt index a94c5df..fd9777d 100644 --- a/src/nvcore/CMakeLists.txt +++ b/src/nvcore/CMakeLists.txt @@ -24,11 +24,14 @@ SET(CORE_SRCS TextReader.cpp TextWriter.h TextWriter.cpp + Tokenizer.h + Tokenizer.cpp Radix.h Radix.cpp CpuInfo.h CpuInfo.cpp Algorithms.h + Timer.h Library.h Library.cpp FileSystem.h @@ -36,6 +39,34 @@ SET(CORE_SRCS INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) +# For Windows64 in MSVC we need to add the assembly version of vsscanf +IF(MSVC AND NV_SYSTEM_PROCESSOR STREQUAL "AMD64") + SET(VSSCANF_ASM_NAME "vsscanf_proxy_win64") + IF(MSVC_IDE) + # $(IntDir) is a macro expanded to the intermediate directory of the selected solution configuration + SET(VSSCANF_ASM_INTDIR "$(IntDir)") + ELSE(MSVC_IDE) + # For some reason the NMake generator doesn't work properly with the generated .obj source: + # it requires the absolute path. So this is a hack which worked as of cmake 2.6.0 patch 0 + GET_FILENAME_COMPONENT(VSSCANF_ASM_INTDIR + "${nvcore_BINARY_DIR}/CMakeFiles/nvcore.dir" ABSOLUTE) + ENDIF(MSVC_IDE) + + SET(VSSCANF_ASM_SRC "${CMAKE_CURRENT_SOURCE_DIR}/${VSSCANF_ASM_NAME}.masm") + SET(VSSCANF_ASM_OBJ "${VSSCANF_ASM_INTDIR}/${VSSCANF_ASM_NAME}.obj") + + # Adds the assembly output to the sources and adds the custom command to generate it + SET(CORE_SRCS + ${CORE_SRCS} + ${VSSCANF_ASM_OBJ} + ) + ADD_CUSTOM_COMMAND(OUTPUT ${VSSCANF_ASM_OBJ} + MAIN_DEPENDENCY ${VSSCANF_ASM_SRC} + COMMAND ml64 + ARGS /nologo /Fo ${VSSCANF_ASM_OBJ} /c /Cx ${VSSCANF_ASM_SRC} + ) +ENDIF(MSVC AND NV_SYSTEM_PROCESSOR STREQUAL "AMD64") + # targets ADD_DEFINITIONS(-DNVCORE_EXPORTS) diff --git a/src/nvcore/Constraints.h b/src/nvcore/Constraints.h new file mode 100644 index 0000000..08b9c8b --- /dev/null +++ b/src/nvcore/Constraints.h @@ -0,0 +1,59 @@ +// This code is in the public domain -- castanyo@yahoo.es + +#ifndef NV_CORE_ALGORITHMS_H +#define NV_CORE_ALGORITHMS_H + +#include + +namespace nv +{ + // Cool constraints from "Imperfect C++" + + // must_be_pod + template + struct must_be_pod + { + static void constraints() + { + union { T T_is_not_POD_type; }; + } + }; + + // must_be_pod_or_void + template + struct must_be_pod_or_void + { + static void constraints() + { + union { T T_is_not_POD_type; }; + } + }; + template <> struct must_be_pod_or_void {}; + + // size_of + template + struct size_of + { + enum { value = sizeof(T) }; + }; + template <> + struct size_of + { + enum { value = 0 }; + }; + + // must_be_same_size + template + struct must_be_same_size + { + static void constraints() + { + const int T1_not_same_size_as_T2 = size_of::value == size_of::value; + int i[T1_not_same_size_as_T2]; + } + }; + + +} // nv namespace + +#endif // NV_CORE_ALGORITHMS_H diff --git a/src/nvcore/CpuInfo.cpp b/src/nvcore/CpuInfo.cpp index 0b4ada6..9c09f04 100644 --- a/src/nvcore/CpuInfo.cpp +++ b/src/nvcore/CpuInfo.cpp @@ -33,30 +33,75 @@ static bool isWow64() #endif // NV_OS_WIN32 +#if NV_OS_LINUX +#include +#include +#endif // NV_OS_LINUX + +#if NV_OS_DARWIN +#include +#include +#endif // NV_OS_DARWIN + +// Initialize the data and the local defines, which are designed +// to match the positions in cpuid +uint CpuInfo::m_cpu = ~0x0; +uint CpuInfo::m_procCount = 0; +#define NV_CPUINFO_MMX_MASK (1<<23) +#define NV_CPUINFO_SSE_MASK (1<<25) +#define NV_CPUINFO_SSE2_MASK (1<<26) +#define NV_CPUINFO_SSE3_MASK (1) + uint CpuInfo::processorCount() { + if (m_procCount == 0) { #if NV_OS_WIN32 - SYSTEM_INFO sysInfo; + SYSTEM_INFO sysInfo; - typedef BOOL (WINAPI *LPFN_ISWOW64PROCESS) (HANDLE, PBOOL); + typedef BOOL (WINAPI *LPFN_ISWOW64PROCESS) (HANDLE, PBOOL); - if (isWow64()) - { - GetNativeSystemInfo(&sysInfo); - } - else - { - GetSystemInfo(&sysInfo); - } + if (isWow64()) + { + GetNativeSystemInfo(&sysInfo); + } + else + { + GetSystemInfo(&sysInfo); + } - uint count = (uint)sysInfo.dwNumberOfProcessors; - nvDebugCheck(count >= 1); + uint count = (uint)sysInfo.dwNumberOfProcessors; + m_procCount = count; + +#elif NV_OS_LINUX + + // Code from x264 (July 6 snapshot) cpu.c:271 + uint bit; + uint np; + cpu_set_t p_aff; + memset( &p_aff, 0, sizeof(p_aff) ); + sched_getaffinity( 0, sizeof(p_aff), &p_aff ); + for( np = 0, bit = 0; bit < sizeof(p_aff); bit++ ) + np += (((uint8 *)&p_aff)[bit / 8] >> (bit % 8)) & 1; + m_procCount = np; + +#elif NV_OS_DARWIN + + // Code from x264 (July 6 snapshot) cpu.c:286 + uint numberOfCPUs; + size_t length = sizeof( numberOfCPUs ); + if( sysctlbyname("hw.ncpu", &numberOfCPUs, &length, NULL, 0) ) + { + numberOfCPUs = 1; + } + m_procCount = numberOfCPUs; - return count; #else - return 1; + m_procCount = 1; #endif + } + nvDebugCheck(m_procCount > 0); + return m_procCount; } uint CpuInfo::coreCount() @@ -66,23 +111,52 @@ uint CpuInfo::coreCount() bool CpuInfo::hasMMX() { - return false; + return (cpu() & NV_CPUINFO_MMX_MASK) != 0; } bool CpuInfo::hasSSE() { - return false; + return (cpu() & NV_CPUINFO_SSE_MASK) != 0; } bool CpuInfo::hasSSE2() { - return false; + return (cpu() & NV_CPUINFO_SSE2_MASK) != 0; } bool CpuInfo::hasSSE3() { - return false; + return (cpu() & NV_CPUINFO_SSE3_MASK) != 0; } +inline int CpuInfo::cpu() { + if (m_cpu == ~0x0) { + m_cpu = 0; +#if NV_CC_MSVC + int CPUInfo[4] = {-1}; + __cpuid(CPUInfo, /*InfoType*/ 1); + + if (CPUInfo[2] & NV_CPUINFO_SSE3_MASK) { + m_cpu |= NV_CPUINFO_SSE3_MASK; + } + if (CPUInfo[3] & NV_CPUINFO_MMX_MASK) { + m_cpu |= NV_CPUINFO_MMX_MASK; + } + if (CPUInfo[3] & NV_CPUINFO_SSE_MASK) { + m_cpu |= NV_CPUINFO_SSE_MASK; + } + if (CPUInfo[3] & NV_CPUINFO_SSE2_MASK) { + m_cpu |= NV_CPUINFO_SSE2_MASK; + } +#elif NV_CC_GNUC + // TODO: add the proper inline assembly +#if NV_CPU_X86 +#elif NV_CPU_X86_64 + +#endif // NV_CPU_X86_64 +#endif // NV_CC_GNUC + } + return m_cpu; +} diff --git a/src/nvcore/CpuInfo.h b/src/nvcore/CpuInfo.h index 099cf61..964d098 100644 --- a/src/nvcore/CpuInfo.h +++ b/src/nvcore/CpuInfo.h @@ -18,6 +18,15 @@ namespace nv // CPU Information. class CpuInfo { + protected: + static int cpu(); + + private: + // Cache of the CPU data + static uint m_cpu; + static uint m_procCount; + + public: static uint processorCount(); static uint coreCount(); @@ -25,7 +34,6 @@ namespace nv static bool hasSSE(); static bool hasSSE2(); static bool hasSSE3(); - }; #if NV_CC_MSVC diff --git a/src/nvcore/Prefetch.h b/src/nvcore/Prefetch.h index 71bd0ed..90a8b52 100644 --- a/src/nvcore/Prefetch.h +++ b/src/nvcore/Prefetch.h @@ -12,16 +12,15 @@ #elif NV_CC_MSVC -#if NV_CPU_X86 +// Uses SSE Intrinsics for both x86 and x86_64 +#include + __forceinline void nvPrefetch(const void * mem) { - __asm mov ecx, mem - __asm prefetcht0 [ecx]; -// __asm prefetchnta [ecx]; + _mm_prefetch(static_cast(mem), _MM_HINT_T0); /* prefetcht0 */ +// _mm_prefetch(static_cast(mem), _MM_HINT_NTA); /* prefetchnta */ } -#endif // NV_CPU_X86 - -#else // NV_CC_MSVC +#else // do nothing in other case. #define nvPrefetch(ptr) diff --git a/src/nvcore/Ptr.h b/src/nvcore/Ptr.h index b5e9fe7..c22317b 100644 --- a/src/nvcore/Ptr.h +++ b/src/nvcore/Ptr.h @@ -34,11 +34,11 @@ class AutoPtr NV_FORBID_HEAPALLOC(); public: - /// Default ctor. - AutoPtr() : m_ptr(NULL) { } - /// Ctor. - explicit AutoPtr( T * p ) : m_ptr(p) { } + AutoPtr(T * p = NULL) : m_ptr(p) { } + + template + AutoPtr(Q * p) : m_ptr(static_cast(p)) { } /** Dtor. Deletes owned pointer. */ ~AutoPtr() { diff --git a/src/nvcore/Radix.cpp b/src/nvcore/Radix.cpp index 713215f..245292f 100644 --- a/src/nvcore/Radix.cpp +++ b/src/nvcore/Radix.cpp @@ -7,6 +7,10 @@ */ /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// References: +// http://www.codercorner.com/RadixSortRevisited.htm +// http://www.stereopsis.com/radix.html + /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// /** * Revisited Radix Sort. @@ -26,19 +30,25 @@ * - 09.18.01: faster CHECK_PASS_VALIDITY thanks to Mark D. Shattuck (who provided other tips, not included here) * - 10.11.01: added local ram support * - 01.20.02: bugfix! In very particular cases the last pass was skipped in the float code-path, leading to incorrect sorting...... + * - 01.02.02: - "mIndices" renamed => "mRanks". That's a rank sorter after all. + * - ranks are not "reset" anymore, but implicit on first calls + * - 07.05.02: offsets rewritten with one less indirection. + * - 11.03.02: "bool" replaced with RadixHint enum + * - 07.15.04: stack-based radix added + * - we want to use the radix sort but without making it static, and without allocating anything. + * - we internally allocate two arrays of ranks. Each of them has N uint32s to sort N values. + * - 1Mb/2/sizeof(uint32) = 131072 values max, at the same time. + * - 09.22.04: - adapted to MacOS by Chris Lamb + * - 01.12.06: - added optimizations suggested by Kyle Hubert + * - 04.06.08: - Fix bug negative zero sorting bug by Ignacio CastaƱo * * \class RadixSort * \author Pierre Terdiman - * \version 1.3 + * \version 1.5 * \date August, 15, 1998 */ /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -/* -To do: - - add an offset parameter between two input values (avoid some data recopy sometimes) - - unroll ? asm ? -*/ /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // Header @@ -49,138 +59,170 @@ To do: //using namespace IceCore; -#define DELETEARRAY(a) { delete [] a; a = NULL; } -#define CHECKALLOC(a) +#define INVALIDATE_RANKS mCurrentSize|=0x80000000 +#define VALIDATE_RANKS mCurrentSize&=0x7fffffff +#define CURRENT_SIZE (mCurrentSize&0x7fffffff) +#define INVALID_RANKS (mCurrentSize&0x80000000) - - -/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -/** - * Constructor. - */ -/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -RadixSort::RadixSort() : mCurrentSize(0), mPreviousSize(0), mIndices(NULL), mIndices2(NULL), mTotalCalls(0), mNbHits(0) -{ -#ifndef RADIX_LOCAL_RAM - // Allocate input-independent ram - mHistogram = new uint32[256*4]; - mOffset = new uint32[256]; +#if NV_BIG_ENDIAN + #define H0_OFFSET 768 + #define H1_OFFSET 512 + #define H2_OFFSET 256 + #define H3_OFFSET 0 + #define BYTES_INC (3-j) +#else + #define H0_OFFSET 0 + #define H1_OFFSET 256 + #define H2_OFFSET 512 + #define H3_OFFSET 768 + #define BYTES_INC j #endif + +#define CREATE_HISTOGRAMS(type, buffer) \ + /* Clear counters/histograms */ \ + memset(mHistogram, 0, 256*4*sizeof(uint32)); \ + \ + /* Prepare to count */ \ + const uint8* p = (const uint8*)input; \ + const uint8* pe = &p[nb*4]; \ + uint32* h0= &mHistogram[H0_OFFSET]; /* Histogram for first pass (LSB) */ \ + uint32* h1= &mHistogram[H1_OFFSET]; /* Histogram for second pass */ \ + uint32* h2= &mHistogram[H2_OFFSET]; /* Histogram for third pass */ \ + uint32* h3= &mHistogram[H3_OFFSET]; /* Histogram for last pass (MSB) */ \ + \ + bool AlreadySorted = true; /* Optimism... */ \ + \ + if(INVALID_RANKS) \ + { \ + /* Prepare for temporal coherence */ \ + type* Running = (type*)buffer; \ + type PrevVal = *Running; \ + \ + while(p!=pe) \ + { \ + /* Read input buffer in previous sorted order */ \ + type Val = *Running++; \ + /* Check whether already sorted or not */ \ + if(ValmCurrentSize) resize(n); \ - else resetIndices(); \ - mPreviousSize = n; \ +inline void RadixSort::checkResize(uint32 nb) +{ + uint32 CurSize = CURRENT_SIZE; + if(nb!=CurSize) + { + if(nb>CurSize) resize(nb); + mCurrentSize = nb; + INVALIDATE_RANKS; } - -#define CREATE_HISTOGRAMS(type, buffer) \ - /* Clear counters */ \ - memset(mHistogram, 0, 256*4*sizeof(uint32)); \ - \ - /* Prepare for temporal coherence */ \ - type PrevVal = (type)buffer[mIndices[0]]; \ - bool AlreadySorted = true; /* Optimism... */ \ - uint32* Indices = mIndices; \ - \ - /* Prepare to count */ \ - uint8* p = (uint8*)input; \ - uint8* pe = &p[nb*4]; \ - uint32* h0= &mHistogram[0]; /* Histogram for first pass (LSB) */ \ - uint32* h1= &mHistogram[256]; /* Histogram for second pass */ \ - uint32* h2= &mHistogram[512]; /* Histogram for third pass */ \ - uint32* h3= &mHistogram[768]; /* Histogram for last pass (MSB) */ \ - \ - while(p!=pe) \ - { \ - /* Read input buffer in previous sorted order */ \ - type Val = (type)buffer[*Indices++]; \ - /* Check whether already sorted or not */ \ - if(Val 126; i--) mLink[i] = mLink[i+1] + CurCount[i]; + mLink[0] = mLink[127] + CurCount[127]; + for(uint32 i = 1; i < 127; i++) mLink[i] = mLink[i-1] + CurCount[i-1]; // Perform Radix Sort - for(i=0;i>24; // Radix byte, same as above. AND is useless here (uint32). - // ### cmp to be killed. Not good. Later. - if(Radix<128) mIndices2[mOffset[Radix]++] = mIndices[i]; // Number is positive, same as above - else mIndices2[--mOffset[Radix]] = mIndices[i]; // Number is negative, flip the sorting order + for(uint32 i=0;i>24; // Radix byte, same as above. AND is useless here (uint32). + // ### cmp to be killed. Not good. Later. + if(Radix<128) *mLink[Radix]++ = i; // Number is positive, same as above + else *(--mLink[Radix]) = i; // Number is negative, flip the sorting order + } + VALIDATE_RANKS; } - // Swap pointers for next pass. Valid indices - the most recent ones - are in mIndices after the swap. - uint32* Tmp = mIndices; mIndices = mIndices2; mIndices2 = Tmp; + else + { + for(uint32 i=0;i>24; // Radix byte, same as above. AND is useless here (uint32). + // ### cmp to be killed. Not good. Later. + if(Radix<128) *mLink[Radix]++ = mRanks[i]; // Number is positive, same as above + else *(--mLink[Radix]) = mRanks[i]; // Number is negative, flip the sorting order + } + } + // Swap pointers for next pass. Valid indices - the most recent ones - are in mRanks after the swap. + uint32* Tmp = mRanks; + mRanks = mRanks2; + mRanks2 = Tmp; } else { // The pass is useless, yet we still have to reverse the order of current list if all values are negative. if(UniqueVal>=128) { - for(i=0;i & input) { - uint32 UsedRam = sizeof(RadixSort); -#ifndef RADIX_LOCAL_RAM - UsedRam += 256*4*sizeof(uint32); // Histograms - UsedRam += 256*sizeof(uint32); // Offsets -#endif - UsedRam += 2*mCurrentSize*sizeof(uint32); // 2 lists of indices - return UsedRam; + return sort((const uint32 *)input.buffer(), input.count(), true); +} + +RadixSort & RadixSort::sort(const Array & input) +{ + return sort(input.buffer(), input.count(), false); +} + +RadixSort & RadixSort::sort(const Array & input) +{ + return sort(input.buffer(), input.count()); } diff --git a/src/nvcore/Radix.h b/src/nvcore/Radix.h index dad0f81..35cd884 100644 --- a/src/nvcore/Radix.h +++ b/src/nvcore/Radix.h @@ -13,57 +13,61 @@ #define NV_CORE_RADIXSORT_H #include +#include + +namespace nv +{ + + class NVCORE_CLASS RadixSort + { + NV_FORBID_COPY(RadixSort); + public: + // Constructor/Destructor + RadixSort(); + ~RadixSort(); + + // Sorting methods + RadixSort & sort(const uint32* input, uint32 nb, bool signedValues=true); + RadixSort & sort(const float* input, uint32 nb); + + // Helpers + RadixSort & sort(const Array & input); + RadixSort & sort(const Array & input); + RadixSort & sort(const Array & input); -#define RADIX_LOCAL_RAM + //! Access to results. mRanks is a list of indices in sorted order, i.e. in the order you may further process your data + inline /*const*/ uint32 * ranks() /*const*/ { return mRanks; } + //! mIndices2 gets trashed on calling the sort routine, but otherwise you can recycle it the way you want. + inline uint32 * recyclable() const { return mRanks2; } -class NVCORE_API RadixSort { - NV_FORBID_COPY(RadixSort); -public: - // Constructor/Destructor - RadixSort(); - ~RadixSort(); + // Stats + //! Returns the total number of calls to the radix sorter. + inline uint32 totalCalls() const { return mTotalCalls; } - // Sorting methods - RadixSort & sort(const uint32* input, uint32 nb, bool signedvalues=true); - RadixSort & sort(const float* input, uint32 nb); - - //! Access to results. mIndices is a list of indices in sorted order, i.e. in the order you may further process your data - inline uint32 * indices() const { return mIndices; } - - //! mIndices2 gets trashed on calling the sort routine, but otherwise you can recycle it the way you want. - inline uint32 * recyclable() const { return mIndices2; } - - // Stats - uint32 usedRam() const; - - //! Returns the total number of calls to the radix sorter. - inline uint32 totalCalls() const { return mTotalCalls; } - - //! Returns the number of premature exits due to temporal coherence. - inline uint32 hits() const { return mNbHits; } + //! Returns the number of early exits due to temporal coherence. + inline uint32 hits() const { return mNbHits; } + bool setRankBuffers(uint32* ranks0, uint32* ranks1); private: -#ifndef RADIX_LOCAL_RAM - uint32* mHistogram; //!< Counters for each byte - uint32* mOffset; //!< Offsets (nearly a cumulative distribution function) -#endif - uint32 mCurrentSize; //!< Current size of the indices list - uint32 mPreviousSize; //!< Size involved in previous call - uint32* mIndices; //!< Two lists, swapped each pass - uint32* mIndices2; + uint32 mCurrentSize; //!< Current size of the indices list + uint32 * mRanks; //!< Two lists, swapped each pass + uint32 * mRanks2; - // Stats - uint32 mTotalCalls; - uint32 mNbHits; + // Stats + uint32 mTotalCalls; //!< Total number of calls to the sort routine + uint32 mNbHits; //!< Number of early exits due to coherence - // Internal methods - bool resize(uint32 nb); - void resetIndices(); + // Stack-radix + bool mDeleteRanks; //!< -}; + // Internal methods + void checkResize(uint32 nb); + bool resize(uint32 nb); + }; +} // nv namespace #endif // NV_CORE_RADIXSORT_H diff --git a/src/nvcore/StrLib.cpp b/src/nvcore/StrLib.cpp index 00d9491..c07593d 100644 --- a/src/nvcore/StrLib.cpp +++ b/src/nvcore/StrLib.cpp @@ -208,49 +208,12 @@ StringBuilder::StringBuilder( const StringBuilder & s ) : m_size(0), m_str(NULL) copy(s); } -// Copy string. -/*StringBuilder::StringBuilder( const char * s ) +/** Copy string. */ +StringBuilder::StringBuilder( const char * s ) { copy(s); -}*/ - -/** Allocate and copy string. */ -StringBuilder::StringBuilder( int size_hint, const StringBuilder & s) : m_size(size_hint), m_str(NULL) -{ - nvDebugCheck(m_size > 0); - m_str = strAlloc(m_size); - copy(s); } -/** Allocate and format string. */ -StringBuilder::StringBuilder( const char * fmt, ... ) : m_size(0), m_str(NULL) -{ - nvDebugCheck(fmt != NULL); - va_list arg; - va_start( arg, fmt ); - - format( fmt, arg ); - - va_end( arg ); -} - -/** Allocate and format string. */ -StringBuilder::StringBuilder( int size_hint, const char * fmt, ... ) : m_size(size_hint), m_str(NULL) -{ - nvDebugCheck(m_size > 0); - nvDebugCheck(fmt != NULL); - - m_str = strAlloc(m_size); - - va_list arg; - va_start( arg, fmt ); - - format( fmt, arg ); - - va_end( arg ); -} - - /** Delete the string. */ StringBuilder::~StringBuilder() { @@ -278,8 +241,7 @@ StringBuilder & StringBuilder::format( const char * fmt, ... ) /** Format a string safely. */ StringBuilder & StringBuilder::format( const char * fmt, va_list arg ) { - nvCheck(fmt != NULL); - nvCheck(m_size >= 0); + nvDebugCheck(fmt != NULL); if( m_size == 0 ) { m_size = 64; @@ -327,8 +289,7 @@ StringBuilder & StringBuilder::format( const char * fmt, va_list arg ) /** Append a string. */ StringBuilder & StringBuilder::append( const char * s ) { - nvCheck(s != NULL); - nvCheck(m_size >= 0); + nvDebugCheck(s != NULL); const uint slen = uint(strlen( s )); @@ -475,31 +436,6 @@ void StringBuilder::reset() } -Path::Path(const char * fmt, ...) -{ - nvDebugCheck( fmt != NULL ); - - va_list arg; - va_start( arg, fmt ); - - format( fmt, arg ); - - va_end( arg ); -} - -Path::Path(int size_hint, const char * fmt, ...) : StringBuilder(size_hint) -{ - nvDebugCheck( fmt != NULL ); - - va_list arg; - va_start( arg, fmt ); - - format( fmt, arg ); - - va_end( arg ); -} - - /// Get the file name from a path. const char * Path::fileName() const { diff --git a/src/nvcore/StrLib.h b/src/nvcore/StrLib.h index a66dd3a..d03583b 100644 --- a/src/nvcore/StrLib.h +++ b/src/nvcore/StrLib.h @@ -45,11 +45,8 @@ namespace nv StringBuilder(); explicit StringBuilder( int size_hint ); - //StringBuilder( const char * str ); + StringBuilder( const char * str ); StringBuilder( const StringBuilder & ); - StringBuilder( int size_hint, const StringBuilder & ); - StringBuilder( const char * format, ... ) __attribute__((format (printf, 2, 3))); - StringBuilder( int size_hint, const char * format, ... ) __attribute__((format (printf, 3, 4))); ~StringBuilder(); @@ -120,24 +117,17 @@ namespace nv char * m_str; }; - - /// Path string. + + /// Path string. @@ This should be called PathBuilder. class NVCORE_CLASS Path : public StringBuilder { public: Path() : StringBuilder() {} explicit Path(int size_hint) : StringBuilder(size_hint) {} - //Path(const char * str) : StringBuilder((const char *)str) {} + Path( const char * str ) : StringBuilder(str) {} Path(const StringBuilder & str) : StringBuilder(str) {} - Path(int size_hint, const StringBuilder & str) : StringBuilder(size_hint, str) {} - Path(const char * format, ...) __attribute__((format (printf, 2, 3))); - Path(int size_hint, const char * format, ...) __attribute__((format (printf, 3, 4))); - Path & operator=( const char * s ) { - return (Path &)copy(s); - } - const char * fileName() const; const char * extension() const; @@ -145,11 +135,11 @@ namespace nv void stripFileName(); void stripExtension(); - + // statics - NVCORE_API static char separator(); - NVCORE_API static const char * fileName(const char *); - NVCORE_API static const char * extension(const char *); + static char separator(); + static const char * fileName(const char *); + static const char * extension(const char *); }; diff --git a/src/nvcore/Timer.h b/src/nvcore/Timer.h new file mode 100644 index 0000000..ebdf3ce --- /dev/null +++ b/src/nvcore/Timer.h @@ -0,0 +1,22 @@ +// This code is in the public domain -- castano@gmail.com + +#ifndef NV_CORE_TIMER_H +#define NV_CORE_TIMER_H + +#include + +#include //clock + +class NVCORE_CLASS Timer +{ +public: + Timer() {} + + void start() { m_start = clock(); } + int elapsed() const { return (1000 * (clock() - m_start)) / CLOCKS_PER_SEC; } + +private: + clock_t m_start; +}; + +#endif // NV_CORE_TIMER_H diff --git a/src/nvcore/Tokenizer.cpp b/src/nvcore/Tokenizer.cpp index b67e00f..b2ec4cd 100644 --- a/src/nvcore/Tokenizer.cpp +++ b/src/nvcore/Tokenizer.cpp @@ -8,7 +8,7 @@ #include // atof, atoi #if NV_CC_MSVC -#if 0 // This doesn't work on MSVC for x64 +#if defined NV_CPU_X86 /* vsscanf for Win32 * Written 5/2003 by * This code is in the Public Domain @@ -56,9 +56,39 @@ static int vsscanf(const char * buffer, const char * format, va_list argPtr) } return result; } +#elif defined NV_CPU_X86_64 + +/* Prototype of the helper assembly function */ +#ifdef __cplusplus +extern "C" { #endif + +int vsscanf_proxy_win64(const char * buffer, const char * format, va_list argPtr, __int64 count); + +#ifdef __cplusplus +} #endif +/* MASM64 version of the above vsscanf */ +static int vsscanf(const char * buffer, const char * format, va_list argPtr) +{ + // Get an upper bound for the # of args + __int64 count = 0; + const char *p = format; + while(1) { + char c = *(p++); + if(c==0) break; + if(c=='%' && (p[0]!='*' && p[0]!='%')) ++count; + } + return vsscanf_proxy_win64(buffer, format, argPtr, count); +} + +/*#error vsscanf doesn't work on MSVC for x64*/ +#else +#error Unknown processor for MSVC +#endif +#endif // NV_CC_MSVC + using namespace nv; Token::Token() : diff --git a/src/nvcore/vsscanf_proxy_win64.masm b/src/nvcore/vsscanf_proxy_win64.masm new file mode 100644 index 0000000..faf7c08 --- /dev/null +++ b/src/nvcore/vsscanf_proxy_win64.masm @@ -0,0 +1,124 @@ +; MASM x64 version of +; vsscanf for Win32 +; originally written 5/2003 by +; +; This was done because MSVC does not accept inline assembly code +; for the x64 platform, so this file implements almost the whole +; module in assembly using the amd64 ABI +; +; 06/17/2008 by edgarv [at] nvidia com + +; Definition of memcpy +memcpy PROTO dest:Ptr, src:Ptr, numbytes:QWORD + +; Definition of sscanf +sscanf PROTO buffer:Ptr Byte, format:Ptr Byte, args:VARARG + + + +; Start a code segment named "_TEXT" by default +.CODE + +; Entry point of our function: at this point we can use +; named parameters +ALIGN 16 +PUBLIC vsscanf_proxy_win64 + +; Because the x64 code uses the fast call convention, only +; the arguments beyond the 4th one are available from the stack. +; The first four parameters are in RCX, RDX, R8 and R9 + +; Parameters: +; const char* buffer +; const char* format +; va_list argPtr +; size_t count +vsscanf_proxy_win64 PROC, \ + buffer:PTR Byte, format:PTR Byte, argPtr:PTR, count:QWORD + + ; Allocates space for our local variable, savedRDP + sub rsp, 08h + + ; Copies the parameters from the registers to the memory: before warping to + ; sscanf we will call memcpy, and those registers can just dissapear! + mov buffer, rcx + mov format, rdx + mov argPtr, r8 + mov count, r9 + + + ; Allocate extra space in the stack for (2+count)*sizeof(void*), + ; this is (2+count)*(8) + mov r10, r9 + add r10, 2 ; count += 2 + sal r10, 3 ; count *= 8 + add r10, 0fh ; To force alignment to 16bytes + and r10, 0fffffffffffffff0h + sub rsp, r10 ; Actual stack allocation + + + ; Continues by copying all the arguments in the "alloca" space + mov [rsp], rcx ; newStack[0] = (void*)buffer; + mov [rsp + 08h], rdx ; newStack[1] = (void*)format; + + ; Calls memcpy(newStack+2, argPtr, count*sizeof(void*)); + mov rcx, rsp + add rcx, 010h ; newStack+2 + mov rdx, r8 ; argPtr + mov r8, r9 + sal r8, 3 ; count*sizeof(void*) + + ; Prepares extra stack space as required by the ABI for 4 arguments, and calls memcpy + sub rsp, 020h + call memcpy + + ; Restore the stack + add rsp, 020h + + ; Saves rsp in memory + mov qword ptr [rbp - 8], rsp + + ; Does exactly the same trick as before: warp into system sscanf with the new stack, + ; but this time we also setup the arguments in the registers according to the amd64 ABI + + ; If there was at least one argument (after buffer and format), we need to copy that + ; to r8, and if there was a second one we must copy that to r9 + ; (the first arguments to sscanf are always the buffer and the format) + mov r10, count + + ; Copy the first argument to r8 (if it exists) + cmp r10, 0 + je args_memcpy + mov r8, [rsp + 10h] + + ; Copy the second argument to r9 (if it exists) + cmp r10, 1 + je args_memcpy + mov r9, [rsp + 18h] + +args_memcpy: + + ; Copies the buffer and format to rcx and rdx + mov rdx, [rsp + 08h] + mov rcx, [rsp] + + ; Finally, calls sscanf using the current stack + call sscanf + + ; At this point the return value is alreay in rax + + ; Restores rsp + mov rsp, qword ptr [rbp - 8] + + ; Undoes the alloca + add rsp, r10 + + ; Restores the space for local variables + add rsp, 08h + + ; Remember, the return value is already in rax since the sscanf call + ret + +vsscanf_proxy_win64 ENDP + +END