Merge internal branch.

- Remove old/unused code.
- Remove format string constructors.
- Better win64 support (vsscanf, prefetch, etc).
- Fix radix sort to sort -0 correctly.
- Add misc utilities (constraints, timer, cpuinfo, introsort).
This commit is contained in:
castano 2008-12-29 11:20:06 +00:00
parent a03411e451
commit e5ae0c0e20
14 changed files with 801 additions and 462 deletions

View File

@ -3,6 +3,8 @@
#ifndef NV_CORE_ALGORITHMS_H #ifndef NV_CORE_ALGORITHMS_H
#define NV_CORE_ALGORITHMS_H #define NV_CORE_ALGORITHMS_H
#include <nvcore/nvcore.h>
namespace nv namespace nv
{ {
@ -45,22 +47,42 @@ namespace nv
} }
} }
// @@ Should swap be implemented here?
// @@ Swap should be implemented here.
#if 0 template <typename T, template <typename T> class C>
// This does not use swap, but copies, in some cases swaps are much faster than copies! void sort(C<T> & container)
// Container should implement operator[], and size()
template <class Container, class T>
void insertionSort(Container<T> & container)
{ {
const uint n = container.size(); introsortLoop(container, 0, container.count());
for (uint i=1; i < n; ++i) insertionSort(container, 0, container.count());
}
template <typename T, template <typename T> class C>
void sort(C<T> & container, uint begin, uint end)
{
if (begin < end)
{
introsortLoop(container, begin, end);
insertionSort(container, begin, end);
}
}
template <typename T, template <typename T> class C>
void insertionSort(C<T> & container)
{
insertionSort(container, 0, container.count());
}
template <typename T, template <typename T> class C>
void insertionSort(C<T> & container, uint begin, uint end)
{
for (uint i = begin + 1; i != end; ++i)
{ {
T value = container[i]; T value = container[i];
uint j = i; uint j = i;
while (j > 0 && container[j-1] > value) while (j != begin && container[j-1] > value)
{ {
container[j] = container[j-1]; container[j] = container[j-1];
--j; --j;
@ -72,75 +94,60 @@ namespace nv
} }
} }
template <class Container, class T> template <typename T, template <typename T> class C>
void quickSort(Container<T> & container) void introsortLoop(C<T> & container, uint begin, uint end)
{ {
quickSort(container, 0, container.count()); while (end-begin > 16)
} {
uint p = partition(container, begin, end, medianof3(container, begin, begin+((end-begin)/2)+1, end-1));
{ introsortLoop(container, p, end);
/* threshhold for transitioning to insertion sort */ end = p;
while (n > 12) { }
int c01,c12,c,m,i,j; }
/* compute median of three */ template <typename T, template <typename T> class C>
m = n >> 1; uint partition(C<T> & a, uint begin, uint end, const T & x)
c = p[0] > p[m]; {
c01 = c; int i = begin, j = end;
c = &p[m] > &p[n-1]; while (true)
c12 = c; {
/* if 0 >= mid >= end, or 0 < mid < end, then use mid */ while (a[i] < x) ++i;
if (c01 != c12) { --j;
/* otherwise, we'll need to swap something else to middle */ while (x < a[j]) --j;
int z; if (i >= j)
c = p[0] < p[n-1]; return i;
/* 0>mid && mid<n: 0>n => n; 0<n => 0 */ swap(a[i], a[j]);
/* 0<mid && mid>n: 0>n => 0; 0<n => n */ i++;
z = (c == c12) ? 0 : n-1; }
swap(p[z], p[m]); }
template <typename T, template <typename T> class C>
const T & medianof3(C<T> & a, uint lo, uint mid, uint hi)
{
if (a[mid] < a[lo])
{
if (a[hi] < a[mid])
{
return a[mid];
} }
/* now p[m] is the median-of-three */ else
/* swap it to the beginning so it won't move around */ {
swap(p[0], p[m]); return (a[hi] < a[lo]) ? a[hi] : a[lo];
/* partition loop */
i=1;
j=n-1;
for(;;) {
/* handling of equality is crucial here */
/* for sentinels & efficiency with duplicates */
for (;;++i) {
c = p[i] > p[0];
if (!c) break;
}
a = &p[0];
for (;;--j) {
b=&p[j];
c = p[j] > p[0]
if (!c) break;
}
/* make sure we haven't crossed */
if (i >= j) break;
swap(p[i], p[j]);
++i;
--j;
}
/* recurse on smaller side, iterate on larger */
if (j < (n-i)) {
quickSort(p, j);
p = p+i;
n = n-i;
}
else {
quickSort(p+i, n-i);
n = j;
} }
} }
else
insertionSort(); {
} if (a[hi] < a[mid])
#endif // 0 {
return (a[hi] < a[lo]) ? a[lo] : a[hi];
}
else
{
return a[mid];
}
}
}
} // nv namespace } // nv namespace

View File

@ -24,11 +24,14 @@ SET(CORE_SRCS
TextReader.cpp TextReader.cpp
TextWriter.h TextWriter.h
TextWriter.cpp TextWriter.cpp
Tokenizer.h
Tokenizer.cpp
Radix.h Radix.h
Radix.cpp Radix.cpp
CpuInfo.h CpuInfo.h
CpuInfo.cpp CpuInfo.cpp
Algorithms.h Algorithms.h
Timer.h
Library.h Library.h
Library.cpp Library.cpp
FileSystem.h FileSystem.h
@ -36,6 +39,34 @@ SET(CORE_SRCS
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
# For Windows64 in MSVC we need to add the assembly version of vsscanf
IF(MSVC AND NV_SYSTEM_PROCESSOR STREQUAL "AMD64")
SET(VSSCANF_ASM_NAME "vsscanf_proxy_win64")
IF(MSVC_IDE)
# $(IntDir) is a macro expanded to the intermediate directory of the selected solution configuration
SET(VSSCANF_ASM_INTDIR "$(IntDir)")
ELSE(MSVC_IDE)
# For some reason the NMake generator doesn't work properly with the generated .obj source:
# it requires the absolute path. So this is a hack which worked as of cmake 2.6.0 patch 0
GET_FILENAME_COMPONENT(VSSCANF_ASM_INTDIR
"${nvcore_BINARY_DIR}/CMakeFiles/nvcore.dir" ABSOLUTE)
ENDIF(MSVC_IDE)
SET(VSSCANF_ASM_SRC "${CMAKE_CURRENT_SOURCE_DIR}/${VSSCANF_ASM_NAME}.masm")
SET(VSSCANF_ASM_OBJ "${VSSCANF_ASM_INTDIR}/${VSSCANF_ASM_NAME}.obj")
# Adds the assembly output to the sources and adds the custom command to generate it
SET(CORE_SRCS
${CORE_SRCS}
${VSSCANF_ASM_OBJ}
)
ADD_CUSTOM_COMMAND(OUTPUT ${VSSCANF_ASM_OBJ}
MAIN_DEPENDENCY ${VSSCANF_ASM_SRC}
COMMAND ml64
ARGS /nologo /Fo ${VSSCANF_ASM_OBJ} /c /Cx ${VSSCANF_ASM_SRC}
)
ENDIF(MSVC AND NV_SYSTEM_PROCESSOR STREQUAL "AMD64")
# targets # targets
ADD_DEFINITIONS(-DNVCORE_EXPORTS) ADD_DEFINITIONS(-DNVCORE_EXPORTS)

59
src/nvcore/Constraints.h Normal file
View File

@ -0,0 +1,59 @@
// This code is in the public domain -- castanyo@yahoo.es
#ifndef NV_CORE_ALGORITHMS_H
#define NV_CORE_ALGORITHMS_H
#include <nvcore/nvcore.h>
namespace nv
{
// Cool constraints from "Imperfect C++"
// must_be_pod
template <typename T>
struct must_be_pod
{
static void constraints()
{
union { T T_is_not_POD_type; };
}
};
// must_be_pod_or_void
template <typename T>
struct must_be_pod_or_void
{
static void constraints()
{
union { T T_is_not_POD_type; };
}
};
template <> struct must_be_pod_or_void<void> {};
// size_of
template <typename T>
struct size_of
{
enum { value = sizeof(T) };
};
template <>
struct size_of<void>
{
enum { value = 0 };
};
// must_be_same_size
template <typename T1, typename T2>
struct must_be_same_size
{
static void constraints()
{
const int T1_not_same_size_as_T2 = size_of<T1>::value == size_of<T2>::value;
int i[T1_not_same_size_as_T2];
}
};
} // nv namespace
#endif // NV_CORE_ALGORITHMS_H

View File

@ -33,30 +33,75 @@ static bool isWow64()
#endif // NV_OS_WIN32 #endif // NV_OS_WIN32
#if NV_OS_LINUX
#include <string.h>
#include <sched.h>
#endif // NV_OS_LINUX
#if NV_OS_DARWIN
#include <sys/types.h>
#include <sys/sysctl.h>
#endif // NV_OS_DARWIN
// Initialize the data and the local defines, which are designed
// to match the positions in cpuid
uint CpuInfo::m_cpu = ~0x0;
uint CpuInfo::m_procCount = 0;
#define NV_CPUINFO_MMX_MASK (1<<23)
#define NV_CPUINFO_SSE_MASK (1<<25)
#define NV_CPUINFO_SSE2_MASK (1<<26)
#define NV_CPUINFO_SSE3_MASK (1)
uint CpuInfo::processorCount() uint CpuInfo::processorCount()
{ {
if (m_procCount == 0) {
#if NV_OS_WIN32 #if NV_OS_WIN32
SYSTEM_INFO sysInfo; SYSTEM_INFO sysInfo;
typedef BOOL (WINAPI *LPFN_ISWOW64PROCESS) (HANDLE, PBOOL); typedef BOOL (WINAPI *LPFN_ISWOW64PROCESS) (HANDLE, PBOOL);
if (isWow64()) if (isWow64())
{ {
GetNativeSystemInfo(&sysInfo); GetNativeSystemInfo(&sysInfo);
} }
else else
{ {
GetSystemInfo(&sysInfo); GetSystemInfo(&sysInfo);
} }
uint count = (uint)sysInfo.dwNumberOfProcessors; uint count = (uint)sysInfo.dwNumberOfProcessors;
nvDebugCheck(count >= 1); m_procCount = count;
#elif NV_OS_LINUX
// Code from x264 (July 6 snapshot) cpu.c:271
uint bit;
uint np;
cpu_set_t p_aff;
memset( &p_aff, 0, sizeof(p_aff) );
sched_getaffinity( 0, sizeof(p_aff), &p_aff );
for( np = 0, bit = 0; bit < sizeof(p_aff); bit++ )
np += (((uint8 *)&p_aff)[bit / 8] >> (bit % 8)) & 1;
m_procCount = np;
#elif NV_OS_DARWIN
// Code from x264 (July 6 snapshot) cpu.c:286
uint numberOfCPUs;
size_t length = sizeof( numberOfCPUs );
if( sysctlbyname("hw.ncpu", &numberOfCPUs, &length, NULL, 0) )
{
numberOfCPUs = 1;
}
m_procCount = numberOfCPUs;
return count;
#else #else
return 1; m_procCount = 1;
#endif #endif
}
nvDebugCheck(m_procCount > 0);
return m_procCount;
} }
uint CpuInfo::coreCount() uint CpuInfo::coreCount()
@ -66,23 +111,52 @@ uint CpuInfo::coreCount()
bool CpuInfo::hasMMX() bool CpuInfo::hasMMX()
{ {
return false; return (cpu() & NV_CPUINFO_MMX_MASK) != 0;
} }
bool CpuInfo::hasSSE() bool CpuInfo::hasSSE()
{ {
return false; return (cpu() & NV_CPUINFO_SSE_MASK) != 0;
} }
bool CpuInfo::hasSSE2() bool CpuInfo::hasSSE2()
{ {
return false; return (cpu() & NV_CPUINFO_SSE2_MASK) != 0;
} }
bool CpuInfo::hasSSE3() bool CpuInfo::hasSSE3()
{ {
return false; return (cpu() & NV_CPUINFO_SSE3_MASK) != 0;
} }
inline int CpuInfo::cpu() {
if (m_cpu == ~0x0) {
m_cpu = 0;
#if NV_CC_MSVC
int CPUInfo[4] = {-1};
__cpuid(CPUInfo, /*InfoType*/ 1);
if (CPUInfo[2] & NV_CPUINFO_SSE3_MASK) {
m_cpu |= NV_CPUINFO_SSE3_MASK;
}
if (CPUInfo[3] & NV_CPUINFO_MMX_MASK) {
m_cpu |= NV_CPUINFO_MMX_MASK;
}
if (CPUInfo[3] & NV_CPUINFO_SSE_MASK) {
m_cpu |= NV_CPUINFO_SSE_MASK;
}
if (CPUInfo[3] & NV_CPUINFO_SSE2_MASK) {
m_cpu |= NV_CPUINFO_SSE2_MASK;
}
#elif NV_CC_GNUC
// TODO: add the proper inline assembly
#if NV_CPU_X86
#elif NV_CPU_X86_64
#endif // NV_CPU_X86_64
#endif // NV_CC_GNUC
}
return m_cpu;
}

View File

@ -18,6 +18,15 @@ namespace nv
// CPU Information. // CPU Information.
class CpuInfo class CpuInfo
{ {
protected:
static int cpu();
private:
// Cache of the CPU data
static uint m_cpu;
static uint m_procCount;
public:
static uint processorCount(); static uint processorCount();
static uint coreCount(); static uint coreCount();
@ -25,7 +34,6 @@ namespace nv
static bool hasSSE(); static bool hasSSE();
static bool hasSSE2(); static bool hasSSE2();
static bool hasSSE3(); static bool hasSSE3();
}; };
#if NV_CC_MSVC #if NV_CC_MSVC

View File

@ -12,16 +12,15 @@
#elif NV_CC_MSVC #elif NV_CC_MSVC
#if NV_CPU_X86 // Uses SSE Intrinsics for both x86 and x86_64
#include <xmmintrin.h>
__forceinline void nvPrefetch(const void * mem) __forceinline void nvPrefetch(const void * mem)
{ {
__asm mov ecx, mem _mm_prefetch(static_cast<const char*>(mem), _MM_HINT_T0); /* prefetcht0 */
__asm prefetcht0 [ecx]; // _mm_prefetch(static_cast<const char*>(mem), _MM_HINT_NTA); /* prefetchnta */
// __asm prefetchnta [ecx];
} }
#endif // NV_CPU_X86 #else
#else // NV_CC_MSVC
// do nothing in other case. // do nothing in other case.
#define nvPrefetch(ptr) #define nvPrefetch(ptr)

View File

@ -34,11 +34,11 @@ class AutoPtr
NV_FORBID_HEAPALLOC(); NV_FORBID_HEAPALLOC();
public: public:
/// Default ctor.
AutoPtr() : m_ptr(NULL) { }
/// Ctor. /// Ctor.
explicit AutoPtr( T * p ) : m_ptr(p) { } AutoPtr(T * p = NULL) : m_ptr(p) { }
template <class Q>
AutoPtr(Q * p) : m_ptr(static_cast<T *>(p)) { }
/** Dtor. Deletes owned pointer. */ /** Dtor. Deletes owned pointer. */
~AutoPtr() { ~AutoPtr() {

View File

@ -7,6 +7,10 @@
*/ */
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// References:
// http://www.codercorner.com/RadixSortRevisited.htm
// http://www.stereopsis.com/radix.html
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/** /**
* Revisited Radix Sort. * Revisited Radix Sort.
@ -26,19 +30,25 @@
* - 09.18.01: faster CHECK_PASS_VALIDITY thanks to Mark D. Shattuck (who provided other tips, not included here) * - 09.18.01: faster CHECK_PASS_VALIDITY thanks to Mark D. Shattuck (who provided other tips, not included here)
* - 10.11.01: added local ram support * - 10.11.01: added local ram support
* - 01.20.02: bugfix! In very particular cases the last pass was skipped in the float code-path, leading to incorrect sorting...... * - 01.20.02: bugfix! In very particular cases the last pass was skipped in the float code-path, leading to incorrect sorting......
* - 01.02.02: - "mIndices" renamed => "mRanks". That's a rank sorter after all.
* - ranks are not "reset" anymore, but implicit on first calls
* - 07.05.02: offsets rewritten with one less indirection.
* - 11.03.02: "bool" replaced with RadixHint enum
* - 07.15.04: stack-based radix added
* - we want to use the radix sort but without making it static, and without allocating anything.
* - we internally allocate two arrays of ranks. Each of them has N uint32s to sort N values.
* - 1Mb/2/sizeof(uint32) = 131072 values max, at the same time.
* - 09.22.04: - adapted to MacOS by Chris Lamb
* - 01.12.06: - added optimizations suggested by Kyle Hubert
* - 04.06.08: - Fix bug negative zero sorting bug by Ignacio Castaño
* *
* \class RadixSort * \class RadixSort
* \author Pierre Terdiman * \author Pierre Terdiman
* \version 1.3 * \version 1.5
* \date August, 15, 1998 * \date August, 15, 1998
*/ */
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/*
To do:
- add an offset parameter between two input values (avoid some data recopy sometimes)
- unroll ? asm ?
*/
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Header // Header
@ -49,138 +59,170 @@ To do:
//using namespace IceCore; //using namespace IceCore;
#define DELETEARRAY(a) { delete [] a; a = NULL; } #define INVALIDATE_RANKS mCurrentSize|=0x80000000
#define CHECKALLOC(a) #define VALIDATE_RANKS mCurrentSize&=0x7fffffff
#define CURRENT_SIZE (mCurrentSize&0x7fffffff)
#define INVALID_RANKS (mCurrentSize&0x80000000)
#if NV_BIG_ENDIAN
#define H0_OFFSET 768
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// #define H1_OFFSET 512
/** #define H2_OFFSET 256
* Constructor. #define H3_OFFSET 0
*/ #define BYTES_INC (3-j)
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// #else
RadixSort::RadixSort() : mCurrentSize(0), mPreviousSize(0), mIndices(NULL), mIndices2(NULL), mTotalCalls(0), mNbHits(0) #define H0_OFFSET 0
{ #define H1_OFFSET 256
#ifndef RADIX_LOCAL_RAM #define H2_OFFSET 512
// Allocate input-independent ram #define H3_OFFSET 768
mHistogram = new uint32[256*4]; #define BYTES_INC j
mOffset = new uint32[256];
#endif #endif
#define CREATE_HISTOGRAMS(type, buffer) \
/* Clear counters/histograms */ \
memset(mHistogram, 0, 256*4*sizeof(uint32)); \
\
/* Prepare to count */ \
const uint8* p = (const uint8*)input; \
const uint8* pe = &p[nb*4]; \
uint32* h0= &mHistogram[H0_OFFSET]; /* Histogram for first pass (LSB) */ \
uint32* h1= &mHistogram[H1_OFFSET]; /* Histogram for second pass */ \
uint32* h2= &mHistogram[H2_OFFSET]; /* Histogram for third pass */ \
uint32* h3= &mHistogram[H3_OFFSET]; /* Histogram for last pass (MSB) */ \
\
bool AlreadySorted = true; /* Optimism... */ \
\
if(INVALID_RANKS) \
{ \
/* Prepare for temporal coherence */ \
type* Running = (type*)buffer; \
type PrevVal = *Running; \
\
while(p!=pe) \
{ \
/* Read input buffer in previous sorted order */ \
type Val = *Running++; \
/* Check whether already sorted or not */ \
if(Val<PrevVal) { AlreadySorted = false; break; } /* Early out */ \
/* Update for next iteration */ \
PrevVal = Val; \
\
/* Create histograms */ \
h0[*p++]++; h1[*p++]++; h2[*p++]++; h3[*p++]++; \
} \
\
/* If all input values are already sorted, we just have to return and leave the */ \
/* previous list unchanged. That way the routine may take advantage of temporal */ \
/* coherence, for example when used to sort transparent faces. */ \
if(AlreadySorted) \
{ \
mNbHits++; \
for(uint32 i=0;i<nb;i++) mRanks[i] = i; \
return *this; \
} \
} \
else \
{ \
/* Prepare for temporal coherence */ \
const uint32* Indices = mRanks; \
type PrevVal = (type)buffer[*Indices]; \
\
while(p!=pe) \
{ \
/* Read input buffer in previous sorted order */ \
type Val = (type)buffer[*Indices++]; \
/* Check whether already sorted or not */ \
if(Val<PrevVal) { AlreadySorted = false; break; } /* Early out */ \
/* Update for next iteration */ \
PrevVal = Val; \
\
/* Create histograms */ \
h0[*p++]++; h1[*p++]++; h2[*p++]++; h3[*p++]++; \
} \
\
/* If all input values are already sorted, we just have to return and leave the */ \
/* previous list unchanged. That way the routine may take advantage of temporal */ \
/* coherence, for example when used to sort transparent faces. */ \
if(AlreadySorted) { mNbHits++; return *this; } \
} \
\
/* Else there has been an early out and we must finish computing the histograms */ \
while(p!=pe) \
{ \
/* Create histograms without the previous overhead */ \
h0[*p++]++; h1[*p++]++; h2[*p++]++; h3[*p++]++; \
}
#define CHECK_PASS_VALIDITY(pass) \
/* Shortcut to current counters */ \
const uint32* CurCount = &mHistogram[pass<<8]; \
\
/* Reset flag. The sorting pass is supposed to be performed. (default) */ \
bool PerformPass = true; \
\
/* Check pass validity */ \
\
/* If all values have the same byte, sorting is useless. */ \
/* It may happen when sorting bytes or words instead of dwords. */ \
/* This routine actually sorts words faster than dwords, and bytes */ \
/* faster than words. Standard running time (O(4*n))is reduced to O(2*n) */ \
/* for words and O(n) for bytes. Running time for floats depends on actual values... */ \
\
/* Get first byte */ \
uint8 UniqueVal = *(((uint8*)input)+pass); \
\
/* Check that byte's counter */ \
if(CurCount[UniqueVal]==nb) PerformPass=false;
using namespace nv;
/// Constructor.
RadixSort::RadixSort() : mRanks(NULL), mRanks2(NULL), mCurrentSize(0), mTotalCalls(0), mNbHits(0), mDeleteRanks(true)
{
// Initialize indices // Initialize indices
resetIndices(); INVALIDATE_RANKS;
} }
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// /// Destructor.
/**
* Destructor.
*/
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
RadixSort::~RadixSort() RadixSort::~RadixSort()
{ {
// Release everything // Release everything
#ifndef RADIX_LOCAL_RAM if(mDeleteRanks)
DELETEARRAY(mOffset); {
DELETEARRAY(mHistogram); delete [] mRanks2;
#endif delete [] mRanks;
DELETEARRAY(mIndices2); }
DELETEARRAY(mIndices);
} }
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// /// Resizes the inner lists.
/** /// \param nb [in] new size (number of dwords)
* Resizes the inner lists. /// \return true if success
* \param nb [in] new size (number of dwords)
* \return true if success
*/
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
bool RadixSort::resize(uint32 nb) bool RadixSort::resize(uint32 nb)
{ {
// Free previously used ram if(mDeleteRanks)
DELETEARRAY(mIndices2); {
DELETEARRAY(mIndices); // Free previously used ram
delete [] mRanks2;
// Get some fresh one delete [] mRanks;
mIndices = new uint32[nb]; CHECKALLOC(mIndices);
mIndices2 = new uint32[nb]; CHECKALLOC(mIndices2);
mCurrentSize = nb;
// Initialize indices so that the input buffer is read in sequential order
resetIndices();
// Get some fresh one
mRanks = new uint32[nb];
mRanks2 = new uint32[nb];
}
return true; return true;
} }
#define CHECK_RESIZE(n) \ inline void RadixSort::checkResize(uint32 nb)
if(n!=mPreviousSize) \ {
{ \ uint32 CurSize = CURRENT_SIZE;
if(n>mCurrentSize) resize(n); \ if(nb!=CurSize)
else resetIndices(); \ {
mPreviousSize = n; \ if(nb>CurSize) resize(nb);
mCurrentSize = nb;
INVALIDATE_RANKS;
} }
}
#define CREATE_HISTOGRAMS(type, buffer) \
/* Clear counters */ \
memset(mHistogram, 0, 256*4*sizeof(uint32)); \
\
/* Prepare for temporal coherence */ \
type PrevVal = (type)buffer[mIndices[0]]; \
bool AlreadySorted = true; /* Optimism... */ \
uint32* Indices = mIndices; \
\
/* Prepare to count */ \
uint8* p = (uint8*)input; \
uint8* pe = &p[nb*4]; \
uint32* h0= &mHistogram[0]; /* Histogram for first pass (LSB) */ \
uint32* h1= &mHistogram[256]; /* Histogram for second pass */ \
uint32* h2= &mHistogram[512]; /* Histogram for third pass */ \
uint32* h3= &mHistogram[768]; /* Histogram for last pass (MSB) */ \
\
while(p!=pe) \
{ \
/* Read input buffer in previous sorted order */ \
type Val = (type)buffer[*Indices++]; \
/* Check whether already sorted or not */ \
if(Val<PrevVal) { AlreadySorted = false; break; } /* Early out */ \
/* Update for next iteration */ \
PrevVal = Val; \
\
/* Create histograms */ \
h0[*p++]++; h1[*p++]++; h2[*p++]++; h3[*p++]++; \
} \
\
/* If all input values are already sorted, we just have to return and leave the */ \
/* previous list unchanged. That way the routine may take advantage of temporal */ \
/* coherence, for example when used to sort transparent faces. */ \
if(AlreadySorted) { mNbHits++; return *this; } \
\
/* Else there has been an early out and we must finish computing the histograms */ \
while(p!=pe) \
{ \
/* Create histograms without the previous overhead */ \
h0[*p++]++; h1[*p++]++; h2[*p++]++; h3[*p++]++; \
}
#define CHECK_PASS_VALIDITY(pass) \
/* Shortcut to current counters */ \
uint32* CurCount = &mHistogram[pass<<8]; \
\
/* Reset flag. The sorting pass is supposed to be performed. (default) */ \
bool PerformPass = true; \
\
/* Check pass validity */ \
\
/* If all values have the same byte, sorting is useless. */ \
/* It may happen when sorting bytes or words instead of dwords. */ \
/* This routine actually sorts words faster than dwords, and bytes */ \
/* faster than words. Standard running time (O(4*n))is reduced to O(2*n) */ \
/* for words and O(n) for bytes. Running time for floats depends on actual values... */ \
\
/* Get first byte */ \
uint8 UniqueVal = *(((uint8*)input)+pass); \
\
/* Check that byte's counter */ \
if(CurCount[UniqueVal]==nb) PerformPass=false;
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/** /**
@ -192,46 +234,31 @@ bool RadixSort::resize(uint32 nb)
* \return Self-Reference * \return Self-Reference
*/ */
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
RadixSort& RadixSort::sort(const uint32* input, uint32 nb, bool signedvalues) RadixSort& RadixSort::sort(const uint32* input, uint32 nb, bool signedValues/*=true*/)
{ {
uint32 i, j;
// Checkings // Checkings
if(!input || !nb) return *this; if(!input || !nb || nb&0x80000000) return *this;
// Stats // Stats
mTotalCalls++; mTotalCalls++;
// Resize lists if needed // Resize lists if needed
CHECK_RESIZE(nb); checkResize(nb);
#ifdef RADIX_LOCAL_RAM
// Allocate histograms & offsets on the stack // Allocate histograms & offsets on the stack
uint32 mHistogram[256*4]; uint32 mHistogram[256*4];
uint32 mOffset[256]; uint32* mLink[256];
#endif
// Create histograms (counters). Counters for all passes are created in one run. // Create histograms (counters). Counters for all passes are created in one run.
// Pros: read input buffer once instead of four times // Pros: read input buffer once instead of four times
// Cons: mHistogram is 4Kb instead of 1Kb // Cons: mHistogram is 4Kb instead of 1Kb
// We must take care of signed/unsigned values for temporal coherence.... I just // We must take care of signed/unsigned values for temporal coherence.... I just
// have 2 code paths even if just a single opcode changes. Self-modifying code, someone? // have 2 code paths even if just a single opcode changes. Self-modifying code, someone?
if(!signedvalues) { CREATE_HISTOGRAMS(uint32, input); } if(!signedValues) { CREATE_HISTOGRAMS(uint32, input); }
else { CREATE_HISTOGRAMS(int32, input); } else { CREATE_HISTOGRAMS(int32, input); }
// Compute #negative values involved if needed
uint32 NbNegativeValues = 0;
if(signedvalues)
{
// An efficient way to compute the number of negatives values we'll have to deal with is simply to sum the 128
// last values of the last histogram. Last histogram because that's the one for the Most Significant Byte,
// responsible for the sign. 128 last values because the 128 first ones are related to positive numbers.
uint32* h3= &mHistogram[768];
for( i=128;i<256;i++) NbNegativeValues += h3[i]; // 768 for last histogram, 128 for negative part
}
// Radix sort, j is the pass number (0=LSB, 3=MSB) // Radix sort, j is the pass number (0=LSB, 3=MSB)
for( j=0;j<4;j++) for(uint32 j=0;j<4;j++)
{ {
CHECK_PASS_VALIDITY(j); CHECK_PASS_VALIDITY(j);
@ -240,40 +267,47 @@ RadixSort& RadixSort::sort(const uint32* input, uint32 nb, bool signedvalues)
if(PerformPass) if(PerformPass)
{ {
// Should we care about negative values? // Should we care about negative values?
if(j!=3 || !signedvalues) if(j!=3 || !signedValues)
{ {
// Here we deal with positive values only // Here we deal with positive values only
// Create offsets // Create offsets
mOffset[0] = 0; mLink[0] = mRanks2;
for(i=1;i<256;i++) mOffset[i] = mOffset[i-1] + CurCount[i-1]; for(uint32 i=1;i<256;i++) mLink[i] = mLink[i-1] + CurCount[i-1];
} }
else else
{ {
// This is a special case to correctly handle negative integers. They're sorted in the right order but at the wrong place. // This is a special case to correctly handle negative integers. They're sorted in the right order but at the wrong place.
mLink[128] = mRanks2;
for(uint32 i=129;i<256;i++) mLink[i] = mLink[i-1] + CurCount[i-1];
// Create biased offsets, in order for negative numbers to be sorted as well mLink[0] = mLink[255] + CurCount[255];
mOffset[0] = NbNegativeValues; // First positive number takes place after the negative ones for(uint32 i=1;i<128;i++) mLink[i] = mLink[i-1] + CurCount[i-1];
for(i=1;i<128;i++) mOffset[i] = mOffset[i-1] + CurCount[i-1]; // 1 to 128 for positive numbers
// Fixing the wrong place for negative values
mOffset[128] = 0;
for(i=129;i<256;i++) mOffset[i] = mOffset[i-1] + CurCount[i-1];
} }
// Perform Radix Sort // Perform Radix Sort
uint8* InputBytes = (uint8*)input; const uint8* InputBytes = (const uint8*)input;
uint32* Indices = mIndices; InputBytes += BYTES_INC;
uint32* IndicesEnd = &mIndices[nb]; if(INVALID_RANKS)
InputBytes += j;
while(Indices!=IndicesEnd)
{ {
uint32 id = *Indices++; for(uint32 i=0;i<nb;i++) *mLink[InputBytes[i<<2]]++ = i;
mIndices2[mOffset[InputBytes[id<<2]]++] = id; VALIDATE_RANKS;
}
else
{
const uint32* Indices = mRanks;
const uint32* IndicesEnd = &mRanks[nb];
while(Indices!=IndicesEnd)
{
uint32 id = *Indices++;
*mLink[InputBytes[id<<2]]++ = id;
}
} }
// Swap pointers for next pass. Valid indices - the most recent ones - are in mIndices after the swap. // Swap pointers for next pass. Valid indices - the most recent ones - are in mRanks after the swap.
uint32* Tmp = mIndices; mIndices = mIndices2; mIndices2 = Tmp; uint32* Tmp = mRanks;
mRanks = mRanks2;
mRanks2 = Tmp;
} }
} }
return *this; return *this;
@ -291,24 +325,20 @@ RadixSort& RadixSort::sort(const uint32* input, uint32 nb, bool signedvalues)
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
RadixSort& RadixSort::sort(const float* input2, uint32 nb) RadixSort& RadixSort::sort(const float* input2, uint32 nb)
{ {
uint32 i, j;
// Checkings // Checkings
if(!input2 || !nb) return *this; if(!input2 || !nb || nb&0x80000000) return *this;
// Stats // Stats
mTotalCalls++; mTotalCalls++;
uint32* input = (uint32*)input2; const uint32* input = (const uint32*)input2;
// Resize lists if needed // Resize lists if needed
CHECK_RESIZE(nb); checkResize(nb);
#ifdef RADIX_LOCAL_RAM
// Allocate histograms & offsets on the stack // Allocate histograms & offsets on the stack
uint32 mHistogram[256*4]; uint32 mHistogram[256*4];
uint32 mOffset[256]; uint32* mLink[256];
#endif
// Create histograms (counters). Counters for all passes are created in one run. // Create histograms (counters). Counters for all passes are created in one run.
// Pros: read input buffer once instead of four times // Pros: read input buffer once instead of four times
@ -320,16 +350,8 @@ RadixSort& RadixSort::sort(const float* input2, uint32 nb)
// wouldn't work with mixed positive/negative values.... // wouldn't work with mixed positive/negative values....
{ CREATE_HISTOGRAMS(float, input2); } { CREATE_HISTOGRAMS(float, input2); }
// Compute #negative values involved if needed
uint32 NbNegativeValues = 0;
// An efficient way to compute the number of negatives values we'll have to deal with is simply to sum the 128
// last values of the last histogram. Last histogram because that's the one for the Most Significant Byte,
// responsible for the sign. 128 last values because the 128 first ones are related to positive numbers.
uint32* h3= &mHistogram[768];
for( i=128;i<256;i++) NbNegativeValues += h3[i]; // 768 for last histogram, 128 for negative part
// Radix sort, j is the pass number (0=LSB, 3=MSB) // Radix sort, j is the pass number (0=LSB, 3=MSB)
for( j=0;j<4;j++) for(uint32 j=0;j<4;j++)
{ {
// Should we care about negative values? // Should we care about negative values?
if(j!=3) if(j!=3)
@ -340,22 +362,32 @@ RadixSort& RadixSort::sort(const float* input2, uint32 nb)
if(PerformPass) if(PerformPass)
{ {
// Create offsets // Create offsets
mOffset[0] = 0; mLink[0] = mRanks2;
for( i=1;i<256;i++) mOffset[i] = mOffset[i-1] + CurCount[i-1]; for(uint32 i=1;i<256;i++) mLink[i] = mLink[i-1] + CurCount[i-1];
// Perform Radix Sort // Perform Radix Sort
uint8* InputBytes = (uint8*)input; const uint8* InputBytes = (const uint8*)input;
uint32* Indices = mIndices; InputBytes += BYTES_INC;
uint32* IndicesEnd = &mIndices[nb]; if(INVALID_RANKS)
InputBytes += j;
while(Indices!=IndicesEnd)
{ {
uint32 id = *Indices++; for(uint32 i=0;i<nb;i++) *mLink[InputBytes[i<<2]]++ = i;
mIndices2[mOffset[InputBytes[id<<2]]++] = id; VALIDATE_RANKS;
}
else
{
const uint32* Indices = mRanks;
const uint32* IndicesEnd = &mRanks[nb];
while(Indices!=IndicesEnd)
{
uint32 id = *Indices++;
*mLink[InputBytes[id<<2]]++ = id;
}
} }
// Swap pointers for next pass. Valid indices - the most recent ones - are in mIndices after the swap. // Swap pointers for next pass. Valid indices - the most recent ones - are in mRanks after the swap.
uint32* Tmp = mIndices; mIndices = mIndices2; mIndices2 = Tmp; uint32* Tmp = mRanks;
mRanks = mRanks2;
mRanks2 = Tmp;
} }
} }
else else
@ -365,35 +397,58 @@ RadixSort& RadixSort::sort(const float* input2, uint32 nb)
if(PerformPass) if(PerformPass)
{ {
// Create biased offsets, in order for negative numbers to be sorted as well mLink[255] = mRanks2 + CurCount[255];
mOffset[0] = NbNegativeValues; // First positive number takes place after the negative ones for(uint32 i = 254; i > 126; i--) mLink[i] = mLink[i+1] + CurCount[i];
for(i=1;i<128;i++) mOffset[i] = mOffset[i-1] + CurCount[i-1]; // 1 to 128 for positive numbers mLink[0] = mLink[127] + CurCount[127];
for(uint32 i = 1; i < 127; i++) mLink[i] = mLink[i-1] + CurCount[i-1];
// We must reverse the sorting order for negative numbers!
mOffset[255] = 0;
for(i=0;i<127;i++) mOffset[254-i] = mOffset[255-i] + CurCount[255-i]; // Fixing the wrong order for negative values
for(i=128;i<256;i++) mOffset[i] += CurCount[i]; // Fixing the wrong place for negative values
// Perform Radix Sort // Perform Radix Sort
for(i=0;i<nb;i++) if(INVALID_RANKS)
{ {
uint32 Radix = input[mIndices[i]]>>24; // Radix byte, same as above. AND is useless here (uint32). for(uint32 i=0;i<nb;i++)
// ### cmp to be killed. Not good. Later. {
if(Radix<128) mIndices2[mOffset[Radix]++] = mIndices[i]; // Number is positive, same as above uint32 Radix = input[i]>>24; // Radix byte, same as above. AND is useless here (uint32).
else mIndices2[--mOffset[Radix]] = mIndices[i]; // Number is negative, flip the sorting order // ### cmp to be killed. Not good. Later.
if(Radix<128) *mLink[Radix]++ = i; // Number is positive, same as above
else *(--mLink[Radix]) = i; // Number is negative, flip the sorting order
}
VALIDATE_RANKS;
} }
// Swap pointers for next pass. Valid indices - the most recent ones - are in mIndices after the swap. else
uint32* Tmp = mIndices; mIndices = mIndices2; mIndices2 = Tmp; {
for(uint32 i=0;i<nb;i++)
{
uint32 Radix = input[mRanks[i]]>>24; // Radix byte, same as above. AND is useless here (uint32).
// ### cmp to be killed. Not good. Later.
if(Radix<128) *mLink[Radix]++ = mRanks[i]; // Number is positive, same as above
else *(--mLink[Radix]) = mRanks[i]; // Number is negative, flip the sorting order
}
}
// Swap pointers for next pass. Valid indices - the most recent ones - are in mRanks after the swap.
uint32* Tmp = mRanks;
mRanks = mRanks2;
mRanks2 = Tmp;
} }
else else
{ {
// The pass is useless, yet we still have to reverse the order of current list if all values are negative. // The pass is useless, yet we still have to reverse the order of current list if all values are negative.
if(UniqueVal>=128) if(UniqueVal>=128)
{ {
for(i=0;i<nb;i++) mIndices2[i] = mIndices[nb-i-1]; if(INVALID_RANKS)
{
// ###Possible?
for(uint32 i=0;i<nb;i++) mRanks2[i] = nb-i-1;
VALIDATE_RANKS;
}
else
{
for(uint32 i=0;i<nb;i++) mRanks2[i] = mRanks[nb-i-1];
}
// Swap pointers for next pass. Valid indices - the most recent ones - are in mIndices after the swap. // Swap pointers for next pass. Valid indices - the most recent ones - are in mRanks after the swap.
uint32* Tmp = mIndices; mIndices = mIndices2; mIndices2 = Tmp; uint32* Tmp = mRanks;
mRanks = mRanks2;
mRanks2 = Tmp;
} }
} }
} }
@ -401,29 +456,29 @@ RadixSort& RadixSort::sort(const float* input2, uint32 nb)
return *this; return *this;
} }
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/** bool RadixSort::setRankBuffers(uint32* ranks0, uint32* ranks1)
* Resets the inner indices. After the call, mIndices is reset.
*/
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
void RadixSort::resetIndices()
{ {
for(uint32 i=0;i<mCurrentSize;i++) mIndices[i] = i; if(!ranks0 || !ranks1) return false;
mRanks = ranks0;
mRanks2 = ranks1;
mDeleteRanks = false;
return true;
} }
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// RadixSort & RadixSort::sort(const Array<int> & input)
/**
* Gets the ram used.
* \return memory used in bytes
*/
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
uint32 RadixSort::usedRam() const
{ {
uint32 UsedRam = sizeof(RadixSort); return sort((const uint32 *)input.buffer(), input.count(), true);
#ifndef RADIX_LOCAL_RAM }
UsedRam += 256*4*sizeof(uint32); // Histograms
UsedRam += 256*sizeof(uint32); // Offsets RadixSort & RadixSort::sort(const Array<uint> & input)
#endif {
UsedRam += 2*mCurrentSize*sizeof(uint32); // 2 lists of indices return sort(input.buffer(), input.count(), false);
return UsedRam; }
RadixSort & RadixSort::sort(const Array<float> & input)
{
return sort(input.buffer(), input.count());
} }

View File

@ -13,57 +13,61 @@
#define NV_CORE_RADIXSORT_H #define NV_CORE_RADIXSORT_H
#include <nvcore/nvcore.h> #include <nvcore/nvcore.h>
#include <nvcore/Containers.h>
namespace nv
{
class NVCORE_CLASS RadixSort
{
NV_FORBID_COPY(RadixSort);
public:
// Constructor/Destructor
RadixSort();
~RadixSort();
// Sorting methods
RadixSort & sort(const uint32* input, uint32 nb, bool signedValues=true);
RadixSort & sort(const float* input, uint32 nb);
// Helpers
RadixSort & sort(const Array<int> & input);
RadixSort & sort(const Array<uint> & input);
RadixSort & sort(const Array<float> & input);
#define RADIX_LOCAL_RAM //! Access to results. mRanks is a list of indices in sorted order, i.e. in the order you may further process your data
inline /*const*/ uint32 * ranks() /*const*/ { return mRanks; }
//! mIndices2 gets trashed on calling the sort routine, but otherwise you can recycle it the way you want.
inline uint32 * recyclable() const { return mRanks2; }
class NVCORE_API RadixSort { // Stats
NV_FORBID_COPY(RadixSort); //! Returns the total number of calls to the radix sorter.
public: inline uint32 totalCalls() const { return mTotalCalls; }
// Constructor/Destructor
RadixSort();
~RadixSort();
// Sorting methods //! Returns the number of early exits due to temporal coherence.
RadixSort & sort(const uint32* input, uint32 nb, bool signedvalues=true); inline uint32 hits() const { return mNbHits; }
RadixSort & sort(const float* input, uint32 nb);
//! Access to results. mIndices is a list of indices in sorted order, i.e. in the order you may further process your data
inline uint32 * indices() const { return mIndices; }
//! mIndices2 gets trashed on calling the sort routine, but otherwise you can recycle it the way you want.
inline uint32 * recyclable() const { return mIndices2; }
// Stats
uint32 usedRam() const;
//! Returns the total number of calls to the radix sorter.
inline uint32 totalCalls() const { return mTotalCalls; }
//! Returns the number of premature exits due to temporal coherence.
inline uint32 hits() const { return mNbHits; }
bool setRankBuffers(uint32* ranks0, uint32* ranks1);
private: private:
#ifndef RADIX_LOCAL_RAM uint32 mCurrentSize; //!< Current size of the indices list
uint32* mHistogram; //!< Counters for each byte uint32 * mRanks; //!< Two lists, swapped each pass
uint32* mOffset; //!< Offsets (nearly a cumulative distribution function) uint32 * mRanks2;
#endif
uint32 mCurrentSize; //!< Current size of the indices list
uint32 mPreviousSize; //!< Size involved in previous call
uint32* mIndices; //!< Two lists, swapped each pass
uint32* mIndices2;
// Stats // Stats
uint32 mTotalCalls; uint32 mTotalCalls; //!< Total number of calls to the sort routine
uint32 mNbHits; uint32 mNbHits; //!< Number of early exits due to coherence
// Internal methods // Stack-radix
bool resize(uint32 nb); bool mDeleteRanks; //!<
void resetIndices();
}; // Internal methods
void checkResize(uint32 nb);
bool resize(uint32 nb);
};
} // nv namespace
#endif // NV_CORE_RADIXSORT_H #endif // NV_CORE_RADIXSORT_H

View File

@ -208,49 +208,12 @@ StringBuilder::StringBuilder( const StringBuilder & s ) : m_size(0), m_str(NULL)
copy(s); copy(s);
} }
// Copy string. /** Copy string. */
/*StringBuilder::StringBuilder( const char * s ) StringBuilder::StringBuilder( const char * s )
{ {
copy(s); copy(s);
}*/
/** Allocate and copy string. */
StringBuilder::StringBuilder( int size_hint, const StringBuilder & s) : m_size(size_hint), m_str(NULL)
{
nvDebugCheck(m_size > 0);
m_str = strAlloc(m_size);
copy(s);
} }
/** Allocate and format string. */
StringBuilder::StringBuilder( const char * fmt, ... ) : m_size(0), m_str(NULL)
{
nvDebugCheck(fmt != NULL);
va_list arg;
va_start( arg, fmt );
format( fmt, arg );
va_end( arg );
}
/** Allocate and format string. */
StringBuilder::StringBuilder( int size_hint, const char * fmt, ... ) : m_size(size_hint), m_str(NULL)
{
nvDebugCheck(m_size > 0);
nvDebugCheck(fmt != NULL);
m_str = strAlloc(m_size);
va_list arg;
va_start( arg, fmt );
format( fmt, arg );
va_end( arg );
}
/** Delete the string. */ /** Delete the string. */
StringBuilder::~StringBuilder() StringBuilder::~StringBuilder()
{ {
@ -278,8 +241,7 @@ StringBuilder & StringBuilder::format( const char * fmt, ... )
/** Format a string safely. */ /** Format a string safely. */
StringBuilder & StringBuilder::format( const char * fmt, va_list arg ) StringBuilder & StringBuilder::format( const char * fmt, va_list arg )
{ {
nvCheck(fmt != NULL); nvDebugCheck(fmt != NULL);
nvCheck(m_size >= 0);
if( m_size == 0 ) { if( m_size == 0 ) {
m_size = 64; m_size = 64;
@ -327,8 +289,7 @@ StringBuilder & StringBuilder::format( const char * fmt, va_list arg )
/** Append a string. */ /** Append a string. */
StringBuilder & StringBuilder::append( const char * s ) StringBuilder & StringBuilder::append( const char * s )
{ {
nvCheck(s != NULL); nvDebugCheck(s != NULL);
nvCheck(m_size >= 0);
const uint slen = uint(strlen( s )); const uint slen = uint(strlen( s ));
@ -475,31 +436,6 @@ void StringBuilder::reset()
} }
Path::Path(const char * fmt, ...)
{
nvDebugCheck( fmt != NULL );
va_list arg;
va_start( arg, fmt );
format( fmt, arg );
va_end( arg );
}
Path::Path(int size_hint, const char * fmt, ...) : StringBuilder(size_hint)
{
nvDebugCheck( fmt != NULL );
va_list arg;
va_start( arg, fmt );
format( fmt, arg );
va_end( arg );
}
/// Get the file name from a path. /// Get the file name from a path.
const char * Path::fileName() const const char * Path::fileName() const
{ {

View File

@ -45,11 +45,8 @@ namespace nv
StringBuilder(); StringBuilder();
explicit StringBuilder( int size_hint ); explicit StringBuilder( int size_hint );
//StringBuilder( const char * str ); StringBuilder( const char * str );
StringBuilder( const StringBuilder & ); StringBuilder( const StringBuilder & );
StringBuilder( int size_hint, const StringBuilder & );
StringBuilder( const char * format, ... ) __attribute__((format (printf, 2, 3)));
StringBuilder( int size_hint, const char * format, ... ) __attribute__((format (printf, 3, 4)));
~StringBuilder(); ~StringBuilder();
@ -120,24 +117,17 @@ namespace nv
char * m_str; char * m_str;
}; };
/// Path string.
/// Path string. @@ This should be called PathBuilder.
class NVCORE_CLASS Path : public StringBuilder class NVCORE_CLASS Path : public StringBuilder
{ {
public: public:
Path() : StringBuilder() {} Path() : StringBuilder() {}
explicit Path(int size_hint) : StringBuilder(size_hint) {} explicit Path(int size_hint) : StringBuilder(size_hint) {}
//Path(const char * str) : StringBuilder((const char *)str) {} Path( const char * str ) : StringBuilder(str) {}
Path(const StringBuilder & str) : StringBuilder(str) {} Path(const StringBuilder & str) : StringBuilder(str) {}
Path(int size_hint, const StringBuilder & str) : StringBuilder(size_hint, str) {}
Path(const char * format, ...) __attribute__((format (printf, 2, 3)));
Path(int size_hint, const char * format, ...) __attribute__((format (printf, 3, 4)));
Path & operator=( const char * s ) {
return (Path &)copy(s);
}
const char * fileName() const; const char * fileName() const;
const char * extension() const; const char * extension() const;
@ -145,11 +135,11 @@ namespace nv
void stripFileName(); void stripFileName();
void stripExtension(); void stripExtension();
// statics // statics
NVCORE_API static char separator(); static char separator();
NVCORE_API static const char * fileName(const char *); static const char * fileName(const char *);
NVCORE_API static const char * extension(const char *); static const char * extension(const char *);
}; };

22
src/nvcore/Timer.h Normal file
View File

@ -0,0 +1,22 @@
// This code is in the public domain -- castano@gmail.com
#ifndef NV_CORE_TIMER_H
#define NV_CORE_TIMER_H
#include <nvcore/nvcore.h>
#include <time.h> //clock
class NVCORE_CLASS Timer
{
public:
Timer() {}
void start() { m_start = clock(); }
int elapsed() const { return (1000 * (clock() - m_start)) / CLOCKS_PER_SEC; }
private:
clock_t m_start;
};
#endif // NV_CORE_TIMER_H

View File

@ -8,7 +8,7 @@
#include <stdlib.h> // atof, atoi #include <stdlib.h> // atof, atoi
#if NV_CC_MSVC #if NV_CC_MSVC
#if 0 // This doesn't work on MSVC for x64 #if defined NV_CPU_X86
/* vsscanf for Win32 /* vsscanf for Win32
* Written 5/2003 by <mgix@mgix.com> * Written 5/2003 by <mgix@mgix.com>
* This code is in the Public Domain * This code is in the Public Domain
@ -56,9 +56,39 @@ static int vsscanf(const char * buffer, const char * format, va_list argPtr)
} }
return result; return result;
} }
#elif defined NV_CPU_X86_64
/* Prototype of the helper assembly function */
#ifdef __cplusplus
extern "C" {
#endif #endif
int vsscanf_proxy_win64(const char * buffer, const char * format, va_list argPtr, __int64 count);
#ifdef __cplusplus
}
#endif #endif
/* MASM64 version of the above vsscanf */
static int vsscanf(const char * buffer, const char * format, va_list argPtr)
{
// Get an upper bound for the # of args
__int64 count = 0;
const char *p = format;
while(1) {
char c = *(p++);
if(c==0) break;
if(c=='%' && (p[0]!='*' && p[0]!='%')) ++count;
}
return vsscanf_proxy_win64(buffer, format, argPtr, count);
}
/*#error vsscanf doesn't work on MSVC for x64*/
#else
#error Unknown processor for MSVC
#endif
#endif // NV_CC_MSVC
using namespace nv; using namespace nv;
Token::Token() : Token::Token() :

View File

@ -0,0 +1,124 @@
; MASM x64 version of
; vsscanf for Win32
; originally written 5/2003 by <mgix@mgix.com>
;
; This was done because MSVC does not accept inline assembly code
; for the x64 platform, so this file implements almost the whole
; module in assembly using the amd64 ABI
;
; 06/17/2008 by edgarv [at] nvidia com
; Definition of memcpy
memcpy PROTO dest:Ptr, src:Ptr, numbytes:QWORD
; Definition of sscanf
sscanf PROTO buffer:Ptr Byte, format:Ptr Byte, args:VARARG
; Start a code segment named "_TEXT" by default
.CODE
; Entry point of our function: at this point we can use
; named parameters
ALIGN 16
PUBLIC vsscanf_proxy_win64
; Because the x64 code uses the fast call convention, only
; the arguments beyond the 4th one are available from the stack.
; The first four parameters are in RCX, RDX, R8 and R9
; Parameters:
; const char* buffer
; const char* format
; va_list argPtr
; size_t count
vsscanf_proxy_win64 PROC, \
buffer:PTR Byte, format:PTR Byte, argPtr:PTR, count:QWORD
; Allocates space for our local variable, savedRDP
sub rsp, 08h
; Copies the parameters from the registers to the memory: before warping to
; sscanf we will call memcpy, and those registers can just dissapear!
mov buffer, rcx
mov format, rdx
mov argPtr, r8
mov count, r9
; Allocate extra space in the stack for (2+count)*sizeof(void*),
; this is (2+count)*(8)
mov r10, r9
add r10, 2 ; count += 2
sal r10, 3 ; count *= 8
add r10, 0fh ; To force alignment to 16bytes
and r10, 0fffffffffffffff0h
sub rsp, r10 ; Actual stack allocation
; Continues by copying all the arguments in the "alloca" space
mov [rsp], rcx ; newStack[0] = (void*)buffer;
mov [rsp + 08h], rdx ; newStack[1] = (void*)format;
; Calls memcpy(newStack+2, argPtr, count*sizeof(void*));
mov rcx, rsp
add rcx, 010h ; newStack+2
mov rdx, r8 ; argPtr
mov r8, r9
sal r8, 3 ; count*sizeof(void*)
; Prepares extra stack space as required by the ABI for 4 arguments, and calls memcpy
sub rsp, 020h
call memcpy
; Restore the stack
add rsp, 020h
; Saves rsp in memory
mov qword ptr [rbp - 8], rsp
; Does exactly the same trick as before: warp into system sscanf with the new stack,
; but this time we also setup the arguments in the registers according to the amd64 ABI
; If there was at least one argument (after buffer and format), we need to copy that
; to r8, and if there was a second one we must copy that to r9
; (the first arguments to sscanf are always the buffer and the format)
mov r10, count
; Copy the first argument to r8 (if it exists)
cmp r10, 0
je args_memcpy
mov r8, [rsp + 10h]
; Copy the second argument to r9 (if it exists)
cmp r10, 1
je args_memcpy
mov r9, [rsp + 18h]
args_memcpy:
; Copies the buffer and format to rcx and rdx
mov rdx, [rsp + 08h]
mov rcx, [rsp]
; Finally, calls sscanf using the current stack
call sscanf
; At this point the return value is alreay in rax
; Restores rsp
mov rsp, qword ptr [rbp - 8]
; Undoes the alloca
add rsp, r10
; Restores the space for local variables
add rsp, 08h
; Remember, the return value is already in rax since the sscanf call
ret
vsscanf_proxy_win64 ENDP
END