Merge internal branch.

- Remove old/unused code. - Remove format string constructors. - Better win64 support (vsscanf, prefetch, etc). - Fix radix sort to sort -0 correctly. - Add misc utilities (constraints, timer, cpuinfo, introsort).
2008-12-29 11:20:06 +00:00 · 2008-12-29 11:20:06 +00:00 · e5ae0c0e20
commit e5ae0c0e20
parent a03411e451
14 changed files with 801 additions and 462 deletions
--- a/src/nvcore/Algorithms.h
+++ b/src/nvcore/Algorithms.h
@ -3,6 +3,8 @@
 #ifndef NV_CORE_ALGORITHMS_H
 #define NV_CORE_ALGORITHMS_H

+#include <nvcore/nvcore.h>
+
 namespace nv
 {

@ -45,22 +47,42 @@ namespace nv
 		}
 	}
 	
-	
-	// @@ Swap should be implemented here.
-	
-	
-#if 0
-	// This does not use swap, but copies, in some cases swaps are much faster than copies!
-	// Container should implement operator[], and size()
-	template <class Container, class T>
-	void insertionSort(Container<T> & container)
+	// @@ Should swap be implemented here?
+
+
+
+	template <typename T, template <typename T> class C>
+	void sort(C<T> & container)
 	{
-		const uint n = container.size();
-		for (uint i=1; i < n; ++i)
+		introsortLoop(container, 0, container.count());
+		insertionSort(container, 0, container.count());
+	}
+
+	template <typename T, template <typename T> class C>
+	void sort(C<T> & container, uint begin, uint end)
+	{
+		if (begin < end)
+		{
+			introsortLoop(container, begin, end);
+			insertionSort(container, begin, end);
+		}
+	}
+
+	template <typename T, template <typename T> class C>
+	void insertionSort(C<T> & container)
+	{
+		insertionSort(container, 0, container.count());
+	}
+
+	template <typename T, template <typename T> class C>
+	void insertionSort(C<T> & container, uint begin, uint end)
+	{
+		for (uint i = begin + 1; i != end; ++i)
 		{
 			T value = container[i];
+
 			uint j = i;
-			while (j > 0 && container[j-1] > value)
+			while (j != begin && container[j-1] > value)
 			{
 				container[j] = container[j-1];
 				--j;
@ -72,75 +94,60 @@ namespace nv
 		}
 	}

-	template <class Container, class T>
-	void quickSort(Container<T> & container)
-	{
-		quickSort(container, 0, container.count());
-	}
-	
-	{
-		/* threshhold for transitioning to insertion sort */
-		while (n > 12) {
-			int c01,c12,c,m,i,j;
+	template <typename T, template <typename T> class C>
+    void introsortLoop(C<T> & container, uint begin, uint end)
+    {
+    	while (end-begin > 16)
+    	{
+			uint p = partition(container, begin, end, medianof3(container, begin, begin+((end-begin)/2)+1, end-1));
+			introsortLoop(container, p, end);
+			end = p;
+    	}
+    }

-			/* compute median of three */
-			m = n >> 1;
-			c = p[0] > p[m];
-			c01 = c;
-			c = &p[m] > &p[n-1];
-			c12 = c;
-			/* if 0 >= mid >= end, or 0 < mid < end, then use mid */
-			if (c01 != c12) {
-				/* otherwise, we'll need to swap something else to middle */
-				int z;
-				c = p[0] < p[n-1];
-				/* 0>mid && mid<n:  0>n => n; 0<n => 0 */
-				/* 0<mid && mid>n:  0>n => 0; 0<n => n */
-				z = (c == c12) ? 0 : n-1;
-				swap(p[z], p[m]);
+	template <typename T, template <typename T> class C>
+    uint partition(C<T> & a, uint begin, uint end, const T & x)
+    {
+    	int i = begin, j = end;
+    	while (true)
+    	{
+    	    while (a[i] < x) ++i;
+    	    --j;
+    	    while (x < a[j]) --j;
+    	    if (i >= j)
+    			return i;
+    	    swap(a[i], a[j]);
+    	    i++;
+    	}
+    }
+
+	template <typename T, template <typename T> class C>
+    const T & medianof3(C<T> & a, uint lo, uint mid, uint hi)
+    {
+		if (a[mid] < a[lo])
+		{
+			if (a[hi] < a[mid])
+			{
+				return a[mid];
 			}
-			/* now p[m] is the median-of-three */
-			/* swap it to the beginning so it won't move around */
-			swap(p[0], p[m]);
-
-			/* partition loop */
-			i=1;
-			j=n-1;
-			for(;;) {
-				/* handling of equality is crucial here */
-				/* for sentinels & efficiency with duplicates */
-				for (;;++i) {
-					c = p[i] > p[0];
-					if (!c) break;
-				}
-				a = &p[0];
-				for (;;--j) {
-					b=&p[j];
-					c = p[j] > p[0]
-					if (!c) break;
-				}
-				/* make sure we haven't crossed */
-				if (i >= j) break;
-				swap(p[i], p[j]);
-
-				++i;
-				--j;
-			}
-			/* recurse on smaller side, iterate on larger */
-			if (j < (n-i)) {
-				quickSort(p, j);
-				p = p+i;
-				n = n-i;
-			} 
-			else {
-				quickSort(p+i, n-i);
-				n = j;
+			else
+			{
+				return (a[hi] < a[lo]) ? a[hi] : a[lo];
 			}
 		}
-		
-		insertionSort();
-	}
-#endif // 0
+		else
+		{
+			if (a[hi] < a[mid])
+			{
+				return (a[hi] < a[lo]) ? a[lo] : a[hi];
+			}
+			else
+			{
+				return a[mid];
+			}
+		}
+    }
+

 } // nv namespace

--- a/src/nvcore/CMakeLists.txt
+++ b/src/nvcore/CMakeLists.txt
@ -24,11 +24,14 @@ SET(CORE_SRCS
 	TextReader.cpp
 	TextWriter.h
 	TextWriter.cpp
+	Tokenizer.h
+	Tokenizer.cpp
 	Radix.h
 	Radix.cpp
 	CpuInfo.h
 	CpuInfo.cpp
 	Algorithms.h
+	Timer.h
 	Library.h
 	Library.cpp
 	FileSystem.h
@ -36,6 +39,34 @@ SET(CORE_SRCS

 INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})

+# For Windows64 in MSVC we need to add the assembly version of vsscanf
+IF(MSVC AND NV_SYSTEM_PROCESSOR STREQUAL "AMD64")
+  SET(VSSCANF_ASM_NAME "vsscanf_proxy_win64")
+  IF(MSVC_IDE)
+    # $(IntDir) is a macro expanded to the intermediate directory of the selected solution configuration
+    SET(VSSCANF_ASM_INTDIR "$(IntDir)")
+  ELSE(MSVC_IDE)
+    # For some reason the NMake generator doesn't work properly with the generated .obj source:
+	# it requires the absolute path. So this is a hack which worked as of cmake 2.6.0 patch 0
+	GET_FILENAME_COMPONENT(VSSCANF_ASM_INTDIR 
+	    "${nvcore_BINARY_DIR}/CMakeFiles/nvcore.dir" ABSOLUTE)
+  ENDIF(MSVC_IDE)
+  
+  SET(VSSCANF_ASM_SRC "${CMAKE_CURRENT_SOURCE_DIR}/${VSSCANF_ASM_NAME}.masm")
+  SET(VSSCANF_ASM_OBJ "${VSSCANF_ASM_INTDIR}/${VSSCANF_ASM_NAME}.obj")
+
+  # Adds the assembly output to the sources and adds the custom command to generate it
+  SET(CORE_SRCS
+	${CORE_SRCS}
+	${VSSCANF_ASM_OBJ}
+  )
+  ADD_CUSTOM_COMMAND(OUTPUT ${VSSCANF_ASM_OBJ}
+					 MAIN_DEPENDENCY ${VSSCANF_ASM_SRC}
+					 COMMAND ml64
+					 ARGS  /nologo /Fo ${VSSCANF_ASM_OBJ} /c /Cx ${VSSCANF_ASM_SRC}
+  )
+ENDIF(MSVC AND NV_SYSTEM_PROCESSOR STREQUAL "AMD64")
+	
 # targets
 ADD_DEFINITIONS(-DNVCORE_EXPORTS)

--- a/src/nvcore/Constraints.h
+++ b/src/nvcore/Constraints.h
@ -0,0 +1,59 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#ifndef NV_CORE_ALGORITHMS_H
+#define NV_CORE_ALGORITHMS_H
+
+#include <nvcore/nvcore.h>
+
+namespace nv
+{
+	// Cool constraints from "Imperfect C++"
+
+	// must_be_pod
+	template <typename T>
+	struct must_be_pod
+	{
+		static void constraints()
+		{
+			union { T T_is_not_POD_type; };
+		}
+	};
+
+	// must_be_pod_or_void
+	template <typename T>
+	struct must_be_pod_or_void
+	{
+		static void constraints()
+		{
+			union { T T_is_not_POD_type; };
+		}
+	};
+	template <> struct must_be_pod_or_void<void> {};
+
+	// size_of
+	template <typename T>
+	struct size_of
+	{
+		enum { value = sizeof(T) };
+	};
+	template <> 
+	struct size_of<void>
+	{
+		enum { value = 0 };
+	};
+	
+	// must_be_same_size
+	template <typename T1, typename T2>
+	struct must_be_same_size
+	{
+		static void constraints()
+		{
+			const int T1_not_same_size_as_T2 = size_of<T1>::value == size_of<T2>::value;
+			int i[T1_not_same_size_as_T2];
+		}
+	};
+
+
+} // nv namespace
+
+#endif // NV_CORE_ALGORITHMS_H
--- a/src/nvcore/CpuInfo.cpp
+++ b/src/nvcore/CpuInfo.cpp
@ -33,30 +33,75 @@ static bool isWow64()
 #endif // NV_OS_WIN32


+#if NV_OS_LINUX
+#include <string.h>
+#include <sched.h>
+#endif // NV_OS_LINUX
+
+#if NV_OS_DARWIN
+#include <sys/types.h>
+#include <sys/sysctl.h>
+#endif // NV_OS_DARWIN
+
+// Initialize the data and the local defines, which are designed
+// to match the positions in cpuid
+uint CpuInfo::m_cpu = ~0x0;
+uint CpuInfo::m_procCount = 0;
+#define NV_CPUINFO_MMX_MASK  (1<<23)
+#define NV_CPUINFO_SSE_MASK  (1<<25)
+#define NV_CPUINFO_SSE2_MASK (1<<26)
+#define NV_CPUINFO_SSE3_MASK (1)
+

 uint CpuInfo::processorCount()
 {
+	if (m_procCount == 0) {
 #if NV_OS_WIN32
-	SYSTEM_INFO sysInfo;
+		SYSTEM_INFO sysInfo;

-	typedef BOOL (WINAPI *LPFN_ISWOW64PROCESS) (HANDLE, PBOOL);
+		typedef BOOL (WINAPI *LPFN_ISWOW64PROCESS) (HANDLE, PBOOL);

-	if (isWow64())
-	{
-		GetNativeSystemInfo(&sysInfo);
-	}
-	else
-	{
-		GetSystemInfo(&sysInfo);
-	}
+		if (isWow64())
+		{
+			GetNativeSystemInfo(&sysInfo);
+		}
+		else
+		{
+			GetSystemInfo(&sysInfo);
+		}

-	uint count = (uint)sysInfo.dwNumberOfProcessors;
-	nvDebugCheck(count >= 1);
+		uint count = (uint)sysInfo.dwNumberOfProcessors;
+		m_procCount = count;
+
+#elif NV_OS_LINUX
+
+		// Code from x264 (July 6 snapshot) cpu.c:271
+		uint bit;
+		uint np;
+		cpu_set_t p_aff;
+		memset( &p_aff, 0, sizeof(p_aff) );
+		sched_getaffinity( 0, sizeof(p_aff), &p_aff );
+		for( np = 0, bit = 0; bit < sizeof(p_aff); bit++ )
+			np += (((uint8 *)&p_aff)[bit / 8] >> (bit % 8)) & 1;
+		m_procCount = np;
+
+#elif NV_OS_DARWIN
+
+		// Code from x264 (July 6 snapshot) cpu.c:286
+		uint numberOfCPUs;
+		size_t length = sizeof( numberOfCPUs );
+		if( sysctlbyname("hw.ncpu", &numberOfCPUs, &length, NULL, 0) )
+		{
+			numberOfCPUs = 1;
+		}
+		m_procCount = numberOfCPUs;

-	return count;
 #else
-	return 1;
+		m_procCount = 1;
 #endif
+	}
+	nvDebugCheck(m_procCount > 0);
+	return m_procCount;
 }

 uint CpuInfo::coreCount()
@ -66,23 +111,52 @@ uint CpuInfo::coreCount()

 bool CpuInfo::hasMMX()
 {
-	return false;
+	return (cpu() & NV_CPUINFO_MMX_MASK) != 0;
 }

 bool CpuInfo::hasSSE()
 {
-	return false;
+	return (cpu() & NV_CPUINFO_SSE_MASK) != 0;
 }

 bool CpuInfo::hasSSE2()
 {
-	return false;
+	return (cpu() & NV_CPUINFO_SSE2_MASK) != 0;
 }

 bool CpuInfo::hasSSE3()
 {
-	return false;
+	return (cpu() & NV_CPUINFO_SSE3_MASK) != 0;
 }

+inline int CpuInfo::cpu() {
+	if (m_cpu == ~0x0) {
+		m_cpu = 0;

+#if NV_CC_MSVC
+		int CPUInfo[4] = {-1};
+		__cpuid(CPUInfo, /*InfoType*/ 1);
+		
+		if (CPUInfo[2] & NV_CPUINFO_SSE3_MASK) {
+			m_cpu |= NV_CPUINFO_SSE3_MASK;
+		}
+		if (CPUInfo[3] & NV_CPUINFO_MMX_MASK) {
+			m_cpu |= NV_CPUINFO_MMX_MASK;
+		}
+		if (CPUInfo[3] & NV_CPUINFO_SSE_MASK) {
+			m_cpu |= NV_CPUINFO_SSE_MASK;
+		}
+		if (CPUInfo[3] & NV_CPUINFO_SSE2_MASK) {
+			m_cpu |= NV_CPUINFO_SSE2_MASK;
+		}
+#elif NV_CC_GNUC
+		// TODO: add the proper inline assembly
+#if NV_CPU_X86

+#elif NV_CPU_X86_64
+
+#endif	// NV_CPU_X86_64
+#endif	// NV_CC_GNUC
+	}
+	return m_cpu;
+}
--- a/src/nvcore/CpuInfo.h
+++ b/src/nvcore/CpuInfo.h
@ -18,6 +18,15 @@ namespace nv
 	// CPU Information.
 	class CpuInfo
 	{
+	protected:
+		static int cpu();
+
+	private:
+		// Cache of the CPU data
+		static uint m_cpu;
+		static uint m_procCount;
+
+	public:
 		static uint processorCount();
 		static uint coreCount();

@ -25,7 +34,6 @@ namespace nv
 		static bool hasSSE();
 		static bool hasSSE2();
 		static bool hasSSE3();
-
 	};

 #if NV_CC_MSVC
--- a/src/nvcore/Prefetch.h
+++ b/src/nvcore/Prefetch.h
@ -12,16 +12,15 @@

 #elif NV_CC_MSVC 

-#if NV_CPU_X86
+// Uses SSE Intrinsics for both x86 and x86_64
+#include <xmmintrin.h>
+
 __forceinline void nvPrefetch(const void * mem)
 {
-	__asm mov ecx, mem
-	__asm prefetcht0 [ecx];
-//	__asm prefetchnta [ecx];
+	_mm_prefetch(static_cast<const char*>(mem), _MM_HINT_T0);	/* prefetcht0  */
+//	_mm_prefetch(static_cast<const char*>(mem), _MM_HINT_NTA);	/* prefetchnta */
 }
-#endif // NV_CPU_X86
-
-#else // NV_CC_MSVC
+#else

 // do nothing in other case.
 #define nvPrefetch(ptr)
--- a/src/nvcore/Ptr.h
+++ b/src/nvcore/Ptr.h
@ -34,11 +34,11 @@ class AutoPtr
 	NV_FORBID_HEAPALLOC();
 public:
 	
-	/// Default ctor.
-	AutoPtr() : m_ptr(NULL) { }
-	
 	/// Ctor.
-	explicit AutoPtr( T * p ) : m_ptr(p) { }
+	AutoPtr(T * p = NULL) : m_ptr(p) { }
+
+	template <class Q>
+	AutoPtr(Q * p) : m_ptr(static_cast<T *>(p)) { }
 	
 	/** Dtor. Deletes owned pointer. */
 	~AutoPtr() {
--- a/src/nvcore/Radix.cpp
+++ b/src/nvcore/Radix.cpp
@ -7,6 +7,10 @@
 */
 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

+// References:
+// http://www.codercorner.com/RadixSortRevisited.htm
+// http://www.stereopsis.com/radix.html
+
 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 /**
 *	Revisited Radix Sort.
@ -26,19 +30,25 @@
 *	- 09.18.01: faster CHECK_PASS_VALIDITY thanks to Mark D. Shattuck (who provided other tips, not included here)
 *	- 10.11.01: added local ram support
 *	- 01.20.02: bugfix! In very particular cases the last pass was skipped in the float code-path, leading to incorrect sorting......
+ *	- 01.02.02:	- "mIndices" renamed => "mRanks". That's a rank sorter after all.
+ *				- ranks are not "reset" anymore, but implicit on first calls
+ *	- 07.05.02:	offsets rewritten with one less indirection.
+ *	- 11.03.02:	"bool" replaced with RadixHint enum
+ *	- 07.15.04:	stack-based radix added
+ *				- we want to use the radix sort but without making it static, and without allocating anything.
+ *				- we internally allocate two arrays of ranks. Each of them has N uint32s to sort N values.
+ *				- 1Mb/2/sizeof(uint32) = 131072 values max, at the same time.
+ *	- 09.22.04:	- adapted to MacOS by Chris Lamb
+ *	- 01.12.06:	- added optimizations suggested by Kyle Hubert
+ *	- 04.06.08:	- Fix bug negative zero sorting bug by Ignacio Castaño
 *
 *	\class		RadixSort
 *	\author		Pierre Terdiman
- *	\version	1.3
+ *	\version	1.5
 *	\date		August, 15, 1998
 */
 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

-/*
-To do:
-	- add an offset parameter between two input values (avoid some data recopy sometimes)
-	- unroll ? asm ?
-*/

 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 // Header
@ -49,138 +59,170 @@ To do:

 //using namespace IceCore;

-#define DELETEARRAY(a)	{ delete [] a; a = NULL; }
-#define CHECKALLOC(a)
+#define INVALIDATE_RANKS	mCurrentSize|=0x80000000
+#define VALIDATE_RANKS		mCurrentSize&=0x7fffffff
+#define CURRENT_SIZE		(mCurrentSize&0x7fffffff)
+#define INVALID_RANKS		(mCurrentSize&0x80000000)

-
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-/**
- *	Constructor.
- */
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-RadixSort::RadixSort() : mCurrentSize(0), mPreviousSize(0), mIndices(NULL), mIndices2(NULL), mTotalCalls(0), mNbHits(0)
-{
-#ifndef RADIX_LOCAL_RAM
-	// Allocate input-independent ram
-	mHistogram		= new uint32[256*4];
-	mOffset			= new uint32[256];
+#if NV_BIG_ENDIAN
+	#define H0_OFFSET	768
+	#define H1_OFFSET	512
+	#define H2_OFFSET	256
+	#define H3_OFFSET	0
+	#define BYTES_INC	(3-j)
+#else 
+	#define H0_OFFSET	0
+	#define H1_OFFSET	256
+	#define H2_OFFSET	512
+	#define H3_OFFSET	768
+	#define BYTES_INC	j
 #endif
+
+#define CREATE_HISTOGRAMS(type, buffer)														\
+	/* Clear counters/histograms */															\
+	memset(mHistogram, 0, 256*4*sizeof(uint32));											\
+																							\
+	/* Prepare to count */																	\
+	const uint8* p = (const uint8*)input;													\
+	const uint8* pe = &p[nb*4];																\
+	uint32* h0= &mHistogram[H0_OFFSET];	/* Histogram for first pass (LSB)	*/				\
+	uint32* h1= &mHistogram[H1_OFFSET];	/* Histogram for second pass		*/				\
+	uint32* h2= &mHistogram[H2_OFFSET];	/* Histogram for third pass			*/				\
+	uint32* h3= &mHistogram[H3_OFFSET];	/* Histogram for last pass (MSB)	*/				\
+																							\
+	bool AlreadySorted = true;	/* Optimism... */											\
+																							\
+	if(INVALID_RANKS)																		\
+	{																						\
+		/* Prepare for temporal coherence */												\
+		type* Running = (type*)buffer;														\
+		type PrevVal = *Running;															\
+																							\
+		while(p!=pe)																		\
+		{																					\
+			/* Read input buffer in previous sorted order */								\
+			type Val = *Running++;															\
+			/* Check whether already sorted or not */										\
+			if(Val<PrevVal)	{ AlreadySorted = false; break; } /* Early out */				\
+			/* Update for next iteration */													\
+			PrevVal = Val;																	\
+																							\
+			/* Create histograms */															\
+			h0[*p++]++;	h1[*p++]++;	h2[*p++]++;	h3[*p++]++;									\
+		}																					\
+																							\
+		/* If all input values are already sorted, we just have to return and leave the */	\
+		/* previous list unchanged. That way the routine may take advantage of temporal */	\
+		/* coherence, for example when used to sort transparent faces.					*/	\
+		if(AlreadySorted)																	\
+		{																					\
+			mNbHits++;																		\
+			for(uint32 i=0;i<nb;i++)	mRanks[i] = i;										\
+			return *this;																	\
+		}																					\
+	}																						\
+	else																					\
+	{																						\
+		/* Prepare for temporal coherence */												\
+		const uint32* Indices = mRanks;														\
+		type PrevVal = (type)buffer[*Indices];												\
+																							\
+		while(p!=pe)																		\
+		{																					\
+			/* Read input buffer in previous sorted order */								\
+			type Val = (type)buffer[*Indices++];											\
+			/* Check whether already sorted or not */										\
+			if(Val<PrevVal)	{ AlreadySorted = false; break; } /* Early out */				\
+			/* Update for next iteration */													\
+			PrevVal = Val;																	\
+																							\
+			/* Create histograms */															\
+			h0[*p++]++;	h1[*p++]++;	h2[*p++]++;	h3[*p++]++;									\
+		}																					\
+																							\
+		/* If all input values are already sorted, we just have to return and leave the */	\
+		/* previous list unchanged. That way the routine may take advantage of temporal */	\
+		/* coherence, for example when used to sort transparent faces.					*/	\
+		if(AlreadySorted)	{ mNbHits++; return *this;	}									\
+	}																						\
+																							\
+	/* Else there has been an early out and we must finish computing the histograms */		\
+	while(p!=pe)																			\
+	{																						\
+		/* Create histograms without the previous overhead */								\
+		h0[*p++]++;	h1[*p++]++;	h2[*p++]++;	h3[*p++]++;										\
+	}
+
+#define CHECK_PASS_VALIDITY(pass)															\
+	/* Shortcut to current counters */														\
+	const uint32* CurCount = &mHistogram[pass<<8];											\
+																							\
+	/* Reset flag. The sorting pass is supposed to be performed. (default) */				\
+	bool PerformPass = true;																\
+																							\
+	/* Check pass validity */																\
+																							\
+	/* If all values have the same byte, sorting is useless. */								\
+	/* It may happen when sorting bytes or words instead of dwords. */						\
+	/* This routine actually sorts words faster than dwords, and bytes */					\
+	/* faster than words. Standard running time (O(4*n))is reduced to O(2*n) */				\
+	/* for words and O(n) for bytes. Running time for floats depends on actual values... */	\
+																							\
+	/* Get first byte */																	\
+	uint8 UniqueVal = *(((uint8*)input)+pass);												\
+																							\
+	/* Check that byte's counter */															\
+	if(CurCount[UniqueVal]==nb)	PerformPass=false;
+
+using namespace nv;
+
+/// Constructor.
+RadixSort::RadixSort() : mRanks(NULL), mRanks2(NULL), mCurrentSize(0), mTotalCalls(0), mNbHits(0), mDeleteRanks(true)
+{
 	// Initialize indices
-	resetIndices();
+	INVALIDATE_RANKS;
 }

-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-/**
- *	Destructor.
- */
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Destructor.
 RadixSort::~RadixSort()
 {
 	// Release everything
-#ifndef RADIX_LOCAL_RAM
-	DELETEARRAY(mOffset);
-	DELETEARRAY(mHistogram);
-#endif
-	DELETEARRAY(mIndices2);
-	DELETEARRAY(mIndices);
+	if(mDeleteRanks)
+	{
+		delete [] mRanks2;
+		delete [] mRanks;
+	}
 }

-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-/**
- *	Resizes the inner lists.
- *	\param		nb				[in] new size (number of dwords)
- *	\return		true if success
- */
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Resizes the inner lists.
+/// \param		nb				[in] new size (number of dwords)
+/// \return		true if success
 bool RadixSort::resize(uint32 nb)
 {
-	// Free previously used ram
-	DELETEARRAY(mIndices2);
-	DELETEARRAY(mIndices);
-
-	// Get some fresh one
-	mIndices		= new uint32[nb];	CHECKALLOC(mIndices);
-	mIndices2		= new uint32[nb];	CHECKALLOC(mIndices2);
-	mCurrentSize	= nb;
-
-	// Initialize indices so that the input buffer is read in sequential order
-	resetIndices();
+	if(mDeleteRanks)
+	{
+		// Free previously used ram
+		delete [] mRanks2;
+		delete [] mRanks;

+		// Get some fresh one
+		mRanks	= new uint32[nb];
+		mRanks2	= new uint32[nb];
+	}
 	return true;
+
 }

-#define CHECK_RESIZE(n)																			\
-	if(n!=mPreviousSize)																		\
-	{																							\
-				if(n>mCurrentSize)	resize(n);													\
-		else						resetIndices();												\
-		mPreviousSize = n;																		\
+inline void RadixSort::checkResize(uint32 nb)
+{
+	uint32 CurSize = CURRENT_SIZE;
+	if(nb!=CurSize)
+	{
+		if(nb>CurSize) resize(nb);
+		mCurrentSize = nb;
+		INVALIDATE_RANKS;
 	}
-
-#define CREATE_HISTOGRAMS(type, buffer)															\
-	/* Clear counters */																		\
-	memset(mHistogram, 0, 256*4*sizeof(uint32));												\
-																								\
-	/* Prepare for temporal coherence */														\
-	type PrevVal = (type)buffer[mIndices[0]];													\
-	bool AlreadySorted = true;	/* Optimism... */												\
-	uint32* Indices = mIndices;																	\
-																								\
-	/* Prepare to count */																		\
-	uint8* p = (uint8*)input;																	\
-	uint8* pe = &p[nb*4];																		\
-	uint32* h0= &mHistogram[0];		/* Histogram for first pass (LSB)	*/						\
-	uint32* h1= &mHistogram[256];	/* Histogram for second pass		*/						\
-	uint32* h2= &mHistogram[512];	/* Histogram for third pass			*/						\
-	uint32* h3= &mHistogram[768];	/* Histogram for last pass (MSB)	*/						\
-																								\
-	while(p!=pe)																				\
-	{																							\
-		/* Read input buffer in previous sorted order */										\
-		type Val = (type)buffer[*Indices++];													\
-		/* Check whether already sorted or not */												\
-		if(Val<PrevVal)	{ AlreadySorted = false; break; } /* Early out */						\
-		/* Update for next iteration */															\
-		PrevVal = Val;																			\
-																								\
-		/* Create histograms */																	\
-		h0[*p++]++;	h1[*p++]++;	h2[*p++]++;	h3[*p++]++;											\
-	}																							\
-																								\
-	/* If all input values are already sorted, we just have to return and leave the */			\
-	/* previous list unchanged. That way the routine may take advantage of temporal */			\
-	/* coherence, for example when used to sort transparent faces.					*/			\
-	if(AlreadySorted)	{ mNbHits++; return *this;	}											\
-																								\
-	/* Else there has been an early out and we must finish computing the histograms */			\
-	while(p!=pe)																				\
-	{																							\
-		/* Create histograms without the previous overhead */									\
-		h0[*p++]++;	h1[*p++]++;	h2[*p++]++;	h3[*p++]++;											\
-	}
-
-#define CHECK_PASS_VALIDITY(pass)																\
-	/* Shortcut to current counters */															\
-	uint32* CurCount = &mHistogram[pass<<8];													\
-																								\
-	/* Reset flag. The sorting pass is supposed to be performed. (default) */					\
-	bool PerformPass = true;																	\
-																								\
-	/* Check pass validity */																	\
-																								\
-	/* If all values have the same byte, sorting is useless. */									\
-	/* It may happen when sorting bytes or words instead of dwords. */							\
-	/* This routine actually sorts words faster than dwords, and bytes */						\
-	/* faster than words. Standard running time (O(4*n))is reduced to O(2*n) */					\
-	/* for words and O(n) for bytes. Running time for floats depends on actual values... */		\
-																								\
-	/* Get first byte */																		\
-	uint8 UniqueVal = *(((uint8*)input)+pass);													\
-																								\
-	/* Check that byte's counter */																\
-	if(CurCount[UniqueVal]==nb)	PerformPass=false;
-
+}

 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 /**
@ -192,46 +234,31 @@ bool RadixSort::resize(uint32 nb)
 *	\return		Self-Reference
 */
 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-RadixSort& RadixSort::sort(const uint32* input, uint32 nb, bool signedvalues)
+RadixSort& RadixSort::sort(const uint32* input, uint32 nb, bool signedValues/*=true*/)
 {
-	uint32 i, j;
-	
 	// Checkings
-	if(!input || !nb)	return *this;
+	if(!input || !nb || nb&0x80000000)	return *this;

 	// Stats
 	mTotalCalls++;

 	// Resize lists if needed
-	CHECK_RESIZE(nb);
+	checkResize(nb);

-#ifdef RADIX_LOCAL_RAM
 	// Allocate histograms & offsets on the stack
 	uint32 mHistogram[256*4];
-	uint32 mOffset[256];
-#endif
+	uint32* mLink[256];

 	// Create histograms (counters). Counters for all passes are created in one run.
 	// Pros:	read input buffer once instead of four times
 	// Cons:	mHistogram is 4Kb instead of 1Kb
 	// We must take care of signed/unsigned values for temporal coherence.... I just
 	// have 2 code paths even if just a single opcode changes. Self-modifying code, someone?
-	if(!signedvalues)	{ CREATE_HISTOGRAMS(uint32, input);	}
+	if(!signedValues)	{ CREATE_HISTOGRAMS(uint32, input);	}
 	else				{ CREATE_HISTOGRAMS(int32, input);	}

-	// Compute #negative values involved if needed
-	uint32 NbNegativeValues = 0;
-	if(signedvalues)
-	{
-		// An efficient way to compute the number of negatives values we'll have to deal with is simply to sum the 128
-		// last values of the last histogram. Last histogram because that's the one for the Most Significant Byte,
-		// responsible for the sign. 128 last values because the 128 first ones are related to positive numbers.
-		uint32* h3= &mHistogram[768];
-		for( i=128;i<256;i++)	NbNegativeValues += h3[i];	// 768 for last histogram, 128 for negative part
-	}
-
 	// Radix sort, j is the pass number (0=LSB, 3=MSB)
-	for( j=0;j<4;j++)
+	for(uint32 j=0;j<4;j++)
 	{
 		CHECK_PASS_VALIDITY(j);

@ -240,40 +267,47 @@ RadixSort& RadixSort::sort(const uint32* input, uint32 nb, bool signedvalues)
 		if(PerformPass)
 		{
 			// Should we care about negative values?
-			if(j!=3 || !signedvalues)
+			if(j!=3 || !signedValues)
 			{
 				// Here we deal with positive values only

 				// Create offsets
-				mOffset[0] = 0;
-				for(i=1;i<256;i++)		mOffset[i] = mOffset[i-1] + CurCount[i-1];
+				mLink[0] = mRanks2;
+				for(uint32 i=1;i<256;i++)		mLink[i] = mLink[i-1] + CurCount[i-1];
 			}
 			else
 			{
 				// This is a special case to correctly handle negative integers. They're sorted in the right order but at the wrong place.
+				mLink[128] = mRanks2;
+				for(uint32 i=129;i<256;i++)	mLink[i] = mLink[i-1] + CurCount[i-1];

-				// Create biased offsets, in order for negative numbers to be sorted as well
-				mOffset[0] = NbNegativeValues;												// First positive number takes place after the negative ones
-				for(i=1;i<128;i++)		mOffset[i] = mOffset[i-1] + CurCount[i-1];	// 1 to 128 for positive numbers
-
-				// Fixing the wrong place for negative values
-				mOffset[128] = 0;
-				for(i=129;i<256;i++)			mOffset[i] = mOffset[i-1] + CurCount[i-1];
+				mLink[0] = mLink[255] + CurCount[255];
+				for(uint32 i=1;i<128;i++)	mLink[i] = mLink[i-1] + CurCount[i-1];
 			}

 			// Perform Radix Sort
-			uint8* InputBytes	= (uint8*)input;
-			uint32* Indices		= mIndices;
-			uint32* IndicesEnd	= &mIndices[nb];
-			InputBytes += j;
-			while(Indices!=IndicesEnd)
+			const uint8* InputBytes	= (const uint8*)input;
+			InputBytes += BYTES_INC;
+			if(INVALID_RANKS)
 			{
-				uint32 id = *Indices++;
-				mIndices2[mOffset[InputBytes[id<<2]]++] = id;
+				for(uint32 i=0;i<nb;i++)	*mLink[InputBytes[i<<2]]++ = i;
+				VALIDATE_RANKS;
+			}
+			else
+			{
+				const uint32* Indices		= mRanks;
+				const uint32* IndicesEnd	= &mRanks[nb];
+				while(Indices!=IndicesEnd)
+				{
+					uint32 id = *Indices++;
+					*mLink[InputBytes[id<<2]]++ = id;
+				}
 			}

-			// Swap pointers for next pass. Valid indices - the most recent ones - are in mIndices after the swap.
-			uint32* Tmp	= mIndices;	mIndices = mIndices2; mIndices2 = Tmp;
+			// Swap pointers for next pass. Valid indices - the most recent ones - are in mRanks after the swap.
+			uint32* Tmp = mRanks;
+			mRanks = mRanks2;
+			mRanks2 = Tmp;
 		}
 	}
 	return *this;
@ -291,24 +325,20 @@ RadixSort& RadixSort::sort(const uint32* input, uint32 nb, bool signedvalues)
 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 RadixSort& RadixSort::sort(const float* input2, uint32 nb)
 {
-	uint32 i, j;
-	
 	// Checkings
-	if(!input2 || !nb)	return *this;
+	if(!input2 || !nb || nb&0x80000000)	return *this;

 	// Stats
 	mTotalCalls++;

-	uint32* input = (uint32*)input2;
+	const uint32* input = (const uint32*)input2;

 	// Resize lists if needed
-	CHECK_RESIZE(nb);
+	checkResize(nb);

-#ifdef RADIX_LOCAL_RAM
 	// Allocate histograms & offsets on the stack
 	uint32 mHistogram[256*4];
-	uint32 mOffset[256];
-#endif
+	uint32* mLink[256];

 	// Create histograms (counters). Counters for all passes are created in one run.
 	// Pros:	read input buffer once instead of four times
@ -320,16 +350,8 @@ RadixSort& RadixSort::sort(const float* input2, uint32 nb)
 	// wouldn't work with mixed positive/negative values....
 	{ CREATE_HISTOGRAMS(float, input2); }

-	// Compute #negative values involved if needed
-	uint32 NbNegativeValues = 0;
-	// An efficient way to compute the number of negatives values we'll have to deal with is simply to sum the 128
-	// last values of the last histogram. Last histogram because that's the one for the Most Significant Byte,
-	// responsible for the sign. 128 last values because the 128 first ones are related to positive numbers.
-	uint32* h3= &mHistogram[768];
-	for( i=128;i<256;i++)	NbNegativeValues += h3[i];	// 768 for last histogram, 128 for negative part
-
 	// Radix sort, j is the pass number (0=LSB, 3=MSB)
-	for( j=0;j<4;j++)
+	for(uint32 j=0;j<4;j++)
 	{
 		// Should we care about negative values?
 		if(j!=3)
@ -340,22 +362,32 @@ RadixSort& RadixSort::sort(const float* input2, uint32 nb)
 			if(PerformPass)
 			{
 				// Create offsets
-				mOffset[0] = 0;
-				for( i=1;i<256;i++)		mOffset[i] = mOffset[i-1] + CurCount[i-1];
+				mLink[0] = mRanks2;
+				for(uint32 i=1;i<256;i++)		mLink[i] = mLink[i-1] + CurCount[i-1];

 				// Perform Radix Sort
-				uint8* InputBytes	= (uint8*)input;
-				uint32* Indices		= mIndices;
-				uint32* IndicesEnd	= &mIndices[nb];
-				InputBytes += j;
-				while(Indices!=IndicesEnd)
+				const uint8* InputBytes = (const uint8*)input;
+				InputBytes += BYTES_INC;
+				if(INVALID_RANKS)
 				{
-					uint32 id = *Indices++;
-					mIndices2[mOffset[InputBytes[id<<2]]++] = id;
+					for(uint32 i=0;i<nb;i++)	*mLink[InputBytes[i<<2]]++ = i;
+					VALIDATE_RANKS;
+				}
+				else
+				{
+					const uint32* Indices		= mRanks;
+					const uint32* IndicesEnd	= &mRanks[nb];
+					while(Indices!=IndicesEnd)
+					{
+						uint32 id = *Indices++;
+						*mLink[InputBytes[id<<2]]++ = id;
+					}
 				}

-				// Swap pointers for next pass. Valid indices - the most recent ones - are in mIndices after the swap.
-				uint32* Tmp	= mIndices;	mIndices = mIndices2; mIndices2 = Tmp;
+				// Swap pointers for next pass. Valid indices - the most recent ones - are in mRanks after the swap.
+				uint32* Tmp = mRanks;
+				mRanks = mRanks2;
+				mRanks2 = Tmp;
 			}
 		}
 		else
@ -365,35 +397,58 @@ RadixSort& RadixSort::sort(const float* input2, uint32 nb)

 			if(PerformPass)
 			{
-				// Create biased offsets, in order for negative numbers to be sorted as well
-				mOffset[0] = NbNegativeValues;												// First positive number takes place after the negative ones
-				for(i=1;i<128;i++)		mOffset[i] = mOffset[i-1] + CurCount[i-1];	// 1 to 128 for positive numbers
-
-				// We must reverse the sorting order for negative numbers!
-				mOffset[255] = 0;
-				for(i=0;i<127;i++)		mOffset[254-i] = mOffset[255-i] + CurCount[255-i];	// Fixing the wrong order for negative values
-				for(i=128;i<256;i++)	mOffset[i] += CurCount[i];							// Fixing the wrong place for negative values
+				mLink[255] = mRanks2 + CurCount[255];
+				for(uint32 i = 254; i > 126; i--) mLink[i] = mLink[i+1] + CurCount[i];
+				mLink[0] = mLink[127] + CurCount[127];
+				for(uint32 i = 1; i < 127; i++) mLink[i] = mLink[i-1] + CurCount[i-1];

 				// Perform Radix Sort
-				for(i=0;i<nb;i++)
+				if(INVALID_RANKS)
 				{
-					uint32 Radix = input[mIndices[i]]>>24;								// Radix byte, same as above. AND is useless here (uint32).
-					// ### cmp to be killed. Not good. Later.
-					if(Radix<128)		mIndices2[mOffset[Radix]++] = mIndices[i];		// Number is positive, same as above
-					else				mIndices2[--mOffset[Radix]] = mIndices[i];		// Number is negative, flip the sorting order
+					for(uint32 i=0;i<nb;i++)
+					{
+						uint32 Radix = input[i]>>24;							// Radix byte, same as above. AND is useless here (uint32).
+						// ### cmp to be killed. Not good. Later.
+						if(Radix<128)		*mLink[Radix]++ = i;		// Number is positive, same as above
+						else				*(--mLink[Radix]) = i;		// Number is negative, flip the sorting order
+					}
+					VALIDATE_RANKS;
 				}
-				// Swap pointers for next pass. Valid indices - the most recent ones - are in mIndices after the swap.
-				uint32* Tmp	= mIndices;	mIndices = mIndices2; mIndices2 = Tmp;
+				else
+				{
+					for(uint32 i=0;i<nb;i++)
+					{
+						uint32 Radix = input[mRanks[i]]>>24;							// Radix byte, same as above. AND is useless here (uint32).
+						// ### cmp to be killed. Not good. Later.
+						if(Radix<128)		*mLink[Radix]++ = mRanks[i];		// Number is positive, same as above
+						else				*(--mLink[Radix]) = mRanks[i];		// Number is negative, flip the sorting order
+					}
+				}
+				// Swap pointers for next pass. Valid indices - the most recent ones - are in mRanks after the swap.
+				uint32* Tmp = mRanks;
+				mRanks = mRanks2;
+				mRanks2 = Tmp;
 			}
 			else
 			{
 				// The pass is useless, yet we still have to reverse the order of current list if all values are negative.
 				if(UniqueVal>=128)
 				{
-					for(i=0;i<nb;i++)	mIndices2[i] = mIndices[nb-i-1];
+					if(INVALID_RANKS)
+					{
+						// ###Possible?
+						for(uint32 i=0;i<nb;i++)	mRanks2[i] = nb-i-1;
+						VALIDATE_RANKS;
+					}
+					else
+					{
+						for(uint32 i=0;i<nb;i++)	mRanks2[i] = mRanks[nb-i-1];
+					}

-					// Swap pointers for next pass. Valid indices - the most recent ones - are in mIndices after the swap.
-					uint32* Tmp	= mIndices;	mIndices = mIndices2; mIndices2 = Tmp;
+					// Swap pointers for next pass. Valid indices - the most recent ones - are in mRanks after the swap.
+					uint32* Tmp = mRanks;
+					mRanks = mRanks2;
+					mRanks2 = Tmp;
 				}
 			}
 		}
@ -401,29 +456,29 @@ RadixSort& RadixSort::sort(const float* input2, uint32 nb)
 	return *this;
 }

-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-/**
- *	Resets the inner indices. After the call, mIndices is reset.
- */
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-void RadixSort::resetIndices()
+
+bool RadixSort::setRankBuffers(uint32* ranks0, uint32* ranks1)
 {
-	for(uint32 i=0;i<mCurrentSize;i++)	mIndices[i] = i;
+	if(!ranks0 || !ranks1)	return false;
+
+	mRanks			= ranks0;
+	mRanks2			= ranks1;
+	mDeleteRanks	= false;
+
+	return true;
 }

-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-/**
- *	Gets the ram used.
- *	\return		memory used in bytes
- */
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-uint32 RadixSort::usedRam() const
+RadixSort & RadixSort::sort(const Array<int> & input)
 {
-	uint32 UsedRam = sizeof(RadixSort);
-#ifndef RADIX_LOCAL_RAM
-	UsedRam += 256*4*sizeof(uint32);			// Histograms
-	UsedRam += 256*sizeof(uint32);				// Offsets
-#endif
-	UsedRam += 2*mCurrentSize*sizeof(uint32);	// 2 lists of indices
-	return UsedRam;
+	return sort((const uint32 *)input.buffer(), input.count(), true);
+}
+
+RadixSort & RadixSort::sort(const Array<uint> & input)
+{
+	return sort(input.buffer(), input.count(), false);
+}
+
+RadixSort &	RadixSort::sort(const Array<float> & input)
+{
+	return sort(input.buffer(), input.count());
 }
--- a/src/nvcore/Radix.h
+++ b/src/nvcore/Radix.h
@ -13,57 +13,61 @@
 #define NV_CORE_RADIXSORT_H

 #include <nvcore/nvcore.h>
+#include <nvcore/Containers.h>
+
+namespace nv
+{
+
+	class NVCORE_CLASS RadixSort
+	{
+		NV_FORBID_COPY(RadixSort);
+	public:
+		// Constructor/Destructor
+		RadixSort();
+		~RadixSort();
+
+		// Sorting methods
+		RadixSort & sort(const uint32* input, uint32 nb, bool signedValues=true);
+		RadixSort &	sort(const float* input, uint32 nb);
+
+		// Helpers
+		RadixSort & sort(const Array<int> & input);
+		RadixSort & sort(const Array<uint> & input);
+		RadixSort & sort(const Array<float> & input);


-#define RADIX_LOCAL_RAM
+		//! Access to results. mRanks is a list of indices in sorted order, i.e. in the order you may further process your data
+		inline /*const*/ uint32 * ranks() /*const*/ { return mRanks; }

+		//! mIndices2 gets trashed on calling the sort routine, but otherwise you can recycle it the way you want.
+		inline uint32 * recyclable() const { return mRanks2; }

-class NVCORE_API RadixSort {
-	NV_FORBID_COPY(RadixSort);
-public:
-	// Constructor/Destructor
-	RadixSort();
-	~RadixSort();
+		// Stats
+		//! Returns the total number of calls to the radix sorter.
+		inline uint32 totalCalls() const { return mTotalCalls; }

-	// Sorting methods
-	RadixSort & sort(const uint32* input, uint32 nb, bool signedvalues=true);
-	RadixSort & sort(const float* input, uint32 nb);
-
-	//! Access to results. mIndices is a list of indices in sorted order, i.e. in the order you may further process your data
-	inline uint32 * indices() const { return mIndices; }
-
-	//! mIndices2 gets trashed on calling the sort routine, but otherwise you can recycle it the way you want.
-	inline uint32 * recyclable() const { return mIndices2; }
-
-	// Stats
-	uint32 usedRam() const;
-
-	//! Returns the total number of calls to the radix sorter.
-	inline uint32 totalCalls()	const { return mTotalCalls;	}
-
-	//! Returns the number of premature exits due to temporal coherence.
-	inline uint32 hits() const { return mNbHits; }
+		//! Returns the number of early exits due to temporal coherence.
+		inline uint32 hits() const { return mNbHits; }

+		bool setRankBuffers(uint32* ranks0, uint32* ranks1);

 	private:
-#ifndef RADIX_LOCAL_RAM
-	uint32*			mHistogram;					//!< Counters for each byte
-	uint32*			mOffset;					//!< Offsets (nearly a cumulative distribution function)
-#endif
-	uint32			mCurrentSize;				//!< Current size of the indices list
-	uint32			mPreviousSize;				//!< Size involved in previous call
-	uint32*			mIndices;					//!< Two lists, swapped each pass
-	uint32*			mIndices2;
+		uint32 mCurrentSize;    //!< Current size of the indices list
+		uint32 * mRanks;        //!< Two lists, swapped each pass
+		uint32 * mRanks2;

-	// Stats
-	uint32			mTotalCalls;
-	uint32			mNbHits;
+		// Stats
+		uint32 mTotalCalls;     //!< Total number of calls to the sort routine
+		uint32 mNbHits;         //!< Number of early exits due to coherence

-	// Internal methods
-	bool			resize(uint32 nb);
-	void			resetIndices();
+		// Stack-radix
+		bool mDeleteRanks;      //!<

-};
+		// Internal methods
+		void checkResize(uint32 nb);
+		bool resize(uint32 nb);
+	};

+} // nv namespace

 #endif // NV_CORE_RADIXSORT_H
--- a/src/nvcore/StrLib.cpp
+++ b/src/nvcore/StrLib.cpp
@ -208,49 +208,12 @@ StringBuilder::StringBuilder( const StringBuilder & s ) : m_size(0), m_str(NULL)
 	copy(s);
 }

-// Copy string. 
-/*StringBuilder::StringBuilder( const char * s )
+/** Copy string. */
+StringBuilder::StringBuilder( const char * s )
 {
 	copy(s);
-}*/
-
-/** Allocate and copy string. */
-StringBuilder::StringBuilder( int size_hint, const StringBuilder & s) : m_size(size_hint), m_str(NULL)
-{
-	nvDebugCheck(m_size > 0);
-	m_str = strAlloc(m_size);
-	copy(s);
 }

-/** Allocate and format string. */
-StringBuilder::StringBuilder( const char * fmt, ... ) : m_size(0), m_str(NULL)
-{
-	nvDebugCheck(fmt != NULL);
-	va_list arg;
-	va_start( arg, fmt );
-
-	format( fmt, arg );
-
-	va_end( arg );
-}
-
-/** Allocate and format string. */
-StringBuilder::StringBuilder( int size_hint, const char * fmt, ... ) : m_size(size_hint), m_str(NULL)
-{
-	nvDebugCheck(m_size > 0);	
-	nvDebugCheck(fmt != NULL);
-	
-	m_str = strAlloc(m_size);
-
-	va_list arg;
-	va_start( arg, fmt );
-
-	format( fmt, arg );
-
-	va_end( arg );
-}
-
-
 /** Delete the string. */
 StringBuilder::~StringBuilder()
 {
@ -278,8 +241,7 @@ StringBuilder & StringBuilder::format( const char * fmt, ... )
 /** Format a string safely. */
 StringBuilder & StringBuilder::format( const char * fmt, va_list arg )
 {
-	nvCheck(fmt != NULL);
-	nvCheck(m_size >= 0);
+	nvDebugCheck(fmt != NULL);

 	if( m_size == 0 ) {
 		m_size = 64;
@ -327,8 +289,7 @@ StringBuilder & StringBuilder::format( const char * fmt, va_list arg )
 /** Append a string. */
 StringBuilder & StringBuilder::append( const char * s )
 {
-	nvCheck(s != NULL);
-	nvCheck(m_size >= 0);
+	nvDebugCheck(s != NULL);

 	const uint slen = uint(strlen( s ));

@ -475,31 +436,6 @@ void StringBuilder::reset()
 }


-Path::Path(const char * fmt, ...)
-{
-	nvDebugCheck( fmt != NULL );
-
-	va_list arg;
-	va_start( arg, fmt );
-
-	format( fmt, arg );
-
-	va_end( arg );
-}
-
-Path::Path(int size_hint, const char * fmt, ...) : StringBuilder(size_hint)
-{
-	nvDebugCheck( fmt != NULL );
-
-	va_list arg;
-	va_start( arg, fmt );
-
-	format( fmt, arg );
-
-	va_end( arg );
-}
-
-
 /// Get the file name from a path.
 const char * Path::fileName() const
 {
--- a/src/nvcore/StrLib.h
+++ b/src/nvcore/StrLib.h
@ -45,11 +45,8 @@ namespace nv
 	
 		StringBuilder();
 		explicit StringBuilder( int size_hint );
-		//StringBuilder( const char * str );
+		StringBuilder( const char * str );
 		StringBuilder( const StringBuilder & );
-		StringBuilder( int size_hint, const StringBuilder & );	
-		StringBuilder( const char * format, ... ) __attribute__((format (printf, 2, 3)));
-		StringBuilder( int size_hint, const char * format, ... ) __attribute__((format (printf, 3, 4)));
 	
 		~StringBuilder();
 	
@ -120,24 +117,17 @@ namespace nv
 		char * m_str;
 		
 	};
-	

-	/// Path string.
+
+	/// Path string. @@ This should be called PathBuilder.
 	class NVCORE_CLASS Path : public StringBuilder
 	{
 	public:
 		Path() : StringBuilder() {}
 		explicit Path(int size_hint) : StringBuilder(size_hint) {}
-		//Path(const char * str) : StringBuilder((const char *)str) {}
+		Path( const char * str ) : StringBuilder(str) {}
 		Path(const StringBuilder & str) : StringBuilder(str) {}
-		Path(int size_hint, const StringBuilder & str) : StringBuilder(size_hint, str) {}	
-		Path(const char * format, ...) __attribute__((format (printf, 2, 3)));
-		Path(int size_hint, const char * format, ...) __attribute__((format (printf, 3, 4)));
 		
-		Path & operator=( const char * s ) {
-			return (Path &)copy(s);
-		}
-
 		const char * fileName() const;
 		const char * extension() const;
 		
@ -145,11 +135,11 @@ namespace nv
 		
 		void stripFileName();
 		void stripExtension();
-		
+
 		// statics
-		NVCORE_API static char separator();
-		NVCORE_API static const char * fileName(const char *);
-		NVCORE_API static const char * extension(const char *);
+		static char separator();
+		static const char * fileName(const char *);
+		static const char * extension(const char *);
 	};
 	
 	
--- a/src/nvcore/Timer.h
+++ b/src/nvcore/Timer.h
@ -0,0 +1,22 @@
+// This code is in the public domain -- castano@gmail.com
+
+#ifndef NV_CORE_TIMER_H
+#define NV_CORE_TIMER_H
+
+#include <nvcore/nvcore.h>
+
+#include <time.h> //clock
+
+class NVCORE_CLASS Timer
+{
+public:
+	Timer() {}
+	
+	void start() { m_start = clock(); }
+	int elapsed() const { return (1000 * (clock() - m_start)) / CLOCKS_PER_SEC; }
+	
+private:
+	clock_t m_start;
+};
+
+#endif // NV_CORE_TIMER_H
--- a/src/nvcore/Tokenizer.cpp
+++ b/src/nvcore/Tokenizer.cpp
@ -8,7 +8,7 @@
 #include <stdlib.h>	// atof, atoi

 #if NV_CC_MSVC
-#if 0 // This doesn't work on MSVC for x64
+#if defined NV_CPU_X86
 /* vsscanf for Win32
 * Written 5/2003 by <mgix@mgix.com>
 * This code is in the Public Domain
@ -56,9 +56,39 @@ static int vsscanf(const char * buffer, const char * format, va_list argPtr)
 	}
 	return result;
 }
+#elif defined NV_CPU_X86_64
+
+/* Prototype of the helper assembly function */
+#ifdef __cplusplus
+extern "C" {
 #endif
+
+int vsscanf_proxy_win64(const char * buffer, const char * format, va_list argPtr, __int64 count);
+
+#ifdef __cplusplus
+}
 #endif

+/* MASM64 version of the above vsscanf */
+static int vsscanf(const char * buffer, const char * format, va_list argPtr)
+{
+	// Get an upper bound for the # of args
+	__int64 count = 0;
+	const char *p = format;
+	while(1) {
+		char c = *(p++);
+		if(c==0) break;
+		if(c=='%' && (p[0]!='*' && p[0]!='%')) ++count;
+	}
+	return vsscanf_proxy_win64(buffer, format, argPtr, count);
+}
+
+/*#error vsscanf doesn't work on MSVC for x64*/
+#else
+#error Unknown processor for MSVC
+#endif
+#endif // NV_CC_MSVC
+
 using namespace nv;

 Token::Token() :
--- a/src/nvcore/vsscanf_proxy_win64.masm
+++ b/src/nvcore/vsscanf_proxy_win64.masm
@ -0,0 +1,124 @@
+; MASM x64 version of
+; vsscanf for Win32
+; originally written 5/2003 by <mgix@mgix.com>
+;
+; This was done because MSVC does not accept inline assembly code
+; for the x64 platform, so this file implements almost the whole
+; module in assembly using the amd64 ABI
+;
+; 06/17/2008 by edgarv [at] nvidia com
+
+; Definition of memcpy
+memcpy	PROTO dest:Ptr, src:Ptr, numbytes:QWORD
+
+; Definition of sscanf
+sscanf PROTO buffer:Ptr Byte, format:Ptr Byte, args:VARARG
+
+
+
+; Start a code segment named "_TEXT" by default
+.CODE
+
+; Entry point of our function: at this point we can use
+; named parameters
+ALIGN 16
+PUBLIC vsscanf_proxy_win64
+
+; Because the x64 code uses the fast call convention, only
+; the arguments beyond the 4th one are available from the stack.
+; The first four parameters are in RCX, RDX, R8 and R9
+
+; Parameters:
+;    const char* buffer
+;    const char* format
+;    va_list argPtr
+;    size_t  count
+vsscanf_proxy_win64 PROC, \
+	buffer:PTR Byte, format:PTR Byte, argPtr:PTR, count:QWORD
+	
+  ; Allocates space for our local variable, savedRDP
+  sub rsp, 08h
+  
+  ; Copies the parameters from the registers to the memory: before warping to
+  ; sscanf we will call memcpy, and those registers can just dissapear!
+  mov buffer, rcx
+  mov format, rdx
+  mov argPtr, r8
+  mov count,  r9
+
+
+  ; Allocate extra space in the stack for (2+count)*sizeof(void*),
+  ; this is (2+count)*(8)
+  mov r10, r9
+  add r10, 2		; count += 2
+  sal r10, 3		; count *= 8
+  add r10, 0fh	; To force alignment to 16bytes
+  and r10, 0fffffffffffffff0h
+  sub rsp, r10	; Actual stack allocation
+  
+  
+  ; Continues by copying all the arguments in the "alloca" space  
+  mov [rsp], rcx		    ; newStack[0] = (void*)buffer;
+  mov [rsp + 08h], rdx		; newStack[1] = (void*)format;
+  
+  ; Calls memcpy(newStack+2, argPtr, count*sizeof(void*));
+  mov rcx, rsp
+  add rcx, 010h		; newStack+2
+  mov rdx, r8		; argPtr
+  mov r8, r9
+  sal r8, 3			; count*sizeof(void*)
+  
+  ; Prepares extra stack space as required by the ABI for 4 arguments, and calls memcpy
+  sub rsp, 020h
+  call memcpy
+  
+  ; Restore the stack
+  add rsp, 020h
+  
+  ; Saves rsp in memory
+  mov qword ptr [rbp - 8], rsp
+  
+  ; Does exactly the same trick as before: warp into system sscanf with the new stack,
+  ; but this time we also setup the arguments in the registers according to the amd64 ABI
+  
+  ; If there was at least one argument (after buffer and format), we need to copy that
+  ; to r8, and if there was a second one we must copy that to r9
+  ; (the first arguments to sscanf are always the buffer and the format)
+  mov r10, count
+  
+  ; Copy the first argument to r8 (if it exists)
+  cmp r10, 0
+  je args_memcpy
+  mov r8, [rsp + 10h]
+  
+  ; Copy the second argument to r9 (if it exists)
+  cmp r10, 1
+  je args_memcpy
+  mov r9, [rsp + 18h]
+    
+args_memcpy:
+  
+  ; Copies the buffer and format to rcx and rdx
+  mov rdx, [rsp + 08h]
+  mov rcx, [rsp]
+  
+  ; Finally, calls sscanf using the current stack
+  call sscanf
+  
+  ; At this point the return value is alreay in rax
+  
+  ; Restores rsp
+  mov rsp, qword ptr [rbp - 8]
+  
+  ; Undoes the alloca
+  add rsp, r10
+  
+  ; Restores the space for local variables
+  add rsp, 08h
+  
+  ; Remember, the return value is already in rax since the sscanf call
+  ret
+
+vsscanf_proxy_win64 ENDP
+
+END