Merge internal branch.

- Remove old/unused code. - Remove format string constructors. - Better win64 support (vsscanf, prefetch, etc). - Fix radix sort to sort -0 correctly. - Add misc utilities (constraints, timer, cpuinfo, introsort).
2008-12-29 11:20:06 +00:00
parent a03411e451
commit e5ae0c0e20
14 changed files with 801 additions and 462 deletions
--- a/src/nvcore/Algorithms.h
+++ b/src/nvcore/Algorithms.h
@ -3,6 +3,8 @@
 #ifndef NV_CORE_ALGORITHMS_H
 #define NV_CORE_ALGORITHMS_H
 #include <nvcore/nvcore.h>
 namespace nv
 {
@ -45,22 +47,42 @@ namespace nv
 		}
 	}
-	
+	// @@ Should swap be implemented here?
-	// @@ Swap should be implemented here.
+
-	
+
-	
+
-#if 0
+	template <typename T, template <typename T> class C>
-	// This does not use swap, but copies, in some cases swaps are much faster than copies!
+	void sort(C<T> & container)
 	// Container should implement operator[], and size()
 	template <class Container, class T>
 	void insertionSort(Container<T> & container)
 	{
-		const uint n = container.size();
+		introsortLoop(container, 0, container.count());
-		for (uint i=1; i < n; ++i)
+		insertionSort(container, 0, container.count());
 	}
 	template <typename T, template <typename T> class C>
 	void sort(C<T> & container, uint begin, uint end)
 	{
 		if (begin < end)
 		{
 			introsortLoop(container, begin, end);
 			insertionSort(container, begin, end);
 		}
 	}
 	template <typename T, template <typename T> class C>
 	void insertionSort(C<T> & container)
 	{
 		insertionSort(container, 0, container.count());
 	}
 	template <typename T, template <typename T> class C>
 	void insertionSort(C<T> & container, uint begin, uint end)
 	{
 		for (uint i = begin + 1; i != end; ++i)
 		{
 			T value = container[i];
 			uint j = i;
-			while (j > 0 && container[j-1] > value)
+			while (j != begin && container[j-1] > value)
 			{
 				container[j] = container[j-1];
 				--j;
@ -72,75 +94,60 @@ namespace nv
 		}
 	}
-	template <class Container, class T>
+	template <typename T, template <typename T> class C>
-	void quickSort(Container<T> & container)
+    void introsortLoop(C<T> & container, uint begin, uint end)
-	{
+    {
-		quickSort(container, 0, container.count());
+    	while (end-begin > 16)
-	}
+    	{
-	
+			uint p = partition(container, begin, end, medianof3(container, begin, begin+((end-begin)/2)+1, end-1));
-	{
+			introsortLoop(container, p, end);
-		/* threshhold for transitioning to insertion sort */
+			end = p;
-		while (n > 12) {
+    	}
-			int c01,c12,c,m,i,j;
+    }
-			/* compute median of three */
+	template <typename T, template <typename T> class C>
-			m = n >> 1;
+    uint partition(C<T> & a, uint begin, uint end, const T & x)
-			c = p[0] > p[m];
+    {
-			c01 = c;
+    	int i = begin, j = end;
-			c = &p[m] > &p[n-1];
+    	while (true)
-			c12 = c;
+    	{
-			/* if 0 >= mid >= end, or 0 < mid < end, then use mid */
+    	    while (a[i] < x) ++i;
-			if (c01 != c12) {
+    	    --j;
-				/* otherwise, we'll need to swap something else to middle */
+    	    while (x < a[j]) --j;
-				int z;
+    	    if (i >= j)
-				c = p[0] < p[n-1];
+    			return i;
-				/* 0>mid && mid<n:  0>n => n; 0<n => 0 */
+    	    swap(a[i], a[j]);
-				/* 0<mid && mid>n:  0>n => 0; 0<n => n */
+    	    i++;
-				z = (c == c12) ? 0 : n-1;
+    	}
-				swap(p[z], p[m]);
+    }
 	template <typename T, template <typename T> class C>
    const T & medianof3(C<T> & a, uint lo, uint mid, uint hi)
    {
 		if (a[mid] < a[lo])
 		{
 			if (a[hi] < a[mid])
 			{
 				return a[mid];
 			}
-			/* now p[m] is the median-of-three */
+			else
-			/* swap it to the beginning so it won't move around */
+			{
-			swap(p[0], p[m]);
+				return (a[hi] < a[lo]) ? a[hi] : a[lo];
 			/* partition loop */
 			i=1;
 			j=n-1;
 			for(;;) {
 				/* handling of equality is crucial here */
 				/* for sentinels & efficiency with duplicates */
 				for (;;++i) {
 					c = p[i] > p[0];
 					if (!c) break;
 				}
 				a = &p[0];
 				for (;;--j) {
 					b=&p[j];
 					c = p[j] > p[0]
 					if (!c) break;
 				}
 				/* make sure we haven't crossed */
 				if (i >= j) break;
 				swap(p[i], p[j]);
 				++i;
 				--j;
 			}
 			/* recurse on smaller side, iterate on larger */
 			if (j < (n-i)) {
 				quickSort(p, j);
 				p = p+i;
 				n = n-i;
 			} 
 			else {
 				quickSort(p+i, n-i);
 				n = j;
 			}
 		}
-		
+		else
-		insertionSort();
+		{
-	}
+			if (a[hi] < a[mid])
-#endif // 0
+			{
 				return (a[hi] < a[lo]) ? a[lo] : a[hi];
 			}
 			else
 			{
 				return a[mid];
 			}
 		}
    }
 } // nv namespace
--- a/src/nvcore/CMakeLists.txt
+++ b/src/nvcore/CMakeLists.txt
@ -24,11 +24,14 @@ SET(CORE_SRCS
 	TextReader.cpp
 	TextWriter.h
 	TextWriter.cpp
 	Tokenizer.h
 	Tokenizer.cpp
 	Radix.h
 	Radix.cpp
 	CpuInfo.h
 	CpuInfo.cpp
 	Algorithms.h
 	Timer.h
 	Library.h
 	Library.cpp
 	FileSystem.h
@ -36,6 +39,34 @@ SET(CORE_SRCS
 INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
 # For Windows64 in MSVC we need to add the assembly version of vsscanf
 IF(MSVC AND NV_SYSTEM_PROCESSOR STREQUAL "AMD64")
  SET(VSSCANF_ASM_NAME "vsscanf_proxy_win64")
  IF(MSVC_IDE)
    # $(IntDir) is a macro expanded to the intermediate directory of the selected solution configuration
    SET(VSSCANF_ASM_INTDIR "$(IntDir)")
  ELSE(MSVC_IDE)
    # For some reason the NMake generator doesn't work properly with the generated .obj source:
 	# it requires the absolute path. So this is a hack which worked as of cmake 2.6.0 patch 0
 	GET_FILENAME_COMPONENT(VSSCANF_ASM_INTDIR 
 	    "${nvcore_BINARY_DIR}/CMakeFiles/nvcore.dir" ABSOLUTE)
  ENDIF(MSVC_IDE)
  SET(VSSCANF_ASM_SRC "${CMAKE_CURRENT_SOURCE_DIR}/${VSSCANF_ASM_NAME}.masm")
  SET(VSSCANF_ASM_OBJ "${VSSCANF_ASM_INTDIR}/${VSSCANF_ASM_NAME}.obj")
  # Adds the assembly output to the sources and adds the custom command to generate it
  SET(CORE_SRCS
 	${CORE_SRCS}
 	${VSSCANF_ASM_OBJ}
  )
  ADD_CUSTOM_COMMAND(OUTPUT ${VSSCANF_ASM_OBJ}
 					 MAIN_DEPENDENCY ${VSSCANF_ASM_SRC}
 					 COMMAND ml64
 					 ARGS  /nologo /Fo ${VSSCANF_ASM_OBJ} /c /Cx ${VSSCANF_ASM_SRC}
  )
 ENDIF(MSVC AND NV_SYSTEM_PROCESSOR STREQUAL "AMD64")
 # targets
 ADD_DEFINITIONS(-DNVCORE_EXPORTS)
--- a/src/nvcore/Constraints.h
+++ b/src/nvcore/Constraints.h
@ -0,0 +1,59 @@
 // This code is in the public domain -- castanyo@yahoo.es
 #ifndef NV_CORE_ALGORITHMS_H
 #define NV_CORE_ALGORITHMS_H
 #include <nvcore/nvcore.h>
 namespace nv
 {
 	// Cool constraints from "Imperfect C++"
 	// must_be_pod
 	template <typename T>
 	struct must_be_pod
 	{
 		static void constraints()
 		{
 			union { T T_is_not_POD_type; };
 		}
 	};
 	// must_be_pod_or_void
 	template <typename T>
 	struct must_be_pod_or_void
 	{
 		static void constraints()
 		{
 			union { T T_is_not_POD_type; };
 		}
 	};
 	template <> struct must_be_pod_or_void<void> {};
 	// size_of
 	template <typename T>
 	struct size_of
 	{
 		enum { value = sizeof(T) };
 	};
 	template <> 
 	struct size_of<void>
 	{
 		enum { value = 0 };
 	};
 	// must_be_same_size
 	template <typename T1, typename T2>
 	struct must_be_same_size
 	{
 		static void constraints()
 		{
 			const int T1_not_same_size_as_T2 = size_of<T1>::value == size_of<T2>::value;
 			int i[T1_not_same_size_as_T2];
 		}
 	};
 } // nv namespace
 #endif // NV_CORE_ALGORITHMS_H
--- a/src/nvcore/CpuInfo.cpp
+++ b/src/nvcore/CpuInfo.cpp
@ -33,30 +33,75 @@ static bool isWow64()
 #endif // NV_OS_WIN32
 #if NV_OS_LINUX
 #include <string.h>
 #include <sched.h>
 #endif // NV_OS_LINUX
 #if NV_OS_DARWIN
 #include <sys/types.h>
 #include <sys/sysctl.h>
 #endif // NV_OS_DARWIN
 // Initialize the data and the local defines, which are designed
 // to match the positions in cpuid
 uint CpuInfo::m_cpu = ~0x0;
 uint CpuInfo::m_procCount = 0;
 #define NV_CPUINFO_MMX_MASK  (1<<23)
 #define NV_CPUINFO_SSE_MASK  (1<<25)
 #define NV_CPUINFO_SSE2_MASK (1<<26)
 #define NV_CPUINFO_SSE3_MASK (1)
 uint CpuInfo::processorCount()
 {
 	if (m_procCount == 0) {
 #if NV_OS_WIN32
-	SYSTEM_INFO sysInfo;
+		SYSTEM_INFO sysInfo;
-	typedef BOOL (WINAPI *LPFN_ISWOW64PROCESS) (HANDLE, PBOOL);
+		typedef BOOL (WINAPI *LPFN_ISWOW64PROCESS) (HANDLE, PBOOL);
-	if (isWow64())
+		if (isWow64())
-	{
+		{
-		GetNativeSystemInfo(&sysInfo);
+			GetNativeSystemInfo(&sysInfo);
-	}
+		}
-	else
+		else
-	{
+		{
-		GetSystemInfo(&sysInfo);
+			GetSystemInfo(&sysInfo);
-	}
+		}
-	uint count = (uint)sysInfo.dwNumberOfProcessors;
+		uint count = (uint)sysInfo.dwNumberOfProcessors;
-	nvDebugCheck(count >= 1);
+		m_procCount = count;
 #elif NV_OS_LINUX
 		// Code from x264 (July 6 snapshot) cpu.c:271
 		uint bit;
 		uint np;
 		cpu_set_t p_aff;
 		memset( &p_aff, 0, sizeof(p_aff) );
 		sched_getaffinity( 0, sizeof(p_aff), &p_aff );
 		for( np = 0, bit = 0; bit < sizeof(p_aff); bit++ )
 			np += (((uint8 *)&p_aff)[bit / 8] >> (bit % 8)) & 1;
 		m_procCount = np;
 #elif NV_OS_DARWIN
 		// Code from x264 (July 6 snapshot) cpu.c:286
 		uint numberOfCPUs;
 		size_t length = sizeof( numberOfCPUs );
 		if( sysctlbyname("hw.ncpu", &numberOfCPUs, &length, NULL, 0) )
 		{
 			numberOfCPUs = 1;
 		}
 		m_procCount = numberOfCPUs;
 	return count;
 #else
-	return 1;
+		m_procCount = 1;
 #endif
 	}
 	nvDebugCheck(m_procCount > 0);
 	return m_procCount;
 }
 uint CpuInfo::coreCount()
@ -66,23 +111,52 @@ uint CpuInfo::coreCount()
 bool CpuInfo::hasMMX()
 {
-	return false;
+	return (cpu() & NV_CPUINFO_MMX_MASK) != 0;
 }
 bool CpuInfo::hasSSE()
 {
-	return false;
+	return (cpu() & NV_CPUINFO_SSE_MASK) != 0;
 }
 bool CpuInfo::hasSSE2()
 {
-	return false;
+	return (cpu() & NV_CPUINFO_SSE2_MASK) != 0;
 }
 bool CpuInfo::hasSSE3()
 {
-	return false;
+	return (cpu() & NV_CPUINFO_SSE3_MASK) != 0;
 }
 inline int CpuInfo::cpu() {
 	if (m_cpu == ~0x0) {
 		m_cpu = 0;
 #if NV_CC_MSVC
 		int CPUInfo[4] = {-1};
 		__cpuid(CPUInfo, /*InfoType*/ 1);
 		if (CPUInfo[2] & NV_CPUINFO_SSE3_MASK) {
 			m_cpu |= NV_CPUINFO_SSE3_MASK;
 		}
 		if (CPUInfo[3] & NV_CPUINFO_MMX_MASK) {
 			m_cpu |= NV_CPUINFO_MMX_MASK;
 		}
 		if (CPUInfo[3] & NV_CPUINFO_SSE_MASK) {
 			m_cpu |= NV_CPUINFO_SSE_MASK;
 		}
 		if (CPUInfo[3] & NV_CPUINFO_SSE2_MASK) {
 			m_cpu |= NV_CPUINFO_SSE2_MASK;
 		}
 #elif NV_CC_GNUC
 		// TODO: add the proper inline assembly
 #if NV_CPU_X86
 #elif NV_CPU_X86_64
 #endif	// NV_CPU_X86_64
 #endif	// NV_CC_GNUC
 	}
 	return m_cpu;
 }
--- a/src/nvcore/CpuInfo.h
+++ b/src/nvcore/CpuInfo.h
@ -18,6 +18,15 @@ namespace nv
 	// CPU Information.
 	class CpuInfo
 	{
 	protected:
 		static int cpu();
 	private:
 		// Cache of the CPU data
 		static uint m_cpu;
 		static uint m_procCount;
 	public:
 		static uint processorCount();
 		static uint coreCount();
@ -25,7 +34,6 @@ namespace nv
 		static bool hasSSE();
 		static bool hasSSE2();
 		static bool hasSSE3();
 	};
 #if NV_CC_MSVC
--- a/src/nvcore/Prefetch.h
+++ b/src/nvcore/Prefetch.h
@ -12,16 +12,15 @@
 #elif NV_CC_MSVC 
-#if NV_CPU_X86
+// Uses SSE Intrinsics for both x86 and x86_64
 #include <xmmintrin.h>
 __forceinline void nvPrefetch(const void * mem)
 {
-	__asm mov ecx, mem
+	_mm_prefetch(static_cast<const char*>(mem), _MM_HINT_T0);	/* prefetcht0  */
-	__asm prefetcht0 [ecx];
+//	_mm_prefetch(static_cast<const char*>(mem), _MM_HINT_NTA);	/* prefetchnta */
 //	__asm prefetchnta [ecx];
 }
-#endif // NV_CPU_X86
+#else
 #else // NV_CC_MSVC
 // do nothing in other case.
 #define nvPrefetch(ptr)
--- a/src/nvcore/Ptr.h
+++ b/src/nvcore/Ptr.h
@ -34,11 +34,11 @@ class AutoPtr
 	NV_FORBID_HEAPALLOC();
 public:
 	/// Default ctor.
 	AutoPtr() : m_ptr(NULL) { }
 	/// Ctor.
-	explicit AutoPtr( T * p ) : m_ptr(p) { }
+	AutoPtr(T * p = NULL) : m_ptr(p) { }
 	template <class Q>
 	AutoPtr(Q * p) : m_ptr(static_cast<T *>(p)) { }
 	/** Dtor. Deletes owned pointer. */
 	~AutoPtr() {
--- a/src/nvcore/Radix.cpp
+++ b/src/nvcore/Radix.cpp
@ -7,6 +7,10 @@
 */
 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 // References:
 // http://www.codercorner.com/RadixSortRevisited.htm
 // http://www.stereopsis.com/radix.html
 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 /**
 *	Revisited Radix Sort.
@ -26,19 +30,25 @@
 *	- 09.18.01: faster CHECK_PASS_VALIDITY thanks to Mark D. Shattuck (who provided other tips, not included here)
 *	- 10.11.01: added local ram support
 *	- 01.20.02: bugfix! In very particular cases the last pass was skipped in the float code-path, leading to incorrect sorting......
 *	- 01.02.02:	- "mIndices" renamed => "mRanks". That's a rank sorter after all.
 *				- ranks are not "reset" anymore, but implicit on first calls
 *	- 07.05.02:	offsets rewritten with one less indirection.
 *	- 11.03.02:	"bool" replaced with RadixHint enum
 *	- 07.15.04:	stack-based radix added
 *				- we want to use the radix sort but without making it static, and without allocating anything.
 *				- we internally allocate two arrays of ranks. Each of them has N uint32s to sort N values.
 *				- 1Mb/2/sizeof(uint32) = 131072 values max, at the same time.
 *	- 09.22.04:	- adapted to MacOS by Chris Lamb
 *	- 01.12.06:	- added optimizations suggested by Kyle Hubert
 *	- 04.06.08:	- Fix bug negative zero sorting bug by Ignacio Castaño
 *
 *	\class		RadixSort
 *	\author		Pierre Terdiman
- *	\version	1.3
+ *	\version	1.5
 *	\date		August, 15, 1998
 */
 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 /*
 To do:
 	- add an offset parameter between two input values (avoid some data recopy sometimes)
 	- unroll ? asm ?
 */
 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 // Header
@ -49,138 +59,170 @@ To do:
 //using namespace IceCore;
-#define DELETEARRAY(a)	{ delete [] a; a = NULL; }
+#define INVALIDATE_RANKS	mCurrentSize|=0x80000000
-#define CHECKALLOC(a)
+#define VALIDATE_RANKS		mCurrentSize&=0x7fffffff
 #define CURRENT_SIZE		(mCurrentSize&0x7fffffff)
 #define INVALID_RANKS		(mCurrentSize&0x80000000)
-
+#if NV_BIG_ENDIAN
-
+	#define H0_OFFSET	768
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+	#define H1_OFFSET	512
-/**
+	#define H2_OFFSET	256
- *	Constructor.
+	#define H3_OFFSET	0
- */
+	#define BYTES_INC	(3-j)
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+#else 
-RadixSort::RadixSort() : mCurrentSize(0), mPreviousSize(0), mIndices(NULL), mIndices2(NULL), mTotalCalls(0), mNbHits(0)
+	#define H0_OFFSET	0
-{
+	#define H1_OFFSET	256
-#ifndef RADIX_LOCAL_RAM
+	#define H2_OFFSET	512
-	// Allocate input-independent ram
+	#define H3_OFFSET	768
-	mHistogram		= new uint32[256*4];
+	#define BYTES_INC	j
 	mOffset			= new uint32[256];
 #endif
 #define CREATE_HISTOGRAMS(type, buffer)														\
 	/* Clear counters/histograms */															\
 	memset(mHistogram, 0, 256*4*sizeof(uint32));											\
 																							\
 	/* Prepare to count */																	\
 	const uint8* p = (const uint8*)input;													\
 	const uint8* pe = &p[nb*4];																\
 	uint32* h0= &mHistogram[H0_OFFSET];	/* Histogram for first pass (LSB)	*/				\
 	uint32* h1= &mHistogram[H1_OFFSET];	/* Histogram for second pass		*/				\
 	uint32* h2= &mHistogram[H2_OFFSET];	/* Histogram for third pass			*/				\
 	uint32* h3= &mHistogram[H3_OFFSET];	/* Histogram for last pass (MSB)	*/				\
 																							\
 	bool AlreadySorted = true;	/* Optimism... */											\
 																							\
 	if(INVALID_RANKS)																		\
 	{																						\
 		/* Prepare for temporal coherence */												\
 		type* Running = (type*)buffer;														\
 		type PrevVal = *Running;															\
 																							\
 		while(p!=pe)																		\
 		{																					\
 			/* Read input buffer in previous sorted order */								\
 			type Val = *Running++;															\
 			/* Check whether already sorted or not */										\
 			if(Val<PrevVal)	{ AlreadySorted = false; break; } /* Early out */				\
 			/* Update for next iteration */													\
 			PrevVal = Val;																	\
 																							\
 			/* Create histograms */															\
 			h0[*p++]++;	h1[*p++]++;	h2[*p++]++;	h3[*p++]++;									\
 		}																					\
 																							\
 		/* If all input values are already sorted, we just have to return and leave the */	\
 		/* previous list unchanged. That way the routine may take advantage of temporal */	\
 		/* coherence, for example when used to sort transparent faces.					*/	\
 		if(AlreadySorted)																	\
 		{																					\
 			mNbHits++;																		\
 			for(uint32 i=0;i<nb;i++)	mRanks[i] = i;										\
 			return *this;																	\
 		}																					\
 	}																						\
 	else																					\
 	{																						\
 		/* Prepare for temporal coherence */												\
 		const uint32* Indices = mRanks;														\
 		type PrevVal = (type)buffer[*Indices];												\
 																							\
 		while(p!=pe)																		\
 		{																					\
 			/* Read input buffer in previous sorted order */								\
 			type Val = (type)buffer[*Indices++];											\
 			/* Check whether already sorted or not */										\
 			if(Val<PrevVal)	{ AlreadySorted = false; break; } /* Early out */				\
 			/* Update for next iteration */													\
 			PrevVal = Val;																	\
 																							\
 			/* Create histograms */															\
 			h0[*p++]++;	h1[*p++]++;	h2[*p++]++;	h3[*p++]++;									\
 		}																					\
 																							\
 		/* If all input values are already sorted, we just have to return and leave the */	\
 		/* previous list unchanged. That way the routine may take advantage of temporal */	\
 		/* coherence, for example when used to sort transparent faces.					*/	\
 		if(AlreadySorted)	{ mNbHits++; return *this;	}									\
 	}																						\
 																							\
 	/* Else there has been an early out and we must finish computing the histograms */		\
 	while(p!=pe)																			\
 	{																						\
 		/* Create histograms without the previous overhead */								\
 		h0[*p++]++;	h1[*p++]++;	h2[*p++]++;	h3[*p++]++;										\
 	}
 #define CHECK_PASS_VALIDITY(pass)															\
 	/* Shortcut to current counters */														\
 	const uint32* CurCount = &mHistogram[pass<<8];											\
 																							\
 	/* Reset flag. The sorting pass is supposed to be performed. (default) */				\
 	bool PerformPass = true;																\
 																							\
 	/* Check pass validity */																\
 																							\
 	/* If all values have the same byte, sorting is useless. */								\
 	/* It may happen when sorting bytes or words instead of dwords. */						\
 	/* This routine actually sorts words faster than dwords, and bytes */					\
 	/* faster than words. Standard running time (O(4*n))is reduced to O(2*n) */				\
 	/* for words and O(n) for bytes. Running time for floats depends on actual values... */	\
 																							\
 	/* Get first byte */																	\
 	uint8 UniqueVal = *(((uint8*)input)+pass);												\
 																							\
 	/* Check that byte's counter */															\
 	if(CurCount[UniqueVal]==nb)	PerformPass=false;
 using namespace nv;
 /// Constructor.
 RadixSort::RadixSort() : mRanks(NULL), mRanks2(NULL), mCurrentSize(0), mTotalCalls(0), mNbHits(0), mDeleteRanks(true)
 {
 	// Initialize indices
-	resetIndices();
+	INVALIDATE_RANKS;
 }
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Destructor.
 /**
 *	Destructor.
 */
 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 RadixSort::~RadixSort()
 {
 	// Release everything
-#ifndef RADIX_LOCAL_RAM
+	if(mDeleteRanks)
-	DELETEARRAY(mOffset);
+	{
-	DELETEARRAY(mHistogram);
+		delete [] mRanks2;
-#endif
+		delete [] mRanks;
-	DELETEARRAY(mIndices2);
+	}
 	DELETEARRAY(mIndices);
 }
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Resizes the inner lists.
-/**
+/// \param		nb				[in] new size (number of dwords)
- *	Resizes the inner lists.
+/// \return		true if success
 *	\param		nb				[in] new size (number of dwords)
 *	\return		true if success
 */
 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 bool RadixSort::resize(uint32 nb)
 {
-	// Free previously used ram
+	if(mDeleteRanks)
-	DELETEARRAY(mIndices2);
+	{
-	DELETEARRAY(mIndices);
+		// Free previously used ram
-
+		delete [] mRanks2;
-	// Get some fresh one
+		delete [] mRanks;
 	mIndices		= new uint32[nb];	CHECKALLOC(mIndices);
 	mIndices2		= new uint32[nb];	CHECKALLOC(mIndices2);
 	mCurrentSize	= nb;
 	// Initialize indices so that the input buffer is read in sequential order
 	resetIndices();
 		// Get some fresh one
 		mRanks	= new uint32[nb];
 		mRanks2	= new uint32[nb];
 	}
 	return true;
 }
-#define CHECK_RESIZE(n)																			\
+inline void RadixSort::checkResize(uint32 nb)
-	if(n!=mPreviousSize)																		\
+{
-	{																							\
+	uint32 CurSize = CURRENT_SIZE;
-				if(n>mCurrentSize)	resize(n);													\
+	if(nb!=CurSize)
-		else						resetIndices();												\
+	{
-		mPreviousSize = n;																		\
+		if(nb>CurSize) resize(nb);
 		mCurrentSize = nb;
 		INVALIDATE_RANKS;
 	}
-
+}
 #define CREATE_HISTOGRAMS(type, buffer)															\
 	/* Clear counters */																		\
 	memset(mHistogram, 0, 256*4*sizeof(uint32));												\
 																								\
 	/* Prepare for temporal coherence */														\
 	type PrevVal = (type)buffer[mIndices[0]];													\
 	bool AlreadySorted = true;	/* Optimism... */												\
 	uint32* Indices = mIndices;																	\
 																								\
 	/* Prepare to count */																		\
 	uint8* p = (uint8*)input;																	\
 	uint8* pe = &p[nb*4];																		\
 	uint32* h0= &mHistogram[0];		/* Histogram for first pass (LSB)	*/						\
 	uint32* h1= &mHistogram[256];	/* Histogram for second pass		*/						\
 	uint32* h2= &mHistogram[512];	/* Histogram for third pass			*/						\
 	uint32* h3= &mHistogram[768];	/* Histogram for last pass (MSB)	*/						\
 																								\
 	while(p!=pe)																				\
 	{																							\
 		/* Read input buffer in previous sorted order */										\
 		type Val = (type)buffer[*Indices++];													\
 		/* Check whether already sorted or not */												\
 		if(Val<PrevVal)	{ AlreadySorted = false; break; } /* Early out */						\
 		/* Update for next iteration */															\
 		PrevVal = Val;																			\
 																								\
 		/* Create histograms */																	\
 		h0[*p++]++;	h1[*p++]++;	h2[*p++]++;	h3[*p++]++;											\
 	}																							\
 																								\
 	/* If all input values are already sorted, we just have to return and leave the */			\
 	/* previous list unchanged. That way the routine may take advantage of temporal */			\
 	/* coherence, for example when used to sort transparent faces.					*/			\
 	if(AlreadySorted)	{ mNbHits++; return *this;	}											\
 																								\
 	/* Else there has been an early out and we must finish computing the histograms */			\
 	while(p!=pe)																				\
 	{																							\
 		/* Create histograms without the previous overhead */									\
 		h0[*p++]++;	h1[*p++]++;	h2[*p++]++;	h3[*p++]++;											\
 	}
 #define CHECK_PASS_VALIDITY(pass)																\
 	/* Shortcut to current counters */															\
 	uint32* CurCount = &mHistogram[pass<<8];													\
 																								\
 	/* Reset flag. The sorting pass is supposed to be performed. (default) */					\
 	bool PerformPass = true;																	\
 																								\
 	/* Check pass validity */																	\
 																								\
 	/* If all values have the same byte, sorting is useless. */									\
 	/* It may happen when sorting bytes or words instead of dwords. */							\
 	/* This routine actually sorts words faster than dwords, and bytes */						\
 	/* faster than words. Standard running time (O(4*n))is reduced to O(2*n) */					\
 	/* for words and O(n) for bytes. Running time for floats depends on actual values... */		\
 																								\
 	/* Get first byte */																		\
 	uint8 UniqueVal = *(((uint8*)input)+pass);													\
 																								\
 	/* Check that byte's counter */																\
 	if(CurCount[UniqueVal]==nb)	PerformPass=false;
 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 /**
@ -192,46 +234,31 @@ bool RadixSort::resize(uint32 nb)
 *	\return		Self-Reference
 */
 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-RadixSort& RadixSort::sort(const uint32* input, uint32 nb, bool signedvalues)
+RadixSort& RadixSort::sort(const uint32* input, uint32 nb, bool signedValues/*=true*/)
 {
 	uint32 i, j;
 	// Checkings
-	if(!input || !nb)	return *this;
+	if(!input || !nb || nb&0x80000000)	return *this;
 	// Stats
 	mTotalCalls++;
 	// Resize lists if needed
-	CHECK_RESIZE(nb);
+	checkResize(nb);
 #ifdef RADIX_LOCAL_RAM
 	// Allocate histograms & offsets on the stack
 	uint32 mHistogram[256*4];
-	uint32 mOffset[256];
+	uint32* mLink[256];
 #endif
 	// Create histograms (counters). Counters for all passes are created in one run.
 	// Pros:	read input buffer once instead of four times
 	// Cons:	mHistogram is 4Kb instead of 1Kb
 	// We must take care of signed/unsigned values for temporal coherence.... I just
 	// have 2 code paths even if just a single opcode changes. Self-modifying code, someone?
-	if(!signedvalues)	{ CREATE_HISTOGRAMS(uint32, input);	}
+	if(!signedValues)	{ CREATE_HISTOGRAMS(uint32, input);	}
 	else				{ CREATE_HISTOGRAMS(int32, input);	}
 	// Compute #negative values involved if needed
 	uint32 NbNegativeValues = 0;
 	if(signedvalues)
 	{
 		// An efficient way to compute the number of negatives values we'll have to deal with is simply to sum the 128
 		// last values of the last histogram. Last histogram because that's the one for the Most Significant Byte,
 		// responsible for the sign. 128 last values because the 128 first ones are related to positive numbers.
 		uint32* h3= &mHistogram[768];
 		for( i=128;i<256;i++)	NbNegativeValues += h3[i];	// 768 for last histogram, 128 for negative part
 	}
 	// Radix sort, j is the pass number (0=LSB, 3=MSB)
-	for( j=0;j<4;j++)
+	for(uint32 j=0;j<4;j++)
 	{
 		CHECK_PASS_VALIDITY(j);
@ -240,40 +267,47 @@ RadixSort& RadixSort::sort(const uint32* input, uint32 nb, bool signedvalues)
 		if(PerformPass)
 		{
 			// Should we care about negative values?
-			if(j!=3 || !signedvalues)
+			if(j!=3 || !signedValues)
 			{
 				// Here we deal with positive values only
 				// Create offsets
-				mOffset[0] = 0;
+				mLink[0] = mRanks2;
-				for(i=1;i<256;i++)		mOffset[i] = mOffset[i-1] + CurCount[i-1];
+				for(uint32 i=1;i<256;i++)		mLink[i] = mLink[i-1] + CurCount[i-1];
 			}
 			else
 			{
 				// This is a special case to correctly handle negative integers. They're sorted in the right order but at the wrong place.
 				mLink[128] = mRanks2;
 				for(uint32 i=129;i<256;i++)	mLink[i] = mLink[i-1] + CurCount[i-1];
-				// Create biased offsets, in order for negative numbers to be sorted as well
+				mLink[0] = mLink[255] + CurCount[255];
-				mOffset[0] = NbNegativeValues;												// First positive number takes place after the negative ones
+				for(uint32 i=1;i<128;i++)	mLink[i] = mLink[i-1] + CurCount[i-1];
 				for(i=1;i<128;i++)		mOffset[i] = mOffset[i-1] + CurCount[i-1];	// 1 to 128 for positive numbers
 				// Fixing the wrong place for negative values
 				mOffset[128] = 0;
 				for(i=129;i<256;i++)			mOffset[i] = mOffset[i-1] + CurCount[i-1];
 			}
 			// Perform Radix Sort
-			uint8* InputBytes	= (uint8*)input;
+			const uint8* InputBytes	= (const uint8*)input;
-			uint32* Indices		= mIndices;
+			InputBytes += BYTES_INC;
-			uint32* IndicesEnd	= &mIndices[nb];
+			if(INVALID_RANKS)
 			InputBytes += j;
 			while(Indices!=IndicesEnd)
 			{
-				uint32 id = *Indices++;
+				for(uint32 i=0;i<nb;i++)	*mLink[InputBytes[i<<2]]++ = i;
-				mIndices2[mOffset[InputBytes[id<<2]]++] = id;
+				VALIDATE_RANKS;
 			}
 			else
 			{
 				const uint32* Indices		= mRanks;
 				const uint32* IndicesEnd	= &mRanks[nb];
 				while(Indices!=IndicesEnd)
 				{
 					uint32 id = *Indices++;
 					*mLink[InputBytes[id<<2]]++ = id;
 				}
 			}
-			// Swap pointers for next pass. Valid indices - the most recent ones - are in mIndices after the swap.
+			// Swap pointers for next pass. Valid indices - the most recent ones - are in mRanks after the swap.
-			uint32* Tmp	= mIndices;	mIndices = mIndices2; mIndices2 = Tmp;
+			uint32* Tmp = mRanks;
 			mRanks = mRanks2;
 			mRanks2 = Tmp;
 		}
 	}
 	return *this;
@ -291,24 +325,20 @@ RadixSort& RadixSort::sort(const uint32* input, uint32 nb, bool signedvalues)
 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 RadixSort& RadixSort::sort(const float* input2, uint32 nb)
 {
 	uint32 i, j;
 	// Checkings
-	if(!input2 || !nb)	return *this;
+	if(!input2 || !nb || nb&0x80000000)	return *this;
 	// Stats
 	mTotalCalls++;
-	uint32* input = (uint32*)input2;
+	const uint32* input = (const uint32*)input2;
 	// Resize lists if needed
-	CHECK_RESIZE(nb);
+	checkResize(nb);
 #ifdef RADIX_LOCAL_RAM
 	// Allocate histograms & offsets on the stack
 	uint32 mHistogram[256*4];
-	uint32 mOffset[256];
+	uint32* mLink[256];
 #endif
 	// Create histograms (counters). Counters for all passes are created in one run.
 	// Pros:	read input buffer once instead of four times
@ -320,16 +350,8 @@ RadixSort& RadixSort::sort(const float* input2, uint32 nb)
 	// wouldn't work with mixed positive/negative values....
 	{ CREATE_HISTOGRAMS(float, input2); }
 	// Compute #negative values involved if needed
 	uint32 NbNegativeValues = 0;
 	// An efficient way to compute the number of negatives values we'll have to deal with is simply to sum the 128
 	// last values of the last histogram. Last histogram because that's the one for the Most Significant Byte,
 	// responsible for the sign. 128 last values because the 128 first ones are related to positive numbers.
 	uint32* h3= &mHistogram[768];
 	for( i=128;i<256;i++)	NbNegativeValues += h3[i];	// 768 for last histogram, 128 for negative part
 	// Radix sort, j is the pass number (0=LSB, 3=MSB)
-	for( j=0;j<4;j++)
+	for(uint32 j=0;j<4;j++)
 	{
 		// Should we care about negative values?
 		if(j!=3)
@ -340,22 +362,32 @@ RadixSort& RadixSort::sort(const float* input2, uint32 nb)
 			if(PerformPass)
 			{
 				// Create offsets
-				mOffset[0] = 0;
+				mLink[0] = mRanks2;
-				for( i=1;i<256;i++)		mOffset[i] = mOffset[i-1] + CurCount[i-1];
+				for(uint32 i=1;i<256;i++)		mLink[i] = mLink[i-1] + CurCount[i-1];
 				// Perform Radix Sort
-				uint8* InputBytes	= (uint8*)input;
+				const uint8* InputBytes = (const uint8*)input;
-				uint32* Indices		= mIndices;
+				InputBytes += BYTES_INC;
-				uint32* IndicesEnd	= &mIndices[nb];
+				if(INVALID_RANKS)
 				InputBytes += j;
 				while(Indices!=IndicesEnd)
 				{
-					uint32 id = *Indices++;
+					for(uint32 i=0;i<nb;i++)	*mLink[InputBytes[i<<2]]++ = i;
-					mIndices2[mOffset[InputBytes[id<<2]]++] = id;
+					VALIDATE_RANKS;
 				}
 				else
 				{
 					const uint32* Indices		= mRanks;
 					const uint32* IndicesEnd	= &mRanks[nb];
 					while(Indices!=IndicesEnd)
 					{
 						uint32 id = *Indices++;
 						*mLink[InputBytes[id<<2]]++ = id;
 					}
 				}
-				// Swap pointers for next pass. Valid indices - the most recent ones - are in mIndices after the swap.
+				// Swap pointers for next pass. Valid indices - the most recent ones - are in mRanks after the swap.
-				uint32* Tmp	= mIndices;	mIndices = mIndices2; mIndices2 = Tmp;
+				uint32* Tmp = mRanks;
 				mRanks = mRanks2;
 				mRanks2 = Tmp;
 			}
 		}
 		else
@ -365,35 +397,58 @@ RadixSort& RadixSort::sort(const float* input2, uint32 nb)
 			if(PerformPass)
 			{
-				// Create biased offsets, in order for negative numbers to be sorted as well
+				mLink[255] = mRanks2 + CurCount[255];
-				mOffset[0] = NbNegativeValues;												// First positive number takes place after the negative ones
+				for(uint32 i = 254; i > 126; i--) mLink[i] = mLink[i+1] + CurCount[i];
-				for(i=1;i<128;i++)		mOffset[i] = mOffset[i-1] + CurCount[i-1];	// 1 to 128 for positive numbers
+				mLink[0] = mLink[127] + CurCount[127];
-
+				for(uint32 i = 1; i < 127; i++) mLink[i] = mLink[i-1] + CurCount[i-1];
 				// We must reverse the sorting order for negative numbers!
 				mOffset[255] = 0;
 				for(i=0;i<127;i++)		mOffset[254-i] = mOffset[255-i] + CurCount[255-i];	// Fixing the wrong order for negative values
 				for(i=128;i<256;i++)	mOffset[i] += CurCount[i];							// Fixing the wrong place for negative values
 				// Perform Radix Sort
-				for(i=0;i<nb;i++)
+				if(INVALID_RANKS)
 				{
-					uint32 Radix = input[mIndices[i]]>>24;								// Radix byte, same as above. AND is useless here (uint32).
+					for(uint32 i=0;i<nb;i++)
-					// ### cmp to be killed. Not good. Later.
+					{
-					if(Radix<128)		mIndices2[mOffset[Radix]++] = mIndices[i];		// Number is positive, same as above
+						uint32 Radix = input[i]>>24;							// Radix byte, same as above. AND is useless here (uint32).
-					else				mIndices2[--mOffset[Radix]] = mIndices[i];		// Number is negative, flip the sorting order
+						// ### cmp to be killed. Not good. Later.
 						if(Radix<128)		*mLink[Radix]++ = i;		// Number is positive, same as above
 						else				*(--mLink[Radix]) = i;		// Number is negative, flip the sorting order
 					}
 					VALIDATE_RANKS;
 				}
-				// Swap pointers for next pass. Valid indices - the most recent ones - are in mIndices after the swap.
+				else
-				uint32* Tmp	= mIndices;	mIndices = mIndices2; mIndices2 = Tmp;
+				{
 					for(uint32 i=0;i<nb;i++)
 					{
 						uint32 Radix = input[mRanks[i]]>>24;							// Radix byte, same as above. AND is useless here (uint32).
 						// ### cmp to be killed. Not good. Later.
 						if(Radix<128)		*mLink[Radix]++ = mRanks[i];		// Number is positive, same as above
 						else				*(--mLink[Radix]) = mRanks[i];		// Number is negative, flip the sorting order
 					}
 				}
 				// Swap pointers for next pass. Valid indices - the most recent ones - are in mRanks after the swap.
 				uint32* Tmp = mRanks;
 				mRanks = mRanks2;
 				mRanks2 = Tmp;
 			}
 			else
 			{
 				// The pass is useless, yet we still have to reverse the order of current list if all values are negative.
 				if(UniqueVal>=128)
 				{
-					for(i=0;i<nb;i++)	mIndices2[i] = mIndices[nb-i-1];
+					if(INVALID_RANKS)
 					{
 						// ###Possible?
 						for(uint32 i=0;i<nb;i++)	mRanks2[i] = nb-i-1;
 						VALIDATE_RANKS;
 					}
 					else
 					{
 						for(uint32 i=0;i<nb;i++)	mRanks2[i] = mRanks[nb-i-1];
 					}
-					// Swap pointers for next pass. Valid indices - the most recent ones - are in mIndices after the swap.
+					// Swap pointers for next pass. Valid indices - the most recent ones - are in mRanks after the swap.
-					uint32* Tmp	= mIndices;	mIndices = mIndices2; mIndices2 = Tmp;
+					uint32* Tmp = mRanks;
 					mRanks = mRanks2;
 					mRanks2 = Tmp;
 				}
 			}
 		}
@ -401,29 +456,29 @@ RadixSort& RadixSort::sort(const float* input2, uint32 nb)
 	return *this;
 }
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
-/**
+bool RadixSort::setRankBuffers(uint32* ranks0, uint32* ranks1)
 *	Resets the inner indices. After the call, mIndices is reset.
 */
 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 void RadixSort::resetIndices()
 {
-	for(uint32 i=0;i<mCurrentSize;i++)	mIndices[i] = i;
+	if(!ranks0 || !ranks1)	return false;
 	mRanks			= ranks0;
 	mRanks2			= ranks1;
 	mDeleteRanks	= false;
 	return true;
 }
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+RadixSort & RadixSort::sort(const Array<int> & input)
 /**
 *	Gets the ram used.
 *	\return		memory used in bytes
 */
 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 uint32 RadixSort::usedRam() const
 {
-	uint32 UsedRam = sizeof(RadixSort);
+	return sort((const uint32 *)input.buffer(), input.count(), true);
-#ifndef RADIX_LOCAL_RAM
+}
-	UsedRam += 256*4*sizeof(uint32);			// Histograms
+
-	UsedRam += 256*sizeof(uint32);				// Offsets
+RadixSort & RadixSort::sort(const Array<uint> & input)
-#endif
+{
-	UsedRam += 2*mCurrentSize*sizeof(uint32);	// 2 lists of indices
+	return sort(input.buffer(), input.count(), false);
-	return UsedRam;
+}
 RadixSort &	RadixSort::sort(const Array<float> & input)
 {
 	return sort(input.buffer(), input.count());
 }
--- a/src/nvcore/Radix.h
+++ b/src/nvcore/Radix.h
@ -13,57 +13,61 @@
 #define NV_CORE_RADIXSORT_H
 #include <nvcore/nvcore.h>
 #include <nvcore/Containers.h>
 namespace nv
 {
 	class NVCORE_CLASS RadixSort
 	{
 		NV_FORBID_COPY(RadixSort);
 	public:
 		// Constructor/Destructor
 		RadixSort();
 		~RadixSort();
 		// Sorting methods
 		RadixSort & sort(const uint32* input, uint32 nb, bool signedValues=true);
 		RadixSort &	sort(const float* input, uint32 nb);
 		// Helpers
 		RadixSort & sort(const Array<int> & input);
 		RadixSort & sort(const Array<uint> & input);
 		RadixSort & sort(const Array<float> & input);
-#define RADIX_LOCAL_RAM
+		//! Access to results. mRanks is a list of indices in sorted order, i.e. in the order you may further process your data
 		inline /*const*/ uint32 * ranks() /*const*/ { return mRanks; }
 		//! mIndices2 gets trashed on calling the sort routine, but otherwise you can recycle it the way you want.
 		inline uint32 * recyclable() const { return mRanks2; }
-class NVCORE_API RadixSort {
+		// Stats
-	NV_FORBID_COPY(RadixSort);
+		//! Returns the total number of calls to the radix sorter.
-public:
+		inline uint32 totalCalls() const { return mTotalCalls; }
 	// Constructor/Destructor
 	RadixSort();
 	~RadixSort();
-	// Sorting methods
+		//! Returns the number of early exits due to temporal coherence.
-	RadixSort & sort(const uint32* input, uint32 nb, bool signedvalues=true);
+		inline uint32 hits() const { return mNbHits; }
 	RadixSort & sort(const float* input, uint32 nb);
 	//! Access to results. mIndices is a list of indices in sorted order, i.e. in the order you may further process your data
 	inline uint32 * indices() const { return mIndices; }
 	//! mIndices2 gets trashed on calling the sort routine, but otherwise you can recycle it the way you want.
 	inline uint32 * recyclable() const { return mIndices2; }
 	// Stats
 	uint32 usedRam() const;
 	//! Returns the total number of calls to the radix sorter.
 	inline uint32 totalCalls()	const { return mTotalCalls;	}
 	//! Returns the number of premature exits due to temporal coherence.
 	inline uint32 hits() const { return mNbHits; }
 		bool setRankBuffers(uint32* ranks0, uint32* ranks1);
 	private:
-#ifndef RADIX_LOCAL_RAM
+		uint32 mCurrentSize;    //!< Current size of the indices list
-	uint32*			mHistogram;					//!< Counters for each byte
+		uint32 * mRanks;        //!< Two lists, swapped each pass
-	uint32*			mOffset;					//!< Offsets (nearly a cumulative distribution function)
+		uint32 * mRanks2;
 #endif
 	uint32			mCurrentSize;				//!< Current size of the indices list
 	uint32			mPreviousSize;				//!< Size involved in previous call
 	uint32*			mIndices;					//!< Two lists, swapped each pass
 	uint32*			mIndices2;
-	// Stats
+		// Stats
-	uint32			mTotalCalls;
+		uint32 mTotalCalls;     //!< Total number of calls to the sort routine
-	uint32			mNbHits;
+		uint32 mNbHits;         //!< Number of early exits due to coherence
-	// Internal methods
+		// Stack-radix
-	bool			resize(uint32 nb);
+		bool mDeleteRanks;      //!<
 	void			resetIndices();
-};
+		// Internal methods
 		void checkResize(uint32 nb);
 		bool resize(uint32 nb);
 	};
 } // nv namespace
 #endif // NV_CORE_RADIXSORT_H
--- a/src/nvcore/StrLib.cpp
+++ b/src/nvcore/StrLib.cpp
@ -208,49 +208,12 @@ StringBuilder::StringBuilder( const StringBuilder & s ) : m_size(0), m_str(NULL)
 	copy(s);
 }
-// Copy string. 
+/** Copy string. */
-/*StringBuilder::StringBuilder( const char * s )
+StringBuilder::StringBuilder( const char * s )
 {
 	copy(s);
 }*/
 /** Allocate and copy string. */
 StringBuilder::StringBuilder( int size_hint, const StringBuilder & s) : m_size(size_hint), m_str(NULL)
 {
 	nvDebugCheck(m_size > 0);
 	m_str = strAlloc(m_size);
 	copy(s);
 }
 /** Allocate and format string. */
 StringBuilder::StringBuilder( const char * fmt, ... ) : m_size(0), m_str(NULL)
 {
 	nvDebugCheck(fmt != NULL);
 	va_list arg;
 	va_start( arg, fmt );
 	format( fmt, arg );
 	va_end( arg );
 }
 /** Allocate and format string. */
 StringBuilder::StringBuilder( int size_hint, const char * fmt, ... ) : m_size(size_hint), m_str(NULL)
 {
 	nvDebugCheck(m_size > 0);	
 	nvDebugCheck(fmt != NULL);
 	m_str = strAlloc(m_size);
 	va_list arg;
 	va_start( arg, fmt );
 	format( fmt, arg );
 	va_end( arg );
 }
 /** Delete the string. */
 StringBuilder::~StringBuilder()
 {
@ -278,8 +241,7 @@ StringBuilder & StringBuilder::format( const char * fmt, ... )
 /** Format a string safely. */
 StringBuilder & StringBuilder::format( const char * fmt, va_list arg )
 {
-	nvCheck(fmt != NULL);
+	nvDebugCheck(fmt != NULL);
 	nvCheck(m_size >= 0);
 	if( m_size == 0 ) {
 		m_size = 64;
@ -327,8 +289,7 @@ StringBuilder & StringBuilder::format( const char * fmt, va_list arg )
 /** Append a string. */
 StringBuilder & StringBuilder::append( const char * s )
 {
-	nvCheck(s != NULL);
+	nvDebugCheck(s != NULL);
 	nvCheck(m_size >= 0);
 	const uint slen = uint(strlen( s ));
@ -475,31 +436,6 @@ void StringBuilder::reset()
 }
 Path::Path(const char * fmt, ...)
 {
 	nvDebugCheck( fmt != NULL );
 	va_list arg;
 	va_start( arg, fmt );
 	format( fmt, arg );
 	va_end( arg );
 }
 Path::Path(int size_hint, const char * fmt, ...) : StringBuilder(size_hint)
 {
 	nvDebugCheck( fmt != NULL );
 	va_list arg;
 	va_start( arg, fmt );
 	format( fmt, arg );
 	va_end( arg );
 }
 /// Get the file name from a path.
 const char * Path::fileName() const
 {
--- a/src/nvcore/StrLib.h
+++ b/src/nvcore/StrLib.h
@ -45,11 +45,8 @@ namespace nv
 		StringBuilder();
 		explicit StringBuilder( int size_hint );
-		//StringBuilder( const char * str );
+		StringBuilder( const char * str );
 		StringBuilder( const StringBuilder & );
 		StringBuilder( int size_hint, const StringBuilder & );	
 		StringBuilder( const char * format, ... ) __attribute__((format (printf, 2, 3)));
 		StringBuilder( int size_hint, const char * format, ... ) __attribute__((format (printf, 3, 4)));
 		~StringBuilder();
@ -120,24 +117,17 @@ namespace nv
 		char * m_str;
 	};
-	/// Path string.
+
 	/// Path string. @@ This should be called PathBuilder.
 	class NVCORE_CLASS Path : public StringBuilder
 	{
 	public:
 		Path() : StringBuilder() {}
 		explicit Path(int size_hint) : StringBuilder(size_hint) {}
-		//Path(const char * str) : StringBuilder((const char *)str) {}
+		Path( const char * str ) : StringBuilder(str) {}
 		Path(const StringBuilder & str) : StringBuilder(str) {}
 		Path(int size_hint, const StringBuilder & str) : StringBuilder(size_hint, str) {}	
 		Path(const char * format, ...) __attribute__((format (printf, 2, 3)));
 		Path(int size_hint, const char * format, ...) __attribute__((format (printf, 3, 4)));
 		Path & operator=( const char * s ) {
 			return (Path &)copy(s);
 		}
 		const char * fileName() const;
 		const char * extension() const;
@ -145,11 +135,11 @@ namespace nv
 		void stripFileName();
 		void stripExtension();
-		
+
 		// statics
-		NVCORE_API static char separator();
+		static char separator();
-		NVCORE_API static const char * fileName(const char *);
+		static const char * fileName(const char *);
-		NVCORE_API static const char * extension(const char *);
+		static const char * extension(const char *);
 	};
--- a/src/nvcore/Timer.h
+++ b/src/nvcore/Timer.h
@ -0,0 +1,22 @@
 // This code is in the public domain -- castano@gmail.com
 #ifndef NV_CORE_TIMER_H
 #define NV_CORE_TIMER_H
 #include <nvcore/nvcore.h>
 #include <time.h> //clock
 class NVCORE_CLASS Timer
 {
 public:
 	Timer() {}
 	void start() { m_start = clock(); }
 	int elapsed() const { return (1000 * (clock() - m_start)) / CLOCKS_PER_SEC; }
 private:
 	clock_t m_start;
 };
 #endif // NV_CORE_TIMER_H
--- a/src/nvcore/Tokenizer.cpp
+++ b/src/nvcore/Tokenizer.cpp
@ -8,7 +8,7 @@
 #include <stdlib.h>	// atof, atoi
 #if NV_CC_MSVC
-#if 0 // This doesn't work on MSVC for x64
+#if defined NV_CPU_X86
 /* vsscanf for Win32
 * Written 5/2003 by <mgix@mgix.com>
 * This code is in the Public Domain
@ -56,9 +56,39 @@ static int vsscanf(const char * buffer, const char * format, va_list argPtr)
 	}
 	return result;
 }
 #elif defined NV_CPU_X86_64
 /* Prototype of the helper assembly function */
 #ifdef __cplusplus
 extern "C" {
 #endif
 int vsscanf_proxy_win64(const char * buffer, const char * format, va_list argPtr, __int64 count);
 #ifdef __cplusplus
 }
 #endif
 /* MASM64 version of the above vsscanf */
 static int vsscanf(const char * buffer, const char * format, va_list argPtr)
 {
 	// Get an upper bound for the # of args
 	__int64 count = 0;
 	const char *p = format;
 	while(1) {
 		char c = *(p++);
 		if(c==0) break;
 		if(c=='%' && (p[0]!='*' && p[0]!='%')) ++count;
 	}
 	return vsscanf_proxy_win64(buffer, format, argPtr, count);
 }
 /*#error vsscanf doesn't work on MSVC for x64*/
 #else
 #error Unknown processor for MSVC
 #endif
 #endif // NV_CC_MSVC
 using namespace nv;
 Token::Token() :
--- a/src/nvcore/vsscanf_proxy_win64.masm
+++ b/src/nvcore/vsscanf_proxy_win64.masm
@ -0,0 +1,124 @@
 ; MASM x64 version of
 ; vsscanf for Win32
 ; originally written 5/2003 by <mgix@mgix.com>
 ;
 ; This was done because MSVC does not accept inline assembly code
 ; for the x64 platform, so this file implements almost the whole
 ; module in assembly using the amd64 ABI
 ;
 ; 06/17/2008 by edgarv [at] nvidia com
 ; Definition of memcpy
 memcpy	PROTO dest:Ptr, src:Ptr, numbytes:QWORD
 ; Definition of sscanf
 sscanf PROTO buffer:Ptr Byte, format:Ptr Byte, args:VARARG
 ; Start a code segment named "_TEXT" by default
 .CODE
 ; Entry point of our function: at this point we can use
 ; named parameters
 ALIGN 16
 PUBLIC vsscanf_proxy_win64
 ; Because the x64 code uses the fast call convention, only
 ; the arguments beyond the 4th one are available from the stack.
 ; The first four parameters are in RCX, RDX, R8 and R9
 ; Parameters:
 ;    const char* buffer
 ;    const char* format
 ;    va_list argPtr
 ;    size_t  count
 vsscanf_proxy_win64 PROC, \
 	buffer:PTR Byte, format:PTR Byte, argPtr:PTR, count:QWORD
  ; Allocates space for our local variable, savedRDP
  sub rsp, 08h
  ; Copies the parameters from the registers to the memory: before warping to
  ; sscanf we will call memcpy, and those registers can just dissapear!
  mov buffer, rcx
  mov format, rdx
  mov argPtr, r8
  mov count,  r9
  ; Allocate extra space in the stack for (2+count)*sizeof(void*),
  ; this is (2+count)*(8)
  mov r10, r9
  add r10, 2		; count += 2
  sal r10, 3		; count *= 8
  add r10, 0fh	; To force alignment to 16bytes
  and r10, 0fffffffffffffff0h
  sub rsp, r10	; Actual stack allocation
  ; Continues by copying all the arguments in the "alloca" space  
  mov [rsp], rcx		    ; newStack[0] = (void*)buffer;
  mov [rsp + 08h], rdx		; newStack[1] = (void*)format;
  ; Calls memcpy(newStack+2, argPtr, count*sizeof(void*));
  mov rcx, rsp
  add rcx, 010h		; newStack+2
  mov rdx, r8		; argPtr
  mov r8, r9
  sal r8, 3			; count*sizeof(void*)
  ; Prepares extra stack space as required by the ABI for 4 arguments, and calls memcpy
  sub rsp, 020h
  call memcpy
  ; Restore the stack
  add rsp, 020h
  ; Saves rsp in memory
  mov qword ptr [rbp - 8], rsp
  ; Does exactly the same trick as before: warp into system sscanf with the new stack,
  ; but this time we also setup the arguments in the registers according to the amd64 ABI
  ; If there was at least one argument (after buffer and format), we need to copy that
  ; to r8, and if there was a second one we must copy that to r9
  ; (the first arguments to sscanf are always the buffer and the format)
  mov r10, count
  ; Copy the first argument to r8 (if it exists)
  cmp r10, 0
  je args_memcpy
  mov r8, [rsp + 10h]
  ; Copy the second argument to r9 (if it exists)
  cmp r10, 1
  je args_memcpy
  mov r9, [rsp + 18h]
 args_memcpy:
  ; Copies the buffer and format to rcx and rdx
  mov rdx, [rsp + 08h]
  mov rcx, [rsp]
  ; Finally, calls sscanf using the current stack
  call sscanf
  ; At this point the return value is alreay in rax
  ; Restores rsp
  mov rsp, qword ptr [rbp - 8]
  ; Undoes the alloca
  add rsp, r10
  ; Restores the space for local variables
  add rsp, 08h
  ; Remember, the return value is already in rax since the sscanf call
  ret
 vsscanf_proxy_win64 ENDP
 END