Tag 2.0.8 for release.

2010-05-14 18:01:41 +00:00
parent f6a39d6eab
commit eb01ca604f
375 changed files with 12760 additions and 28091 deletions
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@ -1,12 +1,11 @@

-INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR})
-INCLUDE_DIRECTORIES(${NV_SOURCE_DIR}/extern/poshlib)
-
 SUBDIRS(nvcore)
 SUBDIRS(nvmath)
 SUBDIRS(nvimage)
 SUBDIRS(nvtt)

+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR})
+
 # OpenGL
 INCLUDE(FindOpenGL)
 IF(OPENGL_FOUND)
@ -16,7 +15,8 @@ ELSE(OPENGL_FOUND)
 ENDIF(OPENGL_FOUND)

 # GLUT
-INCLUDE(FindGLUT)
+INCLUDE(${NV_CMAKE_DIR}/FindGLUT.cmake)
+#INCLUDE(FindGLUT)
 IF(GLUT_FOUND)
 	MESSAGE(STATUS "Looking for GLUT - found")
 ELSE(GLUT_FOUND)
@ -48,7 +48,7 @@ ELSE(CG_FOUND)
 ENDIF(CG_FOUND)

 # CUDA
-FIND_PACKAGE(CUDA)
+INCLUDE(${NV_CMAKE_DIR}/FindCUDA.cmake)
 IF(CUDA_FOUND)
 	SET(HAVE_CUDA ${CUDA_FOUND} CACHE BOOL "Set to TRUE if CUDA is found, FALSE otherwise")
 	MESSAGE(STATUS "Looking for CUDA - found")
@ -65,15 +65,6 @@ ELSE(MAYA_FOUND)
 	MESSAGE(STATUS "Looking for Maya - not found")
 ENDIF(MAYA_FOUND)

-# FreeImage
-INCLUDE(${NV_CMAKE_DIR}/FindFreeImage.cmake)
-IF(FREEIMAGE_FOUND)
-	SET(HAVE_FREEIMAGE ${FREEIMAGE_FOUND} CACHE BOOL "Set to TRUE if FreeImage is found, FALSE otherwise")
-	MESSAGE(STATUS "Looking for FreeImage - found")
-ELSE(FREEIMAGE_FOUND)
-	MESSAGE(STATUS "Looking for FreeImage - not found")
-ENDIF(FREEIMAGE_FOUND)
-
 # JPEG
 INCLUDE(FindJPEG)
 IF(JPEG_FOUND)
@ -93,7 +84,6 @@ ELSE(PNG_FOUND)
 ENDIF(PNG_FOUND)

 # TIFF
-SET(TIFF_NAMES libtiff)
 INCLUDE(FindTIFF)
 IF(TIFF_FOUND)
 	SET(HAVE_TIFF ${TIFF_FOUND} CACHE BOOL "Set to TRUE if TIFF is found, FALSE otherwise")
@ -111,15 +101,6 @@ ELSE(OPENEXR_FOUND)
 	MESSAGE(STATUS "Looking for OpenEXR - not found")
 ENDIF(OPENEXR_FOUND)

-# OpenMP
-INCLUDE(FindOpenMP)
-IF(OPENMP_FOUND)
-	SET(HAVE_OPENMP ${OPENMP_FOUND} CACHE BOOL "Set to TRUE if OpenMP is found, FALSE otherwise")
-	MESSAGE(STATUS "Looking for OpenMP - found")
-ELSE(OPENMP_FOUND)
-	MESSAGE(STATUS "Looking for OpenMP - not found")
-ENDIF(OPENMP_FOUND)
-	
 # Qt
 FIND_PACKAGE(Qt4)

@ -138,3 +119,5 @@ CHECK_INCLUDE_FILES(malloc.h HAVE_MALLOC_H)

 CONFIGURE_FILE(${CMAKE_CURRENT_SOURCE_DIR}/nvconfig.h.in ${CMAKE_CURRENT_BINARY_DIR}/nvconfig.h)

+#INSTALL(FILES ${CMAKE_CURRENT_BINARY_DIR}/nvconfig.h DESTINATION include)
+
--- a/src/nvconfig.h.in
+++ b/src/nvconfig.h.in
@ -7,13 +7,10 @@
 #cmakedefine HAVE_EXECINFO_H
 #cmakedefine HAVE_MALLOC_H

-#cmakedefine HAVE_OPENMP
-
 #cmakedefine HAVE_PNG
 #cmakedefine HAVE_JPEG
 #cmakedefine HAVE_TIFF
 #cmakedefine HAVE_OPENEXR
-#cmakedefine HAVE_FREEIMAGE

 #cmakedefine HAVE_MAYA

--- a/src/nvcore/Algorithms.h
+++ b/src/nvcore/Algorithms.h
@ -1,154 +0,0 @@
-// This code is in the public domain -- castanyo@yahoo.es
-
-#ifndef NV_CORE_ALGORITHMS_H
-#define NV_CORE_ALGORITHMS_H
-
-#include "nvcore.h"
-
-namespace nv
-{
-
-	/// Return the maximum of two values.
-	template <typename T> 
-	inline const T & max(const T & a, const T & b)
-	{
-		//return std::max(a, b);
-		if( a < b ) {
-			return b; 
-		}
-		return a;
-	}
-	
-	/// Return the minimum of two values.
-	template <typename T> 
-	inline const T & min(const T & a, const T & b)
-	{
-		//return std::min(a, b);
-		if( b < a ) {
-			return b; 
-		}
-		return a;
-	}
-
-	/// Clamp between two values.
-	template <typename T> 
-	inline const T & clamp(const T & x, const T & a, const T & b)
-	{
-		return min(max(x, a), b);
-	}
-	
-	/// Delete all the elements of a container.
-	template <typename T>
-	void deleteAll(T & container)
-	{
-		for (typename T::PseudoIndex i = container.start(); !container.isDone(i); container.advance(i))
-		{
-			delete container[i];
-		}
-	}
-	
-	// @@ Should swap be implemented here?
-
-
-
-	template <typename T, template <typename T> class C>
-	void sort(C<T> & container)
-	{
-		introsortLoop(container, 0, container.count());
-		insertionSort(container, 0, container.count());
-	}
-
-	template <typename T, template <typename T> class C>
-	void sort(C<T> & container, uint begin, uint end)
-	{
-		if (begin < end)
-		{
-			introsortLoop(container, begin, end);
-			insertionSort(container, begin, end);
-		}
-	}
-
-	template <typename T, template <typename T> class C>
-	void insertionSort(C<T> & container)
-	{
-		insertionSort(container, 0, container.count());
-	}
-
-	template <typename T, template <typename T> class C>
-	void insertionSort(C<T> & container, uint begin, uint end)
-	{
-		for (uint i = begin + 1; i != end; ++i)
-		{
-			T value = container[i];
-
-			uint j = i;
-			while (j != begin && container[j-1] > value)
-			{
-				container[j] = container[j-1];
-				--j;
-			}
-			if (i != j)
-			{
-				container[j] = value;
-			}
-		}
-	}
-
-	template <typename T, template <typename T> class C>
-    void introsortLoop(C<T> & container, uint begin, uint end)
-    {
-    	while (end-begin > 16)
-    	{
-			uint p = partition(container, begin, end, medianof3(container, begin, begin+((end-begin)/2)+1, end-1));
-			introsortLoop(container, p, end);
-			end = p;
-    	}
-    }
-
-	template <typename T, template <typename T> class C>
-    uint partition(C<T> & a, uint begin, uint end, const T & x)
-    {
-    	int i = begin, j = end;
-    	while (true)
-    	{
-    	    while (a[i] < x) ++i;
-    	    --j;
-    	    while (x < a[j]) --j;
-    	    if (i >= j)
-    			return i;
-    	    swap(a[i], a[j]);
-    	    i++;
-    	}
-    }
-
-	template <typename T, template <typename T> class C>
-    const T & medianof3(C<T> & a, uint lo, uint mid, uint hi)
-    {
-		if (a[mid] < a[lo])
-		{
-			if (a[hi] < a[mid])
-			{
-				return a[mid];
-			}
-			else
-			{
-				return (a[hi] < a[lo]) ? a[hi] : a[lo];
-			}
-		}
-		else
-		{
-			if (a[hi] < a[mid])
-			{
-				return (a[hi] < a[lo]) ? a[lo] : a[hi];
-			}
-			else
-			{
-				return a[mid];
-			}
-		}
-    }
-
-
-} // nv namespace
-
-#endif // NV_CORE_ALGORITHMS_H
--- a/src/nvcore/BitArray.h
+++ b/src/nvcore/BitArray.h
@ -0,0 +1,168 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#ifndef NV_CORE_BITARRAY_H
+#define NV_CORE_BITARRAY_H
+
+#include <nvcore/nvcore.h>
+#include <nvcore/Containers.h>
+
+namespace nv
+{
+
+/// Count the bits of @a x.
+inline uint bitsSet(uint8 x) {
+	uint count = 0;
+	for(; x != 0; x >>= 1) {
+		count += (x & 1);
+	}
+	return count;
+}
+
+
+/// Count the bits of @a x.
+inline uint bitsSet(uint32 x, int bits) {
+	uint count = 0;
+	for(; x != 0 && bits != 0; x >>= 1, bits--) {
+		count += (x & 1);
+	}
+	return count;
+}
+
+
+/// Simple bit array.
+class BitArray
+{
+public:
+
+	/// Default ctor.
+	BitArray() {}
+
+	/// Ctor with initial m_size.
+	BitArray(uint sz)
+	{
+		resize(sz);
+	}
+
+	/// Get array m_size.
+	uint size() const { return m_size; }
+
+	/// Clear array m_size.
+	void clear() { resize(0); }
+
+	/// Set array m_size.
+	void resize(uint sz)
+	{ 
+		m_size = sz;
+		m_bitArray.resize( (m_size + 7) >> 3 );
+	}
+	
+	/// Get bit.
+	bool bitAt(uint b) const
+	{
+		nvDebugCheck( b < m_size );
+		return (m_bitArray[b >> 3] & (1 << (b & 7))) != 0;
+	}
+
+	/// Set a bit.
+	void setBitAt(uint b)
+	{
+		nvDebugCheck( b < m_size );
+		m_bitArray[b >> 3] |=  (1 << (b & 7));
+	}
+
+	/// Clear a bit.
+	void clearBitAt( uint b )
+	{
+		nvDebugCheck( b < m_size );
+		m_bitArray[b >> 3] &= ~(1 << (b & 7));
+	}
+
+	/// Clear all the bits.
+	void clearAll()
+	{
+		memset(m_bitArray.unsecureBuffer(), 0, m_bitArray.size());
+	}
+
+	/// Set all the bits.
+	void setAll()
+	{
+		memset(m_bitArray.unsecureBuffer(), 0xFF, m_bitArray.size());
+	}
+
+	/// Toggle all the bits.
+	void toggleAll()
+	{
+		const uint byte_num = m_bitArray.size();
+		for(uint b = 0; b < byte_num; b++) {
+			m_bitArray[b] ^= 0xFF;
+		}
+	}
+	
+	/// Get a byte of the bit array.
+	const uint8 & byteAt(uint index) const
+	{
+		return m_bitArray[index];
+	}
+
+	/// Set the given byte of the byte array.
+	void setByteAt(uint index, uint8 b)
+	{
+		m_bitArray[index] = b;
+	}
+	
+	/// Count the number of bits set.
+	uint countSetBits() const
+	{
+		const uint num = m_bitArray.size();
+		if( num == 0 ) {
+			return 0;
+		}
+		
+		uint count = 0;				
+		for(uint i = 0; i < num - 1; i++) {
+			count += bitsSet(m_bitArray[i]);
+		}
+		count += bitsSet(m_bitArray[num-1], m_size & 0x7);
+		
+		//piDebugCheck(count + countClearBits() == m_size);
+		return count;
+	}
+
+	/// Count the number of bits clear.
+	uint countClearBits() const {
+		
+		const uint num = m_bitArray.size();
+		if( num == 0 ) {
+			return 0;
+		}
+		
+		uint count = 0;
+		for(uint i = 0; i < num - 1; i++) {
+			count += bitsSet(~m_bitArray[i]);
+		}
+		count += bitsSet(~m_bitArray[num-1], m_size & 0x7);
+		
+		//piDebugCheck(count + countSetBits() == m_size);
+		return count;
+	}
+
+	friend void swap(BitArray & a, BitArray & b)
+	{
+		swap(a.m_size, b.m_size);
+		swap(a.m_bitArray, b.m_bitArray);
+	}
+
+
+private:
+
+	/// Number of bits stored.
+	uint m_size;
+
+	/// Array of bits.
+	Array<uint8> m_bitArray;
+
+};
+
+} // nv namespace
+
+#endif // _PI_CORE_BITARRAY_H_
--- a/src/nvcore/CMakeLists.txt
+++ b/src/nvcore/CMakeLists.txt
@ -1,25 +1,27 @@
 PROJECT(nvcore)
+ADD_SUBDIRECTORY(poshlib)

 SET(CORE_SRCS
-    nvcore.h
-    Algorithms.h
-    Containers.h
-    Debug.h Debug.cpp
-    DefsGnucDarwin.h
-    DefsGnucLinux.h
-    DefsGnucWin32.h
-    DefsVcWin32.h
-    FileSystem.h FileSystem.cpp
-    Library.h Library.cpp
-    Memory.h Memory.cpp
-    Ptr.h
-    RefCounted.h RefCounted.cpp
-    StrLib.h StrLib.cpp
-    Stream.h
-    StdStream.h
-    TextReader.h TextReader.cpp
-    TextWriter.h TextWriter.cpp
-    Timer.h)
+	nvcore.h
+	Ptr.h
+	BitArray.h
+	Memory.h
+	Memory.cpp
+	Debug.h
+	Debug.cpp
+	Containers.h
+	StrLib.h
+	StrLib.cpp
+	Stream.h
+	StdStream.h
+	TextReader.h
+	TextReader.cpp
+	TextWriter.h
+	TextWriter.cpp
+	Radix.h
+	Radix.cpp
+	Library.h
+	Library.cpp)

 INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})

@ -27,19 +29,19 @@ INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
 ADD_DEFINITIONS(-DNVCORE_EXPORTS)

 IF(UNIX)
-    SET(LIBS ${LIBS} ${CMAKE_DL_LIBS})
+	SET(LIBS ${LIBS} ${CMAKE_DL_LIBS})
 ENDIF(UNIX)

 IF(NVCORE_SHARED)
-    ADD_DEFINITIONS(-DNVCORE_SHARED=1)
-    ADD_LIBRARY(nvcore SHARED ${CORE_SRCS})
+	ADD_DEFINITIONS(-DNVCORE_SHARED=1)
+	ADD_LIBRARY(nvcore SHARED ${CORE_SRCS})
 ELSE(NVCORE_SHARED)
-    ADD_LIBRARY(nvcore ${CORE_SRCS})
+	ADD_LIBRARY(nvcore ${CORE_SRCS})
 ENDIF(NVCORE_SHARED)

 TARGET_LINK_LIBRARIES(nvcore ${LIBS})

 INSTALL(TARGETS nvcore
-    RUNTIME DESTINATION bin
-    LIBRARY DESTINATION lib
-    ARCHIVE DESTINATION lib/static)
+	RUNTIME DESTINATION bin
+	LIBRARY DESTINATION lib
+	ARCHIVE DESTINATION lib/static)
--- a/src/nvcore/Containers.h
+++ b/src/nvcore/Containers.h
@ -16,10 +16,9 @@ Do not use memmove in insert & remove, use copy ctors instead.


 // nvcore
-#include "nvcore.h"
-#include "Memory.h"
-#include "Debug.h"
-//#include "Stream.h"
+#include <nvcore/nvcore.h>
+#include <nvcore/Memory.h>
+#include <nvcore/Debug.h>

 #include <string.h>	// memmove
 #include <new>		// for placement new
@ -71,10 +70,40 @@ namespace nv
 {
 	// Templates

+	/// Return the maximum of two values.
+	template <typename T> 
+	inline const T & max(const T & a, const T & b)
+	{
+		//return std::max(a, b);
+		if( a < b ) {
+			return b; 
+		}
+		return a;
+	}
+	
+	/// Return the minimum of two values.
+	template <typename T> 
+	inline const T & min(const T & a, const T & b)
+	{
+		//return std::min(a, b);
+		if( b < a ) {
+			return b; 
+		}
+		return a;
+	}
+	
+	/// Clamp between two values.
+	template <typename T> 
+	inline const T & clamp(const T & x, const T & a, const T & b)
+	{
+		return min(max(x, a), b);
+	}
+	
 	/// Swap two values.
 	template <typename T> 
 	inline void swap(T & a, T & b)
 	{
+		//return std::swap(a, b);
 		T temp = a; 
 		a = b; 
 		b = temp;
@ -105,6 +134,16 @@ namespace nv
 		uint operator()(uint x) const { return x; }
 	};
 	
+	/// Delete all the elements of a container.
+	template <typename T>
+	void deleteAll(T & container)
+	{
+		for(typename T::PseudoIndex i = container.start(); !container.isDone(i); container.advance(i))
+		{
+			delete container[i];
+		}
+	}
+
 	
 	/** Return the next power of two. 
 	* @see http://graphics.stanford.edu/~seander/bithacks.html
@ -115,7 +154,7 @@ namespace nv
 	inline uint nextPowerOfTwo( uint x )
 	{
 		nvDebugCheck( x != 0 );
-	#if 1	// On modern CPUs this is supposed to be as fast as using the bsr instruction.
+	#if 1	// On modern CPUs this is as fast as using the bsr instruction.
 		x--;
 		x |= x >> 1;
 		x |= x >> 2;
@ -138,6 +177,15 @@ namespace nv
 		return (n & (n-1)) == 0;
 	}

+	/// Simple iterator interface.
+	template <typename T>
+	struct Iterator
+	{
+		virtual void advance();
+		virtual bool isDone();
+		virtual T current();
+	};
+

 	/**
 	* Replacement for std::vector that is easier to debug and provides
@ -179,29 +227,20 @@ namespace nv
 		}
 	
 	
-		/// Const element access.
+		/// Const and save vector access.
 		const T & operator[]( uint index ) const
 		{
 			nvDebugCheck(index < m_size);
 			return m_buffer[index];
 		}
-		const T & at( uint index ) const
-		{
-			nvDebugCheck(index < m_size);
-			return m_buffer[index];
-		}
-
-		/// Element access.
+	
+		/// Safe vector access.
 		T & operator[] ( uint index )
 		{
 			nvDebugCheck(index < m_size);
 			return m_buffer[index];
 		}
-		T & at( uint index )
-		{
-			nvDebugCheck(index < m_size);
-			return m_buffer[index];
-		}
+	
 	
 		/// Get vector size.
 		uint size() const { return m_size; }
@ -213,7 +252,7 @@ namespace nv
 		const T * buffer() const { return m_buffer; }
 	
 		/// Get vector pointer.
-		T * mutableBuffer() { return m_buffer; }
+		T * unsecureBuffer() { return m_buffer; }
 	
 		/// Is vector empty.
 		bool isEmpty() const { return m_size == 0; }
@ -294,22 +333,15 @@ namespace nv
 			return m_buffer[0];
 		}
 		
-		/// Return index of the 
-		bool find(const T & element, uint * index)
-		{
-			for (uint i = 0; i < m_size; i++) {
-				if (index != NULL) *index = i;
-				return true;
-			}
-			return false;
-		}
-
 		/// Check if the given element is contained in the array.
 		bool contains(const T & e) const
 		{
-			return find(e, NULL);
+			for (uint i = 0; i < m_size; i++) {
+				if (m_buffer[i] == e) return true;
+			}
+			return false;
 		}
-
+		
 		/// Remove the element at the given index. This is an expensive operation!
 		void removeAt( uint index )
 		{
@ -495,10 +527,9 @@ namespace nv
 		}
 		
 		/// Assignment operator.
-		Array<T> & operator=( const Array<T> & a )
+		void operator=( const Array<T> & a )
 		{
 			copy( a.m_buffer, a.m_size );
-			return *this;
 		}
 		
 		/*
@ -595,43 +626,18 @@ namespace nv
 	template<typename T, typename U, typename hash_functor = hash<T> >
 	class NVCORE_CLASS HashMap
 	{
+		NV_FORBID_COPY(HashMap)
 	public:

 		/// Default ctor.
 		HashMap() : entry_count(0), size_mask(-1), table(NULL) { }

-		// Copy ctor.
-		HashMap(const HashMap & map) : entry_count(0), size_mask(-1), table(NULL)
-		{
-			operator = (map);
-		}
-
 		/// Ctor with size hint.
 		explicit HashMap(int size_hint) : entry_count(0), size_mask(-1), table(NULL) { setCapacity(size_hint); }

 		/// Dtor.
 		~HashMap() { clear(); }
-
-		// Assignment operator.
-		void operator= (const HashMap & map)
-		{
-			clear();
-
-			if (entry_count > 0)
-			{
-				entry_count = map.entry_count;
-				size_mask = map.size_mask;
-
-				const uint size = uint(size_mask + 1);
-				table = (Entry *)nv::mem::malloc(sizeof(Entry) * size);
-
-				// Copy elements using copy ctor.
-				for (uint i = 0; i < size; i++)
-				{
-					new (table + i) Entry(map.table[i]);
-				}
-			}
-		}
+	
 	
 		/// Set a new or existing value under the key, to the value.
 		void set(const T& key, const U& value)
--- a/src/nvcore/Debug.cpp
+++ b/src/nvcore/Debug.cpp
@ -1,7 +1,7 @@
 // This code is in the public domain -- castanyo@yahoo.es

-#include "Debug.h"
-#include "StrLib.h"
+#include <nvcore/Debug.h>
+#include <nvcore/StrLib.h>

 // Extern
 #if NV_OS_WIN32 //&& NV_CC_MSVC
@ -34,7 +34,7 @@
 #	endif
 #endif

-#if NV_OS_DARWIN || NV_OS_FREEBSD
+#if NV_OS_DARWIN
 #	include <unistd.h>	// getpid
 #	include <sys/types.h>
 #	include <sys/sysctl.h>	// sysctl
@ -199,14 +199,6 @@ namespace
 				return (void *) ucp->uc_mcontext->ss.eip;
 #			endif
 #		endif
-#	elif NV_OS_FREEBSD
-#		if NV_CPU_X86_64
-			ucontext_t * ucp = (ucontext_t *)secret;
-			return (void *)ucp->uc_mcontext.mc_rip;
-#		elif NV_CPU_X86
-			ucontext_t * ucp = (ucontext_t *)secret;
-			return (void *)ucp->uc_mcontext.mc_eip;
-#		endif
 #	else
 #		if NV_CPU_X86_64
 			// #define REG_RIP REG_INDEX(rip) // seems to be 16
--- a/src/nvcore/Debug.h
+++ b/src/nvcore/Debug.h
@ -3,7 +3,7 @@
 #ifndef NV_CORE_DEBUG_H
 #define NV_CORE_DEBUG_H

-#include "nvcore.h"
+#include <nvcore/nvcore.h>

 #if defined(HAVE_STDARG_H)
 #	include <stdarg.h>	// va_list
--- a/src/nvcore/FileSystem.cpp
+++ b/src/nvcore/FileSystem.cpp
@ -1,56 +0,0 @@
-// This code is in the public domain -- castano@gmail.com
-
-#include "FileSystem.h"
-#include <nvcore/nvcore.h>
-
-#if NV_OS_WIN32
-#define _CRT_NONSTDC_NO_WARNINGS // _chdir is defined deprecated, but that's a bug, chdir is deprecated, _chdir is *not*.
-//#include <shlwapi.h> // PathFileExists
-#include <windows.h> // GetFileAttributes
-#include <direct.h> // _mkdir
-#else
-#include <sys/stat.h>
-#include <sys/types.h>
-#include <unistd.h>
-#endif
-
-using namespace nv;
-
-
-bool FileSystem::exists(const char * path)
-{
-#if NV_OS_UNIX
-	return access(path, F_OK|R_OK) == 0;
-	//struct stat buf;
-	//return stat(path, &buf) == 0;
-#elif NV_OS_WIN32
-    // PathFileExists requires linking to shlwapi.lib
-    //return PathFileExists(path) != 0;
-    return GetFileAttributes(path) != 0xFFFFFFFF;
-#else
-	if (FILE * fp = fopen(path, "r"))
-	{
-		fclose(fp);
-		return true;
-	}
-	return false;
-#endif
-}
-
-bool FileSystem::createDirectory(const char * path)
-{
-#if NV_OS_WIN32
-    return _mkdir(path) != -1;
-#else
-    return mkdir(path, 0777) != -1;
-#endif
-}
-
-bool FileSystem::changeDirectory(const char * path)
-{
-#if NV_OS_WIN32
-    return _chdir(path) != -1;
-#else
-    return chdir(path) != -1;
-#endif
-}
--- a/src/nvcore/FileSystem.h
+++ b/src/nvcore/FileSystem.h
@ -1,23 +0,0 @@
-// This code is in the public domain -- castano@gmail.com
-
-#ifndef NV_CORE_FILESYSTEM_H
-#define NV_CORE_FILESYSTEM_H
-
-#include "nvcore.h"
-
-namespace nv
-{
-
-	namespace FileSystem
-	{
-
-		NVCORE_API bool exists(const char * path);
-		NVCORE_API bool createDirectory(const char * path);
-		NVCORE_API bool changeDirectory(const char * path);
-
-	} // FileSystem namespace
-
-} // nv namespace
-
-
-#endif // NV_CORE_FILESYSTEM_H
--- a/src/nvcore/Library.h
+++ b/src/nvcore/Library.h
@ -3,7 +3,7 @@
 #ifndef NV_CORE_LIBRARY_H
 #define NV_CORE_LIBRARY_H

-#include "nvcore.h"
+#include <nvcore/nvcore.h>

 #if NV_OS_WIN32
 #define LIBRARY_NAME(name)	#name ".dll"
--- a/src/nvcore/Memory.h
+++ b/src/nvcore/Memory.h
@ -3,7 +3,7 @@
 #ifndef NV_CORE_MEMORY_H
 #define NV_CORE_MEMORY_H

-#include "nvcore.h"
+#include <nvcore/nvcore.h>

 #include <stdlib.h> // malloc(), realloc() and free()
 #include <stddef.h>	// size_t
--- a/src/nvcore/Prefetch.h
+++ b/src/nvcore/Prefetch.h
@ -0,0 +1,31 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#ifndef NV_CORE_PREFETCH_H
+#define NV_CORE_PREFETCH_H
+
+#include <nvcore/nvcore.h>
+
+// nvPrefetch
+#if NV_CC_GNUC
+
+#define nvPrefetch(ptr)	__builtin_prefetch(ptr)
+
+#elif NV_CC_MSVC 
+
+#if NV_CPU_X86
+__forceinline void nvPrefetch(const void * mem)
+{
+	__asm mov ecx, mem
+	__asm prefetcht0 [ecx];
+//	__asm prefetchnta [ecx];
+}
+#endif // NV_CPU_X86
+
+#else // NV_CC_MSVC
+
+// do nothing in other case.
+#define nvPrefetch(ptr)
+
+#endif // NV_CC_MSVC
+
+#endif // NV_CORE_PREFETCH_H
--- a/src/nvcore/Ptr.h
+++ b/src/nvcore/Ptr.h
@ -3,12 +3,11 @@
 #ifndef NV_CORE_PTR_H
 #define NV_CORE_PTR_H

-#include "nvcore.h"
-#include "Debug.h"
+#include <nvcore/nvcore.h>
+#include <nvcore/Debug.h>

 #include <stdio.h>	// NULL

-
 namespace nv
 {
 	
@ -30,11 +29,11 @@ class AutoPtr
 	NV_FORBID_HEAPALLOC();
 public:
 	
+	/// Default ctor.
+	AutoPtr() : m_ptr(NULL) { }
+	
 	/// Ctor.
-	AutoPtr(T * p = NULL) : m_ptr(p) { }
-
-	template <class Q>
-	AutoPtr(Q * p) : m_ptr(static_cast<T *>(p)) { }
+	explicit AutoPtr( T * p ) : m_ptr(p) { }
 	
 	/** Dtor. Deletes owned pointer. */
 	~AutoPtr() {
@ -51,15 +50,6 @@ public:
 		}
 	}

-	template <class Q>
-    void operator=( Q * p ) {
-		if (p != m_ptr)
-		{
-			delete m_ptr;
-			m_ptr = static_cast<T *>(p);
-		}
-    }
-
 	/** Member access. */
 	T * operator -> () const {
 		nvDebugCheck(m_ptr != NULL);
@ -106,23 +96,125 @@ private:
 	T * m_ptr;
 };

+#if 0
+/** Reference counted base class to be used with Pointer.
+ *
+ * The only requirement of the Pointer class is that the RefCounted class implements the 
+ * addRef and release methods.
+ */
+class RefCounted
+{
+	NV_FORBID_COPY(RefCounted);
+public:
+
+	/// Ctor.
+	RefCounted() : m_count(0), m_weak_proxy(NULL)
+	{
+		s_total_obj_count++;
+	}
+
+	/// Virtual dtor.
+	virtual ~RefCounted()
+	{
+		nvCheck( m_count == 0 );
+		nvCheck( s_total_obj_count > 0 );
+		s_total_obj_count--;
+	}
+
+
+	/// Increase reference count.
+	uint addRef() const
+	{
+		s_total_ref_count++;
+		m_count++;
+		return m_count;
+	}
+
+
+	/// Decrease reference count and remove when 0.
+	uint release() const
+	{
+		nvCheck( m_count > 0 );
+		
+		s_total_ref_count--;
+		m_count--;
+		if( m_count == 0 ) {
+			releaseWeakProxy();
+			delete this;
+			return 0;
+		}
+		return m_count;
+	}
+
+	/// Get weak proxy.
+	WeakProxy * getWeakProxy() const
+	{
+		if (m_weak_proxy == NULL) {
+			m_weak_proxy = new WeakProxy;
+			m_weak_proxy->AddRef();
+		}
+		return m_weak_proxy;
+	}
+
+	/// Release the weak proxy.	
+	void releaseWeakProxy() const
+	{
+		if (m_weak_proxy != NULL) {
+			m_weak_proxy->NotifyObjectDied();
+			m_weak_proxy->Release();
+			m_weak_proxy = NULL;
+		}
+	}
+
+	/** @name Debug methods: */
+	//@{
+		/// Get reference count.
+		int refCount() const
+		{
+			return m_count;
+		}
+
+		/// Get total number of objects.
+		static int totalObjectCount()
+		{
+			return s_total_obj_count;
+		}
+
+		/// Get total number of references.
+		static int totalReferenceCount()
+		{
+			return s_total_ref_count;
+		}
+	//@}
+
+
+private:
+
+	NVCORE_API static int s_total_ref_count;
+	NVCORE_API static int s_total_obj_count;
+
+	mutable int m_count;
+	mutable WeakProxy * weak_proxy;
+
+};
+#endif

 /// Smart pointer template class.
 template <class BaseClass>
-class SmartPtr {
+class Pointer {
 public:

 	// BaseClass must implement addRef() and release().
-	typedef SmartPtr<BaseClass>	ThisType;
+	typedef Pointer<BaseClass>	ThisType;

 	/// Default ctor.
-	SmartPtr() : m_ptr(NULL) 
+	Pointer() : m_ptr(NULL) 
 	{
 	}

 	/** Other type assignment. */
 	template <class OtherBase>
-	SmartPtr( const SmartPtr<OtherBase> & tc )
+	Pointer( const Pointer<OtherBase> & tc )
 	{
 		m_ptr = static_cast<BaseClass *>( tc.ptr() );
 		if( m_ptr ) {
@ -131,7 +223,7 @@ public:
 	}

 	/** Copy ctor. */
-	SmartPtr( const ThisType & bc )
+	Pointer( const ThisType & bc )
 	{
 		m_ptr = bc.ptr();
 		if( m_ptr ) {
@ -139,8 +231,8 @@ public:
 		}
 	}

-	/** Copy cast ctor. SmartPtr(NULL) is valid. */
-	explicit SmartPtr( BaseClass * bc )
+	/** Copy cast ctor. Pointer(NULL) is valid. */
+	explicit Pointer( BaseClass * bc )
 	{
 		m_ptr = bc;
 		if( m_ptr ) {
@ -149,7 +241,7 @@ public:
 	}

 	/** Dtor. */
-	~SmartPtr()
+	~Pointer()
 	{
 		set(NULL);
 	}
@ -183,7 +275,7 @@ public:
 	//@{
 		/** Other type assignment. */
 		template <class OtherBase>
-		void operator = ( const SmartPtr<OtherBase> & tc )
+		void operator = ( const Pointer<OtherBase> & tc )
 		{
 			set( static_cast<BaseClass *>(tc.ptr()) );
 		}
@ -206,7 +298,7 @@ public:
 	//@{
 		/** Other type equal comparation. */
 		template <class OtherBase>
-		bool operator == ( const SmartPtr<OtherBase> & other ) const
+		bool operator == ( const Pointer<OtherBase> & other ) const
 		{
 			return m_ptr == other.ptr();
 		}
@ -225,7 +317,7 @@ public:

 		/** Other type not equal comparation. */
 		template <class OtherBase>
-		bool operator != ( const SmartPtr<OtherBase> & other ) const
+		bool operator != ( const Pointer<OtherBase> & other ) const
 		{
 			return m_ptr != other.ptr();
 		}
--- a/src/nvcore/Radix.cpp
+++ b/src/nvcore/Radix.cpp
@ -0,0 +1,429 @@
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+/**
+ *	Contains source code from the article "Radix Sort Revisited".
+ *	\file		Radix.cpp
+ *	\author		Pierre Terdiman
+ *	\date		April, 4, 2000
+ */
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+/**
+ *	Revisited Radix Sort.
+ *	This is my new radix routine:
+ *  - it uses indices and doesn't recopy the values anymore, hence wasting less ram
+ *  - it creates all the histograms in one run instead of four
+ *  - it sorts words faster than dwords and bytes faster than words
+ *  - it correctly sorts negative floating-point values by patching the offsets
+ *  - it automatically takes advantage of temporal coherence
+ *  - multiple keys support is a side effect of temporal coherence
+ *  - it may be worth recoding in asm... (mainly to use FCOMI, FCMOV, etc) [it's probably memory-bound anyway]
+ *
+ *	History:
+ *	- 08.15.98: very first version
+ *	- 04.04.00: recoded for the radix article
+ *	- 12.xx.00: code lifting
+ *	- 09.18.01: faster CHECK_PASS_VALIDITY thanks to Mark D. Shattuck (who provided other tips, not included here)
+ *	- 10.11.01: added local ram support
+ *	- 01.20.02: bugfix! In very particular cases the last pass was skipped in the float code-path, leading to incorrect sorting......
+ *
+ *	\class		RadixSort
+ *	\author		Pierre Terdiman
+ *	\version	1.3
+ *	\date		August, 15, 1998
+ */
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/*
+To do:
+	- add an offset parameter between two input values (avoid some data recopy sometimes)
+	- unroll ? asm ?
+*/
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Header
+
+#include <nvcore/Radix.h>
+
+#include <string.h> // memset
+
+//using namespace IceCore;
+
+#define DELETEARRAY(a)	{ delete [] a; a = NULL; }
+#define CHECKALLOC(a)
+
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+/**
+ *	Constructor.
+ */
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+RadixSort::RadixSort() : mCurrentSize(0), mPreviousSize(0), mIndices(NULL), mIndices2(NULL), mTotalCalls(0), mNbHits(0)
+{
+#ifndef RADIX_LOCAL_RAM
+	// Allocate input-independent ram
+	mHistogram		= new uint32[256*4];
+	mOffset			= new uint32[256];
+#endif
+	// Initialize indices
+	resetIndices();
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+/**
+ *	Destructor.
+ */
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+RadixSort::~RadixSort()
+{
+	// Release everything
+#ifndef RADIX_LOCAL_RAM
+	DELETEARRAY(mOffset);
+	DELETEARRAY(mHistogram);
+#endif
+	DELETEARRAY(mIndices2);
+	DELETEARRAY(mIndices);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+/**
+ *	Resizes the inner lists.
+ *	\param		nb				[in] new size (number of dwords)
+ *	\return		true if success
+ */
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+bool RadixSort::resize(uint32 nb)
+{
+	// Free previously used ram
+	DELETEARRAY(mIndices2);
+	DELETEARRAY(mIndices);
+
+	// Get some fresh one
+	mIndices		= new uint32[nb];	CHECKALLOC(mIndices);
+	mIndices2		= new uint32[nb];	CHECKALLOC(mIndices2);
+	mCurrentSize	= nb;
+
+	// Initialize indices so that the input buffer is read in sequential order
+	resetIndices();
+
+	return true;
+}
+
+#define CHECK_RESIZE(n)																			\
+	if(n!=mPreviousSize)																		\
+	{																							\
+				if(n>mCurrentSize)	resize(n);													\
+		else						resetIndices();												\
+		mPreviousSize = n;																		\
+	}
+
+#define CREATE_HISTOGRAMS(type, buffer)															\
+	/* Clear counters */																		\
+	memset(mHistogram, 0, 256*4*sizeof(uint32));												\
+																								\
+	/* Prepare for temporal coherence */														\
+	type PrevVal = (type)buffer[mIndices[0]];													\
+	bool AlreadySorted = true;	/* Optimism... */												\
+	uint32* Indices = mIndices;																	\
+																								\
+	/* Prepare to count */																		\
+	uint8* p = (uint8*)input;																	\
+	uint8* pe = &p[nb*4];																		\
+	uint32* h0= &mHistogram[0];		/* Histogram for first pass (LSB)	*/						\
+	uint32* h1= &mHistogram[256];	/* Histogram for second pass		*/						\
+	uint32* h2= &mHistogram[512];	/* Histogram for third pass			*/						\
+	uint32* h3= &mHistogram[768];	/* Histogram for last pass (MSB)	*/						\
+																								\
+	while(p!=pe)																				\
+	{																							\
+		/* Read input buffer in previous sorted order */										\
+		type Val = (type)buffer[*Indices++];													\
+		/* Check whether already sorted or not */												\
+		if(Val<PrevVal)	{ AlreadySorted = false; break; } /* Early out */						\
+		/* Update for next iteration */															\
+		PrevVal = Val;																			\
+																								\
+		/* Create histograms */																	\
+		h0[*p++]++;	h1[*p++]++;	h2[*p++]++;	h3[*p++]++;											\
+	}																							\
+																								\
+	/* If all input values are already sorted, we just have to return and leave the */			\
+	/* previous list unchanged. That way the routine may take advantage of temporal */			\
+	/* coherence, for example when used to sort transparent faces.					*/			\
+	if(AlreadySorted)	{ mNbHits++; return *this;	}											\
+																								\
+	/* Else there has been an early out and we must finish computing the histograms */			\
+	while(p!=pe)																				\
+	{																							\
+		/* Create histograms without the previous overhead */									\
+		h0[*p++]++;	h1[*p++]++;	h2[*p++]++;	h3[*p++]++;											\
+	}
+
+#define CHECK_PASS_VALIDITY(pass)																\
+	/* Shortcut to current counters */															\
+	uint32* CurCount = &mHistogram[pass<<8];													\
+																								\
+	/* Reset flag. The sorting pass is supposed to be performed. (default) */					\
+	bool PerformPass = true;																	\
+																								\
+	/* Check pass validity */																	\
+																								\
+	/* If all values have the same byte, sorting is useless. */									\
+	/* It may happen when sorting bytes or words instead of dwords. */							\
+	/* This routine actually sorts words faster than dwords, and bytes */						\
+	/* faster than words. Standard running time (O(4*n))is reduced to O(2*n) */					\
+	/* for words and O(n) for bytes. Running time for floats depends on actual values... */		\
+																								\
+	/* Get first byte */																		\
+	uint8 UniqueVal = *(((uint8*)input)+pass);													\
+																								\
+	/* Check that byte's counter */																\
+	if(CurCount[UniqueVal]==nb)	PerformPass=false;
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+/**
+ *	Main sort routine.
+ *	This one is for integer values. After the call, mIndices contains a list of indices in sorted order, i.e. in the order you may process your data.
+ *	\param		input			[in] a list of integer values to sort
+ *	\param		nb				[in] number of values to sort
+ *	\param		signedvalues	[in] true to handle negative values, false if you know your input buffer only contains positive values
+ *	\return		Self-Reference
+ */
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+RadixSort& RadixSort::sort(const uint32* input, uint32 nb, bool signedvalues)
+{
+	uint32 i, j;
+	
+	// Checkings
+	if(!input || !nb)	return *this;
+
+	// Stats
+	mTotalCalls++;
+
+	// Resize lists if needed
+	CHECK_RESIZE(nb);
+
+#ifdef RADIX_LOCAL_RAM
+	// Allocate histograms & offsets on the stack
+	uint32 mHistogram[256*4];
+	uint32 mOffset[256];
+#endif
+
+	// Create histograms (counters). Counters for all passes are created in one run.
+	// Pros:	read input buffer once instead of four times
+	// Cons:	mHistogram is 4Kb instead of 1Kb
+	// We must take care of signed/unsigned values for temporal coherence.... I just
+	// have 2 code paths even if just a single opcode changes. Self-modifying code, someone?
+	if(!signedvalues)	{ CREATE_HISTOGRAMS(uint32, input);	}
+	else				{ CREATE_HISTOGRAMS(int32, input);	}
+
+	// Compute #negative values involved if needed
+	uint32 NbNegativeValues = 0;
+	if(signedvalues)
+	{
+		// An efficient way to compute the number of negatives values we'll have to deal with is simply to sum the 128
+		// last values of the last histogram. Last histogram because that's the one for the Most Significant Byte,
+		// responsible for the sign. 128 last values because the 128 first ones are related to positive numbers.
+		uint32* h3= &mHistogram[768];
+		for( i=128;i<256;i++)	NbNegativeValues += h3[i];	// 768 for last histogram, 128 for negative part
+	}
+
+	// Radix sort, j is the pass number (0=LSB, 3=MSB)
+	for( j=0;j<4;j++)
+	{
+		CHECK_PASS_VALIDITY(j);
+
+		// Sometimes the fourth (negative) pass is skipped because all numbers are negative and the MSB is 0xFF (for example). This is
+		// not a problem, numbers are correctly sorted anyway.
+		if(PerformPass)
+		{
+			// Should we care about negative values?
+			if(j!=3 || !signedvalues)
+			{
+				// Here we deal with positive values only
+
+				// Create offsets
+				mOffset[0] = 0;
+				for(i=1;i<256;i++)		mOffset[i] = mOffset[i-1] + CurCount[i-1];
+			}
+			else
+			{
+				// This is a special case to correctly handle negative integers. They're sorted in the right order but at the wrong place.
+
+				// Create biased offsets, in order for negative numbers to be sorted as well
+				mOffset[0] = NbNegativeValues;												// First positive number takes place after the negative ones
+				for(i=1;i<128;i++)		mOffset[i] = mOffset[i-1] + CurCount[i-1];	// 1 to 128 for positive numbers
+
+				// Fixing the wrong place for negative values
+				mOffset[128] = 0;
+				for(i=129;i<256;i++)			mOffset[i] = mOffset[i-1] + CurCount[i-1];
+			}
+
+			// Perform Radix Sort
+			uint8* InputBytes	= (uint8*)input;
+			uint32* Indices		= mIndices;
+			uint32* IndicesEnd	= &mIndices[nb];
+			InputBytes += j;
+			while(Indices!=IndicesEnd)
+			{
+				uint32 id = *Indices++;
+				mIndices2[mOffset[InputBytes[id<<2]]++] = id;
+			}
+
+			// Swap pointers for next pass. Valid indices - the most recent ones - are in mIndices after the swap.
+			uint32* Tmp	= mIndices;	mIndices = mIndices2; mIndices2 = Tmp;
+		}
+	}
+	return *this;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+/**
+ *	Main sort routine.
+ *	This one is for floating-point values. After the call, mIndices contains a list of indices in sorted order, i.e. in the order you may process your data.
+ *	\param		input			[in] a list of floating-point values to sort
+ *	\param		nb				[in] number of values to sort
+ *	\return		Self-Reference
+ *	\warning	only sorts IEEE floating-point values
+ */
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+RadixSort& RadixSort::sort(const float* input2, uint32 nb)
+{
+	uint32 i, j;
+	
+	// Checkings
+	if(!input2 || !nb)	return *this;
+
+	// Stats
+	mTotalCalls++;
+
+	uint32* input = (uint32*)input2;
+
+	// Resize lists if needed
+	CHECK_RESIZE(nb);
+
+#ifdef RADIX_LOCAL_RAM
+	// Allocate histograms & offsets on the stack
+	uint32 mHistogram[256*4];
+	uint32 mOffset[256];
+#endif
+
+	// Create histograms (counters). Counters for all passes are created in one run.
+	// Pros:	read input buffer once instead of four times
+	// Cons:	mHistogram is 4Kb instead of 1Kb
+	// Floating-point values are always supposed to be signed values, so there's only one code path there.
+	// Please note the floating point comparison needed for temporal coherence! Although the resulting asm code
+	// is dreadful, this is surprisingly not such a performance hit - well, I suppose that's a big one on first
+	// generation Pentiums....We can't make comparison on integer representations because, as Chris said, it just
+	// wouldn't work with mixed positive/negative values....
+	{ CREATE_HISTOGRAMS(float, input2); }
+
+	// Compute #negative values involved if needed
+	uint32 NbNegativeValues = 0;
+	// An efficient way to compute the number of negatives values we'll have to deal with is simply to sum the 128
+	// last values of the last histogram. Last histogram because that's the one for the Most Significant Byte,
+	// responsible for the sign. 128 last values because the 128 first ones are related to positive numbers.
+	uint32* h3= &mHistogram[768];
+	for( i=128;i<256;i++)	NbNegativeValues += h3[i];	// 768 for last histogram, 128 for negative part
+
+	// Radix sort, j is the pass number (0=LSB, 3=MSB)
+	for( j=0;j<4;j++)
+	{
+		// Should we care about negative values?
+		if(j!=3)
+		{
+			// Here we deal with positive values only
+			CHECK_PASS_VALIDITY(j);
+
+			if(PerformPass)
+			{
+				// Create offsets
+				mOffset[0] = 0;
+				for( i=1;i<256;i++)		mOffset[i] = mOffset[i-1] + CurCount[i-1];
+
+				// Perform Radix Sort
+				uint8* InputBytes	= (uint8*)input;
+				uint32* Indices		= mIndices;
+				uint32* IndicesEnd	= &mIndices[nb];
+				InputBytes += j;
+				while(Indices!=IndicesEnd)
+				{
+					uint32 id = *Indices++;
+					mIndices2[mOffset[InputBytes[id<<2]]++] = id;
+				}
+
+				// Swap pointers for next pass. Valid indices - the most recent ones - are in mIndices after the swap.
+				uint32* Tmp	= mIndices;	mIndices = mIndices2; mIndices2 = Tmp;
+			}
+		}
+		else
+		{
+			// This is a special case to correctly handle negative values
+			CHECK_PASS_VALIDITY(j);
+
+			if(PerformPass)
+			{
+				// Create biased offsets, in order for negative numbers to be sorted as well
+				mOffset[0] = NbNegativeValues;												// First positive number takes place after the negative ones
+				for(i=1;i<128;i++)		mOffset[i] = mOffset[i-1] + CurCount[i-1];	// 1 to 128 for positive numbers
+
+				// We must reverse the sorting order for negative numbers!
+				mOffset[255] = 0;
+				for(i=0;i<127;i++)		mOffset[254-i] = mOffset[255-i] + CurCount[255-i];	// Fixing the wrong order for negative values
+				for(i=128;i<256;i++)	mOffset[i] += CurCount[i];							// Fixing the wrong place for negative values
+
+				// Perform Radix Sort
+				for(i=0;i<nb;i++)
+				{
+					uint32 Radix = input[mIndices[i]]>>24;								// Radix byte, same as above. AND is useless here (uint32).
+					// ### cmp to be killed. Not good. Later.
+					if(Radix<128)		mIndices2[mOffset[Radix]++] = mIndices[i];		// Number is positive, same as above
+					else				mIndices2[--mOffset[Radix]] = mIndices[i];		// Number is negative, flip the sorting order
+				}
+				// Swap pointers for next pass. Valid indices - the most recent ones - are in mIndices after the swap.
+				uint32* Tmp	= mIndices;	mIndices = mIndices2; mIndices2 = Tmp;
+			}
+			else
+			{
+				// The pass is useless, yet we still have to reverse the order of current list if all values are negative.
+				if(UniqueVal>=128)
+				{
+					for(i=0;i<nb;i++)	mIndices2[i] = mIndices[nb-i-1];
+
+					// Swap pointers for next pass. Valid indices - the most recent ones - are in mIndices after the swap.
+					uint32* Tmp	= mIndices;	mIndices = mIndices2; mIndices2 = Tmp;
+				}
+			}
+		}
+	}
+	return *this;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+/**
+ *	Resets the inner indices. After the call, mIndices is reset.
+ */
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+void RadixSort::resetIndices()
+{
+	for(uint32 i=0;i<mCurrentSize;i++)	mIndices[i] = i;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+/**
+ *	Gets the ram used.
+ *	\return		memory used in bytes
+ */
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+uint32 RadixSort::usedRam() const
+{
+	uint32 UsedRam = sizeof(RadixSort);
+#ifndef RADIX_LOCAL_RAM
+	UsedRam += 256*4*sizeof(uint32);			// Histograms
+	UsedRam += 256*sizeof(uint32);				// Offsets
+#endif
+	UsedRam += 2*mCurrentSize*sizeof(uint32);	// 2 lists of indices
+	return UsedRam;
+}
--- a/src/nvcore/Radix.h
+++ b/src/nvcore/Radix.h
@ -0,0 +1,69 @@
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+/**
+ *	Contains source code from the article "Radix Sort Revisited".
+ *	\file		Radix.h
+ *	\author		Pierre Terdiman
+ *	\date		April, 4, 2000
+ */
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Include Guard
+#ifndef NV_CORE_RADIXSORT_H
+#define NV_CORE_RADIXSORT_H
+
+#include <nvcore/nvcore.h>
+
+
+#define RADIX_LOCAL_RAM
+
+
+class NVCORE_API RadixSort {
+	NV_FORBID_COPY(RadixSort);
+public:
+	// Constructor/Destructor
+	RadixSort();
+	~RadixSort();
+
+	// Sorting methods
+	RadixSort & sort(const uint32* input, uint32 nb, bool signedvalues=true);
+	RadixSort & sort(const float* input, uint32 nb);
+
+	//! Access to results. mIndices is a list of indices in sorted order, i.e. in the order you may further process your data
+	inline uint32 * indices() const { return mIndices; }
+
+	//! mIndices2 gets trashed on calling the sort routine, but otherwise you can recycle it the way you want.
+	inline uint32 * recyclable() const { return mIndices2; }
+
+	// Stats
+	uint32 usedRam() const;
+
+	//! Returns the total number of calls to the radix sorter.
+	inline uint32 totalCalls()	const { return mTotalCalls;	}
+
+	//! Returns the number of premature exits due to temporal coherence.
+	inline uint32 hits() const { return mNbHits; }
+
+
+	private:
+#ifndef RADIX_LOCAL_RAM
+	uint32*			mHistogram;					//!< Counters for each byte
+	uint32*			mOffset;					//!< Offsets (nearly a cumulative distribution function)
+#endif
+	uint32			mCurrentSize;				//!< Current size of the indices list
+	uint32			mPreviousSize;				//!< Size involved in previous call
+	uint32*			mIndices;					//!< Two lists, swapped each pass
+	uint32*			mIndices2;
+
+	// Stats
+	uint32			mTotalCalls;
+	uint32			mNbHits;
+
+	// Internal methods
+	bool			resize(uint32 nb);
+	void			resetIndices();
+
+};
+
+
+#endif // NV_CORE_RADIXSORT_H
--- a/src/nvcore/RefCounted.cpp
+++ b/src/nvcore/RefCounted.cpp
@ -1,9 +0,0 @@
-// This code is in the public domain -- castanyo@yahoo.es
-
-#include "RefCounted.h"
-
-using namespace nv;
-
-int nv::RefCounted::s_total_ref_count = 0;
-int nv::RefCounted::s_total_obj_count = 0;
-
--- a/src/nvcore/RefCounted.h
+++ b/src/nvcore/RefCounted.h
@ -1,114 +0,0 @@
-// This code is in the public domain -- castanyo@yahoo.es
-
-#ifndef NV_CORE_REFCOUNTED_H
-#define NV_CORE_REFCOUNTED_H
-
-#include "nvcore.h"
-#include "Debug.h"
-
-
-namespace nv
-{
-
-	/// Reference counted base class to be used with SmartPtr and WeakPtr.
-	class RefCounted
-	{
-		NV_FORBID_COPY(RefCounted);
-	public:
-
-		/// Ctor.
-		RefCounted() : m_count(0)/*, m_weak_proxy(NULL)*/
-		{
-			s_total_obj_count++;
-		}
-
-		/// Virtual dtor.
-		virtual ~RefCounted()
-		{
-			nvCheck( m_count == 0 );
-			nvCheck( s_total_obj_count > 0 );
-			s_total_obj_count--;
-		}
-
-
-		/// Increase reference count.
-		uint addRef() const
-		{
-			s_total_ref_count++;
-			m_count++;
-			return m_count;
-		}
-
-
-		/// Decrease reference count and remove when 0.
-		uint release() const
-		{
-			nvCheck( m_count > 0 );
-			
-			s_total_ref_count--;
-			m_count--;
-			if( m_count == 0 ) {
-			//	releaseWeakProxy();
-				delete this;
-				return 0;
-			}
-			return m_count;
-		}
-	/*
-		/// Get weak proxy.
-		WeakProxy * getWeakProxy() const
-		{
-			if (m_weak_proxy == NULL) {
-				m_weak_proxy = new WeakProxy;
-				m_weak_proxy->AddRef();
-			}
-			return m_weak_proxy;
-		}
-
-		/// Release the weak proxy.	
-		void releaseWeakProxy() const
-		{
-			if (m_weak_proxy != NULL) {
-				m_weak_proxy->NotifyObjectDied();
-				m_weak_proxy->Release();
-				m_weak_proxy = NULL;
-			}
-		}
-	*/
-		/** @name Debug methods: */
-		//@{
-			/// Get reference count.
-			int refCount() const
-			{
-				return m_count;
-			}
-
-			/// Get total number of objects.
-			static int totalObjectCount()
-			{
-				return s_total_obj_count;
-			}
-
-			/// Get total number of references.
-			static int totalReferenceCount()
-			{
-				return s_total_ref_count;
-			}
-		//@}
-
-
-	private:
-
-		NVCORE_API static int s_total_ref_count;
-		NVCORE_API static int s_total_obj_count;
-
-		mutable int m_count;
-	//	mutable WeakProxy * weak_proxy;
-
-	};
-
-
-} // nv namespace
-
-
-#endif // NV_CORE_REFCOUNTED_H
--- a/src/nvcore/StdStream.h
+++ b/src/nvcore/StdStream.h
@ -1,7 +1,5 @@
-// This code is in the public domain -- castano@gmail.com
-
-#ifndef NV_CORE_STDSTREAM_H
-#define NV_CORE_STDSTREAM_H
+#ifndef NV_STDSTREAM_H
+#define NV_STDSTREAM_H

 #include <nvcore/Stream.h>

@ -368,4 +366,4 @@ private:
 } // nv namespace


-#endif // NV_CORE_STDSTREAM_H
+#endif // NV_STDSTREAM_H
--- a/src/nvcore/StrLib.cpp
+++ b/src/nvcore/StrLib.cpp
@ -1,6 +1,6 @@
 // This code is in the public domain -- castanyo@yahoo.es

-#include "StrLib.h"
+#include <nvcore/StrLib.h>

 #include <math.h>	// log
 #include <stdio.h>	// vsnprintf
--- a/src/nvcore/StrLib.h
+++ b/src/nvcore/StrLib.h
@ -3,8 +3,8 @@
 #ifndef NV_CORE_STRING_H
 #define NV_CORE_STRING_H

-#include "nvcore.h"
-#include "Containers.h"	// swap
+#include <nvcore/nvcore.h>
+#include <nvcore/Containers.h>	// swap

 #include <string.h> // strlen, strcmp, etc.

--- a/src/nvcore/Stream.h
+++ b/src/nvcore/Stream.h
@ -1,160 +1,160 @@
-// This code is in the public domain -- castano@gmail.com
+// This code is in the public domain -- castanyo@yahoo.es

-#ifndef NV_CORE_STREAM_H
-#define NV_CORE_STREAM_H
+#ifndef NVCORE_STREAM_H
+#define NVCORE_STREAM_H

-#include "nvcore.h"
-#include "Debug.h"
+#include <nvcore/nvcore.h>
+#include <nvcore/Debug.h>

 namespace nv
 {

-	/// Base stream class.
-	class NVCORE_CLASS Stream {
-	public:
-
-		enum ByteOrder {
-			LittleEndian = false,
-			BigEndian = true,
-		};
-
-		/// Get the byte order of the system.
-		static ByteOrder getSystemByteOrder() { 
-		#if NV_LITTLE_ENDIAN
-			return LittleEndian;
-		#else
-			return BigEndian;
-		#endif
-		}
-
-
-		/// Ctor.
-		Stream() : m_byteOrder(LittleEndian) { }
-
-		/// Virtual destructor.
-		virtual ~Stream() {}
-
-		/// Set byte order.
-		void setByteOrder(ByteOrder bo) { m_byteOrder = bo; }
-		
-		/// Get byte order.
-		ByteOrder byteOrder() const { return m_byteOrder; }
-
-		
-		/// Serialize the given data.
-		virtual uint serialize( void * data, uint len ) = 0;
-
-		/// Move to the given position in the archive.
-		virtual void seek( uint pos ) = 0;
-
-		/// Return the current position in the archive.
-		virtual uint tell() const = 0;
-
-		/// Return the current size of the archive.
-		virtual uint size() const = 0;
-
-		/// Determine if there has been any error.
-		virtual bool isError() const = 0;
-
-		/// Clear errors.
-		virtual void clearError() = 0;
-
-		/// Return true if the stream is at the end.
-		virtual bool isAtEnd() const = 0;
-
-		/// Return true if the stream is seekable.
-		virtual bool isSeekable() const = 0;
-		
-		/// Return true if this is an input stream.
-		virtual bool isLoading() const = 0;
-
-		/// Return true if this is an output stream.
-		virtual bool isSaving() const = 0;
-
-		
-		// friends	
-		friend Stream & operator<<( Stream & s, bool & c ) {
-		#if NV_OS_DARWIN
-			nvStaticCheck(sizeof(bool) == 4);
-			uint8 b = c ? 1 : 0;
-			s.serialize( &b, 1 );
-			c = (b == 1);
-		#else
-			nvStaticCheck(sizeof(bool) == 1);
-			s.serialize( &c, 1 );
-		#endif
-			return s;
-		}
-		friend Stream & operator<<( Stream & s, char & c ) {
-			nvStaticCheck(sizeof(char) == 1);
-			s.serialize( &c, 1 );
-			return s;
-		}
-		friend Stream & operator<<( Stream & s, uint8 & c ) {
-			nvStaticCheck(sizeof(uint8) == 1);
-			s.serialize( &c, 1 );
-			return s;
-		}
-		friend Stream & operator<<( Stream & s, int8 & c ) {
-			nvStaticCheck(sizeof(int8) == 1);
-			s.serialize( &c, 1 );
-			return s;
-		}
-		friend Stream & operator<<( Stream & s, uint16 & c ) {
-			nvStaticCheck(sizeof(uint16) == 2);
-			return s.byteOrderSerialize( &c, 2 );
-		}
-		friend Stream & operator<<( Stream & s, int16 & c ) {
-			nvStaticCheck(sizeof(int16) == 2);
-			return s.byteOrderSerialize( &c, 2 );
-		}
-		friend Stream & operator<<( Stream & s, uint32 & c ) {
-			nvStaticCheck(sizeof(uint32) == 4);
-			return s.byteOrderSerialize( &c, 4 );
-		}
-		friend Stream & operator<<( Stream & s, int32 & c ) {
-			nvStaticCheck(sizeof(int32) == 4);
-			return s.byteOrderSerialize( &c, 4 );
-		}
-		friend Stream & operator<<( Stream & s, uint64 & c ) {
-			nvStaticCheck(sizeof(uint64) == 8);
-			return s.byteOrderSerialize( &c, 8 );
-		}
-		friend Stream & operator<<( Stream & s, int64 & c ) {
-			nvStaticCheck(sizeof(int64) == 8);
-			return s.byteOrderSerialize( &c, 8 );
-		}
-		friend Stream & operator<<( Stream & s, float & c ) {
-			nvStaticCheck(sizeof(float) == 4);
-			return s.byteOrderSerialize( &c, 4 );
-		}
-		friend Stream & operator<<( Stream & s, double & c ) {
-			nvStaticCheck(sizeof(double) == 8);
-			return s.byteOrderSerialize( &c, 8 );
-		}
-
-	protected:
-
-		/// Serialize in the stream byte order.
-		Stream & byteOrderSerialize( void * v, uint len ) {
-			if( m_byteOrder == getSystemByteOrder() ) {
-				serialize( v, len );
-			}
-			else {
-				for( uint i = len; i > 0; i-- ) {
-					serialize( (uint8 *)v + i - 1, 1 );
-				}
-			}
-			return *this;
-		}
-
-
-	private:
-
-		ByteOrder m_byteOrder;
+/// Base stream class.
+class NVCORE_CLASS Stream {
+public:

+	enum ByteOrder {
+		LittleEndian = false,
+		BigEndian = true,
 	};

+	/// Get the byte order of the system.
+	static ByteOrder getSystemByteOrder() { 
+#	if NV_LITTLE_ENDIAN
+		return LittleEndian;
+#	else
+		return BigEndian;
+#	endif
+	}
+
+
+	/// Ctor.
+	Stream() : m_byteOrder(LittleEndian) { }
+
+	/// Virtual destructor.
+	virtual ~Stream() {}
+
+	/// Set byte order.
+	void setByteOrder(ByteOrder bo) { m_byteOrder = bo; }
+	
+	/// Get byte order.
+	ByteOrder byteOrder() const { return m_byteOrder; }
+
+	
+	/// Serialize the given data.
+	virtual uint serialize( void * data, uint len ) = 0;
+
+	/// Move to the given position in the archive.
+	virtual void seek( uint pos ) = 0;
+
+	/// Return the current position in the archive.
+	virtual uint tell() const = 0;
+
+	/// Return the current size of the archive.
+	virtual uint size() const = 0;
+
+	/// Determine if there has been any error.
+	virtual bool isError() const = 0;
+
+	/// Clear errors.
+	virtual void clearError() = 0;
+
+	/// Return true if the stream is at the end.
+	virtual bool isAtEnd() const = 0;
+
+	/// Return true if the stream is seekable.
+	virtual bool isSeekable() const = 0;
+	
+	/// Return true if this is an input stream.
+	virtual bool isLoading() const = 0;
+
+	/// Return true if this is an output stream.
+	virtual bool isSaving() const = 0;
+
+	
+	// friends	
+	friend Stream & operator<<( Stream & s, bool & c ) {
+#	if NV_OS_DARWIN
+		nvStaticCheck(sizeof(bool) == 4);
+		uint8 b = c ? 1 : 0;
+		s.serialize( &b, 1 );
+		c = (b == 1);
+#	else
+		nvStaticCheck(sizeof(bool) == 1);
+		s.serialize( &c, 1 );
+#	endif
+		return s;
+	}
+	friend Stream & operator<<( Stream & s, char & c ) {
+		nvStaticCheck(sizeof(char) == 1);
+		s.serialize( &c, 1 );
+		return s;
+	}
+	friend Stream & operator<<( Stream & s, uint8 & c ) {
+		nvStaticCheck(sizeof(uint8) == 1);
+		s.serialize( &c, 1 );
+		return s;
+	}
+	friend Stream & operator<<( Stream & s, int8 & c ) {
+		nvStaticCheck(sizeof(int8) == 1);
+		s.serialize( &c, 1 );
+		return s;
+	}
+	friend Stream & operator<<( Stream & s, uint16 & c ) {
+		nvStaticCheck(sizeof(uint16) == 2);
+		return s.byteOrderSerialize( &c, 2 );
+	}
+	friend Stream & operator<<( Stream & s, int16 & c ) {
+		nvStaticCheck(sizeof(int16) == 2);
+		return s.byteOrderSerialize( &c, 2 );
+	}
+	friend Stream & operator<<( Stream & s, uint32 & c ) {
+		nvStaticCheck(sizeof(uint32) == 4);
+		return s.byteOrderSerialize( &c, 4 );
+	}
+	friend Stream & operator<<( Stream & s, int32 & c ) {
+		nvStaticCheck(sizeof(int32) == 4);
+		return s.byteOrderSerialize( &c, 4 );
+	}
+	friend Stream & operator<<( Stream & s, uint64 & c ) {
+		nvStaticCheck(sizeof(uint64) == 8);
+		return s.byteOrderSerialize( &c, 8 );
+	}
+	friend Stream & operator<<( Stream & s, int64 & c ) {
+		nvStaticCheck(sizeof(int64) == 8);
+		return s.byteOrderSerialize( &c, 8 );
+	}
+	friend Stream & operator<<( Stream & s, float & c ) {
+		nvStaticCheck(sizeof(float) == 4);
+		return s.byteOrderSerialize( &c, 4 );
+	}
+	friend Stream & operator<<( Stream & s, double & c ) {
+		nvStaticCheck(sizeof(double) == 8);
+		return s.byteOrderSerialize( &c, 8 );
+	}
+
+protected:
+
+	/// Serialize in the stream byte order.
+	Stream & byteOrderSerialize( void * v, uint len ) {
+		if( m_byteOrder == getSystemByteOrder() ) {
+			serialize( v, len );
+		}
+		else {
+			for( uint i = len; i > 0; i-- ) {
+				serialize( (uint8 *)v + i - 1, 1 );
+			}
+		}
+		return *this;
+	}
+
+
+private:
+
+	ByteOrder m_byteOrder;
+
+};
+
 } // nv namespace

-#endif // NV_CORE_STREAM_H
+#endif // NV_STREAM_H
--- a/src/nvcore/TextReader.cpp
+++ b/src/nvcore/TextReader.cpp
@ -1,6 +1,6 @@
-// This code is in the public domain -- castano@gmail.com
+// This code is in the public domain -- castanyo@yahoo.es

-#include "TextReader.h"
+#include <nvcore/TextReader.h>

 using namespace nv;

@ -48,7 +48,7 @@ const char * TextReader::readToEnd()
 	m_text.reserve(size + 1);
 	m_text.resize(size);
 	
-	m_stream->serialize(m_text.mutableBuffer(), size);
+	m_stream->serialize(m_text.unsecureBuffer(), size);
 	m_text.pushBack('\0');
 	
 	return m_text.buffer();
--- a/src/nvcore/TextReader.h
+++ b/src/nvcore/TextReader.h
@ -1,10 +1,11 @@
-// This code is in the public domain -- castano@gmail.com
+// This code is in the public domain -- castanyo@yahoo.es

-#ifndef NV_CORE_TEXTREADER_H
-#define NV_CORE_TEXTREADER_H
+#ifndef NVCORE_TEXTREADER_H
+#define NVCORE_TEXTREADER_H

-#include "Containers.h"
-#include "Stream.h"
+#include <nvcore/nvcore.h>
+#include <nvcore/Stream.h>
+#include <nvcore/Containers.h>

 namespace nv
 {
@ -34,4 +35,4 @@ private:

 } // nv namespace

-#endif // NV_CORE_TEXTREADER_H
+#endif // NVCORE_TEXTREADER_H
--- a/src/nvcore/TextWriter.cpp
+++ b/src/nvcore/TextWriter.cpp
@ -1,6 +1,6 @@
-// This code is in the public domain -- castano@gmail.com
+// This code is in the public domain -- castanyo@yahoo.es

-#include "TextWriter.h"
+#include <nvcore/TextWriter.h>

 using namespace nv;

--- a/src/nvcore/TextWriter.h
+++ b/src/nvcore/TextWriter.h
@ -1,10 +1,11 @@
-// This code is in the public domain -- castano@gmail.com
+// This code is in the public domain -- castanyo@yahoo.es

-#ifndef NV_CORE_TEXTWRITER_H
-#define NV_CORE_TEXTWRITER_H
+#ifndef NVCORE_TEXTWRITER_H
+#define NVCORE_TEXTWRITER_H

-#include "StrLib.h"
-#include "Stream.h"
+#include <nvcore/nvcore.h>
+#include <nvcore/Stream.h>
+#include <nvcore/StrLib.h>

 namespace nv
 {
--- a/src/nvcore/Timer.h
+++ b/src/nvcore/Timer.h
@ -1,60 +0,0 @@
-// This code is in the public domain -- castano@gmail.com
-
-#ifndef NV_CORE_TIMER_H
-#define NV_CORE_TIMER_H
-
-#include "nvcore.h"
-
-#if 1
-
-#include <time.h> //clock
-
-class NVCORE_CLASS Timer
-{
-public:
-	Timer() {}
-	
-	void start() { m_start = clock(); }
-	void stop() { m_stop = clock(); }
-
-	float elapsed() const { return float(m_stop - m_start) / CLOCKS_PER_SEC; }
-	
-private:
-	clock_t m_start;
-	clock_t m_stop;
-};
-
-#else
-
-#define WINDOWS_LEAN_AND_MEAN
-#define VC_EXTRALEAN
-#define NOMINMAX
-#include <windows.h>
-
-class NVCORE_CLASS Timer
-{
-public:
-	Timer() {
-		// get the tick frequency from the OS
-		QueryPerformanceFrequency((LARGE_INTEGER*) &m_frequency);
-	}
-	
-	void start() { QueryPerformanceCounter((LARGE_INTEGER*) &m_start); }
-	void stop() { QueryPerformanceCounter((LARGE_INTEGER*) &m_stop); }
-
-	int elapsed() const {
-		return (int)1000 * ((double)m_stop.QuadPart - (double)m_start.QuadPart) / (double)m_frequency.QuadPart;
-	}
-	
-private:
-	LARGE_INTEGER  m_frequency;
-	LARGE_INTEGER  m_start;
-	LARGE_INTEGER  m_stop;
-
-};
-
-#endif // 0
-
-
-
-#endif // NV_CORE_TIMER_H
--- a/src/nvcore/Tokenizer.cpp
+++ b/src/nvcore/Tokenizer.cpp
@ -0,0 +1,229 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#include <nvcore/Tokenizer.h>
+#include <nvcore/StrLib.h>
+
+#include <stdio.h> // vsscanf
+#include <stdarg.h>	// va_list
+#include <stdlib.h>	// atof, atoi
+
+#if NV_CC_MSVC
+#if 0 // This doesn't work on MSVC for x64
+/* vsscanf for Win32
+ * Written 5/2003 by <mgix@mgix.com>
+ * This code is in the Public Domain
+ */
+
+#include <malloc.h> // alloca
+//#include <string.h>
+
+static int vsscanf(const char * buffer, const char * format, va_list argPtr)
+{
+	// Get an upper bound for the # of args
+	size_t count = 0;
+	const char *p = format;
+	while(1) {
+		char c = *(p++);
+		if(c==0) break;
+		if(c=='%' && (p[0]!='*' && p[0]!='%')) ++count;
+	}
+
+	// Make a local stack
+	size_t stackSize = (2+count)*sizeof(void*);
+	void **newStack = (void**)alloca(stackSize);
+
+	// Fill local stack the way sscanf likes it
+	newStack[0] = (void*)buffer;
+	newStack[1] = (void*)format;
+	memcpy(newStack+2, argPtr, count*sizeof(void*));
+
+	// @@ Use: CALL DWORD PTR [sscanf]
+	
+	// Warp into system sscanf with new stack
+	int result;
+	void *savedESP;
+	__asm
+	{
+		mov     savedESP, esp
+		mov     esp, newStack
+#if _MSC_VER >= 1400
+		call	DWORD PTR [sscanf_s]
+#else
+		call	DWORD PTR [sscanf]
+#endif
+		mov     esp, savedESP
+		mov     result, eax
+	}
+	return result;
+}
+#endif
+#endif
+
+using namespace nv;
+
+Token::Token() :
+	m_str(""), m_len(0)
+{
+}
+
+Token::Token(const Token & token) : 
+	m_str(token.m_str), m_len(token.m_len)
+{
+}
+
+Token::Token(const char * str, int len) : 
+	m_str(str), m_len(len)
+{
+}
+
+bool Token::operator==(const char * str) const
+{
+	return strncmp(m_str, str, m_len) == 0;
+}
+bool Token::operator!=(const char * str) const
+{
+	return strncmp(m_str, str, m_len) != 0;
+}
+
+bool Token::isNull()
+{
+	return m_len != 0;
+}
+
+float Token::toFloat() const
+{
+	return float(atof(m_str));
+}
+
+int Token::toInt() const
+{
+	return atoi(m_str);
+}
+
+uint Token::toUnsignedInt() const
+{
+	// @@ TBD
+	return uint(atoi(m_str));
+}
+
+String Token::toString() const
+{
+	return String(m_str, m_len);
+}
+
+bool Token::parse(const char * format, int count, ...) const
+{
+	va_list arg;
+	va_start(arg, count);
+
+	int readCount = vsscanf(m_str, format, arg);
+
+	va_end(arg);
+
+	return readCount == count;
+}
+
+
+Tokenizer::Tokenizer(Stream * stream) : 
+	m_reader(stream), m_lineNumber(0), m_columnNumber(0), m_delimiters("{}()="), m_spaces(" \t")
+{
+}
+
+bool Tokenizer::nextLine(bool skipEmptyLines /*= true*/)
+{
+	do {
+		if (!readLine()) {
+			return false;
+		}
+	}
+	while (!readToken() && skipEmptyLines);
+	
+	return true;
+}
+
+bool Tokenizer::nextToken(bool skipEndOfLine /*= false*/)
+{
+	if (!readToken()) {
+		if (!skipEndOfLine) {
+			return false;
+		}
+		else {
+			return nextLine(true);
+		}
+	}
+	return true;
+}
+	
+bool Tokenizer::readToken()
+{
+	skipSpaces();
+	
+	const char * begin = m_line + m_columnNumber;
+	
+	if (*begin == '\0') {
+		return false;
+	}
+	
+	char c = readChar();
+	if (isDelimiter(c)) {
+		m_token = Token(begin, 1);
+		return true;
+	}
+	
+	// @@ Add support for quoted tokens "", ''
+	
+	int len = 0;
+	while (!isDelimiter(c) && !isSpace(c) && c != '\0') {
+		c = readChar();
+		len++;
+	}
+	m_columnNumber--;
+	
+	m_token = Token(begin, len);
+	
+	return true;
+}
+
+char Tokenizer::readChar()
+{
+	return m_line[m_columnNumber++];
+}
+
+bool Tokenizer::readLine()
+{
+	m_lineNumber++;
+	m_columnNumber = 0;
+	m_line = m_reader.readLine();
+	return m_line != NULL;
+}
+
+void Tokenizer::skipSpaces()
+{
+	while (isSpace(readChar())) {}
+	m_columnNumber--;
+}
+
+bool Tokenizer::isSpace(char c)
+{
+	uint i = 0;
+	while (m_spaces[i] != '\0') {
+		if (c == m_spaces[i]) {
+			return true;
+		}
+		i++;
+	}
+	return false;
+}
+
+bool Tokenizer::isDelimiter(char c)
+{
+	uint i = 0;
+	while (m_delimiters[i] != '\0') {
+		if (c == m_delimiters[i]) {
+			return true;
+		}
+		i++;
+	}
+	return false;
+}
+
--- a/src/nvcore/Tokenizer.h
+++ b/src/nvcore/Tokenizer.h
@ -0,0 +1,99 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#ifndef NV_CORE_TOKENIZER_H
+#define NV_CORE_TOKENIZER_H
+
+#include <nvcore/nvcore.h>
+#include <nvcore/Stream.h>
+#include <nvcore/TextReader.h>
+#include <nvcore/StrLib.h>
+
+namespace nv
+{
+	/// A token produced by the Tokenizer.
+	class NVCORE_CLASS Token
+	{
+	public:
+		Token();
+		Token(const Token & token);
+		Token(const char * str, int len);		
+		
+		bool operator==(const char * str) const;
+		bool operator!=(const char * str) const;
+
+		bool isNull();
+		
+		float toFloat() const;
+		int toInt() const;
+		uint toUnsignedInt() const;
+		String toString() const;
+		
+		bool parse(const char * format, int count, ...) const __attribute__((format (scanf, 2, 4)));
+		
+	private:
+		const char * m_str;
+		int m_len;
+	};
+	
+	/// Exception thrown by the tokenizer.
+	class TokenizerException
+	{
+	public:
+		TokenizerException(int line, int column) : m_line(line), m_column(column) {}
+		
+		int line() const { return m_line; }
+		int column() const { return m_column; }
+		
+	private:
+		int m_line;
+		int m_column;
+	};
+	
+	// @@ Use enums instead of bools for clarity!
+	//enum SkipEmptyLines { skipEmptyLines, noSkipEmptyLines };
+	//enum SkipEndOfLine { skipEndOfLine, noSkipEndOfLine };
+
+	/// A simple stream tokenizer.
+	class NVCORE_CLASS Tokenizer
+	{
+	public:
+		Tokenizer(Stream * stream);
+		
+		bool nextLine(bool skipEmptyLines = true);
+		bool nextToken(bool skipEndOfLine = false);
+		
+		const Token & token() const { return m_token; }
+		
+		int lineNumber() const { return m_lineNumber; }
+		int columnNumber() const { return m_columnNumber; }
+		
+		void setDelimiters(const char * str) { m_delimiters = str; }
+		const char * delimiters() const { return m_delimiters; }
+		
+		void setSpaces(const char * str) { m_spaces = str; }
+		const char * spaces() const { return m_spaces; }
+		
+	private:
+		char readChar();
+		bool readLine();
+		bool readToken(); 
+		void skipSpaces();
+		bool isSpace(char c);
+		bool isDelimiter(char c);
+		
+	private:
+		TextReader m_reader;
+		const char * m_line;
+		Token m_token;
+		
+		int m_lineNumber;
+		int m_columnNumber;
+		
+		const char * m_delimiters;
+		const char * m_spaces;
+	};
+	
+} // nv namespace
+
+
+#endif // NV_CORE_TOKENIZER_H
--- a/src/nvcore/nvcore.h
+++ b/src/nvcore/nvcore.h
@ -22,7 +22,7 @@


 // Platform definitions
-#include <posh.h>
+#include "poshlib/posh.h"

 // OS:
 // NV_OS_WIN32
@ -38,9 +38,6 @@
 #if defined POSH_OS_LINUX
 #	define NV_OS_LINUX 1
 #	define NV_OS_UNIX 1
-#elif defined POSH_OS_FREEBSD
-#	define NV_OS_FREEBSD 1
-#	define NV_OS_UNIX 1
 #elif defined POSH_OS_CYGWIN32
 #	define NV_OS_CYGWIN 1
 #elif defined POSH_OS_MINGW
@ -129,13 +126,6 @@
 #define NV_DO_STRING_JOIN2(arg1, arg2) arg1 ## arg2
 #define NV_STRING_JOIN3(arg1, arg2, arg3) NV_DO_STRING_JOIN3(arg1, arg2, arg3)
 #define NV_DO_STRING_JOIN3(arg1, arg2, arg3) arg1 ## arg2 ## arg3
-#define NV_STRING2(x) #x
-#define NV_STRING(x) NV_STRING2(x)
-#if NV_CC_GNUC
-#define NV_FILE_LINE __FILE__ ":" NV_STRING(__LINE__) ": "
-#else
-#define NV_FILE_LINE __FILE__ "(" NV_STRING(__LINE__) ") : "
-#endif

 // Startup initialization macro.
 #define NV_AT_STARTUP(some_code) \
@ -168,7 +158,7 @@
 #elif NV_CC_GNUC
 #	if NV_OS_LINUX
 #		include "DefsGnucLinux.h"
-#	elif NV_OS_DARWIN || NV_OS_FREEBSD
+#	elif NV_OS_DARWIN
 #		include "DefsGnucDarwin.h"
 #	elif NV_OS_MINGW
 #		include "DefsGnucWin32.h"
--- a/src/nvcore/poshlib/CMakeLists.txt
+++ b/src/nvcore/poshlib/CMakeLists.txt
@ -0,0 +1,7 @@
+
+SET(POSHLIB_SRCS
+	posh.c
+	posh.h)
+
+ADD_LIBRARY(posh STATIC ${POSHLIB_SRCS})
+
--- a/src/nvcore/poshlib/posh.c
+++ b/src/nvcore/poshlib/posh.c
--- a/src/nvcore/poshlib/posh.h
+++ b/src/nvcore/poshlib/posh.h
--- a/src/nvimage/BlockDXT.cpp
+++ b/src/nvimage/BlockDXT.cpp
@ -21,13 +21,10 @@
 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 // OTHER DEALINGS IN THE SOFTWARE.

-#include "BlockDXT.h"
-
-#include <nvimage/ColorBlock.h>
-
 #include <nvcore/Stream.h>
-#include <nvcore/Containers.h> // swap

+#include "ColorBlock.h"
+#include "BlockDXT.h"

 using namespace nv;

@ -39,9 +36,9 @@ using namespace nv;
 uint BlockDXT1::evaluatePalette(Color32 color_array[4]) const
 {
 	// Does bit expansion before interpolation.
-	color_array[0].r = (col0.r << 3) | (col0.r >> 2);
-	color_array[0].g = (col0.g << 2) | (col0.g >> 4);
 	color_array[0].b = (col0.b << 3) | (col0.b >> 2);
+	color_array[0].g = (col0.g << 2) | (col0.g >> 4);
+	color_array[0].r = (col0.r << 3) | (col0.r >> 2);
 	color_array[0].a = 0xFF;
 	
 	// @@ Same as above, but faster?
@ -93,51 +90,6 @@ uint BlockDXT1::evaluatePalette(Color32 color_array[4]) const
 	}
 }

-
-uint BlockDXT1::evaluatePaletteNV5x(Color32 color_array[4]) const
-{
-	// Does bit expansion before interpolation.
-	color_array[0].r = (3 * col0.r * 22) / 8;
-	color_array[0].g = (col0.g << 2) | (col0.g >> 4);
-	color_array[0].b = (3 * col0.b * 22) / 8;
-	color_array[0].a = 0xFF;
-
-	color_array[1].r = (3 * col1.r * 22) / 8;
-	color_array[1].g = (col1.g << 2) | (col1.g >> 4);
-	color_array[1].b = (3 * col1.b * 22) / 8;
-	color_array[1].a = 0xFF;
-	
-	if( col0.u > col1.u ) {
-		// Four-color block: derive the other two colors.
-		color_array[2].r = (2 * col0.r + col1.r) * 22 / 8;
-		color_array[2].g = (256 * color_array[0].g + (color_array[1].g - color_array[0].g)/4 + 128 + (color_array[1].g - color_array[0].g) * 80) / 256;
-		color_array[2].b = (2 * col0.b + col1.b) * 22 / 8;
-		color_array[2].a = 0xFF;
-		
-		color_array[3].r = (2 * col1.r + col0.r) * 22 / 8;
-		color_array[3].g = (256 * color_array[1].g + (color_array[0].g - color_array[1].g)/4 + 128 + (color_array[0].g - color_array[1].g) * 80) / 256;
-		color_array[3].b = (2 * col1.b + col0.b) * 22 / 8;
-		
-		color_array[3].a = 0xFF;
-		return 4;
-	}
-	else {
-		// Three-color block: derive the other color.
-		color_array[2].r = (col0.r + col1.r) * 33 / 8;
-		color_array[2].g = (256 * color_array[0].g + (color_array[1].g - color_array[0].g)/4 + 128 + (color_array[1].g - color_array[0].g) * 128) / 256;
-		color_array[2].b = (col0.b + col1.b) * 33 / 8;
-		color_array[2].a = 0xFF;
-		
-		// Set all components to 0 to match DXT specs.
-		color_array[3].r = 0x00; // color_array[2].r;
-		color_array[3].g = 0x00; // color_array[2].g;
-		color_array[3].b = 0x00; // color_array[2].b;
-		color_array[3].a = 0x00;
-		
-		return 3;
-	}
-}
-
 // Evaluate palette assuming 3 color block.
 void BlockDXT1::evaluatePalette3(Color32 color_array[4]) const
 {
@ -190,6 +142,95 @@ void BlockDXT1::evaluatePalette4(Color32 color_array[4]) const
 }


+/* Jason Dorie's code.
+// ----------------------------------------------------------------------------
+// Build palette for a 3 color + traparent black block
+// ----------------------------------------------------------------------------
+void DXTCGen::BuildCodes3(cbVector *pVects, cbVector &v1, cbVector &v2)
+{
+	//pVects[0] = v1;
+	//pVects[2] = v2;
+	//pVects[1][0] = v1[0];
+	//pVects[1][1] = (BYTE)( ((long)v1[1] + (long)v2[1]) / 2 );
+	//pVects[1][2] = (BYTE)( ((long)v1[2] + (long)v2[2]) / 2 );
+	//pVects[1][3] = (BYTE)( ((long)v1[3] + (long)v2[3]) / 2 );
+
+	__asm {
+		mov			ecx, dword ptr pVects
+		mov			eax, dword ptr v1
+		mov			ebx, dword ptr v2
+
+		movd		mm0, [eax]
+		movd		mm1, [ebx]
+		pxor		mm2, mm2
+		nop
+
+		movd		[ecx], mm0
+		movd		[ecx+8], mm1
+
+		punpcklbw	mm0, mm2
+		punpcklbw	mm1, mm2
+
+		paddw		mm0, mm1
+		psrlw		mm0, 1
+
+		packuswb	mm0, mm0
+		movd		[ecx+4], mm0
+	}
+	// *(long *)&pVects[1] = r1;
+}
+
+__int64 ScaleOneThird = 0x5500550055005500;
+
+// ----------------------------------------------------------------------------
+// Build palette for a 4 color block
+// ----------------------------------------------------------------------------
+void DXTCGen::BuildCodes4(cbVector *pVects, cbVector &v1, cbVector &v2)
+{
+// 	pVects[0] = v1;
+// 	pVects[3] = v2;
+// 
+// 	pVects[1][0] = v1[0];
+// 	pVects[1][1] = (BYTE)( ((long)v1[1] * 2 + (long)v2[1]) / 3 );
+// 	pVects[1][2] = (BYTE)( ((long)v1[2] * 2 + (long)v2[2]) / 3 );
+// 	pVects[1][3] = (BYTE)( ((long)v1[3] * 2 + (long)v2[3]) / 3 );
+// 
+// 	pVects[2][0] = v1[0];
+// 	pVects[2][1] = (BYTE)( ((long)v2[1] * 2 + (long)v1[1]) / 3 );
+// 	pVects[2][2] = (BYTE)( ((long)v2[2] * 2 + (long)v1[2]) / 3 );
+// 	pVects[2][3] = (BYTE)( ((long)v2[3] * 2 + (long)v1[3]) / 3 );
+
+	__asm {
+		mov			ecx, dword ptr pVects
+		mov			eax, dword ptr v1
+		mov			ebx, dword ptr v2
+
+		movd		mm0, [eax]
+		movd		mm1, [ebx]
+
+		pxor		mm2, mm2
+		movd		[ecx], mm0
+		movd		[ecx+12], mm1
+
+		punpcklbw	mm0, mm2
+		punpcklbw	mm1, mm2
+		movq		mm3, mm0		// mm3 = v0
+
+		paddw		mm0, mm1		// mm0 = v0 + v1
+		paddw		mm3, mm3		// mm3 = v0*2
+
+		paddw		mm0, mm1		// mm0 = v0 + v1*2
+		paddw		mm1, mm3		// mm1 = v0*2 + v1
+
+		pmulhw		mm0, ScaleOneThird
+		pmulhw		mm1, ScaleOneThird
+		packuswb	mm1, mm0
+
+		movq		[ecx+4], mm1
+	}
+}
+*/
+
 void BlockDXT1::decodeBlock(ColorBlock * block) const
 {
 	nvDebugCheck(block != NULL);
@ -207,24 +248,6 @@ void BlockDXT1::decodeBlock(ColorBlock * block) const
 	}	
 }

-void BlockDXT1::decodeBlockNV5x(ColorBlock * block) const
-{
-	nvDebugCheck(block != NULL);
-
-	// Decode color block.
-	Color32 color_array[4];
-	evaluatePaletteNV5x(color_array);
-
-	// Write color block.
-	for( uint j = 0; j < 4; j++ ) {
-		for( uint i = 0; i < 4; i++ ) {
-			uint idx = (row[j] >> (2 * i)) & 3;
-			block->color(i, j) = color_array[idx];
-		}
-	}
-}
-
-
 void BlockDXT1::setIndices(int * idx)
 {
 	indices = 0;
@ -263,14 +286,6 @@ void BlockDXT3::decodeBlock(ColorBlock * block) const
 	alpha.decodeBlock(block);
 }

-void BlockDXT3::decodeBlockNV5x(ColorBlock * block) const
-{
-	nvDebugCheck(block != NULL);
-	
-	color.decodeBlockNV5x(block);
-	alpha.decodeBlock(block);
-}
-
 void AlphaBlockDXT3::decodeBlock(ColorBlock * block) const
 {
 	nvDebugCheck(block != NULL);
@ -451,17 +466,7 @@ void BlockDXT5::decodeBlock(ColorBlock * block) const
 	
 	// Decode alpha.
 	alpha.decodeBlock(block);
-}

-void BlockDXT5::decodeBlockNV5x(ColorBlock * block) const
-{
-	nvDebugCheck(block != NULL);
-	
-	// Decode color.
-	color.decodeBlockNV5x(block);
-	
-	// Decode alpha.
-	alpha.decodeBlock(block);
 }

 /// Flip DXT5 block vertically.
--- a/src/nvimage/BlockDXT.h
+++ b/src/nvimage/BlockDXT.h
@ -47,13 +47,11 @@ namespace nv
 		bool isFourColorMode() const;
 	
 		uint evaluatePalette(Color32 color_array[4]) const;
-		uint evaluatePaletteNV5x(Color32 color_array[4]) const;
-
+		uint evaluatePaletteFast(Color32 color_array[4]) const;
 		void evaluatePalette3(Color32 color_array[4]) const;
 		void evaluatePalette4(Color32 color_array[4]) const;
 		
 		void decodeBlock(ColorBlock * block) const;
-		void decodeBlockNV5x(ColorBlock * block) const;
 		
 		void setIndices(int * idx);

@ -107,7 +105,6 @@ namespace nv
 		BlockDXT1 color;
 		
 		void decodeBlock(ColorBlock * block) const;
-		void decodeBlockNV5x(ColorBlock * block) const;
 		
 		void flip4();
 		void flip2();
@ -163,7 +160,6 @@ namespace nv
 		BlockDXT1 color;
 		
 		void decodeBlock(ColorBlock * block) const;
-		void decodeBlockNV5x(ColorBlock * block) const;
 		
 		void flip4();
 		void flip2();
--- a/src/nvimage/CMakeLists.txt
+++ b/src/nvimage/CMakeLists.txt
@ -1,71 +1,68 @@
 PROJECT(nvimage)

 SET(IMAGE_SRCS	
-    nvimage.h
-    FloatImage.h 
-    FloatImage.cpp
-    Filter.h
-    Filter.cpp
-    Image.h
-    Image.cpp
-    ImageIO.h
-    ImageIO.cpp
-    ColorBlock.h
-    ColorBlock.cpp
-    BlockDXT.h
-    BlockDXT.cpp
-    DirectDrawSurface.h
-    DirectDrawSurface.cpp
-    Quantize.h
-    Quantize.cpp
-    NormalMap.h
-    NormalMap.cpp
-    PsdFile.h
-    TgaFile.h
-    ColorSpace.h
-    ColorSpace.cpp)
+	nvimage.h
+	FloatImage.h
+	FloatImage.cpp
+	Filter.h
+	Filter.cpp
+	Image.h
+	Image.cpp
+	ImageIO.h
+	ImageIO.cpp
+	ColorBlock.h
+	ColorBlock.cpp
+	BlockDXT.h
+	BlockDXT.cpp
+	HoleFilling.h
+	HoleFilling.cpp
+	DirectDrawSurface.h
+	DirectDrawSurface.cpp
+	Quantize.h
+	Quantize.cpp
+	NormalMap.h
+	NormalMap.cpp
+	NormalMipmap.h
+	NormalMipmap.cpp
+	PsdFile.h
+	TgaFile.h)

 INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})

 IF(PNG_FOUND)
-    SET(LIBS ${LIBS} ${PNG_LIBRARIES})
-    INCLUDE_DIRECTORIES(${PNG_INCLUDE_DIR})
+	SET(LIBS ${LIBS} ${PNG_LIBRARIES})
+	INCLUDE_DIRECTORIES(${PNG_INCLUDE_DIR})
 ENDIF(PNG_FOUND)

 IF(JPEG_FOUND)
-    SET(LIBS ${LIBS} ${JPEG_LIBRARIES})
-    INCLUDE_DIRECTORIES(${JPEG_INCLUDE_DIR})
+	SET(LIBS ${LIBS} ${JPEG_LIBRARIES})
+	INCLUDE_DIRECTORIES(${JPEG_INCLUDE_DIR})
 ENDIF(JPEG_FOUND)

 IF(TIFF_FOUND)
-    SET(LIBS ${LIBS} ${TIFF_LIBRARIES})
-    INCLUDE_DIRECTORIES(${TIFF_INCLUDE_DIR})
+	SET(LIBS ${LIBS} ${TIFF_LIBRARIES})
+	INCLUDE_DIRECTORIES(${TIFF_INCLUDE_DIR})
 ENDIF(TIFF_FOUND)

 IF(OPENEXR_FOUND)
-    SET(LIBS ${LIBS} ${OPENEXR_LIBRARIES})
-    INCLUDE_DIRECTORIES(${OPENEXR_INCLUDE_PATHS})
+	SET(LIBS ${LIBS} ${OPENEXR_LIBRARIES})
+	INCLUDE_DIRECTORIES(${OPENEXR_INCLUDE_PATHS})
 ENDIF(OPENEXR_FOUND)

-IF(FREEIMAGE_FOUND)
-    SET(LIBS ${LIBS} ${FREEIMAGE_LIBRARIES})
-    INCLUDE_DIRECTORIES(${FREEIMAGE_INCLUDE_PATH})
-ENDIF(FREEIMAGE_FOUND)
-
 # targets
 ADD_DEFINITIONS(-DNVIMAGE_EXPORTS)

-IF(NVIMAGE_SHARED)
-    ADD_DEFINITIONS(-DNVIMAGE_SHARED=1)
-    ADD_LIBRARY(nvimage SHARED ${IMAGE_SRCS})
+IF(NVIMAGE_SHARED)	
+	ADD_DEFINITIONS(-DNVIMAGE_SHARED=1)
+	ADD_LIBRARY(nvimage SHARED ${IMAGE_SRCS})
 ELSE(NVIMAGE_SHARED)
-    ADD_LIBRARY(nvimage ${IMAGE_SRCS})
+	ADD_LIBRARY(nvimage ${IMAGE_SRCS})
 ENDIF(NVIMAGE_SHARED)

 TARGET_LINK_LIBRARIES(nvimage ${LIBS} nvcore nvmath posh)

 INSTALL(TARGETS nvimage
-    RUNTIME DESTINATION bin
-    LIBRARY DESTINATION lib
-    ARCHIVE DESTINATION lib/static)
+	RUNTIME DESTINATION bin
+	LIBRARY DESTINATION lib
+	ARCHIVE DESTINATION lib/static)

--- a/src/nvimage/ColorBlock.cpp
+++ b/src/nvimage/ColorBlock.cpp
@ -1,6 +1,5 @@
 // This code is in the public domain -- castanyo@yahoo.es

-#include <nvcore/Containers.h> // swap
 #include <nvmath/Box.h>
 #include <nvimage/ColorBlock.h>
 #include <nvimage/Image.h>
@ -58,9 +57,11 @@ void ColorBlock::init(const Image * img, uint x, uint y)
 	
 	const uint bw = min(img->width() - x, 4U);
 	const uint bh = min(img->height() - y, 4U);
-	nvDebugCheck(bw != 0 && bh != 0);

-	static const int remainder[] = {
+	nvDebugCheck(bw != 0);
+	nvDebugCheck(bh != 0);
+
+	static int remainder[] = {
 		0, 0, 0, 0,
 		0, 1, 0, 1,
 		0, 1, 2, 0,
@ -81,129 +82,51 @@ void ColorBlock::init(const Image * img, uint x, uint y)
 	}
 }

-void ColorBlock::init(uint w, uint h, uint * data, uint x, uint y)
+
+void ColorBlock::swizzleDXT5n()
 {
-	nvDebugCheck(data != NULL);
-
-	const uint bw = min(w - x, 4U);
-	const uint bh = min(h - y, 4U);
-	nvDebugCheck(bw != 0 && bh != 0);
-
-	// Blocks that are smaller than 4x4 are handled by repeating the pixels.
-	// @@ Thats only correct when block size is 1, 2 or 4, but not with 3. :(
-
-	for (uint i = 0; i < 4; i++)
-	{
-		const int by = i % bh;
-		
-		for (uint e = 0; e < 4; e++)
-		{
-			const int bx = e % bw;
-			const uint idx = (y + by) * w + x + bx;
-
-			color(e, i).u = data[idx];
-		}
-	}
-}
-
-void ColorBlock::init(uint w, uint h, float * data, uint x, uint y)
-{
-	nvDebugCheck(data != NULL);
-
-	const uint bw = min(w - x, 4U);
-	const uint bh = min(h - y, 4U);
-	nvDebugCheck(bw != 0 && bh != 0);
-
-	// Blocks that are smaller than 4x4 are handled by repeating the pixels.
-	// @@ Thats only correct when block size is 1, 2 or 4, but not with 3. :(
-
-	for (uint i = 0; i < 4; i++)
-	{
-		const uint by = i % bh;
-		
-		for (uint e = 0; e < 4; e++)
-		{
-			const uint bx = e % bw;
-			const uint idx = ((y + by) * w + x + bx) * 4;
-			
-			Color32 & c = color(e, i);
-			c.r = uint8(255 * clamp(data[idx + 0], 0.0f, 1.0f));
-			c.g = uint8(255 * clamp(data[idx + 1], 0.0f, 1.0f));
-			c.b = uint8(255 * clamp(data[idx + 2], 0.0f, 1.0f));
-			c.a = uint8(255 * clamp(data[idx + 3], 0.0f, 1.0f));
-		}
-	}
-}
-
-static inline uint8 component(Color32 c, uint i)
-{
-	if (i == 0) return c.r;
-	if (i == 1) return c.g;
-	if (i == 2) return c.b;
-	if (i == 3) return c.a;
-	if (i == 4) return 0xFF;
-	return 0;
-}
-
-void ColorBlock::swizzle(uint x, uint y, uint z, uint w)
-{
-	for (int i = 0; i < 16; i++)
+	for(int i = 0; i < 16; i++)
 	{
 		Color32 c = m_color[i];
-		m_color[i].r = component(c, x);
-		m_color[i].g = component(c, y);
-		m_color[i].b = component(c, z);
-		m_color[i].a = component(c, w);
+		m_color[i] = Color32(0xFF, c.g, 0, c.r);
 	}
 }

+void ColorBlock::splatX()
+{
+	for(int i = 0; i < 16; i++)
+	{
+		uint8 x = m_color[i].r;
+		m_color[i] = Color32(x, x, x, x);
+	}
+}
+
+void ColorBlock::splatY()
+{
+	for(int i = 0; i < 16; i++)
+	{
+		uint8 y = m_color[i].g;
+		m_color[i] = Color32(y, y, y, y);
+	}
+}

 /// Returns true if the block has a single color.
 bool ColorBlock::isSingleColor() const
 {
-	Color32 mask(0xFF, 0xFF, 0xFF, 0x00);
-	uint u = m_color[0].u & mask.u;
-	
-	for (int i = 1; i < 16; i++)
-	{
-		if (u != (m_color[i].u & mask.u))
-		{
-			return false;
-		}
-	}
-	
-	return true;
+	Color32 mask(0xFF, 0xFF, 0xFF, 0x00);
+	uint u = m_color[0].u & mask.u;
+        
+	for (int i = 1; i < 16; i++)
+	{
+		if (u != (m_color[i].u & mask.u))
+		{
+			return false;
+		}
+	}
+        
+	return true;
 }

-/*
-/// Returns true if the block has a single color, ignoring transparent pixels.
-bool ColorBlock::isSingleColorNoAlpha() const
-{
-	Color32 c;
-	int i;
-	for(i = 0; i < 16; i++)
-	{
-		if (m_color[i].a != 0) {
-			c = m_color[i];
-			break;
-		}
-	}
-
-	Color32 mask(0xFF, 0xFF, 0xFF, 0x00);
-	uint u = c.u & mask.u;
-
-	for(; i < 16; i++)
-	{
-		if (u != (m_color[i].u & mask.u))
-		{
-			return false;
-		}
-	}
-	
-	return true;
-}
-*/
-
 /// Count number of unique colors in this color block.
 uint ColorBlock::countUniqueColors() const
 {
--- a/src/nvimage/ColorBlock.h
+++ b/src/nvimage/ColorBlock.h
@ -18,13 +18,12 @@ namespace nv
 		ColorBlock(const Image * img, uint x, uint y);
 		
 		void init(const Image * img, uint x, uint y);
-		void init(uint w, uint h, uint * data, uint x, uint y);
-		void init(uint w, uint h, float * data, uint x, uint y);
 		
-		void swizzle(uint x, uint y, uint z, uint w); // 0=r, 1=g, 2=b, 3=a, 4=0xFF, 5=0
+		void swizzleDXT5n();
+		void splatX();
+		void splatY();
 		
 		bool isSingleColor() const;
-		//bool isSingleColorNoAlpha() const;
 		uint countUniqueColors() const;
 		Color32 averageColor() const;
 		bool hasAlpha() const;
--- a/src/nvimage/ColorSpace.cpp
+++ b/src/nvimage/ColorSpace.cpp
@ -1,70 +0,0 @@
-// This code is in the public domain -- jim@tilander.org
-
-#include <nvcore/nvcore.h>
-
-#include <nvmath/Color.h>
-#include <nvimage/Image.h>
-
-#include "ColorSpace.h"
-
-namespace nv
-{
-	void ColorSpace::RGBtoYCoCg_R(Image* img)
-	{
-		const uint w = img->width();
-		const uint h = img->height();
-		
-		for( uint y=0; y < h; y++ )
-		{
-			for( uint x=0; x < w; x++ )
-			{
-				Color32 pixel = img->pixel(x, y);
-				
-				const int r = pixel.r;
-				const int g = pixel.g;
-				const int b = pixel.b;
-				
-				const int Co = r - b;
-				const int t  = b + Co/2;
-				const int Cg = g - t;
-				const int Y  = t + Cg/2;
-				
-				// Just saturate the chroma here (we loose out of one bit in each channel)
-				// this just means that we won't have as high dynamic range. Perhaps a better option
-				// is to loose the least significant bit instead?
-				pixel.r = clamp(Co + 128, 0, 255);
-				pixel.g = clamp(Cg + 128, 0, 255);
-				pixel.b = 0;
-				pixel.a = Y;
-			}
-		}
-	}
-	
-	void ColorSpace::YCoCg_RtoRGB(Image* img)
-	{
-		const uint w = img->width();
-		const uint h = img->height();
-		
-		for( uint y=0; y < h; y++ )
-		{
-			for( uint x=0; x < w; x++ )
-			{
-				Color32 pixel = img->pixel(x, y);
-				
-				const int Co = (int)pixel.r - 128;
-				const int Cg = (int)pixel.g - 128;
-				const int Y  =      pixel.a;
-				
-				const int t = Y - Cg/2;
-				const int g = Cg + t;
-				const int b = t - Co/2;
-				const int r = b + Co;
-				
-				pixel.r = r;
-				pixel.g = g;
-				pixel.b = b;
-				pixel.a = 1;
-			}
-		}
-	}
-}
--- a/src/nvimage/ColorSpace.h
+++ b/src/nvimage/ColorSpace.h
@ -1,21 +0,0 @@
-// This code is in the public domain -- jim@tilander.org
-
-#ifndef NV_IMAGE_COLORSPACE_H
-#define NV_IMAGE_COLORSPACE_H
-
-namespace nv 
-{
-	class Image;
-	
-	// Defines simple mappings between different color spaces and encodes them in the 
-	// input image.
-	namespace ColorSpace
-	{
-		void RGBtoYCoCg_R(Image* img);
-		void YCoCg_RtoRGB(Image* img);
-	}
-}
-
-
-
-#endif
--- a/src/nvimage/ConeMap.cpp
+++ b/src/nvimage/ConeMap.cpp
@ -0,0 +1,122 @@
+// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
+// 
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+#include <nvcore/Ptr.h>
+
+#include <nvmath/Color.h>
+
+#include <nvimage/NormalMap.h>
+#include <nvimage/Filter.h>
+#include <nvimage/FloatImage.h>
+#include <nvimage/Image.h>
+
+using namespace nv;
+
+
+static float processPixel(const FloatImage * img, uint x, uint y)
+{
+	nvDebugCheck(img != NULL);
+	
+	const uint w = img->width();
+	const uint h = img->height();
+	
+	float d = img->pixel(x, y, 0);
+	
+	float fx0 = (float) x / w;
+	float fy0 = (float) y / h;
+
+	float best_ratio = INF;
+	uint best_x = w;
+	uint best_y = h;
+	
+	for (uint yy = 0; yy < h; yy++)
+	{
+		for (uint xx = 0; xx < w; xx++)
+		{
+			float ch = d - img->pixel(xx, yy, 0);
+			
+			if (ch > 0)
+			{
+				float dx = float(xx - x);
+				float dy = float(yy - y);
+				
+				float ratio = (dx * dx + dy * dy) / ch;
+				
+				if (ratio < best_ratio)
+				{
+					best_x = xx;
+					best_y = yy;
+				}
+			}
+		}
+	}
+
+	if (best_x != w)
+	{
+		nvDebugCheck(best_y !=h);
+		
+		float dx = float(best_x - x) / w;
+		float dy = float(best_y - y) / h;
+		
+		float cw = sqrtf(dx*dx + dy*dy);
+		float ch = d - img->pixel(xx, yy, 0);
+		
+		return min(1, sqrtf(cw / ch));
+	}
+	
+	return 1;
+}
+
+
+// Create cone map using the given kernels.
+FloatImage * createConeMap(const Image * img, Vector4::Arg heightWeights)
+{
+	nvCheck(img != NULL);
+	
+	const uint w = img->width();
+	const uint h = img->height();
+	
+	AutoPtr<FloatImage> fimage(new FloatImage());
+	//fimage->allocate(2, w, h);
+	fimage->allocate(4, w, h);
+	
+	// Compute height and store in red channel:
+	float * heightChannel = fimage->channel(0);
+	for(uint i = 0; i < w*h; i++)
+	{
+		Vector4 color = toVector4(img->pixel(i));
+		heightChannel[i] = dot(color, heightWeights);
+	}
+	
+	// Compute cones:
+	for(uint y = 0; y < h; y++)
+	{
+		for(uint x = 0; x < w; x++)
+		{
+			processPixel(fimage, x, y);
+		}
+	}
+	
+	return fimage.release();
+}
+
--- a/src/nvtt/CompressorRGB.h
+++ b/src/nvtt/CompressorRGB.h
@ -1,40 +1,39 @@
-// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
-// 
-// Permission is hereby granted, free of charge, to any person
-// obtaining a copy of this software and associated documentation
-// files (the "Software"), to deal in the Software without
-// restriction, including without limitation the rights to use,
-// copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the
-// Software is furnished to do so, subject to the following
-// conditions:
-// 
-// The above copyright notice and this permission notice shall be
-// included in all copies or substantial portions of the Software.
-// 
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-// OTHER DEALINGS IN THE SOFTWARE.
-
-#ifndef NV_TT_COMPRESSORRGB_H
-#define NV_TT_COMPRESSORRGB_H
-
-#include "nvtt.h"
-#include "Compressor.h"
-
-namespace nv
-{
-    struct PixelFormatConverter : public CompressorInterface
-	{
-		virtual void compress(nvtt::InputFormat inputFormat, nvtt::AlphaMode alphaMode, uint w, uint h, const void * data, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
-	};
-
-} // nv namespace
-
-
-#endif // NV_TT_COMPRESSORRGB_H
+// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
+// 
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef NV_IMAGE_CONEMAP_H
+#define NV_IMAGE_CONEMAP_H
+
+#include <nvmath/Vector.h>
+#include <nvimage/nvimage.h>
+
+namespace nv
+{
+	class Image;
+	class FloatImage;
+
+	FloatImage * createConeMap(const Image * img, Vector4::Arg heightWeights);
+
+} // nv namespace
+
+#endif // NV_IMAGE_CONEMAP_H
--- a/src/nvimage/DirectDrawSurface.cpp
+++ b/src/nvimage/DirectDrawSurface.cpp
@ -21,16 +21,16 @@
 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 // OTHER DEALINGS IN THE SOFTWARE.

+#include <nvcore/Debug.h>
+#include <nvcore/Containers.h> // max
+#include <nvcore/StdStream.h>
+
 #include <nvimage/DirectDrawSurface.h>
 #include <nvimage/ColorBlock.h>
 #include <nvimage/Image.h>
 #include <nvimage/BlockDXT.h>
 #include <nvimage/PixelFormat.h>

-#include <nvcore/Debug.h>
-#include <nvcore/Containers.h> // max
-#include <nvcore/StdStream.h>
-
 #include <string.h> // memset


@ -406,14 +406,10 @@ namespace nv
 		s << pf.flags;
 		s << pf.fourcc;
 		s << pf.bitcount;
-		s.serialize(&pf.rmask, sizeof(pf.rmask));
-		s.serialize(&pf.gmask, sizeof(pf.gmask));
-		s.serialize(&pf.bmask, sizeof(pf.bmask));
-		s.serialize(&pf.amask, sizeof(pf.amask));
-	//	s << pf.rmask;
-	//	s << pf.gmask;
-	//	s << pf.bmask;
-	//	s << pf.amask;
+		s << pf.rmask;
+		s << pf.gmask;
+		s << pf.bmask;
+		s << pf.amask;
 		return s;
 	}

@ -449,9 +445,7 @@ namespace nv
 		s << header.pitch;
 		s << header.depth;
 		s << header.mipmapcount;
-		for (int i = 0; i < 11; i++) {
-			s << header.reserved[i];
-		}
+		s.serialize(header.reserved, 11 * sizeof(uint));
 		s << header.pf;
 		s << header.caps;
 		s << header.notused;
@ -538,7 +532,7 @@ DDSHeader::DDSHeader()

 	// Store version information on the reserved header attributes.
 	this->reserved[9] = MAKEFOURCC('N', 'V', 'T', 'T');
-	this->reserved[10] = (2 << 16) | (1 << 8) | (0);	// major.minor.revision
+	this->reserved[10] = (2 << 16) | (0 << 8) | (8);	// major.minor.revision

 	this->pf.size = 32;
 	this->pf.flags = 0;
@ -576,7 +570,7 @@ void DDSHeader::setHeight(uint h)
 void DDSHeader::setDepth(uint d)
 {
 	this->flags |= DDSD_DEPTH;
-	this->depth = d;
+	this->height = d;
 }

 void DDSHeader::setMipmapCount(uint count)
@ -605,7 +599,6 @@ void DDSHeader::setMipmapCount(uint count)
 void DDSHeader::setTexture2D()
 {
 	this->header10.resourceDimension = D3D10_RESOURCE_DIMENSION_TEXTURE2D;
-	this->header10.arraySize = 1;
 }

 void DDSHeader::setTexture3D()
@ -613,7 +606,6 @@ void DDSHeader::setTexture3D()
 	this->caps.caps2 = DDSCAPS2_VOLUME;
 	
 	this->header10.resourceDimension = D3D10_RESOURCE_DIMENSION_TEXTURE3D;
-	this->header10.arraySize = 1;
 }

 void DDSHeader::setTextureCube()
@ -644,33 +636,22 @@ void DDSHeader::setFourCC(uint8 c0, uint8 c1, uint8 c2, uint8 c3)
 	// set fourcc pixel format.
 	this->pf.flags = DDPF_FOURCC;
 	this->pf.fourcc = MAKEFOURCC(c0, c1, c2, c3);
-
-	this->pf.bitcount = 0;
-	this->pf.rmask = 0;
-	this->pf.gmask = 0;
-	this->pf.bmask = 0;
-	this->pf.amask = 0;
-}
-
-void DDSHeader::setFormatCode(uint32 code)
-{
-	// set fourcc pixel format.
-	this->pf.flags = DDPF_FOURCC;
-	this->pf.fourcc = code;
 	
-	this->pf.bitcount = 0;
+	if (this->pf.fourcc == FOURCC_ATI2)
+	{
+		this->pf.bitcount = FOURCC_A2XY;
+	}
+	else
+	{
+		this->pf.bitcount = 0;
+	}
+	
 	this->pf.rmask = 0;
 	this->pf.gmask = 0;
 	this->pf.bmask = 0;
 	this->pf.amask = 0;
 }

-void DDSHeader::setSwizzleCode(uint8 c0, uint8 c1, uint8 c2, uint8 c3)
-{
-	this->pf.bitcount = MAKEFOURCC(c0, c1, c2, c3);
-}
-
-
 void DDSHeader::setPixelFormat(uint bitcount, uint rmask, uint gmask, uint bmask, uint amask)
 {
 	// Make sure the masks are correct.
@ -681,17 +662,10 @@ void DDSHeader::setPixelFormat(uint bitcount, uint rmask, uint gmask, uint bmask
 	nvCheck((gmask & amask) == 0);
 	nvCheck((bmask & amask) == 0);

-	if (rmask != 0 || gmask != 0 || bmask != 0)
-	{
-		this->pf.flags = DDPF_RGB;
-		
-		if (amask != 0) {
-			this->pf.flags |= DDPF_ALPHAPIXELS;
-		}
-	}
-	else if (amask != 0)
-	{
-		this->pf.flags |= DDPF_ALPHA;
+	this->pf.flags = DDPF_RGB;
+
+	if (amask != 0) {
+		this->pf.flags |= DDPF_ALPHAPIXELS;
 	}

 	if (bitcount == 0)
@ -733,12 +707,6 @@ void DDSHeader::setNormalFlag(bool b)
 	else this->pf.flags &= ~DDPF_NORMAL;
 }

-void DDSHeader::setHasAlphaFlag(bool b)
-{
-	if (b) this->pf.flags |= DDPF_ALPHAPIXELS;
-	else this->pf.flags &= ~DDPF_ALPHAPIXELS;
-}
-
 void DDSHeader::swapBytes()
 {
 	this->fourcc = POSH_LittleU32(this->fourcc);
@ -791,15 +759,6 @@ DirectDrawSurface::DirectDrawSurface(const char * name) : stream(new StdInputStr
 	}
 }

-DirectDrawSurface::DirectDrawSurface(Stream * s) : stream(s)
-{
-	if (!stream->isError())
-	{
-		(*stream) << header;
-	}
-}
-
-
 DirectDrawSurface::~DirectDrawSurface()
 {
 	delete stream;
@ -839,16 +798,6 @@ bool DirectDrawSurface::isSupported() const
 	
 	if (header.hasDX10Header())
 	{
-		if (header.header10.dxgiFormat == DXGI_FORMAT_BC1_UNORM ||
-			header.header10.dxgiFormat == DXGI_FORMAT_BC2_UNORM ||
-			header.header10.dxgiFormat == DXGI_FORMAT_BC3_UNORM ||
-			header.header10.dxgiFormat == DXGI_FORMAT_BC4_UNORM ||
-			header.header10.dxgiFormat == DXGI_FORMAT_BC5_UNORM)
-		{
-			return true;
-		}
-
-		return false;
 	}
 	else
 	{
@ -892,41 +841,6 @@ bool DirectDrawSurface::isSupported() const
 	return true;
 }

-bool DirectDrawSurface::hasAlpha() const
-{
-	if (header.hasDX10Header())
-	{
-#pragma message(NV_FILE_LINE "TODO: Update hasAlpha to handle all DX10 formats.")
-		return 
-			header.header10.dxgiFormat == DXGI_FORMAT_BC1_UNORM ||
-			header.header10.dxgiFormat == DXGI_FORMAT_BC2_UNORM ||
-			header.header10.dxgiFormat == DXGI_FORMAT_BC3_UNORM;
-	}
-	else
-	{
-		if (header.pf.flags & DDPF_RGB) 
-		{
-			return header.pf.amask != 0;
-		}
-		else if (header.pf.flags & DDPF_FOURCC)
-		{
-			if (header.pf.fourcc == FOURCC_RXGB ||
-				header.pf.fourcc == FOURCC_ATI1 ||
-				header.pf.fourcc == FOURCC_ATI2 ||
-				header.pf.flags & DDPF_NORMAL)
-			{
-				return false;
-			}
-			else
-			{
-				// @@ Here we could check the ALPHA_PIXELS flag, but nobody sets it.
-				return true;
-			}
-		}
-
-		return false;
-	}
-}

 uint DirectDrawSurface::mipmapCount() const
 {
@ -1005,13 +919,6 @@ void DirectDrawSurface::setNormalFlag(bool b)
 	header.setNormalFlag(b);
 }

-void DirectDrawSurface::setHasAlphaFlag(bool b)
-{
-	nvDebugCheck(isValid());
-	header.setHasAlphaFlag(b);
-}
-
-
 void DirectDrawSurface::mipmap(Image * img, uint face, uint mipmap)
 {
 	nvDebugCheck(isValid());
@ -1029,32 +936,15 @@ void DirectDrawSurface::mipmap(Image * img, uint face, uint mipmap)
 	}
 	
 	img->allocate(w, h);
-
-	if (hasAlpha())
+	
+	if (header.pf.flags & DDPF_RGB) 
 	{
-		img->setFormat(Image::Format_ARGB);
+		readLinearImage(img);
 	}
-	else
+	else if (header.pf.flags & DDPF_FOURCC)
 	{
-		img->setFormat(Image::Format_RGB);
-	}
-
-	if (header.hasDX10Header())
-	{
-		// So far only block formats supported.
 		readBlockImage(img);
 	}
-	else
-	{
-		if (header.pf.flags & DDPF_RGB) 
-		{
-			readLinearImage(img);
-		}
-		else if (header.pf.flags & DDPF_FOURCC)
-		{
-			readBlockImage(img);
-		}
-	}
 }

 void DirectDrawSurface::readLinearImage(Image * img)
@ -1079,7 +969,16 @@ void DirectDrawSurface::readLinearImage(Image * img)

 	uint byteCount = (header.pf.bitcount + 7) / 8;

-#pragma message(NV_FILE_LINE "TODO: Support floating point linear images and other FOURCC codes.")
+	// set image format: RGB or ARGB
+	// alpha channel exists if and only if the alpha mask is non-zero
+	if (header.pf.amask == 0)
+ 	{
+		img->setFormat(Image::Format_RGB);
+	}
+	else
+	{
+		img->setFormat(Image::Format_ARGB);
+	}

 	// Read linear RGB images.
 	for (uint y = 0; y < h; y++)
@ -1105,6 +1004,19 @@ void DirectDrawSurface::readBlockImage(Image * img)
 	nvDebugCheck(stream != NULL);
 	nvDebugCheck(img != NULL);

+	// set image format: RGB or ARGB
+	if (header.pf.fourcc == FOURCC_RXGB ||
+		header.pf.fourcc == FOURCC_ATI1 ||
+		header.pf.fourcc == FOURCC_ATI2 ||
+		header.pf.flags & DDPF_NORMAL)
+	{
+		img->setFormat(Image::Format_RGB);
+	}
+	else
+	{
+		img->setFormat(Image::Format_ARGB);
+	}
+
 	const uint w = img->width();
 	const uint h = img->height();
 	
@ -1149,33 +1061,20 @@ void DirectDrawSurface::readBlock(ColorBlock * rgba)
 	nvDebugCheck(stream != NULL);
 	nvDebugCheck(rgba != NULL);
 	
-	uint fourcc = header.pf.fourcc;
-
-	// Map DX10 block formats to fourcc codes.
-	if (header.hasDX10Header())
-	{
-		if (header.header10.dxgiFormat == DXGI_FORMAT_BC1_UNORM) fourcc = FOURCC_DXT1;
-		if (header.header10.dxgiFormat == DXGI_FORMAT_BC2_UNORM) fourcc = FOURCC_DXT3;
-		if (header.header10.dxgiFormat == DXGI_FORMAT_BC3_UNORM) fourcc = FOURCC_DXT5;
-		if (header.header10.dxgiFormat == DXGI_FORMAT_BC4_UNORM) fourcc = FOURCC_ATI1;
-		if (header.header10.dxgiFormat == DXGI_FORMAT_BC5_UNORM) fourcc = FOURCC_ATI2;
-	}
-
-
-	if (fourcc == FOURCC_DXT1)
+	if (header.pf.fourcc == FOURCC_DXT1)
 	{
 		BlockDXT1 block;
 		*stream << block;
 		block.decodeBlock(rgba);
 	}
-	else if (fourcc == FOURCC_DXT2 ||
+	else if (header.pf.fourcc == FOURCC_DXT2 ||
 	    header.pf.fourcc == FOURCC_DXT3)
 	{
 		BlockDXT3 block;
 		*stream << block;
 		block.decodeBlock(rgba);
 	}
-	else if (fourcc == FOURCC_DXT4 ||
+	else if (header.pf.fourcc == FOURCC_DXT4 ||
 	    header.pf.fourcc == FOURCC_DXT5 ||
 	    header.pf.fourcc == FOURCC_RXGB)
 	{
@ -1183,7 +1082,7 @@ void DirectDrawSurface::readBlock(ColorBlock * rgba)
 		*stream << block;
 		block.decodeBlock(rgba);
 		
-		if (fourcc == FOURCC_RXGB)
+		if (header.pf.fourcc == FOURCC_RXGB)
 		{
 			// Swap R & A.
 			for (int i = 0; i < 16; i++)
@ -1195,13 +1094,13 @@ void DirectDrawSurface::readBlock(ColorBlock * rgba)
 			}
 		}
 	}
-	else if (fourcc == FOURCC_ATI1)
+	else if (header.pf.fourcc == FOURCC_ATI1)
 	{
 		BlockATI1 block;
 		*stream << block;
 		block.decodeBlock(rgba);
 	}
-	else if (fourcc == FOURCC_ATI2)
+	else if (header.pf.fourcc == FOURCC_ATI2)
 	{
 		BlockATI2 block;
 		*stream << block;
@ -1211,7 +1110,7 @@ void DirectDrawSurface::readBlock(ColorBlock * rgba)
 	// If normal flag set, convert to normal.
 	if (header.pf.flags & DDPF_NORMAL)
 	{
-		if (fourcc == FOURCC_ATI2)
+		if (header.pf.fourcc == FOURCC_ATI2)
 		{
 			for (int i = 0; i < 16; i++)
 			{
@ -1219,7 +1118,7 @@ void DirectDrawSurface::readBlock(ColorBlock * rgba)
 				c = buildNormal(c.r, c.g);
 			}
 		}
-		else if (fourcc == FOURCC_DXT5)
+		else if (header.pf.fourcc == FOURCC_DXT5)
 		{
 			for (int i = 0; i < 16; i++)
 			{
@ -1245,27 +1144,6 @@ uint DirectDrawSurface::blockSize() const
 		case FOURCC_RXGB:
 		case FOURCC_ATI2:
 			return 16;
-		case FOURCC_DX10:
-			switch(header.header10.dxgiFormat)
-			{
-				case DXGI_FORMAT_BC1_TYPELESS:
-				case DXGI_FORMAT_BC1_UNORM:
-				case DXGI_FORMAT_BC1_UNORM_SRGB:
-				case DXGI_FORMAT_BC4_TYPELESS:
-				case DXGI_FORMAT_BC4_UNORM:
-				case DXGI_FORMAT_BC4_SNORM:
-					return 8;
-				case DXGI_FORMAT_BC2_TYPELESS:
-				case DXGI_FORMAT_BC2_UNORM:
-				case DXGI_FORMAT_BC2_UNORM_SRGB:
-				case DXGI_FORMAT_BC3_TYPELESS:
-				case DXGI_FORMAT_BC3_UNORM:
-				case DXGI_FORMAT_BC3_UNORM_SRGB:
-				case DXGI_FORMAT_BC5_TYPELESS:
-				case DXGI_FORMAT_BC5_UNORM:
-				case DXGI_FORMAT_BC5_SNORM:
-					return 16;
-			};
 	};

 	// Not a block image.
--- a/src/nvimage/DirectDrawSurface.h
+++ b/src/nvimage/DirectDrawSurface.h
@ -93,12 +93,9 @@ namespace nv
 		void setLinearSize(uint size);
 		void setPitch(uint pitch);
 		void setFourCC(uint8 c0, uint8 c1, uint8 c2, uint8 c3);
-		void setFormatCode(uint code);
-		void setSwizzleCode(uint8 c0, uint8 c1, uint8 c2, uint8 c3);
 		void setPixelFormat(uint bitcount, uint rmask, uint gmask, uint bmask, uint amask);
 		void setDX10Format(uint format);
 		void setNormalFlag(bool b);
-		void setHasAlphaFlag(bool b);
 		
 		void swapBytes();
 		
@ -113,13 +110,10 @@ namespace nv
 	{
 	public:
 		DirectDrawSurface(const char * file);
-		DirectDrawSurface(Stream * stream);
 		~DirectDrawSurface();
 		
 		bool isValid() const;
 		bool isSupported() const;
-
-		bool hasAlpha() const;
 		
 		uint mipmapCount() const;
 		uint width() const;
@ -131,7 +125,6 @@ namespace nv
 		bool isTextureCube() const;

 		void setNormalFlag(bool b);
-		void setHasAlphaFlag(bool b);
 		
 		void mipmap(Image * img, uint f, uint m);
 		//	void mipmap(FloatImage * img, uint f, uint m);
--- a/src/nvimage/FloatImage.cpp
+++ b/src/nvimage/FloatImage.cpp
@ -17,6 +17,21 @@ using namespace nv;

 namespace 
 {
+	static int iround(float f)
+	{
+		return int(f);
+	}
+
+	static int ifloor(float f)
+	{
+		return int(floor(f));
+	}
+
+	static float frac(float f)
+	{
+		return f - floor(f);
+	}
+
 	static int mirror(int x, int w)
 	{
 		x = abs(x);
@ -172,12 +187,12 @@ void FloatImage::normalize(uint base_component)

 void FloatImage::packNormals(uint base_component)
 {
-	scaleBias(base_component, 3, 0.5f, 0.5f);
+	scaleBias(base_component, 3, 0.5f, 1.0f);
 }

 void FloatImage::expandNormals(uint base_component)
 {
-	scaleBias(base_component, 3, 2.0f, -1.0f);
+	scaleBias(base_component, 3, 2, -0.5);
 }

 void FloatImage::scaleBias(uint base_component, uint num, float scale, float bias)
@ -188,7 +203,7 @@ void FloatImage::scaleBias(uint base_component, uint num, float scale, float bia
 		float * ptr = this->channel(base_component + c);
 		
 		for(uint i = 0; i < size; i++) {
-			ptr[i] = scale * ptr[i] + bias;
+			ptr[i] = scale * (ptr[i] + bias);
 		}
 	}
 }
@ -227,57 +242,6 @@ void FloatImage::exponentiate(uint base_component, uint num, float power)
 	}
 }

-/// Apply linear transform.
-void FloatImage::transform(uint base_component, const Matrix & m, Vector4::Arg offset)
-{
-	nvCheck(base_component + 4 <= m_componentNum);
-
-	const uint size = m_width * m_height;
-
-	float * r = this->channel(base_component + 0);
-	float * g = this->channel(base_component + 1);
-	float * b = this->channel(base_component + 2);
-	float * a = this->channel(base_component + 3);
-
-	for (uint i = 0; i < size; i++)
-	{
-		Vector4 color = nv::transform(m, Vector4(*r, *g, *b, *a)) + offset;
-
-		*r++ = color.x();
-		*g++ = color.y();
-		*b++ = color.z();
-		*a++ = color.w();
-	}
-}
-
-void FloatImage::swizzle(uint base_component, uint r, uint g, uint b, uint a)
-{
-	nvCheck(base_component + 4 <= m_componentNum);
-	nvCheck(r < 7 && g < 7 && b < 7 && a < 7);
-
-	const uint size = m_width * m_height;
-
-	float consts[] = { 1.0f, 0.0f, -1.0f };
-	float * c[7];
-	c[0] = this->channel(base_component + 0);
-	c[1] = this->channel(base_component + 1);
-	c[2] = this->channel(base_component + 2);
-	c[3] = this->channel(base_component + 3);
-	c[4] = consts;
-	c[5] = consts + 1;
-	c[6] = consts + 2;
-
-	for (uint i = 0; i < size; i++)
-	{
-		float tmp[4] = { *c[r], *c[g], *c[b], *c[a] };
-
-		*c[0]++ = tmp[0];
-		*c[1]++ = tmp[1];
-		*c[2]++ = tmp[2];
-		*c[3]++ = tmp[3];
-	}
-}
-
 float FloatImage::sampleNearest(const float x, const float y, const int c, const WrapMode wm) const
 {
 	if( wm == WrapMode_Clamp ) return sampleNearestClamp(x, y, c);
@ -628,7 +592,7 @@ FloatImage * FloatImage::resize(const Filter & filter, uint w, uint h, WrapMode
 			float * dst_channel = dst_image->channel(c);
 			
 			for (uint x = 0; x < w; x++) {
-				tmp_image->applyKernelVertical(ykernel, x, c, wm, tmp_column.mutableBuffer());
+				tmp_image->applyKernelVertical(ykernel, x, c, wm, tmp_column.unsecureBuffer());
 				
 				for (uint y = 0; y < h; y++) {
 					dst_channel[y * w + x] = tmp_column[y];
@ -649,7 +613,7 @@ FloatImage * FloatImage::resize(const Filter & filter, uint w, uint h, WrapMode
 			float * tmp_channel = tmp_image->channel(c);

 			for (uint x = 0; x < w; x++) {
-				tmp_image->applyKernelVertical(ykernel, x, c, wm, tmp_column.mutableBuffer());
+				tmp_image->applyKernelVertical(ykernel, x, c, wm, tmp_column.unsecureBuffer());
 				
 				for (uint y = 0; y < h; y++) {
 					tmp_channel[y * w + x] = tmp_column[y];
@ -701,7 +665,7 @@ FloatImage * FloatImage::resize(const Filter & filter, uint w, uint h, WrapMode
 			float * dst_channel = dst_image->channel(c);
 			
 			for (uint x = 0; x < w; x++) {
-				tmp_image->applyKernelVertical(ykernel, x, c, alpha, wm, tmp_column.mutableBuffer());
+				tmp_image->applyKernelVertical(ykernel, x, c, alpha, wm, tmp_column.unsecureBuffer());
 				
 				for (uint y = 0; y < h; y++) {
 					dst_channel[y * w + x] = tmp_column[y];
@ -926,25 +890,6 @@ void FloatImage::applyKernelHorizontal(const PolyphaseKernel & k, int y, uint c,
 	}
 }

-
-// Vertical flip in place.
-void FloatImage::flip()
-{
-    const uint w = m_width;
-    const uint h = m_height;
-    const uint h2 = h / 2;
-
-    for (uint c = 0; c < m_componentNum; c++) {
-        for (uint y = 0; y < h2; y++) {
-            float * src = scanline(y, c);
-            float * dst = scanline(h - 1 - y, c);
-            for (uint x = 0; x < w; x++) {
-                swap(src[x], dst[x]);
-            }
-        }
-    }
-}
-
 FloatImage* FloatImage::clone() const
 {
 	FloatImage* copy = new FloatImage();
--- a/src/nvimage/FloatImage.h
+++ b/src/nvimage/FloatImage.h
@ -8,7 +8,7 @@
 #include <nvmath/Vector.h>

 #include <nvcore/Debug.h>
-#include <nvcore/Algorithms.h> // clamp
+#include <nvcore/Containers.h> // clamp

 #include <stdlib.h> // abs

@ -68,15 +68,14 @@ public:
 	NVIMAGE_API void toGamma(uint base_component, uint num, float gamma = 2.2f);
 	NVIMAGE_API void exponentiate(uint base_component, uint num, float power);
 	
-	NVIMAGE_API void transform(uint base_component, const Matrix & m, const Vector4 & offset);
-	NVIMAGE_API void swizzle(uint base_component, uint r, uint g, uint b, uint a);
-	
+
 	NVIMAGE_API FloatImage * fastDownSample() const;
 	NVIMAGE_API FloatImage * downSample(const Filter & filter, WrapMode wm) const;
 	NVIMAGE_API FloatImage * downSample(const Filter & filter, WrapMode wm, uint alpha) const;
 	NVIMAGE_API FloatImage * resize(const Filter & filter, uint w, uint h, WrapMode wm) const;

 	NVIMAGE_API FloatImage * resize(const Filter & filter, uint w, uint h, WrapMode wm, uint alpha) const;
+	//@}

 	NVIMAGE_API float applyKernel(const Kernel2 * k, int x, int y, uint c, WrapMode wm) const;
 	NVIMAGE_API float applyKernelVertical(const Kernel1 * k, int x, int y, uint c, WrapMode wm) const;
@ -85,9 +84,7 @@ public:
 	NVIMAGE_API void applyKernelHorizontal(const PolyphaseKernel & k, int y, uint c, WrapMode wm, float * output) const;
 	NVIMAGE_API void applyKernelVertical(const PolyphaseKernel & k, int x, uint c, uint a, WrapMode wm, float * output) const;
 	NVIMAGE_API void applyKernelHorizontal(const PolyphaseKernel & k, int y, uint c, uint a, WrapMode wm, float * output) const;
-
-    NVIMAGE_API void flip();
-	//@}
+	
 	
 	uint width() const { return m_width; }
 	uint height() const { return m_height; }
--- a/src/nvimage/HoleFilling.cpp
+++ b/src/nvimage/HoleFilling.cpp
@ -0,0 +1,753 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#include <nvcore/Containers.h>
+#include <nvcore/Ptr.h>
+
+#include <nvmath/nvmath.h>
+
+#include <nvimage/HoleFilling.h>
+#include <nvimage/FloatImage.h>
+
+using namespace nv;
+
+
+// This is a variation of Sapiro's inpainting method.
+void nv::fillExtrapolate(int passCount, FloatImage * img, BitMap * bmap)
+{
+	nvCheck(img != NULL);
+	nvCheck(bmap != NULL);
+
+	const int w = img->width();
+	const int h = img->height();
+	const int count = img->componentNum();
+
+	nvCheck(bmap->width() == uint(w));
+	nvCheck(bmap->height() == uint(h));
+
+	AutoPtr<BitMap> newbmap(new BitMap(w, h));
+
+	for(int p = 0; p < passCount; p++)
+	{
+		for(int c = 0; c < count; c++)
+		{
+			float * channel = img->channel(c);
+			
+			for(int y = 0; y < h; y++) {
+				for(int x = 0; x < w; x++) {
+					
+					if (bmap->bitAt(x, y)) {
+						// Not a hole.
+						newbmap->setBitAt(x, y);
+						continue;
+					}
+					
+					const bool west = bmap->bitAt(img->indexClamp(x-1, y));
+					const bool east = bmap->bitAt(img->indexClamp(x+1, y));
+					const bool north = bmap->bitAt(img->indexClamp(x, y-1));
+					const bool south = bmap->bitAt(img->indexClamp(x, y+1));
+					const bool northwest = bmap->bitAt(img->indexClamp(x-1, y-1));
+					const bool northeast = bmap->bitAt(img->indexClamp(x+1, y-1));
+					const bool southwest = bmap->bitAt(img->indexClamp(x-1, y+1));
+					const bool southeast = bmap->bitAt(img->indexClamp(x+1, y+1));
+					
+					int num = west + east + north + south + northwest + northeast + southwest + southeast;
+					
+					if (num != 0) {
+
+						float average = 0.0f;
+						if (num == 3 && west && northwest && southwest) {
+							average = channel[img->indexClamp(x-1, y)];
+						}
+						else if (num == 3 && east && northeast && southeast) {
+							average = channel[img->indexClamp(x+1, y)];
+						}
+						else if (num == 3 && north && northwest && northeast) {
+							average = channel[img->indexClamp(x, y-1)];
+						}
+						else if (num == 3 && south && southwest && southeast) {
+							average = channel[img->indexClamp(x, y+1)];
+						}
+						else {
+							float total = 0.0f;
+							if (west) { average += 1 * channel[img->indexClamp(x-1, y)]; total += 1; }
+							if (east) { average += 1 * channel[img->indexClamp(x+1, y)]; total += 1; }
+							if (north) { average += 1 * channel[img->indexClamp(x, y-1)]; total += 1; }
+							if (south) { average += 1 * channel[img->indexClamp(x, y+1)]; total += 1; }
+						
+							if (northwest) { average += channel[img->indexClamp(x-1, y-1)]; ++total; }
+							if (northeast) { average += channel[img->indexClamp(x+1, y-1)]; ++total; }
+							if (southwest) { average += channel[img->indexClamp(x-1, y+1)]; ++total; }
+							if (southeast) { average += channel[img->indexClamp(x+1, y+1)]; ++total; }
+							
+							average /= total;
+						}
+
+						channel[img->indexClamp(x, y)] = average;
+						newbmap->setBitAt(x, y);
+					}
+				}
+			}
+		}
+
+		// Update the bit mask.
+		swap(*newbmap, *bmap);
+	}
+}
+
+
+namespace {
+
+	struct Neighbor {
+		uint16 x;
+		uint16 y;
+		uint32 d;
+	};
+
+	// Compute euclidean squared distance.
+	static uint dist( uint16 ax, uint16 ay, uint16 bx, uint16 by ) {
+		int dx = bx - ax;
+		int dy = by - ay;
+		return uint(dx*dx + dy*dy);
+	}
+	
+	// Check neighbour, this is the core of the EDT algorithm.
+	static void checkNeighbour( int x, int y, Neighbor * e, const Neighbor & n ) {
+		nvDebugCheck(e != NULL);
+		
+		uint d = dist( x, y, n.x, n.y );
+		if( d < e->d ) {
+			e->x = n.x;
+			e->y = n.y;
+			e->d = d;
+		}
+	}
+
+} // namespace
+
+// Voronoi filling using EDT-4
+void nv::fillVoronoi(FloatImage * img, const BitMap * bmap)
+{
+	nvCheck(img != NULL);
+
+	const int w = img->width();
+	const int h = img->height();
+	const int count = img->componentNum();
+
+	nvCheck(bmap->width() == uint(w));
+	nvCheck(bmap->height() == uint(h));
+
+	Array<Neighbor> edm;
+	edm.resize(w * h);
+	
+	int x, y;
+	int x0, x1, y0, y1;
+
+	// Init edm.
+	for( y = 0; y < h; y++ ) {
+		for( x = 0; x < w; x++ ) {
+			if( bmap->bitAt(x, y) ) {
+				edm[y * w + x].x = x;
+				edm[y * w + x].y = y;
+				edm[y * w + x].d = 0;
+			}
+			else {
+				edm[y * w + x].x = w;
+				edm[y * w + x].y = h;
+				edm[y * w + x].d = w*w + h*h;
+			}
+		}
+	}
+	
+	// First pass.
+	for( y = 0; y < h; y++ ) {
+		for( x = 0; x < w; x++ ) {
+			x0 = clamp(x-1, 0, w-1);	// @@ Wrap?
+			x1 = clamp(x+1, 0, w-1);
+			y0 = clamp(y-1, 0, h-1);
+			
+			Neighbor & e = edm[y * w + x];
+			checkNeighbour(x, y, &e, edm[y0 * w + x0]);
+			checkNeighbour(x, y, &e, edm[y0 * w + x]);
+			checkNeighbour(x, y, &e, edm[y0 * w + x1]);
+			checkNeighbour(x, y, &e, edm[y * w + x0]);
+		}
+		
+		for( x = w-1; x >= 0; x-- ) {
+			x1 = clamp(x+1, 0, w-1);
+			
+			Neighbor & e = edm[y * w + x];
+			checkNeighbour(x, y, &e, edm[y * w + x1]);
+		}
+	}
+	
+	// Third pass.
+	for( y = h-1; y >= 0; y-- ) {
+		for( x = w-1; x >= 0; x-- ) {
+			x0 = clamp(x-1, 0, w-1);
+			x1 = clamp(x+1, 0, w-1);
+			y1 = clamp(y+1, 0, h-1);
+			
+			Neighbor & e = edm[y * w + x];
+			checkNeighbour(x, y, &e, edm[y * w + x1]);
+			checkNeighbour(x, y, &e, edm[y1 * w + x0]);
+			checkNeighbour(x, y, &e, edm[y1 * w + x]);
+			checkNeighbour(x, y, &e, edm[y1 * w + x1]);
+		}
+		
+		for( x = 0; x < w; x++ ) {
+			x0 = clamp(x-1, 0, w-1);
+			
+			Neighbor & e = edm[y * w + x];
+			checkNeighbour(x, y, &e, edm[y * w + x0]);
+		}
+	}
+	
+	// Fill empty holes.
+	for( y = 0; y < h; y++ ) {
+		for( x = 0; x < w; x++ ) {
+			const int sx = edm[y * w + x].x;
+			const int sy = edm[y * w + x].y;
+			nvDebugCheck(sx < w && sy < h);
+			
+			if( sx != x || sy != y ) {
+				for(int c = 0; c < count; c++ ) {
+					img->setPixel(img->pixel(sx, sy, c), x, y, c);
+				}
+			}
+		}
+	}
+
+}
+
+
+void nv::fillBlur(FloatImage * img, const BitMap * bmap)
+{
+	nvCheck(img != NULL);
+	
+	// @@ Apply a 3x3 kernel.
+}
+
+
+static bool downsample(const FloatImage * src, const BitMap * srcMask, const FloatImage ** _dst, const BitMap ** _dstMask)
+{
+	const uint w = src->width();
+	const uint h = src->height();
+	const uint count = src->componentNum();
+
+	// count holes in srcMask, return false if fully filled.
+	uint holes = 0;
+	for(uint y = 0; y < h; y++) {
+		for(uint x = 0; x < w; x++) {
+			holes += srcMask->bitAt(x, y) == 0;
+		}
+	}
+	if (holes == 0 || (w == 2 || h == 2)) {
+		// Stop when no holes or when the texture is very small.
+		return false;
+	}
+
+	// Apply box filter to image and mask and return true.
+	const uint nw = w / 2;
+	const uint nh = h / 2;
+
+	FloatImage * dst = new FloatImage();
+	dst->allocate(count, nw, nh);
+	BitMap * dstMask = new BitMap(nw, nh);
+
+	for(uint c = 0; c < count; c++) {
+		for(uint y = 0; y < nh; y++) {
+			for(uint x = 0; x < nw; x++) {
+
+				const uint x0 = 2 * x + 0;
+				const uint x1 = 2 * x + 1;
+				const uint y0 = 2 * y + 0;
+				const uint y1 = 2 * y + 1;
+
+				const float f0 = src->pixel(x0, y0, c);
+				const float f1 = src->pixel(x1, y0, c);
+				const float f2 = src->pixel(x0, y1, c);
+				const float f3 = src->pixel(x1, y1, c);
+
+				const bool b0 = srcMask->bitAt(x0, y0);
+				const bool b1 = srcMask->bitAt(x1, y0);
+				const bool b2 = srcMask->bitAt(x0, y1);
+				const bool b3 = srcMask->bitAt(x1, y1);
+
+				if (b0 || b1 || b2 || b3) {
+					// Set bit mask.
+					dstMask->setBitAt(x, y);
+
+					// Set pixel.
+					float value = 0.0f;
+					int total = 0;
+					if (b0) { value += f0; total++; }
+					if (b1) { value += f1; total++; }
+					if (b2) { value += f2; total++; }
+					if (b3) { value += f3; total++; }
+					dst->setPixel(value / total, x, y, c);
+				}
+			}
+		}
+	}
+
+	*_dst = dst;
+	*_dstMask = dstMask;
+
+	return true;
+}
+
+// This is the filter used in the Lumigraph paper.
+void nv::fillPullPush(FloatImage * img, const BitMap * bmap)
+{
+	nvCheck(img != NULL);
+
+	const uint count = img->componentNum();
+	const uint w = img->width();
+	const uint h = img->height();
+	const uint num = log2(max(w,h));
+
+	// Build mipmap chain.
+	Array<const FloatImage *> mipmaps(num);
+	Array<const BitMap *> mipmapMasks(num);
+
+	mipmaps.append(img);
+	mipmapMasks.append(bmap);
+
+	const FloatImage * current;
+	const BitMap * currentMask;
+
+	// Compute mipmap chain.
+	while(downsample(mipmaps.back(), mipmapMasks.back(), &current, &currentMask))
+	{
+		mipmaps.append(current);
+		mipmapMasks.append(currentMask);
+	}
+
+	// Sample mipmaps until non-hole is found.
+	for(uint y = 0; y < h; y++) {
+		for(uint x = 0; x < w; x++) {
+
+			int sx = x;
+			int sy = y;
+			//float sx = x;
+			//float sy = y;
+
+			const uint levelCount = mipmaps.count();
+			for (uint l = 0; l < levelCount; l++)
+			{
+				//const float fx = sx / mipmaps[l]->width();
+				//const float fy = sy / mipmaps[l]->height();
+
+				if (mipmapMasks[l]->bitAt(sx, sy))
+				{
+					// Sample mipmaps[l](sx, sy) and copy to img(x, y)
+					for(uint c = 0; c < count; c++) {
+						//img->setPixel(mipmaps[l]->linear_clamp(fx, fy, c), x, y, c);
+						img->setPixel(mipmaps[l]->pixel(sx, sy, c), x, y, c);
+					}
+					break;
+				}
+
+				sx /= 2;
+				sy /= 2;
+			}
+		}
+	}
+
+	// Don't delete the original image and mask.
+	mipmaps[0] = NULL;
+	mipmapMasks[0] = NULL;
+
+	// Delete the mipmaps.
+	deleteAll(mipmaps);
+	deleteAll(mipmapMasks);
+}
+
+
+
+/*
+
+This Code is from Charles Bloom:
+
+DoPixelSeamFix
+10-20-02
+
+Looks in the 5x5 local neighborhood (LocalPixels) of the desired pixel to fill.
+It tries to build a quadratic model of the neighborhood surface to use in
+extrapolating.  You need 5 pixels to establish a 2d quadratic curve.
+
+This is really just a nice generic way to extrapolate pixels.  It also happens
+to work great for seam-fixing.
+
+Note that I'm working on normals, but I treat them just as 3 scalars and normalize
+at the end.  To be more correct, I would work on the surface of a sphere, but that
+just seems like way too much work.
+
+*/
+
+struct LocalPixels
+{
+	// 5x5 neighborhood
+	// the center is at result
+	// index [y][x]
+	bool fill[5][5];
+	float data[5][5];
+	
+	mutable float result;
+	mutable float weight;
+
+	bool Quad3SubH(float * pQ, int row) const
+	{
+		const bool * pFill = fill[row];
+		const float * pDat = data[row];
+	
+		if ( pFill[1] && pFill[2] && pFill[3] )
+		{
+			// good row
+			*pQ = pDat[1] - 2.f * pDat[2] + pDat[3];
+			return true;
+		}
+		else if ( pFill[0] && pFill[1] && pFill[2] )
+		{
+			// good row
+			*pQ = pDat[0] - 2.f * pDat[1] + pDat[2];
+			return true;
+		}
+		else if ( pFill[2] && pFill[3] && pFill[4] )
+		{
+			// good row
+			*pQ = pDat[2] - 2.f * pDat[3] + pDat[4];
+			return true;
+		}
+		return false;
+	}
+
+	// improve result with a horizontal quad in row 1 and/or 
+	bool Quad3SubV(float * pQ, int col) const
+	{
+		if ( fill[1][col] && fill[2][col] && fill[3][col] )
+		{
+			// good row
+			*pQ = data[1][col] - 2.f * data[2][col] + data[3][col];
+			return true;
+		}
+		else if ( fill[0][col] && fill[1][col] && fill[2][col] )
+		{
+			// good row
+			*pQ = data[0][col] - 2.f * data[1][col] + data[2][col];
+			return true;
+		}
+		else if ( fill[2][col] && fill[3][col] && fill[4][col] )
+		{
+			// good row
+			*pQ = data[2][col] - 2.f * data[3][col] + data[4][col];
+			return true;
+		}
+		return false;
+	}
+	
+	bool Quad3H(float * pQ) const
+	{
+		if (!Quad3SubH(pQ,1))
+		{
+			return Quad3SubH(pQ,3);	
+		}
+		float q = 0.0f; // initializer not needed, just make it shut up
+		if (Quad3SubH(&q, 3))
+		{
+			// got q and pQ
+			*pQ = (*pQ+q)*0.5f;
+		}
+		return true;
+	}
+	
+	bool Quad3V(float * pQ) const
+	{
+		if (!Quad3SubV(pQ, 1))
+		{
+			return Quad3SubV(pQ, 3);	
+		}
+		float q = 0.0f; // initializer not needed, just make it shut up
+		if (Quad3SubV(&q, 3))
+		{
+			// got q and pQ
+			*pQ = (*pQ + q) * 0.5f;
+		}
+		return true;
+	}
+	// Quad returns ([0]+[2] - 2.f*[1])
+	//	a common want is [1] - ([0]+[2])*0.5f ;
+	// so use -0.5f*Quad
+
+	bool tryQuads() const
+	{
+		bool res = false;
+	
+		// look for a pair that straddles the middle:
+		if ( fill[2][1] && fill[2][3] )
+		{
+			// got horizontal straddle
+			float q;
+			if ( Quad3H(&q) )
+			{
+				result += (data[2][1] + data[2][3] - q) * 0.5f;
+				weight += 1.f;
+				res = true;
+			}
+		}
+		if ( fill[1][2] && fill[3][2] )
+		{
+			// got vertical straddle
+			float q;
+			if ( Quad3V(&q) )
+			{
+				result += (data[1][2] + data[3][2] - q) * 0.5f;
+				weight += 1.f;
+				res = true;
+			}
+		}
+	
+		// look for pairs that lead into the middle :
+		if ( fill[2][0] && fill[2][1] )
+		{
+			// got left-side pair
+			float q;
+			if ( Quad3H(&q) )
+			{
+				result += data[2][1]*2.f - data[2][0] + q;
+				weight += 1.f;
+				res = true;
+			}
+		}
+		if ( fill[2][3] && fill[2][4] )
+		{
+			// got right-side pair
+			float q;
+			if ( Quad3H(&q) )
+			{
+				result += data[2][3]*2.f - data[2][4] + q;
+				weight += 1.f;
+				res = true;
+			}
+		}
+		if ( fill[0][2] && fill[1][2] )
+		{
+			// got left-side pair
+			float q;
+			if ( Quad3V(&q) )
+			{
+				result += data[1][2]*2.f - data[0][2] + q;
+				weight += 1.f;
+				res = true;
+			}
+		}
+		if ( fill[3][2] && fill[4][2] )
+		{
+			// got right-side pair
+			float q;
+			if ( Quad3V(&q) )
+			{
+				result += data[3][2]*2.f - data[4][2] + q;
+				weight += 1.f;
+				res = true;
+			}
+		}
+		return res;
+	}
+	
+	bool tryPlanar() const
+	{
+		// four cases :
+		const int indices[] =
+		{
+			2,1, 1,2, 1,1,
+			2,1, 3,2, 3,1,
+			2,3, 1,2, 1,3,
+			2,3, 3,2, 3,3
+		};
+		bool res = false;
+		for (int i = 0; i < 4; i++)
+		{
+			const int * I = indices + i*6;
+			if (!fill[ I[0] ][ I[1] ])
+				continue;
+			if (!fill[ I[2] ][ I[3] ])
+				continue;
+			if (!fill[ I[4] ][ I[5] ])
+				continue;
+	
+			result += data[ I[0] ][ I[1] ] + data[ I[2] ][ I[3] ] - data[ I[4] ][ I[5] ];
+			weight += 1.0f;
+			res = true;
+		}
+		return res;
+	}
+	
+	bool tryTwos() const
+	{
+		bool res = false;
+	
+		if (fill[2][1] && fill[2][3])
+		{
+			result += (data[2][1] + data[2][3]) * 0.5f;
+			weight += 1.0f;
+			res = true;
+		}
+		if (fill[1][2] && fill[3][2])
+		{
+			result += (data[1][2] + data[3][2]) * 0.5f;
+			weight += 1.0f;
+			res = true;
+		}
+		
+		// four side-rotates :
+		const int indices[] =
+		{
+			2,1, 2,0,
+			2,3, 2,4,
+			1,2, 0,2,
+			3,2, 4,2,
+		};
+		for (int i = 0; i < 4; i++)
+		{
+			const int * I = indices + i*4;
+			if (!fill[ I[0] ][ I[1] ])
+				continue;
+			if (!fill[ I[2] ][ I[3] ])
+				continue;
+	
+			result += data[ I[0] ][ I[1] ]*2.0f - data[ I[2] ][ I[3] ];
+			weight += 1.0f;
+			res = true;
+		}
+	
+		return res;
+	}
+
+	bool doLocalPixelFill() const
+	{
+		result = 0.0f;
+		weight = 0.0f;
+		
+		if (tryQuads()) {
+			return true;
+		}
+		
+		if (tryPlanar()) {
+			return true;
+		}
+		
+		return tryTwos();
+	}
+
+}; // struct LocalPixels
+
+
+
+// This is a quadratic extrapolation filter from Charles Bloom (DoPixelSeamFix). Used with his permission.
+void nv::fillQuadraticExtrapolate(int passCount, FloatImage * img, BitMap * bmap, int coverageIndex /*= -1*/)
+{
+	nvCheck(passCount > 0);
+	nvCheck(img != NULL);
+	nvCheck(bmap != NULL);
+
+	const int w = img->width();
+	const int h = img->height();
+	const int count = img->componentNum();
+
+	nvCheck(bmap->width() == uint(w));
+	nvCheck(bmap->height() == uint(h));
+
+	AutoPtr<BitMap> newbmap( new BitMap(w, h) );
+
+	float * coverageChannel = NULL;
+	if (coverageIndex != -1)
+	{
+		coverageChannel = img->channel(coverageIndex);
+	}
+
+	int firstChannel = -1;
+
+	for (int p = 0; p < passCount; p++)
+	{
+		for (int c = 0; c < count; c++)
+		{
+			if (c == coverageIndex) continue;
+			if (firstChannel == -1) firstChannel = c;
+
+			float * channel = img->channel(c);
+			
+			for (int yb = 0; yb < h; yb++) {
+				for (int xb = 0; xb < w; xb++) {
+					
+					if (bmap->bitAt(xb, yb)) {
+						// Not a hole.
+						newbmap->setBitAt(xb, yb);
+						continue;
+					}
+					
+					int numFill = 0;
+					
+					LocalPixels lp;
+					for (int ny = 0; ny < 5; ny++)
+					{
+						int y = (yb + ny - 2);
+						if ( y < 0 || y >= h )
+						{
+							// out of range
+							for(int i = 0; i < 5; i++) 
+							{
+								lp.fill[ny][i] = false;
+							}
+							continue;
+						}
+
+						for (int nx = 0; nx < 5; nx++)
+						{
+							int x = (xb + nx - 2);
+							if (x < 0 || x >= w)
+							{
+								lp.fill[ny][nx] = false;
+							}
+							else
+							{
+								int idx = img->index(x, y);
+								if (!bmap->bitAt(idx))
+								{
+									lp.fill[ny][nx] = false;
+								}
+								else
+								{
+									lp.fill[ny][nx] = true;
+									lp.data[ny][nx] = channel[idx];
+									numFill++;
+								}
+							}
+						}
+					}
+
+					// need at least 3 to do anything decent
+					if (numFill < 2)
+						continue;
+
+					nvDebugCheck(lp.fill[2][2] == false);
+					
+					if (lp.doLocalPixelFill())
+					{
+						const int idx = img->index(xb, yb);
+						channel[idx] = lp.result / lp.weight;
+
+						if (c == firstChannel)
+						{
+							//coverageChannel[idx] /= lp.weight;	// @@ Not sure what this was for, coverageChannel[idx] is always zero.
+							newbmap->setBitAt(xb, yb);
+						}
+					}
+				}
+			}
+		}
+
+		// Update the bit mask.
+		swap(*newbmap, *bmap);
+	}
+}
--- a/src/nvimage/HoleFilling.h
+++ b/src/nvimage/HoleFilling.h
@ -0,0 +1,96 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#ifndef NV_IMAGE_HOLEFILLING_H
+#define NV_IMAGE_HOLEFILLING_H
+
+#include <nvcore/BitArray.h>
+#include <nvimage/nvimage.h>
+
+namespace nv 
+{
+	class FloatImage;
+
+	/// Bit mask.
+	class BitMap
+	{
+	public:
+		BitMap(uint w, uint h) : 
+			m_width(w), m_height(h), m_bitArray(w*h) 
+		{
+		}
+		
+		const uint width() const { return m_width; }
+		const uint height() const { return m_height; }
+		
+		bool bitAt(uint x, uint y) const
+		{
+			nvDebugCheck(x < m_width && y < m_height);
+			return m_bitArray.bitAt(y * m_width + x);
+		}
+		bool bitAt(uint idx) const
+		{
+			return m_bitArray.bitAt(idx);
+		}
+	
+		void setBitAt(uint x, uint y)
+		{
+			nvDebugCheck(x < m_width && y < m_height);
+			m_bitArray.setBitAt(y * m_width + x);
+		}
+		void setBitAt(uint idx)
+		{
+			m_bitArray.setBitAt(idx);
+		}
+	
+		void clearBitAt(uint x, uint y)
+		{
+			nvDebugCheck(x < m_width && y < m_height);
+			m_bitArray.clearBitAt(y * m_width + x);
+		}
+		void clearBitAt(uint idx)
+		{
+			m_bitArray.clearBitAt(idx);
+		}
+	
+		void clearAll()
+		{
+			m_bitArray.clearAll();
+		}
+	
+		void setAll()
+		{
+			m_bitArray.setAll();
+		}
+	
+		void toggleAll()
+		{
+			m_bitArray.toggleAll();
+		}
+		
+		friend void swap(BitMap & a, BitMap & b)
+		{
+			nvCheck(a.m_width == b.m_width);
+			nvCheck(a.m_height == b.m_height);
+			//swap(const_cast<uint &>(a.m_width), const_cast<uint &>(b.m_width));
+			//swap(const_cast<uint &>(a.m_height), const_cast<uint &>(b.m_height));
+			swap(a.m_bitArray, b.m_bitArray);
+		}
+		
+	private:
+		
+		const uint m_width;
+		const uint m_height;
+		BitArray m_bitArray;
+		
+	};
+
+	NVIMAGE_API void fillVoronoi(FloatImage * img, const BitMap * bmap);
+	NVIMAGE_API void fillBlur(FloatImage * img, const BitMap * bmap);
+	NVIMAGE_API void fillPullPush(FloatImage * img, const BitMap * bmap);
+	
+	NVIMAGE_API void fillExtrapolate(int passCount, FloatImage * img, BitMap * bmap);
+	NVIMAGE_API void fillQuadraticExtrapolate(int passCount, FloatImage * img, BitMap * bmap, int coverageIndex = -1);
+	
+} // nv namespace
+
+#endif // NV_IMAGE_HOLEFILLING_H
--- a/src/nvimage/Image.cpp
+++ b/src/nvimage/Image.cpp
@ -1,13 +1,12 @@
 // This code is in the public domain -- castanyo@yahoo.es

-#include <nvimage/Image.h>
-#include <nvimage/ImageIO.h>
-
-#include <nvmath/Color.h>
-
 #include <nvcore/Debug.h>
 #include <nvcore/Ptr.h>
-#include <nvcore/Containers.h> // swap
+
+#include <nvmath/Color.h>
+
+#include <nvimage/Image.h>
+#include <nvimage/ImageIO.h>


 using namespace nv;
@ -41,7 +40,7 @@ void Image::allocate(uint w, uint h)
 {
 	m_width = w;
 	m_height = h;
-	m_data = (Color32 *)nv::mem::realloc(m_data, w * h * sizeof(Color32));
+	m_data = (Color32 *)realloc(m_data, w * h * sizeof(Color32));
 }

 bool Image::load(const char * name)
--- a/src/nvimage/ImageIO.cpp
+++ b/src/nvimage/ImageIO.cpp
--- a/src/nvimage/ImageIO.h
+++ b/src/nvimage/ImageIO.h
@ -5,9 +5,6 @@

 #include <nvimage/nvimage.h>

-#include <nvcore/StrLib.h>
-
-
 namespace nv
 {
 	class Image;
@ -16,22 +13,43 @@ namespace nv

 	namespace ImageIO
 	{
-		struct ImageMetaData
-		{
-			HashMap<String, String> tagMap;
-		};
-
 		NVIMAGE_API Image * load(const char * fileName);
 		NVIMAGE_API Image * load(const char * fileName, Stream & s);

 		NVIMAGE_API FloatImage * loadFloat(const char * fileName);
 		NVIMAGE_API FloatImage * loadFloat(const char * fileName, Stream & s);
 		
-		NVIMAGE_API bool save(const char * fileName, const Image * img, const ImageMetaData * tags=NULL);
-		NVIMAGE_API bool save(const char * fileName, Stream & s, const Image * img, const ImageMetaData * tags=NULL);
+		NVIMAGE_API bool save(const char * fileName, Stream & s, Image * img);
+		NVIMAGE_API bool save(const char * fileName, Image * img);
+		NVIMAGE_API bool saveFloat(const char * fileName, const FloatImage * fimage, uint base_component, uint num_components);

-		NVIMAGE_API bool saveFloat(const char * fileName, const FloatImage * fimage, uint baseComponent, uint componentCount);
-		NVIMAGE_API bool saveFloat(const char * fileName, Stream & s, const FloatImage * fimage, uint baseComponent, uint componentCount);
+		NVIMAGE_API Image * loadTGA(Stream & s);
+		NVIMAGE_API bool saveTGA(Stream & s, const Image * img);
+
+		NVIMAGE_API Image * loadPSD(Stream & s);
+
+#if defined(HAVE_PNG)
+		NVIMAGE_API Image * loadPNG(Stream & s);
+#endif
+
+#if defined(HAVE_JPEG)
+		NVIMAGE_API Image * loadJPG(Stream & s);
+#endif
+
+#if defined(HAVE_TIFF)
+		NVIMAGE_API FloatImage * loadFloatTIFF(const char * fileName, Stream & s);
+		
+		NVIMAGE_API bool saveFloatTIFF(const char * fileName, const FloatImage * fimage, uint base_component, uint num_components);
+#endif
+
+#if defined(HAVE_OPENEXR)
+		NVIMAGE_API FloatImage * loadFloatEXR(const char * fileName, Stream & s);
+		
+		NVIMAGE_API bool saveFloatEXR(const char * fileName, const FloatImage * fimage, uint base_component, uint num_components);
+#endif
+
+	//	NVIMAGE_API FloatImage * loadFloatPFM(const char * fileName, Stream & s);
+	//	NVIMAGE_API bool saveFloatPFM(const char * fileName, const FloatImage * fimage, uint base_component, uint num_components);

 	} // ImageIO namespace
 	
--- a/src/nvimage/NormalMap.cpp
+++ b/src/nvimage/NormalMap.cpp
@ -36,9 +36,9 @@ using namespace nv;
 // Create normal map using the given kernels.
 static FloatImage * createNormalMap(const Image * img, FloatImage::WrapMode wm, Vector4::Arg heightWeights, const Kernel2 * kdu, const Kernel2 * kdv)
 {
-	nvDebugCheck(kdu != NULL);
-	nvDebugCheck(kdv != NULL);
-	nvDebugCheck(img != NULL);
+	nvCheck(kdu != NULL);
+	nvCheck(kdv != NULL);
+	nvCheck(img != NULL);
 	
 	const uint w = img->width();
 	const uint h = img->height();
@ -75,54 +75,10 @@ static FloatImage * createNormalMap(const Image * img, FloatImage::WrapMode wm,
 }


-// Create normal map using the given kernels.
-static FloatImage * createNormalMap(const FloatImage * img, FloatImage::WrapMode wm, const Kernel2 * kdu, const Kernel2 * kdv)
-{
-	nvDebugCheck(kdu != NULL);
-	nvDebugCheck(kdv != NULL);
-	nvDebugCheck(img != NULL);
-
-#pragma message(NV_FILE_LINE "FIXME: Height scale parameter should go away. It should be a sensible value that produces good results when the heightmap is in the [0, 1] range.")
-	const float heightScale = 1.0f / 16.0f;
-
-	const uint w = img->width();
-	const uint h = img->height();
-
-	AutoPtr<FloatImage> img_out(new FloatImage());
-	img_out->allocate(4, w, h);
-
-	for (uint y = 0; y < h; y++)
-	{
-		for (uint x = 0; x < w; x++)
-		{
-			const float du = img->applyKernel(kdu, x, y, 3, wm);
-			const float dv = img->applyKernel(kdv, x, y, 3, wm);
-
-			Vector3 n = normalize(Vector3(du, dv, heightScale));
-
-			img_out->setPixel(n.x(), x, y, 0);
-			img_out->setPixel(n.y(), x, y, 1);
-			img_out->setPixel(n.z(), x, y, 2);
-		}
-	}
-
-	// Copy alpha channel.
-	for (uint y = 0; y < h; y++)
-	{
-		for (uint x = 0; x < w; x++)
-		{
-			img_out->setPixel(img->pixel(x, y, 3), x, y, 3);
-		}
-	}
-
-	return img_out.release();
-}
-
-
 /// Create normal map using the given filter.
 FloatImage * nv::createNormalMap(const Image * img, FloatImage::WrapMode wm, Vector4::Arg heightWeights, NormalMapFilter filter /*= Sobel3x3*/)
 {
-	nvDebugCheck(img != NULL);
+	nvCheck(img != NULL);
 	
 	// Init the kernels.
 	Kernel2 * kdu = NULL;
@ -159,7 +115,7 @@ FloatImage * nv::createNormalMap(const Image * img, FloatImage::WrapMode wm, Vec
 /// Create normal map combining multiple sobel filters.
 FloatImage * nv::createNormalMap(const Image * img, FloatImage::WrapMode wm, Vector4::Arg heightWeights, Vector4::Arg filterWeights)
 {
-	nvDebugCheck(img != NULL);
+	nvCheck(img != NULL);

 	Kernel2 * kdu = NULL;
 	Kernel2 * kdv = NULL;
@ -174,32 +130,10 @@ FloatImage * nv::createNormalMap(const Image * img, FloatImage::WrapMode wm, Vec
 	return ::createNormalMap(img, wm, heightWeights, kdu, kdv);
 }

-
-FloatImage * nv::createNormalMap(const FloatImage * img, FloatImage::WrapMode wm, Vector4::Arg filterWeights)
-{
-	nvDebugCheck(img != NULL);
-
-	Kernel2 * kdu = NULL;
-	Kernel2 * kdv = NULL;
-
-	kdu = new Kernel2(9);
-	kdu->initBlendedSobel(filterWeights);
-	kdu->normalize();
-
-	kdv = new Kernel2(*kdu);
-	kdv->transpose();
-
-	return ::createNormalMap(img, wm, kdu, kdv);
-}
-
-
 /// Normalize the given image in place.
 void nv::normalizeNormalMap(FloatImage * img)
 {
-	nvDebugCheck(img != NULL);
-
-#pragma message(NV_FILE_LINE "TODO: Pack and expand normals explicitly")
-
+	nvCheck(img != NULL);
 	img->expandNormals(0);
 	img->normalize(0);
 	img->packNormals(0);
--- a/src/nvimage/NormalMap.h
+++ b/src/nvimage/NormalMap.h
@ -41,11 +41,9 @@ namespace nv
 		NormalMapFilter_Sobel9x9,	// very large
 	};

-	// @@ These two functions should be deprecated:
 	FloatImage * createNormalMap(const Image * img, FloatImage::WrapMode wm, Vector4::Arg heightWeights, NormalMapFilter filter = NormalMapFilter_Sobel3x3);
-	FloatImage * createNormalMap(const Image * img, FloatImage::WrapMode wm, Vector4::Arg heightWeights, Vector4::Arg filterWeights);

-	FloatImage * createNormalMap(const FloatImage * img, FloatImage::WrapMode wm, Vector4::Arg filterWeights);
+	FloatImage * createNormalMap(const Image * img, FloatImage::WrapMode wm, Vector4::Arg heightWeights, Vector4::Arg filterWeights);

 	void normalizeNormalMap(FloatImage * img);

--- a/src/nvimage/NormalMipmap.cpp
+++ b/src/nvimage/NormalMipmap.cpp
@ -0,0 +1,98 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#include <nvcore/Ptr.h>
+
+#include <nvmath/Montecarlo.h>
+#include <nvmath/SphericalHarmonic.h>
+
+#include <nvimage/NormalMipmap.h>
+#include <nvimage/FloatImage.h>
+
+using namespace nv;
+
+FloatImage * nv::createNormalMipmapMap(const FloatImage * img)
+{
+	nvDebugCheck(img != NULL);
+	
+	uint w = img->width();
+	uint h = img->height();
+	
+	uint hw = w / 2;
+	uint hh = h / 2;
+	
+	FloatImage dotImg;
+	dotImg.allocate(1, w, h);
+	
+	FloatImage shImg;
+	shImg.allocate(9, hw, hh);
+	
+	SampleDistribution distribution(256);
+	const uint sampleCount = distribution.sampleCount();
+	
+	for (uint d = 0; d < sampleCount; d++)
+	{
+		const float * xChannel = img->channel(0);
+		const float * yChannel = img->channel(1);
+		const float * zChannel = img->channel(2);
+		
+		Vector3 dir = distribution.sampleDir(d);
+		
+		Sh2 basis;
+		basis.eval(dir);
+		
+		for(uint i = 0; i < w*h; i++)
+		{
+			Vector3 normal(xChannel[i], yChannel[i], zChannel[i]);
+			normal = normalizeSafe(normal, Vector3(zero), 0.0f);
+			
+			dotImg.setPixel(dot(dir, normal), d);
+		}
+		
+		// @@ It would be nice to have a fastDownSample that took an existing image as an argument, to avoid allocations.
+		AutoPtr<FloatImage> dotMip(dotImg.fastDownSample());
+		
+		for(uint p = 0; p < hw*hh; p++)
+		{
+			float f = dotMip->pixel(p);
+			
+			// Project irradiance to sh basis and accumulate.
+			for (uint i = 0; i < 9; i++)
+			{
+				float & sum = shImg.channel(i)[p];
+				sum += f * basis.elemAt(i);
+			}
+		}
+	}
+	
+	
+	
+	FloatImage * normalMipmap = new FloatImage;
+	normalMipmap->allocate(4, hw, hh);
+	
+	// Precompute the clamped cosine radiance transfer.
+	Sh2 prt;
+	prt.cosineTransfer();
+	
+	// Allocate outside the loop.
+	Sh2 sh;
+	
+	for(uint p = 0; p < hw*hh; p++)
+	{
+		for (uint i = 0; i < 9; i++)
+		{
+			sh.elemAt(i) = shImg.channel(i)[p];
+		}
+		
+		// Convolve sh irradiance by radiance transfer.
+		sh *= prt;
+		
+		// Now sh(0) is the ambient occlusion.
+		// and sh(1) is the normal direction.
+		
+		// Should we use SVD to fit only the normals to the SH?
+		
+	}
+	
+	return normalMipmap;
+}
+
--- a/src/nvimage/NormalMipmap.h
+++ b/src/nvimage/NormalMipmap.h
@ -0,0 +1,17 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#ifndef NV_IMAGE_NORMALMIPMAP_H
+#define NV_IMAGE_NORMALMIPMAP_H
+
+#include <nvimage/nvimage.h>
+
+
+namespace nv
+{
+	class FloatImage;
+
+	FloatImage * createNormalMipmapMap(const FloatImage * img);
+
+} // nv namespace
+
+#endif // NV_IMAGE_NORMALMIPMAP_H
--- a/src/nvimage/TiledImage.cpp
+++ b/src/nvimage/TiledImage.cpp
@ -1,102 +0,0 @@
-// This code is in the public domain -- castano@gmail.com
-
-#include "TiledImage.h"
-
-#include <nvcore/StdStream.h>
-
-
-using namespace nv;
-
-namespace
-{
-	// MRU helpers:
-	// ...
-
-
-}
-
-
-bool Tile::load(const char * name)
-{
-	StdInputStream stream(name);
-
-	if (stream.isError()) {
-		return false;
-	}
-
-	uint header;
-	stream << header;
-
-	if (header == 'NVTC') {
-		return false;
-	}
-
-	uint count;
-	stream << count;
-
-	if (count != w*h) {
-		return false;
-	}
-
-	const uint size = count * sizeof(float);
-
-	return stream.serialize(data, size) == size;
-}
-
-
-bool Tile::unload(const char * name)
-{
-	StdOutputStream stream(name);
-
-	if (stream.isError()) {
-		return false;
-	}
-
-	uint header = 'NVTC';
-	uint count = w * h;
-	const uint size = w * h * sizeof(float);
-
-	stream << header << count;
-
-	return stream.serialize(data, size) == size;
-}
-
-
-
-
-
-TiledImage::TiledImage()
-{
-}
-
-void TiledImage::allocate(uint c, uint w, uint h, uint pageCount)
-{
-	// Allocate page map:
-	const uint pw = ((w + TILE_SIZE - 1) / TILE_SIZE);
-	const uint ph = ((h + TILE_SIZE - 1) / TILE_SIZE);
-	const uint size = c * pw * ph;
-	m_pageMap.resize(size);
-
-	m_residentArray.resize(pageCount, ~0);
-}
-
-void TiledImage::prefetch(uint c, uint x, uint y)
-{
-}
-
-void TiledImage::prefetch(uint c, uint x, uint y, uint w, uint h)
-{
-}
-
-void TiledImage::loadPage(uint x, uint y)
-{
-	const uint pw = ((w + TILE_SIZE - 1) / TILE_SIZE);
-	const uint ph = ((h + TILE_SIZE - 1) / TILE_SIZE);
-
-	nvDebugCheck(x < pw);
-	nvDebugCheck(y < ph);
-
-
-}
-
-
--- a/src/nvimage/TiledImage.h
+++ b/src/nvimage/TiledImage.h
@ -1,152 +0,0 @@
-// This code is in the public domain -- castano@gmail.com
-
-#ifndef NV_IMAGE_TILEDIMAGE_H
-#define NV_IMAGE_TILEDIMAGE_H
-
-#include <nvcore/Debug.h>
-#include <nvcore/StrLib.h>
-
-#include <nvimage/nvimage.h>
-
-// For simplicity the tile size is fixed at compile time.
-#define TILE_SIZE 256
-
-// 256 * 256 * 4 = 2^(8+8+2) = 2^18 = 256 KB
-// 512 * 512 * 4 = 2^(9+9+2) = 2^20 = 1 MB
-
-
-namespace nv
-{
-#if 0
-	struct ImageConcept
-	{
-		float pixel(uint x, uint y) const;
-	};
-
-	enum WrapMode {
-		WrapMode_Clamp,
-		WrapMode_Repeat,
-		WrapMode_Mirror
-	};
-
-	template <class T>
-	class Sampler
-	{
-		// ...
-	};
-#endif
-
-
-	class Tile
-	{
-		Tile(uint x, uint y, uint w, uint h) : xoffset(x), yoffset(y), w(w), h(h)
-		{
-			data = new float[w*h];
-		}
-		~Tile()
-		{
-			delete [] data;
-		}
-
-		uint size() const
-		{
-			return w * h * sizeof(float);
-		}
-		
-		float pixel(uint x, uint y) const
-		{
-			x -= xoffset;
-			y -= yoffset;
-			
-			nvDebugCheck (x < w);
-			nvDebugCheck (y < h);
-			
-			return data[y * w + x];
-		}
-
-		bool load(const char * name);
-		void unload(const char * name);
-
-
-		uint xoffset, yoffset;
-		uint w, h;
-		float * data;
-	};
-
-
-	class TiledImage
-	{
-	public:
-		
-		TiledImage();
-
-		void allocate(uint c, uint w, uint h, uint pageCount);
-
-		uint componentCount() const { return m_componentCount; }
-		uint width() const { return m_width; }
-		uint height() const { return m_height; }
-		uint pageCount() const { return m_residentArray.count(); }
-
-		void prefetch(uint c, uint x, uint y);
-		void prefetch(uint c, uint x, uint y, uint w, uint h);
-
-		float pixel(uint c, uint x, uint y);
-
-	private:
-		Tile * tileAt(uint c, uint x, uint y);
-		Tile * tileAt(uint idx);
-
-		uint loadPage(uint x, uint y);
-		void unloadPage(Tile *);
-
-		uint addAndReplace(uint newPage);
-		
-	private:
-		uint m_componentCount;
-		uint m_width;
-		uint m_height;
-
-		struct Page {
-			Page() : tile(NULL) {}
-
-			String tmpFileName;
-			Tile * tile;
-		};
-
-		mutable Array<Page> m_pageMap;
-		mutable Array<uint> m_residentArray; // MRU
-	};
-
-	inline float TiledImage::pixel(uint c, uint x, uint y)
-	{
-		nvDebugCheck (c < m_componentCount);
-		nvDebugCheck (x < m_width);
-		nvDebugCheck (y < m_height);
-
-		uint px = x / TILE_SIZE;
-		uint py = y / TILE_SIZE;
-
-		Tile * tile = tileAt(c, px, py);
-
-		if (tile == NULL) {
-			tile = loadPage(c, px, py);
-		}
-
-		return tile->pixel(x, y);
-	}
-	
-	inline Tile * TiledImage::tileAt(uint c, uint x, uint y)
-	{
-		uint idx = (c * h  + y) * w + x;
-		return tileAt(idx);
-	}
-	inline Tile * TiledImage::tileAt(uint idx)
-	{
-		return m_pageMap[idx].tile;
-	}
-	
-} // nv namespace
-
-
-
-#endif // NV_IMAGE_TILEDIMAGE_H
--- a/src/nvmath/Basis.cpp
+++ b/src/nvmath/Basis.cpp
@ -0,0 +1,173 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#include <nvmath/Basis.h>
+
+using namespace nv;
+
+
+/// Normalize basis vectors.
+void Basis::normalize(float epsilon /*= NV_EPSILON*/)
+{
+	normal = ::normalize(normal, epsilon);
+	tangent = ::normalize(tangent, epsilon);
+	bitangent = ::normalize(bitangent, epsilon);
+}
+
+
+/// Gram-Schmidt orthogonalization.
+/// @note Works only if the vectors are close to orthogonal.
+void Basis::orthonormalize(float epsilon /*= NV_EPSILON*/)
+{
+	// N' = |N|
+	// T' = |T - (N' dot T) N'|
+	// B' = |B - (N' dot B) N' - (T' dot B) T'|
+
+	normal = ::normalize(normal, epsilon);
+
+	tangent -= normal * dot(normal, tangent);
+	tangent = ::normalize(tangent, epsilon);
+
+	bitangent -= normal * dot(normal, bitangent);
+	bitangent -= tangent * dot(tangent, bitangent);
+	bitangent = ::normalize(bitangent, epsilon);
+}
+
+
+/// Robust orthonormalization. 
+/// Returns an orthonormal basis even when the original is degenerate.
+void Basis::robustOrthonormalize(float epsilon /*= NV_EPSILON*/)
+{
+	if (length(normal) < epsilon)
+	{
+		normal = cross(tangent, bitangent);
+		
+		if (length(normal) < epsilon)
+		{
+			tangent = Vector3(1, 0, 0);
+			bitangent = Vector3(0, 1, 0);
+			normal = Vector3(0, 0, 1);
+			return;
+		}
+	}
+	normal = ::normalize(normal, epsilon);
+	
+	tangent -= normal * dot(normal, tangent);
+	bitangent -= normal * dot(normal, bitangent);
+	
+	if (length(tangent) < epsilon)
+	{
+		if (length(bitangent) < epsilon)
+		{
+			buildFrameForDirection(normal);
+		}
+		else
+		{
+			tangent = cross(bitangent, normal);
+			nvCheck(isNormalized(tangent, epsilon));
+		}
+	}
+	else
+	{
+		tangent = ::normalize(tangent, epsilon);
+		bitangent -= tangent * dot(tangent, bitangent);
+		
+		if (length(bitangent) < epsilon)
+		{
+			bitangent = cross(tangent, normal);
+			nvCheck(isNormalized(bitangent));
+		}
+		else
+		{
+			tangent = ::normalize(tangent, epsilon);
+		}
+	}
+	
+	// Check vector lengths.
+	nvCheck(isNormalized(normal, epsilon));
+	nvCheck(isNormalized(tangent, epsilon));
+	nvCheck(isNormalized(bitangent, epsilon));
+
+	// Check vector angles.
+	nvCheck(equal(dot(normal, tangent), 0.0f, epsilon));
+	nvCheck(equal(dot(normal, bitangent), 0.0f, epsilon));
+	nvCheck(equal(dot(tangent, bitangent), 0.0f, epsilon));
+
+	// Check vector orientation.
+	const float det = dot(cross(normal, tangent), bitangent);
+	nvCheck(equal(det, 1.0f, epsilon) || equal(det, -1.0f, epsilon));
+}
+
+
+/// Build an arbitrary frame for the given direction.
+void Basis::buildFrameForDirection(Vector3::Arg d)
+{
+	nvCheck(isNormalized(d));
+	normal = d;
+
+	// Choose minimum axis.
+	if (fabsf(normal.x()) < fabsf(normal.y()) && fabsf(normal.x()) < fabsf(normal.z()))
+	{
+		tangent = Vector3(1, 0, 0);
+	}
+	else if (fabsf(normal.y()) < fabsf(normal.z()))
+	{
+		tangent = Vector3(0, 1, 0);
+	}
+	else
+	{
+		tangent = Vector3(0, 0, 1);
+	}
+
+	// Ortogonalize
+	tangent -= normal * dot(normal, tangent);
+	tangent = ::normalize(tangent);
+
+	bitangent = cross(normal, tangent);
+}
+
+
+
+/*
+/// Transform by this basis. (From this basis to object space).
+Vector3 Basis::transform(Vector3::Arg v) const
+{
+	Vector3 o = tangent * v.x();
+	o += bitangent * v.y();
+	o += normal * v.z();
+	return o;
+}
+
+/// Transform by the transpose. (From object space to this basis).
+Vector3 Basis::transformT(Vector3::Arg v)
+{
+	return Vector3(dot(tangent, v), dot(bitangent, v), dot(normal, v));
+}
+
+/// Transform by the inverse. (From object space to this basis).
+/// @note Uses Kramer's rule so the inverse is not accurate if the basis is ill-conditioned.
+Vector3 Basis::transformI(Vector3::Arg v) const
+{
+	const float det = determinant();
+	nvCheck(!equalf(det, 0.0f));
+	
+	const float idet = 1.0f / det;
+
+	// Rows of the inverse matrix.
+	Vector3 r0, r1, r2;
+	r0.x =  (bitangent.y() * normal.z() - bitangent.z() * normal.y()) * idet;
+	r0.y = -(bitangent.x() * normal.z() - bitangent.z() * normal.x()) * idet;
+	r0.z =  (bitangent.x() * normal.y() - bitangent.y() * normal.x()) * idet;
+
+	r1.x = -(tangent.y() * normal.z() - tangent.z() * normal.y()) * idet;
+	r1.y =  (tangent.x() * normal.z() - tangent.z() * normal.x()) * idet;
+	r1.z = -(tangent.x() * normal.y() - tangent.y() * normal.x()) * idet;
+
+	r2.x =  (tangent.y() * bitangent.z() - tangent.z() * bitangent.y()) * idet;
+	r2.y = -(tangent.x() * bitangent.z() - tangent.z() * bitangent.x()) * idet;
+	r2.z =  (tangent.x() * bitangent.y() - tangent.y() * bitangent.x()) * idet;
+
+	return Vector3(dot(v, r0), dot(v, r1), dot(v, r2));
+}	
+*/
+
+
--- a/src/nvmath/Basis.h
+++ b/src/nvmath/Basis.h
@ -0,0 +1,78 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#ifndef NV_MATH_BASIS_H
+#define NV_MATH_BASIS_H
+
+#include <nvmath/nvmath.h>
+#include <nvmath/Vector.h>
+#include <nvmath/Matrix.h>
+
+namespace nv
+{
+
+	/// Basis class to compute tangent space basis, ortogonalizations and to
+	/// transform vectors from one space to another.
+	struct Basis
+	{
+		/// Create a null basis.
+		Basis() : tangent(0, 0, 0), bitangent(0, 0, 0), normal(0, 0, 0) {}
+
+		/// Create a basis given three vectors.
+		Basis(Vector3::Arg n, Vector3::Arg t, Vector3::Arg b) : tangent(t), bitangent(b), normal(n) {}
+
+		/// Create a basis with the given tangent vectors and the handness.
+		Basis(Vector3::Arg n, Vector3::Arg t, float sign)
+		{
+			build(n, t, sign);
+		}
+
+		NVMATH_API void normalize(float epsilon = NV_EPSILON);
+		NVMATH_API void orthonormalize(float epsilon = NV_EPSILON);
+		NVMATH_API void robustOrthonormalize(float epsilon = NV_EPSILON);
+		NVMATH_API void buildFrameForDirection(Vector3::Arg d);
+
+		/// Calculate the determinant [ F G N ] to obtain the handness of the basis. 
+		float handness() const
+		{
+			return determinant() > 0.0f ? 1.0f : -1.0f;
+		}
+
+		/// Build a basis from 2 vectors and a handness flag.
+		void build(Vector3::Arg n, Vector3::Arg t, float sign)
+		{
+			normal = n;
+			tangent = t;
+			bitangent = sign * cross(t, n);
+		}
+
+		/// Compute the determinant of this basis.
+		float determinant() const
+		{
+			return 
+				tangent.x() * bitangent.y() * normal.z() - tangent.z() * bitangent.y() * normal.x() +
+				tangent.y() * bitangent.z() * normal.x() - tangent.y() * bitangent.x() * normal.z() + 
+				tangent.z() * bitangent.x() * normal.y() - tangent.x() * bitangent.z() * normal.y();
+		}
+		
+		/*
+		// Get transform matrix for this basis.
+		NVMATH_API Matrix matrix() const;
+		
+		// Transform by this basis. (From this basis to object space).
+		NVMATH_API Vector3 transform(Vector3::Arg v) const;
+
+		// Transform by the transpose. (From object space to this basis).
+		NVMATH_API Vector3 transformT(Vector3::Arg v);
+
+		// Transform by the inverse. (From object space to this basis).
+		NVMATH_API Vector3 transformI(Vector3::Arg v) const;
+		*/
+		
+		Vector3 tangent;
+		Vector3 bitangent;
+		Vector3 normal;
+	};
+
+} // nv namespace
+
+#endif // NV_MATH_BASIS_H
--- a/src/nvmath/Box.h
+++ b/src/nvmath/Box.h
@ -9,7 +9,6 @@

 namespace nv
 {
-class Stream;

 /// Axis Aligned Bounding Box.
 class Box
@ -28,13 +27,11 @@ public:
 	// Cast operators.
 	operator const float * () const { return reinterpret_cast<const float *>(this); }

-	// Min corner of the box.
-	Vector3 minCorner() const { return m_mins; }
-	Vector3 & minCorner() { return m_mins; }
+	/// Min corner of the box.
+	Vector3 mins() const { return m_mins; }

-	// Max corner of the box.
-	Vector3 maxCorner() const { return m_maxs; }
-	Vector3 & maxCorner() { return m_maxs; }
+	/// Max corner of the box.
+	Vector3 maxs() const { return m_maxs; }

 	/// Clear the bounds.
 	void clearBounds()
@ -129,8 +126,6 @@ public:
 			m_maxs.x() > p.x() && m_maxs.y() > p.y() && m_maxs.z() > p.z();
 	}

-	friend Stream & operator<< (Stream & s, Box & box);
-
 private:

 	Vector3 m_mins;
--- a/src/nvmath/CMakeLists.txt
+++ b/src/nvmath/CMakeLists.txt
@ -1,14 +1,17 @@
 PROJECT(nvmath)

 SET(MATH_SRCS
-    nvmath.h
-    Vector.h
-    Matrix.h
-    Plane.h Plane.cpp
-    Box.h
-    Color.h
-    Half.h Half.cpp
-    Fitting.h Fitting.cpp)
+	nvmath.h
+	Vector.h
+	Matrix.h
+	Quaternion.h
+	Box.h
+	Color.h
+	Montecarlo.h Montecarlo.cpp
+	Random.h Random.cpp
+	SphericalHarmonic.h SphericalHarmonic.cpp
+	Basis.h Basis.cpp
+	Triangle.h Triangle.cpp TriBox.cpp)

 INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})

@ -16,15 +19,15 @@ INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
 ADD_DEFINITIONS(-DNVMATH_EXPORTS)

 IF(NVMATH_SHARED)
-    ADD_DEFINITIONS(-DNVMATH_SHARED=1)
-    ADD_LIBRARY(nvmath SHARED ${MATH_SRCS})
+	ADD_DEFINITIONS(-DNVMATH_SHARED=1)
+	ADD_LIBRARY(nvmath SHARED ${MATH_SRCS})
 ELSE(NVMATH_SHARED)
-    ADD_LIBRARY(nvmath ${MATH_SRCS})
+	ADD_LIBRARY(nvmath ${MATH_SRCS})
 ENDIF(NVMATH_SHARED)

 TARGET_LINK_LIBRARIES(nvmath ${LIBS} nvcore)

 INSTALL(TARGETS nvmath
-    RUNTIME DESTINATION bin
-    LIBRARY DESTINATION lib
-    ARCHIVE DESTINATION lib/static)
+	RUNTIME DESTINATION bin
+	LIBRARY DESTINATION lib
+	ARCHIVE DESTINATION lib/static)
--- a/src/nvmath/Fitting.cpp
+++ b/src/nvmath/Fitting.cpp
@ -1,247 +0,0 @@
-// This code is in the public domain -- icastano@gmail.com
-
-#include "Fitting.h"
-
-#include <nvcore/Algorithms.h> // max
-#include <nvcore/Containers.h> // swap
-
-#include <float.h> // FLT_MAX
-
-using namespace nv;
-
-// @@ Move to EigenSolver.h
-static inline Vector3 firstEigenVector_PowerMethod(const float *__restrict matrix)
-{
-	if (matrix[0] == 0 || matrix[3] == 0 || matrix[5] == 0)
-	{
-		return Vector3(zero);
-	}
-	
-	const int NUM = 8;
-
-	Vector3 v(1, 1, 1);
-	for (int i = 0; i < NUM; i++)
-	{
-		float x = v.x() * matrix[0] + v.y() * matrix[1] + v.z() * matrix[2];
-		float y = v.x() * matrix[1] + v.y() * matrix[3] + v.z() * matrix[4];
-		float z = v.x() * matrix[2] + v.y() * matrix[4] + v.z() * matrix[5];
-		
-		float norm = max(max(x, y), z);
-	
-		v = Vector3(x, y, z) / norm;
-	}
-
-	return v;	
-}
-
-
-Vector3 nv::Fit::computeCentroid(int n, const Vector3 *__restrict points)
-{
-	Vector3 centroid(zero);
-
-	for (int i = 0; i < n; i++)
-	{
-		centroid += points[i];
-	}
-	centroid /= float(n);
-
-	return centroid;
-}
-
-Vector3 nv::Fit::computeCentroid(int n, const Vector3 *__restrict points, const float *__restrict weights, Vector3::Arg metric)
-{
-	Vector3 centroid(zero);
-	float total = 0.0f;
-
-	for (int i = 0; i < n; i++)
-	{
-		total += weights[i];
-		centroid += weights[i]*points[i];
-	}
-	centroid /= total;
-
-	return centroid;
-}
-
-
-Vector3 nv::Fit::computeCovariance(int n, const Vector3 *__restrict points, float *__restrict covariance)
-{
-	// compute the centroid
-	Vector3 centroid = computeCentroid(n, points);
-
-	// compute covariance matrix
-	for (int i = 0; i < 6; i++)
-	{
-		covariance[i] = 0.0f;
-	}
-
-	for (int i = 0; i < n; i++)
-	{
-		Vector3 v = points[i] - centroid;
-		
-		covariance[0] += v.x() * v.x();
-		covariance[1] += v.x() * v.y();
-		covariance[2] += v.x() * v.z();
-		covariance[3] += v.y() * v.y();
-		covariance[4] += v.y() * v.z();
-		covariance[5] += v.z() * v.z();
-	}
-
-	return centroid;
-}
-
-Vector3 nv::Fit::computeCovariance(int n, const Vector3 *__restrict points, const float *__restrict weights, Vector3::Arg metric, float *__restrict covariance)
-{
-	// compute the centroid
-	Vector3 centroid = computeCentroid(n, points, weights, metric);
-
-	// compute covariance matrix
-	for (int i = 0; i < 6; i++)
-	{
-		covariance[i] = 0.0f;
-	}
-
-	for (int i = 0; i < n; i++)
-	{
-		Vector3 a = (points[i] - centroid) * metric;
-		Vector3 b = weights[i]*a;
-		
-		covariance[0] += a.x()*b.x();
-		covariance[1] += a.x()*b.y();
-		covariance[2] += a.x()*b.z();
-		covariance[3] += a.y()*b.y();
-		covariance[4] += a.y()*b.z();
-		covariance[5] += a.z()*b.z();
-	}
-
-	return centroid;
-}
-
-Vector3 nv::Fit::computePrincipalComponent(int n, const Vector3 *__restrict points)
-{
-	float matrix[6];
-	computeCovariance(n, points, matrix);
-
-	return firstEigenVector_PowerMethod(matrix);
-}
-
-Vector3 nv::Fit::computePrincipalComponent(int n, const Vector3 *__restrict points, const float *__restrict weights, Vector3::Arg metric)
-{
-	float matrix[6];
-	computeCovariance(n, points, weights, metric, matrix);
-
-	return firstEigenVector_PowerMethod(matrix);
-}
-
-
-Plane nv::Fit::bestPlane(int n, const Vector3 *__restrict points)
-{
-	// compute the centroid and covariance
-	float matrix[6];
-	Vector3 centroid = computeCovariance(n, points, matrix);
-
-	if (matrix[0] == 0 || matrix[3] == 0 || matrix[5] == 0)
-	{
-		// If no plane defined, then return a horizontal plane.
-		return Plane(Vector3(0, 0, 1), centroid);
-	}
-
-#pragma message(NV_FILE_LINE "TODO: need to write an eigensolver!")
-
-	// - Numerical Recipes in C is a good reference. Householder transforms followed by QL decomposition seems to be the best approach.
-	// - The one from magic-tools is now LGPL. For the 3D case it uses a cubic root solver, which is not very accurate.
-	// - Charles' Galaxy3 contains an implementation of the tridiagonalization method, but is under BPL.
-
-	//EigenSolver3 solver(matrix);
-
-	return Plane();
-}
-
-
-int nv::Fit::compute4Means(int n, const Vector3 *__restrict points, const float *__restrict weights, Vector3::Arg metric, Vector3 *__restrict cluster)
-{
-	// Compute principal component.
-	float matrix[6];
-	Vector3 centroid = computeCovariance(n, points, weights, metric, matrix);
-	Vector3 principal = firstEigenVector_PowerMethod(matrix);
-	
-	// Pick initial solution.
-	int mini, maxi;
-	mini = maxi = 0;
-	
-	float mindps, maxdps;
-	mindps = maxdps = dot(points[0] - centroid, principal);
-	
-	for (int i = 1; i < n; ++i)
-	{
-		float dps = dot(points[i] - centroid, principal);
-		
-		if (dps < mindps) {
-			mindps = dps;
-			mini = i;
-		}
-		else {
-			maxdps = dps;
-			maxi = i;
-		}
-	}
-
-	cluster[0] = centroid + mindps * principal;
-	cluster[1] = centroid + maxdps * principal;
-	cluster[2] = (2 * cluster[0] + cluster[1]) / 3;
-	cluster[3] = (2 * cluster[1] + cluster[0]) / 3;
-
-	// Now we have to iteratively refine the clusters.
-	while (true)
-	{
-		Vector3 newCluster[4] = { Vector3(zero), Vector3(zero), Vector3(zero), Vector3(zero) };
-		float total[4] = {0, 0, 0, 0};
-		
-		for (int i = 0; i < n; ++i)
-		{
-			// Find nearest cluster.
-			int nearest = 0;
-			float mindist = FLT_MAX;
-			for (int j = 0; j < 4; j++)
-			{
-				float dist = length_squared((cluster[j] - points[i]) * metric);
-				if (dist < mindist)
-				{
-					mindist = dist;
-					nearest = j;
-				}
-			}
-			
-			newCluster[nearest] += weights[i] * points[i];
-			total[nearest] += weights[i];
-		}
-
-		for (int j = 0; j < 4; j++)
-		{
-            if (total[j] != 0)
-			    newCluster[j] /= total[j];
-		}
-
-		if (equal(cluster[0], newCluster[0]) && equal(cluster[1], newCluster[1]) && 
-			equal(cluster[2], newCluster[2]) && equal(cluster[3], newCluster[3]))
-		{
-			return (total[0] != 0) + (total[1] != 0) + (total[2] != 0) + (total[3] != 0);
-		}
-
-		cluster[0] = newCluster[0];
-		cluster[1] = newCluster[1];
-		cluster[2] = newCluster[2];
-		cluster[3] = newCluster[3];
-
-		// Sort clusters by weight.
-		for (int i = 0; i < 4; i++)
-		{
-			for (int j = i; j > 0 && total[j] > total[j - 1]; j--)
-			{
-				swap( total[j], total[j - 1] );
-				swap( cluster[j], cluster[j - 1] );
-			}
-		}
-	}
-}
-
--- a/src/nvmath/Fitting.h
+++ b/src/nvmath/Fitting.h
@ -1,31 +0,0 @@
-// This code is in the public domain -- icastano@gmail.com
-
-#ifndef NV_MATH_FITTING_H
-#define NV_MATH_FITTING_H
-
-#include <nvmath/nvmath.h>
-#include <nvmath/Vector.h>
-#include <nvmath/Plane.h>
-
-namespace nv
-{
-	namespace Fit
-	{
-		Vector3 computeCentroid(int n, const Vector3 * points);
-		Vector3 computeCentroid(int n, const Vector3 * points, const float * weights, Vector3::Arg metric);
-
-		Vector3 computeCovariance(int n, const Vector3 * points, float * covariance);
-		Vector3 computeCovariance(int n, const Vector3 * points, const float * weights, Vector3::Arg metric, float * covariance);
-
-		Vector3 computePrincipalComponent(int n, const Vector3 * points);
-		Vector3 computePrincipalComponent(int n, const Vector3 * points, const float * weights, Vector3::Arg metric);
-
-		Plane bestPlane(int n, const Vector3 * points);
-
-		// Returns number of clusters [1-4].
-		int compute4Means(int n, const Vector3 * points, const float * weights, Vector3::Arg metric, Vector3 * cluster);
-	}
-
-} // nv namespace
-
-#endif // NV_MATH_FITTING_H
--- a/src/nvmath/Half.cpp
+++ b/src/nvmath/Half.cpp
@ -1,563 +0,0 @@
-// Branch-free implementation of half-precision (16 bit) floating point
-// Copyright 2006 Mike Acton <macton@gmail.com>
-// 
-// Permission is hereby granted, free of charge, to any person obtaining a 
-// copy of this software and associated documentation files (the "Software"),
-// to deal in the Software without restriction, including without limitation
-// the rights to use, copy, modify, merge, publish, distribute, sublicense, 
-// and/or sell copies of the Software, and to permit persons to whom the 
-// Software is furnished to do so, subject to the following conditions:
-// 
-// The above copyright notice and this permission notice shall be included 
-// in all copies or substantial portions of the Software.
-// 
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
-// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
-// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-// THE SOFTWARE
-//
-// Half-precision floating point format
-// ------------------------------------
-//
-//   | Field    | Last | First | Note
-//   |----------|------|-------|----------
-//   | Sign     | 15   | 15    |
-//   | Exponent | 14   | 10    | Bias = 15
-//   | Mantissa | 9    | 0     |
-//
-// Compiling
-// ---------
-//
-//  Preferred compile flags for GCC: 
-//     -O3 -fstrict-aliasing -std=c99 -pedantic -Wall -Wstrict-aliasing
-//
-//     This file is a C99 source file, intended to be compiled with a C99 
-//     compliant compiler. However, for the moment it remains combatible
-//     with C++98. Therefore if you are using a compiler that poorly implements
-//     C standards (e.g. MSVC), it may be compiled as C++. This is not
-//     guaranteed for future versions. 
-//
-// Features
-// --------
-//
-//  * QNaN + <x>  = QNaN
-//  * <x>  + +INF = +INF
-//  * <x>  - -INF = -INF
-//  * INF  - INF  = SNaN
-//  * Denormalized values
-//  * Difference of ZEROs is always +ZERO
-//  * Sum round with guard + round + sticky bit (grs)
-//  * And of course... no branching
-// 
-// Precision of Sum
-// ----------------
-//
-//  (SUM)        uint16 z = half_add( x, y );
-//  (DIFFERENCE) uint16 z = half_add( x, -y );
-//
-//     Will have exactly (0 ulps difference) the same result as:
-//     (For 32 bit IEEE 784 floating point and same rounding mode)
-//
-//     union FLOAT_32
-//     {
-//       float    f32;
-//       uint32 u32;
-//     };
-//
-//     union FLOAT_32 fx = { .u32 = half_to_float( x ) };
-//     union FLOAT_32 fy = { .u32 = half_to_float( y ) };
-//     union FLOAT_32 fz = { .f32 = fx.f32 + fy.f32    };
-//     uint16       z  = float_to_half( fz );
-//
-
-#include "Half.h"
-#include <stdio.h>
-
-// Load immediate
-static inline uint32 _uint32_li( uint32 a )
-{
-  return (a);
-}
-
-// Decrement
-static inline uint32 _uint32_dec( uint32 a )
-{
-  return (a - 1);
-}
-
-// Complement
-static inline uint32 _uint32_not( uint32 a )
-{
-  return (~a);
-}
-
-// Negate
-static inline uint32 _uint32_neg( uint32 a )
-{
-#if NV_CC_MSVC
-  // prevent msvc warning.
-  return ~a + 1;
-#else
-  return (-a);
-#endif
-}
-
-// Extend sign
-static inline uint32 _uint32_ext( uint32 a )
-{
-  return (((int32)a)>>31);
-}
-
-// And
-static inline uint32 _uint32_and( uint32 a, uint32 b )
-{
-  return (a & b);
-}
-
-// And with Complement
-static inline uint32 _uint32_andc( uint32 a, uint32 b )
-{
-  return (a & ~b);
-}
-
-// Or
-static inline uint32 _uint32_or( uint32 a, uint32 b )
-{
-  return (a | b);
-}
-
-// Shift Right Logical
-static inline uint32 _uint32_srl( uint32 a, int sa )
-{
-  return (a >> sa);
-}
-
-// Shift Left Logical
-static inline uint32 _uint32_sll( uint32 a, int sa )
-{
-  return (a << sa);
-}
-
-// Add
-static inline uint32 _uint32_add( uint32 a, uint32 b )
-{
-  return (a + b);
-}
-
-// Subtract
-static inline uint32 _uint32_sub( uint32 a, uint32 b )
-{
-  return (a - b);
-}
-
-// Select on Sign bit
-static inline uint32 _uint32_sels( uint32 test, uint32 a, uint32 b )
-{
-  const uint32 mask   = _uint32_ext( test );
-  const uint32 sel_a  = _uint32_and(  a,     mask  );
-  const uint32 sel_b  = _uint32_andc( b,     mask  );
-  const uint32 result = _uint32_or(   sel_a, sel_b );
-
-  return (result);
-}
-
-// Load Immediate
-static inline uint16 _uint16_li( uint16 a )
-{
-  return (a);
-}
-
-// Extend sign
-static inline uint16 _uint16_ext( uint16 a )
-{
-  return (((int16)a)>>15);
-}
-
-// Negate
-static inline uint16 _uint16_neg( uint16 a )
-{
-  return (-a);
-}
-
-// Complement
-static inline uint16 _uint16_not( uint16 a )
-{
-  return (~a);
-}
-
-// Decrement
-static inline uint16 _uint16_dec( uint16 a )
-{
-  return (a - 1);
-}
-
-// Shift Left Logical
-static inline uint16 _uint16_sll( uint16 a, int sa )
-{
-  return (a << sa);
-}
-
-// Shift Right Logical
-static inline uint16 _uint16_srl( uint16 a, int sa )
-{
-  return (a >> sa);
-}
-
-// Add
-static inline uint16 _uint16_add( uint16 a, uint16 b )
-{
-  return (a + b);
-}
-
-// Subtract
-static inline uint16 _uint16_sub( uint16 a, uint16 b )
-{
-  return (a - b);
-}
-
-// And
-static inline uint16 _uint16_and( uint16 a, uint16 b )
-{
-  return (a & b);
-}
-
-// Or
-static inline uint16 _uint16_or( uint16 a, uint16 b )
-{
-  return (a | b);
-}
-
-// Exclusive Or
-static inline uint16 _uint16_xor( uint16 a, uint16 b )
-{
-  return (a ^ b);
-}
-
-// And with Complement
-static inline uint16 _uint16_andc( uint16 a, uint16 b )
-{
-  return (a & ~b);
-}
-
-// And then Shift Right Logical
-static inline uint16 _uint16_andsrl( uint16 a, uint16 b, int sa )
-{
-  return ((a & b) >> sa);
-}
-
-// Shift Right Logical then Mask
-static inline uint16 _uint16_srlm( uint16 a, int sa, uint16 mask )
-{
-  return ((a >> sa) & mask);
-}
-
-// Add then Mask
-static inline uint16 _uint16_addm( uint16 a, uint16 b, uint16 mask )
-{
-  return ((a + b) & mask);
-}
-
-
-// Select on Sign bit
-static inline uint16 _uint16_sels( uint16 test, uint16 a, uint16 b )
-{
-  const uint16 mask   = _uint16_ext( test );
-  const uint16 sel_a  = _uint16_and(  a,     mask  );
-  const uint16 sel_b  = _uint16_andc( b,     mask  );
-  const uint16 result = _uint16_or(   sel_a, sel_b );
-
-  return (result);
-}
-
-// Count Leading Zeros
-static inline uint32 _uint32_cntlz( uint32 x )
-{
-#ifdef __GNUC__
-  /* On PowerPC, this will map to insn: cntlzw */
-  /* On Pentium, this will map to insn: clz    */
-  uint32 nlz = __builtin_clz( x );
-  return (nlz);
-#else
-  const uint32 x0  = _uint32_srl(  x,  1 );
-  const uint32 x1  = _uint32_or(   x,  x0 );
-  const uint32 x2  = _uint32_srl(  x1, 2 );
-  const uint32 x3  = _uint32_or(   x1, x2 );
-  const uint32 x4  = _uint32_srl(  x3, 4 );
-  const uint32 x5  = _uint32_or(   x3, x4 );
-  const uint32 x6  = _uint32_srl(  x5, 8 );
-  const uint32 x7  = _uint32_or(   x5, x6 );
-  const uint32 x8  = _uint32_srl(  x7, 16 );
-  const uint32 x9  = _uint32_or(   x7, x8 );
-  const uint32 xA  = _uint32_not(  x9 );
-  const uint32 xB  = _uint32_srl(  xA, 1 );
-  const uint32 xC  = _uint32_and(  xB, 0x55555555 );
-  const uint32 xD  = _uint32_sub(  xA, xC );
-  const uint32 xE  = _uint32_and(  xD, 0x33333333 );
-  const uint32 xF  = _uint32_srl(  xD, 2 );
-  const uint32 x10 = _uint32_and(  xF, 0x33333333 );
-  const uint32 x11 = _uint32_add(  xE, x10 );
-  const uint32 x12 = _uint32_srl(  x11, 4 );
-  const uint32 x13 = _uint32_add(  x11, x12 );
-  const uint32 x14 = _uint32_and(  x13, 0x0f0f0f0f );
-  const uint32 x15 = _uint32_srl(  x14, 8 );
-  const uint32 x16 = _uint32_add(  x14, x15 );
-  const uint32 x17 = _uint32_srl(  x16, 16 );
-  const uint32 x18 = _uint32_add(  x16, x17 );
-  const uint32 x19 = _uint32_and(  x18, 0x0000003f );
-  return ( x19 );
-#endif
-}
-
-// Count Leading Zeros
-static inline uint16 _uint16_cntlz( uint16 x )
-{
-#ifdef __GNUC__
-  /* On PowerPC, this will map to insn: cntlzw */
-  /* On Pentium, this will map to insn: clz    */
-  uint32 x32   = _uint32_sll( x, 16 );
-  uint16 nlz   = (uint16)__builtin_clz( x32 );
-  return (nlz);
-#else
-  const uint16 x0  = _uint16_srl(  x,  1 );
-  const uint16 x1  = _uint16_or(   x,  x0 );
-  const uint16 x2  = _uint16_srl(  x1, 2 );
-  const uint16 x3  = _uint16_or(   x1, x2 );
-  const uint16 x4  = _uint16_srl(  x3, 4 );
-  const uint16 x5  = _uint16_or(   x3, x4 );
-  const uint16 x6  = _uint16_srl(  x5, 8 );
-  const uint16 x7  = _uint16_or(   x5, x6 );
-  const uint16 x8  = _uint16_not(  x7 );
-  const uint16 x9  = _uint16_srlm( x8, 1, 0x5555 );
-  const uint16 xA  = _uint16_sub(  x8, x9 );
-  const uint16 xB  = _uint16_and(  xA, 0x3333 );
-  const uint16 xC  = _uint16_srlm( xA, 2, 0x3333 );
-  const uint16 xD  = _uint16_add(  xB, xC );
-  const uint16 xE  = _uint16_srl(  xD, 4 );
-  const uint16 xF  = _uint16_addm( xD, xE, 0x0f0f );
-  const uint16 x10 = _uint16_srl(  xF, 8 );
-  const uint16 x11 = _uint16_addm( xF, x10, 0x001f );
-  return ( x11 );
-#endif
-}
-
-uint16
-half_from_float( uint32 f )
-{
-  const uint32 one                        = _uint32_li( 0x00000001 );
-  const uint32 f_e_mask                   = _uint32_li( 0x7f800000 );
-  const uint32 f_m_mask                   = _uint32_li( 0x007fffff );
-  const uint32 f_s_mask                   = _uint32_li( 0x80000000 );
-  const uint32 h_e_mask                   = _uint32_li( 0x00007c00 );
-  const uint32 f_e_pos                    = _uint32_li( 0x00000017 );
-  const uint32 f_m_round_bit              = _uint32_li( 0x00001000 );
-  const uint32 h_nan_em_min               = _uint32_li( 0x00007c01 );
-  const uint32 f_h_s_pos_offset           = _uint32_li( 0x00000010 );
-  const uint32 f_m_hidden_bit             = _uint32_li( 0x00800000 );
-  const uint32 f_h_m_pos_offset           = _uint32_li( 0x0000000d );
-  const uint32 f_h_bias_offset            = _uint32_li( 0x38000000 );
-  const uint32 f_m_snan_mask              = _uint32_li( 0x003fffff );
-  const uint16 h_snan_mask                = _uint32_li( 0x00007e00 );
-  const uint32 f_e                        = _uint32_and( f, f_e_mask  );
-  const uint32 f_m                        = _uint32_and( f, f_m_mask  );
-  const uint32 f_s                        = _uint32_and( f, f_s_mask  );
-  const uint32 f_e_h_bias                 = _uint32_sub( f_e,               f_h_bias_offset );
-  const uint32 f_e_h_bias_amount          = _uint32_srl( f_e_h_bias,        f_e_pos         );
-  const uint32 f_m_round_mask             = _uint32_and( f_m,               f_m_round_bit     );
-  const uint32 f_m_round_offset           = _uint32_sll( f_m_round_mask,    one               );
-  const uint32 f_m_rounded                = _uint32_add( f_m,               f_m_round_offset  );
-  const uint32 f_m_rounded_overflow       = _uint32_and( f_m_rounded,       f_m_hidden_bit    );
-  const uint32 f_m_denorm_sa              = _uint32_sub( one,               f_e_h_bias_amount );
-  const uint32 f_m_with_hidden            = _uint32_or(  f_m_rounded,       f_m_hidden_bit    );
-  const uint32 f_m_denorm                 = _uint32_srl( f_m_with_hidden,   f_m_denorm_sa     );
-  const uint32 f_em_norm_packed           = _uint32_or(  f_e_h_bias,        f_m_rounded       );
-  const uint32 f_e_overflow               = _uint32_add( f_e_h_bias,        f_m_hidden_bit    );
-  const uint32 h_s                        = _uint32_srl( f_s,               f_h_s_pos_offset );
-  const uint32 h_m_nan                    = _uint32_srl( f_m,               f_h_m_pos_offset );
-  const uint32 h_m_denorm                 = _uint32_srl( f_m_denorm,        f_h_m_pos_offset );
-  const uint32 h_em_norm                  = _uint32_srl( f_em_norm_packed,  f_h_m_pos_offset );
-  const uint32 h_em_overflow              = _uint32_srl( f_e_overflow,      f_h_m_pos_offset );
-  const uint32 is_e_eqz_msb               = _uint32_dec(  f_e     );
-  const uint32 is_m_nez_msb               = _uint32_neg(  f_m     );
-  const uint32 is_h_m_nan_nez_msb         = _uint32_neg(  h_m_nan );
-  const uint32 is_e_nflagged_msb          = _uint32_sub(  f_e,                 f_e_mask          );
-  const uint32 is_ninf_msb                = _uint32_or(   is_e_nflagged_msb,   is_m_nez_msb      );
-  const uint32 is_underflow_msb           = _uint32_sub(  is_e_eqz_msb,        f_h_bias_offset   );
-  const uint32 is_nan_nunderflow_msb      = _uint32_or(   is_h_m_nan_nez_msb,  is_e_nflagged_msb );
-  const uint32 is_m_snan_msb              = _uint32_sub(  f_m_snan_mask,       f_m               );
-  const uint32 is_snan_msb                = _uint32_andc( is_m_snan_msb,       is_e_nflagged_msb );
-  const uint32 is_overflow_msb            = _uint32_neg(  f_m_rounded_overflow );
-  const uint32 h_nan_underflow_result     = _uint32_sels( is_nan_nunderflow_msb, h_em_norm,                h_nan_em_min       );
-  const uint32 h_inf_result               = _uint32_sels( is_ninf_msb,           h_nan_underflow_result,   h_e_mask           );
-  const uint32 h_underflow_result         = _uint32_sels( is_underflow_msb,      h_m_denorm,               h_inf_result       );
-  const uint32 h_overflow_result          = _uint32_sels( is_overflow_msb,       h_em_overflow,            h_underflow_result );
-  const uint32 h_em_result                = _uint32_sels( is_snan_msb,           h_snan_mask,              h_overflow_result  );
-  const uint32 h_result                   = _uint32_or( h_em_result, h_s );
-
-  return (h_result);
-}
-
-uint32 
-half_to_float( uint16 h )
-{
-  const uint32 h_e_mask              = _uint32_li( 0x00007c00 );
-  const uint32 h_m_mask              = _uint32_li( 0x000003ff );
-  const uint32 h_s_mask              = _uint32_li( 0x00008000 );
-  const uint32 h_f_s_pos_offset      = _uint32_li( 0x00000010 );
-  const uint32 h_f_e_pos_offset      = _uint32_li( 0x0000000d );
-  const uint32 h_f_bias_offset       = _uint32_li( 0x0001c000 );
-  const uint32 f_e_mask              = _uint32_li( 0x7f800000 );
-  const uint32 f_m_mask              = _uint32_li( 0x007fffff );
-  const uint32 h_f_e_denorm_bias     = _uint32_li( 0x0000007e );
-  const uint32 h_f_m_denorm_sa_bias  = _uint32_li( 0x00000008 );
-  const uint32 f_e_pos               = _uint32_li( 0x00000017 );
-  const uint32 h_e_mask_minus_one    = _uint32_li( 0x00007bff );
-  const uint32 h_e                   = _uint32_and( h, h_e_mask );
-  const uint32 h_m                   = _uint32_and( h, h_m_mask );
-  const uint32 h_s                   = _uint32_and( h, h_s_mask );
-  const uint32 h_e_f_bias            = _uint32_add( h_e, h_f_bias_offset );
-  const uint32 h_m_nlz               = _uint32_cntlz( h_m );
-  const uint32 f_s                   = _uint32_sll( h_s,        h_f_s_pos_offset );
-  const uint32 f_e                   = _uint32_sll( h_e_f_bias, h_f_e_pos_offset );
-  const uint32 f_m                   = _uint32_sll( h_m,        h_f_e_pos_offset );
-  const uint32 f_em                  = _uint32_or(  f_e,        f_m              );
-  const uint32 h_f_m_sa              = _uint32_sub( h_m_nlz,             h_f_m_denorm_sa_bias );
-  const uint32 f_e_denorm_unpacked   = _uint32_sub( h_f_e_denorm_bias,   h_f_m_sa             );
-  const uint32 h_f_m                 = _uint32_sll( h_m,                 h_f_m_sa             );
-  const uint32 f_m_denorm            = _uint32_and( h_f_m,               f_m_mask             );
-  const uint32 f_e_denorm            = _uint32_sll( f_e_denorm_unpacked, f_e_pos              );
-  const uint32 f_em_denorm           = _uint32_or(  f_e_denorm,          f_m_denorm           );
-  const uint32 f_em_nan              = _uint32_or(  f_e_mask,            f_m                  );
-  const uint32 is_e_eqz_msb          = _uint32_dec(  h_e );
-  const uint32 is_m_nez_msb          = _uint32_neg(  h_m );
-  const uint32 is_e_flagged_msb      = _uint32_sub(  h_e_mask_minus_one, h_e );
-  const uint32 is_zero_msb           = _uint32_andc( is_e_eqz_msb,       is_m_nez_msb );
-  const uint32 is_inf_msb            = _uint32_andc( is_e_flagged_msb,   is_m_nez_msb );
-  const uint32 is_denorm_msb         = _uint32_and(  is_m_nez_msb,       is_e_eqz_msb );
-  const uint32 is_nan_msb            = _uint32_and(  is_e_flagged_msb,   is_m_nez_msb ); 
-  const uint32 is_zero               = _uint32_ext(  is_zero_msb );
-  const uint32 f_zero_result         = _uint32_andc( f_em, is_zero );
-  const uint32 f_denorm_result       = _uint32_sels( is_denorm_msb, f_em_denorm, f_zero_result );
-  const uint32 f_inf_result          = _uint32_sels( is_inf_msb,    f_e_mask,    f_denorm_result );
-  const uint32 f_nan_result          = _uint32_sels( is_nan_msb,    f_em_nan,    f_inf_result    );
-  const uint32 f_result              = _uint32_or( f_s, f_nan_result );
- 
-  return (f_result);
-}
-
-uint16
-half_add( uint16 x, uint16 y )
-{
-  const uint16 one                       = _uint16_li( 0x0001 );
-  const uint16 msb_to_lsb_sa             = _uint16_li( 0x000f );
-  const uint16 h_s_mask                  = _uint16_li( 0x8000 );
-  const uint16 h_e_mask                  = _uint16_li( 0x7c00 );
-  const uint16 h_m_mask                  = _uint16_li( 0x03ff );
-  const uint16 h_m_msb_mask              = _uint16_li( 0x2000 );
-  const uint16 h_m_msb_sa                = _uint16_li( 0x000d );
-  const uint16 h_m_hidden                = _uint16_li( 0x0400 );
-  const uint16 h_e_pos                   = _uint16_li( 0x000a );
-  const uint16 h_e_bias_minus_one        = _uint16_li( 0x000e );
-  const uint16 h_m_grs_carry             = _uint16_li( 0x4000 );
-  const uint16 h_m_grs_carry_pos         = _uint16_li( 0x000e );
-  const uint16 h_grs_size                = _uint16_li( 0x0003 );
-  const uint16 h_snan                    = _uint16_li( 0xfe00 );
-  const uint16 h_e_mask_minus_one        = _uint16_li( 0x7bff );
-  const uint16 h_grs_round_carry         = _uint16_sll( one, h_grs_size );
-  const uint16 h_grs_round_mask          = _uint16_sub( h_grs_round_carry, one );
-  const uint16 x_e                       = _uint16_and( x, h_e_mask );
-  const uint16 y_e                       = _uint16_and( y, h_e_mask );
-  const uint16 is_y_e_larger_msb         = _uint16_sub( x_e, y_e );
-  const uint16 a                         = _uint16_sels( is_y_e_larger_msb, y, x);
-  const uint16 a_s                       = _uint16_and( a, h_s_mask );
-  const uint16 a_e                       = _uint16_and( a, h_e_mask );
-  const uint16 a_m_no_hidden_bit         = _uint16_and( a, h_m_mask );
-  const uint16 a_em_no_hidden_bit        = _uint16_or( a_e, a_m_no_hidden_bit );
-  const uint16 b                         = _uint16_sels( is_y_e_larger_msb, x, y);
-  const uint16 b_s                       = _uint16_and( b, h_s_mask );
-  const uint16 b_e                       = _uint16_and( b, h_e_mask );
-  const uint16 b_m_no_hidden_bit         = _uint16_and( b, h_m_mask );
-  const uint16 b_em_no_hidden_bit        = _uint16_or( b_e, b_m_no_hidden_bit );
-  const uint16 is_diff_sign_msb          = _uint16_xor( a_s, b_s );
-  const uint16 is_a_inf_msb              = _uint16_sub( h_e_mask_minus_one, a_em_no_hidden_bit );
-  const uint16 is_b_inf_msb              = _uint16_sub( h_e_mask_minus_one, b_em_no_hidden_bit );
-  const uint16 is_undenorm_msb           = _uint16_dec( a_e );
-  const uint16 is_undenorm               = _uint16_ext( is_undenorm_msb );
-  const uint16 is_both_inf_msb           = _uint16_and( is_a_inf_msb, is_b_inf_msb );
-  const uint16 is_invalid_inf_op_msb     = _uint16_and( is_both_inf_msb, b_s );
-  const uint16 is_a_e_nez_msb            = _uint16_neg( a_e );
-  const uint16 is_b_e_nez_msb            = _uint16_neg( b_e );
-  const uint16 is_a_e_nez                = _uint16_ext( is_a_e_nez_msb );
-  const uint16 is_b_e_nez                = _uint16_ext( is_b_e_nez_msb );
-  const uint16 a_m_hidden_bit            = _uint16_and( is_a_e_nez, h_m_hidden );
-  const uint16 b_m_hidden_bit            = _uint16_and( is_b_e_nez, h_m_hidden );
-  const uint16 a_m_no_grs                = _uint16_or( a_m_no_hidden_bit, a_m_hidden_bit );
-  const uint16 b_m_no_grs                = _uint16_or( b_m_no_hidden_bit, b_m_hidden_bit );
-  const uint16 diff_e                    = _uint16_sub( a_e,        b_e );
-  const uint16 a_e_unbias                = _uint16_sub( a_e,        h_e_bias_minus_one );
-  const uint16 a_m                       = _uint16_sll( a_m_no_grs, h_grs_size );
-  const uint16 a_e_biased                = _uint16_srl( a_e,        h_e_pos );
-  const uint16 m_sa_unbias               = _uint16_srl( a_e_unbias, h_e_pos );
-  const uint16 m_sa_default              = _uint16_srl( diff_e,     h_e_pos );
-  const uint16 m_sa_unbias_mask          = _uint16_andc( is_a_e_nez_msb,   is_b_e_nez_msb );
-  const uint16 m_sa                      = _uint16_sels( m_sa_unbias_mask, m_sa_unbias, m_sa_default );
-  const uint16 b_m_no_sticky             = _uint16_sll( b_m_no_grs,        h_grs_size );
-  const uint16 sh_m                      = _uint16_srl( b_m_no_sticky,     m_sa );
-  const uint16 sticky_overflow           = _uint16_sll( one,               m_sa );
-  const uint16 sticky_mask               = _uint16_dec( sticky_overflow );
-  const uint16 sticky_collect            = _uint16_and( b_m_no_sticky, sticky_mask );
-  const uint16 is_sticky_set_msb         = _uint16_neg( sticky_collect );
-  const uint16 sticky                    = _uint16_srl( is_sticky_set_msb, msb_to_lsb_sa);
-  const uint16 b_m                       = _uint16_or( sh_m, sticky );
-  const uint16 is_c_m_ab_pos_msb         = _uint16_sub( b_m, a_m );
-  const uint16 c_inf                     = _uint16_or( a_s, h_e_mask );
-  const uint16 c_m_sum                   = _uint16_add( a_m, b_m );
-  const uint16 c_m_diff_ab               = _uint16_sub( a_m, b_m );
-  const uint16 c_m_diff_ba               = _uint16_sub( b_m, a_m );
-  const uint16 c_m_smag_diff             = _uint16_sels( is_c_m_ab_pos_msb, c_m_diff_ab, c_m_diff_ba );
-  const uint16 c_s_diff                  = _uint16_sels( is_c_m_ab_pos_msb, a_s,         b_s         );
-  const uint16 c_s                       = _uint16_sels( is_diff_sign_msb,  c_s_diff,    a_s         );
-  const uint16 c_m_smag_diff_nlz         = _uint16_cntlz( c_m_smag_diff );
-  const uint16 diff_norm_sa              = _uint16_sub( c_m_smag_diff_nlz, one );
-  const uint16 is_diff_denorm_msb        = _uint16_sub( a_e_biased, diff_norm_sa );
-  const uint16 is_diff_denorm            = _uint16_ext( is_diff_denorm_msb );
-  const uint16 is_a_or_b_norm_msb        = _uint16_neg( a_e_biased );
-  const uint16 diff_denorm_sa            = _uint16_dec( a_e_biased );
-  const uint16 c_m_diff_denorm           = _uint16_sll( c_m_smag_diff, diff_denorm_sa );
-  const uint16 c_m_diff_norm             = _uint16_sll( c_m_smag_diff, diff_norm_sa );
-  const uint16 c_e_diff_norm             = _uint16_sub( a_e_biased,  diff_norm_sa );
-  const uint16 c_m_diff_ab_norm          = _uint16_sels( is_diff_denorm_msb, c_m_diff_denorm, c_m_diff_norm );
-  const uint16 c_e_diff_ab_norm          = _uint16_andc( c_e_diff_norm, is_diff_denorm );
-  const uint16 c_m_diff                  = _uint16_sels( is_a_or_b_norm_msb, c_m_diff_ab_norm, c_m_smag_diff );
-  const uint16 c_e_diff                  = _uint16_sels( is_a_or_b_norm_msb, c_e_diff_ab_norm, a_e_biased    );
-  const uint16 is_diff_eqz_msb           = _uint16_dec( c_m_diff );
-  const uint16 is_diff_exactly_zero_msb  = _uint16_and( is_diff_sign_msb, is_diff_eqz_msb );
-  const uint16 is_diff_exactly_zero      = _uint16_ext( is_diff_exactly_zero_msb );
-  const uint16 c_m_added                 = _uint16_sels( is_diff_sign_msb, c_m_diff, c_m_sum );
-  const uint16 c_e_added                 = _uint16_sels( is_diff_sign_msb, c_e_diff, a_e_biased );
-  const uint16 c_m_carry                 = _uint16_and( c_m_added, h_m_grs_carry );
-  const uint16 is_c_m_carry_msb          = _uint16_neg( c_m_carry );
-  const uint16 c_e_hidden_offset         = _uint16_andsrl( c_m_added, h_m_grs_carry, h_m_grs_carry_pos );
-  const uint16 c_m_sub_hidden            = _uint16_srl( c_m_added, one );
-  const uint16 c_m_no_hidden             = _uint16_sels( is_c_m_carry_msb, c_m_sub_hidden, c_m_added );
-  const uint16 c_e_no_hidden             = _uint16_add( c_e_added,         c_e_hidden_offset  );
-  const uint16 c_m_no_hidden_msb         = _uint16_and( c_m_no_hidden,     h_m_msb_mask       );
-  const uint16 undenorm_m_msb_odd        = _uint16_srl( c_m_no_hidden_msb, h_m_msb_sa         );
-  const uint16 undenorm_fix_e            = _uint16_and( is_undenorm,       undenorm_m_msb_odd );
-  const uint16 c_e_fixed                 = _uint16_add( c_e_no_hidden,     undenorm_fix_e     );
-  const uint16 c_m_round_amount          = _uint16_and( c_m_no_hidden,     h_grs_round_mask   );
-  const uint16 c_m_rounded               = _uint16_add( c_m_no_hidden,     c_m_round_amount   );
-  const uint16 c_m_round_overflow        = _uint16_andsrl( c_m_rounded, h_m_grs_carry, h_m_grs_carry_pos );
-  const uint16 c_e_rounded               = _uint16_add( c_e_fixed, c_m_round_overflow );
-  const uint16 c_m_no_grs                = _uint16_srlm( c_m_rounded, h_grs_size,  h_m_mask );
-  const uint16 c_e                       = _uint16_sll( c_e_rounded, h_e_pos );
-  const uint16 c_em                      = _uint16_or( c_e, c_m_no_grs );
-  const uint16 c_normal                  = _uint16_or( c_s, c_em );
-  const uint16 c_inf_result              = _uint16_sels( is_a_inf_msb, c_inf, c_normal );
-  const uint16 c_zero_result             = _uint16_andc( c_inf_result, is_diff_exactly_zero );
-  const uint16 c_result                  = _uint16_sels( is_invalid_inf_op_msb, h_snan, c_zero_result );
-
-  return (c_result);
-}
--- a/src/nvmath/Half.h
+++ b/src/nvmath/Half.h
@ -1,9 +0,0 @@
-#ifndef NV_MATH_HALF_H
-#define NV_MATH_HALF_H
-
-#include <nvmath/nvmath.h>
-
-uint32 half_to_float( uint16 h );
-uint16 half_from_float( uint32 f );
-
-#endif /* NV_MATH_HALF_H */
--- a/src/nvmath/Matrix.h
+++ b/src/nvmath/Matrix.h
@ -24,8 +24,6 @@ public:
 	Matrix(zero_t);
 	Matrix(identity_t);
 	Matrix(const Matrix & m);
-	Matrix(Vector4::Arg v0, Vector4::Arg v1, Vector4::Arg v2, Vector4::Arg v3);
-	Matrix(const scalar m[]);	// m is assumed to contain 16 elements

 	scalar data(uint idx) const;
 	scalar & data(uint idx);
@ -77,21 +75,6 @@ inline Matrix::Matrix(const Matrix & m)
 	}
 }

-inline Matrix::Matrix(Vector4::Arg v0, Vector4::Arg v1, Vector4::Arg v2, Vector4::Arg v3)
-{
-	m_data[ 0] = v0.x(); m_data[ 1] = v0.y(); m_data[ 2] = v0.z(); m_data[ 3] = v0.w();
-	m_data[ 4] = v1.x(); m_data[ 5] = v1.y(); m_data[ 6] = v1.z(); m_data[ 7] = v1.w();
-	m_data[ 8] = v2.x(); m_data[ 9] = v2.y(); m_data[10] = v2.z(); m_data[11] = v2.w();
-	m_data[12] = v3.x(); m_data[13] = v3.y(); m_data[14] = v3.z(); m_data[15] = v3.w();
-}
-
-inline Matrix::Matrix(const scalar m[])
-{
-	for(int i = 0; i < 16; i++) {
-		m_data[i] = m[i];
-	}
-}
-

 // Accessors
 inline scalar Matrix::data(uint idx) const
--- a/src/nvmath/Montecarlo.cpp
+++ b/src/nvmath/Montecarlo.cpp
@ -0,0 +1,156 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#include <nvmath/Montecarlo.h>
+
+using namespace nv;
+
+
+void SampleDistribution::redistribute(Method method/*=Method_NRook*/, Distribution dist/*=Distribution_Cosine*/)
+{
+	switch(method) 
+	{
+		case Method_Random:
+			redistributeRandom(dist);
+			break;
+		case Method_Stratified:
+			redistributeStratified(dist);
+			break;
+		case Method_NRook:
+			redistributeNRook(dist);
+			break;
+	};
+}
+	
+void SampleDistribution::redistributeRandom(const Distribution dist)
+{
+	const uint sampleCount = m_sampleArray.count();
+	
+	// This is the worst method possible!
+	for(uint i = 0; i < sampleCount; i++)
+	{
+		float x = m_rand.getFloat();
+		float y = m_rand.getFloat();
+		
+		// Map uniform distribution in the square to the (hemi)sphere.
+		if( dist == Distribution_Uniform ) {
+			m_sampleArray[i].setUV(acosf(1 - 2 * x), 2 * PI * y);
+		}
+		else {
+			nvDebugCheck(dist == Distribution_Cosine);
+			m_sampleArray[i].setUV(acosf(sqrtf(x)), 2 * PI * y);
+		}
+	}
+}
+
+
+void SampleDistribution::redistributeStratified(const Distribution dist)
+{
+	const uint sampleCount = m_sampleArray.count();
+	const uint sqrtSampleCount = uint(sqrtf(float(sampleCount)));
+	
+	nvDebugCheck(sqrtSampleCount*sqrtSampleCount == sampleCount);	// Must use exact powers!
+
+	// Create a uniform distribution of points on the hemisphere with low variance.
+	for(uint v = 0, i = 0; v < sqrtSampleCount; v++) {
+		for(uint u = 0; u < sqrtSampleCount; u++, i++) {
+			float x = (u + m_rand.getFloat()) / float(sqrtSampleCount);
+			float y = (v + m_rand.getFloat()) / float(sqrtSampleCount);
+			
+			// Map uniform distribution in the square to the (hemi)sphere.
+			if( dist == Distribution_Uniform ) {
+				m_sampleArray[i].setUV(acosf(1 - 2 * x), 2 * PI * y);
+			}
+			else {
+				nvDebugCheck(dist == Distribution_Cosine);
+				m_sampleArray[i].setUV(acosf(sqrtf(x)), 2 * PI * y);
+			}
+		}
+	}
+}
+
+
+/** Multi-Stage N-rooks Sampling Method.
+ * See: http://www.acm.org/jgt/papers/WangSung9/9
+ */
+void SampleDistribution::multiStageNRooks(const int size, int* cells)
+{
+	if (size == 1) {
+		return;
+	}
+
+	int size1 = size >> 1;
+	int size2 = size >> 1;
+
+	if (size & 1) {
+		if (m_rand.getFloat() > 0.5) {
+			size1++;
+		}
+		else {
+			size2++;
+		}
+	}
+
+	int* upper_cells = new int[size1];
+	int* lower_cells = new int[size2];
+
+	int i, j;
+	for(i = 0, j = 0; i < size - 1; i += 2, j++) {
+		if (m_rand.get() & 1) {
+			upper_cells[j] = cells[i];
+			lower_cells[j] = cells[i + 1];
+		}
+		else {
+			upper_cells[j] = cells[i + 1];
+			lower_cells[j] = cells[i];
+		}
+	}
+
+	if (size1 != size2) {
+		if (size1 > size2) {
+			upper_cells[j] = cells[i];
+		}
+		else {
+			lower_cells[j] = cells[i];
+		}
+	}
+
+	multiStageNRooks(size1, upper_cells);
+	memcpy(cells, upper_cells, size1 * sizeof(int));
+	delete [] upper_cells;
+
+	multiStageNRooks(size2, lower_cells);
+	memcpy(cells + size1, lower_cells, size2 * sizeof(int));
+	delete [] lower_cells;
+}
+
+
+void SampleDistribution::redistributeNRook(const Distribution dist)
+{
+	const uint sampleCount = m_sampleArray.count();
+	
+	// Generate nrook cells
+	int * cells = new int[sampleCount];
+	for(uint32 i = 0; i < sampleCount; i++)
+	{
+		cells[i] = i;
+	}
+	multiStageNRooks(sampleCount, cells);
+
+	for(uint i = 0; i < sampleCount; i++)
+	{
+		float x = (i + m_rand.getFloat()) / sampleCount;
+		float y = (cells[i] + m_rand.getFloat()) / sampleCount;
+
+		// Map uniform distribution in the square to the (hemi)sphere.
+		if( dist == Distribution_Uniform ) {
+			m_sampleArray[i].setUV(acosf(1 - 2 * x), 2 * PI * y);
+		}
+		else {
+			nvDebugCheck(dist == Distribution_Cosine);
+			m_sampleArray[i].setUV(acosf(sqrtf(x)), 2 * PI * y);
+		}
+	}
+
+	delete [] cells;
+}
+
--- a/src/nvmath/Montecarlo.h
+++ b/src/nvmath/Montecarlo.h
@ -0,0 +1,84 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#ifndef NV_MATH_MONTECARLO_H
+#define NV_MATH_MONTECARLO_H
+
+#include <nvmath/Vector.h>
+#include <nvmath/Random.h>
+
+namespace nv
+{
+
+/// A random sample distribution.
+class SampleDistribution
+{
+public:
+	
+	// Sampling method.
+	enum Method {
+		Method_Random,
+		Method_Stratified,
+		Method_NRook
+	};
+
+	// Distribution functions.
+	enum Distribution {
+		Distribution_Uniform,
+		Distribution_Cosine
+	};
+	
+	/// Constructor.
+	SampleDistribution(int num)
+	{
+		m_sampleArray.resize(num);
+	}
+	
+	void redistribute(Method method=Method_NRook, Distribution dist=Distribution_Cosine);
+	
+	/// Get parametric coordinates of the sample.
+	Vector2 sample(int i) { return m_sampleArray[i].uv; }
+	
+	/// Get sample direction.
+	Vector3 sampleDir(int i) { return m_sampleArray[i].dir; }
+
+	/// Get number of samples.
+	uint sampleCount() const { return m_sampleArray.count(); }
+	
+private:
+	
+	void redistributeRandom(const Distribution dist);
+	void redistributeStratified(const Distribution dist);
+	void multiStageNRooks(const int size, int* cells);
+	void redistributeNRook(const Distribution dist);
+	
+	
+	/// A sample of the random distribution.
+	struct Sample
+	{
+		/// Set sample given the 3d coordinates.
+		void setDir(float x, float y, float z) {
+			dir.set(x, y, z);
+			uv.set(acosf(z), atan2f(y, x));
+		}
+		
+		/// Set sample given the 2d parametric coordinates.
+		void setUV(float u, float v) {
+			uv.set(u, v);
+			dir.set(sinf(u) * cosf(v), sinf(u) * sinf(v), cosf(u));
+		}
+		
+		Vector2 uv;
+		Vector3 dir;
+	};
+	
+	/// Random seed.
+	MTRand m_rand;
+	
+	/// Samples.
+	Array<Sample> m_sampleArray;
+	
+};
+
+} // nv namespace
+
+#endif // NV_MATH_MONTECARLO_H
--- a/src/nvmath/Plane.h
+++ b/src/nvmath/Plane.h
@ -3,8 +3,8 @@
 #ifndef NV_MATH_PLANE_H
 #define NV_MATH_PLANE_H

-#include "nvmath.h"
-#include "Vector.h"
+#include <nvmath/nvmath.h>
+#include <nvmath/Vector.h>

 namespace nv
 {
@ -59,7 +59,7 @@ namespace nv
 		return Plane(plane.asVector() * inv);
 	}

-	// Get the signed distance from the given point to this plane.
+	// Get the distance from the given point to this plane.
 	inline float distance(Plane::Arg plane, Vector3::Arg point)
 	{
 		return dot(plane.vector(), point) - plane.offset();
--- a/src/nvmath/Quaternion.h
+++ b/src/nvmath/Quaternion.h
@ -0,0 +1,128 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#ifndef NV_MATH_QUATERNION_H
+#define NV_MATH_QUATERNION_H
+
+#include <nvmath/nvmath.h>
+#include <nvmath/Vector.h>
+
+namespace nv
+{
+
+	class NVMATH_CLASS Quaternion
+	{
+	public:
+		typedef Quaternion const & Arg;
+		
+		Quaternion();
+		explicit Quaternion(zero_t);
+		Quaternion(float x, float y, float z, float w);
+		Quaternion(Vector4::Arg v);
+		
+		const Quaternion & operator=(Quaternion::Arg v);
+		
+		scalar x() const;
+		scalar y() const;
+		scalar z() const;
+		scalar w() const;
+		
+		const Vector4 & asVector() const;
+		Vector4 & asVector();
+		
+	private:
+		Vector4 q;
+	};
+
+	inline Quaternion::Quaternion() {}
+	inline Quaternion::Quaternion(zero_t) : q(zero) {}
+	inline Quaternion::Quaternion(float x, float y, float z, float w) : q(x, y, z, w) {}
+	inline Quaternion::Quaternion(Vector4::Arg v) : q(v) {}
+	
+	inline const Quaternion & Quaternion::operator=(Quaternion::Arg v) { q = v.q; return *this; }
+	
+	inline scalar Quaternion::x() const { return q.x(); }
+	inline scalar Quaternion::y() const { return q.y(); }
+	inline scalar Quaternion::z() const { return q.z(); }
+	inline scalar Quaternion::w() const { return q.w(); }
+
+	inline const Vector4 & Quaternion::asVector() const { return q; }
+	inline Vector4 & Quaternion::asVector() { return q; }
+
+
+	inline Quaternion mul(Quaternion::Arg a, Quaternion::Arg b)
+	{
+		// @@ Efficient SIMD implementation?
+		return Quaternion(
+			+ a.x() * b.w() + a.y()*b.z() - a.z()*b.y() + a.w()*b.x(),
+			- a.x() * b.z() + a.y()*b.w() + a.z()*b.x() + a.w()*b.y(),
+			+ a.x() * b.y() - a.y()*b.x() + a.z()*b.w() + a.w()*b.z(),
+			- a.x() * b.x() - a.y()*b.y() - a.z()*b.z() + a.w()*b.w());
+	}
+
+	inline Quaternion scale(Quaternion::Arg q, float s)
+	{
+		return scale(q.asVector(), s);
+	}
+	inline Quaternion operator *(Quaternion::Arg q, float s)
+	{
+		return scale(q, s);
+	}
+	inline Quaternion operator *(float s, Quaternion::Arg q)
+	{
+		return scale(q, s);
+	}
+
+	inline Quaternion scale(Quaternion::Arg q, Vector4::Arg s)
+	{
+		return scale(q.asVector(), s);
+	}
+	/*inline Quaternion operator *(Quaternion::Arg q, Vector4::Arg s)
+	{
+		return scale(q, s);
+	}
+	inline Quaternion operator *(Vector4::Arg s, Quaternion::Arg q)
+	{
+		return scale(q, s);
+	}*/
+
+	inline Quaternion conjugate(Quaternion::Arg q)
+	{
+		return scale(q, Vector4(-1, -1, -1, 1));
+	}
+
+	inline float length(Quaternion::Arg q)
+	{
+		return length(q.asVector());
+	}
+
+	inline bool isNormalized(Quaternion::Arg q, float epsilon = NV_NORMAL_EPSILON)
+	{
+		return equal(length(q), 1, epsilon);
+	}
+
+	inline Quaternion normalize(Quaternion::Arg q, float epsilon = NV_EPSILON)
+	{
+		float l = length(q);
+		nvDebugCheck(!isZero(l, epsilon));
+		Quaternion n = scale(q, 1.0f / l);
+		nvDebugCheck(isNormalized(n));
+		return n;
+	}
+
+	inline Quaternion inverse(Quaternion::Arg q)
+	{
+		return conjugate(normalize(q));
+	}
+
+	/// Create a rotation quaternion for @a angle alpha around normal vector @a v.
+	inline Quaternion axisAngle(Vector3::Arg v, float alpha)
+	{
+		float s = sinf(alpha * 0.5f);
+		float c = cosf(alpha * 0.5f);
+		return Quaternion(Vector4(v * s, c));
+	}
+
+
+} // nv namespace
+
+#endif // NV_MATH_QUATERNION_H
--- a/src/nvmath/Random.cpp
+++ b/src/nvmath/Random.cpp
@ -0,0 +1,54 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#include <nvmath/Random.h>
+#include <time.h>
+
+using namespace nv;
+
+// Statics
+const uint16 Rand48::a0 = 0xE66D; 
+const uint16 Rand48::a1 = 0xDEEC; 
+const uint16 Rand48::a2 = 0x0005;
+const uint16 Rand48::c0 = 0x000B;
+
+
+/// Get a random seed based on the current time.
+uint Rand::randomSeed()
+{
+	return (uint)time(NULL);
+}
+
+
+void MTRand::initialize( uint32 seed )
+{
+	// Initialize generator state with seed
+	// See Knuth TAOCP Vol 2, 3rd Ed, p.106 for multiplier.
+	// In previous versions, most significant bits (MSBs) of the seed affect
+	// only MSBs of the state array.  Modified 9 Jan 2002 by Makoto Matsumoto.
+	uint32 *s = state;
+	uint32 *r = state;
+	int i = 1;
+	*s++ = seed & 0xffffffffUL;
+	for( ; i < N; ++i )
+	{
+		*s++ = ( 1812433253UL * ( *r ^ (*r >> 30) ) + i ) & 0xffffffffUL;
+		r++;
+	}
+}
+
+
+void MTRand::reload()
+{
+	// Generate N new values in state
+	// Made clearer and faster by Matthew Bellew (matthew.bellew@home.com)
+	uint32 *p = state;
+	int i;
+	for( i = N - M; i--; ++p )
+		*p = twist( p[M], p[0], p[1] );
+	for( i = M; --i; ++p )
+		*p = twist( p[M-N], p[0], p[1] );
+	*p = twist( p[M-N], p[0], state[0] );
+
+	left = N, next = state;
+}
+
--- a/src/nvmath/Random.h
+++ b/src/nvmath/Random.h
@ -0,0 +1,368 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#ifndef NV_MATH_RANDOM_H
+#define NV_MATH_RANDOM_H
+
+#include <nvcore/Containers.h> // nextPowerOfTwo
+#include <nvmath/nvmath.h>
+
+namespace nv
+{
+
+/// Interface of the random number generators.
+class Rand
+{
+public:
+
+	virtual ~Rand() {}
+
+	enum time_e { Time };
+
+	/// Provide a new seed.
+	virtual void seed( uint s ) { /* empty */ };
+	
+	/// Get an integer random number.
+	virtual uint get() = 0;
+
+	/// Get a random number on [0, max] interval.
+	uint getRange( uint max )
+	{
+		uint n;
+	//	uint mask = Bitmask( max );
+	//	do { n = Get() & mask; } while( n > max );		
+		uint np2 = nextPowerOfTwo( max );
+		do { n = get() & (np2-1); } while( n > max );
+		return n;
+	}
+
+	/// Random number on [0.0, 1.0] interval.
+	float getFloat()
+	{
+    	union
+		{
+			uint32 i;
+			float f;
+		} pun;
+
+		pun.i = 0x3f800000UL | (get() & 0x007fffffUL);
+		return pun.f - 1.0f;
+	}
+
+	/*
+	/// Random number on [0.0, 1.0] interval.
+	double getReal()
+	{
+		return double(get()) * (1.0/4294967295.0); // 2^32-1
+	}
+	
+	/// Random number on [0.0, 1.0) interval.
+	double getRealExclusive()
+	{
+		return double(get()) * (1.0/4294967296.0); // 2^32
+	}
+	*/
+
+	/// Get the max value of the random number.
+	uint max() const { return 4294967295U; }
+
+	// Get a random seed.
+	static uint randomSeed();
+	
+};
+
+
+/// Very simple random number generator with low storage requirements.
+class SimpleRand : public Rand
+{
+public:
+
+	/// Constructor that uses the current time as the seed.
+	SimpleRand( time_e )
+	{
+		seed(randomSeed());
+	}
+	
+	/// Constructor that uses the given seed.
+	SimpleRand( uint s = 0 )
+	{
+		seed(s);
+	}
+	
+	/// Set the given seed.
+	virtual void seed( uint s )
+	{
+		current = s;
+	}
+	
+	/// Get a random number.
+	virtual uint get()
+	{
+		return current = current * 1103515245 + 12345;
+	}
+
+private:
+
+	uint current;
+	
+};
+
+
+/// Mersenne twister random number generator.
+class MTRand : public Rand
+{
+public:
+
+	enum { N = 624 };       // length of state vector
+	enum { M = 397 };
+
+	/// Constructor that uses the current time as the seed.
+	MTRand( time_e )
+	{
+		seed(randomSeed());
+	}
+	
+	/// Constructor that uses the given seed.
+	MTRand( uint s = 0 )
+	{
+		seed(s);
+	}
+	
+	/// Constructor that uses the given seeds.
+	NVMATH_API MTRand( const uint * seed_array, uint length );
+
+	
+	/// Provide a new seed.
+	virtual void seed( uint s )
+	{
+		initialize(s);
+		reload();
+	}	
+
+	/// Get a random number between 0 - 65536.
+	virtual uint get()
+	{
+		// Pull a 32-bit integer from the generator state
+		// Every other access function simply transforms the numbers extracted here
+		if( left == 0 ) { 
+			reload(); 
+		}
+		left--;
+		
+		uint s1;
+		s1 = *next++;
+		s1 ^= (s1 >> 11);
+		s1 ^= (s1 <<  7) & 0x9d2c5680U;
+		s1 ^= (s1 << 15) & 0xefc60000U;
+		return ( s1 ^ (s1 >> 18) );		
+	};
+
+	
+private:
+	
+	NVMATH_API void initialize( uint32 seed );
+	NVMATH_API void reload();
+	
+	uint hiBit( uint u ) const { return u & 0x80000000U; }
+	uint loBit( uint u ) const { return u & 0x00000001U; }
+	uint loBits( uint u ) const { return u & 0x7fffffffU; }
+	uint mixBits( uint u, uint v ) const { return hiBit(u) | loBits(v); }
+	uint twist( uint m, uint s0, uint s1 ) const { return m ^ (mixBits(s0,s1)>>1) ^ ((~loBit(s1)+1) & 0x9908b0dfU); }
+		
+private:
+
+	uint state[N];	// internal state
+	uint * next;	// next value to get from state
+	int left;		// number of values left before reload needed		
+
+};
+
+
+
+/** George Marsaglia's random number generator. 
+ * Code based on Thatcher Ulrich public domain source code:
+ * http://cvs.sourceforge.net/viewcvs.py/tu-testbed/tu-testbed/base/tu_random.cpp?rev=1.7&view=auto
+ *
+ * PRNG code adapted from the complimentary-multiply-with-carry
+ * code in the article: George Marsaglia, "Seeds for Random Number
+ * Generators", Communications of the ACM, May 2003, Vol 46 No 5,
+ * pp90-93.
+ * 
+ * The article says:
+ * 
+ * "Any one of the choices for seed table size and multiplier will
+ * provide a RNG that has passed extensive tests of randomness,
+ * particularly those in [3], yet is simple and fast --
+ * approximately 30 million random 32-bit integers per second on a
+ * 850MHz PC.  The period is a*b^n, where a is the multiplier, n
+ * the size of the seed table and b=2^32-1.  (a is chosen so that
+ * b is a primitive root of the prime a*b^n + 1.)"
+ * 
+ * [3] Marsaglia, G., Zaman, A., and Tsang, W.  Toward a universal
+ * random number generator.  _Statistics and Probability Letters
+ * 8_ (1990), 35-39.
+ */
+class GMRand : public Rand
+{
+public:
+
+	enum { SEED_COUNT = 8 };
+
+//	const uint64 a = 123471786;		// for SEED_COUNT=1024
+//	const uint64 a = 123554632;		// for SEED_COUNT=512
+//	const uint64 a = 8001634;		// for SEED_COUNT=255
+//	const uint64 a = 8007626;		// for SEED_COUNT=128
+//	const uint64 a = 647535442;		// for SEED_COUNT=64
+//	const uint64 a = 547416522;		// for SEED_COUNT=32
+//	const uint64 a = 487198574;		// for SEED_COUNT=16
+//	const uint64 a = 716514398U;	// for SEED_COUNT=8
+	enum { a = 716514398U };
+	
+
+	GMRand( time_e )
+	{
+		seed(randomSeed());
+	}
+	
+	GMRand(uint s = 987654321)
+	{
+		seed(s);
+	}
+
+
+	/// Provide a new seed.
+	virtual void seed( uint s )
+	{
+		c = 362436;
+		i = SEED_COUNT - 1;
+		
+		for(int i = 0; i < SEED_COUNT; i++) {
+			s = s ^ (s << 13);
+			s = s ^ (s >> 17);
+			s = s ^ (s << 5);
+			Q[i] = s;
+		}
+	}
+
+	/// Get a random number between 0 - 65536.
+	virtual uint get()
+	{
+		const uint32 r = 0xFFFFFFFE;		
+		
+		uint64 t;
+		uint32 x;
+
+		i = (i + 1) & (SEED_COUNT - 1);
+		t = a * Q[i] + c;
+		c = uint32(t >> 32);
+		x = uint32(t + c);
+		
+		if( x < c ) {
+			x++;
+			c++;
+		}
+		
+		uint32  val = r - x;
+		Q[i] = val;
+		return val;
+	};
+
+
+private:
+
+	uint32 c;
+	uint32 i;
+	uint32 Q[8];
+
+};
+
+
+/** Random number implementation from the GNU Sci. Lib. (GSL).
+ * Adapted from Nicholas Chapman version:
+ * 
+ * Copyright (C) 1996, 1997, 1998, 1999, 2000 James Theiler, Brian Gough
+ * This is the Unix rand48() generator. The generator returns the
+ * upper 32 bits from each term of the sequence,
+ * 
+ * x_{n+1} = (a x_n + c) mod m 
+ * 
+ * using 48-bit unsigned arithmetic, with a = 0x5DEECE66D , c = 0xB
+ * and m = 2^48. The seed specifies the upper 32 bits of the initial
+ * value, x_1, with the lower 16 bits set to 0x330E.
+ * 
+ * The theoretical value of x_{10001} is 244131582646046.
+ * 
+ * The period of this generator is ? FIXME (probably around 2^48). 
+ */
+class Rand48 : public Rand
+{
+public:
+
+	Rand48( time_e )
+	{
+		seed(randomSeed());
+	}
+	
+	Rand48( uint s = 0x1234ABCD )
+	{
+		seed(s);
+	}	
+
+
+	/** Set the given seed. */
+	virtual void seed( uint s ) {
+		vstate.x0 = 0x330E;
+		vstate.x1 = uint16(s & 0xFFFF);
+		vstate.x2 = uint16((s >> 16) & 0xFFFF);
+	}
+	
+	/** Get a random number. */
+	virtual uint get() {
+		
+		advance();
+
+		uint x1 = vstate.x1;
+		uint x2 = vstate.x2;
+		return (x2 << 16) + x1;
+	}
+	
+	
+private:
+	
+	void advance()
+	{
+		/* work with unsigned long ints throughout to get correct integer
+		promotions of any unsigned short ints */
+		const uint32 x0 = vstate.x0;
+		const uint32 x1 = vstate.x1;
+		const uint32 x2 = vstate.x2;
+		
+		uint32 a;
+		a = a0 * x0 + c0;
+		
+		vstate.x0 = uint16(a & 0xFFFF);
+		a >>= 16;
+		
+		/* although the next line may overflow we only need the top 16 bits
+		in the following stage, so it does not matter */
+		
+		a += a0 * x1 + a1 * x0; 
+		vstate.x1 = uint16(a & 0xFFFF);
+		
+		a >>= 16;
+		a += a0 * x2 + a1 * x1 + a2 * x0;
+		vstate.x2 = uint16(a & 0xFFFF);
+	}
+
+	
+private:	
+	NVMATH_API static const uint16 a0, a1, a2, c0;
+
+	struct rand48_state_t { 
+		uint16 x0, x1, x2; 
+	} vstate;
+
+};
+
+} // nv namespace
+
+#endif // NV_MATH_RANDOM_H
--- a/src/nvmath/SphericalHarmonic.cpp
+++ b/src/nvmath/SphericalHarmonic.cpp
@ -0,0 +1,241 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#include <nvmath/SphericalHarmonic.h>
+
+using namespace nv;
+
+
+namespace
+{
+	
+	// Basic integer factorial.
+	inline static int factorial( int v )
+	{
+		if (v == 0) {
+			return 1;
+		}
+	
+		int result = v;
+		while (--v > 0) {
+			result *= v;
+		}
+		return result;
+	}
+	
+	
+	// Double factorial. 
+	// Defined as: n!! = n*(n - 2)*(n - 4)..., n!!(0,-1) = 1.
+	inline static int doubleFactorial( int x )
+	{
+		if (x == 0 || x == -1) {
+			return 1;
+		}
+	
+		int result = x;
+		while ((x -= 2) > 0) {
+			result *= x;
+		}
+	
+		return result;
+	}	
+	
+	/// Normalization constant for spherical harmonic.
+	/// @param l is the band.
+	/// @param m is the argument, in the range [0, m]
+	inline static float K( int l, int m )
+	{
+		nvDebugCheck( m >= 0 );
+		return sqrtf(((2 * l + 1) * factorial(l - m)) / (4 * PI * factorial(l + m)));
+	}
+	
+	/// Normalization constant for hemispherical harmonic.
+	inline static float HK( int l, int m )
+	{
+		nvDebugCheck( m >= 0 );
+		return sqrtf(((2 * l + 1) * factorial(l - m)) / (2 * PI * factorial(l + m)));
+	}
+
+	/// Evaluate Legendre polynomial. */
+	static float legendre( int l, int m, float x )
+	{
+	//	piDebugCheck( m >= 0 );
+	//	piDebugCheck( m <= l );
+	//	piDebugCheck( fabs(x) <= 1 );
+	
+		// Rule 2 needs no previous results
+		if (l == m) {
+			return powf(-1.0f, float(m)) * doubleFactorial(2 * m - 1) * powf(1 - x*x, 0.5f * m);
+		}
+	
+		// Rule 3 requires the result for the same argument of the previous band
+		if (l == m + 1) {
+			return x * (2 * m + 1) * legendrePolynomial(m, m, x);
+		}
+	
+		// Main reccurence used by rule 1 that uses result of the same argument from
+		// the previous two bands
+		return (x * (2 * l - 1) * legendrePolynomial(l - 1, m, x) - (l + m - 1) * legendrePolynomial(l - 2, m, x)) / (l - m);
+	}
+	
+	
+	template <int l, int m> float legendre(float x);
+	
+	template <> float legendre<0, 0>(float x) {
+		return 1;
+	}
+	
+	template <> float legendre<1, 0>(float x) {
+		return x;
+	}
+	template <> float legendre<1, 1>(float x) {
+		return -sqrtf(1 - x * x);
+	}
+	
+	template <> float legendre<2, 0>(float x) {
+		return -0.5f + (3 * x * x) / 2;
+	}
+	template <> float legendre<2, 1>(float x) {
+		return -3 * x * sqrtf(1 - x * x);
+	}
+	template <> float legendre<2, 2>(float x) {
+		return -3 * (-1 + x * x);
+	}
+	
+	template <> float legendre<3, 0>(float x) {
+		return -(3 * x) / 2 + (5 * x * x * x) / 2;
+	}
+	template <> float legendre<3, 1>(float x) {
+		return -3 * sqrtf(1 - x * x) / 2 * (-1 + 5 * x * x);
+	}
+	template <> float legendre<3, 2>(float x) {
+		return -15 * (-x + x * x * x);
+	}
+	template <> float legendre<3, 3>(float x) {
+		return -15 * powf(1 - x * x, 1.5f);
+	}
+	
+	template <> float legendre<4, 0>(float x) {
+		return 0.125f * (3.0f - 30.0f * x * x + 35.0f * x * x * x * x);
+	}
+	template <> float legendre<4, 1>(float x) {
+		return -2.5f * x * sqrtf(1.0f - x * x) * (7.0f * x * x - 3.0f);
+	}
+	template <> float legendre<4, 2>(float x) {
+		return -7.5f * (1.0f - 8.0f * x * x + 7.0f * x * x * x * x);
+	}
+	template <> float legendre<4, 3>(float x) {
+		return -105.0f * x * powf(1 - x * x, 1.5f);
+	}
+	template <> float legendre<4, 4>(float x) {
+		return 105.0f * (x * x - 1.0f) * (x * x - 1.0f);
+	}
+
+} // namespace
+
+
+float nv::legendrePolynomial(int l, int m, float x)
+{
+	switch(l)
+	{
+		case 0:
+			return legendre<0, 0>(x);
+		case 1:
+			if(m == 0) return legendre<1, 0>(x);
+			return legendre<1, 1>(x);
+		case 2:
+			if(m == 0) return legendre<2, 0>(x);
+			else if(m == 1) return legendre<2, 1>(x);
+			return legendre<2, 2>(x);
+		case 3:
+			if(m == 0) return legendre<3, 0>(x);
+			else if(m == 1) return legendre<3, 1>(x);
+			else if(m == 2) return legendre<3, 2>(x);
+			return legendre<3, 3>(x);
+		case 4:
+			if(m == 0) return legendre<4, 0>(x);
+			else if(m == 1) return legendre<4, 1>(x);
+			else if(m == 2) return legendre<4, 2>(x);
+			else if(m == 3) return legendre<4, 3>(x);
+			else return legendre<4, 4>(x);
+	}
+	
+	// Fallback to the expensive version.
+	return legendre(l, m, x);
+}
+
+
+/** 
+ * Evaluate the spherical harmonic function for the given angles.
+ * @param l is the band.
+ * @param m is the argument, in the range [-l,l]
+ * @param theta is the altitude, in the range [0, PI]
+ * @param phi is the azimuth, in the range [0, 2*PI]
+ */
+float nv::y( int l, int m, float theta, float phi )
+{
+	if( m == 0 ) {
+		// K(l, 0) = sqrt((2*l+1)/(4*PI))
+		return sqrtf((2 * l + 1) / (4 * PI)) * legendrePolynomial(l, 0, cosf(theta));
+	}
+	else if( m > 0 ) {
+		return sqrtf(2.0f) * K(l, m) * cosf(m * phi) * legendrePolynomial(l, m, cosf(theta));
+	}
+	else {
+		return sqrtf(2.0f) * K(l, -m) * sinf(-m * phi) * legendrePolynomial(l, -m, cosf(theta));
+	}
+}
+
+
+/**
+ * Real spherical harmonic function of an unit vector. Uses the following
+ * equalities to call the angular function:
+ * x = sin(theta)*cos(phi)
+ * y = sin(theta)*sin(phi)
+ * z = cos(theta)
+ */
+float nv::y( int l, int m, Vector3::Arg v )
+{
+	float theta = acosf(v.z());
+	float phi = atan2f(v.y(), v.x());
+	return y( l, m, theta, phi );
+}
+
+
+/**
+ * Evaluate the hemispherical harmonic function for the given angles.
+ * @param l is the band.
+ * @param m is the argument, in the range [-l,l]
+ * @param theta is the altitude, in the range [0, PI/2]
+ * @param phi is the azimuth, in the range [0, 2*PI]
+ */
+float nv::hy( int l, int m, float theta, float phi )
+{
+	if( m == 0 ) {
+		// HK(l, 0) = sqrt((2*l+1)/(2*PI))
+		return sqrtf((2 * l + 1) / (2 * PI)) * legendrePolynomial(l, 0, 2*cosf(theta)-1);
+	}
+	else if( m > 0 ) {
+		return sqrtf(2.0f) * HK(l, m) * cosf(m * phi) * legendrePolynomial(l, m, 2*cosf(theta)-1);
+	}
+	else {
+		return sqrtf(2.0f) * HK(l, -m) * sinf(-m * phi) * legendrePolynomial(l, -m, 2*cosf(theta)-1);
+	}
+}
+
+
+/**
+ * Real hemispherical harmonic function of an unit vector. Uses the following
+ * equalities to call the angular function:
+ * x = sin(theta)*cos(phi)
+ * y = sin(theta)*sin(phi)
+ * z = cos(theta)
+ */
+float nv::hy( int l, int m, Vector3::Arg v )
+{
+	float theta = acosf(v.z());
+	float phi = atan2f(v.y(), v.x());
+	return y( l, m, theta, phi );
+}
+
+
+
--- a/src/nvmath/SphericalHarmonic.h
+++ b/src/nvmath/SphericalHarmonic.h
@ -0,0 +1,419 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#ifndef NV_MATH_SPHERICALHARMONIC_H
+#define NV_MATH_SPHERICALHARMONIC_H
+
+#include <nvmath/Vector.h>
+
+namespace nv
+{
+
+	NVMATH_API float legendrePolynomial( int l, int m, float x ) NV_CONST;
+	NVMATH_API float y( int l, int m, float theta, float phi ) NV_CONST;
+	NVMATH_API float y( int l, int m, Vector3::Arg v ) NV_CONST;
+	NVMATH_API float hy( int l, int m, float theta, float phi ) NV_CONST;
+	NVMATH_API float hy( int l, int m, Vector3::Arg v ) NV_CONST;
+	
+	class Sh;
+	float dot(const Sh & a, const Sh & b) NV_CONST;
+
+
+	/// Spherical harmonic class.
+	class Sh
+	{
+		friend class Sh2;
+		friend class ShMatrix;
+	public:
+		
+		/// Construct a spherical harmonic of the given order.
+		Sh(int o) : m_order(o)
+		{
+			m_elemArray = new float[basisNum()];
+		}
+		
+		/// Copy constructor.
+		Sh(const Sh & sh) : m_order(sh.order())
+		{
+			m_elemArray = new float[basisNum()];
+			memcpy(m_elemArray, sh.m_elemArray, sizeof(float) * basisNum());
+		}
+		
+		/// Destructor.
+		~Sh()
+		{
+			delete [] m_elemArray;
+			m_elemArray = NULL;
+		}
+		
+		/// Get number of bands.
+		static int bandNum(int order) {
+			return order + 1;
+		}
+		
+		/// Get number of sh basis.
+		static int basisNum(int order) {
+			return (order + 1) * (order + 1);
+		}
+		
+		/// Get the index for the given coefficients.
+		static int index( int l, int m ) {
+			return l * l + l + m;
+		}
+		
+		/// Get sh order.
+		int order() const
+		{
+			return m_order;
+		}
+		
+		/// Get sh order.
+		int bandNum() const
+		{
+			return bandNum(m_order);
+		}
+		
+		/// Get sh order.
+		int basisNum() const
+		{
+			return basisNum(m_order);
+		}
+		
+		/// Get sh coefficient indexed by l,m.
+		float elem( int l, int m ) const
+		{
+			return m_elemArray[index(l, m)];
+		}
+		
+		/// Get sh coefficient indexed by l,m.
+		float & elem( int l, int m )
+		{
+			return m_elemArray[index(l, m)];
+		}
+		
+		
+		/// Get sh coefficient indexed by i.
+		float elemAt( int i ) const {
+			return m_elemArray[i];
+		}
+		
+		/// Get sh coefficient indexed by i.
+		float & elemAt( int i )
+		{
+			return m_elemArray[i];
+		}
+		
+		
+		/// Reset the sh coefficients.
+		void reset()
+		{
+			for( int i = 0; i < basisNum(); i++ ) {
+				m_elemArray[i] = 0.0f;
+			}
+		}
+		
+		/// Copy spherical harmonic.
+		void operator= ( const Sh & sh )
+		{
+			nvDebugCheck(order() <= sh.order());
+			
+			for(int i = 0; i < basisNum(); i++) {
+				m_elemArray[i] = sh.m_elemArray[i];
+			}
+		}
+		
+		/// Add spherical harmonics.
+		void operator+= ( const Sh & sh )
+		{
+			nvDebugCheck(order() == sh.order());
+			
+			for(int i = 0; i < basisNum(); i++) {
+				m_elemArray[i] += sh.m_elemArray[i];
+			}
+		}
+		
+		/// Substract spherical harmonics.
+		void operator-= ( const Sh & sh )
+		{
+			nvDebugCheck(order() == sh.order());
+			
+			for(int i = 0; i < basisNum(); i++) {
+				m_elemArray[i] -= sh.m_elemArray[i];
+			}
+		}
+		
+		// Not exactly convolution, nor product.
+		void operator*= ( const Sh & sh )
+		{
+			nvDebugCheck(order() == sh.order());
+			
+			for(int i = 0; i < basisNum(); i++) {
+				m_elemArray[i] *= sh.m_elemArray[i];
+			}
+		}
+		
+		/// Scale spherical harmonics.
+		void operator*= ( float f )
+		{
+			for(int i = 0; i < basisNum(); i++) {
+				m_elemArray[i] *= f;
+			}
+		}
+		
+		/// Add scaled spherical harmonics.
+		void addScaled( const Sh & sh, float f )
+		{
+			nvDebugCheck(order() == sh.order());
+			
+			for(int i = 0; i < basisNum(); i++) {
+				m_elemArray[i] += sh.m_elemArray[i] * f;
+			}
+		}
+		
+		
+		/*/// Add a weighted sample to the sh coefficients.
+		void AddSample( const Vec3 & dir, const Color3f & color, float w=1.0f ) {
+			for(int l = 0; l <= order; l++) {
+				for(int m = -l; m <= l; m++) {
+					Color3f & elem = GetElem(l, m);
+					elem.Mad( elem, color, w * y(l, m, dir) );
+				}
+			}
+		}*/
+		
+		/// Evaluate 
+		void eval(Vector3::Arg dir)
+		{
+			for(int l = 0; l <= m_order; l++) {
+				for(int m = -l; m <= l; m++) {
+					elem(l, m) = y(l, m, dir);
+				}
+			}
+		}
+		
+		
+		/// Evaluate the spherical harmonic function.
+		float sample(Vector3::Arg dir) const
+		{
+			Sh sh(order());
+			sh.eval(dir);
+			
+			return dot(sh, *this);
+		}
+		
+		
+	protected:
+		
+		const int m_order;
+		float * m_elemArray;
+		
+	};
+
+
+	/// Compute dot product of the spherical harmonics.
+	inline float dot(const Sh & a, const Sh & b)
+	{
+		nvDebugCheck(a.order() == b.order());
+		
+		float sum = 0;
+		for( int i = 0; i < Sh::basisNum(a.order()); i++ ) {
+			sum += a.elemAt(i) * b.elemAt(i);
+		}
+		
+		return sum;
+	}
+
+	
+	/// Second order spherical harmonic.
+	class Sh2 : public Sh
+	{
+	public:
+		
+		/// Constructor.
+		Sh2() : Sh(2) {}
+		
+		/// Copy constructor.
+		Sh2(const Sh2 & sh) : Sh(sh) {}
+		
+		/// Spherical harmonic resulting from projecting the clamped cosine transfer function to the SH basis.
+		void cosineTransfer()
+		{
+			const float c1 = 0.282095f;	// K(0, 0)
+			const float c2 = 0.488603f; // K(1, 0)
+			const float c3 = 1.092548f; // sqrt(15.0f / PI) / 2.0f = K(2, -2)
+			const float c4 = 0.315392f; // sqrt(5.0f / PI) / 4.0f) = K(2, 0)
+			const float c5 = 0.546274f; // sqrt(15.0f / PI) / 4.0f) = K(2, 2)
+			
+			const float normalization = PI * 16.0f / 17.0f;
+			
+			const float const1 = c1 * normalization * 1.0f;
+			const float const2 = c2 * normalization * (2.0f / 3.0f);
+			const float const3 = c3 * normalization * (1.0f / 4.0f);
+			const float const4 = c4 * normalization * (1.0f / 4.0f);
+			const float const5 = c5 * normalization * (1.0f / 4.0f);
+			
+			m_elemArray[0] = const1;
+			
+			m_elemArray[1] = -const2;
+			m_elemArray[2] = const2;
+			m_elemArray[3] = -const2;
+			
+			m_elemArray[4] = const3;
+			m_elemArray[5] = -const3;
+			m_elemArray[6] = const4;
+			m_elemArray[7] = -const3;
+			m_elemArray[8] = const5;
+		}
+	};
+	
+	
+
+#if 0
+
+/// Spherical harmonic matrix.
+class ShMatrix
+{
+public:
+
+	/// Create an identity matrix of the given order.
+	ShMatrix(int o = 2) : order(o), identity(true)
+	{
+		nvCheck(order > 0);
+		e = new float[Size()];
+		band = new float *[GetBandNum()];		
+		setupBands();
+	}
+
+	/// Destroy and free matrix elements.
+	~ShMatrix()
+	{
+		delete e;
+		delete band;
+	}
+
+	/// Set identity matrix.
+	void setIdentity()
+	{
+		identity = true;
+	}
+
+	/// Return true if this is an identity matrix, false in other case.
+	bool isIdentity() const {
+		return identity;
+	}
+	
+	/// Get number of bands of this matrix.
+	int bandNum() const
+	{
+		return order+1;
+	}
+	
+	/// Get total number of elements in the matrix.
+	int size() const
+	{
+		int size = 0;
+		for( int i = 0; i < bandNum(); i++ ) {
+			size += SQ(i * 2 + 1);
+		}
+		return size;
+	}
+
+	/// Get element at the given raw index.
+	float elem(const int idx) const
+	{
+		return e[idx];
+	}
+	
+	/// Get element at the given with the given indices.
+	float & elem( const int b, const int x, const int y )
+	{
+		nvDebugCheck(b >= 0);
+		nvDebugCheck(b < bandNum());
+		return band[b][(b + y) * (b * 2 + 1) + (b + x)];
+	}
+
+	/// Get element at the given with the given indices.
+	float elem( const int b, const int x, const int y ) const
+	{
+		nvDebugCheck(b >= 0);
+		nvDebugCheck(b < bandNum());
+		return band[b][(b + y) * (b * 2 + 1) + (b + x)];
+	}
+
+	/** Copy matrix. */
+	void Copy( const ShMatrix & m )
+	{
+		nvDebugCheck(order == m.order);
+		memcpy(e, m.e, Size() * sizeof(float));
+	}
+	
+	/** Rotate the given coefficients. */
+	void transform( const Sh & restrict source,  Sh * restrict dest ) const {
+		piCheck( &source != dest );	// Make sure there's no aliasing.
+		piCheck( dest->order <= order );
+		piCheck( order <= source.order );
+		
+		if( identity ) {
+			*dest = source;
+			return;
+		}
+		
+		// Loop through each band.
+		for( int l = 0; l <= dest->order; l++ ) {
+			
+			for( int mo = -l; mo <= l; mo++ ) {
+				
+				Color3f rgb = Color3f::Black;
+				
+				for( int mi = -l; mi <= l; mi++ ) {
+					rgb.Mad( rgb, source.elem(l, mi), elem(l, mo, mi) );
+				}
+				
+				dest->elem(l, mo) = rgb;
+			}
+		}
+	}
+
+
+	MATHLIB_API void multiply( const ShMatrix &A, const ShMatrix &B );
+	MATHLIB_API void rotation( const Matrix & m );
+	MATHLIB_API void rotation( int axis, float angles );
+	MATHLIB_API void print();
+	
+
+private:
+
+	// @@ These could be static indices precomputed only once.
+	/// Setup the band pointers.
+	void setupBands()
+	{
+		int size = 0;
+		for( int i = 0; i < bandNum(); i++ ) {
+			band[i] = &e[size];
+			size += SQ(i * 2 + 1);
+		}
+	}
+	
+	
+private:
+
+	// Matrix order.
+	const int m_order;
+
+	// Identity flag for quick transform.
+	bool m_identity;
+
+	// Array of elements.
+	float * m_e;
+	
+	// Band pointers.
+	float ** m_band;
+	
+};
+
+#endif // 0
+
+
+
+} // nv namespace
+
+#endif // NV_MATH_SPHERICALHARMONIC_H
--- a/src/nvmath/TriBox.cpp
+++ b/src/nvmath/TriBox.cpp
@ -0,0 +1,226 @@
+/********************************************************/
+/* AABB-triangle overlap test code                      */
+/* by Tomas Akenine-M<>ller                              */
+/* Function: int triBoxOverlap(float boxcenter[3],      */
+/*          float boxhalfsize[3],float triverts[3][3]); */
+/* History:                                             */
+/*   2001-03-05: released the code in its first version */
+/*   2001-06-18: changed the order of the tests, faster */
+/*                                                      */
+/* Acknowledgement: Many thanks to Pierre Terdiman for  */
+/* suggestions and discussions on how to optimize code. */
+/* Thanks to David Hunt for finding a ">="-bug!         */
+/********************************************************/
+
+#include <nvmath/Vector.h>
+#include <nvmath/Triangle.h>
+
+using namespace nv;
+
+#define X 0
+#define Y 1
+#define Z 2
+
+#define FINDMINMAX(x0,x1,x2,min,max) \
+  min = max = x0;   \
+  if(x1<min) min=x1;\
+  if(x1>max) max=x1;\
+  if(x2<min) min=x2;\
+  if(x2>max) max=x2;
+
+
+static bool planeBoxOverlap(Vector3::Arg normal, Vector3::Arg vert, Vector3::Arg maxbox)	// -NJMP-
+{
+	Vector3 vmin, vmax;
+
+	float signs[3] = {1, 1, 1};
+	if (normal.x() <= 0.0f) signs[0] = -1;
+	if (normal.y() <= 0.0f) signs[1] = -1;
+	if (normal.z() <= 0.0f) signs[2] = -1;
+	
+	Vector3 sign(signs[0], signs[1], signs[2]);
+	vmin = -scale(sign, maxbox) - vert;
+	vmax = scale(sign, maxbox) - vert;
+
+	if (dot(normal, vmin) > 0.0f) return false;
+	if (dot(normal, vmax) >= 0.0f) return true;
+
+	return false;
+}
+
+
+/*======================== X-tests ========================*/
+#define AXISTEST_X01(a, b, fa, fb) \
+	p0 = a*v0.y() - b*v0.z(); \
+	p2 = a*v2.y() - b*v2.z(); \
+	if(p0<p2) {min=p0; max=p2;} else {min=p2; max=p0;} \
+	rad = fa * boxhalfsize.y() + fb * boxhalfsize.z(); \
+	if(min>rad || max<-rad) return false;
+
+#define AXISTEST_X2(a, b, fa, fb) \
+	p0 = a*v0.y() - b*v0.z(); \
+	p1 = a*v1.y() - b*v1.z(); \
+	if(p0<p1) {min=p0; max=p1;} else {min=p1; max=p0;} \
+	rad = fa * boxhalfsize.y() + fb * boxhalfsize.z(); \
+	if(min>rad || max<-rad) return false;
+
+/*======================== Y-tests ========================*/
+#define AXISTEST_Y02(a, b, fa, fb) \
+	p0 = -a*v0.x() + b*v0.z(); \
+	p2 = -a*v2.x() + b*v2.z(); \
+	if(p0<p2) {min=p0; max=p2;} else {min=p2; max=p0;} \
+	rad = fa * boxhalfsize.x() + fb * boxhalfsize.z(); \
+	if(min>rad || max<-rad) return false;
+
+#define AXISTEST_Y1(a, b, fa, fb) \
+	p0 = -a*v0.x() + b*v0.z(); \
+	p1 = -a*v1.x() + b*v1.z(); \
+	if(p0<p1) {min=p0; max=p1;} else {min=p1; max=p0;} \
+	rad = fa * boxhalfsize.x() + fb * boxhalfsize.z(); \
+	if(min>rad || max<-rad) return false;
+
+/*======================== Z-tests ========================*/
+
+#define AXISTEST_Z12(a, b, fa, fb) \
+	p1 = a*v1.x() - b*v1.y();	\
+	p2 = a*v2.x() - b*v2.y();	\
+	if(p2<p1) {min=p2; max=p1;} else {min=p1; max=p2;} \
+	rad = fa * boxhalfsize.x() + fb * boxhalfsize.y(); \
+	if(min>rad || max<-rad) return false;
+
+#define AXISTEST_Z0(a, b, fa, fb) \
+	p0 = a*v0.x() - b*v0.y();	\
+	p1 = a*v1.x() - b*v1.y();	\
+	if(p0<p1) {min=p0; max=p1;} else {min=p1; max=p0;} \
+	rad = fa * boxhalfsize.x() + fb * boxhalfsize.y(); \
+	if(min>rad || max<-rad) return false;
+
+
+bool triBoxOverlap(Vector3::Arg boxcenter, Vector3::Arg boxhalfsize, const Triangle & tri)
+{
+	// use separating axis theorem to test overlap between triangle and box
+	// need to test for overlap in these directions:
+	// 1) the {x,y,z}-directions (actually, since we use the AABB of the triangle
+	//    we do not even need to test these)
+	// 2) normal of the triangle
+	// 3) crossproduct(edge from tri, {x,y,z}-directin)
+	//    this gives 3x3=9 more tests
+	Vector3 v0, v1, v2;
+	float min, max, p0, p1, p2, rad, fex, fey, fez;
+	Vector3 normal, e0, e1, e2;
+
+	// This is the fastest branch on Sun.
+	// move everything so that the boxcenter is in (0,0,0)
+	v0 = tri.v[0] - boxcenter;
+	v1 = tri.v[1] - boxcenter;
+	v2 = tri.v[2] - boxcenter;
+
+	// Compute triangle edges.
+	e0 = v1 - v0;	// tri edge 0
+	e1 = v2 - v1;	// tri edge 1
+	e2 = v0 - v2;	// tri edge 2
+
+	// Bullet 3:
+	//  test the 9 tests first (this was faster)
+	fex = fabsf(e0.x());
+	fey = fabsf(e0.y());
+	fez = fabsf(e0.z());
+	AXISTEST_X01(e0.z(), e0.y(), fez, fey);
+	AXISTEST_Y02(e0.z(), e0.x(), fez, fex);
+	AXISTEST_Z12(e0.y(), e0.x(), fey, fex);
+
+	fex = fabsf(e1.x());
+	fey = fabsf(e1.y());
+	fez = fabsf(e1.z());
+	AXISTEST_X01(e1.z(), e1.y(), fez, fey);
+	AXISTEST_Y02(e1.z(), e1.x(), fez, fex);
+	AXISTEST_Z0(e1.y(), e1.x(), fey, fex);
+
+	fex = fabsf(e2.x());
+	fey = fabsf(e2.y());
+	fez = fabsf(e2.z());
+	AXISTEST_X2(e2.z(), e2.y(), fez, fey);
+	AXISTEST_Y1(e2.z(), e2.x(), fez, fex);
+	AXISTEST_Z12(e2.y(), e2.x(), fey, fex);
+
+	// Bullet 1:
+	//  first test overlap in the {x,y,z}-directions
+	//  find min, max of the triangle each direction, and test for overlap in
+	//  that direction -- this is equivalent to testing a minimal AABB around
+	//  the triangle against the AABB
+
+	// test in X-direction
+	FINDMINMAX(v0.x(), v1.x(), v2.x(), min, max);
+	if(min > boxhalfsize.x() || max < -boxhalfsize.x()) return false;
+
+	// test in Y-direction
+	FINDMINMAX(v0.y(), v1.y(), v2.y(), min, max);
+	if(min > boxhalfsize.y() || max < -boxhalfsize.y()) return false;
+
+	// test in Z-direction
+	FINDMINMAX(v0.z(), v1.z(), v2.z(), min, max);
+	if(min > boxhalfsize.z() || max < -boxhalfsize.z()) return false;
+
+	// Bullet 2:
+	//  test if the box intersects the plane of the triangle
+	//  compute plane equation of triangle: normal*x+d=0
+	normal = cross(e0, e1);
+
+	return planeBoxOverlap(normal, v0, boxhalfsize);
+}
+
+
+bool triBoxOverlapNoBounds(Vector3::Arg boxcenter, Vector3::Arg boxhalfsize, const Triangle & tri)
+{
+	// use separating axis theorem to test overlap between triangle and box
+	// need to test for overlap in these directions:
+	// 1) the {x,y,z}-directions (actually, since we use the AABB of the triangle
+	//    we do not even need to test these)
+	// 2) normal of the triangle
+	// 3) crossproduct(edge from tri, {x,y,z}-directin)
+	//    this gives 3x3=9 more tests
+	Vector3 v0, v1, v2;
+	float min, max, p0, p1, p2, rad, fex, fey, fez;
+	Vector3 normal, e0, e1, e2;
+
+	// This is the fastest branch on Sun.
+	// move everything so that the boxcenter is in (0,0,0)
+	v0 = tri.v[0] - boxcenter;
+	v1 = tri.v[1] - boxcenter;
+	v2 = tri.v[2] - boxcenter;
+
+	// Compute triangle edges.
+	e0 = v1 - v0;	// tri edge 0
+	e1 = v2 - v1;	// tri edge 1
+	e2 = v0 - v2;	// tri edge 2
+
+	// Bullet 3:
+	//  test the 9 tests first (this was faster)
+	fex = fabsf(e0.x());
+	fey = fabsf(e0.y());
+	fez = fabsf(e0.z());
+	AXISTEST_X01(e0.z(), e0.y(), fez, fey);
+	AXISTEST_Y02(e0.z(), e0.x(), fez, fex);
+	AXISTEST_Z12(e0.y(), e0.x(), fey, fex);
+
+	fex = fabsf(e1.x());
+	fey = fabsf(e1.y());
+	fez = fabsf(e1.z());
+	AXISTEST_X01(e1.z(), e1.y(), fez, fey);
+	AXISTEST_Y02(e1.z(), e1.x(), fez, fex);
+	AXISTEST_Z0(e1.y(), e1.x(), fey, fex);
+
+	fex = fabsf(e2.x());
+	fey = fabsf(e2.y());
+	fez = fabsf(e2.z());
+	AXISTEST_X2(e2.z(), e2.y(), fez, fey);
+	AXISTEST_Y1(e2.z(), e2.x(), fez, fex);
+	AXISTEST_Z12(e2.y(), e2.x(), fey, fex);
+
+	// Bullet 2:
+	//  test if the box intersects the plane of the triangle
+	//  compute plane equation of triangle: normal*x+d=0
+	normal = cross(e0, e1);
+
+	return planeBoxOverlap(normal, v0, boxhalfsize);
+}
--- a/src/nvmath/Triangle.cpp
+++ b/src/nvmath/Triangle.cpp
@ -0,0 +1,168 @@
+// This code is in the public domain -- Ignacio Casta<74>o <castanyo@yahoo.es>
+
+#include <nvmath/Triangle.h>
+
+using namespace nv;
+
+
+/// Tomas M<>ller, barycentric ray-triangle test.
+bool rayTest_Moller(const Triangle & t, Vector3::Arg orig, Vector3::Arg dir, float * out_t, float * out_u, float * out_v)
+{
+	// find vectors for two edges sharing vert0 
+	Vector3 e1 = t.v[1] - t.v[0];
+	Vector3 e2 = t.v[2] - t.v[0];
+
+	// begin calculating determinant - also used to calculate U parameter
+	Vector3 pvec = cross(dir, e2);
+	
+	// if determinant is near zero, ray lies in plane of triangle
+	float det = dot(e1, pvec);
+	if (det < -NV_EPSILON) {
+		return false;
+	}
+
+	// calculate distance from vert0 to ray origin
+	Vector3 tvec = orig - t.v[0];
+
+	// calculate U parameter and test bounds
+	float u = dot(tvec, pvec);
+	if( u < 0.0f || u > det ) {
+		return false;
+	}
+
+	// prepare to test V parameter
+	Vector3 qvec = cross(tvec, e1);
+
+	// calculate V parameter and test bounds
+	float v = dot(dir, qvec);
+	if (v < 0.0f || u + v > det) {
+		return false;
+	}
+
+	// calculate t, scale parameters, ray intersects triangle
+	float inv_det = 1.0f / det;
+	*out_t = dot(e2, qvec) * inv_det;
+	*out_u = u * inv_det;	// v
+	*out_v = v * inv_det;	// 1-(u+v)
+
+	return true;
+}
+
+
+
+
+
+#if 0
+
+
+// IC: This code is adapted from my Pi.MathLib code, based on Moller-Trumbore triangle test.
+FXVector3 edge1, edge2, pvec, tvec, qvec;
+
+edge1 = tri.V1 - tri.V0;
+edge2 = tri.V2 - tri.V0;
+
+pvec.Cross(ray.Direction, edge2);
+
+float det = FXVector3.Dot(edge1, pvec);
+
+// calculate distance from vert0 to ray origin.
+FXVector3 tvec = ray.Origin - vert0;
+
+if( det < 0 ) 
+{
+	// calculate U parameter and test bounds.
+	float u = FXVector3.Dot(tvec, pvec);
+	if (u > 0.0 || u < det)
+	{
+		return false;
+	}
+
+	// prepare to test V parameter.
+	qvec.Cross(tvec, edge1);
+
+	// calculate V parameter and test bounds.
+	float v = FXVector3.Dot(dir, qvec);
+
+	return v <= 0.0 && u + v >= det;
+}
+else
+{
+	// calculate U parameter and test bounds.
+	float u = FXVector3.Dot(tvec, pvec);
+	if (u < 0.0 || u > det)
+	{
+		return false;
+	}
+
+	// prepare to test V parameter.
+	qvec.Cross(tvec, edge1);
+
+	// calculate V parameter and test bounds.
+	float v = FXVector3.Dot(dir, qvec);
+
+	return v >= 0.0 && u + v <= det;
+}
+
+
+
+/** 
+ * Dan Sunday, parametric ray-triangle test.
+ */
+//    Output: *I = intersection point (when it exists)
+//    Return: -1 = triangle is degenerate (a segment or point)
+//             0 = disjoint (no intersect)
+//             1 = intersect in unique point I1
+//             2 = are in the same plane
+bool RayTriangleTest( const Vec3 &p0, const Vec3 &p1, 
+					  const Vec3 &v0, const Vec3 &v1, const Vec3 &v2, const Vec3 &n,
+					  Vec3 &I ) {
+    Vec3 u, v;					// triangle vectors
+    Vec3 dir, w0, w;			// ray vectors
+    float r, a, b;				// params to calc ray-plane intersect
+
+    // get triangle edge vectors and plane normal
+    u.Sub( v1, v0 );
+    v.Sub( v2, v0 );
+
+    dir.Sub( p1, p0 );			// ray direction vector
+	w0.Sub( p0, v0 );
+    a = Vec3DotProduct( n, w0 );
+    b = Vec3DotProduct( n, dir );
+
+    if( fabs(b) < TI_EPSILON ) 	// ray is parallel to triangle plane
+		return false;
+
+
+    // get intersect point of ray with triangle plane
+    r = -a / b;
+    if( r < 0.0f )				// ray goes away from triangle
+        return false;			// => no intersect
+    
+	// for a segment, also test if (r > 1.0) => no intersect
+
+	I.Mad( p0, dir, r );		// intersect point of ray and plane
+
+    // is I inside T?
+    float    uu, uv, vv, wu, wv, D;
+    uu = Vec3DotProduct( u, u );
+    uv = Vec3DotProduct( u, v );
+    vv = Vec3DotProduct( v, v );
+    w = I - v0;
+    wu = Vec3DotProduct( w, u );
+    wv = Vec3DotProduct( w, v );
+    D = uv * uv - uu * vv;
+
+    // get and test parametric coords
+    float s, t;
+    s = (uv * wv - vv * wu) / D;
+    if( s<0.0 || s > 1.0)        // I is outside T
+        return false;
+    t = (uv * wu - uu * wv) / D;
+    if( t<0.0 || (s + t) > 1.0)  // I is outside T
+        return false;
+
+    return true;                      // I is in T
+}
+
+
+#endif // 0
--- a/src/nvmath/Triangle.h
+++ b/src/nvmath/Triangle.h
@ -0,0 +1,81 @@
+// This code is in the public domain -- Ignacio Casta<74>o <castanyo@yahoo.es>
+
+#ifndef NV_MATH_TRIANGLE_H
+#define NV_MATH_TRIANGLE_H
+
+#include <nvmath/nvmath.h>
+#include <nvmath/Vector.h>
+#include <nvmath/Box.h>
+
+namespace nv
+{
+
+	/// Triangle class with three vertices.
+	class Triangle
+	{
+	public:
+		Triangle() {};
+
+		Triangle(Vector3::Arg v0, Vector3::Arg v1, Vector3::Arg v2)
+		{
+			v[0] = v0;
+			v[1] = v1;
+			v[2] = v2;
+		}
+
+		/// Get the bounds of the triangle.
+		Box bounds() const
+		{
+			Box bounds;
+			bounds.clearBounds();
+			bounds.addPointToBounds(v[0]);
+			bounds.addPointToBounds(v[1]);
+			bounds.addPointToBounds(v[2]);
+			return bounds;
+		}
+
+		Vector4 plane() const
+		{
+			Vector3 n = cross(v[1]-v[0], v[2]-v[0]);
+			return Vector4(n, dot(n, v[0]));
+		}
+
+		Vector3 v[3];
+	};
+
+
+	// Tomas Akenine-M<>ller box-triangle test.
+	NVMATH_API bool triBoxOverlap(Vector3::Arg boxcenter, Vector3::Arg boxhalfsize, const Triangle & triangle);
+	NVMATH_API bool triBoxOverlapNoBounds(Vector3::Arg boxcenter, Vector3::Arg boxhalfsize, const Triangle & triangle);
+
+
+	// Moller ray triangle test.
+	NVMATH_API bool rayTest_Moller(const Triangle & t, Vector3::Arg orig, Vector3::Arg dir, float * out_t, float * out_u, float * out_v);
+
+	inline bool rayTest(const Triangle & t, Vector3::Arg orig, Vector3::Arg dir, float * out_t, float * out_u, float * out_v)
+	{
+		return rayTest_Moller(t, orig, dir, out_t, out_u, out_v);
+	}
+	
+	inline bool overlap(const Triangle & t, const Box & b)
+	{
+		Vector3 center = b.center();
+		Vector3 extents = b.extents();
+		return triBoxOverlap(center, extents, t);
+	}
+
+	inline bool overlap(const Box & b, const Triangle & t)
+	{
+		return overlap(t, b);
+	}
+
+	inline bool overlapNoBounds(const Triangle & t, const Box & b)
+	{
+		Vector3 center = b.center();
+		Vector3 extents = b.extents();
+		return triBoxOverlapNoBounds(center, extents, t);
+	}
+
+} // nv namespace
+
+#endif	// NV_MATH_TRIANGLE_H
--- a/src/nvmath/Vector.h
+++ b/src/nvmath/Vector.h
@ -4,7 +4,7 @@
 #define NV_MATH_VECTOR_H

 #include <nvmath/nvmath.h>
-#include <nvcore/Algorithms.h> // min, max
+#include <nvcore/Containers.h> // min, max

 namespace nv
 {
@ -27,7 +27,6 @@ public:
 	Vector2(Vector2::Arg v);
 	
 	const Vector2 & operator=(Vector2::Arg v);
-	void setComponent(uint idx, scalar f);
 	
 	scalar x() const;
 	scalar y() const;
@ -72,7 +71,6 @@ public:
 	const Vector2 & xy() const;

 	scalar component(uint idx) const;
-	void setComponent(uint idx, scalar f);

 	const scalar * ptr() const;

@ -117,7 +115,6 @@ public:
 	const Vector3 & xyz() const;

 	scalar component(uint idx) const;
-	void setComponent(uint idx, scalar f);

 	const scalar * ptr() const;

@ -164,14 +161,6 @@ inline scalar Vector2::component(uint idx) const
 	return 0.0f;
 }

-inline void Vector2::setComponent(uint idx, float f)
-{
-	nvDebugCheck(idx < 2);
-	if (idx == 0) m_x = f;
-	else if (idx == 1) m_y = f;
-}
-
-
 inline const scalar * Vector2::ptr() const
 {
 	return &m_x;
@ -250,21 +239,13 @@ inline const Vector2 & Vector3::xy() const
 inline scalar Vector3::component(uint idx) const
 {
 	nvDebugCheck(idx < 3);
-	if (idx == 0) return m_x;
-	if (idx == 1) return m_y;
-	if (idx == 2) return m_z;
+	if (idx == 0) return x();
+	if (idx == 1) return y();
+	if (idx == 2) return z();
 	nvAssume(false);
 	return 0.0f;
 }

-inline void Vector3::setComponent(uint idx, float f)
-{
-	nvDebugCheck(idx < 3);
-	if (idx == 0) m_x = f;
-	else if (idx == 1) m_y = f;
-	else if (idx == 2) m_z = f;
-}
-
 inline const scalar * Vector3::ptr() const
 {
 	return &m_x;
@ -372,15 +353,6 @@ inline scalar Vector4::component(uint idx) const
 	return 0.0f;
 }

-inline void Vector4::setComponent(uint idx, float f)
-{
-	nvDebugCheck(idx < 4);
-	if (idx == 0) m_x = f;
-	else if (idx == 1) m_y = f;
-	else if (idx == 2) m_z = f;
-	else if (idx == 3) m_w = f;
-}
-
 inline const scalar * Vector4::ptr() const
 {
 	return &m_x;
@ -505,35 +477,6 @@ inline scalar length(Vector2::Arg v)
 	return sqrtf(length_squared(v));
 }

-inline scalar inverse_length(Vector2::Arg v)
-{
-	return 1.0f / sqrtf(length_squared(v));
-}
-
-inline bool isNormalized(Vector2::Arg v, float epsilon = NV_NORMAL_EPSILON)
-{
-	return equal(length(v), 1, epsilon);
-}
-
-inline Vector2 normalize(Vector2::Arg v, float epsilon = NV_EPSILON)
-{
-	float l = length(v);
-	nvDebugCheck(!isZero(l, epsilon));
-	Vector2 n = scale(v, 1.0f / l);
-	nvDebugCheck(isNormalized(n));
-	return n;
-}
-
-inline Vector2 normalizeSafe(Vector2::Arg v, Vector2::Arg fallback, float epsilon = NV_EPSILON)
-{
-	float l = length(v);
-	if (isZero(l, epsilon)) {
-		return fallback;
-	}
-	return scale(v, 1.0f / l);
-}
-
-
 inline bool equal(Vector2::Arg v1, Vector2::Arg v2, float epsilon = NV_EPSILON)
 {
 	return equal(v1.x(), v2.x(), epsilon) && equal(v1.y(), v2.y(), epsilon);
@ -652,11 +595,6 @@ inline scalar length(Vector3::Arg v)
 	return sqrtf(length_squared(v));
 }

-inline scalar inverse_length(Vector3::Arg v)
-{
-	return 1.0f / sqrtf(length_squared(v));
-}
-
 inline bool isNormalized(Vector3::Arg v, float epsilon = NV_NORMAL_EPSILON)
 {
 	return equal(length(v), 1, epsilon);
@ -778,11 +716,6 @@ inline scalar length(Vector4::Arg v)
 	return sqrtf(length_squared(v));
 }

-inline scalar inverse_length(Vector4::Arg v)
-{
-	return 1.0f / sqrtf(length_squared(v));
-}
-
 inline bool isNormalized(Vector4::Arg v, float epsilon = NV_NORMAL_EPSILON)
 {
 	return equal(length(v), 1, epsilon);
--- a/src/nvmath/nvmath.h
+++ b/src/nvmath/nvmath.h
@ -1,196 +1,164 @@
-// This code is in the public domain -- castanyo@yahoo.es
-
-#ifndef NV_MATH_H
-#define NV_MATH_H
-
-#include <nvcore/nvcore.h>
-#include <nvcore/Debug.h>
-
-#include <math.h>
-#include <limits.h> // INT_MAX
-
-#if NV_OS_WIN32
-#include <float.h>
-#endif
-
-// Function linkage
-#if NVMATH_SHARED
-#ifdef NVMATH_EXPORTS
-#define NVMATH_API DLL_EXPORT
-#define NVMATH_CLASS DLL_EXPORT_CLASS
-#else
-#define NVMATH_API DLL_IMPORT
-#define NVMATH_CLASS DLL_IMPORT
-#endif
-#else // NVMATH_SHARED
-#define NVMATH_API
-#define NVMATH_CLASS
-#endif // NVMATH_SHARED
-
-#ifndef PI
-#define PI      			float(3.1415926535897932384626433833)
-#endif
-
-#define NV_EPSILON			(0.0001f)
-#define NV_NORMAL_EPSILON	(0.001f)
-
-/*
-#define SQ(r)				((r)*(r))
-
-#define	SIGN_BITMASK		0x80000000
-
-/// Integer representation of a floating-point value.
-#define IR(x)					((uint32 &)(x))
-
-/// Absolute integer representation of a floating-point value
-#define AIR(x)					(IR(x) & 0x7fffffff)
-
-/// Floating-point representation of an integer value.
-#define FR(x)					((float&)(x))
-
-/// Integer-based comparison of a floating point value.
-/// Don't use it blindly, it can be faster or slower than the FPU comparison, depends on the context.
-#define IS_NEGATIVE_FLOAT(x)	(IR(x)&SIGN_BITMASK)
-*/
-
-inline double sqrt_assert(const double f)
-{
-    nvDebugCheck(f >= 0.0f);
-    return sqrt(f);
-}
-
-inline float sqrtf_assert(const float f)
-{
-    nvDebugCheck(f >= 0.0f);
-    return sqrtf(f);
-}
-
-inline double acos_assert(const double f)
-{
-    nvDebugCheck(f >= -1.0f && f <= 1.0f);
-    return acos(f);
-}
-
-inline float acosf_assert(const float f)
-{
-    nvDebugCheck(f >= -1.0f && f <= 1.0f);
-    return acosf(f);
-}
-
-inline double asin_assert(const double f)
-{
-    nvDebugCheck(f >= -1.0f && f <= 1.0f);
-    return asin(f);
-}
-
-inline float asinf_assert(const float f)
-{
-    nvDebugCheck(f >= -1.0f && f <= 1.0f);
-    return asinf(f);
-}
-
-// Replace default functions with asserting ones.
-#define sqrt sqrt_assert
-#define sqrtf sqrtf_assert
-#define acos acos_assert
-#define acosf acosf_assert
-#define asin asin_assert
-#define asinf asinf_assert
-
-namespace nv
-{
-    inline float toRadian(float degree) { return degree * (PI / 180.0f); }
-    inline float toDegree(float radian) { return radian * (180.0f / PI); }
-
-    inline bool equal(const float f0, const float f1, const float epsilon = NV_EPSILON)
-    {
-        return fabs(f0-f1) <= epsilon;
-    }
-
-    inline bool isZero(const float f, const float epsilon = NV_EPSILON)
-    {
-        return fabs(f) <= epsilon;
-    }
-
-    inline bool isFinite(const float f)
-    {
-#if NV_OS_WIN32
-        return _finite(f) != 0;
-#elif NV_OS_DARWIN || NV_OS_FREEBSD
-        return isfinite(f);
-#elif NV_OS_LINUX
-        return finitef(f);
-#else
-#	error "isFinite not supported"
-#endif
-        //return std::isfinite (f);
-        //return finite (f);
-    }
-
-    inline bool isNan(const float f)
-    {
-#if NV_OS_WIN32
-        return _isnan(f) != 0;
-#elif NV_OS_DARWIN || NV_OS_FREEBSD
-        return isnan(f);
-#elif NV_OS_LINUX
-        return isnanf(f);
-#else
-#	error "isNan not supported"
-#endif
-    }
-
-    inline uint log2(uint i)
-    {
-        uint value = 0;
-        while( i >>= 1 ) {
-            value++;
-        }
-        return value;
-    }
-
-    inline float log2f(float x)
-    {
-        nvCheck(x >= 0);
-        return logf(x) / logf(2.0f);
-    }
-
-    inline float lerp(float f0, float f1, float t)
-    {
-        const float s = 1.0f - t;
-        return f0 * s + f1 * t;
-    }
-
-    inline float square(float f)
-    {
-        return f * f;
-    }
-
-    // @@ Float to int conversions to be optimized at some point. See:
-    // http://cbloomrants.blogspot.com/2009/01/01-17-09-float-to-int.html
-    // http://www.stereopsis.com/sree/fpu2006.html
-    // http://assemblyrequired.crashworks.org/2009/01/12/why-you-should-never-cast-floats-to-ints/
-    // http://chrishecker.com/Miscellaneous_Technical_Articles#Floating_Point
-    inline int iround(float f)
-    {
-        return int(f);
-    }
-
-    inline int ifloor(float f)
-    {
-        return int(floorf(f));
-    }
-
-    inline int iceil(float f)
-    {
-        return int(ceilf(f));
-    }
-
-    inline float frac(float f)
-    {
-        return f - floor(f);
-    }
-
-} // nv
-
-#endif // NV_MATH_H
+// This code is in the public domain -- castanyo@yahoo.es
+
+#ifndef NV_MATH_H
+#define NV_MATH_H
+
+#include <nvcore/nvcore.h>
+#include <nvcore/Debug.h>
+
+#include <math.h>
+
+// Function linkage
+#if NVMATH_SHARED
+#ifdef NVMATH_EXPORTS
+#define NVMATH_API DLL_EXPORT
+#define NVMATH_CLASS DLL_EXPORT_CLASS
+#else
+#define NVMATH_API DLL_IMPORT
+#define NVMATH_CLASS DLL_IMPORT
+#endif
+#else // NVMATH_SHARED
+#define NVMATH_API
+#define NVMATH_CLASS
+#endif // NVMATH_SHARED
+
+#ifndef PI
+#define PI      			float(3.1415926535897932384626433833)
+#endif
+
+#define NV_EPSILON			(0.0001f)
+#define NV_NORMAL_EPSILON	(0.001f)
+
+/*
+#define SQ(r)				((r)*(r))
+
+#define	SIGN_BITMASK		0x80000000
+
+/// Integer representation of a floating-point value.
+#define IR(x)					((uint32 &)(x))
+
+/// Absolute integer representation of a floating-point value
+#define AIR(x)					(IR(x) & 0x7fffffff)
+
+/// Floating-point representation of an integer value.
+#define FR(x)					((float&)(x))
+
+/// Integer-based comparison of a floating point value.
+/// Don't use it blindly, it can be faster or slower than the FPU comparison, depends on the context.
+#define IS_NEGATIVE_FLOAT(x)	(IR(x)&SIGN_BITMASK)
+*/
+
+inline double sqrt_assert(const double f)
+{
+	nvDebugCheck(f >= 0.0f);
+	return sqrt(f);
+}
+
+inline float sqrtf_assert(const float f)
+{
+	nvDebugCheck(f >= 0.0f);
+	return sqrtf(f);
+}
+
+inline double acos_assert(const double f)
+{
+	nvDebugCheck(f >= -1.0f && f <= 1.0f);
+	return acos(f);
+}
+
+inline float acosf_assert(const float f)
+{
+	nvDebugCheck(f >= -1.0f && f <= 1.0f);
+	return acosf(f);
+}
+
+inline double asin_assert(const double f)
+{
+	nvDebugCheck(f >= -1.0f && f <= 1.0f);
+	return asin(f);
+}
+
+inline float asinf_assert(const float f)
+{
+	nvDebugCheck(f >= -1.0f && f <= 1.0f);
+	return asinf(f);
+}
+
+// Replace default functions with asserting ones.
+#define sqrt sqrt_assert
+#define sqrtf sqrtf_assert
+#define acos acos_assert
+#define acosf acosf_assert
+#define asin asin_assert
+#define asinf asinf_assert
+
+#if NV_OS_WIN32
+#include <float.h>
+#endif
+
+namespace nv
+{
+inline float toRadian(float degree) { return degree * (PI / 180.0f); }
+inline float toDegree(float radian) { return radian * (180.0f / PI); }
+	
+inline bool equal(const float f0, const float f1, const float epsilon = NV_EPSILON)
+{
+	return fabs(f0-f1) <= epsilon;
+}
+
+inline bool isZero(const float f, const float epsilon = NV_EPSILON)
+{
+	return fabs(f) <= epsilon;
+}
+
+inline bool isFinite(const float f)
+{
+#if NV_OS_WIN32
+	return _finite(f) != 0;
+#elif NV_OS_DARWIN
+	return isfinite(f);
+#elif NV_OS_LINUX
+	return finitef(f);
+#else
+#	error "isFinite not supported"
+#endif
+//return std::isfinite (f);
+//return finite (f);
+}
+
+inline bool isNan(const float f)
+{
+#if NV_OS_WIN32
+	return _isnan(f) != 0;
+#elif NV_OS_DARWIN
+	return isnan(f);
+#elif NV_OS_LINUX
+	return isnanf(f);
+#else
+#	error "isNan not supported"
+#endif
+}
+
+inline uint log2(uint i)
+{
+	uint value = 0;
+	while( i >>= 1 ) {
+		value++;
+	}
+	return value;
+}
+
+inline float lerp(float f0, float f1, float t)
+{
+	const float s = 1.0f - t;
+	return f0 * s + f1 * t;
+}
+
+inline float square(float f)
+{
+	return f * f;
+}
+
+} // nv
+
+#endif // NV_MATH_H
--- a/src/nvtt/CMakeLists.txt
+++ b/src/nvtt/CMakeLists.txt
@ -5,17 +5,14 @@ ADD_SUBDIRECTORY(squish)
 SET(NVTT_SRCS
 	nvtt.h 
 	nvtt.cpp
-	Context.h
-	Context.cpp
+	Compressor.h
+	Compressor.cpp
 	nvtt_wrapper.h
 	nvtt_wrapper.cpp
-	Compressor.h
-	CompressorDXT.h
-	CompressorDXT.cpp
-	CompressorRGB.h
-	CompressorRGB.cpp
-	CompressorRGBE.h
-	CompressorRGBE.cpp
+	CompressDXT.h
+	CompressDXT.cpp
+	CompressRGB.h
+	CompressRGB.cpp
 	QuickCompressDXT.h
 	QuickCompressDXT.cpp
 	OptimalCompressDXT.h
@ -27,27 +24,27 @@ SET(NVTT_SRCS
 	InputOptions.cpp
 	OutputOptions.h
 	OutputOptions.cpp
-	TexImage.h TexImage.cpp
 	cuda/CudaUtils.h
 	cuda/CudaUtils.cpp
 	cuda/CudaMath.h
-	cuda/BitmapTable.h
-	cuda/CudaCompressorDXT.h
-	cuda/CudaCompressorDXT.cpp)
+	cuda/Bitmaps.h
+	cuda/CudaCompressDXT.h
+	cuda/CudaCompressDXT.cpp)

-IF (CUDA_FOUND)
+IF(CUDA_FOUND)
 	ADD_DEFINITIONS(-DHAVE_CUDA)
-	CUDA_COMPILE(CUDA_SRCS cuda/CompressKernel.cu)
+	WRAP_CUDA(CUDA_SRCS cuda/CompressKernel.cu)
 	SET(NVTT_SRCS ${NVTT_SRCS} ${CUDA_SRCS})
 	SET(LIBS ${LIBS} ${CUDA_LIBRARIES})
-	INCLUDE_DIRECTORIES(${CUDA_INCLUDE_DIRS})
-ENDIF (CUDA_FOUND)
+	INCLUDE_DIRECTORIES(${CUDA_INCLUDE_PATH})
+ENDIF(CUDA_FOUND)

 INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})

 ADD_DEFINITIONS(-DNVTT_EXPORTS)

-IF(NVTT_SHARED)	
+IF(NVTT_SHARED)
+	ADD_DEFINITIONS(-DNVTT_SHARED=1)
 	ADD_LIBRARY(nvtt SHARED ${NVTT_SRCS})
 ELSE(NVTT_SHARED)
 	ADD_LIBRARY(nvtt ${NVTT_SRCS})
@ -63,5 +60,54 @@ INSTALL(TARGETS nvtt
 INSTALL(FILES nvtt.h DESTINATION include/nvtt)


-ADD_SUBDIRECTORY(tools)
-ADD_SUBDIRECTORY(tests)
+
+# test executables
+ADD_EXECUTABLE(nvcompress tools/compress.cpp tools/cmdline.h)
+TARGET_LINK_LIBRARIES(nvcompress nvcore nvmath nvimage nvtt)
+
+ADD_EXECUTABLE(nvdecompress tools/decompress.cpp tools/cmdline.h)
+TARGET_LINK_LIBRARIES(nvdecompress nvcore nvmath nvimage)
+
+ADD_EXECUTABLE(nvddsinfo tools/ddsinfo.cpp tools/cmdline.h)
+TARGET_LINK_LIBRARIES(nvddsinfo nvcore nvmath nvimage)
+
+ADD_EXECUTABLE(nvimgdiff tools/imgdiff.cpp tools/cmdline.h)
+TARGET_LINK_LIBRARIES(nvimgdiff nvcore nvmath nvimage)
+
+ADD_EXECUTABLE(nvassemble tools/assemble.cpp tools/cmdline.h)
+TARGET_LINK_LIBRARIES(nvassemble nvcore nvmath nvimage)
+
+ADD_EXECUTABLE(filtertest tests/filtertest.cpp tools/cmdline.h)
+TARGET_LINK_LIBRARIES(filtertest nvcore nvmath nvimage)
+
+ADD_EXECUTABLE(nvzoom tools/resize.cpp tools/cmdline.h)
+TARGET_LINK_LIBRARIES(nvzoom nvcore nvmath nvimage)
+
+INSTALL(TARGETS nvcompress nvdecompress nvddsinfo nvimgdiff nvassemble nvzoom DESTINATION bin)
+
+# UI tools
+IF(QT4_FOUND AND NOT MSVC)
+	SET(QT_USE_QTOPENGL TRUE)
+	INCLUDE_DIRECTORIES(${QT_INCLUDE_DIR} ${CMAKE_CURRENT_BINARY_DIR})
+	
+	SET(SRCS
+		tools/main.cpp
+		tools/configdialog.h
+		tools/configdialog.cpp)
+
+	SET(LIBS
+		nvtt
+		${QT_QTCORE_LIBRARY}
+		${QT_QTGUI_LIBRARY}
+		${QT_QTOPENGL_LIBRARY})
+
+	QT4_WRAP_UI(UICS tools/configdialog.ui)
+	QT4_WRAP_CPP(MOCS tools/configdialog.h)
+	#QT4_ADD_RESOURCES(RCCS tools/configdialog.rc)
+
+	ADD_EXECUTABLE(nvcompressui MACOSX_BUNDLE ${SRCS} ${UICS} ${MOCS})
+	TARGET_LINK_LIBRARIES(nvcompressui ${LIBS})
+
+ENDIF(QT4_FOUND AND NOT MSVC)
+
+
--- a/src/nvtt/CompressDXT.cpp
+++ b/src/nvtt/CompressDXT.cpp
@ -0,0 +1,597 @@
+// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
+// 
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+#include <nvcore/Memory.h>
+
+#include <nvimage/Image.h>
+#include <nvimage/ColorBlock.h>
+#include <nvimage/BlockDXT.h>
+
+#include "nvtt.h"
+#include "CompressDXT.h"
+#include "QuickCompressDXT.h"
+#include "OptimalCompressDXT.h"
+#include "CompressionOptions.h"
+#include "OutputOptions.h"
+
+// squish
+#include "squish/colourset.h"
+//#include "squish/clusterfit.h"
+#include "squish/fastclusterfit.h"
+#include "squish/weightedclusterfit.h"
+
+
+// s3_quant
+#if defined(HAVE_S3QUANT)
+#include "s3tc/s3_quant.h"
+#endif
+
+// ati tc
+#if defined(HAVE_ATITC)
+#include "atitc/ATI_Compress.h"
+#endif
+
+//#include <time.h>
+
+using namespace nv;
+using namespace nvtt;
+
+
+nv::FastCompressor::FastCompressor() : m_image(NULL), m_alphaMode(AlphaMode_None)
+{
+}
+
+nv::FastCompressor::~FastCompressor()
+{
+}
+
+void nv::FastCompressor::setImage(const Image * image, nvtt::AlphaMode alphaMode)
+{
+	m_image = image;
+	m_alphaMode = alphaMode;
+}
+
+void nv::FastCompressor::compressDXT1(const OutputOptions::Private & outputOptions)
+{
+	const uint w = m_image->width();
+	const uint h = m_image->height();
+	
+	ColorBlock rgba;
+	BlockDXT1 block;
+
+	for (uint y = 0; y < h; y += 4) {
+		for (uint x = 0; x < w; x += 4) {
+			rgba.init(m_image, x, y);
+			
+			QuickCompress::compressDXT1(rgba, &block);
+			
+			if (outputOptions.outputHandler != NULL) {
+				outputOptions.outputHandler->writeData(&block, sizeof(block));
+			}
+		}
+	}
+}
+
+
+void nv::FastCompressor::compressDXT1a(const OutputOptions::Private & outputOptions)
+{
+	const uint w = m_image->width();
+	const uint h = m_image->height();
+	
+	ColorBlock rgba;
+	BlockDXT1 block;
+
+	for (uint y = 0; y < h; y += 4) {
+		for (uint x = 0; x < w; x += 4) {
+			rgba.init(m_image, x, y);
+			
+			QuickCompress::compressDXT1a(rgba, &block);
+			
+			if (outputOptions.outputHandler != NULL) {
+				outputOptions.outputHandler->writeData(&block, sizeof(block));
+			}
+		}
+	}
+}
+
+
+void nv::FastCompressor::compressDXT3(const nvtt::OutputOptions::Private & outputOptions)
+{
+	const uint w = m_image->width();
+	const uint h = m_image->height();
+	
+	ColorBlock rgba;
+	BlockDXT3 block;
+
+	for (uint y = 0; y < h; y += 4) {
+		for (uint x = 0; x < w; x += 4) {
+			rgba.init(m_image, x, y);
+
+			QuickCompress::compressDXT3(rgba, &block);
+			
+			if (outputOptions.outputHandler != NULL) {
+				outputOptions.outputHandler->writeData(&block, sizeof(block));
+			}
+		}
+	}
+}
+
+
+void nv::FastCompressor::compressDXT5(const nvtt::OutputOptions::Private & outputOptions)
+{
+	const uint w = m_image->width();
+	const uint h = m_image->height();
+	
+	ColorBlock rgba;
+	BlockDXT5 block;
+
+	for (uint y = 0; y < h; y += 4) {
+		for (uint x = 0; x < w; x += 4) {
+			rgba.init(m_image, x, y);
+			
+			QuickCompress::compressDXT5(rgba, &block, 0);
+			
+			if (outputOptions.outputHandler != NULL) {
+				outputOptions.outputHandler->writeData(&block, sizeof(block));
+			}
+		}
+	}
+}
+
+
+void nv::FastCompressor::compressDXT5n(const nvtt::OutputOptions::Private & outputOptions)
+{
+	const uint w = m_image->width();
+	const uint h = m_image->height();
+	
+	ColorBlock rgba;
+	BlockDXT5 block;
+
+	for (uint y = 0; y < h; y += 4) {
+		for (uint x = 0; x < w; x += 4) {
+			rgba.init(m_image, x, y);
+			
+			rgba.swizzleDXT5n();
+
+			QuickCompress::compressDXT5(rgba, &block, 0);
+			
+			if (outputOptions.outputHandler != NULL) {
+				outputOptions.outputHandler->writeData(&block, sizeof(block));
+			}
+		}
+	}
+}
+
+
+nv::SlowCompressor::SlowCompressor() : m_image(NULL), m_alphaMode(AlphaMode_None)
+{
+}
+
+nv::SlowCompressor::~SlowCompressor()
+{
+}
+
+void nv::SlowCompressor::setImage(const Image * image, nvtt::AlphaMode alphaMode)
+{
+	m_image = image;
+	m_alphaMode = alphaMode;
+}
+
+void nv::SlowCompressor::compressDXT1(const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions)
+{
+	const uint w = m_image->width();
+	const uint h = m_image->height();
+	
+	ColorBlock rgba;
+	BlockDXT1 block;
+
+	squish::WeightedClusterFit fit;
+	//squish::ClusterFit fit;
+	//squish::FastClusterFit fit;
+	fit.SetMetric(compressionOptions.colorWeight.x(), compressionOptions.colorWeight.y(), compressionOptions.colorWeight.z());
+
+	for (uint y = 0; y < h; y += 4) {
+		for (uint x = 0; x < w; x += 4) {
+			
+			rgba.init(m_image, x, y);
+			
+			if (rgba.isSingleColor())
+			{
+				OptimalCompress::compressDXT1(rgba.color(0), &block);
+			}
+			else
+			{
+				squish::ColourSet colours((uint8 *)rgba.colors(), 0, true);
+				fit.SetColourSet(&colours, squish::kDxt1);
+				fit.Compress(&block);
+			}
+			
+			if (outputOptions.outputHandler != NULL) {
+				outputOptions.outputHandler->writeData(&block, sizeof(block));
+			}
+		}
+	}
+}
+
+
+void nv::SlowCompressor::compressDXT1a(const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions)
+{
+	const uint w = m_image->width();
+	const uint h = m_image->height();
+	
+	ColorBlock rgba;
+	BlockDXT1 block;
+
+	squish::WeightedClusterFit fit;
+	fit.SetMetric(compressionOptions.colorWeight.x(), compressionOptions.colorWeight.y(), compressionOptions.colorWeight.z());
+
+	for (uint y = 0; y < h; y += 4) {
+		for (uint x = 0; x < w; x += 4) {
+			
+			rgba.init(m_image, x, y);
+			
+			bool anyAlpha = false;
+			bool allAlpha = true;
+			
+			for (uint i = 0; i < 16; i++)
+			{
+				if (rgba.color(i).a < 128) anyAlpha = true;
+				else allAlpha = false;
+			}
+			
+			if ((!anyAlpha && rgba.isSingleColor() || allAlpha))
+			{
+				OptimalCompress::compressDXT1a(rgba.color(0), &block);
+			}
+			else
+			{
+				squish::ColourSet colours((uint8 *)rgba.colors(), squish::kDxt1|squish::kWeightColourByAlpha);
+				fit.SetColourSet(&colours, squish::kDxt1);
+				fit.Compress(&block);
+			}
+			
+			if (outputOptions.outputHandler != NULL) {
+				outputOptions.outputHandler->writeData(&block, sizeof(block));
+			}
+		}
+	}
+}
+
+
+void nv::SlowCompressor::compressDXT3(const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions)
+{
+	const uint w = m_image->width();
+	const uint h = m_image->height();
+	
+	ColorBlock rgba;
+	BlockDXT3 block;
+	
+	squish::WeightedClusterFit fit;
+	//squish::FastClusterFit fit;
+	fit.SetMetric(compressionOptions.colorWeight.x(), compressionOptions.colorWeight.y(), compressionOptions.colorWeight.z());
+
+	for (uint y = 0; y < h; y += 4) {
+		for (uint x = 0; x < w; x += 4) {
+			
+			rgba.init(m_image, x, y);
+			
+			// Compress explicit alpha.
+			OptimalCompress::compressDXT3A(rgba, &block.alpha);
+
+			// Compress color.
+			if (rgba.isSingleColor())
+			{
+				OptimalCompress::compressDXT1(rgba.color(0), &block.color);
+			}
+			else
+			{
+				squish::ColourSet colours((uint8 *)rgba.colors(), squish::kWeightColourByAlpha);
+				fit.SetColourSet(&colours, 0);
+				fit.Compress(&block.color);
+			}
+			
+			if (outputOptions.outputHandler != NULL) {
+				outputOptions.outputHandler->writeData(&block, sizeof(block));
+			}
+		}
+	}
+}
+
+void nv::SlowCompressor::compressDXT5(const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions)
+{
+	const uint w = m_image->width();
+	const uint h = m_image->height();
+	
+	ColorBlock rgba;
+	BlockDXT5 block;
+
+	squish::WeightedClusterFit fit;
+	fit.SetMetric(compressionOptions.colorWeight.x(), compressionOptions.colorWeight.y(), compressionOptions.colorWeight.z());
+
+	for (uint y = 0; y < h; y += 4) {
+		for (uint x = 0; x < w; x += 4) {
+			
+			rgba.init(m_image, x, y);
+
+			// Compress alpha.
+			if (compressionOptions.quality == Quality_Highest)
+			{
+				OptimalCompress::compressDXT5A(rgba, &block.alpha);
+			}
+			else
+			{
+				QuickCompress::compressDXT5A(rgba, &block.alpha);
+			}
+		
+			// Compress color.
+			if (rgba.isSingleColor())
+			{
+				OptimalCompress::compressDXT1(rgba.color(0), &block.color);
+			}
+			else
+			{
+				squish::ColourSet colours((uint8 *)rgba.colors(), squish::kWeightColourByAlpha);
+				fit.SetColourSet(&colours, 0);
+				fit.Compress(&block.color);
+			}
+			
+			if (outputOptions.outputHandler != NULL) {
+				outputOptions.outputHandler->writeData(&block, sizeof(block));
+			}
+		}
+	}
+}
+
+
+void nv::SlowCompressor::compressDXT5n(const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions)
+{
+	const uint w = m_image->width();
+	const uint h = m_image->height();
+	
+	ColorBlock rgba;
+	BlockDXT5 block;
+	
+	for (uint y = 0; y < h; y += 4) {
+		for (uint x = 0; x < w; x += 4) {
+			
+			rgba.init(m_image, x, y);
+			
+			rgba.swizzleDXT5n();			
+			
+			// Compress X.
+			if (compressionOptions.quality == Quality_Highest)
+			{
+				OptimalCompress::compressDXT5A(rgba, &block.alpha);
+			}
+			else
+			{
+				QuickCompress::compressDXT5A(rgba, &block.alpha);
+			}
+			
+			// Compress Y.
+			OptimalCompress::compressDXT1G(rgba, &block.color);
+			
+			if (outputOptions.outputHandler != NULL) {
+				outputOptions.outputHandler->writeData(&block, sizeof(block));
+			}
+		}
+	}
+}
+
+
+void nv::SlowCompressor::compressBC4(const CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions)
+{
+	const uint w = m_image->width();
+	const uint h = m_image->height();
+	
+	ColorBlock rgba;
+	AlphaBlockDXT5 block;
+	
+	for (uint y = 0; y < h; y += 4) {
+		for (uint x = 0; x < w; x += 4) {
+			
+			rgba.init(m_image, x, y);
+
+			if (compressionOptions.quality == Quality_Highest)
+			{
+				OptimalCompress::compressDXT5A(rgba, &block);
+			}
+			else
+			{
+				QuickCompress::compressDXT5A(rgba, &block);
+			}
+
+			if (outputOptions.outputHandler != NULL) {
+				outputOptions.outputHandler->writeData(&block, sizeof(block));
+			}
+		}
+	}
+}
+
+
+void nv::SlowCompressor::compressBC5(const CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions)
+{
+	const uint w = m_image->width();
+	const uint h = m_image->height();
+
+	ColorBlock xcolor;
+	ColorBlock ycolor;
+
+	BlockATI2 block;
+
+	for (uint y = 0; y < h; y += 4) {
+		for (uint x = 0; x < w; x += 4) {
+			
+			xcolor.init(m_image, x, y);
+			xcolor.splatX();
+			
+			ycolor.init(m_image, x, y);
+			ycolor.splatY();
+
+			if (compressionOptions.quality == Quality_Highest)
+			{
+				OptimalCompress::compressDXT5A(xcolor, &block.x);
+				OptimalCompress::compressDXT5A(ycolor, &block.y);
+			}
+			else
+			{
+				QuickCompress::compressDXT5A(xcolor, &block.x);
+				QuickCompress::compressDXT5A(ycolor, &block.y);
+			}
+
+			if (outputOptions.outputHandler != NULL) {
+				outputOptions.outputHandler->writeData(&block, sizeof(block));
+			}
+		}
+	}
+}
+
+
+#if defined(HAVE_S3QUANT)
+
+void nv::s3CompressDXT1(const Image * image, const nvtt::OutputOptions::Private & outputOptions)
+{
+	const uint w = image->width();
+	const uint h = image->height();
+	
+	float error = 0.0f;
+
+	BlockDXT1 dxtBlock3;
+	BlockDXT1 dxtBlock4;
+	ColorBlock block;
+
+	for (uint y = 0; y < h; y += 4) {
+		for (uint x = 0; x < w; x += 4) {
+			block.init(image, x, y);
+
+			// Init rgb block.
+			RGBBlock rgbBlock;
+			rgbBlock.n = 16;
+			for (uint i = 0; i < 16; i++) {
+				rgbBlock.colorChannel[i][0] = clamp(float(block.color(i).r) / 255.0f, 0.0f, 1.0f);
+				rgbBlock.colorChannel[i][1] = clamp(float(block.color(i).g) / 255.0f, 0.0f, 1.0f);
+				rgbBlock.colorChannel[i][2] = clamp(float(block.color(i).b) / 255.0f, 0.0f, 1.0f);
+			}
+			rgbBlock.weight[0] = 1.0f;
+			rgbBlock.weight[1] = 1.0f;
+			rgbBlock.weight[2] = 1.0f;
+
+			rgbBlock.inLevel = 4;
+			CodeRGBBlock(&rgbBlock);
+
+			// Copy results to DXT block.
+			dxtBlock4.col0.r = rgbBlock.endPoint[0][0];
+			dxtBlock4.col0.g = rgbBlock.endPoint[0][1];
+			dxtBlock4.col0.b = rgbBlock.endPoint[0][2];
+
+			dxtBlock4.col1.r = rgbBlock.endPoint[1][0];
+			dxtBlock4.col1.g = rgbBlock.endPoint[1][1];
+			dxtBlock4.col1.b = rgbBlock.endPoint[1][2];
+
+			dxtBlock4.setIndices(rgbBlock.index);
+
+			if (dxtBlock4.col0.u < dxtBlock4.col1.u) {
+				swap(dxtBlock4.col0.u, dxtBlock4.col1.u);
+				dxtBlock4.indices ^= 0x55555555;
+			}
+
+			uint error4 = blockError(block, dxtBlock4);
+
+			rgbBlock.inLevel = 3;
+
+			CodeRGBBlock(&rgbBlock);
+
+			// Copy results to DXT block.
+			dxtBlock3.col0.r = rgbBlock.endPoint[0][0];
+			dxtBlock3.col0.g = rgbBlock.endPoint[0][1];
+			dxtBlock3.col0.b = rgbBlock.endPoint[0][2];
+
+			dxtBlock3.col1.r = rgbBlock.endPoint[1][0];
+			dxtBlock3.col1.g = rgbBlock.endPoint[1][1];
+			dxtBlock3.col1.b = rgbBlock.endPoint[1][2];
+
+			dxtBlock3.setIndices(rgbBlock.index);
+
+			if (dxtBlock3.col0.u > dxtBlock3.col1.u) {
+				swap(dxtBlock3.col0.u, dxtBlock3.col1.u);
+				dxtBlock3.indices ^= (~dxtBlock3.indices  >> 1) & 0x55555555;
+			}
+
+			uint error3 = blockError(block, dxtBlock3);
+
+			if (error3 < error4) {
+				error += error3;
+
+				if (outputOptions.outputHandler != NULL) {
+					outputOptions.outputHandler->writeData(&dxtBlock3, sizeof(dxtBlock3));
+				}
+			}
+			else {
+				error += error4;
+
+				if (outputOptions.outputHandler != NULL) {
+					outputOptions.outputHandler->writeData(&dxtBlock4, sizeof(dxtBlock4));
+				}
+			}
+		}
+	}
+
+	printf("error = %f\n", error/((w+3)/4 * (h+3)/4));
+}
+
+#endif // defined(HAVE_S3QUANT)
+
+
+#if defined(HAVE_ATITC)
+
+void nv::atiCompressDXT1(const Image * image, const OutputOptions::Private & outputOptions)
+{
+	// Init source texture
+	ATI_TC_Texture srcTexture;
+	srcTexture.dwSize = sizeof(srcTexture);
+	srcTexture.dwWidth = image->width();
+	srcTexture.dwHeight = image->height();
+	srcTexture.dwPitch = image->width() * 4;
+	srcTexture.format = ATI_TC_FORMAT_ARGB_8888;
+	srcTexture.dwDataSize = ATI_TC_CalculateBufferSize(&srcTexture);
+	srcTexture.pData = (ATI_TC_BYTE*) image->pixels();
+
+	// Init dest texture
+	ATI_TC_Texture destTexture;
+	destTexture.dwSize = sizeof(destTexture);
+	destTexture.dwWidth = image->width();
+	destTexture.dwHeight = image->height();
+	destTexture.dwPitch = 0;
+	destTexture.format = ATI_TC_FORMAT_DXT1;
+	destTexture.dwDataSize = ATI_TC_CalculateBufferSize(&destTexture);
+	destTexture.pData = (ATI_TC_BYTE*) mem::malloc(destTexture.dwDataSize);
+
+	// Compress
+	ATI_TC_ConvertTexture(&srcTexture, &destTexture, NULL, NULL, NULL, NULL);
+
+	if (outputOptions.outputHandler != NULL) {
+		outputOptions.outputHandler->writeData(destTexture.pData, destTexture.dwDataSize);
+	}
+}
+
+#endif // defined(HAVE_ATITC)
--- a/src/nvtt/CompressDXT.h
+++ b/src/nvtt/CompressDXT.h
@ -0,0 +1,87 @@
+// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
+// 
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef NV_TT_COMPRESSDXT_H
+#define NV_TT_COMPRESSDXT_H
+
+#include <nvimage/nvimage.h>
+#include "nvtt.h"
+
+namespace nv
+{
+	class Image;
+	class FloatImage;
+
+	class FastCompressor
+	{
+	public:
+		FastCompressor();
+		~FastCompressor();
+
+		void setImage(const Image * image, nvtt::AlphaMode alphaMode);
+
+		void compressDXT1(const nvtt::OutputOptions::Private & outputOptions);
+		void compressDXT1a(const nvtt::OutputOptions::Private & outputOptions);
+		void compressDXT3(const nvtt::OutputOptions::Private & outputOptions);
+		void compressDXT5(const nvtt::OutputOptions::Private & outputOptions);
+		void compressDXT5n(const nvtt::OutputOptions::Private & outputOptions);
+
+	private:
+		const Image * m_image;
+		nvtt::AlphaMode m_alphaMode;
+	};
+
+	class SlowCompressor
+	{
+	public:
+		SlowCompressor();
+		~SlowCompressor();
+
+		void setImage(const Image * image, nvtt::AlphaMode alphaMode);
+
+		void compressDXT1(const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
+		void compressDXT1a(const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
+		void compressDXT3(const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
+		void compressDXT5(const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
+		void compressDXT5n(const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
+		void compressBC4(const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
+		void compressBC5(const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
+
+	private:
+		const Image * m_image;
+		nvtt::AlphaMode m_alphaMode;
+	};
+
+	// External compressors.
+#if defined(HAVE_S3QUANT)
+	void s3CompressDXT1(const Image * image, const nvtt::OutputOptions::Private & outputOptions);
+#endif
+	
+#if defined(HAVE_ATITC)
+	void atiCompressDXT1(const Image * image, const nvtt::OutputOptions::Private & outputOptions);
+#endif
+
+} // nv namespace
+
+
+#endif // NV_TT_COMPRESSDXT_H
--- a/src/nvtt/CompressRGB.cpp
+++ b/src/nvtt/CompressRGB.cpp
@ -0,0 +1,140 @@
+// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
+// 
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+#include <nvcore/Debug.h>
+
+#include <nvimage/Image.h>
+#include <nvimage/PixelFormat.h>
+#include <nvmath/Color.h>
+
+#include "CompressRGB.h"
+#include "CompressionOptions.h"
+#include "OutputOptions.h"
+
+using namespace nv;
+using namespace nvtt;
+
+namespace 
+{
+
+	inline uint computePitch(uint w, uint bitsize)
+	{
+		uint p = w * ((bitsize + 7) / 8);
+
+		// Align to 32 bits.
+		return ((p + 3) / 4) * 4;
+	}
+
+	inline void convert_to_a8r8g8b8(const void * src, void * dst, uint w)
+	{
+		memcpy(dst, src, 4 * w);
+	}
+
+	inline void convert_to_x8r8g8b8(const void * src, void * dst, uint w)
+	{
+		memcpy(dst, src, 4 * w);
+	}
+
+} // namespace
+
+
+// Pixel format converter.
+void nv::compressRGB(const Image * image, const OutputOptions::Private & outputOptions, const CompressionOptions::Private & compressionOptions)
+{
+	nvCheck(image != NULL);
+
+	const uint w = image->width();
+	const uint h = image->height();
+
+	const uint bitCount = compressionOptions.bitcount;
+	nvCheck(bitCount == 8 || bitCount == 16 || bitCount == 24 || bitCount == 32);
+
+	const uint byteCount = bitCount / 8;
+
+	const uint rmask = compressionOptions.rmask;
+	uint rshift, rsize;
+	PixelFormat::maskShiftAndSize(rmask, &rshift, &rsize);
+	
+	const uint gmask = compressionOptions.gmask;
+	uint gshift, gsize;
+	PixelFormat::maskShiftAndSize(gmask, &gshift, &gsize);
+	
+	const uint bmask = compressionOptions.bmask;
+	uint bshift, bsize;
+	PixelFormat::maskShiftAndSize(bmask, &bshift, &bsize);
+	
+	const uint amask = compressionOptions.amask;
+	uint ashift, asize;
+	PixelFormat::maskShiftAndSize(amask, &ashift, &asize);
+
+	// Determine pitch.
+	uint pitch = computePitch(w, compressionOptions.bitcount);
+
+	uint8 * dst = (uint8 *)mem::malloc(pitch + 4);
+
+	for (uint y = 0; y < h; y++)
+	{
+		const Color32 * src = image->scanline(y);
+
+		if (bitCount == 32 && rmask == 0xFF0000 && gmask == 0xFF00 && bmask == 0xFF && amask == 0xFF000000)
+		{
+			convert_to_a8r8g8b8(src, dst, w);
+		}
+		else if (bitCount == 32 && rmask == 0xFF0000 && gmask == 0xFF00 && bmask == 0xFF && amask == 0)
+		{
+			convert_to_x8r8g8b8(src, dst, w);
+		}
+		else
+		{
+			// Generic pixel format conversion.
+			for (uint x = 0; x < w; x++)
+			{
+				uint c = 0;
+				c |= PixelFormat::convert(src[x].r, 8, rsize) << rshift;
+				c |= PixelFormat::convert(src[x].g, 8, gsize) << gshift;
+				c |= PixelFormat::convert(src[x].b, 8, bsize) << bshift;
+				c |= PixelFormat::convert(src[x].a, 8, asize) << ashift;
+				
+				// Output one byte at a time.
+				for (uint i = 0; i < byteCount; i++)
+				{
+					*(dst + x * byteCount + i) = (c >> (i * 8)) & 0xFF;
+				}
+			}
+			
+			// Zero padding.
+			for (uint x = w * byteCount; x < pitch; x++)
+			{
+				*(dst + x) = 0;
+			}
+		}
+
+		if (outputOptions.outputHandler != NULL)
+		{
+			outputOptions.outputHandler->writeData(dst, pitch);
+		}
+	}
+
+	mem::free(dst);
+}
+
--- a/src/nvtt/CompressorRGBE.h
+++ b/src/nvtt/CompressorRGBE.h
@ -21,20 +21,19 @@
 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 // OTHER DEALINGS IN THE SOFTWARE.

-#ifndef NV_TT_COMPRESSORRGBE_H
-#define NV_TT_COMPRESSORRGBE_H
+#ifndef NV_TT_COMPRESSRGB_H
+#define NV_TT_COMPRESSRGB_H

 #include "nvtt.h"
-#include "Compressor.h"

 namespace nv
 {
-    struct CompressorRGBE : public CompressorInterface
-	{
-		virtual void compress(nvtt::InputFormat inputFormat, nvtt::AlphaMode alphaMode, uint w, uint h, const void * data, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
-	};
+	class Image;

+	// Pixel format converter.
+	void compressRGB(const Image * image, const nvtt::OutputOptions::Private & outputOptions, const nvtt::CompressionOptions::Private & compressionOptions);
+	
 } // nv namespace


-#endif // NV_TT_COMPRESSORRGBE_H
+#endif // NV_TT_COMPRESSDXT_H
--- a/src/nvtt/CompressionOptions.cpp
+++ b/src/nvtt/CompressionOptions.cpp
@ -55,12 +55,6 @@ void CompressionOptions::reset()
 	m.rmask = 0x00FF0000;
 	m.amask = 0xFF000000;

-	m.rsize = 8;
-	m.gsize = 8;
-	m.bsize = 8;
-	m.asize = 8;
-	m.pixelType = PixelType_UnsignedNorm;
-
 	m.enableColorDithering = false;
 	m.enableAlphaDithering = false;
 	m.binaryAlpha = false;
@ -123,36 +117,8 @@ void CompressionOptions::setPixelFormat(uint bitcount, uint rmask, uint gmask, u
 	m.gmask = gmask;
 	m.bmask = bmask;
 	m.amask = amask;
-
-	m.rsize = 0;
-	m.gsize = 0;
-	m.bsize = 0;
-	m.asize = 0;
 }

-void CompressionOptions::setPixelFormat(uint8 rsize, uint8 gsize, uint8 bsize, uint8 asize)
-{
-	nvCheck(rsize <= 32 || gsize <= 32 || bsize <= 32 || asize <= 32);
-
-	m.bitcount = 0;
-	m.rmask = 0;
-	m.gmask = 0;
-	m.bmask = 0;
-	m.amask = 0;
-
-	m.rsize = rsize;
-	m.gsize = gsize;
-	m.bsize = bsize;
-	m.asize = asize;
-}
-
-/// Set pixel type.
-void CompressionOptions::setPixelType(PixelType pixelType)
-{
-	m.pixelType = pixelType;
-}
-
-
 /// Use external compressor.
 void CompressionOptions::setExternalCompressor(const char * name)
 {
--- a/src/nvtt/CompressionOptions.h
+++ b/src/nvtt/CompressionOptions.h
@ -45,12 +45,6 @@ namespace nvtt
 		uint gmask;
 		uint bmask;
 		uint amask;
-		uint8 rsize;
-		uint8 gsize;
-		uint8 bsize;
-		uint8 asize;
-		
-		PixelType pixelType;
 		
 		nv::String externalCompressor;

@ -59,15 +53,6 @@ namespace nvtt
 		bool enableAlphaDithering;
 		bool binaryAlpha;
 		int alphaThreshold;			// reference value used for binary alpha quantization.
-
-		uint getBitCount() const
-		{
-			if (format == Format_RGBA) {
-				if (bitcount != 0) return bitcount;
-				else return rsize + gsize + bsize + asize;
-			}
-			return 0;
-		}
 	};

 } // nvtt namespace
--- a/src/nvtt/Compressor.cpp
+++ b/src/nvtt/Compressor.cpp
@ -0,0 +1,853 @@
+// Copyright NVIDIA Corporation 2008 -- Ignacio Castano <icastano@nvidia.com>
+// 
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+#include <nvtt/nvtt.h>
+
+#include <nvcore/Memory.h>
+#include <nvcore/Ptr.h>
+
+#include <nvimage/DirectDrawSurface.h>
+#include <nvimage/ColorBlock.h>
+#include <nvimage/BlockDXT.h>
+#include <nvimage/Image.h>
+#include <nvimage/FloatImage.h>
+#include <nvimage/Filter.h>
+#include <nvimage/Quantize.h>
+#include <nvimage/NormalMap.h>
+#include <nvimage/PixelFormat.h>
+
+#include "Compressor.h"
+#include "InputOptions.h"
+#include "CompressionOptions.h"
+#include "OutputOptions.h"
+
+#include "CompressDXT.h"
+#include "CompressRGB.h"
+#include "cuda/CudaUtils.h"
+#include "cuda/CudaCompressDXT.h"
+
+
+using namespace nv;
+using namespace nvtt;
+
+
+namespace
+{
+
+	static int blockSize(Format format)
+	{
+		if (format == Format_DXT1 || format == Format_DXT1a) {
+			return 8;
+		}
+		else if (format == Format_DXT3) {
+			return 16;
+		}
+		else if (format == Format_DXT5 || format == Format_DXT5n) {
+			return 16;
+		}
+		else if (format == Format_BC4) {
+			return 8;
+		}
+		else if (format == Format_BC5) {
+			return 16;
+		}
+		return 0;
+	}
+
+	inline uint computePitch(uint w, uint bitsize)
+	{
+		uint p = w * ((bitsize + 7) / 8);
+
+		// Align to 32 bits.
+		return ((p + 3) / 4) * 4;
+	}
+
+	static int computeImageSize(uint w, uint h, uint d, uint bitCount, Format format)
+	{
+		if (format == Format_RGBA) {
+			return d * h * computePitch(w, bitCount);
+		}
+		else {
+			// @@ Handle 3D textures. DXT and VTC have different behaviors.
+			return ((w + 3) / 4) * ((h + 3) / 4) * blockSize(format);
+		}
+	}
+
+} // namespace
+
+namespace nvtt
+{
+	// Mipmap could be:
+	// - a pointer to an input image.
+	// - a fixed point image.
+	// - a floating point image.
+	struct Mipmap
+	{
+		Mipmap() : m_inputImage(NULL) {}
+		~Mipmap() {}
+
+		// Reference input image.
+		void setFromInput(const InputOptions::Private & inputOptions, uint idx)
+		{
+			m_inputImage = inputOptions.image(idx);
+			m_fixedImage = NULL;
+			m_floatImage = NULL;
+		}
+
+		// Assign and take ownership of given image.
+		void setImage(FloatImage * image)
+		{
+			m_inputImage = NULL;
+			m_fixedImage = NULL;
+			m_floatImage = image;
+		}
+
+
+		// Convert linear float image to fixed image ready for compression.
+		void toFixedImage(const InputOptions::Private & inputOptions)
+		{
+			if (m_floatImage != NULL) // apfaffe - We should check that we have a float image, if so convert it!
+			{
+				if (inputOptions.isNormalMap || inputOptions.outputGamma == 1.0f)
+				{
+					m_fixedImage = m_floatImage->createImage();
+				}
+				else
+				{
+					m_fixedImage = m_floatImage->createImageGammaCorrect(inputOptions.outputGamma);
+				}
+			}
+		}
+
+		// Convert input image to linear float image.
+		void toFloatImage(const InputOptions::Private & inputOptions)
+		{
+			if (m_floatImage == NULL)
+			{
+				nvDebugCheck(this->asFixedImage() != NULL);
+
+				m_floatImage = new FloatImage(this->asFixedImage());
+
+				if (inputOptions.isNormalMap)
+				{
+					// Expand normals to [-1, 1] range.
+					//	floatImage->expandNormals(0);
+				}
+				else if (inputOptions.inputGamma != 1.0f)
+				{
+					// Convert to linear space.
+					m_floatImage->toLinear(0, 3, inputOptions.inputGamma);
+				}
+			}
+		}
+
+		const FloatImage * asFloatImage() const
+		{
+			return m_floatImage.ptr();
+		}
+
+		FloatImage * asFloatImage()
+		{
+			return m_floatImage.ptr();
+		}
+
+		const Image * asFixedImage() const
+		{
+			// - apfaffe - switched logic to return the 'processed image' rather than the input!
+			if (m_fixedImage != NULL && m_fixedImage.ptr() != NULL)
+			{
+				return m_fixedImage.ptr();
+			}
+			return m_inputImage;
+		}
+
+		Image * asMutableFixedImage()
+		{
+			if (m_inputImage != NULL)
+			{
+				// Do not modify input image, create a copy.
+				m_fixedImage = new Image(*m_inputImage);
+				m_inputImage = NULL;
+			}
+			return m_fixedImage.ptr();
+		}
+
+
+	private:
+		const Image * m_inputImage;
+		AutoPtr<Image> m_fixedImage;
+		AutoPtr<FloatImage> m_floatImage;
+	};
+
+} // nvtt namespace
+
+
+Compressor::Compressor() : m(*new Compressor::Private())
+{
+	// CUDA initialization.
+	m.cudaSupported = cuda::isHardwarePresent();
+	m.cudaEnabled = false;
+	m.cudaDevice = -1;
+
+	enableCudaAcceleration(m.cudaSupported);
+}
+
+Compressor::~Compressor()
+{
+	enableCudaAcceleration(false);
+	delete &m;
+}
+
+
+/// Enable CUDA acceleration.
+void Compressor::enableCudaAcceleration(bool enable)
+{
+	if (m.cudaSupported)
+	{
+		if (m.cudaEnabled && !enable)
+		{
+			m.cudaEnabled = false;
+			m.cuda = NULL;
+
+			if (m.cudaDevice != -1)
+			{
+				// Exit device.
+				cuda::exitDevice();
+			}
+		}
+		else if (!m.cudaEnabled && enable)
+		{
+			// Init the CUDA device. This may return -1 if CUDA was already initialized by the app.
+			m.cudaEnabled = cuda::initDevice(&m.cudaDevice);
+
+			if (m.cudaEnabled)
+			{
+				// Create compressor if initialization succeeds.
+				m.cuda = new CudaCompressor();
+
+				// But cleanup if failed.
+				if (!m.cuda->isValid())
+				{
+					enableCudaAcceleration(false);
+				}
+			}
+		}
+	}
+}
+
+/// Check if CUDA acceleration is enabled.
+bool Compressor::isCudaAccelerationEnabled() const
+{
+	return m.cudaEnabled;
+}
+
+
+/// Compress the input texture with the given compression options.
+bool Compressor::process(const InputOptions & inputOptions, const CompressionOptions & compressionOptions, const OutputOptions & outputOptions) const
+{
+	return m.compress(inputOptions.m, compressionOptions.m, outputOptions.m);
+}
+
+
+/// Estimate the size of compressing the input with the given options.
+int Compressor::estimateSize(const InputOptions & inputOptions, const CompressionOptions & compressionOptions) const
+{
+	return m.estimateSize(inputOptions.m, compressionOptions.m);
+}
+
+
+
+
+bool Compressor::Private::compress(const InputOptions::Private & inputOptions, const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions) const
+{
+	// Make sure enums match.
+	nvStaticCheck(FloatImage::WrapMode_Clamp == (FloatImage::WrapMode)WrapMode_Clamp);
+	nvStaticCheck(FloatImage::WrapMode_Mirror == (FloatImage::WrapMode)WrapMode_Mirror);
+	nvStaticCheck(FloatImage::WrapMode_Repeat == (FloatImage::WrapMode)WrapMode_Repeat);
+
+	// Get output handler.
+	if (!outputOptions.openFile())
+	{
+		if (outputOptions.errorHandler) outputOptions.errorHandler->error(Error_FileOpen);
+		return false;
+	}
+
+	inputOptions.computeTargetExtents();
+
+	// Output DDS header.
+	if (!outputHeader(inputOptions, compressionOptions, outputOptions))
+	{
+		return false;
+	}
+
+	for (uint f = 0; f < inputOptions.faceCount; f++)
+	{
+		if (!compressMipmaps(f, inputOptions, compressionOptions, outputOptions))
+		{
+			return false;
+		}
+	}
+
+	outputOptions.closeFile();
+
+	return true;
+}
+
+
+// Output DDS header.
+bool Compressor::Private::outputHeader(const InputOptions::Private & inputOptions, const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions) const
+{
+	// Output DDS header.
+	if (outputOptions.outputHandler == NULL || !outputOptions.outputHeader)
+	{
+		return true;
+	}
+
+	DDSHeader header;
+
+	header.setWidth(inputOptions.targetWidth);
+	header.setHeight(inputOptions.targetHeight);
+
+	int mipmapCount = inputOptions.realMipmapCount();
+	nvDebugCheck(mipmapCount > 0);
+
+	header.setMipmapCount(mipmapCount);
+
+	if (inputOptions.textureType == TextureType_2D) {
+		header.setTexture2D();
+	}
+	else if (inputOptions.textureType == TextureType_Cube) {
+		header.setTextureCube();
+	}		
+	/*else if (inputOptions.textureType == TextureType_3D) {
+	header.setTexture3D();
+	header.setDepth(inputOptions.targetDepth);
+	}*/
+
+	if (compressionOptions.format == Format_RGBA)
+	{
+		header.setPitch(computePitch(inputOptions.targetWidth, compressionOptions.bitcount));
+		header.setPixelFormat(compressionOptions.bitcount, compressionOptions.rmask, compressionOptions.gmask, compressionOptions.bmask, compressionOptions.amask);
+	}
+	else
+	{
+		header.setLinearSize(computeImageSize(inputOptions.targetWidth, inputOptions.targetHeight, inputOptions.targetDepth, compressionOptions.bitcount, compressionOptions.format));
+
+		if (compressionOptions.format == Format_DXT1 || compressionOptions.format == Format_DXT1a) {
+			header.setFourCC('D', 'X', 'T', '1');
+			if (inputOptions.isNormalMap) header.setNormalFlag(true);
+		}
+		else if (compressionOptions.format == Format_DXT3) {
+			header.setFourCC('D', 'X', 'T', '3');
+		}
+		else if (compressionOptions.format == Format_DXT5) {
+			header.setFourCC('D', 'X', 'T', '5');
+		}
+		else if (compressionOptions.format == Format_DXT5n) {
+			header.setFourCC('D', 'X', 'T', '5');
+			if (inputOptions.isNormalMap) header.setNormalFlag(true);
+		}
+		else if (compressionOptions.format == Format_BC4) {
+			header.setFourCC('A', 'T', 'I', '1');
+		}
+		else if (compressionOptions.format == Format_BC5) {
+			header.setFourCC('A', 'T', 'I', '2');
+			if (inputOptions.isNormalMap) header.setNormalFlag(true);
+		}
+	}
+
+	// Swap bytes if necessary.
+	header.swapBytes();
+
+	uint headerSize = 128;
+	if (header.hasDX10Header())
+	{
+		nvStaticCheck(sizeof(DDSHeader) == 128 + 20);
+		headerSize = 128 + 20;
+	}
+
+	bool writeSucceed = outputOptions.outputHandler->writeData(&header, headerSize);
+	if (!writeSucceed && outputOptions.errorHandler != NULL)
+	{
+		outputOptions.errorHandler->error(Error_FileWrite);
+	}
+
+	return writeSucceed;
+}
+
+
+bool Compressor::Private::compressMipmaps(uint f, const InputOptions::Private & inputOptions, const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions) const
+{
+	uint w = inputOptions.targetWidth;
+	uint h = inputOptions.targetHeight;
+	uint d = inputOptions.targetDepth;
+
+	Mipmap mipmap;
+
+	const uint mipmapCount = inputOptions.realMipmapCount();
+	nvDebugCheck(mipmapCount > 0);
+
+	for (uint m = 0; m < mipmapCount; m++)
+	{
+		if (outputOptions.outputHandler)
+		{
+			int size = computeImageSize(w, h, d, compressionOptions.bitcount, compressionOptions.format);
+			outputOptions.outputHandler->beginImage(size, w, h, d, f, m);
+		}
+
+		// @@ Where to do the color transform?
+		// - Color transform may not be linear, so we cannot do before computing mipmaps.
+		// - Should be done in linear space, that is, after gamma correction.
+
+		if (!initMipmap(mipmap, inputOptions, w, h, d, f, m))
+		{
+			if (outputOptions.errorHandler != NULL)
+			{
+				outputOptions.errorHandler->error(Error_InvalidInput);
+				return false;
+			}
+		}
+
+		quantizeMipmap(mipmap, compressionOptions);
+
+		compressMipmap(mipmap, inputOptions, compressionOptions, outputOptions);
+
+		// Compute extents of next mipmap:
+		w = max(1U, w / 2);
+		h = max(1U, h / 2);
+		d = max(1U, d / 2);
+	}
+
+	return true;
+}
+
+bool Compressor::Private::initMipmap(Mipmap & mipmap, const InputOptions::Private & inputOptions, uint w, uint h, uint d, uint f, uint m) const
+{
+	// Find image from input.
+	int inputIdx = findExactMipmap(inputOptions, w, h, d, f);
+
+	if ((inputIdx == -1 || inputOptions.convertToNormalMap) && m != 0)
+	{
+		// Generate from last, when mipmap not found, or normal map conversion enabled.
+		downsampleMipmap(mipmap, inputOptions);
+	}
+	else
+	{
+		if (inputIdx != -1)
+		{
+			// If input mipmap found, then get from input.
+			mipmap.setFromInput(inputOptions, inputIdx);
+		}
+		else
+		{
+			// If not found, resize closest mipmap.
+			inputIdx = findClosestMipmap(inputOptions, w, h, d, f);
+
+			if (inputIdx == -1)
+			{
+				return false;
+			}
+
+			mipmap.setFromInput(inputOptions, inputIdx);
+
+			scaleMipmap(mipmap, inputOptions, w, h, d);
+		}
+
+		processInputImage(mipmap, inputOptions);
+	}
+
+	// Convert linear float image to fixed image ready for compression.
+	mipmap.toFixedImage(inputOptions);
+
+	return true;
+}
+
+int Compressor::Private::findExactMipmap(const InputOptions::Private & inputOptions, uint w, uint h, uint d, uint f) const
+{
+	for (int m = 0; m < int(inputOptions.mipmapCount); m++)
+	{
+		int idx = f * inputOptions.mipmapCount + m;
+		const InputOptions::Private::InputImage & inputImage = inputOptions.images[idx];
+
+		if (inputImage.width == int(w) && inputImage.height == int(h) && inputImage.depth == int(d))
+		{
+			if (inputImage.data != NULL)
+			{
+				return idx;
+			}
+			return -1;
+		}
+		else if (inputImage.width < int(w) || inputImage.height < int(h) || inputImage.depth < int(d))
+		{
+			return -1;
+		}
+	}
+
+	return -1;
+}
+
+int Compressor::Private::findClosestMipmap(const InputOptions::Private & inputOptions, uint w, uint h, uint d, uint f) const
+{
+	int bestIdx = -1;
+
+	for (int m = 0; m < int(inputOptions.mipmapCount); m++)
+	{
+		int idx = f * inputOptions.mipmapCount + m;
+		const InputOptions::Private::InputImage & inputImage = inputOptions.images[idx];
+
+		if (inputImage.data != NULL)
+		{
+			int difference = (inputImage.width - w) + (inputImage.height - h) + (inputImage.depth - d);
+
+			if (difference < 0)
+			{
+				if (bestIdx == -1)
+				{
+					bestIdx = idx;
+				}
+
+				return bestIdx;
+			}
+
+			bestIdx = idx;
+		}
+	}
+
+	return bestIdx;
+}
+
+// Create mipmap from the given image.
+void Compressor::Private::downsampleMipmap(Mipmap & mipmap, const InputOptions::Private & inputOptions) const
+{
+	// Make sure that floating point linear representation is available.
+	mipmap.toFloatImage(inputOptions);
+
+	const FloatImage * floatImage = mipmap.asFloatImage();
+
+	if (inputOptions.mipmapFilter == MipmapFilter_Box)
+	{
+		// Use fast downsample.
+		mipmap.setImage(floatImage->fastDownSample());
+	}
+	else if (inputOptions.mipmapFilter == MipmapFilter_Triangle)
+	{
+		TriangleFilter filter;
+		mipmap.setImage(floatImage->downSample(filter, (FloatImage::WrapMode)inputOptions.wrapMode));
+	}
+	else /*if (inputOptions.mipmapFilter == MipmapFilter_Kaiser)*/
+	{
+		nvDebugCheck(inputOptions.mipmapFilter == MipmapFilter_Kaiser);
+		KaiserFilter filter(inputOptions.kaiserWidth);
+		filter.setParameters(inputOptions.kaiserAlpha, inputOptions.kaiserStretch);
+		mipmap.setImage(floatImage->downSample(filter, (FloatImage::WrapMode)inputOptions.wrapMode));
+	}
+
+	// Normalize mipmap.
+	if ((inputOptions.isNormalMap || inputOptions.convertToNormalMap) && inputOptions.normalizeMipmaps)
+	{
+		normalizeNormalMap(mipmap.asFloatImage());
+	}
+}
+
+
+void Compressor::Private::scaleMipmap(Mipmap & mipmap, const InputOptions::Private & inputOptions, uint w, uint h, uint d) const
+{
+	mipmap.toFloatImage(inputOptions);
+
+	// @@ Add more filters.
+	// @@ Select different filters for downscaling and reconstruction.
+
+	// Resize image. 
+	BoxFilter boxFilter;
+	mipmap.setImage(mipmap.asFloatImage()->resize(boxFilter, w, h, (FloatImage::WrapMode)inputOptions.wrapMode));
+}
+
+
+// Process an input image: Convert to normal map, normalize, or convert to linear space.
+void Compressor::Private::processInputImage(Mipmap & mipmap, const InputOptions::Private & inputOptions) const
+{
+	if (inputOptions.convertToNormalMap)
+	{
+		mipmap.toFixedImage(inputOptions);
+
+		Vector4 heightScale = inputOptions.heightFactors;
+		mipmap.setImage(createNormalMap(mipmap.asFixedImage(), (FloatImage::WrapMode)inputOptions.wrapMode, heightScale, inputOptions.bumpFrequencyScale));
+	}
+	else if (inputOptions.isNormalMap)
+	{
+		if (inputOptions.normalizeMipmaps)
+		{
+			// If floating point image available, normalize in place.
+			if (mipmap.asFloatImage() == NULL)
+			{
+				FloatImage * floatImage = new FloatImage(mipmap.asFixedImage());
+				normalizeNormalMap(floatImage);
+				mipmap.setImage(floatImage);
+			}
+			else
+			{
+				normalizeNormalMap(mipmap.asFloatImage());
+				mipmap.setImage(mipmap.asFloatImage());
+			}
+		}
+	}
+	else
+	{
+		if (inputOptions.inputGamma != inputOptions.outputGamma)
+		{
+			mipmap.toFloatImage(inputOptions);
+		}
+	}
+}
+
+
+// Quantize the given mipmap according to the compression options.
+void Compressor::Private::quantizeMipmap(Mipmap & mipmap, const CompressionOptions::Private & compressionOptions) const
+{
+	nvDebugCheck(mipmap.asFixedImage() != NULL);
+
+	if (compressionOptions.binaryAlpha)
+	{
+		if (compressionOptions.enableAlphaDithering)
+		{
+			Quantize::FloydSteinberg_BinaryAlpha(mipmap.asMutableFixedImage(), compressionOptions.alphaThreshold);
+		}
+		else
+		{
+			Quantize::BinaryAlpha(mipmap.asMutableFixedImage(), compressionOptions.alphaThreshold);
+		}
+	}
+
+	if (compressionOptions.enableColorDithering || compressionOptions.enableAlphaDithering)
+	{
+		uint rsize = 8;
+		uint gsize = 8;
+		uint bsize = 8;
+		uint asize = 8;
+
+		if (compressionOptions.enableColorDithering)
+		{
+			if (compressionOptions.format >= Format_DXT1 && compressionOptions.format <= Format_DXT5)
+			{
+				rsize = 5;
+				gsize = 6;
+				bsize = 5;
+			}
+			else if (compressionOptions.format == Format_RGB)
+			{
+				uint rshift, gshift, bshift;
+				PixelFormat::maskShiftAndSize(compressionOptions.rmask, &rshift, &rsize);
+				PixelFormat::maskShiftAndSize(compressionOptions.gmask, &gshift, &gsize);
+				PixelFormat::maskShiftAndSize(compressionOptions.bmask, &bshift, &bsize);
+			}
+		}
+
+		if (compressionOptions.enableAlphaDithering)
+		{
+			if (compressionOptions.format == Format_DXT3)
+			{
+				asize = 4;
+			}
+			else if (compressionOptions.format == Format_RGB)
+			{
+				uint ashift;
+				PixelFormat::maskShiftAndSize(compressionOptions.amask, &ashift, &asize);
+			}
+		}
+
+		if (compressionOptions.binaryAlpha)
+		{
+			asize = 8; // Already quantized.
+		}
+
+		Quantize::FloydSteinberg(mipmap.asMutableFixedImage(), rsize, gsize, bsize, asize);
+	}
+}
+
+
+// Compress the given mipmap.
+bool Compressor::Private::compressMipmap(const Mipmap & mipmap, const InputOptions::Private & inputOptions, const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions) const
+{
+	const Image * image = mipmap.asFixedImage();
+	nvDebugCheck(image != NULL);
+
+	FastCompressor fast;
+	fast.setImage(image, inputOptions.alphaMode);
+
+	SlowCompressor slow;
+	slow.setImage(image, inputOptions.alphaMode);
+
+	const bool useCuda = cudaEnabled && image->width() * image->height() >= 512;
+
+	if (compressionOptions.format == Format_RGBA || compressionOptions.format == Format_RGB)
+	{
+		compressRGB(image, outputOptions, compressionOptions);
+	}
+	else if (compressionOptions.format == Format_DXT1)
+	{
+#if defined(HAVE_S3QUANT)
+		if (compressionOptions.externalCompressor == "s3")
+		{
+			s3CompressDXT1(image, outputOptions);
+		}
+		else
+#endif
+
+#if defined(HAVE_ATITC)
+			if (compressionOptions.externalCompressor == "ati")
+			{
+				atiCompressDXT1(image, outputOptions);
+			}
+			else
+#endif
+				if (compressionOptions.quality == Quality_Fastest)
+				{
+					fast.compressDXT1(outputOptions);
+				}
+				else
+				{
+					if (useCuda)
+					{
+						nvDebugCheck(cudaSupported);
+						cuda->setImage(image, inputOptions.alphaMode);
+						cuda->compressDXT1(compressionOptions, outputOptions);
+					}
+					else
+					{
+						slow.compressDXT1(compressionOptions, outputOptions);
+					}
+				}
+	}
+	else if (compressionOptions.format == Format_DXT1a)
+	{
+		if (compressionOptions.quality == Quality_Fastest)
+		{
+			fast.compressDXT1a(outputOptions);
+		}
+		else
+		{
+			if (useCuda)
+			{
+				nvDebugCheck(cudaSupported);
+				/*cuda*/slow.compressDXT1a(compressionOptions, outputOptions);
+			}
+			else
+			{
+				slow.compressDXT1a(compressionOptions, outputOptions);
+			}
+		}
+	}
+	else if (compressionOptions.format == Format_DXT3)
+	{
+		if (compressionOptions.quality == Quality_Fastest)
+		{
+			fast.compressDXT3(outputOptions);
+		}
+		else
+		{
+			if (useCuda)
+			{
+				nvDebugCheck(cudaSupported);
+				cuda->setImage(image, inputOptions.alphaMode);
+				cuda->compressDXT3(compressionOptions, outputOptions);
+			}
+			else
+			{
+				slow.compressDXT3(compressionOptions, outputOptions);
+			}
+		}
+	}
+	else if (compressionOptions.format == Format_DXT5)
+	{
+		if (compressionOptions.quality == Quality_Fastest)
+		{
+			fast.compressDXT5(outputOptions);
+		}
+		else
+		{
+			if (useCuda)
+			{
+				nvDebugCheck(cudaSupported);
+				cuda->setImage(image, inputOptions.alphaMode);
+				cuda->compressDXT5(compressionOptions, outputOptions);
+			}
+			else
+			{
+				slow.compressDXT5(compressionOptions, outputOptions);
+			}
+		}
+	}
+	else if (compressionOptions.format == Format_DXT5n)
+	{
+		if (compressionOptions.quality == Quality_Fastest)
+		{
+			fast.compressDXT5n(outputOptions);
+		}
+		else
+		{
+			slow.compressDXT5n(compressionOptions, outputOptions);
+		}
+	}
+	else if (compressionOptions.format == Format_BC4)
+	{
+		slow.compressBC4(compressionOptions, outputOptions);
+	}
+	else if (compressionOptions.format == Format_BC5)
+	{
+		slow.compressBC5(compressionOptions, outputOptions);
+	}
+
+	return true;
+}
+
+
+int Compressor::Private::estimateSize(const InputOptions::Private & inputOptions, const CompressionOptions::Private & compressionOptions) const
+{
+	const Format format = compressionOptions.format;
+	const uint bitCount = compressionOptions.bitcount;
+
+	inputOptions.computeTargetExtents();
+
+	uint mipmapCount = inputOptions.realMipmapCount();
+
+	int size = 0;
+
+	for (uint f = 0; f < inputOptions.faceCount; f++)
+	{
+		uint w = inputOptions.targetWidth;
+		uint h = inputOptions.targetHeight;
+		uint d = inputOptions.targetDepth;
+
+		for (uint m = 0; m < mipmapCount; m++)
+		{
+			size += computeImageSize(w, h, d, bitCount, format);
+
+			// Compute extents of next mipmap:
+			w = max(1U, w / 2);
+			h = max(1U, h / 2);
+			d = max(1U, d / 2);
+		}
+	}
+
+	return size;
+}
--- a/src/nvtt/Compressor.h
+++ b/src/nvtt/Compressor.h
@ -1,4 +1,4 @@
-// Copyright Ignacio Castano <icastano@nvidia.com> 2009
+// Copyright NVIDIA Corporation 2008 -- Ignacio Castano <icastano@nvidia.com>
 // 
 // Permission is hereby granted, free of charge, to any person
 // obtaining a copy of this software and associated documentation
@ -24,17 +24,57 @@
 #ifndef NV_TT_COMPRESSOR_H
 #define NV_TT_COMPRESSOR_H

-#include <nvcore/nvcore.h> // uint
+#include <nvcore/Ptr.h>
+
+#include <nvtt/cuda/CudaCompressDXT.h>
+
 #include "nvtt.h"

 namespace nv
 {
-	struct CompressorInterface
+	class Image;
+}
+
+namespace nvtt
+{
+	struct Mipmap;
+
+	struct Compressor::Private
 	{
-		virtual ~CompressorInterface() {}
-		virtual void compress(nvtt::InputFormat inputFormat, nvtt::AlphaMode alphaMode, uint w, uint h, const void * data, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions) = 0;
+		Private() {}
+
+		bool compress(const InputOptions::Private & inputOptions, const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions) const;
+		int estimateSize(const InputOptions::Private & inputOptions, const CompressionOptions::Private & compressionOptions) const;
+
+	private:
+
+		bool outputHeader(const InputOptions::Private & inputOptions, const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions) const;
+		bool compressMipmaps(uint f, const InputOptions::Private & inputOptions, const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions) const;
+
+		bool initMipmap(Mipmap & mipmap, const InputOptions::Private & inputOptions, uint w, uint h, uint d, uint f, uint m) const;
+
+		int findExactMipmap(const InputOptions::Private & inputOptions, uint w, uint h, uint d, uint f) const;
+		int findClosestMipmap(const InputOptions::Private & inputOptions, uint w, uint h, uint d, uint f) const;
+
+		void downsampleMipmap(Mipmap & mipmap, const InputOptions::Private & inputOptions) const;
+		void scaleMipmap(Mipmap & mipmap, const InputOptions::Private & inputOptions, uint w, uint h, uint d) const;
+		void processInputImage(Mipmap & mipmap, const InputOptions::Private & inputOptions) const;
+		void quantizeMipmap(Mipmap & mipmap, const CompressionOptions::Private & compressionOptions) const;
+		bool compressMipmap(const Mipmap & mipmap, const InputOptions::Private & inputOptions, const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions) const;
+
+
+
+	public:
+
+		bool cudaSupported;
+		bool cudaEnabled;
+		int cudaDevice;
+
+		nv::AutoPtr<nv::CudaCompressor> cuda;
+
 	};

-} // nv namespace
+} // nvtt namespace

-#endif // NV_TT_COMPRESSOR_H
+
+#endif // NV_TT_COMPRESSOR_H
--- a/src/nvtt/CompressorDXT.cpp
+++ b/src/nvtt/CompressorDXT.cpp
@ -1,676 +0,0 @@
-// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
-// 
-// Permission is hereby granted, free of charge, to any person
-// obtaining a copy of this software and associated documentation
-// files (the "Software"), to deal in the Software without
-// restriction, including without limitation the rights to use,
-// copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the
-// Software is furnished to do so, subject to the following
-// conditions:
-// 
-// The above copyright notice and this permission notice shall be
-// included in all copies or substantial portions of the Software.
-// 
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-// OTHER DEALINGS IN THE SOFTWARE.
-
-#include "CompressorDXT.h"
-#include "QuickCompressDXT.h"
-#include "OptimalCompressDXT.h"
-#include "CompressionOptions.h"
-#include "OutputOptions.h"
-
-// squish
-#include "squish/colourset.h"
-#include "squish/fastclusterfit.h"
-#include "squish/weightedclusterfit.h"
-
-#include "nvtt.h"
-
-#include "nvcore/Memory.h"
-
-#include "nvimage/Image.h"
-#include "nvimage/ColorBlock.h"
-#include "nvimage/BlockDXT.h"
-
-
-// s3_quant
-#if defined(HAVE_S3QUANT)
-#include "s3tc/s3_quant.h"
-#endif
-
-// ati tc
-#if defined(HAVE_ATITC)
-typedef int BOOL;
-typedef _W64 unsigned long ULONG_PTR;
-typedef ULONG_PTR DWORD_PTR;
-#include "atitc/ATI_Compress.h"
-#endif
-
-// squish
-#if defined(HAVE_SQUISH)
-//#include "squish/squish.h"
-#include "squish-1.10/squish.h"
-#endif
-
-// d3dx
-#if defined(HAVE_D3DX)
-#include <d3dx9.h>
-#endif
-
-// stb
-#if defined(HAVE_STB)
-#define STB_DEFINE
-#include "stb/stb_dxt.h"
-#endif
-
-// OpenMP
-#if defined(HAVE_OPENMP)
-#include <omp.h>
-#endif
-
-using namespace nv;
-using namespace nvtt;
-
-
-void FixedBlockCompressor::compress(nvtt::InputFormat inputFormat, nvtt::AlphaMode alphaMode, uint w, uint h, const void * data, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions)
-{
-	const uint bs = blockSize();
-	const uint bw = (w + 3) / 4;
-	const uint bh = (h + 3) / 4;
-	const uint size = bs * bw * bh;
-
-#if defined(HAVE_OPENMP)
-	bool singleThreaded = false;
-#else
-	bool singleThreaded = true;
-#endif
-
-	// Use a single thread to compress small textures.
-	if (bw * bh < 16) singleThreaded = true;
-
-	if (singleThreaded)
-	{
-		nvDebugCheck(bs <= 16);
-		uint8 mem[16];
-
-		for (int y = 0; y < int(h); y += 4) {
-			for (uint x = 0; x < w; x += 4) {
-
-				ColorBlock rgba;
-				if (inputFormat == nvtt::InputFormat_BGRA_8UB) {
-					rgba.init(w, h, (uint *)data, x, y);
-				}
-				else {
-					nvDebugCheck(inputFormat == nvtt::InputFormat_RGBA_32F);
-					rgba.init(w, h, (float *)data, x, y);
-				}
-
-				compressBlock(rgba, alphaMode, compressionOptions, mem);
-
-				if (outputOptions.outputHandler != NULL) {
-					outputOptions.outputHandler->writeData(mem, bs);
-				}
-			}
-		}
-	}
-#if defined(HAVE_OPENMP)
-	else
-	{
-		uint8 * mem = new uint8[size];
-
-	#pragma omp parallel
-		{
-	#pragma omp for
-			for (int i = 0; i < int(bw*bh); i++)
-			{
-				const uint x = i % bw;
-				const uint y = i / bw;
-
-				ColorBlock rgba;
-				if (inputFormat == nvtt::InputFormat_BGRA_8UB) {
-					rgba.init(w, h, (uint *)data, 4*x, 4*y);
-				}
-				else {
-					nvDebugCheck(inputFormat == nvtt::InputFormat_RGBA_32F);
-					rgba.init(w, h, (float *)data, 4*x, 4*y);
-				}
-
-				uint8 * ptr = mem + (y * bw + x) * bs;
-				compressBlock(rgba, alphaMode, compressionOptions, ptr);
-			} // omp for
-		} // omp parallel
-
-		if (outputOptions.outputHandler != NULL) {
-			outputOptions.outputHandler->writeData(mem, size);
-		}
-
-		delete [] mem;
-	}
-#endif
-}
-
-
-void FastCompressorDXT1::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
-{
-	BlockDXT1 * block = new(output) BlockDXT1;
-	QuickCompress::compressDXT1(rgba, block);
-}
-
-void FastCompressorDXT1a::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
-{
-	BlockDXT1 * block = new(output) BlockDXT1;
-	QuickCompress::compressDXT1a(rgba, block);
-}
-
-void FastCompressorDXT3::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
-{
-	BlockDXT3 * block = new(output) BlockDXT3;
-	QuickCompress::compressDXT3(rgba, block);
-}
-
-void FastCompressorDXT5::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
-{
-	BlockDXT5 * block = new(output) BlockDXT5;
-	QuickCompress::compressDXT5(rgba, block);
-}
-
-void FastCompressorDXT5n::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
-{
-	rgba.swizzle(4, 1, 5, 0); // 0xFF, G, 0, R
-
-	BlockDXT5 * block = new(output) BlockDXT5;
-	QuickCompress::compressDXT5(rgba, block);
-}
-
-void FastCompressorBC4::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
-{
-	BlockATI1 * block = new(output) BlockATI1;
-	
-	rgba.swizzle(0, 1, 2, 0); // Copy red to alpha
-	QuickCompress::compressDXT5A(rgba, &block->alpha);
-}
-
-void FastCompressorBC5::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
-{
-	BlockATI2 * block = new(output) BlockATI2;
-	
-	rgba.swizzle(0, 1, 2, 0); // Copy red to alpha
-	QuickCompress::compressDXT5A(rgba, &block->x);
-	
-	rgba.swizzle(0, 1, 2, 1); // Copy green to alpha
-	QuickCompress::compressDXT5A(rgba, &block->y);
-}
-
-
-void NormalCompressorDXT1::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
-{
-	nvsquish::WeightedClusterFit fit;
-	fit.SetMetric(compressionOptions.colorWeight.x(), compressionOptions.colorWeight.y(), compressionOptions.colorWeight.z());
-
-	if (rgba.isSingleColor())
-	{
-		BlockDXT1 * block = new(output) BlockDXT1;
-		OptimalCompress::compressDXT1(rgba.color(0), block);
-	}
-	else
-	{
-		nvsquish::ColourSet colours((uint8 *)rgba.colors(), 0);
-		fit.SetColourSet(&colours, nvsquish::kDxt1);
-		fit.Compress(output);
-	}
-}
-
-
-void NormalCompressorDXT1a::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
-{
-	bool anyAlpha = false;
-	bool allAlpha = true;
-		
-	for (uint i = 0; i < 16; i++)
-	{
-		if (rgba.color(i).a < 128) anyAlpha = true;
-		else allAlpha = false;
-	}
-
-	const bool isSingleColor = rgba.isSingleColor();
-		
-	if ((!anyAlpha && isSingleColor || allAlpha))
-	{
-		BlockDXT1 * block = new(output) BlockDXT1;
-		OptimalCompress::compressDXT1a(rgba.color(0), block);
-	}
-	else
-	{
-		nvsquish::WeightedClusterFit fit;
-		fit.SetMetric(compressionOptions.colorWeight.x(), compressionOptions.colorWeight.y(), compressionOptions.colorWeight.z());
-
-		int flags = nvsquish::kDxt1;
-		if (alphaMode == nvtt::AlphaMode_Transparency) flags |= nvsquish::kWeightColourByAlpha;
-
-		nvsquish::ColourSet colours((uint8 *)rgba.colors(), flags);
-		fit.SetColourSet(&colours, nvsquish::kDxt1);
-
-		fit.Compress(output);
-	}
-}
-
-
-void NormalCompressorDXT3::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
-{
-	BlockDXT3 * block = new(output) BlockDXT3;
-
-	// Compress explicit alpha.
-	OptimalCompress::compressDXT3A(rgba, &block->alpha);
-
-	// Compress color.
-	if (rgba.isSingleColor())
-	{
-		OptimalCompress::compressDXT1(rgba.color(0), &block->color);
-	}
-	else
-	{
-		nvsquish::WeightedClusterFit fit;
-		fit.SetMetric(compressionOptions.colorWeight.x(), compressionOptions.colorWeight.y(), compressionOptions.colorWeight.z());
-
-		int flags = 0;
-		if (alphaMode == nvtt::AlphaMode_Transparency) flags |= nvsquish::kWeightColourByAlpha;
-
-		nvsquish::ColourSet colours((uint8 *)rgba.colors(), flags);
-		fit.SetColourSet(&colours, 0);
-		fit.Compress(&block->color);
-	}
-}
-
-
-void NormalCompressorDXT5::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
-{
-	BlockDXT5 * block = new(output) BlockDXT5;
-
-	// Compress alpha.
-	if (compressionOptions.quality == Quality_Highest)
-	{
-		OptimalCompress::compressDXT5A(rgba, &block->alpha);
-	}
-	else
-	{
-		QuickCompress::compressDXT5A(rgba, &block->alpha);
-	}
-
-	// Compress color.
-	if (rgba.isSingleColor())
-	{
-		OptimalCompress::compressDXT1(rgba.color(0), &block->color);
-	}
-	else
-	{
-		nvsquish::WeightedClusterFit fit;
-		fit.SetMetric(compressionOptions.colorWeight.x(), compressionOptions.colorWeight.y(), compressionOptions.colorWeight.z());
-
-		int flags = 0;
-		if (alphaMode == nvtt::AlphaMode_Transparency) flags |= nvsquish::kWeightColourByAlpha;
-
-		nvsquish::ColourSet colours((uint8 *)rgba.colors(), flags);
-		fit.SetColourSet(&colours, 0);
-		fit.Compress(&block->color);
-	}
-}
-
-
-void NormalCompressorDXT5n::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
-{
-	rgba.swizzle(4, 1, 5, 0); // 0xFF, G, 0, R
-
-	BlockDXT5 * block = new(output) BlockDXT5;
-
-	// Compress X.
-	if (compressionOptions.quality == Quality_Highest)
-	{
-		OptimalCompress::compressDXT5A(rgba, &block->alpha);
-	}
-	else
-	{
-		QuickCompress::compressDXT5A(rgba, &block->alpha);
-	}
-
-	// Compress Y.
-	if (compressionOptions.quality == Quality_Highest)
-	{
-		OptimalCompress::compressDXT1G(rgba, &block->color);
-	}
-	else
-	{
-		if (rgba.isSingleColor())
-		{
-			OptimalCompress::compressDXT1G(rgba.color(0), &block->color);
-		}
-		else
-		{
-			nvsquish::WeightedClusterFit fit;
-			fit.SetMetric(0, 1, 0);
-
-			int flags = 0;
-			if (alphaMode == nvtt::AlphaMode_Transparency) flags |= nvsquish::kWeightColourByAlpha;
-
-			nvsquish::ColourSet colours((uint8 *)rgba.colors(), flags);
-			fit.SetColourSet(&colours, 0);
-			fit.Compress(&block->color);
-		}
-	}
-}
-
-
-void ProductionCompressorBC4::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
-{
-	BlockATI1 * block = new(output) BlockATI1;
-
-	rgba.swizzle(0, 1, 2, 0); // Copy red to alpha
-	OptimalCompress::compressDXT5A(rgba, &block->alpha);
-}
-
-void ProductionCompressorBC5::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
-{
-	BlockATI2 * block = new(output) BlockATI2;
-	
-	rgba.swizzle(0, 1, 2, 0); // Copy red to alpha
-	OptimalCompress::compressDXT5A(rgba, &block->x);
-	
-	rgba.swizzle(0, 1, 2, 1); // Copy green to alpha
-	OptimalCompress::compressDXT5A(rgba, &block->y);
-}
-
-
-
-#if defined(HAVE_S3QUANT)
-
-void S3CompressorDXT1::compress(nvtt::InputFormat inputFormat, nvtt::AlphaMode alphaMode, uint w, uint h, void * data, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions)
-{
-	float error = 0.0f;
-
-	BlockDXT1 dxtBlock3;
-	BlockDXT1 dxtBlock4;
-	ColorBlock block;
-
-	for (uint y = 0; y < h; y += 4) {
-		for (uint x = 0; x < w; x += 4) {
-			block.init(inputFormat, w, h, data, x, y);
-
-			// Init rgb block.
-			RGBBlock rgbBlock;
-			rgbBlock.n = 16;
-			for (uint i = 0; i < 16; i++) {
-				rgbBlock.colorChannel[i][0] = clamp(float(block.color(i).r) / 255.0f, 0.0f, 1.0f);
-				rgbBlock.colorChannel[i][1] = clamp(float(block.color(i).g) / 255.0f, 0.0f, 1.0f);
-				rgbBlock.colorChannel[i][2] = clamp(float(block.color(i).b) / 255.0f, 0.0f, 1.0f);
-			}
-			rgbBlock.weight[0] = 1.0f;
-			rgbBlock.weight[1] = 1.0f;
-			rgbBlock.weight[2] = 1.0f;
-
-			rgbBlock.inLevel = 4;
-			CodeRGBBlock(&rgbBlock);
-
-			// Copy results to DXT block.
-			dxtBlock4.col0.r = rgbBlock.endPoint[0][0];
-			dxtBlock4.col0.g = rgbBlock.endPoint[0][1];
-			dxtBlock4.col0.b = rgbBlock.endPoint[0][2];
-
-			dxtBlock4.col1.r = rgbBlock.endPoint[1][0];
-			dxtBlock4.col1.g = rgbBlock.endPoint[1][1];
-			dxtBlock4.col1.b = rgbBlock.endPoint[1][2];
-
-			dxtBlock4.setIndices(rgbBlock.index);
-
-			if (dxtBlock4.col0.u < dxtBlock4.col1.u) {
-				swap(dxtBlock4.col0.u, dxtBlock4.col1.u);
-				dxtBlock4.indices ^= 0x55555555;
-			}
-
-			uint error4 = blockError(block, dxtBlock4);
-
-			rgbBlock.inLevel = 3;
-
-			CodeRGBBlock(&rgbBlock);
-
-			// Copy results to DXT block.
-			dxtBlock3.col0.r = rgbBlock.endPoint[0][0];
-			dxtBlock3.col0.g = rgbBlock.endPoint[0][1];
-			dxtBlock3.col0.b = rgbBlock.endPoint[0][2];
-
-			dxtBlock3.col1.r = rgbBlock.endPoint[1][0];
-			dxtBlock3.col1.g = rgbBlock.endPoint[1][1];
-			dxtBlock3.col1.b = rgbBlock.endPoint[1][2];
-
-			dxtBlock3.setIndices(rgbBlock.index);
-
-			if (dxtBlock3.col0.u > dxtBlock3.col1.u) {
-				swap(dxtBlock3.col0.u, dxtBlock3.col1.u);
-				dxtBlock3.indices ^= (~dxtBlock3.indices  >> 1) & 0x55555555;
-			}
-
-			uint error3 = blockError(block, dxtBlock3);
-
-			if (error3 < error4) {
-				error += error3;
-
-				if (outputOptions.outputHandler != NULL) {
-					outputOptions.outputHandler->writeData(&dxtBlock3, sizeof(dxtBlock3));
-				}
-			}
-			else {
-				error += error4;
-
-				if (outputOptions.outputHandler != NULL) {
-					outputOptions.outputHandler->writeData(&dxtBlock4, sizeof(dxtBlock4));
-				}
-			}
-		}
-	}
-}
-
-#endif // defined(HAVE_S3QUANT)
-
-
-#if defined(HAVE_ATITC)
-
-void AtiCompressorDXT1::compress(nvtt::InputFormat inputFormat, nvtt::AlphaMode alphaMode, uint w, uint h, void * data, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions)
-{
-	// Init source texture
-	ATI_TC_Texture srcTexture;
-	srcTexture.dwSize = sizeof(srcTexture);
-	srcTexture.dwWidth = w;
-	srcTexture.dwHeight = h;
-	if (inputFormat == nvtt::InputFormat_BGRA_8UB)
-	{
-		srcTexture.dwPitch = w * 4;
-		srcTexture.format = ATI_TC_FORMAT_ARGB_8888;
-	}
-	else
-	{
-		srcTexture.dwPitch = w * 16;
-		srcTexture.format = ATI_TC_FORMAT_ARGB_32F;
-	}
-	srcTexture.dwDataSize = ATI_TC_CalculateBufferSize(&srcTexture);
-	srcTexture.pData = (ATI_TC_BYTE*) data;
-
-	// Init dest texture
-	ATI_TC_Texture destTexture;
-	destTexture.dwSize = sizeof(destTexture);
-	destTexture.dwWidth = w;
-	destTexture.dwHeight = h;
-	destTexture.dwPitch = 0;
-	destTexture.format = ATI_TC_FORMAT_DXT1;
-	destTexture.dwDataSize = ATI_TC_CalculateBufferSize(&destTexture);
-	destTexture.pData = (ATI_TC_BYTE*) mem::malloc(destTexture.dwDataSize);
-
-	ATI_TC_CompressOptions options;
-	options.dwSize = sizeof(options);
-	options.bUseChannelWeighting = false;
-	options.bUseAdaptiveWeighting = false;
-	options.bDXT1UseAlpha = false;
-	options.nCompressionSpeed = ATI_TC_Speed_Normal;
-	options.bDisableMultiThreading = false;
-	//options.bDisableMultiThreading = true;
-
-	// Compress
-	ATI_TC_ConvertTexture(&srcTexture, &destTexture, &options, NULL, NULL, NULL);
-
-	if (outputOptions.outputHandler != NULL) {
-		outputOptions.outputHandler->writeData(destTexture.pData, destTexture.dwDataSize);
-	}
-
-	mem::free(destTexture.pData);
-}
-
-void AtiCompressorDXT5::compress(nvtt::InputFormat inputFormat, nvtt::AlphaMode alphaMode, uint w, uint h, void * data, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions)
-{
-	// Init source texture
-	ATI_TC_Texture srcTexture;
-	srcTexture.dwSize = sizeof(srcTexture);
-	srcTexture.dwWidth = w;
-	srcTexture.dwHeight = h;
-	if (inputFormat == nvtt::InputFormat_BGRA_8UB)
-	{
-		srcTexture.dwPitch = w * 4;
-		srcTexture.format = ATI_TC_FORMAT_ARGB_8888;
-	}
-	else
-	{
-		srcTexture.dwPitch = w * 16;
-		srcTexture.format = ATI_TC_FORMAT_ARGB_32F;
-	}
-	srcTexture.dwDataSize = ATI_TC_CalculateBufferSize(&srcTexture);
-	srcTexture.pData = (ATI_TC_BYTE*) data;
-
-	// Init dest texture
-	ATI_TC_Texture destTexture;
-	destTexture.dwSize = sizeof(destTexture);
-	destTexture.dwWidth = w;
-	destTexture.dwHeight = h;
-	destTexture.dwPitch = 0;
-	destTexture.format = ATI_TC_FORMAT_DXT5;
-	destTexture.dwDataSize = ATI_TC_CalculateBufferSize(&destTexture);
-	destTexture.pData = (ATI_TC_BYTE*) mem::malloc(destTexture.dwDataSize);
-
-	// Compress
-	ATI_TC_ConvertTexture(&srcTexture, &destTexture, NULL, NULL, NULL, NULL);
-
-	if (outputOptions.outputHandler != NULL) {
-		outputOptions.outputHandler->writeData(destTexture.pData, destTexture.dwDataSize);
-	}
-
-	mem::free(destTexture.pData);
-}
-
-#endif // defined(HAVE_ATITC)
-
-#if defined(HAVE_SQUISH)
-
-void SquishCompressorDXT1::compress(nvtt::InputFormat inputFormat, nvtt::AlphaMode alphaMode, uint w, uint h, void * data, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions)
-{
-#pragma message(NV_FILE_LINE "TODO: Convert input to fixed point ABGR format instead of ARGB")
-	/*
-	Image img(*image);
-	int count = img.width() * img.height();
-	for (int i = 0; i < count; i++)
-	{
-		Color32 c = img.pixel(i);
-		img.pixel(i) = Color32(c.b, c.g, c.r, c.a);
-	}
-
-	int size = squish::GetStorageRequirements(img.width(), img.height(), squish::kDxt1);
-	void * blocks = mem::malloc(size);
-
-	squish::CompressImage((const squish::u8 *)img.pixels(), img.width(), img.height(), blocks, squish::kDxt1 | squish::kColourClusterFit);
-
-	if (outputOptions.outputHandler != NULL) {
-		outputOptions.outputHandler->writeData(blocks, size);
-	}
-
-	mem::free(blocks);
-	*/
-}
-
-#endif // defined(HAVE_SQUISH)
-
-
-#if defined(HAVE_D3DX)
-
-void D3DXCompressorDXT1::compress(nvtt::InputFormat inputFormat, nvtt::AlphaMode alphaMode, uint w, uint h, void * data, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions)
-{
-	IDirect3D9 * d3d = Direct3DCreate9(D3D_SDK_VERSION);
-
-	D3DPRESENT_PARAMETERS presentParams;
-	ZeroMemory(&presentParams, sizeof(presentParams));
-	presentParams.Windowed = TRUE;
-	presentParams.SwapEffect = D3DSWAPEFFECT_COPY;
-	presentParams.BackBufferWidth = 8;
-	presentParams.BackBufferHeight = 8;
-	presentParams.BackBufferFormat = D3DFMT_UNKNOWN;
-
-	HRESULT err;
-
-	IDirect3DDevice9 * device = NULL;
-	err = d3d->CreateDevice(D3DADAPTER_DEFAULT, D3DDEVTYPE_REF, GetDesktopWindow(), D3DCREATE_SOFTWARE_VERTEXPROCESSING, &presentParams, &device);
-
-	IDirect3DTexture9 * texture = NULL;
-	err = D3DXCreateTexture(device, w, h, 1, 0, D3DFMT_DXT1, D3DPOOL_SYSTEMMEM, &texture);
-	
-	IDirect3DSurface9 * surface = NULL;
-	err = texture->GetSurfaceLevel(0, &surface);
-
-	RECT rect;
-	rect.left = 0; 
-	rect.top = 0; 
-	rect.bottom = h;
-	rect.right = w;
-
-	if (inputFormat == nvtt::InputFormat_BGRA_8UB)
-	{
-		err = D3DXLoadSurfaceFromMemory(surface, NULL, NULL, data, D3DFMT_A8R8G8B8, w * 4, NULL, &rect, D3DX_DEFAULT, 0);
-	}
-	else
-	{
-		err = D3DXLoadSurfaceFromMemory(surface, NULL, NULL, data, D3DFMT_A32B32G32R32F, w * 16, NULL, &rect, D3DX_DEFAULT, 0);
-	}
-
-	if (err != D3DERR_INVALIDCALL && err != D3DXERR_INVALIDDATA)
-	{
-		D3DLOCKED_RECT rect;
-		ZeroMemory(&rect, sizeof(rect));
-
-		err = surface->LockRect(&rect, NULL, D3DLOCK_READONLY);
-
-		if (outputOptions.outputHandler != NULL) {
-			int size = rect.Pitch * ((h + 3) / 4);
-			outputOptions.outputHandler->writeData(rect.pBits, size);
-		}
-
-		err = surface->UnlockRect();
-	}
-
-	surface->Release();
-	device->Release();
-	d3d->Release();
-}
-
-#endif // defined(HAVE_D3DX)
-
-
-#if defined(HAVE_STB)
-
-void StbCompressorDXT1::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
-{
-	rgba.swizzle(2, 1, 0, 3); // Swap R and B
-	stb_compress_dxt_block((unsigned char *)output, (unsigned char *)rgba.colors(), 0, 0);
-}
-
-
-#endif // defined(HAVE_STB)
--- a/src/nvtt/CompressorDXT.h
+++ b/src/nvtt/CompressorDXT.h
@ -1,179 +0,0 @@
-// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
-// 
-// Permission is hereby granted, free of charge, to any person
-// obtaining a copy of this software and associated documentation
-// files (the "Software"), to deal in the Software without
-// restriction, including without limitation the rights to use,
-// copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the
-// Software is furnished to do so, subject to the following
-// conditions:
-// 
-// The above copyright notice and this permission notice shall be
-// included in all copies or substantial portions of the Software.
-// 
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-// OTHER DEALINGS IN THE SOFTWARE.
-
-#ifndef NV_TT_COMPRESSORDXT_H
-#define NV_TT_COMPRESSORDXT_H
-
-#include <nvcore/nvcore.h>
-#include "nvtt.h"
-#include "Compressor.h"
-
-namespace nv
-{
-	struct ColorBlock;
-
-	struct FixedBlockCompressor : public CompressorInterface
-	{
-		virtual void compress(nvtt::InputFormat inputFormat, nvtt::AlphaMode alphaMode, uint w, uint h, const void * data, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
-
-		virtual void compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output) = 0;
-		virtual uint blockSize() const = 0;
-	};
-
-
-	// Fast CPU compressors.
-	struct FastCompressorDXT1 : public FixedBlockCompressor
-	{
-		virtual void compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
-		virtual uint blockSize() const { return 8; }
-	};
-
-	struct FastCompressorDXT1a : public FixedBlockCompressor
-	{
-		virtual void compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
-		virtual uint blockSize() const { return 8; }
-	};
-
-	struct FastCompressorDXT3 : public FixedBlockCompressor
-	{
-		virtual void compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
-		virtual uint blockSize() const { return 16; }
-	};
-
-	struct FastCompressorDXT5 : public FixedBlockCompressor
-	{
-		virtual void compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
-		virtual uint blockSize() const { return 16; }
-	};
-
-	struct FastCompressorDXT5n : public FixedBlockCompressor
-	{
-		virtual void compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
-		virtual uint blockSize() const { return 16; }
-	};
-
-	struct FastCompressorBC4 : public FixedBlockCompressor
-	{
-		virtual void compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
-		virtual uint blockSize() const { return 8; }
-	};
-
-	struct FastCompressorBC5 : public FixedBlockCompressor
-	{
-		virtual void compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
-		virtual uint blockSize() const { return 16; }
-	};
-
-
-	// Normal CPU compressors.
-	struct NormalCompressorDXT1 : public FixedBlockCompressor
-	{
-		virtual void compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
-		virtual uint blockSize() const { return 8; }
-	};
-
-	struct NormalCompressorDXT1a : public FixedBlockCompressor
-	{
-		virtual void compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
-		virtual uint blockSize() const { return 8; }
-	};
-
-	struct NormalCompressorDXT3 : public FixedBlockCompressor
-	{
-		virtual void compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
-		virtual uint blockSize() const { return 16; }
-	};
-
-	struct NormalCompressorDXT5 : public FixedBlockCompressor
-	{
-		virtual void compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
-		virtual uint blockSize() const { return 16; }
-	};
-
-	struct NormalCompressorDXT5n : public FixedBlockCompressor
-	{
-		virtual void compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
-		virtual uint blockSize() const { return 16; }
-	};
-
-
-	// Production CPU compressors.
-	struct ProductionCompressorBC4 : public FixedBlockCompressor
-	{
-		virtual void compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
-		virtual uint blockSize() const { return 8; }
-	};
-
-	struct ProductionCompressorBC5 : public FixedBlockCompressor
-	{
-		virtual void compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
-		virtual uint blockSize() const { return 16; }
-	};
-
-
-	// External compressors.
-#if defined(HAVE_S3QUANT)
-	struct S3CompressorDXT1 : public CompressorInterface
-	{
-		virtual void compress(nvtt::InputFormat inputFormat, nvtt::AlphaMode alphaMode, uint w, uint h, void * data, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
-	};
-#endif
-	
-#if defined(HAVE_ATITC)
-	struct AtiCompressorDXT1 : public CompressorInterface
-	{
-		virtual void compress(nvtt::InputFormat inputFormat, nvtt::AlphaMode alphaMode, uint w, uint h, void * data, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
-	};
-
-	struct AtiCompressorDXT5 : public CompressorInterface
-	{
-		virtual void compress(nvtt::InputFormat inputFormat, nvtt::AlphaMode alphaMode, uint w, uint h, void * data, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
-	};
-#endif
-
-#if defined(HAVE_SQUISH)
-	struct SquishCompressorDXT1 : public CompressorInterface
-	{
-		virtual void compress(nvtt::InputFormat inputFormat, nvtt::AlphaMode alphaMode, uint w, uint h, void * data, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
-	};
-#endif
-
-#if defined(HAVE_D3DX)
-	struct D3DXCompressorDXT1 : public CompressorInterface
-	{
-		virtual void compress(nvtt::InputFormat inputFormat, nvtt::AlphaMode alphaMode, uint w, uint h, void * data, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
-	};
-#endif
-
-#if defined(HAVE_STB)
-	struct StbCompressorDXT1 : public FixedBlockCompressor
-	{
-		virtual void compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
-		virtual uint blockSize() const { return 8; }
-	};
-#endif
-
-} // nv namespace
-
-
-#endif // NV_TT_COMPRESSORDXT_H
--- a/src/nvtt/CompressorRGB.cpp
+++ b/src/nvtt/CompressorRGB.cpp
@ -1,230 +0,0 @@
-// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
-// 
-// Permission is hereby granted, free of charge, to any person
-// obtaining a copy of this software and associated documentation
-// files (the "Software"), to deal in the Software without
-// restriction, including without limitation the rights to use,
-// copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the
-// Software is furnished to do so, subject to the following
-// conditions:
-// 
-// The above copyright notice and this permission notice shall be
-// included in all copies or substantial portions of the Software.
-// 
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-// OTHER DEALINGS IN THE SOFTWARE.
-
-#include "CompressorRGB.h"
-#include "CompressionOptions.h"
-#include "OutputOptions.h"
-
-#include <nvimage/Image.h>
-#include <nvimage/FloatImage.h>
-#include <nvimage/PixelFormat.h>
-
-#include <nvmath/Color.h>
-#include <nvmath/Half.h>
-
-#include <nvcore/Debug.h>
-
-using namespace nv;
-using namespace nvtt;
-
-namespace 
-{
-
-	inline uint computePitch(uint w, uint bitsize)
-	{
-		uint p = w * ((bitsize + 7) / 8);
-
-		// Align to 32 bits.
-		return ((p + 3) / 4) * 4;
-	}
-
-	inline void convert_to_a8r8g8b8(const void * src, void * dst, uint w)
-	{
-		memcpy(dst, src, 4 * w);
-	}
-
-	inline void convert_to_x8r8g8b8(const void * src, void * dst, uint w)
-	{
-		memcpy(dst, src, 4 * w);
-	}
-
-    static uint16 to_half(float f)
-    {
-	    union { float f; uint32 u; } c;
-        c.f = f;
-        return half_from_float(c.u);
-    }
-
-} // namespace
-
-
-
-void PixelFormatConverter::compress(nvtt::InputFormat inputFormat, nvtt::AlphaMode alphaMode, uint w, uint h, const void * data, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions)
-{
-	uint bitCount;
-	uint rmask, rshift, rsize;
-	uint gmask, gshift, gsize;
-	uint bmask, bshift, bsize;
-	uint amask, ashift, asize;
-
-    if (compressionOptions.pixelType == nvtt::PixelType_Float)
-    {
-	    rsize = compressionOptions.rsize;
-	    gsize = compressionOptions.gsize;
-	    bsize = compressionOptions.bsize;
-	    asize = compressionOptions.asize;
-
-	    nvCheck(rsize == 0 || rsize == 16 || rsize == 32);
-	    nvCheck(gsize == 0 || gsize == 16 || gsize == 32);
-	    nvCheck(bsize == 0 || bsize == 16 || bsize == 32);
-	    nvCheck(asize == 0 || asize == 16 || asize == 32);
-
-	    bitCount = rsize + gsize + bsize + asize;
-    }
-    else
-    {
-	    if (compressionOptions.bitcount != 0)
-	    {
-		    bitCount = compressionOptions.bitcount;
-		    nvCheck(bitCount == 8 || bitCount == 16 || bitCount == 24 || bitCount == 32);
-
-		    rmask = compressionOptions.rmask;
-		    gmask = compressionOptions.gmask;
-		    bmask = compressionOptions.bmask;
-		    amask = compressionOptions.amask;
-
-		    PixelFormat::maskShiftAndSize(rmask, &rshift, &rsize);
-		    PixelFormat::maskShiftAndSize(gmask, &gshift, &gsize);
-		    PixelFormat::maskShiftAndSize(bmask, &bshift, &bsize);
-		    PixelFormat::maskShiftAndSize(amask, &ashift, &asize);
-	    }
-	    else
-	    {
-		    rsize = compressionOptions.rsize;
-		    gsize = compressionOptions.gsize;
-		    bsize = compressionOptions.bsize;
-		    asize = compressionOptions.asize;
-
-		    bitCount = rsize + gsize + bsize + asize;
-		    nvCheck(bitCount <= 32);
-
-		    ashift = 0;
-		    bshift = ashift + asize;
-		    gshift = bshift + bsize;
-		    rshift = gshift + gsize;
-
-		    rmask = ((1 << rsize) - 1) << rshift;
-		    gmask = ((1 << gsize) - 1) << gshift;
-		    bmask = ((1 << bsize) - 1) << bshift;
-		    amask = ((1 << asize) - 1) << ashift;
-	    }
-    }
-
-	uint byteCount = (bitCount + 7) / 8;
-    uint pitch = computePitch(w, bitCount);
-
-    uint srcPitch = w;
-    uint srcPlane = w * h;
-
-
-    // Allocate output scanline.
-	uint8 * dst = (uint8 *)mem::malloc(pitch + 4);
-
-	for (uint y = 0; y < h; y++)
-	{
-        const uint * src = (const uint *)data + y * srcPitch;
-        const float * fsrc = (const float *)data + y * srcPitch;
-
-        uint8 * ptr = dst;
-
-		for (uint x = 0; x < w; x++)
-		{
-            float r, g, b, a;
-
-            if (inputFormat == nvtt::InputFormat_BGRA_8UB) {
-                Color32 c = Color32(src[x]);
-                r = float(c.r) / 255.0f;
-                g = float(c.g) / 255.0f;
-                b = float(c.b) / 255.0f;
-                a = float(c.a) / 255.0f;
-            }
-            else {
-                nvDebugCheck (inputFormat == nvtt::InputFormat_RGBA_32F);
-
-			    //r = ((float *)src)[4 * x + 0]; // Color components not interleaved.
-			    //g = ((float *)src)[4 * x + 1];
-			    //b = ((float *)src)[4 * x + 2];
-			    //a = ((float *)src)[4 * x + 3];
-			    r = fsrc[x + 0 * srcPlane];
-			    g = fsrc[x + 1 * srcPlane];
-			    b = fsrc[x + 2 * srcPlane];
-			    a = fsrc[x + 3 * srcPlane];
-            }
-
-            if (compressionOptions.pixelType == nvtt::PixelType_Float)
-            {
-			    if (rsize == 32) *((float *)ptr) = r;
-			    else if (rsize == 16) *((uint16 *)ptr) = to_half(r);
-			    ptr += rsize / 8;
-
-			    if (gsize == 32) *((float *)ptr) = g;
-			    else if (gsize == 16) *((uint16 *)ptr) = to_half(g);
-			    ptr += gsize / 8;
-
-			    if (bsize == 32) *((float *)ptr) = b;
-			    else if (bsize == 16) *((uint16 *)ptr) = to_half(b);
-			    ptr += bsize / 8;
-
-			    if (asize == 32) *((float *)ptr) = a;
-			    else if (asize == 16) *((uint16 *)ptr) = to_half(a);
-			    ptr += asize / 8;
-            }
-            else
-            {
-                Color32 c;
-                if (compressionOptions.pixelType == nvtt::PixelType_UnsignedNorm) {
-                    c.r = uint8(clamp(r * 255, 0.0f, 255.0f));
-                    c.g = uint8(clamp(g * 255, 0.0f, 255.0f));
-                    c.b = uint8(clamp(b * 255, 0.0f, 255.0f));
-                    c.a = uint8(clamp(a * 255, 0.0f, 255.0f));
-                }
-                // @@ Add support for nvtt::PixelType_SignedInt, nvtt::PixelType_SignedNorm, nvtt::PixelType_UnsignedInt
-
-				uint p = 0;
-				p |= PixelFormat::convert(c.r, 8, rsize) << rshift;
-				p |= PixelFormat::convert(c.g, 8, gsize) << gshift;
-				p |= PixelFormat::convert(c.b, 8, bsize) << bshift;
-				p |= PixelFormat::convert(c.a, 8, asize) << ashift;
-				
-				// Output one byte at a time.
-				for (uint i = 0; i < byteCount; i++)
-				{
-					*(dst + x * byteCount + i) = (p >> (i * 8)) & 0xFF;
-				}
-            }
-        }
-
-		// Zero padding.
-		for (uint x = w * byteCount; x < pitch; x++)
-		{
-			*(dst + x) = 0;
-		}
-
-		if (outputOptions.outputHandler != NULL)
-		{
-			outputOptions.outputHandler->writeData(dst, pitch);
-		}
-    }
-
-	mem::free(dst);
-}
--- a/src/nvtt/CompressorRGBE.cpp
+++ b/src/nvtt/CompressorRGBE.cpp
@ -1,102 +0,0 @@
-// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
-// 
-// Permission is hereby granted, free of charge, to any person
-// obtaining a copy of this software and associated documentation
-// files (the "Software"), to deal in the Software without
-// restriction, including without limitation the rights to use,
-// copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the
-// Software is furnished to do so, subject to the following
-// conditions:
-// 
-// The above copyright notice and this permission notice shall be
-// included in all copies or substantial portions of the Software.
-// 
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-// OTHER DEALINGS IN THE SOFTWARE.
-
-#include "CompressorRGBE.h"
-#include "CompressionOptions.h"
-#include "OutputOptions.h"
-
-#include <nvimage/Image.h>
-#include <nvimage/FloatImage.h>
-
-#include <nvmath/Color.h>
-
-#include <nvcore/Debug.h>
-
-using namespace nv;
-using namespace nvtt;
-
-static Color32 toRgbe8(float r, float g, float b)
-{
-    Color32 c;
-    float v = max(max(r, g), b);
-    if (v < 1e-32) {
-        c.r = c.g = c.b = c.a = 0;
-    }
-    else {
-        int e;
-        v = frexp(v, &e) * 256.0f / v;
-        c.r = uint8(clamp(r * v, 0.0f, 255.0f));
-        c.g = uint8(clamp(g * v, 0.0f, 255.0f));
-        c.b = uint8(clamp(b * v, 0.0f, 255.0f));
-        c.a = e + 128;
-    }
-
-    return c;
-}
-
-
-void CompressorRGBE::compress(nvtt::InputFormat inputFormat, nvtt::AlphaMode alphaMode, uint w, uint h, const void * data, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions)
-{
-    nvDebugCheck (compressionOptions.format == nvtt::Format_RGBE);
-
-    uint srcPitch = w;
-    uint srcPlane = w * h;
-
-    // Allocate output scanline.
-	Color32 * dst = (Color32 *)mem::malloc(w);
-
-	for (uint y = 0; y < h; y++)
-	{
-        const uint * src = (const uint *)data + y * srcPitch;
-        const float * fsrc = (const float *)data + y * srcPitch;
-
-		for (uint x = 0; x < w; x++)
-		{
-            float r, g, b;
-
-            if (inputFormat == nvtt::InputFormat_BGRA_8UB) {
-                Color32 c = Color32(src[x]);
-                r = float(c.r) / 255.0f;
-                g = float(c.g) / 255.0f;
-                b = float(c.b) / 255.0f;
-            }
-            else {
-                nvDebugCheck (inputFormat == nvtt::InputFormat_RGBA_32F);
-
-			    // Color components not interleaved.
-			    r = fsrc[x + 0 * srcPlane];
-			    g = fsrc[x + 1 * srcPlane];
-			    b = fsrc[x + 2 * srcPlane];
-            }
-            
-            dst[x] = toRgbe8(r, g, b);
-        }
-
-		if (outputOptions.outputHandler != NULL)
-		{
-			outputOptions.outputHandler->writeData(dst, w * 4);
-		}
-    }
-
-	mem::free(dst);
-}
--- a/src/nvtt/Context.cpp
+++ b/src/nvtt/Context.cpp
--- a/src/nvtt/Context.h
+++ b/src/nvtt/Context.h
@ -1,87 +0,0 @@
-// Copyright NVIDIA Corporation 2008 -- Ignacio Castano <icastano@nvidia.com>
-// 
-// Permission is hereby granted, free of charge, to any person
-// obtaining a copy of this software and associated documentation
-// files (the "Software"), to deal in the Software without
-// restriction, including without limitation the rights to use,
-// copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the
-// Software is furnished to do so, subject to the following
-// conditions:
-// 
-// The above copyright notice and this permission notice shall be
-// included in all copies or substantial portions of the Software.
-// 
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-// OTHER DEALINGS IN THE SOFTWARE.
-
-#ifndef NV_TT_CONTEXT_H
-#define NV_TT_CONTEXT_H
-
-#include "nvcore/Ptr.h"
-
-#include "nvtt/Compressor.h"
-#include "nvtt/cuda/CudaCompressorDXT.h"
-#include "nvtt.h"
-
-namespace nv
-{
-	class Image;
-}
-
-namespace nvtt
-{
-	struct Mipmap;
-
-	struct Compressor::Private
-	{
-		Private() {}
-
-		bool compress(const InputOptions::Private & inputOptions, const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions) const;
-
-        bool compress2D(InputFormat inputFormat, AlphaMode alphaMode, int w, int h, const void * data, const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions) const;
-
-		int estimateSize(const InputOptions::Private & inputOptions, const CompressionOptions::Private & compressionOptions) const;
-
-		bool outputHeader(const TexImage & tex, int mipmapCount, const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions);
-
-	private:
-
-		bool outputHeader(const InputOptions::Private & inputOptions, const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions) const;
-
-		nv::CompressorInterface * chooseCpuCompressor(const CompressionOptions::Private & compressionOptions) const;
-		nv::CompressorInterface * chooseGpuCompressor(const CompressionOptions::Private & compressionOptions) const;
-
-		bool compressMipmaps(uint f, const InputOptions::Private & inputOptions, const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions) const;
-
-		bool initMipmap(Mipmap & mipmap, const InputOptions::Private & inputOptions, uint w, uint h, uint d, uint f, uint m) const;
-
-		int findExactMipmap(const InputOptions::Private & inputOptions, uint w, uint h, uint d, uint f) const;
-		int findClosestMipmap(const InputOptions::Private & inputOptions, uint w, uint h, uint d, uint f) const;
-
-		void downsampleMipmap(Mipmap & mipmap, const InputOptions::Private & inputOptions) const;
-		void scaleMipmap(Mipmap & mipmap, const InputOptions::Private & inputOptions, uint w, uint h, uint d) const;
-		void premultiplyAlphaMipmap(Mipmap & mipmap, const InputOptions::Private & inputOptions) const;
-		void processInputImage(Mipmap & mipmap, const InputOptions::Private & inputOptions) const;
-		void quantizeMipmap(Mipmap & mipmap, const CompressionOptions::Private & compressionOptions) const;
-
-
-	public:
-
-		bool cudaSupported;
-		bool cudaEnabled;
-
-		nv::AutoPtr<nv::CudaContext> cuda;
-
-	};
-
-} // nvtt namespace
-
-
-#endif // NV_TT_CONTEXT_H
--- a/src/nvtt/InputOptions.cpp
+++ b/src/nvtt/InputOptions.cpp
@ -23,11 +23,8 @@

 #include <string.h> // memcpy

-#include <nvcore/Containers.h> // nextPowerOfTwo
 #include <nvcore/Memory.h>

-#include <nvmath/Color.h>
-
 #include "nvtt.h"
 #include "InputOptions.h"

@ -104,8 +101,6 @@ void InputOptions::reset()
 	
 	m.colorTransform = ColorTransform_None;
 	m.linearTransform = Matrix(identity);
-	for (int i = 0; i < 4; i++) m.colorOffsets[i] = 0;
-	for (int i = 0; i < 4; i++) m.swizzleTransform[i] = i;

 	m.generateMipmaps = true;
 	m.maxLevel = -1;
@ -123,8 +118,6 @@ void InputOptions::reset()
 	
 	m.maxExtent = 0;
 	m.roundMode = RoundMode_None;
-
-	m.premultiplyAlpha = false;
 }


@ -168,8 +161,7 @@ void InputOptions::setTextureLayout(TextureType type, int width, int height, int
 			img.mipLevel = mipLevel;
 			img.face = f;
 			
-			img.uint8data = NULL;
-			img.floatdata = NULL;
+			img.data = NULL;
 			
 			w = max(1U, w / 2);
 			h = max(1U, h / 2);
@ -207,116 +199,14 @@ bool InputOptions::setMipmapData(const void * data, int width, int height, int d
 		return false;
 	}
 	
-	switch(m.inputFormat)
-	{
-		case InputFormat_BGRA_8UB:
-			if (Image * image = new nv::Image())
-			{
-				image->allocate(width, height);
-				memcpy(image->pixels(), data, width * height * 4);
-				m.images[idx].uint8data = image;
-			}
-			else
-			{
-				// @@ Out of memory error.
-				return false;
-			}
-			break;
-		case InputFormat_RGBA_32F:
-			if (FloatImage * image = new nv::FloatImage())
-			{
-				const float * floatData = (const float *)data;
-				image->allocate(4, width, height);
-				
-				for (int c = 0; c < 4; c++)
-				{
-					float * channel = image->channel(c);
-					for (int i = 0; i < width * height; i++)
-					{
-						channel[i] = floatData[i*4 + c];
-					}
-				}
-				
-				m.images[idx].floatdata = image;
-			}
-			else
-			{
-				// @@ Out of memory error.
-				return false;
-			}
-			break;
-		default:
-			return false;
-	}
+	m.images[idx].data = new nv::Image();
+	m.images[idx].data->allocate(width, height);
+	memcpy(m.images[idx].data->pixels(), data, width * height * 4); 
 	
 	return true;
 }


-// Copies data 
-bool InputOptions::setMipmapChannelData(const void * data, int channel, int width, int height, int depth /*= 1*/, int face /*= 0*/, int mipLevel /*= 0*/)
-{
-	nvCheck(depth == 1);
-	nvCheck(channel >= 0 && channel < 4);
-	
-	const int idx = face * m.mipmapCount + mipLevel;
-	
-	if (m.images[idx].width != width || m.images[idx].height != height || m.images[idx].depth != depth || m.images[idx].mipLevel != mipLevel || m.images[idx].face != face)
-	{
-		// Invalid dimension or index.
-		return false;
-	}
-	
-	// Allocate image if not allocated already.
-	if (m.inputFormat == InputFormat_BGRA_8UB)
-	{
-		m.images[idx].floatdata = NULL;
-		if (m.images[idx].uint8data == NULL)
-		{
-			m.images[idx].uint8data = new Image();
-			m.images[idx].uint8data->allocate(width, height);
-			m.images[idx].uint8data->fill(Color32(0,0,0,0));
-		}
-	}
-	else if (m.inputFormat == InputFormat_RGBA_32F)
-	{
-		m.images[idx].uint8data = NULL;
-		if (m.images[idx].floatdata == NULL)
-		{
-			m.images[idx].floatdata = new FloatImage();
-			m.images[idx].floatdata->allocate(4, width, height);
-			m.images[idx].floatdata->clear();
-		}
-
-		
-	}
-	else
-	{
-		m.images[idx].floatdata = NULL;
-		m.images[idx].uint8data = NULL;
-		return false;
-	}
-
-	// Copy channel data to image.
-	if (m.inputFormat == InputFormat_BGRA_8UB)
-	{
-		// @@ TODO
-	}
-	else if (m.inputFormat == InputFormat_RGBA_32F)
-	{
-		const float * floatData = (const float *)data;
-		float * channelPtr = m.images[idx].floatdata->channel(channel);
-
-		for (int i = 0; i < width * height; i++)
-		{
-			channelPtr[i] = floatData[i];
-		}
-	}
-
-	return true;
-}
-
-
 /// Describe the format of the input.
 void InputOptions::setFormat(InputFormat format)
 {
@ -411,32 +301,8 @@ void InputOptions::setLinearTransform(int channel, float w0, float w1, float w2,
 {
 	nvCheck(channel >= 0 && channel < 4);

-	m.linearTransform(channel, 0) = w0;
-	m.linearTransform(channel, 1) = w1;
-	m.linearTransform(channel, 2) = w2;
-	m.linearTransform(channel, 3) = w3;
-}
-
-void InputOptions::setLinearTransform(int channel, float w0, float w1, float w2, float w3, float offset)
-{
-	nvCheck(channel >= 0 && channel < 4);
-
-	setLinearTransform(channel, w0, w1, w2, w3);
-
-	m.colorOffsets[channel] = offset;
-}
-
-void InputOptions::setSwizzleTransform(int x, int y, int z, int w)
-{
-	nvCheck(x >= 0 && x <= 6);
-	nvCheck(y >= 0 && y <= 6);
-	nvCheck(z >= 0 && z <= 6);
-	nvCheck(w >= 0 && w <= 6);
-	
-	m.swizzleTransform[0] = x;
-	m.swizzleTransform[1] = y;
-	m.swizzleTransform[2] = z;
-	m.swizzleTransform[3] = w;
+	Vector4 w(w0, w1, w2, w3);
+	//m.linearTransform.setRow(channel, w);
 }

 void InputOptions::setMaxExtents(int e)
@ -450,10 +316,6 @@ void InputOptions::setRoundMode(RoundMode mode)
 	m.roundMode = mode;
 }

-void InputOptions::setPremultiplyAlpha(bool b)
-{
-	m.premultiplyAlpha = b;
-}

 void InputOptions::Private::computeTargetExtents() const
 {
@ -533,7 +395,7 @@ const Image * InputOptions::Private::image(uint face, uint mipmap) const
 	nvDebugCheck(image.face == face);
 	nvDebugCheck(image.mipLevel == mipmap);

-	return image.uint8data.ptr();
+	return image.data.ptr();
 }

 const Image * InputOptions::Private::image(uint idx) const
@ -542,14 +404,5 @@ const Image * InputOptions::Private::image(uint idx) const

 	const InputImage & image = this->images[idx];

-	return image.uint8data.ptr();
-}
-
-const FloatImage * InputOptions::Private::floatImage(uint idx) const
-{
-	nvDebugCheck(idx < faceCount * mipmapCount);
-
-	const InputImage & image = this->images[idx];
-
-	return image.floatdata.ptr();
+	return image.data.ptr();
 }
--- a/src/nvtt/InputOptions.h
+++ b/src/nvtt/InputOptions.h
@ -28,7 +28,6 @@
 #include <nvmath/Vector.h>
 #include <nvmath/Matrix.h>
 #include <nvimage/Image.h>
-#include <nvimage/FloatImage.h>
 #include "nvtt.h"

 namespace nvtt
@ -57,8 +56,6 @@ namespace nvtt
 		// Color transform.
 		ColorTransform colorTransform;
 		nv::Matrix linearTransform;
-		float colorOffsets[4];
-		uint swizzleTransform[4];
 		
 		// Mipmap generation options.
 		bool generateMipmaps;
@ -81,8 +78,6 @@ namespace nvtt
 		uint maxExtent;
 		RoundMode roundMode;
 		
-		bool premultiplyAlpha;
-
 		// @@ These are computed in nvtt::compress, so they should be mutable or stored elsewhere...
 		mutable uint targetWidth;
 		mutable uint targetHeight;
@ -94,9 +89,7 @@ namespace nvtt
 		int realMipmapCount() const;
 		
 		const nv::Image * image(uint face, uint mipmap) const;
-		const nv::Image * image(uint idx) const;
-
-		const nv::FloatImage * floatImage(uint idx) const;
+		const nv::Image * image(uint idx) const;

 	};

@ -105,8 +98,6 @@ namespace nvtt
 	{
 		InputImage() {}
 		
-		bool hasValidData() const { return uint8data != NULL || floatdata != NULL; }
-		
 		int mipLevel;
 		int face;
 		
@ -114,8 +105,7 @@ namespace nvtt
 		int height;
 		int depth;
 		
-		nv::AutoPtr<nv::Image> uint8data;
-		nv::AutoPtr<nv::FloatImage> floatdata;
+		nv::AutoPtr<nv::Image> data;
 	};

 } // nvtt namespace
--- a/src/nvtt/OptimalCompressDXT.cpp
+++ b/src/nvtt/OptimalCompressDXT.cpp
@ -21,17 +21,16 @@
 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 // OTHER DEALINGS IN THE SOFTWARE.

-#include "OptimalCompressDXT.h"
-#include "SingleColorLookup.h"
+#include <nvcore/Containers.h> // swap
+
+#include <nvmath/Color.h>

 #include <nvimage/ColorBlock.h>
 #include <nvimage/BlockDXT.h>

-#include <nvmath/Color.h>
+#include "OptimalCompressDXT.h"
+#include "SingleColorLookup.h"

-#include <nvcore/Containers.h> // swap
-
-#include <limits.h>

 using namespace nv;
 using namespace OptimalCompress;
@ -40,37 +39,10 @@ using namespace OptimalCompress;

 namespace
 {
-	static int greenDistance(int g0, int g1)
-	{
-		//return abs(g0 - g1);
-		int d = g0 - g1;
-		return d * d;
-	}
-
-	static int alphaDistance(int a0, int a1)
-	{
-		//return abs(a0 - a1);
-		int d = a0 - a1;
-		return d * d;
-	}
-
-	static uint nearestGreen4(uint green, uint maxGreen, uint minGreen)
-	{
-		uint bias = maxGreen + (maxGreen - minGreen) / 6;
-
-		uint index = 0;
-		if (maxGreen - minGreen != 0) index = clamp(3 * (bias - green) / (maxGreen - minGreen), 0U, 3U);
-
-		return (index * minGreen + (3 - index) * maxGreen) / 3;
-	}
-
-	static int computeGreenError(const ColorBlock & rgba, const BlockDXT1 * block, int bestError = INT_MAX)
+	static int computeGreenError(const ColorBlock & rgba, const BlockDXT1 * block)
 	{
 		nvDebugCheck(block != NULL);

-	//	uint g0 = (block->col0.g << 2) | (block->col0.g >> 4);
-	//	uint g1 = (block->col1.g << 2) | (block->col1.g >> 4);
-
 		int palette[4];
 		palette[0] = (block->col0.g << 2) | (block->col0.g >> 4);
 		palette[1] = (block->col1.g << 2) | (block->col1.g >> 4);
@ -78,24 +50,17 @@ namespace
 		palette[3] = (2 * palette[1] + palette[0]) / 3;

 		int totalError = 0;
+
 		for (int i = 0; i < 16; i++)
 		{
 			const int green = rgba.color(i).g;
 			
-			int error = greenDistance(green, palette[0]);
-			error = min(error, greenDistance(green, palette[1]));
-			error = min(error, greenDistance(green, palette[2]));
-			error = min(error, greenDistance(green, palette[3]));
-
+			int error = abs(green - palette[0]);
+			error = min(error, abs(green - palette[1]));
+			error = min(error, abs(green - palette[2]));
+			error = min(error, abs(green - palette[3]));
+			
 			totalError += error;
-
-		//	totalError += nearestGreen4(green, g0, g1);
-
-			if (totalError > bestError)
-			{
-				// early out
-				return totalError;
-			}
 		}

 		return totalError;
@ -113,10 +78,10 @@ namespace
 		{
 			const int color = rgba.color(i).g;
 			
-			uint d0 = greenDistance(color0, color);
-			uint d1 = greenDistance(color1, color);
-			uint d2 = greenDistance(color2, color);
-			uint d3 = greenDistance(color3, color);
+			uint d0 = abs(color0 - color);
+			uint d1 = abs(color1 - color);
+			uint d2 = abs(color2 - color);
+			uint d3 = abs(color3 - color);
 			
 			uint b0 = d0 > d3;
 			uint b1 = d1 > d2;
@ -137,78 +102,49 @@ namespace
 	// Choose quantized color that produces less error. Used by DXT3 compressor.
 	inline static uint quantize4(uint8 a)
 	{
-		int q0 = max(int(a >> 4) - 1, 0);
+		int q0 = (a >> 4) - 1;
 		int q1 = (a >> 4);
-		int q2 = min(int(a >> 4) + 1, 0xF);
+		int q2 = (a >> 4) + 1;
 		
 		q0 = (q0 << 4) | q0;
 		q1 = (q1 << 4) | q1;
 		q2 = (q2 << 4) | q2;
 		
-		int d0 = alphaDistance(q0, a);
-		int d1 = alphaDistance(q1, a);
-		int d2 = alphaDistance(q2, a);
+		int d0 = abs(q0 - a);
+		int d1 = abs(q1 - a);
+		int d2 = abs(q2 - a);

 		if (d0 < d1 && d0 < d2) return q0 >> 4;
 		if (d1 < d2) return q1 >> 4;
 		return q2 >> 4;
 	}
 	
-	static uint nearestAlpha8(uint alpha, uint maxAlpha, uint minAlpha)
-	{
-		float bias = maxAlpha + float(maxAlpha - minAlpha) / (2.0f * 7.0f);
-		float scale = 7.0f / float(maxAlpha - minAlpha);
-
-		uint index = (uint)clamp((bias - float(alpha)) * scale, 0.0f, 7.0f);
-
-		return (index * minAlpha + (7 - index) * maxAlpha) / 7;
-	}
-
-	static uint computeAlphaError8(const ColorBlock & rgba, const AlphaBlockDXT5 * block, int bestError = INT_MAX)
-	{
-		int totalError = 0;
-
-		for (uint i = 0; i < 16; i++)
-		{
-			uint8 alpha = rgba.color(i).a;
-
-			totalError += alphaDistance(alpha, nearestAlpha8(alpha, block->alpha0, block->alpha1));
-
-			if (totalError > bestError)
-			{
-				// early out
-				return totalError;
-			}
-		}
-
-		return totalError;
-	}
-
-	static uint computeAlphaError(const ColorBlock & rgba, const AlphaBlockDXT5 * block, int bestError = INT_MAX)
+	static uint computeAlphaError(const ColorBlock & rgba, const AlphaBlockDXT5 * block)
 	{
 		uint8 alphas[8];
 		block->evaluatePalette(alphas);

-		int totalError = 0;
+		uint totalError = 0;

 		for (uint i = 0; i < 16; i++)
 		{
 			uint8 alpha = rgba.color(i).a;

-			int minDist = INT_MAX;
+			uint besterror = 256*256;
+			uint best;
 			for (uint p = 0; p < 8; p++)
 			{
-				int dist = alphaDistance(alpha, alphas[p]);
-				minDist = min(dist, minDist);
+				int d = alphas[p] - alpha;
+				uint error = d * d;
+
+				if (error < besterror)
+				{
+					besterror = error;
+					best = p;
+				}
 			}

-			totalError += minDist;
-
-			if (totalError > bestError)
-			{
-				// early out
-				return totalError;
-			}
+			totalError += besterror;
 		}

 		return totalError;
@ -223,21 +159,22 @@ namespace
 		{
 			uint8 alpha = rgba.color(i).a;

-			int minDist = INT_MAX;
-			int bestIndex = 8;
-			for (uint p = 0; p < 8; p++)
+			uint besterror = 256*256;
+			uint best = 8;
+			for(uint p = 0; p < 8; p++)
 			{
-				int dist = alphaDistance(alpha, alphas[p]);
+				int d = alphas[p] - alpha;
+				uint error = d * d;

-				if (dist < minDist)
+				if (error < besterror)
 				{
-					minDist = dist;
-					bestIndex = p;
+					besterror = error;
+					best = p;
 				}
 			}
-			nvDebugCheck(bestIndex < 8);
+			nvDebugCheck(best < 8);

-			block->setIndex(i, bestIndex);
+			block->setIndex(i, best);
 		}
 	}

@ -280,23 +217,6 @@ void OptimalCompress::compressDXT1a(Color32 rgba, BlockDXT1 * dxtBlock)
 	}
 }

-void OptimalCompress::compressDXT1G(uint8 g, BlockDXT1 * dxtBlock)
-{
-	dxtBlock->col0.r = 31;
-	dxtBlock->col0.g = OMatch6[g][0];
-	dxtBlock->col0.b = 0;
-	dxtBlock->col1.r = 31;
-	dxtBlock->col1.g = OMatch6[g][1];
-	dxtBlock->col1.b = 0;
-	dxtBlock->indices = 0xaaaaaaaa;
-
-	if (dxtBlock->col0.u < dxtBlock->col1.u)
-	{
-		swap(dxtBlock->col0.u, dxtBlock->col1.u);
-		dxtBlock->indices ^= 0x55555555;
-	}
-}
-

 // Brute force green channel compressor
 void OptimalCompress::compressDXT1G(const ColorBlock & rgba, BlockDXT1 * block)
@ -306,23 +226,12 @@ void OptimalCompress::compressDXT1G(const ColorBlock & rgba, BlockDXT1 * block)
 	uint8 ming = 63;
 	uint8 maxg = 0;
 	
-	bool isSingleColor = true;
-	uint8 singleColor = rgba.color(0).g;
-
 	// Get min/max green.
 	for (uint i = 0; i < 16; i++)
 	{
-		uint8 green = (rgba.color(i).g + 1) >> 2;
+		uint8 green = rgba.color(i).g >> 2;
 		ming = min(ming, green);
 		maxg = max(maxg, green);
-
-		if (rgba.color(i).g != singleColor) isSingleColor = false;
-	}
-
-	if (isSingleColor)
-	{
-		compressDXT1G(singleColor, block);
-		return;
 	}

 	block->col0.r = 31;
@ -332,38 +241,36 @@ void OptimalCompress::compressDXT1G(const ColorBlock & rgba, BlockDXT1 * block)
 	block->col0.b = 0;
 	block->col1.b = 0;

-	int bestError = computeGreenError(rgba, block);
-	int bestg0 = maxg;
-	int bestg1 = ming;
-
-	// Expand search space a bit.
-	const int greenExpand = 4;
-	ming = (ming <= greenExpand) ? 0 : ming - greenExpand;
-	maxg = (maxg >= 63-greenExpand) ? 63 : maxg + greenExpand;
-
-	for (int g0 = ming+1; g0 <= maxg; g0++)
+	if (maxg - ming > 4)
 	{
-		for (int g1 = ming; g1 < g0; g1++)
+		int besterror = computeGreenError(rgba, block);
+		int bestg0 = maxg;
+		int bestg1 = ming;
+		
+		for (int g0 = ming+5; g0 < maxg; g0++)
 		{
-			block->col0.g = g0;
-			block->col1.g = g1;
-			int error = computeGreenError(rgba, block, bestError);
-			
-			if (error < bestError)
+			for (int g1 = ming; g1 < g0-4; g1++)
 			{
-				bestError = error;
-				bestg0 = g0;
-				bestg1 = g1;
+				if ((maxg-g0) + (g1-ming) > besterror)
+					continue;
+				
+				block->col0.g = g0;
+				block->col1.g = g1;
+				int error = computeGreenError(rgba, block);
+				
+				if (error < besterror)
+				{
+					besterror = error;
+					bestg0 = g0;
+					bestg1 = g1;
+				}
 			}
 		}
+		
+		block->col0.g = bestg0;
+		block->col1.g = bestg1;
 	}
 	
-	block->col0.g = bestg0;
-	block->col1.g = bestg1;
-
-	nvDebugCheck(bestg0 == bestg1 || block->isFourColorMode());
-
-
 	Color32 palette[4];
 	block->evaluatePalette(palette);
 	block->indices = computeGreenIndices(rgba, palette);
@ -406,26 +313,42 @@ void OptimalCompress::compressDXT5A(const ColorBlock & rgba, AlphaBlockDXT5 * dx
 	dxtBlock->alpha0 = maxa;
 	dxtBlock->alpha1 = mina;

+	/*int centroidDist = 256;
+	int centroid;
+
+	// Get the closest to the centroid.
+	for (uint i = 0; i < 16; i++)
+	{
+		uint8 alpha = rgba.color(i).a;
+		int dist = abs(alpha - (maxa + mina) / 2);
+		if (dist < centroidDist)
+		{
+			centroidDist = dist;
+			centroid = alpha;
+		}
+	}*/
+
 	if (maxa - mina > 8)
 	{
 		int besterror = computeAlphaError(rgba, dxtBlock);
 		int besta0 = maxa;
 		int besta1 = mina;

-		// Expand search space a bit.
-		const int alphaExpand = 8;
-		mina = (mina <= alphaExpand) ? 0 : mina - alphaExpand;
-		maxa = (maxa >= 255-alphaExpand) ? 255 : maxa + alphaExpand;
-
 		for (int a0 = mina+9; a0 < maxa; a0++)
 		{
 			for (int a1 = mina; a1 < a0-8; a1++)
+			//for (int a1 = mina; a1 < maxa; a1++)
 			{
-				nvDebugCheck(a0 - a1 > 8);
+				//nvCheck(abs(a1-a0) > 8);
+
+				//if (abs(a0 - a1) < 8) continue;
+				//if ((maxa-a0) + (a1-mina) + min(abs(centroid-a0), abs(centroid-a1)) > besterror)
+				if ((maxa-a0) + (a1-mina) > besterror)
+					continue;

 				dxtBlock->alpha0 = a0;
 				dxtBlock->alpha1 = a1;
-				int error = computeAlphaError(rgba, dxtBlock, besterror);
+				int error = computeAlphaError(rgba, dxtBlock);

 				if (error < besterror)
 				{
--- a/src/nvtt/OptimalCompressDXT.h
+++ b/src/nvtt/OptimalCompressDXT.h
@ -26,8 +26,6 @@

 #include <nvimage/nvimage.h>

-#include <nvmath/Color.h>
-
 namespace nv
 {
 	struct ColorBlock;
@ -41,7 +39,6 @@ namespace nv
 	{
 		void compressDXT1(Color32 rgba, BlockDXT1 * dxtBlock);
 		void compressDXT1a(Color32 rgba, BlockDXT1 * dxtBlock);
-		void compressDXT1G(uint8 g, BlockDXT1 * dxtBlock);
 		
 		void compressDXT1G(const ColorBlock & rgba, BlockDXT1 * block);
 		void compressDXT3A(const ColorBlock & rgba, AlphaBlockDXT3 * dxtBlock);
--- a/src/nvtt/OutputOptions.cpp
+++ b/src/nvtt/OutputOptions.cpp
@ -33,9 +33,6 @@ OutputOptions::OutputOptions() : m(*new OutputOptions::Private())

 OutputOptions::~OutputOptions()
 {
-	// Cleanup output handler.
-	setOutputHandler(NULL);
-
 	delete &m;
 }

@ -46,31 +43,20 @@ void OutputOptions::reset()
 	m.outputHandler = NULL;
 	m.errorHandler = NULL;
 	m.outputHeader = true;
-	m.container = Container_DDS;
 }


 /// Set output file name.
 void OutputOptions::setFileName(const char * fileName)
 {
-	m.fileName = fileName; // @@ Do we need to record filename?
+	m.fileName = fileName;
 	m.outputHandler = NULL;
-
-	DefaultOutputHandler * oh = new DefaultOutputHandler(fileName);
-	if (!oh->stream.isError())
-	{
-		m.outputHandler = oh;
-	}
 }

 /// Set output handler.
 void OutputOptions::setOutputHandler(OutputHandler * outputHandler)
 {
-	if (!m.fileName.isNull())
-	{
-		delete m.outputHandler;
-		m.fileName.reset();
-	}
+	m.fileName.reset();
 	m.outputHandler = outputHandler;
 }

@ -86,20 +72,31 @@ void OutputOptions::setOutputHeader(bool outputHeader)
 	m.outputHeader = outputHeader;
 }

-/// Set container.
-void OutputOptions::setContainer(Container container)
-{
-	m.container = container;
-}

-
-bool OutputOptions::Private::hasValidOutputHandler() const
+bool OutputOptions::Private::openFile() const
 {
 	if (!fileName.isNull())
 	{
-		return outputHandler != NULL;
+		nvCheck(outputHandler == NULL);
+		
+		DefaultOutputHandler * oh = new DefaultOutputHandler(fileName.str());
+		if (oh->stream.isError())
+		{
+			return false;
+		}
+		
+		outputHandler = oh;
 	}
 	
 	return true;
 }

+void OutputOptions::Private::closeFile() const
+{
+	if (!fileName.isNull())
+	{
+		delete outputHandler;
+		outputHandler = NULL;
+	}
+}
+
--- a/src/nvtt/OutputOptions.h
+++ b/src/nvtt/OutputOptions.h
@ -52,7 +52,7 @@ namespace nvtt
 			//return !stream.isError();
 			return true;
 		}
-
+		
 		nv::StdOutputStream stream;
 	};
 	
@ -61,12 +61,12 @@ namespace nvtt
 	{
 		nv::Path fileName;
 		
-		OutputHandler * outputHandler;
+		mutable OutputHandler * outputHandler;
 		ErrorHandler * errorHandler;
 		bool outputHeader;
-		Container container;
 		
-		bool hasValidOutputHandler() const;
+		bool openFile() const;
+		void closeFile() const;
 	};

 	
--- a/Show More
+++ b/Show More