Import all sources from perforce.

2007-04-17 08:49:19 +00:00
commit 7543dd1efa
197 changed files with 49819 additions and 0 deletions
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@ -0,0 +1,92 @@
+
+SUBDIRS(nvcore)
+SUBDIRS(nvmath)
+SUBDIRS(nvimage)
+
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR})
+
+# OpenGL
+INCLUDE(FindOpenGL)
+IF(OPENGL_FOUND)
+	MESSAGE(STATUS "Looking for OpenGL - found")
+ELSE(OPENGL_FOUND)
+	MESSAGE(STATUS "Looking for OpenGL - not found")
+ENDIF(OPENGL_FOUND)
+
+# GLUT
+INCLUDE(${NV_CMAKE_DIR}/FindGLUT.cmake)
+#INCLUDE(FindGLUT)
+IF(GLUT_FOUND)
+	MESSAGE(STATUS "Looking for GLUT - found")
+ELSE(GLUT_FOUND)
+	MESSAGE(STATUS "Looking for GLUT - not found")
+ENDIF(GLUT_FOUND)
+
+# GLEW
+INCLUDE(${NV_CMAKE_DIR}/FindGLEW.cmake)
+IF(GLEW_FOUND)
+	MESSAGE(STATUS "Looking for GLEW - found")
+ELSE(GLEW_FOUND)
+	MESSAGE(STATUS "Looking for GLEW - not found")
+ENDIF(GLEW_FOUND)
+
+# Cg
+INCLUDE(${NV_CMAKE_DIR}/FindCg.cmake)
+IF(CG_FOUND)
+	MESSAGE(STATUS "Looking for Cg - found")
+ELSE(CG_FOUND)
+	MESSAGE(STATUS "Looking for Cg - not found")
+ENDIF(CG_FOUND)
+
+# CUDA
+INCLUDE(${NV_CMAKE_DIR}/FindCUDA.cmake)
+IF(CUDA_FOUND)
+	MESSAGE(STATUS "Looking for CUDA - found")
+ELSE(CUDA_FOUND)
+	MESSAGE(STATUS "Looking for CUDA - not found")
+ENDIF(CUDA_FOUND)
+
+# JPEG
+INCLUDE(FindJPEG)
+IF(JPEG_FOUND)
+	SET(HAVE_JPEG JPEG_FOUND)
+	MESSAGE(STATUS "Looking for JPEG - found")
+ELSE(JPEG_FOUND)
+	MESSAGE(STATUS "Looking for JPEG - not found")
+ENDIF(JPEG_FOUND)
+
+# PNG
+INCLUDE(FindPNG)
+IF(PNG_FOUND)
+	SET(HAVE_PNG PNG_FOUND)
+	MESSAGE(STATUS "Looking for PNG - found")
+ELSE(PNG_FOUND)
+	MESSAGE(STATUS "Looking for PNG - not found")
+ENDIF(PNG_FOUND)
+
+# TIFF
+INCLUDE(FindTIFF)
+IF(TIFF_FOUND)
+	SET(HAVE_TIFF TIFF_FOUND)
+	MESSAGE(STATUS "Looking for TIFF - found")
+ELSE(TIFF_FOUND)
+	MESSAGE(STATUS "Looking for TIFF - not found")
+ENDIF(TIFF_FOUND)
+
+# Qt
+FIND_PACKAGE(Qt4)
+
+# Threads
+FIND_PACKAGE(Threads)
+
+# configuration file
+INCLUDE(CheckIncludeFiles)
+
+CHECK_INCLUDE_FILES(unistd.h HAVE_UNISTD_H)
+CHECK_INCLUDE_FILES(stdarg.h HAVE_STDARG_H)
+CHECK_INCLUDE_FILES(signal.h HAVE_SIGNAL_H)
+CHECK_INCLUDE_FILES(execinfo.h HAVE_EXECINFO_H)
+CHECK_INCLUDE_FILES(malloc.h HAVE_MALLOC_H)
+
+CONFIGURE_FILE(${CMAKE_CURRENT_SOURCE_DIR}/nvconfig.h.in ${CMAKE_CURRENT_BINARY_DIR}/nvconfig.h)
+
--- a/src/nvconfig.h.in
+++ b/src/nvconfig.h.in
@ -0,0 +1,14 @@
+#ifndef NV_CONFIG
+#define NV_CONFIG
+
+#cmakedefine HAVE_UNISTD_H
+#cmakedefine HAVE_STDARG_H
+#cmakedefine HAVE_SIGNAL_H
+#cmakedefine HAVE_EXECINFO_H
+#cmakedefine HAVE_MALLOC_H
+
+#cmakedefine HAVE_PNG
+#cmakedefine HAVE_JPEG
+#cmakedefine HAVE_TIFF
+
+#endif // NV_CONFIG
--- a/src/nvcore/BitArray.h
+++ b/src/nvcore/BitArray.h
@ -0,0 +1,168 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#ifndef NV_CORE_BITARRAY_H
+#define NV_CORE_BITARRAY_H
+
+#include <nvcore/nvcore.h>
+#include <nvcore/Containers.h>
+
+namespace nv
+{
+
+/// Count the bits of @a x.
+inline uint bitsSet(uint8 x) {
+	uint count = 0;
+	for(; x != 0; x >>= 1) {
+		count += (x & 1);
+	}
+	return count;
+}
+
+
+/// Count the bits of @a x.
+inline uint bitsSet(uint32 x, int bits) {
+	uint count = 0;
+	for(; x != 0 && bits != 0; x >>= 1, bits--) {
+		count += (x & 1);
+	}
+	return count;
+}
+
+
+/// Simple bit array.
+class BitArray
+{
+public:
+
+	/// Default ctor.
+	BitArray() {}
+
+	/// Ctor with initial m_size.
+	BitArray(uint sz)
+	{
+		resize(sz);
+	}
+
+	/// Get array m_size.
+	uint size() const { return m_size; }
+
+	/// Clear array m_size.
+	void clear() { resize(0); }
+
+	/// Set array m_size.
+	void resize(uint sz)
+	{ 
+		m_size = sz;
+		m_bitArray.resize( (m_size + 7) >> 3 );
+	}
+	
+	/// Get bit.
+	bool bitAt(uint b) const
+	{
+		nvDebugCheck( b < m_size );
+		return (m_bitArray[b >> 3] & (1 << (b & 7))) != 0;
+	}
+
+	/// Set a bit.
+	void setBitAt(uint b)
+	{
+		nvDebugCheck( b < m_size );
+		m_bitArray[b >> 3] |=  (1 << (b & 7));
+	}
+
+	/// Clear a bit.
+	void clearBitAt( uint b )
+	{
+		nvDebugCheck( b < m_size );
+		m_bitArray[b >> 3] &= ~(1 << (b & 7));
+	}
+
+	/// Clear all the bits.
+	void clearAll()
+	{
+		memset(m_bitArray.unsecureBuffer(), 0, m_bitArray.size());
+	}
+
+	/// Set all the bits.
+	void setAll()
+	{
+		memset(m_bitArray.unsecureBuffer(), 0xFF, m_bitArray.size());
+	}
+
+	/// Toggle all the bits.
+	void toggleAll()
+	{
+		const uint byte_num = m_bitArray.size();
+		for(uint b = 0; b < byte_num; b++) {
+			m_bitArray[b] ^= 0xFF;
+		}
+	}
+	
+	/// Get a byte of the bit array.
+	const uint8 & byteAt(uint index) const
+	{
+		return m_bitArray[index];
+	}
+
+	/// Set the given byte of the byte array.
+	void setByteAt(uint index, uint8 b)
+	{
+		m_bitArray[index] = b;
+	}
+	
+	/// Count the number of bits set.
+	uint countSetBits() const
+	{
+		const uint num = m_bitArray.size();
+		if( num == 0 ) {
+			return 0;
+		}
+		
+		uint count = 0;				
+		for(uint i = 0; i < num - 1; i++) {
+			count += bitsSet(m_bitArray[i]);
+		}
+		count += bitsSet(m_bitArray[num-1], m_size & 0x7);
+		
+		//piDebugCheck(count + countClearBits() == m_size);
+		return count;
+	}
+
+	/// Count the number of bits clear.
+	uint countClearBits() const {
+		
+		const uint num = m_bitArray.size();
+		if( num == 0 ) {
+			return 0;
+		}
+		
+		uint count = 0;
+		for(uint i = 0; i < num - 1; i++) {
+			count += bitsSet(~m_bitArray[i]);
+		}
+		count += bitsSet(~m_bitArray[num-1], m_size & 0x7);
+		
+		//piDebugCheck(count + countSetBits() == m_size);
+		return count;
+	}
+
+	friend void swap(BitArray & a, BitArray & b)
+	{
+		swap(a.m_size, b.m_size);
+		swap(a.m_bitArray, b.m_bitArray);
+	}
+
+
+private:
+
+	/// Number of bits stored.
+	uint m_size;
+
+	/// Array of bits.
+	Array<uint8> m_bitArray;
+
+};
+
+} // nv namespace
+
+#endif // _PI_CORE_BITARRAY_H_
--- a/src/nvcore/CMakeLists.txt
+++ b/src/nvcore/CMakeLists.txt
@ -0,0 +1,36 @@
+PROJECT(nvcore)
+ADD_SUBDIRECTORY(poshlib)
+
+SET(CORE_SRCS
+	nvcore.h
+	BitArray.h
+	Memory.h
+	Memory.cpp
+	Debug.h
+	Debug.cpp
+	Containers.h
+	StrLib.h
+	StrLib.cpp
+	Stream.h
+	StdStream.h
+	TextReader.h
+	TextReader.cpp
+	TextWriter.h
+	Tokenizer.h
+	Tokenizer.cpp
+	Radix.h
+	Radix.cpp)
+
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+
+# targets
+ADD_DEFINITIONS(-DNVCORE_EXPORTS)
+
+IF(NVCORE_SHARED)
+	ADD_LIBRARY(nvcore SHARED ${CORE_SRCS})
+ELSE(NVCORE_SHARED)
+	ADD_LIBRARY(nvcore ${CORE_SRCS})
+ENDIF(NVCORE_SHARED)
+
+TARGET_LINK_LIBRARIES(nvcore ${LIBS})
+
--- a/src/nvcore/Containers.h
+++ b/src/nvcore/Containers.h
--- a/src/nvcore/Debug.cpp
+++ b/src/nvcore/Debug.cpp
@ -0,0 +1,456 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#include <nvcore/Debug.h>
+#include <nvcore/StrLib.h>
+
+// Extern
+#if NV_OS_WIN32 //&& NV_CC_MSVC
+#	define WIN32_LEAN_AND_MEAN
+#	define VC_EXTRALEAN
+#	include <windows.h>
+#	include <direct.h>
+#	if NV_CC_MSVC
+#		include <crtdbg.h>
+#		if _MSC_VER < 1300
+#			define DECLSPEC_DEPRECATED
+			// VC6: change this path to your Platform SDK headers
+#			include <dbghelp.h>	// must be XP version of file
+//			include "M:\\dev7\\vs\\devtools\\common\\win32sdk\\include\\dbghelp.h"
+#		else
+			// VC7: ships with updated headers
+#			include <dbghelp.h>
+#		endif
+#	endif
+#endif
+
+#if !NV_OS_WIN32 && defined(HAVE_SIGNAL_H)
+#	include <signal.h>
+#endif
+
+#if NV_OS_LINUX && defined(HAVE_EXECINFO_H)
+#	include <execinfo.h>
+#	if NV_CC_GNUC // defined(HAVE_CXXABI_H)
+#		include <cxxabi.h>
+#	endif
+#endif
+
+#if NV_OS_DARWIN
+#	include <unistd.h>	// getpid
+#	include <sys/types.h>
+#	include <sys/sysctl.h>	// sysctl
+#endif
+
+#include <stdexcept> // std::runtime_error
+#undef assert // defined on mingw
+
+using namespace nv;
+
+namespace 
+{
+
+	static MessageHandler * s_message_handler = NULL;
+	static AssertHandler * s_assert_handler = NULL;
+	
+	static bool s_sig_handler_enabled = false;
+
+#if NV_OS_WIN32 && NV_CC_MSVC
+
+	// Old exception filter.
+	static LPTOP_LEVEL_EXCEPTION_FILTER s_old_exception_filter = NULL;
+
+#elif !NV_OS_WIN32 && defined(HAVE_SIGNAL_H)
+
+	// Old signal handlers.
+	struct sigaction s_old_sigsegv;
+	struct sigaction s_old_sigtrap;
+	struct sigaction s_old_sigfpe;
+	struct sigaction s_old_sigbus;
+	
+#endif
+
+
+#if NV_OS_WIN32 && NV_CC_MSVC
+
+	// TODO write minidump
+	
+	static LONG WINAPI nvTopLevelFilter( struct _EXCEPTION_POINTERS *pExceptionInfo ) {
+	/*	BOOL (WINAPI * Dump) (HANDLE, DWORD, HANDLE, MINIDUMP_TYPE, PMINIDUMP_EXCEPTION_INFORMATION, PMINIDUMP_USER_STREAM_INFORMATION, PMINIDUMP_CALLBACK_INFORMATION );
+	
+		AutoString dbghelp_path(512);
+		getcwd(dbghelp_path, 512);
+		dbghelp_path.Append("\\DbgHelp.dll");
+		nvTranslatePath(dbghelp_path);
+		
+		PiLibrary DbgHelp_lib(dbghelp_path, true);
+		
+		if( !DbgHelp_lib.IsValid() ) {
+			nvDebug("*** 'DbgHelp.dll' not found.\n");
+			return EXCEPTION_CONTINUE_SEARCH;
+		}
+		
+		if( !DbgHelp_lib.BindSymbol( (void **)&Dump, "MiniDumpWriteDump" ) ) {
+			nvDebug("*** 'DbgHelp.dll' too old.\n");
+			return EXCEPTION_CONTINUE_SEARCH;
+		}
+		
+		// create the file
+		HANDLE hFile = ::CreateFile( "nv.dmp", GENERIC_WRITE, FILE_SHARE_WRITE, NULL, CREATE_ALWAYS, FILE_ATTRIBUTE_NORMAL, NULL );
+		if( hFile == INVALID_HANDLE_VALUE ) {
+			nvDebug("*** Failed to create dump file.\n");
+			return EXCEPTION_CONTINUE_SEARCH;
+		}
+		
+		
+		_MINIDUMP_EXCEPTION_INFORMATION ExInfo;
+	
+		ExInfo.ThreadId = ::GetCurrentThreadId();
+		ExInfo.ExceptionPointers = pExceptionInfo;
+		ExInfo.ClientPointers = NULL;
+	
+		// write the dump
+		bool ok = Dump( GetCurrentProcess(), GetCurrentProcessId(), hFile, MiniDumpNormal, &ExInfo, NULL, NULL )!=0;
+		::CloseHandle(hFile);
+		
+		if( !ok ) {
+			nvDebug("*** Failed to save dump file.\n");
+			return EXCEPTION_CONTINUE_SEARCH;
+		}
+		
+		nvDebug("--- Dump file saved.\n");
+		*/
+		return EXCEPTION_CONTINUE_SEARCH;
+	}
+
+#elif !NV_OS_WIN32 && defined(HAVE_SIGNAL_H) // NV_OS_LINUX || NV_OS_OSX
+
+#if defined(HAVE_EXECINFO_H) // NV_OS_LINUX
+
+	static void nvPrintStackTrace(void * trace[], int size, int start=0) {
+		char ** string_array = backtrace_symbols(trace, size);
+	
+		nvDebug( "\nDumping stacktrace:\n" );
+		for(int i = start; i < size-1; i++ ) {
+#		if NV_CC_GNUC // defined(HAVE_CXXABI_H)
+			char * begin = strchr(string_array[i], '(');
+			char * end = strchr(string_array[i], '+');
+			if( begin != 0 && begin < end ) {
+				int stat;
+				*end = '\0';
+				*begin = '\0';
+				char * module = string_array[i];
+				char * name = abi::__cxa_demangle(begin+1, 0, 0, &stat);
+				if( name == NULL || begin[1] != '_' || begin[2] != 'Z' ) {
+					nvDebug( "  In: [%s] '%s'\n", module, begin+1 );
+				}
+				else {
+					nvDebug( "  In: [%s] '%s'\n", module, name );
+				}
+				free(name);
+			}
+			else {
+				nvDebug( "  In: '%s'\n", string_array[i] );
+			}
+#		else
+			nvDebug( "  In: '%s'\n", string_array[i] );
+#		endif
+		}
+		nvDebug("\n");
+	
+		free(string_array);
+	}
+
+#endif // defined(HAVE_EXECINFO_H)
+
+	static void nvSigHandler(int sig, siginfo_t *info, void *secret)
+	{
+		// Do something useful with siginfo_t
+		if (sig == SIGSEGV) {
+#		if NV_CPU_X86
+			ucontext_t * uc = (ucontext_t *)secret;
+			nvDebug("Got signal %d, faulty address is %p, from %p\n", sig, info->si_addr, (void *)uc->uc_mcontext.gregs[REG_EIP]);
+#		else
+			nvDebug("Got signal %d, faulty address is %p\n", sig, info->si_addr);
+#		endif
+		}
+		else if(sig == SIGTRAP) {
+			nvDebug("Breakpoint hit.\n");
+		}
+		else {
+			nvDebug("Got signal %d\n", sig);
+		}
+		
+#	if defined(HAVE_EXECINFO_H)
+		
+		void * trace[64];
+		int size = backtrace(trace, 64);
+		
+#	if NV_CPU_X86
+		// Overwrite sigaction with caller's address.
+		ucontext_t * uc = (ucontext_t *)secret;
+		trace[1] = (void *) uc->uc_mcontext.gregs[REG_EIP];
+#	endif // NV_CPU_X86
+		
+		nvPrintStackTrace(trace, size, 1);
+		
+#	endif // defined(HAVE_EXECINFO_H)
+		
+		exit(0);
+	}
+
+#endif // defined(HAVE_SIGNAL_H)
+
+
+
+#if NV_OS_WIN32 //&& NV_CC_MSVC
+	
+	/** Win32 asset handler. */
+	struct Win32AssertHandler : public AssertHandler 
+	{
+		// Code from Daniel Vogel.
+		static bool isDebuggerPresent()
+		{
+			bool result = false;
+			
+			HINSTANCE kern_lib = LoadLibraryEx( "kernel32.dll", NULL, 0 );
+			if( kern_lib ) {
+				FARPROC lIsDebuggerPresent = GetProcAddress( kern_lib, "IsDebuggerPresent" );
+				if( lIsDebuggerPresent && lIsDebuggerPresent() ) {
+					result = true;
+				}
+				
+				FreeLibrary( kern_lib );
+			}
+			return result;
+		}
+		
+		// Flush the message queue. This is necessary for the message box to show up.
+		static void flushMessageQueue()
+		{
+			MSG msg;
+			while( PeekMessage( &msg, NULL, 0, 0, PM_REMOVE ) ) {
+				if( msg.message == WM_QUIT ) break;
+				TranslateMessage( &msg );
+				DispatchMessage( &msg );
+			}
+		}
+	
+		// Assert handler method.
+		virtual int assert( const char * exp, const char * file, int line, const char * func/*=NULL*/ )
+		{
+			int ret = NV_ABORT_EXIT;
+			
+			StringBuilder error_string;
+			if( func != NULL ) {
+				error_string.format( "*** Assertion failed: %s\n    On file: %s\n    On function: %s\n    On line: %d\n ", exp, file, func, line );
+				nvDebug( error_string );
+			}
+			else {
+				error_string.format( "*** Assertion failed: %s\n    On file: %s\n    On line: %d\n ", exp, file, line );
+				nvDebug( error_string );
+			}
+			
+		#if _DEBUG
+			
+			if( isDebuggerPresent() ) {
+				return NV_ABORT_DEBUG;
+			}
+			
+			flushMessageQueue();
+			int action = MessageBox(NULL, error_string, "Assertion failed", MB_ABORTRETRYIGNORE|MB_ICONERROR);
+			switch( action ) {
+				case IDRETRY:
+					ret = NV_ABORT_DEBUG;
+					break;
+				case IDIGNORE:
+					ret = NV_ABORT_IGNORE;
+					break;
+				case IDABORT:
+				default:
+					ret = NV_ABORT_EXIT;
+					break;
+			}
+			/*if( _CrtDbgReport( _CRT_ASSERT, file, line, module, exp ) == 1 ) {
+				return NV_ABORT_DEBUG;
+			}*/
+			
+		#endif
+			
+			if( ret == NV_ABORT_EXIT ) {
+				// Exit cleanly.
+				throw std::runtime_error("Assertion failed");
+			}
+			
+			return ret;			
+		}
+	};
+	
+#else
+	
+	/** Unix asset handler. */
+	struct UnixAssertHandler : public AssertHandler
+	{
+		bool isDebuggerPresent()
+		{
+#		if NV_OS_DARWIN
+			int mib[4];
+			struct kinfo_proc info;
+			size_t size;
+			mib[0] = CTL_KERN;
+			mib[1] = KERN_PROC;
+			mib[2] = KERN_PROC_PID;
+			mib[3] = getpid();
+			size = sizeof(info);
+			info.kp_proc.p_flag = 0;
+			sysctl(mib,4,&info,&size,NULL,0);
+			return ((info.kp_proc.p_flag & P_TRACED) == P_TRACED);
+#		else
+			// if ppid != sid, some process spawned our app, probably a debugger. 
+			return getsid(getpid()) != getppid();
+#		endif
+		}
+		
+		// Assert handler method.
+		virtual int assert(const char * exp, const char * file, int line, const char * func)
+		{
+			if( func != NULL ) {
+				nvDebug( "*** Assertion failed: %s\n    On file: %s\n    On function: %s\n    On line: %d\n ", exp, file, func, line );
+			}
+			else {
+				nvDebug( "*** Assertion failed: %s\n    On file: %s\n    On line: %d\n ", exp, file, line );
+			}
+			
+#		if _DEBUG
+			if( isDebuggerPresent() ) {
+				return NV_ABORT_DEBUG;
+			}
+#		endif
+
+#		if defined(HAVE_EXECINFO_H)
+			void * trace[64];
+			int size = backtrace(trace, 64);
+			nvPrintStackTrace(trace, size, 3);
+#		endif
+
+			// Exit cleanly.
+			throw std::runtime_error("Assertion failed");
+		}
+	};
+	
+#endif
+
+} // namespace
+
+
+/// Handle assertion through the asset handler.
+int nvAbort(const char * exp, const char * file, int line, const char * func)
+{
+#if NV_OS_WIN32 //&& NV_CC_MSVC
+	static Win32AssertHandler s_default_assert_handler;
+#else
+	static UnixAssertHandler s_default_assert_handler;
+#endif
+	
+	if( s_assert_handler != NULL ) {
+		return s_assert_handler->assert( exp, file, line, func );
+	}
+	else {
+		return s_default_assert_handler.assert( exp, file, line, func );
+	}
+}
+
+
+/// Shows a message through the message handler.
+void NV_CDECL nvDebug(const char *msg, ...)
+{
+	va_list arg;
+	va_start(arg,msg);
+	if( s_message_handler != NULL ) {
+		s_message_handler->log( msg, arg );
+	}
+	va_end(arg);
+}
+
+
+/// Dump debug info.
+void debug::dumpInfo()
+{
+#if defined(HAVE_EXECINFO_H)
+	void * trace[64];
+	int size = backtrace(trace, 64);
+	nvPrintStackTrace(trace, size, 1);
+#endif
+}
+
+
+/// Set the debug message handler.
+void debug::setMessageHandler(MessageHandler * message_handler)
+{
+	s_message_handler = message_handler;
+}
+
+/// Reset the debug message handler.
+void debug::resetMessageHandler()
+{
+	s_message_handler = NULL;
+}
+
+/// Set the assert handler.
+void debug::setAssertHandler(AssertHandler * assert_handler)
+{
+	s_assert_handler = assert_handler;
+}
+
+/// Reset the assert handler.
+void debug::resetAssertHandler()
+{
+	s_assert_handler = NULL;
+}
+
+
+/// Enable signal handler.
+void debug::enableSigHandler()
+{
+	nvCheck(s_sig_handler_enabled != true);
+	s_sig_handler_enabled = true;
+	
+#if NV_OS_WIN32 && NV_CC_MSVC
+	
+	s_old_exception_filter = ::SetUnhandledExceptionFilter( nvTopLevelFilter );
+	
+#elif !NV_OS_WIN32 && defined(HAVE_SIGNAL_H)
+	
+	// Install our signal handler
+	struct sigaction sa;
+	sa.sa_sigaction = nvSigHandler;
+	sigemptyset (&sa.sa_mask);
+	sa.sa_flags = SA_RESTART | SA_SIGINFO;
+
+	sigaction(SIGSEGV, &sa, &s_old_sigsegv);
+	sigaction(SIGTRAP, &sa, &s_old_sigtrap);
+	sigaction(SIGFPE, &sa, &s_old_sigfpe);
+	sigaction(SIGBUS, &sa, &s_old_sigbus);
+	
+#endif
+}
+
+/// Disable signal handler.
+void debug::disableSigHandler()
+{
+	nvCheck(s_sig_handler_enabled == true);
+	s_sig_handler_enabled = false;
+
+#if NV_OS_WIN32 && NV_CC_MSVC
+
+	::SetUnhandledExceptionFilter( s_old_exception_filter );
+	s_old_exception_filter = NULL;
+
+#elif !NV_OS_WIN32 && defined(HAVE_SIGNAL_H)
+	
+	sigaction(SIGSEGV, &s_old_sigsegv, NULL);
+	sigaction(SIGTRAP, &s_old_sigtrap, NULL);
+	sigaction(SIGFPE, &s_old_sigfpe, NULL);
+	sigaction(SIGBUS, &s_old_sigbus, NULL);
+	
+#endif
+}
+
--- a/src/nvcore/Debug.h
+++ b/src/nvcore/Debug.h
@ -0,0 +1,129 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#ifndef NV_CORE_DEBUG_H
+#define NV_CORE_DEBUG_H
+
+#include <nvcore/nvcore.h>
+
+#if defined(HAVE_STDARG_H)
+#	include <stdarg.h>	// va_list
+#endif
+
+#define NV_ABORT_DEBUG		1
+#define NV_ABORT_IGNORE		2
+#define NV_ABORT_EXIT		3
+
+#if NV_CC_MSVC
+#define nvNoAssert	__noop
+#else
+#define nvNoAssert(exp)
+#endif
+
+
+#if NV_NO_ASSERT
+
+#	define nvAssert(exp) nvNoAssert()
+#	define nvCheck(exp) nvNoAssert()
+#	define nvDebugAssert(exp) nvNoAssert()
+#	define nvDebugCheck(exp) nvNoAssert()
+#	define nvDebugBreak()
+
+#else // NV_NO_ASSERT
+
+#	if NV_CC_MSVC && NV_CPU_X86 && 0
+#		define nvDebugBreak()		__asm int 3
+#	elif NV_CC_MSVC	// this is only on recent versions...
+		// Do I have to include <intrin.h> ?
+#		define nvDebugBreak()		__debugbreak()
+#	elif NV_CC_GNUC && NV_CPU_PPC && NV_OS_DARWIN
+#		define nvDebugBreak()		__asm__ volatile ("trap");
+#	elif NV_CC_GNUC && NV_CPU_X86 && NV_OS_DARWIN
+#		define nvDebugBreak()		__asm__ volatile ("int3");
+#	elif NV_CC_GNUC && NV_CPU_X86 
+#		define nvDebugBreak()		__asm__ ( "int %0" : :"I"(3) )
+#	else
+#		include <signal.h>
+#		define nvDebugBreak()		raise(SIGTRAP); //*((int *)(0)) = 0
+#	endif
+
+#	define nvAssertMacro(exp) \
+		do { \
+			if(!(exp)) { \
+				if( nvAbort(#exp, __FILE__, __LINE__, __FUNC__) == NV_ABORT_DEBUG ) { \
+					nvDebugBreak(); \
+				} \
+			} \
+		} while(false)
+
+#	define nvAssert(exp)	nvAssertMacro(exp)
+#	define nvCheck(exp)		nvAssertMacro(exp)
+
+#	if defined(_DEBUG)
+#		define nvDebugAssert(exp)	nvAssertMacro(exp)
+#		define nvDebugCheck(exp)	nvAssertMacro(exp)
+#	else // _DEBUG
+#		define nvDebugAssert(exp)	nvNoAssert(exp)
+#		define nvDebugCheck(exp)	nvNoAssert(exp)
+#	endif // _DEBUG
+
+#endif // NV_NO_ASSERT
+
+// Use nvAssume for very simple expresions only: piAssume(0), piAssume(value == true), etc.
+#if defined(_DEBUG)
+#	if NV_CC_MSVC
+#		define nvAssume(exp)	__assume(exp)
+#	else
+#		define nvAssume(exp)	nvCheck(exp)
+#	endif
+#else
+#	define nvAssume(exp)	nvCheck(exp)
+#endif
+
+
+#define nvError(x)		nvAbort(x, __FILE__, __LINE__, __FUNC__)
+#define nvWarning(x)	nvDebug("*** Warning %s/%d: %s\n", __FILE__, __LINE__, (x))
+
+
+#if PI_CC_MSVC
+// I'm not sure it's a good idea to use the default static assert.
+#define nvStaticCheck(x) _STATIC_ASSERT(x)
+#else
+#define nvStaticCheck(x) typedef char NV_DO_STRING_JOIN2(__static_assert_,__LINE__)[(x)]
+//#define nvStaticCheck(x) switch(0) { case 0: case x:; }
+#endif
+
+NVCORE_API int nvAbort(const char *exp, const char *file, int line, const char * func = 0);
+NVCORE_API void NV_CDECL nvDebug( const char *msg, ... ) __attribute__((format (printf, 1, 2)));
+
+namespace nv
+{
+	/** Message handler interface. */
+	struct MessageHandler {
+		virtual void log(const char * str, va_list arg) = 0;
+		virtual ~MessageHandler() {}	
+	};
+	
+	/** Assert handler interface. */
+	struct AssertHandler {
+		virtual int assert(const char *exp, const char *file, int line, const char *func = 0) = 0;
+		virtual ~AssertHandler() {}	
+	};
+
+
+	namespace debug
+	{
+		NVCORE_API void dumpInfo();
+	
+		NVCORE_API void setMessageHandler( MessageHandler * messageHandler );
+		NVCORE_API void resetMessageHandler();
+	
+		NVCORE_API void setAssertHandler( AssertHandler * assertHanlder );
+		NVCORE_API void resetAssertHandler();
+	
+		NVCORE_API void enableSigHandler();
+		NVCORE_API void disableSigHandler();
+	}
+
+} // nv namespace
+
+#endif	// NV_CORE_DEBUG_H
--- a/src/nvcore/DefsGnucDarwin.h
+++ b/src/nvcore/DefsGnucDarwin.h
@ -0,0 +1,66 @@
+#ifndef NV_CORE_H
+#error "Do not include this file directly."
+#endif
+
+#include <stdlib.h>	// uint8_t, int8_t, ...
+
+
+// Function linkage
+#define DLL_IMPORT
+#if __GNUC__ >= 4
+#	define DLL_EXPORT	__attribute__((visibility("default")))
+#	define DLL_EXPORT_CLASS DLL_EXPORT
+#else
+#	define DLL_EXPORT
+#	define DLL_EXPORT_CLASS
+#endif
+
+// Function calling modes
+#if NV_CPU_X86
+#	define NV_CDECL 	__attribute__((cdecl))
+#	define NV_STDCALL	__attribute__((stdcall))
+#else
+#	define NV_CDECL 
+#	define NV_STDCALL
+#endif
+
+#define NV_FASTCALL		__attribute__((fastcall))
+#define NV_FORCEINLINE	__attribute__((always_inline))
+
+#if __GNUC__ > 2
+#define NV_PURE		__attribute__((pure))
+#define NV_CONST	__attribute__((const))
+#else
+#define NV_PURE
+#define NV_CONST
+#endif
+
+// Define __FUNC__ properly.
+#if __STDC_VERSION__ < 199901L
+#	if __GNUC__ >= 2
+#		define __FUNC__ __PRETTY_FUNCTION__	// __FUNCTION__
+#	else
+#		define __FUNC__ "<unknown>"
+#	endif
+#else
+#	define __FUNC__ __PRETTY_FUNCTION__
+#endif
+
+#define restrict	__restrict__
+
+
+// Type definitions
+typedef uint8_t		uint8;
+typedef int8_t		int8;
+
+typedef uint16_t	uint16;
+typedef int16_t		int16;
+
+typedef uint32_t	uint32;
+typedef int32_t		int32;
+
+typedef uint64_t	uint64;
+typedef int64_t		int64;
+
+// Aliases
+typedef uint32				uint;
--- a/src/nvcore/DefsGnucLinux.h
+++ b/src/nvcore/DefsGnucLinux.h
@ -0,0 +1,63 @@
+#ifndef NV_CORE_H
+#error "Do not include this file directly."
+#endif
+
+// Function linkage
+#define DLL_IMPORT
+#if __GNUC__ >= 4
+#	define DLL_EXPORT	__attribute__((visibility("default")))
+#	define DLL_EXPORT_CLASS DLL_EXPORT
+#else
+#	define DLL_EXPORT
+#	define DLL_EXPORT_CLASS
+#endif
+
+// Function calling modes
+#if NV_CPU_X86
+#	define NV_CDECL 	__attribute__((cdecl))
+#	define NV_STDCALL	__attribute__((stdcall))
+#else
+#	define NV_CDECL 
+#	define NV_STDCALL
+#endif
+
+#define NV_FASTCALL		__attribute__((fastcall))
+#define NV_FORCEINLINE	__attribute__((always_inline))
+
+#if __GNUC__ > 2
+#define NV_PURE		__attribute__((pure))
+#define NV_CONST	__attribute__((const))
+#else
+#define NV_PURE
+#define NV_CONST
+#endif
+
+// Define __FUNC__ properly.
+#if __STDC_VERSION__ < 199901L
+#	if __GNUC__ >= 2
+#		define __FUNC__ __PRETTY_FUNCTION__	// __FUNCTION__
+#	else
+#		define __FUNC__ "<unknown>"
+#	endif
+#else
+#	define __FUNC__ __PRETTY_FUNCTION__
+#endif
+
+#define restrict	__restrict__
+
+
+// Type definitions
+typedef unsigned char		uint8;
+typedef signed char			int8;
+
+typedef unsigned short		uint16;
+typedef signed short		int16;
+
+typedef unsigned int		uint32;
+typedef signed int			int32;
+
+typedef unsigned long long	uint64;
+typedef signed long long	int64;
+
+// Aliases
+typedef uint32				uint;
--- a/src/nvcore/DefsGnucWin32.h
+++ b/src/nvcore/DefsGnucWin32.h
@ -0,0 +1,58 @@
+#ifndef NV_CORE_H
+#error "Do not include this file directly."
+#endif
+
+// Function linkage
+#define DLL_IMPORT	__declspec(dllimport)
+#define DLL_EXPORT	__declspec(dllexport)
+#define DLL_EXPORT_CLASS DLL_EXPORT
+
+// Function calling modes
+#if NV_CPU_X86
+#	define NV_CDECL 	__attribute__((cdecl))
+#	define NV_STDCALL	__attribute__((stdcall))
+#else
+#	define NV_CDECL 
+#	define NV_STDCALL
+#endif
+
+#define NV_FASTCALL		__attribute__((fastcall))
+#define NV_FORCEINLINE	__attribute__((always_inline))
+
+#if __GNUC__ > 2
+#define NV_PURE		__attribute__((pure))
+#define NV_CONST	__attribute__((const))
+#else
+#define NV_PURE
+#define NV_CONST
+#endif
+
+// Define __FUNC__ properly.
+#if __STDC_VERSION__ < 199901L
+#	if __GNUC__ >= 2
+#		define __FUNC__ __PRETTY_FUNCTION__	// __FUNCTION__
+#	else
+#		define __FUNC__ "<unknown>"
+#	endif
+#else
+#	define __FUNC__ __PRETTY_FUNCTION__
+#endif
+
+#define restrict	__restrict__
+
+
+// Type definitions
+typedef unsigned char		uint8;
+typedef signed char			int8;
+
+typedef unsigned short		uint16;
+typedef signed short		int16;
+
+typedef unsigned int		uint32;
+typedef signed int			int32;
+
+typedef unsigned long long	uint64;
+typedef signed long long	int64;
+
+// Aliases
+typedef uint32				uint;
--- a/src/nvcore/DefsVcWin32.h
+++ b/src/nvcore/DefsVcWin32.h
@ -0,0 +1,94 @@
+#ifndef NV_CORE_H
+#error "Do not include this file directly."
+#endif
+
+// Function linkage
+#define DLL_IMPORT	__declspec(dllimport)
+#define DLL_EXPORT	__declspec(dllexport)
+#define DLL_EXPORT_CLASS DLL_EXPORT
+
+// Function calling modes
+#define NV_CDECL		__cdecl
+#define NV_STDCALL		__stdcall
+#define NV_FASTCALL		__fastcall
+#define NV_FORCEINLINE	__forceinline
+
+#define NV_PURE
+#define NV_CONST
+
+// Set standard function names.
+#define snprintf _snprintf
+#define vsnprintf _vsnprintf
+#define vsscanf _vsscanf
+#define chdir _chdir
+#define getcwd _getcwd 
+
+#define va_copy(a, b)	a = b
+
+#if !defined restrict
+#define restrict
+#endif
+
+#if !defined __attribute__
+#define __attribute__(X)
+#endif
+
+#if !defined __FUNC__
+#define __FUNC__	__FUNCTION__ 
+#endif
+
+
+// Type definitions
+typedef unsigned char		uint8;
+typedef signed char			int8;
+
+typedef unsigned short		uint16;
+typedef signed short		int16;
+
+typedef unsigned int		uint32;
+typedef signed int			int32;
+
+typedef unsigned __int64	uint64;
+typedef signed __int64		int64;
+
+// Aliases
+typedef uint32				uint;
+
+
+// Unwanted VC++ warnings to disable.
+#pragma warning(disable : 4244)		// conversion to float, possible loss of data
+#pragma warning(disable : 4245)		// conversion from 'enum ' to 'unsigned long', signed/unsigned mismatch
+#pragma warning(disable : 4100)		// unreferenced formal parameter
+#pragma warning(disable : 4514)		// unreferenced inline function has been removed
+#pragma warning(disable : 4710)		// inline function not expanded
+#pragma warning(disable : 4127)		// Conditional expression is constant
+#pragma warning(disable : 4305)		// truncation from 'const double' to 'float'
+#pragma warning(disable : 4505)		// unreferenced local function has been removed
+
+//#pragma warning(disable : 4699)	// creating precompiled header
+//#pragma warning(disable : 4201)	// nonstandard extension used : nameless struct/union
+#pragma warning(disable : 4702)		// unreachable code in inline expanded function
+#pragma warning(disable : 4711)		// function selected for automatic inlining
+#pragma warning(disable : 4725)		// Pentium fdiv bug
+
+//#pragma warning(disable : 4512)	// assignment operator could not be generated
+//#pragma warning(disable : 4530)	// C++ exception handler used, but unwind semantics are not enabled
+//#pragma warning(disable : 4238)	// nonstandard extension used : class rvalue used as lvalue
+//#pragma warning(disable : 4251)	// needs to have dll-interface to be used by clients of class 'ULinker'
+//#pragma warning(disable : 4275)	// non dll-interface class used as base for dll-interface class
+
+//#pragma warning(disable : 4511)	// copy constructor could not be generated
+//#pragma warning(disable : 4284)	// return type is not a UDT or reference to a UDT
+//#pragma warning(disable : 4355)	// this used in base initializer list
+//#pragma warning(disable : 4097)	// typedef-name '' used as synonym for class-name ''
+//#pragma warning(disable : 4291)	// typedef-name '' used as synonym for class-name ''
+
+#pragma warning(disable : 4345)		// behavior change: an object of POD type constructed with an initializer of the form () will be default-initialized
+
+#pragma warning(disable : 4786)		// Identifier was truncated and cannot be debugged.
+
+//#pragma warning(disable : 4996)		// function was declared deprecated.
+//#pragma warning(disable : 4146)		// unary minus operator applied to unsigned type, result still unsigned
+
+#pragma warning(disable : 4675)		// resolved overload was found by argument-dependent lookup
+
--- a/src/nvcore/Memory.cpp
+++ b/src/nvcore/Memory.cpp
@ -0,0 +1,34 @@
+
+#include "Memory.h"
+#include "Debug.h"
+
+//#if HAVE_MALLOC_H
+//#include <malloc.h>
+//#endif
+
+#include <stdlib.h>
+
+
+using namespace nv;
+
+void * nv::mem::malloc(size_t size)
+{
+	return ::malloc(size);
+}
+
+void * nv::mem::malloc(size_t size, const char * file, int line)
+{
+	return ::malloc(size);
+}
+
+void nv::mem::free(const void * ptr)
+{
+	::free(const_cast<void *>(ptr));
+}
+
+void * nv::mem::realloc(void * ptr, size_t size)
+{
+	nvDebugCheck(ptr != NULL || size != 0);	// undefined realloc behavior.
+	return ::realloc(ptr, size);
+}
+
--- a/src/nvcore/Memory.h
+++ b/src/nvcore/Memory.h
@ -0,0 +1,186 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#ifndef NV_CORE_MEMORY_H
+#define NV_CORE_MEMORY_H
+
+#include <nvcore/nvcore.h>
+
+#include <stdlib.h> // malloc(), realloc() and free()
+#include <stddef.h>	// size_t
+
+#include <new>	// new and delete
+
+// Custom memory allocator
+namespace nv
+{
+	namespace mem 
+	{
+		NVCORE_API void * malloc(size_t size);
+		NVCORE_API void * malloc(size_t size, const char * file, int line);
+		
+		NVCORE_API void free(const void * ptr);
+		NVCORE_API void * realloc(void * ptr, size_t size);
+		
+	} // mem namespace
+	
+} // nv namespace
+
+
+// Override new/delete
+
+inline void * operator new (size_t size) throw()
+{
+	return nv::mem::malloc(size); 
+}
+
+inline void operator delete (void *p) throw()
+{
+	nv::mem::free(p); 
+}
+
+inline void * operator new [] (size_t size) throw()
+{
+	return nv::mem::malloc(size);
+}
+
+inline void operator delete [] (void * p) throw()
+{
+	nv::mem::free(p); 
+}
+
+/*
+#ifdef _DEBUG
+#define new new(__FILE__, __LINE__)
+#define malloc(i) malloc(i, __FILE__, __LINE__)
+#endif
+*/
+
+#if 0
+/*
+    File:	main.cpp
+    
+    Version:	1.0
+
+	Abstract: Overrides the C++ 'operator new' and 'operator delete'.
+
+    Disclaimer:	IMPORTANT:  This Apple software is supplied to you by Apple Computer, Inc.
+		("Apple") in consideration of your agreement to the following terms, and your
+		use, installation, modification or redistribution of this Apple software
+		constitutes acceptance of these terms.  If you do not agree with these terms,
+		please do not use, install, modify or redistribute this Apple software.
+
+		In consideration of your agreement to abide by the following terms, and subject
+		to these terms, Apple grants you a personal, non-exclusive license, under Apple’s
+		copyrights in this original Apple software (the "Apple Software"), to use,
+		reproduce, modify and redistribute the Apple Software, with or without
+		modifications, in source and/or binary forms; provided that if you redistribute
+		the Apple Software in its entirety and without modifications, you must retain
+		this notice and the following text and disclaimers in all such redistributions of
+		the Apple Software.  Neither the name, trademarks, service marks or logos of
+		Apple Computer, Inc. may be used to endorse or promote products derived from the
+		Apple Software without specific prior written permission from Apple.  Except as
+		expressly stated in this notice, no other rights or licenses, express or implied,
+		are granted by Apple herein, including but not limited to any patent rights that
+		may be infringed by your derivative works or by other works in which the Apple
+		Software may be incorporated.
+
+		The Apple Software is provided by Apple on an "AS IS" basis.  APPLE MAKES NO
+		WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED
+		WARRANTIES OF NON-INFRINGEMENT, MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+		PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND OPERATION ALONE OR IN
+		COMBINATION WITH YOUR PRODUCTS.
+
+		IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL OR
+		CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
+		GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+		ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION, MODIFICATION AND/OR DISTRIBUTION
+		OF THE APPLE SOFTWARE, HOWEVER CAUSED AND WHETHER UNDER THEORY OF CONTRACT, TORT
+		(INCLUDING NEGLIGENCE), STRICT LIABILITY OR OTHERWISE, EVEN IF APPLE HAS BEEN
+		ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+	Copyright © 2006 Apple Computer, Inc., All Rights Reserved
+*/
+
+/* This sample shows how to override the C++ global 'new' and 'delete' operators.  */
+#include <new>
+#include <iostream>
+#include <cstdlib>
+#include <stdexcept>
+#include <locale>
+
+/* Some variables and code to make the example do something.  */
+namespace {
+  unsigned long long gNewCounter; // number of times 'new' was called
+  unsigned long long gDeleteCounter;  // number of times 'delete' was called
+  
+  void printCounters()  // print the counters above
+  {
+	std::cout << "new was called " << gNewCounter << " times and delete was called " << gDeleteCounter << " times\n";
+  }
+}
+
+/* These are the overridden new and delete routines.
+   Most applications will want to override at least these four versions of new/delete if they override any of them.
+
+   In Mac OS, it's not necessary to override the array versions of operator new and delete if all
+   they would do is call the non-array versions; the C++ standard library, as an extension
+   to the C++ standard, does this for you.
+
+   Developers should consult the section [lib.support.dynamic] in the C++ standard to see the requirements
+   on the generic operators new and delete; the system may expect that your overridden operators meet all these
+   requirements.
+
+   Your operators may be called by the system, even early in start-up before constructors have been executed.  */
+void* operator new(std::size_t sz) throw (std::bad_alloc)
+{
+	void *result = std::malloc (sz == 0 ? 1 : sz);
+	if (result == NULL)
+		throw std::bad_alloc();
+	gNewCounter++;
+	return result;
+}
+void operator delete(void* p) throw()
+{
+	if (p == NULL)
+		return;
+	std::free (p);
+	gDeleteCounter++;
+}
+
+/* These are the 'nothrow' versions of the above operators.
+   The system version will try to call a std::new_handler if they
+   fail, but your overriding versions are not required to do this.  */
+void* operator new(std::size_t sz, const std::nothrow_t&) throw()
+{
+	try {
+		void * result = ::operator new (sz);  // calls our overridden operator new
+		return result;
+	} catch (std::bad_alloc &) {
+	  return NULL;
+	}
+}
+void operator delete(void* p, const std::nothrow_t&) throw()
+{
+	::operator delete (p);
+}
+
+/* Bug 4067110 is that if your program has no weak symbols at all, the linker will not set the
+   WEAK_DEFINES bit in the Mach-O header and as a result the new and delete operators above won't
+   be seen by system libraries.  This is mostly a problem for test programs and small examples,
+   since almost all real C++ programs complicated enough to override new and delete will have at
+   least one weak symbol.  However, this is a small example, so:  */
+void __attribute__((weak, visibility("default"))) workaroundFor4067110 () { }
+
+/* This is a simple test program that causes the runtime library to call new and delete.  */
+int main() 
+{
+	atexit (printCounters);
+	try {
+	  std::locale example("does_not_exist");
+	} catch (std::runtime_error &x) {
+	}
+	return 0;
+}
+#endif // 0
+
+#endif // NV_CORE_MEMORY_H
--- a/src/nvcore/Prefetch.h
+++ b/src/nvcore/Prefetch.h
@ -0,0 +1,24 @@
+
+// nvPrefetch
+#if NV_CC_GNUC
+
+#define nvPrefetch(ptr)	__builtin_prefetch(ptr)
+
+#elif NV_CC_MSVC 
+
+#if NV_CPU_X86
+__forceinline void nvPrefetch(const void * mem)
+{
+	__asm mov ecx, mem
+	__asm prefetcht0 [ecx];
+//	__asm prefetchnta [ecx];
+}
+#endif // NV_CPU_X86
+
+#else // NV_CC_MSVC
+
+// do nothing in other case.
+#define piPrefetch(ptr)
+
+#endif // NV_CC_MSVC
+
--- a/src/nvcore/Ptr.h
+++ b/src/nvcore/Ptr.h
@ -0,0 +1,420 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#ifndef NV_CORE_PTR_H
+#define NV_CORE_PTR_H
+
+#include <nvcore/nvcore.h>
+#include <nvcore/Debug.h>
+
+#include <stdio.h>	// NULL
+
+namespace nv
+{
+	
+/** Simple auto pointer template class.
+ *
+ * This is very similar to the standard auto_ptr class, but with some 
+ * additional limitations to make its use less error prone:
+ * - Copy constructor and assignment operator are disabled.
+ * - reset method is removed.
+ * 
+ * The semantics of the standard auto_ptr are not clear and change depending
+ * on the std implementation. For a discussion of the problems of auto_ptr read:
+ * http://www.awprofessional.com/content/images/020163371X/autoptrupdate\auto_ptr_update.html
+ */
+template <class T>
+class AutoPtr
+{
+	NV_FORBID_COPY(AutoPtr);
+	NV_FORBID_HEAPALLOC();
+public:
+	
+	/** Ctor. */
+	explicit AutoPtr( T * p ) {
+		m_ptr = p;
+	}
+	
+	/** Dtor. Deletes owned pointer. */
+	~AutoPtr() {
+		delete m_ptr;
+		m_ptr = NULL;
+	}
+
+	/** Delete owned pointer and assign new one. */
+	void operator=( T * p ) {
+		delete m_ptr;
+		m_ptr = p;
+	}
+
+	/** Member access. */
+	T * operator -> () const {
+		nvDebugCheck(m_ptr != NULL);
+		return m_ptr;
+	}
+
+	/** Get reference. */
+	T & operator*() const {
+		nvDebugCheck(m_ptr != NULL);
+		return *m_ptr;
+	}
+
+	/** Get pointer. */
+	T * ptr() const { return m_ptr; }
+	
+	/** Relinquish ownership of the underlying pointer and returns that pointer. */
+	T * release() {
+		T * tmp = m_ptr;
+		m_ptr = NULL;
+		return tmp;
+	}
+	
+	/** Const pointer equal comparation. */
+	bool operator == (const T * const p) const {
+		return (m_ptr == p);
+	}
+
+	/** Const pointer nequal comparation. */
+	bool operator != (const T * const p) const {
+		return (m_ptr != p);
+	}
+
+private:
+	T * m_ptr;
+};
+
+#if 0
+/** Reference counted base class to be used with Pointer.
+ *
+ * The only requirement of the Pointer class is that the RefCounted class implements the 
+ * addRef and release methods.
+ */
+class RefCounted
+{
+	NV_FORBID_COPY(RefCounted);
+public:
+
+	/// Ctor.
+	RefCounted() : m_count(0), m_weak_proxy(NULL)
+	{
+		s_total_obj_count++;
+	}
+
+	/// Virtual dtor.
+	virtual ~RefCounted()
+	{
+		nvCheck( m_count == 0 );
+		nvCheck( s_total_obj_count > 0 );
+		s_total_obj_count--;
+	}
+
+
+	/// Increase reference count.
+	uint addRef() const
+	{
+		s_total_ref_count++;
+		m_count++;
+		return m_count;
+	}
+
+
+	/// Decrease reference count and remove when 0.
+	uint release() const
+	{
+		nvCheck( m_count > 0 );
+		
+		s_total_ref_count--;
+		m_count--;
+		if( m_count == 0 ) {
+			releaseWeakProxy();
+			delete this;
+			return 0;
+		}
+		return m_count;
+	}
+
+	/// Get weak proxy.
+	WeakProxy * getWeakProxy() const
+	{
+		if (m_weak_proxy == NULL) {
+			m_weak_proxy = new WeakProxy;
+			m_weak_proxy->AddRef();
+		}
+		return m_weak_proxy;
+	}
+
+	/// Release the weak proxy.	
+	void releaseWeakProxy() const
+	{
+		if (m_weak_proxy != NULL) {
+			m_weak_proxy->NotifyObjectDied();
+			m_weak_proxy->Release();
+			m_weak_proxy = NULL;
+		}
+	}
+
+	/** @name Debug methods: */
+	//@{
+		/// Get reference count.
+		int refCount() const
+		{
+			return m_count;
+		}
+
+		/// Get total number of objects.
+		static int totalObjectCount()
+		{
+			return s_total_obj_count;
+		}
+
+		/// Get total number of references.
+		static int totalReferenceCount()
+		{
+			return s_total_ref_count;
+		}
+	//@}
+
+
+private:
+
+	NVCORE_API static int s_total_ref_count;
+	NVCORE_API static int s_total_obj_count;
+
+	mutable int m_count;
+	mutable WeakProxy * weak_proxy;
+
+};
+#endif
+
+/// Smart pointer template class.
+template <class BaseClass>
+class Pointer {
+public:
+
+	// BaseClass must implement addRef() and release().
+	typedef Pointer<BaseClass>	ThisType;
+
+	/// Default ctor.
+	Pointer() : m_ptr(NULL) 
+	{
+	}
+
+	/** Other type assignment. */
+	template <class OtherBase>
+	Pointer( const Pointer<OtherBase> & tc )
+	{
+		m_ptr = static_cast<BaseClass *>( tc.ptr() );
+		if( m_ptr ) {
+			m_ptr->addRef();
+		}
+	}
+
+	/** Copy ctor. */
+	Pointer( const ThisType & bc )
+	{
+		m_ptr = bc.ptr();
+		if( m_ptr ) {
+			m_ptr->addRef();
+		}
+	}
+
+	/** Copy cast ctor. Pointer(NULL) is valid. */
+	explicit Pointer( BaseClass * bc )
+	{
+		m_ptr = bc;
+		if( m_ptr ) {
+			m_ptr->addRef();
+		}
+	}
+
+	/** Dtor. */
+	~Pointer()
+	{
+		set(NULL);
+	}
+
+
+	/** @name Accessors: */
+	//@{
+		/** -> operator. */
+		BaseClass * operator -> () const
+		{
+			piCheck( m_ptr != NULL );
+			return m_ptr;
+		}
+
+		/** * operator. */
+		BaseClass & operator*() const
+		{
+			piCheck( m_ptr != NULL );
+			return *m_ptr;
+		}
+
+		/** Get pointer. */
+		BaseClass * ptr() const
+		{
+			return m_ptr;
+		}
+	//@}
+
+
+	/** @name Mutators: */
+	//@{
+		/** Other type assignment. */
+		template <class OtherBase>
+		void operator = ( const Pointer<OtherBase> & tc )
+		{
+			set( static_cast<BaseClass *>(tc.ptr()) );
+		}
+
+		/** This type assignment. */
+		void operator = ( const ThisType & bc )
+		{
+			set( bc.ptr() );
+		}
+
+		/** Pointer assignment. */
+		void operator = ( BaseClass * bc )
+		{
+			set( bc );
+		}
+	//@}
+
+
+	/** @name Comparators: */
+	//@{
+		/** Other type equal comparation. */
+		template <class OtherBase>
+		bool operator == ( const Pointer<OtherBase> & other ) const
+		{
+			return m_ptr == other.ptr();
+		}
+
+		/** This type equal comparation. */
+		bool operator == ( const ThisType & bc ) const
+		{
+			return m_ptr == bc.ptr();
+		}
+
+		/** Const pointer equal comparation. */
+		bool operator == ( const BaseClass * const bc ) const
+		{
+			return m_ptr == bc;
+		}
+
+		/** Other type not equal comparation. */
+		template <class OtherBase>
+		bool operator != ( const Pointer<OtherBase> & other ) const
+		{
+			return m_ptr != other.ptr();
+		}
+		
+		/** Other type not equal comparation. */
+		bool operator != ( const ThisType & bc ) const
+		{
+			return m_ptr != bc.ptr();
+		}
+
+		/** Const pointer not equal comparation. */
+		bool operator != (const BaseClass * const bc) const
+		{
+			return m_ptr != bc;
+		}
+
+		/** This type lower than comparation. */
+		bool operator < (const ThisType & p) const
+		{
+			return m_ptr < p.ptr();
+		}
+	//@}
+
+private:
+
+	/** Set this pointer. */
+	void set( BaseClass * p )
+	{
+		if( m_ptr != p ) {
+			if( m_ptr ) m_ptr->release();
+			if( p ) p->addRef();
+			m_ptr = p;
+		}
+	}
+
+private:
+
+	BaseClass * m_ptr;
+
+};
+
+
+
+/*	
+template <class T> 
+class QSharedDataPointer
+{
+public:
+	inline void detach() { if (d && d->ref != 1) detach_helper(); }
+	inline T &operator*() { detach(); return *d; }
+	inline const T &operator*() const { return *d; }
+	inline T *operator->() { detach(); return d; }
+	inline const T *operator->() const { return d; }
+	inline operator T *() { detach(); return d; }
+	inline operator const T *() const { return d; }
+	inline T *data() { detach(); return d; }
+	inline const T *data() const { return d; }
+	inline const T *constData() const { return d; }
+
+	inline bool operator==(const QSharedDataPointer<T> &other) const { return d == other.d; }
+	inline bool operator!=(const QSharedDataPointer<T> &other) const { return d != other.d; }
+
+	inline QSharedDataPointer() { d = 0; }
+	inline ~QSharedDataPointer() { if (d && !d->ref.deref()) delete d; }
+
+	explicit QSharedDataPointer(T *data);
+	inline QSharedDataPointer(const QSharedDataPointer<T> &o) : d(o.d) { if (d) d->ref.ref(); }
+	inline QSharedDataPointer<T> & operator=(const QSharedDataPointer<T> &o) {
+		if (o.d != d) {
+			T *x = o.d;
+			if (x) x->ref.ref();
+			x = qAtomicSetPtr(&d, x);
+			if (x && !x->ref.deref())
+				delete x;
+		}
+		return *this;
+	}
+	inline QSharedDataPointer &operator=(T *o) {
+		if (o != d) {
+			T *x = o;
+			if (x) x->ref.ref();
+			x = qAtomicSetPtr(&d, x);
+			if (x && !x->ref.deref())
+				delete x;
+		}
+		return *this;
+	}
+
+	inline bool operator!() const { return !d; }
+
+private:
+	void detach_helper();
+
+	T *d;
+};
+
+template <class T>
+Q_INLINE_TEMPLATE QSharedDataPointer<T>::QSharedDataPointer(T *adata) : d(adata)
+{ if (d) d->ref.ref(); }
+
+template <class T>
+Q_OUTOFLINE_TEMPLATE void QSharedDataPointer<T>::detach_helper()
+{
+	T *x = new T(*d);
+	x->ref.ref();
+	x = qAtomicSetPtr(&d, x);
+	if (!x->ref.deref())
+		delete x;
+}
+*/
+
+} // nv namespace
+
+#endif // NV_CORE_PTR_H
--- a/src/nvcore/Radix.cpp
+++ b/src/nvcore/Radix.cpp
@ -0,0 +1,429 @@
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+/**
+ *	Contains source code from the article "Radix Sort Revisited".
+ *	\file		Radix.cpp
+ *	\author		Pierre Terdiman
+ *	\date		April, 4, 2000
+ */
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+/**
+ *	Revisited Radix Sort.
+ *	This is my new radix routine:
+ *  - it uses indices and doesn't recopy the values anymore, hence wasting less ram
+ *  - it creates all the histograms in one run instead of four
+ *  - it sorts words faster than dwords and bytes faster than words
+ *  - it correctly sorts negative floating-point values by patching the offsets
+ *  - it automatically takes advantage of temporal coherence
+ *  - multiple keys support is a side effect of temporal coherence
+ *  - it may be worth recoding in asm... (mainly to use FCOMI, FCMOV, etc) [it's probably memory-bound anyway]
+ *
+ *	History:
+ *	- 08.15.98: very first version
+ *	- 04.04.00: recoded for the radix article
+ *	- 12.xx.00: code lifting
+ *	- 09.18.01: faster CHECK_PASS_VALIDITY thanks to Mark D. Shattuck (who provided other tips, not included here)
+ *	- 10.11.01: added local ram support
+ *	- 01.20.02: bugfix! In very particular cases the last pass was skipped in the float code-path, leading to incorrect sorting......
+ *
+ *	\class		RadixSort
+ *	\author		Pierre Terdiman
+ *	\version	1.3
+ *	\date		August, 15, 1998
+ */
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/*
+To do:
+	- add an offset parameter between two input values (avoid some data recopy sometimes)
+	- unroll ? asm ?
+*/
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Header
+
+#include <nvcore/Radix.h>
+
+#include <string.h> // memset
+
+//using namespace IceCore;
+
+#define DELETEARRAY(a)	{ delete [] a; a = NULL; }
+#define CHECKALLOC(a)
+
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+/**
+ *	Constructor.
+ */
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+RadixSort::RadixSort() : mCurrentSize(0), mPreviousSize(0), mIndices(NULL), mIndices2(NULL), mTotalCalls(0), mNbHits(0)
+{
+#ifndef RADIX_LOCAL_RAM
+	// Allocate input-independent ram
+	mHistogram		= new uint32[256*4];
+	mOffset			= new uint32[256];
+#endif
+	// Initialize indices
+	resetIndices();
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+/**
+ *	Destructor.
+ */
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+RadixSort::~RadixSort()
+{
+	// Release everything
+#ifndef RADIX_LOCAL_RAM
+	DELETEARRAY(mOffset);
+	DELETEARRAY(mHistogram);
+#endif
+	DELETEARRAY(mIndices2);
+	DELETEARRAY(mIndices);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+/**
+ *	Resizes the inner lists.
+ *	\param		nb				[in] new size (number of dwords)
+ *	\return		true if success
+ */
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+bool RadixSort::resize(uint32 nb)
+{
+	// Free previously used ram
+	DELETEARRAY(mIndices2);
+	DELETEARRAY(mIndices);
+
+	// Get some fresh one
+	mIndices		= new uint32[nb];	CHECKALLOC(mIndices);
+	mIndices2		= new uint32[nb];	CHECKALLOC(mIndices2);
+	mCurrentSize	= nb;
+
+	// Initialize indices so that the input buffer is read in sequential order
+	resetIndices();
+
+	return true;
+}
+
+#define CHECK_RESIZE(n)																			\
+	if(n!=mPreviousSize)																		\
+	{																							\
+				if(n>mCurrentSize)	resize(n);													\
+		else						resetIndices();												\
+		mPreviousSize = n;																		\
+	}
+
+#define CREATE_HISTOGRAMS(type, buffer)															\
+	/* Clear counters */																		\
+	memset(mHistogram, 0, 256*4*sizeof(uint32));												\
+																								\
+	/* Prepare for temporal coherence */														\
+	type PrevVal = (type)buffer[mIndices[0]];													\
+	bool AlreadySorted = true;	/* Optimism... */												\
+	uint32* Indices = mIndices;																	\
+																								\
+	/* Prepare to count */																		\
+	uint8* p = (uint8*)input;																	\
+	uint8* pe = &p[nb*4];																		\
+	uint32* h0= &mHistogram[0];		/* Histogram for first pass (LSB)	*/						\
+	uint32* h1= &mHistogram[256];	/* Histogram for second pass		*/						\
+	uint32* h2= &mHistogram[512];	/* Histogram for third pass			*/						\
+	uint32* h3= &mHistogram[768];	/* Histogram for last pass (MSB)	*/						\
+																								\
+	while(p!=pe)																				\
+	{																							\
+		/* Read input buffer in previous sorted order */										\
+		type Val = (type)buffer[*Indices++];													\
+		/* Check whether already sorted or not */												\
+		if(Val<PrevVal)	{ AlreadySorted = false; break; } /* Early out */						\
+		/* Update for next iteration */															\
+		PrevVal = Val;																			\
+																								\
+		/* Create histograms */																	\
+		h0[*p++]++;	h1[*p++]++;	h2[*p++]++;	h3[*p++]++;											\
+	}																							\
+																								\
+	/* If all input values are already sorted, we just have to return and leave the */			\
+	/* previous list unchanged. That way the routine may take advantage of temporal */			\
+	/* coherence, for example when used to sort transparent faces.					*/			\
+	if(AlreadySorted)	{ mNbHits++; return *this;	}											\
+																								\
+	/* Else there has been an early out and we must finish computing the histograms */			\
+	while(p!=pe)																				\
+	{																							\
+		/* Create histograms without the previous overhead */									\
+		h0[*p++]++;	h1[*p++]++;	h2[*p++]++;	h3[*p++]++;											\
+	}
+
+#define CHECK_PASS_VALIDITY(pass)																\
+	/* Shortcut to current counters */															\
+	uint32* CurCount = &mHistogram[pass<<8];													\
+																								\
+	/* Reset flag. The sorting pass is supposed to be performed. (default) */					\
+	bool PerformPass = true;																	\
+																								\
+	/* Check pass validity */																	\
+																								\
+	/* If all values have the same byte, sorting is useless. */									\
+	/* It may happen when sorting bytes or words instead of dwords. */							\
+	/* This routine actually sorts words faster than dwords, and bytes */						\
+	/* faster than words. Standard running time (O(4*n))is reduced to O(2*n) */					\
+	/* for words and O(n) for bytes. Running time for floats depends on actual values... */		\
+																								\
+	/* Get first byte */																		\
+	uint8 UniqueVal = *(((uint8*)input)+pass);													\
+																								\
+	/* Check that byte's counter */																\
+	if(CurCount[UniqueVal]==nb)	PerformPass=false;
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+/**
+ *	Main sort routine.
+ *	This one is for integer values. After the call, mIndices contains a list of indices in sorted order, i.e. in the order you may process your data.
+ *	\param		input			[in] a list of integer values to sort
+ *	\param		nb				[in] number of values to sort
+ *	\param		signedvalues	[in] true to handle negative values, false if you know your input buffer only contains positive values
+ *	\return		Self-Reference
+ */
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+RadixSort& RadixSort::sort(const uint32* input, uint32 nb, bool signedvalues)
+{
+	uint32 i, j;
+	
+	// Checkings
+	if(!input || !nb)	return *this;
+
+	// Stats
+	mTotalCalls++;
+
+	// Resize lists if needed
+	CHECK_RESIZE(nb);
+
+#ifdef RADIX_LOCAL_RAM
+	// Allocate histograms & offsets on the stack
+	uint32 mHistogram[256*4];
+	uint32 mOffset[256];
+#endif
+
+	// Create histograms (counters). Counters for all passes are created in one run.
+	// Pros:	read input buffer once instead of four times
+	// Cons:	mHistogram is 4Kb instead of 1Kb
+	// We must take care of signed/unsigned values for temporal coherence.... I just
+	// have 2 code paths even if just a single opcode changes. Self-modifying code, someone?
+	if(!signedvalues)	{ CREATE_HISTOGRAMS(uint32, input);	}
+	else				{ CREATE_HISTOGRAMS(int32, input);	}
+
+	// Compute #negative values involved if needed
+	uint32 NbNegativeValues = 0;
+	if(signedvalues)
+	{
+		// An efficient way to compute the number of negatives values we'll have to deal with is simply to sum the 128
+		// last values of the last histogram. Last histogram because that's the one for the Most Significant Byte,
+		// responsible for the sign. 128 last values because the 128 first ones are related to positive numbers.
+		uint32* h3= &mHistogram[768];
+		for( i=128;i<256;i++)	NbNegativeValues += h3[i];	// 768 for last histogram, 128 for negative part
+	}
+
+	// Radix sort, j is the pass number (0=LSB, 3=MSB)
+	for( j=0;j<4;j++)
+	{
+		CHECK_PASS_VALIDITY(j);
+
+		// Sometimes the fourth (negative) pass is skipped because all numbers are negative and the MSB is 0xFF (for example). This is
+		// not a problem, numbers are correctly sorted anyway.
+		if(PerformPass)
+		{
+			// Should we care about negative values?
+			if(j!=3 || !signedvalues)
+			{
+				// Here we deal with positive values only
+
+				// Create offsets
+				mOffset[0] = 0;
+				for(i=1;i<256;i++)		mOffset[i] = mOffset[i-1] + CurCount[i-1];
+			}
+			else
+			{
+				// This is a special case to correctly handle negative integers. They're sorted in the right order but at the wrong place.
+
+				// Create biased offsets, in order for negative numbers to be sorted as well
+				mOffset[0] = NbNegativeValues;												// First positive number takes place after the negative ones
+				for(i=1;i<128;i++)		mOffset[i] = mOffset[i-1] + CurCount[i-1];	// 1 to 128 for positive numbers
+
+				// Fixing the wrong place for negative values
+				mOffset[128] = 0;
+				for(i=129;i<256;i++)			mOffset[i] = mOffset[i-1] + CurCount[i-1];
+			}
+
+			// Perform Radix Sort
+			uint8* InputBytes	= (uint8*)input;
+			uint32* Indices		= mIndices;
+			uint32* IndicesEnd	= &mIndices[nb];
+			InputBytes += j;
+			while(Indices!=IndicesEnd)
+			{
+				uint32 id = *Indices++;
+				mIndices2[mOffset[InputBytes[id<<2]]++] = id;
+			}
+
+			// Swap pointers for next pass. Valid indices - the most recent ones - are in mIndices after the swap.
+			uint32* Tmp	= mIndices;	mIndices = mIndices2; mIndices2 = Tmp;
+		}
+	}
+	return *this;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+/**
+ *	Main sort routine.
+ *	This one is for floating-point values. After the call, mIndices contains a list of indices in sorted order, i.e. in the order you may process your data.
+ *	\param		input			[in] a list of floating-point values to sort
+ *	\param		nb				[in] number of values to sort
+ *	\return		Self-Reference
+ *	\warning	only sorts IEEE floating-point values
+ */
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+RadixSort& RadixSort::sort(const float* input2, uint32 nb)
+{
+	uint32 i, j;
+	
+	// Checkings
+	if(!input2 || !nb)	return *this;
+
+	// Stats
+	mTotalCalls++;
+
+	uint32* input = (uint32*)input2;
+
+	// Resize lists if needed
+	CHECK_RESIZE(nb);
+
+#ifdef RADIX_LOCAL_RAM
+	// Allocate histograms & offsets on the stack
+	uint32 mHistogram[256*4];
+	uint32 mOffset[256];
+#endif
+
+	// Create histograms (counters). Counters for all passes are created in one run.
+	// Pros:	read input buffer once instead of four times
+	// Cons:	mHistogram is 4Kb instead of 1Kb
+	// Floating-point values are always supposed to be signed values, so there's only one code path there.
+	// Please note the floating point comparison needed for temporal coherence! Although the resulting asm code
+	// is dreadful, this is surprisingly not such a performance hit - well, I suppose that's a big one on first
+	// generation Pentiums....We can't make comparison on integer representations because, as Chris said, it just
+	// wouldn't work with mixed positive/negative values....
+	{ CREATE_HISTOGRAMS(float, input2); }
+
+	// Compute #negative values involved if needed
+	uint32 NbNegativeValues = 0;
+	// An efficient way to compute the number of negatives values we'll have to deal with is simply to sum the 128
+	// last values of the last histogram. Last histogram because that's the one for the Most Significant Byte,
+	// responsible for the sign. 128 last values because the 128 first ones are related to positive numbers.
+	uint32* h3= &mHistogram[768];
+	for( i=128;i<256;i++)	NbNegativeValues += h3[i];	// 768 for last histogram, 128 for negative part
+
+	// Radix sort, j is the pass number (0=LSB, 3=MSB)
+	for( j=0;j<4;j++)
+	{
+		// Should we care about negative values?
+		if(j!=3)
+		{
+			// Here we deal with positive values only
+			CHECK_PASS_VALIDITY(j);
+
+			if(PerformPass)
+			{
+				// Create offsets
+				mOffset[0] = 0;
+				for( i=1;i<256;i++)		mOffset[i] = mOffset[i-1] + CurCount[i-1];
+
+				// Perform Radix Sort
+				uint8* InputBytes	= (uint8*)input;
+				uint32* Indices		= mIndices;
+				uint32* IndicesEnd	= &mIndices[nb];
+				InputBytes += j;
+				while(Indices!=IndicesEnd)
+				{
+					uint32 id = *Indices++;
+					mIndices2[mOffset[InputBytes[id<<2]]++] = id;
+				}
+
+				// Swap pointers for next pass. Valid indices - the most recent ones - are in mIndices after the swap.
+				uint32* Tmp	= mIndices;	mIndices = mIndices2; mIndices2 = Tmp;
+			}
+		}
+		else
+		{
+			// This is a special case to correctly handle negative values
+			CHECK_PASS_VALIDITY(j);
+
+			if(PerformPass)
+			{
+				// Create biased offsets, in order for negative numbers to be sorted as well
+				mOffset[0] = NbNegativeValues;												// First positive number takes place after the negative ones
+				for(i=1;i<128;i++)		mOffset[i] = mOffset[i-1] + CurCount[i-1];	// 1 to 128 for positive numbers
+
+				// We must reverse the sorting order for negative numbers!
+				mOffset[255] = 0;
+				for(i=0;i<127;i++)		mOffset[254-i] = mOffset[255-i] + CurCount[255-i];	// Fixing the wrong order for negative values
+				for(i=128;i<256;i++)	mOffset[i] += CurCount[i];							// Fixing the wrong place for negative values
+
+				// Perform Radix Sort
+				for(i=0;i<nb;i++)
+				{
+					uint32 Radix = input[mIndices[i]]>>24;								// Radix byte, same as above. AND is useless here (uint32).
+					// ### cmp to be killed. Not good. Later.
+					if(Radix<128)		mIndices2[mOffset[Radix]++] = mIndices[i];		// Number is positive, same as above
+					else				mIndices2[--mOffset[Radix]] = mIndices[i];		// Number is negative, flip the sorting order
+				}
+				// Swap pointers for next pass. Valid indices - the most recent ones - are in mIndices after the swap.
+				uint32* Tmp	= mIndices;	mIndices = mIndices2; mIndices2 = Tmp;
+			}
+			else
+			{
+				// The pass is useless, yet we still have to reverse the order of current list if all values are negative.
+				if(UniqueVal>=128)
+				{
+					for(i=0;i<nb;i++)	mIndices2[i] = mIndices[nb-i-1];
+
+					// Swap pointers for next pass. Valid indices - the most recent ones - are in mIndices after the swap.
+					uint32* Tmp	= mIndices;	mIndices = mIndices2; mIndices2 = Tmp;
+				}
+			}
+		}
+	}
+	return *this;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+/**
+ *	Resets the inner indices. After the call, mIndices is reset.
+ */
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+void RadixSort::resetIndices()
+{
+	for(uint32 i=0;i<mCurrentSize;i++)	mIndices[i] = i;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+/**
+ *	Gets the ram used.
+ *	\return		memory used in bytes
+ */
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+uint32 RadixSort::usedRam() const
+{
+	uint32 UsedRam = sizeof(RadixSort);
+#ifndef RADIX_LOCAL_RAM
+	UsedRam += 256*4*sizeof(uint32);			// Histograms
+	UsedRam += 256*sizeof(uint32);				// Offsets
+#endif
+	UsedRam += 2*mCurrentSize*sizeof(uint32);	// 2 lists of indices
+	return UsedRam;
+}
--- a/src/nvcore/Radix.h
+++ b/src/nvcore/Radix.h
@ -0,0 +1,69 @@
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+/**
+ *	Contains source code from the article "Radix Sort Revisited".
+ *	\file		Radix.h
+ *	\author		Pierre Terdiman
+ *	\date		April, 4, 2000
+ */
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Include Guard
+#ifndef NV_CORE_RADIXSORT_H
+#define NV_CORE_RADIXSORT_H
+
+#include <nvcore/nvcore.h>
+
+
+#define RADIX_LOCAL_RAM
+
+
+class NVCORE_API RadixSort {
+	NV_FORBID_COPY(RadixSort);
+public:
+	// Constructor/Destructor
+	RadixSort();
+	~RadixSort();
+
+	// Sorting methods
+	RadixSort & sort(const uint32* input, uint32 nb, bool signedvalues=true);
+	RadixSort & sort(const float* input, uint32 nb);
+
+	//! Access to results. mIndices is a list of indices in sorted order, i.e. in the order you may further process your data
+	inline uint32 * indices() const { return mIndices; }
+
+	//! mIndices2 gets trashed on calling the sort routine, but otherwise you can recycle it the way you want.
+	inline uint32 * recyclable() const { return mIndices2; }
+
+	// Stats
+	uint32 usedRam() const;
+
+	//! Returns the total number of calls to the radix sorter.
+	inline uint32 totalCalls()	const { return mTotalCalls;	}
+
+	//! Returns the number of premature exits due to temporal coherence.
+	inline uint32 hits() const { return mNbHits; }
+
+
+	private:
+#ifndef RADIX_LOCAL_RAM
+	uint32*			mHistogram;					//!< Counters for each byte
+	uint32*			mOffset;					//!< Offsets (nearly a cumulative distribution function)
+#endif
+	uint32			mCurrentSize;				//!< Current size of the indices list
+	uint32			mPreviousSize;				//!< Size involved in previous call
+	uint32*			mIndices;					//!< Two lists, swapped each pass
+	uint32*			mIndices2;
+
+	// Stats
+	uint32			mTotalCalls;
+	uint32			mNbHits;
+
+	// Internal methods
+	bool			resize(uint32 nb);
+	void			resetIndices();
+
+};
+
+
+#endif // NV_CORE_RADIXSORT_H
--- a/src/nvcore/StdStream.h
+++ b/src/nvcore/StdStream.h
@ -0,0 +1,336 @@
+#ifndef NV_STDSTREAM_H
+#define NV_STDSTREAM_H
+
+#include <nvcore/Stream.h>
+
+#include <stdio.h> // fopen
+#include <string.h> // memcpy
+#include <exception> // std::exception
+
+namespace nv
+{
+
+// Portable version of fopen.
+inline FILE * fileOpen(const char * fileName, const char * mode)
+{
+#if NV_CC_MSVC && _MSC_VER >= 1400
+	FILE * fp;
+	if (fopen_s(&fp, fileName, mode) == 0) {
+		return fp;
+	}
+	return NULL;
+#else
+	return fopen(fileName, mode);
+#endif
+}
+
+
+/// Base stdio stream.
+class StdStream : public Stream
+{
+public:
+
+	/// Ctor.
+	StdStream( FILE * fp, bool autoclose=true ) : 
+		m_fp(fp), m_autoclose(autoclose) { }
+	
+	/// Dtor. 
+	virtual ~StdStream()
+	{
+		if( m_fp != NULL && m_autoclose ) {
+			fclose( m_fp );
+		}
+	}
+
+
+	/** @name Stream implementation. */
+	//@{
+		virtual void seek( int pos )
+		{
+			nvDebugCheck(m_fp != NULL);
+			fseek(m_fp, pos, SEEK_SET);
+		}
+		
+		virtual int tell() const
+		{
+			nvDebugCheck(m_fp != NULL);
+			return ftell(m_fp);
+		}
+		
+		virtual int size() const
+		{
+			int pos = ftell(m_fp);
+			fseek(m_fp, 0, SEEK_END);
+			int end = ftell(m_fp);
+			fseek(m_fp, pos, SEEK_SET);
+			return end;
+		}
+		
+		virtual bool isError() const
+		{
+			return m_fp == NULL || ferror( m_fp ) != 0;
+		}
+		
+		virtual bool isAtEnd() const
+		{
+			nvDebugCheck(m_fp != NULL);
+			return feof( m_fp ) != 0;
+		}
+		
+		/// Always true.
+		virtual bool isSeekable() const { return true; }
+	//@}
+
+protected:
+
+	FILE * m_fp;
+	bool m_autoclose;
+
+};
+
+
+/// Standard output stream.
+class StdOutputStream : public StdStream
+{
+public:
+
+	/// Construct stream by file name.
+	StdOutputStream( const char * name ) :
+		StdStream(fileOpen(name, "wb")) { }
+
+	/// Construct stream by file handle.
+	StdOutputStream( FILE * fp, bool autoclose=true ) : StdStream(fp, autoclose)
+	{
+	}
+
+	/** @name Stream implementation. */
+	//@{
+		/// Write data.
+		virtual void serialize( void * data, int len )
+		{
+			nvDebugCheck(data != NULL);
+			nvDebugCheck(m_fp != NULL);
+			fwrite(data, len, 1, m_fp);
+		}
+		
+		virtual bool isLoading() const
+		{
+			return false;
+		}
+		
+		virtual bool isSaving() const
+		{
+			return true;
+		}
+	//@}
+
+};
+
+
+/// Standard input stream.
+class StdInputStream : public StdStream
+{
+public:
+
+	/// Construct stream by file name.
+	StdInputStream( const char * name ) : 
+		StdStream(fileOpen(name, "rb")) { }
+
+	/// Construct stream by file handle.
+	StdInputStream( FILE * fp, bool autoclose=true ) : StdStream(fp, autoclose)
+	{
+	}
+
+	/** @name Stream implementation. */
+	//@{
+		/// Read data.
+		virtual void serialize( void * data, int len )
+		{
+			nvDebugCheck(data != NULL);
+			nvDebugCheck(m_fp != NULL);
+			fread(data, len, 1, m_fp);
+		}
+		
+		virtual bool isLoading() const
+		{
+			return true;
+		}
+		
+		virtual bool isSaving() const
+		{
+			return false;
+		}
+	//@}
+};
+
+
+
+/// Memory input stream.
+class MemoryInputStream : public Stream
+{
+public:
+
+	/// Ctor.
+	MemoryInputStream( const uint8 * mem, int size ) : 
+		m_mem(mem), m_ptr(mem), m_size(size) { }
+
+	/** @name Stream implementation. */
+	//@{
+		/// Read data.
+		virtual void serialize( void * data, int len )
+		{
+			nvDebugCheck(data != NULL);
+			nvDebugCheck(!isError());
+			memcpy( data, m_ptr, len );
+			m_ptr += len;
+		}
+		
+		virtual void seek( int pos )
+		{
+			nvDebugCheck(!isError());
+			m_ptr = m_mem + pos;
+			nvDebugCheck(!isError());
+		}
+		
+		virtual int tell() const
+		{
+			return m_ptr - m_mem;
+		}
+		
+		virtual int size() const
+		{
+			return m_size;
+		}
+		
+		virtual bool isError() const
+		{
+			return m_mem == NULL || m_ptr > m_mem + m_size || m_ptr < m_mem;
+		}
+		
+		virtual bool isAtEnd() const
+		{
+			return m_ptr == m_mem + m_size;
+		}
+		
+		/// Always true.
+		virtual bool isSeekable() const
+		{
+			return true;
+		}
+		
+		virtual bool isLoading() const
+		{
+			return true;
+		}
+		
+		virtual bool isSaving() const
+		{
+			return false;
+		}
+	//@}
+
+	
+private:
+
+	const uint8 * m_mem;
+	const uint8 * m_ptr;
+	int m_size;
+
+};
+
+
+/// Protected input stream.
+class ProtectedStream : public Stream
+{
+public:
+
+	/// Ctor.
+	ProtectedStream( Stream & s ) : m_s(&s), m_autodelete(false)
+	{ 
+	}
+
+	/// Ctor.
+	ProtectedStream( Stream * s, bool autodelete = true ) : 
+		m_s(s), m_autodelete(autodelete) 
+	{
+		nvDebugCheck(m_s != NULL);
+	}
+
+	/// Dtor.
+	virtual ~ProtectedStream()
+	{
+		if( m_autodelete ) {
+			delete m_s;
+		}
+	}
+
+	/** @name Stream implementation. */
+	//@{
+		/// Read data.
+		virtual void serialize( void * data, int len )
+		{
+			nvDebugCheck(data != NULL);
+			m_s->serialize( data, len );
+			
+			if( m_s->isError() ) {
+				throw std::exception();
+			}
+		}
+		
+		virtual void seek( int pos )
+		{
+			m_s->seek( pos );
+			
+			if( m_s->isError() ) {
+				throw std::exception();
+			}
+		}
+		
+		virtual int tell() const
+		{
+			return m_s->tell();
+		}
+		
+		virtual int size() const
+		{
+			return m_s->size();
+		}
+		
+		virtual bool isError() const
+		{
+			return m_s->isError();
+		}
+		
+		virtual bool isAtEnd() const
+		{
+			return m_s->isAtEnd();
+		}
+		
+		virtual bool isSeekable() const
+		{
+			return m_s->isSeekable();
+		}
+		
+		virtual bool isLoading() const
+		{
+			return m_s->isLoading();
+		}
+		
+		virtual bool isSaving() const
+		{
+			return m_s->isSaving();
+		}
+	//@}
+
+	
+private:
+	
+	Stream * m_s;
+	bool m_autodelete;
+
+};
+
+} // nv namespace
+
+
+#endif // NV_STDSTREAM_H
--- a/src/nvcore/StrLib.cpp
+++ b/src/nvcore/StrLib.cpp
@ -0,0 +1,632 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#include <nvcore/StrLib.h>
+
+#include <math.h>	// log
+#include <stdio.h>	// vsnprintf
+
+#if NV_CC_MSVC
+#include <stdarg.h> // vsnprintf
+#endif
+
+#if NV_OS_WIN32
+#define NV_PATH_SEPARATOR '\\'
+#else
+#define NV_PATH_SEPARATOR '/'
+#endif
+
+using namespace nv;
+
+namespace 
+{
+	static char * strAlloc(uint size)
+	{
+		return static_cast<char *>(mem::malloc(size));
+	}
+	
+	static char * strReAlloc(char * str, uint size)
+	{
+		return static_cast<char *>(mem::realloc(str, size));
+	}
+	
+	static void strFree(const char * str)
+	{
+		return mem::free(const_cast<char *>(str));
+	}
+	
+	/*static char * strDup( const char * str ) 
+	{
+		nvDebugCheck( str != NULL );
+		uint len = uint(strlen( str ) + 1);
+		char * dup = strAlloc( len );
+		memcpy( dup, str, len );
+		return dup;
+	}*/
+	
+	// helper function for integer to string conversion.
+	static char * i2a( uint i, char *a, uint r )
+	{
+		if( i / r > 0 ) {
+			a = i2a( i / r, a, r );
+		}
+		*a = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"[i % r];
+		return a + 1;
+	}
+	
+	// Locale independent functions.
+	static inline char toUpper( char c ) {
+		return (c<'a' || c>'z') ? (c) : (c+'A'-'a');
+	}
+	static inline char toLower( char c ) {
+		return (c<'A' || c>'Z') ? (c) : (c+'a'-'A');
+	}
+	static inline bool isAlpha( char c ) {
+		return (c>='a' && c<='z') || (c>='A' && c<='Z');
+	}
+	static inline bool isDigit( char c ) {
+		return c>='0' && c<='9';
+	}
+	static inline bool isAlnum( char c ) {
+		return (c>='a' && c<='z') || (c>='A' && c<='Z') || (c>='0' && c<='9');
+	}
+	
+}
+
+int nv::strCmp(const char * s1, const char * s2)
+{
+	nvDebugCheck(s1 != NULL);
+	nvDebugCheck(s2 != NULL);
+	return strcmp(s1, s2);
+}
+
+int nv::strCaseCmp(const char * s1, const char * s2)
+{
+	nvDebugCheck(s1 != NULL);
+	nvDebugCheck(s1 != NULL);
+#if NV_CC_MSVC
+	return _stricmp(s1, s2);
+#else
+	return strcasecmp(s1, s2);
+#endif
+}
+
+void nv::strCpy(char * dst, int size, const char * src)
+{
+	nvDebugCheck(dst != NULL);
+	nvDebugCheck(src != NULL);
+#if NV_CC_MSVC && _MSC_VER >= 1400
+	strcpy_s(dst, size, src);
+#else
+	NV_UNUSED(size);
+	strcpy(dst, src);
+#endif
+}
+
+void nv::strCat(char * dst, int size, const char * src)
+{
+	nvDebugCheck(dst != NULL);
+	nvDebugCheck(src != NULL);
+#if NV_CC_MSVC && _MSC_VER >= 1400
+	strcat_s(dst, size, src);
+#else
+	NV_UNUSED(size);
+	strcat(dst, src);
+#endif
+}
+
+
+/** Pattern matching routine. I don't remember where did I get this. */
+bool nv::strMatch(const char * str, const char * pat)
+{
+	nvDebugCheck(str != NULL);
+	nvDebugCheck(pat != NULL);
+
+    char c2;
+
+    while (true) {
+        if (*pat==0) {
+            if (*str==0) return true;
+            else         return false;
+        }
+        if ((*str==0) && (*pat!='*')) return false;
+        if (*pat=='*') {
+            pat++;
+            if (*pat==0) return true;
+            while (true) {
+                if (strMatch(str, pat)) return true;
+                if (*str==0) return false;
+                str++;
+            }
+        }
+        if (*pat=='?') goto match;
+        if (*pat=='[') {
+            pat++;
+            while (true) {
+                if ((*pat==']') || (*pat==0)) return false;
+                if (*pat==*str) break;
+                if (pat[1] == '-') {
+                    c2 = pat[2];
+                    if (c2==0) return false;
+                    if ((*pat<=*str) && (c2>=*str)) break;
+                    if ((*pat>=*str) && (c2<=*str)) break;
+                    pat+=2;
+                }
+                pat++;
+            }
+            while (*pat!=']') {
+                if (*pat==0) {
+                    pat--;
+                    break;
+                }
+                pat++;
+            }
+            goto match;
+        }
+
+        if (*pat == NV_PATH_SEPARATOR) {
+            pat++;
+            if (*pat==0) return false;
+        }
+        if (*pat!=*str) return false;
+
+match:
+        pat++;
+        str++;
+    }
+}
+
+
+
+/** Empty string. */
+StringBuilder::StringBuilder() : m_size(0), m_str(NULL)
+{
+}
+
+/** Preallocate space. */
+StringBuilder::StringBuilder( int size_hint ) : m_size(size_hint)
+{
+	nvDebugCheck(m_size > 0);
+	m_str = strAlloc(m_size);
+	*m_str = '\0';
+}
+
+/** Copy ctor. */
+StringBuilder::StringBuilder( const StringBuilder & s ) : m_size(0), m_str(NULL)
+{
+	copy(s);
+}
+
+/** Allocate and copy string. */
+StringBuilder::StringBuilder( int size_hint, const StringBuilder & s) : m_size(size_hint), m_str(NULL)
+{
+	nvDebugCheck(m_size > 0);
+	m_str = strAlloc(m_size);
+	copy(s);
+}
+
+/** Allocate and format string. */
+StringBuilder::StringBuilder( const char * fmt, ... ) : m_size(0), m_str(NULL)
+{
+	nvDebugCheck(fmt != NULL);
+	va_list arg;
+	va_start( arg, fmt );
+
+	format( fmt, arg );
+
+	va_end( arg );
+}
+
+/** Allocate and format string. */
+StringBuilder::StringBuilder( int size_hint, const char * fmt, ... ) : m_size(size_hint), m_str(NULL)
+{
+	nvDebugCheck(m_size > 0);	
+	nvDebugCheck(fmt != NULL);
+	
+	m_str = strAlloc(m_size);
+
+	va_list arg;
+	va_start( arg, fmt );
+
+	format( fmt, arg );
+
+	va_end( arg );
+}
+
+
+/** Delete the string. */
+StringBuilder::~StringBuilder()
+{
+	m_size = 0;
+	strFree(m_str);
+	m_str = NULL;
+}
+
+
+/** Format a string safely. */
+StringBuilder & StringBuilder::format( const char * fmt, ... )
+{
+	nvDebugCheck(fmt != NULL);
+	va_list arg;
+	va_start( arg, fmt );
+
+	format( fmt, arg );
+
+	va_end( arg );
+
+	return *this;
+}
+
+
+/** Format a string safely. */
+StringBuilder & StringBuilder::format( const char * fmt, va_list arg )
+{
+	nvCheck(fmt != NULL);
+	nvCheck(m_size >= 0);
+
+	if( m_size == 0 ) {
+		m_size = 64;
+		m_str = strAlloc( m_size );
+	}
+
+	va_list tmp;
+	va_copy(tmp, arg);
+#if NV_CC_MSVC && _MSC_VER >= 1400
+	int n = vsnprintf_s(m_str, m_size, _TRUNCATE, fmt, tmp);
+#else
+	int n = vsnprintf(m_str, m_size, fmt, tmp);
+#endif
+	va_end(tmp);
+
+	while( n < 0 || n >= int(m_size) ) {
+		if( n > -1 ) {
+			m_size = n + 1;
+		}
+		else {
+			m_size *= 2;
+		}
+
+		m_str = strReAlloc(m_str, m_size);
+
+		va_copy(tmp, arg);
+#if NV_CC_MSVC && _MSC_VER >= 1400
+		n = vsnprintf_s(m_str, m_size, _TRUNCATE, fmt, tmp);
+#else
+		n = vsnprintf(m_str, m_size, fmt, tmp);
+#endif
+		va_end(tmp);
+	}
+	
+	nvDebugCheck(n < int(m_size));
+	
+	// Make sure it's null terminated.
+	nvDebugCheck(m_str[n] == '\0');
+	//str[n] = '\0';
+
+	return *this;
+}
+
+
+/** Append a string. */
+StringBuilder & StringBuilder::append( const char * s )
+{
+	nvCheck(s != NULL);
+	nvCheck(m_size >= 0);
+
+	const uint slen = uint(strlen( s ));
+
+	if( m_str == NULL ) {
+		m_size = slen + 1;
+		m_str = strAlloc(m_size);
+		strCpy( m_str, m_size, s );
+	}
+	else {
+	
+		const uint len = uint(strlen( m_str ));
+
+		if( m_size < len + slen + 1 ) {
+			m_size = len + slen + 1;
+			m_str = strReAlloc(m_str, m_size);
+		}
+		
+		strCat( m_str, m_size, s );
+	}
+
+	return *this;
+}
+
+
+/** Append a formatted string. */
+StringBuilder & StringBuilder::appendFormat( const char * format, ... )
+{
+	nvDebugCheck( format != NULL );
+
+	va_list arg;
+	va_start( arg, format );
+
+	appendFormat( format, arg );
+
+	va_end( arg );
+
+	return *this;
+}
+
+
+/** Append a formatted string. */
+StringBuilder & StringBuilder::appendFormat( const char * format, va_list arg )
+{
+	nvDebugCheck( format != NULL );
+	
+	va_list tmp;
+	va_copy(tmp, arg);
+
+	StringBuilder tmp_str;
+	tmp_str.format( format, tmp );
+	append( tmp_str );
+	
+	va_end(tmp);
+
+	return *this;
+}
+
+
+/** Convert number to string in the given base. */
+StringBuilder & StringBuilder::number( int i, int base )
+{
+	nvCheck( base >= 2 );
+	nvCheck( base <= 36 );
+
+	// @@ This needs to be done correctly.
+	// length = floor(log(i, base));
+	uint len = uint(log(float(i)) / log(float(base)) + 2);	// one more if negative
+	reserve(len);
+
+	if( i < 0 ) {
+		*m_str = '-';
+		*i2a(uint(-i), m_str+1, base) = 0;
+	}
+	else {
+		*i2a(i, m_str, base) = 0;
+	}
+
+	return *this;
+}
+
+
+/** Convert number to string in the given base. */
+StringBuilder & StringBuilder::number( uint i, int base )
+{
+	nvCheck( base >= 2 );
+	nvCheck( base <= 36 );
+
+	// @@ This needs to be done correctly.
+	// length = floor(log(i, base));
+	uint len = uint(log(float(i)) / log(float(base)) - 0.5f + 1);
+	reserve(len);
+
+	*i2a(i, m_str, base) = 0;
+
+	return *this;
+}
+
+
+/** Resize the string preserving the contents. */
+StringBuilder & StringBuilder::reserve( uint size_hint )
+{
+	nvCheck(size_hint != 0);
+	if( size_hint > m_size ) {
+		m_str = strReAlloc(m_str, size_hint);
+		m_size = size_hint;
+	}
+	return *this;
+}
+
+
+/** Copy a string safely. */
+StringBuilder & StringBuilder::copy( const char * s )
+{
+	nvCheck( s != NULL );
+	uint str_size = uint(strlen( s )) + 1;
+	reserve(str_size);
+	strCpy( m_str, str_size, s );
+	return *this;
+}
+
+
+/** Copy an StringBuilder. */
+StringBuilder & StringBuilder::copy( const StringBuilder & s )
+{
+	if( s.m_str == NULL ) {
+		nvCheck( s.m_size == 0 );
+		m_size = 0;
+		strFree( m_str );
+		m_str = NULL;
+	}
+	else {
+		reserve( s.m_size );
+		strCpy( m_str, s.m_size, s.m_str );
+	}
+	return *this;
+}
+
+/** Reset the string. */
+void StringBuilder::reset()
+{
+	m_size = 0;
+	strFree( m_str );
+	m_str = NULL;
+}
+
+
+Path::Path(const char * fmt, ...)
+{
+	nvDebugCheck( fmt != NULL );
+
+	va_list arg;
+	va_start( arg, fmt );
+
+	format( fmt, arg );
+
+	va_end( arg );
+}
+
+Path::Path(int size_hint, const char * fmt, ...) : StringBuilder(size_hint)
+{
+	nvDebugCheck( fmt != NULL );
+
+	va_list arg;
+	va_start( arg, fmt );
+
+	format( fmt, arg );
+
+	va_end( arg );
+}
+
+
+/// Get the file name from a path.
+const char * Path::fileName() const
+{
+	return fileName(m_str);
+}
+
+
+/// Get the extension from a file path.
+const char * Path::extension() const
+{
+	return extension(m_str);
+}
+
+
+/// Toggles path separators (ie. \\ into /).
+void Path::translatePath()
+{
+	nvCheck( m_str != NULL );
+
+	for(int i = 0; ; i++) {
+		if( m_str[i] == '\0' ) break;
+#if NV_PATH_SEPARATOR == '/'
+		if( m_str[i] == '\\' ) m_str[i] = NV_PATH_SEPARATOR;
+#else
+		if( m_str[i] == '/' ) m_str[i] = NV_PATH_SEPARATOR;
+#endif
+	}
+}
+
+
+/**
+ * Strip the file name from a path.
+ * @warning path cannot end with '/' o '\\', can't it?
+ */
+void Path::stripFileName()
+{
+	nvCheck( m_str != NULL );
+
+	int length = (int)strlen(m_str) - 1;
+	while (length > 0 && m_str[length] != '/' && m_str[length] != '\\'){
+		length--;
+	}
+	if( length ) {
+		m_str[length+1] = 0;
+	}
+	else {
+		m_str[0] = 0;
+	}
+}
+
+
+/// Strip the extension from a path name.
+void Path::stripExtension()
+{
+	nvCheck( m_str != NULL );
+	
+	int length = (int)strlen(m_str) - 1;
+	while( length > 0 && m_str[length] != '.' ) {
+		length--;
+		if( m_str[length] == NV_PATH_SEPARATOR ) {
+			return;		// no extension
+		}
+	}
+	if( length ) {
+		m_str[length] = 0;
+	}
+}
+
+
+/// Get the path separator.
+// static
+char Path::separator()
+{
+	return NV_PATH_SEPARATOR;
+}
+
+// static 
+const char * Path::fileName(const char * str)
+{
+	nvCheck( str != NULL );
+
+	int length = (int)strlen(str) - 1;
+	while( length >= 0 && str[length] != separator() ) {
+		length--;
+	}
+
+	return &str[length+1];
+}
+
+// static 
+const char * Path::extension(const char * str)
+{
+	nvCheck( str != NULL );
+
+	int length, l;
+	l = length = (int)strlen( str );
+	while( length > 0 && str[length] != '.' ) {
+		length--;
+		if( str[length] == separator() ) {
+			return &str[l];		// no extension
+		}
+	}
+	if( length == 0 ) {
+		return &str[l];
+	}
+	return &str[length];
+}
+
+
+// static
+String String::s_null(String::null);
+
+/// Clone this string
+String String::clone() const
+{
+	String str(data);
+	return str;
+}
+
+void String::setString(const char * str)
+{
+	if( str == NULL ) {
+		data = s_null.data;
+	}
+	else {
+		allocString( str );
+	}
+	addRef();
+}
+
+void String::setString(const char * str, int length)
+{
+	nvDebugCheck(str != NULL);
+
+	allocString(str, length);
+	addRef();
+}
+
+void String::setString(const StringBuilder & str)
+{
+	if( str.str() == NULL ) {
+		data = s_null.data;
+	}
+	else {
+		allocString(str);
+	}
+	addRef();
+}	
--- a/src/nvcore/StrLib.h
+++ b/src/nvcore/StrLib.h
@ -0,0 +1,348 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#ifndef NV_CORE_STRING_H
+#define NV_CORE_STRING_H
+
+#include <nvcore/nvcore.h>
+#include <nvcore/Containers.h>	// swap
+
+#include <string.h> // strlen, strcmp, etc.
+
+
+namespace nv
+{
+
+	uint strHash(const char * str, uint h) NV_PURE;
+
+	/// String hash vased on Bernstein's hash.
+	inline uint strHash(const char * data, uint h = 5381)
+	{
+		uint i;
+		while(data[i] != 0) {
+			h = (33 * h) ^ uint(data[i]);
+			i++;
+		}
+		return h;
+	}
+	
+	template <> struct hash<const char *> {
+		uint operator()(const char * str) const { return strHash(str); }
+	};
+	
+	NVCORE_API int strCaseCmp(const char * s1, const char * s2) NV_PURE;
+	NVCORE_API int strCmp(const char * s1, const char * s2) NV_PURE;
+	NVCORE_API void strCpy(char * dst, int size, const char * src);
+	NVCORE_API void strCat(char * dst, int size, const char * src);
+
+	NVCORE_API bool strMatch(const char * str, const char * pat) NV_PURE;
+
+	
+	/// String builder.
+	class StringBuilder
+	{
+	public:
+	
+		NVCORE_API StringBuilder();
+		NVCORE_API explicit StringBuilder( int size_hint );
+		NVCORE_API StringBuilder( const StringBuilder & );
+		NVCORE_API StringBuilder( int size_hint, const StringBuilder & );	
+		NVCORE_API StringBuilder( const char * format, ... ) __attribute__((format (printf, 2, 3)));
+		NVCORE_API StringBuilder( int size_hint, const char * format, ... ) __attribute__((format (printf, 3, 4)));
+	
+		NVCORE_API ~StringBuilder();
+	
+		NVCORE_API StringBuilder & format( const char * format, ... ) __attribute__((format (printf, 2, 3)));
+		NVCORE_API StringBuilder & format( const char * format, va_list arg );
+	
+		NVCORE_API StringBuilder & append( const char * str );
+		NVCORE_API StringBuilder & appendFormat( const char * format, ... ) __attribute__((format (printf, 2, 3)));
+		NVCORE_API StringBuilder & appendFormat( const char * format, va_list arg );
+	
+		NVCORE_API StringBuilder & number( int i, int base = 10 );
+		NVCORE_API StringBuilder & number( uint i, int base = 10 );
+	
+		NVCORE_API StringBuilder & reserve( uint size_hint );
+		NVCORE_API StringBuilder & copy( const char * str );
+		NVCORE_API StringBuilder & copy( const StringBuilder & str );
+		
+		NVCORE_API StringBuilder & toLower();
+		NVCORE_API StringBuilder & toUpper();
+		
+		NVCORE_API void reset();
+		NVCORE_API bool empty() const { return m_size == 0; }
+	
+		// const char * accessors
+		operator const char * () const { return m_str; }
+		operator char * () { return m_str; }
+		const char * str() const { return m_str; }
+		char * str() { return m_str; }
+	
+		/// Implement value semantics.
+		StringBuilder & operator=( const StringBuilder & s ) {
+			return copy(s);
+		}
+	
+		/// Equal operator.
+		bool operator==( const StringBuilder & s ) const {
+			nvCheck(m_str != NULL);
+			nvCheck(s.m_str != NULL);
+			return strcmp(s.m_str, m_str) != 0;
+		}
+		
+		/// Return the exact length.
+		uint length() const { nvCheck(m_str != NULL); return uint(strlen(m_str)); }
+	
+		/// Return the size of the string container.
+		uint capacity() const { nvCheck(m_str != NULL); return m_size; }
+	
+		/// Return the hash of the string.
+		uint hash() const { nvCheck(m_str != NULL); return strHash(m_str); }
+	
+		///	Swap strings.
+		friend void swap(StringBuilder & a, StringBuilder & b) {
+			nv::swap(a.m_size, b.m_size);
+			nv::swap(a.m_str, b.m_str);
+		}
+	
+		static char separator();
+		
+	protected:
+		
+		/// Size of the string container.
+		uint m_size;
+		
+		/// String.
+		char * m_str;
+		
+	};
+	
+
+	/// Path string.
+	class Path : public StringBuilder
+	{
+	public:
+		Path() : StringBuilder() {}
+		explicit Path(int size_hint) : StringBuilder(size_hint) {}
+		Path(const StringBuilder & str) : StringBuilder(str) {}
+		Path(int size_hint, const StringBuilder & str) : StringBuilder(size_hint, str) {}	
+		NVCORE_API Path(const char * format, ...) __attribute__((format (printf, 2, 3)));
+		NVCORE_API Path(int size_hint, const char * format, ...) __attribute__((format (printf, 3, 4)));
+		
+		
+		NVCORE_API const char * fileName() const;
+		NVCORE_API const char * extension() const;
+		
+		NVCORE_API void translatePath();
+		
+		NVCORE_API void stripFileName();
+		NVCORE_API void stripExtension();
+		
+		// statics
+		NVCORE_API static char separator();
+		NVCORE_API static const char * fileName(const char *);
+		NVCORE_API static const char * extension(const char *);
+
+	};
+	
+	
+	/// String class.
+	class String
+	{
+	public:
+
+		/// Constructs a null string. @sa isNull()
+		String()
+		{
+			data = s_null.data;
+			addRef();
+		}
+
+		/// Constructs a shared copy of str.
+		String(const String & str)
+		{
+			data = str.data;
+			addRef();
+		}
+
+		/// Constructs a shared string from a standard string.
+		String(const char * str)
+		{
+			setString(str);
+		}
+
+		/// Constructs a shared string from a standard string.
+		String(const char * str, int length)
+		{
+			setString(str, length);
+		}
+
+		/// Constructs a shared string from a StringBuilder.
+		String(const StringBuilder & str)
+		{
+			setString(str);
+		}
+
+		/// Dtor.
+		~String()
+		{
+			nvDebugCheck(data != NULL);
+			release();
+		}
+
+		NVCORE_API String clone() const;
+	
+		/// Release the current string and allocate a new one.
+		const String & operator=( const char * str )
+		{
+			release();
+			setString( str );
+			return *this;
+		}
+
+		/// Release the current string and allocate a new one.
+		const String & operator=( const StringBuilder & str )
+		{
+			release();
+			setString( str );
+			return *this;
+		}
+	
+		/// Implement value semantics.
+		String & operator=( const String & str )
+		{
+			release();
+			data = str.data;
+			addRef();
+			return *this;
+		}
+
+		/// Equal operator.
+		bool operator==( const String & str ) const
+		{
+			nvDebugCheck(data != NULL);
+			nvDebugCheck(str.data != NULL);
+			if( str.data == data ) {
+				return true;
+			}
+			return strcmp(data, str.data) == 0;
+		}
+
+		/// Equal operator.
+		bool operator==( const char * str ) const
+		{
+			nvDebugCheck(data != NULL);
+			nvCheck(str != NULL);	// Use isNull!
+			return strcmp(data, str) == 0;
+		}
+
+		/// Not equal operator.
+		bool operator!=( const String & str ) const
+		{
+			nvDebugCheck(data != NULL);
+			nvDebugCheck(str.data != NULL);
+			if( str.data == data ) {
+				return false;
+			}
+			return strcmp(data, str.data) != 0;
+		}
+	
+		/// Not equal operator.
+		bool operator!=( const char * str ) const
+		{
+			nvDebugCheck(data != NULL);
+			nvCheck(str != NULL);	// Use isNull!
+			return strcmp(data, str) != 0;
+		}
+	
+		/// Returns true if this string is the null string.
+		bool isNull() const { nvDebugCheck(data != NULL); return data == s_null.data; }
+	
+		/// Return the exact length.
+		uint length() const { nvDebugCheck(data != NULL); return uint(strlen(data)); }
+	
+		/// Return the hash of the string.
+		uint hash() const { nvDebugCheck(data != NULL); return strHash(data); }
+	
+		/// const char * cast operator.
+		operator const char * () const { nvDebugCheck(data != NULL); return data; }
+	
+		/// Get string pointer.
+		const char * str() const { nvDebugCheck(data != NULL); return data; }
+	
+
+	private:
+
+		enum null_t { null };
+		
+		// Private constructor for null string.
+		String(null_t) {
+			setString("");
+		}
+
+		// Add reference count.
+		void addRef() {
+			nvDebugCheck(data != NULL);
+			setRefCount(getRefCount() + 1);
+		}
+		
+		// Decrease reference count.
+		void release() {
+			nvDebugCheck(data != NULL);
+
+			const uint16 count = getRefCount();
+			setRefCount(count - 1);
+			if( count - 1 == 0 ) {
+				mem::free(data - 2);
+				data = NULL;
+			}
+		}
+		
+		uint16 getRefCount() const {
+			return *reinterpret_cast<const uint16 *>(data - 2);
+		}
+		
+		void setRefCount(uint16 count) {
+			nvCheck(count < 0xFFFF);
+			*reinterpret_cast<uint16 *>(const_cast<char *>(data - 2)) = uint16(count);
+		}
+		
+		void setData(const char * str) {
+			data = str + 2;
+		}
+		
+		void allocString(const char * str)
+		{
+			allocString(str, (int)strlen(str));
+		}
+
+		void allocString(const char * str, int len)
+		{
+			const char * ptr = static_cast<const char *>(mem::malloc(2 + len + 1));
+	
+			setData( ptr );				
+			setRefCount( 0 );
+			
+			// Copy string.
+			strCpy(const_cast<char *>(data), len + 1, str);
+		}
+	
+		NVCORE_API void setString(const char * str);
+		NVCORE_API void setString(const char * str, int length);
+		NVCORE_API void setString(const StringBuilder & str);	
+	
+		///	Swap strings.
+		friend void swap(String & a, String & b) {
+			swap(a.data, b.data);
+		}
+	
+	private:
+
+		NVCORE_API static String s_null;
+
+		const char * data;
+		
+	};
+
+} // nv namespace
+
+#endif // NV_CORE_STRING_H
--- a/src/nvcore/Stream.h
+++ b/src/nvcore/Stream.h
@ -0,0 +1,165 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#ifndef NVCORE_STREAM_H
+#define NVCORE_STREAM_H
+
+#include <nvcore/nvcore.h>
+#include <nvcore/Debug.h>
+
+namespace nv
+{
+
+/** Base stream class. */
+class Stream {
+public:
+
+	enum ByteOrder {
+		LittleEndian = false,
+		BigEndian = true,
+	};
+
+	/// Get the byte order of the system.
+	static ByteOrder getSystemByteOrder() { 
+#	if NV_LITTLE_ENDIAN
+		return LittleEndian;
+#	else
+		return BigEndian;
+#	endif
+	}
+
+
+	/// Ctor.
+	Stream() : m_byteOrder(LittleEndian) { }
+
+	/// Virtual destructor.
+	virtual ~Stream() {}
+
+	/// Set byte order.
+	void setByteOrder(ByteOrder bo) { m_byteOrder = bo; }
+	
+	/// Get byte order.
+	ByteOrder byteOrder() const { return m_byteOrder; }
+
+	
+	/// Serialize the given data.
+	virtual void serialize( void * data, int len ) = 0;
+
+	/// Move to the given position in the archive.
+	virtual void seek( int pos ) = 0;
+
+	/// Return the current position in the archive.
+	virtual int tell() const = 0;
+
+	/// Return the current size of the archive.
+	virtual int size() const = 0;
+
+	/// Determine if there has been any error.
+	virtual bool isError() const = 0;
+	
+	/// Return true if the stream is at the end.
+	virtual bool isAtEnd() const = 0;
+
+	/// Return true if the stream is seekable.
+	virtual bool isSeekable() const = 0;
+	
+	/// Return true if this is an input stream.
+	virtual bool isLoading() const = 0;
+
+	/// Return true if this is an output stream.
+	virtual bool isSaving() const = 0;
+
+	
+	// friends	
+	friend Stream & operator<<( Stream & s, bool & c ) {
+#	if NV_OS_DARWIN
+		nvStaticCheck(sizeof(bool) == 4);
+		uint8 b = c ? 1 : 0;
+		s.serialize( &b, 1 );
+		c = (b == 1);
+#	else
+		nvStaticCheck(sizeof(bool) == 1);
+		s.serialize( &c, 1 );
+#	endif
+		return s;
+	}
+	friend Stream & operator<<( Stream & s, char & c ) {
+		nvStaticCheck(sizeof(char) == 1);
+		s.serialize( &c, 1 );
+		return s;
+	}
+	friend Stream & operator<<( Stream & s, uint8 & c ) {
+		nvStaticCheck(sizeof(uint8) == 1);
+		s.serialize( &c, 1 );
+		return s;
+	}
+	friend Stream & operator<<( Stream & s, int8 & c ) {
+		nvStaticCheck(sizeof(int8) == 1);
+		s.serialize( &c, 1 );
+		return s;
+	}
+	friend Stream & operator<<( Stream & s, uint16 & c ) {
+		nvStaticCheck(sizeof(uint16) == 2);
+		s.byteOrderSerialize( &c, 2 );
+		return s;
+	}
+	friend Stream & operator<<( Stream & s, int16 & c ) {
+		nvStaticCheck(sizeof(int16) == 2);
+		s.byteOrderSerialize( &c, 2 );
+		return s;
+	}
+	friend Stream & operator<<( Stream & s, uint32 & c ) {
+		nvStaticCheck(sizeof(uint32) == 4);
+		s.byteOrderSerialize( &c, 4 );
+		return s;
+	}
+	friend Stream & operator<<( Stream & s, int32 & c ) {
+		nvStaticCheck(sizeof(int32) == 4);
+		s.byteOrderSerialize( &c, 4 );
+		return s;
+	}
+	friend Stream & operator<<( Stream & s, uint64 & c ) {
+		nvStaticCheck(sizeof(uint64) == 8);
+		s.byteOrderSerialize( &c, 8 );
+		return s;
+	}
+	friend Stream & operator<<( Stream & s, int64 & c ) {
+		nvStaticCheck(sizeof(int64) == 8);
+		s.byteOrderSerialize( &c, 8 );
+		return s;
+	}
+	friend Stream & operator<<( Stream & s, float & c ) {
+		nvStaticCheck(sizeof(float) == 4);
+		s.byteOrderSerialize( &c, 4 );
+		return s;
+	}
+	friend Stream & operator<<( Stream & s, double & c ) {
+		nvStaticCheck(sizeof(double) == 8);
+		s.byteOrderSerialize( &c, 8 );
+		return s;
+	}
+
+protected:
+
+	/** Serialize in the stream byte order. */
+	Stream & byteOrderSerialize( void * v, int len ) {
+		if( m_byteOrder == getSystemByteOrder() ) {
+			serialize( v, len );
+		}
+		else {
+			for( int i=len-1; i>=0; i-- ) {
+				serialize( (uint8 *)v + i, 1 );
+			}
+		}
+		return *this;
+	}
+
+
+private:
+
+	ByteOrder m_byteOrder;
+
+};
+
+} // nv namespace
+
+#endif // NV_STREAM_H
--- a/src/nvcore/TextReader.cpp
+++ b/src/nvcore/TextReader.cpp
@ -0,0 +1,85 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#include <nvcore/TextReader.h>
+
+using namespace nv;
+
+/// Peek next character.
+char TextReader::peek()
+{
+	nvDebugCheck(m_stream != NULL);
+	nvDebugCheck(m_stream->isSeekable());
+	
+	if (m_stream->isAtEnd()) {
+		return 0;
+	}
+
+	uint pos = m_stream->tell();
+
+	char c;
+	m_stream->serialize(&c, 1);
+	m_stream->seek(pos);
+	return c;
+}
+
+/// Read a single char.
+char TextReader::read()
+{
+	nvDebugCheck(m_stream != NULL);
+	
+	if( m_stream->isAtEnd() ) {
+		return 0;
+	}
+
+	char c;
+	m_stream->serialize(&c, 1);
+	return c;
+}
+
+/// Read from the current location to the end of the stream.
+const char * TextReader::readToEnd()
+{
+	nvDebugCheck(m_stream != NULL);
+	const int size = m_stream->size();
+	
+	m_text.clear();
+	
+	m_text.reserve(size + 1);
+	m_text.resize(size);
+	
+	m_stream->serialize(m_text.unsecureBuffer(), size);
+	m_text.pushBack('\0');
+	
+	return m_text.buffer();
+}
+
+/// Read from the current location to the end of the line.
+const char * TextReader::readLine()
+{
+	m_text.clear();
+
+	if (m_stream->isAtEnd()) {
+		return NULL;
+	}
+	
+	while (true) {
+		char c = read();
+		
+		if (c == 0 || c == '\n') {
+			break;
+		}
+		else if (c == '\r') {
+			if( peek() == '\n' ) {
+				read();
+			}
+			break;
+		}
+		
+		m_text.pushBack(c);
+	}
+	
+	m_text.pushBack('\0');
+	return m_text.buffer();
+}
+
+
--- a/src/nvcore/TextReader.h
+++ b/src/nvcore/TextReader.h
@ -0,0 +1,38 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#ifndef NVCORE_TEXTREADER_H
+#define NVCORE_TEXTREADER_H
+
+#include <nvcore/nvcore.h>
+#include <nvcore/Stream.h>
+#include <nvcore/Containers.h>
+
+namespace nv
+{
+
+/// Text reader.
+class NVCORE_CLASS TextReader {
+public:
+	
+	/// Ctor.
+	TextReader(Stream * stream) : m_stream(stream), m_text(512) {
+		nvCheck(stream != NULL);
+		nvCheck(stream->isLoading());
+	}
+	
+	char peek();
+	char read();
+	
+	const char *readToEnd();
+
+	// Returns a temporary string.
+	const char * readLine(); 
+
+private:
+	Stream * m_stream;
+	Array<char> m_text;
+};
+
+} // nv namespace
+
+#endif // NVCORE_TEXTREADER_H
--- a/src/nvcore/TextWriter.h
+++ b/src/nvcore/TextWriter.h
@ -0,0 +1,44 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#ifndef NVCORE_TEXTWRITER_H
+#define NVCORE_TEXTWRITER_H
+
+#include <nvcore/nvcore.h>
+#include <nvcore/Stream.h>
+#include <nvcore/StrLib.h>
+
+// @@ NOT IMPLEMENTED !!!
+
+
+namespace nv
+{
+
+	/// Text writer.
+	class NVCORE_CLASS TextWriter
+	{
+	public:
+	
+		/// Ctor.
+		TextWriter(Stream * s) : s(s), str(1024) {
+			nvDebugCheck(s != NULL);
+			nvCheck(s->IsSaving());
+		}
+	
+		void write( const char * str, uint len );
+		void write( const char * format, ... ) __attribute__((format (printf, 2, 3)));
+		void write( const char * format, va_list arg );
+	
+	
+	private:
+	
+		Stream * s;
+		
+		// Temporary string.
+		StringBuilder str;
+	
+	};
+
+} // nv namespace
+
+
+#endif // NVCORE_TEXTWRITER_H
--- a/src/nvcore/Tokenizer.cpp
+++ b/src/nvcore/Tokenizer.cpp
@ -0,0 +1,245 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#include <nvcore/Tokenizer.h>
+#include <nvcore/StrLib.h>
+
+#include <stdio.h> // vsscanf
+#include <stdarg.h>	// va_list
+#include <stdlib.h>	// atof, atoi
+
+#if NV_CC_MSVC
+/* vsscanf for Win32
+ * Written 5/2003 by <mgix@mgix.com>
+ * This code is in the Public Domain
+ */
+
+#include <malloc.h> // alloca
+//#include <string.h>
+
+static int vsscanf(const char * buffer, const char * format, va_list argPtr)
+{
+	// Get an upper bound for the # of args
+	size_t count = 0;
+	const char *p = format;
+	while(1) {
+		char c = *(p++);
+		if(c==0) break;
+		if(c=='%' && (p[0]!='*' && p[0]!='%')) ++count;
+	}
+
+	// Make a local stack
+	size_t stackSize = (2+count)*sizeof(void*);
+	void **newStack = (void**)alloca(stackSize);
+
+	// Fill local stack the way sscanf likes it
+	newStack[0] = (void*)buffer;
+	newStack[1] = (void*)format;
+	memcpy(newStack+2, argPtr, count*sizeof(void*));
+
+	// @@ Use: CALL DWORD PTR [sscanf]
+	
+	// Warp into system sscanf with new stack
+	int result;
+	void *savedESP;
+	__asm
+	{
+		mov     savedESP, esp
+		mov     esp, newStack
+#if _MSC_VER >= 1400
+		call	DWORD PTR [sscanf_s]
+#else
+		call	DWORD PTR [sscanf]
+#endif
+		mov     esp, savedESP
+		mov     result, eax
+	}
+	return result;
+}
+
+/*
+int hacky_vsscanf(const char *str, int count, const char *format, va_list ap) {
+	nvCheck(count < 8)
+	if (count == 0) {
+	}
+	void * arg0 = va_arg(ap, void *);
+	void * arg1 = va_arg(ap, void *);
+	void * arg2 = va_arg(ap, void *);
+	void * arg3 = va_arg(ap, void *);
+	void * arg4 = va_arg(ap, void *);
+	void * arg5 = va_arg(ap, void *);
+	void * arg6 = va_arg(ap, void *);
+	void * arg7 = va_arg(ap, void *);
+	return sscanf(str, format, arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7);
+}
+*/
+
+#endif
+
+using namespace nv;
+
+Token::Token() :
+	m_str(""), m_len(0)
+{
+}
+
+Token::Token(const Token & token) : 
+	m_str(token.m_str), m_len(token.m_len)
+{
+}
+
+Token::Token(const char * str, int len) : 
+	m_str(str), m_len(len)
+{
+}
+
+bool Token::operator==(const char * str) const
+{
+	return strncmp(m_str, str, m_len) == 0;
+}
+bool Token::operator!=(const char * str) const
+{
+	return strncmp(m_str, str, m_len) != 0;
+}
+
+bool Token::isNull()
+{
+	return m_len != 0;
+}
+
+float Token::toFloat() const
+{
+	return float(atof(m_str));
+}
+
+int Token::toInt() const
+{
+	return atoi(m_str);
+}
+
+uint Token::toUnsignedInt() const
+{
+	// @@ TBD
+	return uint(atoi(m_str));
+}
+
+String Token::toString() const
+{
+	return String(m_str, m_len);
+}
+
+bool Token::parse(const char * format, int count, ...) const
+{
+	va_list arg;
+	va_start(arg, count);
+
+	int readCount = vsscanf(m_str, format, arg);
+
+	va_end(arg);
+
+	return readCount == count;
+}
+
+
+Tokenizer::Tokenizer(Stream * stream) : 
+	m_reader(stream), m_lineNumber(0), m_columnNumber(0), m_delimiters("{}()="), m_spaces(" \t")
+{
+}
+
+bool Tokenizer::nextLine(bool skipEmptyLines /*= true*/)
+{
+	do {
+		if (!readLine()) {
+			return false;
+		}
+	}
+	while (!readToken() && skipEmptyLines);
+	
+	return true;
+}
+
+bool Tokenizer::nextToken(bool skipEndOfLine /*= false*/)
+{
+	if (!readToken()) {
+		if (!skipEndOfLine) {
+			return false;
+		}
+		else {
+			return nextLine(true);
+		}
+	}
+	return true;
+}
+	
+bool Tokenizer::readToken()
+{
+	skipSpaces();
+	
+	const char * begin = m_line + m_columnNumber;
+	
+	if (*begin == '\0') {
+		return false;
+	}
+	
+	char c = readChar();
+	if (isDelimiter(c)) {
+		m_token = Token(begin, 1);
+		return true;
+	}
+	
+	// @@ Add support for quoted tokens "", ''
+	
+	int len = 0;
+	while (!isDelimiter(c) && !isSpace(c) && c != '\0') {
+		c = readChar();
+		len++;
+	}
+	m_columnNumber--;
+	
+	m_token = Token(begin, len);
+	
+	return true;
+}
+
+char Tokenizer::readChar()
+{
+	return m_line[m_columnNumber++];
+}
+
+bool Tokenizer::readLine()
+{
+	m_lineNumber++;
+	m_columnNumber = 0;
+	m_line = m_reader.readLine();
+	return m_line != NULL;
+}
+
+void Tokenizer::skipSpaces()
+{
+	while (isSpace(readChar())) {}
+	m_columnNumber--;
+}
+
+bool Tokenizer::isSpace(char c)
+{
+	uint i = 0;
+	while (m_spaces[i] != '\0') {
+		if (c == m_spaces[i]) {
+			return true;
+		}
+		i++;
+	}
+	return false;
+}
+
+bool Tokenizer::isDelimiter(char c)
+{
+	uint i = 0;
+	while (m_delimiters[i] != '\0') {
+		if (c == m_delimiters[i]) {
+			return true;
+		}
+		i++;
+	}
+	return false;
+}
+
--- a/src/nvcore/Tokenizer.h
+++ b/src/nvcore/Tokenizer.h
@ -0,0 +1,95 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#ifndef NV_CORE_TOKENIZER_H
+#define NV_CORE_TOKENIZER_H
+
+#include <nvcore/nvcore.h>
+#include <nvcore/Stream.h>
+#include <nvcore/TextReader.h>
+#include <nvcore/StrLib.h>
+
+namespace nv
+{
+	/// A token produced by the Tokenizer.
+	class NVCORE_CLASS Token
+	{
+	public:
+		Token();
+		Token(const Token & token);
+		Token(const char * str, int len);		
+		
+		bool operator==(const char * str) const;
+		bool operator!=(const char * str) const;
+
+		bool isNull();
+		
+		float toFloat() const;
+		int toInt() const;
+		uint toUnsignedInt() const;
+		String toString() const;
+		
+		bool parse(const char * format, int count, ...) const __attribute__((format (scanf, 2, 4)));
+		
+	private:
+		const char * m_str;
+		int m_len;
+	};
+	
+	/// Exception thrown by the tokenizer.
+	class TokenizerException
+	{
+	public:
+		TokenizerException(int line, int column) : m_line(line), m_column(column) {}
+		
+		int line() const { return m_line; }
+		int column() const { return m_column; }
+		
+	private:
+		int m_line;
+		int m_column;
+	};
+	
+	/// A simple stream tokenizer.
+	class NVCORE_CLASS Tokenizer
+	{
+	public:
+		Tokenizer(Stream * stream);
+		
+		bool nextLine(bool skipEmptyLines = true);
+		bool nextToken(bool skipEndOfLine = false);
+		
+		const Token & token() const { return m_token; }
+		
+		int lineNumber() const { return m_lineNumber; }
+		int columnNumber() const { return m_columnNumber; }
+		
+		void setDelimiters(const char * str) { m_delimiters = str; }
+		const char * delimiters() const { return m_delimiters; }
+		
+		void setSpaces(const char * str) { m_spaces = str; }
+		const char * spaces() const { return m_spaces; }
+		
+	private:
+		char readChar();
+		bool readLine();
+		bool readToken(); 
+		void skipSpaces();
+		bool isSpace(char c);
+		bool isDelimiter(char c);
+		
+	private:
+		TextReader m_reader;
+		const char * m_line;
+		Token m_token;
+		
+		int m_lineNumber;
+		int m_columnNumber;
+		
+		const char * m_delimiters;
+		const char * m_spaces;
+	};
+	
+} // nv namespace
+
+
+#endif // NV_CORE_TOKENIZER_H
--- a/src/nvcore/nvcore.h
+++ b/src/nvcore/nvcore.h
@ -0,0 +1,172 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#ifndef NV_CORE_H
+#define NV_CORE_H
+
+// cmake config
+#include <nvconfig.h>
+
+// Function linkage
+#if NVCORE_SHARED
+#ifdef NVCORE_EXPORTS
+#define NVCORE_API DLL_EXPORT
+#define NVCORE_CLASS DLL_EXPORT_CLASS
+#else
+#define NVCORE_API DLL_IMPORT
+#define NVCORE_CLASS DLL_IMPORT
+#endif
+#else // NVCORE_SHARED
+#define NVCORE_API
+#define NVCORE_CLASS
+#endif // NVCORE_SHARED
+
+
+// Platform definitions
+#include "poshlib/posh.h"
+
+// OS:
+// NV_OS_WIN32
+// NV_OS_WIN64
+// NV_OS_MINGW
+// NV_OS_CYGWIN
+// NV_OS_LINUX
+// NV_OS_UNIX
+// NV_OS_DARWIN
+
+#define NV_OS_STRING 	POSH_OS_STRING
+
+#if defined POSH_OS_LINUX
+#	define NV_OS_LINUX 1
+#	define NV_OS_UNIX 1
+#elif defined POSH_OS_CYGWIN32
+#	define NV_OS_CYGWIN 1
+#elif defined POSH_OS_MINGW
+#	define NV_OS_MINGW 1
+#	define NV_OS_WIN32 1
+#elif defined POSH_OS_OSX
+#	define NV_OS_DARWIN 1
+#	define NV_OS_UNIX 1
+#elif defined POSH_OS_UNIX
+#	define NV_OS_UNIX 1
+#elif defined POSH_OS_WIN32
+#	define NV_OS_WIN32 1
+#elif defined POSH_OS_WIN64
+#	define NV_OS_WIN64 1
+#else
+#	error "Unsupported OS"
+#endif
+
+
+// CPUs:
+// NV_CPU_X86
+// NV_CPU_X86_64
+// NV_CPU_PPC
+
+#define NV_CPU_STRING 	POSH_CPU_STRING
+
+#if defined POSH_CPU_X86_64
+#	define NV_CPU_X86_64 1
+#elif defined POSH_CPU_X86
+#	define NV_CPU_X86 1
+#elif defined POSH_CPU_PPC
+#	define NV_CPU_PPC 1
+#else
+#	error "Unsupported CPU"
+#endif
+
+
+// Compiler:
+// NV_CC_GNUC
+// NV_CC_MSVC
+// @@ NV_CC_MSVC6
+// @@ NV_CC_MSVC7
+// @@ NV_CC_MSVC8
+
+#if defined POSH_COMPILER_GCC
+#	define NV_CC_GNUC	1
+#	define NV_CC_STRING "gcc"
+#elif defined POSH_COMPILER_MSVC
+#	define NV_CC_MSVC	1
+#	define NV_CC_STRING "msvc"
+#else
+#	error "Unsupported compiler"
+#endif
+
+
+// Endiannes:
+#define NV_LITTLE_ENDIAN 	POSH_LITTLE_ENDIAN
+#define NV_BIG_ENDIAN		POSH_BIG_ENDIAN
+#define NV_ENDIAN_STRING	POSH_ENDIAN_STRING
+
+
+// Version string:
+#define NV_VERSION_STRING \
+	NV_OS_STRING "/" NV_CC_STRING "/" NV_CPU_STRING"/" \
+	NV_ENDIAN_STRING"-endian - " __DATE__ "-" __TIME__
+
+
+/// Disable copy constructor and assignment operator. 
+/// @hideinitializer
+#define NV_FORBID_COPY(C) \
+    private: \
+    C( const C & ); \
+    C &operator=( const C & );
+
+
+/// Disable dynamic allocation on the heap. 
+/// See Prohibiting Heap-Based Objects in More Effective C++.
+/// @hideinitializer 
+#define NV_FORBID_HEAPALLOC() \
+	private: \
+	static void *operator new(size_t size); \
+	static void *operator new[](size_t size);
+
+// String concatenation macros.
+#define NV_STRING_JOIN2(arg1, arg2) NV_DO_STRING_JOIN2(arg1, arg2)
+#define NV_DO_STRING_JOIN2(arg1, arg2) arg1 ## arg2
+#define NV_STRING_JOIN3(arg1, arg2, arg3) NV_DO_STRING_JOIN3(arg1, arg2, arg3)
+#define NV_DO_STRING_JOIN3(arg1, arg2, arg3) arg1 ## arg2 ## arg3
+
+// Startup initialization macro.
+#define NV_AT_STARTUP(some_code) \
+	namespace { \
+		static struct NV_STRING_JOIN2(AtStartup_, __LINE__) { \
+			NV_STRING_JOIN2(AtStartup_, __LINE__)() { some_code; } \
+		} \
+		NV_STRING_JOIN3(AtStartup_, __LINE__, Instance); \
+	};
+
+/// Indicate the compiler that the parameter is not used to suppress compier warnings.
+/// @hideinitializer 
+#define NV_UNUSED(a) ((a)=(a))
+
+/// Null index. @@ Move this somewhere else... This could have collisions with other definitions!
+#define NIL uint(~0)
+
+/// Null pointer.
+#ifndef NULL
+#define NULL 0
+#endif
+
+// Platform includes
+#if NV_CC_MSVC
+#	if NV_OS_WIN32
+#		include "DefsVcWin32.h"
+#	else
+#		error "MSVC: Platform not supported"
+#	endif
+#elif NV_CC_GNUC
+#	if NV_OS_LINUX
+#		include "DefsGnucLinux.h"
+#	elif NV_OS_DARWIN
+#		include "DefsGnucDarwin.h"
+#	elif NV_OS_MINGW
+#		include "DefsGnucWin32.h"
+#	elif NV_OS_CYGWIN
+#		error "GCC: Cygwin not supported"
+#	else
+#		error "GCC: Platform not supported"
+#	endif
+#endif
+
+#endif // NV_CORE_H
--- a/src/nvcore/poshlib/CMakeLists.txt
+++ b/src/nvcore/poshlib/CMakeLists.txt
@ -0,0 +1,14 @@
+
+SET(POSHLIB_SRCS
+	posh.c
+	posh.h)
+
+ADD_LIBRARY(posh STATIC ${POSHLIB_SRCS})
+
+ADD_EXECUTABLE(archtest tests/arch/archtest.c)
+TARGET_LINK_LIBRARIES(archtest posh)
+
+#ADD_EXECUTABLE(linktest tests/linktest/linktest.cpp tests/linktest/testlib.cpp)
+#TARGET_LINK_LIBRARIES(linktest posh)
+
+ADD_TEST(POSHTEST archtest)
--- a/src/nvcore/poshlib/posh.c
+++ b/src/nvcore/poshlib/posh.c
@ -0,0 +1,926 @@
+/*
+LICENSE:
+
+Copyright (c) 2004, Brian Hook
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above
+      copyright notice, this list of conditions and the following
+      disclaimer in the documentation and/or other materials provided
+      with the distribution.
+
+    * The names of this package'ss contributors contributors may not
+      be used to endorse or promote products derived from this
+      software without specific prior written permission.
+
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+/** 
+ @file    posh.c
+ @author  Brian Hook
+ @date    2002
+ @brief   Portable Open Source Harness primary source file
+*/
+#include "posh.h"
+
+#if !defined FORCE_DOXYGEN
+
+#if !defined POSH_NO_FLOAT
+#  define POSH_FLOAT_STRING "enabled"
+#else
+#  define POSH_FLOAT_STRING "disabled"
+#endif
+
+#if defined POSH_64BIT_INTEGER
+#  define POSH_64BIT_INTEGER_STRING "yes"
+#else
+#  define POSH_64BIT_INTEGER_STRING "no"
+#endif
+
+#if defined POSH_64BIT_POINTER
+#  define POSH_POINTER_STRING "64-bits"
+#else
+#  define POSH_POINTER_STRING "32-bits"
+#endif
+
+#if defined POSH_LITTLE_ENDIAN
+#  define IS_BIG_ENDIAN    0
+
+#  define NATIVE16  POSH_LittleU16
+#  define NATIVE32  POSH_LittleU32
+#  define NATIVE64  POSH_LittleU64
+#  define FOREIGN16 POSH_BigU16
+#  define FOREIGN32 POSH_BigU32
+#  define FOREIGN64 POSH_BigU64
+#else
+#  define IS_BIG_ENDIAN    1
+
+#  define NATIVE16  POSH_BigU16
+#  define NATIVE32  POSH_BigU32
+#  define NATIVE64  POSH_BigU64
+#  define FOREIGN16 POSH_LittleU16
+#  define FOREIGN32 POSH_LittleU32
+#  define FOREIGN64 POSH_LittleU64
+#endif /* POSH_LITTLE_ENDIAN */
+
+static 
+int 
+s_testBigEndian( void )
+{
+   union 
+   {
+      posh_byte_t c[ 4 ];
+      posh_u32_t  i;
+   } u;
+
+   u.i= 1;
+
+   if ( u.c[ 0 ] == 1 )
+   {
+      return 0;
+   }
+   return 1;
+}
+
+static
+const char *
+s_testSerialization( void )
+{
+   posh_byte_t serbuf[ 8 ];
+   posh_u16_t  tmp16;
+   posh_u32_t  tmp32;
+
+   /* 16-bit serialization */
+   POSH_WriteU16ToLittle( serbuf, 0xABCD );
+   if ( ( tmp16 = POSH_ReadU16FromLittle( serbuf ) ) != 0xABCD )
+   {
+      return "*ERROR: failed little-endian 16-bit serialization test";
+   }
+
+   POSH_WriteU16ToBig( serbuf, 0xABCD );
+   if ( ( tmp16 = POSH_ReadU16FromBig( serbuf ) ) != 0xABCD )
+   {
+      return "*ERROR: failed big-endian 16-bit serialization test";
+   }
+
+   /* 32-bit serialization */
+   POSH_WriteU32ToLittle( serbuf, 0xABCD1234L );
+   if ( ( tmp32 = POSH_ReadU32FromLittle( serbuf ) ) != 0xABCD1234 )
+   {
+      return "*ERROR: failed little-endian 32-bit serialization test";
+   }
+
+   POSH_WriteU32ToBig( serbuf, 0xABCD1234L );
+   if ( ( tmp32 = POSH_ReadU32FromBig( serbuf ) ) != 0xABCD1234 )
+   {
+      return "*ERROR: failed big-endian 32-bit serialization test";
+   }
+
+#if defined POSH_64BIT_INTEGER
+   {
+#define REF64 POSH_U64(0xFEDCBA9876543210)
+
+      posh_u64_t tmp64;
+
+      POSH_WriteU64ToLittle( serbuf, REF64 );
+
+      if ( ( tmp64 = POSH_ReadU64FromLittle( serbuf ) ) != REF64 )
+      {
+         return "*ERROR: failed little-endian 64-bit serialization test";
+      }
+
+      POSH_WriteU64ToBig( serbuf, REF64 );
+
+      if ( ( tmp64 = POSH_ReadU64FromBig( serbuf ) ) != REF64 )
+      {
+         return "*ERROR: failed big-endian 64-bit serialization test";
+      }
+   }
+#endif
+
+   return 0;
+}
+
+#if !defined POSH_NO_FLOAT
+static
+const char *
+s_testFloatingPoint( void )
+{
+   float fRef = 10.0f/30.0f;
+   double dRef = 10.0/30.0;
+   posh_byte_t dbuf[ 8 ];
+   float fTmp;
+   double dTmp;
+
+   fTmp = POSH_FloatFromLittleBits( POSH_LittleFloatBits( fRef ) );
+
+   if ( fTmp != fRef )
+   {
+      return "*ERROR: POSH little endian floating point conversion failed.  Please report this to poshlib@poshlib.org!\n";
+   }
+
+   fTmp = POSH_FloatFromBigBits( POSH_BigFloatBits( fRef ) );
+   if ( fTmp != fRef )
+   {
+      return "*ERROR: POSH big endian floating point conversion failed.  Please report this to poshlib@poshlib.org!\n";
+   }
+
+   POSH_DoubleBits( dRef, dbuf );
+
+   dTmp = POSH_DoubleFromBits( dbuf );
+
+   if ( dTmp != dRef )
+   {
+      return "*ERROR: POSH double precision floating point serialization failed.  Please report this to poshlib@poshlib.org!\n";
+   }
+
+   return 0;
+}
+#endif /* !defined POSH_NO_FLOAT */
+
+static
+const char *
+s_testEndianess( void )
+{
+   /* check endianess */
+   if ( s_testBigEndian() != IS_BIG_ENDIAN )
+   {
+      return "*ERROR: POSH compile time endianess does not match run-time endianess verification.  Please report this to poshlib@poshlib.org!\n";
+   }
+
+   /* make sure our endian swap routines work */
+   if ( ( NATIVE32( 0x11223344L ) != 0x11223344L ) || 
+        ( FOREIGN32( 0x11223344L ) != 0x44332211L ) ||
+        ( NATIVE16( 0x1234 ) != 0x1234 ) ||
+        ( FOREIGN16( 0x1234 ) != 0x3412 ) )
+   {
+      return "*ERROR: POSH endianess macro selection failed.  Please report this to poshlib@poshlib.org!\n";
+   }
+
+   /* test serialization routines */
+
+   return 0;
+}
+#endif /* !defined FORCE_DOXYGEN */
+
+/**
+  Returns a string describing this platform's basic attributes.  
+
+  POSH_GetArchString() reports on an architecture's statically determined
+  attributes.  In addition, it will perform run-time verification checks
+  to make sure the various platform specific functions work.  If an error
+  occurs, please contact me at poshlib@poshlib.org so we can try to resolve
+  what the specific failure case is.
+  @returns a string describing this platform on success, or a string in the 
+           form "*ERROR: [text]" on failure.  You can simply check to see if
+           the first character returned is '*' to verify an error condition.
+*/
+const char *
+POSH_GetArchString( void )
+{
+   const char *err;
+   const char *s = "OS:.............."POSH_OS_STRING"\n"
+                   "CPU:............."POSH_CPU_STRING"\n"
+                   "endian:.........."POSH_ENDIAN_STRING"\n"
+                   "ptr size:........"POSH_POINTER_STRING"\n"
+                   "64-bit ints......"POSH_64BIT_INTEGER_STRING"\n"
+                   "floating point..."POSH_FLOAT_STRING"\n"
+                   "compiler........."POSH_COMPILER_STRING"\n";
+
+   /* test endianess */
+   err = s_testEndianess();
+
+   if ( err != 0 )
+   {
+      return err;
+   }
+
+   /* test serialization */
+   err = s_testSerialization();
+
+   if ( err != 0 )
+   {
+      return err;
+   }
+
+#if !defined POSH_NO_FLOAT
+   /* check that our floating point support is correct */
+   err = s_testFloatingPoint();
+
+   if ( err != 0 )
+   {
+      return err;
+   }
+
+#endif
+
+   return s;
+}
+
+/* ---------------------------------------------------------------------------*/
+/*                           BYTE SWAPPING SUPPORT                            */
+/* ---------------------------------------------------------------------------*/
+/** 
+ * Byte swaps a 16-bit unsigned value
+ *
+   @param v [in] unsigned 16-bit input value to swap
+   @returns a byte swapped version of v
+ */
+posh_u16_t
+POSH_SwapU16( posh_u16_t v )
+{
+   posh_u16_t swapped;
+
+   swapped  = v << 8;
+   swapped |= v >> 8;
+
+   return swapped;
+}
+
+/** 
+ * Byte swaps a 16-bit signed value
+ *
+   @param v [in] signed 16-bit input value to swap
+   @returns a byte swapped version of v
+   @remarks This just calls back to the unsigned version, since byte swapping 
+            is independent of sign.  However, we still provide this function to
+            avoid signed/unsigned mismatch compiler warnings.
+ */
+posh_i16_t
+POSH_SwapI16( posh_i16_t v )
+{
+   return ( posh_i16_t ) POSH_SwapU16( v );
+}
+
+/** 
+ * Byte swaps a 32-bit unsigned value
+ *
+   @param v [in] unsigned 32-bit input value to swap
+   @returns a byte swapped version of v
+ */
+posh_u32_t
+POSH_SwapU32( posh_u32_t v )
+{
+   posh_u32_t swapped;
+
+   swapped  = ( v & 0xFF ) << 24;
+   swapped |= ( v & 0xFF00 ) << 8;
+   swapped |= ( v >> 8 ) & 0xFF00;
+   swapped |= ( v >> 24 );
+
+   return swapped;
+}
+
+/** 
+ * Byte swaps a 32-bit signed value
+ *
+   @param v [in] signed 32-bit input value to swap
+   @returns a byte swapped version of v
+   @remarks This just calls back to the unsigned version, since byte swapping 
+            is independent of sign.  However, we still provide this function to
+            avoid signed/unsigned mismatch compiler warnings.
+ */
+posh_i32_t
+POSH_SwapI32( posh_i32_t v )
+{
+   return ( posh_i32_t ) POSH_SwapU32( ( posh_u32_t ) v );
+}
+
+#if defined POSH_64BIT_INTEGER
+/**
+ * Byte swaps a 64-bit unsigned value
+
+   @param v [in] a 64-bit input value to swap
+   @ingroup SixtyFourBit
+   @returns a byte swapped version of v
+*/
+posh_u64_t 
+POSH_SwapU64( posh_u64_t v )
+{
+   posh_byte_t tmp;
+   union {
+      posh_byte_t bytes[ 8 ];
+      posh_u64_t  u64;
+   } u;
+
+   u.u64 = v;
+
+   tmp = u.bytes[ 0 ]; u.bytes[ 0 ] = u.bytes[ 7 ]; u.bytes[ 7 ] = tmp;
+   tmp = u.bytes[ 1 ]; u.bytes[ 1 ] = u.bytes[ 6 ]; u.bytes[ 6 ] = tmp;
+   tmp = u.bytes[ 2 ]; u.bytes[ 2 ] = u.bytes[ 5 ]; u.bytes[ 5 ] = tmp;
+   tmp = u.bytes[ 3 ]; u.bytes[ 3 ] = u.bytes[ 4 ]; u.bytes[ 4 ] = tmp;
+
+   return u.u64;
+}
+
+/**
+ * Byte swaps a 64-bit signed value
+
+   @param v [in] a 64-bit input value to swap
+   @ingroup SixtyFourBit
+   @returns a byte swapped version of v
+*/
+posh_i64_t 
+POSH_SwapI64( posh_i64_t v )
+{
+   return ( posh_i64_t ) POSH_SwapU64( ( posh_u64_t ) v );
+}
+
+#endif /* defined POSH_64BIT_INTEGER */
+
+/* ---------------------------------------------------------------------------*/
+/*                           IN-MEMORY SERIALIZATION                          */
+/* ---------------------------------------------------------------------------*/
+
+/**
+ * Writes an unsigned 16-bit value to a little endian buffer
+
+ @param dst [out] pointer to the destination buffer, may not be NULL
+ @param value [in] host-endian unsigned 16-bit value
+ @returns a pointer to the location two bytes after dst
+ @remarks does no validation of the inputs
+*/
+posh_u16_t *
+POSH_WriteU16ToLittle( void *dst, posh_u16_t value )
+{
+   posh_u16_t *p16 = ( posh_u16_t * ) dst;
+
+   *p16 = POSH_LittleU16(value);
+
+   return p16 + 1;
+}
+
+/**
+ * Writes a signed 16-bit value to a little endian buffer
+
+ @param dst [out] pointer to the destination buffer, may not be NULL
+ @param value [in] host-endian signed 16-bit value
+ @returns a pointer to the location two bytes after dst
+ @remarks does no validation of the inputs.  This simply calls
+          POSH_WriteU16ToLittle() with appropriate casting.
+*/
+posh_i16_t *
+POSH_WriteI16ToLittle( void *dst, posh_i16_t value )
+{
+   return ( posh_i16_t * ) POSH_WriteU16ToLittle( dst, ( posh_u16_t ) value );
+}
+
+/**
+ * Writes an unsigned 32-bit value to a little endian buffer
+
+ @param dst [out] pointer to the destination buffer, may not be NULL
+ @param value [in] host-endian signed 32-bit value
+ @returns a pointer to the location four bytes after dst
+ @remarks does no validation of the inputs.
+*/
+posh_u32_t *
+POSH_WriteU32ToLittle( void *dst, posh_u32_t value )
+{
+   posh_u32_t *p32 = ( posh_u32_t * ) dst;
+
+   *p32 = POSH_LittleU32(value);
+
+   return p32 + 1;
+}
+
+/**
+ * Writes a signed 32-bit value to a little endian buffer
+
+ @param dst [out] pointer to the destination buffer, may not be NULL
+ @param value [in] host-endian signed 32-bit value
+ @returns a pointer to the location four bytes after dst
+ @remarks does no validation of the inputs.  This simply calls
+          POSH_WriteU32ToLittle() with appropriate casting.
+*/
+posh_i32_t *
+POSH_WriteI32ToLittle( void *dst, posh_i32_t value )
+{
+   return ( posh_i32_t * ) POSH_WriteU32ToLittle( dst, ( posh_u32_t ) value );
+}
+
+/**
+ * Writes an unsigned 16-bit value to a big endian buffer
+
+ @param dst [out] pointer to the destination buffer, may not be NULL
+ @param value [in] host-endian unsigned 16-bit value
+ @returns a pointer to the location two bytes after dst
+ @remarks does no validation of the inputs
+*/
+posh_u16_t *
+POSH_WriteU16ToBig( void *dst, posh_u16_t value )
+{
+   posh_u16_t *p16 = ( posh_u16_t * ) dst;
+
+   *p16 = POSH_BigU16(value);
+
+   return p16 + 1;
+}
+
+/**
+ * Writes a signed 16-bit value to a big endian buffer
+
+ @param dst [out] pointer to the destination buffer, may not be NULL
+ @param value [in] host-endian signed 16-bit value
+ @returns a pointer to the location two bytes after dst
+ @remarks does no validation of the inputs.  This simply calls
+          POSH_WriteU16ToLittle() with appropriate casting.
+*/
+posh_i16_t *
+POSH_WriteI16ToBig( void *dst, posh_i16_t value )
+{
+   return ( posh_i16_t * ) POSH_WriteU16ToBig( dst, ( posh_u16_t ) value );
+}
+
+/**
+ * Writes an unsigned 32-bit value to a big endian buffer
+
+ @param dst [out] pointer to the destination buffer, may not be NULL
+ @param value [in] host-endian unsigned 32-bit value
+ @returns a pointer to the location four bytes after dst
+ @remarks does no validation of the inputs.
+*/
+posh_u32_t *
+POSH_WriteU32ToBig( void *dst, posh_u32_t value )
+{
+   posh_u32_t *p32 = ( posh_u32_t * ) dst;
+
+   *p32 = POSH_BigU32(value);
+
+   return p32 + 1;
+}
+
+/**
+ * Writes a signed 32-bit value to a big endian buffer
+
+ @param dst [out] pointer to the destination buffer, may not be NULL
+ @param value [in] host-endian signed 32-bit value
+ @returns a pointer to the location four bytes after dst
+ @remarks does no validation of the inputs.  This simply calls
+          POSH_WriteU32ToBig() with appropriate casting.
+*/
+posh_i32_t *
+POSH_WriteI32ToBig( void *dst, posh_i32_t value )
+{
+   return ( posh_i32_t * ) POSH_WriteU32ToBig( dst, ( posh_u32_t ) value );
+}
+
+#if defined POSH_64BIT_INTEGER
+/**
+ * Writes an unsigned 64-bit value to a little-endian buffer
+
+ @ingroup SixtyFourBit
+ @param dst [out] pointer to the destination buffer, may not be NULL
+ @param value [in] host-endian unsigned 64-bit value
+ @returns a pointer to the location eight bytes after dst
+ @remarks does no validation of the inputs.
+*/
+posh_u64_t *
+POSH_WriteU64ToLittle( void *dst, posh_u64_t value )
+{
+   posh_u64_t *p64 = ( posh_u64_t * ) dst;
+
+   *p64 = POSH_LittleU64(value);
+
+   return p64 + 1;
+}
+
+/**
+ * Writes a signed 64-bit value to a little-endian buffer
+
+ @ingroup SixtyFourBit
+ @param dst [out] pointer to the destination buffer, may not be NULL
+ @param value [in] host-endian unsigned 64-bit value
+ @returns a pointer to the location eight bytes after dst
+ @remarks does no validation of the inputs.
+*/
+posh_i64_t *
+POSH_WriteI64ToLittle( void *dst, posh_i64_t value )
+{
+   return ( posh_i64_t * ) POSH_WriteU64ToLittle( dst, ( posh_u64_t ) value );
+}
+
+/**
+ * Writes an unsigned 64-bit value to a big-endian buffer
+
+ @ingroup SixtyFourBit
+ @param dst [out] pointer to the destination buffer, may not be NULL
+ @param value [in] host-endian unsigned 64-bit value
+ @returns a pointer to the location eight bytes after dst
+ @remarks does no validation of the inputs.
+*/
+posh_u64_t *
+POSH_WriteU64ToBig( void *dst, posh_u64_t value )
+{
+   posh_u64_t *p64 = ( posh_u64_t * ) dst;
+
+   *p64 = POSH_BigU64(value);
+
+   return p64 + 8;
+}
+
+/**
+ * Writes a signed 64-bit value to a big-endian buffer
+
+ @ingroup SixtyFourBit
+ @param dst [out] pointer to the destination buffer, may not be NULL
+ @param value [in] host-endian signed 64-bit value
+ @returns a pointer to the location eight bytes after dst
+ @remarks does no validation of the inputs.
+*/
+posh_i64_t *
+POSH_WriteI64ToBig( void *dst, posh_i64_t value )
+{
+   return ( posh_i64_t * ) POSH_WriteU64ToBig( dst, ( posh_u64_t ) value );
+}
+
+#endif /* POSH_64BIT_INTEGER */
+
+/* ---------------------------------------------------------------------------*/
+/*                         IN-MEMORY DESERIALIZATION                          */
+/* ---------------------------------------------------------------------------*/
+
+/** 
+ * Reads an unsigned 16-bit value from a little-endian buffer
+ @param src [in] source buffer
+ @returns host-endian unsigned 16-bit value
+*/
+posh_u16_t  
+POSH_ReadU16FromLittle( const void *src )
+{
+   return POSH_LittleU16( (*(const posh_u16_t*)src) );
+}
+
+/** 
+ * Reads a signed 16-bit value from a little-endian buffer
+ @param src [in] source buffer
+ @returns host-endian signed 16-bit value
+*/
+posh_i16_t  
+POSH_ReadI16FromLittle( const void *src )
+{
+   return POSH_LittleI16( (*(const posh_i16_t*)src) );
+}
+
+/** 
+ * Reads an unsigned 32-bit value from a little-endian buffer
+ @param src [in] source buffer
+ @returns host-endian unsigned 32-bit value
+*/
+posh_u32_t  
+POSH_ReadU32FromLittle( const void *src )
+{
+   return POSH_LittleU32( (*(const posh_u32_t*)src) );
+}
+
+/** 
+ * Reads a signed 32-bit value from a little-endian buffer
+ @param src [in] source buffer
+ @returns host-endian signed 32-bit value
+*/
+posh_i32_t  
+POSH_ReadI32FromLittle( const void *src )
+{
+   return POSH_LittleI32( (*(const posh_i32_t*)src) );
+}
+
+
+/** 
+ * Reads an unsigned 16-bit value from a big-endian buffer
+ @param src [in] source buffer
+ @returns host-endian unsigned 16-bit value
+*/
+posh_u16_t  
+POSH_ReadU16FromBig( const void *src )
+{
+   return POSH_BigU16( (*(const posh_u16_t*)src) );
+}
+
+/** 
+ * Reads a signed 16-bit value from a big-endian buffer
+ @param src [in] source buffer
+ @returns host-endian signed 16-bit value
+*/
+posh_i16_t  
+POSH_ReadI16FromBig( const void *src )
+{
+   return POSH_BigI16( (*(const posh_i16_t*)src));
+}
+
+/** 
+ * Reads an unsigned 32-bit value from a big-endian buffer
+ @param src [in] source buffer
+ @returns host-endian unsigned 32-bit value
+*/
+posh_u32_t  
+POSH_ReadU32FromBig( const void *src )
+{
+   return POSH_BigU32( (*(const posh_u32_t*)src) );
+}
+
+/** 
+ * Reads a signed 32-bit value from a big-endian buffer
+ @param src [in] source buffer
+ @returns host-endian signed 32-bit value
+*/
+posh_i32_t  
+POSH_ReadI32FromBig( const void *src )
+{
+   return POSH_BigI32( (*(const posh_i32_t*)src ) );
+}
+
+#if defined POSH_64BIT_INTEGER
+
+/** 
+ * Reads an unsigned 64-bit value from a little-endian buffer
+ @param src [in] source buffer
+ @returns host-endian unsigned 32-bit value
+*/
+posh_u64_t  
+POSH_ReadU64FromLittle( const void *src )
+{
+   return POSH_LittleU64( (*(const posh_u64_t*)src) );
+}
+
+/** 
+ * Reads a signed 64-bit value from a little-endian buffer
+ @param src [in] source buffer
+ @returns host-endian signed 32-bit value
+*/
+posh_i64_t  
+POSH_ReadI64FromLittle( const void *src )
+{
+   return POSH_LittleI64( (*(const posh_i64_t*)src) );
+}
+
+/** 
+ * Reads an unsigned 64-bit value from a big-endian buffer
+ @param src [in] source buffer
+ @returns host-endian unsigned 32-bit value
+*/
+posh_u64_t
+POSH_ReadU64FromBig( const void *src )
+{
+   return POSH_BigU64( (*(const posh_u64_t*)src) );
+}
+
+/** 
+ * Reads an signed 64-bit value from a big-endian buffer
+ @param src [in] source buffer
+ @returns host-endian signed 32-bit value
+*/
+posh_i64_t
+POSH_ReadI64FromBig( const void *src )
+{
+   return POSH_BigI64( (*(const posh_i64_t*)src) );
+}
+
+#endif /* POSH_64BIT_INTEGER */
+
+/* ---------------------------------------------------------------------------*/
+/*                           FLOATING POINT SUPPORT                           */
+/* ---------------------------------------------------------------------------*/
+
+#if !defined POSH_NO_FLOAT
+
+/** @ingroup FloatingPoint
+    @param[in] f floating point value
+    @returns a little-endian bit representation of f
+ */
+posh_u32_t
+POSH_LittleFloatBits( float f )
+{
+   union
+   {
+      float f32;
+      posh_u32_t u32;
+   } u;
+
+   u.f32 = f;
+
+#if defined POSH_LITTLE_ENDIAN
+   return u.u32;
+#else
+   return POSH_SwapU32( u.u32 );
+#endif
+}
+
+/** 
+ * Extracts raw big-endian bits from a 32-bit floating point value
+ *
+   @ingroup FloatingPoint
+   @param   f [in] floating point value
+   @returns a big-endian bit representation of f
+ */
+posh_u32_t
+POSH_BigFloatBits( float f )
+{
+   union
+   {
+      float f32;
+      posh_u32_t u32;
+   } u;
+
+   u.f32 = f;
+
+#if defined POSH_LITTLE_ENDIAN
+   return POSH_SwapU32( u.u32 );
+#else
+   return u.u32;
+#endif
+}
+
+/** 
+ * Extracts raw, little-endian bit representation from a 64-bit double.
+ *
+   @param d [in] 64-bit double precision value
+   @param dst [out] 8-byte storage buffer
+   @ingroup FloatingPoint
+   @returns the raw bits used to represent the value 'd', in the form dst[0]=LSB
+ */
+void
+POSH_DoubleBits( double d, posh_byte_t dst[ 8 ] )
+{
+   union
+   {
+      double d64;
+      posh_byte_t bytes[ 8 ];
+   } u;
+
+   u.d64 = d;
+
+#if defined POSH_LITTLE_ENDIAN
+   dst[ 0 ] = u.bytes[ 0 ];
+   dst[ 1 ] = u.bytes[ 1 ];
+   dst[ 2 ] = u.bytes[ 2 ];
+   dst[ 3 ] = u.bytes[ 3 ];
+   dst[ 4 ] = u.bytes[ 4 ];
+   dst[ 5 ] = u.bytes[ 5 ];
+   dst[ 6 ] = u.bytes[ 6 ];
+   dst[ 7 ] = u.bytes[ 7 ];
+#else
+   dst[ 0 ] = u.bytes[ 7 ];
+   dst[ 1 ] = u.bytes[ 6 ];
+   dst[ 2 ] = u.bytes[ 5 ];
+   dst[ 3 ] = u.bytes[ 4 ];
+   dst[ 4 ] = u.bytes[ 3 ];
+   dst[ 5 ] = u.bytes[ 2 ];
+   dst[ 6 ] = u.bytes[ 1 ];
+   dst[ 7 ] = u.bytes[ 0 ];
+#endif
+}
+
+/** 
+ * Creates a double-precision, 64-bit floating point value from a set of raw, 
+ * little-endian bits
+
+   @ingroup FloatingPoint
+   @param src [in] little-endian byte representation of 64-bit double precision 
+                  floating point value
+   @returns double precision floating point representation of the raw bits
+   @remarks No error checking is performed, so there are no guarantees that the 
+            result is a valid number, nor is there any check to ensure that src is 
+            non-NULL.  BE CAREFUL USING THIS.
+ */
+double
+POSH_DoubleFromBits( const posh_byte_t src[ 8 ] )
+{
+   union
+   {
+      double d64;
+      posh_byte_t bytes[ 8 ];
+   } u;
+
+#if defined POSH_LITTLE_ENDIAN
+   u.bytes[ 0 ] = src[ 0 ];
+   u.bytes[ 1 ] = src[ 1 ];
+   u.bytes[ 2 ] = src[ 2 ];
+   u.bytes[ 3 ] = src[ 3 ];
+   u.bytes[ 4 ] = src[ 4 ];
+   u.bytes[ 5 ] = src[ 5 ];
+   u.bytes[ 6 ] = src[ 6 ];
+   u.bytes[ 7 ] = src[ 7 ];
+#else
+   u.bytes[ 0 ] = src[ 7 ];
+   u.bytes[ 1 ] = src[ 6 ];
+   u.bytes[ 2 ] = src[ 5 ];
+   u.bytes[ 3 ] = src[ 4 ];
+   u.bytes[ 4 ] = src[ 3 ];
+   u.bytes[ 5 ] = src[ 2 ];
+   u.bytes[ 6 ] = src[ 1 ];
+   u.bytes[ 7 ] = src[ 0 ];
+#endif
+
+   return u.d64;
+}
+
+/** 
+ * Creates a floating point number from little endian bits
+ *
+   @ingroup FloatingPoint
+   @param   bits [in] raw floating point bits in little-endian form
+   @returns a floating point number based on the given bit representation
+   @remarks No error checking is performed, so there are no guarantees that the 
+            result is a valid number.  BE CAREFUL USING THIS.
+ */
+float       
+POSH_FloatFromLittleBits( posh_u32_t bits )
+{
+   union
+   {
+      float f32;
+      posh_u32_t u32;
+   } u;
+
+   u.u32 = bits;
+#if defined POSH_BIG_ENDIAN
+   u.u32 = POSH_SwapU32( u.u32 );
+#endif
+
+   return u.f32;
+}
+
+/** 
+ * Creates a floating point number from big-endian bits
+ *
+   @ingroup FloatingPoint
+   @param   bits [in] raw floating point bits in big-endian form
+   @returns a floating point number based on the given bit representation
+   @remarks No error checking is performed, so there are no guarantees that the 
+            result is a valid number.  BE CAREFUL USING THIS.
+ */
+float
+POSH_FloatFromBigBits( posh_u32_t bits )
+{
+   union
+   {
+      float f32;
+      posh_u32_t u32;
+   } u;
+
+   u.u32 = bits;
+#if defined POSH_LITTLE_ENDIAN
+   u.u32 = POSH_SwapU32( u.u32 );
+#endif
+
+   return u.f32;
+}
+
+#endif /* !defined POSH_NO_FLOAT */
--- a/src/nvcore/poshlib/posh.h
+++ b/src/nvcore/poshlib/posh.h
@ -0,0 +1,989 @@
+/**
+@file posh.h
+@author Brian Hook
+
+Header file for POSH, the Portable Open Source Harness project.
+
+NOTE: Unlike most header files, this one is designed to be included
+multiple times, which is why it does not have the @#ifndef/@#define
+preamble.
+
+POSH relies on environment specified preprocessor symbols in order
+to infer as much as possible about the target OS/architecture and
+the host compiler capabilities.
+
+NOTE: POSH is simple and focused. It attempts to provide basic
+functionality and information, but it does NOT attempt to emulate
+missing functionality.  I am also not willing to make POSH dirty
+and hackish to support truly ancient and/or outmoded and/or bizarre
+technologies such as non-ANSI compilers, systems with non-IEEE
+floating point formats, segmented 16-bit operating systems, etc.
+
+Please refer to the accompanying HTML documentation or visit
+http://www.poshlib.org for more information on how to use POSH.
+
+LICENSE:
+
+Copyright (c) 2004, Brian Hook
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above
+      copyright notice, this list of conditions and the following
+      disclaimer in the documentation and/or other materials provided
+      with the distribution.
+
+    * The names of this package'ss contributors contributors may not
+      be used to endorse or promote products derived from this
+      software without specific prior written permission.
+
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+/*
+I have yet to find an authoritative reference on preprocessor
+symbols, but so far this is what I've gleaned:
+
+GNU GCC/G++:
+   - __GNUC__: GNU C version
+   - __GNUG__: GNU C++ compiler
+   - __sun__ : on Sun platforms
+   - __svr4__: on Solaris and other SysV R4 platforms
+   - __mips__: on MIPS processor platforms
+   - __sparc_v9__: on Sparc 64-bit CPUs
+   - __sparcv9: 64-bit Solaris
+   - __MIPSEL__: mips processor, compiled for little endian
+   - __MIPSEB__: mips processor, compiled for big endian
+   - _R5900: MIPS/Sony/Toshiba R5900 (PS2)
+   - mc68000: 68K
+   - m68000: 68K
+   - m68k: 68K
+   - __palmos__: PalmOS
+
+Intel C/C++ Compiler:
+   - __ECC      : compiler version, IA64 only
+   - __EDG__
+   - __ELF__
+   - __GXX_ABI_VERSION
+   - __i386     : IA-32 only
+   - __i386__   : IA-32 only
+   - i386       : IA-32 only
+   - __ia64     : IA-64 only
+   - __ia64__   : IA-64 only
+   - ia64       : IA-64 only
+   - __ICC      : IA-32 only
+   - __INTEL_COMPILER : IA-32 or IA-64, newer versions only
+
+Apple's C/C++ Compiler for OS X:
+   - __APPLE_CC__
+   - __APPLE__
+   - __BIG_ENDIAN__
+   - __APPLE__
+   - __ppc__
+   - __MACH__
+
+DJGPP:
+   - __MSDOS__
+   - __unix__
+   - __unix
+   - __GNUC__
+   - __GO32
+   - DJGPP
+   - __i386, __i386, i386
+
+Cray's C compiler:
+   - _ADDR64: if 64-bit pointers
+   - _UNICOS: 
+   - __unix:
+
+SGI's CC compiler predefines the following (and more) with -ansi:
+   - __sgi
+   - __unix
+   - __host_mips
+   - _SYSTYPE_SVR4
+   - __mips
+   - _MIPSEB
+   - anyone know if there is a predefined symbol for the compiler?!
+
+MinGW:
+   - as GnuC but also defines _WIN32, __WIN32, WIN32, _X86_, __i386, __i386__, and several others
+   - __MINGW32__
+
+Cygwin:
+   - as Gnu C, but also
+   - __unix__
+   - __CYGWIN32__
+
+Microsoft Visual Studio predefines the following:
+   - _MSC_VER
+   - _WIN32: on Win32
+   - _M_IX6 (on x86 systems)
+   - _M_ALPHA (on DEC AXP systems)
+   - _SH3: WinCE, Hitachi SH-3
+   - _MIPS: WinCE, MIPS
+   - _ARM: WinCE, ARM
+
+Sun's C Compiler:
+   - sun and _sun
+   - unix and _unix
+   - sparc and _sparc (SPARC systems only)
+   - i386 and _i386 (x86 systems only)
+   - __SVR4 (Solaris only)
+   - __sparcv9: 64-bit solaris
+   - __SUNPRO_C
+   - _LP64: defined in 64-bit LP64 mode, but only if <sys/types.h> is included
+
+Borland C/C++ predefines the following:
+   - __BORLANDC__:
+
+DEC/Compaq C/C++ on Alpha:
+   - __alpha
+   - __arch64__
+   - __unix__ (on Tru64 Unix)
+   - __osf__
+   - __DECC
+   - __DECCXX (C++ compilation)
+   - __DECC_VER
+   - __DECCXX_VER
+
+IBM's AIX compiler:
+   - __64BIT__ if 64-bit mode
+   - _AIX
+   - __IBMC__: C compiler version
+   - __IBMCPP__: C++ compiler version
+   - _LONG_LONG: compiler allows long long
+
+Watcom:
+   - __WATCOMC__
+   - __DOS__ : if targeting DOS
+   - __386__ : if 32-bit support
+   - __WIN32__ : if targetin 32-bit Windows
+
+HP-UX C/C++ Compiler:
+   - __hpux
+   - __unix
+   - __hppa (on PA-RISC)
+   - __LP64__: if compiled in 64-bit mode
+
+Metrowerks:
+   - __MWERKS__
+   - __powerpc__
+   - _powerc
+   - __MC68K__
+   - macintosh when compiling for MacOS
+   - __INTEL__ for x86 targets
+   - __POWERPC__
+
+*/
+
+#ifndef HAVE_POSH_H
+#define HAVE_POSH_H
+
+/*
+** ----------------------------------------------------------------------------
+** Include <limits.h> optionally
+** ----------------------------------------------------------------------------
+*/
+#ifdef POSH_USE_LIMITS_H
+#  include <limits.h>
+#endif
+
+/*
+** ----------------------------------------------------------------------------
+** Determine compilation environment
+** ----------------------------------------------------------------------------
+*/
+#if defined __ECC || defined __ICC || defined __INTEL_COMPILER
+#  define POSH_COMPILER_STRING "Intel C/C++"
+#  define POSH_COMPILER_INTEL 1
+#endif
+
+#if ( defined __host_mips || defined __sgi ) && !defined __GNUC__
+#  define POSH_COMPILER_STRING    "MIPSpro C/C++"
+#  define POSH_COMPILER_MIPSPRO 1 
+#endif
+
+#if defined __hpux && !defined __GNUC__
+#  define POSH_COMPILER_STRING "HP-UX CC"
+#  define POSH_COMPILER_HPCC 1 
+#endif
+
+#if defined __GNUC__
+#  define POSH_COMPILER_STRING "Gnu GCC"
+#  define POSH_COMPILER_GCC 1
+#endif
+
+#if defined __APPLE_CC__
+   /* we don't define the compiler string here, let it be GNU */
+#  define POSH_COMPILER_APPLECC 1
+#endif
+
+#if defined __IBMC__ || defined __IBMCPP__
+#  define POSH_COMPILER_STRING "IBM C/C++"
+#  define POSH_COMPILER_IBM 1
+#endif
+
+#if defined _MSC_VER
+#  define POSH_COMPILER_STRING "Microsoft Visual C++"
+#  define POSH_COMPILER_MSVC 1
+#endif
+
+#if defined __SUNPRO_C
+#  define POSH_COMPILER_STRING "Sun Pro" 
+#  define POSH_COMPILER_SUN 1
+#endif
+
+#if defined __BORLANDC__
+#  define POSH_COMPILER_STRING "Borland C/C++"
+#  define POSH_COMPILER_BORLAND 1
+#endif
+
+#if defined __MWERKS__
+#  define POSH_COMPILER_STRING     "MetroWerks CodeWarrior"
+#  define POSH_COMPILER_METROWERKS 1
+#endif
+
+#if defined __DECC || defined __DECCXX
+#  define POSH_COMPILER_STRING "Compaq/DEC C/C++"
+#  define POSH_COMPILER_DEC 1
+#endif
+
+#if defined __WATCOMC__
+#  define POSH_COMPILER_STRING "Watcom C/C++"
+#  define POSH_COMPILER_WATCOM 1
+#endif
+
+#if !defined POSH_COMPILER_STRING
+#  define POSH_COMPILER_STRING "Unknown compiler"
+#endif
+
+/*
+** ----------------------------------------------------------------------------
+** Determine target operating system
+** ----------------------------------------------------------------------------
+*/
+#if defined linux || defined __linux__
+#  define POSH_OS_LINUX 1 
+#  define POSH_OS_STRING "Linux"
+#endif
+
+#if defined __CYGWIN32__
+#  define POSH_OS_CYGWIN32 1
+#  define POSH_OS_STRING "Cygwin"
+#endif
+
+#if defined __MINGW32__
+#  define POSH_OS_MINGW 1
+#  define POSH_OS_STRING "MinGW"
+#endif
+
+#if defined GO32 && defined DJGPP && defined __MSDOS__ 
+#  define POSH_OS_GO32 1
+#  define POSH_OS_STRING "GO32/MS-DOS"
+#endif
+
+/* NOTE: make sure you use /bt=DOS if compiling for 32-bit DOS,
+   otherwise Watcom assumes host=target */
+#if defined __WATCOMC__  && defined __386__ && defined __DOS__
+#  define POSH_OS_DOS32 1
+#  define POSH_OS_STRING "DOS/32-bit"
+#endif
+
+#if defined _UNICOS
+#  define POSH_OS_UNICOS 1
+#  define POSH_OS_STRING "UNICOS"
+#endif
+
+#if ( defined __MWERKS__ && defined __powerc && !defined macintosh ) || defined __APPLE_CC__ || defined macosx
+#  define POSH_OS_OSX 1
+#  define POSH_OS_STRING "MacOS X"
+#endif
+
+#if defined __sun__ || defined sun || defined __sun || defined __solaris__
+#  if defined __SVR4 || defined __svr4__ || defined __solaris__
+#     define POSH_OS_STRING "Solaris"
+#     define POSH_OS_SOLARIS 1
+#  endif
+#  if !defined POSH_OS_STRING
+#     define POSH_OS_STRING "SunOS"
+#     define POSH_OS_SUNOS 1
+#  endif
+#endif
+
+#if defined __sgi__ || defined sgi || defined __sgi
+#  define POSH_OS_IRIX 1
+#  define POSH_OS_STRING "Irix"
+#endif
+
+#if defined __hpux__ || defined __hpux
+#  define POSH_OS_HPUX 1
+#  define POSH_OS_STRING "HP-UX"
+#endif
+
+#if defined _AIX
+#  define POSH_OS_AIX 1
+#  define POSH_OS_STRING "AIX"
+#endif
+
+#if ( defined __alpha && defined __osf__ )
+#  define POSH_OS_TRU64 1
+#  define POSH_OS_STRING "Tru64"
+#endif
+
+#if defined __BEOS__ || defined __beos__
+#  define POSH_OS_BEOS 1
+#  define POSH_OS_STRING "BeOS"
+#endif
+
+#if defined amiga || defined amigados || defined AMIGA || defined _AMIGA
+#  define POSH_OS_AMIGA 1
+#  define POSH_OS_STRING "Amiga"
+#endif
+
+#if defined __unix__
+#  define POSH_OS_UNIX 1 
+#  if !defined POSH_OS_STRING
+#     define POSH_OS_STRING "Unix-like(generic)"
+#  endif
+#endif
+
+#if defined _WIN32_WCE
+#  define POSH_OS_WINCE 1
+#  define POSH_OS_STRING "Windows CE"
+#endif
+
+#if defined _XBOX_VER == 200
+#  define POSH_OS_XBOX360 1
+#  define POSH_OS_STRING "XBOX-360"
+#elif defined _XBOX
+#  define POSH_OS_XBOX 1
+#  define POSH_OS_STRING "XBOX"
+#endif
+
+
+#if defined _WIN32 || defined WIN32 || defined __NT__ || defined __WIN32__
+#  define POSH_OS_WIN32 1
+#  if !defined POSH_OS_XBOX
+#     if defined _WIN64
+#        define POSH_OS_WIN64 1
+#        define POSH_OS_STRING "Win64"
+#     else
+#        if !defined POSH_OS_STRING
+#           define POSH_OS_STRING "Win32"
+#        endif
+#     endif
+#  endif
+#endif
+
+#if defined __palmos__
+#  define POSH_OS_PALM 1
+#  define POSH_OS_STRING "PalmOS"
+#endif
+
+#if defined THINK_C || defined macintosh
+#  define POSH_OS_MACOS 1
+#  define POSH_OS_STRING "MacOS"
+#endif
+
+/*
+** -----------------------------------------------------------------------------
+** Determine target CPU
+** -----------------------------------------------------------------------------
+*/
+#if defined mc68000 || defined m68k || defined __MC68K__ || defined m68000
+#  define POSH_CPU_68K 1
+#  define POSH_CPU_STRING "MC68000"
+#endif
+
+#if defined __PPC__ || defined __POWERPC__  || defined powerpc || defined _POWER || defined __ppc__ || defined __powerpc__
+#  define POSH_CPU_PPC 1
+#  if defined __powerpc64__
+#     define POSH_CPU_STRING "PowerPC64"
+#  else
+#     define POSH_CPU_STRING "PowerPC"
+#  endif
+#endif
+
+#if defined _CRAYT3E || defined _CRAYMPP
+#  define POSH_CPU_CRAYT3E 1 /* target processor is a DEC Alpha 21164 used in a Cray T3E*/
+#  define POSH_CPU_STRING "Cray T3E (Alpha 21164)"
+#endif
+
+#if defined CRAY || defined _CRAY && !defined _CRAYT3E
+#  error Non-AXP Cray systems not supported
+#endif
+
+#if defined _SH3
+#  define POSH_CPU_SH3 1
+#  define POSH_CPU_STRING "Hitachi SH-3"
+#endif
+
+#if defined __sh4__ || defined __SH4__
+#  define POSH_CPU_SH3 1
+#  define POSH_CPU_SH4 1
+#  define POSH_CPU_STRING "Hitachi SH-4"
+#endif
+
+#if defined __sparc__ || defined __sparc
+#  if defined __arch64__ || defined __sparcv9 || defined __sparc_v9__
+#     define POSH_CPU_SPARC64 1 
+#     define POSH_CPU_STRING "Sparc/64"
+#  else
+#     define POSH_CPU_STRING "Sparc/32"
+#  endif
+#  define POSH_CPU_SPARC 1
+#endif
+
+#if defined ARM || defined __arm__ || defined _ARM
+#  define POSH_CPU_STRONGARM 1
+#  define POSH_CPU_STRING "ARM"
+#endif
+
+#if defined mips || defined __mips__ || defined __MIPS__ || defined _MIPS
+#  define POSH_CPU_MIPS 1 
+#  if defined _R5900
+#    define POSH_CPU_STRING "MIPS R5900 (PS2)"
+#  else
+#    define POSH_CPU_STRING "MIPS"
+#  endif
+#endif
+
+#if defined __ia64 || defined _M_IA64 || defined __ia64__ 
+#  define POSH_CPU_IA64 1
+#  define POSH_CPU_STRING "IA64"
+#endif
+
+#if defined __X86__ || defined __i386__ || defined i386 || defined _M_IX86 || defined __386__ || defined __x86_64__ || defined _M_X64
+#  define POSH_CPU_X86 1
+#  if defined __x86_64__ || defined _M_X64
+#     define POSH_CPU_X86_64 1 
+#  endif
+#  if defined POSH_CPU_X86_64
+#     define POSH_CPU_STRING "AMD x86-64"
+#  else
+#     define POSH_CPU_STRING "Intel 386+"
+#  endif
+#endif
+
+#if defined __alpha || defined alpha || defined _M_ALPHA || defined __alpha__
+#  define POSH_CPU_AXP 1
+#  define POSH_CPU_STRING "AXP"
+#endif
+
+#if defined __hppa || defined hppa
+#  define POSH_CPU_HPPA 1
+#  define POSH_CPU_STRING "PA-RISC"
+#endif
+
+#if !defined POSH_CPU_STRING
+#  error POSH cannot determine target CPU
+#  define POSH_CPU_STRING "Unknown" /* this is here for Doxygen's benefit */
+#endif
+
+/*
+** -----------------------------------------------------------------------------
+** Attempt to autodetect building for embedded on Sony PS2
+** -----------------------------------------------------------------------------
+*/
+#if !defined POSH_OS_STRING
+#  if !defined FORCE_DOXYGEN
+#    define POSH_OS_EMBEDDED 1 
+#  endif
+#  if defined _R5900
+#     define POSH_OS_STRING "Sony PS2(embedded)"
+#  else
+#     define POSH_OS_STRING "Embedded/Unknown"
+#  endif
+#endif
+
+/*
+** ---------------------------------------------------------------------------
+** Handle cdecl, stdcall, fastcall, etc.
+** ---------------------------------------------------------------------------
+*/
+#if defined POSH_CPU_X86 && !defined POSH_CPU_X86_64
+#  if defined __GNUC__
+#     define POSH_CDECL __attribute__((cdecl))
+#     define POSH_STDCALL __attribute__((stdcall))
+#     define POSH_FASTCALL __attribute__((fastcall))
+#  elif ( defined _MSC_VER || defined __WATCOMC__ || defined __BORLANDC__ || defined __MWERKS__ )
+#     define POSH_CDECL    __cdecl
+#     define POSH_STDCALL  __stdcall
+#     define POSH_FASTCALL __fastcall
+#  endif
+#else
+#  define POSH_CDECL    
+#  define POSH_STDCALL  
+#  define POSH_FASTCALL 
+#endif
+
+/*
+** ---------------------------------------------------------------------------
+** Define POSH_IMPORTEXPORT signature based on POSH_DLL and POSH_BUILDING_LIB
+** ---------------------------------------------------------------------------
+*/
+
+/*
+** We undefine this so that multiple inclusions will work
+*/
+#if defined POSH_IMPORTEXPORT
+#  undef POSH_IMPORTEXPORT
+#endif
+
+#if defined POSH_DLL
+#   if defined POSH_OS_WIN32
+#      if defined _MSC_VER 
+#         if ( _MSC_VER >= 800 )
+#            if defined POSH_BUILDING_LIB
+#               define POSH_IMPORTEXPORT __declspec( dllexport )
+#            else
+#               define POSH_IMPORTEXPORT __declspec( dllimport )
+#            endif
+#         else
+#            if defined POSH_BUILDING_LIB
+#               define POSH_IMPORTEXPORT __export
+#            else
+#               define POSH_IMPORTEXPORT 
+#            endif
+#         endif
+#      endif  /* defined _MSC_VER */
+#      if defined __BORLANDC__
+#         if ( __BORLANDC__ >= 0x500 )
+#            if defined POSH_BUILDING_LIB 
+#               define POSH_IMPORTEXPORT __declspec( dllexport )
+#            else
+#               define POSH_IMPORTEXPORT __declspec( dllimport )
+#            endif
+#         else
+#            if defined POSH_BUILDING_LIB
+#               define POSH_IMPORTEXPORT __export
+#            else
+#               define POSH_IMPORTEXPORT 
+#            endif
+#         endif
+#      endif /* defined __BORLANDC__ */
+       /* for all other compilers, we're just making a blanket assumption */
+#      if defined __GNUC__ || defined __WATCOMC__ || defined __MWERKS__
+#         if defined POSH_BUILDING_LIB
+#            define POSH_IMPORTEXPORT __declspec( dllexport )
+#         else
+#            define POSH_IMPORTEXPORT __declspec( dllimport )
+#         endif
+#      endif /* all other compilers */
+#      if !defined POSH_IMPORTEXPORT
+#         error Building DLLs not supported on this compiler (poshlib@poshlib.org if you know how)
+#      endif
+#   endif /* defined POSH_OS_WIN32 */
+#endif
+
+/* On pretty much everything else, we can thankfully just ignore this */
+#if !defined POSH_IMPORTEXPORT
+#  define POSH_IMPORTEXPORT
+#endif
+
+#if defined FORCE_DOXYGEN
+#  define POSH_DLL    
+#  define POSH_BUILDING_LIB
+#  undef POSH_DLL
+#  undef POSH_BUILDING_LIB
+#endif
+
+/*
+** ----------------------------------------------------------------------------
+** (Re)define POSH_PUBLIC_API export signature 
+** ----------------------------------------------------------------------------
+*/
+#ifdef POSH_PUBLIC_API
+#  undef POSH_PUBLIC_API
+#endif
+
+#if ( ( defined _MSC_VER ) && ( _MSC_VER < 800 ) ) || ( defined __BORLANDC__ && ( __BORLANDC__ < 0x500 ) )
+#  define POSH_PUBLIC_API(rtype) extern rtype POSH_IMPORTEXPORT 
+#else
+#  define POSH_PUBLIC_API(rtype) extern POSH_IMPORTEXPORT rtype
+#endif
+
+/*
+** ----------------------------------------------------------------------------
+** Try to infer endianess.  Basically we just go through the CPUs we know are
+** little endian, and assume anything that isn't one of those is big endian.
+** As a sanity check, we also do this with operating systems we know are
+** little endian, such as Windows.  Some processors are bi-endian, such as 
+** the MIPS series, so we have to be careful about those.
+** ----------------------------------------------------------------------------
+*/
+#if defined POSH_CPU_X86 || defined POSH_CPU_AXP || defined POSH_CPU_STRONGARM || defined POSH_OS_WIN32 || defined POSH_OS_WINCE || defined __MIPSEL__
+#  define POSH_ENDIAN_STRING "little"
+#  define POSH_LITTLE_ENDIAN 1
+#else
+#  define POSH_ENDIAN_STRING "big"
+#  define POSH_BIG_ENDIAN 1
+#endif
+
+#if defined FORCE_DOXYGEN
+#  define POSH_LITTLE_ENDIAN
+#endif
+
+/*
+** ----------------------------------------------------------------------------
+** Cross-platform compile time assertion macro
+** ----------------------------------------------------------------------------
+*/
+#define POSH_COMPILE_TIME_ASSERT(name, x) typedef int _POSH_dummy_ ## name[(x) ? 1 : -1 ]
+
+/*
+** ----------------------------------------------------------------------------
+** 64-bit Integer
+**
+** We don't require 64-bit support, nor do we emulate its functionality, we
+** simply export it if it's available.  Since we can't count on <limits.h>
+** for 64-bit support, we ignore the POSH_USE_LIMITS_H directive.
+** ----------------------------------------------------------------------------
+*/
+#if defined ( __LP64__ ) || defined ( __powerpc64__ ) || defined POSH_CPU_SPARC64
+#  define POSH_64BIT_INTEGER 1
+typedef long posh_i64_t; 
+typedef unsigned long posh_u64_t;
+#  define POSH_I64( x ) ((posh_i64_t)x)
+#  define POSH_U64( x ) ((posh_u64_t)x)
+#  define POSH_I64_PRINTF_PREFIX "l"
+#elif defined _MSC_VER || defined __BORLANDC__ || defined __WATCOMC__ || ( defined __alpha && defined __DECC )
+#  define POSH_64BIT_INTEGER 1
+typedef __int64 posh_i64_t;
+typedef unsigned __int64 posh_u64_t;
+#  define POSH_I64( x ) ((posh_i64_t)x)
+#  define POSH_U64( x ) ((posh_u64_t)x)
+#  define POSH_I64_PRINTF_PREFIX "I64"
+#elif defined __GNUC__ || defined __MWERKS__ || defined __SUNPRO_C || defined __SUNPRO_CC || defined __APPLE_CC__ || defined POSH_OS_IRIX || defined _LONG_LONG || defined _CRAYC
+#  define POSH_64BIT_INTEGER 1
+typedef long long posh_i64_t;
+typedef unsigned long long posh_u64_t;
+#  define POSH_U64( x ) ((posh_u64_t)(x##LL))
+#  define POSH_I64( x ) ((posh_i64_t)(x##LL))
+#  define POSH_I64_PRINTF_PREFIX "ll"
+#endif
+
+/* hack */
+#ifdef __MINGW32__
+#  undef POSH_I64
+#  undef POSH_U64
+#  undef POSH_I64_PRINTF_PREFIX
+#  define POSH_U64( x ) ((posh_u64_t)(x##LL))
+#  define POSH_I64( x ) ((posh_i64_t)(x##LL))
+#  define POSH_I64_PRINTF_PREFIX "I64"
+#endif
+
+#ifdef FORCE_DOXYGEN
+typedef long long posh_i64_t;
+typedef unsigned long posh_u64_t;
+#  define POSH_64BIT_INTEGER
+#  define POSH_I64_PRINTF_PREFIX
+#  define POSH_I64(x)
+#  define POSH_U64(x)
+#endif
+
+/** Minimum value for a 64-bit signed integer */
+#define POSH_I64_MIN  POSH_I64(0x8000000000000000)
+/** Maximum value for a 64-bit signed integer */
+#define POSH_I64_MAX  POSH_I64(0x7FFFFFFFFFFFFFFF)
+/** Minimum value for a 64-bit unsigned integer */
+#define POSH_U64_MIN  POSH_U64(0)
+/** Maximum value for a 64-bit unsigned integer */
+#define POSH_U64_MAX  POSH_U64(0xFFFFFFFFFFFFFFFF)
+
+/* ----------------------------------------------------------------------------
+** Basic Sized Types
+**
+** These types are expected to be EXACTLY sized so you can use them for
+** serialization.
+** ----------------------------------------------------------------------------
+*/
+#define POSH_FALSE 0 
+#define POSH_TRUE  1 
+
+typedef int            posh_bool_t;
+typedef unsigned char  posh_byte_t;
+
+/* NOTE: These assume that CHAR_BIT is 8!! */
+typedef unsigned char  posh_u8_t;
+typedef signed char    posh_i8_t;
+
+#if defined POSH_USE_LIMITS_H
+#  if CHAR_BITS > 8
+#    error This machine uses 9-bit characters.  This is a warning, you can comment this out now.
+#  endif /* CHAR_BITS > 8 */
+
+/* 16-bit */
+#  if ( USHRT_MAX == 65535 ) 
+   typedef unsigned short posh_u16_t;
+   typedef short          posh_i16_t;
+#  else
+   /* Yes, in theory there could still be a 16-bit character type and shorts are
+      32-bits in size...if you find such an architecture, let me know =P */
+#    error No 16-bit type found
+#  endif
+
+/* 32-bit */
+#  if ( INT_MAX == 2147483647 )
+  typedef unsigned       posh_u32_t;
+  typedef int            posh_i32_t;
+#  elif ( LONG_MAX == 2147483647 )
+  typedef unsigned long  posh_u32_t;
+  typedef long           posh_i32_t;
+#  else
+      error No 32-bit type found
+#  endif
+
+#else /* POSH_USE_LIMITS_H */
+
+  typedef unsigned short posh_u16_t;
+  typedef short          posh_i16_t;
+
+#  if !defined POSH_OS_PALM
+  typedef unsigned       posh_u32_t;
+  typedef int            posh_i32_t;
+#  else
+  typedef unsigned long  posh_u32_t;
+  typedef long           posh_i32_t;
+#  endif
+#endif
+
+/** Minimum value for a byte */
+#define POSH_BYTE_MIN    0
+/** Maximum value for an 8-bit unsigned value */
+#define POSH_BYTE_MAX    255
+/** Minimum value for a byte */
+#define POSH_I16_MIN     ( ( posh_i16_t ) 0x8000 )
+/** Maximum value for a 16-bit signed value */
+#define POSH_I16_MAX     ( ( posh_i16_t ) 0x7FFF ) 
+/** Minimum value for a 16-bit unsigned value */
+#define POSH_U16_MIN     0
+/** Maximum value for a 16-bit unsigned value */
+#define POSH_U16_MAX     ( ( posh_u16_t ) 0xFFFF )
+/** Minimum value for a 32-bit signed value */
+#define POSH_I32_MIN     ( ( posh_i32_t ) 0x80000000 )
+/** Maximum value for a 32-bit signed value */
+#define POSH_I32_MAX     ( ( posh_i32_t ) 0x7FFFFFFF )
+/** Minimum value for a 32-bit unsigned value */
+#define POSH_U32_MIN     0
+/** Maximum value for a 32-bit unsigned value */
+#define POSH_U32_MAX     ( ( posh_u32_t ) 0xFFFFFFFF )
+
+/*
+** ----------------------------------------------------------------------------
+** Sanity checks on expected sizes
+** ----------------------------------------------------------------------------
+*/
+#if !defined FORCE_DOXYGEN
+
+POSH_COMPILE_TIME_ASSERT(posh_byte_t, sizeof(posh_byte_t) == 1);
+POSH_COMPILE_TIME_ASSERT(posh_u8_t, sizeof(posh_u8_t) == 1);
+POSH_COMPILE_TIME_ASSERT(posh_i8_t, sizeof(posh_i8_t) == 1);
+POSH_COMPILE_TIME_ASSERT(posh_u16_t, sizeof(posh_u16_t) == 2);
+POSH_COMPILE_TIME_ASSERT(posh_i16_t, sizeof(posh_i16_t) == 2);
+POSH_COMPILE_TIME_ASSERT(posh_u32_t, sizeof(posh_u32_t) == 4);
+POSH_COMPILE_TIME_ASSERT(posh_i32_t, sizeof(posh_i32_t) == 4);
+
+#if !defined POSH_NO_FLOAT
+   POSH_COMPILE_TIME_ASSERT(posh_testfloat_t, sizeof(float)==4 );
+   POSH_COMPILE_TIME_ASSERT(posh_testdouble_t, sizeof(double)==8);
+#endif
+
+#if defined POSH_64BIT_INTEGER
+   POSH_COMPILE_TIME_ASSERT(posh_u64_t, sizeof(posh_u64_t) == 8);
+   POSH_COMPILE_TIME_ASSERT(posh_i64_t, sizeof(posh_i64_t) == 8);
+#endif
+
+#endif
+
+/*
+** ----------------------------------------------------------------------------
+** 64-bit pointer support
+** ----------------------------------------------------------------------------
+*/
+#if defined POSH_CPU_AXP && ( defined POSH_OS_TRU64 || defined POSH_OS_LINUX )
+#  define POSH_64BIT_POINTER 1
+#endif
+
+#if defined POSH_CPU_X86_64 && defined POSH_OS_LINUX
+#  define POSH_64BIT_POINTER 1
+#endif
+
+#if defined POSH_CPU_SPARC64 || defined POSH_OS_WIN64 || defined __64BIT__ || defined __LP64 || defined _LP64 || defined __LP64__ || defined _ADDR64 || defined _CRAYC
+#   define POSH_64BIT_POINTER 1
+#endif
+
+#if defined POSH_64BIT_POINTER
+   POSH_COMPILE_TIME_ASSERT( posh_64bit_pointer, sizeof( void * ) == 8 );
+#elif !defined FORCE_DOXYGEN
+/* if this assertion is hit then you're on a system that either has 64-bit
+   addressing and we didn't catch it, or you're on a system with 16-bit
+   pointers.  In the latter case, POSH doesn't actually care, we're just
+   triggering this assertion to make sure you're aware of the situation,
+   so feel free to delete it.
+
+   If this assertion is triggered on a known 32 or 64-bit platform, 
+   please let us know (poshlib@poshlib.org) */
+   POSH_COMPILE_TIME_ASSERT( posh_32bit_pointer, sizeof( void * ) == 4 );
+#endif
+
+#if defined FORCE_DOXYGEN
+#  define POSH_64BIT_POINTER
+#endif
+
+/*
+** ----------------------------------------------------------------------------
+** POSH Utility Functions
+**
+** These are optional POSH utility functions that are not required if you don't
+** need anything except static checking of your host and target environment.
+** 
+** These functions are NOT wrapped with POSH_PUBLIC_API because I didn't want
+** to enforce their export if your own library is only using them internally.
+** ----------------------------------------------------------------------------
+*/
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+const char *POSH_GetArchString( void );
+
+#if !defined POSH_NO_FLOAT
+
+posh_u32_t  POSH_LittleFloatBits( float f );
+posh_u32_t  POSH_BigFloatBits( float f );
+float       POSH_FloatFromLittleBits( posh_u32_t bits );
+float       POSH_FloatFromBigBits( posh_u32_t bits );
+
+void        POSH_DoubleBits( double d, posh_byte_t dst[ 8 ] );
+double      POSH_DoubleFromBits( const posh_byte_t src[ 8 ] );
+
+/* unimplemented
+float      *POSH_WriteFloatToLittle( void *dst, float f );
+float      *POSH_WriteFloatToBig( void *dst, float f );
+float       POSH_ReadFloatFromLittle( const void *src );
+float       POSH_ReadFloatFromBig( const void *src );
+
+double     *POSH_WriteDoubleToLittle( void *dst, double d );
+double     *POSH_WriteDoubleToBig( void *dst, double d );
+double      POSH_ReadDoubleFromLittle( const void *src );
+double      POSH_ReadDoubleFromBig( const void *src );
+*/
+#endif /* !defined POSH_NO_FLOAT */
+
+#if defined FORCE_DOXYGEN
+#  define POSH_NO_FLOAT
+#  undef  POSH_NO_FLOAT
+#endif
+
+
+extern posh_u16_t  POSH_SwapU16( posh_u16_t u );
+extern posh_i16_t  POSH_SwapI16( posh_i16_t u );
+extern posh_u32_t  POSH_SwapU32( posh_u32_t u );
+extern posh_i32_t  POSH_SwapI32( posh_i32_t u );
+
+#if defined POSH_64BIT_INTEGER
+
+extern posh_u64_t  POSH_SwapU64( posh_u64_t u );
+extern posh_i64_t  POSH_SwapI64( posh_i64_t u );
+
+#endif /*POSH_64BIT_INTEGER */
+
+extern posh_u16_t *POSH_WriteU16ToLittle( void *dst, posh_u16_t value );
+extern posh_i16_t *POSH_WriteI16ToLittle( void *dst, posh_i16_t value );
+extern posh_u32_t *POSH_WriteU32ToLittle( void *dst, posh_u32_t value );
+extern posh_i32_t *POSH_WriteI32ToLittle( void *dst, posh_i32_t value );
+
+extern posh_u16_t *POSH_WriteU16ToBig( void *dst, posh_u16_t value );
+extern posh_i16_t *POSH_WriteI16ToBig( void *dst, posh_i16_t value );
+extern posh_u32_t *POSH_WriteU32ToBig( void *dst, posh_u32_t value );
+extern posh_i32_t *POSH_WriteI32ToBig( void *dst, posh_i32_t value );
+
+extern posh_u16_t  POSH_ReadU16FromLittle( const void *src );
+extern posh_i16_t  POSH_ReadI16FromLittle( const void *src );
+extern posh_u32_t  POSH_ReadU32FromLittle( const void *src );
+extern posh_i32_t  POSH_ReadI32FromLittle( const void *src );
+
+extern posh_u16_t  POSH_ReadU16FromBig( const void *src );
+extern posh_i16_t  POSH_ReadI16FromBig( const void *src );
+extern posh_u32_t  POSH_ReadU32FromBig( const void *src );
+extern posh_i32_t  POSH_ReadI32FromBig( const void *src );
+
+#if defined POSH_64BIT_INTEGER
+extern posh_u64_t *POSH_WriteU64ToLittle( void *dst, posh_u64_t value );
+extern posh_i64_t *POSH_WriteI64ToLittle( void *dst, posh_i64_t value );
+extern posh_u64_t *POSH_WriteU64ToBig( void *dst, posh_u64_t value );
+extern posh_i64_t *POSH_WriteI64ToBig( void *dst, posh_i64_t value );
+
+extern posh_u64_t  POSH_ReadU64FromLittle( const void *src );
+extern posh_i64_t  POSH_ReadI64FromLittle( const void *src );
+extern posh_u64_t  POSH_ReadU64FromBig( const void *src );
+extern posh_i64_t  POSH_ReadI64FromBig( const void *src );
+#endif /* POSH_64BIT_INTEGER */
+
+#if defined POSH_LITTLE_ENDIAN
+
+#  define POSH_LittleU16(x) (x)
+#  define POSH_LittleU32(x) (x)
+#  define POSH_LittleI16(x) (x)
+#  define POSH_LittleI32(x) (x)
+#  if defined POSH_64BIT_INTEGER
+#    define POSH_LittleU64(x) (x)
+#    define POSH_LittleI64(x) (x)
+#  endif /* defined POSH_64BIT_INTEGER */
+
+#  define POSH_BigU16(x) POSH_SwapU16(x)
+#  define POSH_BigU32(x) POSH_SwapU32(x)
+#  define POSH_BigI16(x) POSH_SwapI16(x)
+#  define POSH_BigI32(x) POSH_SwapI32(x)
+#  if defined POSH_64BIT_INTEGER
+#    define POSH_BigU64(x) POSH_SwapU64(x)
+#    define POSH_BigI64(x) POSH_SwapI64(x)
+#  endif /* defined POSH_64BIT_INTEGER */
+
+#else
+
+#  define POSH_BigU16(x) (x)
+#  define POSH_BigU32(x) (x)
+#  define POSH_BigI16(x) (x)
+#  define POSH_BigI32(x) (x)
+
+#  if defined POSH_64BIT_INTEGER
+#    define POSH_BigU64(x) (x)
+#    define POSH_BigI64(x) (x)
+#  endif /* POSH_64BIT_INTEGER */
+
+#  define POSH_LittleU16(x) POSH_SwapU16(x)
+#  define POSH_LittleU32(x) POSH_SwapU32(x)
+#  define POSH_LittleI16(x) POSH_SwapI16(x)
+#  define POSH_LittleI32(x) POSH_SwapI32(x)
+
+#  if defined POSH_64BIT_INTEGER
+#    define POSH_LittleU64(x) POSH_SwapU64(x)
+#    define POSH_LittleI64(x) POSH_SwapI64(x)
+#  endif /* POSH_64BIT_INTEGER */
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* HAVE_POSH_H */
--- a/src/nvcore/poshlib/tests/arch/archtest.c
+++ b/src/nvcore/poshlib/tests/arch/archtest.c
@ -0,0 +1,29 @@
+#include "../../posh.c"
+
+#include <stdio.h>
+
+int main( void )
+{
+   printf( "archtest:\n" );
+   printf( "--------\n" );
+
+   printf( "%s", POSH_GetArchString() );
+   printf( "byte min:  %d\n", POSH_BYTE_MIN );
+   printf( "byte max:  %d\n", POSH_BYTE_MAX );
+   printf( "i16  min:  %d\n",  POSH_I16_MIN );
+   printf( "i16  max:  %d\n",  POSH_I16_MAX );
+   printf( "i32  min:  %d\n",  POSH_I32_MIN );
+   printf( "i32  max:  %d\n",  POSH_I32_MAX );
+   printf( "u16  min:  %u\n", POSH_U16_MIN );
+   printf( "u16  max:  %u\n", POSH_U16_MAX );
+   printf( "u32  min:  %u\n", POSH_U32_MIN );
+   printf( "u32  max:  %u\n", POSH_U32_MAX );
+#ifdef POSH_64BIT_INTEGER
+   printf( "i64  min:  %"POSH_I64_PRINTF_PREFIX"d\n", POSH_I64_MIN );
+   printf( "i64  max:  %"POSH_I64_PRINTF_PREFIX"d\n", POSH_I64_MAX );
+   printf( "u64  min:  %"POSH_I64_PRINTF_PREFIX"u\n", POSH_U64_MIN );
+   printf( "u64  max:  %"POSH_I64_PRINTF_PREFIX"u\n", POSH_U64_MAX );
+#endif
+
+   return 0;
+}
--- a/src/nvcore/poshlib/tests/linktest/linktest.cpp
+++ b/src/nvcore/poshlib/tests/linktest/linktest.cpp
@ -0,0 +1,23 @@
+#include "testlib.hpp"
+#include "testdll.h"
+
+#include <stdio.h>
+
+int main( void )
+{
+  printf( "linktest:\n" );
+  printf( "---------\n" );
+  printf( "linktest is a simple verification test that tests:\n" );
+  printf( "  * correct linkage between C and C++\n" );
+  printf( "  * proper handling when multiple libs use posh\n" );
+  printf( "  * correct handling of DLL vs. LIB linkage (Windows)\n" );
+  printf( "\n\n" );
+  printf( "POSH_GetArchString() reporting:\n%s\n\n", POSH_GetArchString() );
+
+  TestLib_Foo();
+  TestDLL_Foo();
+
+  printf( "\n\nlinktest succeeded!\n" );
+
+  return 0;
+}
--- a/src/nvcore/poshlib/tests/linktest/testdll.h
+++ b/src/nvcore/poshlib/tests/linktest/testdll.h
@ -0,0 +1,20 @@
+#ifndef TESTDLL_H
+#define TESTDLL_H
+
+#define POSH_DLL 1     //define this since poshtestdll is a DLL
+#include "../../posh.h"
+#undef POSH_DLL        //undefine so that another include of posh.h doesn't cause problems
+
+#define TESTDLL_PUBLIC_API POSH_PUBLIC_API
+
+#if defined __cplusplus
+extern "C" {
+#endif
+
+TESTDLL_PUBLIC_API(void) TestDLL_Foo( void );
+
+#if defined __cplusplus
+}
+#endif
+
+#endif /* TESTDLL_H */
--- a/src/nvcore/poshlib/tests/linktest/testlib.cpp
+++ b/src/nvcore/poshlib/tests/linktest/testlib.cpp
@ -0,0 +1,11 @@
+#define POSH_BUILDING_LIB 1
+#include "testlib.hpp"
+
+#include <stdio.h>
+
+void TestLib_Foo( void )
+{
+   printf( "...TestLib_Foo called successfully!\n" );
+}
+
+
--- a/src/nvcore/poshlib/tests/linktest/testlib.hpp
+++ b/src/nvcore/poshlib/tests/linktest/testlib.hpp
@ -0,0 +1,19 @@
+#ifndef TESTLIB_HPP
+#define TESTLIB_HPP
+
+#undef POSH_DLL
+#include "../../posh.h"
+
+#define TESTLIB_PUBLIC_API POSH_PUBLIC_API
+
+#if defined __cplusplus && defined POSH_DLL
+extern "C" {
+#endif
+
+TESTLIB_PUBLIC_API(void) TestLib_Foo( void );
+
+#if defined __cplusplus && defined POSH_DLL
+}
+#endif
+
+#endif /* POSHTESTLIB_H */
--- a/src/nvimage/CMakeLists.txt
+++ b/src/nvimage/CMakeLists.txt
@ -0,0 +1,54 @@
+PROJECT(nvimage)
+
+SUBDIRS(nvtt)
+
+SET(IMAGE_SRCS	
+	nvimage.h
+	FloatImage.h
+	FloatImage.cpp
+	Filter.h
+	Filter.cpp
+	Image.h
+	Image.cpp
+	ImageIO.h
+	ImageIO.cpp
+	ColorBlock.h
+	ColorBlock.cpp
+	HoleFilling.h
+	HoleFilling.cpp
+	DirectDrawSurface.h
+	DirectDrawSurface.cpp
+	Quantize.h
+	Quantize.cpp
+	NormalMap.h
+	NormalMap.cpp)
+
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+
+IF(PNG_FOUND)
+	SET(LIBS ${LIBS} ${PNG_LIBRARIES})
+	INCLUDE_DIRECTORIES(${PNG_INCLUDE_DIR})
+ENDIF(PNG_FOUND)
+
+IF(JPEG_FOUND)
+	SET(LIBS ${LIBS} ${JPEG_LIBRARIES})
+	INCLUDE_DIRECTORIES(${JPEG_INCLUDE_DIR})
+ENDIF(JPEG_FOUND)
+
+IF(TIFF_FOUND)
+	SET(LIBS ${LIBS} ${TIFF_LIBRARIES})
+	INCLUDE_DIRECTORIES(${TIFF_INCLUDE_DIR})
+ENDIF(TIFF_FOUND)
+
+# targets
+ADD_DEFINITIONS(-DNVIMAGE_EXPORTS)
+
+IF(NVIMAGE_SHARED)	
+	ADD_LIBRARY(nvimage SHARED ${IMAGE_SRCS})
+ELSE(NVIMAGE_SHARED)
+	ADD_LIBRARY(nvimage ${IMAGE_SRCS})
+ENDIF(NVIMAGE_SHARED)
+
+TARGET_LINK_LIBRARIES(nvimage ${LIBS} nvcore nvmath posh)
+
+
--- a/src/nvimage/ColorBlock.cpp
+++ b/src/nvimage/ColorBlock.cpp
@ -0,0 +1,392 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#include <nvmath/Box.h>
+#include <nvimage/ColorBlock.h>
+#include <nvimage/Image.h>
+
+using namespace nv;
+
+namespace {
+	
+	// Get approximate luminance.
+	inline static uint colorLuminance(Color32 c)
+	{
+		return c.r + c.g + c.b;
+	}
+	
+	// Get the euclidean distance between the given colors.
+	inline static uint colorDistance(Color32 c0, Color32 c1)
+	{
+		return (c0.r - c1.r) * (c0.r - c1.r) + (c0.g - c1.g) * (c0.g - c1.g) + (c0.b - c1.b) * (c0.b - c1.b);
+	}
+	
+} // namespace`
+
+
+/// Default constructor.
+ColorBlock::ColorBlock()
+{
+}
+
+/// Init the color block with the contents of the given block.
+ColorBlock::ColorBlock(const ColorBlock & block)
+{
+	for(uint i = 0; i < 16; i++) {
+		color(i) = block.color(i);
+	}
+}
+
+
+/// Initialize this color block.
+ColorBlock::ColorBlock(const Image * img, uint x, uint y)
+{
+	init(img, x, y);
+}
+
+void ColorBlock::init(const Image * img, uint x, uint y)
+{
+	nvDebugCheck(img != NULL);
+	
+	const uint bw = min(img->width() - x, 4U);
+	const uint bh = min(img->height() - y, 4U);
+
+	nvDebugCheck(bw != 0);
+	nvDebugCheck(bh != 0);
+
+	int remainder[] = {
+		0, 0, 0, 0,
+		0, 1, 0, 1,
+		0, 1, 2, 0,
+		0, 1, 2, 3,
+	};
+
+	// Blocks that are smaller than 4x4 are handled by repeating the pixels.
+	// @@ Thats only correct when block size is 1, 2 or 4, but not with 3.
+
+	for(uint i = 0; i < 4; i++) {
+		//const int by = i % bh;
+		const int by = remainder[(bh - 1) * 4 + i];
+		for(uint e = 0; e < 4; e++) {
+			//const int bx = e % bw;
+			const int bx = remainder[(bw - 1) * 4 + e];
+			color(e, i) = img->pixel(x + bx, y + by);
+		}
+	}
+}
+
+
+void ColorBlock::swizzleDXT5n()
+{
+	for(int i = 0; i < 16; i++)
+	{
+		Color32 c = m_color[i];
+		m_color[i] = Color32(0, c.r, 0, c.g);
+	}
+}
+
+void ColorBlock::splatX()
+{
+	for(int i = 0; i < 16; i++)
+	{
+		uint8 x = m_color[i].r;
+		m_color[i] = Color32(x, x, x, x);
+	}
+}
+
+void ColorBlock::splatY()
+{
+	for(int i = 0; i < 16; i++)
+	{
+		uint8 y = m_color[i].g;
+		m_color[i] = Color32(y, y, y, y);
+	}
+}
+
+
+/// Count number of unique colors in this color block.
+uint ColorBlock::countUniqueColors() const
+{
+	uint count = 0;
+
+	// @@ This does not have to be o(n^2)
+	for(int i = 0; i < 16; i++)
+	{
+		bool unique = true;
+		for(int j = 0; j < i; j++) {
+			if( m_color[i] != m_color[j] ) {
+				unique = false;
+			}
+		}
+		
+		if( unique ) {
+			count++;
+		}
+	}
+	
+	return count;
+}
+
+/// Get average color of the block.
+Color32 ColorBlock::averageColor() const
+{
+	uint r, g, b, a;
+	r = g = b = a = 0;
+
+	for(uint i = 0; i < 16; i++) {
+		r += m_color[i].r;
+		g += m_color[i].g;
+		b += m_color[i].b;
+		a += m_color[i].a;
+	}
+	
+	return Color32(uint8(r / 16), uint8(g / 16), uint8(b / 16), uint8(a / 16));
+}
+
+
+/// Get diameter color range.
+void ColorBlock::diameterRange(Color32 * start, Color32 * end) const
+{
+	nvDebugCheck(start != NULL);
+	nvDebugCheck(end != NULL);
+	
+	Color32 c0, c1;
+	uint best_dist = 0;
+	
+	for(int i = 0; i < 16; i++) {
+		for (int j = i+1; j < 16; j++) {
+			uint dist = colorDistance(m_color[i], m_color[j]);
+			if( dist > best_dist ) {
+				best_dist = dist;
+				c0 = m_color[i];
+				c1 = m_color[j];
+			}
+		}
+	}
+	
+	*start = c0;
+	*end = c1;
+}
+
+/// Get luminance color range.
+void ColorBlock::luminanceRange(Color32 * start, Color32 * end) const
+{
+	nvDebugCheck(start != NULL);
+	nvDebugCheck(end != NULL);
+	
+	Color32 minColor, maxColor;
+	uint minLuminance, maxLuminance;
+	
+	maxLuminance = minLuminance = colorLuminance(m_color[0]);
+	
+	for(uint i = 1; i < 16; i++)
+	{
+		uint luminance = colorLuminance(m_color[i]);
+		
+		if (luminance > maxLuminance) {
+			maxLuminance = luminance;
+			maxColor = m_color[i];
+		}
+		else if (luminance < minLuminance) {
+			minLuminance = luminance;
+			minColor = m_color[i];
+		}
+	}
+
+	*start = minColor;
+	*end = maxColor;
+}
+
+/// Get color range based on the bounding box. 
+void ColorBlock::boundsRange(Color32 * start, Color32 * end) const
+{
+	nvDebugCheck(start != NULL);
+	nvDebugCheck(end != NULL);
+
+	Color32 minColor(255, 255, 255);
+	Color32 maxColor(0, 0, 0);
+
+	for(uint i = 0; i < 16; i++)
+	{
+		if (m_color[i].r < minColor.r) { minColor.r = m_color[i].r; }
+		if (m_color[i].g < minColor.g) { minColor.g = m_color[i].g; }
+		if (m_color[i].b < minColor.b) { minColor.b = m_color[i].b; }
+		if (m_color[i].r > maxColor.r) { maxColor.r = m_color[i].r; }
+		if (m_color[i].g > maxColor.g) { maxColor.g = m_color[i].g; }
+		if (m_color[i].b > maxColor.b) { maxColor.b = m_color[i].b; }
+	}
+
+	// Offset range by 1/16 of the extents
+	Color32 inset;
+	inset.r = (maxColor.r - minColor.r) >> 4;
+	inset.g = (maxColor.g - minColor.g) >> 4;
+	inset.b = (maxColor.b - minColor.b) >> 4;
+
+	minColor.r = (minColor.r + inset.r <= 255) ? minColor.r + inset.r : 255;
+	minColor.g = (minColor.g + inset.g <= 255) ? minColor.g + inset.g : 255;
+	minColor.b = (minColor.b + inset.b <= 255) ? minColor.b + inset.b : 255;
+
+	maxColor.r = (maxColor.r >= inset.r) ? maxColor.r - inset.r : 0;
+	maxColor.g = (maxColor.g >= inset.g) ? maxColor.g - inset.g : 0;
+	maxColor.b = (maxColor.b >= inset.b) ? maxColor.b - inset.b : 0;
+
+	*start = minColor;
+	*end = maxColor;
+}
+
+/// Get color range based on the bounding box. 
+void ColorBlock::boundsRangeAlpha(Color32 * start, Color32 * end) const
+{
+	nvDebugCheck(start != NULL);
+	nvDebugCheck(end != NULL);
+
+	Color32 minColor(255, 255, 255, 255);
+	Color32 maxColor(0, 0, 0, 0);
+
+	for(uint i = 0; i < 16; i++)
+	{
+		if (m_color[i].r < minColor.r) { minColor.r = m_color[i].r; }
+		if (m_color[i].g < minColor.g) { minColor.g = m_color[i].g; }
+		if (m_color[i].b < minColor.b) { minColor.b = m_color[i].b; }
+		if (m_color[i].a < minColor.a) { minColor.a = m_color[i].a; }
+		if (m_color[i].r > maxColor.r) { maxColor.r = m_color[i].r; }
+		if (m_color[i].g > maxColor.g) { maxColor.g = m_color[i].g; }
+		if (m_color[i].b > maxColor.b) { maxColor.b = m_color[i].b; }
+		if (m_color[i].a > maxColor.a) { maxColor.a = m_color[i].a; }
+	}
+
+	// Offset range by 1/16 of the extents
+	Color32 inset;
+	inset.r = (maxColor.r - minColor.r) >> 4;
+	inset.g = (maxColor.g - minColor.g) >> 4;
+	inset.b = (maxColor.b - minColor.b) >> 4;
+	inset.a = (maxColor.a - minColor.a) >> 4;
+
+	minColor.r = (minColor.r + inset.r <= 255) ? minColor.r + inset.r : 255;
+	minColor.g = (minColor.g + inset.g <= 255) ? minColor.g + inset.g : 255;
+	minColor.b = (minColor.b + inset.b <= 255) ? minColor.b + inset.b : 255;
+	minColor.a = (minColor.a + inset.a <= 255) ? minColor.a + inset.a : 255;
+
+	maxColor.r = (maxColor.r >= inset.r) ? maxColor.r - inset.r : 0;
+	maxColor.g = (maxColor.g >= inset.g) ? maxColor.g - inset.g : 0;
+	maxColor.b = (maxColor.b >= inset.b) ? maxColor.b - inset.b : 0;
+	maxColor.a = (maxColor.a >= inset.a) ? maxColor.a - inset.a : 0;
+	
+	*start = minColor;
+	*end = maxColor;
+}
+
+
+void ColorBlock::bestFitRange(Color32 * start, Color32 * end) const
+{
+	nvDebugCheck(start != NULL);
+	nvDebugCheck(end != NULL);
+
+	Vector3 axis = bestFitLine().direction();
+	computeRange(axis, start, end);
+}
+
+/// Sort colors by abosolute value in their 16 bit representation.
+void ColorBlock::sortColorsByAbsoluteValue()
+{
+	// Dummy selection sort.
+	for( uint a = 0; a < 16; a++ ) {
+		uint max = a;
+		Color16 cmax(m_color[a]);
+		
+		for( uint b = a+1; b < 16; b++ ) {
+			Color16 cb(m_color[b]);
+			
+			if( cb.u > cmax.u ) {
+				max = b;
+				cmax = cb;
+			}
+		}
+		swap( m_color[a], m_color[max] );
+	}
+}
+
+
+/// Find extreme colors in the given axis.
+void ColorBlock::computeRange(Vector3::Arg axis, Color32 * start, Color32 * end) const
+{
+	nvDebugCheck(start != NULL);
+	nvDebugCheck(end != NULL);
+	
+	int mini, maxi;
+	mini = maxi = 0;
+	
+	float min, max;	
+	min = max = dot(Vector3(m_color[0].r, m_color[0].g, m_color[0].b), axis);
+
+	for(uint i = 1; i < 16; i++)
+	{
+		const Vector3 vec(m_color[i].r, m_color[i].g, m_color[i].b);
+		
+		float val = dot(vec, axis);
+		if( val < min ) {
+			mini = i;
+			min = val;
+		}
+		else if( val > max ) {
+			maxi = i;
+			max = val;
+		}
+	}
+	
+	*start = m_color[mini];
+	*end = m_color[maxi];
+}
+
+
+/// Sort colors in the given axis.
+void ColorBlock::sortColors(const Vector3 & axis)
+{
+	float luma_array[16];
+	
+	for(uint i = 0; i < 16; i++) {
+		const Vector3 vec(m_color[i].r, m_color[i].g, m_color[i].b);
+		luma_array[i] = dot(vec, axis);
+	}
+	
+	// Dummy selection sort.
+	for( uint a = 0; a < 16; a++ ) {
+		uint min = a;
+		for( uint b = a+1; b < 16; b++ ) {
+			if( luma_array[b] < luma_array[min] ) {
+				min = b;
+			}
+		}
+		swap( luma_array[a], luma_array[min] );
+		swap( m_color[a], m_color[min] );
+	}
+}
+
+
+/// Get least squares line that best approxiamtes the points of the color block.
+Line3 ColorBlock::bestFitLine() const
+{
+	Array<Vector3> pointArray(16);
+	
+	for(int i = 0; i < 16; i++) {
+		pointArray.append(Vector3(m_color[i].r, m_color[i].g, m_color[i].b));
+	}
+	
+	return Fit::bestLine(pointArray);
+}
+
+
+/// Get the volume of the color block.
+float ColorBlock::volume() const
+{
+	Box bounds;
+	bounds.clearBounds();
+	
+	for(int i = 0; i < 16; i++) {
+		const Vector3 point(m_color[i].r, m_color[i].g, m_color[i].b);
+		bounds.addPointToBounds(point);
+	}
+	
+	return bounds.volume();
+}
+
+
--- a/src/nvimage/ColorBlock.h
+++ b/src/nvimage/ColorBlock.h
@ -0,0 +1,96 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#ifndef NV_IMAGE_COLORBLOCK_H
+#define NV_IMAGE_COLORBLOCK_H
+
+#include <nvmath/Color.h>
+#include <nvmath/Fitting.h>	// Line3
+
+namespace nv
+{
+	class Image;
+
+	/// Uncompressed 4x4 color block.
+	struct ColorBlock
+	{
+		ColorBlock();
+		ColorBlock(const ColorBlock & block);
+		ColorBlock(const Image * img, uint x, uint y);
+		
+		void init(const Image * img, uint x, uint y);
+		
+		void swizzleDXT5n();
+		void splatX();
+		void splatY();
+		
+		uint countUniqueColors() const;
+		Color32 averageColor() const;
+		
+		void diameterRange(Color32 * start, Color32 * end) const;
+		void luminanceRange(Color32 * start, Color32 * end) const;
+		void boundsRange(Color32 * start, Color32 * end) const;
+		void boundsRangeAlpha(Color32 * start, Color32 * end) const;
+		void bestFitRange(Color32 * start, Color32 * end) const;
+		
+		void sortColorsByAbsoluteValue();
+		
+		void computeRange(const Vector3 & axis, Color32 * start, Color32 * end) const;
+		void sortColors(const Vector3 & axis);
+		
+		Line3 bestFitLine() const;
+		float volume() const;
+		Line3 diameterLine() const;
+		
+		// Accessors
+		const Color32 * colors() const;
+
+		Color32 color(uint i) const;
+		Color32 & color(uint i);
+		
+		Color32 color(uint x, uint y) const;
+		Color32 & color(uint x, uint y);
+		
+	private:
+		
+		Color32 m_color[4*4];
+		
+	};
+	
+
+	/// Get pointer to block colors.
+	inline const Color32 * ColorBlock::colors() const
+	{
+		return m_color;
+	}
+	
+	/// Get block color.
+	inline Color32 ColorBlock::color(uint i) const
+	{
+		nvDebugCheck(i < 16);
+		return m_color[i];
+	}
+	
+	/// Get block color.
+	inline Color32 & ColorBlock::color(uint i)
+	{
+		nvDebugCheck(i < 16);
+		return m_color[i];
+	}
+	
+	/// Get block color.
+	inline Color32 ColorBlock::color(uint x, uint y) const
+	{
+		nvDebugCheck(x < 4 && y < 4);
+		return m_color[y * 4 + x];
+	}
+	
+	/// Get block color.
+	inline Color32 & ColorBlock::color(uint x, uint y)
+	{
+		nvDebugCheck(x < 4 && y < 4);
+		return m_color[y * 4 + x];
+	}
+	
+} // nv namespace
+
+#endif // NV_IMAGE_COLORBLOCK_H
--- a/src/nvimage/DirectDrawSurface.cpp
+++ b/src/nvimage/DirectDrawSurface.cpp
@ -0,0 +1,258 @@
+// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
+// 
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+#include <nvcore/Debug.h>
+
+#include <nvimage/DirectDrawSurface.h>
+
+#include <string.h> // memset
+
+
+using namespace nv;
+
+#if !defined(MAKEFOURCC)
+#	define MAKEFOURCC(ch0, ch1, ch2, ch3) \
+		(uint(uint8(ch0)) | (uint(uint8(ch1)) << 8) | \
+		(uint(uint8(ch2)) << 16) | (uint(uint8(ch3)) << 24 ))
+#endif
+
+namespace
+{
+	static const uint FOURCC_DDS = MAKEFOURCC('D', 'D', 'S', ' ');
+	static const uint FOURCC_DXT1 = MAKEFOURCC('D', 'X', 'T', '1');
+	static const uint FOURCC_DXT2 = MAKEFOURCC('D', 'X', 'T', '2');
+	static const uint FOURCC_DXT3 = MAKEFOURCC('D', 'X', 'T', '3');
+	static const uint FOURCC_DXT4 = MAKEFOURCC('D', 'X', 'T', '4');
+	static const uint FOURCC_DXT5 = MAKEFOURCC('D', 'X', 'T', '5');
+	static const uint FOURCC_RXGB = MAKEFOURCC('R', 'X', 'G', 'B');
+	static const uint FOURCC_ATI1 = MAKEFOURCC('A', 'T', 'I', '1');
+	static const uint FOURCC_ATI2 = MAKEFOURCC('A', 'T', 'I', '2');
+
+	static const uint DDSD_CAPS = 0x00000001U;
+	static const uint DDSD_PIXELFORMAT = 0x00001000U;
+	static const uint DDSD_WIDTH = 0x00000004U;
+	static const uint DDSD_HEIGHT = 0x00000002U;
+	static const uint DDSD_PITCH = 0x00000008U;
+	static const uint DDSD_MIPMAPCOUNT = 0x00020000U;
+	static const uint DDSD_LINEARSIZE = 0x00080000U;
+	static const uint DDSD_DEPTH = 0x00800000U;
+		
+	static const uint DDSCAPS_COMPLEX = 0x00000008U;
+	static const uint DDSCAPS_TEXTURE = 0x00001000U;
+	static const uint DDSCAPS_MIPMAP = 0x00400000U;
+	static const uint DDSCAPS2_VOLUME = 0x00200000U;
+	static const uint DDSCAPS2_CUBEMAP = 0x00000200U;
+
+	static const uint DDSCAPS2_CUBEMAP_POSITIVEX = 0x00000400U;
+	static const uint DDSCAPS2_CUBEMAP_NEGATIVEX = 0x00000800U;
+	static const uint DDSCAPS2_CUBEMAP_POSITIVEY = 0x00001000U;
+	static const uint DDSCAPS2_CUBEMAP_NEGATIVEY = 0x00002000U;
+	static const uint DDSCAPS2_CUBEMAP_POSITIVEZ = 0x00004000U;
+	static const uint DDSCAPS2_CUBEMAP_NEGATIVEZ = 0x00008000U;
+	static const uint DDSCAPS2_CUBEMAP_ALL_FACES = 0x0000F000U;
+
+	static const uint DDPF_RGB = 0x00000040U;
+	static const uint DDPF_FOURCC = 0x00000004U;
+	static const uint DDPF_ALPHAPIXELS = 0x00000001U;
+}
+
+DDSHeader::DDSHeader()
+{
+	this->fourcc = FOURCC_DDS;
+	this->size = 124;
+	this->flags  = (DDSD_CAPS|DDSD_PIXELFORMAT);
+	this->height = 0;
+	this->width = 0;
+	this->pitch = 0;
+	this->depth = 0;
+	this->mipmapcount = 0;
+	memset(this->reserved, 0, sizeof(this->reserved));
+
+	// Store version information on the reserved header attributes.
+	this->reserved[9] = MAKEFOURCC('N', 'V', 'T', 'T');
+	this->reserved[10] = (0 << 16) | (1 << 8) | (0);	// major.minor.revision
+
+	this->pf.size = 32;
+	this->pf.flags = 0;
+	this->pf.fourcc = 0;
+	this->pf.bitcount = 0;
+	this->pf.rmask = 0;
+	this->pf.gmask = 0;
+	this->pf.bmask = 0;
+	this->pf.amask = 0;
+	this->caps.caps1 = DDSCAPS_TEXTURE;
+	this->caps.caps2 = 0;
+	this->caps.caps3 = 0;
+	this->caps.caps4 = 0;
+	this->notused = 0;
+}
+
+void DDSHeader::setWidth(uint w)
+{
+	this->flags |= DDSD_WIDTH;
+	this->width = w;
+}
+
+void DDSHeader::setHeight(uint h)
+{
+	this->flags |= DDSD_HEIGHT;
+	this->height = h;
+}
+
+void DDSHeader::setDepth(uint d)
+{
+	this->flags |= DDSD_DEPTH;
+	this->height = d;
+}
+
+void DDSHeader::setMipmapCount(uint count)
+{
+	if (count == 0)
+	{
+		this->flags &= ~DDSD_MIPMAPCOUNT;
+		this->mipmapcount = 0;
+
+		if (this->caps.caps2 == 0) {
+			this->caps.caps1 = DDSCAPS_TEXTURE;
+		}
+		else {
+			this->caps.caps1 = DDSCAPS_TEXTURE | DDSCAPS_COMPLEX;
+		}
+	}
+	else
+	{
+		this->flags |= DDSD_MIPMAPCOUNT;
+		this->mipmapcount = count;
+
+		this->caps.caps1 |= DDSCAPS_COMPLEX | DDSCAPS_MIPMAP;
+	}
+}
+
+void DDSHeader::setTexture2D()
+{
+	// nothing to do here.
+}
+
+void DDSHeader::setTexture3D()
+{
+	this->caps.caps2 = DDSCAPS2_VOLUME;
+}
+
+void DDSHeader::setTextureCube()
+{
+	this->caps.caps1 |= DDSCAPS_COMPLEX;
+	this->caps.caps2 = DDSCAPS2_CUBEMAP | DDSCAPS2_CUBEMAP_ALL_FACES;
+}
+
+void DDSHeader::setLinearSize(uint size)
+{
+	this->flags &= ~DDSD_PITCH;
+	this->flags |= DDSD_LINEARSIZE;
+	this->pitch = size;
+}
+
+void DDSHeader::setPitch(uint pitch)
+{
+	this->flags &= ~DDSD_LINEARSIZE;
+	this->flags |= DDSD_PITCH;
+	this->pitch = pitch;
+}
+
+void DDSHeader::setFourCC(uint8 c0, uint8 c1, uint8 c2, uint8 c3)
+{
+	// set fourcc pixel format.
+	this->pf.flags = DDPF_FOURCC;
+	this->pf.fourcc = MAKEFOURCC(c0, c1, c2, c3);
+	this->pf.bitcount = 0;
+	this->pf.rmask = 0;
+	this->pf.gmask = 0;
+	this->pf.bmask = 0;
+	this->pf.amask = 0;
+}
+
+void DDSHeader::setPixelFormat(uint bitcount, uint rmask, uint gmask, uint bmask, uint amask)
+{
+	// Make sure the masks are correct.
+	nvCheck((rmask & gmask) == 0);
+	nvCheck((rmask & bmask) == 0);
+	nvCheck((rmask & amask) == 0);
+	nvCheck((gmask & bmask) == 0);
+	nvCheck((gmask & amask) == 0);
+	nvCheck((bmask & amask) == 0);
+
+	this->pf.flags = DDPF_RGB;
+
+	if (amask != 0) {
+		this->pf.flags |= DDPF_ALPHAPIXELS;
+	}
+
+	if (bitcount == 0)
+	{
+		// Compute bit count from the masks.
+		uint total = rmask | gmask | bmask | amask;
+		while(total != 0) {
+			bitcount++;
+			total >>= 1;
+		}
+		// @@ Align to 8?
+	}
+
+	this->pf.fourcc = 0;
+	this->pf.bitcount = bitcount;
+	this->pf.rmask = rmask;
+	this->pf.gmask = gmask;
+	this->pf.bmask = bmask;
+	this->pf.amask = amask;
+}
+
+
+void DDSHeader::swapBytes()
+{
+	this->fourcc = POSH_LittleU32(this->fourcc);
+	this->size = POSH_LittleU32(this->size);
+	this->flags = POSH_LittleU32(this->flags);
+	this->height = POSH_LittleU32(this->height);
+	this->width = POSH_LittleU32(this->width);
+	this->pitch = POSH_LittleU32(this->pitch);
+	this->depth = POSH_LittleU32(this->depth);
+	this->mipmapcount = POSH_LittleU32(this->mipmapcount);
+	
+	for(int i = 0; i < 11; i++) {
+		this->reserved[i] = POSH_LittleU32(this->reserved[i]);
+	}
+
+	this->pf.size = POSH_LittleU32(this->pf.size);
+	this->pf.flags = POSH_LittleU32(this->pf.flags);
+	this->pf.fourcc = POSH_LittleU32(this->pf.fourcc);
+	this->pf.bitcount = POSH_LittleU32(this->pf.bitcount);
+	this->pf.rmask = POSH_LittleU32(this->pf.rmask);
+	this->pf.gmask = POSH_LittleU32(this->pf.gmask);
+	this->pf.bmask = POSH_LittleU32(this->pf.bmask);
+	this->pf.amask = POSH_LittleU32(this->pf.amask);
+	this->caps.caps1 = POSH_LittleU32(this->caps.caps1);
+	this->caps.caps2 = POSH_LittleU32(this->caps.caps2);
+	this->caps.caps3 = POSH_LittleU32(this->caps.caps3);
+	this->caps.caps4 = POSH_LittleU32(this->caps.caps4);
+	this->notused = POSH_LittleU32(this->notused);
+}
+
--- a/src/nvimage/DirectDrawSurface.h
+++ b/src/nvimage/DirectDrawSurface.h
@ -0,0 +1,85 @@
+// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
+// 
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef NV_IMAGE_DIRECTDRAWSURFACE_H
+#define NV_IMAGE_DIRECTDRAWSURFACE_H
+
+#include <nvcore/nvcore.h>
+
+namespace nv
+{
+
+	struct DDSPixelFormat {
+		uint size;
+		uint flags;
+		uint fourcc;
+		uint bitcount;
+		uint rmask;
+		uint gmask;
+		uint bmask;
+		uint amask;
+	};
+
+	struct DDSCaps {
+		uint caps1;
+		uint caps2;
+		uint caps3;
+		uint caps4;
+	};
+
+	/// DDS file header.
+	struct DDSHeader {
+		uint fourcc;
+		uint size;
+		uint flags;
+		uint height;
+		uint width;
+		uint pitch;
+		uint depth;
+		uint mipmapcount;
+		uint reserved[11];
+		DDSPixelFormat pf;
+		DDSCaps caps;
+		uint notused;
+
+		// Helper methods.
+		DDSHeader();
+		void setWidth(uint w);
+		void setHeight(uint h);
+		void setDepth(uint d);
+		void setMipmapCount(uint count);
+		void setLinearSize(uint size);
+		void setPitch(uint pitch);
+		void setFourCC(uint8 c0, uint8 c1, uint8 c2, uint8 c3);
+		void setPixelFormat(uint bitcount, uint rmask, uint gmask, uint bmask, uint amask);
+		void setTexture2D();
+		void setTexture3D();
+		void setTextureCube();
+		
+		void swapBytes();
+	};
+
+
+} // nv namespace
+
+#endif // NV_IMAGE_DIRECTDRAWSURFACE_H
--- a/src/nvimage/Filter.cpp
+++ b/src/nvimage/Filter.cpp
@ -0,0 +1,572 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+/** @file Filter.cpp
+ * @brief Image filters.
+ *
+ * Jonathan Blow articles:
+ * http://number-none.com/product/Mipmapping, Part 1/index.html
+ * http://number-none.com/product/Mipmapping, Part 2/index.html
+ *
+ * References from Thacher Ulrich:
+ * See _Graphics Gems III_ "General Filtered Image Rescaling", Dale A. Schumacher
+ *
+ * References from Paul Heckbert:
+ * A.V. Oppenheim, R.W. Schafer, Digital Signal Processing, Prentice-Hall, 1975
+ *
+ * R.W. Hamming, Digital Filters, Prentice-Hall, Englewood Cliffs, NJ, 1983
+ *
+ * W.K. Pratt, Digital Image Processing, John Wiley and Sons, 1978
+ *
+ * H.S. Hou, H.C. Andrews, "Cubic Splines for Image Interpolation and
+ *	Digital Filtering", IEEE Trans. Acoustics, Speech, and Signal Proc.,
+ *	vol. ASSP-26, no. 6, Dec. 1978, pp. 508-517
+ *
+ * Paul Heckbert's zoom library.
+ * http://www.xmission.com/~legalize/zoom.html
+ * 
+ * Reconstruction Filters in Computer Graphics
+ * http://www.mentallandscape.com/Papers_siggraph88.pdf 
+ *
+ */
+
+
+#include <nvcore/Containers.h>	// swap
+#include <nvmath/nvmath.h>	// fabs
+#include <nvmath/Vector.h>	// Vector4
+#include <nvimage/Filter.h>
+
+using namespace nv;
+
+namespace
+{
+
+// support = 0.5
+inline static float filter_box(float x)
+{
+    if( x < -0.5f ) return 0.0f;
+    if( x <= 0.5 ) return 1.0f;
+    return 0.0f;
+}
+
+// support = 1.0
+inline static float filter_triangle(float x)
+{
+    if( x < -1.0f ) return 0.0f;
+    if( x < 0.0f ) return 1.0f + x;
+    if( x < 1.0f ) return 1.0f - x;
+    return 0.0f;
+}
+
+// support = 1.5
+inline static float filter_quadratic(float x)
+{
+	if( x < 0.0f ) x = -x;
+    if( x < 0.5f ) return 0.75f - x * x;
+    if( x < 1.5f ) { 
+    	float t = x - 1.5f;
+    	return 0.5f * t * t;
+    }
+    return 0.0f;
+}
+
+// @@ Filter from tulrich. 
+// support 1.0
+inline static float filter_cubic(float x)
+{
+	// f(t) = 2|t|^3 - 3|t|^2 + 1, -1 <= t <= 1
+	if( x < 0.0f ) x = -x;
+	if( x < 1.0f ) return((2.0f * x - 3.0f) * x * x + 1.0f);
+	return 0.0f;
+}
+
+
+// @@ Paul Heckbert calls this cubic instead of spline.
+// support = 2.0
+inline static float filter_spline(float x)
+{
+    if( x < 0.0f ) x = -x;
+    if( x < 1.0f ) return (4.0f + x * x * (-6.0f + x * 3.0f)) / 6.0f;
+    if( x < 2.0f ) { 
+    	float t = 2.0f - x;
+    	return t * t * t / 6.0f;
+    }
+    return 0.0f;
+}
+
+/// Sinc function.
+inline float sincf( const float x )
+{
+	if( fabs(x) < NV_EPSILON ) {
+		return 1.0 ;
+		//return 1.0f + x*x*(-1.0f/6.0f + x*x*1.0f/120.0f);
+	}
+	else {
+		return sin(x) / x;
+	}
+}
+
+// support = 3.0
+inline static float filter_lanczos3(float x)
+{
+	if( x < 0.0f ) x = -x;
+	if( x < 3.0f ) return(sincf(x) * sincf(x / 3.0f));
+	return 0.0f;
+}
+
+
+
+// Mitchell & Netravali's two-param cubic
+// see "Reconstruction Filters in Computer Graphics", SIGGRAPH 88
+// support = 2.0
+inline static float filter_mitchell(float x, float b, float c)
+{
+	// @@ Coefficients could be precomputed.
+	// @@ if b and c are fixed, these are constants.
+	const float p0 = (6.0f -  2.0f * b) / 6.0f;
+	const float p2 = (-18.0f + 12.0f * b + 6.0f * c) / 6.0f;
+	const float p3 = (12.0f - 9.0f * b - 6.0f * c) / 6.0f;
+	const float q0 = (8.0f * b + 24.0f * c) / 6.0f;
+	const float q1 = (-12.0f * b - 48.0f * c) / 6.0f;
+	const float q2 = (6.0f * b + 30.0f * c) / 6.0f;
+	const float q3 = (-b - 6.0f * c) / 6.0f;
+
+	if( x < 0.0f ) x = -x;
+	if( x < 1.0f ) return p0 + x * x * (p2 + x * p3);
+	if( x < 2.0f ) return q0 + x * (q1 + x * (q2 + x * q3));
+	return 0.0f;
+}
+
+inline static float filter_mitchell(float x)
+{
+	return filter_mitchell(x, 1.0f/3.0f, 1.0f/3.0f);
+}
+
+// Bessel function of the first kind from Jon Blow's article.
+// http://mathworld.wolfram.com/BesselFunctionoftheFirstKind.html
+// http://en.wikipedia.org/wiki/Bessel_function
+static float bessel0(float x)
+{
+	const float EPSILON_RATIO = 1E-6;
+	float xh, sum, pow, ds;
+	int k;
+
+	xh = 0.5 * x;
+	sum = 1.0;
+	pow = 1.0;
+	k = 0;
+	ds = 1.0;
+	while (ds > sum * EPSILON_RATIO) {
+		++k;
+		pow = pow * (xh / k);
+		ds = pow * pow;
+		sum = sum + ds;
+	}
+
+	return sum;
+}
+
+// Alternative bessel function from Paul Heckbert.
+static float _bessel0(float x)
+{
+	const float EPSILON_RATIO = 1E-6;
+    float sum = 1.0f;
+    float y = x * x / 4.0f;
+    float t = y;
+    for(int i = 2; t > EPSILON_RATIO; i++) {
+		sum += t;
+		t *= y / float(i * i);
+    }
+    return sum;
+}
+
+// support = 1.0
+inline static float filter_kaiser(float x, float alpha)
+{
+	return bessel0(alpha * sqrtf(1 - x * x)) / bessel0(alpha);
+}
+
+inline static float filter_kaiser(float x)
+{
+	return filter_kaiser(x, 4.0f);
+}
+
+
+// Array of filters.
+static Filter s_filter_array[] = {
+	{filter_box, 		0.5f},	// Box
+	{filter_triangle, 	1.0f},	// Triangle
+	{filter_quadratic, 	1.5f},	// Quadratic
+	{filter_cubic, 		1.0f},	// Cubic
+	{filter_spline,		2.0f},	// Spline
+	{filter_lanczos3,	3.0f},	// Lanczos
+	{filter_mitchell,	1.0f},	// Mitchell
+	{filter_kaiser,		1.0f},	// Kaiser
+};
+
+} // namespace
+
+
+
+/// Ctor.
+Kernel1::Kernel1(uint width) : w(width)
+{
+	data = new float[w];
+}
+
+/// Copy ctor.
+Kernel1::Kernel1(const Kernel1 & k) : w(k.w)
+{
+	data = new float[w];
+	for(uint i = 0; i < w; i++) {
+		data[i] = k.data[i];
+	}
+}
+
+/// Dtor.
+Kernel1::~Kernel1()
+{
+	delete data;
+}
+
+/// Normalize the filter.
+void Kernel1::normalize()
+{
+	float total = 0.0f;
+	for(uint i = 0; i < w; i++) {
+		total += data[i];
+	}
+	
+	float inv = 1.0f / total;
+	for(uint i = 0; i < w; i++) {
+		data[i] *= inv;
+	}
+}
+
+
+/// Init 1D Box filter.
+void Kernel1::initFilter(Filter::Enum f)
+{
+	nvCheck((w & 1) == 0);
+	nvCheck(f < Filter::Num);
+	
+	float (* filter_function)(float) = s_filter_array[f].function;
+	const float support = s_filter_array[f].support;
+	
+	const float half_width = float(w / 2);
+	const float offset = -half_width;
+	const float nudge = 0.5f;
+	
+	for(uint i = 0; i < w; i++) {
+		const float x = (i + offset) + nudge;
+		data[i] = filter_function(x * support / half_width);
+	}
+	
+	normalize();
+}
+
+
+/// Init 1D sinc filter.
+void Kernel1::initSinc(float stretch /*= 1*/)
+{
+	nvCheck((w & 1) == 0);
+	
+	const float half_width = float(w / 2);
+	const float offset = -half_width;
+	const float nudge = 0.5f;
+	
+	for(uint i = 0; i < w; i++) {
+		const float x = (i + offset) + nudge;
+		data[i] = sincf(PI * x * stretch);
+	}
+
+	normalize();
+}
+
+
+/// Init 1D windowed Kaiser filter.
+void Kernel1::initKaiser(float alpha, float stretch /*= 1*/)
+{
+	nvCheck((w & 1) == 0);
+	
+	const float half_width = float(w / 2);
+	const float offset = -half_width;
+	const float nudge = 0.5f;
+	
+	for(uint i = 0; i < w; i++) {
+		const float x = (i + offset) + nudge;
+		const float sinc_value = sincf(PI * x * stretch);
+		const float window_value = filter_kaiser(x / half_width, alpha);
+		
+		data[i] = sinc_value * window_value;	// @@ sinc windowed by kaiser
+	}
+
+	normalize();
+}
+
+
+/// Init 1D Mitchell filter.
+void Kernel1::initMitchell(float b, float c)
+{
+	nvCheck((w & 1) == 0);
+	
+	const float half_width = float(w / 2);
+	const float offset = -half_width;
+	const float nudge = 0.5f;
+	
+	for(uint i = 0; i < w; i++) {
+		const float x = (i + offset) + nudge;
+		data[i] = filter_mitchell(x / half_width, b, c);
+	}
+	
+	normalize();
+}
+
+
+/// Print the kernel for debugging purposes.
+void Kernel1::debugPrint()
+{
+	for(uint i = 0; i < w; i++) {
+		nvDebug("%d: %f\n", i, data[i]);
+	}
+}
+
+
+
+/// Ctor.
+Kernel2::Kernel2(uint width) : w(width)
+{
+	data = new float[w*w];
+}
+
+/// Copy ctor.
+Kernel2::Kernel2(const Kernel2 & k) : w(k.w)
+{
+	data = new float[w*w];
+	for(uint i = 0; i < w*w; i++) {
+		data[i] = k.data[i];
+	}
+}
+
+
+/// Dtor.
+Kernel2::~Kernel2()
+{
+	delete data;
+}
+
+/// Normalize the filter.
+void Kernel2::normalize()
+{
+	float total = 0.0f;
+	for(uint i = 0; i < w*w; i++) {
+		total += fabs(data[i]);
+	}
+	
+	float inv = 1.0f / total;
+	for(uint i = 0; i < w*w; i++) {
+		data[i] *= inv;
+	}
+}
+
+/// Transpose the kernel.
+void Kernel2::transpose()
+{
+	for(uint i = 0; i < w; i++) {
+		for(uint j = i+1; j < w; j++) {
+			swap(data[i*w + j], data[j*w + i]);
+		}
+	}
+}
+
+/// Init laplacian filter, usually used for sharpening.
+void Kernel2::initLaplacian()
+{
+	nvDebugCheck(w == 3);
+//	data[0] = -1; data[1] = -1; data[2] = -1;
+//	data[3] = -1; data[4] = +8; data[5] = -1;
+//	data[6] = -1; data[7] = -1; data[8] = -1;	
+	
+	data[0] = +0; data[1] = -1; data[2] = +0;
+	data[3] = -1; data[4] = +4; data[5] = -1;
+	data[6] = +0; data[7] = -1; data[8] = +0;	
+	
+//	data[0] = +1; data[1] = -2; data[2] = +1;
+//	data[3] = -2; data[4] = +4; data[5] = -2;
+//	data[6] = +1; data[7] = -2; data[8] = +1;	
+}
+
+
+/// Init simple edge detection filter.
+void Kernel2::initEdgeDetection()
+{
+	nvCheck(w == 3);
+	data[0] = 0; data[1] = 0; data[2] = 0;
+	data[3] = -1; data[4] = 0; data[5] = 1;
+	data[6] = 0; data[7] = 0; data[8] = 0;
+}
+
+/// Init sobel filter.
+void Kernel2::initSobel()
+{
+	if (w == 3)
+	{
+		data[0] = -1; data[1] = 0; data[2] = 1;
+		data[3] = -2; data[4] = 0; data[5] = 2;
+		data[6] = -1; data[7] = 0; data[8] = 1;
+	}
+	else if (w == 5)
+	{
+		float elements[] = {
+            -1, -2, 0, 2, 1,
+            -2, -3, 0, 3, 2,
+            -3, -4, 0, 4, 3,
+            -2, -3, 0, 3, 2,
+            -1, -2, 0, 2, 1
+		};
+
+		for (int i = 0; i < 5*5; i++) {
+			data[i] = elements[i];
+		}
+	}
+	else if (w == 7)
+	{
+		float elements[] = {
+            -1, -2, -3, 0, 3, 2, 1,
+            -2, -3, -4, 0, 4, 3, 2,
+            -3, -4, -5, 0, 5, 4, 3,
+            -4, -5, -6, 0, 6, 5, 4,
+            -3, -4, -5, 0, 5, 4, 3,
+            -2, -3, -4, 0, 4, 3, 2,
+            -1, -2, -3, 0, 3, 2, 1
+		};
+
+		for (int i = 0; i < 7*7; i++) {
+			data[i] = elements[i];
+		}
+	}
+	else if (w == 9)
+	{
+		float elements[] = {
+            -1, -2, -3, -4, 0, 4, 3, 2, 1,
+            -2, -3, -4, -5, 0, 5, 4, 3, 2,
+            -3, -4, -5, -6, 0, 6, 5, 4, 3,
+            -4, -5, -6, -7, 0, 7, 6, 5, 4,
+            -5, -6, -7, -8, 0, 8, 7, 6, 5,
+            -4, -5, -6, -7, 0, 7, 6, 5, 4,
+            -3, -4, -5, -6, 0, 6, 5, 4, 3,
+            -2, -3, -4, -5, 0, 5, 4, 3, 2,
+            -1, -2, -3, -4, 0, 4, 3, 2, 1
+		};
+		
+		for (int i = 0; i < 9*9; i++) {
+			data[i] = elements[i];
+		}
+	}
+}
+
+/// Init prewitt filter.
+void Kernel2::initPrewitt()
+{
+	if (w == 3)
+	{
+		data[0] = -1; data[1] = 0; data[2] = -1;
+		data[3] = -1; data[4] = 0; data[5] = -1;
+		data[6] = -1; data[7] = 0; data[8] = -1;
+	}
+	else if (w == 5)
+	{
+		// @@ Is this correct?
+		float elements[] = {
+            -2, -1, 0, 1, 2,
+            -2, -1, 0, 1, 2,
+            -2, -1, 0, 1, 2,
+            -2, -1, 0, 1, 2,
+            -2, -1, 0, 1, 2
+		};
+
+		for (int i = 0; i < 5*5; i++) {
+			data[i] = elements[i];
+		}
+	}
+}
+
+/// Init blended sobel filter.
+void Kernel2::initBlendedSobel(const Vector4 & scale)
+{
+	nvCheck(w == 9);
+
+	{
+		float elements[] = {
+            -1, -2, -3, -4, 0, 4, 3, 2, 1,
+            -2, -3, -4, -5, 0, 5, 4, 3, 2,
+            -3, -4, -5, -6, 0, 6, 5, 4, 3,
+            -4, -5, -6, -7, 0, 7, 6, 5, 4,
+            -5, -6, -7, -8, 0, 8, 7, 6, 5,
+            -4, -5, -6, -7, 0, 7, 6, 5, 4,
+            -3, -4, -5, -6, 0, 6, 5, 4, 3,
+            -2, -3, -4, -5, 0, 5, 4, 3, 2,
+            -1, -2, -3, -4, 0, 4, 3, 2, 1
+		};
+		
+		for (int i = 0; i < 9*9; i++) {
+			data[i] = elements[i] * scale.w();
+		}
+	}
+	{
+		float elements[] = {
+            -1, -2, -3, 0, 3, 2, 1,
+            -2, -3, -4, 0, 4, 3, 2,
+            -3, -4, -5, 0, 5, 4, 3,
+            -4, -5, -6, 0, 6, 5, 4,
+            -3, -4, -5, 0, 5, 4, 3,
+            -2, -3, -4, 0, 4, 3, 2,
+            -1, -2, -3, 0, 3, 2, 1,
+		};
+
+		for (int i = 0; i < 7; i++) {
+			for (int e = 0; e < 7; e++) {
+				data[i * 9 + e + 1] += elements[i * 7 + e] * scale.z();
+			}
+		}
+	}
+	{
+		float elements[] = {
+            -1, -2, 0, 2, 1,
+            -2, -3, 0, 3, 2,
+            -3, -4, 0, 4, 3,
+            -2, -3, 0, 3, 2,
+            -1, -2, 0, 2, 1
+		};
+
+		for (int i = 0; i < 5; i++) {
+			for (int e = 0; e < 5; e++) {
+				data[i * 9 + e + 2] += elements[i * 5 + e] * scale.y();
+			}
+		}
+	}
+	{
+		float elements[] = {
+            -1, 0, 1,
+            -2, 0, 2,
+            -1, 0, 1,
+		};
+
+		for (int i = 0; i < 3; i++) {
+			for (int e = 0; e < 3; e++) {
+				data[i * 9 + e + 3] += elements[i * 3 + e] * scale.x();
+			}
+		}
+	}
+}
+
+
+/*PI_DECLARE_TEST(BesselTest) {
+
+	for(int i = 0; i < 8; i++) {
+		nvDebug("bessel0(%i) %f =? %f\n", i, bessel0(i), _bessel0(i));
+		PI_TEST(equalf(bessel0(i), _bessel0(i)));
+	}
+
+	return PiTestUnit::Succeed;
+}*/
+
--- a/src/nvimage/Filter.h
+++ b/src/nvimage/Filter.h
@ -0,0 +1,103 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#ifndef NV_IMAGE_FILTER_H
+#define NV_IMAGE_FILTER_H
+
+#include <nvimage/nvimage.h>
+
+namespace nv
+{
+	class Vector4;
+
+	/// A filter function.
+	struct Filter 
+	{
+		// Standard filters.
+		enum Enum 
+		{
+			Box,
+			Triangle,
+			Quadratic,	// Bell
+			Cubic,
+			Spline,
+			Lanczos,
+			Mitchell,
+			Kaiser,
+			Num
+		};
+
+		float (*function)(float x);
+		float support;
+	};
+
+
+	/// A 1D kernel. Used to precompute filter weights.
+	class Kernel1
+	{
+	public:
+		NVIMAGE_API Kernel1(uint width);
+		NVIMAGE_API Kernel1(const Kernel1 & k);
+		NVIMAGE_API ~Kernel1();
+		
+		NVIMAGE_API void normalize();
+		
+		float valueAt(uint x) const {
+			return data[x];
+		}
+		
+		uint width() const {
+			return w;
+		}
+		
+		NVIMAGE_API void initFilter(Filter::Enum filter);
+		NVIMAGE_API void initSinc(float stretch = 1);
+		NVIMAGE_API void initKaiser(float alpha = 4.0f, float stretch = 1.0f);
+		NVIMAGE_API void initMitchell(float b = 1.0f/3.0f, float c = 1.0f/3.0f);
+		
+		NVIMAGE_API void debugPrint();
+		
+	private:
+		const uint w;
+		float * data;
+	};
+
+
+	/// A 2D kernel.
+	class Kernel2 
+	{
+	public:
+		NVIMAGE_API Kernel2(uint width);
+		NVIMAGE_API Kernel2(const Kernel2 & k);
+		NVIMAGE_API ~Kernel2();
+		
+		NVIMAGE_API void normalize();
+		NVIMAGE_API void transpose();
+		
+		float valueAt(uint x, uint y) const {
+			return data[y * w + x];
+		}
+		
+		uint width() const {
+			return w;
+		}
+		
+		NVIMAGE_API void initLaplacian();
+		NVIMAGE_API void initEdgeDetection();
+		NVIMAGE_API void initSobel();
+		NVIMAGE_API void initPrewitt();
+
+		NVIMAGE_API void initBlendedSobel(const Vector4 & scale);
+
+	private:
+		const uint w;
+		float * data;
+	};
+
+
+	// @@ Implement non linear filters:
+	// Kuwahara filter
+	// Median filter
+
+} // nv namespace
+
+#endif // NV_IMAGE_FILTER_H
--- a/src/nvimage/FloatImage.cpp
+++ b/src/nvimage/FloatImage.cpp
@ -0,0 +1,839 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#include <nvcore/Containers.h>
+#include <nvcore/Ptr.h>
+
+#include <nvmath/Color.h>
+
+#include "FloatImage.h"
+#include "Filter.h"
+#include "Image.h"
+
+#include <math.h>
+
+using namespace nv;
+
+namespace 
+{
+	static int round(float f)
+	{
+		return int(f);
+	}
+
+	static float frac(float f)
+	{
+		return f - floor(f);
+	}
+}
+
+
+/// Ctor.
+FloatImage::FloatImage() : m_width(0), m_height(0), 
+	m_componentNum(0), m_count(0), m_mem(NULL)
+{
+}
+
+/// Ctor. Init from image.
+FloatImage::FloatImage(const Image * img) : m_width(0), m_height(0), 
+	m_componentNum(0), m_count(0), m_mem(NULL)
+{
+	initFrom(img);
+}
+
+/// Dtor.
+FloatImage::~FloatImage()
+{
+	free();
+}
+
+
+/// Init the floating point image from a regular image.
+void FloatImage::initFrom(const Image * img)
+{
+	nvCheck(img != NULL);
+	
+	allocate(4, img->width(), img->height());
+	
+	float * red_channel = channel(0);
+	float * green_channel = channel(1);
+	float * blue_channel = channel(2);
+	float * alpha_channel = channel(3);
+	
+	const uint count = m_width * m_height;
+	for(uint i = 0; i < count; i++) {
+		Color32 pixel = img->pixel(i);
+		red_channel[i] = float(pixel.r) / 255.0f;
+		green_channel[i] = float(pixel.g) / 255.0f;
+		blue_channel[i] = float(pixel.b) / 255.0f;
+		alpha_channel[i] = float(pixel.a) / 255.0f;
+	}
+}
+
+/// Convert the floating point image to a regular image.
+Image * FloatImage::createImage(uint base_component/*= 0*/, uint num/*= 4*/) const
+{
+	nvCheck(num <= 4);
+	nvCheck(base_component + num <= m_componentNum);
+	
+	AutoPtr<Image> img(new Image());
+	img->allocate(m_width, m_height);
+	
+	const uint size = m_width * m_height;
+	for(uint i = 0; i < size; i++) {
+		
+		uint c;
+		uint8 rgba[4];
+
+		for(c = 0; c < num; c++) {
+			float f = m_mem[size * (base_component + c) + i];
+			rgba[c] = nv::clamp(int(255.0f * f), 0, 255);
+		}
+
+		// Fill the rest with 0xff000000;
+		for(; c < 4; c++) {
+			rgba[c] = c != 3 ? 0 : 0xff;
+		}
+		
+		img->pixel(i) = Color32(rgba[0], rgba[1], rgba[2], rgba[3]);
+	}
+	
+	return img.release();
+}
+
+
+/// Convert the floating point image to a regular image. Correct gamma of rgb, but not alpha.
+Image * FloatImage::createImageGammaCorrect(float gamma/*= 2.2f*/) const
+{
+	nvCheck(m_componentNum == 4);
+	
+	AutoPtr<Image> img(new Image());
+	img->allocate(m_width, m_height);
+	
+	const float * rChannel = this->channel(0);
+	const float * gChannel = this->channel(1);
+	const float * bChannel = this->channel(2);
+	const float * aChannel = this->channel(3);
+
+	const uint size = m_width * m_height;
+	for(uint i = 0; i < size; i++)
+	{
+		const uint8 r = nv::clamp(int(255.0f * pow(rChannel[i], 1.0f/gamma)), 0, 255);
+		const uint8 g = nv::clamp(int(255.0f * pow(gChannel[i], 1.0f/gamma)), 0, 255);
+		const uint8 b = nv::clamp(int(255.0f * pow(bChannel[i], 1.0f/gamma)), 0, 255);
+		const uint8 a = nv::clamp(int(255.0f * aChannel[i]), 0, 255);
+
+		img->pixel(i) = Color32(r, g, b, a);
+	}
+	
+	return img.release();
+}
+
+/// Allocate a 2d float image of the given format and the given extents.
+void FloatImage::allocate(uint c, uint w, uint h)
+{
+	nvCheck(m_mem == NULL);
+	m_width = w;
+	m_height = h;
+	m_componentNum = c;
+	m_count = w * h * c;
+	m_mem = reinterpret_cast<float *>(nv::mem::malloc(m_count * sizeof(float)));
+}
+
+/// Free the image, but don't clear the members.
+void FloatImage::free()
+{
+	nvCheck(m_mem != NULL);
+	nv::mem::free( reinterpret_cast<void *>(m_mem) );
+	m_mem = NULL;
+}
+
+void FloatImage::clear(float f/*=0.0f*/)
+{
+	for(uint i = 0; i < m_count; i++) {
+		m_mem[i] = f;
+	}
+}
+
+void FloatImage::normalize(uint base_component)
+{
+	nvCheck(base_component + 3 <= m_componentNum);
+	
+	float * xChannel = this->channel(base_component + 0);
+	float * yChannel = this->channel(base_component + 1);
+	float * zChannel = this->channel(base_component + 2);
+
+	const uint size = m_width * m_height;
+	for(uint i = 0; i < size; i++) {
+		
+		Vector3 normal(xChannel[i], yChannel[i], zChannel[i]);
+		normal = normalizeSafe(normal, Vector3(zero));
+		
+		xChannel[i] = normal.x();
+		yChannel[i] = normal.y();
+		zChannel[i] = normal.z();
+	}
+}
+
+void FloatImage::packNormals(uint base_component)
+{
+	scaleBias(base_component, 3, 0.5f, 1.0f);
+}
+
+void FloatImage::expandNormals(uint base_component)
+{
+	scaleBias(base_component, 3, 2, 0.5);
+}
+
+void FloatImage::scaleBias(uint base_component, uint num, float scale, float bias)
+{
+	const uint size = m_width * m_height;
+	
+	for(uint c = 0; c < num; c++) {
+		float * ptr = this->channel(base_component + c);
+		
+		for(uint i = 0; i < size; i++) {
+			ptr[i] = scale * (ptr[i] + bias);
+		}
+	}
+}
+
+/// Clamp the elements of the image.
+void FloatImage::clamp(float low, float high)
+{
+	for(uint i = 0; i < m_count; i++) {
+		m_mem[i] = nv::clamp(m_mem[i], low, high);
+	}
+}
+
+/// From gamma to linear space.
+void FloatImage::toLinear(uint base_component, uint num, float gamma /*= 2.2f*/)
+{
+	exponentiate(base_component, num, gamma);
+}
+
+/// From linear to gamma space.
+void FloatImage::toGamma(uint base_component, uint num, float gamma /*= 2.2f*/)
+{
+	exponentiate(base_component, num, 1.0f/gamma);
+}
+
+/// Exponentiate the elements of the image.
+void FloatImage::exponentiate(uint base_component, uint num, float power)
+{
+	const uint size = m_width * m_height;
+
+	for(uint c = 0; c < num; c++) {
+		float * ptr = this->channel(base_component + c);
+		
+		for(uint i = 0; i < size; i++) {
+			ptr[i] = pow(ptr[i], power);
+		}
+	}
+}
+
+#if 0
+float FloatImage::nearest(float x, float y, int c, WrapMode wm) const
+{
+	if( wm == WrapMode_Clamp ) return nearest_clamp(x, y, c);
+	/*if( wm == WrapMode_Repeat )*/ return nearest_repeat(x, y, c);
+	//if( wm == WrapMode_Mirror ) return nearest_mirror(x, y, c);
+}
+
+float FloatImage::nearest_clamp(int x, int y, const int c) const
+{
+	const int w = m_width;
+	const int h = m_height;
+	int ix = ::clamp(x, 0, w-1);
+	int iy = ::clamp(y, 0, h-1);
+	return pixel(ix, iy, c);
+}
+
+float FloatImage::nearest_repeat(int x, int y, const int c) const
+{
+	const int w = m_width;
+	const int h = m_height;
+	int ix = x % w;
+	int iy = y % h;
+	return pixel(ix, iy, c);
+}
+#endif
+
+float FloatImage::nearest(float x, float y, int c, WrapMode wm) const
+{
+	if( wm == WrapMode_Clamp ) return nearest_clamp(x, y, c);
+	/*if( wm == WrapMode_Repeat )*/ return nearest_repeat(x, y, c);
+	//if( wm == WrapMode_Mirror ) return nearest_mirror(x, y, c);
+}
+
+float FloatImage::linear(float x, float y, int c, WrapMode wm) const
+{
+	if( wm == WrapMode_Clamp ) return linear_clamp(x, y, c);
+	/*if( wm == WrapMode_Repeat )*/ return linear_repeat(x, y, c);
+	//if( wm == WrapMode_Mirror ) return linear_mirror(x, y, c);
+}
+
+float FloatImage::nearest_clamp(float x, float y, const int c) const
+{
+	const int w = m_width;
+	const int h = m_height;
+	int ix = ::clamp(round(x * w), 0, w-1);
+	int iy = ::clamp(round(y * h), 0, h-1);
+	return pixel(ix, iy, c);
+}
+
+float FloatImage::nearest_repeat(float x, float y, const int c) const
+{
+	const int w = m_width;
+	const int h = m_height;
+	int ix = round(frac(x) * w);
+	int iy = round(frac(y) * h);
+	return pixel(ix, iy, c);
+}
+
+float FloatImage::nearest_mirror(float x, float y, const int c) const
+{
+	// @@ TBD
+	return 0.0f;
+}
+
+float FloatImage::linear_clamp(float x, float y, const int c) const
+{
+	const int w = m_width;
+	const int h = m_height;
+	
+	x *= w;
+	y *= h;
+	
+	const float fracX = frac(x);
+	const float fracY = frac(y);
+	
+	const int ix0 = ::clamp(round(x), 0, w-1);
+	const int iy0 = ::clamp(round(y), 0, h-1);
+	const int ix1 = ::clamp(round(x)+1, 0, w-1);
+	const int iy1 = ::clamp(round(y)+1, 0, h-1);
+
+	float f1 = pixel(ix0, iy0, c);
+	float f2 = pixel(ix1, iy0, c);
+	float f3 = pixel(ix0, iy1, c);
+	float f4 = pixel(ix1, iy1, c);
+	
+	float i1 = lerp(f1, f2, fracX);
+	float i2 = lerp(f3, f4, fracX);
+
+	return lerp(i1, i2, fracY);
+}
+
+float FloatImage::linear_repeat(float x, float y, int c) const
+{
+	const int w = m_width;
+	const int h = m_height;
+	
+	const float fracX = frac(x * w);
+	const float fracY = frac(y * h);
+	
+	int ix0 = round(frac(x) * w);
+	int iy0 = round(frac(y) * h);
+	int ix1 = round(frac(x + 1.0f/w) * w);
+	int iy1 = round(frac(y + 1.0f/h) * h);
+	
+	float f1 = pixel(ix0, iy0, c);
+	float f2 = pixel(ix1, iy0, c);
+	float f3 = pixel(ix0, iy1, c);
+	float f4 = pixel(ix1, iy1, c);
+	
+	float i1 = lerp(f1, f2, fracX);
+	float i2 = lerp(f3, f4, fracX);
+
+	return lerp(i1, i2, fracY);
+}
+
+float FloatImage::linear_mirror(float x, float y, int c) const
+{
+	// @@ TBD
+	return 0.0f;
+}
+
+
+/// Fast downsampling using box filter. 
+///
+/// The extents of the image are divided by two and rounded down.
+///
+/// When the size of the image is odd, this uses a polyphase box filter as explained in:
+/// http://developer.nvidia.com/object/np2_mipmapping.html
+///
+FloatImage * FloatImage::fastDownSample() const
+{
+	nvDebugCheck(m_width != 1 || m_height != 1);
+	
+	AutoPtr<FloatImage> dst_image( new FloatImage() );
+
+	const uint w = max(1, m_width / 2);
+	const uint h = max(1, m_height / 2);
+	dst_image->allocate(m_componentNum, w, h);
+
+	// 1D box filter.
+	if (m_width == 1 || m_height == 1)
+	{
+		const uint w = m_width * m_height;
+		
+		if (w & 1)
+		{
+			const float scale = 1.0f / (2 * w + 1);
+			
+			for(uint c = 0; c < m_componentNum; c++)
+			{
+				const float * src = this->channel(c);
+				float * dst = dst_image->channel(c);
+				
+				for(uint x = 0; x < w; x++)
+				{
+					const float w0 = (w - x);
+					const float w1 = (w - 0);
+					const float w2 = (1 + x);
+					
+					*dst++ = scale * (w0 * src[0] + w1 * src[1] + w2 * src[2]);
+					src += 2;
+				}
+			}
+		}
+		else
+		{
+			for(uint c = 0; c < m_componentNum; c++)
+			{
+				const float * src = this->channel(c);
+				float * dst = dst_image->channel(c);
+				
+				for(uint x = 0; x < w; x++)
+				{
+					*dst = 0.5f * (src[0] + src[1]);
+					dst++;
+					src += 2;
+				}
+			}
+		}
+	}
+	
+	// Regular box filter.
+	else if ((m_width & 1) == 0 && (m_height & 1) == 0)
+	{
+		for(uint c = 0; c < m_componentNum; c++)
+		{
+			const float * src = this->channel(c);
+			float * dst = dst_image->channel(c);
+			
+			for(uint y = 0; y < h; y++)
+			{
+				for(uint x = 0; x < w; x++)
+				{
+					*dst = 0.25f * (src[0] + src[1] + src[m_width] + src[m_width + 1]);
+					dst++;
+					src += 2;
+				}
+				
+				src += m_width;
+			}
+		}
+	}
+	
+	// Polyphase filters.
+	else if (m_width & 1 && m_height & 1)
+	{
+		nvDebugCheck(m_width == 2 * w + 1);
+		nvDebugCheck(m_height == 2 * h + 1);
+		
+		const float scale = 1.0f / (m_width * m_height);
+		
+		for(uint c = 0; c < m_componentNum; c++)
+		{
+			const float * src = this->channel(c);
+			float * dst = dst_image->channel(c);
+			
+			for(uint y = 0; y < h; y++)
+			{
+				const float v0 = (h - y);
+				const float v1 = (h - 0);
+				const float v2 = (1 + y);
+				
+				for (uint x = 0; x < w; x++)
+				{
+					const float w0 = (w - x);
+					const float w1 = (w - 0);
+					const float w2 = (1 + x);
+					
+					float f = 0.0f;
+					f += v0 * (w0 * src[0 * m_width + 2 * x] + w1 * src[0 * m_width + 2 * x + 1] + w2 * src[0 * m_width + 2 * x + 2]);
+					f += v1 * (w0 * src[1 * m_width + 2 * x] + w1 * src[1 * m_width + 2 * x + 1] + w2 * src[0 * m_width + 2 * x + 2]);
+					f += v2 * (w0 * src[2 * m_width + 2 * x] + w1 * src[2 * m_width + 2 * x + 1] + w2 * src[0 * m_width + 2 * x + 2]);
+					
+					*dst = f * scale;
+					dst++;
+				}
+				
+				src += 2 * m_width;
+			}
+		}
+	}
+	else if (m_width & 1)
+	{
+		nvDebugCheck(m_width == 2 * w + 1);
+		const float scale = 1.0f / (2 * m_width);
+		
+		for(uint c = 0; c < m_componentNum; c++)
+		{
+			const float * src = this->channel(c);
+			float * dst = dst_image->channel(c);
+			
+			for(uint y = 0; y < h; y++)
+			{
+				for (uint x = 0; x < w; x++)
+				{
+					const float w0 = (w - x);
+					const float w1 = (w - 0);
+					const float w2 = (1 + x);
+					
+					float f = 0.0f;
+					f += w0 * (src[2 * x + 0] + src[m_width + 2 * x + 0]);
+					f += w1 * (src[2 * x + 1] + src[m_width + 2 * x + 1]);
+					f += w2 * (src[2 * x + 2] + src[m_width + 2 * x + 2]);
+					
+					*dst = f * scale;
+					dst++;
+				}
+				
+				src += 2 * m_width;
+			}
+		}
+	}
+	else if (m_height & 1)
+	{
+		nvDebugCheck(m_height == 2 * h + 1);
+		
+		const float scale = 1.0f / (2 * m_height);
+		
+		for(uint c = 0; c < m_componentNum; c++)
+		{
+			const float * src = this->channel(c);
+			float * dst = dst_image->channel(c);
+			
+			for(uint y = 0; y < h; y++)
+			{
+				const float v0 = (h - y);
+				const float v1 = (h - 0);
+				const float v2 = (1 + y);
+				
+				for (uint x = 0; x < w; x++)
+				{
+					float f = 0.0f;
+					f += v0 * (src[0 * m_width + 2 * x] + src[0 * m_width + 2 * x + 1]);
+					f += v1 * (src[1 * m_width + 2 * x] + src[1 * m_width + 2 * x + 1]);
+					f += v2 * (src[2 * m_width + 2 * x] + src[2 * m_width + 2 * x + 1]);
+					
+					*dst = f * scale;
+					dst++;
+				}
+				
+				src += 2 * m_width;
+			}
+		}
+	}
+	
+	return dst_image.release();
+}
+
+
+/// Downsample applying a 1D kernel separately in each dimension.
+FloatImage * FloatImage::downSample(const Kernel1 & kernel, WrapMode wm) const
+{
+	const uint w = max(1, m_width / 2);
+	const uint h = max(1, m_height / 2);
+	
+	return downSample(kernel, w, h, wm);
+}
+
+
+/// Downsample applying a 1D kernel separately in each dimension.
+FloatImage * FloatImage::downSample(const Kernel1 & kernel, uint w, uint h, WrapMode wm) const
+{
+	nvCheck(!(kernel.width() & 1));	// Make sure that kernel m_width is even.
+
+	AutoPtr<FloatImage> tmp_image( new FloatImage() );
+	tmp_image->allocate(m_componentNum, w, m_height);
+	
+	AutoPtr<FloatImage> dst_image( new FloatImage() );	
+	dst_image->allocate(m_componentNum, w, h);
+	
+	const float xscale = float(m_width) / float(w);
+	const float yscale = float(m_height) / float(h);
+	
+	for(uint c = 0; c < m_componentNum; c++) {
+		float * tmp_channel = tmp_image->channel(c);
+		
+		for(uint y = 0; y < m_height; y++) {
+			for(uint x = 0; x < w; x++) {
+				
+				float sum = this->applyKernelHorizontal(&kernel, uint(x*xscale), y, c, wm);
+				
+				const uint tmp_index = tmp_image->index(x, y);
+				tmp_channel[tmp_index] = sum;
+			}
+		}
+		
+		float * dst_channel = dst_image->channel(c);
+		
+		for(uint y = 0; y < h; y++) {
+			for(uint x = 0; x < w; x++) {
+				
+				float sum = this->applyKernelVertical(&kernel, uint(x*xscale), uint(y*yscale), c, wm);
+				
+				const uint dst_index = dst_image->index(x, y);
+				dst_channel[dst_index] = sum;
+			}
+		}
+	}
+	
+	return dst_image.release();
+}
+
+
+/// Apply 2D kernel at the given coordinates and return result.
+float FloatImage::applyKernel(const Kernel2 * k, int x, int y, int c, WrapMode wm) const
+{
+	nvDebugCheck(k != NULL);
+	
+	const uint kernelWidth = k->width();
+	const int kernelOffset = int(kernelWidth / 2) - 1;
+	
+	const float * channel = this->channel(c);
+
+	float sum = 0.0f;
+	for(uint i = 0; i < kernelWidth; i++)
+	{
+		const int src_y = int(y + i) - kernelOffset;
+		
+		for(uint e = 0; e < kernelWidth; e++)
+		{
+			const int src_x = int(x + e) - kernelOffset;
+			
+			int idx = this->index(src_x, src_y, wm);
+			
+			sum += k->valueAt(e, i) * channel[idx];
+		}
+	}
+	
+	return sum;
+}
+
+
+/// Apply 1D vertical kernel at the given coordinates and return result.
+float FloatImage::applyKernelVertical(const Kernel1 * k, int x, int y, int c, WrapMode wm) const
+{
+	nvDebugCheck(k != NULL);
+	
+	const uint kernelWidth = k->width();
+	const int kernelOffset = int(kernelWidth / 2) - 1;
+	
+	const float * channel = this->channel(c);
+
+	float sum = 0.0f;
+	for(uint i = 0; i < kernelWidth; i++)
+	{
+		const int src_y = int(y + i) - kernelOffset;
+		const int idx = this->index(x, src_y, wm);
+		
+		sum += k->valueAt(i) * channel[idx];
+	}
+	
+	return sum;
+}
+
+/// Apply 1D horizontal kernel at the given coordinates and return result.
+float FloatImage::applyKernelHorizontal(const Kernel1 * k, int x, int y, int c, WrapMode wm) const
+{
+	nvDebugCheck(k != NULL);
+	
+	const uint kernelWidth = k->width();
+	const int kernelOffset = int(kernelWidth / 2) - 1;
+	
+	const float * channel = this->channel(c);
+
+	float sum = 0.0f;
+	for(uint e = 0; e < kernelWidth; e++)
+	{
+		const int src_x = int(x + e) - kernelOffset;
+		const int idx = this->index(src_x, y, wm);
+		
+		sum += k->valueAt(e) * channel[idx];
+	}
+	
+	return sum;
+}
+
+
+
+#if 0
+
+Vec3d bilinear(double u, double v) const
+{
+	u = mod(u*(W-1),W);
+	v = mod(v*(H-1),H);
+
+	Vec3d v1,v2,v3,v4;
+
+	int x_small	= (int)floor(u);
+	int x_big	  = x_small + 1;
+	int y_small = (int)floor(v);
+	int y_big   = y_small + 1;
+
+	if (x_small < 0)
+		x_small = W-1;
+	else if (x_big >= W)
+		x_big = 0;
+	if (y_small < 0)
+		y_small = H-1;
+	else if (y_big >= H)
+		y_big = 0;
+	
+	double fractional_X = u - x_small;
+	double fractional_Y = v - y_small;
+
+	if (nchan == 3)
+	{
+		v1 = Vec3d(pixel(x_small, y_small)[0], pixel(x_small, y_small)[1], pixel(x_small, y_small)[2]);
+		v2 = Vec3d(pixel(x_big, y_small)[0], pixel(x_big, y_small)[1], pixel(x_big, y_small)[2]);
+		v3 = Vec3d(pixel(x_small, y_big)[0], pixel(x_small, y_big)[1], pixel(x_small, y_big)[2]);
+		v4 = Vec3d(pixel(x_big, y_big)[0], pixel(x_big, y_big)[1], pixel(x_big, y_big)[2]);
+	}
+			
+	Vec3d i1 = lerp(v1, v2, fractional_X);
+	Vec3d i2 = lerp(v3, v4, fractional_X);
+
+	return lerp(i1, i2, fractional_Y);
+}
+
+Vec3d bicubic(double u, double v) const
+{
+	u = mod(u*(W-1),W);
+	v = mod(v*(H-1),H);
+
+	int x_small1	= (int)floor(u),
+		x_small2	= x_small1 - 1,
+		x_big1		= x_small1 + 1,				
+		x_big2		= x_small1 + 2;
+
+	int y_small1	= (int)floor(v),
+		y_small2	= y_small1 - 1,
+		y_big1		= y_small1 + 1,
+		y_big2		= y_small1 + 2;
+
+	x_small1	= (int)mod(x_small1,W);
+	x_small2	= (int)mod(x_small2,W);
+	x_big1		= (int)mod(x_big1,W);
+	x_big2		= (int)mod(x_big2,W);
+	
+	y_small1	= (int)mod(y_small1,H);
+	y_small2	= (int)mod(y_small2,H);
+	y_big1		= (int)mod(y_big1,H);
+	y_big2		= (int)mod(y_big2,H);
+	
+	double fractional_X = u - x_small1;
+	double fractional_Y = v - y_small1;
+
+	if (nchan == 3)
+	{
+		// the interpolations across the rows
+		Vec3d row1 = cubic(Vec3d(pixel(x_small2, y_small2)[0], pixel(x_small2, y_small2)[1], pixel(x_small2, y_small2)[2]),
+							Vec3d(pixel(x_small1, y_small2)[0], pixel(x_small1, y_small2)[1], pixel(x_small1, y_small2)[2]),
+							Vec3d(pixel(x_big1, y_small2)[0], pixel(x_big1, y_small2)[1], pixel(x_big1, y_small2)[2]),
+							Vec3d(pixel(x_big2, y_small2)[0], pixel(x_big2, y_small2)[1], pixel(x_big2, y_small2)[2]),
+							fractional_X);
+
+		Vec3d row2 = cubic(Vec3d(pixel(x_small2, y_small1)[0], pixel(x_small2, y_small1)[1], pixel(x_small2, y_small1)[2]),
+							Vec3d(pixel(x_small1, y_small1)[0], pixel(x_small1, y_small1)[1], pixel(x_small1, y_small1)[2]),
+							Vec3d(pixel(x_big1, y_small1)[0], pixel(x_big1, y_small1)[1], pixel(x_big1, y_small1)[2]),
+							Vec3d(pixel(x_big2, y_small1)[0], pixel(x_big2, y_small1)[1], pixel(x_big2, y_small1)[2]),
+							fractional_X);
+
+		Vec3d row3 = cubic(Vec3d(pixel(x_small2, y_big1)[0], pixel(x_small2, y_big1)[1], pixel(x_small2, y_big1)[2]),
+							Vec3d(pixel(x_small1, y_big1)[0], pixel(x_small1, y_big1)[1], pixel(x_small1, y_big1)[2]),
+							Vec3d(pixel(x_big1, y_big1)[0], pixel(x_big1, y_big1)[1], pixel(x_big1, y_big1)[2]),
+							Vec3d(pixel(x_big2, y_big1)[0], pixel(x_big2, y_big1)[1], pixel(x_big2, y_big1)[2]),
+							fractional_X);
+
+		Vec3d row4 = cubic(Vec3d(pixel(x_small2, y_big2)[0], pixel(x_small2, y_big2)[1], pixel(x_small2, y_big2)[2]),
+							Vec3d(pixel(x_small1, y_big2)[0], pixel(x_small1, y_big2)[1], pixel(x_small1, y_big2)[2]),
+							Vec3d(pixel(x_big1, y_big2)[0], pixel(x_big1, y_big2)[1], pixel(x_big1, y_big2)[2]),
+							Vec3d(pixel(x_big2, y_big2)[0], pixel(x_big2, y_big2)[1], pixel(x_big2, y_big2)[2]),
+							fractional_X);
+
+		// now interpolate across the interpolated rows (the columns)
+
+		return cubic(row1,row2,row3,row4,fractional_Y);
+	}
+	else
+		return Vec3d(0.0);
+}
+
+Vec3d bicubic2(double u, double v) const
+{
+	u = mod(u*(W-1),W);
+	v = mod(v*(H-1),H);
+
+	int x_small1	= floorf(u),
+		x_small2	= x_small1 - 1,
+		x_big1		= int(x_small1 + 1),
+		x_big2		= int(x_small1 + 2);
+
+	int y_small1	= floorf(v),
+		y_small2	= y_small1 - 1,
+		y_big1		= y_small1 + 1,
+		y_big2		= y_small1 + 2;
+
+	x_small1	= (int)mod(x_small1,W);
+	x_small2	= (int)mod(x_small2,W);
+	x_big1		= (int)mod(x_big1,W);
+	x_big2		= (int)mod(x_big2,W);
+	
+	y_small1	= (int)mod(y_small1,H);
+	y_small2	= (int)mod(y_small2,H);
+	y_big1		= (int)mod(y_big1,H);
+	y_big2		= (int)mod(y_big2,H);
+	
+	double fractional_X = u - x_small1;
+	double fractional_Y = v - y_small1;
+
+	if (nchan == 3)
+	{
+		// the interpolations across the rows
+		Vec3d row1 = cubic2(Vec3d(pixel(x_small2, y_small2)[0], pixel(x_small2, y_small2)[1], pixel(x_small2, y_small2)[2]),
+							Vec3d(pixel(x_small1, y_small2)[0], pixel(x_small1, y_small2)[1], pixel(x_small1, y_small2)[2]),
+							Vec3d(pixel(x_big1, y_small2)[0], pixel(x_big1, y_small2)[1], pixel(x_big1, y_small2)[2]),
+							Vec3d(pixel(x_big2, y_small2)[0], pixel(x_big2, y_small2)[1], pixel(x_big2, y_small2)[2]),
+							fractional_X);
+
+		Vec3d row2 = cubic2(Vec3d(pixel(x_small2, y_small1)[0], pixel(x_small2, y_small1)[1], pixel(x_small2, y_small1)[2]),
+							Vec3d(pixel(x_small1, y_small1)[0], pixel(x_small1, y_small1)[1], pixel(x_small1, y_small1)[2]),
+							Vec3d(pixel(x_big1, y_small1)[0], pixel(x_big1, y_small1)[1], pixel(x_big1, y_small1)[2]),
+							Vec3d(pixel(x_big2, y_small1)[0], pixel(x_big2, y_small1)[1], pixel(x_big2, y_small1)[2]),
+							fractional_X);
+
+		Vec3d row3 = cubic2(Vec3d(pixel(x_small2, y_big1)[0], pixel(x_small2, y_big1)[1], pixel(x_small2, y_big1)[2]),
+							Vec3d(pixel(x_small1, y_big1)[0], pixel(x_small1, y_big1)[1], pixel(x_small1, y_big1)[2]),
+							Vec3d(pixel(x_big1, y_big1)[0], pixel(x_big1, y_big1)[1], pixel(x_big1, y_big1)[2]),
+							Vec3d(pixel(x_big2, y_big1)[0], pixel(x_big2, y_big1)[1], pixel(x_big2, y_big1)[2]),
+							fractional_X);
+
+		Vec3d row4 = cubic2(Vec3d(pixel(x_small2, y_big2)[0], pixel(x_small2, y_big2)[1], pixel(x_small2, y_big2)[2]),
+							Vec3d(pixel(x_small1, y_big2)[0], pixel(x_small1, y_big2)[1], pixel(x_small1, y_big2)[2]),
+							Vec3d(pixel(x_big1, y_big2)[0], pixel(x_big1, y_big2)[1], pixel(x_big1, y_big2)[2]),
+							Vec3d(pixel(x_big2, y_big2)[0], pixel(x_big2, y_big2)[1], pixel(x_big2, y_big2)[2]),
+							fractional_X);
+
+		// now interpolate across the interpolated rows (the columns)
+
+		return cubic2(row1,row2,row3,row4,fractional_Y);
+	}
+	else
+		return Vec3d(0.0);
+}
+
+#endif
--- a/src/nvimage/FloatImage.h
+++ b/src/nvimage/FloatImage.h
@ -0,0 +1,241 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#ifndef NV_IMAGE_FLOATIMAGE_H
+#define NV_IMAGE_FLOATIMAGE_H
+
+#include <nvcore/Debug.h>
+#include <nvcore/Containers.h> // clamp
+#include <nvimage/nvimage.h>
+
+namespace nv
+{
+class Image;
+class Kernel1;
+class Kernel2;
+
+
+/// Multicomponent floating point image class.
+class FloatImage
+{
+public:
+
+	enum WrapMode {
+		WrapMode_Clamp,
+		WrapMode_Repeat,
+		WrapMode_Mirror
+	};
+	
+	NVIMAGE_API FloatImage();
+	NVIMAGE_API FloatImage(const Image * img);
+	NVIMAGE_API virtual ~FloatImage();
+
+	/** @name Conversion. */
+	//@{
+	NVIMAGE_API void initFrom(const Image * img);
+	NVIMAGE_API Image * createImage(uint base_component = 0, uint num = 4) const;
+	NVIMAGE_API Image * createImageGammaCorrect(float gamma = 2.2f) const;
+	//@}
+
+	/** @name Allocation. */
+	//@{
+	NVIMAGE_API void allocate(uint c, uint w, uint h);
+	NVIMAGE_API void free(); // Does not clear members.
+	//@}
+
+	/** @name Manipulation. */
+	//@{
+	NVIMAGE_API void clear(float f=0.0f);
+
+	//NVIMAGE_API void ComputeMipmaps();
+	//NVIMAGE_API void ComputeNormalMap(const float height_scale = 1.0f);
+	
+	//NVIMAGE_API void Clamp(uint base_component, uint num);
+	//NVIMAGE_API void NormalizeColor(uint base_component);
+	NVIMAGE_API void normalize(uint base_component);
+	
+	NVIMAGE_API void packNormals(uint base_component);
+	NVIMAGE_API void expandNormals(uint base_component);
+	NVIMAGE_API void scaleBias(uint base_component, uint num, float scale, float add);
+	NVIMAGE_API void clamp(float low, float high);
+	
+	NVIMAGE_API void toLinear(uint base_component, uint num, float gamma = 2.2f);
+	NVIMAGE_API void toGamma(uint base_component, uint num, float gamma = 2.2f);
+	NVIMAGE_API void exponentiate(uint base_component, uint num, float power);
+	
+	
+	NVIMAGE_API FloatImage * fastDownSample() const;
+	NVIMAGE_API FloatImage * downSample(const Kernel1 & filter, WrapMode wm) const;
+	NVIMAGE_API FloatImage * downSample(const Kernel1 & filter, uint w, uint h, WrapMode wm) const;
+	//@}
+
+	NVIMAGE_API float applyKernel(const Kernel2 * k, int x, int y, int c, WrapMode wm) const;
+	NVIMAGE_API float applyKernelVertical(const Kernel1 * k, int x, int y, int c, WrapMode wm) const;
+	NVIMAGE_API float applyKernelHorizontal(const Kernel1 * k, int x, int y, int c, WrapMode wm) const;
+	
+	uint width() const { return m_width; }
+	uint height() const { return m_height; }
+	uint componentNum() const { return m_componentNum; }
+	uint count() const { return m_count; }
+
+
+	/** @name Pixel access. */
+	//@{
+	const float * channel(uint c) const;
+	float * channel(uint c);
+	
+	const float * scanline(uint y, uint c) const;
+	float * scanline(uint y, uint c);
+	
+	void setPixel(float f, uint x, uint y, uint c);
+	float pixel(uint x, uint y, uint c) const;
+	
+	void setPixel(float f, uint idx);
+	float pixel(uint idx) const;
+	
+	float nearest(int x, int y, int c, WrapMode wm) const;
+	
+	float nearest(float x, float y, int c, WrapMode wm) const;
+	float linear(float x, float y, int c, WrapMode wm) const;
+	
+	float nearest_clamp(float x, float y, int c) const;
+	float nearest_repeat(float x, float y, int c) const;
+	float nearest_mirror(float x, float y, int c) const;
+	
+	float linear_clamp(float x, float y, int c) const;
+	float linear_repeat(float x, float y, int c) const;
+	float linear_mirror(float x, float y, int c) const;
+	//@}
+	
+public:
+	
+	uint index(uint x, uint y) const;
+	uint indexClamp(int x, int y) const;
+	uint indexRepeat(int x, int y) const;
+	uint indexMirror(int x, int y) const;
+	uint index(int x, int y, WrapMode wm) const;
+
+public:
+
+	uint16 m_width;			///< Width of the texture.
+	uint16 m_height;		///< Height of the texture.
+	uint32 m_componentNum;	///< Number of components.
+	uint32 m_count;			///< Image pixel count.
+	float * m_mem;
+
+};
+
+
+/// Get const channel pointer.
+inline const float * FloatImage::channel(uint c) const
+{
+	nvDebugCheck(m_mem != NULL);
+	nvDebugCheck(c < m_componentNum);
+	return m_mem + c * m_width * m_height;
+}
+
+/// Get channel pointer.
+inline float * FloatImage::channel(uint c) {
+	nvDebugCheck(m_mem != NULL);
+	nvDebugCheck(c < m_componentNum);
+	return m_mem + c * m_width * m_height;
+}
+
+/// Get const scanline pointer.
+inline const float * FloatImage::scanline(uint y, uint c) const
+{
+	nvDebugCheck(y < m_height);
+	return channel(c) + y * m_width;
+}
+
+/// Get scanline pointer.
+inline float * FloatImage::scanline(uint y, uint c)
+{
+	nvDebugCheck(y < m_height);
+	return channel(c) + y * m_width;
+}
+
+/// Set pixel component.
+inline void FloatImage::setPixel(float f, uint x, uint y, uint c)
+{
+	nvDebugCheck(m_mem != NULL);
+	nvDebugCheck(x < m_width);
+	nvDebugCheck(y < m_height);
+	nvDebugCheck(c < m_componentNum);
+	m_mem[(c * m_height + y) * m_width + x] = f;
+}
+
+/// Get pixel component.
+inline float FloatImage::pixel(uint x, uint y, uint c) const
+{
+	nvDebugCheck(m_mem != NULL);
+	nvDebugCheck(x < m_width);
+	nvDebugCheck(y < m_height);
+	nvDebugCheck(c < m_componentNum);
+	return m_mem[(c * m_height + y) * m_width + x];
+}
+
+/// Set pixel component.
+inline void FloatImage::setPixel(float f, uint idx)
+{
+	nvDebugCheck(idx < m_count);
+	m_mem[idx] = f;
+}
+
+/// Get pixel component.
+inline float FloatImage::pixel(uint idx) const
+{
+	nvDebugCheck(idx < m_count);
+	return m_mem[idx];
+}
+
+inline uint FloatImage::index(uint x, uint y) const
+{
+	nvDebugCheck(x < m_width);
+	nvDebugCheck(y < m_height);
+	return y * m_width + x;
+}
+
+inline uint FloatImage::indexClamp(int x, int y) const
+{
+	return nv::clamp(y, int(0), int(m_height-1)) * m_width + nv::clamp(x, int(0), int(m_width-1));
+}
+
+inline int repeat_remainder(int a, int b)
+{
+   if (a >= 0) return a % b;
+   else return (a + 1) % b + b - 1;
+}
+
+inline uint FloatImage::indexRepeat(int x, int y) const
+{
+	return repeat_remainder(y, m_height) * m_width + repeat_remainder(x, m_width);
+}
+
+// @@ This could be way more efficient.
+inline uint FloatImage::indexMirror(int x, int y) const
+{
+	while ((x < 0) || (x > (m_width - 1))) {
+		if (x < 0) x = -x;
+		if (x >= m_width) x = m_width + m_width - x - 1;
+	}
+
+	while ((y < 0) || (y > (m_height - 1))) {
+		if (y < 0) y = -y;
+		if (y >= m_height) y = m_height + m_height - y - 1;
+	}
+
+	return index(x, y);
+}
+
+inline uint FloatImage::index(int x, int y, WrapMode wm) const
+{
+	if (wm == WrapMode_Clamp) return indexClamp(x, y);
+	if (wm == WrapMode_Repeat) return indexRepeat(x, y);
+	/*if (wm == WrapMode_Mirror)*/ return indexMirror(x, y);
+}
+
+} // nv namespace
+
+
+
+#endif // NV_IMAGE_FLOATIMAGE_H
--- a/src/nvimage/HoleFilling.cpp
+++ b/src/nvimage/HoleFilling.cpp
@ -0,0 +1,751 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#include <nvcore/Containers.h>
+
+#include <nvmath/nvmath.h>
+
+#include <nvimage/HoleFilling.h>
+#include <nvimage/FloatImage.h>
+
+using namespace nv;
+
+
+// This is a variation of Sapiro's inpainting method.
+void nv::fillExtrapolateOnce(FloatImage * img, BitMap * bmap)
+{
+	nvCheck(img != NULL);
+	nvCheck(bmap != NULL);
+
+	const int w = img->width();
+	const int h = img->height();
+	const int count = img->componentNum();
+
+	nvCheck(bmap->width() == uint(w));
+	nvCheck(bmap->height() == uint(h));
+
+	BitMap * newbmap = new BitMap(w, h);
+
+	for(int c = 0; c < count; c++) {
+		
+		float * channel = img->channel(c);
+		
+		for(int y = 0; y < h; y++) {
+			for(int x = 0; x < w; x++) {
+				
+				if (bmap->bitAt(x, y)) {
+					// Not a hole.
+					newbmap->setBitAt(x, y);
+					continue;
+				}
+				
+				const bool west = bmap->bitAt(img->indexClamp(x-1, y));
+				const bool east = bmap->bitAt(img->indexClamp(x+1, y));
+				const bool north = bmap->bitAt(img->indexClamp(x, y-1));
+				const bool south = bmap->bitAt(img->indexClamp(x, y+1));
+				const bool northwest = bmap->bitAt(img->indexClamp(x-1, y-1));
+				const bool northeast = bmap->bitAt(img->indexClamp(x+1, y-1));
+				const bool southwest = bmap->bitAt(img->indexClamp(x-1, y+1));
+				const bool southeast = bmap->bitAt(img->indexClamp(x+1, y+1));
+				
+				int num = west + east + north + south + northwest + northeast + southwest + southeast;
+				
+				if (num != 0) {
+
+					float average = 0.0f;
+					if (num == 3 && west && northwest && southwest) {
+						average = channel[img->indexClamp(x-1, y)];
+					}
+					else if (num == 3 && east && northeast && southeast) {
+						average = channel[img->indexClamp(x+1, y)];
+					}
+					else if (num == 3 && north && northwest && northeast) {
+						average = channel[img->indexClamp(x, y-1)];
+					}
+					else if (num == 3 && south && southwest && southeast) {
+						average = channel[img->indexClamp(x, y+1)];
+					}
+					else {
+						float total = 0.0f;
+						if (west) { average += 1 * channel[img->indexClamp(x-1, y)]; total += 1; }
+						if (east) { average += 1 * channel[img->indexClamp(x+1, y)]; total += 1; }
+						if (north) { average += 1 * channel[img->indexClamp(x, y-1)]; total += 1; }
+						if (south) { average += 1 * channel[img->indexClamp(x, y+1)]; total += 1; }
+					
+						if (northwest) { average += channel[img->indexClamp(x-1, y-1)]; ++total; }
+						if (northeast) { average += channel[img->indexClamp(x+1, y-1)]; ++total; }
+						if (southwest) { average += channel[img->indexClamp(x-1, y+1)]; ++total; }
+						if (southeast) { average += channel[img->indexClamp(x+1, y+1)]; ++total; }
+						
+						average /= total;
+					}
+
+					channel[img->indexClamp(x, y)] = average;
+					newbmap->setBitAt(x, y);
+				}
+			}
+		}
+	}
+
+	// Update the bit mask.
+	swap(*newbmap, *bmap);
+
+}
+
+void nv::fillExtrapolateNTimes(FloatImage * img, BitMap * bmap, int n)
+{
+	nvCheck(img != NULL);
+	nvCheck(bmap != NULL);
+	nvCheck(n > 0);
+
+	for(int i = 0; i < n; i++)
+	{
+		fillExtrapolateOnce(img, bmap);
+	}
+}
+
+
+namespace {
+
+	struct Neighbor {
+		uint16 x;
+		uint16 y;
+		uint32 d;
+	};
+
+	// Compute euclidean squared distance.
+	static uint dist( uint16 ax, uint16 ay, uint16 bx, uint16 by ) {
+		int dx = bx - ax;
+		int dy = by - ay;
+		return uint(dx*dx + dy*dy);
+	}
+	
+	// Check neighbour, this is the core of the EDT algorithm.
+	static void checkNeighbour( int x, int y, Neighbor * e, const Neighbor & n ) {
+		nvDebugCheck(e != NULL);
+		
+		uint d = dist( x, y, n.x, n.y );
+		if( d < e->d ) {
+			e->x = n.x;
+			e->y = n.y;
+			e->d = d;
+		}
+	}
+
+} // namespace
+
+// Voronoi filling using EDT-4
+void nv::fillVoronoi(FloatImage * img, const BitMap & bmap)
+{
+	nvCheck(img != NULL);
+
+	const int w = img->width();
+	const int h = img->height();
+	const int count = img->componentNum();
+
+	nvCheck(bmap.width() == uint(w));
+	nvCheck(bmap.height() == uint(h));
+
+	Array<Neighbor> edm;
+	edm.resize(w * h);
+	
+	int x, y;
+	int x0, x1, y0, y1;
+
+	// Init edm.
+	for( y = 0; y < h; y++ ) {
+		for( x = 0; x < w; x++ ) {
+			if( bmap.bitAt(x, y) ) {
+				edm[y * w + x].x = x;
+				edm[y * w + x].y = y;
+				edm[y * w + x].d = 0;
+			}
+			else {
+				edm[y * w + x].x = w;
+				edm[y * w + x].y = h;
+				edm[y * w + x].d = w*w + h*h;
+			}
+		}
+	}
+	
+	// First pass.
+	for( y = 0; y < h; y++ ) {
+		for( x = 0; x < w; x++ ) {
+			x0 = clamp(x-1, 0, w-1);	// @@ Wrap?
+			x1 = clamp(x+1, 0, w-1);
+			y0 = clamp(y-1, 0, h-1);
+			
+			Neighbor & e = edm[y * w + x];
+			checkNeighbour(x, y, &e, edm[y0 * w + x0]);
+			checkNeighbour(x, y, &e, edm[y0 * w + x]);
+			checkNeighbour(x, y, &e, edm[y0 * w + x1]);
+			checkNeighbour(x, y, &e, edm[y * w + x0]);
+		}
+		
+		for( x = w-1; x >= 0; x-- ) {
+			x1 = clamp(x+1, 0, w-1);
+			
+			Neighbor & e = edm[y * w + x];
+			checkNeighbour(x, y, &e, edm[y * w + x1]);
+		}
+	}
+	
+	// Third pass.
+	for( y = h-1; y >= 0; y-- ) {
+		for( x = w-1; x >= 0; x-- ) {
+			x0 = clamp(x-1, 0, w-1);
+			x1 = clamp(x+1, 0, w-1);
+			y1 = clamp(y+1, 0, h-1);
+			
+			Neighbor & e = edm[y * w + x];
+			checkNeighbour(x, y, &e, edm[y * w + x1]);
+			checkNeighbour(x, y, &e, edm[y1 * w + x0]);
+			checkNeighbour(x, y, &e, edm[y1 * w + x]);
+			checkNeighbour(x, y, &e, edm[y1 * w + x1]);
+		}
+		
+		for( x = 0; x < w; x++ ) {
+			x0 = clamp(x-1, 0, w-1);
+			
+			Neighbor & e = edm[y * w + x];
+			checkNeighbour(x, y, &e, edm[y * w + x0]);
+		}
+	}
+	
+	// Fill empty holes.
+	for( y = 0; y < h; y++ ) {
+		for( x = 0; x < w; x++ ) {
+			const int sx = edm[y * w + x].x;
+			const int sy = edm[y * w + x].y;
+			nvDebugCheck(sx < w && sy < h);
+			
+			if( sx != x || sy != y ) {
+				for(int c = 0; c < count; c++ ) {
+					img->setPixel(img->pixel(sx, sy, c), x, y, c);
+				}
+			}
+		}
+	}
+
+}
+
+
+void nv::fillBlur(FloatImage * img, const BitMap & bmap)
+{
+	nvCheck(img != NULL);
+	
+	// @@ Apply a 3x3 kernel.
+}
+
+
+static bool downsample(const FloatImage * src, const BitMap * srcMask, const FloatImage ** _dst, const BitMap ** _dstMask)
+{
+	const uint w = src->width();
+	const uint h = src->height();
+	const uint count = src->componentNum();
+
+	// count holes in srcMask, return false if fully filled.
+	uint holes = 0;
+	for(uint y = 0; y < h; y++) {
+		for(uint x = 0; x < w; x++) {
+			holes += srcMask->bitAt(x, y) == 0;
+		}
+	}
+	if (holes == 0 || (w == 2 || h == 2)) {
+		// Stop when no holes or when the texture is very small.
+		return false;
+	}
+
+	// Apply box filter to image and mask and return true.
+	const uint nw = w / 2;
+	const uint nh = h / 2;
+
+	FloatImage * dst = new FloatImage();
+	dst->allocate(count, nw, nh);
+	BitMap * dstMask = new BitMap(nw, nh);
+
+	for(uint c = 0; c < count; c++) {
+		for(uint y = 0; y < nh; y++) {
+			for(uint x = 0; x < nw; x++) {
+
+				const uint x0 = 2 * x + 0;
+				const uint x1 = 2 * x + 1;
+				const uint y0 = 2 * y + 0;
+				const uint y1 = 2 * y + 1;
+
+				const float f0 = src->pixel(x0, y0, c);
+				const float f1 = src->pixel(x1, y0, c);
+				const float f2 = src->pixel(x0, y1, c);
+				const float f3 = src->pixel(x1, y1, c);
+
+				const bool b0 = srcMask->bitAt(x0, y0);
+				const bool b1 = srcMask->bitAt(x1, y0);
+				const bool b2 = srcMask->bitAt(x0, y1);
+				const bool b3 = srcMask->bitAt(x1, y1);
+
+				if (b0 || b1 || b2 || b3) {
+					// Set bit mask.
+					dstMask->setBitAt(x, y);
+
+					// Set pixel.
+					float value = 0.0f;
+					int total = 0;
+					if (b0) { value += f0; total++; }
+					if (b1) { value += f1; total++; }
+					if (b2) { value += f2; total++; }
+					if (b3) { value += f3; total++; }
+					dst->setPixel(value / total, x, y, c);
+				}
+			}
+		}
+	}
+
+	*_dst = dst;
+	*_dstMask = dstMask;
+
+	return true;
+}
+
+// This is the filter used in the Lumigraph paper. The Unreal engine uses something similar.
+void nv::fillPullPush(FloatImage * img, const BitMap & bmap)
+{
+	nvCheck(img != NULL);
+
+	const uint count = img->componentNum();
+	const uint w = img->width();
+	const uint h = img->height();
+	const uint num = log2(max(w,h));
+
+	// Build mipmap chain.
+	Array<const FloatImage *> mipmaps(num);
+	Array<const BitMap *> mipmapMasks(num);
+
+	mipmaps.append(img);
+	mipmapMasks.append(&bmap);
+
+	const FloatImage * current;
+	const BitMap * currentMask;
+
+	// Compute mipmap chain.
+	while(downsample(mipmaps.back(), mipmapMasks.back(), &current, &currentMask))
+	{
+		mipmaps.append(current);
+		mipmapMasks.append(currentMask);
+	}
+
+	// Sample mipmaps until non-hole is found.
+	for(uint y = 0; y < h; y++) {
+		for(uint x = 0; x < w; x++) {
+
+			uint sx = x;
+			uint sy = y;
+
+			const uint levelCount = mipmaps.count();
+			for(uint l = 0; l < levelCount; l++) {
+
+				if (mipmapMasks[l]->bitAt(sx, sy))
+				{
+					// Sample mipmaps[l](sx, sy) and copy to img(x, y)
+					for(uint c = 0; c < count; c++) {
+						img->setPixel(mipmaps[l]->pixel(sx, sy, c), x, y, c);
+					}
+					break;
+				}
+
+				sx /= 2;
+				sy /= 2;
+			}
+		}
+	}
+
+	deleteAll(mipmaps);
+	deleteAll(mipmapMasks);
+}
+
+
+/*
+void nv::fillSeamFix(FloatImage * img, const BitMap & bmap)
+{
+}
+*/
+#if 0 // Code below is under the BPL license.
+
+
+/**
+
+DoPixelSeamFix
+10-20-02
+
+Looks in the 5x5 local neighborhood (LocalPixels) of the desired pixel to fill.
+It tries to build a quadratic model of the neighborhood surface to use in
+extrapolating.  You need 5 pixels to establish a 2d quadratic curve.
+
+This is really just a nice generic way to extrapolate pixels.  It also happens
+to work great for seam-fixing.
+
+Note that I'm working on normals, but I treat them just as 3 scalars and normalize
+at the end.  To be more correct, I would work on the surface of a sphere, but that
+just seems like way too much work.
+
+**/
+
+struct LocalPixels
+{
+	// 5x5 neighborhood
+	// the center is at result
+	// index [y][x]
+	bool fill[5][5];
+	float data[5][5];
+	mutable float result;
+	mutable float weight;
+
+
+	bool Quad3SubH(gVec4 * pQ,int row) const
+	{
+		const bool * pFill = fill[row];
+		const float * pDat = data[row];
+	
+		if ( pFill[1] && pFill[2] && pFill[3] )
+		{
+			// good row
+			*pQ = pDat[1] - 2.f * pDat[2] + pDat[3];
+			return true;
+		}
+		else if ( pFill[0] && pFill[1] && pFill[2] )
+		{
+			// good row
+			*pQ = pDat[0] - 2.f * pDat[1] + pDat[2];
+			return true;
+		}
+		else if ( pFill[2] && pFill[3] && pFill[4] )
+		{
+			// good row
+			*pQ = pDat[2] - 2.f * pDat[3] + pDat[4];
+			return true;
+		}
+		return false;
+	}
+
+	// improve result with a horizontal quad in row 1 and/or 
+	bool Quad3SubV(gVec4 * pQ,int col) const
+	{
+		if ( fill[1][col] && fill[2][col] && fill[3][col] )
+		{
+			// good row
+			*pQ = data[1][col] - 2.f * data[2][col] + data[3][col];
+			return true;
+		}
+		else if ( fill[0][col] && fill[1][col] && fill[2][col] )
+		{
+			// good row
+			*pQ = data[0][col] - 2.f * data[1][col] + data[2][col];
+			return true;
+		}
+		else if ( fill[2][col] && fill[3][col] && fill[4][col] )
+		{
+			// good row
+			*pQ = data[2][col] - 2.f * data[3][col] + data[4][col];
+			return true;
+		}
+		return false;
+	}
+	
+	bool Quad3H(gVec4 * pQ) const
+	{
+		if ( ! Quad3SubH(pQ,1) )
+		{
+			return Quad3SubH(pQ,3);	
+		}
+		gVec4 q(0,0,0,0); // initializer not needed, just make it shut up
+		if ( Quad3SubH(&q,3) )
+		{
+			// got q and pQ
+			*pQ = (*pQ+q)*0.5f;
+		}
+		return true;
+	}
+	
+	bool Quad3V(gVec4 * pQ) const
+	{
+		if ( ! Quad3SubV(pQ,1) )
+		{
+			return Quad3SubV(pQ,3);	
+		}
+		gVec4 q(0,0,0,0); // initializer not needed, just make it shut up
+		if ( Quad3SubV(&q,3) )
+		{
+			// got q and pQ
+			*pQ = (*pQ+q)*0.5f;
+		}
+		return true;
+	}
+	// Quad returns ([0]+[2] - 2.f*[1])
+	//	a common want is [1] - ([0]+[2])*0.5f ;
+	// so use -0.5f*Quad
+
+	bool TryQuads() const
+	{
+		bool res = false;
+	
+		// look for a pair that straddles the middle:
+		if ( fill[2][1] && fill[2][3] )
+		{
+			// got horizontal straddle
+			gVec4 q;
+			if ( Quad3H(&q) )
+			{
+				result += (data[2][1] + data[2][3] - q) * 0.5f;
+				weight += 1.f;
+				res = true;
+			}
+		}
+		if ( fill[1][2] && fill[3][2] )
+		{
+			// got vertical straddle
+			gVec4 q;
+			if ( Quad3V(&q) )
+			{
+				result += (data[1][2] + data[3][2] - q) * 0.5f;
+				weight += 1.f;
+				res = true;
+			}
+		}
+	
+		// look for pairs that lead into the middle :
+		if ( fill[2][0] && fill[2][1] )
+		{
+			// got left-side pair
+			gVec4 q;
+			if ( Quad3H(&q) )
+			{
+				result += data[2][1]*2.f - data[2][0] + q;
+				weight += 1.f;
+				res = true;
+			}
+		}
+		if ( fill[2][3] && fill[2][4] )
+		{
+			// got right-side pair
+			gVec4 q;
+			if ( Quad3H(&q) )
+			{
+				result += data[2][3]*2.f - data[2][4] + q;
+				weight += 1.f;
+				res = true;
+			}
+		}
+		if ( fill[0][2] && fill[1][2] )
+		{
+			// got left-side pair
+			gVec4 q;
+			if ( Quad3V(&q) )
+			{
+				result += data[1][2]*2.f - data[0][2] + q;
+				weight += 1.f;
+				res = true;
+			}
+		}
+		if ( fill[3][2] && fill[4][2] )
+		{
+			// got right-side pair
+			gVec4 q;
+			if ( Quad3V(&q) )
+			{
+				result += data[3][2]*2.f - data[4][2] + q;
+				weight += 1.f;
+				res = true;
+			}
+		}
+		return res;
+	}
+	
+	bool TryPlanar() const
+	{
+		// four cases :
+		const int indices[] =
+		{
+			2,1, 1,2, 1,1,
+			2,1, 3,2, 3,1,
+			2,3, 1,2, 1,3,
+			2,3, 3,2, 3,3
+		};
+		bool res = false;
+		for(int i=0;i<4;i++)
+		{
+			const int * I = indices + i*6;
+			if ( ! fill[ I[0] ][ I[1] ] )
+				continue;
+			if ( ! fill[ I[2] ][ I[3] ] )
+				continue;
+			if ( ! fill[ I[4] ][ I[5] ] )
+				continue;
+	
+			result += data[ I[0] ][ I[1] ] + data[ I[2] ][ I[3] ] - data[ I[4] ][ I[5] ];
+			weight += 1.f;
+			res = true;
+		}
+		return res;
+	}
+	
+	bool TryTwos() const
+	{
+		bool res = false;
+	
+		if ( fill[2][1] && fill[2][3] )
+		{
+			result += (data[2][1] + data[2][3]) * 0.5f;
+			weight += 1.f;
+			res = true;
+		}
+		if ( fill[1][2] && fill[3][2] )
+		{
+			result += (data[1][2] + data[3][2]) * 0.5f;
+			weight += 1.f;
+			res = true;
+		}
+		
+		// four side-rotates :
+		const int indices[] =
+		{
+			2,1, 2,0,
+			2,3, 2,4,
+			1,2, 0,2,
+			3,2, 4,2,
+		};
+		for(int i=0;i<4;i++)
+		{
+			const int * I = indices + i*4;
+			if ( ! fill[ I[0] ][ I[1] ] )
+				continue;
+			if ( ! fill[ I[2] ][ I[3] ] )
+				continue;
+	
+			result += data[ I[0] ][ I[1] ]*2.f - data[ I[2] ][ I[3] ];
+			weight += 1.f;
+			res = true;
+		}
+	
+		return res;
+	}
+
+
+	bool DoLocalPixelFill() const
+	{
+		result = gVec4::zero;
+		weight = 0.f;
+	
+		if ( TryQuads() )
+			return true;
+			
+		if ( TryPlanar() )
+			return true;
+	
+		return TryTwos();
+	}
+
+}; // LocalPixels -----------------------------------------------
+
+void gNormalMap::DoPixelSeamFix()
+{
+	gLog::Printf("gNormalMap::DoPixelSeamFix..");
+
+	const int desiredTicks = 30;
+	const int heightPerTick = NUM_SEAMFIX_PASSES * m_height / desiredTicks;
+	int tick = 0;
+
+	for(int pass=0;pass<NUM_SEAMFIX_PASSES;pass++)
+	{
+		for(int yb=0;yb<m_height;yb++)
+		{
+			gVec4 * pRow = m_normals + m_width * yb;
+			const EState * pStateRow = m_states + m_width * yb;
+			for(int xb=0;xb<m_width;xb++)
+			{
+				if ( pStateRow[xb] != eNull && pStateRow[xb] != eEdge )
+				{
+					ASSERT( ! IsNull(pRow[xb]) );
+					continue; // it's got a pixel
+				}
+				// can be non-null, if it wasn't actually inside any tri,
+				//	but got the anti-aliased edge effect of a tri
+				// replace edge pixels with seam-fix here
+				//ASSERT( IsNull(pRow[xb]) );
+
+				// make the local neighborhood:
+				int numFill = 0;
+				LocalPixels lp;
+				for(int ny=0;ny<5;ny++)
+				{
+					int y = (yb + ny - 2);
+					if ( y < 0 || y >= m_height )
+					{
+						// out of range
+						for(int i=0;i<5;i++)
+						{
+							lp.fill[ny][i] = false;
+						}
+						continue;
+					}
+					gVec4 * pRow = m_normals + m_width * y;
+					const EState * pStateRow = m_states + m_width * y;
+					for(int nx=0;nx<5;nx++)
+					{
+						int x = (xb + nx - 2);
+						if ( x < 0 || x >= m_width )
+						{
+							lp.fill[ny][nx] = false;
+						}
+						else if ( pStateRow[x] == eNull || pStateRow[x] == eEdge )
+						{
+							lp.fill[ny][nx] = false;
+						}
+						else
+						{
+							lp.fill[ny][nx] = true;
+							lp.data[ny][nx] = pRow[x];
+							numFill++;
+						}
+					}
+				}
+
+				// need at least 3 to do anything decent
+				if ( numFill < 2 )
+					continue;
+
+				ASSERT(lp.fill[2][2] == false);
+				if ( lp.DoLocalPixelFill() )
+				{
+					if ( lp.result.MutableVec3().NormalizeSafe() )
+					{
+						pRow[xb] = lp.result;
+						pRow[xb][3] /= lp.weight;
+					}
+				}
+			}
+
+			if ( ++tick == heightPerTick )
+			{
+				tick = 0;
+				gLog::Printf(".");
+			}
+		}
+
+		// now run back over and stamp anything that's not null as being ok
+
+		for(int y=0;y<m_height;y++)
+		{
+			const gVec4 * pRow = m_normals + m_width * y;
+			EState * pStateRow = m_states + m_width * y;
+			for(int x=0;x<m_width;x++)
+			{
+				if ( ( pStateRow[x] == eNull || pStateRow[x] == eEdge ) && ! IsNull(pRow[x]) )
+				{
+					pStateRow[x] = eSeamFixed;
+				}
+			}
+		}
+	}
+	
+	gLog::Printf("done\n");
+}
+
+#endif // 0
--- a/src/nvimage/HoleFilling.h
+++ b/src/nvimage/HoleFilling.h
@ -0,0 +1,96 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#ifndef NV_IMAGE_HOLEFILLING_H
+#define NV_IMAGE_HOLEFILLING_H
+
+#include <nvcore/BitArray.h>
+#include <nvimage/nvimage.h>
+
+namespace nv 
+{
+	class FloatImage;
+
+	/// Bit mask.
+	class BitMap
+	{
+	public:
+		BitMap(uint w, uint h) : 
+			m_width(w), m_height(h), m_bitArray(w*h) 
+		{
+		}
+		
+		const uint width() const { return m_width; }
+		const uint height() const { return m_height; }
+		
+		bool bitAt(uint x, uint y) const
+		{
+			nvDebugCheck(x < m_width && y < m_height);
+			return m_bitArray.bitAt(y * m_width + x);
+		}
+		bool bitAt(uint idx) const
+		{
+			return m_bitArray.bitAt(idx);
+		}
+	
+		void setBitAt(uint x, uint y)
+		{
+			nvDebugCheck(x < m_width && y < m_height);
+			m_bitArray.setBitAt(y * m_width + x);
+		}
+		void setBitAt(uint idx)
+		{
+			m_bitArray.setBitAt(idx);
+		}
+	
+		void clearBitAt(uint x, uint y)
+		{
+			nvDebugCheck(x < m_width && y < m_height);
+			m_bitArray.clearBitAt(y * m_width + x);
+		}
+		void clearBitAt(uint idx)
+		{
+			m_bitArray.clearBitAt(idx);
+		}
+	
+		void clearAll()
+		{
+			m_bitArray.clearAll();
+		}
+	
+		void setAll()
+		{
+			m_bitArray.setAll();
+		}
+	
+		void toggleAll()
+		{
+			m_bitArray.toggleAll();
+		}
+		
+		friend void swap(BitMap & a, BitMap & b)
+		{
+			nvCheck(a.m_width == b.m_width);
+			nvCheck(a.m_height == b.m_height);
+			//swap(const_cast<uint &>(a.m_width), const_cast<uint &>(b.m_width));
+			//swap(const_cast<uint &>(a.m_height), const_cast<uint &>(b.m_height));
+			swap(a.m_bitArray, b.m_bitArray);
+		}
+		
+	private:
+		
+		const uint m_width;
+		const uint m_height;
+		BitArray m_bitArray;
+		
+	};
+
+	NVIMAGE_API void fillVoronoi(FloatImage * img, const BitMap & bmap);
+	NVIMAGE_API void fillBlur(FloatImage * img, const BitMap & bmap);
+	NVIMAGE_API void fillPullPush(FloatImage * img, const BitMap & bmap);
+	
+	NVIMAGE_API void fillExtrapolateOnce(FloatImage * img, BitMap * bmap);
+	NVIMAGE_API void fillExtrapolateNTimes(FloatImage * img, BitMap * bmap, int n);
+	
+} // nv namespace
+
+#endif // NV_IMAGE_HOLEFILLING_H
--- a/src/nvimage/Image.cpp
+++ b/src/nvimage/Image.cpp
@ -0,0 +1,125 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#include <nvcore/Debug.h>
+#include <nvcore/Ptr.h>
+
+#include <nvmath/Color.h>
+
+#include <nvimage/Image.h>
+#include <nvimage/ImageIO.h>
+
+
+using namespace nv;
+
+Image::Image() : m_width(0), m_height(0), m_format(Format_RGB), m_data(NULL)
+{
+}
+
+Image::~Image()
+{
+	free();
+}
+
+void Image::allocate(uint w, uint h)
+{
+	free();
+	m_width = w;
+	m_height = h;
+	m_data = new Color32[w*h];
+}
+
+bool Image::load(const char * name)
+{
+	free();
+	
+	AutoPtr<Image> img(ImageIO::load(name));
+	if (img == NULL) {
+		return false;
+	}
+	
+	swap(m_width, img->m_width);
+	swap(m_height, img->m_height);
+	swap(m_format, img->m_format);
+	swap(m_data, img->m_data);
+	
+	return true;
+}
+
+void Image::wrap(void * data, uint w, uint h)
+{
+	free();
+	m_data = (Color32 *)data;
+	m_width = w;
+	m_height = h;
+}
+
+void Image::unwrap()
+{
+	m_data = NULL;
+	m_width = 0;
+	m_height = 0;
+}
+
+
+void Image::free()
+{
+	delete m_data;
+	m_data = NULL;
+}
+
+
+uint Image::width() const
+{
+	return m_width;
+}
+
+uint Image::height() const
+{
+	return m_height;
+}
+
+const Color32 * Image::scanline(uint h) const
+{
+	nvDebugCheck(h < m_height);
+	return m_data + h * m_width;
+}
+
+Color32 * Image::scanline(uint h)
+{
+	nvDebugCheck(h < m_height);
+	return m_data + h * m_width;
+}
+
+const Color32 * Image::pixels() const
+{
+	return m_data;
+}
+
+Color32 * Image::pixels()
+{
+	return m_data;
+}
+
+const Color32 & Image::pixel(uint idx) const
+{
+	nvDebugCheck(idx < m_width * m_height);
+	return m_data[idx];
+}
+
+Color32 & Image::pixel(uint idx)
+{
+	nvDebugCheck(idx < m_width * m_height);
+	return m_data[idx];
+}
+
+
+Image::Format Image::format() const
+{
+	return m_format;
+}
+
+void Image::setFormat(Image::Format f)
+{
+	m_format = f;
+}
+
--- a/src/nvimage/Image.h
+++ b/src/nvimage/Image.h
@ -0,0 +1,77 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#ifndef NV_IMAGE_IMAGE_H
+#define NV_IMAGE_IMAGE_H
+
+#include <nvcore/Debug.h>
+#include <nvimage/nvimage.h>
+
+namespace nv
+{
+	class Color32;
+	
+	/// 32 bit RGBA image.
+	class Image
+	{
+	public:
+		
+		enum Format 
+		{
+			Format_RGB,
+			Format_ARGB,
+		};
+		
+		NVIMAGE_API Image();
+		NVIMAGE_API ~Image();
+		
+		NVIMAGE_API void allocate(uint w, uint h);
+		NVIMAGE_API bool load(const char * name);
+		
+		NVIMAGE_API void wrap(void * data, uint w, uint h);
+		NVIMAGE_API void unwrap();
+		
+		NVIMAGE_API uint width() const;
+		NVIMAGE_API uint height() const;
+		
+		NVIMAGE_API const Color32 * scanline(uint h) const;
+		NVIMAGE_API Color32 * scanline(uint h);
+		
+		NVIMAGE_API const Color32 * pixels() const;
+		NVIMAGE_API Color32 * pixels();
+		
+		NVIMAGE_API const Color32 & pixel(uint idx) const;
+		NVIMAGE_API Color32 & pixel(uint idx);
+		
+		const Color32 & pixel(uint x, uint y) const;
+		Color32 & pixel(uint x, uint y);
+		
+		NVIMAGE_API Format format() const;
+		NVIMAGE_API void setFormat(Format f);
+		
+	private:
+		void free();
+		
+	private:
+		uint m_width;
+		uint m_height;
+		Format m_format;
+		Color32 * m_data;
+	};
+
+
+	inline const Color32 & Image::pixel(uint x, uint y) const
+	{
+		nvDebugCheck(x < width() && y < height());
+		return pixel(y * width() + x);
+	}
+	
+	inline Color32 & Image::pixel(uint x, uint y)
+	{
+		nvDebugCheck(x < width() && y < height());
+		return pixel(y * width() + x);
+	}
+
+} // nv namespace
+
+
+#endif // NV_IMAGE_IMAGE_H
--- a/src/nvimage/ImageIO.cpp
+++ b/src/nvimage/ImageIO.cpp
@ -0,0 +1,863 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#include <nvcore/Ptr.h>
+#include <nvcore/Containers.h>
+#include <nvcore/StrLib.h>
+#include <nvcore/StdStream.h>
+
+#include <nvmath/Color.h>
+
+#include "ImageIO.h"
+#include "Image.h"
+#include "FloatImage.h"
+#include "TgaFile.h"
+
+// Extern
+#if defined(HAVE_JPEG)
+extern "C" {
+#	include <jpeglib.h>
+}
+#endif
+
+#if defined(HAVE_PNG)
+#	include <png.h>
+#endif
+
+#if defined(HAVE_TIFF)
+#	define _TIFF_DATA_TYPEDEFS_
+#	include <tiffio.h>
+#endif
+
+using namespace nv;
+
+namespace {
+
+	// Array of image load plugins.
+//	static HashMap<String, ImageInput_Plugin> s_plugin_load_map;
+
+	// Array of image save plugins.
+//	static HashMap<String, ImageOutput_Plugin> s_plugin_save_map;
+	
+	struct Color555 {
+		uint16 b : 5;
+		uint16 g : 5;
+		uint16 r : 5;
+	};
+	
+} // namespace
+
+
+Image * nv::ImageIO::load(const char * name)
+{
+	StdInputStream stream(name);
+	
+	if (stream.isError()) {
+		return false;
+	}
+	
+	return load(name, stream);
+}
+
+Image * nv::ImageIO::load(const char * name, Stream & s)
+{
+	const char * extension = Path::extension(name);
+	
+	if (strCaseCmp(extension, ".tga") == 0) {
+		return loadTGA(s);
+	}
+#if defined(HAVE_JPEG)
+	if (strCaseCmp(extension, ".jpg") == 0 || strCaseCmp(extension, ".jpeg") == 0) {
+		return loadJPG(s);
+	}
+#endif
+#if defined(HAVE_PNG)
+	if (strCaseCmp(extension, ".png") == 0) {
+		return loadPNG(s);
+	}
+#endif
+	// @@ use image plugins?
+
+	return NULL;
+}
+
+
+/// Load TGA image.
+Image * nv::ImageIO::loadTGA(Stream & s)
+{
+	nvCheck(!s.isError());
+	
+	TgaHeader tga;
+	s << tga;
+	s.seek(TgaHeader::Size + tga.id_length);
+
+	// Get header info.
+	bool rle = false;
+	bool pal = false;
+	bool rgb = false;
+	bool grey = false;
+
+	switch( tga.image_type ) {
+		case TGA_TYPE_RLE_INDEXED:
+			rle = true;
+			// no break is intended!
+		case TGA_TYPE_INDEXED:
+			if( tga.colormap_type!=1 || tga.colormap_size!=24 || tga.colormap_length>256 ) {
+				nvDebug( "*** ImageIO::loadTGA: Error, only 24bit paletted images are supported.\n" );
+				return false;
+			}
+			pal = true;
+			break;
+
+		case TGA_TYPE_RLE_RGB:
+			rle = true;
+			// no break is intended!
+		case TGA_TYPE_RGB:
+			rgb = true;
+			break;
+
+		case TGA_TYPE_RLE_GREY:
+			rle = true;
+			// no break is intended!
+		case TGA_TYPE_GREY:
+			grey = true;
+			break;
+
+		default:
+			nvDebug( "*** ImageIO::loadTGA: Error, unsupported image type.\n" );
+			return false;
+	}
+
+	const uint pixel_size = (tga.pixel_size/8);
+	nvDebugCheck(pixel_size <= 4);
+	
+	const uint size = tga.width * tga.height * pixel_size;
+
+	
+	// Read palette
+	uint8 palette[768];
+	if( pal ) {
+		nvDebugCheck(tga.colormap_length < 256);
+		s.serialize(palette, 3 * tga.colormap_length);
+	}
+
+	// Decode image.
+	uint8 * mem = new uint8[size];
+	if( rle ) {
+		// Decompress image in src.
+		uint8 * dst = mem;
+		int num = size;
+
+		while (num > 0) {
+			// Get packet header
+			uint8 c; 
+			s << c;
+
+			uint count = (c & 0x7f) + 1;
+			num -= count * pixel_size;
+
+			if (c & 0x80) {
+				// RLE pixels.
+				uint8 pixel[4];	// uint8 pixel[pixel_size];
+				s.serialize( pixel, pixel_size );
+				do {
+					memcpy(dst, pixel, pixel_size);
+					dst += pixel_size;
+				} while (--count);
+			}
+			else {
+				// Raw pixels.
+				count *= pixel_size;
+				//file->Read8(dst, count);
+				s.serialize(dst, count);
+				dst += count;
+			}
+		}
+	}
+	else {
+		s.serialize(mem, size);
+	}
+
+	// Allocate image.
+	AutoPtr<Image> img(new Image());
+	img->allocate(tga.width, tga.height);
+
+	int lstep;
+	Color32 * dst;
+	if( tga.flags & TGA_ORIGIN_UPPER ) {
+		lstep = tga.width;
+		dst = img->pixels();
+	}
+	else {
+		lstep = - tga.width;
+		dst = img->pixels() + (tga.height-1) * tga.width;
+	}
+
+	// Write image.
+	uint8 * src = mem;
+	if( pal ) {
+		for( int y = 0; y < tga.height; y++ ) {
+			for( int x = 0; x < tga.width; x++ ) {
+				uint8 idx = *src++;
+				dst[x].setBGRA(palette[3*idx+0], palette[3*idx+1], palette[3*idx+2], 0xFF);
+			}
+			dst += lstep;
+		}
+	}
+	else if( grey ) {
+		img->setFormat(Image::Format_ARGB);
+		
+		for( int y = 0; y < tga.height; y++ ) {
+			for( int x = 0; x < tga.width; x++ ) {
+				dst[x].setBGRA(*src, *src, *src, *src);
+				src++;
+			}
+			dst += lstep;
+		}
+	}
+	else {
+		
+		if( tga.pixel_size == 16 ) {
+			for( int y = 0; y < tga.height; y++ ) {
+				for( int x = 0; x < tga.width; x++ ) {
+					Color555 c = *reinterpret_cast<Color555 *>(src);
+					uint8 b = (c.b << 3) | (c.b >> 2);					
+					uint8 g = (c.g << 3) | (c.g >> 2);
+					uint8 r = (c.r << 3) | (c.r >> 2);
+					dst[x].setBGRA(b, g, r, 0xFF);
+					src += 2;
+				}
+				dst += lstep;
+			}
+		}
+		else if( tga.pixel_size == 24 ) {
+			for( int y = 0; y < tga.height; y++ ) {
+				for( int x = 0; x < tga.width; x++ ) {
+					dst[x].setBGRA(src[0], src[1], src[2], 0xFF);
+					src += 3;
+				}
+				dst += lstep;
+			}
+		}
+		else if( tga.pixel_size == 32 ) {
+			img->setFormat(Image::Format_ARGB);
+			
+			for( int y = 0; y < tga.height; y++ ) {
+				for( int x = 0; x < tga.width; x++ ) {
+					dst[x].setBGRA(src[0], src[1], src[2], src[3]);
+					src += 4;
+				}
+				dst += lstep;
+			}
+		}
+	}
+
+	// free uncompressed data.
+	delete [] mem;
+
+	return img.release();
+}
+
+/// Save TGA image.
+bool nv::ImageIO::saveTGA(Stream & s, const Image * img)
+{
+	nvCheck(!s.isError());
+	nvCheck(img != NULL);
+	nvCheck(img->pixels() != NULL);
+	
+	TgaFile tga;
+	tga.head.id_length = 0;
+	tga.head.colormap_type = 0;
+	tga.head.image_type = TGA_TYPE_RGB;
+
+	tga.head.colormap_index = 0;
+	tga.head.colormap_length = 0;
+	tga.head.colormap_size = 0;
+
+	tga.head.x_origin = 0;
+	tga.head.y_origin = 0;
+	tga.head.width = img->width();
+	tga.head.height = img->height();
+	if(img->format() == Image::Format_ARGB) {
+		tga.head.pixel_size = 32;
+		tga.head.flags = TGA_ORIGIN_UPPER;
+	}
+	else {
+		tga.head.pixel_size = 24;
+		tga.head.flags = TGA_ORIGIN_UPPER;
+	}
+
+	// @@ Serialize directly.
+	tga.allocate();
+
+	const uint n = img->width() * img->height();
+	if(img->format() == Image::Format_ARGB) {
+		for(uint i = 0; i < n; i++) {
+			Color32 color = img->pixel(i);
+			tga.mem[4 * i + 0] = color.b;
+			tga.mem[4 * i + 1] = color.g;
+			tga.mem[4 * i + 2] = color.r;
+			tga.mem[4 * i + 3] = color.a;
+		}
+	}
+	else {
+		for(uint i = 0; i < n; i++) {
+			Color32 color = img->pixel(i);
+			tga.mem[3 * i + 0] = color.b;
+			tga.mem[3 * i + 1] = color.g;
+			tga.mem[3 * i + 2] = color.r;
+		}
+	}
+
+	s << tga;
+	
+	tga.free();
+	
+	return true;
+}
+
+
+#if defined(HAVE_PNG)
+
+static void user_read_data(png_structp png_ptr, png_bytep data, png_size_t length)
+{
+	nvDebugCheck(png_ptr != NULL);
+	
+	Stream * s = (Stream *)png_ptr->io_ptr;
+	s->serialize(data, (int)length);
+	
+	if (s->isError()) {
+		png_error(png_ptr, "Read Error");
+	}
+}
+
+
+Image * nv::ImageIO::loadPNG(Stream & s)
+{
+	nvCheck(!s.isError());
+	
+	// Set up a read buffer and check the library version
+	png_structp png_ptr;
+	png_ptr = png_create_read_struct(PNG_LIBPNG_VER_STRING, NULL, NULL, NULL);
+	if (png_ptr == NULL) {
+	//	nvDebug( "*** LoadPNG: Error allocating read buffer in file '%s'.\n", name );
+		return false;
+	}
+
+	// Allocate/initialize a memory block for the image information
+	png_infop info_ptr = png_create_info_struct(png_ptr);
+	if (info_ptr == NULL) {
+		png_destroy_read_struct(&png_ptr, NULL, NULL);
+	//	nvDebug( "*** LoadPNG: Error allocating image information for '%s'.\n", name );
+		return false;
+	}
+
+	// Set up the error handling
+	if (setjmp(png_jmpbuf(png_ptr))) {
+		png_destroy_read_struct(&png_ptr, &info_ptr, NULL);
+	//	nvDebug( "*** LoadPNG: Error reading png file '%s'.\n", name );
+		return false;
+	}
+
+	// Set up the I/O functions.
+	png_set_read_fn(png_ptr, (void*)&s, user_read_data);
+
+
+	// Retrieve the image header information
+	png_uint_32 width, height;
+	int bit_depth, color_type, interlace_type;
+	png_read_info(png_ptr, info_ptr);
+	png_get_IHDR(png_ptr, info_ptr, &width, &height, &bit_depth, &color_type, &interlace_type, NULL, NULL);
+
+
+	if (color_type == PNG_COLOR_TYPE_PALETTE && bit_depth <= 8) {
+		// Convert indexed images to RGB.
+		png_set_expand(png_ptr);
+	}
+	else if (color_type == PNG_COLOR_TYPE_GRAY && bit_depth < 8) {
+		// Convert grayscale to RGB.
+		png_set_expand(png_ptr);
+	}
+	else if (png_get_valid(png_ptr, info_ptr, PNG_INFO_tRNS)) {
+		// Expand images with transparency to full alpha channels
+		// so the data will be available as RGBA quartets.
+		png_set_expand(png_ptr);
+	}
+	else if (bit_depth < 8) {
+		// If we have < 8 scale it up to 8.
+		//png_set_expand(png_ptr);
+		png_set_packing(png_ptr);
+	}
+
+	// Reduce bit depth.
+	if (bit_depth == 16) {
+		png_set_strip_16(png_ptr);
+	}
+
+	// Represent gray as RGB
+	if (color_type == PNG_COLOR_TYPE_GRAY || color_type == PNG_COLOR_TYPE_GRAY_ALPHA) {
+		png_set_gray_to_rgb(png_ptr);
+	}
+
+	// Convert to RGBA filling alpha with 0xFF.
+	if (!(color_type & PNG_COLOR_MASK_ALPHA)) {
+		png_set_filler(png_ptr, 0xFF, PNG_FILLER_AFTER);
+	}
+
+	// @todo Choose gamma according to the platform?
+	double screen_gamma = 2.2;
+	int intent;
+	if (png_get_sRGB(png_ptr, info_ptr, &intent)) {
+		png_set_gamma(png_ptr, screen_gamma, 0.45455);
+	}
+	else {
+		double image_gamma;
+		if (png_get_gAMA(png_ptr, info_ptr, &image_gamma)) {
+			png_set_gamma(png_ptr, screen_gamma, image_gamma);
+		}
+		else {
+			png_set_gamma(png_ptr, screen_gamma, 0.45455);
+		}
+	}
+
+	// Perform the selected transforms.
+	png_read_update_info(png_ptr, info_ptr);
+
+	png_get_IHDR(png_ptr, info_ptr, &width, &height, &bit_depth, &color_type, &interlace_type, NULL, NULL);
+
+	AutoPtr<Image> img(new Image());
+	img->allocate(width, height);
+
+	// Set internal format flags.
+	if(color_type & PNG_COLOR_MASK_COLOR) {
+		//img->flags |= PI_IF_HAS_COLOR;
+	}
+	if(color_type & PNG_COLOR_MASK_ALPHA) {
+		//img->flags |= PI_IF_HAS_ALPHA;
+		img->setFormat(Image::Format_ARGB);
+	}
+
+	// Read the image
+	uint8 * pixels = (uint8 *)img->pixels();
+	png_bytep * row_data = new png_bytep[sizeof(png_byte) * height];
+	for (uint i = 0; i < height; i++) {
+		row_data[i] = &(pixels[width * 4 * i]);
+	}
+
+	png_read_image(png_ptr, row_data);
+	delete [] row_data;
+
+	// Finish things up
+	png_read_end(png_ptr, info_ptr);
+	png_destroy_read_struct(&png_ptr, &info_ptr, NULL);
+
+	// RGBA to BGRA.
+	uint num = width * height;
+	for(uint i = 0; i < num; i++)
+	{
+		Color32 c = img->pixel(i);
+		img->pixel(i) = Color32(c.b, c.g, c.r, c.a);
+	}
+	
+	// Compute alpha channel if needed.
+	/*if( img->flags & PI_IU_BUMPMAP || img->flags & PI_IU_ALPHAMAP ) {
+		if( img->flags & PI_IF_HAS_COLOR && !(img->flags & PI_IF_HAS_ALPHA)) {
+			img->ComputeAlphaFromColor();
+		}
+	}*/
+
+	return img.release();
+}
+
+
+FloatImage * nv::ImageIO::loadFloatPNG(Stream & s)
+{
+	return NULL;
+}
+
+
+#endif // defined(HAVE_PNG)
+
+#if defined(HAVE_JPEG)
+
+static void init_source (j_decompress_ptr /*cinfo*/){
+}
+
+static boolean fill_input_buffer (j_decompress_ptr cinfo){
+	struct jpeg_source_mgr * src = cinfo->src;
+	static JOCTET FakeEOI[] = { 0xFF, JPEG_EOI };
+
+	// Generate warning
+	nvDebug("jpeglib: Premature end of file\n");
+
+	// Insert a fake EOI marker
+	src->next_input_byte = FakeEOI;
+	src->bytes_in_buffer = 2;
+
+	return TRUE;
+}
+
+static void skip_input_data (j_decompress_ptr cinfo, long num_bytes) {
+	struct jpeg_source_mgr * src = cinfo->src;
+
+	if(num_bytes >= (long)src->bytes_in_buffer) {
+		fill_input_buffer(cinfo);
+		return;
+	}
+
+	src->bytes_in_buffer -= num_bytes;
+	src->next_input_byte += num_bytes;
+}
+
+static void term_source (j_decompress_ptr /*cinfo*/){
+	// no work necessary here
+}
+
+
+Image * nv::ImageIO::loadJPG(Stream & s)
+{
+	nvCheck(!s.isError());
+	
+	// Read the entire file.
+	Array<uint8> byte_array;
+	byte_array.resize(s.size());
+	s.serialize(byte_array.unsecureBuffer(), s.size());
+	
+	jpeg_decompress_struct cinfo;
+	jpeg_error_mgr jerr;
+
+	cinfo.err = jpeg_std_error(&jerr);
+	jpeg_create_decompress(&cinfo);
+
+	cinfo.src = (struct jpeg_source_mgr *) (*cinfo.mem->alloc_small)
+			((j_common_ptr) &cinfo, JPOOL_PERMANENT, sizeof(struct jpeg_source_mgr));
+	cinfo.src->init_source = init_source;
+	cinfo.src->fill_input_buffer = fill_input_buffer;
+	cinfo.src->skip_input_data = skip_input_data;
+	cinfo.src->resync_to_restart = jpeg_resync_to_restart;	// use default method
+	cinfo.src->term_source = term_source;
+	cinfo.src->bytes_in_buffer = byte_array.size();
+	cinfo.src->next_input_byte = byte_array.buffer();
+
+	jpeg_read_header(&cinfo, TRUE);
+	jpeg_start_decompress(&cinfo);
+
+	/*
+	cinfo.do_fancy_upsampling = FALSE;	// fast decompression
+	cinfo.dct_method = JDCT_FLOAT;			// Choose floating point DCT method.
+	*/
+
+	uint8 * tmp_buffer = new uint8 [cinfo.output_width * cinfo.output_height * cinfo.num_components];
+	uint8 * scanline = tmp_buffer;
+
+	while( cinfo.output_scanline < cinfo.output_height ){
+		int num_scanlines = jpeg_read_scanlines (&cinfo, &scanline, 1);
+		scanline += num_scanlines * cinfo.output_width * cinfo.num_components;
+	}
+
+	jpeg_finish_decompress(&cinfo);
+
+	AutoPtr<Image> img(new Image());
+	img->allocate(cinfo.output_width, cinfo.output_height);
+
+	Color32 * dst = img->pixels();
+	const int size = img->height() * img->width();
+	const uint8 * src = tmp_buffer;
+
+	if( cinfo.num_components == 3 ) {
+		img->setFormat(Image::Format_RGB);
+		for( int i = 0; i < size; i++ ) {
+			*dst++ = Color32(src[0], src[1], src[2]);
+			src += 3;
+		}
+	}
+	else {
+		img->setFormat(Image::Format_ARGB);
+		for( int i = 0; i < size; i++ ) {
+			*dst++ = Color32(*src, *src, *src, *src);
+			src++;
+		}
+	}
+
+	delete [] tmp_buffer;
+	jpeg_destroy_decompress (&cinfo);
+
+	return img.release();
+}
+
+#endif // defined(HAVE_JPEG)
+
+#if defined(HAVE_TIFF)
+
+FloatImage * nv::ImageIO::loadFloatTIFF(Stream & s)
+{
+	nvCheck(!s.isError());
+	return NULL;
+}
+
+FloatImage * nv::ImageIO::loadFloatTIFF(const char * fileName)
+{
+	TIFF * tif = TIFFOpen(fileName, "r");
+	if (!tif)
+	{
+		nvDebug("Can't open '%s' for reading\n", fileName);
+		return NULL;
+	}
+	
+	::uint16 spp, bpp;
+	::uint32 width, height;
+	TIFFGetField(tif, TIFFTAG_IMAGELENGTH, &height);
+	TIFFGetField(tif, TIFFTAG_IMAGEWIDTH, &width);
+	TIFFGetField(tif, TIFFTAG_BITSPERSAMPLE, &bpp);
+	TIFFGetField(tif, TIFFTAG_SAMPLESPERPIXEL, &spp);
+	
+	if (spp != 1 || (bpp != 8 && bpp != 16 && bpp != 32)) {
+		nvDebug("Can't load '%s', only 1 sample per pixel supported\n", fileName);
+		TIFFClose(tif);
+		return NULL;
+	}
+	
+	FloatImage * fimage = new FloatImage();
+	fimage->allocate(spp, width, height);
+	
+	int linesize = TIFFScanlineSize(tif);
+	tdata_t buf = (::uint8 *)nv::mem::malloc(linesize);
+	
+	for (uint y = 0; y < height; y++) {
+		TIFFReadScanline(tif, buf, y, 0);
+		
+		float * dst = fimage->scanline(y, 0);
+		
+		if (bpp == 8) {
+			for(uint x = 0; x < width; x++) {
+				dst[x] = float(((::uint8 *)buf)[x]) / float(0xFF);
+			}
+		}
+		else if (bpp == 16) {
+			for(uint x = 0; x < width; x++) {
+				dst[x] = float(((::uint16 *)buf)[x]) / float(0xFFFF);
+			}
+		}
+		else /*if (bpp == 32)*/ {
+			// Mantissa has only 24 bits, so drop 8 bits.
+			for(uint x = 0; x < width; x++) {
+				dst[x] = float(((::uint32 *)buf)[x] >> 8) / float(0xFFFFFF);
+			}
+		}
+	}
+
+	nv::mem::free(buf);
+	
+	TIFFClose(tif);
+	
+	return fimage;
+}
+
+#endif
+
+#if 0
+
+/** Save PNG*/
+static bool SavePNG(const PiImage * img, const char * name) {
+	nvCheck( img != NULL );
+	nvCheck( img->mem != NULL );
+
+	if( piStrCmp(piExtension(name), ".png" ) != 0 ) {
+		return false;
+	}
+	
+	if( img->flags & PI_IT_CUBEMAP ) {
+		nvDebug("*** Cannot save cubemaps as PNG.");
+		return false;
+	}
+	if( img->flags & PI_IT_DDS ) {
+		nvDebug("*** Cannot save DDS surface as PNG.");
+		return false;
+	}
+
+	nvDebug( "--- Saving '%s'.\n", name );
+	
+	PiAutoPtr<PiStream> ar( PiFileSystem::CreateFileWriter( name ) );
+	if( ar == NULL ) {
+		nvDebug( "*** SavePNG: Error, cannot save file '%s'.\n", name );
+		return false;
+	}
+
+/*
+public class PNGEnc {
+
+    public static function encode(img:BitmapData):ByteArray {
+        // Create output byte array
+        var png:ByteArray = new ByteArray();
+        // Write PNG signature
+        png.writeUnsignedInt(0x89504e47);
+        png.writeUnsignedInt(0x0D0A1A0A);
+        // Build IHDR chunk
+        var IHDR:ByteArray = new ByteArray();
+        IHDR.writeInt(img.width);
+        IHDR.writeInt(img.height);
+        IHDR.writeUnsignedInt(0x08060000); // 32bit RGBA
+        IHDR.writeByte(0);
+        writeChunk(png,0x49484452,IHDR);
+        // Build IDAT chunk
+        var IDAT:ByteArray= new ByteArray();
+        for(var i:int=0;i < img.height;i++) {
+            // no filter
+            IDAT.writeByte(0);
+            var p:uint;
+            if ( !img.transparent ) {
+                for(var j:int=0;j < img.width;j++) {
+                    p = img.getPixel(j,i);
+                    IDAT.writeUnsignedInt(
+                        uint(((p&0xFFFFFF) << 8)|0xFF));
+                }
+            } else {
+                for(var j:int=0;j < img.width;j++) {
+                    p = img.getPixel32(j,i);
+                    IDAT.writeUnsignedInt(
+                        uint(((p&0xFFFFFF) << 8)|
+                        (shr(p,24))));
+                }
+            }
+        }
+        IDAT.compress();
+        writeChunk(png,0x49444154,IDAT);
+        // Build IEND chunk
+        writeChunk(png,0x49454E44,null);
+        // return PNG
+        return png;
+    }
+
+    private static var crcTable:Array;
+    private static var crcTableComputed:Boolean = false;
+
+    private static function writeChunk(png:ByteArray, 
+            type:uint, data:ByteArray) {
+        if (!crcTableComputed) {
+            crcTableComputed = true;
+            crcTable = [];
+            for (var n:uint = 0; n < 256; n++) {
+                var c:uint = n;
+                for (var k:uint = 0; k < 8; k++) {
+                    if (c & 1) {
+                        c = uint(uint(0xedb88320) ^ 
+                            uint(c >>> 1));
+                    } else {
+                        c = uint(c >>> 1);
+                    }
+                }
+                crcTable[n] = c;
+            }
+        }
+        var len:uint = 0;
+        if (data != null) {
+            len = data.length;
+        }
+        png.writeUnsignedInt(len);
+        var p:uint = png.position;
+        png.writeUnsignedInt(type);
+        if ( data != null ) {
+            png.writeBytes(data);
+        }
+        var e:uint = png.position;
+        png.position = p;
+        var c:uint = 0xffffffff;
+        for (var i:int = 0; i < (e-p); i++) {
+            c = uint(crcTable[
+                (c ^ png.readUnsignedByte()) & 
+                uint(0xff)] ^ uint(c >>> 8));
+        }
+        c = uint(c^uint(0xffffffff));
+        png.position = e;
+        png.writeUnsignedInt(c);
+    }
+}
+*/
+}
+
+#endif // 0
+
+#if 0
+
+
+namespace ImageIO {
+
+	/** Init ImageIO plugins. */
+	void InitPlugins() {
+	//	AddInputPlugin( "", LoadANY );
+		AddInputPlugin( "tga", LoadTGA );
+#if HAVE_PNG
+		AddInputPlugin( "png", LoadPNG );
+#endif
+#if HAVE_JPEG
+		AddInputPlugin( "jpg", LoadJPG );
+#endif
+		AddInputPlugin( "dds", LoadDDS );
+		
+		AddOutputPlugin( "tga", SaveTGA );
+	}
+	
+	/** Reset ImageIO plugins. */
+	void ResetPlugins() {
+		s_plugin_load_map.Clear();
+		s_plugin_save_map.Clear();
+	}
+	
+	/** Add an input plugin. */
+	void AddInputPlugin( const char * ext, ImageInput_Plugin plugin ) {
+		s_plugin_load_map.Add(ext, plugin);
+	}
+	
+	/** Add an output plugin. */
+	void AddOutputPlugin( const char * ext, ImageOutput_Plugin plugin ) {
+		s_plugin_save_map.Add(ext, plugin);
+	}
+
+	
+	bool Load(PiImage * img, const char * name, PiStream & stream) {
+			
+		// Get name extension.
+		const char * extension = piExtension(name);
+		
+		// Skip the dot.
+		if( *extension == '.' ) {
+			extension++;
+		}
+		
+		// Lookup plugin in the map.
+		ImageInput_Plugin plugin = NULL;
+		if( s_plugin_load_map.Get(extension, &plugin) ) {
+			return plugin(img, stream);
+		}
+		
+		/*foreach(i, s_plugin_load_map) {
+			nvDebug("%s %s %d\n", s_plugin_load_map[i].key.GetStr(), extension, 0 == strcmp(extension, s_plugin_load_map[i].key));
+		}
+		
+		nvDebug("No plugin found for '%s' %d.\n", extension, s_plugin_load_map.Size());*/
+		
+		return false;
+	}
+
+	bool Save(const PiImage * img, const char * name, PiStream & stream) {
+				
+		// Get name extension.
+		const char * extension = piExtension(name);
+		
+		// Skip the dot.
+		if( *extension == '.' ) {
+			extension++;
+		}
+		
+		// Lookup plugin in the map.
+		ImageOutput_Plugin plugin = NULL;
+		if( s_plugin_save_map.Get(extension, &plugin) ) {
+			return plugin(img, stream);
+		}
+		
+		return false;
+	}
+	
+} // ImageIO
+
+#endif // 0
+
--- a/src/nvimage/ImageIO.h
+++ b/src/nvimage/ImageIO.h
@ -0,0 +1,43 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#ifndef NV_IMAGE_IMAGEIO_H
+#define NV_IMAGE_IMAGEIO_H
+
+#include <nvimage/nvimage.h>
+
+namespace nv
+{
+	class Image;
+	class FloatImage;
+	class Stream;
+
+	namespace ImageIO
+	{
+		NVIMAGE_API Image * load(const char * name);
+		NVIMAGE_API Image * load(const char * name, Stream & s);
+		
+		NVIMAGE_API Image * loadTGA(Stream & s);
+		NVIMAGE_API bool saveTGA(Stream & s, const Image * img);
+
+#if defined(HAVE_PNG)
+		NVIMAGE_API Image * loadPNG(Stream & s);
+		NVIMAGE_API FloatImage * loadFloatPNG(Stream & s);
+#endif
+
+#if defined(HAVE_JPEG)
+		NVIMAGE_API Image * loadJPG(Stream & s);
+#endif
+		
+#if defined(HAVE_TIFF)
+		// Hack!
+		NVIMAGE_API FloatImage * loadFloatTIFF(const char * fileName);
+
+		NVIMAGE_API FloatImage * loadFloatTIFF(Stream & s);
+#endif
+
+	} // ImageIO namespace
+	
+} // nv namespace
+
+
+#endif // NV_IMAGE_IMAGEIO_H
--- a/src/nvimage/NormalMap.cpp
+++ b/src/nvimage/NormalMap.cpp
@ -0,0 +1,138 @@
+// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
+// 
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+#include <nvcore/Ptr.h>
+
+#include <nvmath/Color.h>
+
+#include <nvimage/NormalMap.h>
+#include <nvimage/Filter.h>
+#include <nvimage/FloatImage.h>
+#include <nvimage/Image.h>
+
+using namespace nv;
+
+// Create normal map using the given kernels.
+static FloatImage * createNormalMap(const Image * img, FloatImage::WrapMode wm, Vector4::Arg heightWeights, const Kernel2 * kdu, const Kernel2 * kdv)
+{
+	nvCheck(kdu != NULL);
+	nvCheck(kdv != NULL);
+	nvCheck(img != NULL);
+	
+	const uint w = img->width();
+	const uint h = img->height();
+	
+	AutoPtr<FloatImage> fimage(new FloatImage());
+	fimage->allocate(4, w, h);
+	
+	// Compute height and store in alpha channel:
+	float * alphaChannel = fimage->channel(3);
+	for(uint i = 0; i < w*h; i++)
+	{
+		Vector4 color = toVector4(img->pixel(i));
+		alphaChannel[i] = dot(color, heightWeights);
+	}
+	
+	float heightScale = 1.0f / 16.0f;	// @@ Use a user defined factor.
+	
+	for(uint y = 0; y < h; y++)
+	{
+		for(uint x = 0; x < w; x++)
+		{
+			const float du = fimage->applyKernel(kdu, x, y, 3, wm);
+			const float dv = fimage->applyKernel(kdv, x, y, 3, wm);
+			
+			Vector3 n = normalize(Vector3(du, dv, heightScale));
+			
+			fimage->setPixel(0.5f * n.x() + 0.5f, x, y, 0);
+			fimage->setPixel(0.5f * n.y() + 0.5f, x, y, 1);
+			fimage->setPixel(0.5f * n.z() + 0.5f, x, y, 2);
+		}
+	}
+	
+	return fimage.release();
+}
+
+
+/// Create normal map using the given filter.
+FloatImage * nv::createNormalMap(const Image * img, FloatImage::WrapMode wm, Vector4::Arg heightWeights, NormalMapFilter filter /*= Sobel3x3*/)
+{
+	nvCheck(img != NULL);
+	
+	// Init the kernels.
+	Kernel2 * kdu = NULL;
+	Kernel2 * kdv = NULL;
+
+	switch(filter)
+	{
+		case NormalMapFilter_Sobel3x3:
+			kdu = new Kernel2(3);
+			break;
+		case NormalMapFilter_Sobel5x5:
+			kdu = new Kernel2(5);
+			break;
+		case NormalMapFilter_Sobel7x7:
+			kdu = new Kernel2(7);
+			break;
+		case NormalMapFilter_Sobel9x9:
+			kdu = new Kernel2(9);
+			break;
+		default:
+			nvDebugCheck(false);
+	};
+
+	kdu->initSobel();
+	kdu->normalize();
+
+	kdv = new Kernel2(*kdu);
+	kdv->transpose();
+
+	return ::createNormalMap(img, wm, heightWeights, kdu, kdv);
+}
+
+
+/// Create normal map combining multiple sobel filters.
+FloatImage * nv::createNormalMap(const Image * img, FloatImage::WrapMode wm, Vector4::Arg heightWeights, Vector4::Arg filterWeights)
+{
+	nvCheck(img != NULL);
+
+	Kernel2 * kdu = NULL;
+	Kernel2 * kdv = NULL;
+
+	kdu = new Kernel2(9);
+	kdu->initBlendedSobel(filterWeights);
+	kdu->normalize();
+	
+	kdv = new Kernel2(*kdu);
+	kdv->transpose();
+	
+	return ::createNormalMap(img, wm, heightWeights, kdu, kdv);
+}
+
+/// Normalize the given image in place.
+void nv::normalize(FloatImage * img)
+{
+	nvCheck(img != NULL);
+	img->normalize(0);
+}
+
--- a/src/nvimage/NormalMap.h
+++ b/src/nvimage/NormalMap.h
@ -0,0 +1,55 @@
+// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
+// 
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef NV_IMAGE_NORMALMAP_H
+#define NV_IMAGE_NORMALMAP_H
+
+#include <nvmath/Vector.h>
+#include <nvimage/nvimage.h>
+#include <nvimage/FloatImage.h>
+
+
+namespace nv
+{
+	class Image;
+
+	enum NormalMapFilter
+	{
+		NormalMapFilter_Sobel3x3,	// fine detail
+		NormalMapFilter_Sobel5x5,	// medium detail
+		NormalMapFilter_Sobel7x7,	// large detail
+		NormalMapFilter_Sobel9x9,	// very large
+	};
+
+	FloatImage * createNormalMap(const Image * img, FloatImage::WrapMode wm, Vector4::Arg heightWeights, NormalMapFilter filter = NormalMapFilter_Sobel3x3);
+
+	FloatImage * createNormalMap(const Image * img, FloatImage::WrapMode wm, Vector4::Arg heightWeights, Vector4::Arg filterWeights);
+
+	void normalize(FloatImage * img);
+
+	// @@ Add generation of DU/DV maps.
+
+
+} // nv namespace
+
+#endif // NV_IMAGE_NORMALMAP_H
--- a/src/nvimage/Quantize.cpp
+++ b/src/nvimage/Quantize.cpp
@ -0,0 +1,234 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+/*
+http://www.visgraf.impa.br/Courses/ip00/proj/Dithering1/floyd_steinberg_dithering.html
+http://www.gamedev.net/reference/articles/article341.asp
+
+@@ Look at LPS: http://www.cs.rit.edu/~pga/pics2000/i.html
+ 
+This is a really nice guide to dithering algorithms:
+http://www.efg2.com/Lab/Library/ImageProcessing/DHALF.TXT
+
+@@ This code needs to be reviewed, I'm not sure it's correct.
+*/
+
+#include <nvmath/Color.h>
+
+#include <nvimage/Image.h>
+#include <nvimage/Quantize.h>
+
+using namespace nv;
+
+
+// Simple quantization.
+void nv::Quantize::BinaryAlpha( Image * image, int alpha_threshold /*= 127*/ )
+{
+	nvCheck(image != NULL);
+	
+	const uint w = image->width();
+	const uint h = image->height();
+	
+	for(uint y = 0; y < h; y++) {
+		for(uint x = 0; x < w; x++) {
+			
+			Color32 pixel = image->pixel(x, y);
+			
+			// Convert color.
+			if( pixel.a > alpha_threshold ) pixel.a = 255;
+			else pixel.a = 0;
+			
+			// Store color.
+			image->pixel(x, y) = pixel;
+		}
+	}
+}
+
+
+// Simple quantization.
+void nv::Quantize::RGB16( Image * image )
+{
+	nvCheck(image != NULL);
+	
+	const uint w = image->width();
+	const uint h = image->height();
+	
+	for(uint y = 0; y < h; y++) {
+		for(uint x = 0; x < w; x++) {
+			
+			Color32 pixel32 = image->pixel(x, y);
+			
+			// Convert to 16 bit and back to 32 using regular bit expansion.
+			Color32 pixel16 = toColor32( toColor16(pixel32) );
+			
+			// Store color.
+			image->pixel(x, y) = pixel16;
+		}
+	}
+}
+
+// Alpha quantization.
+void nv::Quantize::Alpha4( Image * image )
+{
+	nvCheck(image != NULL);
+	
+	const uint w = image->width();
+	const uint h = image->height();
+	
+	for(uint y = 0; y < h; y++) {
+		for(uint x = 0; x < w; x++) {
+			
+			Color32 pixel = image->pixel(x, y);
+			
+			// Convert to 4 bit using regular bit expansion.
+			pixel.a = (pixel.a & 0xF0) | ((pixel.a & 0xF0) >> 4);
+			
+			// Store color.
+			image->pixel(x, y) = pixel;
+		}
+	}
+}
+
+
+// Error diffusion. Floyd Steinberg.
+void nv::Quantize::FloydSteinberg_RGB16( Image * image )
+{
+	nvCheck(image != NULL);
+	
+	const uint w = image->width();
+	const uint h = image->height();
+	
+	// @@ Use fixed point?
+	Vector3 * row0 = new Vector3[w+2];
+	Vector3 * row1 = new Vector3[w+2];
+	memset(row0, 0, sizeof(Vector3)*(w+2));
+	memset(row1, 0, sizeof(Vector3)*(w+2));
+	
+	for(uint y = 0; y < h; y++) {
+		for(uint x = 0; x < w; x++) {
+			
+			Color32 pixel32 = image->pixel(x, y);
+			
+			// Add error.	// @@ We shouldn't clamp here!
+			pixel32.r = clamp(int(pixel32.r) + int(row0[1+x].x()), 0, 255);
+			pixel32.g = clamp(int(pixel32.g) + int(row0[1+x].y()), 0, 255);
+			pixel32.b = clamp(int(pixel32.b) + int(row0[1+x].z()), 0, 255);
+			
+			// Convert to 16 bit. @@ Use regular clamp?
+			Color32 pixel16 = toColor32( toColor16(pixel32) );
+			
+			// Store color.
+			image->pixel(x, y) = pixel16;
+			
+			// Compute new error.
+			Vector3 diff(float(pixel32.r - pixel16.r), float(pixel32.g - pixel16.g), float(pixel32.b - pixel16.b));
+			
+			// Propagate new error.
+			row0[1+x+1] += 7.0f / 16.0f * diff;
+			row1[1+x-1] += 3.0f / 16.0f * diff;
+			row1[1+x+0] += 5.0f / 16.0f * diff;
+			row1[1+x+1] += 1.0f / 16.0f * diff;
+		}
+		
+		swap(row0, row1);
+		memset(row1, 0, sizeof(Vector3)*(w+2));
+	}
+	
+	delete [] row0;
+	delete [] row1;
+}
+
+
+// Error diffusion. Floyd Steinberg.
+void nv::Quantize::FloydSteinberg_BinaryAlpha( Image * image, int alpha_threshold /*= 127*/ ) 
+{
+	nvCheck(image != NULL);
+	
+	const uint w = image->width();
+	const uint h = image->height();
+	
+	// @@ Use fixed point?
+	float * row0 = new float[(w+2)];
+	float * row1 = new float[(w+2)];
+	memset(row0, 0, sizeof(float)*(w+2));
+	memset(row1, 0, sizeof(float)*(w+2));
+	
+	for(uint y = 0; y < h; y++) {
+		for(uint x = 0; x < w; x++) {
+			
+			Color32 pixel = image->pixel(x, y);
+			
+			// Add error.
+			int alpha = int(pixel.a) + int(row0[1+x]);
+			
+			// Convert color.
+			if( alpha > alpha_threshold ) pixel.a = 255;
+			else pixel.a = 0;
+			
+			// Store color.
+			image->pixel(x, y) = pixel;
+			
+			// Compute new error.
+			float diff = float(alpha - pixel.a);
+			
+			// Propagate new error.
+			row0[1+x+1] += 7.0f / 16.0f * diff;
+			row1[1+x-1] += 3.0f / 16.0f * diff;
+			row1[1+x+0] += 5.0f / 16.0f * diff;
+			row1[1+x+1] += 1.0f / 16.0f * diff;
+		}
+		
+		swap(row0, row1);
+		memset(row1, 0, sizeof(float)*(w+2));
+	}
+	
+	delete [] row0;
+	delete [] row1;
+}
+
+
+// Error diffusion. Floyd Steinberg.
+void nv::Quantize::FloydSteinberg_Alpha4( Image * image )
+{
+	nvCheck(image != NULL);
+	
+	const uint w = image->width();
+	const uint h = image->height();
+	
+	// @@ Use fixed point?
+	float * row0 = new float[(w+2)];
+	float * row1 = new float[(w+2)];
+	memset(row0, 0, sizeof(float)*(w+2));
+	memset(row1, 0, sizeof(float)*(w+2));
+	
+	for(uint y = 0; y < h; y++) {
+		for(uint x = 0; x < w; x++) {
+			
+			Color32 pixel = image->pixel(x, y);
+			
+			// Add error.
+			int alpha = int(pixel.a) + int(row0[1+x]);
+			
+			// Convert to 4 bit using regular bit expansion.
+			pixel.a = (pixel.a & 0xF0) | ((pixel.a & 0xF0) >> 4);
+			
+			// Store color.
+			image->pixel(x, y) = pixel;
+			
+			// Compute new error.
+			float diff = float(alpha - pixel.a);
+			
+			// Propagate new error.
+			row0[1+x+1] += 7.0f / 16.0f * diff;
+			row1[1+x-1] += 3.0f / 16.0f * diff;
+			row1[1+x+0] += 5.0f / 16.0f * diff;
+			row1[1+x+1] += 1.0f / 16.0f * diff;
+		}
+		
+		swap(row0, row1);
+		memset(row1, 0, sizeof(float)*(w+2));
+	}
+	
+	delete [] row0;
+	delete [] row1;
+}
+
--- a/src/nvimage/Quantize.h
+++ b/src/nvimage/Quantize.h
@ -0,0 +1,25 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#ifndef NV_IMAGE_QUANTIZE_H
+#define NV_IMAGE_QUANTIZE_H
+
+namespace nv
+{
+	class Image;
+
+	namespace Quantize
+	{
+		void RGB16(Image * img);
+		void BinaryAlpha(Image * img, int alpha_threshold = 127);
+		void Alpha4(Image * img);
+		
+		void FloydSteinberg_RGB16(Image * img);
+		void FloydSteinberg_BinaryAlpha(Image * img, int alpha_threshold = 127);
+		void FloydSteinberg_Alpha4(Image * img);
+
+		// @@ Add palette quantization algorithms!
+	}
+}
+
+
+#endif // NV_IMAGE_QUANTIZE_H
--- a/src/nvimage/TgaFile.h
+++ b/src/nvimage/TgaFile.h
@ -0,0 +1,103 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#ifndef NV_IMAGE_TGAFILE_H
+#define NV_IMAGE_TGAFILE_H
+
+#include <nvcore/Stream.h>
+
+namespace nv
+{
+	
+// TGA types
+enum TGAType {
+    TGA_TYPE_INDEXED		= 1,
+    TGA_TYPE_RGB			= 2,
+    TGA_TYPE_GREY			= 3,
+    TGA_TYPE_RLE_INDEXED	= 9,
+    TGA_TYPE_RLE_RGB		= 10,
+    TGA_TYPE_RLE_GREY		= 11
+};
+
+#define TGA_INTERLEAVE_MASK	0xc0
+#define TGA_INTERLEAVE_NONE	0x00
+#define TGA_INTERLEAVE_2WAY	0x40
+#define TGA_INTERLEAVE_4WAY	0x80
+
+#define TGA_ORIGIN_MASK		0x30
+#define TGA_ORIGIN_LEFT		0x00
+#define TGA_ORIGIN_RIGHT	0x10
+#define TGA_ORIGIN_LOWER	0x00
+#define TGA_ORIGIN_UPPER	0x20
+
+
+/// Tga Header.
+struct TgaHeader {
+	uint8	id_length;
+	uint8	colormap_type;
+	uint8	image_type;
+	uint16	colormap_index;
+	uint16	colormap_length;
+	uint8	colormap_size;
+	uint16	x_origin;
+	uint16	y_origin;
+	uint16	width;
+	uint16	height;
+	uint8	pixel_size;
+	uint8	flags;
+
+	enum { Size = 18 };		//const static int SIZE = 18;
+};
+
+
+/// Tga File.
+struct TgaFile {
+
+	TgaFile() {
+		mem = NULL;
+	}
+	~TgaFile() {
+		free();
+	}
+
+	uint size() const {
+		return head.width * head.height * (head.pixel_size / 8);
+	}
+	void allocate() {
+		nvCheck( mem == NULL );
+		mem = new uint8[size()];
+	}
+	void free() {
+		delete [] mem;
+		mem = NULL;
+	}
+
+	TgaHeader head;
+	uint8 * mem;
+};
+
+
+inline Stream & operator<< (Stream & s, TgaHeader & head)
+{
+	s << head.id_length << head.colormap_type << head.image_type;
+	s << head.colormap_index << head.colormap_length << head.colormap_size;
+	s << head.x_origin << head.y_origin << head.width << head.height;
+	s << head.pixel_size << head.flags;
+	return s;
+}
+
+inline Stream & operator<< (Stream & s, TgaFile & tga)
+{
+	s << tga.head;
+
+	if( s.isLoading() ) {
+		tga.allocate();
+	}
+
+	s.serialize( tga.mem, tga.size() );
+
+	return s;
+}
+
+} // nv namespace
+
+#endif // NV_IMAGE_TGAFILE_H
--- a/src/nvimage/nvimage.h
+++ b/src/nvimage/nvimage.h
@ -0,0 +1,22 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#ifndef NV_IMAGE_H
+#define NV_IMAGE_H
+
+#include <nvcore/nvcore.h>
+
+// Function linkage
+#if NVIMAGE_SHARED
+#ifdef NVIMAGE_EXPORTS
+#define NVIMAGE_API DLL_EXPORT
+#define NVIMAGE_CLASS DLL_EXPORT_CLASS
+#else
+#define NVIMAGE_API DLL_IMPORT
+#define NVIMAGE_CLASS DLL_IMPORT
+#endif
+#else
+#define NVIMAGE_API
+#define NVIMAGE_CLASS
+#endif
+
+#endif // NV_IMAGE_H
--- a/src/nvimage/nvtt/BlockDXT.cpp
+++ b/src/nvimage/nvtt/BlockDXT.cpp
@ -0,0 +1,553 @@
+// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
+// 
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+#include <nvimage/ColorBlock.h>
+#include "BlockDXT.h"
+
+using namespace nv;
+
+
+/*----------------------------------------------------------------------------
+	BlockDXT1
+----------------------------------------------------------------------------*/
+
+uint BlockDXT1::evaluatePalette(Color32 color_array[4]) const
+{
+	// Does bit expansion before interpolation.
+	color_array[0].b = (col0.b << 3) | (col0.b >> 2);
+	color_array[0].g = (col0.g << 2) | (col0.g >> 4);
+	color_array[0].r = (col0.r << 3) | (col0.r >> 2);
+	color_array[0].a = 0xFF;
+	
+	// @@ Same as above, but faster?
+//	Color32 c;
+//	c.u = ((col0.u << 3) & 0xf8) | ((col0.u << 5) & 0xfc00) | ((col0.u << 8) & 0xf80000);
+//	c.u |= (c.u >> 5) & 0x070007;
+//	c.u |= (c.u >> 6) & 0x000300;
+//	color_array[0].u = c.u;
+	
+	color_array[1].r = (col1.r << 3) | (col1.r >> 2);
+	color_array[1].g = (col1.g << 2) | (col1.g >> 4);
+	color_array[1].b = (col1.b << 3) | (col1.b >> 2);
+	color_array[1].a = 0xFF;
+	
+	// @@ Same as above, but faster?
+//	c.u = ((col1.u << 3) & 0xf8) | ((col1.u << 5) & 0xfc00) | ((col1.u << 8) & 0xf80000);
+//	c.u |= (c.u >> 5) & 0x070007;
+//	c.u |= (c.u >> 6) & 0x000300;
+//	color_array[1].u = c.u;
+	
+	if( col0.u > col1.u ) {
+		// Four-color block: derive the other two colors.
+		color_array[2].r = (2 * color_array[0].r + color_array[1].r) / 3;
+		color_array[2].g = (2 * color_array[0].g + color_array[1].g) / 3;
+		color_array[2].b = (2 * color_array[0].b + color_array[1].b) / 3;
+		color_array[2].a = 0xFF;
+		
+		color_array[3].r = (2 * color_array[1].r + color_array[0].r) / 3;
+		color_array[3].g = (2 * color_array[1].g + color_array[0].g) / 3;
+		color_array[3].b = (2 * color_array[1].b + color_array[0].b) / 3;
+		color_array[3].a = 0xFF;
+		
+		return 4;
+	}
+	else {
+		// Three-color block: derive the other color.
+		color_array[2].r = (color_array[0].r + color_array[1].r) / 2;
+		color_array[2].g = (color_array[0].g + color_array[1].g) / 2;
+		color_array[2].b = (color_array[0].b + color_array[1].b) / 2;
+		color_array[2].a = 0xFF;
+		
+		// Set all components to 0 to match DXT specs.
+		color_array[3].r = 0x00; // color_array[2].r;
+		color_array[3].g = 0x00; // color_array[2].g;
+		color_array[3].b = 0x00; // color_array[2].b;
+		color_array[3].a = 0x00;
+		
+		return 3;
+	}
+}
+
+// Evaluate palette assuming 3 color block.
+void BlockDXT1::evaluatePalette3(Color32 color_array[4]) const
+{
+	color_array[0].b = (col0.b << 3) | (col0.b >> 2);
+	color_array[0].g = (col0.g << 2) | (col0.g >> 4);
+	color_array[0].r = (col0.r << 3) | (col0.r >> 2);
+	color_array[0].a = 0xFF;
+	
+	color_array[1].r = (col1.r << 3) | (col1.r >> 2);
+	color_array[1].g = (col1.g << 2) | (col1.g >> 4);
+	color_array[1].b = (col1.b << 3) | (col1.b >> 2);
+	color_array[1].a = 0xFF;
+	
+	// Three-color block: derive the other color.
+	color_array[2].r = (color_array[0].r + color_array[1].r) / 2;
+	color_array[2].g = (color_array[0].g + color_array[1].g) / 2;
+	color_array[2].b = (color_array[0].b + color_array[1].b) / 2;
+	color_array[2].a = 0xFF;
+		
+	// Set all components to 0 to match DXT specs.
+	color_array[3].r = 0x00; // color_array[2].r;
+	color_array[3].g = 0x00; // color_array[2].g;
+	color_array[3].b = 0x00; // color_array[2].b;
+	color_array[3].a = 0x00;
+}
+
+// Evaluate palette assuming 4 color block.
+void BlockDXT1::evaluatePalette4(Color32 color_array[4]) const
+{
+	color_array[0].b = (col0.b << 3) | (col0.b >> 2);
+	color_array[0].g = (col0.g << 2) | (col0.g >> 4);
+	color_array[0].r = (col0.r << 3) | (col0.r >> 2);
+	color_array[0].a = 0xFF;
+	
+	color_array[1].r = (col1.r << 3) | (col1.r >> 2);
+	color_array[1].g = (col1.g << 2) | (col1.g >> 4);
+	color_array[1].b = (col1.b << 3) | (col1.b >> 2);
+	color_array[1].a = 0xFF;
+	
+	// Four-color block: derive the other two colors.
+	color_array[2].r = (2 * color_array[0].r + color_array[1].r) / 3;
+	color_array[2].g = (2 * color_array[0].g + color_array[1].g) / 3;
+	color_array[2].b = (2 * color_array[0].b + color_array[1].b) / 3;
+	color_array[2].a = 0xFF;
+		
+	color_array[3].r = (2 * color_array[1].r + color_array[0].r) / 3;
+	color_array[3].g = (2 * color_array[1].g + color_array[0].g) / 3;
+	color_array[3].b = (2 * color_array[1].b + color_array[0].b) / 3;
+	color_array[3].a = 0xFF;
+}
+
+
+/* Jason Dorie's code.
+// ----------------------------------------------------------------------------
+// Build palette for a 3 color + traparent black block
+// ----------------------------------------------------------------------------
+void DXTCGen::BuildCodes3(cbVector *pVects, cbVector &v1, cbVector &v2)
+{
+	//pVects[0] = v1;
+	//pVects[2] = v2;
+	//pVects[1][0] = v1[0];
+	//pVects[1][1] = (BYTE)( ((long)v1[1] + (long)v2[1]) / 2 );
+	//pVects[1][2] = (BYTE)( ((long)v1[2] + (long)v2[2]) / 2 );
+	//pVects[1][3] = (BYTE)( ((long)v1[3] + (long)v2[3]) / 2 );
+
+	__asm {
+		mov			ecx, dword ptr pVects
+		mov			eax, dword ptr v1
+		mov			ebx, dword ptr v2
+
+		movd		mm0, [eax]
+		movd		mm1, [ebx]
+		pxor		mm2, mm2
+		nop
+
+		movd		[ecx], mm0
+		movd		[ecx+8], mm1
+
+		punpcklbw	mm0, mm2
+		punpcklbw	mm1, mm2
+
+		paddw		mm0, mm1
+		psrlw		mm0, 1
+
+		packuswb	mm0, mm0
+		movd		[ecx+4], mm0
+	}
+	// *(long *)&pVects[1] = r1;
+}
+
+__int64 ScaleOneThird = 0x5500550055005500;
+
+// ----------------------------------------------------------------------------
+// Build palette for a 4 color block
+// ----------------------------------------------------------------------------
+void DXTCGen::BuildCodes4(cbVector *pVects, cbVector &v1, cbVector &v2)
+{
+// 	pVects[0] = v1;
+// 	pVects[3] = v2;
+// 
+// 	pVects[1][0] = v1[0];
+// 	pVects[1][1] = (BYTE)( ((long)v1[1] * 2 + (long)v2[1]) / 3 );
+// 	pVects[1][2] = (BYTE)( ((long)v1[2] * 2 + (long)v2[2]) / 3 );
+// 	pVects[1][3] = (BYTE)( ((long)v1[3] * 2 + (long)v2[3]) / 3 );
+// 
+// 	pVects[2][0] = v1[0];
+// 	pVects[2][1] = (BYTE)( ((long)v2[1] * 2 + (long)v1[1]) / 3 );
+// 	pVects[2][2] = (BYTE)( ((long)v2[2] * 2 + (long)v1[2]) / 3 );
+// 	pVects[2][3] = (BYTE)( ((long)v2[3] * 2 + (long)v1[3]) / 3 );
+
+	__asm {
+		mov			ecx, dword ptr pVects
+		mov			eax, dword ptr v1
+		mov			ebx, dword ptr v2
+
+		movd		mm0, [eax]
+		movd		mm1, [ebx]
+
+		pxor		mm2, mm2
+		movd		[ecx], mm0
+		movd		[ecx+12], mm1
+
+		punpcklbw	mm0, mm2
+		punpcklbw	mm1, mm2
+		movq		mm3, mm0		// mm3 = v0
+
+		paddw		mm0, mm1		// mm0 = v0 + v1
+		paddw		mm3, mm3		// mm3 = v0*2
+
+		paddw		mm0, mm1		// mm0 = v0 + v1*2
+		paddw		mm1, mm3		// mm1 = v0*2 + v1
+
+		pmulhw		mm0, ScaleOneThird
+		pmulhw		mm1, ScaleOneThird
+		packuswb	mm1, mm0
+
+		movq		[ecx+4], mm1
+	}
+}
+*/
+
+void BlockDXT1::decodeBlock(ColorBlock * block) const
+{
+	nvDebugCheck(block != NULL);
+	
+	// Decode color block.
+	Color32 color_array[4];
+	evaluatePalette(color_array);
+	
+	// Write color block.
+	for( uint j = 0; j < 4; j++ ) {
+		for( uint i = 0; i < 4; i++ ) {
+			uint idx = (row[j] >> (2 * i)) & 3;
+			block->color(i, j) = color_array[idx];
+		}
+	}	
+}
+
+void BlockDXT1::setIndices(int * idx)
+{
+	indices = 0;
+	for(uint i = 0; i < 16; i++) {
+		indices |= (idx[i] & 3) << (2 * i);
+	}
+}
+
+
+/// Flip DXT1 block vertically.
+inline void BlockDXT1::flip4()
+{
+	swap(row[0], row[3]);
+	swap(row[1], row[2]);
+}
+
+/// Flip half DXT1 block vertically.
+inline void BlockDXT1::flip2()
+{
+	swap(row[0], row[1]);
+}
+
+
+/*----------------------------------------------------------------------------
+	BlockDXT3
+----------------------------------------------------------------------------*/
+
+void BlockDXT3::decodeBlock(ColorBlock * block) const
+{
+	nvDebugCheck(block != NULL);
+	
+	// Decode color.
+	color.decodeBlock(block);
+	
+	// Decode alpha.
+	block->color(0x0).a = (alpha.alpha0 << 4) | alpha.alpha0;
+	block->color(0x1).a = (alpha.alpha1 << 4) | alpha.alpha1;
+	block->color(0x2).a = (alpha.alpha2 << 4) | alpha.alpha2;
+	block->color(0x3).a = (alpha.alpha3 << 4) | alpha.alpha3;
+	block->color(0x4).a = (alpha.alpha4 << 4) | alpha.alpha4;
+	block->color(0x5).a = (alpha.alpha5 << 4) | alpha.alpha5;
+	block->color(0x6).a = (alpha.alpha6 << 4) | alpha.alpha6;
+	block->color(0x7).a = (alpha.alpha7 << 4) | alpha.alpha7;
+	block->color(0x8).a = (alpha.alpha8 << 4) | alpha.alpha8;
+	block->color(0x9).a = (alpha.alpha9 << 4) | alpha.alpha9;
+	block->color(0xA).a = (alpha.alphaA << 4) | alpha.alphaA;
+	block->color(0xB).a = (alpha.alphaB << 4) | alpha.alphaB;
+	block->color(0xC).a = (alpha.alphaC << 4) | alpha.alphaC;
+	block->color(0xD).a = (alpha.alphaD << 4) | alpha.alphaD;
+	block->color(0xE).a = (alpha.alphaE << 4) | alpha.alphaE;
+	block->color(0xF).a = (alpha.alphaF << 4) | alpha.alphaF;
+}
+
+/// Flip DXT3 alpha block vertically.
+void AlphaBlockDXT3::flip4()
+{
+	swap(row[0], row[3]);
+	swap(row[1], row[2]);
+}
+
+/// Flip half DXT3 alpha block vertically.
+void AlphaBlockDXT3::flip2()
+{
+	swap(row[0], row[1]);
+}
+
+/// Flip DXT3 block vertically.
+void BlockDXT3::flip4()
+{
+	alpha.flip4();
+	color.flip4();
+}
+
+/// Flip half DXT3 block vertically.
+void BlockDXT3::flip2()
+{
+	alpha.flip2();
+	color.flip2();
+}
+
+
+/*----------------------------------------------------------------------------
+	BlockDXT5
+----------------------------------------------------------------------------*/
+
+void AlphaBlockDXT5::evaluatePalette(uint8 alpha[8]) const
+{
+	if (alpha0 > alpha1) {
+		evaluatePalette8(alpha);
+	}
+	else {
+		evaluatePalette6(alpha);
+	}
+}
+
+void AlphaBlockDXT5::evaluatePalette8(uint8 alpha[8]) const
+{
+	// 8-alpha block:  derive the other six alphas.
+	// Bit code 000 = alpha0, 001 = alpha1, others are interpolated.
+	alpha[0] = alpha0;
+	alpha[1] = alpha1;
+	alpha[2] = (6 * alpha0 + 1 * alpha1) / 7;	// bit code 010
+	alpha[3] = (5 * alpha0 + 2 * alpha1) / 7;	// bit code 011
+	alpha[4] = (4 * alpha0 + 3 * alpha1) / 7;	// bit code 100
+	alpha[5] = (3 * alpha0 + 4 * alpha1) / 7;	// bit code 101
+	alpha[6] = (2 * alpha0 + 5 * alpha1) / 7;	// bit code 110
+	alpha[7] = (1 * alpha0 + 6 * alpha1) / 7;	// bit code 111
+}
+
+void AlphaBlockDXT5::evaluatePalette6(uint8 alpha[8]) const
+{
+	// 6-alpha block.
+	// Bit code 000 = alpha0, 001 = alpha1, others are interpolated.
+	alpha[0] = alpha0;
+	alpha[1] = alpha1;
+	alpha[2] = (4 * alpha0 + 1 * alpha1) / 5;	// Bit code 010
+	alpha[3] = (3 * alpha0 + 2 * alpha1) / 5;	// Bit code 011
+	alpha[4] = (2 * alpha0 + 3 * alpha1) / 5;	// Bit code 100
+	alpha[5] = (1 * alpha0 + 4 * alpha1) / 5;	// Bit code 101
+	alpha[6] = 0x00;							// Bit code 110
+	alpha[7] = 0xFF;							// Bit code 111
+}
+
+void AlphaBlockDXT5::indices(uint8 index_array[16]) const
+{
+	index_array[0x0] = bits0;
+	index_array[0x1] = bits1;
+	index_array[0x2] = bits2;
+	index_array[0x3] = bits3;
+	index_array[0x4] = bits4;
+	index_array[0x5] = bits5;
+	index_array[0x6] = bits6;
+	index_array[0x7] = bits7;
+	index_array[0x8] = bits8;
+	index_array[0x9] = bits9;
+	index_array[0xA] = bitsA;
+	index_array[0xB] = bitsB;
+	index_array[0xC] = bitsC;
+	index_array[0xD] = bitsD;
+	index_array[0xE] = bitsE;
+	index_array[0xF] = bitsF;
+	
+	/*
+	// @@ missaligned reads might be very expensive on some hardware.		
+	uint b = (uint &) bits[0];
+	for(int i = 0; i < 8; i++) {
+		index_array[i] = uint8(b & 0x07); 
+		b >>= 3;
+	}
+	
+	b = (uint &) bits[3];
+	for(int i = 0; i < 8; i++) {
+		index_array[8+i] = uint8(b & 0x07); 
+		b >>= 3;
+	}
+	*/
+}
+
+uint AlphaBlockDXT5::index(uint index) const
+{
+	nvDebugCheck(index < 16);
+
+	int offset = (3 * index + 16);
+	return (this->u >> offset) & 0x7;
+/*
+	if (index == 0x0) return bits0;
+	else if (index == 0x1) return bits1;
+	else if (index == 0x2) return bits2;
+	else if (index == 0x3) return bits3;
+	else if (index == 0x4) return bits4;
+	else if (index == 0x5) return bits5;
+	else if (index == 0x6) return bits6;
+	else if (index == 0x7) return bits7;
+	else if (index == 0x8) return bits8;
+	else if (index == 0x9) return bits9;
+	else if (index == 0xA) return bitsA;
+	else if (index == 0xB) return bitsB;
+	else if (index == 0xC) return bitsC;
+	else if (index == 0xD) return bitsD;
+	else if (index == 0xE) return bitsE;
+	else if (index == 0xF) return bitsF;
+	return 0;
+*/
+}
+
+void AlphaBlockDXT5::setIndex(uint index, uint value)
+{
+	nvDebugCheck(index < 16);
+	nvDebugCheck(value < 8);
+
+	int offset = (3 * index + 16);
+	uint64 mask = uint64(0x7) << offset;
+	this->u = (this->u & ~mask) | (uint64(value) << offset);
+
+/*
+	// @@ Really bad code...
+	if (index == 0x0) bits0 = value;
+	else if (index == 0x1) bits1 = value;
+	else if (index == 0x2) bits2 = value;
+	else if (index == 0x3) bits3 = value;
+	else if (index == 0x4) bits4 = value;
+	else if (index == 0x5) bits5 = value;
+	else if (index == 0x6) bits6 = value;
+	else if (index == 0x7) bits7 = value;
+	else if (index == 0x8) bits8 = value;
+	else if (index == 0x9) bits9 = value;
+	else if (index == 0xA) bitsA = value;
+	else if (index == 0xB) bitsB = value;
+	else if (index == 0xC) bitsC = value;
+	else if (index == 0xD) bitsD = value;
+	else if (index == 0xE) bitsE = value;
+	else if (index == 0xF) bitsF = value;
+*/
+}
+
+void AlphaBlockDXT5::flip4()
+{
+	uint64 * b = (uint64 *)this;
+	
+	// @@ The masks might have to be byte swapped.
+	uint64 tmp = (*b & POSH_U64(0x000000000000FFFF));
+	tmp |= (*b & POSH_U64(0x000000000FFF0000)) << 36;
+	tmp |= (*b & POSH_U64(0x000000FFF0000000)) << 12;
+	tmp |= (*b & POSH_U64(0x000FFF0000000000)) >> 12;
+	tmp |= (*b & POSH_U64(0xFFF0000000000000)) >> 36;
+	
+	*b = tmp;
+}
+
+void AlphaBlockDXT5::flip2()
+{
+	uint * b = (uint *)this;
+	
+	// @@ The masks might have to be byte swapped.
+	uint tmp = (*b & 0xFF000000);
+	tmp |=  (*b & 0x00000FFF) << 12;
+	tmp |= (*b & 0x00FFF000) >> 12;
+	
+	*b = tmp;
+}
+
+void BlockDXT5::decodeBlock(ColorBlock * block) const
+{
+	nvDebugCheck(block != NULL);
+	
+	// Decode color.
+	color.decodeBlock(block);
+	
+	// Decode alpha.
+	uint8 alpha_array[8];
+	alpha.evaluatePalette(alpha_array);
+	
+	uint8 index_array[16];
+	alpha.indices(index_array);
+	
+	for(uint i = 0; i < 16; i++) {
+		block->color(i).a = alpha_array[index_array[i]];
+	}
+}
+
+/// Flip DXT5 block vertically.
+void BlockDXT5::flip4()
+{
+	alpha.flip4();
+	color.flip4();
+}
+
+/// Flip half DXT5 block vertically.
+void BlockDXT5::flip2()
+{
+	alpha.flip2();
+	color.flip2();
+}
+
+
+/// Decode 3DC block.
+void Block3DC::decodeBlock(ColorBlock * block) const
+{
+	// @@ TBD
+}
+
+/// Flip 3DC block vertically.
+void Block3DC::flip4()
+{
+	y.flip4();
+	x.flip4();
+}
+
+/// Flip half 3DC block vertically.
+void Block3DC::flip2()
+{
+	y.flip2();
+	x.flip2();
+}
+
+
+
+
+
+	
+
+
+
+
+
--- a/src/nvimage/nvtt/BlockDXT.h
+++ b/src/nvimage/nvtt/BlockDXT.h
@ -0,0 +1,176 @@
+// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
+// 
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef NV_TT_BLOCKDXT_H
+#define NV_TT_BLOCKDXT_H
+
+#include <nvmath/Color.h>
+#include "nvtt.h"
+
+namespace nv
+{
+	struct ColorBlock;
+
+	/// DXT1 block.
+	struct BlockDXT1
+	{
+		Color16 col0;
+		Color16 col1;
+		union {
+			uint8 row[4];
+			uint indices;
+		};
+	
+		bool isFourColorMode() const;
+	
+		uint evaluatePalette(Color32 color_array[4]) const;
+		uint evaluatePaletteFast(Color32 color_array[4]) const;
+		void evaluatePalette3(Color32 color_array[4]) const;
+		void evaluatePalette4(Color32 color_array[4]) const;
+		
+		void decodeBlock(ColorBlock * block) const;
+		
+		void setIndices(int * idx);
+
+		void flip4();
+		void flip2();
+	};
+	
+	/// Return true if the block uses four color mode, false otherwise.
+	inline bool BlockDXT1::isFourColorMode() const
+	{
+		return col0.u >= col1.u;	// @@ > or >= ?
+	}
+	
+	
+	
+	
+	/// DXT3 alpha block with explicit alpha.
+	struct AlphaBlockDXT3
+	{
+		union {
+			struct {
+				uint alpha0 : 4;
+				uint alpha1 : 4;
+				uint alpha2 : 4;
+				uint alpha3 : 4;
+				uint alpha4 : 4;
+				uint alpha5 : 4;
+				uint alpha6 : 4;
+				uint alpha7 : 4;
+				uint alpha8 : 4;
+				uint alpha9 : 4;
+				uint alphaA : 4;
+				uint alphaB : 4;
+				uint alphaC : 4;
+				uint alphaD : 4;
+				uint alphaE : 4;
+				uint alphaF : 4;
+			};
+			uint16 row[4];
+		};
+		
+		void flip4();
+		void flip2();
+	};
+	
+	
+	/// DXT3 block.
+	struct BlockDXT3
+	{
+		AlphaBlockDXT3 alpha;
+		BlockDXT1 color;
+		
+		void decodeBlock(ColorBlock * block) const;
+		
+		void flip4();
+		void flip2();
+	};
+	
+	
+	/// DXT5 alpha block.
+	struct AlphaBlockDXT5
+	{
+		union {
+			struct {
+				uint64 alpha0 : 8;	// 8
+				uint64 alpha1 : 8;	// 16
+				uint64 bits0 : 3;	// 3 - 19
+				uint64 bits1 : 3; 	// 6 - 22
+				uint64 bits2 : 3; 	// 9 - 25
+				uint64 bits3 : 3;	// 12 - 28
+				uint64 bits4 : 3;	// 15 - 31
+				uint64 bits5 : 3;	// 18 - 34
+				uint64 bits6 : 3;	// 21 - 37
+				uint64 bits7 : 3;	// 24 - 40
+				uint64 bits8 : 3;	// 27 - 43
+				uint64 bits9 : 3; 	// 30 - 46
+				uint64 bitsA : 3; 	// 33 - 49
+				uint64 bitsB : 3;	// 36 - 52
+				uint64 bitsC : 3;	// 39 - 55
+				uint64 bitsD : 3;	// 42 - 58
+				uint64 bitsE : 3;	// 45 - 61
+				uint64 bitsF : 3;	// 48 - 64
+			};
+			uint64 u;
+		};
+		
+		void evaluatePalette(uint8 alpha[8]) const;
+		void evaluatePalette8(uint8 alpha[8]) const;
+		void evaluatePalette6(uint8 alpha[8]) const;
+		void indices(uint8 index_array[16]) const;
+
+		uint index(uint index) const;
+		void setIndex(uint index, uint value);
+		
+		void flip4();
+		void flip2();
+	};
+	
+	/// DXT5 block.
+	struct BlockDXT5
+	{
+		AlphaBlockDXT5 alpha;
+		BlockDXT1 color;
+		
+		void decodeBlock(ColorBlock * block) const;
+		
+		void flip4();
+		void flip2();
+	};
+	
+	/// 3DC block.
+	struct Block3DC
+	{
+		AlphaBlockDXT5 y;
+		AlphaBlockDXT5 x;
+		
+		void decodeBlock(ColorBlock * block) const;
+		
+		void flip4();
+		void flip2();
+	};
+
+} // nv namespace
+
+#endif // NV_TT_BLOCKDXT_H
--- a/src/nvimage/nvtt/CMakeLists.txt
+++ b/src/nvimage/nvtt/CMakeLists.txt
@ -0,0 +1,57 @@
+PROJECT(nvtt)
+
+ADD_SUBDIRECTORY(squish)
+
+SET(NVTT_SRCS
+	nvtt.h 
+	CompressDXT.h
+	CompressDXT.cpp
+	CompressRGB.h
+	CompressRGB.cpp
+	FastCompressDXT.h
+	FastCompressDXT.cpp
+	BlockDXT.h
+	BlockDXT.cpp
+	dxtlib.cpp
+	dxtlib_compat.h
+	CompressionOptions.h
+	CompressionOptions.cpp
+	InputOptions.h
+	InputOptions.cpp
+	OutputOptions.cpp
+	cuda/CudaUtils.h
+	cuda/CudaUtils.cpp
+	cuda/CudaCompressDXT.h
+	cuda/CudaCompressDXT.cpp)
+
+IF(CUDA_FOUND)
+	ADD_DEFINITIONS(-DHAVE_CUDA)
+	WRAP_CUDA(CUDA_SRCS cuda/CompressKernel.cu)
+	SET(NVTT_SRCS ${NVTT_SRCS} ${CUDA_SRCS})
+	SET(LIBS ${LIBS} ${CUDA_LIBRARY})
+	INCLUDE_DIRECTORIES(${CUDA_INCLUDE_PATH})
+ENDIF(CUDA_FOUND)
+
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+
+ADD_DEFINITIONS(-DNVTT_EXPORTS)
+
+IF(NVTT_SHARED)	
+	ADD_LIBRARY(nvtt SHARED ${DXT_SRCS})
+ELSE(NVTT_SHARED)
+	ADD_LIBRARY(nvtt ${NVTT_SRCS})
+ENDIF(NVTT_SHARED)
+
+TARGET_LINK_LIBRARIES(nvtt ${LIBS} nvcore nvmath nvimage squish)
+
+
+# test executables
+ADD_EXECUTABLE(nvcompress compress.cpp)
+TARGET_LINK_LIBRARIES(nvcompress nvcore nvmath nvimage nvtt)
+
+INSTALL(TARGETS nvcompress DESTINATION bin)
+
+#ADD_EXECUTABLE(nvtextool nvdxt.cpp configdialog.cpp)
+
+
+
--- a/src/nvimage/nvtt/CompressDXT.cpp
+++ b/src/nvimage/nvtt/CompressDXT.cpp
@ -0,0 +1,535 @@
+// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
+// 
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+#include <nvcore/Memory.h>
+
+#include <nvimage/Image.h>
+#include <nvimage/ColorBlock.h>
+
+#include "nvtt.h"
+#include "CompressDXT.h"
+#include "FastCompressDXT.h"
+#include "BlockDXT.h"
+#include "CompressionOptions.h"
+
+// squish
+#include "squish/colourset.h"
+//#include "squish/clusterfit.h"
+#include "squish/fastclusterfit.h"
+#include "squish/weightedclusterfit.h"
+
+// s3_quant
+#if defined(HAVE_S3QUANT)
+#include "s3tc/s3_quant.h"
+#endif
+
+// ati tc
+#if defined(HAVE_ATITC)
+#include "atitc/ATI_Compress.h"
+#endif
+
+//#include <time.h>
+
+using namespace nv;
+using namespace nvtt;
+
+
+void nv::fastCompressDXT1(const Image * image, const OutputOptions & outputOptions)
+{
+	const uint w = image->width();
+	const uint h = image->height();
+	
+	ColorBlock rgba;
+	BlockDXT1 block;
+
+	for (uint y = 0; y < h; y += 4) {
+		for (uint x = 0; x < w; x += 4) {
+			rgba.init(image, x, y);
+			
+			compressBlock_BoundsRange(rgba, &block);
+
+			// @@ Use iterative optimization.
+			optimizeEndPoints(rgba, &block);
+			
+			if (outputOptions.outputHandler != NULL) {
+				outputOptions.outputHandler->writeData(&block, sizeof(block));
+			}
+		}
+	}
+}
+
+
+void nv::fastCompressDXT3(const Image * image, const nvtt::OutputOptions & outputOptions)
+{
+	const uint w = image->width();
+	const uint h = image->height();
+	
+	ColorBlock rgba;
+	BlockDXT3 block;
+
+	for (uint y = 0; y < h; y += 4) {
+		for (uint x = 0; x < w; x += 4) {
+			rgba.init(image, x, y);
+			compressBlock_BoundsRange(rgba, &block);
+			
+			if (outputOptions.outputHandler != NULL) {
+				outputOptions.outputHandler->writeData(&block, sizeof(block));
+			}
+		}
+	}
+}
+
+
+void nv::fastCompressDXT5(const Image * image, const nvtt::OutputOptions & outputOptions)
+{
+	const uint w = image->width();
+	const uint h = image->height();
+	
+	ColorBlock rgba;
+	BlockDXT5 block;
+
+	for (uint y = 0; y < h; y += 4) {
+		for (uint x = 0; x < w; x += 4) {
+			rgba.init(image, x, y);
+			compressBlock_BoundsRange(rgba, &block);
+			
+			if (outputOptions.outputHandler != NULL) {
+				outputOptions.outputHandler->writeData(&block, sizeof(block));
+			}
+		}
+	}
+}
+
+
+void nv::fastCompressDXT5n(const Image * image, const nvtt::OutputOptions & outputOptions)
+{
+	const uint w = image->width();
+	const uint h = image->height();
+	
+	ColorBlock rgba;
+	BlockDXT5 block;
+
+	for (uint y = 0; y < h; y += 4) {
+		for (uint x = 0; x < w; x += 4) {
+			rgba.init(image, x, y);
+			
+			// copy X coordinate to green channel and Y coordinate to alpha channel.
+			rgba.swizzleDXT5n();			
+			
+			compressBlock_BoundsRange(rgba, &block);
+			
+			if (outputOptions.outputHandler != NULL) {
+				outputOptions.outputHandler->writeData(&block, sizeof(block));
+			}
+		}
+	}
+}
+
+
+void nv::fastCompressBC4(const Image * image, const nvtt::OutputOptions & outputOptions)
+{
+	// @@ TODO
+	// compress red channel (X)
+}
+
+
+void nv::fastCompressBC5(const Image * image, const nvtt::OutputOptions & outputOptions)
+{
+	// @@ TODO
+	// compress red, green channels (X,Y)
+}
+
+
+void nv::doPrecomputation()
+{
+	static bool done = false;
+	
+	if (!done)
+	{
+		done = true;
+		squish::FastClusterFit::doPrecomputation();
+	}
+}
+
+
+void nv::compressDXT1(const Image * image, const OutputOptions & outputOptions, const CompressionOptions::Private & compressionOptions)
+{
+	const uint w = image->width();
+	const uint h = image->height();
+	
+	ColorBlock rgba;
+	BlockDXT1 block;
+
+	doPrecomputation();
+
+	for (uint y = 0; y < h; y += 4) {
+		for (uint x = 0; x < w; x += 4) {
+			
+			rgba.init(image, x, y);
+			
+			// Compress color.
+			squish::ColourSet colours((uint8 *)rgba.colors(), 0);
+			squish::FastClusterFit fit(&colours, squish::kDxt1);
+			//squish::WeightedClusterFit fit(&colours, squish::kDxt1);
+			//squish::ClusterFit fit(&colours, squish::kDxt1);
+			fit.setMetric(compressionOptions.colorWeight.x(), compressionOptions.colorWeight.y(), compressionOptions.colorWeight.z());
+			fit.Compress(&block);
+			
+			// @@ Use iterative cluster fit algorithm to improve error in highest quality mode.
+			
+			if (outputOptions.outputHandler != NULL) {
+				outputOptions.outputHandler->writeData(&block, sizeof(block));
+			}
+		}
+	}
+}
+
+
+void nv::compressDXT3(const Image * image, const OutputOptions & outputOptions, const CompressionOptions::Private & compressionOptions)
+{
+	const uint w = image->width();
+	const uint h = image->height();
+	
+	ColorBlock rgba;
+	BlockDXT3 block;
+	
+	for (uint y = 0; y < h; y += 4) {
+		for (uint x = 0; x < w; x += 4) {
+			
+			rgba.init(image, x, y);
+			
+			// Compress explicit alpha.
+			compressBlock(rgba, &block.alpha);
+			
+			// Compress color.
+			squish::ColourSet colours((uint8 *)rgba.colors(), squish::kWeightColourByAlpha);
+			squish::WeightedClusterFit fit(&colours, 0);
+			fit.setMetric(compressionOptions.colorWeight.x(), compressionOptions.colorWeight.y(), compressionOptions.colorWeight.z());
+			fit.Compress(&block.color);
+			
+			if (outputOptions.outputHandler != NULL) {
+				outputOptions.outputHandler->writeData(&block, sizeof(block));
+			}
+		}
+	}
+}
+
+void nv::compressDXT5(const Image * image, const OutputOptions & outputOptions, const CompressionOptions::Private & compressionOptions)
+{
+	const uint w = image->width();
+	const uint h = image->height();
+	
+	ColorBlock rgba;
+	BlockDXT5 block;
+	
+	for (uint y = 0; y < h; y += 4) {
+		for (uint x = 0; x < w; x += 4) {
+			
+			rgba.init(image, x, y);
+
+			// Compress alpha.
+			uint error;
+			if (compressionOptions.quality == Quality_Highest)
+			{
+				error = compressBlock_BruteForce(rgba, &block.alpha);
+			}
+			else
+			{
+				error = compressBlock_Iterative(rgba, &block.alpha);
+			}
+
+			// Compress color.
+			squish::ColourSet colours((uint8 *)rgba.colors(), squish::kWeightColourByAlpha);
+			squish::WeightedClusterFit fit(&colours, 0);
+			fit.setMetric(compressionOptions.colorWeight.x(), compressionOptions.colorWeight.y(), compressionOptions.colorWeight.z());
+			fit.Compress(&block.color);
+			
+			if (outputOptions.outputHandler != NULL) {
+				outputOptions.outputHandler->writeData(&block, sizeof(block));
+			}
+		}
+	}
+}
+
+
+void nv::compressDXT5n(const Image * image, const OutputOptions & outputOptions, const CompressionOptions::Private & compressionOptions)
+{
+	const uint w = image->width();
+	const uint h = image->height();
+	
+	ColorBlock rgba;
+	BlockDXT5 block;
+	
+	doPrecomputation();
+
+	for (uint y = 0; y < h; y += 4) {
+		for (uint x = 0; x < w; x += 4) {
+			
+			rgba.init(image, x, y);
+			
+			// copy X coordinate to green channel and Y coordinate to alpha channel.
+			rgba.swizzleDXT5n();			
+			
+			// Compress Y.
+			uint error = compressBlock_Iterative(rgba, &block.alpha);
+			if (compressionOptions.quality == Quality_Highest)
+			{
+				error = compressBlock_BruteForce(rgba, &block.alpha);
+			}
+			
+			// Compress X.
+			squish::ColourSet colours((uint8 *)rgba.colors(), 0);
+			squish::FastClusterFit fit(&colours, 0);
+			fit.setMetric(0, 1, 0);
+			fit.Compress(&block.color);
+			
+			if (outputOptions.outputHandler != NULL) {
+				outputOptions.outputHandler->writeData(&block, sizeof(block));
+			}
+		}
+	}
+}
+
+
+void nv::compressBC4(const Image * image, const nvtt::OutputOptions & outputOptions, const CompressionOptions::Private & compressionOptions)
+{
+	const uint w = image->width();
+	const uint h = image->height();
+	
+	// threshold should be from [0 - 1] but may also be higher...
+	const uint threshold = uint(compressionOptions.errorThreshold * 256);
+	
+	ColorBlock rgba;
+	AlphaBlockDXT5 block;
+	
+	uint totalError = 0;
+	
+	for (uint y = 0; y < h; y += 4) {
+		for (uint x = 0; x < w; x += 4) {
+			
+			rgba.init(image, x, y);
+
+			//error = compressBlock_BoundsRange(rgba, &block);
+			uint error = compressBlock_Iterative(rgba, &block);
+
+			if (compressionOptions.quality == Quality_Highest ||
+				(compressionOptions.quality == Quality_Production && error > threshold))
+			{
+				// Try brute force algorithm.
+				error = compressBlock_BruteForce(rgba, &block);
+			}
+
+			totalError += error;
+
+			if (outputOptions.outputHandler != NULL) {
+				outputOptions.outputHandler->writeData(&block, sizeof(block));
+			}
+		}
+	}
+
+	// @@ All the compressors should work like this.
+	// Effect of adjusting threshold: 
+	// (threshold: error - time)
+	// 0: 4.29 - 1.83
+	// 32: 4.32 - 1.77
+	// 48: 4.37 - 1.72
+	// 64: 4.43 - 1.45
+	// 74: 4.45 - 1.35
+	// 92: 4.54 - 1.15
+	// 128: 4.67 - 0.79
+	// 256: 4.92 - 0.20
+	// inf: 4.98 - 0.09
+
+	printf("Alpha error: %f\n", float(totalError) / (w*h));
+}
+
+
+void nv::compressBC5(const Image * image, const nvtt::OutputOptions & outputOptions, const CompressionOptions::Private & compressionOptions)
+{
+	const uint w = image->width();
+	const uint h = image->height();
+
+	ColorBlock xcolor;
+	ColorBlock ycolor;
+
+	Block3DC block;
+
+	for (uint y = 0; y < h; y += 4) {
+		for (uint x = 0; x < w; x += 4) {
+			
+			xcolor.init(image, x, y);
+			xcolor.splatX();
+			
+			ycolor.init(image, x, y);
+			ycolor.splatY();
+
+			// @@ Compute normal error, instead of separate xy errors.
+			uint xerror, yerror;
+			
+			if (compressionOptions.quality == Quality_Highest)
+			{
+				xerror = compressBlock_BruteForce(xcolor, &block.x);
+				yerror = compressBlock_BruteForce(ycolor, &block.y);
+			}
+			else
+			{
+				xerror = compressBlock_Iterative(xcolor, &block.x);
+				yerror = compressBlock_Iterative(ycolor, &block.y);
+			}
+
+			if (outputOptions.outputHandler != NULL) {
+				outputOptions.outputHandler->writeData(&block, sizeof(block));
+			}
+		}
+	}
+}
+
+
+#if defined(HAVE_S3QUANT)
+
+void nv::s3CompressDXT1(const Image * image, const nvtt::OutputOptions & outputOptions)
+{
+	const uint w = image->width();
+	const uint h = image->height();
+	
+	float error = 0.0f;
+
+	BlockDXT1 dxtBlock3;
+	BlockDXT1 dxtBlock4;
+	ColorBlock block;
+
+	for (uint y = 0; y < h; y += 4) {
+		for (uint x = 0; x < w; x += 4) {
+			block.init(image, x, y);
+
+			// Init rgb block.
+			RGBBlock rgbBlock;
+			rgbBlock.n = 16;
+			for (uint i = 0; i < 16; i++) {
+				rgbBlock.colorChannel[i][0] = clamp(float(block.color(i).r) / 255.0f, 0.0f, 1.0f);
+				rgbBlock.colorChannel[i][1] = clamp(float(block.color(i).g) / 255.0f, 0.0f, 1.0f);
+				rgbBlock.colorChannel[i][2] = clamp(float(block.color(i).b) / 255.0f, 0.0f, 1.0f);
+			}
+			rgbBlock.weight[0] = 1.0f;
+			rgbBlock.weight[1] = 1.0f;
+			rgbBlock.weight[2] = 1.0f;
+
+			rgbBlock.inLevel = 4;
+			CodeRGBBlock(&rgbBlock);
+
+			// Copy results to DXT block.
+			dxtBlock4.col0.r = rgbBlock.endPoint[0][0];
+			dxtBlock4.col0.g = rgbBlock.endPoint[0][1];
+			dxtBlock4.col0.b = rgbBlock.endPoint[0][2];
+
+			dxtBlock4.col1.r = rgbBlock.endPoint[1][0];
+			dxtBlock4.col1.g = rgbBlock.endPoint[1][1];
+			dxtBlock4.col1.b = rgbBlock.endPoint[1][2];
+
+			dxtBlock4.setIndices(rgbBlock.index);
+
+			if (dxtBlock4.col0.u < dxtBlock4.col1.u) {
+				swap(dxtBlock4.col0.u, dxtBlock4.col1.u);
+				dxtBlock4.indices ^= 0x55555555;
+			}
+
+			uint error4 = blockError(block, dxtBlock4);
+
+			rgbBlock.inLevel = 3;
+
+			CodeRGBBlock(&rgbBlock);
+
+			// Copy results to DXT block.
+			dxtBlock3.col0.r = rgbBlock.endPoint[0][0];
+			dxtBlock3.col0.g = rgbBlock.endPoint[0][1];
+			dxtBlock3.col0.b = rgbBlock.endPoint[0][2];
+
+			dxtBlock3.col1.r = rgbBlock.endPoint[1][0];
+			dxtBlock3.col1.g = rgbBlock.endPoint[1][1];
+			dxtBlock3.col1.b = rgbBlock.endPoint[1][2];
+
+			dxtBlock3.setIndices(rgbBlock.index);
+
+			if (dxtBlock3.col0.u > dxtBlock3.col1.u) {
+				swap(dxtBlock3.col0.u, dxtBlock3.col1.u);
+				dxtBlock3.indices ^= (~dxtBlock3.indices  >> 1) & 0x55555555;
+			}
+
+			uint error3 = blockError(block, dxtBlock3);
+
+			if (error3 < error4) {
+				error += error3;
+
+				if (outputOptions.outputHandler != NULL) {
+					outputOptions.outputHandler->writeData(&dxtBlock3, sizeof(dxtBlock3));
+				}
+			}
+			else {
+				error += error4;
+
+				if (outputOptions.outputHandler != NULL) {
+					outputOptions.outputHandler->writeData(&dxtBlock4, sizeof(dxtBlock4));
+				}
+			}
+		}
+	}
+
+	printf("error = %f\n", error/((w+3)/4 * (h+3)/4));
+}
+
+#endif // defined(HAVE_S3QUANT)
+
+
+#if defined(HAVE_ATITC)
+
+void nv::atiCompressDXT1(const Image * image, const OutputOptions & outputOptions)
+{
+	// Init source texture
+	ATI_TC_Texture srcTexture;
+	srcTexture.dwSize = sizeof(srcTexture);
+	srcTexture.dwWidth = image->width();
+	srcTexture.dwHeight = image->height();
+	srcTexture.dwPitch = image->width() * 4;
+	srcTexture.format = ATI_TC_FORMAT_ARGB_8888;
+	srcTexture.dwDataSize = ATI_TC_CalculateBufferSize(&srcTexture);
+	srcTexture.pData = (ATI_TC_BYTE*) image->pixels();
+
+	// Init dest texture
+	ATI_TC_Texture destTexture;
+	destTexture.dwSize = sizeof(destTexture);
+	destTexture.dwWidth = image->width();
+	destTexture.dwHeight = image->height();
+	destTexture.dwPitch = 0;
+	destTexture.format = ATI_TC_FORMAT_DXT1;
+	destTexture.dwDataSize = ATI_TC_CalculateBufferSize(&destTexture);
+	destTexture.pData = (ATI_TC_BYTE*) mem::malloc(destTexture.dwDataSize);
+
+	// Compress
+	ATI_TC_ConvertTexture(&srcTexture, &destTexture, NULL, NULL, NULL, NULL);
+
+	if (outputOptions.outputHandler != NULL) {
+		outputOptions.outputHandler->writeData(destTexture.pData, destTexture.dwDataSize);
+	}
+}
+
+#endif // defined(HAVE_ATITC)
--- a/src/nvimage/nvtt/CompressDXT.h
+++ b/src/nvimage/nvtt/CompressDXT.h
@ -0,0 +1,65 @@
+// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
+// 
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef NV_TT_COMPRESSDXT_H
+#define NV_TT_COMPRESSDXT_H
+
+#include <nvimage/nvimage.h>
+#include "nvtt.h"
+
+namespace nv
+{
+	class Image;
+	class FloatImage;
+
+	void doPrecomputation();
+	
+	// Fast compressors.
+	void fastCompressDXT1(const Image * image, const nvtt::OutputOptions & outputOptions);
+	void fastCompressDXT3(const Image * image, const nvtt::OutputOptions & outputOptions);
+	void fastCompressDXT5(const Image * image, const nvtt::OutputOptions & outputOptions);
+	void fastCompressDXT5n(const Image * image, const nvtt::OutputOptions & outputOptions);
+	void fastCompressBC4(const Image * image, const nvtt::OutputOptions & outputOptions);
+	void fastCompressBC5(const Image * image, const nvtt::OutputOptions & outputOptions);
+
+	// Normal compressors.
+	void compressDXT1(const Image * image, const nvtt::OutputOptions & outputOptions, const nvtt::CompressionOptions::Private & compressionOptions);
+	void compressDXT3(const Image * image, const nvtt::OutputOptions & outputOptions, const nvtt::CompressionOptions::Private & compressionOptions);
+	void compressDXT5(const Image * image, const nvtt::OutputOptions & outputOptions, const nvtt::CompressionOptions::Private & compressionOptions);
+	void compressDXT5n(const Image * image, const nvtt::OutputOptions & outputOptions, const nvtt::CompressionOptions::Private & compressionOptions);
+	void compressBC4(const Image * image, const nvtt::OutputOptions & outputOptions, const nvtt::CompressionOptions::Private & compressionOptions);
+	void compressBC5(const Image * image, const nvtt::OutputOptions & outputOptions, const nvtt::CompressionOptions::Private & compressionOptions);
+	
+	// External compressors.
+#if defined(HAVE_S3QUANT)
+	void s3CompressDXT1(const Image * image, const nvtt::OutputOptions & outputOptions);
+#endif
+	
+#if defined(HAVE_ATITC)
+	void atiCompressDXT1(const Image * image, const nvtt::OutputOptions & outputOptions);
+#endif
+
+} // nv namespace
+
+
+#endif // NV_TT_COMPRESSDXT_H
--- a/src/nvimage/nvtt/CompressRGB.cpp
+++ b/src/nvimage/nvtt/CompressRGB.cpp
@ -0,0 +1,153 @@
+// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
+// 
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+#include <string.h>
+#include <nvcore/Debug.h>
+#include <nvimage/Image.h>
+
+#include "CompressRGB.h"
+#include "CompressionOptions.h"
+
+
+using namespace nv;
+using namespace nvtt;
+
+namespace 
+{
+
+	inline uint computePitch(uint w, uint bitsize)
+	{
+		uint p = w * ((bitsize + 7) / 8);
+
+		// Align to 32 bits.
+		return ((p + 3) / 4) * 4;
+	}
+
+	static void convert_to_rgba8888(void * src, void * dst, uint w)
+	{
+		// @@ TODO
+	}
+
+	static void convert_to_bgra8888(const void * src, void * dst, uint w)
+	{
+		memcpy(dst, src, 4 * w);
+	}
+
+	static void convert_to_rgb888(const void * src, void * dst, uint w)
+	{
+		// @@ TODO
+	}
+
+	static uint truncate(uint c, uint inbits, uint outbits)
+	{
+		nvDebugCheck(inbits > outbits);	
+		c >>= inbits - outbits;
+	}
+
+	static uint bitexpand(uint c, uint inbits, uint outbits)
+	{
+		// @@ TODO
+	}
+	
+	static void maskShiftAndSize(uint mask, uint & shift, uint & size)
+	{
+		shift = 0;
+		while((mask & 1) == 0) {
+			shift++;
+			mask >>= 1;
+		}
+		
+		while((mask & 1) == 1) {
+			size++;
+			mask >>= 1;
+		}
+	}
+	
+} // namespace
+
+
+// Pixel format converter.
+void nv::compressRGB(const Image * image, const OutputOptions & outputOptions, const CompressionOptions::Private & compressionOptions)
+{
+	nvCheck(image != NULL);
+
+	const uint w = image->width();
+	const uint h = image->height();
+
+	uint rshift, rsize;
+	maskShiftAndSize(compressionOptions.rmask, rshift, rsize);
+	
+	uint gshift, gsize;
+	maskShiftAndSize(compressionOptions.gmask, gshift, gsize);
+	
+	uint bshift, bsize;
+	maskShiftAndSize(compressionOptions.bmask, bshift, bsize);
+	
+	uint ashift, asize;
+	maskShiftAndSize(compressionOptions.amask, ashift, asize);
+
+
+	// Determine pitch.
+	uint pitch = computePitch(w, compressionOptions.bitcount);
+
+	void * dst = malloc(pitch);
+
+	for (uint y = 0; y < h; y++)
+	{
+		const Color32 * src = image->scanline(y);
+
+		convert_to_bgra8888(src, dst, w);
+
+		if (false)
+		{
+		//	uint c = 0;
+		//	c |= (src[i].r >> (8 - rsize)) << rshift;
+		//	c |= (src[i].g >> (8 - gsize)) << gshift;
+		//	c |= (src[i].b >> (8 - bsize)) << bshift;
+		}
+
+		/*
+		if (rmask == 0xFF000000 && gmask == 0xFF0000 && bmask == 0xFF00 && amask == 0xFF)
+		{
+			convert_to_rgba8888(src, dst, w);
+		}
+		else if (rmask == 0xFF0000 && gmask == 0xFF00 && bmask == 0xFF && amask == 0)
+		{
+			convert_to_rgb888(src, dst, w);
+		}
+		else
+		{
+			// @@ Not supported.
+		}
+		*/
+
+		if (outputOptions.outputHandler != NULL)
+		{
+			outputOptions.outputHandler->writeData(dst, pitch);
+		}
+	}
+
+	free(dst);
+}
+
+
--- a/src/nvimage/nvtt/CompressRGB.h
+++ b/src/nvimage/nvtt/CompressRGB.h
@ -0,0 +1,39 @@
+// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
+// 
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef NV_TT_COMPRESSRGB_H
+#define NV_TT_COMPRESSRGB_H
+
+#include "nvtt.h"
+
+namespace nv
+{
+	class Image;
+
+	// Pixel format converter.
+	void compressRGB(const Image * image, const nvtt::OutputOptions & outputOptions, const nvtt::CompressionOptions::Private & compressionOptions);
+	
+} // nv namespace
+
+
+#endif // NV_TT_COMPRESSDXT_H
--- a/src/nvimage/nvtt/CompressionOptions.cpp
+++ b/src/nvimage/nvtt/CompressionOptions.cpp
@ -0,0 +1,113 @@
+// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
+// 
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+#include "nvtt.h"
+#include "CompressionOptions.h"
+
+using namespace nv;
+using namespace nvtt;
+
+
+/// Constructor. Sets compression options to the default values.
+CompressionOptions::CompressionOptions() : m(*new CompressionOptions::Private())
+{
+	reset();
+}
+
+
+/// Destructor.
+CompressionOptions::~CompressionOptions()
+{
+	delete &m;
+}
+
+
+/// Set default compression options.
+void CompressionOptions::reset()
+{
+	m.format = Format_DXT1;
+	m.quality = Quality_Normal;
+	m.colorWeight.set(1.0f, 1.0f, 1.0f);
+	m.useCuda = true;
+	m.bitcount = 32;
+	m.bmask = 0x000000FF;
+	m.gmask = 0x0000FF00;
+	m.rmask = 0x00FF0000;
+	m.amask = 0xFF000000;
+}
+
+
+/// Set desired compression format.
+void CompressionOptions::setFormat(Format format)
+{
+	m.format = format;
+}
+
+
+/// Set compression quality settings.
+void CompressionOptions::setQuality(Quality quality, float errorThreshold /*= 0.5f*/)
+{
+	m.quality = quality;
+	m.errorThreshold = errorThreshold;
+}
+
+
+/// Set the weights of each color channel. 
+/// The choice for these values is subjective. In many case uniform color weights 
+/// (1.0, 1.0, 1.0) work very well. A popular choice is to use the NTSC luma encoding 
+/// weights (0.2126, 0.7152, 0.0722), but I think that blue contributes to our 
+/// perception more than a 7%. A better choice in my opinion is (3, 4, 2). Ideally
+/// the compressor should use a non linear colour metric as described here:
+/// http://www.compuphase.com/cmetric.htm
+void CompressionOptions::setColorWeights(float red, float green, float blue)
+{
+	float total = red + green + blue;
+	float x = blue / total;
+	float y = green / total;
+
+	m.colorWeight.set(x, y, 1.0f - x - y);
+}
+
+
+/// Enable or disable hardware compression.
+void CompressionOptions::enableHardwareCompression(bool enable)
+{
+	m.useCuda = enable;
+}
+
+
+/// Set color mask to describe the RGB/RGBA format.
+void CompressionOptions::setPixelFormat(uint bitcount, uint rmask, uint gmask, uint bmask, uint amask)
+{
+	m.bitcount = bitcount;
+	m.rmask = rmask;
+	m.gmask = gmask;
+	m.bmask = bmask;
+	m.amask = amask;
+}
+
+/// Use external compressor.
+void CompressionOptions::setExternalCompressor(const char * name)
+{
+	m.externalCompressor = name;
+}
--- a/src/nvimage/nvtt/CompressionOptions.h
+++ b/src/nvimage/nvtt/CompressionOptions.h
@ -0,0 +1,57 @@
+// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
+// 
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef NV_TT_COMPRESSIONOPTIONS_H
+#define NV_TT_COMPRESSIONOPTIONS_H
+
+#include <nvcore/StrLib.h>
+#include <nvmath/Vector.h>
+#include "nvtt.h"
+
+namespace nvtt
+{
+
+	struct CompressionOptions::Private
+	{
+		Format format;
+		
+		Quality quality;
+		float errorThreshold;
+		
+		nv::Vector3 colorWeight;
+		
+		uint bitcount;
+		uint rmask;
+		uint gmask;
+		uint bmask;
+		uint amask;
+		
+		bool useCuda;
+
+		nv::String externalCompressor;
+	};
+
+} // nvtt namespace
+
+
+#endif // NV_TT_COMPRESSIONOPTIONS_H
--- a/src/nvimage/nvtt/FastCompressDXT.cpp
+++ b/src/nvimage/nvtt/FastCompressDXT.cpp
--- a/src/nvimage/nvtt/FastCompressDXT.h
+++ b/src/nvimage/nvtt/FastCompressDXT.h
@ -0,0 +1,81 @@
+// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
+// 
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef NV_TT_FASTCOMPRESSDXT_H
+#define NV_TT_FASTCOMPRESSDXT_H
+
+#include <nvimage/nvimage.h>
+
+namespace nv
+{
+	struct ColorBlock;
+	struct BlockDXT1;
+	struct BlockDXT3;
+	struct BlockDXT5;
+	struct AlphaBlockDXT3;
+	struct AlphaBlockDXT5;
+
+	// Color compression:
+
+	// Compressor that uses the extremes of the luminance axis.
+	void compressBlock_DiameterAxis(const ColorBlock & rgba, BlockDXT1 * block);
+
+	// Compressor that uses the extremes of the luminance axis.
+	void compressBlock_LuminanceAxis(const ColorBlock & rgba, BlockDXT1 * block);
+
+	// Compressor that uses bounding box.
+	void compressBlock_BoundsRange(const ColorBlock & rgba, BlockDXT1 * block);
+
+	// Compressor that uses the best fit axis.
+	void compressBlock_BestFitAxis(const ColorBlock & rgba, BlockDXT1 * block);
+
+
+	// Simple, but slow compressor that tests all color pairs.
+	void compressBlock_TestAllPairs(const ColorBlock & rgba, BlockDXT1 * block);
+	
+	// Brute force 6d search along the best fit axis.
+	void compressBlock_AnalyzeBestFitAxis(const ColorBlock & rgba, BlockDXT1 * block);
+
+	// Spatial greedy search.
+	void refineSolution_1dSearch(const ColorBlock & rgba, BlockDXT1 * block);
+	void refineSolution_3dSearch(const ColorBlock & rgba, BlockDXT1 * block);
+	void refineSolution_6dSearch(const ColorBlock & rgba, BlockDXT1 * block);
+	
+	// Minimize error of the endpoints.
+	void optimizeEndPoints(const ColorBlock & rgba, BlockDXT1 * block);
+	
+	uint blockError(const ColorBlock & rgba, const BlockDXT1 & block);
+	uint blockError(const ColorBlock & rgba, const AlphaBlockDXT5 & block);
+
+	// Alpha compression:
+	void compressBlock(const ColorBlock & rgba, AlphaBlockDXT3 * block);
+	void compressBlock_BoundsRange(const ColorBlock & rgba, BlockDXT3 * block);
+	void compressBlock_BoundsRange(const ColorBlock & rgba, BlockDXT5 * block);
+
+	uint compressBlock_BoundsRange(const ColorBlock & rgba, AlphaBlockDXT5 * block);
+	uint compressBlock_BruteForce(const ColorBlock & rgba, AlphaBlockDXT5 * block);
+	uint compressBlock_Iterative(const ColorBlock & rgba, AlphaBlockDXT5 * block);
+
+} // nv namespace
+
+#endif // NV_TT_FASTCOMPRESSDXT_H
--- a/src/nvimage/nvtt/InputOptions.cpp
+++ b/src/nvimage/nvtt/InputOptions.cpp
@ -0,0 +1,250 @@
+// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
+// 
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+#include <string.h> // memcpy
+
+#include <nvcore/Memory.h>
+
+#include "nvtt.h"
+#include "InputOptions.h"
+
+using namespace nv;
+using namespace nvtt;
+
+namespace
+{
+
+	static int countMipmaps(int w, int h, int d)
+	{
+		int mipmap = 0;
+		
+		while (w != 1 && h != 1) {
+			w = max(1, w / 2);
+			h = max(1, h / 2);
+			d = max(1, d / 2);
+			mipmap++;
+		}
+		
+		return mipmap + 1;
+	}
+
+} // namespace
+
+
+/// Constructor.
+InputOptions::InputOptions() : m(*new InputOptions::Private())
+{ 
+	reset();
+}
+
+// Delete images.
+InputOptions::~InputOptions()
+{
+	resetTextureLayout();
+	
+	delete &m;
+}
+
+
+// Reset input options.
+void InputOptions::reset()
+{
+	m.wrapMode = WrapMode_Repeat;
+	m.textureType = TextureType_2D;
+	m.inputFormat = InputFormat_BGRA_8UB;
+
+	m.enableColorDithering = false;
+	m.enableAlphaDithering = false;
+	m.binaryAlpha = false;
+	m.alphaThreshold = 127;
+
+	m.alphaTransparency = true;
+
+	m.inputGamma = 2.2f;
+	m.outputGamma = 2.2f;
+	
+	m.generateMipmaps = false;
+	m.maxLevel = -1;
+	m.mipmapFilter = MipmapFilter_Box;
+
+	m.normalizeMipmaps = false;
+	m.convertToNormalMap = false;
+	m.heightFactors.set(0.0f, 0.0f, 0.0f, 1.0f);
+	m.bumpFrequencyScale = Vector4(1.0f, 0.5f, 0.25f, 0.125f) / (1.0f + 0.5f + 0.25f + 0.125f);
+}
+
+
+// Setup the input image.
+void InputOptions::setTextureLayout(TextureType type, int w, int h, int d /*= 1*/)
+{
+	// Validate arguments.
+	nvCheck(w >= 0);
+	nvCheck(h >= 0);
+	nvCheck(d >= 0);
+
+	// Correct arguments.
+	if (w == 0) w = 1;
+	if (h == 0) h = 1;
+	if (d == 0) d = 1;
+
+	// Delete previous images.
+	resetTextureLayout();
+	
+	m.textureType = type;
+	
+	// Allocate images.
+	m.mipmapCount = countMipmaps(w, h, d);
+	m.faceCount = (type == TextureType_Cube) ? 6 : 1;
+	m.imageCount = m.mipmapCount * m.faceCount;
+	
+	m.images = new Private::Image[m.imageCount];
+	
+	for(int f = 0; f < m.faceCount; f++)
+	{
+		for (int mipLevel = 0; mipLevel < m.mipmapCount; mipLevel++)
+		{
+			Private::Image & img = m.images[f * m.mipmapCount + mipLevel];
+			img.width = w;
+			img.height = h;
+			img.depth = d;
+			img.mipLevel = mipLevel;
+			img.face = f;
+			
+			img.data = NULL;
+			
+			w = max(1, w / 2);
+			h = max(1, h / 2);
+			d = max(1, d / 2);
+		}
+	}
+}
+
+
+void InputOptions::resetTextureLayout()
+{
+	if (m.images != NULL)
+	{
+		// Delete image array.
+		delete [] m.images;
+		m.images = NULL;
+
+		m.faceCount = 0;
+		m.mipmapCount = 0;
+		m.imageCount = 0;
+	}
+}
+
+
+// Copies the data to our internal structures.
+bool InputOptions::setMipmapData(const void * data, int width, int height, int depth /*= 1*/, int face /*= 0*/, int mipLevel /*= 0*/)
+{
+	nvCheck(depth == 1);
+	
+	const int idx = face * m.mipmapCount + mipLevel;
+	
+	if (m.images[idx].width != width || m.images[idx].height != height || m.images[idx].depth != depth || m.images[idx].mipLevel != mipLevel || m.images[idx].face != face)
+	{
+		// Invalid dimension or index.
+		return false;
+	}
+	
+	m.images[idx].data = new nv::Image();
+	m.images[idx].data->allocate(width, height);
+	memcpy(m.images[idx].data->pixels(), data, width * height * 4); 
+	
+	return true;
+}
+
+
+/// Describe the format of the input.
+void InputOptions::setFormat(InputFormat format, bool alphaTransparency)
+{
+	m.inputFormat = format;
+	m.alphaTransparency = alphaTransparency;
+}
+
+
+/// Set gamma settings.
+void InputOptions::setGamma(float inputGamma, float outputGamma)
+{
+	m.inputGamma = inputGamma;
+	m.outputGamma = outputGamma;
+}
+
+
+/// Set texture wrappign mode.
+void InputOptions::setWrapMode(WrapMode mode)
+{
+	m.wrapMode = mode;
+}
+
+
+/// Set mipmapping options.
+void InputOptions::setMipmapping(bool generateMipmaps, MipmapFilter filter/*= MipmapFilter_Kaiser*/, int maxLevel/*= -1*/)
+{
+	m.generateMipmaps = generateMipmaps;
+	m.mipmapFilter = filter;
+	m.maxLevel = maxLevel;
+}
+
+
+/// Set quantization options.
+/// @warning Do not enable dithering unless you know what you are doing. Quantization 
+/// introduces errors. It's better to let the compressor quantize the result to 
+/// minimize the error, instead of quantizing the data before handling it to
+/// the compressor.
+void InputOptions::setQuantization(bool colorDithering, bool alphaDithering, bool binaryAlpha, int alphaThreshold/*= 127*/)
+{
+	m.enableColorDithering = colorDithering;
+	m.enableAlphaDithering = alphaDithering;
+	m.binaryAlpha = binaryAlpha;
+	m.alphaThreshold = alphaThreshold;
+}
+
+
+/// Enable normal map conversion.
+void InputOptions::setConvertToNormalMap(bool convert)
+{
+	m.convertToNormalMap = convert;
+}
+
+/// Set height evaluation factors.
+void InputOptions::setHeightEvaluation(float redScale, float greenScale, float blueScale, float alphaScale)
+{
+	// Do not normalize height factors.
+//	float total = redScale + greenScale + blueScale + alphaScale;
+	m.heightFactors = Vector4(redScale, greenScale, blueScale, alphaScale);
+}
+
+/// Set normal map conversion filter.
+void InputOptions::setNormalFilter(float small, float medium, float big, float large)
+{
+	float total = small + medium + big + large;
+	m.bumpFrequencyScale = Vector4(small, medium, big, large) / total;
+}
+
+/// Enable mipmap normalization.
+void InputOptions::setNormalizeMipmaps(bool normalize)
+{
+	m.normalizeMipmaps = normalize;
+}
--- a/src/nvimage/nvtt/InputOptions.h
+++ b/src/nvimage/nvtt/InputOptions.h
@ -0,0 +1,91 @@
+// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
+// 
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef NV_TT_INPUTOPTIONS_H
+#define NV_TT_INPUTOPTIONS_H
+
+#include <nvmath/Vector.h>
+#include <nvimage/Image.h>
+#include "nvtt.h"
+
+namespace nvtt
+{
+
+	struct InputOptions::Private
+	{
+		Private() : images(NULL) {}
+
+		WrapMode wrapMode;
+		TextureType textureType;
+		InputFormat inputFormat;
+		
+		int faceCount;
+		int mipmapCount;
+		int imageCount;
+		
+		struct Image;
+		Image * images;
+
+		// Quantization.
+		bool enableColorDithering;
+		bool enableAlphaDithering;
+		bool binaryAlpha;
+		int alphaThreshold;			// reference value used for binary alpha quantization.
+
+		bool alphaTransparency;	// set to true if alpha is used for transparency.
+		
+		// Gamma conversion.
+		float inputGamma;
+		float outputGamma;
+		
+		// Mipmap generation options.
+		bool generateMipmaps;
+		int maxLevel;
+		MipmapFilter mipmapFilter;
+		
+		// Normal map options.
+		bool normalizeMipmaps;
+		bool convertToNormalMap;
+		nv::Vector4 heightFactors;
+		nv::Vector4 bumpFrequencyScale;
+	};
+
+	// Internal image structure.
+	struct InputOptions::Private::Image
+	{
+		Image() {}
+		~Image() { delete data; }
+		
+		int mipLevel;
+		int face;
+		
+		int width;
+		int height;
+		int depth;
+		
+		nv::Image * data;
+	};
+
+} // nvtt namespace
+
+#endif // NV_TT_INPUTOPTIONS_H
--- a/src/nvimage/nvtt/OutputOptions.cpp
+++ b/src/nvimage/nvtt/OutputOptions.cpp
@ -0,0 +1,32 @@
+// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
+// 
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+#include "nvtt.h"
+
+using namespace nvtt;
+
+/// Set default output options.
+void OutputOptions::reset()
+{
+	// endiannes = native...
+}
--- a/src/nvimage/nvtt/cmdline.h
+++ b/src/nvimage/nvtt/cmdline.h
@ -0,0 +1,44 @@
+// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
+
+#ifndef CMDLINE_H
+#define CMDLINE_H
+
+#include <nvcore/Debug.h>
+
+#include <stdarg.h>
+
+struct MyMessageHandler : public nv::MessageHandler {
+	MyMessageHandler() {
+		nv::debug::setMessageHandler( this );
+	}
+	~MyMessageHandler() {
+		nv::debug::resetMessageHandler();
+	}
+
+	virtual void log( const char * str, va_list arg ) {
+		va_list val;
+		va_copy(val, arg);
+		vfprintf(stderr, str, arg);
+		va_end(val);		
+	}
+};
+
+
+struct MyAssertHandler : public nv::AssertHandler {
+	MyAssertHandler() {
+		nv::debug::setAssertHandler( this );
+	}
+	~MyAssertHandler() {
+		nv::debug::resetAssertHandler();
+	}
+	
+	// Handler method, note that func might be NULL!
+	virtual int assert( const char *exp, const char *file, int line, const char *func ) {
+		fprintf(stderr, "Assertion failed: %s\nIn %s:%d\n", exp, file, line);
+		nv::debug::dumpInfo();
+		exit(1);
+	}
+};
+
+
+#endif // CMDLINE_H
--- a/src/nvimage/nvtt/compress.cpp
+++ b/src/nvimage/nvtt/compress.cpp
@ -0,0 +1,354 @@
+// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
+// 
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+#include <nvcore/StrLib.h>
+#include <nvcore/StdStream.h>
+
+#include <nvimage/Image.h>
+#include <nvimage/nvtt/nvtt.h>
+
+#include "cmdline.h"
+
+#include <time.h> // clock
+
+struct MyOutputHandler : public nvtt::OutputHandler
+{
+	MyOutputHandler() : total(0), progress(0), percentage(0), stream(NULL) {}
+	MyOutputHandler(const char * name) : total(0), progress(0), percentage(0), stream(new nv::StdOutputStream(name)) {}
+	virtual ~MyOutputHandler() { delete stream; }
+	
+	bool open(const char * name)
+	{
+		stream = new nv::StdOutputStream(name);
+		percentage = progress = 0;
+		if (stream->isError()) {
+			printf("Error opening '%s' for writting\n", name);
+			return false;
+		}
+		return true;
+	}
+	
+	virtual void setTotal(int t)
+	{
+		total = t;
+	}
+
+	virtual void mipmap(int size, int width, int height, int depth, int face, int miplevel)
+	{
+		// ignore.
+	}
+	
+	// Output data.
+	virtual void writeData(const void * data, int size)
+	{
+		nvDebugCheck(stream != NULL);
+		stream->serialize(const_cast<void *>(data), size);
+
+		progress += size;
+		int p = (100 * progress) / total;
+		if (p != percentage)
+		{
+			percentage = p;
+			printf("\r%d%%", percentage);
+			fflush(stdout);
+		}
+	}
+	
+	int total;
+	int progress;
+	int percentage;
+	nv::StdOutputStream * stream;
+};
+
+struct MyErrorHandler : public nvtt::ErrorHandler
+{
+	virtual void error(nvtt::Error e)
+	{
+		nvDebugBreak();
+	}
+};
+
+
+
+
+// Set color to normal map conversion options.
+void setColorToNormalMap(nvtt::InputOptions & inputOptions)
+{
+	inputOptions.setConvertToNormalMap(true);
+	inputOptions.setHeightEvaluation(1.0f/3.0f, 1.0f/3.0f, 1.0f/3.0f, 0.0f);
+	//inputOptions.setNormalFilter(1.0f, 0, 0, 0);
+	//inputOptions.setNormalFilter(0.0f, 0, 0, 1);
+	inputOptions.setGamma(1.0f, 1.0f);
+	inputOptions.setNormalizeMipmaps(true);
+}
+
+// Set options for normal maps.
+void setNormalMap(nvtt::InputOptions & inputOptions)
+{
+	inputOptions.setConvertToNormalMap(false);
+	inputOptions.setGamma(1.0f, 1.0f);
+	inputOptions.setNormalizeMipmaps(true);
+}
+
+// Set options for color maps.
+void setColorMap(nvtt::InputOptions & inputOptions)
+{
+	inputOptions.setConvertToNormalMap(false);
+	inputOptions.setGamma(2.2f, 2.2f);
+	inputOptions.setNormalizeMipmaps(false);
+}
+
+
+
+int main(int argc, char *argv[])
+{
+	MyAssertHandler assertHandler;
+	MyMessageHandler messageHandler;
+
+	bool normal = false;
+	bool color2normal = false;
+	bool wrapRepeat = false;
+	bool noMipmaps = false;
+	bool fast = false;
+	bool nocuda = false;
+	nvtt::Format format = nvtt::Format_BC1;
+
+	const char * externalCompressor = NULL;
+
+	nv::Path input;
+	nv::Path output;
+
+
+	// Parse arguments.
+	for (int i = 1; i < argc; i++)
+	{
+		// Input options.
+		if (strcmp("-color", argv[i]) == 0)
+		{
+		}
+		else if (strcmp("-normal", argv[i]) == 0)
+		{
+			normal = true;
+		}
+		else if (strcmp("-tonormal", argv[i]) == 0)
+		{
+			color2normal = true;
+		}
+		else if (strcmp("-clamp", argv[i]) == 0)
+		{
+		}
+		else if (strcmp("-repeat", argv[i]) == 0)
+		{
+			wrapRepeat = true;
+		}
+		else if (strcmp("-nomips", argv[i]) == 0)
+		{
+			noMipmaps = true;
+		}
+
+		// Compression options.
+		else if (strcmp("-fast", argv[i]) == 0)
+		{
+			fast = true;
+		}
+		else if (strcmp("-nocuda", argv[i]) == 0)
+		{
+			nocuda = true;
+		}
+		else if (strcmp("-rgb", argv[i]) == 0)
+		{
+			format = nvtt::Format_RGB;
+		}
+		else if (strcmp("-bc1", argv[i]) == 0)
+		{
+			format = nvtt::Format_BC1;
+		}
+		else if (strcmp("-bc2", argv[i]) == 0)
+		{
+			format = nvtt::Format_BC2;
+		}
+		else if (strcmp("-bc3", argv[i]) == 0)
+		{
+			format = nvtt::Format_BC3;
+		}
+		else if (strcmp("-bc3n", argv[i]) == 0)
+		{
+			format = nvtt::Format_BC3n;
+		}
+		else if (strcmp("-bc4", argv[i]) == 0)
+		{
+			format = nvtt::Format_BC4;
+		}
+		else if (strcmp("-bc5", argv[i]) == 0)
+		{
+			format = nvtt::Format_BC5;
+		}
+
+		// Undocumented option. Mainly used for testing.
+		else if (strcmp("-ext", argv[i]) == 0)
+		{
+			if (i+1 < argc && argv[i+1][0] != '-') {
+				externalCompressor = argv[i+1];
+				printf("using %s\n", argv[i+1]);
+				i++;
+			}
+		}
+
+		else if (argv[i][0] != '-')
+		{
+			input = argv[i];
+
+			if (i+1 < argc && argv[i+1][0] != '-') {
+				output = argv[i+1];
+			}
+			else
+			{
+				output.copy(input.str());
+				output.stripExtension();
+				output.append(".dds");
+			}
+
+			break;
+		}
+	}
+
+	if (input.empty())
+	{
+		printf("NVIDIA Texture Tools - Copyright NVIDIA Corporation 2007\n\n");
+		
+		printf("usage: nvcompress [options] infile [outfile]\n\n");
+		
+		printf("Input options:\n");
+		printf("  -color   \tThe input image is a color map (default).\n");
+		printf("  -normal  \tThe input image is a normal map.\n");
+		printf("  -tonormal\tConvert input to normal map.\n");
+		printf("  -clamp   \tClamp wrapping mode (default).\n");
+		printf("  -repeat  \tRepeat wrapping mode.\n");
+		printf("  -nomips  \tDisable mipmap generation.\n\n");
+
+		printf("Compression options:\n");
+		printf("  -fast    \tFast compression.\n");
+		printf("  -nocuda  \tDo not use cuda compressor.\n");
+		printf("  -rgb     \tRGBA format\n");
+		printf("  -bc1     \tBC1 format (DXT1)\n");
+		printf("  -bc2     \tBC2 format (DXT3)\n");
+		printf("  -bc3     \tBC3 format (DXT5)\n");
+		printf("  -bc3n    \tBC3 normal map format (DXT5n/RXGB)\n");
+		printf("  -bc4     \tBC4 format (ATI1)\n");
+		printf("  -bc5     \tBC5 format (3Dc/ATI2)\n\n");
+		
+		return 1;
+	}
+
+	nv::Image image;
+	if (!image.load(input))
+	{
+		printf("The file '%s' is not a supported image type.\n", input.str());
+		return 1;
+	}
+
+
+	MyErrorHandler errorHandler;
+	MyOutputHandler outputHandler(output);
+	if (outputHandler.stream->isError())
+	{
+		printf("Error opening '%s' for writting\n", output.str());
+		return 1;
+	}
+
+	// Set input options.
+	nvtt::InputOptions inputOptions;
+	inputOptions.setTextureLayout(nvtt::TextureType_2D, image.width(), image.height());
+	inputOptions.setMipmapData(image.pixels(), image.width(), image.height());
+
+	if (fast)
+	{
+		inputOptions.setMipmapping(true, nvtt::MipmapFilter_Box);
+	}
+	else
+	{
+		inputOptions.setMipmapping(true, nvtt::MipmapFilter_Kaiser);
+	}
+
+	if (wrapRepeat)
+	{
+		inputOptions.setWrapMode(nvtt::WrapMode_Repeat);
+	}
+	else
+	{
+		inputOptions.setWrapMode(nvtt::WrapMode_Clamp);
+	}
+
+	if (normal)
+	{
+		setNormalMap(inputOptions);
+	}
+	else if (color2normal)
+	{
+		setColorToNormalMap(inputOptions);
+	}
+	else
+	{
+		setColorMap(inputOptions);
+	}
+	
+	if (noMipmaps)
+	{
+		inputOptions.setMipmapping(false);
+	}
+	
+
+	nvtt::CompressionOptions compressionOptions;
+	compressionOptions.setFormat(format);
+	if (fast)
+	{
+		compressionOptions.setQuality(nvtt::Quality_Fastest);
+	}
+	else
+	{
+		compressionOptions.setQuality(nvtt::Quality_Normal);
+		//compressionOptions.setQuality(nvtt::Quality_Production, 0.5f);
+		//compressionOptions.setQuality(nvtt::Quality_Highest);
+	}
+	compressionOptions.enableHardwareCompression(!nocuda);
+	compressionOptions.setColorWeights(1, 1, 1);
+
+	if (externalCompressor != NULL)
+	{
+		compressionOptions.setExternalCompressor(externalCompressor);
+	}
+
+	outputHandler.setTotal(nvtt::estimateSize(inputOptions, compressionOptions));
+
+	nvtt::OutputOptions outputOptions(&outputHandler, &errorHandler);
+	
+	clock_t start = clock();
+
+	nvtt::compress(inputOptions, outputOptions, compressionOptions);
+
+	clock_t end = clock();
+	printf("\rtime taken: %.3f seconds\n", float(end-start) / CLOCKS_PER_SEC);
+	
+	return 0;
+}
+
--- a/src/nvimage/nvtt/cuda/CompressKernel.cu
+++ b/src/nvimage/nvtt/cuda/CompressKernel.cu
@ -0,0 +1,481 @@
+// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
+// 
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+
+#include "CudaMath.h"
+
+#define THREAD_NUM 64		// Number of threads per block.
+
+#if __DEVICE_EMULATION__
+#define __debugsync() __syncthreads()
+#else
+#define __debugsync()
+#endif
+
+typedef unsigned short ushort;
+typedef unsigned int uint;
+
+template <class T> 
+__device__ inline void swap(T & a, T & b)
+{
+	T tmp = a;
+	a = b;
+	b = tmp;
+}
+
+__constant__ float3 kColorMetric = { 1.0f, 1.0f, 1.0f };
+
+
+////////////////////////////////////////////////////////////////////////////////
+// Round color to RGB565 and expand
+////////////////////////////////////////////////////////////////////////////////
+inline __device__ float3 roundAndExpand(float3 v, ushort * w)
+{
+    v.x = rintf(__saturatef(v.x) * 31.0f);
+    v.y = rintf(__saturatef(v.y) * 63.0f);
+    v.z = rintf(__saturatef(v.z) * 31.0f);
+    *w = ((ushort)v.x << 11) | ((ushort)v.y << 5) | (ushort)v.z;
+    v.x *= 0.03227752766457f; // approximate integer bit expansion.
+    v.y *= 0.01583151765563f;
+    v.z *= 0.03227752766457f;
+    return v;
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+// Evaluate permutations
+////////////////////////////////////////////////////////////////////////////////
+static __device__ float evalPermutation4(const float3 * colors, uint permutation, ushort * start, ushort * end)
+{
+    // Compute endpoints using least squares.
+    float alpha2_sum = 0.0f;
+    float beta2_sum = 0.0f;
+    float alphabeta_sum = 0.0f;
+    float3 alphax_sum = make_float3(0.0f, 0.0f, 0.0f);
+    float3 betax_sum = make_float3(0.0f, 0.0f, 0.0f);
+
+    // Compute alpha & beta for this permutation.
+    for (int i = 0; i < 16; i++)
+    {
+        const uint bits = permutation >> (2*i);
+
+        float beta = (bits & 1);
+        if (bits & 2) beta = (1 + beta) / 3.0f;
+        float alpha = 1.0f - beta;
+    
+        alpha2_sum += alpha * alpha;
+        beta2_sum += beta * beta;
+        alphabeta_sum += alpha * beta;
+        alphax_sum += alpha * colors[i];
+        betax_sum += beta * colors[i];
+    }
+
+    // alpha2, beta2, alphabeta and factor could be precomputed for each permutation, but it's faster to recompute them.
+    const float factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);
+
+    float3 a = (alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor;
+    float3 b = (betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor;
+    
+    // Round a, b to the closest 5-6-5 color and expand...
+    a = roundAndExpand(a, start);
+    b = roundAndExpand(b, end);
+
+    // compute the error
+    float3 e = a * a * alpha2_sum + b * b * beta2_sum + 2.0f * (a * b * alphabeta_sum - a * alphax_sum - b * betax_sum);
+
+    return dot(e, kColorMetric);
+}
+
+
+static __device__ float evalPermutation3(const float3 * colors, uint permutation, ushort * start, ushort * end)
+{
+    // Compute endpoints using least squares.
+    float alpha2_sum = 0.0f;
+    float beta2_sum = 0.0f;
+    float alphabeta_sum = 0.0f;
+    float3 alphax_sum = make_float3(0.0f, 0.0f, 0.0f);
+    float3 betax_sum = make_float3(0.0f, 0.0f, 0.0f);
+
+    // Compute alpha & beta for this permutation.
+    for (int i = 0; i < 16; i++)
+    {
+        const uint bits = permutation >> (2*i);
+
+        float beta = (bits & 1);
+        if (bits & 2) beta = 0.5f;
+        float alpha = 1.0f - beta;
+    
+        alpha2_sum += alpha * alpha;
+        beta2_sum += beta * beta;
+        alphabeta_sum += alpha * beta;
+        alphax_sum += alpha * colors[i];
+        betax_sum += beta * colors[i];
+    }
+
+    const float factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);
+
+    float3 a = (alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor;
+    float3 b = (betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor;
+    
+    // Round a, b to the closest 5-6-5 color and expand...
+    a = roundAndExpand(a, start);
+    b = roundAndExpand(b, end);
+
+    // compute the error
+    float3 e = a * a * alpha2_sum + b * b * beta2_sum + 2.0f * (a * b * alphabeta_sum - a * alphax_sum - b * betax_sum);
+
+    return dot(e, kColorMetric);
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+// Sort colors
+////////////////////////////////////////////////////////////////////////////////
+__device__ void sortColors(float * values, float3 * colors, int * xrefs)
+{
+#if __DEVICE_EMULATION__
+
+    if (threadIdx.x == 0) 
+    {
+        for( int i = 0; i < 16; ++i )
+        {
+			xrefs[i] = i;
+		}
+        
+        // Use a sequential sort on emulation.
+        for( int i = 0; i < 16; ++i )
+        {
+            for( int j = i; j > 0 && values[j] < values[j - 1]; --j )
+            {
+                swap( values[j], values[j - 1] );
+                swap( xrefs[j], xrefs[j - 1] );
+            //    swap( colors[j], colors[j - 1] );
+            }
+        }
+        
+        float3 tmp[16];
+        for( int i = 0; i < 16; ++i ) 
+        {
+			tmp[i] = colors[i];
+		}
+        
+        for( int i = 0; i < 16; ++i )
+        {
+            int xid = xrefs[i];
+            colors[i] = tmp[xid];
+        }
+    }
+
+#else
+    int tid = threadIdx.x;
+
+	xrefs[tid] = tid;
+
+    // Parallel bitonic sort.
+    for (int k = 2; k <= 16; k *= 2)
+    {
+        // bitonic merge:
+        for (int j = k / 2; j>0; j /= 2)
+        {
+            int ixj = tid ^ j;
+            
+            if (ixj > tid) {
+                // @@ Optimize these branches.
+                if ((tid & k) == 0) {
+                    if (values[xrefs[tid]] > values[xrefs[ixj]]) {
+                    //    swap(values[tid], values[ixj]);
+                        swap(colors[tid], colors[ixj]);
+                        swap(xrefs[tid], xrefs[ixj]);
+                    }
+                }
+                else {
+                    if (values[xrefs[tid]] < values[xrefs[ixj]]) {
+                    //    swap(values[tid], values[ixj]);
+                        swap(colors[tid], colors[ixj]);
+                        swap(xrefs[tid], xrefs[ixj]);
+                    }
+                }
+            }
+        }
+    }
+#endif
+
+    // It would be faster to avoid color swaps during the sort, but there
+    // are compiler bugs preventing that.
+#if 0
+	float3 tmp = colors[xrefs[tid]];
+    colors[tid] = tmp;
+#endif
+}
+
+// This sort is faster, but does not sort correctly elements with the same value.
+__device__ void sortColors2(float * values, float3 * colors, int * cmp)
+{
+	int tid = threadIdx.x;
+
+	cmp[tid] = (values[0] < values[tid]);
+	cmp[tid] += (values[1] < values[tid]);
+	cmp[tid] += (values[2] < values[tid]);
+	cmp[tid] += (values[3] < values[tid]);
+	cmp[tid] += (values[4] < values[tid]);
+	cmp[tid] += (values[5] < values[tid]);
+	cmp[tid] += (values[6] < values[tid]);
+	cmp[tid] += (values[7] < values[tid]);
+	cmp[tid] += (values[8] < values[tid]);
+	cmp[tid] += (values[9] < values[tid]);
+	cmp[tid] += (values[10] < values[tid]);
+	cmp[tid] += (values[11] < values[tid]);
+	cmp[tid] += (values[12] < values[tid]);
+	cmp[tid] += (values[13] < values[tid]);
+	cmp[tid] += (values[14] < values[tid]);
+	cmp[tid] += (values[15] < values[tid]);
+	
+	float3 tmp = colors[tid];
+	colors[cmp[tid]] = tmp;
+}
+
+
+
+////////////////////////////////////////////////////////////////////////////////
+// Find index with minimum error
+////////////////////////////////////////////////////////////////////////////////
+__device__ void minimizeError(float * errors, int * indices)
+{
+	const int idx = threadIdx.x;
+
+#if __DEVICE_EMULATION__
+
+	for(int d = THREAD_NUM/2; d > 0; d >>= 1)
+	{
+		__syncthreads();
+
+		if (idx < d)
+		{
+			float err0 = errors[idx];
+			float err1 = errors[idx + d];
+			
+			if (err1 < err0) {
+				errors[idx] = err1;
+				indices[idx] = indices[idx + d];
+			}
+		}
+	}
+
+#else
+
+	for(int d = THREAD_NUM/2; d > 32; d >>= 1)
+	{
+		__syncthreads();
+
+		if (idx < d)
+		{
+			float err0 = errors[idx];
+			float err1 = errors[idx + d];
+			
+			if (err1 < err0) {
+				errors[idx] = err1;
+				indices[idx] = indices[idx + d];
+			}
+		}
+	}
+
+	// unroll last 6 steps 
+	if (idx <= 32)
+	{
+		if (errors[idx + 32] < errors[idx]) {
+			errors[idx] = errors[idx + 32];
+			indices[idx] = indices[idx + 32];
+		}
+		if (errors[idx + 16] < errors[idx]) {
+			errors[idx] = errors[idx + 16];
+			indices[idx] = indices[idx + 16];
+		}
+		if (errors[idx + 8] < errors[idx]) {
+			errors[idx] = errors[idx + 8];
+			indices[idx] = indices[idx + 8];
+		}
+		if (errors[idx + 4] < errors[idx]) {
+			errors[idx] = errors[idx + 4];
+			indices[idx] = indices[idx + 4];
+		}
+		if (errors[idx + 2] < errors[idx]) {
+			errors[idx] = errors[idx + 2];
+			indices[idx] = indices[idx + 2];
+		}
+		if (errors[idx + 1] < errors[idx]) {
+			errors[idx] = errors[idx + 1];
+			indices[idx] = indices[idx + 1];
+		}
+	}
+#endif
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+// Compress color block
+////////////////////////////////////////////////////////////////////////////////
+__global__ void compress(const uint * permutations, const uint * image, uint * result)
+{
+	const int bid = blockIdx.x;
+	const int idx = threadIdx.x;
+	
+	__shared__ float3 colors[16];
+	__shared__ float dps[16];
+	__shared__ int xrefs[16];
+	
+	if (idx < 16)
+	{
+		// Read color.
+		uint c = image[(bid) * 16 + idx];
+	
+		// No need to synchronize, 16 < warp size.
+#if __DEVICE_EMULATION__
+		} __debugsync(); if (idx < 16) {
+#endif
+		
+		// Copy color to shared mem.
+		colors[idx].z = ((c >> 0) & 0xFF) * (1.0f / 255.0f);
+		colors[idx].y = ((c >> 8) & 0xFF) * (1.0f / 255.0f);
+		colors[idx].x = ((c >> 16) & 0xFF) * (1.0f / 255.0f);
+		
+#if __DEVICE_EMULATION__
+		} __debugsync(); if (idx < 16) {
+#endif
+
+		// Sort colors along the best fit line.
+		float3 axis = bestFitLine(colors);
+		
+		dps[idx] = dot(colors[idx], axis);
+		
+#if __DEVICE_EMULATION__
+		} __debugsync(); if (idx < 16) {
+#endif
+		
+		sortColors(dps, colors, xrefs);
+	}
+	
+	ushort bestStart, bestEnd;
+	uint bestPermutation;
+	float bestError = FLT_MAX;
+	
+	__syncthreads();
+	
+	for(int i = 0; i < 16; i++)
+	{
+		if (i == 15 && idx >= 32) break;
+		
+		ushort start, end;
+		uint permutation = permutations[idx + THREAD_NUM * i];
+		float error = evalPermutation4(colors, permutation, &start, &end);
+		
+		if (error < bestError)
+		{
+			bestError = error;
+			bestPermutation = permutation;
+			bestStart = start;
+			bestEnd = end;
+		}
+	}
+
+	if (bestStart < bestEnd)
+	{
+		swap(bestEnd, bestStart);
+		bestPermutation ^= 0x55555555;	// Flip indices.
+	}
+
+	for(int i = 0; i < 3; i++)
+	{
+		if (i == 2 && idx >= 32) break;
+		
+		ushort start, end;
+		uint permutation = permutations[idx + THREAD_NUM * i];
+		float error = evalPermutation3(colors, permutation, &start, &end);
+		
+		if (error < bestError)
+		{
+			bestError = error;
+			bestPermutation = permutation;
+			bestStart = start;
+			bestEnd = end;
+			
+			if (bestStart > bestEnd)
+			{
+				swap(bestEnd, bestStart);
+				bestPermutation ^= (~bestPermutation >> 1) & 0x55555555;	// Flip indices.
+			}
+		}
+	}
+	
+	if (bestStart == bestEnd)
+	{
+		bestPermutation = 0;
+	}
+	
+	__syncthreads();
+	
+	// Use a parallel reduction to find minimum error.
+	__shared__ float errors[THREAD_NUM];
+	__shared__ int indices[THREAD_NUM];
+	
+	errors[idx] = bestError;
+	indices[idx] = idx;
+	
+	minimizeError(errors, indices);
+	
+	__syncthreads();
+	
+	// Only write the result of the winner thread.
+	if (idx == indices[0])
+	{
+		// Reorder permutation.
+		uint perm = 0;
+		for(int i = 0; i < 16; i++)
+		{
+			int ref = xrefs[i];
+			perm |= ((bestPermutation >> (2 * i)) & 3) << (2 * ref);
+		}
+		
+		// Write endpoints. (bestStart, bestEnd)
+		result[2 * bid + 0] = (bestEnd << 16) | bestStart;
+		
+		// Write palette indices (permutation).
+		result[2 * bid + 1] = perm;
+	}
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+// Launch kernel
+////////////////////////////////////////////////////////////////////////////////
+extern "C" void compressKernel(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps, float weights[3])
+{
+	// Set constants.
+	cudaMemcpyToSymbol(kColorMetric, weights, sizeof(float) * 3, 0);
+
+	compress<<<blockNum, THREAD_NUM>>>(d_bitmaps, d_data, d_result);
+}
+
--- a/src/nvimage/nvtt/cuda/CudaCompressDXT.cpp
+++ b/src/nvimage/nvtt/cuda/CudaCompressDXT.cpp
@ -0,0 +1,264 @@
+// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
+// 
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+#include <nvcore/Debug.h>
+#include <nvcore/Containers.h>
+#include <nvmath/Color.h>
+#include <nvimage/Image.h>
+#include <nvimage/nvtt/CompressionOptions.h>
+
+#include "CudaCompressDXT.h"
+#include "CudaUtils.h"
+
+#if defined HAVE_CUDA
+#include <cuda_runtime.h>
+#endif
+
+using namespace nv;
+using namespace nvtt;
+
+#if defined HAVE_CUDA
+
+extern "C" void compressKernel(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps, float weights[3]);
+
+
+static uint * d_bitmaps = NULL;
+
+static void doPrecomputation()
+{
+	if (d_bitmaps != NULL) {
+		return;
+	}
+
+	uint bitmaps[1024];
+
+	int indices[16];
+	int num = 0;
+
+	// Compute bitmaps with 3 clusters:
+
+	// first cluster [0,i) is at the start
+	for( int m = 0; m < 16; ++m )
+	{
+		indices[m] = 0;
+	}
+	const int imax = 15;
+	for( int i = imax; i >= 0; --i )
+	{
+		// second cluster [i,j) is half along
+		for( int m = i; m < 16; ++m )
+		{
+			indices[m] = 2;
+		}
+		const int jmax = ( i == 0 ) ? 15 : 16;
+		for( int j = jmax; j >= i; --j )
+		{
+			// last cluster [j,k) is at the end
+			if( j < 16 )
+			{
+				indices[j] = 1;
+			}
+
+			uint bitmap = 0;
+			
+			for(int p = 0; p < 16; p++) {
+				bitmap |= indices[p] << (p * 2);
+			}
+				
+			bitmaps[num] = bitmap;
+				
+			num++;
+		}
+	}
+	nvDebugCheck(num == 151);
+
+	// Align to 160.
+	for(int i = 0; i < 9; i++)
+	{
+		bitmaps[num] = 0x000AA555;
+		num++;
+	}
+	nvDebugCheck(num == 160);
+
+	// Append bitmaps with 4 clusters:
+
+	// first cluster [0,i) is at the start
+	for( int m = 0; m < 16; ++m )
+	{
+		indices[m] = 0;
+	}
+	for( int i = imax; i >= 0; --i )
+	{
+		// second cluster [i,j) is one third along
+		for( int m = i; m < 16; ++m )
+		{
+			indices[m] = 2;
+		}
+		const int jmax = ( i == 0 ) ? 15 : 16;
+		for( int j = jmax; j >= i; --j )
+		{
+			// third cluster [j,k) is two thirds along
+			for( int m = j; m < 16; ++m )
+			{
+				indices[m] = 3;
+			}
+
+			int kmax = ( j == 0 ) ? 15 : 16;
+			for( int k = kmax; k >= j; --k )
+			{
+				// last cluster [k,n) is at the end
+				if( k < 16 )
+				{
+					indices[k] = 1;
+				}
+				
+				uint bitmap = 0;
+
+				bool hasThree = false;
+				for(int p = 0; p < 16; p++) {
+					bitmap |= indices[p] << (p * 2);
+
+					if (indices[p] == 3) hasThree = true;
+				}
+				
+				if (hasThree) {
+					bitmaps[num] = bitmap;
+					num++;
+				}
+			}
+		}
+	}
+	nvDebugCheck(num == 975);
+	
+	// Align to 1024.
+	for(int i = 0; i < 49; i++)
+	{
+		bitmaps[num] = 0x00AAFF55;
+		num++;
+	}
+
+	nvDebugCheck(num == 1024);
+
+    // Upload bitmaps.
+    cudaMalloc((void**) &d_bitmaps, 1024 * sizeof(uint));
+    cudaMemcpy(d_bitmaps, bitmaps, 1024 * sizeof(uint), cudaMemcpyHostToDevice);
+
+	// @@ Check for errors.
+
+}
+
+#endif
+
+
+/// Compress image using CUDA.
+void nv::cudaCompressDXT1(const Image * image, const OutputOptions & outputOptions, const nvtt::CompressionOptions::Private & compressionOptions)
+{
+	nvDebugCheck(cuda::isHardwarePresent());
+#if defined HAVE_CUDA
+
+	doPrecomputation();
+
+	// Image size in blocks.
+	const uint w = (image->width() + 3) / 4;
+	const uint h = (image->height() + 3) / 4;
+
+	uint imageSize = w * h * 16 * sizeof(Color32);
+    uint * blockLinearImage = (uint *) malloc(imageSize);
+
+	// Convert linear image to block linear.
+	for(uint by = 0; by < h; by++) {
+		for(uint bx = 0; bx < w; bx++) {
+			const uint bw = min(image->width() - bx * 4, 4U);
+			const uint bh = min(image->height() - by * 4, 4U);
+
+			for (uint i = 0; i < 16; i++) {
+				const int x = (i & 3) % bw;
+				const int y = (i / 4) % bh;
+				blockLinearImage[(by * w + bx) * 16 + i] = image->pixel(bx * 4 + x, by * 4 + y).u;
+			}
+		}
+	}
+
+	const uint blockNum = w * h;
+	const uint compressedSize = blockNum * 8;
+	const uint blockMax = 32768; // 65535
+
+    // Allocate image in device memory.
+    uint * d_data = NULL;
+    cudaMalloc((void**) &d_data, min(imageSize, blockMax * 64U));
+
+	// Allocate result.
+    uint * d_result = NULL;
+    cudaMalloc((void**) &d_result, min(compressedSize, blockMax * 8U));
+
+	// TODO: Add support for multiple GPUs.
+	uint bn = 0;
+	while(bn != blockNum)
+	{
+		uint count = min(blockNum - bn, blockMax);
+
+	    cudaMemcpy(d_data, blockLinearImage + bn * 16, count * 64, cudaMemcpyHostToDevice);
+
+		// Launch kernel.
+		float weights[3];
+		weights[0] = compressionOptions.colorWeight.x();
+		weights[1] = compressionOptions.colorWeight.y();
+		weights[2] = compressionOptions.colorWeight.z();
+		compressKernel(count, d_data, d_result, d_bitmaps, weights);
+
+		// Check for errors.
+		cudaError_t err = cudaGetLastError();
+		if (err != cudaSuccess)
+		{
+			nvDebug("CUDA Error: %s\n", cudaGetErrorString(err));
+
+			if (outputOptions.errorHandler != NULL)
+			{
+				outputOptions.errorHandler->error(nvtt::Error_CudaError);
+			}
+		}
+
+		// Copy result to host, overwrite swizzled image.
+		cudaMemcpy(blockLinearImage, d_result, count * 8, cudaMemcpyDeviceToHost);
+
+		// Output result.
+		if (outputOptions.outputHandler != NULL)
+		{
+			outputOptions.outputHandler->writeData(blockLinearImage, count * 8);
+		}
+
+		bn += count;
+	}
+
+	free(blockLinearImage);
+	cudaFree(d_data);
+	cudaFree(d_result);
+
+#else
+	if (outputOptions.errorHandler != NULL)
+	{
+		outputOptions.errorHandler->error(Error_CudaError);
+	}
+#endif
+}
+
--- a/src/nvimage/nvtt/cuda/CudaCompressDXT.h
+++ b/src/nvimage/nvtt/cuda/CudaCompressDXT.h
@ -0,0 +1,39 @@
+// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
+// 
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef NV_TT_CUDACOMPRESSDXT_H
+#define NV_TT_CUDACOMPRESSDXT_H
+
+#include <nvimage/nvimage.h>
+#include <nvimage/nvtt/nvtt.h>
+
+namespace nv
+{
+	class Image;
+
+	void cudaCompressDXT1(const Image * image, const nvtt::OutputOptions & outputOptions, const nvtt::CompressionOptions::Private & compressionOptions);
+	
+} // nv namespace
+
+
+#endif // NV_TT_CUDAUTILS_H
--- a/src/nvimage/nvtt/cuda/CudaMath.h
+++ b/src/nvimage/nvtt/cuda/CudaMath.h
@ -0,0 +1,214 @@
+// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
+// 
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+// Math functions and operators to be used with vector types.
+
+#ifndef CUDAMATH_H
+#define CUDAMATH_H
+
+#include <float.h>
+
+
+inline __device__ __host__ float3 operator *(float3 a, float3 b)
+{
+    return make_float3(a.x*b.x, a.y*b.y, a.z*b.z);
+}
+
+inline __device__ __host__ float3 operator *(float f, float3 v)
+{
+    return make_float3(v.x*f, v.y*f, v.z*f);
+}
+
+inline __device__ __host__ float3 operator *(float3 v, float f)
+{
+    return make_float3(v.x*f, v.y*f, v.z*f);
+}
+
+inline __device__ __host__ float3 operator +(float3 a, float3 b)
+{
+    return make_float3(a.x+b.x, a.y+b.y, a.z+b.z);
+}
+
+inline __device__ __host__ void operator +=(float3 & b, float3 a)
+{
+    b.x += a.x;
+    b.y += a.y;
+    b.z += a.z;
+}
+
+inline __device__ __host__ float3 operator -(float3 a, float3 b)
+{
+    return make_float3(a.x-b.x, a.y-b.y, a.z-b.z);
+}
+
+inline __device__ __host__ void operator -=(float3 & b, float3 a)
+{
+    b.x -= a.x;
+    b.y -= a.y;
+    b.z -= a.z;
+}
+
+inline __device__ __host__ float3 operator /(float3 v, float f)
+{
+    float inv = 1.0f / f;
+    return v * inv;
+}
+
+inline __device__ __host__ void operator /=(float3 & b, float f)
+{
+    float inv = 1.0f / f;
+    b.x *= inv;
+    b.y *= inv;
+    b.z *= inv;
+}
+
+
+inline __device__ __host__ float dot(float3 a, float3 b)
+{
+    return a.x * b.x + a.y * b.y + a.z * b.z;
+}
+
+inline __device__ __host__ float dot(float4 a, float4 b)
+{
+    return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
+}
+
+inline __device__ __host__ float clamp(float f, float a, float b)
+{
+    return max(a, min(f, b));
+}
+
+inline __device__ __host__ float3 clamp(float3 v, float a, float b)
+{
+    return make_float3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b));
+}
+
+inline __device__ __host__ float3 clamp(float3 v, float3 a, float3 b)
+{
+    return make_float3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z));
+}
+
+
+inline __device__ __host__ float3 normalize(float3 v)
+{
+    float len = 1.0f / dot(v, v);
+    return make_float3(v.x * len, v.y * len, v.z * len);
+}
+
+
+
+
+// Use power method to find the first eigenvector.
+// http://www.miislita.com/information-retrieval-tutorial/matrix-tutorial-3-eigenvalues-eigenvectors.html
+inline __device__ __host__ float3 firstEigenVector( float matrix[6] )
+{
+    // 8 iterations seems to be more than enough.
+
+    float3 v = make_float3(1.0f, 1.0f, 1.0f);
+    for(int i = 0; i < 8; i++) {
+        float x = v.x * matrix[0] + v.y * matrix[1] + v.z * matrix[2];
+        float y = v.x * matrix[1] + v.y * matrix[3] + v.z * matrix[4];
+        float z = v.x * matrix[2] + v.y * matrix[4] + v.z * matrix[5];
+        float m = max(max(x, y), z);        
+        float iv = 1.0f / m;
+        #if __DEVICE_EMULATION__
+        if (m == 0.0f) iv = 0.0f;
+        #endif
+        v = make_float3(x*iv, y*iv, z*iv);
+    }
+
+    return v;
+}
+
+inline __device__ float3 bestFitLine(const float3 * colors)
+{
+#if __DEVICE_EMULATION__
+
+    // Compute covariance matrix of the given colors.
+    float3 center = make_float3(0.0f, 0.0f, 0.0f);
+    for (int i = 0; i < 16; i++)
+    {
+        center += colors[i];
+    }
+    center /= 16.0f;
+
+    float covariance[6] = {0, 0, 0, 0, 0, 0};
+    for (int i = 0; i < 16; i++)
+    {
+        float3 a = colors[i] - center;
+        covariance[0] += a.x * a.x;
+        covariance[1] += a.x * a.y;
+        covariance[2] += a.x * a.z;
+        covariance[3] += a.y * a.y;
+        covariance[4] += a.y * a.z;
+        covariance[5] += a.z * a.z;
+    }
+
+#else
+
+    const int idx = threadIdx.x;
+
+    __shared__ float3 colorSum[16];
+    colorSum[idx] = colors[idx];
+
+    // Unrolled parallel reduction.
+    if (idx < 8) {
+        colorSum[idx] += colorSum[idx + 8];
+        colorSum[idx] += colorSum[idx + 4];
+        colorSum[idx] += colorSum[idx + 2];
+        colorSum[idx] += colorSum[idx + 1];
+    }
+
+    // @@ Eliminate two-way bank conflicts here.
+    // @@ It seems that doing that and unrolling the reduction doesn't help...
+    __shared__ float covariance[16*6];
+    colorSum[idx] = colors[idx] - colorSum[0] / 16.0f;
+    
+    covariance[6 * idx + 0] = colorSum[idx].x * colorSum[idx].x;    // 0, 6, 12, 2, 8, 14, 4, 10, 0
+    covariance[6 * idx + 1] = colorSum[idx].x * colorSum[idx].y;
+    covariance[6 * idx + 2] = colorSum[idx].x * colorSum[idx].z;
+    covariance[6 * idx + 3] = colorSum[idx].y * colorSum[idx].y;
+    covariance[6 * idx + 4] = colorSum[idx].y * colorSum[idx].z;
+    covariance[6 * idx + 5] = colorSum[idx].z * colorSum[idx].z;
+
+    for(int d = 8; d > 0; d >>= 1)
+    {
+        if (idx < d)
+        {
+            covariance[6 * idx + 0] += covariance[6 * (idx+d) + 0];
+            covariance[6 * idx + 1] += covariance[6 * (idx+d) + 1];
+            covariance[6 * idx + 2] += covariance[6 * (idx+d) + 2];
+            covariance[6 * idx + 3] += covariance[6 * (idx+d) + 3];
+            covariance[6 * idx + 4] += covariance[6 * (idx+d) + 4];
+            covariance[6 * idx + 5] += covariance[6 * (idx+d) + 5];
+        }
+    }
+
+#endif
+    
+    // Compute first eigen vector.
+    return firstEigenVector(covariance);
+}
+
+
+#endif // CUDAMATH_H
--- a/src/nvimage/nvtt/cuda/CudaUtils.cpp
+++ b/src/nvimage/nvtt/cuda/CudaUtils.cpp
@ -0,0 +1,109 @@
+// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
+// 
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+#include <nvcore/Debug.h>
+#include "CudaUtils.h"
+
+#if defined HAVE_CUDA
+#include <cuda_runtime.h>
+#endif
+
+using namespace nv;
+using namespace cuda;
+
+#if NV_OS_WIN32
+
+#define WINDOWS_LEAN_AND_MEAN
+#include <windows.h>
+
+static bool isWindowsVista()
+{
+	OSVERSIONINFO osvi;
+	osvi.dwOSVersionInfoSize = sizeof(OSVERSIONINFO);
+
+	::GetVersionEx(&osvi);
+	return osvi.dwMajorVersion >= 6;
+}
+
+
+typedef BOOL (WINAPI *LPFN_ISWOW64PROCESS) (HANDLE, PBOOL);
+
+static bool isWow32()
+{
+	LPFN_ISWOW64PROCESS fnIsWow64Process = (LPFN_ISWOW64PROCESS)GetProcAddress(GetModuleHandle("kernel32"), "IsWow64Process");
+
+    BOOL bIsWow64 = FALSE;
+ 
+    if (NULL != fnIsWow64Process)
+    {
+        if (!fnIsWow64Process(GetCurrentProcess(), &bIsWow64))
+        {
+			// Assume 32 bits.
+            return true;
+        }
+    }
+
+    return !bIsWow64;
+}
+
+#endif
+
+
+/// Determine if CUDA is available.
+bool nv::cuda::isHardwarePresent()
+{
+#if defined HAVE_CUDA
+	return !isWindowsVista() && deviceCount() > 0;
+	//return !isWindowsVista() && isWow32() && deviceCount() > 0;
+#else
+	return false;
+#endif
+}
+
+/// Get number of CUDA enabled devices.
+int nv::cuda::deviceCount()
+{
+#if defined HAVE_CUDA
+	int gpuCount = 0;
+
+	cudaError_t result = cudaGetDeviceCount(&gpuCount);
+
+	if (result == cudaSuccess)
+	{
+		return gpuCount;
+	}
+#endif
+	return 0;
+}
+
+/// Activate the given devices.
+bool nv::cuda::setDevice(int i)
+{
+	nvCheck(i < deviceCount());
+#if defined HAVE_CUDA
+	cudaError_t result = cudaSetDevice(i);
+	return result == cudaSuccess;
+#else
+	return false;
+#endif
+}
--- a/src/nvimage/nvtt/cuda/CudaUtils.h
+++ b/src/nvimage/nvtt/cuda/CudaUtils.h
@ -0,0 +1,40 @@
+// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
+// 
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef NV_TT_CUDAUTILS_H
+#define NV_TT_CUDAUTILS_H
+
+namespace nv
+{
+	
+	namespace cuda
+	{
+		bool isHardwarePresent();
+		int deviceCount();
+		bool setDevice(int i);
+	};
+	
+} // nv namespace
+
+
+#endif // NV_TT_CUDAUTILS_H
--- a/src/nvimage/nvtt/dxtlib.cpp
+++ b/src/nvimage/nvtt/dxtlib.cpp
@ -0,0 +1,486 @@
+// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
+// 
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+#include <nvcore/Memory.h>
+#include <nvcore/Ptr.h>
+
+#include <nvimage/DirectDrawSurface.h>
+#include <nvimage/ColorBlock.h>
+#include <nvimage/Image.h>
+#include <nvimage/FloatImage.h>
+#include <nvimage/Filter.h>
+#include <nvimage/Quantize.h>
+#include <nvimage/NormalMap.h>
+
+#include "CompressDXT.h"
+#include "FastCompressDXT.h"
+#include "CompressRGB.h"
+#include "BlockDXT.h"
+#include "InputOptions.h"
+#include "CompressionOptions.h"
+#include "cuda/CudaUtils.h"
+#include "cuda/CudaCompressDXT.h"
+
+
+using namespace nv;
+using namespace nvtt;
+
+namespace
+{
+	
+	static int blockSize(Format format)
+	{
+		if (format == Format_DXT1 /*|| format == Format_DXT1a*/) {
+			return 8;
+		}
+		else if (format == Format_DXT3) {
+			return 16;
+		}
+		else if (format == Format_DXT5 || format == Format_DXT5n) {
+			return 16;
+		}
+		else if (format == Format_BC4) {
+			return 8;
+		}
+		else if (format == Format_BC5) {
+			return 16;
+		}
+		return 0;
+	}
+	
+	static int computeImageSize(int w, int h, Format format)
+	{
+		if (format == Format_RGBA) {
+			return w * h * sizeof(Color32);
+		}
+		else {
+			return ((w + 3) / 4) * ((h + 3) / 4) * blockSize(format);
+		}
+	}
+	
+} // namespace
+
+
+
+
+
+//
+// compress
+//
+
+static void outputHeader(const InputOptions::Private & inputOptions, const OutputOptions & outputOptions, const CompressionOptions::Private & compressionOptions)
+{
+	// Output DDS header.
+	if (outputOptions.outputHandler != NULL && outputOptions.outputHeader)
+	{
+		DDSHeader header;
+		
+		// Only 1 face and 2d textures supported.
+		nvCheck(inputOptions.faceCount == 1);
+		
+		InputOptions::Private::Image * img = inputOptions.images;
+		nvCheck(img != NULL);
+		
+		header.setWidth(img->width);
+		header.setHeight(img->height);
+		
+		int mipmapCount = inputOptions.mipmapCount;
+		if (!inputOptions.generateMipmaps) mipmapCount = 0;
+		else if (inputOptions.maxLevel != -1 && inputOptions.maxLevel < mipmapCount) mipmapCount = inputOptions.maxLevel;
+		header.setMipmapCount(mipmapCount);
+
+		if (inputOptions.textureType == TextureType_2D) {
+			header.setTexture2D();
+		}
+		else if (inputOptions.textureType == TextureType_Cube) {
+			header.setTextureCube();
+		}		
+		/*else if (inputOptions.textureType == TextureType_3D) {
+			header.setTexture3D();
+			header.setDepth(img->depth);
+		}*/
+		
+		if (compressionOptions.format == Format_RGBA)
+		{
+			header.setPitch(4 * img->width);
+			header.setPixelFormat(compressionOptions.bitcount, compressionOptions.rmask, compressionOptions.gmask, compressionOptions.bmask, compressionOptions.amask);
+		}
+		else
+		{
+			header.setLinearSize(computeImageSize(img->width, img->height, compressionOptions.format));
+			
+			if (compressionOptions.format == Format_DXT1 /*|| compressionOptions.format == Format_DXT1a*/) {
+				header.setFourCC('D', 'X', 'T', '1');
+			}
+			else if (compressionOptions.format == Format_DXT3) {
+				header.setFourCC('D', 'X', 'T', '3');
+			}
+			else if (compressionOptions.format == Format_DXT5) {
+				header.setFourCC('D', 'X', 'T', '5');
+			}
+			else if (compressionOptions.format == Format_DXT5n) {
+				header.setFourCC('R', 'X', 'G', 'B');
+			}
+			else if (compressionOptions.format == Format_BC4) {
+				header.setFourCC('A', 'T', 'I', '1');
+			}
+			else if (compressionOptions.format == Format_BC5) {
+				header.setFourCC('A', 'T', 'I', '2');
+			}
+		}
+		
+		// Swap bytes if necessary.
+		header.swapBytes();
+		
+		nvStaticCheck(sizeof(DDSHeader) == 128);
+		outputOptions.outputHandler->writeData(&header, 128);
+		
+		// Revert swap.
+		header.swapBytes();
+	}
+}
+
+
+static bool compressMipmap(const Image * image, const OutputOptions & outputOptions, const CompressionOptions::Private & compressionOptions)
+{
+	nvDebugCheck(image != NULL);
+
+	if (compressionOptions.format == Format_RGBA || compressionOptions.format == Format_RGB)
+	{
+		compressRGB(image, outputOptions, compressionOptions);
+	}
+	else if (compressionOptions.format == Format_DXT1)
+	{
+#if defined(HAVE_S3QUANT)
+		if (compressionOptions.externalCompressor == "s3")
+		{
+			s3CompressDXT1(image, outputOptions);
+		}
+		else
+#endif
+
+#if defined(HAVE_ATITC)
+		if (compressionOptions.externalCompressor == "ati")
+		{
+			printf("ATI\n");
+			atiCompressDXT1(image, outputOptions);
+		}
+		else
+#endif
+		if (compressionOptions.useCuda && nv::cuda::isHardwarePresent())
+		{
+			cudaCompressDXT1(image, outputOptions, compressionOptions);
+		}
+		else
+		{
+			if (compressionOptions.quality == Quality_Fastest)
+			{
+				fastCompressDXT1(image, outputOptions);
+			}
+			else
+			{
+				compressDXT1(image, outputOptions, compressionOptions);
+			}
+		}
+	}
+	else if (compressionOptions.format == Format_DXT3)
+	{
+		if (compressionOptions.quality == Quality_Fastest)
+		{
+			fastCompressDXT3(image, outputOptions);
+		}
+		else
+		{
+			compressDXT3(image, outputOptions, compressionOptions);
+		}
+	}
+	else if (compressionOptions.format == Format_DXT5)
+	{
+		if (compressionOptions.quality == Quality_Fastest)
+		{
+			fastCompressDXT5(image, outputOptions);
+		}
+		else
+		{
+			compressDXT5(image, outputOptions, compressionOptions);
+		}
+	}
+	else if (compressionOptions.format == Format_DXT5n)
+	{
+		if (compressionOptions.quality == Quality_Fastest)
+		{
+			fastCompressDXT5n(image, outputOptions);
+		}
+		else
+		{
+			compressDXT5n(image, outputOptions, compressionOptions);
+		}
+	}
+	else if (compressionOptions.format == Format_BC4)
+	{
+		compressBC4(image, outputOptions, compressionOptions);
+	}
+	else if (compressionOptions.format == Format_BC5)
+	{
+		compressBC5(image, outputOptions, compressionOptions);
+	}
+
+	return true;
+}
+
+
+// Convert input image to linear float image.
+static FloatImage * toFloatImage(const Image * image, const InputOptions::Private & inputOptions)
+{
+	nvDebugCheck(image != NULL);
+
+	FloatImage * floatImage = new FloatImage(image);
+
+	// Convert to linear space.
+	if (inputOptions.inputGamma != 1.0f) {
+		floatImage->toLinear(0, 3, inputOptions.inputGamma);
+	}
+
+	return floatImage;
+}
+
+
+// Convert linear float image to output image.
+static Image * toFixedImage(const FloatImage * floatImage, const InputOptions::Private & inputOptions)
+{
+	nvDebugCheck(floatImage != NULL);
+
+	return floatImage->createImageGammaCorrect(inputOptions.outputGamma);
+}
+
+
+// Create mipmap from the given image.
+static FloatImage * createMipmap(const FloatImage * floatImage, const InputOptions::Private & inputOptions)
+{
+	FloatImage * result = NULL;
+	
+	if (inputOptions.mipmapFilter == MipmapFilter_Box)
+	{
+		// Use fast downsample.
+		result = floatImage->fastDownSample();
+	}
+	else if (inputOptions.mipmapFilter == MipmapFilter_Triangle)
+	{
+		Kernel1 kernel(4);
+		kernel.initFilter(Filter::Triangle);
+		result = floatImage->downSample(kernel, (FloatImage::WrapMode)inputOptions.wrapMode);
+	}
+	else /*if (inputOptions.mipmapFilter == MipmapFilter_Kaiser)*/
+	{
+		Kernel1 kernel(10);
+		kernel.initKaiser(8.0, 0.75f);
+		result = floatImage->downSample(kernel, (FloatImage::WrapMode)inputOptions.wrapMode);
+	}
+	
+	// Normalize mipmap.
+	if (inputOptions.normalizeMipmaps)
+	{
+		normalize(result);
+	}
+	
+	return result;
+}
+
+
+// Quantize the input image to the precision of the output format.
+static void quantize(Image * img, const InputOptions::Private & inputOptions, Format format)
+{
+	if (inputOptions.enableColorDithering)
+	{
+		if (format >= Format_DXT1 && format <= Format_DXT5)
+		{
+			Quantize::FloydSteinberg_RGB16(img);
+		}
+	}
+	if (inputOptions.binaryAlpha)
+	{
+		if (inputOptions.enableAlphaDithering)
+		{
+			Quantize::FloydSteinberg_BinaryAlpha(img, inputOptions.alphaThreshold);
+		}
+		else
+		{
+			Quantize::BinaryAlpha(img, inputOptions.alphaThreshold);
+		}
+	}
+	else
+	{
+		if (inputOptions.enableAlphaDithering)
+		{
+			if (format == Format_DXT3)
+			{
+				Quantize::Alpha4(img);
+			}
+			/*else if (format == Format_DXT1a)
+			{
+				Quantize::BinaryAlpha(img, inputOptions.alphaThreshold);
+			}*/
+		}
+	}
+}
+
+
+/// Compress the input texture with the given compression options.
+bool nvtt::compress(const InputOptions & inputOptions, const OutputOptions & outputOptions, const CompressionOptions & compressionOptions)
+{
+	// Make sure enums match.
+	nvStaticCheck(FloatImage::WrapMode_Clamp == (FloatImage::WrapMode)WrapMode_Clamp);
+	nvStaticCheck(FloatImage::WrapMode_Mirror == (FloatImage::WrapMode)WrapMode_Mirror);
+	nvStaticCheck(FloatImage::WrapMode_Repeat == (FloatImage::WrapMode)WrapMode_Repeat);
+
+	// Output DDS header.
+	outputHeader(inputOptions.m, outputOptions, compressionOptions.m);
+
+	Format format = compressionOptions.m.format;
+
+	for (int f = 0; f < inputOptions.m.faceCount; f++)
+	{
+		Image * lastImage = NULL;
+		AutoPtr<FloatImage> floatImage(NULL);
+		
+		for (int m = 0; m < inputOptions.m.mipmapCount; m++)
+		{
+			int idx = f * inputOptions.m.mipmapCount + m;
+			InputOptions::Private::Image & mipmap = inputOptions.m.images[idx];
+			
+			if (outputOptions.outputHandler)
+			{
+				int size = computeImageSize(mipmap.width, mipmap.height, format);
+				outputOptions.outputHandler->mipmap(size, mipmap.width, mipmap.height, mipmap.depth, mipmap.face, mipmap.mipLevel);
+			}
+			
+			Image * img; // Image to compress.
+			
+			if (mipmap.data != NULL) // Mipmap provided.
+			{
+				// Convert to normal map.
+				if (inputOptions.m.convertToNormalMap)
+				{
+					floatImage = createNormalMap(mipmap.data, (FloatImage::WrapMode)inputOptions.m.wrapMode, inputOptions.m.heightFactors, inputOptions.m.bumpFrequencyScale);
+				}
+				else
+				{
+					lastImage = img = mipmap.data;
+					
+					// Delete float image.
+					floatImage = NULL;
+				}
+			}
+			else // Create mipmap from last.
+			{
+				if (m == 0) {
+					// First mipmap missing.
+					if (outputOptions.errorHandler != NULL) outputOptions.errorHandler->error(Error_InvalidInput);
+					return false;
+				}
+				
+				if (floatImage == NULL)
+				{
+					nvDebugCheck(lastImage != NULL);
+					floatImage = toFloatImage(lastImage, inputOptions.m);
+				}
+				
+				// Create mipmap.
+				floatImage = createMipmap(floatImage.ptr(), inputOptions.m);
+			}
+			
+			if (floatImage != NULL)
+			{
+				// Convert to fixed.
+				img = toFixedImage(floatImage.ptr(), inputOptions.m);
+			}
+			
+			quantize(img, inputOptions.m, format);
+			
+			compressMipmap(img, outputOptions, compressionOptions.m);
+			
+			if (img != mipmap.data)
+			{
+				delete img;
+			}
+			
+			if (!inputOptions.m.generateMipmaps || (inputOptions.m.maxLevel >= 0 && m >= inputOptions.m.maxLevel)) {
+				// continue with next face.
+				break;
+			}
+		}
+	}
+
+	return true;
+}
+
+
+
+
+/// Estimate the size of compressing the input with the given options.
+int nvtt::estimateSize(const InputOptions & inputOptions, const CompressionOptions & compressionOptions)
+{
+	Format format = compressionOptions.m.format;
+
+	int size = 0;
+	
+	for (int f = 0; f < inputOptions.m.faceCount; f++)
+	{
+		for (int m = 0; m < inputOptions.m.mipmapCount; m++)
+		{
+			int idx = f * inputOptions.m.mipmapCount + m;
+			const InputOptions::Private::Image & img = inputOptions.m.images[idx];
+			
+			size += computeImageSize(img.width, img.height, format);
+			
+			if (!inputOptions.m.generateMipmaps || (inputOptions.m.maxLevel >= 0 && m >= inputOptions.m.maxLevel)) {
+				// continue with next face.
+				break;
+			}
+		}
+	}
+	
+	return size;
+}
+
+
+/// Return a string for the given error.
+const char * nvtt::errorString(Error e)
+{
+	switch(e)
+	{
+		case Error_InvalidInput:
+			return "Invalid input";
+		case Error_UserInterruption:
+			return "User interruption";
+		case Error_UnsupportedFeature:
+			return "Unsupported feature";
+		case Error_CudaError:
+			return "CUDA error";
+		case Error_Unknown:
+			return "Unknown error";
+	}
+
+	return NULL;
+}
+
--- a/src/nvimage/nvtt/nvtt.h
+++ b/src/nvimage/nvtt/nvtt.h
@ -0,0 +1,242 @@
+// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
+// 
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef NV_TT_H
+#define NV_TT_H
+
+#include <nvcore/nvcore.h>
+
+// Function linkage
+#if NVTT_SHARED
+#ifdef NVTT_EXPORTS
+#define NVTT_API DLL_EXPORT
+#define NVTT_CLASS DLL_EXPORT_CLASS
+#else
+#define NVTT_API DLL_IMPORT
+#define NVTT_CLASS DLL_IMPORT
+#endif
+#else
+#define NVTT_API
+#define NVTT_CLASS
+#endif
+
+// Public interface.
+namespace nvtt
+{
+	/// Supported compression formats.
+	enum Format
+	{
+		// No compression.
+		Format_RGB,
+		Format_RGBA = Format_RGB,
+
+		// DX9 formats.
+		Format_DXT1,
+	//	Format_DXT1a,	// DXT1 with binary alpha.
+		Format_DXT3,
+		Format_DXT5,
+		Format_DXT5n,	// Compressed HILO: R=0, G=x, B=0, A=y
+		
+		// DX10 formats.
+		Format_BC1 = Format_DXT1,
+		Format_BC2 = Format_DXT3,
+		Format_BC3 = Format_DXT5,
+		Format_BC3n = Format_DXT5n,
+		Format_BC4,		// ATI1
+		Format_BC5,		// 3DC, ATI2
+
+		// OpenGL formats.
+		Format_LATC = Format_BC5,
+	};
+	
+	/// Quality modes.
+	enum Quality
+	{
+		Quality_Fastest,
+		Quality_Normal,
+		Quality_Production,
+		Quality_Highest,
+	};
+
+	/// Compression options. This class describes the desired compression format and other compression settings.
+	class CompressionOptions
+	{
+	public:
+		NVTT_API CompressionOptions();
+		NVTT_API ~CompressionOptions();
+		
+		NVTT_API void reset();
+		
+		NVTT_API void setFormat(Format format);
+		NVTT_API void setQuality(Quality quality, float errorThreshold = 0.5f);
+		NVTT_API void setColorWeights(float red, float green, float blue);
+		NVTT_API void enableHardwareCompression(bool enable);
+		
+		NVTT_API void setExternalCompressor(const char * name);
+
+		// Set color mask to describe the RGB/RGBA format.
+		NVTT_API void setPixelFormat(uint bitcount, uint rmask, uint gmask, uint bmask, uint amask);
+
+	//private:
+		struct Private;
+		Private & m;
+	};
+
+
+	/// Wrap modes. // This matches FloatImage::WrapMode.
+	enum WrapMode
+	{
+		WrapMode_Clamp,
+		WrapMode_Repeat,
+		WrapMode_Mirror,
+	};
+	
+	/// Texture types.
+	enum TextureType
+	{
+		TextureType_2D,
+		TextureType_Cube,
+	//	TextureType_3D,
+	};
+	
+	/// Input formats.
+	enum InputFormat
+	{
+		InputFormat_BGRA_8UB,
+	//	InputFormat_RGBE_8UB,
+	//	InputFormat_BGRA_32F,
+	};
+	
+	/// Mipmap downsampling filters.
+	enum MipmapFilter
+	{
+		MipmapFilter_Box,		///< Box filter is quite good and very fast.
+		MipmapFilter_Triangle,	///< Triangle filter blurs the results too much, but that might be what you want.
+		MipmapFilter_Kaiser,	///< Kaiser-windowed Sinc filter is the best downsampling filter.
+	};
+	
+
+
+	/// Input options. Specify format and layout of the input texture.
+	struct InputOptions
+	{
+		NVTT_API InputOptions();
+		NVTT_API ~InputOptions();
+		
+		// Set default options.
+		NVTT_API void reset();
+		
+		// Setup input layout.
+		NVTT_API void setTextureLayout(TextureType type, int w, int h, int d = 1);
+		NVTT_API void resetTextureLayout();
+
+		// Set mipmap data. Copies the data.
+		NVTT_API bool setMipmapData(const void * data, int w, int h, int d = 1, int face = 0, int mipmap = 0);		
+
+		// Describe the format of the input.
+		NVTT_API void setFormat(InputFormat fmt, bool alphaTransparency);
+
+		// Set gamma settings.
+		NVTT_API void setGamma(float inputGamma, float outputGamma);
+
+		// Set texture wrappign mode.
+		NVTT_API void setWrapMode(WrapMode mode);
+
+		// Set mipmapping options.
+		NVTT_API void setMipmapping(bool generateMipmaps, MipmapFilter filter = MipmapFilter_Kaiser, int maxLevel = -1);
+		
+		// Set quantization options.
+		NVTT_API void setQuantization(bool colorDithering, bool alphaDithering, bool binaryAlpha, int alphaThreshold = 127);
+
+		// Set normal map options.
+		NVTT_API void setConvertToNormalMap(bool convert);
+		NVTT_API void setHeightEvaluation(float redScale, float greenScale, float blueScale, float alphaScale);
+		NVTT_API void setNormalFilter(float small, float medium, float big, float large);
+		NVTT_API void setNormalizeMipmaps(bool b);
+
+	//private:
+		struct Private;
+		Private & m;
+	};
+	
+	
+	/// Output handler.
+	struct OutputHandler
+	{
+		virtual ~OutputHandler() {}
+		
+		/// Indicate the start of a new compressed image that's part of the final texture.
+		virtual void mipmap(int size, int width, int height, int depth, int face, int miplevel) = 0;
+		
+		/// Output data. Compressed data is output as soon as it's generated to minimize memory allocations.
+		virtual void writeData(const void * data, int size) = 0;
+	};
+
+	/// Error codes.
+	enum Error
+	{
+		Error_InvalidInput,
+		Error_UserInterruption,
+		Error_UnsupportedFeature,
+		Error_CudaError,
+		Error_Unknown,
+	};
+	
+	/// Error handler.
+	struct ErrorHandler
+	{
+		virtual ~ErrorHandler() {}
+		
+		// Signal error.
+		virtual void error(Error e) = 0;
+	};
+
+
+	/// Output Options. This class holds pointers to the interfaces that are used to report the output of 
+	/// the compressor to the user.
+	struct OutputOptions
+	{
+		OutputOptions() : outputHandler(NULL), outputHeader(true) { reset(); }
+		OutputOptions(OutputHandler * oh, ErrorHandler * eh) : outputHandler(oh), errorHandler(eh), outputHeader(true) { reset(); }
+		
+		// Set default options.
+		NVTT_API void reset();
+		
+		OutputHandler * outputHandler;
+		ErrorHandler * errorHandler;
+		bool outputHeader;
+	};
+	
+	
+	// Main entrypoint of the compression library.
+	NVTT_API bool compress(const InputOptions & inputOptions, const OutputOptions & outputOptions, const CompressionOptions & compressionOptions);
+	
+	// Estimate the size of compressing the input with the given options.
+	NVTT_API int estimateSize(const InputOptions & inputOptions, const CompressionOptions & compressionOptions);
+	
+	// Return string for the given error.
+	NVTT_API const char * errorString(Error e);
+
+} // nvtt namespace
+
+#endif // NV_TT_H
--- a/src/nvimage/nvtt/squish/CMakeLists.txt
+++ b/src/nvimage/nvtt/squish/CMakeLists.txt
@ -0,0 +1,52 @@
+PROJECT(squish)
+ENABLE_TESTING()
+
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+
+SET(SQUISH_SRCS
+	alpha.cpp
+	alpha.h
+	clusterfit.cpp
+	clusterfit.h
+	fastclusterfit.cpp
+	fastclusterfit.h
+	weightedclusterfit.cpp
+	weightedclusterfit.h
+	colourblock.cpp
+	colourblock.h
+	colourfit.cpp
+	colourfit.h
+	colourset.cpp
+	colourset.h
+	config.h
+	maths.cpp
+	maths.h
+	rangefit.cpp
+	rangefit.h
+	singlecolourfit.cpp
+	singlecolourfit.h
+	singlecolourlookup.inl
+	squish.cpp
+	squish.h
+	simd.h
+	simd_sse.h
+	simd_ve.h)
+
+ADD_LIBRARY(squish STATIC ${SQUISH_SRCS})
+
+# libpng
+FIND_PACKAGE(PNG)
+
+IF(PNG_FOUND)
+	INCLUDE_DIRECTORIES(${PNG_INCLUDE_DIR})
+	ADD_EXECUTABLE(squishpng extra/squishpng.cpp)
+	TARGET_LINK_LIBRARIES(squishpng squish ${PNG_LIBRARY})
+ENDIF(PNG_FOUND)
+
+#ADD_EXECUTABLE(squishgen extra/squishgen.cpp)
+
+ADD_EXECUTABLE(squishtest extra/squishtest.cpp)
+TARGET_LINK_LIBRARIES(squishtest squish)
+
+ADD_TEST(SQUISHTEST squishtest)
+
--- a/src/nvimage/nvtt/squish/ChangeLog
+++ b/src/nvimage/nvtt/squish/ChangeLog
@ -0,0 +1,38 @@
+
+1.7
+* Fixed floating-point equality issue in clusterfit sort (x86 affected only)
+* Implemented proper SSE(2) floor function for 50% speedup on SSE builds 
+* The range fit implementation now uses the correct colour metric
+
+1.6
+* Fixed bug in CompressImage where masked pixels were not skipped over
+* DXT3 and DXT5 alpha compression now properly use the mask to ignore pixels
+* Fixed major DXT1 bug that can generate unexpected transparent pixels
+
+1.5
+* Added CompressMasked function to handle incomplete DXT blocks more cleanly
+* Added kWeightColourByAlpha flag for better quality images when alpha blending
+
+1.4
+* Fixed stack overflow in rangefit
+
+1.3
+* Worked around SSE floor implementation bug, proper fix needed!
+* This release has visual studio and makefile builds that work
+
+1.2
+* Added provably optimal single colour compressor
+* Added extra/squishgen.cpp that generates single colour lookup tables
+
+1.1
+* Fixed a DXT1 colour output bug
+* Changed argument order for Decompress function to match Compress
+* Added GetStorageRequirements function
+* Added CompressImage function
+* Added DecompressImage function
+* Moved squishtool.cpp to extra/squishpng.cpp
+* Added extra/squishtest.cpp
+
+1.0
+* Initial release
+
--- a/src/nvimage/nvtt/squish/Doxyfile
+++ b/src/nvimage/nvtt/squish/Doxyfile
@ -0,0 +1,223 @@
+# Doxyfile 1.4.6
+
+#---------------------------------------------------------------------------
+# Project related configuration options
+#---------------------------------------------------------------------------
+PROJECT_NAME           = squish
+PROJECT_NUMBER         = 1.1
+OUTPUT_DIRECTORY       = docs 
+CREATE_SUBDIRS         = NO
+OUTPUT_LANGUAGE        = English
+USE_WINDOWS_ENCODING   = NO
+BRIEF_MEMBER_DESC      = YES
+REPEAT_BRIEF           = YES
+ABBREVIATE_BRIEF       = 
+ALWAYS_DETAILED_SEC    = NO
+INLINE_INHERITED_MEMB  = NO
+FULL_PATH_NAMES        = YES
+STRIP_FROM_PATH        = 
+STRIP_FROM_INC_PATH    = 
+SHORT_NAMES            = NO
+JAVADOC_AUTOBRIEF      = NO
+MULTILINE_CPP_IS_BRIEF = NO
+DETAILS_AT_TOP         = NO
+INHERIT_DOCS           = YES
+SEPARATE_MEMBER_PAGES  = NO
+TAB_SIZE               = 4
+ALIASES                = 
+OPTIMIZE_OUTPUT_FOR_C  = NO
+OPTIMIZE_OUTPUT_JAVA   = NO
+BUILTIN_STL_SUPPORT    = NO
+DISTRIBUTE_GROUP_DOC   = NO
+SUBGROUPING            = YES
+#---------------------------------------------------------------------------
+# Build related configuration options
+#---------------------------------------------------------------------------
+EXTRACT_ALL            = YES
+EXTRACT_PRIVATE        = NO
+EXTRACT_STATIC         = NO
+EXTRACT_LOCAL_CLASSES  = YES
+EXTRACT_LOCAL_METHODS  = NO
+HIDE_UNDOC_MEMBERS     = NO
+HIDE_UNDOC_CLASSES     = NO
+HIDE_FRIEND_COMPOUNDS  = NO
+HIDE_IN_BODY_DOCS      = NO
+INTERNAL_DOCS          = NO
+CASE_SENSE_NAMES       = NO
+HIDE_SCOPE_NAMES       = NO
+SHOW_INCLUDE_FILES     = YES
+INLINE_INFO            = YES
+SORT_MEMBER_DOCS       = YES
+SORT_BRIEF_DOCS        = NO
+SORT_BY_SCOPE_NAME     = NO
+GENERATE_TODOLIST      = YES
+GENERATE_TESTLIST      = YES
+GENERATE_BUGLIST       = YES
+GENERATE_DEPRECATEDLIST= YES
+ENABLED_SECTIONS       = 
+MAX_INITIALIZER_LINES  = 30
+SHOW_USED_FILES        = YES
+SHOW_DIRECTORIES       = NO
+FILE_VERSION_FILTER    = 
+#---------------------------------------------------------------------------
+# configuration options related to warning and progress messages
+#---------------------------------------------------------------------------
+QUIET                  = YES
+WARNINGS               = YES
+WARN_IF_UNDOCUMENTED   = YES
+WARN_IF_DOC_ERROR      = YES
+WARN_NO_PARAMDOC       = NO
+WARN_FORMAT            = "$file:$line: $text"
+WARN_LOGFILE           = 
+#---------------------------------------------------------------------------
+# configuration options related to the input files
+#---------------------------------------------------------------------------
+INPUT                  = squish.h
+FILE_PATTERNS          = 
+RECURSIVE              = NO
+EXCLUDE                = 
+EXCLUDE_SYMLINKS       = NO
+EXCLUDE_PATTERNS       = 
+EXAMPLE_PATH           = 
+EXAMPLE_PATTERNS       = 
+EXAMPLE_RECURSIVE      = NO
+IMAGE_PATH             = 
+INPUT_FILTER           = 
+FILTER_PATTERNS        = 
+FILTER_SOURCE_FILES    = NO
+#---------------------------------------------------------------------------
+# configuration options related to source browsing
+#---------------------------------------------------------------------------
+SOURCE_BROWSER         = NO
+INLINE_SOURCES         = NO
+STRIP_CODE_COMMENTS    = YES
+REFERENCED_BY_RELATION = YES
+REFERENCES_RELATION    = YES
+USE_HTAGS              = NO
+VERBATIM_HEADERS       = YES
+#---------------------------------------------------------------------------
+# configuration options related to the alphabetical class index
+#---------------------------------------------------------------------------
+ALPHABETICAL_INDEX     = NO
+COLS_IN_ALPHA_INDEX    = 5
+IGNORE_PREFIX          = 
+#---------------------------------------------------------------------------
+# configuration options related to the HTML output
+#---------------------------------------------------------------------------
+GENERATE_HTML          = YES
+HTML_OUTPUT            = html
+HTML_FILE_EXTENSION    = .html
+HTML_HEADER            = 
+HTML_FOOTER            = 
+HTML_STYLESHEET        = 
+HTML_ALIGN_MEMBERS     = YES
+GENERATE_HTMLHELP      = NO
+CHM_FILE               = 
+HHC_LOCATION           = 
+GENERATE_CHI           = NO
+BINARY_TOC             = NO
+TOC_EXPAND             = NO
+DISABLE_INDEX          = NO
+ENUM_VALUES_PER_LINE   = 4
+GENERATE_TREEVIEW      = NO
+TREEVIEW_WIDTH         = 250
+#---------------------------------------------------------------------------
+# configuration options related to the LaTeX output
+#---------------------------------------------------------------------------
+GENERATE_LATEX         = NO
+LATEX_OUTPUT           = latex
+LATEX_CMD_NAME         = latex
+MAKEINDEX_CMD_NAME     = makeindex
+COMPACT_LATEX          = NO
+PAPER_TYPE             = a4wide
+EXTRA_PACKAGES         = 
+LATEX_HEADER           = 
+PDF_HYPERLINKS         = NO
+USE_PDFLATEX           = NO
+LATEX_BATCHMODE        = NO
+LATEX_HIDE_INDICES     = NO
+#---------------------------------------------------------------------------
+# configuration options related to the RTF output
+#---------------------------------------------------------------------------
+GENERATE_RTF           = NO
+RTF_OUTPUT             = rtf
+COMPACT_RTF            = NO
+RTF_HYPERLINKS         = NO
+RTF_STYLESHEET_FILE    = 
+RTF_EXTENSIONS_FILE    = 
+#---------------------------------------------------------------------------
+# configuration options related to the man page output
+#---------------------------------------------------------------------------
+GENERATE_MAN           = NO
+MAN_OUTPUT             = man
+MAN_EXTENSION          = .3
+MAN_LINKS              = NO
+#---------------------------------------------------------------------------
+# configuration options related to the XML output
+#---------------------------------------------------------------------------
+GENERATE_XML           = NO
+XML_OUTPUT             = xml
+XML_SCHEMA             = 
+XML_DTD                = 
+XML_PROGRAMLISTING     = YES
+#---------------------------------------------------------------------------
+# configuration options for the AutoGen Definitions output
+#---------------------------------------------------------------------------
+GENERATE_AUTOGEN_DEF   = NO
+#---------------------------------------------------------------------------
+# configuration options related to the Perl module output
+#---------------------------------------------------------------------------
+GENERATE_PERLMOD       = NO
+PERLMOD_LATEX          = NO
+PERLMOD_PRETTY         = YES
+PERLMOD_MAKEVAR_PREFIX = 
+#---------------------------------------------------------------------------
+# Configuration options related to the preprocessor   
+#---------------------------------------------------------------------------
+ENABLE_PREPROCESSING   = YES
+MACRO_EXPANSION        = NO
+EXPAND_ONLY_PREDEF     = NO
+SEARCH_INCLUDES        = YES
+INCLUDE_PATH           = 
+INCLUDE_FILE_PATTERNS  = 
+PREDEFINED             = 
+EXPAND_AS_DEFINED      = 
+SKIP_FUNCTION_MACROS   = YES
+#---------------------------------------------------------------------------
+# Configuration::additions related to external references   
+#---------------------------------------------------------------------------
+TAGFILES               = 
+GENERATE_TAGFILE       = 
+ALLEXTERNALS           = NO
+EXTERNAL_GROUPS        = YES
+PERL_PATH              = /usr/bin/perl
+#---------------------------------------------------------------------------
+# Configuration options related to the dot tool   
+#---------------------------------------------------------------------------
+CLASS_DIAGRAMS         = YES
+HIDE_UNDOC_RELATIONS   = YES
+HAVE_DOT               = YES
+CLASS_GRAPH            = YES
+COLLABORATION_GRAPH    = YES
+GROUP_GRAPHS           = YES
+UML_LOOK               = NO
+TEMPLATE_RELATIONS     = NO
+INCLUDE_GRAPH          = YES
+INCLUDED_BY_GRAPH      = YES
+CALL_GRAPH             = NO
+GRAPHICAL_HIERARCHY    = YES
+DIRECTORY_GRAPH        = YES
+DOT_IMAGE_FORMAT       = png
+DOT_PATH               = /Applications/Graphviz.app/Contents/MacOS
+DOTFILE_DIRS           = 
+MAX_DOT_GRAPH_WIDTH    = 1024
+MAX_DOT_GRAPH_HEIGHT   = 1024
+MAX_DOT_GRAPH_DEPTH    = 0
+DOT_TRANSPARENT        = NO
+DOT_MULTI_TARGETS      = NO
+GENERATE_LEGEND        = YES
+DOT_CLEANUP            = YES
+#---------------------------------------------------------------------------
+# Configuration::additions related to the search engine   
+#---------------------------------------------------------------------------
+SEARCHENGINE           = NO
--- a/src/nvimage/nvtt/squish/Makefile
+++ b/src/nvimage/nvtt/squish/Makefile
@ -0,0 +1,31 @@
+
+include config
+
+SRC = alpha.cpp clusterfit.cpp colourblock.cpp colourfit.cpp colourset.cpp maths.cpp rangefit.cpp singlecolourfit.cpp squish.cpp
+
+OBJ = $(SRC:%.cpp=%.o)
+
+LIB = libsquish.a
+
+all : $(LIB)
+
+install : $(LIB)
+	install squish.h $(INSTALL_DIR)/include 
+	install libsquish.a $(INSTALL_DIR)/lib
+
+uninstall:
+	$(RM) $(INSTALL_DIR)/include/squish.h
+	$(RM) $(INSTALL_DIR)/lib/libsquish.a
+
+$(LIB) : $(OBJ)
+	$(AR) cr $@ $?
+	ranlib $@
+
+%.o : %.cpp
+	$(CXX) $(CPPFLAGS) -I. $(CXXFLAGS) -o$@ -c $<
+
+clean :
+	$(RM) $(OBJ) $(LIB)
+
+
+
--- a/src/nvimage/nvtt/squish/README
+++ b/src/nvimage/nvtt/squish/README
@ -0,0 +1,35 @@
+LICENSE
+-------
+
+The squish library is distributed under the terms and conditions of the MIT
+license. This license is specified at the top of each source file and must be
+preserved in its entirety.
+
+BUILDING AND INSTALLING THE LIBRARY
+-----------------------------------
+
+If you are using Visual Studio 2003 or above under Windows then load the Visual
+Studio 2003 project in the vs7 folder. By default, the library is built using
+SSE optimisations. To change this either change or remove the SQUISH_USE_SSE=1
+from the preprocessor symbols.
+
+If you are using a Mac then load the Xcode 2.2 project in the distribution. By
+default, the library is built using Altivec optimisations. To change this
+either change or remove SQUISH_USE_ALTIVEC=1 from the preprocessor symbols. I
+guess I'll have to think about changing this for the new Intel Macs that are
+rolling out...
+
+If you are using unix then first edit the config file in the base directory of
+the distribution, enabling Altivec or SSE with the USE_ALTIVEC or USE_SSE
+variables, and editing the optimisation flags passed to the C++ compiler if
+necessary. Then make can be used to build the library, and make install (from
+the superuser account) can be used to install (into /usr/local by default).
+
+REPORTING BUGS OR FEATURE REQUESTS
+----------------------------------
+
+Feedback can be sent to Simon Brown (the developer) at si@sjbrown.co.uk
+
+New releases are announced on the squish library homepage at
+http://sjbrown.co.uk/?code=squish
+
--- a/src/nvimage/nvtt/squish/alpha.cpp
+++ b/src/nvimage/nvtt/squish/alpha.cpp
@ -0,0 +1,326 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#include "alpha.h"
+#include <algorithm>
+
+namespace squish {
+
+static int FloatToInt( float a, int limit )
+{
+	// use ANSI round-to-zero behaviour to get round-to-nearest
+	int i = ( int )( a + 0.5f );
+
+	// clamp to the limit
+	if( i < 0 )
+		i = 0;
+	else if( i > limit )
+		i = limit; 
+
+	// done
+	return i;
+}
+
+void CompressAlphaDxt3( u8 const* rgba, void* block )
+{
+	u8* bytes = reinterpret_cast< u8* >( block );
+	
+	// quantise and pack the alpha values pairwise
+	for( int i = 0; i < 8; ++i )
+	{
+		// quantise down to 4 bits
+		float alpha1 = ( float )rgba[8*i + 3] * ( 15.0f/255.0f );
+		float alpha2 = ( float )rgba[8*i + 7] * ( 15.0f/255.0f );
+		int quant1 = FloatToInt( alpha1, 15 );
+		int quant2 = FloatToInt( alpha2, 15 );
+
+		// pack into the byte
+		bytes[i] = ( u8 )( quant1 | ( quant2 << 4 ) );
+	}
+}
+
+void DecompressAlphaDxt3( u8* rgba, void const* block )
+{
+	u8 const* bytes = reinterpret_cast< u8 const* >( block );
+	
+	// unpack the alpha values pairwise
+	for( int i = 0; i < 8; ++i )
+	{
+		// quantise down to 4 bits
+		u8 quant = bytes[i];
+		
+		// unpack the values
+		u8 lo = quant & 0x0f;
+		u8 hi = quant & 0xf0;
+
+		// convert back up to bytes
+		rgba[8*i + 3] = lo | ( lo << 4 );
+		rgba[8*i + 7] = hi | ( hi >> 4 );
+	}
+}
+
+static void FixRange( int& min, int& max, int steps )
+{
+	if( max - min < steps )
+		max = std::min( min + steps, 255 );
+	if( max - min < steps )
+		min = std::max( 0, max - steps );
+}
+
+static int FitCodes( u8 const* rgba, u8 const* codes, u8* indices )
+{
+	// fit each alpha value to the codebook
+	int err = 0;
+	for( int i = 0; i < 16; ++i )
+	{
+		// find the least error and corresponding index
+		int value = rgba[4*i + 3];
+		int least = INT_MAX;
+		int index = 0;
+		for( int j = 0; j < 8; ++j )
+		{
+			// get the squared error from this code
+			int dist = ( int )value - ( int )codes[j];
+			dist *= dist;
+			
+			// compare with the best so far
+			if( dist < least )
+			{
+				least = dist;
+				index = j;
+			}
+		}
+		
+		// save this index and accumulate the error
+		indices[i] = ( u8 )index;
+		err += least;
+	}
+	
+	// return the total error
+	return err;
+}
+
+static void WriteAlphaBlock( int alpha0, int alpha1, u8 const* indices, void* block )
+{
+	u8* bytes = reinterpret_cast< u8* >( block );
+	
+	// write the first two bytes
+	bytes[0] = ( u8 )alpha0;
+	bytes[1] = ( u8 )alpha1;
+	
+	// pack the indices with 3 bits each
+	u8* dest = bytes + 2;
+	u8 const* src = indices;
+	for( int i = 0; i < 2; ++i )
+	{
+		// pack 8 3-bit values
+		int value = 0;
+		for( int j = 0; j < 8; ++j )
+		{
+			int index = *src++;
+			value |= ( index << 3*j );
+		}
+			
+		// store in 3 bytes
+		for( int j = 0; j < 3; ++j )
+		{
+			int byte = ( value >> 8*j ) & 0xff;
+			*dest++ = ( u8 )byte;
+		}
+	}
+}
+
+static void WriteAlphaBlock5( int alpha0, int alpha1, u8 const* indices, void* block )
+{
+	// check the relative values of the endpoints
+	if( alpha0 > alpha1 )
+	{
+		// swap the indices
+		u8 swapped[16];
+		for( int i = 0; i < 16; ++i )
+		{
+			u8 index = indices[i];
+			if( index == 0 )
+				swapped[i] = 1;
+			else if( index == 1 )
+				swapped[i] = 0;
+			else if( index <= 5 )
+				swapped[i] = 7 - index;
+			else 
+				swapped[i] = index;
+		}
+		
+		// write the block
+		WriteAlphaBlock( alpha1, alpha0, swapped, block );
+	}
+	else
+	{
+		// write the block
+		WriteAlphaBlock( alpha0, alpha1, indices, block );
+	}	
+}
+
+static void WriteAlphaBlock7( int alpha0, int alpha1, u8 const* indices, void* block )
+{
+	// check the relative values of the endpoints
+	if( alpha0 < alpha1 )
+	{
+		// swap the indices
+		u8 swapped[16];
+		for( int i = 0; i < 16; ++i )
+		{
+			u8 index = indices[i];
+			if( index == 0 )
+				swapped[i] = 1;
+			else if( index == 1 )
+				swapped[i] = 0;
+			else
+				swapped[i] = 9 - index;
+		}
+		
+		// write the block
+		WriteAlphaBlock( alpha1, alpha0, swapped, block );
+	}
+	else
+	{
+		// write the block
+		WriteAlphaBlock( alpha0, alpha1, indices, block );
+	}	
+}
+
+void CompressAlphaDxt5( u8 const* rgba, void* block )
+{
+	// get the range for 5-alpha and 7-alpha interpolation
+	int min5 = 255;
+	int max5 = 0;
+	int min7 = 255;
+	int max7 = 0;
+	for( int i = 0; i < 16; ++i )
+	{
+		// incorporate into the min/max
+		int value = rgba[4*i + 3];
+		if( value < min7 )
+			min7 = value;
+		if( value > max7 )
+			max7 = value;
+		if( value != 0 && value < min5 )
+			min5 = value;
+		if( value != 255 && value > max5 )
+			max5 = value;
+	}
+	
+	// handle the case that no valid range was found
+	if( min5 > max5 )
+		min5 = max5;
+	if( min7 > max7 )
+		min7 = max7;
+		
+	// fix the range to be the minimum in each case
+	FixRange( min5, max5, 5 );
+	FixRange( min7, max7, 7 );
+	
+	// set up the 5-alpha code book
+	u8 codes5[8];
+	codes5[0] = ( u8 )min5;
+	codes5[1] = ( u8 )max5;
+	for( int i = 1; i < 5; ++i )
+		codes5[1 + i] = ( u8 )( ( ( 5 - i )*min5 + i*max5 )/5 );
+	codes5[6] = 0;
+	codes5[7] = 255;
+	
+	// set up the 7-alpha code book
+	u8 codes7[8];
+	codes7[0] = ( u8 )min7;
+	codes7[1] = ( u8 )max7;
+	for( int i = 1; i < 7; ++i )
+		codes7[1 + i] = ( u8 )( ( ( 7 - i )*min7 + i*max7 )/7 );
+		
+	// fit the data to both code books
+	u8 indices5[16];
+	u8 indices7[16];
+	int err5 = FitCodes( rgba, codes5, indices5 );
+	int err7 = FitCodes( rgba, codes7, indices7 );
+	
+	// save the block with least error
+	if( err5 <= err7 )
+		WriteAlphaBlock5( min5, max5, indices5, block );
+	else
+		WriteAlphaBlock7( min7, max7, indices7, block );
+}
+
+void DecompressAlphaDxt5( u8* rgba, void const* block )
+{
+	// get the two alpha values
+	u8 const* bytes = reinterpret_cast< u8 const* >( block );
+	int alpha0 = bytes[0];
+	int alpha1 = bytes[1];
+	
+	// compare the values to build the codebook
+	u8 codes[8];
+	codes[0] = ( u8 )alpha0;
+	codes[1] = ( u8 )alpha1;
+	if( alpha0 <= alpha1 )
+	{
+		// use 5-alpha codebook
+		for( int i = 1; i < 5; ++i )
+			codes[1 + i] = ( u8 )( ( ( 5 - i )*alpha0 + i*alpha1 )/5 );
+		codes[6] = 0;
+		codes[7] = 255;
+	}
+	else
+	{
+		// use 7-alpha codebook
+		for( int i = 1; i < 7; ++i )
+			codes[1 + i] = ( u8 )( ( ( 7 - i )*alpha0 + i*alpha1 )/7 );
+	}
+	
+	// decode the indices
+	u8 indices[16];
+	u8 const* src = bytes + 2;
+	u8* dest = indices;
+	for( int i = 0; i < 2; ++i )
+	{
+		// grab 3 bytes
+		int value = 0;
+		for( int j = 0; j < 3; ++j )
+		{
+			int byte = *src++;
+			value |= ( byte << 8*j );
+		}
+		
+		// unpack 8 3-bit values from it
+		for( int j = 0; j < 8; ++j )
+		{
+			int index = ( value >> 3*j ) & 0x7;
+			*dest++ = ( u8 )index;
+		}
+	}
+	
+	// write out the indexed codebook values
+	for( int i = 0; i < 16; ++i )
+		rgba[4*i + 3] = codes[indices[i]];
+}
+
+} // namespace squish
--- a/src/nvimage/nvtt/squish/alpha.h
+++ b/src/nvimage/nvtt/squish/alpha.h
@ -0,0 +1,41 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#ifndef SQUISH_ALPHA_H
+#define SQUISH_ALPHA_H
+
+#include <squish.h>
+
+namespace squish {
+
+void CompressAlphaDxt3( u8 const* rgba, void* block );
+void CompressAlphaDxt5( u8 const* rgba, void* block );
+
+void DecompressAlphaDxt3( u8* rgba, void const* block );
+void DecompressAlphaDxt5( u8* rgba, void const* block );
+
+} // namespace squish
+
+#endif // ndef SQUISH_ALPHA_H
--- a/src/nvimage/nvtt/squish/clusterfit.cpp
+++ b/src/nvimage/nvtt/squish/clusterfit.cpp
@ -0,0 +1,499 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#include "clusterfit.h"
+#include "colourset.h"
+#include "colourblock.h"
+#include <cfloat>
+
+namespace squish {
+
+ClusterFit::ClusterFit( ColourSet const* colours, int flags ) 
+  : ColourFit( colours, flags )
+{
+	// initialise the best error
+#if SQUISH_USE_SIMD
+	m_besterror = VEC4_CONST( FLT_MAX );
+#else
+	m_besterror = FLT_MAX;
+#endif
+
+/*	// initialise the metric
+	bool perceptual = ( ( m_flags & kColourMetricPerceptual ) != 0 );
+#if SQUISH_USE_SIMD
+	if( perceptual )
+		m_metric = Vec4( 0.2126f, 0.7152f, 0.0722f, 0.0f );
+	else
+		m_metric = VEC4_CONST( 1.0f );	
+#else
+	if( perceptual )
+		m_metric = Vec3( 0.2126f, 0.7152f, 0.0722f );
+	else
+		m_metric = Vec3( 1.0f );
+#endif
+*/
+	// cache some values
+	int const count = m_colours->GetCount();
+	Vec3 const* values = m_colours->GetPoints();
+	
+	// get the covariance matrix
+	Sym3x3 covariance = ComputeWeightedCovariance( count, values, m_colours->GetWeights() );
+	
+	// compute the principle component
+	Vec3 principle = ComputePrincipleComponent( covariance );
+
+	// build the list of values
+	float dps[16];
+	for( int i = 0; i < count; ++i )
+	{
+		dps[i] = Dot( values[i], principle );
+		m_order[i] = i;
+	}
+	
+	// stable sort
+	for( int i = 0; i < count; ++i )
+	{
+		for( int j = i; j > 0 && dps[j] < dps[j - 1]; --j )
+		{
+			std::swap( dps[j], dps[j - 1] );
+			std::swap( m_order[j], m_order[j - 1] );
+		}
+	}
+
+	// weight all the points
+#if SQUISH_USE_SIMD
+	Vec4 const* unweighted = m_colours->GetPointsSimd();
+	Vec4 const* weights = m_colours->GetWeightsSimd();
+	m_xxsum = VEC4_CONST( 0.0f );
+#else
+	Vec3 const* unweighted = m_colours->GetPoints();
+	float const* weights = m_colours->GetWeights();
+	m_xxsum = Vec3( 0.0f );
+#endif
+	for( int i = 0; i < count; ++i )
+	{
+		int p = m_order[i];
+		m_unweighted[i] = unweighted[p];
+		m_weights[i] = weights[p];
+		m_weighted[i] = weights[p]*unweighted[p];
+		m_xxsum += m_weighted[i]*m_weighted[i];
+	}
+}
+
+
+void ClusterFit::setMetric(float r, float g, float b)
+{
+#if SQUISH_USE_SIMD
+	m_metric = Vec4(r, g, b, 0);
+#else
+	m_metric = Vec3(r, g, b);
+#endif
+}
+
+float ClusterFit::bestError() const
+{
+#if SQUISH_USE_SIMD
+	return m_besterror.GetVec3().X();
+#else
+	return m_besterror;
+#endif
+}
+
+
+void ClusterFit::Compress3( void* block )
+{
+	// declare variables
+	int const count = m_colours->GetCount();
+#if SQUISH_USE_SIMD
+	Vec4 beststart = VEC4_CONST( 0.0f );
+	Vec4 bestend = VEC4_CONST( 0.0f );
+	Vec4 besterror = VEC4_CONST( FLT_MAX );
+	Vec4 const half = VEC4_CONST( 0.5f );
+	Vec4 const zero = VEC4_CONST( 0.0f );
+#else
+	Vec3 beststart( 0.0f );
+	Vec3 bestend( 0.0f );
+	float besterror = FLT_MAX;
+	float const half = 0.5f;
+	float const zero = 0.0f;
+#endif	
+
+	// check all possible clusters for this total order
+	u8 indices[16];
+	u8 bestindices[16];
+	
+	// first cluster [0,i) is at the start
+	for( int m = 0; m < count; ++m )
+	{
+		indices[m] = 0;
+		m_alpha[m] = m_weights[m];
+		m_beta[m] = zero;
+	}
+	for( int i = count; i >= 0; --i )
+	{
+		// second cluster [i,j) is half along
+		for( int m = i; m < count; ++m )
+		{
+			indices[m] = 2;
+			m_alpha[m] = m_beta[m] = half*m_weights[m];
+		}		
+		for( int j = count; j > i; --j )
+		{
+			// last cluster [j,k) is at the end
+			if( j < count )
+			{
+				indices[j] = 1;
+				m_alpha[j] = zero;
+				m_beta[j] = m_weights[j];
+			}		
+			
+			// solve a least squares problem to place the endpoints
+#if SQUISH_USE_SIMD
+			Vec4 start, end;
+			Vec4 error = SolveLeastSquares( start, end );
+#else
+			Vec3 start, end;
+			float error = SolveLeastSquares( start, end );
+#endif
+
+			// keep the solution if it wins
+#if SQUISH_USE_SIMD
+			if( CompareAnyLessThan( error, besterror ) )
+#else
+			if( error < besterror )
+#endif
+			{
+				beststart = start;
+				bestend = end;
+				for( int m = 0; m < 16; ++m )	// TODO: make this faster?
+					bestindices[m] = indices[m];
+				besterror = error;
+			}
+		}
+	}
+	
+	// save the block if necessary
+#if SQUISH_USE_SIMD
+	if( CompareAnyLessThan( besterror, m_besterror ) )
+#else
+	if( besterror < m_besterror )
+#endif
+	{
+		// remap the indices
+		u8 unordered[16];
+		for( int i = 0; i < count; ++i )
+			unordered[m_order[i]] = bestindices[i];
+		m_colours->RemapIndices( unordered, bestindices );
+		
+		// save the block
+#if SQUISH_USE_SIMD
+		WriteColourBlock3( beststart.GetVec3(), bestend.GetVec3(), bestindices, block );
+#else
+		WriteColourBlock3( beststart, bestend, bestindices, block );
+#endif
+
+		// save the error
+		m_besterror = besterror;
+	}
+}
+
+//static int run = 0;
+//static bool debug = false;
+
+void ClusterFit::Compress4( void* block )
+{
+	//debug = (run == 1);
+	//run++;
+
+	// declare variables
+	int const count = m_colours->GetCount();
+#if SQUISH_USE_SIMD
+	Vec4 beststart = VEC4_CONST( 0.0f );
+	Vec4 bestend = VEC4_CONST( 0.0f );
+	Vec4 besterror = m_besterror;
+	Vec4 const twothirds = VEC4_CONST( 2.0f/3.0f );
+	Vec4 const onethird = VEC4_CONST( 1.0f/3.0f );
+	Vec4 const zero = VEC4_CONST( 0.0f );
+#else
+	Vec3 beststart( 0.0f );
+	Vec3 bestend( 0.0f );
+	float besterror = m_besterror;
+	float const twothirds = 2.0f/3.0f;
+	float const onethird = 1.0f/3.0f;
+	float const zero = 0.0f;
+#endif
+
+	// check all possible clusters for this total order
+	u8 indices[16];
+	u8 bestindices[16];
+	
+	// first cluster [0,i) is at the start
+	for( int m = 0; m < count; ++m )
+	{
+		indices[m] = 0;
+		m_alpha[m] = m_weights[m];
+		m_beta[m] = zero;
+	}
+	for( int i = count; i >= 0; --i )
+	{
+		// second cluster [i,j) is one third along
+		for( int m = i; m < count; ++m )
+		{
+			indices[m] = 2;
+			m_alpha[m] = twothirds*m_weights[m];
+			m_beta[m] = onethird*m_weights[m];
+		}		
+		for( int j = count; j >= i; --j )
+		{
+			// third cluster [j,k) is two thirds along
+			for( int m = j; m < count; ++m )
+			{
+				indices[m] = 3;
+				m_alpha[m] = onethird*m_weights[m];
+				m_beta[m] = twothirds*m_weights[m];
+			}		
+			for( int k = count; k >= j; --k )
+			{
+				if (j + k == 0) continue;
+				
+				// last cluster [k,n) is at the end
+				if( k < count )
+				{
+					indices[k] = 1;
+					m_alpha[k] = zero;
+					m_beta[k] = m_weights[k];
+				}
+
+				/*unsigned int permutation = 0;
+				for(int p = 0; p < 16; p++) {
+					permutation |= indices[p] << (p * 2);
+				}
+				if (debug) printf("%X:\t", permutation);
+
+				if (debug && permutation == 0x55FFFFAA) __debugbreak();
+				*/
+
+				// solve a least squares problem to place the endpoints
+#if SQUISH_USE_SIMD
+				Vec4 start, end;
+				Vec4 error = SolveLeastSquares( start, end );
+#else
+				Vec3 start, end;
+				float error = SolveLeastSquares( start, end );
+#endif
+
+				// keep the solution if it wins
+#if SQUISH_USE_SIMD
+				if( CompareAnyLessThan( error, besterror ) )
+#else
+				if( error < besterror )
+#endif
+				{
+					beststart = start;
+					bestend = end;
+					for( int m = 0; m < 16; ++m )	// TODO: make this faster?
+						bestindices[m] = indices[m];	
+					besterror = error;
+				}
+			}
+		}
+	}
+
+	// save the block if necessary
+#if SQUISH_USE_SIMD
+	if( CompareAnyLessThan( besterror, m_besterror ) )
+#else
+	if( besterror < m_besterror )
+#endif
+	{
+		// remap the indices
+		u8 unordered[16];
+		for( int i = 0; i < count; ++i )
+			unordered[m_order[i]] = bestindices[i];
+		m_colours->RemapIndices( unordered, bestindices );
+		
+		// save the block
+#if SQUISH_USE_SIMD
+		WriteColourBlock4( beststart.GetVec3(), bestend.GetVec3(), bestindices, block );
+#else
+		WriteColourBlock4( beststart, bestend, bestindices, block );
+#endif
+
+		// save the error
+		m_besterror = besterror;
+	}
+}
+
+#if SQUISH_USE_SIMD
+Vec4 ClusterFit::SolveLeastSquares( Vec4& start, Vec4& end ) const
+{
+	// accumulate all the quantities we need
+	int const count = m_colours->GetCount();
+	Vec4 alpha2_sum = VEC4_CONST( 0.0f );
+	Vec4 beta2_sum = VEC4_CONST( 0.0f );
+	Vec4 alphabeta_sum = VEC4_CONST( 0.0f );
+	Vec4 alphax_sum = VEC4_CONST( 0.0f );
+	Vec4 betax_sum = VEC4_CONST( 0.0f );
+	for( int i = 0; i < count; ++i )
+	{
+		Vec4 alpha = m_alpha[i];
+		Vec4 beta = m_beta[i];
+		Vec4 x = m_weighted[i];
+	
+		alpha2_sum = MultiplyAdd( alpha, alpha, alpha2_sum );
+		beta2_sum = MultiplyAdd( beta, beta, beta2_sum );
+		alphabeta_sum = MultiplyAdd( alpha, beta, alphabeta_sum );
+		alphax_sum = MultiplyAdd( alpha, x, alphax_sum );
+		betax_sum = MultiplyAdd( beta, x, betax_sum );	
+	}
+
+	// select the results
+	Vec4 const zero = VEC4_CONST( 0.0f );
+	Vec4 beta2_sum_zero = CompareEqual( beta2_sum, zero );
+	Vec4 alpha2_sum_zero = CompareEqual( alpha2_sum, zero );
+	
+	Vec4 a1 = alphax_sum*Reciprocal( alpha2_sum );
+	Vec4 b1 = betax_sum*Reciprocal( beta2_sum );
+	
+	Vec4 factor = Reciprocal( NegativeMultiplySubtract( 
+		alphabeta_sum, alphabeta_sum, alpha2_sum*beta2_sum 
+	) );
+	Vec4 a2 = NegativeMultiplySubtract( 
+		betax_sum, alphabeta_sum, alphax_sum*beta2_sum
+	)*factor;
+	Vec4 b2 = NegativeMultiplySubtract(
+		alphax_sum, alphabeta_sum, betax_sum*alpha2_sum
+	)*factor;
+	
+	Vec4 a = Select( Select( a2, a1, beta2_sum_zero ), zero, alpha2_sum_zero );
+	Vec4 b = Select( Select( b2, b1, alpha2_sum_zero ), zero, beta2_sum_zero );
+
+	// clamp the output to [0, 1]
+	Vec4 const one = VEC4_CONST( 1.0f );
+	Vec4 const half = VEC4_CONST( 0.5f );
+	a = Min( one, Max( zero, a ) );
+	b = Min( one, Max( zero, b ) );
+
+	// clamp to the grid
+	Vec4 const grid( 31.0f, 63.0f, 31.0f, 0.0f );
+//	Vec4 const gridrcp( 1.0f/31.0f, 1.0f/63.0f, 1.0f/31.0f, 0.0f );
+	Vec4 const gridrcp( 0.03227752766457f, 0.01583151765563f, 0.03227752766457f, 0.0f ); // IC: use approximate grid fitting.
+	Vec4 const onethird = VEC4_CONST( 1.0f/3.0f );
+	Vec4 const twothirds = VEC4_CONST( 2.0f/3.0f );
+	a = Truncate( MultiplyAdd( grid, a, half ) )*gridrcp;
+	b = Truncate( MultiplyAdd( grid, b, half ) )*gridrcp;
+
+	// compute the error
+	Vec4 const two = VEC4_CONST( 2.0 );
+	Vec4 e1 = MultiplyAdd( b*b, beta2_sum, m_xxsum );
+	Vec4 e2 = MultiplyAdd( a, alphax_sum, b*betax_sum );
+	Vec4 e3 = MultiplyAdd( a*a, alpha2_sum, e1 );
+	Vec4 e4 = MultiplyAdd( a*b*alphabeta_sum - e2, two, e3 );
+
+	// apply the metric to the error term
+	Vec4 e5 = e4*m_metric;
+	Vec4 error = e5.SplatX() + e5.SplatY() + e5.SplatZ();
+	
+	// save the start and end
+	start = a;
+	end = b;
+	return error;
+}
+#else
+float ClusterFit::SolveLeastSquares( Vec3& start, Vec3& end ) const
+{
+	// accumulate all the quantities we need
+	int const count = m_colours->GetCount();
+	float alpha2_sum = 0.0f;
+	float beta2_sum = 0.0f;
+	float alphabeta_sum = 0.0f;
+	Vec3 alphax_sum( 0.0f );
+	Vec3 betax_sum( 0.0f );	
+	for( int i = 0; i < count; ++i )
+	{
+		float alpha = m_alpha[i];
+		float beta = m_beta[i];
+		Vec3 const& x = m_weighted[i];
+		
+		alpha2_sum += alpha*alpha;
+		beta2_sum += beta*beta;
+		alphabeta_sum += alpha*beta;
+		alphax_sum += alpha*x;
+		betax_sum += beta*x;
+	}
+
+	//if (debug) printf("%f %f %f", alpha2_sum, beta2_sum, alphabeta_sum);
+
+	// zero where non-determinate
+	Vec3 a, b;
+	if( beta2_sum == 0.0f )
+	{
+		a = alphax_sum/alpha2_sum;
+		b = Vec3( 0.0f );
+	}
+	else if( alpha2_sum == 0.0f )
+	{
+		a = Vec3( 0.0f );
+		b = betax_sum/beta2_sum;
+	}
+	else
+	{
+		float factor = 1.0f/( alpha2_sum*beta2_sum - alphabeta_sum*alphabeta_sum );
+		
+		a = ( alphax_sum*beta2_sum - betax_sum*alphabeta_sum )*factor;
+		b = ( betax_sum*alpha2_sum - alphax_sum*alphabeta_sum )*factor;
+	}
+	
+	// clamp the output to [0, 1]
+	Vec3 const one( 1.0f );
+	Vec3 const zero( 0.0f );
+	a = Min( one, Max( zero, a ) );
+	b = Min( one, Max( zero, b ) );
+
+	// clamp to the grid
+	Vec3 const grid( 31.0f, 63.0f, 31.0f );
+	//Vec3 const gridrcp( 1.0f/31.0f, 1.0f/63.0f, 1.0f/31.0f );
+	Vec3 const gridrcp(0.03227752766457f, 0.01583151765563f, 0.03227752766457f); // IC: use approximate grid fitting.
+	Vec3 const half( 0.5f );
+	a = Floor( grid*a + half )*gridrcp;
+	b = Floor( grid*b + half )*gridrcp;
+
+	// compute the error
+	Vec3 e1 = a*a*alpha2_sum + b*b*beta2_sum /*+ m_xxsum*/
+		+ 2.0f*( a*b*alphabeta_sum - a*alphax_sum - b*betax_sum );
+
+	// apply the metric to the error term
+	float error = Dot( e1, m_metric );
+	
+	//if (debug) printf(" - %f\n", error);
+
+	// save the start and end
+	start = a;
+	end = b;
+	return error;
+}
+#endif
+
+} // namespace squish
--- a/src/nvimage/nvtt/squish/clusterfit.h
+++ b/src/nvimage/nvtt/squish/clusterfit.h
@ -0,0 +1,79 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#ifndef SQUISH_CLUSTERFIT_H
+#define SQUISH_CLUSTERFIT_H
+
+#include "squish.h"
+#include "maths.h"
+#include "simd.h"
+#include "colourfit.h"
+
+namespace squish {
+
+class ClusterFit : public ColourFit
+{
+public:
+	ClusterFit( ColourSet const* colours, int flags );
+	
+	void setMetric(float r, float g, float b);
+	float bestError() const;
+
+private:
+	virtual void Compress3( void* block );
+	virtual void Compress4( void* block );
+
+	void Reorder( Vec3::Arg principle );
+
+	Vec3 m_principle;
+#if SQUISH_USE_SIMD
+	Vec4 SolveLeastSquares( Vec4& start, Vec4& end ) const;
+
+	Vec4 m_weighted[16];
+	Vec4 m_unweighted[16];
+	Vec4 m_weights[16];
+	Vec4 m_metric;
+	Vec4 m_alpha[16];
+	Vec4 m_beta[16];
+	Vec4 m_xxsum;
+	Vec4 m_besterror;
+#else
+	float SolveLeastSquares( Vec3& start, Vec3& end ) const;
+
+	Vec3 m_weighted[16];
+	Vec3 m_unweighted[16];
+	float m_weights[16];
+	Vec3 m_metric;
+	float m_alpha[16];
+	float m_beta[16];
+	Vec3 m_xxsum;
+	float m_besterror;
+#endif
+	int m_order[16];
+};
+
+} // namespace squish
+
+#endif // ndef SQUISH_CLUSTERFIT_H
--- a/src/nvimage/nvtt/squish/colourblock.cpp
+++ b/src/nvimage/nvtt/squish/colourblock.cpp
@ -0,0 +1,278 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#include "colourblock.h"
+
+namespace squish {
+
+static int FloatToInt( float a, int limit )
+{
+	// use ANSI round-to-zero behaviour to get round-to-nearest
+	int i = ( int )( a + 0.5f );
+
+	// clamp to the limit
+	if( i < 0 )
+		i = 0;
+	else if( i > limit )
+		i = limit; 
+
+	// done
+	return i;
+}
+
+static int FloatTo565( Vec3::Arg colour )
+{
+	// get the components in the correct range
+	int r = FloatToInt( 31.0f*colour.X(), 31 );
+	int g = FloatToInt( 63.0f*colour.Y(), 63 );
+	int b = FloatToInt( 31.0f*colour.Z(), 31 );
+	
+	// pack into a single value
+	return ( r << 11 ) | ( g << 5 ) | b;
+}
+
+static void WriteColourBlock( int a, int b, u8* indices, void* block )
+{
+	// get the block as bytes
+	u8* bytes = ( u8* )block;
+
+	// write the endpoints
+	bytes[0] = ( u8 )( a & 0xff );
+	bytes[1] = ( u8 )( a >> 8 );
+	bytes[2] = ( u8 )( b & 0xff );
+	bytes[3] = ( u8 )( b >> 8 );
+	
+	// write the indices
+	for( int i = 0; i < 4; ++i )
+	{
+		u8 const* ind = indices + 4*i;
+		bytes[4 + i] = ind[0] | ( ind[1] << 2 ) | ( ind[2] << 4 ) | ( ind[3] << 6 );
+	}
+}
+
+void WriteColourBlock3( Vec3::Arg start, Vec3::Arg end, u8 const* indices, void* block )
+{
+	// get the packed values
+	int a = FloatTo565( start );
+	int b = FloatTo565( end );
+
+	// remap the indices
+	u8 remapped[16];
+	if( a <= b )
+	{
+		// use the indices directly
+		for( int i = 0; i < 16; ++i )
+			remapped[i] = indices[i];
+	}
+	else
+	{
+		// swap a and b
+		std::swap( a, b );
+		for( int i = 0; i < 16; ++i )
+		{
+			if( indices[i] == 0 )
+				remapped[i] = 1;
+			else if( indices[i] == 1 )
+				remapped[i] = 0;
+			else
+				remapped[i] = indices[i];
+		}
+	}
+	
+	// write the block
+	WriteColourBlock( a, b, remapped, block );
+}
+
+void WriteColourBlock4( Vec3::Arg start, Vec3::Arg end, u8 const* indices, void* block )
+{
+	// get the packed values
+	int a = FloatTo565( start );
+	int b = FloatTo565( end );
+
+	// remap the indices
+	u8 remapped[16];
+	if( a < b )
+	{
+		// swap a and b
+		std::swap( a, b );
+		for( int i = 0; i < 16; ++i )
+			remapped[i] = ( indices[i] ^ 0x1 ) & 0x3;
+	}
+	else if( a == b )
+	{
+		// use index 0
+		for( int i = 0; i < 16; ++i )
+			remapped[i] = 0;
+	}
+	else
+	{
+		// use the indices directly
+		for( int i = 0; i < 16; ++i )
+			remapped[i] = indices[i];
+	}
+	
+	// write the block
+	WriteColourBlock( a, b, remapped, block );
+}
+
+/*
+static void WriteColourBlock( int a, int b, uint indices, void* block )
+{
+	// get the block as bytes
+	u8* bytes = ( u8* )block;
+
+	// write the endpoints
+	bytes[0] = ( u8 )( a & 0xff );
+	bytes[1] = ( u8 )( a >> 8 );
+	bytes[2] = ( u8 )( b & 0xff );
+	bytes[3] = ( u8 )( b >> 8 );
+	
+	// write the indices @@ Not sure that's correct...
+	bytes[4] = ( u8 )((indices >> 24) & 0xff);
+	bytes[5] = ( u8 )((indices >> 16) & 0xff);
+	bytes[6] = ( u8 )((indices >> 8) & 0xff);
+	bytes[7] = ( u8 )((indices >> 0) & 0xff);
+}
+
+void WriteColourBlock3( Vec3::Arg start, Vec3::Arg end, uint indices, void* block )
+{
+	// get the packed values
+	int a = FloatTo565( start );
+	int b = FloatTo565( end );
+
+	// remap the indices
+	if( a > b )
+	{
+		// swap a and b
+		std::swap( a, b );
+		indices ^= (~indices >> 1) & 0x55555555;
+	}
+	else if ( a == b )
+	{
+		indices = 0;
+	}
+	
+	// write the block
+	WriteColourBlock( a, b, indices, block );
+}
+
+void WriteColourBlock4( Vec3::Arg start, Vec3::Arg end, uint indices, void* block )
+{
+	// get the packed values
+	int a = FloatTo565( start );
+	int b = FloatTo565( end );
+
+	// remap the indices
+	if( a < b )
+	{
+		// swap a and b
+		std::swap( a, b );
+		indices ^= 0x55555555;
+	}
+	else if( a == b )
+	{
+		indices = 0;
+	}
+	
+	// write the block
+	WriteColourBlock( a, b, indices, block );
+}
+*/
+
+static int Unpack565( u8 const* packed, u8* colour )
+{
+	// build the packed value
+	int value = ( int )packed[0] | ( ( int )packed[1] << 8 );
+	
+	// get the components in the stored range
+	u8 red = ( u8 )( ( value >> 11 ) & 0x1f );
+	u8 green = ( u8 )( ( value >> 5 ) & 0x3f );
+	u8 blue = ( u8 )( value & 0x1f );
+
+	// scale up to 8 bits
+	colour[0] = ( red << 3 ) | ( red >> 2 );
+	colour[1] = ( green << 2 ) | ( green >> 4 );
+	colour[2] = ( blue << 3 ) | ( blue >> 2 );
+	colour[3] = 255;
+	
+	// return the value
+	return value;
+}
+
+void DecompressColour( u8* rgba, void const* block, bool isDxt1 )
+{
+	// get the block bytes
+	u8 const* bytes = reinterpret_cast< u8 const* >( block );
+	
+	// unpack the endpoints
+	u8 codes[16];
+	int a = Unpack565( bytes, codes );
+	int b = Unpack565( bytes + 2, codes + 4 );
+	
+	// generate the midpoints
+	for( int i = 0; i < 3; ++i )
+	{
+		int c = codes[i];
+		int d = codes[4 + i];
+
+		if( isDxt1 && a <= b )
+		{
+			codes[8 + i] = ( u8 )( ( c + d )/2 );
+			codes[12 + i] = 0;
+		}
+		else
+		{
+			codes[8 + i] = ( u8 )( ( 2*c + d )/3 );
+			codes[12 + i] = ( u8 )( ( c + 2*d )/3 );
+		}
+	}
+	
+	// fill in alpha for the intermediate values
+	codes[8 + 3] = 255;
+	codes[12 + 3] = ( isDxt1 && a <= b ) ? 0 : 255;
+	
+	// unpack the indices
+	u8 indices[16];
+	for( int i = 0; i < 4; ++i )
+	{
+		u8* ind = indices + 4*i;
+		u8 packed = bytes[4 + i];
+		
+		ind[0] = packed & 0x3;
+		ind[1] = ( packed >> 2 ) & 0x3;
+		ind[2] = ( packed >> 4 ) & 0x3;
+		ind[3] = ( packed >> 6 ) & 0x3;
+	}
+
+	// store out the colours
+	for( int i = 0; i < 16; ++i )
+	{
+		u8 offset = 4*indices[i];
+		for( int j = 0; j < 4; ++j )
+			rgba[4*i + j] = codes[offset + j];
+	}
+}
+
+} // namespace squish
--- a/src/nvimage/nvtt/squish/colourblock.h
+++ b/src/nvimage/nvtt/squish/colourblock.h
@ -0,0 +1,43 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#ifndef SQUISH_COLOURBLOCK_H
+#define SQUISH_COLOURBLOCK_H
+
+#include "squish.h"
+#include "maths.h"
+
+namespace squish {
+
+void WriteColourBlock3( Vec3::Arg start, Vec3::Arg end, u8 const* indices, void* block );
+void WriteColourBlock4( Vec3::Arg start, Vec3::Arg end, u8 const* indices, void* block );
+//void WriteColourBlock3( Vec3::Arg start, Vec3::Arg end, uint indices, void* block );
+//void WriteColourBlock4( Vec3::Arg start, Vec3::Arg end, uint indices, void* block );
+
+void DecompressColour( u8* rgba, void const* block, bool isDxt1 );
+
+} // namespace squish
+
+#endif // ndef SQUISH_COLOURBLOCK_H
--- a/src/nvimage/nvtt/squish/colourfit.cpp
+++ b/src/nvimage/nvtt/squish/colourfit.cpp
@ -0,0 +1,54 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#include "colourfit.h"
+#include "colourset.h"
+
+namespace squish {
+
+ColourFit::ColourFit( ColourSet const* colours, int flags ) 
+  : m_colours( colours ), 
+	m_flags( flags )
+{
+}
+
+void ColourFit::Compress( void* block )
+{
+	bool isDxt1 = ( ( m_flags & kDxt1 ) != 0 );
+	if( isDxt1 )
+	{
+		Compress4( block );
+		if( !m_colours->IsTransparent() )
+		{		
+			Compress3( block );
+		}
+	}
+	else
+	{
+		Compress4( block );
+	}
+}
+
+} // namespace squish
--- a/src/nvimage/nvtt/squish/colourfit.h
+++ b/src/nvimage/nvtt/squish/colourfit.h
@ -0,0 +1,53 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#ifndef SQUISH_COLOURFIT_H
+#define SQUISH_COLOURFIT_H
+
+#include "squish.h"
+#include "maths.h"
+
+namespace squish {
+
+class ColourSet;
+
+class ColourFit
+{
+public:
+	ColourFit( ColourSet const* colours, int flags );
+
+	void Compress( void* block );
+
+protected:
+	virtual void Compress3( void* block ) = 0;
+	virtual void Compress4( void* block ) = 0;
+
+	ColourSet const* m_colours;
+	int m_flags;
+};
+
+} // namespace squish
+
+#endif // ndef SQUISH_COLOURFIT_H
--- a/src/nvimage/nvtt/squish/colourset.cpp
+++ b/src/nvimage/nvtt/squish/colourset.cpp
@ -0,0 +1,134 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#include "colourset.h"
+
+namespace squish {
+
+ColourSet::ColourSet( u8 const* rgba, int flags )
+  : m_count( 0 ), 
+	m_transparent( false )
+{
+	// check the compression mode for dxt1
+	bool isDxt1 = ( ( flags & kDxt1 ) != 0 );
+	bool weightByAlpha = ( ( flags & kWeightColourByAlpha ) != 0 );
+
+	// create the minimal set
+	for( int i = 0; i < 16; ++i )
+	{
+		// check for transparent pixels when using dxt1
+		if( isDxt1 && rgba[4*i + 3] == 0 )
+		{
+			m_remap[i] = -1;
+			m_transparent = true;
+			continue;
+		}
+		
+#if 1
+		// normalise coordinates to [0,1]
+		float x = ( float )rgba[4*i + 2] / 255.0f;
+		float y = ( float )rgba[4*i + 1] / 255.0f;
+		float z = ( float )rgba[4*i + 0] / 255.0f;
+		
+		// ensure there is always non-zero weight even for zero alpha
+		float w = ( float )( rgba[4*i + 3] + 1 ) / 256.0f;
+
+		// add the point
+		m_points[m_count] = Vec3( x, y, z );
+		m_weights[m_count] = ( weightByAlpha ? w : 1.0f );
+		m_remap[i] = m_count;
+		
+		// advance
+		++m_count;
+#else
+		// loop over previous points for a match
+		for( int j = 0;; ++j )
+		{
+			// allocate a new point
+			if( j == i )
+			{
+				// normalise coordinates to [0,1]
+				float x = ( float )rgba[4*i + 2] / 255.0f;
+				float y = ( float )rgba[4*i + 1] / 255.0f;
+				float z = ( float )rgba[4*i + 0] / 255.0f;
+				
+				// ensure there is always non-zero weight even for zero alpha
+				float w = ( float )( rgba[4*i + 3] + 1 ) / 256.0f;
+
+				// add the point
+				m_points[m_count] = Vec3( x, y, z );
+				m_weights[m_count] = ( weightByAlpha ? w : 1.0f );
+				m_remap[i] = m_count;
+				
+				// advance
+				++m_count;
+				break;
+			}
+		
+			// check for a match
+			bool match = ( rgba[4*i] == rgba[4*j] )
+				&& ( rgba[4*i + 1] == rgba[4*j + 1] )
+				&& ( rgba[4*i + 2] == rgba[4*j + 2] )
+				&& ( rgba[4*j + 3] != 0 || !isDxt1 );
+			if( match )
+			{
+				// get the index of the match
+				int index = m_remap[j];
+				
+				// ensure there is always non-zero weight even for zero alpha
+				float w = ( float )( rgba[4*i + 3] + 1 ) / 256.0f;
+
+				// map to this point and increase the weight
+				m_weights[index] += ( weightByAlpha ? w : 1.0f );
+				m_remap[i] = index;
+				break;
+			}
+		}
+#endif
+	}
+	
+#if SQUISH_USE_SIMD
+	// generate vector values
+	for( int i = 0; i < m_count; ++i )
+	{
+		m_points_simd[i] = Vec4(m_points[i].X(), m_points[i].Y(), m_points[i].Z(), 1);
+		m_weights_simd[i] = VEC4_CONST(m_weights[i]);
+	}
+#endif
+}
+
+void ColourSet::RemapIndices( u8 const* source, u8* target ) const
+{
+	for( int i = 0; i < 16; ++i )
+	{
+		int j = m_remap[i];
+		if( j == -1 )
+			target[i] = 3;
+		else
+			target[i] = source[j];
+	}
+}
+
+} // namespace squish
--- a/src/nvimage/nvtt/squish/colourset.h
+++ b/src/nvimage/nvtt/squish/colourset.h
@ -0,0 +1,69 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#ifndef SQUISH_COLOURSET_H
+#define SQUISH_COLOURSET_H
+
+#include "squish.h"
+#include "maths.h"
+#include "simd.h"
+
+namespace squish {
+
+/*! @brief Represents a set of block colours
+*/
+class ColourSet
+{
+public:
+	ColourSet( u8 const* rgba, int flags );
+
+	int GetCount() const { return m_count; }
+	Vec3 const* GetPoints() const { return m_points; }
+	float const* GetWeights() const { return m_weights; }
+	bool IsTransparent() const { return m_transparent; }
+
+	void RemapIndices( u8 const* source, u8* target ) const;
+
+private:
+	int m_count;
+	Vec3 m_points[16];
+	float m_weights[16];
+	int m_remap[16];
+	bool m_transparent;
+
+#if SQUISH_USE_SIMD
+public:
+	Vec4 const* GetPointsSimd() const { return m_points_simd; }
+	Vec4 const* GetWeightsSimd() const { return m_weights_simd; }
+	
+private:
+	Vec4 m_points_simd[16];
+	Vec4 m_weights_simd[16];
+#endif
+};
+
+} // namespace sqish
+
+#endif // ndef SQUISH_COLOURSET_H
--- a/src/nvimage/nvtt/squish/config
+++ b/src/nvimage/nvtt/squish/config
@ -0,0 +1,22 @@
+# config file used for the Makefile only
+
+# define to 1 to use altivec instructions
+USE_ALTIVEC ?= 0
+
+# define to 1 to use sse instructions
+USE_SSE ?= 0
+
+# default flags
+CXXFLAGS ?= -O2
+ifeq ($(USE_ALTIVEC),1)
+CPPFLAGS += -DSQUISH_USE_ALTIVEC=1
+CXXFLAGS += -maltivec
+endif
+ifeq ($(USE_SSE),1)
+CPPFLAGS += -DSQUISH_USE_SSE=1
+CXXFLAGS += -msse
+endif
+
+# where should we install to
+INSTALL_DIR ?= /usr/local
+
--- a/src/nvimage/nvtt/squish/config.h
+++ b/src/nvimage/nvtt/squish/config.h
@ -0,0 +1,55 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#ifndef SQUISH_CONFIG_H
+#define SQUISH_CONFIG_H
+
+// Set to 1 when building squish to use altivec instructions.
+#ifndef SQUISH_USE_ALTIVEC
+#	define SQUISH_USE_ALTIVEC defined(__VEC__)
+#endif
+
+// Set to 1 when building squish to use sse instructions.
+#ifndef SQUISH_USE_SSE
+#	if defined(__SSE2__)
+#		define SQUISH_USE_SSE 2
+#	elif defined(__SSE__)
+#		define SQUISH_USE_SSE 1
+#	else
+#		define SQUISH_USE_SSE 0
+#	endif
+#endif
+
+// Internally et SQUISH_USE_SIMD when either altivec or sse is available.
+#if SQUISH_USE_ALTIVEC && SQUISH_USE_SSE
+#	error "Cannot enable both altivec and sse!"
+#endif
+#if SQUISH_USE_ALTIVEC || SQUISH_USE_SSE
+#	define SQUISH_USE_SIMD 1
+#else
+#	define SQUISH_USE_SIMD 0
+#endif
+
+#endif // ndef SQUISH_CONFIG_H
--- a/src/nvimage/nvtt/squish/extra/squishgen.cpp
+++ b/src/nvimage/nvtt/squish/extra/squishgen.cpp
@ -0,0 +1,158 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#include <iostream>
+
+struct SourceBlock
+{
+	int start;
+	int end;
+	int error;
+};
+
+struct TargetValue
+{
+	SourceBlock sources[4];
+};
+
+static void GenerateData( std::string const& name, int bits, int colours )
+{
+	TargetValue values[256];
+	
+	// initialise the data
+	for( int target = 0; target < 256; ++target )
+		for( int index = 0; index < colours; ++index )
+			values[target].sources[index].error = 255;	
+
+	// loop over all possible source points
+	int count = ( 1 << bits );
+	for( int value1 = 0; value1 < count; ++value1 )
+	{
+		for( int value2 = 0; value2 < count; ++value2 )
+		{
+			// compute the 8-bit endpoints
+			int a = ( value1 << ( 8 - bits ) ) | ( value1 >> ( 2*bits - 8 ) );
+			int b = ( value2 << ( 8 - bits ) ) | ( value2 >> ( 2*bits - 8 ) );
+			
+			// fill in the codebook with the these and intermediates
+			int codes[4];
+			codes[0] = a;
+			codes[1] = b;
+			if( colours == 3 )
+			{
+				codes[2] = ( a + b )/2;
+				codes[3] = 0;
+			}
+			else
+			{
+				codes[2] = ( 2*a + b )/3;
+				codes[3] = ( a + 2*b )/3;
+			}
+			
+			// mark each target point with the endpoints and index needed for it
+			for( int index = 0; index < colours; ++index )
+			{
+				int target = codes[index];
+				
+				SourceBlock& block = values[target].sources[index];
+				if( block.error != 0 )
+				{
+					block.start = value1;
+					block.end = value2;
+					block.error = 0;
+				}
+			}
+		}
+	}
+	
+	// iteratively fill in the missing values
+	for( ;; )
+	{
+		bool stable = true;
+		for( int index = 0; index < colours; ++index )
+		{
+			for( int target = 0; target < 256; ++target )
+			{
+				if( target != 255 )
+				{
+					SourceBlock& current = values[target].sources[index];
+					SourceBlock& next = values[target + 1].sources[index];
+					if( current.error > next.error + 1 )
+					{
+						current.start = next.start;
+						current.end = next.end;
+						current.error = next.error + 1;
+						stable = false;
+					}
+				}
+				if( target != 0 )
+				{
+					SourceBlock& current = values[target].sources[index];
+					SourceBlock& previous = values[target - 1].sources[index];
+					if( current.error > previous.error + 1 )
+					{
+						current.start = previous.start;
+						current.end = previous.end;
+						current.error = previous.error + 1;
+						stable = false;
+					}
+				}
+			}
+		}
+		if( stable )
+			break;
+	}
+	
+	// debug
+	std::cout << "\nstatic SingleColourLookup const " << name << "[] = \n{\n"; 
+	for( int i = 0;; )
+	{
+		std::cout << "\t{ { ";
+		for( int j = 0;; )
+		{
+			SourceBlock const& block = values[i].sources[j];
+			if( j < colours )
+				std::cout << "{ " << block.start << ", " << block.end << ", " << block.error << " }";
+			else
+				std::cout << "{ 0, 0, 0 }";
+			if( ++j == 4 )
+				break;
+			std::cout << ", ";
+		}
+		std::cout << " } }";
+		if( ++i == 256 )
+			break;
+		std::cout << ",\n";
+	}
+	std::cout << "\n};\n";
+}
+
+int main()
+{
+	GenerateData( "lookup_5_3", 5, 3 );
+	GenerateData( "lookup_6_3", 6, 3 );
+	GenerateData( "lookup_5_4", 5, 4 );
+	GenerateData( "lookup_6_4", 6, 4 );
+}
--- a/src/nvimage/nvtt/squish/extra/squishpng.cpp
+++ b/src/nvimage/nvtt/squish/extra/squishpng.cpp
@ -0,0 +1,603 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+/*! @file
+
+	@brief	Example program that converts between the PNG and DXT formats.
+	
+	This program requires libpng for PNG input and output, and is designed
+	to show how to prepare data for the squish library when it is not simply
+	a contiguous block of memory.
+*/
+
+#include <iostream>
+#include <string>
+#include <sstream>
+#include <ctime>
+#include <cmath>
+#include <squish.h>
+#include <png.h>
+
+#ifdef _MSC_VER
+#pragma warning( disable: 4511 4512 )
+#endif // def _MSC_VER
+
+using namespace squish;
+
+//! Simple exception class.
+class Error : public std::exception
+{
+public:
+	Error( std::string const& excuse ) : m_excuse( excuse ) {}
+	~Error() throw() {}
+	
+	virtual char const* what() const throw() { return m_excuse.c_str(); }
+	
+private:
+	std::string m_excuse;
+};
+
+//! Base class to make derived classes non-copyable
+class NonCopyable
+{
+public:
+	NonCopyable() {}
+	
+private:
+	NonCopyable( NonCopyable const& );
+	NonCopyable& operator=( NonCopyable const& );
+};
+
+//! Memory object.
+class Mem : NonCopyable
+{
+public:
+	explicit Mem( int size ) : m_p( new u8[size] ) {}
+	~Mem() { delete[] m_p; }
+	
+	u8* Get() const { return m_p; }
+	
+private:
+	u8* m_p;
+};
+
+//! File object.
+class File : NonCopyable
+{
+public:
+	explicit File( FILE* fp ) : m_fp( fp ) {}
+	~File() { if( m_fp ) fclose( m_fp ); }
+	
+	bool IsValid() const { return m_fp != 0; }
+	FILE* Get() const { return m_fp; }
+
+private:
+	FILE* m_fp;
+};
+
+//! PNG read object.
+class PngReadStruct : NonCopyable
+{
+public:
+	PngReadStruct()
+	  : m_png( 0 ), 
+		m_info( 0 ), 
+		m_end( 0 )
+	{
+		m_png = png_create_read_struct( PNG_LIBPNG_VER_STRING, 0, 0, 0 );
+		if( !m_png )
+			throw Error( "failed to create png read struct" );	
+			
+		m_info = png_create_info_struct( m_png );
+		m_end = png_create_info_struct( m_png );
+		if( !m_info || !m_end )
+		{
+			png_infopp info = m_info ? &m_info : 0;
+			png_infopp end = m_end ? &m_end : 0;
+			png_destroy_read_struct( &m_png, info, end );
+			throw Error( "failed to create png info structs" );
+		}
+	}
+	
+	~PngReadStruct() 
+	{ 
+		png_destroy_read_struct( &m_png, &m_info, &m_end );
+	}
+
+	png_structp GetPng() const { return m_png; }
+	png_infop GetInfo() const { return m_info; }
+
+private:
+	png_structp m_png;
+	png_infop m_info, m_end;
+};
+
+//! PNG write object.
+class PngWriteStruct : NonCopyable
+{
+public:
+	PngWriteStruct()
+	  : m_png( 0 ), 
+		m_info( 0 )
+	{
+		m_png = png_create_write_struct( PNG_LIBPNG_VER_STRING, 0, 0, 0 );
+		if( !m_png )
+			throw Error( "failed to create png read struct" );	
+			
+		m_info = png_create_info_struct( m_png );
+		if( !m_info )
+		{
+			png_infopp info = m_info ? &m_info : 0;
+			png_destroy_write_struct( &m_png, info );
+			throw Error( "failed to create png info structs" );
+		}
+	}
+	
+	~PngWriteStruct()
+	{
+		png_destroy_write_struct( &m_png, &m_info );
+	}
+	
+	png_structp GetPng() const { return m_png; }
+	png_infop GetInfo() const { return m_info; }
+
+private:
+	png_structp m_png;
+	png_infop m_info;
+};
+
+//! PNG rows object.
+class PngRows : NonCopyable
+{
+public:
+	PngRows( int width, int height, int stride ) : m_width( width ), m_height( height )
+	{
+		m_rows = ( png_bytep* )malloc( m_height*sizeof( png_bytep ) );
+		for( int i = 0; i < m_height; ++i )
+			m_rows[i] = ( png_bytep )malloc( m_width*stride );
+	}
+	
+	~PngRows() 
+	{
+		for( int i = 0; i < m_height; ++i )
+			free( m_rows[i] );
+		free( m_rows );
+	}
+	
+	png_bytep* Get() const { return m_rows; }
+	
+private:
+	png_bytep* m_rows;
+	int m_width, m_height;
+};
+
+class PngImage
+{
+public:
+	explicit PngImage( std::string const& fileName );
+
+	int GetWidth() const { return m_width; }
+	int GetHeight() const { return m_height; }
+	int GetStride() const { return m_stride; }
+	bool IsColour() const { return m_colour; }
+	bool IsAlpha() const { return m_alpha; }
+	
+	u8 const* GetRow( int row ) const { return ( u8* )m_rows[row]; }
+
+private:
+	PngReadStruct m_png;
+
+	int m_width;
+	int m_height;
+	int m_stride;
+	bool m_colour;
+	bool m_alpha;
+	
+	png_bytep* m_rows;
+};
+
+PngImage::PngImage( std::string const& fileName )
+{
+	// open the source file
+	File file( fopen( fileName.c_str(), "rb" ) );
+	if( !file.IsValid() )
+	{
+		std::ostringstream oss;
+		oss << "failed to open \"" << fileName << "\" for reading";
+		throw Error( oss.str() );
+	}
+	
+	// check the signature bytes
+	png_byte header[8];
+	fread( header, 1, 8, file.Get() );
+	if( png_sig_cmp( header, 0, 8 ) )
+	{
+		std::ostringstream oss;
+		oss << "\"" << fileName << "\" does not look like a png file";
+		throw Error( oss.str() );
+	}
+	
+	// read the image into memory
+	png_init_io( m_png.GetPng(), file.Get() );
+	png_set_sig_bytes( m_png.GetPng(), 8 );
+	png_read_png( m_png.GetPng(), m_png.GetInfo(), PNG_TRANSFORM_EXPAND, 0 );
+
+	// get the image info
+	png_uint_32 width;
+	png_uint_32 height;
+	int bitDepth;
+	int colourType;
+	png_get_IHDR( m_png.GetPng(), m_png.GetInfo(), &width, &height, &bitDepth, &colourType, 0, 0, 0 );
+	
+	// check the image is 8 bit
+	if( bitDepth != 8 )
+	{
+		std::ostringstream oss;
+		oss << "cannot process " << bitDepth << "-bit image (bit depth must be 8)";
+		throw Error( oss.str() );
+	}
+	
+	// save the info
+	m_width = width;
+	m_height = height;
+	m_colour = ( ( colourType & PNG_COLOR_MASK_COLOR ) != 0 );
+	m_alpha = ( ( colourType & PNG_COLOR_MASK_ALPHA ) != 0 );
+	m_stride = ( m_colour ? 3 : 1 ) + ( m_alpha ? 1 : 0 );
+
+	// get the image rows
+	m_rows = png_get_rows( m_png.GetPng(), m_png.GetInfo() );
+	if( !m_rows )
+		throw Error( "failed to get image rows" );
+}
+
+static void Compress( std::string const& sourceFileName, std::string const& targetFileName, int flags )
+{
+	// load the source image
+	PngImage sourceImage( sourceFileName );
+
+	// get the image info
+	int width = sourceImage.GetWidth();
+	int height = sourceImage.GetHeight();
+	int stride = sourceImage.GetStride();
+	bool colour = sourceImage.IsColour();
+	bool alpha = sourceImage.IsAlpha();
+
+	// check the image dimensions
+	if( ( width % 4 ) != 0 || ( height % 4 ) != 0 )
+	{
+		std::ostringstream oss;
+		oss << "cannot compress " << width << "x" << height
+			<< "image (dimensions must be multiples of 4)";
+		throw Error( oss.str() );
+	}
+	
+	// create the target data
+	int bytesPerBlock = ( ( flags & kDxt1 ) != 0 ) ? 8 : 16;
+	int targetDataSize = bytesPerBlock*width*height/16;
+	Mem targetData( targetDataSize );
+	
+	// loop over blocks and compress them
+	clock_t start = std::clock();
+	u8* targetBlock = targetData.Get();
+	for( int y = 0; y < height; y += 4 )
+	{
+		// process a row of blocks
+		for( int x = 0; x < width; x += 4 )
+		{
+			// get the block data
+			u8 sourceRgba[16*4];
+			for( int py = 0, i = 0; py < 4; ++py )
+			{
+				u8 const* row = sourceImage.GetRow( y + py ) + x*stride;
+				for( int px = 0; px < 4; ++px, ++i )
+				{
+					// get the pixel colour 
+					if( colour )
+					{
+						for( int j = 0; j < 3; ++j )
+							sourceRgba[4*i + j] = *row++;
+					}
+					else
+					{
+						for( int j = 0; j < 3; ++j )
+							sourceRgba[4*i + j] = *row;
+						++row;
+					}
+					
+					// skip alpha for now
+					if( alpha )
+						sourceRgba[4*i + 3] = *row++;
+					else
+						sourceRgba[4*i + 3] = 255;
+				}
+			}
+			
+			// compress this block
+			Compress( sourceRgba, targetBlock, flags );
+			
+			// advance
+			targetBlock += bytesPerBlock;			
+		}
+	}
+	clock_t end = std::clock();
+	double duration = ( double )( end - start ) / CLOCKS_PER_SEC;
+	std::cout << "time taken: " << duration << " seconds" << std::endl;
+	
+	// open the target file
+	File targetFile( fopen( targetFileName.c_str(), "wb" ) );
+	if( !targetFile.IsValid() )
+	{
+		std::ostringstream oss;
+		oss << "failed to open \"" << sourceFileName << "\" for writing";
+		throw Error( oss.str() );
+	}
+	
+	// write the header
+	fwrite( &width, sizeof( int ), 1, targetFile.Get() );
+	fwrite( &height, sizeof( int ), 1, targetFile.Get() );
+	
+	// write the data
+	fwrite( targetData.Get(), 1, targetDataSize, targetFile.Get() );
+}
+
+static void Decompress( std::string const& sourceFileName, std::string const& targetFileName, int flags )
+{
+	// open the source file
+	File sourceFile( fopen( sourceFileName.c_str(), "rb" ) );
+	if( !sourceFile.IsValid() )
+	{
+		std::ostringstream oss;
+		oss << "failed to open \"" << sourceFileName << "\" for reading";
+		throw Error( oss.str() );
+	}
+	
+	// get the width and height
+	int width, height;
+	fread( &width, sizeof( int ), 1, sourceFile.Get() ); 
+	fread( &height, sizeof( int ), 1, sourceFile.Get() );
+	
+	// work out the data size
+	int bytesPerBlock = ( ( flags & kDxt1 ) != 0 ) ? 8 : 16;
+	int sourceDataSize = bytesPerBlock*width*height/16;
+	Mem sourceData( sourceDataSize );
+	
+	// read the source data
+	fread( sourceData.Get(), 1, sourceDataSize, sourceFile.Get() );
+		
+	// create the target rows
+	PngRows targetRows( width, height, 4 );
+	
+	// loop over blocks and compress them
+	u8 const* sourceBlock = sourceData.Get();
+	for( int y = 0; y < height; y += 4 )
+	{
+		// process a row of blocks
+		for( int x = 0; x < width; x += 4 )
+		{
+			// decompress back
+			u8 targetRgba[16*4];
+			Decompress( targetRgba, sourceBlock, flags );
+			
+			// write the data into the target rows
+			for( int py = 0, i = 0; py < 4; ++py )
+			{
+				u8* row = ( u8* )targetRows.Get()[y + py] + x*4;
+				for( int px = 0; px < 4; ++px, ++i )
+				{	
+					for( int j = 0; j < 4; ++j )
+						*row++ = targetRgba[4*i + j];
+				}
+			}
+			
+			// advance
+			sourceBlock += bytesPerBlock;
+		}
+	}
+	
+	// create the target PNG
+	PngWriteStruct targetPng;
+
+	// set up the image
+	png_set_IHDR(
+		targetPng.GetPng(), targetPng.GetInfo(), width, height,
+		8, PNG_COLOR_TYPE_RGBA, PNG_INTERLACE_NONE,
+		PNG_COMPRESSION_TYPE_DEFAULT, PNG_FILTER_TYPE_DEFAULT 
+	);
+	   
+	// open the target file
+	File targetFile( fopen( targetFileName.c_str(), "wb" ) );
+	if( !targetFile.IsValid() )
+	{
+		std::ostringstream oss;
+		oss << "failed to open \"" << targetFileName << "\" for writing";
+		throw Error( oss.str() );
+	}
+	
+	// write the image
+	png_set_rows( targetPng.GetPng(), targetPng.GetInfo(), targetRows.Get() );
+	png_init_io( targetPng.GetPng(), targetFile.Get() );
+	png_write_png( targetPng.GetPng(), targetPng.GetInfo(), PNG_TRANSFORM_IDENTITY, 0 );
+}
+
+static void Diff( std::string const& sourceFileName, std::string const& targetFileName )
+{
+	// load the images
+	PngImage sourceImage( sourceFileName );
+	PngImage targetImage( targetFileName );
+	
+	// get the image info
+	int width = sourceImage.GetWidth();
+	int height = sourceImage.GetHeight();
+	int sourceStride = sourceImage.GetStride();
+	int targetStride = targetImage.GetStride();
+	int stride = std::min( sourceStride, targetStride );
+
+	// check they match
+	if( width != targetImage.GetWidth() || height != targetImage.GetHeight() )
+		throw Error( "source and target dimensions do not match" );
+		
+	// work out the error
+	double error = 0.0;
+	for( int y = 0; y < height; ++y )
+	{
+		u8 const* sourceRow = sourceImage.GetRow( y );
+		u8 const* targetRow = targetImage.GetRow( y );
+		for( int x = 0; x < width; ++x )
+		{	
+			u8 const* sourcePixel = sourceRow + x*sourceStride;
+			u8 const* targetPixel = targetRow + x*targetStride;
+			for( int i = 0; i < stride; ++i )
+			{
+				int diff = ( int )sourcePixel[i] - ( int )targetPixel[i];
+				error += ( double )( diff*diff );
+			}
+		}
+	}
+	error = std::sqrt( error / ( width*height ) );
+	
+	// print it out
+	std::cout << "rms error: " << error << std::endl;
+}
+
+enum Mode
+{
+	kCompress, 
+	kDecompress,
+	kDiff
+};
+
+int main( int argc, char* argv[] )
+{
+	try
+	{
+		// parse the command-line
+		std::string sourceFileName;
+		std::string targetFileName;
+		Mode mode = kCompress;
+		int method = kDxt1;
+		int metric = kColourMetricPerceptual;
+		int fit = kColourClusterFit;
+		int extra = 0;
+		bool help = false;
+		bool arguments = true;
+		for( int i = 1; i < argc; ++i )
+		{
+			// check for options
+			char const* word = argv[i];
+			if( arguments && word[0] == '-' )
+			{
+				for( int j = 1; word[j] != '\0'; ++j )
+				{
+					switch( word[j] )
+					{
+					case 'h': help = true; break;
+					case 'c': mode = kCompress; break;
+					case 'd': mode = kDecompress; break;
+					case 'e': mode = kDiff; break;
+					case '1': method = kDxt1; break;
+					case '3': method = kDxt3; break;
+					case '5': method = kDxt5; break;
+					case 'u': metric = kColourMetricUniform; break;
+					case 'r': fit = kColourRangeFit; break;
+					case 'w': extra = kWeightColourByAlpha; break;
+					case '-': arguments = false; break;
+					default:
+						std::cerr << "unknown option '" << word[j] << "'" << std::endl;
+						return -1;
+					}
+				}
+			}
+			else
+			{
+				if( sourceFileName.empty() )
+					sourceFileName.assign( word );
+				else if( targetFileName.empty() )
+					targetFileName.assign( word );
+				else
+				{
+					std::cerr << "unexpected argument \"" << word << "\"" << std::endl;
+				}
+			}
+		}
+		
+		// check arguments
+		if( help )
+		{
+			std::cout 
+				<< "SYNTAX" << std::endl
+				<< "\tsquishpng [-cde135] <source> <target>" << std::endl 
+				<< "OPTIONS" << std::endl
+				<< "\t-c\tCompress source png to target raw dxt (default)" << std::endl
+				<< "\t-135\tSpecifies whether to use DXT1 (default), DXT3 or DXT5 compression" << std::endl
+				<< "\t-u\tUse a uniform colour metric during colour compression" << std::endl
+				<< "\t-r\tUse the fast but inferior range-based colour compressor" << std::endl
+				<< "\t-w\tWeight colour values by alpha in the cluster colour compressor" << std::endl
+				<< "\t-d\tDecompress source raw dxt to target png" << std::endl
+				<< "\t-e\tDiff source and target png" << std::endl
+				;
+			
+			return 0;
+		}
+		if( sourceFileName.empty() )
+		{
+			std::cerr << "no source file given" << std::endl;
+			return -1;
+		}
+		if( targetFileName.empty() )
+		{
+			std::cerr << "no target file given" << std::endl;
+			return -1;
+		}
+
+		// do the work
+		switch( mode )
+		{
+		case kCompress:
+			Compress( sourceFileName, targetFileName, method | metric | fit | extra );
+			break;
+		
+		case kDecompress:
+			Decompress( sourceFileName, targetFileName, method );
+			break;
+			
+		case kDiff:
+			Diff( sourceFileName, targetFileName );
+			break;
+			
+		default:
+			std::cerr << "unknown mode" << std::endl;
+			throw std::exception();
+		}
+	}
+	catch( std::exception& excuse )
+	{
+		// complain
+		std::cerr << "squishpng error: " << excuse.what() << std::endl;
+		return -1;
+	}
+	
+	// done
+	return 0;
+}
--- a/src/nvimage/nvtt/squish/extra/squishtest.cpp
+++ b/src/nvimage/nvtt/squish/extra/squishtest.cpp
@ -0,0 +1,205 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+/*! @file
+
+	@brief	This program tests the error for 1 and 2-colour DXT compression.
+	
+	This tests the effectiveness of the DXT compression algorithm for all
+	possible 1 and 2-colour blocks of pixels.
+*/
+
+#include <squish.h>
+#include <iostream>
+#include <cmath>
+#include <cfloat>
+
+using namespace squish;
+
+double GetColourError( u8 const* a, u8 const* b )
+{
+	double error = 0.0;
+	for( int i = 0; i < 16; ++i )
+	{
+		for( int j = 0; j < 3; ++j )
+		{
+			int index = 4*i + j;
+			int diff = ( int )a[index] - ( int )b[index];
+			error += ( double )( diff*diff );
+		}
+	}
+	return error / 16.0;
+}
+
+void TestOneColour( int flags )
+{
+	u8 input[4*16];
+	u8 output[4*16];
+	u8 block[16];
+	
+	double avg = 0.0, min = DBL_MAX, max = -DBL_MAX;
+	int counter = 0;
+	
+	// test all single-channel colours
+	for( int i = 0; i < 16*4; ++i )
+		input[i] = ( ( i % 4 ) == 3 ) ? 255 : 0;
+	for( int channel = 0; channel < 3; ++channel )
+	{
+		for( int value = 0; value < 255; ++value )
+		{
+			// set the channnel value
+			for( int i = 0; i < 16; ++i )
+				input[4*i + channel] = ( u8 )value;
+			
+			// compress and decompress
+			Compress( input, block, flags );
+			Decompress( output, block, flags );
+			
+			// test the results
+			double rm = GetColourError( input, output );
+			double rms = std::sqrt( rm );
+			
+			// accumulate stats
+			min = std::min( min, rms );
+			max = std::max( max, rms );
+			avg += rm;
+			++counter;
+		}
+		
+		// reset the channel value
+		for( int i = 0; i < 16; ++i )
+			input[4*i + channel] = 0;
+	}
+	
+	// finish stats
+	avg = std::sqrt( avg/counter );
+	
+	// show stats
+	std::cout << "one colour error (min, max, avg): " 
+		<< min << ", " << max << ", " << avg << std::endl;
+}
+
+void TestOneColourRandom( int flags )
+{
+	u8 input[4*16];
+	u8 output[4*16];
+	u8 block[16];
+	
+	double avg = 0.0, min = DBL_MAX, max = -DBL_MAX;
+	int counter = 0;
+	
+	// test all single-channel colours
+	for( int test = 0; test < 1000; ++test )
+	{
+		// set a constant random colour
+		for( int channel = 0; channel < 3; ++channel )
+		{
+			u8 value = ( u8 )( rand() & 0xff );
+			for( int i = 0; i < 16; ++i )
+				input[4*i + channel] = value;
+		}
+		for( int i = 0; i < 16; ++i )
+			input[4*i + 3] = 255;
+		
+		// compress and decompress
+		Compress( input, block, flags );
+		Decompress( output, block, flags );
+		
+		// test the results
+		double rm = GetColourError( input, output );
+		double rms = std::sqrt( rm );
+		
+		// accumulate stats
+		min = std::min( min, rms );
+		max = std::max( max, rms );
+		avg += rm;
+		++counter;
+	}
+	
+	// finish stats
+	avg = std::sqrt( avg/counter );
+	
+	// show stats
+	std::cout << "random one colour error (min, max, avg): " 
+		<< min << ", " << max << ", " << avg << std::endl;
+}
+
+void TestTwoColour( int flags )
+{
+	u8 input[4*16];
+	u8 output[4*16];
+	u8 block[16];
+	
+	double avg = 0.0, min = DBL_MAX, max = -DBL_MAX;
+	int counter = 0;
+	
+	// test all single-channel colours
+	for( int i = 0; i < 16*4; ++i )
+		input[i] = ( ( i % 4 ) == 3 ) ? 255 : 0;
+	for( int channel = 0; channel < 3; ++channel )
+	{
+		for( int value1 = 0; value1 < 255; ++value1 )
+		{
+			for( int value2 = value1 + 1; value2 < 255; ++value2 )
+			{
+				// set the channnel value
+				for( int i = 0; i < 16; ++i )
+					input[4*i + channel] = ( u8 )( ( i < 8 ) ? value1 : value2 );
+				
+				// compress and decompress
+				Compress( input, block, flags );
+				Decompress( output, block, flags );
+				
+				// test the results
+				double rm = GetColourError( input, output );
+				double rms = std::sqrt( rm );
+				
+				// accumulate stats
+				min = std::min( min, rms );
+				max = std::max( max, rms );
+				avg += rm;
+				++counter;
+			}
+		}
+				
+		// reset the channel value
+		for( int i = 0; i < 16; ++i )
+			input[4*i + channel] = 0;
+	}
+	
+	// finish stats
+	avg = std::sqrt( avg/counter );
+	
+	// show stats
+	std::cout << "two colour error (min, max, avg): " 
+		<< min << ", " << max << ", " << avg << std::endl;
+}
+
+int main()
+{
+	TestOneColourRandom( kDxt1 | kColourRangeFit );
+	TestOneColour( kDxt1 );
+	TestTwoColour( kDxt1 );
+}
--- a/Show More
+++ b/Show More