diff --git a/CMakeLists.txt b/CMakeLists.txt
index ab4dcb6..5e4bab9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -31,7 +31,7 @@ MESSAGE(STATUS "  Processor: ${NV_SYSTEM_PROCESSOR}")
 MESSAGE(STATUS "  Compiler Flags: ${CMAKE_CXX_FLAGS}")
 
 IF(CMAKE_BUILD_TYPE MATCHES "debug")
-	SET(CMAKE_DEBUG_POSTFIX "_d" CACHE STRING "Postfix for debug build libraries.")
+    SET(CMAKE_DEBUG_POSTFIX "_d" CACHE STRING "Postfix for debug build libraries.")
     ADD_DEFINITIONS(-D_DEBUG=1)
 ENDIF()
 
diff --git a/LICENSE b/LICENSE
index bdd7e67..c1bfebc 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,6 +1,6 @@
 NVIDIA Texture Tools is licensed under the MIT license.
 
-Copyright (c) 2009-2016 Ignacio Castano
+Copyright (c) 2009-2017 Ignacio Castaño
 Copyright (c) 2007-2009 NVIDIA Corporation
 
 Permission is hereby granted, free of charge, to any person
diff --git a/README.md b/README.md
index 65b30f4..6b14411 100644
--- a/README.md
+++ b/README.md
@@ -6,7 +6,7 @@ manipulation tools, designed to be integrated in game tools and asset
 processing pipelines.
 
 The primary features of the library are mipmap and normal map generation, format 
-conversion and DXT compression.
+conversion, and DXT compression.
 
 
 ### How to build (Windows)
@@ -42,5 +42,5 @@ src/nvtt/tools/compress.cpp
 
 Detailed documentation of the API can be found at:
 
-http://code.google.com/p/nvidia-texture-tools/wiki/ApiDocumentation
+https://github.com/castano/nvidia-texture-tools/wiki/ApiDocumentation
 
diff --git a/data/witness/run.sh b/data/witness/run.sh
index 1ca6e77..21189da 100644
--- a/data/witness/run.sh
+++ b/data/witness/run.sh
@@ -36,4 +36,6 @@ do
 	#./nvcompress -silent -alpha -nomips -bc6 $file.$EXT $file.bc6.dds
 	#./nvimgdiff -alpha $file.$EXT $file.bc6.dds
 
+    # ETC2-EAC
+    ./nvcompress -silent -alpha -nomips -etc_rgbm
 done
diff --git a/extern/poshlib/posh.h b/extern/poshlib/posh.h
index 5382294..716607d 100644
--- a/extern/poshlib/posh.h
+++ b/extern/poshlib/posh.h
@@ -349,9 +349,18 @@ LLVM:
 #  define POSH_OS_STRING "UNICOS"
 #endif
 
-#if ( defined __MWERKS__ && defined __powerc && !defined macintosh ) || defined __APPLE_CC__ || defined macosx
-#  define POSH_OS_OSX 1
-#  define POSH_OS_STRING "MacOS X"
+//ACS if we're in xcode, look at the target conditionals to figure out if this is ios or osx
+#if defined __APPLE__
+#  include "TargetConditionals.h"
+#endif
+#if TARGET_OS_IPHONE
+#    define POSH_OS_IOS 1
+#    define POSH_OS_STRING "iOS"
+#else
+#  if ( defined __MWERKS__ && defined __powerc && !defined macintosh ) || defined __APPLE_CC__ || defined macosx
+#    define POSH_OS_OSX 1
+#    define POSH_OS_STRING "MacOS X"
+#  endif
 #endif
 
 #if defined __sun__ || defined sun || defined __sun || defined __solaris__
diff --git a/extern/rg_etc1_v104/rg_etc1.cpp b/extern/rg_etc1_v104/rg_etc1.cpp
index d9b9331..d6d6ddf 100644
--- a/extern/rg_etc1_v104/rg_etc1.cpp
+++ b/extern/rg_etc1_v104/rg_etc1.cpp
@@ -1808,7 +1808,7 @@ typedef unsigned long uint64;
          {
             if (block_inten[0] > m_pSorted_luma[n - 1])
             {
-               const uint min_error = labs(int(block_inten[0] - m_pSorted_luma[n - 1]));
+               const uint min_error = abs(int(block_inten[0] - m_pSorted_luma[n - 1]));
                if (min_error >= trial_solution.m_error)
                   continue;
             }
@@ -1822,7 +1822,7 @@ typedef unsigned long uint64;
          {
             if (m_pSorted_luma[0] > block_inten[3])
             {
-               const uint min_error = labs(int(m_pSorted_luma[0] - block_inten[3]));
+               const uint min_error = abs(int(m_pSorted_luma[0] - block_inten[3]));
                if (min_error >= trial_solution.m_error)
                   continue;
             }
@@ -1914,7 +1914,7 @@ done:
                   for (uint packed_c = 0; packed_c < limit; packed_c++)
                   {
                      int v = etc1_decode_value(diff, inten, selector, packed_c);
-                     uint err = labs(v - static_cast<int>(color));
+                     uint err = abs(v - static_cast<int>(color));
                      if (err < best_error)
                      {
                         best_error = err;
diff --git a/src/bc6h/CMakeLists.txt b/src/bc6h/CMakeLists.txt
index 635e0f3..5c01c6c 100644
--- a/src/bc6h/CMakeLists.txt
+++ b/src/bc6h/CMakeLists.txt
@@ -14,6 +14,7 @@ SET(BC6H_SRCS
 	zohtwo.cpp)
 
 ADD_LIBRARY(bc6h STATIC ${BC6H_SRCS})
+TARGET_LINK_LIBRARIES(bc6h nvcore nvmath)
 
 IF(NOT WIN32)
     IF(CMAKE_COMPILER_IS_GNUCXX)
diff --git a/src/bc6h/zoh_utils.cpp b/src/bc6h/zoh_utils.cpp
index 166d1f3..328477c 100644
--- a/src/bc6h/zoh_utils.cpp
+++ b/src/bc6h/zoh_utils.cpp
@@ -37,7 +37,7 @@ int Utils::lerp(int a, int b, int i, int denom)
 	case 3:		denom *= 5; i *= 5;	// fall through to case 15
 	case 15:	weights = denom15_weights_64; break;
 	case 7:		weights = denom7_weights_64; break;
-	default:	nvDebugCheck(0);
+	default:	nvUnreachable();
 	}
 
 	return (a*weights[denom-i] +b*weights[i] + round) >> shift;
diff --git a/src/bc6h/zohone.cpp b/src/bc6h/zohone.cpp
index 43a302c..df49cc8 100644
--- a/src/bc6h/zohone.cpp
+++ b/src/bc6h/zohone.cpp
@@ -584,7 +584,7 @@ static void optimize_endpts(const Tile &tile, int shapeindex, const float orig_e
 {
     Vector3 pixels[Tile::TILE_TOTAL];
     float importance[Tile::TILE_TOTAL];
-    float err = 0;
+    //float err = 0;
 
     for (int region=0; region<NREGIONS_ONE; ++region)
     {
diff --git a/src/bc6h/zohtwo.cpp b/src/bc6h/zohtwo.cpp
index c585ed3..1183016 100644
--- a/src/bc6h/zohtwo.cpp
+++ b/src/bc6h/zohtwo.cpp
@@ -672,7 +672,7 @@ static void optimize_endpts(const Tile &tile, int shapeindex, const float orig_e
 {
     Vector3 pixels[Tile::TILE_TOTAL];
     float importance[Tile::TILE_TOTAL];
-    float err = 0;
+    //float err = 0;
 
     for (int region=0; region<NREGIONS_TWO; ++region)
     {
diff --git a/src/bc7/CMakeLists.txt b/src/bc7/CMakeLists.txt
index ab952f0..99df434 100644
--- a/src/bc7/CMakeLists.txt
+++ b/src/bc7/CMakeLists.txt
@@ -22,6 +22,7 @@ SET(BC7_SRCS
 	avpcl_utils.h)
 
 ADD_LIBRARY(bc7 STATIC ${BC7_SRCS})
+TARGET_LINK_LIBRARIES(bc7 nvcore nvmath)
 
 TARGET_LINK_LIBRARIES(bc7 nvmath)
 
diff --git a/src/bc7/avpcl_mode0.cpp b/src/bc7/avpcl_mode0.cpp
index 443b7c8..be5b439 100644
--- a/src/bc7/avpcl_mode0.cpp
+++ b/src/bc7/avpcl_mode0.cpp
@@ -243,7 +243,7 @@ static void write_header(const IntEndptsRGB_2 endpts[NREGIONS], int shapeindex,
 
 static void read_header(Bits &in, IntEndptsRGB_2 endpts[NREGIONS], int &shapeindex, Pattern &p, int &pat_index)
 {
-	int mode = AVPCL::getmode(in);
+	//int mode = AVPCL::getmode(in);
 
 	pat_index = 0;
 	nvAssert (pat_index >= 0 && pat_index < NPATTERNS);
@@ -580,7 +580,7 @@ static float exhaustive(const Vector4 colors[], const float importance[], int np
 	int bhigh = min((1<<bprec)-1, opt_endpts.B[ch] + bdelta);
 
 	// now there's no need to swap the ordering of A and B
-	bool a_le_b = opt_endpts.A[ch] <= opt_endpts.B[ch];
+	//bool a_le_b = opt_endpts.A[ch] <= opt_endpts.B[ch];
 
 	int amin, bmin;
 
diff --git a/src/nvcore/Array.h b/src/nvcore/Array.h
index f3310e8..cef9b2c 100644
--- a/src/nvcore/Array.h
+++ b/src/nvcore/Array.h
@@ -148,7 +148,7 @@ namespace nv
         NV_FORCEINLINE bool isDone(const PseudoIndex & i) const { nvDebugCheck(i <= this->m_size); return i == this->m_size; }
         NV_FORCEINLINE void advance(PseudoIndex & i) const { nvDebugCheck(i <= this->m_size); i++; }
 
-#if NV_CC_MSVC
+#if NV_NEED_PSEUDOINDEX_WRAPPER
         NV_FORCEINLINE T & operator[]( const PseudoIndexWrapper & i ) {
             return m_buffer[i(this)];
         }
diff --git a/src/nvcore/DefsGnucDarwin.h b/src/nvcore/DefsGnucDarwin.h
index 75dc027..b0a58db 100644
--- a/src/nvcore/DefsGnucDarwin.h
+++ b/src/nvcore/DefsGnucDarwin.h
@@ -27,7 +27,7 @@
 #define NV_FASTCALL		__attribute__((fastcall))
 #define NV_FORCEINLINE	__attribute__((always_inline)) inline
 #define NV_DEPRECATED   __attribute__((deprecated))
-#define NV_THREAD_LOCAL //ACS: there's no "__thread" or equivalent on iOS/OSX
+#define NV_THREAD_LOCAL __thread
 
 #if __GNUC__ > 2
 #define NV_PURE     __attribute__((pure))
diff --git a/src/nvcore/FileSystem.cpp b/src/nvcore/FileSystem.cpp
index bf64c28..4336f5d 100644
--- a/src/nvcore/FileSystem.cpp
+++ b/src/nvcore/FileSystem.cpp
@@ -31,11 +31,6 @@ bool FileSystem::exists(const char * path)
     // PathFileExists requires linking to shlwapi.lib
     //return PathFileExists(path) != 0;
     return GetFileAttributesA(path) != INVALID_FILE_ATTRIBUTES;
-#elif NV_OS_ORBIS
-    const int BUFFER_SIZE = 2048;
-    char file_fullpath[BUFFER_SIZE];
-    snprintf(file_fullpath, BUFFER_SIZE, "/app0/%s", path);
-    return sceFiosExistsSync(NULL, file_fullpath);
 #else
 	if (FILE * fp = fopen(path, "r"))
 	{
@@ -78,3 +73,31 @@ bool FileSystem::removeFile(const char * path)
     // @@ Use unlink or remove?
     return remove(path) == 0;
 }
+
+
+#include "StdStream.h" // for fileOpen
+
+bool FileSystem::copyFile(const char * src, const char * dst) {
+
+    FILE * fsrc = fileOpen(src, "rb");
+    if (fsrc == NULL) return false;
+    NV_ON_RETURN(fclose(fsrc));
+
+    FILE * fdst = fileOpen(dst, "wb");
+    if (fdst == NULL) return false;
+    NV_ON_RETURN(fclose(fdst));
+    
+    char buffer[1024];
+    size_t n;
+
+    while ((n = fread(buffer, sizeof(char), sizeof(buffer), fsrc)) > 0) {
+        if (fwrite(buffer, sizeof(char), n, fdst) != n) {
+            return false;
+        }
+    }
+    
+    return true;
+}
+
+
+
diff --git a/src/nvcore/FileSystem.h b/src/nvcore/FileSystem.h
index f0f06aa..17379fb 100644
--- a/src/nvcore/FileSystem.h
+++ b/src/nvcore/FileSystem.h
@@ -15,7 +15,7 @@ namespace nv
         NVCORE_API bool createDirectory(const char * path);
         NVCORE_API bool changeDirectory(const char * path);
         NVCORE_API bool removeFile(const char * path);
-
+        NVCORE_API bool copyFile(const char * src, const char * dst);
     } // FileSystem namespace
 
 } // nv namespace
diff --git a/src/nvcore/ForEach.h b/src/nvcore/ForEach.h
index 078227f..d7a89cc 100644
--- a/src/nvcore/ForEach.h
+++ b/src/nvcore/ForEach.h
@@ -33,6 +33,8 @@ https://gcc.gnu.org/bugzilla/show_bug.cgi?id=6709
 
 #else // If typeof not available:
 
+#define NV_NEED_PSEUDOINDEX_WRAPPER 1
+
 #include <new> // placement new
 
 struct PseudoIndexWrapper {
diff --git a/src/nvcore/Memory.cpp b/src/nvcore/Memory.cpp
index 644f40a..ab8f5d1 100644
--- a/src/nvcore/Memory.cpp
+++ b/src/nvcore/Memory.cpp
@@ -2,6 +2,7 @@
 
 #include "Memory.h"
 #include "Debug.h"
+#include "Utils.h"
 
 #include <stdlib.h>
 
@@ -56,6 +57,7 @@ void * realloc(void * ptr, size_t size)
 #endif
 }
 
+
 /* No need to override this unless we want line info.
 void * operator new (size_t size) throw()
 {
@@ -116,4 +118,32 @@ void operator delete(void* p, const std::nothrow_t&) throw()
 
 #endif // NV_OVERRIDE_ALLOC
 
+void * nv::aligned_malloc(size_t size, size_t alignment)
+{
+    // alignment must be a power of two, multiple of sizeof(void*)
+    nvDebugCheck(isPowerOfTwo(alignment));
+    nvDebugCheck((alignment & (sizeof(void*) - 1)) == 0);
+
+#if NV_OS_WIN32 || NV_OS_DURANGO
+    return _aligned_malloc(size, alignment);
+#elif NV_OS_DARWIN && !NV_OS_IOS
+    void * ptr = NULL;
+    posix_memalign(&ptr, alignment, size);
+    return ptr;
+#elif NV_OS_LINUX
+    return memalign(alignment, size)
+#else // NV_OS_ORBIS || NV_OS_IOS
+    // @@ IC: iOS appears to be 16 byte aligned, should we check alignment and assert if we request a higher alignment factor?
+    return ::malloc(size);
+#endif
+}
+
+void nv::aligned_free(void * ptr)
+{
+#if NV_OS_WIN32 || NV_OS_DURANGO
+    _aligned_free(ptr);
+#else
+    ::free(ptr);
+#endif
+}
 
diff --git a/src/nvcore/Memory.h b/src/nvcore/Memory.h
index a7fe197..5739e49 100644
--- a/src/nvcore/Memory.h
+++ b/src/nvcore/Memory.h
@@ -7,10 +7,16 @@
 #include "nvcore.h"
 
 #include <stdlib.h> // malloc(), realloc() and free()
+#include <string.h> // memset
 //#include <stddef.h> // size_t
 
 //#include <new>	// new and delete
 
+#define TRACK_MEMORY_LEAKS 0
+#if TRACK_MEMORY_LEAKS
+#include <vld.h>
+#endif
+
 
 #if NV_CC_GNUC
 #   define NV_ALIGN_16 __attribute__ ((__aligned__ (16)))
@@ -41,6 +47,8 @@ extern "C" {
 #endif
 
 namespace nv {
+    NVCORE_API void * aligned_malloc(size_t size, size_t alignment);
+    NVCORE_API void aligned_free(void * );
 
     // C++ helpers.
     template <typename T> NV_FORCEINLINE T * malloc(size_t count) {
diff --git a/src/nvcore/Ptr.h b/src/nvcore/Ptr.h
index 7275c43..b1f7228 100644
--- a/src/nvcore/Ptr.h
+++ b/src/nvcore/Ptr.h
@@ -113,7 +113,7 @@ namespace nv
     public:
 
         // BaseClass must implement addRef() and release().
-        typedef SmartPtr<BaseClass>	ThisType;
+        typedef SmartPtr<BaseClass> ThisType;
 
         /// Default ctor.
         SmartPtr() : m_ptr(NULL) 
diff --git a/src/nvcore/StdStream.h b/src/nvcore/StdStream.h
index dbebff2..8dbdf99 100644
--- a/src/nvcore/StdStream.h
+++ b/src/nvcore/StdStream.h
@@ -213,9 +213,12 @@ namespace nv
 #elif NV_OS_LINUX
             return (uint)fread_unlocked(data, 1, len, m_fp);
 #elif NV_OS_DARWIN
-            // @@ No error checking, always returns len.
+            // This is rather lame. Not sure if it's faster than the locked version.
             for (uint i = 0; i < len; i++) {
                 ((char *)data)[i] = getc_unlocked(m_fp);
+                if (feof_unlocked(m_fp) != 0) {
+                    return i;
+                }
             }
             return len;
 #else
diff --git a/src/nvcore/StrLib.cpp b/src/nvcore/StrLib.cpp
index 72d6612..b285402 100644
--- a/src/nvcore/StrLib.cpp
+++ b/src/nvcore/StrLib.cpp
@@ -347,26 +347,36 @@ StringBuilder & StringBuilder::formatList( const char * fmt, va_list arg )
 }
 
 
-/** Append a string. */
-StringBuilder & StringBuilder::append( const char * s )
+// Append a character.
+StringBuilder & StringBuilder::append( char c )
 {
-	return append(s, U32(strlen( s )));
+    return append(&c, 1);
 }
 
+// Append a string.
+StringBuilder & StringBuilder::append( const char * s )
+{
+    return append(s, U32(strlen( s )));
+}
 
-/** Append a string. */
+// Append a string.
 StringBuilder & StringBuilder::append(const char * s, uint len)
 {
     nvDebugCheck(s != NULL);
 
-	uint offset = length();
-	const uint size = offset + len + 1;
-	reserve(size);
-	strCpy(m_str + offset, len + 1, s, len);
+    uint offset = length();
+    const uint size = offset + len + 1;
+    reserve(size);
+    strCpy(m_str + offset, len + 1, s, len);
 
     return *this;
 }
 
+StringBuilder & StringBuilder::append(const StringBuilder & str)
+{
+    return append(str.m_str, str.length());
+}
+
 
 /** Append a formatted string. */
 StringBuilder & StringBuilder::appendFormat( const char * fmt, ... )
@@ -516,6 +526,19 @@ StringBuilder & StringBuilder::copy( const StringBuilder & s )
     return *this;
 }
 
+void StringBuilder::removeChar(char c)
+{
+    char * src = strchr(m_str, c);
+    if (src) {
+        char * dst = src;
+        src++;
+        while (*src) {
+            *dst++ = *src++;
+        }
+        *dst = '\0';
+    }
+}
+
 bool StringBuilder::endsWith(const char * str) const
 {
     uint l = uint(strlen(str));
@@ -530,7 +553,7 @@ bool StringBuilder::beginsWith(const char * str) const
     return strncmp(m_str, str, l) == 0;
 }
 
-// Find given char starting from the end.
+// Find given char starting from the end. Why not use strrchr!?
 char * StringBuilder::reverseFind(char c)
 {
     int length = (int)strlen(m_str) - 1;
@@ -563,6 +586,19 @@ char * StringBuilder::release()
     return str;
 }
 
+// Take ownership of string.
+void StringBuilder::acquire(char * str)
+{
+    if (str) {
+        m_size = strLen(str) + 1;
+        m_str = str;
+    }
+    else {
+        m_size = 0;
+        m_str = NULL;
+    }
+}
+
 // Swap strings.
 void nv::swap(StringBuilder & a, StringBuilder & b) {
     swap(a.m_size, b.m_size);
@@ -585,19 +621,20 @@ const char * Path::extension() const
 
 
 /*static */void Path::translatePath(char * path, char pathSeparator/*= NV_PATH_SEPARATOR*/) {
-    nvCheck(path != NULL);
-
-    for (int i = 0;; i++) {
-        if (path[i] == '\0') break;
-        if (path[i] == '\\' || path[i] == '/') path[i] = pathSeparator;
+    if (path != NULL) {
+        for (int i = 0;; i++) {
+            if (path[i] == '\0') break;
+            if (path[i] == '\\' || path[i] == '/') path[i] = pathSeparator;
+        }
     }
 }
 
 /// Toggles path separators (ie. \\ into /).
 void Path::translatePath(char pathSeparator/*=NV_PATH_SEPARATOR*/)
 {
-    nvCheck(!isNull());
-    translatePath(m_str, pathSeparator);
+    if (!isNull()) {
+        translatePath(m_str, pathSeparator);
+    }
 }
 
 void Path::appendSeparator(char pathSeparator/*=NV_PATH_SEPARATOR*/)
diff --git a/src/nvcore/StrLib.h b/src/nvcore/StrLib.h
index f4f3ac3..c6ab71d 100644
--- a/src/nvcore/StrLib.h
+++ b/src/nvcore/StrLib.h
@@ -105,8 +105,10 @@ namespace nv
         StringBuilder & format( const char * format, ... ) __attribute__((format (printf, 2, 3)));
         StringBuilder & formatList( const char * format, va_list arg );
 
+        StringBuilder & append(char c);
         StringBuilder & append(const char * str);
-		StringBuilder & append(const char * str, uint len);
+        StringBuilder & append(const char * str, uint len);
+        StringBuilder & append(const StringBuilder & str);
         StringBuilder & appendFormat(const char * format, ...) __attribute__((format (printf, 2, 3)));
         StringBuilder & appendFormatList(const char * format, va_list arg);
 
@@ -122,6 +124,8 @@ namespace nv
 
         StringBuilder & toLower();
         StringBuilder & toUpper();
+        
+        void removeChar(char c);
 
         bool endsWith(const char * str) const;
         bool beginsWith(const char * str) const;
@@ -129,15 +133,16 @@ namespace nv
         char * reverseFind(char c);
 
         void reset();
-        bool isNull() const { return m_size == 0; }
+        NV_FORCEINLINE bool isNull() const { return m_size == 0; }
 
         // const char * accessors
         //operator const char * () const { return m_str; }
         //operator char * () { return m_str; }
-        const char * str() const { return m_str; }
-        char * str() { return m_str; }
+        NV_FORCEINLINE const char * str() const { return m_str; }
+        NV_FORCEINLINE char * str() { return m_str; }
 
-        char * release();
+        char * release();       // Release ownership of string.
+        void acquire(char *);   // Take ownership of string.
 
         /// Implement value semantics.
         StringBuilder & operator=( const StringBuilder & s ) {
@@ -280,25 +285,25 @@ namespace nv
         /// Equal operator.
         bool operator==( const String & str ) const
         {
-            return strMatch(str.data, data);
+            return strEqual(str.data, data);
         }
 
         /// Equal operator.
         bool operator==( const char * str ) const
         {
-            return strMatch(str, data);
+            return strEqual(str, data);
         }
 
         /// Not equal operator.
         bool operator!=( const String & str ) const
         {
-            return !strMatch(str.data, data);
+            return !strEqual(str.data, data);
         }
 
         /// Not equal operator.
         bool operator!=( const char * str ) const
         {
-            return !strMatch(str, data);
+            return !strEqual(str, data);
         }
 
         /// Returns true if this string is the null string.
diff --git a/src/nvcore/Stream.h b/src/nvcore/Stream.h
index 513cd0c..8e74380 100644
--- a/src/nvcore/Stream.h
+++ b/src/nvcore/Stream.h
@@ -76,13 +76,13 @@ namespace nv
         void advance(uint offset) { seek(tell() + offset); }
 
 
-        // friends	
+        // friends
         friend Stream & operator<<( Stream & s, bool & c ) {
 #if NV_OS_DARWIN && !NV_CC_CPP11
             nvStaticCheck(sizeof(bool) == 4);
             uint8 b = c ? 1 : 0;
             s.serialize( &b, 1 );
-            c = (b == 1);
+            c = (b != 0);
 #else
             nvStaticCheck(sizeof(bool) == 1);
             s.serialize( &c, 1 );
diff --git a/src/nvcore/Utils.h b/src/nvcore/Utils.h
index 778b252..2eb692c 100644
--- a/src/nvcore/Utils.h
+++ b/src/nvcore/Utils.h
@@ -39,6 +39,28 @@ namespace nv
 
     // These intentionally look like casts.
 
+    // uint64 casts:
+    template <typename T> inline uint64 U64(T x) { return x; }
+    //template <> inline uint64 U64<uint64>(uint64 x) { return x; }
+    template <> inline uint64 U64<int64>(int64 x) { nvDebugCheck(x >= 0); return (uint64)x; }
+    //template <> inline uint64 U32<uint32>(uint32 x) { return x; }
+    template <> inline uint64 U64<int32>(int32 x) { nvDebugCheck(x >= 0); return (uint64)x; }
+    //template <> inline uint64 U64<uint16>(uint16 x) { return x; }
+    template <> inline uint64 U64<int16>(int16 x) { nvDebugCheck(x >= 0); return (uint64)x; }
+    //template <> inline uint64 U64<uint8>(uint8 x) { return x; }
+    template <> inline uint64 U64<int8>(int8 x) { nvDebugCheck(x >= 0); return (uint64)x; }
+
+    // int64 casts:
+    template <typename T> inline int64 I64(T x) { return x; }
+    template <> inline int64 I64<uint64>(uint64 x) { nvDebugCheck(x <= NV_INT64_MAX); return (int64)x; }
+    //template <> inline uint64 U64<int64>(int64 x) { return x; }
+    //template <> inline uint64 U32<uint32>(uint32 x) { return x; }
+    //template <> inline uint64 U64<int32>(int32 x) { return x; }
+    //template <> inline uint64 U64<uint16>(uint16 x) { return x; }
+    //template <> inline uint64 U64<int16>(int16 x) { return x; }
+    //template <> inline uint64 U64<uint8>(uint8 x) { return x; }
+    //template <> inline uint64 U64<int8>(int8 x) { return x; }
+
     // uint32 casts:
     template <typename T> inline uint32 U32(T x) { return x; }
     template <> inline uint32 U32<uint64>(uint64 x) { nvDebugCheck(x <= NV_UINT32_MAX); return (uint32)x; }
@@ -50,6 +72,11 @@ namespace nv
     //template <> inline uint32 U32<uint8>(uint8 x) { return x; }
     template <> inline uint32 U32<int8>(int8 x) { nvDebugCheck(x >= 0); return (uint32)x; }
 
+#if NV_OS_DARWIN
+    template <> inline uint32 U32<unsigned long>(unsigned long x) { nvDebugCheck(x <= NV_UINT32_MAX); return (uint32)x; }
+    template <> inline uint32 U32<long>(long x) { nvDebugCheck(x >= 0 && x <= NV_UINT32_MAX); return (uint32)x; }
+#endif
+
     // int32 casts:
     template <typename T> inline int32 I32(T x) { return x; }
     template <> inline int32 I32<uint64>(uint64 x) { nvDebugCheck(x <= NV_INT32_MAX); return (int32)x; }
@@ -182,7 +209,7 @@ namespace nv
     * @note isPowerOfTwo(x) == true -> nextPowerOfTwo(x) == x
     * @note nextPowerOfTwo(x) = 2 << log2(x-1)
     */
-    inline uint nextPowerOfTwo( uint x )
+    inline uint32 nextPowerOfTwo(uint32 x)
     {
         nvDebugCheck( x != 0 );
 #if 1	// On modern CPUs this is supposed to be as fast as using the bsr instruction.
@@ -202,8 +229,19 @@ namespace nv
 #endif
     }
 
-    /// Return true if @a n is a power of two.
-    inline bool isPowerOfTwo( uint n )
+    inline uint64 nextPowerOfTwo(uint64 x)
+    {
+        nvDebugCheck(x != 0);
+        uint p = 1;
+        while (x > p) {
+            p += p;
+        }
+        return p;
+    }
+
+    // @@ Should I just use a macro instead?
+    template <typename T>
+    inline bool isPowerOfTwo(T n)
     {
         return (n & (n-1)) == 0;
     }
diff --git a/src/nvcore/nvcore.h b/src/nvcore/nvcore.h
index b402bb2..ecbaa5b 100644
--- a/src/nvcore/nvcore.h
+++ b/src/nvcore/nvcore.h
@@ -56,6 +56,7 @@
 #   define NV_OS_MINGW 1
 #   define NV_OS_WIN32 1
 #elif defined POSH_OS_OSX
+#   define NV_OS_OSX 1      // IC: Adding this, because iOS defines NV_OS_DARWIN too.
 #   define NV_OS_DARWIN 1
 #   define NV_OS_UNIX 1
 #elif defined POSH_OS_IOS
@@ -78,9 +79,9 @@
 
 // Threading:
 // some platforms don't implement __thread or similar for thread-local-storage
-#if NV_OS_UNIX || NV_OS_ORBIS || NV_OS_IOS //ACStodoIOS darwin instead of ios?
+#if NV_OS_UNIX || NV_OS_ORBIS || NV_OS_IOS
 #   define NV_OS_USE_PTHREAD 1
-#   if NV_OS_DARWIN || NV_OS_IOS
+#   if 0  //Apple finally added TLS support to iOS!// NV_OS_IOS
 #       define NV_OS_HAS_TLS_QUALIFIER 0
 #   else
 #       define NV_OS_HAS_TLS_QUALIFIER 1
@@ -96,7 +97,7 @@
 // NV_CPU_X86_64
 // NV_CPU_PPC
 // NV_CPU_ARM
-// NV_CPU_AARCH64
+// NV_CPU_ARM_64
 
 #define NV_CPU_STRING   POSH_CPU_STRING
 
@@ -110,7 +111,7 @@
 #elif defined POSH_CPU_STRONGARM
 #   define NV_CPU_ARM 1
 #elif defined POSH_CPU_AARCH64
-#   define NV_CPU_AARCH64 1
+#   define NV_CPU_ARM_64 1
 #else
 #   error "Unsupported CPU"
 #endif
@@ -148,10 +149,16 @@
 #endif
 
 // Endiannes:
-#define NV_LITTLE_ENDIAN    POSH_LITTLE_ENDIAN
-#define NV_BIG_ENDIAN       POSH_BIG_ENDIAN
-#define NV_ENDIAN_STRING    POSH_ENDIAN_STRING
-
+// @@ POSH endian detection is broken for arm64 on iOS. They are bi-endian and iOS sets all their processors to little endian by default.
+#if NV_OS_IOS
+#   define NV_LITTLE_ENDIAN    1
+#   define NV_BIG_ENDIAN       0
+#   define NV_ENDIAN_STRING    "little"
+#else
+#   define NV_LITTLE_ENDIAN    POSH_LITTLE_ENDIAN
+#   define NV_BIG_ENDIAN       POSH_BIG_ENDIAN
+#   define NV_ENDIAN_STRING    POSH_ENDIAN_STRING
+#endif
 
 // Define the right printf prefix for size_t arguments:
 #if POSH_64BIT_POINTER
@@ -164,6 +171,28 @@
 // cmake config
 #include "nvconfig.h"
 
+#if NV_OS_DARWIN
+#include <stdint.h>
+//#include <inttypes.h>
+
+// Type definitions:
+typedef uint8_t     uint8;
+typedef int8_t      int8;
+
+typedef uint16_t    uint16;
+typedef int16_t     int16;
+
+typedef uint32_t    uint32;
+typedef int32_t     int32;
+
+typedef uint64_t    uint64;
+typedef int64_t     int64;
+
+// POSH gets this wrong due to __LP64__
+#undef POSH_I64_PRINTF_PREFIX
+#define POSH_I64_PRINTF_PREFIX "ll"
+
+#else
 
 // Type definitions:
 typedef posh_u8_t   uint8;
@@ -175,8 +204,23 @@ typedef posh_i16_t  int16;
 typedef posh_u32_t  uint32;
 typedef posh_i32_t  int32;
 
+//#if NV_OS_DARWIN
+// OSX-64 is supposed to be LP64 (longs and pointers are 64 bits), thus uint64 is defined as 
+// unsigned long. However, some OSX headers define it as unsigned long long, producing errors,
+// even though both types are 64 bit. Ideally posh should handle that, but it has not been
+// updated in ages, so here I'm just falling back to the standard C99 types defined in inttypes.h
+//#include <inttypes.h>
+//typedef posh_u64_t  uint64_t;
+//typedef posh_i64_t  int64_t;
+//#else
 typedef posh_u64_t  uint64;
 typedef posh_i64_t  int64;
+//#endif
+#if NV_OS_DARWIN
+// To avoid duplicate definitions.
+#define _UINT64
+#endif
+#endif
 
 // Aliases
 typedef uint32      uint;
@@ -246,8 +290,10 @@ NV_COMPILER_CHECK(sizeof(uint32) == 4);
 NV_COMPILER_CHECK(sizeof(int32) == 4);
 NV_COMPILER_CHECK(sizeof(uint32) == 4);
 
-
-#define NV_ARRAY_SIZE(x) (sizeof(x)/sizeof((x)[0]))
+#include <stddef.h> // for size_t
+template <typename T, size_t N> char (&ArraySizeHelper(T (&array)[N]))[N];
+#define NV_ARRAY_SIZE(x) sizeof(ArraySizeHelper(x))
+//#define NV_ARRAY_SIZE(x) (sizeof(x)/sizeof((x)[0]))
 
 #if 0 // Disabled in The Witness.
 #if NV_CC_MSVC
@@ -269,8 +315,38 @@ NV_COMPILER_CHECK(sizeof(uint32) == 4);
         NV_STRING_JOIN3(AtStartup_, __LINE__, Instance); \
     }
 
+namespace nv {
+    template <typename F>
+    struct ScopeExit {
+        ScopeExit(F f) : f(f) {}
+        ~ScopeExit() { f(); }
+        F f;
+    };
+
+    template <typename F>
+    ScopeExit<F> MakeScopeExit(F f) {
+        return ScopeExit<F>(f);
+    };
+}
+
+#define NV_ON_RETURN(code) \
+    auto NV_STRING_JOIN2(scope_exit_, __LINE__) = nv::MakeScopeExit([=](){code;})
+
+
 // Indicate the compiler that the parameter is not used to suppress compier warnings.
+#if NV_CC_MSVC
 #define NV_UNUSED(a) ((a)=(a))
+#else
+#define NV_UNUSED(a) _Pragma(NV_STRING(unused(a)))
+#endif
+
+#if NV_CC_GNUC || NV_CC_CLANG
+#define NV_LIKELY(x) __builtin_expect(!!(x), 1)
+#define NV_UNLIKELY(x) __builtin_expect(!!(x), 0)
+#else
+#define NV_LIKELY(x) x
+#define NV_UNLIKELY(x) x
+#endif
 
 // Null index. @@ Move this somewhere else... it's only used by nvmesh.
 //const unsigned int NIL = unsigned int(~0);
diff --git a/src/nvimage/BlockDXT.cpp b/src/nvimage/BlockDXT.cpp
index 9d334c4..e98d8fa 100644
--- a/src/nvimage/BlockDXT.cpp
+++ b/src/nvimage/BlockDXT.cpp
@@ -632,44 +632,45 @@ void BlockCTX1::setIndices(int * idx)
 
 
 /// Decode BC6 block.
-void BlockBC6::decodeBlock(Vector3 colors[16]) const
-{
-	ZOH::Tile tile(4, 4);
-	ZOH::decompress((const char *)data, tile);
-
-	// Convert ZOH's tile struct to Vector3, and convert half to float.
-	for (uint y = 0; y < 4; ++y)
-	{
-		for (uint x = 0; x < 4; ++x)
-		{
-			uint16 rHalf = ZOH::Tile::float2half(tile.data[y][x].x);
-			uint16 gHalf = ZOH::Tile::float2half(tile.data[y][x].y);
-			uint16 bHalf = ZOH::Tile::float2half(tile.data[y][x].z);
-			colors[y * 4 + x].x = to_float(rHalf);
-			colors[y * 4 + x].y = to_float(gHalf);
-			colors[y * 4 + x].z = to_float(bHalf);
-		}
-	}
+void BlockBC6::decodeBlock(Vector4 colors[16]) const
+{
+    ZOH::Tile tile(4, 4);
+    ZOH::decompress((const char *)data, tile);
+
+    // Convert ZOH's tile struct to Vector3, and convert half to float.
+    for (uint y = 0; y < 4; ++y)
+    {
+        for (uint x = 0; x < 4; ++x)
+        {
+            uint16 rHalf = ZOH::Tile::float2half(tile.data[y][x].x);
+            uint16 gHalf = ZOH::Tile::float2half(tile.data[y][x].y);
+            uint16 bHalf = ZOH::Tile::float2half(tile.data[y][x].z);
+            colors[y * 4 + x].x = to_float(rHalf);
+            colors[y * 4 + x].y = to_float(gHalf);
+            colors[y * 4 + x].z = to_float(bHalf);
+            colors[y * 4 + x].w = 1.0f;
+        }
+    }
 }
 
 
 /// Decode BC7 block.
 void BlockBC7::decodeBlock(ColorBlock * block) const
 {
-	AVPCL::Tile tile(4, 4);
-	AVPCL::decompress((const char *)data, tile);
-
-	// Convert AVPCL's tile struct back to NVTT's.
-	for (uint y = 0; y < 4; ++y)
-	{
-		for (uint x = 0; x < 4; ++x)
-		{
-			Vector4 rgba = tile.data[y][x];
-			// Note: decoded rgba values are in [0, 255] range and should be an integer,
-			// because BC7 never uses more than 8 bits per channel.  So no need to round.
-			block->color(x, y).setRGBA(uint8(rgba.x), uint8(rgba.y), uint8(rgba.z), uint8(rgba.w));
-		}
-	}
+    AVPCL::Tile tile(4, 4);
+    AVPCL::decompress((const char *)data, tile);
+
+    // Convert AVPCL's tile struct back to NVTT's.
+    for (uint y = 0; y < 4; ++y)
+    {
+        for (uint x = 0; x < 4; ++x)
+        {
+            Vector4 rgba = tile.data[y][x];
+            // Note: decoded rgba values are in [0, 255] range and should be an integer,
+            // because BC7 never uses more than 8 bits per channel.  So no need to round.
+            block->color(x, y).setRGBA(uint8(rgba.x), uint8(rgba.y), uint8(rgba.z), uint8(rgba.w));
+        }
+    }
 }
 
 
diff --git a/src/nvimage/BlockDXT.h b/src/nvimage/BlockDXT.h
index 40f615f..c462761 100644
--- a/src/nvimage/BlockDXT.h
+++ b/src/nvimage/BlockDXT.h
@@ -36,6 +36,7 @@ namespace nv
     struct AlphaBlock4x4;
     class Stream;
     class Vector3;
+    class Vector4;
 
 
     /// DXT1 block.
@@ -220,7 +221,7 @@ namespace nv
 	struct BlockBC6
 	{
 		uint8 data[16];		// Not even going to try to write a union for this thing.
-		void decodeBlock(Vector3 colors[16]) const;
+        void decodeBlock(Vector4 colors[16]) const;
 	};
 
 	/// BC7 block.
diff --git a/src/nvimage/CMakeLists.txt b/src/nvimage/CMakeLists.txt
index dce627d..42f21f4 100644
--- a/src/nvimage/CMakeLists.txt
+++ b/src/nvimage/CMakeLists.txt
@@ -14,7 +14,8 @@ SET(IMAGE_SRCS
     NormalMap.h NormalMap.cpp
     PixelFormat.h
     PsdFile.h
-    TgaFile.h)
+    TgaFile.h
+    KtxFile.h KtxFile.cpp)
 
 INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
 
diff --git a/src/nvimage/DirectDrawSurface.cpp b/src/nvimage/DirectDrawSurface.cpp
index 3be26b9..be8146e 100644
--- a/src/nvimage/DirectDrawSurface.cpp
+++ b/src/nvimage/DirectDrawSurface.cpp
@@ -454,7 +454,8 @@ namespace
 
         { D3DFMT_L8,             DXGI_FORMAT_R8_UNORM ,         { 8,  0xFF,       0,          0,          0 } },
         { D3DFMT_L16,            DXGI_FORMAT_R16_UNORM,         { 16, 0xFFFF,     0,          0,          0 } },
-        { D3DFMT_A8L8,           DXGI_FORMAT_R8G8_UNORM,        { 16, 0xFF,       0,          0,     0xFF00 } },
+        { D3DFMT_A8L8,           0,                             { 16, 0xFF,       0,          0,     0xFF00 } },
+        { 0,                     DXGI_FORMAT_R8G8_UNORM,        { 16, 0xFF,       0xFF00,     0,          0 } },
     };
 
     static const uint s_formatCount = NV_ARRAY_SIZE(s_formats);
@@ -635,7 +636,7 @@ void DDSHeader::setFourCC(uint8 c0, uint8 c1, uint8 c2, uint8 c3)
 {
     // set fourcc pixel format.
     this->pf.flags = DDPF_FOURCC;
-    this->pf.fourcc = MAKEFOURCC(c0, c1, c2, c3);
+    this->pf.fourcc = NV_MAKEFOURCC(c0, c1, c2, c3);
 
     this->pf.bitcount = 0;
     this->pf.rmask = 0;
@@ -659,7 +660,7 @@ void DDSHeader::setFormatCode(uint32 code)
 
 void DDSHeader::setSwizzleCode(uint8 c0, uint8 c1, uint8 c2, uint8 c3)
 {
-    this->pf.bitcount = MAKEFOURCC(c0, c1, c2, c3);
+    this->pf.bitcount = NV_MAKEFOURCC(c0, c1, c2, c3);
 }
 
 
@@ -1445,7 +1446,7 @@ void DirectDrawSurface::readBlock(ColorBlock * rgba)
     {
         BlockBC6 block;
         *stream << block;
-        Vector3 colors[16];
+        Vector4 colors[16];
         block.decodeBlock(colors);
 
         // Clamp to [0, 1] and round to 8-bit
@@ -1453,7 +1454,7 @@ void DirectDrawSurface::readBlock(ColorBlock * rgba)
         {
             for (int x = 0; x < 4; ++x)
             {
-                Vector3 px = colors[y*4 + x];
+                Vector4 px = colors[y*4 + x];
                 rgba->color(x, y).setRGBA(
                                     ftoi_round(clamp(px.x, 0.0f, 1.0f) * 255.0f),
                                     ftoi_round(clamp(px.y, 0.0f, 1.0f) * 255.0f),
@@ -1535,7 +1536,7 @@ uint DirectDrawSurface::surfaceSize(uint mipmap) const
     else {
         w = (w + 3) / 4;
         h = (h + 3) / 4;
-        d = d; // @@ How are 3D textures aligned?
+        //d = d; // @@ How are 3D textures aligned?
         return blockSize * w * h * d;
     }
 }
diff --git a/src/nvimage/DirectDrawSurface.h b/src/nvimage/DirectDrawSurface.h
index 5b48a8c..1049f06 100644
--- a/src/nvimage/DirectDrawSurface.h
+++ b/src/nvimage/DirectDrawSurface.h
@@ -27,11 +27,9 @@
 
 #include "nvimage.h"
 
-#if !defined(MAKEFOURCC)
-#define MAKEFOURCC(ch0, ch1, ch2, ch3) \
+#define NV_MAKEFOURCC(ch0, ch1, ch2, ch3) \
     (uint(uint8(ch0)) | (uint(uint8(ch1)) << 8) | \
     (uint(uint8(ch2)) << 16) | (uint(uint8(ch3)) << 24 ))
-#endif
 
 namespace nv
 {
@@ -101,19 +99,26 @@ namespace nv
 
     enum FOURCC
     {
-        FOURCC_NVTT = MAKEFOURCC('N', 'V', 'T', 'T'),
-        FOURCC_DDS = MAKEFOURCC('D', 'D', 'S', ' '),
-        FOURCC_DXT1 = MAKEFOURCC('D', 'X', 'T', '1'),
-        FOURCC_DXT2 = MAKEFOURCC('D', 'X', 'T', '2'),
-        FOURCC_DXT3 = MAKEFOURCC('D', 'X', 'T', '3'),
-        FOURCC_DXT4 = MAKEFOURCC('D', 'X', 'T', '4'),
-        FOURCC_DXT5 = MAKEFOURCC('D', 'X', 'T', '5'),
-        FOURCC_RXGB = MAKEFOURCC('R', 'X', 'G', 'B'),
-        FOURCC_ATI1 = MAKEFOURCC('A', 'T', 'I', '1'),
-        FOURCC_ATI2 = MAKEFOURCC('A', 'T', 'I', '2'),
-        FOURCC_A2XY = MAKEFOURCC('A', '2', 'X', 'Y'),
-        FOURCC_DX10 = MAKEFOURCC('D', 'X', '1', '0'),
-        FOURCC_UVER = MAKEFOURCC('U', 'V', 'E', 'R'),
+        FOURCC_NVTT = NV_MAKEFOURCC('N', 'V', 'T', 'T'),
+        FOURCC_DDS = NV_MAKEFOURCC('D', 'D', 'S', ' '),
+        FOURCC_DXT1 = NV_MAKEFOURCC('D', 'X', 'T', '1'),
+        FOURCC_DXT2 = NV_MAKEFOURCC('D', 'X', 'T', '2'),
+        FOURCC_DXT3 = NV_MAKEFOURCC('D', 'X', 'T', '3'),
+        FOURCC_DXT4 = NV_MAKEFOURCC('D', 'X', 'T', '4'),
+        FOURCC_DXT5 = NV_MAKEFOURCC('D', 'X', 'T', '5'),
+        FOURCC_RXGB = NV_MAKEFOURCC('R', 'X', 'G', 'B'),
+        FOURCC_ATI1 = NV_MAKEFOURCC('A', 'T', 'I', '1'),
+        FOURCC_ATI2 = NV_MAKEFOURCC('A', 'T', 'I', '2'),
+        FOURCC_A2XY = NV_MAKEFOURCC('A', '2', 'X', 'Y'),
+        FOURCC_DX10 = NV_MAKEFOURCC('D', 'X', '1', '0'),
+        FOURCC_UVER = NV_MAKEFOURCC('U', 'V', 'E', 'R'),
+        FOURCC_BC6H = NV_MAKEFOURCC('B', 'C', '6', 'H'),
+        FOURCC_BC7L = NV_MAKEFOURCC('B', 'C', '7', 'L'),
+        
+        FOURCC_PVR0 = NV_MAKEFOURCC('P', 'V', 'R', '0'),
+        FOURCC_PVR1 = NV_MAKEFOURCC('P', 'V', 'R', '1'),
+        FOURCC_PVR2 = NV_MAKEFOURCC('P', 'V', 'R', '2'),
+        FOURCC_PVR3 = NV_MAKEFOURCC('P', 'V', 'R', '3'),
     };
 
 
diff --git a/src/nvimage/ErrorMetric.cpp b/src/nvimage/ErrorMetric.cpp
index 3f10a72..3f632ef 100644
--- a/src/nvimage/ErrorMetric.cpp
+++ b/src/nvimage/ErrorMetric.cpp
@@ -1,460 +1,513 @@
-
-#include "ErrorMetric.h"
-#include "FloatImage.h"
-#include "Filter.h"
-
-#include "nvmath/Matrix.h"
-#include "nvmath/Vector.inl"
-
-#include <float.h> // FLT_MAX
-
-using namespace nv;
-
-float nv::rmsColorError(const FloatImage * ref, const FloatImage * img, bool alphaWeight)
-{
-    if (!sameLayout(img, ref)) {
-        return FLT_MAX;
-    }
-    nvDebugCheck(img->componentCount() == 4);
-    nvDebugCheck(ref->componentCount() == 4);
-
-    double mse = 0;
-
-    const uint count = img->pixelCount();
-    for (uint i = 0; i < count; i++)
-    {
-        float r0 = ref->pixel(i + count * 0);
-        float g0 = ref->pixel(i + count * 1);
-        float b0 = ref->pixel(i + count * 2);
-        float a0 = ref->pixel(i + count * 3);
-        float r1 = img->pixel(i + count * 0);
-        float g1 = img->pixel(i + count * 1);
-        float b1 = img->pixel(i + count * 2);
-        //float a1 = img->pixel(i + count * 3);
-
-        float r = r0 - r1;
-        float g = g0 - g1;
-        float b = b0 - b1;
-
-        float a = 1;
-        if (alphaWeight) a = a0 * a0; // @@ a0*a1 or a0*a0 ?
-
-        mse += (r * r) * a;
-        mse += (g * g) * a;
-        mse += (b * b) * a;
-    }
-
-    return float(sqrt(mse / count));
-}
-
-float nv::rmsAlphaError(const FloatImage * ref, const FloatImage * img)
-{
-    if (!sameLayout(img, ref)) {
-        return FLT_MAX;
-    }
-    nvDebugCheck(img->componentCount() == 4 && ref->componentCount() == 4);
-
-    double mse = 0;
-
-    const uint count = img->pixelCount();
-    for (uint i = 0; i < count; i++)
-    {
-        float a0 = img->pixel(i + count * 3);
-        float a1 = ref->pixel(i + count * 3);
-
-        float a = a0 - a1;
-
-        mse += a * a;
-    }
-
-    return float(sqrt(mse / count));
-}
-
-
-float nv::averageColorError(const FloatImage * ref, const FloatImage * img, bool alphaWeight)
-{
-    if (!sameLayout(img, ref)) {
-        return FLT_MAX;
-    }
-    nvDebugCheck(img->componentCount() == 4);
-    nvDebugCheck(ref->componentCount() == 4);
-
-    double mae = 0;
-
-    const uint count = img->pixelCount();
-    for (uint i = 0; i < count; i++)
-    {
-        float r0 = img->pixel(i + count * 0);
-        float g0 = img->pixel(i + count * 1);
-        float b0 = img->pixel(i + count * 2);
-        //float a0 = img->pixel(i + count * 3);
-        float r1 = ref->pixel(i + count * 0);
-        float g1 = ref->pixel(i + count * 1);
-        float b1 = ref->pixel(i + count * 2);
-        float a1 = ref->pixel(i + count * 3);
-
-        float r = fabs(r0 - r1);
-        float g = fabs(g0 - g1);
-        float b = fabs(b0 - b1);
-
-        float a = 1;
-        if (alphaWeight) a = a1;
-
-        mae += r * a;
-        mae += g * a;
-        mae += b * a;
-    }
-
-    return float(mae / count);
-}
-
-float nv::averageAlphaError(const FloatImage * ref, const FloatImage * img)
-{
-    if (img == NULL || ref == NULL || img->width() != ref->width() || img->height() != ref->height()) {
-        return FLT_MAX;
-    }
-    nvDebugCheck(img->componentCount() == 4 && ref->componentCount() == 4);
-
-    double mae = 0;
-
-    const uint count = img->width() * img->height();
-    for (uint i = 0; i < count; i++)
-    {
-        float a0 = img->pixel(i + count * 3);
-        float a1 = ref->pixel(i + count * 3);
-
-        float a = a0 - a1;
-
-        mae += fabs(a);
-    }
-
-    return float(mae / count);
-}
-
-
-// Color space conversions based on:
-// http://www.brucelindbloom.com/
-
-// Assumes input is in *linear* sRGB color space.
-static Vector3 rgbToXyz(Vector3::Arg c)
-{
-    Vector3 xyz;
-    xyz.x = 0.412453f * c.x + 0.357580f * c.y + 0.180423f * c.z;
-    xyz.y = 0.212671f * c.x + 0.715160f * c.y + 0.072169f * c.z;
-    xyz.z = 0.019334f * c.x + 0.119193f * c.y + 0.950227f * c.z;
-    return xyz;
-}
-
-static Vector3 xyzToRgb(Vector3::Arg c)
-{
-    Vector3 rgb;
-    rgb.x =  3.2404542f * c.x - 1.5371385f * c.y - 0.4985314f * c.z;
-    rgb.y = -0.9692660f * c.x + 1.8760108f * c.y + 0.0415560f * c.z;
-    rgb.z =  0.0556434f * c.x - 0.2040259f * c.y + 1.0572252f * c.z;
-    return rgb;
-}
-
-static float toLinear(float f)
-{
-    return powf(f, 2.2f);
-}
-
-static float toGamma(float f)
-{
-    // @@ Use sRGB space?
-    return powf(f, 1.0f/2.2f);
-}
-
-static Vector3 toLinear(Vector3::Arg c)
-{
-    return Vector3(toLinear(c.x), toLinear(c.y), toLinear(c.z));
-}
-
-static Vector3 toGamma(Vector3::Arg c)
-{
-    return Vector3(toGamma(c.x), toGamma(c.y), toGamma(c.z));
-}
-
-static float f(float t)
-{
-    const float epsilon = powf(6.0f/29.0f, 3);
-
-    if (t > epsilon) {
-        return powf(t, 1.0f/3.0f);
-    }
-    else {
-        return 1.0f/3.0f * powf(29.0f/6.0f, 2) * t + 4.0f / 29.0f;
-    }
-}
-
-static float finv(float t)
-{
-    if (t > 6.0f / 29.0f) {
-        return 3.0f * powf(6.0f / 29.0f, 2) * (t - 4.0f / 29.0f);
-    }
-    else {
-        return powf(t, 3.0f);
-    }
-}
-
-static Vector3 xyzToCieLab(Vector3::Arg c)
-{
-    // Normalized white point.
-    const float Xn = 0.950456f;
-    const float Yn = 1.0f;
-    const float Zn = 1.088754f;
-
-    float Xr = c.x / Xn;
-    float Yr = c.y / Yn;
-    float Zr = c.z / Zn;
-
-    float fx = f(Xr);
-    float fy = f(Yr);
-    float fz = f(Zr);
-
-    float L = 116 * fx - 16;
-    float a = 500 * (fx - fy);
-    float b = 200 * (fy - fz);
-
-    return Vector3(L, a, b);
-}
-
-static Vector3 rgbToCieLab(Vector3::Arg c)
-{
-    return xyzToCieLab(rgbToXyz(toLinear(c)));
-}
-
-// h is hue-angle in radians
-static Vector3 cieLabToLCh(Vector3::Arg c)
-{
-    return Vector3(c.x, sqrtf(c.y*c.y + c.z*c.z), atan2f(c.y, c.z));
-}
-
-static void rgbToCieLab(const FloatImage * rgbImage, FloatImage * LabImage)
-{
-    nvDebugCheck(rgbImage != NULL && LabImage != NULL);
-    nvDebugCheck(rgbImage->width() == LabImage->width() && rgbImage->height() == LabImage->height());
-    nvDebugCheck(rgbImage->componentCount() >= 3 && LabImage->componentCount() >= 3);
-
-    const uint w = rgbImage->width();
-    const uint h = LabImage->height();
-
-    const float * R = rgbImage->channel(0);
-    const float * G = rgbImage->channel(1);
-    const float * B = rgbImage->channel(2);
-
-    float * L = LabImage->channel(0);
-    float * a = LabImage->channel(1);
-    float * b = LabImage->channel(2);
-
-    const uint count = w*h;
-    for (uint i = 0; i < count; i++)
-    {
-        Vector3 Lab = rgbToCieLab(Vector3(R[i], G[i], B[i]));
-        L[i] = Lab.x;
-        a[i] = Lab.y;
-        b[i] = Lab.z;
-    }
-}
-
-
-// Assumes input images are in linear sRGB space.
-float nv::cieLabError(const FloatImage * img0, const FloatImage * img1)
-{
-    if (!sameLayout(img0, img1)) return FLT_MAX;
-    nvDebugCheck(img0->componentCount() == 4 && img1->componentCount() == 4);
-
-    const float * r0 = img0->channel(0);
-    const float * g0 = img0->channel(1);
-    const float * b0 = img0->channel(2);
-
-    const float * r1 = img1->channel(0);
-    const float * g1 = img1->channel(1);
-    const float * b1 = img1->channel(2);
-
-    double error = 0.0f;
-
-    const uint count = img0->pixelCount();
-    for (uint i = 0; i < count; i++)
-    {
-        Vector3 lab0 = rgbToCieLab(Vector3(r0[i], g0[i], b0[i]));
-        Vector3 lab1 = rgbToCieLab(Vector3(r1[i], g1[i], b1[i]));
-
-        // @@ Measure Delta E.
-        Vector3 delta = lab0 - lab1;
-        
-        error += length(delta);
-    }
-
-    return float(error / count);
-}
-
-// Assumes input images are in linear sRGB space.
-float nv::cieLab94Error(const FloatImage * img0, const FloatImage * img1)
-{
-    if (!sameLayout(img0, img1)) return FLT_MAX;
-    nvDebugCheck(img0->componentCount() == 4 && img1->componentCount() == 4);
-
-    const float kL = 1;
-    const float kC = 1;
-    const float kH = 1;
-    const float k1 = 0.045f;
-    const float k2 = 0.015f;
-
-    const float sL = 1;
-
-    const float * r0 = img0->channel(0);
-    const float * g0 = img0->channel(1);
-    const float * b0 = img0->channel(2);
-
-    const float * r1 = img1->channel(0);
-    const float * g1 = img1->channel(1);
-    const float * b1 = img1->channel(2);
-
-    double error = 0.0f;
-
-    const uint count = img0->pixelCount();
-    for (uint i = 0; i < count; ++i)
-    {
-        Vector3 lab0 = rgbToCieLab(Vector3(r0[i], g0[i], b0[i]));
-        Vector3 lch0 = cieLabToLCh(lab0);
-        Vector3 lab1 = rgbToCieLab(Vector3(r1[i], g1[i], b1[i]));
-        Vector3 lch1 = cieLabToLCh(lab1);
-
-        const float sC = 1 + k1*lch0.x;
-        const float sH = 1 + k2*lch0.x;
-
-        // @@ Measure Delta E using the 1994 definition
-        Vector3 labDelta = lab0 - lab1;
-        Vector3 lchDelta = lch0 - lch1;
-
-        double deltaLsq = powf(lchDelta.x / (kL*sL), 2);
-        double deltaCsq = powf(lchDelta.y / (kC*sC), 2);
-
-        // avoid possible sqrt of negative value by computing (deltaH/(kH*sH))^2
-        double deltaHsq = powf(labDelta.y, 2) + powf(labDelta.z, 2) - powf(lchDelta.y, 2);
-        deltaHsq /= powf(kH*sH, 2);
-
-        error += sqrt(deltaLsq + deltaCsq + deltaHsq);
-    }
-
-    return float(error / count);
-}
-
-float nv::spatialCieLabError(const FloatImage * img0, const FloatImage * img1)
-{
-    if (img0 == NULL || img1 == NULL || img0->width() != img1->width() || img0->height() != img1->height()) {
-        return FLT_MAX;
-    }
-    nvDebugCheck(img0->componentCount() == 4 && img1->componentCount() == 4);
-
-    uint w = img0->width();
-    uint h = img0->height();
-    uint d = img0->depth();
-
-    FloatImage lab0, lab1; // Original images in CIE-Lab space.
-    lab0.allocate(3, w, h, d);
-    lab1.allocate(3, w, h, d);
-
-    // Convert input images to CIE-Lab.
-    rgbToCieLab(img0, &lab0);
-    rgbToCieLab(img1, &lab1);
-
-    // @@ Convolve each channel by the corresponding filter.
-    /*
-    GaussianFilter LFilter(5);
-    GaussianFilter aFilter(5);
-    GaussianFilter bFilter(5);
-
-    lab0.convolve(0, LFilter);
-    lab0.convolve(1, aFilter);
-    lab0.convolve(2, bFilter);
-
-    lab1.convolve(0, LFilter);
-    lab1.convolve(1, aFilter);
-    lab1.convolve(2, bFilter);
-    */
-    // @@ Measure Delta E between lab0 and lab1.
-
-    return 0.0f;
-}
-
-
-// Assumes input images are normal maps.
-float nv::averageAngularError(const FloatImage * img0, const FloatImage * img1)
-{
-    if (img0 == NULL || img1 == NULL || img0->width() != img1->width() || img0->height() != img1->height()) {
-        return FLT_MAX;
-    }
-    nvDebugCheck(img0->componentCount() == 4 && img1->componentCount() == 4);
-
-    uint w = img0->width();
-    uint h = img0->height();
-
-    const float * x0 = img0->channel(0);
-    const float * y0 = img0->channel(1);
-    const float * z0 = img0->channel(2);
-
-    const float * x1 = img1->channel(0);
-    const float * y1 = img1->channel(1);
-    const float * z1 = img1->channel(2);
-
-    double error = 0.0f;
-
-    const uint count = w*h;
-    for (uint i = 0; i < count; i++)
-    {
-        Vector3 n0 = Vector3(x0[i], y0[i], z0[i]);
-        Vector3 n1 = Vector3(x1[i], y1[i], z1[i]);
-
-        n0 = 2.0f * n0 - Vector3(1);
-        n1 = 2.0f * n1 - Vector3(1);
-
-        n0 = normalizeSafe(n0, Vector3(0), 0.0f);
-        n1 = normalizeSafe(n1, Vector3(0), 0.0f);
-
-        error += acos(clamp(dot(n0, n1), -1.0f, 1.0f));
-    }
-
-    return float(error / count);
-}
-
-float nv::rmsAngularError(const FloatImage * img0, const FloatImage * img1)
-{
-    if (img0 == NULL || img1 == NULL || img0->width() != img1->width() || img0->height() != img1->height()) {
-        return FLT_MAX;
-    }
-    nvDebugCheck(img0->componentCount() == 4 && img1->componentCount() == 4);
-
-    uint w = img0->width();
-    uint h = img0->height();
-
-    const float * x0 = img0->channel(0);
-    const float * y0 = img0->channel(1);
-    const float * z0 = img0->channel(2);
-
-    const float * x1 = img1->channel(0);
-    const float * y1 = img1->channel(1);
-    const float * z1 = img1->channel(2);
-
-    double error = 0.0f;
-
-    const uint count = w*h;
-    for (uint i = 0; i < count; i++)
-    {
-        Vector3 n0 = Vector3(x0[i], y0[i], z0[i]);
-        Vector3 n1 = Vector3(x1[i], y1[i], z1[i]);
-
-        n0 = 2.0f * n0 - Vector3(1);
-        n1 = 2.0f * n1 - Vector3(1);
-
-        n0 = normalizeSafe(n0, Vector3(0), 0.0f);
-        n1 = normalizeSafe(n1, Vector3(0), 0.0f);
-
-        float angle = acosf(clamp(dot(n0, n1), -1.0f, 1.0f));
-        error += angle * angle;
-    }
-
-    return float(sqrt(error / count));
-}
-
+
+#include "ErrorMetric.h"
+#include "FloatImage.h"
+#include "Filter.h"
+
+#include "nvmath/Matrix.h"
+#include "nvmath/Vector.inl"
+
+#include <float.h> // FLT_MAX
+
+using namespace nv;
+
+float nv::rmsColorError(const FloatImage * ref, const FloatImage * img, bool alphaWeight)
+{
+    if (!sameLayout(img, ref)) {
+        return FLT_MAX;
+    }
+    nvDebugCheck(img->componentCount() == 4);
+    nvDebugCheck(ref->componentCount() == 4);
+
+    double mse = 0;
+
+    const uint count = img->pixelCount();
+    for (uint i = 0; i < count; i++)
+    {
+        float r0 = ref->pixel(i + count * 0);
+        float g0 = ref->pixel(i + count * 1);
+        float b0 = ref->pixel(i + count * 2);
+        float a0 = ref->pixel(i + count * 3);
+        float r1 = img->pixel(i + count * 0);
+        float g1 = img->pixel(i + count * 1);
+        float b1 = img->pixel(i + count * 2);
+        //float a1 = img->pixel(i + count * 3);
+
+        float r = r0 - r1;
+        float g = g0 - g1;
+        float b = b0 - b1;
+
+        float a = 1;
+        if (alphaWeight) a = a0 * a0; // @@ a0*a1 or a0*a0 ?
+
+        mse += (r * r) * a;
+        mse += (g * g) * a;
+        mse += (b * b) * a;
+    }
+
+    return float(sqrt(mse / count));
+}
+
+float nv::rmsAlphaError(const FloatImage * ref, const FloatImage * img)
+{
+    if (!sameLayout(img, ref)) {
+        return FLT_MAX;
+    }
+    nvDebugCheck(img->componentCount() == 4 && ref->componentCount() == 4);
+
+    double mse = 0;
+
+    const uint count = img->pixelCount();
+    for (uint i = 0; i < count; i++)
+    {
+        float a0 = img->pixel(i + count * 3);
+        float a1 = ref->pixel(i + count * 3);
+
+        float a = a0 - a1;
+
+        mse += a * a;
+    }
+
+    return float(sqrt(mse / count));
+}
+
+
+float nv::averageColorError(const FloatImage * ref, const FloatImage * img, bool alphaWeight)
+{
+    if (!sameLayout(img, ref)) {
+        return FLT_MAX;
+    }
+    nvDebugCheck(img->componentCount() == 4);
+    nvDebugCheck(ref->componentCount() == 4);
+
+    double mae = 0;
+
+    const uint count = img->pixelCount();
+    for (uint i = 0; i < count; i++)
+    {
+        float r0 = img->pixel(i + count * 0);
+        float g0 = img->pixel(i + count * 1);
+        float b0 = img->pixel(i + count * 2);
+        //float a0 = img->pixel(i + count * 3);
+        float r1 = ref->pixel(i + count * 0);
+        float g1 = ref->pixel(i + count * 1);
+        float b1 = ref->pixel(i + count * 2);
+        float a1 = ref->pixel(i + count * 3);
+
+        float r = fabs(r0 - r1);
+        float g = fabs(g0 - g1);
+        float b = fabs(b0 - b1);
+
+        float a = 1;
+        if (alphaWeight) a = a1;
+
+        mae += r * a;
+        mae += g * a;
+        mae += b * a;
+    }
+
+    return float(mae / count);
+}
+
+float nv::averageAlphaError(const FloatImage * ref, const FloatImage * img)
+{
+    if (img == NULL || ref == NULL || img->width() != ref->width() || img->height() != ref->height()) {
+        return FLT_MAX;
+    }
+    nvDebugCheck(img->componentCount() == 4 && ref->componentCount() == 4);
+
+    double mae = 0;
+
+    const uint count = img->width() * img->height();
+    for (uint i = 0; i < count; i++)
+    {
+        float a0 = img->pixel(i + count * 3);
+        float a1 = ref->pixel(i + count * 3);
+
+        float a = a0 - a1;
+
+        mae += fabs(a);
+    }
+
+    return float(mae / count);
+}
+
+
+float nv::rmsBilinearColorError(const FloatImage * ref, const FloatImage * img, FloatImage::WrapMode wm, bool alphaWeight)
+{
+    nvDebugCheck(img->componentCount() == 4);
+    nvDebugCheck(ref->componentCount() == 4);
+
+    double mse = 0;
+
+    const uint w0 = ref->width();
+    const uint h0 = ref->height();
+    const uint d0 = ref->depth();
+
+    const uint w1 = img->width();
+    const uint h1 = img->height();
+    const uint d1 = img->depth();
+
+    for (uint z = 0; z < d0; z++) {
+        for (uint y = 0; y < h0; y++) {
+            for (uint x = 0; x < w0; x++) {
+                float r0 = ref->pixel(0, x, y, z);
+                float g0 = ref->pixel(1, x, y, z);
+                float b0 = ref->pixel(2, x, y, z);
+                float a0 = ref->pixel(3, x, y, z);
+
+                float fx = float(x) / w0;
+                float fy = float(y) / h0;
+                float fz = float(z) / d0;
+
+                float r1 = img->sampleLinear(0, fx, fy, fz, wm);
+                float g1 = img->sampleLinear(1, fx, fy, fz, wm);
+                float b1 = img->sampleLinear(2, fx, fy, fz, wm);
+                float a1 = img->sampleLinear(2, fx, fy, fz, wm);
+
+                float dr = r0 - r1;
+                float dg = g0 - g1;
+                float db = b0 - b1;
+                float da = a0 - a1;
+
+                float w = 1;
+                if (alphaWeight) w = a0 * a0; // @@ a0*a1 or a0*a0 ?
+
+                mse += (dr * dr) * w;
+                mse += (dg * dg) * w;
+                mse += (db * db) * w;
+                mse += (da * da);
+            }
+        }
+    }
+
+    int count = w0 * h0 * d0;
+    return float(sqrt(mse / count));
+}
+
+
+// Color space conversions based on:
+// http://www.brucelindbloom.com/
+
+// Assumes input is in *linear* sRGB color space.
+static Vector3 rgbToXyz(Vector3::Arg c)
+{
+    Vector3 xyz;
+    xyz.x = 0.412453f * c.x + 0.357580f * c.y + 0.180423f * c.z;
+    xyz.y = 0.212671f * c.x + 0.715160f * c.y + 0.072169f * c.z;
+    xyz.z = 0.019334f * c.x + 0.119193f * c.y + 0.950227f * c.z;
+    return xyz;
+}
+
+static Vector3 xyzToRgb(Vector3::Arg c)
+{
+    Vector3 rgb;
+    rgb.x =  3.2404542f * c.x - 1.5371385f * c.y - 0.4985314f * c.z;
+    rgb.y = -0.9692660f * c.x + 1.8760108f * c.y + 0.0415560f * c.z;
+    rgb.z =  0.0556434f * c.x - 0.2040259f * c.y + 1.0572252f * c.z;
+    return rgb;
+}
+
+static float toLinear(float f)
+{
+    return powf(f, 2.2f);
+}
+
+static float toGamma(float f)
+{
+    // @@ Use sRGB space?
+    return powf(f, 1.0f/2.2f);
+}
+
+static Vector3 toLinear(Vector3::Arg c)
+{
+    return Vector3(toLinear(c.x), toLinear(c.y), toLinear(c.z));
+}
+
+static Vector3 toGamma(Vector3::Arg c)
+{
+    return Vector3(toGamma(c.x), toGamma(c.y), toGamma(c.z));
+}
+
+static float f(float t)
+{
+    const float epsilon = powf(6.0f/29.0f, 3);
+
+    if (t > epsilon) {
+        return powf(t, 1.0f/3.0f);
+    }
+    else {
+        return 1.0f/3.0f * powf(29.0f/6.0f, 2) * t + 4.0f / 29.0f;
+    }
+}
+
+static float finv(float t)
+{
+    if (t > 6.0f / 29.0f) {
+        return 3.0f * powf(6.0f / 29.0f, 2) * (t - 4.0f / 29.0f);
+    }
+    else {
+        return powf(t, 3.0f);
+    }
+}
+
+static Vector3 xyzToCieLab(Vector3::Arg c)
+{
+    // Normalized white point.
+    const float Xn = 0.950456f;
+    const float Yn = 1.0f;
+    const float Zn = 1.088754f;
+
+    float Xr = c.x / Xn;
+    float Yr = c.y / Yn;
+    float Zr = c.z / Zn;
+
+    float fx = f(Xr);
+    float fy = f(Yr);
+    float fz = f(Zr);
+
+    float L = 116 * fx - 16;
+    float a = 500 * (fx - fy);
+    float b = 200 * (fy - fz);
+
+    return Vector3(L, a, b);
+}
+
+static Vector3 rgbToCieLab(Vector3::Arg c)
+{
+    return xyzToCieLab(rgbToXyz(toLinear(c)));
+}
+
+// h is hue-angle in radians
+static Vector3 cieLabToLCh(Vector3::Arg c)
+{
+    return Vector3(c.x, sqrtf(c.y*c.y + c.z*c.z), atan2f(c.y, c.z));
+}
+
+static void rgbToCieLab(const FloatImage * rgbImage, FloatImage * LabImage)
+{
+    nvDebugCheck(rgbImage != NULL && LabImage != NULL);
+    nvDebugCheck(rgbImage->width() == LabImage->width() && rgbImage->height() == LabImage->height());
+    nvDebugCheck(rgbImage->componentCount() >= 3 && LabImage->componentCount() >= 3);
+
+    const uint w = rgbImage->width();
+    const uint h = LabImage->height();
+
+    const float * R = rgbImage->channel(0);
+    const float * G = rgbImage->channel(1);
+    const float * B = rgbImage->channel(2);
+
+    float * L = LabImage->channel(0);
+    float * a = LabImage->channel(1);
+    float * b = LabImage->channel(2);
+
+    const uint count = w*h;
+    for (uint i = 0; i < count; i++)
+    {
+        Vector3 Lab = rgbToCieLab(Vector3(R[i], G[i], B[i]));
+        L[i] = Lab.x;
+        a[i] = Lab.y;
+        b[i] = Lab.z;
+    }
+}
+
+
+// Assumes input images are in linear sRGB space.
+float nv::cieLabError(const FloatImage * img0, const FloatImage * img1)
+{
+    if (!sameLayout(img0, img1)) return FLT_MAX;
+    nvDebugCheck(img0->componentCount() == 4 && img1->componentCount() == 4);
+
+    const float * r0 = img0->channel(0);
+    const float * g0 = img0->channel(1);
+    const float * b0 = img0->channel(2);
+
+    const float * r1 = img1->channel(0);
+    const float * g1 = img1->channel(1);
+    const float * b1 = img1->channel(2);
+
+    double error = 0.0f;
+
+    const uint count = img0->pixelCount();
+    for (uint i = 0; i < count; i++)
+    {
+        Vector3 lab0 = rgbToCieLab(Vector3(r0[i], g0[i], b0[i]));
+        Vector3 lab1 = rgbToCieLab(Vector3(r1[i], g1[i], b1[i]));
+
+        // @@ Measure Delta E.
+        Vector3 delta = lab0 - lab1;
+        
+        error += length(delta);
+    }
+
+    return float(error / count);
+}
+
+// Assumes input images are in linear sRGB space.
+float nv::cieLab94Error(const FloatImage * img0, const FloatImage * img1)
+{
+    if (!sameLayout(img0, img1)) return FLT_MAX;
+    nvDebugCheck(img0->componentCount() == 4 && img1->componentCount() == 4);
+
+    const float kL = 1;
+    const float kC = 1;
+    const float kH = 1;
+    const float k1 = 0.045f;
+    const float k2 = 0.015f;
+
+    const float sL = 1;
+
+    const float * r0 = img0->channel(0);
+    const float * g0 = img0->channel(1);
+    const float * b0 = img0->channel(2);
+
+    const float * r1 = img1->channel(0);
+    const float * g1 = img1->channel(1);
+    const float * b1 = img1->channel(2);
+
+    double error = 0.0f;
+
+    const uint count = img0->pixelCount();
+    for (uint i = 0; i < count; ++i)
+    {
+        Vector3 lab0 = rgbToCieLab(Vector3(r0[i], g0[i], b0[i]));
+        Vector3 lch0 = cieLabToLCh(lab0);
+        Vector3 lab1 = rgbToCieLab(Vector3(r1[i], g1[i], b1[i]));
+        Vector3 lch1 = cieLabToLCh(lab1);
+
+        const float sC = 1 + k1*lch0.x;
+        const float sH = 1 + k2*lch0.x;
+
+        // @@ Measure Delta E using the 1994 definition
+        Vector3 labDelta = lab0 - lab1;
+        Vector3 lchDelta = lch0 - lch1;
+
+        double deltaLsq = powf(lchDelta.x / (kL*sL), 2);
+        double deltaCsq = powf(lchDelta.y / (kC*sC), 2);
+
+        // avoid possible sqrt of negative value by computing (deltaH/(kH*sH))^2
+        double deltaHsq = powf(labDelta.y, 2) + powf(labDelta.z, 2) - powf(lchDelta.y, 2);
+        deltaHsq /= powf(kH*sH, 2);
+
+        error += sqrt(deltaLsq + deltaCsq + deltaHsq);
+    }
+
+    return float(error / count);
+}
+
+float nv::spatialCieLabError(const FloatImage * img0, const FloatImage * img1)
+{
+    if (img0 == NULL || img1 == NULL || img0->width() != img1->width() || img0->height() != img1->height()) {
+        return FLT_MAX;
+    }
+    nvDebugCheck(img0->componentCount() == 4 && img1->componentCount() == 4);
+
+    uint w = img0->width();
+    uint h = img0->height();
+    uint d = img0->depth();
+
+    FloatImage lab0, lab1; // Original images in CIE-Lab space.
+    lab0.allocate(3, w, h, d);
+    lab1.allocate(3, w, h, d);
+
+    // Convert input images to CIE-Lab.
+    rgbToCieLab(img0, &lab0);
+    rgbToCieLab(img1, &lab1);
+
+    // @@ Convolve each channel by the corresponding filter.
+    /*
+    GaussianFilter LFilter(5);
+    GaussianFilter aFilter(5);
+    GaussianFilter bFilter(5);
+
+    lab0.convolve(0, LFilter);
+    lab0.convolve(1, aFilter);
+    lab0.convolve(2, bFilter);
+
+    lab1.convolve(0, LFilter);
+    lab1.convolve(1, aFilter);
+    lab1.convolve(2, bFilter);
+    */
+    // @@ Measure Delta E between lab0 and lab1.
+
+    return 0.0f;
+}
+
+
+// Assumes input images are normal maps.
+float nv::averageAngularError(const FloatImage * img0, const FloatImage * img1)
+{
+    if (img0 == NULL || img1 == NULL || img0->width() != img1->width() || img0->height() != img1->height()) {
+        return FLT_MAX;
+    }
+    nvDebugCheck(img0->componentCount() == 4 && img1->componentCount() == 4);
+
+    uint w = img0->width();
+    uint h = img0->height();
+
+    const float * x0 = img0->channel(0);
+    const float * y0 = img0->channel(1);
+    const float * z0 = img0->channel(2);
+
+    const float * x1 = img1->channel(0);
+    const float * y1 = img1->channel(1);
+    const float * z1 = img1->channel(2);
+
+    double error = 0.0f;
+
+    const uint count = w*h;
+    for (uint i = 0; i < count; i++)
+    {
+        Vector3 n0 = Vector3(x0[i], y0[i], z0[i]);
+        Vector3 n1 = Vector3(x1[i], y1[i], z1[i]);
+
+        n0 = 2.0f * n0 - Vector3(1);
+        n1 = 2.0f * n1 - Vector3(1);
+
+        n0 = normalizeSafe(n0, Vector3(0), 0.0f);
+        n1 = normalizeSafe(n1, Vector3(0), 0.0f);
+
+        error += acos(clamp(dot(n0, n1), -1.0f, 1.0f));
+    }
+
+    return float(error / count);
+}
+
+float nv::rmsAngularError(const FloatImage * img0, const FloatImage * img1)
+{
+    if (img0 == NULL || img1 == NULL || img0->width() != img1->width() || img0->height() != img1->height()) {
+        return FLT_MAX;
+    }
+    nvDebugCheck(img0->componentCount() == 4 && img1->componentCount() == 4);
+
+    uint w = img0->width();
+    uint h = img0->height();
+
+    const float * x0 = img0->channel(0);
+    const float * y0 = img0->channel(1);
+    const float * z0 = img0->channel(2);
+
+    const float * x1 = img1->channel(0);
+    const float * y1 = img1->channel(1);
+    const float * z1 = img1->channel(2);
+
+    double error = 0.0f;
+
+    const uint count = w*h;
+    for (uint i = 0; i < count; i++)
+    {
+        Vector3 n0 = Vector3(x0[i], y0[i], z0[i]);
+        Vector3 n1 = Vector3(x1[i], y1[i], z1[i]);
+
+        n0 = 2.0f * n0 - Vector3(1);
+        n1 = 2.0f * n1 - Vector3(1);
+
+        n0 = normalizeSafe(n0, Vector3(0), 0.0f);
+        n1 = normalizeSafe(n1, Vector3(0), 0.0f);
+
+        float angle = acosf(clamp(dot(n0, n1), -1.0f, 1.0f));
+        error += angle * angle;
+    }
+
+    return float(sqrt(error / count));
+}
+
diff --git a/src/nvimage/ErrorMetric.h b/src/nvimage/ErrorMetric.h
index b875802..aa43d0c 100644
--- a/src/nvimage/ErrorMetric.h
+++ b/src/nvimage/ErrorMetric.h
@@ -1,5 +1,6 @@
 
 #include "nvimage.h"
+#include "FloatImage.h" // For FloatImage::WrapMode
 
 
 namespace nv
@@ -9,13 +10,15 @@ namespace nv
     float rmsColorError(const FloatImage * ref, const FloatImage * img, bool alphaWeight);
     float rmsAlphaError(const FloatImage * ref, const FloatImage * img);
 
+    float averageColorError(const FloatImage * ref, const FloatImage * img, bool alphaWeight);
+    float averageAlphaError(const FloatImage * ref, const FloatImage * img);
+
+    float rmsBilinearColorError(const FloatImage * ref, const FloatImage * img, FloatImage::WrapMode wm, bool alphaWeight);
+
     float cieLabError(const FloatImage * ref, const FloatImage * img);
     float cieLab94Error(const FloatImage * ref, const FloatImage * img);
     float spatialCieLabError(const FloatImage * ref, const FloatImage * img);
 
-    float averageColorError(const FloatImage * ref, const FloatImage * img, bool alphaWeight);
-    float averageAlphaError(const FloatImage * ref, const FloatImage * img);
-
     float averageAngularError(const FloatImage * img0, const FloatImage * img1);
     float rmsAngularError(const FloatImage * img0, const FloatImage * img1);
 
diff --git a/src/nvimage/FloatImage.cpp b/src/nvimage/FloatImage.cpp
index 43164f7..f611328 100644
--- a/src/nvimage/FloatImage.cpp
+++ b/src/nvimage/FloatImage.cpp
@@ -1,1471 +1,1499 @@
-// This code is in the public domain -- castanyo@yahoo.es
-
-#include "FloatImage.h"
-#include "Filter.h"
-#include "Image.h"
-
-#include "nvmath/Color.h"
-#include "nvmath/Vector.inl"
-#include "nvmath/Matrix.inl"
-#include "nvmath/ftoi.h"
-#include "nvmath/Gamma.h"
-
-#include "nvcore/Utils.h" // max
-#include "nvcore/Ptr.h"
-#include "nvcore/Memory.h"
-#include "nvcore/Array.inl"
-
-#include <math.h>
-#include <string.h> // memset, memcpy
-
-
-using namespace nv;
-
-
-/// Ctor.
-FloatImage::FloatImage() : m_componentCount(0), m_width(0), m_height(0), m_depth(0),
-  m_pixelCount(0), m_floatCount(0), m_mem(NULL)
-{
-}
-
-/// Ctor. Init from image.
-FloatImage::FloatImage(const Image * img) : m_componentCount(0), m_width(0), m_height(0), m_depth(0),
-    m_pixelCount(0), m_floatCount(0), m_mem(NULL)
-{
-    initFrom(img);
-}
-
-/// Dtor.
-FloatImage::~FloatImage()
-{
-    free();
-}
-
-
-/// Init the floating point image from a regular image.
-void FloatImage::initFrom(const Image * img)
-{
-    nvCheck(img != NULL);
-
-    allocate(4, img->width(), img->height(), img->depth());
-
-    float * red_channel = channel(0);
-    float * green_channel = channel(1);
-    float * blue_channel = channel(2);
-    float * alpha_channel = channel(3);
-
-    const uint count = m_pixelCount;
-    for (uint i = 0; i < count; i++) {
-        Color32 pixel = img->pixel(i);
-        red_channel[i] = float(pixel.r) / 255.0f;
-        green_channel[i] = float(pixel.g) / 255.0f;
-        blue_channel[i] = float(pixel.b) / 255.0f;
-        alpha_channel[i] = float(pixel.a) / 255.0f;
-    }
-}
-
-/// Convert the floating point image to a regular image.
-Image * FloatImage::createImage(uint baseComponent/*= 0*/, uint num/*= 4*/) const
-{
-    nvCheck(num <= 4);
-    nvCheck(baseComponent + num <= m_componentCount);
-
-    AutoPtr<Image> img(new Image());
-    img->allocate(m_width, m_height, m_depth);
-
-    for (uint i = 0; i < m_pixelCount; i++) {
-
-        uint c;
-        uint8 rgba[4]= {0, 0, 0, 0xff};
-
-        for (c = 0; c < num; c++) {
-            float f = pixel(baseComponent + c, i);
-            rgba[c] = nv::clamp(int(255.0f * f), 0, 255);
-        }
-
-        img->pixel(i) = Color32(rgba[0], rgba[1], rgba[2], rgba[3]);
-    }
-
-    return img.release();
-}
-
-
-/// Convert the floating point image to a regular image. Correct gamma of rgb, but not alpha.
-Image * FloatImage::createImageGammaCorrect(float gamma/*= 2.2f*/) const
-{
-    nvCheck(m_componentCount == 4);
-
-    AutoPtr<Image> img(new Image());
-    img->allocate(m_width, m_height, m_depth);
-
-    const float * rChannel = this->channel(0);
-    const float * gChannel = this->channel(1);
-    const float * bChannel = this->channel(2);
-    const float * aChannel = this->channel(3);
-
-    const uint count = m_pixelCount;
-    for (uint i = 0; i < count; i++)
-    {
-        const uint8 r = nv::clamp(int(255.0f * pow(rChannel[i], 1.0f/gamma)), 0, 255);
-        const uint8 g = nv::clamp(int(255.0f * pow(gChannel[i], 1.0f/gamma)), 0, 255);
-        const uint8 b = nv::clamp(int(255.0f * pow(bChannel[i], 1.0f/gamma)), 0, 255);
-        const uint8 a = nv::clamp(int(255.0f * aChannel[i]), 0, 255);
-
-        img->pixel(i) = Color32(r, g, b, a);
-    }
-
-    return img.release();
-}
-
-/// Allocate a 2D float image of the given format and the given extents.
-void FloatImage::allocate(uint c, uint w, uint h, uint d)
-{
-    if (m_componentCount != c || m_width != w || m_height != h || m_depth != d)
-    {
-        free();
-
-        m_width = w;
-        m_height = h;
-        m_depth = d;
-        m_componentCount = c;
-        m_pixelCount = w * h * d;
-        m_floatCount = m_pixelCount * c;
-        m_mem = malloc<float>(m_floatCount);
-    }
-}
-
-/// Free the image, but don't clear the members.
-void FloatImage::free()
-{
-    ::free(m_mem);
-    m_mem = NULL;
-}
-
-void FloatImage::resizeChannelCount(uint c)
-{
-    if (m_componentCount != c) {
-        uint count = m_pixelCount * c;
-        m_mem = realloc<float>(m_mem, count);
-
-        if (c > m_componentCount) {
-            memset(m_mem + m_floatCount, 0, (count - m_floatCount) * sizeof(float));
-        }
-
-        m_componentCount = c;
-        m_floatCount = count;
-    }
-}
-
-void FloatImage::clear(float f/*=0.0f*/)
-{
-    for (uint i = 0; i < m_floatCount; i++) {
-        m_mem[i] = f;
-    }
-}
-
-void FloatImage::clear(uint c, float f/*= 0.0f*/)
-{
-    float * channel = this->channel(c);
-
-    const uint count = m_pixelCount;
-    for (uint i = 0; i < count; i++) {
-        channel[i] = f;
-    }
-}
-
-void FloatImage::copyChannel(uint src, uint dst)
-{
-    nvCheck(src < m_componentCount);
-    nvCheck(dst < m_componentCount);
-
-    const float * srcChannel = this->channel(src);
-    float * dstChannel = this->channel(dst);
-
-    memcpy(dstChannel, srcChannel, sizeof(float)*m_pixelCount);
-}
-
-void FloatImage::normalize(uint baseComponent)
-{
-    nvCheck(baseComponent + 3 <= m_componentCount);
-
-    float * xChannel = this->channel(baseComponent + 0);
-    float * yChannel = this->channel(baseComponent + 1);
-    float * zChannel = this->channel(baseComponent + 2);
-
-    const uint count = m_pixelCount;
-    for (uint i = 0; i < count; i++) {
-
-        Vector3 normal(xChannel[i], yChannel[i], zChannel[i]);
-        normal = normalizeSafe(normal, Vector3(0), 0.0f);
-
-        xChannel[i] = normal.x;
-        yChannel[i] = normal.y;
-        zChannel[i] = normal.z;
-    }
-}
-
-void FloatImage::packNormals(uint baseComponent)
-{
-    scaleBias(baseComponent, 3, 0.5f, 0.5f);
-}
-
-void FloatImage::expandNormals(uint baseComponent)
-{
-    scaleBias(baseComponent, 3, 2, -1.0);
-}
-
-void FloatImage::scaleBias(uint baseComponent, uint num, float scale, float bias)
-{
-    const uint size = m_pixelCount;
-
-    for (uint c = 0; c < num; c++) {
-        float * ptr = this->channel(baseComponent + c);
-
-        for (uint i = 0; i < size; i++) {
-            ptr[i] = scale * ptr[i] + bias;
-        }
-    }
-}
-
-/// Clamp the elements of the image.
-void FloatImage::clamp(uint baseComponent, uint num, float low, float high)
-{
-    const uint size = m_pixelCount;
-
-    for (uint c = 0; c < num; c++) {
-        float * ptr = this->channel(baseComponent + c);
-
-        for (uint i = 0; i < size; i++) {
-            ptr[i] = nv::clamp(ptr[i], low, high);
-        }
-    }
-}
-
-/// From gamma to linear space.
-void FloatImage::toLinear(uint baseComponent, uint num, float gamma /*= 2.2f*/)
-{
-    if (gamma == 2.2f) {
-        for (uint c = 0; c < num; c++) {
-            float * ptr = this->channel(baseComponent + c);
-
-            powf_11_5(ptr, ptr, m_pixelCount);
-        }
-    } else {
-        exponentiate(baseComponent, num, gamma);
-    }
-}
-
-/// From linear to gamma space.
-void FloatImage::toGamma(uint baseComponent, uint num, float gamma /*= 2.2f*/)
-{
-    if (gamma == 2.2f) {
-        for (uint c = 0; c < num; c++) {
-            float * ptr = this->channel(baseComponent + c);
-
-            powf_5_11(ptr, ptr, m_pixelCount);
-        }
-    } else {
-        exponentiate(baseComponent, num, 1.0f/gamma);
-    }
-}
-
-/// Exponentiate the elements of the image.
-void FloatImage::exponentiate(uint baseComponent, uint num, float power)
-{
-    const uint size = m_pixelCount;
-
-    for(uint c = 0; c < num; c++) {
-        float * ptr = this->channel(baseComponent + c);
-
-        for(uint i = 0; i < size; i++) {
-            ptr[i] = powf(max(0.0f, ptr[i]), power);
-        }
-    }
-}
-
-/// Apply linear transform.
-void FloatImage::transform(uint baseComponent, const Matrix & m, Vector4::Arg offset)
-{
-    nvCheck(baseComponent + 4 <= m_componentCount);
-
-    float * r = this->channel(baseComponent + 0);
-    float * g = this->channel(baseComponent + 1);
-    float * b = this->channel(baseComponent + 2);
-    float * a = this->channel(baseComponent + 3);
-
-    const uint size = m_pixelCount;
-    for (uint i = 0; i < size; i++)
-    {
-        Vector4 color = nv::transform(m, Vector4(*r, *g, *b, *a)) + offset;
-
-        *r++ = color.x;
-        *g++ = color.y;
-        *b++ = color.z;
-        *a++ = color.w;
-    }
-}
-
-void FloatImage::swizzle(uint baseComponent, uint r, uint g, uint b, uint a)
-{
-    nvCheck(baseComponent + 4 <= m_componentCount);
-    nvCheck(r < 7 && g < 7 && b < 7 && a < 7);
-
-    float consts[] = { 1.0f, 0.0f, -1.0f };
-    float * c[7];
-    c[0] = this->channel(baseComponent + 0);
-    c[1] = this->channel(baseComponent + 1);
-    c[2] = this->channel(baseComponent + 2);
-    c[3] = this->channel(baseComponent + 3);
-    c[4] = consts;
-    c[5] = consts + 1;
-    c[6] = consts + 2;
-
-    const uint size = m_pixelCount;
-    for (uint i = 0; i < size; i++)
-    {
-        float tmp[4] = { *c[r], *c[g], *c[b], *c[a] };
-
-        *c[0]++ = tmp[0];
-        *c[1]++ = tmp[1];
-        *c[2]++ = tmp[2];
-        *c[3]++ = tmp[3];
-    }
-}
-
-float FloatImage::sampleNearest(uint c, float x, float y, const WrapMode wm) const
-{
-    if( wm == WrapMode_Clamp ) return sampleNearestClamp(c, x, y);
-    else if( wm == WrapMode_Repeat ) return sampleNearestRepeat(c, x, y);
-    else /*if( wm == WrapMode_Mirror )*/ return sampleNearestMirror(c, x, y);
-}
-
-float FloatImage::sampleLinear(uint c, float x, float y, WrapMode wm) const
-{
-    if( wm == WrapMode_Clamp ) return sampleLinearClamp(c, x, y);
-    else if( wm == WrapMode_Repeat ) return sampleLinearRepeat(c, x, y);
-    else /*if( wm == WrapMode_Mirror )*/ return sampleLinearMirror(c, x, y);
-}
-
-float FloatImage::sampleNearest(uint c, float x, float y, float z, WrapMode wm) const
-{
-    if( wm == WrapMode_Clamp ) return sampleNearestClamp(c, x, y, z);
-    else if( wm == WrapMode_Repeat ) return sampleNearestRepeat(c, x, y, z);
-    else /*if( wm == WrapMode_Mirror )*/ return sampleNearestMirror(c, x, y, z);
-}
-
-float FloatImage::sampleLinear(uint c, float x, float y, float z, WrapMode wm) const
-{
-    if( wm == WrapMode_Clamp ) return sampleLinearClamp(c, x, y, z);
-    else if( wm == WrapMode_Repeat ) return sampleLinearRepeat(c, x, y, z);
-    else /*if( wm == WrapMode_Mirror )*/ return sampleLinearMirror(c, x, y, z);
-}
-
-float FloatImage::sampleNearestClamp(uint c, float x, float y) const
-{
-    int ix = wrapClamp(iround(x * m_width), m_width);
-    int iy = wrapClamp(iround(y * m_height), m_height);
-    return pixel(c, ix, iy, 0);
-}
-
-float FloatImage::sampleNearestRepeat(uint c, float x, float y) const
-{
-    int ix = wrapRepeat(iround(x * m_width), m_width);
-    int iy = wrapRepeat(iround(y * m_height), m_height);
-    return pixel(c, ix, iy, 0);
-}
-
-float FloatImage::sampleNearestMirror(uint c, float x, float y) const
-{
-    int ix = wrapMirror(iround(x * m_width), m_width);
-    int iy = wrapMirror(iround(y * m_height), m_height);
-    return pixel(c, ix, iy, 0);
-}
-
-float FloatImage::sampleNearestClamp(uint c, float x, float y, float z) const
-{
-    int ix = wrapClamp(iround(x * m_width), m_width);
-    int iy = wrapClamp(iround(y * m_height), m_height);
-    int iz = wrapClamp(iround(z * m_depth), m_depth);
-    return pixel(c, ix, iy, iz);
-}
-
-float FloatImage::sampleNearestRepeat(uint c, float x, float y, float z) const
-{
-    int ix = wrapRepeat(iround(x * m_width), m_width);
-    int iy = wrapRepeat(iround(y * m_height), m_height);
-    int iz = wrapRepeat(iround(z * m_depth), m_depth);
-    return pixel(c, ix, iy, iz);
-}
-
-float FloatImage::sampleNearestMirror(uint c, float x, float y, float z) const
-{
-    int ix = wrapMirror(iround(x * m_width), m_width);
-    int iy = wrapMirror(iround(y * m_height), m_height);
-    int iz = wrapMirror(iround(z * m_depth), m_depth);
-    return pixel(c, ix, iy, iz);
-}
-
-
-float FloatImage::sampleLinearClamp(uint c, float x, float y) const
-{
-    const int w = m_width;
-    const int h = m_height;
-
-    x *= w;
-    y *= h;
-
-    const float fracX = frac(x);
-    const float fracY = frac(y);
-
-    const int ix0 = ::clamp(ifloor(x), 0, w-1);
-    const int iy0 = ::clamp(ifloor(y), 0, h-1);
-    const int ix1 = ::clamp(ifloor(x)+1, 0, w-1);
-    const int iy1 = ::clamp(ifloor(y)+1, 0, h-1);
-
-    return bilerp(c, ix0, iy0, ix1, iy1, fracX, fracY);
-}
-
-float FloatImage::sampleLinearRepeat(uint c, float x, float y) const
-{
-    const int w = m_width;
-    const int h = m_height;
-
-    const float fracX = frac(x * w);
-    const float fracY = frac(y * h);
-
-    // @@ Using floor in some places, but round in others?
-    int ix0 = ifloor(frac(x) * w);
-    int iy0 = ifloor(frac(y) * h);
-    int ix1 = ifloor(frac(x + 1.0f/w) * w);
-    int iy1 = ifloor(frac(y + 1.0f/h) * h);
-
-    return bilerp(c, ix0, iy0, ix1, iy1, fracX, fracY);
-}
-
-float FloatImage::sampleLinearMirror(uint c, float x, float y) const
-{
-    const int w = m_width;
-    const int h = m_height;
-
-    x *= w;
-    y *= h;
-
-    const float fracX = frac(x);
-    const float fracY = frac(y);
-
-    int ix0 = wrapMirror(iround(x), w);
-    int iy0 = wrapMirror(iround(y), h);
-    int ix1 = wrapMirror(iround(x) + 1, w);
-    int iy1 = wrapMirror(iround(y) + 1, h);
-
-    return bilerp(c, ix0, iy0, ix1, iy1, fracX, fracY);
-}
-
-float FloatImage::sampleLinearClamp(uint c, float x, float y, float z) const
-{
-    const int w = m_width;
-    const int h = m_height;
-    const int d = m_depth;
-
-    x *= w;
-    y *= h;
-    z *= d;
-
-    const float fracX = frac(x);
-    const float fracY = frac(y);
-    const float fracZ = frac(z);
-
-    // @@ Using floor in some places, but round in others?
-    const int ix0 = ::clamp(ifloor(x), 0, w-1);
-    const int iy0 = ::clamp(ifloor(y), 0, h-1);
-    const int iz0 = ::clamp(ifloor(z), 0, h-1);
-    const int ix1 = ::clamp(ifloor(x)+1, 0, w-1);
-    const int iy1 = ::clamp(ifloor(y)+1, 0, h-1);
-    const int iz1 = ::clamp(ifloor(z)+1, 0, h-1);
-
-    return trilerp(c, ix0, iy0, iz0, ix1, iy1, iz1, fracX, fracY, fracZ);
-}
-
-float FloatImage::sampleLinearRepeat(uint c, float x, float y, float z) const
-{
-    const int w = m_width;
-    const int h = m_height;
-    const int d = m_depth;
-
-    const float fracX = frac(x * w);
-    const float fracY = frac(y * h);
-    const float fracZ = frac(z * d);
-
-    int ix0 = ifloor(frac(x) * w);
-    int iy0 = ifloor(frac(y) * h);
-    int iz0 = ifloor(frac(z) * d);
-    int ix1 = ifloor(frac(x + 1.0f/w) * w);
-    int iy1 = ifloor(frac(y + 1.0f/h) * h);
-    int iz1 = ifloor(frac(z + 1.0f/d) * d);
-
-    return trilerp(c, ix0, iy0, iz0, ix1, iy1, iz1, fracX, fracY, fracZ);
-}
-
-float FloatImage::sampleLinearMirror(uint c, float x, float y, float z) const
-{
-    const int w = m_width;
-    const int h = m_height;
-    const int d = m_depth;
-
-    x *= w;
-    y *= h;
-    z *= d;
-
-    int ix0 = wrapMirror(iround(x), w);
-    int iy0 = wrapMirror(iround(y), h);
-    int iz0 = wrapMirror(iround(z), d);
-    int ix1 = wrapMirror(iround(x) + 1, w);
-    int iy1 = wrapMirror(iround(y) + 1, h);
-    int iz1 = wrapMirror(iround(z) + 1, d);
-
-    const float fracX = frac(x);
-    const float fracY = frac(y);
-    const float fracZ = frac(z);
-
-    return trilerp(c, ix0, iy0, iz0, ix1, iy1, iz1, fracX, fracY, fracZ);
-}
-
-
-/// Fast downsampling using box filter. 
-///
-/// The extents of the image are divided by two and rounded down.
-///
-/// When the size of the image is odd, this uses a polyphase box filter as explained in:
-/// http://developer.nvidia.com/object/np2_mipmapping.html
-///
-FloatImage * FloatImage::fastDownSample() const
-{
-    nvDebugCheck(m_depth == 1);
-    nvDebugCheck(m_width != 1 || m_height != 1);
-
-    AutoPtr<FloatImage> dst_image( new FloatImage() );
-
-    const uint w = max(1, m_width / 2);
-    const uint h = max(1, m_height / 2);
-    dst_image->allocate(m_componentCount, w, h);
-
-    // 1D box filter.
-    if (m_width == 1 || m_height == 1)
-    {
-        const uint n = w * h;
-
-        if ((m_width * m_height) & 1)
-        {
-            const float scale = 1.0f / (2 * n + 1);
-
-            for(uint c = 0; c < m_componentCount; c++)
-            {
-                const float * src = this->channel(c);
-                float * dst = dst_image->channel(c);
-
-                for(uint x = 0; x < n; x++)
-                {
-                    const float w0 = float(n - x);
-                    const float w1 = float(n - 0);
-                    const float w2 = float(1 + x);
-
-                    *dst++ = scale * (w0 * src[0] + w1 * src[1] + w2 * src[2]);
-                    src += 2;
-                }
-            }
-        }
-        else
-        {
-            for(uint c = 0; c < m_componentCount; c++)
-            {
-                const float * src = this->channel(c);
-                float * dst = dst_image->channel(c);
-
-                for(uint x = 0; x < n; x++)
-                {
-                    *dst = 0.5f * (src[0] + src[1]);
-                    dst++;
-                    src += 2;
-                }
-            }
-        }
-    }
-
-    // Regular box filter.
-    else if ((m_width & 1) == 0 && (m_height & 1) == 0)
-    {
-        for(uint c = 0; c < m_componentCount; c++)
-        {
-            const float * src = this->channel(c);
-            float * dst = dst_image->channel(c);
-
-            for(uint y = 0; y < h; y++)
-            {
-                for(uint x = 0; x < w; x++)
-                {
-                    *dst = 0.25f * (src[0] + src[1] + src[m_width] + src[m_width + 1]);
-                    dst++;
-                    src += 2;
-                }
-
-                src += m_width;
-            }
-        }
-    }
-
-    // Polyphase filters.
-    else if (m_width & 1 && m_height & 1)
-    {
-        nvDebugCheck(m_width == 2 * w + 1);
-        nvDebugCheck(m_height == 2 * h + 1);
-
-        const float scale = 1.0f / (m_width * m_height);
-
-        for(uint c = 0; c < m_componentCount; c++)
-        {
-            const float * src = this->channel(c);
-            float * dst = dst_image->channel(c);
-
-            for(uint y = 0; y < h; y++)
-            {
-                const float v0 = float(h - y);
-                const float v1 = float(h - 0);
-                const float v2 = float(1 + y);
-
-                for (uint x = 0; x < w; x++)
-                {
-                    const float w0 = float(w - x);
-                    const float w1 = float(w - 0);
-                    const float w2 = float(1 + x);
-
-                    float f = 0.0f;
-                    f += v0 * (w0 * src[0 * m_width + 2 * x] + w1 * src[0 * m_width + 2 * x + 1] + w2 * src[0 * m_width + 2 * x + 2]);
-                    f += v1 * (w0 * src[1 * m_width + 2 * x] + w1 * src[1 * m_width + 2 * x + 1] + w2 * src[1 * m_width + 2 * x + 2]);
-                    f += v2 * (w0 * src[2 * m_width + 2 * x] + w1 * src[2 * m_width + 2 * x + 1] + w2 * src[2 * m_width + 2 * x + 2]);
-
-                    *dst = f * scale;
-                    dst++;
-                }
-
-                src += 2 * m_width;
-            }
-        }
-    }
-    else if (m_width & 1)
-    {
-        nvDebugCheck(m_width == 2 * w + 1);
-        const float scale = 1.0f / (2 * m_width);
-
-        for(uint c = 0; c < m_componentCount; c++)
-        {
-            const float * src = this->channel(c);
-            float * dst = dst_image->channel(c);
-
-            for(uint y = 0; y < h; y++)
-            {
-                for (uint x = 0; x < w; x++)
-                {
-                    const float w0 = float(w - x);
-                    const float w1 = float(w - 0);
-                    const float w2 = float(1 + x);
-
-                    float f = 0.0f;
-                    f += w0 * (src[2 * x + 0] + src[m_width + 2 * x + 0]);
-                    f += w1 * (src[2 * x + 1] + src[m_width + 2 * x + 1]);
-                    f += w2 * (src[2 * x + 2] + src[m_width + 2 * x + 2]);
-
-                    *dst = f * scale;
-                    dst++;
-                }
-
-                src += 2 * m_width;
-            }
-        }
-    }
-    else if (m_height & 1)
-    {
-        nvDebugCheck(m_height == 2 * h + 1);
-
-        const float scale = 1.0f / (2 * m_height);
-
-        for(uint c = 0; c < m_componentCount; c++)
-        {
-            const float * src = this->channel(c);
-            float * dst = dst_image->channel(c);
-
-            for(uint y = 0; y < h; y++)
-            {
-                const float v0 = float(h - y);
-                const float v1 = float(h - 0);
-                const float v2 = float(1 + y);
-
-                for (uint x = 0; x < w; x++)
-                {
-                    float f = 0.0f;
-                    f += v0 * (src[0 * m_width + 2 * x] + src[0 * m_width + 2 * x + 1]);
-                    f += v1 * (src[1 * m_width + 2 * x] + src[1 * m_width + 2 * x + 1]);
-                    f += v2 * (src[2 * m_width + 2 * x] + src[2 * m_width + 2 * x + 1]);
-
-                    *dst = f * scale;
-                    dst++;
-                }
-
-                src += 2 * m_width;
-            }
-        }
-    }
-
-    return dst_image.release();
-}
-
-/// Downsample applying a 1D kernel separately in each dimension.
-FloatImage * FloatImage::downSample(const Filter & filter, WrapMode wm) const
-{
-    const uint w = max(1, m_width / 2);
-    const uint h = max(1, m_height / 2);
-    const uint d = max(1, m_depth / 2);
-
-    return resize(filter, w, h, d, wm);
-}
-
-/// Downsample applying a 1D kernel separately in each dimension.
-FloatImage * FloatImage::downSample(const Filter & filter, WrapMode wm, uint alpha) const
-{
-    const uint w = max(1, m_width / 2);
-    const uint h = max(1, m_height / 2);
-    const uint d = max(1, m_depth / 2);
-
-    return resize(filter, w, h, d, wm, alpha);
-}
-
-
-/// Downsample applying a 1D kernel separately in each dimension.
-FloatImage * FloatImage::resize(const Filter & filter, uint w, uint h, WrapMode wm) const
-{
-    // @@ Use monophase filters when frac(m_width / w) == 0
-
-    AutoPtr<FloatImage> tmp_image( new FloatImage() );
-    AutoPtr<FloatImage> dst_image( new FloatImage() );	
-
-    PolyphaseKernel xkernel(filter, m_width, w, 32);
-    PolyphaseKernel ykernel(filter, m_height, h, 32);
-
-    // @@ Select fastest filtering order:
-    //if (w * m_height <= h * m_width)
-    {
-        tmp_image->allocate(m_componentCount, w, m_height);
-        dst_image->allocate(m_componentCount, w, h);
-
-        // @@ We could avoid this allocation, write directly to dst_plane.
-        Array<float> tmp_column(h);
-        tmp_column.resize(h);
-
-        for (uint c = 0; c < m_componentCount; c++)
-        {
-            for (uint z = 0; z < m_depth; z++)
-            {
-                float * tmp_plane = tmp_image->plane(c, z);
-
-                for (uint y = 0; y < m_height; y++) {
-                    this->applyKernelX(xkernel, y, z, c, wm, tmp_plane + y * w);
-                }
-
-                float * dst_plane = dst_image->plane(c, z);
-
-                for (uint x = 0; x < w; x++) {
-                    tmp_image->applyKernelY(ykernel, x, z, c, wm, tmp_column.buffer());
-
-                    // @@ We could avoid this copy, write directly to dst_plane.
-                    for (uint y = 0; y < h; y++) {
-                        dst_plane[y * w + x] = tmp_column[y];
-                    }
-                }
-            }
-        }
-    }
-
-    return dst_image.release();
-}
-
-/// Downsample applying a 1D kernel separately in each dimension. (for 3d textures)
-FloatImage * FloatImage::resize(const Filter & filter, uint w, uint h, uint d, WrapMode wm) const
-{
-    // @@ Use monophase filters when frac(m_width / w) == 0
-
-    // Use the existing 2d version if we are not resizing in the Z axis:
-    if (m_depth == d) {
-        return resize(filter, w, h, wm);
-    }
-
-    AutoPtr<FloatImage> tmp_image( new FloatImage() );
-    AutoPtr<FloatImage> tmp_image2( new FloatImage() );
-    AutoPtr<FloatImage> dst_image( new FloatImage() );
-
-    PolyphaseKernel xkernel(filter, m_width, w, 32);
-    PolyphaseKernel ykernel(filter, m_height, h, 32);
-    PolyphaseKernel zkernel(filter, m_depth, d, 32);
-
-    tmp_image->allocate(m_componentCount, w, m_height, m_depth);
-    tmp_image2->allocate(m_componentCount, w, m_height, d);
-    dst_image->allocate(m_componentCount, w, h, d);
-
-    Array<float> tmp_column(h);
-    tmp_column.resize(h);
-
-    for (uint c = 0; c < m_componentCount; c++)
-    {
-        float * tmp_channel = tmp_image->channel(c);
-
-        // split width in half
-        for (uint z = 0; z < m_depth; z++ ) {
-            for (uint y = 0; y < m_height; y++) {
-                this->applyKernelX(xkernel, y, z, c, wm, tmp_channel + z * m_height * w + y * w);
-            }
-        }
-
-        // split depth in half
-        float * tmp2_channel = tmp_image2->channel(c);
-        for (uint y = 0; y < m_height; y++) {
-            for (uint x = 0; x < w; x++) {
-                tmp_image->applyKernelZ(zkernel, x, y, c, wm, tmp_column.buffer() );
-
-                for (uint z = 0; z < d; z++) {
-                    tmp2_channel[z * m_height * w + y * w + x] = tmp_column[z];
-                }
-            }
-        }
-
-        // split height in half
-        float * dst_channel = dst_image->channel(c);
-
-        for (uint z = 0; z < d; z++ ) {
-            for (uint x = 0; x < w; x++) {
-                tmp_image2->applyKernelY(ykernel, x, z, c, wm, tmp_column.buffer());
-
-                for (uint y = 0; y < h; y++) {
-                    dst_channel[z * h * w + y * w + x] = tmp_column[y];
-                }
-            }
-        }
-    }
-
-    return dst_image.release();
-}
-
-
-/// Downsample applying a 1D kernel separately in each dimension.
-FloatImage * FloatImage::resize(const Filter & filter, uint w, uint h, WrapMode wm, uint alpha) const
-{
-    nvCheck(alpha < m_componentCount);
-
-    AutoPtr<FloatImage> tmp_image( new FloatImage() );
-    AutoPtr<FloatImage> dst_image( new FloatImage() );	
-
-    PolyphaseKernel xkernel(filter, m_width, w, 32);
-    PolyphaseKernel ykernel(filter, m_height, h, 32);
-
-    {
-        tmp_image->allocate(m_componentCount, w, m_height);
-        dst_image->allocate(m_componentCount, w, h);
-
-        Array<float> tmp_column(h);
-        tmp_column.resize(h);
-
-        for (uint i = 0; i < m_componentCount; i++)
-        {
-            // Process alpha channel first.
-            uint c;
-            if (i == 0) c = alpha;
-            else if (i > alpha) c = i;
-            else c = i - 1;
-
-            for (uint z = 0; z < m_depth; z++)
-            {
-                float * tmp_plane = tmp_image->plane(c, z);
-
-                for (uint y = 0; y < m_height; y++) {
-                    this->applyKernelX(xkernel, y, z, c, wm, tmp_plane + y * w);
-                }
-
-                float * dst_plane = dst_image->plane(c, z);
-
-                for (uint x = 0; x < w; x++) {
-                    tmp_image->applyKernelY(ykernel, x, z, c, wm, tmp_column.buffer());
-
-                    // @@ Avoid this copy, write directly to dst_plane.
-                    for (uint y = 0; y < h; y++) {
-                        dst_plane[y * w + x] = tmp_column[y];
-                    }
-                }
-            }
-        }
-    }
-
-    return dst_image.release();
-}
-
-
-/// Downsample applying a 1D kernel separately in each dimension. (for 3d textures)
-FloatImage * FloatImage::resize(const Filter & filter, uint w, uint h, uint d, WrapMode wm, uint alpha) const
-{
-    nvCheck(alpha < m_componentCount);
-
-    // use the existing 2d version if we are a 2d image:
-    if (m_depth == d) {
-        return resize( filter, w, h, wm, alpha );
-    }
-
-    AutoPtr<FloatImage> tmp_image( new FloatImage() );
-    AutoPtr<FloatImage> tmp_image2( new FloatImage() );
-    AutoPtr<FloatImage> dst_image( new FloatImage() );
-
-    PolyphaseKernel xkernel(filter, m_width, w, 32);
-    PolyphaseKernel ykernel(filter, m_height, h, 32);
-    PolyphaseKernel zkernel(filter, m_depth, d, 32);
-
-    tmp_image->allocate(m_componentCount, w, m_height, m_depth);
-    tmp_image2->allocate(m_componentCount, w, m_height, d);
-    dst_image->allocate(m_componentCount, w, h, d);
-
-    Array<float> tmp_column(h);
-    tmp_column.resize(h);
-
-    for (uint i = 0; i < m_componentCount; i++)
-    {
-        // Process alpha channel first.
-        uint c;
-        if (i == 0) c = alpha;
-        else if (i > alpha) c = i;
-        else c = i - 1;
-
-        float * tmp_channel = tmp_image->channel(c);
-
-        for (uint z = 0; z < m_depth; z++ ) {
-            for (uint y = 0; y < m_height; y++) {
-                this->applyKernelX(xkernel, y, z, c, wm, tmp_channel + z * m_height * w + y * w);
-            }
-        }
-
-        float * tmp2_channel = tmp_image2->channel(c);
-        for (uint y = 0; y < m_height; y++) {
-            for (uint x = 0; x < w; x++) {
-                tmp_image->applyKernelZ(zkernel, x, y, c, wm, tmp_column.buffer() );
-
-                for (uint z = 0; z < d; z++) {
-                    tmp2_channel[z * m_height * w + y * w + x] = tmp_column[z];
-                }
-            }
-        }
-
-        float * dst_channel = dst_image->channel(c);
-
-        for (uint z = 0; z < d; z++ ) {
-            for (uint x = 0; x < w; x++) {
-                tmp_image2->applyKernelY(ykernel, x, z, c, wm, tmp_column.buffer());
-
-                for (uint y = 0; y < h; y++) {
-                    dst_channel[z * h * w + y * w + x] = tmp_column[y];
-                }
-            }
-        }
-    }
-
-    return dst_image.release();
-}
-
-
-void FloatImage::convolve(const Kernel2 & k, uint c, WrapMode wm)
-{
-    AutoPtr<FloatImage> tmpImage(clone());
-
-    uint w = m_width;
-    uint h = m_height;
-    uint d = m_depth;
-
-    for (uint z = 0; z < d; z++)
-    {
-        for (uint y = 0; y < h; y++)
-        {
-            for (uint x = 0; x < w; x++)
-            {
-                pixel(c, x, y, 0) = tmpImage->applyKernelXY(&k, x, y, z, c, wm);
-            }
-        }
-    }
-}
-
-
-/// Apply 2D kernel at the given coordinates and return result.
-float FloatImage::applyKernelXY(const Kernel2 * k, int x, int y, int z, uint c, WrapMode wm) const
-{
-    nvDebugCheck(k != NULL);
-
-    const uint kernelWindow = k->windowSize();
-    const int kernelOffset = int(kernelWindow / 2);
-
-    const float * channel = this->plane(c, z);
-
-    float sum = 0.0f;
-    for (uint i = 0; i < kernelWindow; i++)
-    {
-        int src_y = int(y + i) - kernelOffset;
-
-        for (uint e = 0; e < kernelWindow; e++)
-        {
-            int src_x = int(x + e) - kernelOffset;
-
-            int idx = this->index(src_x, src_y, z, wm);
-
-            sum += k->valueAt(e, i) * channel[idx];
-        }
-    }
-
-    return sum;
-}
-
-
-/// Apply 1D horizontal kernel at the given coordinates and return result.
-float FloatImage::applyKernelX(const Kernel1 * k, int x, int y, int z, uint c, WrapMode wm) const
-{
-    nvDebugCheck(k != NULL);
-
-    const uint kernelWindow = k->windowSize();
-    const int kernelOffset = int(kernelWindow / 2);
-
-    const float * channel = this->channel(c);
-
-    float sum = 0.0f;
-    for (uint i = 0; i < kernelWindow; i++)
-    {
-        const int src_x = int(x + i) - kernelOffset;
-        const int idx = this->index(src_x, y, z, wm);
-
-        sum += k->valueAt(i) * channel[idx];
-    }
-
-    return sum;
-}
-
-/// Apply 1D vertical kernel at the given coordinates and return result.
-float FloatImage::applyKernelY(const Kernel1 * k, int x, int y, int z, uint c, WrapMode wm) const
-{
-    nvDebugCheck(k != NULL);
-
-    const uint kernelWindow = k->windowSize();
-    const int kernelOffset = int(kernelWindow / 2);
-
-    const float * channel = this->channel(c);
-
-    float sum = 0.0f;
-    for (uint i = 0; i < kernelWindow; i++)
-    {
-        const int src_y = int(y + i) - kernelOffset;
-        const int idx = this->index(x, src_y, z, wm);
-
-        sum += k->valueAt(i) * channel[idx];
-    }
-
-    return sum;
-}
-
-/// Apply 1D kernel in the z direction at the given coordinates and return result.
-float FloatImage::applyKernelZ(const Kernel1 * k, int x, int y, int z, uint c, WrapMode wm) const
-{
-    nvDebugCheck(k != NULL);
-
-    const uint kernelWindow = k->windowSize();
-    const int kernelOffset = int(kernelWindow / 2);
-
-    const float * channel = this->channel(c);
-
-    float sum = 0.0f;
-    for (uint i = 0; i < kernelWindow; i++)
-    {
-        const int src_z = int(z + i) - kernelOffset;
-        const int idx = this->index(x, y, src_z, wm);
-
-        sum += k->valueAt(i) * channel[idx];
-    }
-
-    return sum;
-}
-
-
-/// Apply 1D horizontal kernel at the given coordinates and return result.
-void FloatImage::applyKernelX(const PolyphaseKernel & k, int y, int z, uint c, WrapMode wm, float * __restrict output) const
-{
-    const uint length = k.length();
-    const float scale = float(length) / float(m_width);
-    const float iscale = 1.0f / scale;
-
-    const float width = k.width();
-    const int windowSize = k.windowSize();
-
-    const float * channel = this->channel(c);
-
-    for (uint i = 0; i < length; i++)
-    {
-        const float center = (0.5f + i) * iscale;
-
-        const int left = (int)floorf(center - width);
-        const int right = (int)ceilf(center + width);
-        nvDebugCheck(right - left <= windowSize);
-
-        float sum = 0;
-        for (int j = 0; j < windowSize; ++j)
-        {
-            const int idx = this->index(left + j, y, z, wm);
-
-            sum += k.valueAt(i, j) * channel[idx];
-        }
-
-        output[i] = sum;
-    }
-}
-
-/// Apply 1D vertical kernel at the given coordinates and return result.
-void FloatImage::applyKernelY(const PolyphaseKernel & k, int x, int z, uint c, WrapMode wm, float * __restrict output) const
-{
-    const uint length = k.length();
-    const float scale = float(length) / float(m_height);
-    const float iscale = 1.0f / scale;
-
-    const float width = k.width();
-    const int windowSize = k.windowSize();
-
-    const float * channel = this->channel(c);
-
-    for (uint i = 0; i < length; i++)
-    {
-        const float center = (0.5f + i) * iscale;
-
-        const int left = (int)floorf(center - width);
-        const int right = (int)ceilf(center + width);
-        nvCheck(right - left <= windowSize);
-
-        float sum = 0;
-        for (int j = 0; j < windowSize; ++j)
-        {
-            const int idx = this->index(x, j+left, z, wm);
-
-            sum += k.valueAt(i, j) * channel[idx];
-        }
-
-        output[i] = sum;
-    }
-}
-
-/// Apply 1D kernel in the Z direction at the given coordinates and return result.
-void FloatImage::applyKernelZ(const PolyphaseKernel & k, int x, int y, uint c, WrapMode wm, float * __restrict output) const
-{
-    const uint length = k.length();
-    const float scale = float(length) / float(m_height);
-    const float iscale = 1.0f / scale;
-
-    const float width = k.width();
-    const int windowSize = k.windowSize();
-
-    const float * channel = this->channel(c);
-
-    for (uint i = 0; i < length; i++)
-    {
-        const float center = (0.5f + i) * iscale;
-
-        const int left = (int)floorf(center - width);
-        const int right = (int)ceilf(center + width);
-        nvCheck(right - left <= windowSize);
-
-        float sum = 0;
-        for (int j = 0; j < windowSize; ++j)
-        {
-            const int idx = this->index(x, y, j+left, wm);
-
-            sum += k.valueAt(i, j) * channel[idx];
-        }
-
-        output[i] = sum;
-    }
-}
-
-
-/// Apply 1D horizontal kernel at the given coordinates and return result.
-void FloatImage::applyKernelX(const PolyphaseKernel & k, int y, int z, uint c, uint a, WrapMode wm, float * __restrict output) const
-{
-    const uint length = k.length();
-    const float scale = float(length) / float(m_width);
-    const float iscale = 1.0f / scale;
-
-    const float width = k.width();
-    const int windowSize = k.windowSize();
-
-    const float * channel = this->channel(c);
-    const float * alpha = this->channel(a);
-
-    for (uint i = 0; i < length; i++)
-    {
-        const float center = (0.5f + i) * iscale;
-
-        const int left = (int)floorf(center - width);
-        const int right = (int)ceilf(center + width);
-        nvDebugCheck(right - left <= windowSize);
-
-        float norm = 0.0f;
-        float sum = 0;
-        for (int j = 0; j < windowSize; ++j)
-        {
-            const int idx = this->index(left + j, y, z, wm);
-
-            float w = k.valueAt(i, j) * (alpha[idx] + (1.0f / 256.0f));
-            norm += w;
-            sum += w * channel[idx];
-        }
-
-        output[i] = sum / norm;
-    }
-}
-
-/// Apply 1D vertical kernel at the given coordinates and return result.
-void FloatImage::applyKernelY(const PolyphaseKernel & k, int x, int z, uint c, uint a, WrapMode wm, float * __restrict output) const
-{
-    const uint length = k.length();
-    const float scale = float(length) / float(m_height);
-    const float iscale = 1.0f / scale;
-
-    const float width = k.width();
-    const int windowSize = k.windowSize();
-
-    const float * channel = this->channel(c);
-    const float * alpha = this->channel(a);
-
-    for (uint i = 0; i < length; i++)
-    {
-        const float center = (0.5f + i) * iscale;
-
-        const int left = (int)floorf(center - width);
-        const int right = (int)ceilf(center + width);
-        nvCheck(right - left <= windowSize);
-
-        float norm = 0;
-        float sum = 0;
-        for (int j = 0; j < windowSize; ++j)
-        {
-            const int idx = this->index(x, j+left, z, wm);
-
-            float w = k.valueAt(i, j) * (alpha[idx] + (1.0f / 256.0f));
-            norm += w;
-            sum += w * channel[idx];
-        }
-
-        output[i] = sum / norm;
-    }
-}
-
-/// Apply 1D horizontal kernel at the given coordinates and return result.
-void FloatImage::applyKernelZ(const PolyphaseKernel & k, int x, int y, uint c, uint a, WrapMode wm, float * __restrict output) const
-{
-    const uint length = k.length();
-    const float scale = float(length) / float(m_width);
-    const float iscale = 1.0f / scale;
-
-    const float width = k.width();
-    const int windowSize = k.windowSize();
-
-    const float * channel = this->channel(c);
-    const float * alpha = this->channel(a);
-
-    for (uint i = 0; i < length; i++)
-    {
-        const float center = (0.5f + i) * iscale;
-
-        const int left = (int)floorf(center - width);
-        const int right = (int)ceilf(center + width);
-        nvDebugCheck(right - left <= windowSize);
-
-        float norm = 0.0f;
-        float sum = 0;
-        for (int j = 0; j < windowSize; ++j)
-        {
-            const int idx = this->index(x, y, left + j, wm);
-
-            float w = k.valueAt(i, j) * (alpha[idx] + (1.0f / 256.0f));
-            norm += w;
-            sum += w * channel[idx];
-        }
-
-        output[i] = sum / norm;
-    }
-}
-
-
-void FloatImage::flipX()
-{
-    const uint w = m_width;
-    const uint h = m_height;
-    const uint d = m_depth;
-    const uint w2 = w / 2;
-
-    for (uint c = 0; c < m_componentCount; c++) {
-        for (uint z = 0; z < d; z++) {
-            for (uint y = 0; y < h; y++) {
-                float * line = scanline(c, y, z);
-                for (uint x = 0; x < w2; x++) {
-                    swap(line[x], line[w - 1 - x]);
-                }
-            }
-        }
-    }
-}
-
-void FloatImage::flipY()
-{
-    const uint w = m_width;
-    const uint h = m_height;
-    const uint d = m_depth;
-    const uint h2 = h / 2;
-
-    for (uint c = 0; c < m_componentCount; c++) {
-        for (uint z = 0; z < d; z++) {
-            for (uint y = 0; y < h2; y++) {
-                float * src = scanline(c, y, z);
-                float * dst = scanline(c, h - 1 - y, z);
-                for (uint x = 0; x < w; x++) {
-                    swap(src[x], dst[x]);
-                }
-            }
-        }
-    }
-}
-
-void FloatImage::flipZ()
-{
-    const uint w = m_width;
-    const uint h = m_height;
-    const uint d = m_depth;
-    const uint d2 = d / 2;
-
-    for (uint c = 0; c < m_componentCount; c++) {
-        for (uint z = 0; z < d2; z++) {
-            float * src = plane(c, z);
-            float * dst = plane(c, d - 1 - z);
-            for (uint i = 0; i < w*h; i++) {
-                swap(src[i], dst[i]);
-            }
-        }
-    }
-}
-
-
-
-float FloatImage::alphaTestCoverage(float alphaRef, int alphaChannel, float alphaScale/*=1*/) const
-{
-    const uint w = m_width;
-    const uint h = m_height;
-
-    float coverage = 0.0f;
-
-#if 0
-    const float * alpha = channel(alphaChannel);
-
-    const uint count = m_pixelCount;
-    for (uint i = 0; i < count; i++) {
-        if (alpha[i] > alphaRef) coverage += 1.0f; // @@ gt or lt?
-    }
-    
-    return coverage / float(w * h);
-#else
-    const uint n = 8;
-
-    // If we want subsampling:
-    for (uint y = 0; y < h-1; y++) {
-        for (uint x = 0; x < w-1; x++) {
-
-            float alpha00 = nv::saturate(pixel(alphaChannel, x+0, y+0, 0) * alphaScale);
-            float alpha10 = nv::saturate(pixel(alphaChannel, x+1, y+0, 0) * alphaScale);
-            float alpha01 = nv::saturate(pixel(alphaChannel, x+0, y+1, 0) * alphaScale);
-            float alpha11 = nv::saturate(pixel(alphaChannel, x+1, y+1, 0) * alphaScale);
-
-            for (uint sy = 0; sy < n; sy++) {
-                float fy = (sy + 0.5f) / n;
-                for (uint sx = 0; sx < n; sx++) {
-                    float fx = (sx + 0.5f) / n;
-                    float alpha = alpha00 * (1 - fx) * (1 - fy) + alpha10 * fx * (1 - fy) + alpha01 * (1 - fx) * fy + alpha11 * fx * fy;
-                    if (alpha > alphaRef) coverage += 1.0f;
-                }
-            }
-        }
-    }
-
-    return coverage / float(w * h * n * n);
-#endif
-}
-
-void FloatImage::scaleAlphaToCoverage(float desiredCoverage, float alphaRef, int alphaChannel)
-{
-#if 0
-    float minAlphaRef = 0.0f;
-    float maxAlphaRef = 1.0f;
-    float midAlphaRef = 0.5f;
-
-    // Determine desired scale using a binary search. Hardcoded to 8 steps max.
-    for (int i = 0; i < 10; i++) {
-        float currentCoverage = alphaTestCoverage(midAlphaRef, alphaChannel);
-
-        if (currentCoverage > desiredCoverage) {
-            minAlphaRef = midAlphaRef;
-        }
-        else if (currentCoverage < desiredCoverage) {
-            maxAlphaRef = midAlphaRef;
-        }
-        else {
-            break;
-        }
-
-        midAlphaRef = (minAlphaRef + maxAlphaRef) * 0.5f;
-    }
-
-    float alphaScale = alphaRef / midAlphaRef;
-
-    // Scale alpha channel.
-    scaleBias(alphaChannel, 1, alphaScale, 0.0f);
-    clamp(alphaChannel, 1, 0.0f, 1.0f); 
-#else
-    float minAlphaScale = 0.0f;
-    float maxAlphaScale = 4.0f;
-    float alphaScale = 1.0f;
-
-    // Determine desired scale using a binary search. Hardcoded to 8 steps max.
-    for (int i = 0; i < 10; i++) {
-        float currentCoverage = alphaTestCoverage(alphaRef, alphaChannel, alphaScale);
-
-        if (currentCoverage < desiredCoverage) {
-            minAlphaScale = alphaScale;
-        }
-        else if (currentCoverage > desiredCoverage) {
-            maxAlphaScale = alphaScale;
-        }
-        else {
-            break;
-        }
-
-        alphaScale = (minAlphaScale + maxAlphaScale) * 0.5f;
-    }
-
-    // Scale alpha channel.
-    scaleBias(alphaChannel, 1, alphaScale, 0.0f);
-    clamp(alphaChannel, 1, 0.0f, 1.0f); 
-#endif
-#if _DEBUG
-    alphaTestCoverage(alphaRef, alphaChannel);
-#endif
-}
-
-FloatImage* FloatImage::clone() const
-{
-    FloatImage* copy = new FloatImage();
-
-    copy->allocate(m_componentCount, m_width, m_height, m_depth);
-    memcpy(copy->m_mem, m_mem, m_floatCount * sizeof(float));
-
-    return copy;
-}
-
+// This code is in the public domain -- castanyo@yahoo.es
+
+#include "FloatImage.h"
+#include "Filter.h"
+#include "Image.h"
+
+#include "nvthread/ParallelFor.h"
+
+#include "nvmath/Color.h"
+#include "nvmath/Vector.inl"
+#include "nvmath/Matrix.inl"
+#include "nvmath/ftoi.h"
+#include "nvmath/Gamma.h"
+
+#include "nvcore/Utils.h" // max
+#include "nvcore/Ptr.h"
+#include "nvcore/Memory.h"
+#include "nvcore/Array.inl"
+
+#include <math.h>
+#include <string.h> // memset, memcpy
+
+
+using namespace nv;
+
+
+/// Ctor.
+FloatImage::FloatImage() : m_componentCount(0), m_width(0), m_height(0), m_depth(0),
+  m_pixelCount(0), m_floatCount(0), m_mem(NULL)
+{
+}
+
+FloatImage::FloatImage(const FloatImage & img) : m_componentCount(0), m_width(0), m_height(0), m_depth(0),
+    m_pixelCount(0), m_floatCount(0), m_mem(NULL)
+{
+    allocate(img.m_componentCount, img.m_width, img.m_height, img.m_depth);
+    memcpy(m_mem, img.m_mem, m_floatCount * sizeof(float));
+}
+
+/// Ctor. Init from image.
+FloatImage::FloatImage(const Image * img) : m_componentCount(0), m_width(0), m_height(0), m_depth(0),
+    m_pixelCount(0), m_floatCount(0), m_mem(NULL)
+{
+    initFrom(img);
+}
+
+/// Dtor.
+FloatImage::~FloatImage()
+{
+    free();
+}
+
+/// Init the floating point image from a regular image.
+void FloatImage::initFrom(const Image * img)
+{
+    nvCheck(img != NULL);
+
+    uint channel_count = 3;
+    if (img->format() == Image::Format_ARGB) channel_count = 4;
+
+    allocate(channel_count, img->width(), img->height(), img->depth());
+
+    float * red_channel = channel(0);
+    float * green_channel = channel(1);
+    float * blue_channel = channel(2);
+    float * alpha_channel = (channel_count == 4) ? channel(3) : NULL;
+
+    float scale = 1.0f / 255.0f;
+
+    const uint count = m_pixelCount;
+    for (uint i = 0; i < count; i++) {
+    //parallel_for(count, 128, [&](int i) {
+        Color32 pixel = img->pixel(i);
+        red_channel[i] = float(pixel.r) * scale;
+        green_channel[i] = float(pixel.g) * scale;
+        blue_channel[i] = float(pixel.b) * scale;
+        if (channel_count == 4) alpha_channel[i] = float(pixel.a) * scale;
+    }//);
+}
+
+/// Convert the floating point image to a regular image.
+Image * FloatImage::createImage(uint baseComponent/*= 0*/, uint num/*= 4*/) const
+{
+    nvCheck(num <= 4);
+    nvCheck(baseComponent + num <= m_componentCount);
+
+    AutoPtr<Image> img(new Image());
+    img->allocate(m_width, m_height, m_depth);
+
+    for (uint i = 0; i < m_pixelCount; i++) {
+
+        uint c;
+        uint8 rgba[4]= {0, 0, 0, 0xff};
+
+        for (c = 0; c < num; c++) {
+            float f = pixel(baseComponent + c, i);
+            rgba[c] = nv::clamp(int(255.0f * f), 0, 255);
+        }
+
+        img->pixel(i) = Color32(rgba[0], rgba[1], rgba[2], rgba[3]);
+    }
+
+    return img.release();
+}
+
+
+/// Convert the floating point image to a regular image. Correct gamma of rgb, but not alpha.
+Image * FloatImage::createImageGammaCorrect(float gamma/*= 2.2f*/) const
+{
+    nvCheck(m_componentCount == 4);
+
+    AutoPtr<Image> img(new Image());
+    img->allocate(m_width, m_height, m_depth);
+
+    const float * rChannel = this->channel(0);
+    const float * gChannel = this->channel(1);
+    const float * bChannel = this->channel(2);
+    const float * aChannel = this->channel(3);
+
+    const uint count = m_pixelCount;
+    for (uint i = 0; i < count; i++)
+    {
+        const uint8 r = nv::clamp(int(255.0f * pow(rChannel[i], 1.0f/gamma)), 0, 255);
+        const uint8 g = nv::clamp(int(255.0f * pow(gChannel[i], 1.0f/gamma)), 0, 255);
+        const uint8 b = nv::clamp(int(255.0f * pow(bChannel[i], 1.0f/gamma)), 0, 255);
+        const uint8 a = nv::clamp(int(255.0f * aChannel[i]), 0, 255);
+
+        img->pixel(i) = Color32(r, g, b, a);
+    }
+
+    return img.release();
+}
+
+/// Allocate a 2D float image of the given format and the given extents.
+void FloatImage::allocate(uint c, uint w, uint h, uint d)
+{
+    if (m_componentCount != c || m_width != w || m_height != h || m_depth != d)
+    {
+        free();
+
+        m_width = w;
+        m_height = h;
+        m_depth = d;
+        m_componentCount = c;
+        m_pixelCount = w * h * d;
+        m_floatCount = m_pixelCount * c;
+        m_mem = malloc<float>(m_floatCount);
+    }
+}
+
+/// Free the image, but don't clear the members.
+void FloatImage::free()
+{
+    ::free(m_mem);
+    m_mem = NULL;
+}
+
+void FloatImage::resizeChannelCount(uint c)
+{
+    if (m_componentCount != c) {
+        uint count = m_pixelCount * c;
+        m_mem = realloc<float>(m_mem, count);
+
+        if (c > m_componentCount) {
+            memset(m_mem + m_floatCount, 0, (count - m_floatCount) * sizeof(float));
+        }
+
+        m_componentCount = c;
+        m_floatCount = count;
+    }
+}
+
+void FloatImage::clear(float f/*=0.0f*/)
+{
+    for (uint i = 0; i < m_floatCount; i++) {
+        m_mem[i] = f;
+    }
+}
+
+void FloatImage::clear(uint c, float f/*= 0.0f*/)
+{
+    float * channel = this->channel(c);
+
+    const uint count = m_pixelCount;
+    for (uint i = 0; i < count; i++) {
+        channel[i] = f;
+    }
+}
+
+void FloatImage::copyChannel(uint src, uint dst)
+{
+    nvCheck(src < m_componentCount);
+    nvCheck(dst < m_componentCount);
+
+    const float * srcChannel = this->channel(src);
+    float * dstChannel = this->channel(dst);
+
+    memcpy(dstChannel, srcChannel, sizeof(float)*m_pixelCount);
+}
+
+void FloatImage::normalize(uint baseComponent)
+{
+    nvCheck(baseComponent + 3 <= m_componentCount);
+
+    float * xChannel = this->channel(baseComponent + 0);
+    float * yChannel = this->channel(baseComponent + 1);
+    float * zChannel = this->channel(baseComponent + 2);
+
+    const uint count = m_pixelCount;
+    for (uint i = 0; i < count; i++) {
+
+        Vector3 normal(xChannel[i], yChannel[i], zChannel[i]);
+        normal = normalizeSafe(normal, Vector3(0), 0.0f);
+
+        xChannel[i] = normal.x;
+        yChannel[i] = normal.y;
+        zChannel[i] = normal.z;
+    }
+}
+
+void FloatImage::packNormals(uint baseComponent)
+{
+    scaleBias(baseComponent, 3, 0.5f, 0.5f);
+}
+
+void FloatImage::expandNormals(uint baseComponent)
+{
+    scaleBias(baseComponent, 3, 2, -1.0);
+}
+
+void FloatImage::scaleBias(uint baseComponent, uint num, float scale, float bias)
+{
+    const uint size = m_pixelCount;
+
+    for (uint c = 0; c < num; c++) {
+        float * ptr = this->channel(baseComponent + c);
+
+        for (uint i = 0; i < size; i++) {
+            ptr[i] = scale * ptr[i] + bias;
+        }
+    }
+}
+
+/// Clamp the elements of the image.
+void FloatImage::clamp(uint baseComponent, uint num, float low, float high)
+{
+    const uint size = m_pixelCount;
+
+    for (uint c = 0; c < num; c++) {
+        float * ptr = this->channel(baseComponent + c);
+
+        for (uint i = 0; i < size; i++) {
+            ptr[i] = nv::clamp(ptr[i], low, high);
+        }
+    }
+}
+
+/// From gamma to linear space.
+void FloatImage::toLinear(uint baseComponent, uint num, float gamma /*= 2.2f*/)
+{
+    if (gamma == 2.2f) {
+        for (uint c = 0; c < num; c++) {
+            float * ptr = this->channel(baseComponent + c);
+
+            powf_11_5(ptr, ptr, m_pixelCount);
+        }
+    } else {
+        exponentiate(baseComponent, num, gamma);
+    }
+}
+
+/// From linear to gamma space.
+void FloatImage::toGamma(uint baseComponent, uint num, float gamma /*= 2.2f*/)
+{
+    if (gamma == 2.2f) {
+        for (uint c = 0; c < num; c++) {
+            float * ptr = this->channel(baseComponent + c);
+
+            powf_5_11(ptr, ptr, m_pixelCount);
+        }
+    } else {
+        exponentiate(baseComponent, num, 1.0f/gamma);
+    }
+}
+
+/// Exponentiate the elements of the image.
+void FloatImage::exponentiate(uint baseComponent, uint num, float power)
+{
+    const uint size = m_pixelCount;
+
+    for(uint c = 0; c < num; c++) {
+        float * ptr = this->channel(baseComponent + c);
+
+        for(uint i = 0; i < size; i++) {
+            ptr[i] = powf(max(0.0f, ptr[i]), power);
+        }
+    }
+}
+
+/// Apply linear transform.
+void FloatImage::transform(uint baseComponent, const Matrix & m, Vector4::Arg offset)
+{
+    nvCheck(baseComponent + 4 <= m_componentCount);
+
+    float * r = this->channel(baseComponent + 0);
+    float * g = this->channel(baseComponent + 1);
+    float * b = this->channel(baseComponent + 2);
+    float * a = this->channel(baseComponent + 3);
+
+    const uint size = m_pixelCount;
+    for (uint i = 0; i < size; i++)
+    {
+        Vector4 color = nv::transform(m, Vector4(*r, *g, *b, *a)) + offset;
+
+        *r++ = color.x;
+        *g++ = color.y;
+        *b++ = color.z;
+        *a++ = color.w;
+    }
+}
+
+void FloatImage::swizzle(uint baseComponent, uint r, uint g, uint b, uint a)
+{
+    nvCheck(baseComponent + 4 <= m_componentCount);
+    nvCheck(r < 7 && g < 7 && b < 7 && a < 7);
+
+    float consts[] = { 1.0f, 0.0f, -1.0f };
+    float * c[7];
+    c[0] = this->channel(baseComponent + 0);
+    c[1] = this->channel(baseComponent + 1);
+    c[2] = this->channel(baseComponent + 2);
+    c[3] = this->channel(baseComponent + 3);
+    c[4] = consts;
+    c[5] = consts + 1;
+    c[6] = consts + 2;
+
+    const uint size = m_pixelCount;
+    for (uint i = 0; i < size; i++)
+    {
+        float tmp[4] = { *c[r], *c[g], *c[b], *c[a] };
+
+        *c[0]++ = tmp[0];
+        *c[1]++ = tmp[1];
+        *c[2]++ = tmp[2];
+        *c[3]++ = tmp[3];
+    }
+}
+
+float FloatImage::sampleNearest(uint c, float x, float y, const WrapMode wm) const
+{
+    if( wm == WrapMode_Clamp ) return sampleNearestClamp(c, x, y);
+    else if( wm == WrapMode_Repeat ) return sampleNearestRepeat(c, x, y);
+    else /*if( wm == WrapMode_Mirror )*/ return sampleNearestMirror(c, x, y);
+}
+
+float FloatImage::sampleLinear(uint c, float x, float y, WrapMode wm) const
+{
+    if( wm == WrapMode_Clamp ) return sampleLinearClamp(c, x, y);
+    else if( wm == WrapMode_Repeat ) return sampleLinearRepeat(c, x, y);
+    else /*if( wm == WrapMode_Mirror )*/ return sampleLinearMirror(c, x, y);
+}
+
+float FloatImage::sampleNearest(uint c, float x, float y, float z, WrapMode wm) const
+{
+    if( wm == WrapMode_Clamp ) return sampleNearestClamp(c, x, y, z);
+    else if( wm == WrapMode_Repeat ) return sampleNearestRepeat(c, x, y, z);
+    else /*if( wm == WrapMode_Mirror )*/ return sampleNearestMirror(c, x, y, z);
+}
+
+float FloatImage::sampleLinear(uint c, float x, float y, float z, WrapMode wm) const
+{
+    if( wm == WrapMode_Clamp ) return sampleLinearClamp(c, x, y, z);
+    else if( wm == WrapMode_Repeat ) return sampleLinearRepeat(c, x, y, z);
+    else /*if( wm == WrapMode_Mirror )*/ return sampleLinearMirror(c, x, y, z);
+}
+
+float FloatImage::sampleNearestClamp(uint c, float x, float y) const
+{
+    int ix = wrapClamp(iround(x * m_width), m_width);
+    int iy = wrapClamp(iround(y * m_height), m_height);
+    return pixel(c, ix, iy, 0);
+}
+
+float FloatImage::sampleNearestRepeat(uint c, float x, float y) const
+{
+    int ix = wrapRepeat(iround(x * m_width), m_width);
+    int iy = wrapRepeat(iround(y * m_height), m_height);
+    return pixel(c, ix, iy, 0);
+}
+
+float FloatImage::sampleNearestMirror(uint c, float x, float y) const
+{
+    int ix = wrapMirror(iround(x * m_width), m_width);
+    int iy = wrapMirror(iround(y * m_height), m_height);
+    return pixel(c, ix, iy, 0);
+}
+
+float FloatImage::sampleNearestClamp(uint c, float x, float y, float z) const
+{
+    int ix = wrapClamp(iround(x * m_width), m_width);
+    int iy = wrapClamp(iround(y * m_height), m_height);
+    int iz = wrapClamp(iround(z * m_depth), m_depth);
+    return pixel(c, ix, iy, iz);
+}
+
+float FloatImage::sampleNearestRepeat(uint c, float x, float y, float z) const
+{
+    int ix = wrapRepeat(iround(x * m_width), m_width);
+    int iy = wrapRepeat(iround(y * m_height), m_height);
+    int iz = wrapRepeat(iround(z * m_depth), m_depth);
+    return pixel(c, ix, iy, iz);
+}
+
+float FloatImage::sampleNearestMirror(uint c, float x, float y, float z) const
+{
+    int ix = wrapMirror(iround(x * m_width), m_width);
+    int iy = wrapMirror(iround(y * m_height), m_height);
+    int iz = wrapMirror(iround(z * m_depth), m_depth);
+    return pixel(c, ix, iy, iz);
+}
+
+
+float FloatImage::sampleLinearClamp(uint c, float x, float y) const
+{
+    const int w = m_width;
+    const int h = m_height;
+
+    x *= w;
+    y *= h;
+
+    const float fracX = frac(x);
+    const float fracY = frac(y);
+
+    const int ix0 = ::clamp(ifloor(x), 0, w-1);
+    const int iy0 = ::clamp(ifloor(y), 0, h-1);
+    const int ix1 = ::clamp(ifloor(x)+1, 0, w-1);
+    const int iy1 = ::clamp(ifloor(y)+1, 0, h-1);
+
+    return bilerp(c, ix0, iy0, ix1, iy1, fracX, fracY);
+}
+
+float FloatImage::sampleLinearRepeat(uint c, float x, float y) const
+{
+    const int w = m_width;
+    const int h = m_height;
+
+    const float fracX = frac(x * w);
+    const float fracY = frac(y * h);
+
+    // @@ Using floor in some places, but round in others?
+    int ix0 = ifloor(frac(x) * w);
+    int iy0 = ifloor(frac(y) * h);
+    int ix1 = ifloor(frac(x + 1.0f/w) * w);
+    int iy1 = ifloor(frac(y + 1.0f/h) * h);
+
+    return bilerp(c, ix0, iy0, ix1, iy1, fracX, fracY);
+}
+
+float FloatImage::sampleLinearMirror(uint c, float x, float y) const
+{
+    const int w = m_width;
+    const int h = m_height;
+
+    x *= w;
+    y *= h;
+
+    const float fracX = frac(x);
+    const float fracY = frac(y);
+
+    int ix0 = wrapMirror(iround(x), w);
+    int iy0 = wrapMirror(iround(y), h);
+    int ix1 = wrapMirror(iround(x) + 1, w);
+    int iy1 = wrapMirror(iround(y) + 1, h);
+
+    return bilerp(c, ix0, iy0, ix1, iy1, fracX, fracY);
+}
+
+float FloatImage::sampleLinearClamp(uint c, float x, float y, float z) const
+{
+    const int w = m_width;
+    const int h = m_height;
+    const int d = m_depth;
+
+    x *= w;
+    y *= h;
+    z *= d;
+
+    const float fracX = frac(x);
+    const float fracY = frac(y);
+    const float fracZ = frac(z);
+
+    //x -= fracX;
+    //y -= fracY;
+    //z -= fracZ;
+
+    // @@ Using floor in some places, but round in others?
+    const int ix0 = ::clamp(ifloor(x), 0, w-1);
+    const int iy0 = ::clamp(ifloor(y), 0, h-1);
+    const int iz0 = ::clamp(ifloor(z), 0, d-1);
+    const int ix1 = ::clamp(ifloor(x)+1, 0, w-1);
+    const int iy1 = ::clamp(ifloor(y)+1, 0, h-1);
+    const int iz1 = ::clamp(ifloor(z)+1, 0, d-1);
+
+    return trilerp(c, ix0, iy0, iz0, ix1, iy1, iz1, fracX, fracY, fracZ);
+}
+
+float FloatImage::sampleLinearRepeat(uint c, float x, float y, float z) const
+{
+    const int w = m_width;
+    const int h = m_height;
+    const int d = m_depth;
+
+    const float fracX = frac(x * w);
+    const float fracY = frac(y * h);
+    const float fracZ = frac(z * d);
+
+    int ix0 = ifloor(frac(x) * w);
+    int iy0 = ifloor(frac(y) * h);
+    int iz0 = ifloor(frac(z) * d);
+    int ix1 = ifloor(frac(x + 1.0f/w) * w);
+    int iy1 = ifloor(frac(y + 1.0f/h) * h);
+    int iz1 = ifloor(frac(z + 1.0f/d) * d);
+
+    return trilerp(c, ix0, iy0, iz0, ix1, iy1, iz1, fracX, fracY, fracZ);
+}
+
+float FloatImage::sampleLinearMirror(uint c, float x, float y, float z) const
+{
+    const int w = m_width;
+    const int h = m_height;
+    const int d = m_depth;
+
+    x *= w;
+    y *= h;
+    z *= d;
+
+    int ix0 = wrapMirror(iround(x), w);
+    int iy0 = wrapMirror(iround(y), h);
+    int iz0 = wrapMirror(iround(z), d);
+    int ix1 = wrapMirror(iround(x) + 1, w);
+    int iy1 = wrapMirror(iround(y) + 1, h);
+    int iz1 = wrapMirror(iround(z) + 1, d);
+
+    const float fracX = frac(x);
+    const float fracY = frac(y);
+    const float fracZ = frac(z);
+
+    return trilerp(c, ix0, iy0, iz0, ix1, iy1, iz1, fracX, fracY, fracZ);
+}
+
+
+/// Fast downsampling using box filter. 
+///
+/// The extents of the image are divided by two and rounded down.
+///
+/// When the size of the image is odd, this uses a polyphase box filter as explained in:
+/// http://developer.nvidia.com/object/np2_mipmapping.html
+///
+FloatImage * FloatImage::fastDownSample() const
+{
+    nvDebugCheck(m_depth == 1);
+    nvDebugCheck(m_width != 1 || m_height != 1);
+
+    AutoPtr<FloatImage> dst_image( new FloatImage() );
+
+    const uint w = max(1, m_width / 2);
+    const uint h = max(1, m_height / 2);
+    dst_image->allocate(m_componentCount, w, h);
+
+    // 1D box filter.
+    if (m_width == 1 || m_height == 1)
+    {
+        const uint n = w * h;
+
+        if ((m_width * m_height) & 1)
+        {
+            const float scale = 1.0f / (2 * n + 1);
+
+            for(uint c = 0; c < m_componentCount; c++)
+            {
+                const float * src = this->channel(c);
+                float * dst = dst_image->channel(c);
+
+                for(uint x = 0; x < n; x++)
+                {
+                    const float w0 = float(n - x);
+                    const float w1 = float(n - 0);
+                    const float w2 = float(1 + x);
+
+                    *dst++ = scale * (w0 * src[0] + w1 * src[1] + w2 * src[2]);
+                    src += 2;
+                }
+            }
+        }
+        else
+        {
+            for(uint c = 0; c < m_componentCount; c++)
+            {
+                const float * src = this->channel(c);
+                float * dst = dst_image->channel(c);
+
+                for(uint x = 0; x < n; x++)
+                {
+                    *dst = 0.5f * (src[0] + src[1]);
+                    dst++;
+                    src += 2;
+                }
+            }
+        }
+    }
+
+    // Regular box filter.
+    else if ((m_width & 1) == 0 && (m_height & 1) == 0)
+    {
+        for(uint c = 0; c < m_componentCount; c++)
+        {
+            const float * src = this->channel(c);
+            float * dst = dst_image->channel(c);
+
+            for(uint y = 0; y < h; y++)
+            {
+                for(uint x = 0; x < w; x++)
+                {
+                    *dst = 0.25f * (src[0] + src[1] + src[m_width] + src[m_width + 1]);
+                    dst++;
+                    src += 2;
+                }
+
+                src += m_width;
+            }
+        }
+    }
+
+    // Polyphase filters.
+    else if (m_width & 1 && m_height & 1)
+    {
+        nvDebugCheck(m_width == 2 * w + 1);
+        nvDebugCheck(m_height == 2 * h + 1);
+
+        const float scale = 1.0f / (m_width * m_height);
+
+        for(uint c = 0; c < m_componentCount; c++)
+        {
+            const float * src = this->channel(c);
+            float * dst = dst_image->channel(c);
+
+            for(uint y = 0; y < h; y++)
+            {
+                const float v0 = float(h - y);
+                const float v1 = float(h - 0);
+                const float v2 = float(1 + y);
+
+                for (uint x = 0; x < w; x++)
+                {
+                    const float w0 = float(w - x);
+                    const float w1 = float(w - 0);
+                    const float w2 = float(1 + x);
+
+                    float f = 0.0f;
+                    f += v0 * (w0 * src[0 * m_width + 2 * x] + w1 * src[0 * m_width + 2 * x + 1] + w2 * src[0 * m_width + 2 * x + 2]);
+                    f += v1 * (w0 * src[1 * m_width + 2 * x] + w1 * src[1 * m_width + 2 * x + 1] + w2 * src[1 * m_width + 2 * x + 2]);
+                    f += v2 * (w0 * src[2 * m_width + 2 * x] + w1 * src[2 * m_width + 2 * x + 1] + w2 * src[2 * m_width + 2 * x + 2]);
+
+                    *dst = f * scale;
+                    dst++;
+                }
+
+                src += 2 * m_width;
+            }
+        }
+    }
+    else if (m_width & 1)
+    {
+        nvDebugCheck(m_width == 2 * w + 1);
+        const float scale = 1.0f / (2 * m_width);
+
+        for(uint c = 0; c < m_componentCount; c++)
+        {
+            const float * src = this->channel(c);
+            float * dst = dst_image->channel(c);
+
+            for(uint y = 0; y < h; y++)
+            {
+                for (uint x = 0; x < w; x++)
+                {
+                    const float w0 = float(w - x);
+                    const float w1 = float(w - 0);
+                    const float w2 = float(1 + x);
+
+                    float f = 0.0f;
+                    f += w0 * (src[2 * x + 0] + src[m_width + 2 * x + 0]);
+                    f += w1 * (src[2 * x + 1] + src[m_width + 2 * x + 1]);
+                    f += w2 * (src[2 * x + 2] + src[m_width + 2 * x + 2]);
+
+                    *dst = f * scale;
+                    dst++;
+                }
+
+                src += 2 * m_width;
+            }
+        }
+    }
+    else if (m_height & 1)
+    {
+        nvDebugCheck(m_height == 2 * h + 1);
+
+        const float scale = 1.0f / (2 * m_height);
+
+        for(uint c = 0; c < m_componentCount; c++)
+        {
+            const float * src = this->channel(c);
+            float * dst = dst_image->channel(c);
+
+            for(uint y = 0; y < h; y++)
+            {
+                const float v0 = float(h - y);
+                const float v1 = float(h - 0);
+                const float v2 = float(1 + y);
+
+                for (uint x = 0; x < w; x++)
+                {
+                    float f = 0.0f;
+                    f += v0 * (src[0 * m_width + 2 * x] + src[0 * m_width + 2 * x + 1]);
+                    f += v1 * (src[1 * m_width + 2 * x] + src[1 * m_width + 2 * x + 1]);
+                    f += v2 * (src[2 * m_width + 2 * x] + src[2 * m_width + 2 * x + 1]);
+
+                    *dst = f * scale;
+                    dst++;
+                }
+
+                src += 2 * m_width;
+            }
+        }
+    }
+
+    return dst_image.release();
+}
+
+/// Downsample applying a 1D kernel separately in each dimension.
+FloatImage * FloatImage::downSample(const Filter & filter, WrapMode wm) const
+{
+    const uint w = max(1, m_width / 2);
+    const uint h = max(1, m_height / 2);
+    const uint d = max(1, m_depth / 2);
+
+    return resize(filter, w, h, d, wm);
+}
+
+/// Downsample applying a 1D kernel separately in each dimension.
+FloatImage * FloatImage::downSample(const Filter & filter, WrapMode wm, uint alpha) const
+{
+    const uint w = max(1, m_width / 2);
+    const uint h = max(1, m_height / 2);
+    const uint d = max(1, m_depth / 2);
+
+    return resize(filter, w, h, d, wm, alpha);
+}
+
+
+/// Downsample applying a 1D kernel separately in each dimension.
+FloatImage * FloatImage::resize(const Filter & filter, uint w, uint h, WrapMode wm) const
+{
+    // @@ Use monophase filters when frac(m_width / w) == 0
+
+    AutoPtr<FloatImage> tmp_image( new FloatImage() );
+    AutoPtr<FloatImage> dst_image( new FloatImage() );	
+
+    PolyphaseKernel xkernel(filter, m_width, w, 32);
+    PolyphaseKernel ykernel(filter, m_height, h, 32);
+
+    // @@ Select fastest filtering order:
+    //if (w * m_height <= h * m_width)
+    {
+        tmp_image->allocate(m_componentCount, w, m_height);
+        dst_image->allocate(m_componentCount, w, h);
+
+        // @@ We could avoid this allocation, write directly to dst_plane.
+        //Array<float> tmp_column(h);
+        //tmp_column.resize(h);
+
+        for (uint c = 0; c < m_componentCount; c++)
+        {
+            for (uint z = 0; z < m_depth; z++)
+            {
+                float * tmp_plane = tmp_image->plane(c, z);
+
+                for (uint y = 0; y < m_height; y++) {
+                //parallel_for(m_height, [&](int y) {
+                    this->applyKernelX(xkernel, y, z, c, wm, tmp_plane + y * w);
+                }//);
+
+                float * dst_plane = dst_image->plane(c, z);
+
+                for (uint x = 0; x < w; x++) {
+                //parallel_for(w, [&](int x) {
+                    tmp_image->applyKernelY(ykernel, x, z, c, wm, dst_plane + x, w);
+
+                    // @@ We could avoid this copy, write directly to dst_plane.
+                    /*for (uint y = 0; y < h; y++) {
+                        dst_plane[y * w + x] = tmp_column[y];
+                    }*/
+                }//);
+            }
+        }
+    }
+
+    return dst_image.release();
+}
+
+/// Downsample applying a 1D kernel separately in each dimension. (for 3d textures)
+FloatImage * FloatImage::resize(const Filter & filter, uint w, uint h, uint d, WrapMode wm) const
+{
+    // @@ Use monophase filters when frac(m_width / w) == 0
+
+    // Use the existing 2d version if we are not resizing in the Z axis:
+    if (m_depth == d) {
+        return resize(filter, w, h, wm);
+    }
+
+    AutoPtr<FloatImage> tmp_image( new FloatImage() );
+    AutoPtr<FloatImage> tmp_image2( new FloatImage() );
+    AutoPtr<FloatImage> dst_image( new FloatImage() );
+
+    PolyphaseKernel xkernel(filter, m_width, w, 32);
+    PolyphaseKernel ykernel(filter, m_height, h, 32);
+    PolyphaseKernel zkernel(filter, m_depth, d, 32);
+
+    tmp_image->allocate(m_componentCount, w, m_height, m_depth);
+    tmp_image2->allocate(m_componentCount, w, m_height, d);
+    dst_image->allocate(m_componentCount, w, h, d);
+
+    Array<float> tmp_column(h);
+    tmp_column.resize(h);
+
+    for (uint c = 0; c < m_componentCount; c++)
+    {
+        float * tmp_channel = tmp_image->channel(c);
+
+        // split width in half
+        for (uint z = 0; z < m_depth; z++ ) {
+            for (uint y = 0; y < m_height; y++) {
+                this->applyKernelX(xkernel, y, z, c, wm, tmp_channel + z * m_height * w + y * w);
+            }
+        }
+
+        // split depth in half
+        float * tmp2_channel = tmp_image2->channel(c);
+        for (uint y = 0; y < m_height; y++) {
+            for (uint x = 0; x < w; x++) {
+                tmp_image->applyKernelZ(zkernel, x, y, c, wm, tmp_column.buffer() );
+
+                for (uint z = 0; z < d; z++) {
+                    tmp2_channel[z * m_height * w + y * w + x] = tmp_column[z];
+                }
+            }
+        }
+
+        // split height in half
+        float * dst_channel = dst_image->channel(c);
+
+        for (uint z = 0; z < d; z++ ) {
+            for (uint x = 0; x < w; x++) {
+                tmp_image2->applyKernelY(ykernel, x, z, c, wm, tmp_column.buffer(), 1);
+
+                for (uint y = 0; y < h; y++) {
+                    dst_channel[z * h * w + y * w + x] = tmp_column[y];
+                }
+            }
+        }
+    }
+
+    return dst_image.release();
+}
+
+
+/// Downsample applying a 1D kernel separately in each dimension.
+FloatImage * FloatImage::resize(const Filter & filter, uint w, uint h, WrapMode wm, uint alpha) const
+{
+    nvCheck(alpha < m_componentCount);
+
+    AutoPtr<FloatImage> tmp_image( new FloatImage() );
+    AutoPtr<FloatImage> dst_image( new FloatImage() );	
+
+    PolyphaseKernel xkernel(filter, m_width, w, 32);
+    PolyphaseKernel ykernel(filter, m_height, h, 32);
+
+    {
+        tmp_image->allocate(m_componentCount, w, m_height);
+        dst_image->allocate(m_componentCount, w, h);
+
+        Array<float> tmp_column(h);
+        tmp_column.resize(h);
+
+        for (uint i = 0; i < m_componentCount; i++)
+        {
+            // Process alpha channel first.
+            uint c;
+            if (i == 0) c = alpha;
+            else if (i > alpha) c = i;
+            else c = i - 1;
+
+            for (uint z = 0; z < m_depth; z++)
+            {
+                float * tmp_plane = tmp_image->plane(c, z);
+
+                for (uint y = 0; y < m_height; y++) {
+                    this->applyKernelX(xkernel, y, z, c, wm, tmp_plane + y * w);
+                }
+
+                float * dst_plane = dst_image->plane(c, z);
+
+                for (uint x = 0; x < w; x++) {
+                    tmp_image->applyKernelY(ykernel, x, z, c, wm, tmp_column.buffer(), 1);
+
+                    // @@ Avoid this copy, write directly to dst_plane.
+                    for (uint y = 0; y < h; y++) {
+                        dst_plane[y * w + x] = tmp_column[y];
+                    }
+                }
+            }
+        }
+    }
+
+    return dst_image.release();
+}
+
+
+/// Downsample applying a 1D kernel separately in each dimension. (for 3d textures)
+FloatImage * FloatImage::resize(const Filter & filter, uint w, uint h, uint d, WrapMode wm, uint alpha) const
+{
+    nvCheck(alpha < m_componentCount);
+
+    // use the existing 2d version if we are a 2d image:
+    if (m_depth == d) {
+        return resize( filter, w, h, wm, alpha );
+    }
+
+    AutoPtr<FloatImage> tmp_image( new FloatImage() );
+    AutoPtr<FloatImage> tmp_image2( new FloatImage() );
+    AutoPtr<FloatImage> dst_image( new FloatImage() );
+
+    PolyphaseKernel xkernel(filter, m_width, w, 32);
+    PolyphaseKernel ykernel(filter, m_height, h, 32);
+    PolyphaseKernel zkernel(filter, m_depth, d, 32);
+
+    tmp_image->allocate(m_componentCount, w, m_height, m_depth);
+    tmp_image2->allocate(m_componentCount, w, m_height, d);
+    dst_image->allocate(m_componentCount, w, h, d);
+
+    Array<float> tmp_column(h);
+    tmp_column.resize(h);
+
+    for (uint i = 0; i < m_componentCount; i++)
+    {
+        // Process alpha channel first.
+        uint c;
+        if (i == 0) c = alpha;
+        else if (i > alpha) c = i;
+        else c = i - 1;
+
+        float * tmp_channel = tmp_image->channel(c);
+
+        for (uint z = 0; z < m_depth; z++ ) {
+            for (uint y = 0; y < m_height; y++) {
+                this->applyKernelX(xkernel, y, z, c, wm, tmp_channel + z * m_height * w + y * w);
+            }
+        }
+
+        float * tmp2_channel = tmp_image2->channel(c);
+        for (uint y = 0; y < m_height; y++) {
+            for (uint x = 0; x < w; x++) {
+                tmp_image->applyKernelZ(zkernel, x, y, c, wm, tmp_column.buffer() );
+
+                for (uint z = 0; z < d; z++) {
+                    tmp2_channel[z * m_height * w + y * w + x] = tmp_column[z];
+                }
+            }
+        }
+
+        float * dst_channel = dst_image->channel(c);
+
+        for (uint z = 0; z < d; z++ ) {
+            for (uint x = 0; x < w; x++) {
+                tmp_image2->applyKernelY(ykernel, x, z, c, wm, tmp_column.buffer(), 1);
+
+                for (uint y = 0; y < h; y++) {
+                    dst_channel[z * h * w + y * w + x] = tmp_column[y];
+                }
+            }
+        }
+    }
+
+    return dst_image.release();
+}
+
+
+void FloatImage::convolve(const Kernel2 & k, uint c, WrapMode wm)
+{
+    AutoPtr<FloatImage> tmpImage(clone());
+
+    uint w = m_width;
+    uint h = m_height;
+    uint d = m_depth;
+
+    for (uint z = 0; z < d; z++)
+    {
+        for (uint y = 0; y < h; y++)
+        {
+            for (uint x = 0; x < w; x++)
+            {
+                pixel(c, x, y, 0) = tmpImage->applyKernelXY(&k, x, y, z, c, wm);
+            }
+        }
+    }
+}
+
+
+/// Apply 2D kernel at the given coordinates and return result.
+float FloatImage::applyKernelXY(const Kernel2 * k, int x, int y, int z, uint c, WrapMode wm) const
+{
+    nvDebugCheck(k != NULL);
+
+    const uint kernelWindow = k->windowSize();
+    const int kernelOffset = int(kernelWindow / 2);
+
+    const float * channel = this->plane(c, z);
+
+    float sum = 0.0f;
+    for (uint i = 0; i < kernelWindow; i++)
+    {
+        int src_y = int(y + i) - kernelOffset;
+
+        for (uint e = 0; e < kernelWindow; e++)
+        {
+            int src_x = int(x + e) - kernelOffset;
+
+            int idx = this->index(src_x, src_y, z, wm);
+
+            sum += k->valueAt(e, i) * channel[idx];
+        }
+    }
+
+    return sum;
+}
+
+
+/// Apply 1D horizontal kernel at the given coordinates and return result.
+float FloatImage::applyKernelX(const Kernel1 * k, int x, int y, int z, uint c, WrapMode wm) const
+{
+    nvDebugCheck(k != NULL);
+
+    const uint kernelWindow = k->windowSize();
+    const int kernelOffset = int(kernelWindow / 2);
+
+    const float * channel = this->channel(c);
+
+    float sum = 0.0f;
+    for (uint i = 0; i < kernelWindow; i++)
+    {
+        const int src_x = int(x + i) - kernelOffset;
+        const int idx = this->index(src_x, y, z, wm);
+
+        sum += k->valueAt(i) * channel[idx];
+    }
+
+    return sum;
+}
+
+/// Apply 1D vertical kernel at the given coordinates and return result.
+float FloatImage::applyKernelY(const Kernel1 * k, int x, int y, int z, uint c, WrapMode wm) const
+{
+    nvDebugCheck(k != NULL);
+
+    const uint kernelWindow = k->windowSize();
+    const int kernelOffset = int(kernelWindow / 2);
+
+    const float * channel = this->channel(c);
+
+    float sum = 0.0f;
+    for (uint i = 0; i < kernelWindow; i++)
+    {
+        const int src_y = int(y + i) - kernelOffset;
+        const int idx = this->index(x, src_y, z, wm);
+
+        sum += k->valueAt(i) * channel[idx];
+    }
+
+    return sum;
+}
+
+/// Apply 1D kernel in the z direction at the given coordinates and return result.
+float FloatImage::applyKernelZ(const Kernel1 * k, int x, int y, int z, uint c, WrapMode wm) const
+{
+    nvDebugCheck(k != NULL);
+
+    const uint kernelWindow = k->windowSize();
+    const int kernelOffset = int(kernelWindow / 2);
+
+    const float * channel = this->channel(c);
+
+    float sum = 0.0f;
+    for (uint i = 0; i < kernelWindow; i++)
+    {
+        const int src_z = int(z + i) - kernelOffset;
+        const int idx = this->index(x, y, src_z, wm);
+
+        sum += k->valueAt(i) * channel[idx];
+    }
+
+    return sum;
+}
+
+
+/// Apply 1D horizontal kernel at the given coordinates and return result.
+void FloatImage::applyKernelX(const PolyphaseKernel & k, int y, int z, uint c, WrapMode wm, float * __restrict output) const
+{
+    const uint length = k.length();
+    const float scale = float(length) / float(m_width);
+    const float iscale = 1.0f / scale;
+
+    const float width = k.width();
+    const int windowSize = k.windowSize();
+
+    const float * channel = this->channel(c);
+
+    for (uint i = 0; i < length; i++)
+    {
+        const float center = (0.5f + i) * iscale;
+
+        const int left = (int)floorf(center - width);
+        const int right = (int)ceilf(center + width);
+        nvDebugCheck(right - left <= windowSize);
+
+        float sum = 0;
+        for (int j = 0; j < windowSize; ++j)
+        {
+            const int idx = this->index(left + j, y, z, wm);
+
+            sum += k.valueAt(i, j) * channel[idx];
+        }
+
+        output[i] = sum;
+    }
+}
+
+/// Apply 1D vertical kernel at the given coordinates and return result.
+void FloatImage::applyKernelY(const PolyphaseKernel & k, int x, int z, uint c, WrapMode wm, float * __restrict output, int output_stride) const
+{
+    const uint length = k.length();
+    const float scale = float(length) / float(m_height);
+    const float iscale = 1.0f / scale;
+
+    const float width = k.width();
+    const int windowSize = k.windowSize();
+
+    const float * channel = this->channel(c);
+
+    for (uint i = 0; i < length; i++)
+    {
+        const float center = (0.5f + i) * iscale;
+
+        const int left = (int)floorf(center - width);
+        const int right = (int)ceilf(center + width);
+        nvCheck(right - left <= windowSize);
+
+        float sum = 0;
+        for (int j = 0; j < windowSize; ++j)
+        {
+            const int idx = this->index(x, j+left, z, wm);
+
+            sum += k.valueAt(i, j) * channel[idx];
+        }
+
+        output[i * output_stride] = sum;
+    }
+}
+
+/// Apply 1D kernel in the Z direction at the given coordinates and return result.
+void FloatImage::applyKernelZ(const PolyphaseKernel & k, int x, int y, uint c, WrapMode wm, float * __restrict output) const
+{
+    const uint length = k.length();
+    const float scale = float(length) / float(m_height);
+    const float iscale = 1.0f / scale;
+
+    const float width = k.width();
+    const int windowSize = k.windowSize();
+
+    const float * channel = this->channel(c);
+
+    for (uint i = 0; i < length; i++)
+    {
+        const float center = (0.5f + i) * iscale;
+
+        const int left = (int)floorf(center - width);
+        const int right = (int)ceilf(center + width);
+        nvCheck(right - left <= windowSize);
+
+        float sum = 0;
+        for (int j = 0; j < windowSize; ++j)
+        {
+            const int idx = this->index(x, y, j+left, wm);
+
+            sum += k.valueAt(i, j) * channel[idx];
+        }
+
+        output[i] = sum;
+    }
+}
+
+
+/// Apply 1D horizontal kernel at the given coordinates and return result.
+void FloatImage::applyKernelX(const PolyphaseKernel & k, int y, int z, uint c, uint a, WrapMode wm, float * __restrict output) const
+{
+    const uint length = k.length();
+    const float scale = float(length) / float(m_width);
+    const float iscale = 1.0f / scale;
+
+    const float width = k.width();
+    const int windowSize = k.windowSize();
+
+    const float * channel = this->channel(c);
+    const float * alpha = this->channel(a);
+
+    for (uint i = 0; i < length; i++)
+    {
+        const float center = (0.5f + i) * iscale;
+
+        const int left = (int)floorf(center - width);
+        const int right = (int)ceilf(center + width);
+        nvDebugCheck(right - left <= windowSize);
+
+        float norm = 0.0f;
+        float sum = 0;
+        for (int j = 0; j < windowSize; ++j)
+        {
+            const int idx = this->index(left + j, y, z, wm);
+
+            float w = k.valueAt(i, j) * (alpha[idx] + (1.0f / 256.0f));
+            norm += w;
+            sum += w * channel[idx];
+        }
+
+        output[i] = sum / norm;
+    }
+}
+
+/// Apply 1D vertical kernel at the given coordinates and return result.
+void FloatImage::applyKernelY(const PolyphaseKernel & k, int x, int z, uint c, uint a, WrapMode wm, float * __restrict output, int output_stride) const
+{
+    const uint length = k.length();
+    const float scale = float(length) / float(m_height);
+    const float iscale = 1.0f / scale;
+
+    const float width = k.width();
+    const int windowSize = k.windowSize();
+
+    const float * channel = this->channel(c);
+    const float * alpha = this->channel(a);
+
+    for (uint i = 0; i < length; i++)
+    {
+        const float center = (0.5f + i) * iscale;
+
+        const int left = (int)floorf(center - width);
+        const int right = (int)ceilf(center + width);
+        nvCheck(right - left <= windowSize);
+
+        float norm = 0;
+        float sum = 0;
+        for (int j = 0; j < windowSize; ++j)
+        {
+            const int idx = this->index(x, j+left, z, wm);
+
+            float w = k.valueAt(i, j) * (alpha[idx] + (1.0f / 256.0f));
+            norm += w;
+            sum += w * channel[idx];
+        }
+
+        output[i * output_stride] = sum / norm;
+    }
+}
+
+/// Apply 1D horizontal kernel at the given coordinates and return result.
+void FloatImage::applyKernelZ(const PolyphaseKernel & k, int x, int y, uint c, uint a, WrapMode wm, float * __restrict output) const
+{
+    const uint length = k.length();
+    const float scale = float(length) / float(m_width);
+    const float iscale = 1.0f / scale;
+
+    const float width = k.width();
+    const int windowSize = k.windowSize();
+
+    const float * channel = this->channel(c);
+    const float * alpha = this->channel(a);
+
+    for (uint i = 0; i < length; i++)
+    {
+        const float center = (0.5f + i) * iscale;
+
+        const int left = (int)floorf(center - width);
+        const int right = (int)ceilf(center + width);
+        nvDebugCheck(right - left <= windowSize);
+
+        float norm = 0.0f;
+        float sum = 0;
+        for (int j = 0; j < windowSize; ++j)
+        {
+            const int idx = this->index(x, y, left + j, wm);
+
+            float w = k.valueAt(i, j) * (alpha[idx] + (1.0f / 256.0f));
+            norm += w;
+            sum += w * channel[idx];
+        }
+
+        output[i] = sum / norm;
+    }
+}
+
+
+void FloatImage::flipX()
+{
+    const uint w = m_width;
+    const uint h = m_height;
+    const uint d = m_depth;
+    const uint w2 = w / 2;
+
+    for (uint c = 0; c < m_componentCount; c++) {
+        for (uint z = 0; z < d; z++) {
+            for (uint y = 0; y < h; y++) {
+                float * line = scanline(c, y, z);
+                for (uint x = 0; x < w2; x++) {
+                    swap(line[x], line[w - 1 - x]);
+                }
+            }
+        }
+    }
+}
+
+void FloatImage::flipY()
+{
+    const uint w = m_width;
+    const uint h = m_height;
+    const uint d = m_depth;
+    const uint h2 = h / 2;
+
+    for (uint c = 0; c < m_componentCount; c++) {
+        for (uint z = 0; z < d; z++) {
+            for (uint y = 0; y < h2; y++) {
+                float * src = scanline(c, y, z);
+                float * dst = scanline(c, h - 1 - y, z);
+                for (uint x = 0; x < w; x++) {
+                    swap(src[x], dst[x]);
+                }
+            }
+        }
+    }
+}
+
+void FloatImage::flipZ()
+{
+    const uint w = m_width;
+    const uint h = m_height;
+    const uint d = m_depth;
+    const uint d2 = d / 2;
+
+    for (uint c = 0; c < m_componentCount; c++) {
+        for (uint z = 0; z < d2; z++) {
+            float * src = plane(c, z);
+            float * dst = plane(c, d - 1 - z);
+            for (uint i = 0; i < w*h; i++) {
+                swap(src[i], dst[i]);
+            }
+        }
+    }
+}
+
+
+
+float FloatImage::alphaTestCoverage(float alphaRef, int alphaChannel, float alphaScale/*=1*/) const
+{
+    const uint w = m_width;
+    const uint h = m_height;
+
+    float coverage = 0.0f;
+
+#if 0
+    const float * alpha = channel(alphaChannel);
+
+    const uint count = m_pixelCount;
+    for (uint i = 0; i < count; i++) {
+        if (alpha[i] > alphaRef) coverage += 1.0f; // @@ gt or lt?
+    }
+    
+    return coverage / float(w * h);
+#else
+    const uint n = 8;
+
+    // If we want subsampling:
+    for (uint y = 0; y < h-1; y++) {
+        for (uint x = 0; x < w-1; x++) {
+
+            float alpha00 = nv::saturate(pixel(alphaChannel, x+0, y+0, 0) * alphaScale);
+            float alpha10 = nv::saturate(pixel(alphaChannel, x+1, y+0, 0) * alphaScale);
+            float alpha01 = nv::saturate(pixel(alphaChannel, x+0, y+1, 0) * alphaScale);
+            float alpha11 = nv::saturate(pixel(alphaChannel, x+1, y+1, 0) * alphaScale);
+
+            for (uint sy = 0; sy < n; sy++) {
+                float fy = (sy + 0.5f) / n;
+                for (uint sx = 0; sx < n; sx++) {
+                    float fx = (sx + 0.5f) / n;
+                    float alpha = alpha00 * (1 - fx) * (1 - fy) + alpha10 * fx * (1 - fy) + alpha01 * (1 - fx) * fy + alpha11 * fx * fy;
+                    if (alpha > alphaRef) coverage += 1.0f;
+                }
+            }
+        }
+    }
+
+    return coverage / float(w * h * n * n);
+#endif
+}
+
+void FloatImage::scaleAlphaToCoverage(float desiredCoverage, float alphaRef, int alphaChannel)
+{
+#if 0
+    float minAlphaRef = 0.0f;
+    float maxAlphaRef = 1.0f;
+    float midAlphaRef = 0.5f;
+
+    // Determine desired scale using a binary search. Hardcoded to 8 steps max.
+    for (int i = 0; i < 10; i++) {
+        float currentCoverage = alphaTestCoverage(midAlphaRef, alphaChannel);
+
+        if (currentCoverage > desiredCoverage) {
+            minAlphaRef = midAlphaRef;
+        }
+        else if (currentCoverage < desiredCoverage) {
+            maxAlphaRef = midAlphaRef;
+        }
+        else {
+            break;
+        }
+
+        midAlphaRef = (minAlphaRef + maxAlphaRef) * 0.5f;
+    }
+
+    float alphaScale = alphaRef / midAlphaRef;
+
+    // Scale alpha channel.
+    scaleBias(alphaChannel, 1, alphaScale, 0.0f);
+    clamp(alphaChannel, 1, 0.0f, 1.0f); 
+#else
+    float minAlphaScale = 0.0f;
+    float maxAlphaScale = 4.0f;
+    float alphaScale = 1.0f;
+    float bestAlphaScale = 1.0f;
+    float bestError = NV_FLOAT_MAX;
+
+    // Determine desired scale using a binary search. Hardcoded to 10 steps max.
+    for (int i = 0; i < 10; i++) {
+        float currentCoverage = alphaTestCoverage(alphaRef, alphaChannel, alphaScale);
+
+        float error = fabsf(currentCoverage - desiredCoverage);
+        if (error < bestError) {
+            bestError = error;
+            bestAlphaScale = alphaScale;
+        }
+
+        if (currentCoverage < desiredCoverage) {
+            minAlphaScale = alphaScale;
+        }
+        else if (currentCoverage > desiredCoverage) {
+            maxAlphaScale = alphaScale;
+        }
+        else {
+            break;
+        }
+
+        alphaScale = (minAlphaScale + maxAlphaScale) * 0.5f;
+    }
+
+    // Scale alpha channel.
+    scaleBias(alphaChannel, 1, bestAlphaScale, 0.0f);
+    clamp(alphaChannel, 1, 0.0f, 1.0f); 
+#endif
+#if _DEBUG
+    alphaTestCoverage(alphaRef, alphaChannel);
+#endif
+}
+
+FloatImage* FloatImage::clone() const
+{
+    FloatImage* copy = new FloatImage();
+
+    copy->allocate(m_componentCount, m_width, m_height, m_depth);
+    memcpy(copy->m_mem, m_mem, m_floatCount * sizeof(float));
+
+    return copy;
+}
+
diff --git a/src/nvimage/FloatImage.h b/src/nvimage/FloatImage.h
index 10a236f..42cd86a 100644
--- a/src/nvimage/FloatImage.h
+++ b/src/nvimage/FloatImage.h
@@ -35,6 +35,7 @@ namespace nv
         };
 
         NVIMAGE_API FloatImage();
+        NVIMAGE_API FloatImage(const FloatImage & img);
         NVIMAGE_API FloatImage(const Image * img);
         NVIMAGE_API virtual ~FloatImage();
 
@@ -92,10 +93,10 @@ namespace nv
         NVIMAGE_API float applyKernelY(const Kernel1 * k, int x, int y, int z, uint c, WrapMode wm) const;
         NVIMAGE_API float applyKernelZ(const Kernel1 * k, int x, int y, int z, uint c, WrapMode wm) const;
         NVIMAGE_API void applyKernelX(const PolyphaseKernel & k, int y, int z, uint c, WrapMode wm, float * output) const;
-        NVIMAGE_API void applyKernelY(const PolyphaseKernel & k, int x, int z, uint c, WrapMode wm, float * output) const;
+        NVIMAGE_API void applyKernelY(const PolyphaseKernel & k, int x, int z, uint c, WrapMode wm, float * output, int output_stride) const;
         NVIMAGE_API void applyKernelZ(const PolyphaseKernel & k, int x, int y, uint c, WrapMode wm, float * output) const;
         NVIMAGE_API void applyKernelX(const PolyphaseKernel & k, int y, int z, uint c, uint a, WrapMode wm, float * output) const;
-        NVIMAGE_API void applyKernelY(const PolyphaseKernel & k, int x, int z, uint c, uint a, WrapMode wm, float * output) const;
+        NVIMAGE_API void applyKernelY(const PolyphaseKernel & k, int x, int z, uint c, uint a, WrapMode wm, float * output, int output_stride) const;
         NVIMAGE_API void applyKernelZ(const PolyphaseKernel & k, int x, int y, uint c, uint a, WrapMode wm, float * output) const;
 
 
diff --git a/src/nvimage/Image.cpp b/src/nvimage/Image.cpp
index 3d99108..0ac84a6 100644
--- a/src/nvimage/Image.cpp
+++ b/src/nvimage/Image.cpp
@@ -42,13 +42,21 @@ const Image & Image::operator=(const Image & img)
 
 void Image::allocate(uint w, uint h, uint d/*= 1*/)
 {
-    free();
     m_width = w;
     m_height = h;
 	m_depth = d;
     m_data = realloc<Color32>(m_data, w * h * d);
 }
 
+void Image::acquire(Color32 * data, uint w, uint h, uint d/*= 1*/)
+{
+    free();
+    m_width = w;
+    m_height = h;
+    m_depth = d;
+    m_data = data;
+}
+
 void Image::resize(uint w, uint h, uint d/*= 1*/) {
 
     Image img;
diff --git a/src/nvimage/Image.h b/src/nvimage/Image.h
index 643fd9d..e8f6fa6 100644
--- a/src/nvimage/Image.h
+++ b/src/nvimage/Image.h
@@ -34,6 +34,7 @@ namespace nv
 
 
         void allocate(uint w, uint h, uint d = 1);
+        void acquire(Color32 * data, uint w, uint h, uint d = 1);
         bool load(const char * name);
 
         void resize(uint w, uint h, uint d = 1);
diff --git a/src/nvimage/ImageIO.cpp b/src/nvimage/ImageIO.cpp
index 03c0410..047cf39 100644
--- a/src/nvimage/ImageIO.cpp
+++ b/src/nvimage/ImageIO.cpp
@@ -8,6 +8,8 @@
 #include "DirectDrawSurface.h"
 #include "PixelFormat.h"
 
+#include "nvthread/ParallelFor.h"
+
 #include "nvmath/Color.h"
 #include "nvmath/Half.h"
 
@@ -19,31 +21,31 @@
 #include "nvcore/TextWriter.h"
 
 // Extern
-#if defined(HAVE_FREEIMAGE)
+#if defined(NV_HAVE_FREEIMAGE)
 #   include <FreeImage.h>
 // If FreeImage available, do not use individual libraries, since that produces link conflicts in some platforms.
-#   undef HAVE_JPEG
-#   undef HAVE_PNG
-#   undef HAVE_TIFF
-#   undef HAVE_OPENEXR
+#   undef NV_HAVE_JPEG
+#   undef NV_HAVE_PNG
+#   undef NV_HAVE_TIFF
+#   undef NV_HAVE_OPENEXR
 #endif
 
-#if defined(HAVE_JPEG)
+#if defined(NV_HAVE_JPEG)
 extern "C" {
 #   include <jpeglib.h>
 }
 #endif
 
-#if defined(HAVE_PNG)
+#if defined(NV_HAVE_PNG)
 #   include <png.h>
 #endif
 
-#if defined(HAVE_TIFF)
+#if defined(NV_HAVE_TIFF)
 #   define _TIFF_DATA_TYPEDEFS_
 #   include <tiffio.h>
 #endif
 
-#if defined(HAVE_OPENEXR)
+#if defined(NV_HAVE_OPENEXR)
 #   include <ImfIO.h>
 #   include <ImathBox.h>
 #   include <ImfChannelList.h>
@@ -52,7 +54,7 @@ extern "C" {
 #   include <ImfArray.h>
 #endif
 
-#if defined(HAVE_STBIMAGE)
+#if defined(NV_HAVE_STBIMAGE)
 #   define STBI_NO_STDIO
 #   include <stb_image.h>
 #endif
@@ -303,6 +305,51 @@ static bool saveTGA(Stream & s, const Image * img)
     return true;
 }
 
+#pragma optimize("", off)
+
+// Save BMP image.
+static bool saveBMP(Stream & s, const Image * img)
+{
+    int w = img->width();
+    int h = img->height();
+    int image_size = w * h * 3;
+
+    BmpFileHeader header;
+    zero(header);
+    header.type = BM_TYPE;
+    header.size = BITMAPFILEHEADER_SIZE + BITMAPINFOHEADER_SIZE + image_size;
+    header.offBits = BITMAPFILEHEADER_SIZE + BITMAPINFOHEADER_SIZE;
+
+    BmpInfoHeader info;
+    zero(info);
+    info.size = BITMAPINFOHEADER_SIZE;
+    info.width = w;
+    info.height = h;
+    info.planes = 1;
+    info.bitCount = 24;
+    info.sizeImage = image_size;
+    info.xPelsPerMeter = 2000;
+    info.yPelsPerMeter = 2000;
+
+    s << header;
+    s << info;
+
+    nv::Array<uint8> data;
+    data.resize(3 * w);
+
+    for (int y = 0; y < h; y++) {
+        for (int x = 0; x < w; x++) {
+            data[x * 3 + 0] = img->pixel(x, h - y - 1).b;
+            data[x * 3 + 1] = img->pixel(x, h - y - 1).g;
+            data[x * 3 + 2] = img->pixel(x, h - y - 1).r;
+        }
+
+        s.serialize(data.buffer(), data.size());
+    }
+
+    return true;
+}
+
 /*static Image * loadPPM(Stream & s)
 {
     // @@
@@ -324,7 +371,10 @@ static bool savePPM(Stream & s, const Image * img)
     writer.writeString("255\n");
     for (uint i = 0; i < w * h; i++) {
         Color32 c = img->pixel(i);
-        s << (uint8_t&)c.r << (uint8_t&)c.g << (uint8_t&)c.b;
+        uint8 r = c.r;  // current version of apple's llvm compiling for arm64 doesn't like taking the address of a bit-field. Workaround by using the stack
+        uint8 g = c.g;
+        uint8 b = c.b;
+        s << r << g << b;
     }
 
     return true;
@@ -653,7 +703,7 @@ static bool saveFloatDDS(Stream & s, const FloatImage * img, uint base_component
 }
 
 
-#if defined(HAVE_PNG)
+#if defined(NV_HAVE_PNG)
 
 static void user_read_data(png_structp png_ptr, png_bytep data, png_size_t length)
 {
@@ -902,9 +952,9 @@ static bool savePNG(Stream & s, const Image * img, const char ** tags/*=NULL*/)
     return true;
 }
 
-#endif // defined(HAVE_PNG)
+#endif // defined(NV_HAVE_PNG)
 
-#if defined(HAVE_JPEG)
+#if defined(NV_HAVE_JPEG)
 
 static void init_source (j_decompress_ptr /*cinfo*/){
 }
@@ -1011,9 +1061,9 @@ static Image * loadJPG(Stream & s)
     return img.release();
 }
 
-#endif // defined(HAVE_JPEG)
+#endif // defined(NV_HAVE_JPEG)
 
-#if defined(HAVE_TIFF)
+#if defined(NV_HAVE_TIFF)
 
 /*
 static tsize_t tiffReadWriteProc(thandle_t h, tdata_t ptr, tsize_t size)
@@ -1207,9 +1257,9 @@ static bool saveFloatTIFF(const char * fileName, const FloatImage * fimage, uint
     return true;
 }
 
-#endif // defined(HAVE_TIFF)
+#endif // defined(NV_HAVE_TIFF)
 
-#if defined(HAVE_OPENEXR)
+#if defined(NV_HAVE_OPENEXR)
 
 namespace
 {
@@ -1348,10 +1398,10 @@ static bool saveFloatEXR(const char * fileName, const FloatImage * fimage, uint
     return true;
 }
 
-#endif // defined(HAVE_OPENEXR)
+#endif // defined(NV_HAVE_OPENEXR)
 
 
-#if defined(HAVE_FREEIMAGE)
+#if defined(NV_HAVE_FREEIMAGE)
 
 static unsigned DLL_CALLCONV ReadProc(void *buffer, unsigned size, unsigned count, fi_handle handle)
 {
@@ -1688,10 +1738,10 @@ bool nv::ImageIO::saveFloatFreeImage(FREE_IMAGE_FORMAT fif, Stream & s, const Fl
     return result;
 }
 
-#endif // defined(HAVE_FREEIMAGE)
+#endif // defined(NV_HAVE_FREEIMAGE)
 
 
-#if defined(HAVE_STBIMAGE)
+#if defined(NV_HAVE_STBIMAGE)
 
 static Image * loadSTB(Stream & s)
 {
@@ -1704,28 +1754,22 @@ static Image * loadSTB(Stream & s)
     int w, h, n;
     uint8 * data = stbi_load_from_memory(buffer, size, &w, &h, &n, 4);
 
+    // @@ Hack: STB is returning n=4, because we request 4 components, even when input only has 3.
+    n = 3;
+
     delete [] buffer;
 
     if (data != NULL) {
         Image * img = new Image;
-        img->allocate(w, h);
+        img->acquire((Color32 *)data, w, h);
         img->setFormat(n == 4 ? Image::Format_ARGB : Image::Format_RGB);
 
-        for (int y = 0; y < h; ++y)
-        {
-            nv::Color32* dest = img->scanline(y);
-            uint8* src = data + y * w * 4;
-
-            for (int x = 0; x < w; ++x)
-            {
-                dest[x].r = src[x * 4 + 0];
-                dest[x].g = src[x * 4 + 1];
-                dest[x].b = src[x * 4 + 2];
-                dest[x].a = src[x * 4 + 3];
-            }
-        }
-        
-        free(data);
+        int count = w * h;
+        for (int i = 0; i < count; ++i) {
+        //parallel_for(count, 128, [&](int i) {
+            Color32 & pixel = img->pixel(i);
+            swap(pixel.r, pixel.b);
+        }//);
 
         return img;
     }
@@ -1766,7 +1810,7 @@ static FloatImage * loadFloatSTB(Stream & s)
     return NULL;
 }
 
-#endif // defined(HAVE_STBIMAGE)
+#endif // defined(NV_HAVE_STBIMAGE)
 
 
 
@@ -1804,32 +1848,33 @@ Image * nv::ImageIO::load(const char * fileName, Stream & s)
         return loadPPM(s);
     }*/
 
-#if defined(HAVE_JPEG)
+#if defined(NV_HAVE_JPEG)
     if (strCaseDiff(extension, ".jpg") == 0 || strCaseDiff(extension, ".jpeg") == 0) {
         return loadJPG(s);
     }
 #endif
 
-#if defined(HAVE_PNG)
+#if defined(NV_HAVE_PNG)
     if (strCaseDiff(extension, ".png") == 0) {
         return loadPNG(s);
     }
 #endif
 
-#if defined(HAVE_FREEIMAGE)
+#if defined(NV_HAVE_FREEIMAGE)
     FREE_IMAGE_FORMAT fif = FreeImage_GetFIFFromFilename(fileName);
     if (fif != FIF_UNKNOWN && FreeImage_FIFSupportsReading(fif)) {
         return loadFreeImage(fif, s);
     }
 #endif
 
-#if defined(HAVE_STBIMAGE)
+#if defined(NV_HAVE_STBIMAGE)
     return loadSTB(s);
 #endif
 
     return NULL;
 }
 
+
 bool nv::ImageIO::save(const char * fileName, Stream & s, const Image * img, const char ** tags/*=NULL*/)
 {
     nvDebugCheck(fileName != NULL);
@@ -1838,6 +1883,10 @@ bool nv::ImageIO::save(const char * fileName, Stream & s, const Image * img, con
 
     const char * extension = Path::extension(fileName);
 
+    if (strCaseDiff(extension, ".bmp") == 0) {
+        return saveBMP(s, img);
+    }
+
     if (strCaseDiff(extension, ".tga") == 0) {
         return saveTGA(s, img);
     }
@@ -1846,13 +1895,13 @@ bool nv::ImageIO::save(const char * fileName, Stream & s, const Image * img, con
         return savePPM(s, img);
     }
 
-#if defined(HAVE_PNG)
+#if defined(NV_HAVE_PNG)
     if (strCaseDiff(extension, ".png") == 0) {
         return savePNG(s, img, tags);
     }
 #endif
 
-#if defined(HAVE_FREEIMAGE)
+#if defined(NV_HAVE_FREEIMAGE)
     FREE_IMAGE_FORMAT fif = FreeImage_GetFIFFromFilename(fileName);
     if (fif != FIF_UNKNOWN && FreeImage_FIFSupportsWriting(fif)) {
         return saveFreeImage(fif, s, img, tags);
@@ -1899,27 +1948,27 @@ FloatImage * nv::ImageIO::loadFloat(const char * fileName, Stream & s)
         return loadFloatPFM(s);
     }*/
 
-#if defined(HAVE_TIFF)
+#if defined(NV_HAVE_TIFF)
     #pragma NV_MESSAGE("TODO: Load TIFF from stream.")
     if (strCaseDiff(extension, ".tif") == 0 || strCaseDiff(extension, ".tiff") == 0) {
         return loadFloatTIFF(fileName, s);
     }
 #endif
 
-#if defined(HAVE_OPENEXR)
+#if defined(NV_HAVE_OPENEXR)
     #pragma NV_MESSAGE("TODO: Load EXR from stream.")
     if (strCaseDiff(extension, ".exr") == 0) {
         return loadFloatEXR(fileName, s);
     }
 #endif
 
-#if defined(HAVE_STBIMAGE)
+#if defined(NV_HAVE_STBIMAGE)
     if (strCaseDiff(extension, ".hdr") == 0) {
         return loadFloatSTB(s);
     }
 #endif
 
-#if defined(HAVE_FREEIMAGE)
+#if defined(NV_HAVE_FREEIMAGE)
     FREE_IMAGE_FORMAT fif = FreeImage_GetFIFFromFilename(fileName);
     if (fif != FIF_UNKNOWN && FreeImage_FIFSupportsReading(fif)) {
         return loadFloatFreeImage(fif, s);
@@ -1961,7 +2010,7 @@ bool nv::ImageIO::saveFloat(const char * fileName, Stream & s, const FloatImage
         return saveFloatPFM(s, fimage, baseComponent, componentCount);
     }*/
 
-#if defined(HAVE_FREEIMAGE)
+#if defined(NV_HAVE_FREEIMAGE)
     FREE_IMAGE_FORMAT fif = FreeImage_GetFIFFromFilename(fileName);
     if (fif != FIF_UNKNOWN && FreeImage_FIFSupportsWriting(fif)) {
         return saveFloatFreeImage(fif, s, fimage, baseComponent, componentCount);
@@ -2005,14 +2054,15 @@ bool nv::ImageIO::saveFloat(const char * fileName, const FloatImage * fimage, ui
     }
 
     const char * extension = Path::extension(fileName);
+    NV_UNUSED(extension);
 
-#if defined(HAVE_OPENEXR)
+#if defined(NV_HAVE_OPENEXR)
     if (strCaseDiff(extension, ".exr") == 0) {
         return saveFloatEXR(fileName, fimage, baseComponent, componentCount);
     }
 #endif
 
-#if defined(HAVE_TIFF)
+#if defined(NV_HAVE_TIFF)
     if (strCaseDiff(extension, ".tif") == 0 || strCaseDiff(extension, ".tiff") == 0) {
         return saveFloatTIFF(fileName, fimage, baseComponent, componentCount);
     }
diff --git a/src/nvimage/KtxFile.cpp b/src/nvimage/KtxFile.cpp
index de075bd..033ad75 100644
--- a/src/nvimage/KtxFile.cpp
+++ b/src/nvimage/KtxFile.cpp
@@ -1,6 +1,7 @@
 // This code is in the public domain -- Ignacio Casta�o <castano@gmail.com>
 
 #include "KtxFile.h"
+#include "nvcore/StdStream.h"
 
 using namespace nv;
 
@@ -10,6 +11,8 @@ static const uint8 fileIdentifier[12] = {
     0x0D, 0x0A, 0x1A, 0x0A
 };
 
+namespace nv
+{
 
 KtxHeader::KtxHeader() {
     memcpy(identifier, fileIdentifier, 12);
@@ -19,8 +22,8 @@ KtxHeader::KtxHeader() {
     glType = 0;
     glTypeSize = 1;
     glFormat = 0;
-    glInternalFormat = KTX_RGBA;
-    glBaseInternalFormat = KTX_RGBA;
+    glInternalFormat = KTX_INTERNAL_COMPRESSED_SRGB_S3TC_DXT1;
+    glBaseInternalFormat = KTX_BASE_INTERNAL_RGB;
     pixelWidth = 0;
     pixelHeight = 0;
     pixelDepth = 0;
@@ -31,9 +34,9 @@ KtxHeader::KtxHeader() {
 }
 
 
-Stream & operator<< (Stream & s, DDSHeader & header) {
+Stream & operator<< (Stream & s, KtxHeader & header) {
     s.serialize(header.identifier, 12);
-    s << header.endiannes << header.glType << header.glTypeSize << header.glFormat << header.glInternalFormat << header.glBaseInternalFormat;
+    s << header.endianness << header.glType << header.glTypeSize << header.glFormat << header.glInternalFormat << header.glBaseInternalFormat;
     s << header.pixelWidth << header.pixelHeight << header.pixelDepth;
     s << header.numberOfArrayElements << header.numberOfFaces << header.numberOfMipmapLevels;
     s << header.bytesOfKeyValueData;
@@ -41,7 +44,7 @@ Stream & operator<< (Stream & s, DDSHeader & header) {
 }
 
 
-KtxFile::KtxFile() {
+/*KtxFile::KtxFile() {
 }
 KtxFile::~KtxFile() {
 }
@@ -49,7 +52,7 @@ KtxFile::~KtxFile() {
 void KtxFile::addKeyValue(const char * key, const char * value) {
     keyArray.append(key);
     valueArray.append(value);
-    bytesOfKeyValueData += strlen(key) + 1 + strlen(value) + 1;
+    header.bytesOfKeyValueData += strlen(key) + 1 + strlen(value) + 1;
 }
 
 
@@ -77,7 +80,8 @@ Stream & operator<< (Stream & s, KtxFile & file) {
     }
 
     return s;
-}
+}*/
 
+} // nv
 
 
diff --git a/src/nvimage/KtxFile.h b/src/nvimage/KtxFile.h
index 9f89590..b1b3674 100644
--- a/src/nvimage/KtxFile.h
+++ b/src/nvimage/KtxFile.h
@@ -6,6 +6,7 @@
 
 #include "nvimage.h"
 #include "nvcore/StrLib.h"
+#include "nvcore/Array.h"
 
 // KTX File format specification:
 // http://www.khronos.org/opengles/sdk/tools/KTX/file_format_spec/#key
@@ -14,22 +15,99 @@ namespace nv
 {
     class Stream;
 
-    // GL types (Table 3.2)
-    const uint KTX_UNSIGNED_BYTE;
-    const uint KTX_UNSIGNED_SHORT_5_6_5;
-    // ...
-
-    // GL formats (Table 3.3)
-    // ...
-
-    // GL internal formats (Table 3.12, 3.13)
-    // ...
-
-    // GL base internal format. (Table 3.11)
-    const uint KTX_RGB;
-    const uint KTX_RGBA;
-    const uint KTX_ALPHA;
-    // ...
+    // GL types
+    const uint KTX_UNSIGNED_BYTE = 0x1401;
+    const uint KTX_BYTE = 0x1400;
+    const uint KTX_UNSIGNED_SHORT = 0x1403;
+    const uint KTX_SHORT = 0x1402;
+    const uint KTX_UNSIGNED_INT = 0x1405;
+    const uint KTX_INT = 0x1404;
+    const uint KTX_FLOAT = 0x1406;
+    const uint KTX_UNSIGNED_BYTE_3_3_2 = 0x8032;
+    const uint KTX_UNSIGNED_BYTE_2_3_3_REV = 0x8362;
+    const uint KTX_UNSIGNED_SHORT_5_6_5 = 0x8363;
+    const uint KTX_UNSIGNED_SHORT_5_6_5_REV = 0x8364;
+    const uint KTX_UNSIGNED_SHORT_4_4_4_4 = 0x8033;
+    const uint KTX_UNSIGNED_SHORT_4_4_4_4_REV = 0x8365;
+    const uint KTX_UNSIGNED_SHORT_5_5_5_1 = 0x8034;
+    const uint KTX_UNSIGNED_SHORT_1_5_5_5_REV = 0x8366;
+    const uint KTX_UNSIGNED_INT_8_8_8_8 = 0x8035;
+    const uint KTX_UNSIGNED_INT_8_8_8_8_REV = 0x8367;
+    const uint KTX_UNSIGNED_INT_10_10_10_2 = 0x8036;
+    const uint KTX_UNSIGNED_INT_2_10_10_10_REV = 0x8368;
+
+    // GL formats
+    const uint KTX_FORMAT_RED = 0x1903;
+    const uint KTX_FORMAT_RG = 0x8227;
+    const uint KTX_FORMAT_RGB = 0x1907;
+    const uint KTX_FORMAT_BGR = 0x80E0;
+    const uint KTX_FORMAT_RGBA = 0x1908;
+    const uint KTX_FORMAT_BGRA = 0x80E1;
+    const uint KTX_FORMAT_RED_INTEGER = 0x8D94;
+    const uint KTX_FORMAT_RG_INTEGER = 0x8228;
+    const uint KTX_FORMAT_RGB_INTEGER = 0x8D98;
+    const uint KTX_FORMAT_BGR_INTEGER = 0x8D9A;
+    const uint KTX_FORMAT_RGBA_INTEGER = 0x8D99;
+    const uint KTX_FORMAT_BGRA_INTEGER = 0x8D9B;
+    const uint KTX_FORMAT_STENCIL_INDEX = 0x1901;
+    const uint KTX_FORMAT_DEPTH_COMPONENT = 0x1902;
+    const uint KTX_FORMAT_DEPTH_STENCIL = 0x84F9;
+
+    // GL internal formats
+    // BC1
+    const uint KTX_INTERNAL_COMPRESSED_RGB_S3TC_DXT1 = 0x83F0;
+    const uint KTX_INTERNAL_COMPRESSED_SRGB_S3TC_DXT1 = 0x8C4C;
+    // BC1a
+    const uint KTX_INTERNAL_COMPRESSED_RGBA_S3TC_DXT1 = 0x83F1;
+    const uint KTX_INTERNAL_COMPRESSED_SRGB_ALPHA_S3TC_DXT1 = 0x8C4D;
+    // BC2
+    const uint KTX_INTERNAL_COMPRESSED_RGBA_S3TC_DXT3 = 0x83F2;
+    const uint KTX_INTERNAL_COMPRESSED_SRGB_ALPHA_S3TC_DXT3 = 0x8C4E;
+    // BC3
+    const uint KTX_INTERNAL_COMPRESSED_RGBA_S3TC_DXT5 = 0x83F3;
+    const uint KTX_INTERNAL_COMPRESSED_SRGB_ALPHA_S3TC_DXT5 = 0x8C4F;
+    // BC4
+    const uint KTX_INTERNAL_COMPRESSED_RED_RGTC1 = 0x8DBB;
+    const uint KTX_INTERNAL_COMPRESSED_SIGNED_RED_RGTC1 = 0x8DBC;
+    // BC5
+    const uint KTX_INTERNAL_COMPRESSED_RG_RGTC2 = 0x8DBD;
+    const uint KTX_INTERNAL_COMPRESSED_SIGNED_RG_RGTC2 = 0x8DBE;
+    // BC6
+    const uint KTX_INTERNAL_COMPRESSED_RGB_BPTC_UNSIGNED_FLOAT = 0x8E8F;
+    const uint KTX_INTERNAL_COMPRESSED_RGB_BPTC_SIGNED_FLOAT = 0x8E8E;
+    // BC7
+    const uint KTX_INTERNAL_COMPRESSED_RGBA_BPTC_UNORM = 0x8E8C;
+    const uint KTX_INTERNAL_COMPRESSED_SRGB_ALPHA_BPTC_UNORM = 0x8E8D;
+
+    // ETC
+    const uint KTX_INTERNAL_COMPRESSED_RGB_ETC1 = 0x8D64;
+    const uint KTX_INTERNAL_COMPRESSED_SRGB_ETC1 = 0x8D64;  // ???
+
+    // ETC2
+    const uint KTX_INTERNAL_COMPRESSED_RED_EAC = 0x9270;
+    const uint KTX_INTERNAL_COMPRESSED_SIGNED_RED_EAC = 0x9271;
+
+    const uint KTX_INTERNAL_COMPRESSED_RG_EAC = 0x9272;
+    const uint KTX_INTERNAL_COMPRESSED_SIGNED_RG_EAC = 0x9273;
+
+    const uint KTX_INTERNAL_COMPRESSED_RGB_ETC2 = 0x9274;
+    const uint KTX_INTERNAL_COMPRESSED_SRGB_ETC2 = 0x9275;
+
+    const uint KTX_INTERNAL_COMPRESSED_RGB_PUNCHTHROUGH_ALPHA_ETC2 = 0x9276;
+    const uint KTX_INTERNAL_COMPRESSED_SRGB_PUNCHTHROUGH_ALPHA_ETC2 = 0x9277;
+
+    const uint KTX_INTERNAL_COMPRESSED_RGBA_ETC2_EAC = 0x9278;
+    const uint KTX_INTERNAL_COMPRESSED_SRGB_ALPHA_ETC2_EAC = 0x9279;
+
+
+    // GL base internal formats
+    const uint KTX_BASE_INTERNAL_DEPTH_COMPONENT = 0x1902;
+    const uint KTX_BASE_INTERNAL_DEPTH_STENCIL = 0x84F9;
+    const uint KTX_BASE_INTERNAL_RED = 0x1903;
+    const uint KTX_BASE_INTERNAL_RG = 0x8227;
+    const uint KTX_BASE_INTERNAL_RGB = 0x1907;
+    const uint KTX_BASE_INTERNAL_RGBA = 0x1908;
+    const uint KTX_BASE_INTERNAL_STENCIL_INDEX = 0x1901;
 
 
     struct KtxHeader {
@@ -52,10 +130,10 @@ namespace nv
 
     };
 
-    NVIMAGE_API Stream & operator<< (Stream & s, DDSHeader & header);
+    NVIMAGE_API Stream & operator<< (Stream & s, KtxHeader & header);
 
 
-    struct KtxFile {
+/*    struct KtxFile {
         KtxFile();
         ~KtxFile();
 
@@ -66,10 +144,9 @@ namespace nv
 
         Array<String> keyArray;
         Array<String> valueArray;
-
     };
 
-    NVIMAGE_API Stream & operator<< (Stream & s, KtxFile & file);
+    NVIMAGE_API Stream & operator<< (Stream & s, KtxFile & file);*/
 
 
     /*
diff --git a/src/nvimage/NormalMap.cpp b/src/nvimage/NormalMap.cpp
index 559e4f8..754166a 100644
--- a/src/nvimage/NormalMap.cpp
+++ b/src/nvimage/NormalMap.cpp
@@ -1,208 +1,208 @@
-// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
-// 
-// Permission is hereby granted, free of charge, to any person
-// obtaining a copy of this software and associated documentation
-// files (the "Software"), to deal in the Software without
-// restriction, including without limitation the rights to use,
-// copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the
-// Software is furnished to do so, subject to the following
-// conditions:
-// 
-// The above copyright notice and this permission notice shall be
-// included in all copies or substantial portions of the Software.
-// 
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-// OTHER DEALINGS IN THE SOFTWARE.
-
-#include "NormalMap.h"
-#include "Filter.h"
-#include "FloatImage.h"
-#include "Image.h"
-
-#include "nvmath/Color.inl"
-#include "nvmath/Vector.h"
-
-#include "nvcore/Ptr.h"
-
-#include <string.h> // memcpy
-
-
-using namespace nv;
-
-// Create normal map using the given kernels.
-static FloatImage * createNormalMap(const Image * img, FloatImage::WrapMode wm, Vector4::Arg heightWeights, const Kernel2 * kdu, const Kernel2 * kdv)
-{
-    nvDebugCheck(kdu != NULL);
-    nvDebugCheck(kdv != NULL);
-    nvDebugCheck(img != NULL);
-
-    const uint w = img->width();
-    const uint h = img->height();
-
-    AutoPtr<FloatImage> fimage(new FloatImage());
-    fimage->allocate(4, w, h);
-
-    // Compute height and store in alpha channel:
-    float * alphaChannel = fimage->channel(3);
-    for(uint i = 0; i < w * h; i++)
-    {
-        Vector4 color = toVector4(img->pixel(i));
-        alphaChannel[i] = dot(color, heightWeights);
-    }
-
-    float heightScale = 1.0f / 16.0f;	// @@ Use a user defined factor.
-
-    for(uint y = 0; y < h; y++)
-    {
-        for(uint x = 0; x < w; x++)
-        {
-            const float du = fimage->applyKernelXY(kdu, x, y, 0, 3, wm);
-            const float dv = fimage->applyKernelXY(kdv, x, y, 0, 3, wm);
-
-            Vector3 n = normalize(Vector3(du, dv, heightScale));
-
-            fimage->pixel(0, x, y, 0) = 0.5f * n.x + 0.5f;
-            fimage->pixel(1, x, y, 0) = 0.5f * n.y + 0.5f;
-            fimage->pixel(2, x, y, 0) = 0.5f * n.z + 0.5f;
-        }
-    }
-
-    return fimage.release();
-}
-
-
-// Create normal map using the given kernels.
-static FloatImage * createNormalMap(const FloatImage * img, FloatImage::WrapMode wm, const Kernel2 * kdu, const Kernel2 * kdv)
-{
-    nvDebugCheck(kdu != NULL);
-    nvDebugCheck(kdv != NULL);
-    nvDebugCheck(img != NULL);
-
-#pragma NV_MESSAGE("FIXME: Height scale parameter should go away. It should be a sensible value that produces good results when the heightmap is in the [0, 1] range.")
-    const float heightScale = 1.0f / 16.0f;
-
-    const uint w = img->width();
-    const uint h = img->height();
-
-    AutoPtr<FloatImage> img_out(new FloatImage());
-    img_out->allocate(4, w, h);
-
-    for (uint y = 0; y < h; y++)
-    {
-        for (uint x = 0; x < w; x++)
-        {
-            const float du = img->applyKernelXY(kdu, x, y, 0, 3, wm);
-            const float dv = img->applyKernelXY(kdv, x, y, 0, 3, wm);
-
-            Vector3 n = normalize(Vector3(du, dv, heightScale));
-
-            img_out->pixel(0, x, y, 0) = n.x;
-            img_out->pixel(1, x, y, 0) = n.y;
-            img_out->pixel(2, x, y, 0) = n.z;
-        }
-    }
-
-    // Copy alpha channel.
-    /*for (uint y = 0; y < h; y++)
-    {
-        for (uint x = 0; x < w; x++)
-        {
-            
-            img_out->pixel(3, x, y, 0) = img->pixel(3, x, y, 0);
-        }
-    }*/
-    memcpy(img_out->channel(3), img->channel(3), w * h * sizeof(float));
-
-    return img_out.release();
-}
-
-
-/// Create normal map using the given filter.
-FloatImage * nv::createNormalMap(const Image * img, FloatImage::WrapMode wm, Vector4::Arg heightWeights, NormalMapFilter filter /*= Sobel3x3*/)
-{
-    nvDebugCheck(img != NULL);
-
-    // Init the kernels.
-    Kernel2 * kdu = NULL;
-    Kernel2 * kdv = NULL;
-
-    switch(filter)
-    {
-        case NormalMapFilter_Sobel3x3:
-            kdu = new Kernel2(3);
-            break;
-        case NormalMapFilter_Sobel5x5:
-            kdu = new Kernel2(5);
-            break;
-        case NormalMapFilter_Sobel7x7:
-            kdu = new Kernel2(7);
-            break;
-        case NormalMapFilter_Sobel9x9:
-            kdu = new Kernel2(9);
-            break;
-        default:
-            nvDebugCheck(false);
-    };
-
-    kdu->initSobel();
-    kdu->normalize();
-
-    kdv = new Kernel2(*kdu);
-    kdv->transpose();
-
-    return ::createNormalMap(img, wm, heightWeights, kdu, kdv);
-}
-
-
-/// Create normal map combining multiple sobel filters.
-FloatImage * nv::createNormalMap(const Image * img, FloatImage::WrapMode wm, Vector4::Arg heightWeights, Vector4::Arg filterWeights)
-{
-    nvDebugCheck(img != NULL);
-
-    Kernel2 * kdu = NULL;
-    Kernel2 * kdv = NULL;
-
-    kdu = new Kernel2(9);
-    kdu->initBlendedSobel(filterWeights);
-    kdu->normalize();
-
-    kdv = new Kernel2(*kdu);
-    kdv->transpose();
-
-    return ::createNormalMap(img, wm, heightWeights, kdu, kdv);
-}
-
-
-FloatImage * nv::createNormalMap(const FloatImage * img, FloatImage::WrapMode wm, Vector4::Arg filterWeights)
-{
-    nvDebugCheck(img != NULL);
-
-    Kernel2 * kdu = NULL;
-    Kernel2 * kdv = NULL;
-
-    kdu = new Kernel2(9);
-    kdu->initBlendedSobel(filterWeights);
-    kdu->normalize();
-
-    kdv = new Kernel2(*kdu);
-    kdv->transpose();
-
-    return ::createNormalMap(img, wm, kdu, kdv);
-}
-
-
-/// Normalize the given image in place.
-void nv::normalizeNormalMap(FloatImage * img)
-{
-    nvDebugCheck(img != NULL);
-
-    img->normalize(0);
-}
-
+// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
+// 
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+#include "NormalMap.h"
+#include "Filter.h"
+#include "FloatImage.h"
+#include "Image.h"
+
+#include "nvmath/Color.inl"
+#include "nvmath/Vector.h"
+
+#include "nvcore/Ptr.h"
+
+#include <string.h> // memcpy
+
+
+using namespace nv;
+
+// Create normal map using the given kernels.
+static FloatImage * createNormalMap(const Image * img, FloatImage::WrapMode wm, Vector4::Arg heightWeights, const Kernel2 * kdu, const Kernel2 * kdv)
+{
+    nvDebugCheck(kdu != NULL);
+    nvDebugCheck(kdv != NULL);
+    nvDebugCheck(img != NULL);
+
+    const uint w = img->width();
+    const uint h = img->height();
+
+    AutoPtr<FloatImage> fimage(new FloatImage());
+    fimage->allocate(4, w, h);
+
+    // Compute height and store in alpha channel:
+    float * alphaChannel = fimage->channel(3);
+    for(uint i = 0; i < w * h; i++)
+    {
+        Vector4 color = toVector4(img->pixel(i));
+        alphaChannel[i] = dot(color, heightWeights);
+    }
+
+    float heightScale = 1.0f / 16.0f;	// @@ Use a user defined factor.
+
+    for(uint y = 0; y < h; y++)
+    {
+        for(uint x = 0; x < w; x++)
+        {
+            const float du = fimage->applyKernelXY(kdu, x, y, 0, 3, wm);
+            const float dv = fimage->applyKernelXY(kdv, x, y, 0, 3, wm);
+
+            Vector3 n = normalize(Vector3(du, dv, heightScale));
+
+            fimage->pixel(0, x, y, 0) = 0.5f * n.x + 0.5f;
+            fimage->pixel(1, x, y, 0) = 0.5f * n.y + 0.5f;
+            fimage->pixel(2, x, y, 0) = 0.5f * n.z + 0.5f;
+        }
+    }
+
+    return fimage.release();
+}
+
+
+// Create normal map using the given kernels.
+static FloatImage * createNormalMap(const FloatImage * img, FloatImage::WrapMode wm, const Kernel2 * kdu, const Kernel2 * kdv)
+{
+    nvDebugCheck(kdu != NULL);
+    nvDebugCheck(kdv != NULL);
+    nvDebugCheck(img != NULL);
+
+#pragma NV_MESSAGE("FIXME: Height scale parameter should go away. It should be a sensible value that produces good results when the heightmap is in the [0, 1] range.")
+    const float heightScale = 1.0f / 16.0f;
+
+    const uint w = img->width();
+    const uint h = img->height();
+
+    AutoPtr<FloatImage> img_out(new FloatImage());
+    img_out->allocate(4, w, h);
+
+    for (uint y = 0; y < h; y++)
+    {
+        for (uint x = 0; x < w; x++)
+        {
+            const float du = img->applyKernelXY(kdu, x, y, 0, 3, wm);
+            const float dv = img->applyKernelXY(kdv, x, y, 0, 3, wm);
+
+            Vector3 n = normalize(Vector3(du, dv, heightScale));
+
+            img_out->pixel(0, x, y, 0) = n.x;
+            img_out->pixel(1, x, y, 0) = n.y;
+            img_out->pixel(2, x, y, 0) = n.z;
+        }
+    }
+
+    // Copy alpha channel.
+    /*for (uint y = 0; y < h; y++)
+    {
+        for (uint x = 0; x < w; x++)
+        {
+            
+            img_out->pixel(3, x, y, 0) = img->pixel(3, x, y, 0);
+        }
+    }*/
+    memcpy(img_out->channel(3), img->channel(3), w * h * sizeof(float));
+
+    return img_out.release();
+}
+
+
+/// Create normal map using the given filter.
+FloatImage * nv::createNormalMap(const Image * img, FloatImage::WrapMode wm, Vector4::Arg heightWeights, NormalMapFilter filter /*= Sobel3x3*/)
+{
+    nvDebugCheck(img != NULL);
+
+    // Init the kernels.
+    Kernel2 * kdu = NULL;
+    Kernel2 * kdv = NULL;
+
+    switch(filter)
+    {
+        case NormalMapFilter_Sobel3x3:
+            kdu = new Kernel2(3);
+            break;
+        case NormalMapFilter_Sobel5x5:
+            kdu = new Kernel2(5);
+            break;
+        case NormalMapFilter_Sobel7x7:
+            kdu = new Kernel2(7);
+            break;
+        case NormalMapFilter_Sobel9x9:
+            kdu = new Kernel2(9);
+            break;
+        default:
+            nvDebugCheck(false);
+    };
+
+    kdu->initSobel();
+    kdu->normalize();
+
+    kdv = new Kernel2(*kdu);
+    kdv->transpose();
+
+    return ::createNormalMap(img, wm, heightWeights, kdu, kdv);
+}
+
+
+/// Create normal map combining multiple sobel filters.
+FloatImage * nv::createNormalMap(const Image * img, FloatImage::WrapMode wm, Vector4::Arg heightWeights, Vector4::Arg filterWeights)
+{
+    nvDebugCheck(img != NULL);
+
+    Kernel2 * kdu = NULL;
+    Kernel2 * kdv = NULL;
+
+    kdu = new Kernel2(9);
+    kdu->initBlendedSobel(filterWeights);
+    kdu->normalize();
+
+    kdv = new Kernel2(*kdu);
+    kdv->transpose();
+
+    return ::createNormalMap(img, wm, heightWeights, kdu, kdv);
+}
+
+
+FloatImage * nv::createNormalMap(const FloatImage * img, FloatImage::WrapMode wm, Vector4::Arg filterWeights)
+{
+    nvDebugCheck(img != NULL);
+
+    Kernel2 * kdu = NULL;
+    Kernel2 * kdv = NULL;
+
+    kdu = new Kernel2(9);
+    kdu->initBlendedSobel(filterWeights);
+    kdu->normalize();
+
+    kdv = new Kernel2(*kdu);
+    kdv->transpose();
+
+    return ::createNormalMap(img, wm, kdu, kdv);
+}
+
+
+/// Normalize the given image in place.
+void nv::normalizeNormalMap(FloatImage * img)
+{
+    nvDebugCheck(img != NULL);
+
+    img->normalize(0);
+}
+
diff --git a/src/nvimage/TgaFile.h b/src/nvimage/TgaFile.h
index bce2fc1..22310a7 100644
--- a/src/nvimage/TgaFile.h
+++ b/src/nvimage/TgaFile.h
@@ -101,6 +101,48 @@ inline Stream & operator<< (Stream & s, TgaFile & tga)
 	return s;
 }
 
+
+
+// @@ Move to BMP file?
+
+const int BITMAPFILEHEADER_SIZE = 14;
+const int BITMAPINFOHEADER_SIZE = 40;
+const int BM_TYPE = ((unsigned int)'M') << 8 | ((unsigned int)'B');
+
+// BMP Header.
+struct BmpFileHeader {
+    uint16 type;
+    uint32 size;
+    uint16 reserved1;
+    uint16 reserved2;
+    uint32 offBits;
+};
+
+struct BmpInfoHeader {
+    uint32 size;
+    uint32 width;
+    uint32 height;
+    uint16 planes;
+    uint16 bitCount;
+    uint32 compression;
+    uint32 sizeImage;
+    uint32 xPelsPerMeter;
+    uint32 yPelsPerMeter;
+    uint32 clrUsed;
+    uint32 clrImportant;
+};
+
+inline Stream & operator<< (Stream & s, BmpFileHeader & bmp) {
+    return s << bmp.type << bmp.size << bmp.reserved1 << bmp.reserved2 << bmp.offBits;
+}
+
+inline Stream & operator<< (Stream & s, BmpInfoHeader & bmp) {
+    s << bmp.size << bmp.width << bmp.height << bmp.planes << bmp.bitCount << bmp.compression << bmp.sizeImage;
+    s << bmp.xPelsPerMeter << bmp.yPelsPerMeter << bmp.clrUsed << bmp.clrImportant;
+    return s;
+}
+
+
 } // nv namespace
 
 #endif // NV_IMAGE_TGAFILE_H
diff --git a/src/nvmath/CMakeLists.txt b/src/nvmath/CMakeLists.txt
index abeb05f..c59cfeb 100644
--- a/src/nvmath/CMakeLists.txt
+++ b/src/nvmath/CMakeLists.txt
@@ -7,7 +7,7 @@ SET(MATH_SRCS
     Fitting.h Fitting.cpp
     Gamma.h Gamma.cpp
     Half.h Half.cpp
-    Matrix.h
+    Matrix.h Matrix.inl Matrix.cpp
     Plane.h Plane.inl Plane.cpp
     SphericalHarmonic.h SphericalHarmonic.cpp
     SimdVector.h SimdVector_SSE.h SimdVector_VE.h
diff --git a/src/nvmath/Color.inl b/src/nvmath/Color.inl
index 2b87ee4..d871704 100644
--- a/src/nvmath/Color.inl
+++ b/src/nvmath/Color.inl
@@ -157,6 +157,12 @@ namespace nv
         return Vector4(c.r * scale, c.g * scale, c.b * scale, c.a * scale);
     }
 
+    inline Vector3 toVector3(Color32 c)
+    {
+        const float scale = 1.0f / 255.0f;
+        return Vector3(c.r * scale, c.g * scale, c.b * scale);
+    }
+
 
     inline float perceptualColorDistance(Vector3::Arg c0, Vector3::Arg c1)
     {
diff --git a/src/nvmath/Matrix.cpp b/src/nvmath/Matrix.cpp
index 29bd19f..d171d13 100644
--- a/src/nvmath/Matrix.cpp
+++ b/src/nvmath/Matrix.cpp
@@ -1,441 +1,487 @@
-// This code is in the public domain -- castanyo@yahoo.es
-
-#include "Matrix.inl"
-#include "Vector.inl"
-
-#include "nvcore/Array.inl"
-
-#include <float.h>
-
-#if !NV_CC_MSVC && !NV_OS_ORBIS
-#include <alloca.h>
-#endif
-
-using namespace nv;
-
-
-// Given a matrix a[1..n][1..n], this routine replaces it by the LU decomposition of a rowwise
-// permutation of itself. a and n are input. a is output, arranged as in equation (2.3.14) above;
-// indx[1..n] is an output vector that records the row permutation effected by the partial
-// pivoting; d is output as -1 depending on whether the number of row interchanges was even
-// or odd, respectively. This routine is used in combination with lubksb to solve linear equations
-// or invert a matrix.
-static bool ludcmp(float **a, int n, int *indx, float *d)
-{
-    const float TINY = 1.0e-20f;
-
-    float * vv = (float*)alloca(sizeof(float) * n);    // vv stores the implicit scaling of each row.
-
-    *d = 1.0; // No row interchanges yet.
-    for (int i = 0; i < n; i++) { // Loop over rows to get the implicit scaling information.
-    
-        float big = 0.0;
-        for (int j = 0; j < n; j++) {
-            big = max(big, fabsf(a[i][j]));
-        }
-        if (big == 0) {
-            return false;   // Singular matrix
-        }
-        
-        // No nonzero largest element.
-        vv[i] = 1.0f / big; // Save the scaling.
-    }
-
-    for (int j = 0; j < n; j++) {       // This is the loop over columns of Crout's method.
-        for (int i = 0; i < j; i++) {   // This is equation (2.3.12) except for i = j.
-            float sum = a[i][j];
-            for (int k = 0; k < i; k++) sum -= a[i][k]*a[k][j];
-            a[i][j] = sum;
-        }
-
-        int imax = -1;
-        float big = 0.0;                // Initialize for the search for largest pivot element.
-        for (int i = j; i < n; i++) {   // This is i = j of equation (2.3.12) and i = j+ 1 : : : N
-            float sum = a[i][j];              // of equation (2.3.13).
-            for (int k = 0; k < j; k++) {
-                sum -= a[i][k]*a[k][j];
-            }
-            a[i][j]=sum;
-
-            float dum = vv[i]*fabs(sum);
-            if (dum >= big) {
-                // Is the figure of merit for the pivot better than the best so far?
-                big = dum;
-                imax = i;
-            }
-        }
-        nvDebugCheck(imax != -1);
-
-        if (j != imax) {                // Do we need to interchange rows?
-            for (int k = 0; k < n; k++) {   // Yes, do so...
-                swap(a[imax][k], a[j][k]);
-            }
-            *d = -(*d); // ...and change the parity of d.
-            vv[imax]=vv[j]; // Also interchange the scale factor.
-        }
-
-        indx[j]=imax;
-        if (a[j][j] == 0.0) a[j][j] = TINY;
-        
-        // If the pivot element is zero the matrix is singular (at least to the precision of the
-        // algorithm). For some applications on singular matrices, it is desirable to substitute
-        // TINY for zero.
-        if (j != n-1) { // Now, finally, divide by the pivot element.
-            float dum = 1.0f / a[j][j];
-            for (int i = j+1; i < n; i++) a[i][j] *= dum;
-        }
-    } // Go back for the next column in the reduction.
-
-    return true;
-}
-
-
-// Solves the set of n linear equations Ax = b. Here a[1..n][1..n] is input, not as the matrix
-// A but rather as its LU decomposition, determined by the routine ludcmp. indx[1..n] is input
-// as the permutation vector returned by ludcmp. b[1..n] is input as the right-hand side vector
-// B, and returns with the solution vector X. a, n, and indx are not modified by this routine
-// and can be left in place for successive calls with different right-hand sides b. This routine takes
-// into account the possibility that b will begin with many zero elements, so it is efficient for use
-// in matrix inversion.
-static void lubksb(float **a, int n, int *indx, float b[])
-{
-    int ii = 0;
-    for (int i=0; i<n; i++) {   // When ii is set to a positive value, it will become 
-        int ip = indx[i];       // the index of the first nonvanishing element of b. We now 
-        float sum = b[ip];      // do the forward substitution, equation (2.3.6). The 
-        b[ip] = b[i];           // only new wrinkle is to unscramble the permutation as we go.
-        if (ii != 0) {
-            for (int j = ii-1; j < i; j++) sum -= a[i][j]*b[j];
-        }
-        else if (sum != 0.0f) {
-            ii = i+1;             // A nonzero element was encountered, so from now on we 
-        }
-        b[i] = sum;             // will have to do the sums in the loop above.
-    }
-    for (int i=n-1; i>=0; i--) {  // Now we do the backsubstitution, equation (2.3.7).
-        float sum = b[i];
-        for (int j = i+1; j < n; j++) {
-            sum -= a[i][j]*b[j];
-        }
-        b[i] = sum/a[i][i];     // Store a component of the solution vector X.
-    } // All done!
-}
-
-
-bool nv::solveLU(const Matrix & A, const Vector4 & b, Vector4 * x)
-{
-    nvDebugCheck(x != NULL);
-
-    float m[4][4];
-    float *a[4] = {m[0], m[1], m[2], m[3]};
-    int idx[4];
-    float d;
-
-    for (int y = 0; y < 4; y++) {
-        for (int x = 0; x < 4; x++) {
-            a[x][y] = A(x, y);
-        }
-    }
-
-    // Create LU decomposition.
-    if (!ludcmp(a, 4, idx, &d)) {
-        // Singular matrix.
-        return false;
-    }
-
-    // Init solution.
-    *x = b;
-
-    // Do back substitution.
-    lubksb(a, 4, idx, x->component);
-
-    return true;
-}
-
-// @@ Not tested.
-Matrix nv::inverseLU(const Matrix & A)
-{
-    Vector4 Ai[4];
-
-    solveLU(A, Vector4(1, 0, 0, 0), &Ai[0]);
-    solveLU(A, Vector4(0, 1, 0, 0), &Ai[1]);
-    solveLU(A, Vector4(0, 0, 1, 0), &Ai[2]);
-    solveLU(A, Vector4(0, 0, 0, 1), &Ai[3]);
-
-    return Matrix(Ai[0], Ai[1], Ai[2], Ai[3]);
-}
-
-
-
-bool nv::solveLU(const Matrix3 & A, const Vector3 & b, Vector3 * x)
-{
-    nvDebugCheck(x != NULL);
-
-    float m[3][3];
-    float *a[3] = {m[0], m[1], m[2]};
-    int idx[3];
-    float d;
-
-    for (int y = 0; y < 3; y++) {
-        for (int x = 0; x < 3; x++) {
-            a[x][y] = A(x, y);
-        }
-    }
-
-    // Create LU decomposition.
-    if (!ludcmp(a, 3, idx, &d)) {
-        // Singular matrix.
-        return false;
-    }
-
-    // Init solution.
-    *x = b;
-
-    // Do back substitution.
-    lubksb(a, 3, idx, x->component);
-
-    return true;
-}
-
-
-bool nv::solveCramer(const Matrix & A, const Vector4 & b, Vector4 * x)
-{
-    nvDebugCheck(x != NULL);
-
-    *x = transform(inverseCramer(A), b);
-    
-    return true; // @@ Return false if determinant(A) == 0 !
-}
-
-bool nv::solveCramer(const Matrix3 & A, const Vector3 & b, Vector3 * x)
-{
-    nvDebugCheck(x != NULL);
-
-    const float det = A.determinant();
-    if (equal(det, 0.0f)) {   // @@ Use input epsilon.
-        return false;
-    }
-
-    Matrix3 Ai = inverseCramer(A);
-
-    *x = transform(Ai, b);
-    
-    return true;
-}
-
-
-
-// Inverse using gaussian elimination. From Jon's code.
-Matrix nv::inverse(const Matrix & m) {
-
-    Matrix A = m;
-    Matrix B(identity);
-
-    int i, j, k;
-    float max, t, det, pivot;
-
-    det = 1.0;
-    for (i=0; i<4; i++) {               /* eliminate in column i, below diag */
-        max = -1.;
-        for (k=i; k<4; k++)             /* find pivot for column i */
-            if (fabs(A(k, i)) > max) {
-                max = fabs(A(k, i));
-                j = k;
-            }
-        if (max<=0.) return B;         /* if no nonzero pivot, PUNT */
-        if (j!=i) {                     /* swap rows i and j */
-            for (k=i; k<4; k++)
-                swap(A(i, k), A(j, k));
-            for (k=0; k<4; k++)
-                swap(B(i, k), B(j, k));
-            det = -det;
-        }
-        pivot = A(i, i);
-        det *= pivot;
-        for (k=i+1; k<4; k++)           /* only do elems to right of pivot */
-            A(i, k) /= pivot;
-        for (k=0; k<4; k++)
-            B(i, k) /= pivot;
-        /* we know that A(i, i) will be set to 1, so don't bother to do it */
-
-        for (j=i+1; j<4; j++) {         /* eliminate in rows below i */
-            t = A(j, i);                /* we're gonna zero this guy */
-            for (k=i+1; k<4; k++)       /* subtract scaled row i from row j */
-                A(j, k) -= A(i, k)*t;   /* (ignore k<=i, we know they're 0) */
-            for (k=0; k<4; k++)
-                B(j, k) -= B(i, k)*t;
-        }
-    }
-
-    /*---------- backward elimination ----------*/
-
-    for (i=4-1; i>0; i--) {             /* eliminate in column i, above diag */
-        for (j=0; j<i; j++) {           /* eliminate in rows above i */
-            t = A(j, i);                /* we're gonna zero this guy */
-            for (k=0; k<4; k++)         /* subtract scaled row i from row j */
-                B(j, k) -= B(i, k)*t;
-        }
-    }
-
-    return B;
-}
-
-
-Matrix3 nv::inverse(const Matrix3 & m) {
-
-    Matrix3 A = m;
-    Matrix3 B(identity);
-
-    int i, j, k;
-    float max, t, det, pivot;
-
-    det = 1.0;
-    for (i=0; i<3; i++) {               /* eliminate in column i, below diag */
-        max = -1.;
-        for (k=i; k<3; k++)             /* find pivot for column i */
-            if (fabs(A(k, i)) > max) {
-                max = fabs(A(k, i));
-                j = k;
-            }
-        if (max<=0.) return B;         /* if no nonzero pivot, PUNT */
-        if (j!=i) {                     /* swap rows i and j */
-            for (k=i; k<3; k++)
-                swap(A(i, k), A(j, k));
-            for (k=0; k<3; k++)
-                swap(B(i, k), B(j, k));
-            det = -det;
-        }
-        pivot = A(i, i);
-        det *= pivot;
-        for (k=i+1; k<3; k++)           /* only do elems to right of pivot */
-            A(i, k) /= pivot;
-        for (k=0; k<3; k++)
-            B(i, k) /= pivot;
-        /* we know that A(i, i) will be set to 1, so don't bother to do it */
-
-        for (j=i+1; j<3; j++) {         /* eliminate in rows below i */
-            t = A(j, i);                /* we're gonna zero this guy */
-            for (k=i+1; k<3; k++)       /* subtract scaled row i from row j */
-                A(j, k) -= A(i, k)*t;   /* (ignore k<=i, we know they're 0) */
-            for (k=0; k<3; k++)
-                B(j, k) -= B(i, k)*t;
-        }
-    }
-
-    /*---------- backward elimination ----------*/
-
-    for (i=3-1; i>0; i--) {             /* eliminate in column i, above diag */
-        for (j=0; j<i; j++) {           /* eliminate in rows above i */
-            t = A(j, i);                /* we're gonna zero this guy */
-            for (k=0; k<3; k++)         /* subtract scaled row i from row j */
-                B(j, k) -= B(i, k)*t;
-        }
-    }
-
-    return B;
-}
-
-
-
-
-
-#if 0 
-
-// Copyright (C) 1999-2004 Michael Garland.
-// 
-// Permission is hereby granted, free of charge, to any person obtaining a
-// copy of this software and associated documentation files (the
-// "Software"), to deal in the Software without restriction, including
-// without limitation the rights to use, copy, modify, merge, publish,
-// distribute, and/or sell copies of the Software, and to permit persons
-// to whom the Software is furnished to do so, provided that the above
-// copyright notice(s) and this permission notice appear in all copies of
-// the Software and that both the above copyright notice(s) and this
-// permission notice appear in supporting documentation.
-// 
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
-// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT
-// OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
-// HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL
-// INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING
-// FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
-// NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION
-// WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
-// 
-// Except as contained in this notice, the name of a copyright holder
-// shall not be used in advertising or otherwise to promote the sale, use
-// or other dealings in this Software without prior written authorization
-// of the copyright holder.
-
-
-// Matrix inversion code for 4x4 matrices using Gaussian elimination
-// with partial pivoting.  This is a specialized version of a
-// procedure originally due to Paul Heckbert <ph@cs.cmu.edu>.
-//
-// Returns determinant of A, and B=inverse(A)
-// If matrix A is singular, returns 0 and leaves trash in B.
-//
-#define SWAP(a, b, t)   {t = a; a = b; b = t;}
-double invert(Mat4& B, const Mat4& m)
-{
-    Mat4 A = m;
-    int i, j, k;
-    double max, t, det, pivot;
-
-    /*---------- forward elimination ----------*/
-
-    for (i=0; i<4; i++)                 /* put identity matrix in B */
-        for (j=0; j<4; j++)
-            B(i, j) = (double)(i==j);
-
-    det = 1.0;
-    for (i=0; i<4; i++) {               /* eliminate in column i, below diag */
-        max = -1.;
-        for (k=i; k<4; k++)             /* find pivot for column i */
-            if (fabs(A(k, i)) > max) {
-                max = fabs(A(k, i));
-                j = k;
-            }
-        if (max<=0.) return 0.;         /* if no nonzero pivot, PUNT */
-        if (j!=i) {                     /* swap rows i and j */
-            for (k=i; k<4; k++)
-                SWAP(A(i, k), A(j, k), t);
-            for (k=0; k<4; k++)
-                SWAP(B(i, k), B(j, k), t);
-            det = -det;
-        }
-        pivot = A(i, i);
-        det *= pivot;
-        for (k=i+1; k<4; k++)           /* only do elems to right of pivot */
-            A(i, k) /= pivot;
-        for (k=0; k<4; k++)
-            B(i, k) /= pivot;
-        /* we know that A(i, i) will be set to 1, so don't bother to do it */
-
-        for (j=i+1; j<4; j++) {         /* eliminate in rows below i */
-            t = A(j, i);                /* we're gonna zero this guy */
-            for (k=i+1; k<4; k++)       /* subtract scaled row i from row j */
-                A(j, k) -= A(i, k)*t;   /* (ignore k<=i, we know they're 0) */
-            for (k=0; k<4; k++)
-                B(j, k) -= B(i, k)*t;
-        }
-    }
-
-    /*---------- backward elimination ----------*/
-
-    for (i=4-1; i>0; i--) {             /* eliminate in column i, above diag */
-        for (j=0; j<i; j++) {           /* eliminate in rows above i */
-            t = A(j, i);                /* we're gonna zero this guy */
-            for (k=0; k<4; k++)         /* subtract scaled row i from row j */
-                B(j, k) -= B(i, k)*t;
-        }
-    }
-
-    return det;
-}
-
-#endif // 0
-
-
-
+// This code is in the public domain -- castanyo@yahoo.es
+
+#include "Matrix.inl"
+#include "Vector.inl"
+
+#include "nvcore/Array.inl"
+
+#include <float.h>
+
+#if !NV_CC_MSVC && !NV_OS_ORBIS
+#include <alloca.h>
+#endif
+
+using namespace nv;
+
+
+// Given a matrix a[1..n][1..n], this routine replaces it by the LU decomposition of a rowwise
+// permutation of itself. a and n are input. a is output, arranged as in equation (2.3.14) above;
+// indx[1..n] is an output vector that records the row permutation effected by the partial
+// pivoting; d is output as -1 depending on whether the number of row interchanges was even
+// or odd, respectively. This routine is used in combination with lubksb to solve linear equations
+// or invert a matrix.
+static bool ludcmp(float **a, int n, int *indx, float *d)
+{
+    const float TINY = 1.0e-20f;
+
+    float * vv = (float*)alloca(sizeof(float) * n);    // vv stores the implicit scaling of each row.
+
+    *d = 1.0; // No row interchanges yet.
+    for (int i = 0; i < n; i++) { // Loop over rows to get the implicit scaling information.
+    
+        float big = 0.0;
+        for (int j = 0; j < n; j++) {
+            big = max(big, fabsf(a[i][j]));
+        }
+        if (big == 0) {
+            return false;   // Singular matrix
+        }
+        
+        // No nonzero largest element.
+        vv[i] = 1.0f / big; // Save the scaling.
+    }
+
+    for (int j = 0; j < n; j++) {       // This is the loop over columns of Crout's method.
+        for (int i = 0; i < j; i++) {   // This is equation (2.3.12) except for i = j.
+            float sum = a[i][j];
+            for (int k = 0; k < i; k++) sum -= a[i][k]*a[k][j];
+            a[i][j] = sum;
+        }
+
+        int imax = -1;
+        float big = 0.0;                // Initialize for the search for largest pivot element.
+        for (int i = j; i < n; i++) {   // This is i = j of equation (2.3.12) and i = j+ 1 : : : N
+            float sum = a[i][j];              // of equation (2.3.13).
+            for (int k = 0; k < j; k++) {
+                sum -= a[i][k]*a[k][j];
+            }
+            a[i][j]=sum;
+
+            float dum = vv[i]*fabs(sum);
+            if (dum >= big) {
+                // Is the figure of merit for the pivot better than the best so far?
+                big = dum;
+                imax = i;
+            }
+        }
+        nvDebugCheck(imax != -1);
+
+        if (j != imax) {                // Do we need to interchange rows?
+            for (int k = 0; k < n; k++) {   // Yes, do so...
+                swap(a[imax][k], a[j][k]);
+            }
+            *d = -(*d); // ...and change the parity of d.
+            vv[imax]=vv[j]; // Also interchange the scale factor.
+        }
+
+        indx[j]=imax;
+        if (a[j][j] == 0.0) a[j][j] = TINY;
+        
+        // If the pivot element is zero the matrix is singular (at least to the precision of the
+        // algorithm). For some applications on singular matrices, it is desirable to substitute
+        // TINY for zero.
+        if (j != n-1) { // Now, finally, divide by the pivot element.
+            float dum = 1.0f / a[j][j];
+            for (int i = j+1; i < n; i++) a[i][j] *= dum;
+        }
+    } // Go back for the next column in the reduction.
+
+    return true;
+}
+
+
+// Solves the set of n linear equations Ax = b. Here a[1..n][1..n] is input, not as the matrix
+// A but rather as its LU decomposition, determined by the routine ludcmp. indx[1..n] is input
+// as the permutation vector returned by ludcmp. b[1..n] is input as the right-hand side vector
+// B, and returns with the solution vector X. a, n, and indx are not modified by this routine
+// and can be left in place for successive calls with different right-hand sides b. This routine takes
+// into account the possibility that b will begin with many zero elements, so it is efficient for use
+// in matrix inversion.
+static void lubksb(float **a, int n, int *indx, float b[])
+{
+    int ii = 0;
+    for (int i=0; i<n; i++) {   // When ii is set to a positive value, it will become 
+        int ip = indx[i];       // the index of the first nonvanishing element of b. We now 
+        float sum = b[ip];      // do the forward substitution, equation (2.3.6). The 
+        b[ip] = b[i];           // only new wrinkle is to unscramble the permutation as we go.
+        if (ii != 0) {
+            for (int j = ii-1; j < i; j++) sum -= a[i][j]*b[j];
+        }
+        else if (sum != 0.0f) {
+            ii = i+1;             // A nonzero element was encountered, so from now on we 
+        }
+        b[i] = sum;             // will have to do the sums in the loop above.
+    }
+    for (int i=n-1; i>=0; i--) {  // Now we do the backsubstitution, equation (2.3.7).
+        float sum = b[i];
+        for (int j = i+1; j < n; j++) {
+            sum -= a[i][j]*b[j];
+        }
+        b[i] = sum/a[i][i];     // Store a component of the solution vector X.
+    } // All done!
+}
+
+
+bool nv::solveLU(const Matrix & A, const Vector4 & b, Vector4 * x)
+{
+    nvDebugCheck(x != NULL);
+
+    float m[4][4];
+    float *a[4] = {m[0], m[1], m[2], m[3]};
+    int idx[4];
+    float d;
+
+    for (int y = 0; y < 4; y++) {
+        for (int x = 0; x < 4; x++) {
+            a[x][y] = A(x, y);
+        }
+    }
+
+    // Create LU decomposition.
+    if (!ludcmp(a, 4, idx, &d)) {
+        // Singular matrix.
+        return false;
+    }
+
+    // Init solution.
+    *x = b;
+
+    // Do back substitution.
+    lubksb(a, 4, idx, x->component);
+
+    return true;
+}
+
+// @@ Not tested.
+Matrix nv::inverseLU(const Matrix & A)
+{
+    Vector4 Ai[4];
+
+    solveLU(A, Vector4(1, 0, 0, 0), &Ai[0]);
+    solveLU(A, Vector4(0, 1, 0, 0), &Ai[1]);
+    solveLU(A, Vector4(0, 0, 1, 0), &Ai[2]);
+    solveLU(A, Vector4(0, 0, 0, 1), &Ai[3]);
+
+    return Matrix(Ai[0], Ai[1], Ai[2], Ai[3]);
+}
+
+
+
+bool nv::solveLU(const Matrix3 & A, const Vector3 & b, Vector3 * x)
+{
+    nvDebugCheck(x != NULL);
+
+    float m[3][3];
+    float *a[3] = {m[0], m[1], m[2]};
+    int idx[3];
+    float d;
+
+    for (int y = 0; y < 3; y++) {
+        for (int x = 0; x < 3; x++) {
+            a[x][y] = A(x, y);
+        }
+    }
+
+    // Create LU decomposition.
+    if (!ludcmp(a, 3, idx, &d)) {
+        // Singular matrix.
+        return false;
+    }
+
+    // Init solution.
+    *x = b;
+
+    // Do back substitution.
+    lubksb(a, 3, idx, x->component);
+
+    return true;
+}
+
+bool nv::solveLU(const Matrix2 & A, const Vector2 & b, Vector2 * x)
+{
+    nvDebugCheck(x != NULL);
+    
+    float m[2][2];
+    float *a[2] = {m[0], m[1]};
+    int idx[2];
+    float d;
+    
+    for (int y = 0; y < 2; y++) {
+        for (int x = 0; x < 2; x++) {
+            a[x][y] = A(x, y);
+        }
+    }
+    
+    // Create LU decomposition.
+    if (!ludcmp(a, 2, idx, &d)) {
+        // Singular matrix.
+        return false;
+    }
+    
+    // Init solution.
+    *x = b;
+    
+    // Do back substitution.
+    lubksb(a, 2, idx, x->component);
+    
+    return true;
+}
+
+
+bool nv::solveCramer(const Matrix & A, const Vector4 & b, Vector4 * x)
+{
+    nvDebugCheck(x != NULL);
+
+    *x = transform(inverseCramer(A), b);
+    
+    return true; // @@ Return false if determinant(A) == 0 !
+}
+
+bool nv::solveCramer(const Matrix3 & A, const Vector3 & b, Vector3 * x)
+{
+    nvDebugCheck(x != NULL);
+
+    const float det = A.determinant();
+    if (equal(det, 0.0f)) {   // @@ Use input epsilon.
+        return false;
+    }
+
+    Matrix3 Ai = inverseCramer(A);
+
+    *x = transform(Ai, b);
+    
+    return true;
+}
+
+bool nv::solveCramer(const Matrix2 & A, const Vector2 & b, Vector2 * x)
+{
+    nvDebugCheck(x != NULL);
+    
+    const float det = A.determinant();
+    if (equal(det, 0.0f)) {   // @@ Use input epsilon.
+        return false;
+    }
+    
+    Matrix2 Ai = inverseCramer(A);
+    
+    *x = transform(Ai, b);
+    
+    return true;
+}
+
+
+
+// Inverse using gaussian elimination. From Jon's code.
+Matrix nv::inverse(const Matrix & m) {
+
+    Matrix A = m;
+    Matrix B(identity);
+
+    int i, j, k;
+    float max, t, det, pivot;
+
+    det = 1.0;
+    for (i=0; i<4; i++) {               /* eliminate in column i, below diag */
+        max = -1.;
+        for (k=i; k<4; k++)             /* find pivot for column i */
+            if (fabs(A(k, i)) > max) {
+                max = fabs(A(k, i));
+                j = k;
+            }
+        if (max<=0.) return B;         /* if no nonzero pivot, PUNT */
+        if (j!=i) {                     /* swap rows i and j */
+            for (k=i; k<4; k++)
+                swap(A(i, k), A(j, k));
+            for (k=0; k<4; k++)
+                swap(B(i, k), B(j, k));
+            det = -det;
+        }
+        pivot = A(i, i);
+        det *= pivot;
+        for (k=i+1; k<4; k++)           /* only do elems to right of pivot */
+            A(i, k) /= pivot;
+        for (k=0; k<4; k++)
+            B(i, k) /= pivot;
+        /* we know that A(i, i) will be set to 1, so don't bother to do it */
+
+        for (j=i+1; j<4; j++) {         /* eliminate in rows below i */
+            t = A(j, i);                /* we're gonna zero this guy */
+            for (k=i+1; k<4; k++)       /* subtract scaled row i from row j */
+                A(j, k) -= A(i, k)*t;   /* (ignore k<=i, we know they're 0) */
+            for (k=0; k<4; k++)
+                B(j, k) -= B(i, k)*t;
+        }
+    }
+
+    /*---------- backward elimination ----------*/
+
+    for (i=4-1; i>0; i--) {             /* eliminate in column i, above diag */
+        for (j=0; j<i; j++) {           /* eliminate in rows above i */
+            t = A(j, i);                /* we're gonna zero this guy */
+            for (k=0; k<4; k++)         /* subtract scaled row i from row j */
+                B(j, k) -= B(i, k)*t;
+        }
+    }
+
+    return B;
+}
+
+
+Matrix3 nv::inverse(const Matrix3 & m) {
+
+    Matrix3 A = m;
+    Matrix3 B(identity);
+
+    int i, j, k;
+    float max, t, det, pivot;
+
+    det = 1.0;
+    for (i=0; i<3; i++) {               /* eliminate in column i, below diag */
+        max = -1.;
+        for (k=i; k<3; k++)             /* find pivot for column i */
+            if (fabs(A(k, i)) > max) {
+                max = fabs(A(k, i));
+                j = k;
+            }
+        if (max<=0.) return B;         /* if no nonzero pivot, PUNT */
+        if (j!=i) {                     /* swap rows i and j */
+            for (k=i; k<3; k++)
+                swap(A(i, k), A(j, k));
+            for (k=0; k<3; k++)
+                swap(B(i, k), B(j, k));
+            det = -det;
+        }
+        pivot = A(i, i);
+        det *= pivot;
+        for (k=i+1; k<3; k++)           /* only do elems to right of pivot */
+            A(i, k) /= pivot;
+        for (k=0; k<3; k++)
+            B(i, k) /= pivot;
+        /* we know that A(i, i) will be set to 1, so don't bother to do it */
+
+        for (j=i+1; j<3; j++) {         /* eliminate in rows below i */
+            t = A(j, i);                /* we're gonna zero this guy */
+            for (k=i+1; k<3; k++)       /* subtract scaled row i from row j */
+                A(j, k) -= A(i, k)*t;   /* (ignore k<=i, we know they're 0) */
+            for (k=0; k<3; k++)
+                B(j, k) -= B(i, k)*t;
+        }
+    }
+
+    /*---------- backward elimination ----------*/
+
+    for (i=3-1; i>0; i--) {             /* eliminate in column i, above diag */
+        for (j=0; j<i; j++) {           /* eliminate in rows above i */
+            t = A(j, i);                /* we're gonna zero this guy */
+            for (k=0; k<3; k++)         /* subtract scaled row i from row j */
+                B(j, k) -= B(i, k)*t;
+        }
+    }
+
+    return B;
+}
+
+
+
+
+
+#if 0 
+
+// Copyright (C) 1999-2004 Michael Garland.
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a
+// copy of this software and associated documentation files (the
+// "Software"), to deal in the Software without restriction, including
+// without limitation the rights to use, copy, modify, merge, publish,
+// distribute, and/or sell copies of the Software, and to permit persons
+// to whom the Software is furnished to do so, provided that the above
+// copyright notice(s) and this permission notice appear in all copies of
+// the Software and that both the above copyright notice(s) and this
+// permission notice appear in supporting documentation.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT
+// OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+// HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL
+// INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING
+// FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
+// NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION
+// WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+// 
+// Except as contained in this notice, the name of a copyright holder
+// shall not be used in advertising or otherwise to promote the sale, use
+// or other dealings in this Software without prior written authorization
+// of the copyright holder.
+
+
+// Matrix inversion code for 4x4 matrices using Gaussian elimination
+// with partial pivoting.  This is a specialized version of a
+// procedure originally due to Paul Heckbert <ph@cs.cmu.edu>.
+//
+// Returns determinant of A, and B=inverse(A)
+// If matrix A is singular, returns 0 and leaves trash in B.
+//
+#define SWAP(a, b, t)   {t = a; a = b; b = t;}
+double invert(Mat4& B, const Mat4& m)
+{
+    Mat4 A = m;
+    int i, j, k;
+    double max, t, det, pivot;
+
+    /*---------- forward elimination ----------*/
+
+    for (i=0; i<4; i++)                 /* put identity matrix in B */
+        for (j=0; j<4; j++)
+            B(i, j) = (double)(i==j);
+
+    det = 1.0;
+    for (i=0; i<4; i++) {               /* eliminate in column i, below diag */
+        max = -1.;
+        for (k=i; k<4; k++)             /* find pivot for column i */
+            if (fabs(A(k, i)) > max) {
+                max = fabs(A(k, i));
+                j = k;
+            }
+        if (max<=0.) return 0.;         /* if no nonzero pivot, PUNT */
+        if (j!=i) {                     /* swap rows i and j */
+            for (k=i; k<4; k++)
+                SWAP(A(i, k), A(j, k), t);
+            for (k=0; k<4; k++)
+                SWAP(B(i, k), B(j, k), t);
+            det = -det;
+        }
+        pivot = A(i, i);
+        det *= pivot;
+        for (k=i+1; k<4; k++)           /* only do elems to right of pivot */
+            A(i, k) /= pivot;
+        for (k=0; k<4; k++)
+            B(i, k) /= pivot;
+        /* we know that A(i, i) will be set to 1, so don't bother to do it */
+
+        for (j=i+1; j<4; j++) {         /* eliminate in rows below i */
+            t = A(j, i);                /* we're gonna zero this guy */
+            for (k=i+1; k<4; k++)       /* subtract scaled row i from row j */
+                A(j, k) -= A(i, k)*t;   /* (ignore k<=i, we know they're 0) */
+            for (k=0; k<4; k++)
+                B(j, k) -= B(i, k)*t;
+        }
+    }
+
+    /*---------- backward elimination ----------*/
+
+    for (i=4-1; i>0; i--) {             /* eliminate in column i, above diag */
+        for (j=0; j<i; j++) {           /* eliminate in rows above i */
+            t = A(j, i);                /* we're gonna zero this guy */
+            for (k=0; k<4; k++)         /* subtract scaled row i from row j */
+                B(j, k) -= B(i, k)*t;
+        }
+    }
+
+    return det;
+}
+
+#endif // 0
+
+
+
diff --git a/src/nvmath/Matrix.h b/src/nvmath/Matrix.h
index 2f14f0c..d1171a1 100644
--- a/src/nvmath/Matrix.h
+++ b/src/nvmath/Matrix.h
@@ -14,6 +14,46 @@ namespace nv
 {
     enum identity_t { identity };
 
+    // 2x2 matrix.
+    class NVMATH_CLASS Matrix2
+    {
+    public:
+        Matrix2();
+        explicit Matrix2(float f);
+        explicit Matrix2(identity_t);
+        Matrix2(const Matrix2 & m);
+        Matrix2(Vector2::Arg v0, Vector2::Arg v1);
+        Matrix2(float a, float b, float c, float d);
+        
+        float data(uint idx) const;
+        float & data(uint idx);
+        float get(uint row, uint col) const;
+        float operator()(uint row, uint col) const;
+        float & operator()(uint row, uint col);
+        
+        Vector2 row(uint i) const;
+        Vector2 column(uint i) const;
+        
+        void operator*=(float s);
+        void operator/=(float s);
+        void operator+=(const Matrix2 & m);
+        void operator-=(const Matrix2 & m);
+        
+        void scale(float s);
+        void scale(Vector2::Arg s);
+        float determinant() const;
+        
+    private:
+        float m_data[4];
+    };
+    
+    // Solve equation system using LU decomposition and back-substitution.
+    extern bool solveLU(const Matrix2 & m, const Vector2 & b, Vector2 * x);
+    
+    // Solve equation system using Cramer's inverse.
+    extern bool solveCramer(const Matrix2 & A, const Vector2 & b, Vector2 * x);
+    
+    
     // 3x3 matrix.
     class NVMATH_CLASS Matrix3
     {
@@ -52,6 +92,8 @@ namespace nv
     // Solve equation system using Cramer's inverse.
     extern bool solveCramer(const Matrix3 & A, const Vector3 & b, Vector3 * x);
 
+    extern Matrix3 inverse(const Matrix3 & m);
+    
 
     // 4x4 matrix.
     class NVMATH_CLASS Matrix
@@ -106,7 +148,6 @@ namespace nv
 
     // Compute inverse using Gaussian elimination and partial pivoting.
     extern Matrix inverse(const Matrix & m);
-    extern Matrix3 inverse(const Matrix3 & m);
 
 } // nv namespace
 
diff --git a/src/nvmath/Matrix.inl b/src/nvmath/Matrix.inl
index 34fdc70..f742656 100644
--- a/src/nvmath/Matrix.inl
+++ b/src/nvmath/Matrix.inl
@@ -8,6 +8,199 @@
 
 namespace nv
 {
+    inline Matrix2::Matrix2() {}
+    
+    inline Matrix2::Matrix2(float f)
+    {
+        for(int i = 0; i < 4; i++) {
+            m_data[i] = f;
+        }
+    }
+    
+    inline Matrix2::Matrix2(identity_t)
+    {
+        for(int i = 0; i < 2; i++) {
+            for(int j = 0; j < 2; j++) {
+                m_data[2*j+i] = (i == j) ? 1.0f : 0.0f;
+            }
+        }
+    }
+    
+    inline Matrix2::Matrix2(const Matrix2 & m)
+    {
+        for(int i = 0; i < 4; i++) {
+            m_data[i] = m.m_data[i];
+        }
+    }
+    
+    inline Matrix2::Matrix2(Vector2::Arg v0, Vector2::Arg v1)
+    {
+        m_data[0] = v0.x; m_data[1] = v0.y;
+        m_data[2] = v1.x; m_data[3] = v1.y;
+    }
+    
+    inline Matrix2::Matrix2(float a, float b, float c, float d)
+    {
+        m_data[0] = a; m_data[1] = b;
+        m_data[2] = c; m_data[3] = d;
+    }
+    
+    inline float Matrix2::data(uint idx) const
+    {
+        nvDebugCheck(idx < 4);
+        return m_data[idx];
+    }
+    inline float & Matrix2::data(uint idx)
+    {
+        nvDebugCheck(idx < 4);
+        return m_data[idx];
+    }
+    inline float Matrix2::get(uint row, uint col) const
+    {
+        nvDebugCheck(row < 2 && col < 2);
+        return m_data[col * 2 + row];
+    }
+    inline float Matrix2::operator()(uint row, uint col) const
+    {
+        nvDebugCheck(row < 2 && col < 2);
+        return m_data[col * 2 + row];
+    }
+    inline float & Matrix2::operator()(uint row, uint col)
+    {
+        nvDebugCheck(row < 2 && col < 2);
+        return m_data[col * 2 + row];
+    }
+    
+    inline Vector2 Matrix2::row(uint i) const
+    {
+        nvDebugCheck(i < 2);
+        return Vector2(get(i, 0), get(i, 1));
+    }
+    inline Vector2 Matrix2::column(uint i) const
+    {
+        nvDebugCheck(i < 2);
+        return Vector2(get(0, i), get(1, i));
+    }
+    
+    inline void Matrix2::operator*=(float s)
+    {
+        for(int i = 0; i < 4; i++) {
+            m_data[i] *= s;
+        }
+    }
+    
+    inline void Matrix2::operator/=(float s)
+    {
+        float is = 1.0f /s;
+        for(int i = 0; i < 4; i++) {
+            m_data[i] *= is;
+        }
+    }
+    
+    inline void Matrix2::operator+=(const Matrix2 & m)
+    {
+        for(int i = 0; i < 4; i++) {
+            m_data[i] += m.m_data[i];
+        }
+    }
+    
+    inline void Matrix2::operator-=(const Matrix2 & m)
+    {
+        for(int i = 0; i < 4; i++) {
+            m_data[i] -= m.m_data[i];
+        }
+    }
+    
+    inline Matrix2 operator+(const Matrix2 & a, const Matrix2 & b)
+    {
+        Matrix2 m = a;
+        m += b;
+        return m;
+    }
+    
+    inline Matrix2 operator-(const Matrix2 & a, const Matrix2 & b)
+    {
+        Matrix2 m = a;
+        m -= b;
+        return m;
+    }
+    
+    inline Matrix2 operator*(const Matrix2 & a, float s)
+    {
+        Matrix2 m = a;
+        m *= s;
+        return m;
+    }
+    
+    inline Matrix2 operator*(float s, const Matrix2 & a)
+    {
+        Matrix2 m = a;
+        m *= s;
+        return m;
+    }
+    
+    inline Matrix2 operator/(const Matrix2 & a, float s)
+    {
+        Matrix2 m = a;
+        m /= s;
+        return m;
+    }
+    
+    inline Matrix2 mul(const Matrix2 & a, const Matrix2 & b)
+    {
+        Matrix2 m;
+        
+        for(int i = 0; i < 2; i++) {
+            const float ai0 = a(i,0), ai1 = a(i,1);
+            m(i, 0) = ai0 * b(0,0) + ai1 * b(1,0);
+            m(i, 1) = ai0 * b(0,1) + ai1 * b(1,1);
+        }
+        
+        return m;
+    }
+    
+    inline Matrix2 operator*(const Matrix2 & a, const Matrix2 & b)
+    {
+        return mul(a, b);
+    }
+    
+    // Transform the given 3d vector with the given matrix.
+    inline Vector2 transform(const Matrix2 & m, const Vector2 & p)
+    {
+        return Vector2(p.x * m(0,0) + p.y * m(0,1),
+                       p.x * m(1,0) + p.y * m(1,1));
+    }
+    
+    inline void Matrix2::scale(float s)
+    {
+        for (int i = 0; i < 4; i++) {
+            m_data[i] *= s;
+        }
+    }
+    
+    inline void Matrix2::scale(Vector2::Arg s)
+    {
+        m_data[0] *= s.x; m_data[1] *= s.x;
+        m_data[2] *= s.y; m_data[3] *= s.y;
+    }
+    
+    inline float Matrix2::determinant() const
+    {
+        return get(0,0) * get(1,1) - get(0,1) * get(1,0);
+    }
+    
+    // Inverse using Cramer's rule.
+    inline Matrix2 inverseCramer(const Matrix2 & m)
+    {
+        const float det = m.determinant();
+        if (equal(det, 0.0f, 0.0f)) {
+            return Matrix2(0);
+        }
+        
+        return m * (1/det);
+    }
+    
+    
     inline Matrix3::Matrix3() {}
     
     inline Matrix3::Matrix3(float f)
@@ -16,7 +209,7 @@ namespace nv
             m_data[i] = f;
         }
     }
-
+    
     inline Matrix3::Matrix3(identity_t)
     {
         for(int i = 0; i < 3; i++) {
@@ -794,7 +987,7 @@ v1 = FXVector3.Cross(v3, v2);
 v1.Normalize();
 
 Matrix R = Matrix::Identity;
-R[0, 0] = v3.X;�// Not sure this is in the correct order...
+R[0, 0] = v3.X;	// Not sure this is in the correct order...
 R[1, 0] = v3.Y;
 R[2, 0] = v3.Z;
 R[0, 1] = v1.X;
diff --git a/src/nvmath/Plane.h b/src/nvmath/Plane.h
index 17e366a..dc468b2 100644
--- a/src/nvmath/Plane.h
+++ b/src/nvmath/Plane.h
@@ -7,10 +7,6 @@
 #include "nvmath.h"
 #include "Vector.h"
 
-#if NV_USE_ALTIVEC
-#undef vector
-#endif
-
 namespace nv
 {
     class Matrix;
@@ -29,6 +25,7 @@ namespace nv
 
         Vector3 vector() const;
         float offset() const;
+        Vector3 normal() const;
 
         void operator*=(float s);
 
diff --git a/src/nvmath/Plane.inl b/src/nvmath/Plane.inl
index 73bf712..2277e38 100644
--- a/src/nvmath/Plane.inl
+++ b/src/nvmath/Plane.inl
@@ -24,6 +24,7 @@ namespace nv
 
     inline Vector3 Plane::vector() const { return v.xyz(); }
     inline float Plane::offset() const { return v.w; }
+    inline Vector3 Plane::normal() const { return normalize(vector(), 0.0f); }
 
     // Normalize plane.
     inline Plane normalize(const Plane & plane, float epsilon = NV_EPSILON)
diff --git a/src/nvmath/Vector.inl b/src/nvmath/Vector.inl
index 80c8e6f..5807912 100644
--- a/src/nvmath/Vector.inl
+++ b/src/nvmath/Vector.inl
@@ -1,919 +1,919 @@
-// This code is in the public domain -- castanyo@yahoo.es
-
-#pragma once
-#ifndef NV_MATH_VECTOR_INL
-#define NV_MATH_VECTOR_INL
-
-#include "Vector.h"
-#include "nvcore/Utils.h" // min, max
-#include "nvcore/Hash.h" // hash
-
-namespace nv
-{
-
-    // Helpers to convert vector types. Assume T has x,y members and 2 argument constructor.
-    //template <typename T> T to(Vector2::Arg v) { return T(v.x, v.y); }
-
-    // Helpers to convert vector types. Assume T has x,y,z members and 3 argument constructor.
-    //template <typename T> T to(Vector3::Arg v) { return T(v.x, v.y, v.z); }
-
-    // Helpers to convert vector types. Assume T has x,y,z members and 3 argument constructor.
-    //template <typename T> T to(Vector4::Arg v) { return T(v.x, v.y, v.z, v.w); }
-
-
-    // Vector2
-    inline Vector2::Vector2() {}
-    inline Vector2::Vector2(float f) : x(f), y(f) {}
-    inline Vector2::Vector2(float x, float y) : x(x), y(y) {}
-    inline Vector2::Vector2(Vector2::Arg v) : x(v.x), y(v.y) {}
-
-    inline const Vector2 & Vector2::operator=(Vector2::Arg v)
-    {
-        x = v.x;
-        y = v.y;
-        return *this;
-    }
-
-    inline const float * Vector2::ptr() const
-    {
-        return &x;
-    }
-
-    inline void Vector2::set(float x, float y)
-    {
-        this->x = x;
-        this->y = y;
-    }
-
-    inline Vector2 Vector2::operator-() const
-    {
-        return Vector2(-x, -y);
-    }
-
-    inline void Vector2::operator+=(Vector2::Arg v)
-    {
-        x += v.x;
-        y += v.y;
-    }
-
-    inline void Vector2::operator-=(Vector2::Arg v)
-    {
-        x -= v.x;
-        y -= v.y;
-    }
-
-    inline void Vector2::operator*=(float s)
-    {
-        x *= s;
-        y *= s;
-    }
-
-    inline void Vector2::operator*=(Vector2::Arg v)
-    {
-        x *= v.x;
-        y *= v.y;
-    }
-
-    inline bool operator==(Vector2::Arg a, Vector2::Arg b)
-    {
-        return a.x == b.x && a.y == b.y; 
-    }
-    inline bool operator!=(Vector2::Arg a, Vector2::Arg b)
-    {
-        return a.x != b.x || a.y != b.y; 
-    }
-
-
-    // Vector3
-    inline Vector3::Vector3() {}
-    inline Vector3::Vector3(float f) : x(f), y(f), z(f) {}
-    inline Vector3::Vector3(float x, float y, float z) : x(x), y(y), z(z) {}
-    inline Vector3::Vector3(Vector2::Arg v, float z) : x(v.x), y(v.y), z(z) {}
-    inline Vector3::Vector3(Vector3::Arg v) : x(v.x), y(v.y), z(v.z) {}
-
-    inline const Vector3 & Vector3::operator=(Vector3::Arg v)
-    {
-        x = v.x;
-        y = v.y;
-        z = v.z;
-        return *this;
-    }
-
-
-    inline Vector2 Vector3::xy() const
-    {
-        return Vector2(x, y);
-    }
-
-    inline const float * Vector3::ptr() const
-    {
-        return &x;
-    }
-
-    inline void Vector3::set(float x, float y, float z)
-    {
-        this->x = x;
-        this->y = y;
-        this->z = z;
-    }
-
-    inline Vector3 Vector3::operator-() const
-    {
-        return Vector3(-x, -y, -z);
-    }
-
-    inline void Vector3::operator+=(Vector3::Arg v)
-    {
-        x += v.x;
-        y += v.y;
-        z += v.z;
-    }
-
-    inline void Vector3::operator-=(Vector3::Arg v)
-    {
-        x -= v.x;
-        y -= v.y;
-        z -= v.z;
-    }
-
-    inline void Vector3::operator*=(float s)
-    {
-        x *= s;
-        y *= s;
-        z *= s;
-    }
-
-    inline void Vector3::operator/=(float s)
-    {
-        float is = 1.0f / s;
-        x *= is;
-        y *= is;
-        z *= is;
-    }
-
-    inline void Vector3::operator*=(Vector3::Arg v)
-    {
-        x *= v.x;
-        y *= v.y;
-        z *= v.z;
-    }
-
-    inline void Vector3::operator/=(Vector3::Arg v)
-    {
-        x /= v.x;
-        y /= v.y;
-        z /= v.z;
-    }
-
-    inline bool operator==(Vector3::Arg a, Vector3::Arg b)
-    {
-        return a.x == b.x && a.y == b.y && a.z == b.z; 
-    }
-    inline bool operator!=(Vector3::Arg a, Vector3::Arg b)
-    {
-        return a.x != b.x || a.y != b.y || a.z != b.z; 
-    }
-
-
-    // Vector4
-    inline Vector4::Vector4() {}
-    inline Vector4::Vector4(float f) : x(f), y(f), z(f), w(f) {}
-    inline Vector4::Vector4(float x, float y, float z, float w) : x(x), y(y), z(z), w(w) {}
-    inline Vector4::Vector4(Vector2::Arg v, float z, float w) : x(v.x), y(v.y), z(z), w(w) {}
-    inline Vector4::Vector4(Vector2::Arg v, Vector2::Arg u) : x(v.x), y(v.y), z(u.x), w(u.y) {}
-    inline Vector4::Vector4(Vector3::Arg v, float w) : x(v.x), y(v.y), z(v.z), w(w) {}
-    inline Vector4::Vector4(Vector4::Arg v) : x(v.x), y(v.y), z(v.z), w(v.w) {}
-
-    inline const Vector4 & Vector4::operator=(const Vector4 & v)
-    {
-        x = v.x;
-        y = v.y;
-        z = v.z;
-        w = v.w;
-        return *this;
-    }
-
-    inline Vector2 Vector4::xy() const
-    {
-        return Vector2(x, y);
-    }
-
-    inline Vector2 Vector4::zw() const
-    {
-        return Vector2(z, w);
-    }
-
-    inline Vector3 Vector4::xyz() const
-    {
-        return Vector3(x, y, z);
-    }
-
-    inline const float * Vector4::ptr() const
-    {
-        return &x;
-    }
-
-    inline void Vector4::set(float x, float y, float z, float w)
-    {
-        this->x = x;
-        this->y = y;
-        this->z = z;
-        this->w = w;
-    }
-
-    inline Vector4 Vector4::operator-() const
-    {
-        return Vector4(-x, -y, -z, -w);
-    }
-
-    inline void Vector4::operator+=(Vector4::Arg v)
-    {
-        x += v.x;
-        y += v.y;
-        z += v.z;
-        w += v.w;
-    }
-
-    inline void Vector4::operator-=(Vector4::Arg v)
-    {
-        x -= v.x;
-        y -= v.y;
-        z -= v.z;
-        w -= v.w;
-    }
-
-    inline void Vector4::operator*=(float s)
-    {
-        x *= s;
-        y *= s;
-        z *= s;
-        w *= s;
-    }
-
-    inline void Vector4::operator/=(float s)
-    {
-        x /= s;
-        y /= s;
-        z /= s;
-        w /= s;
-    }
-
-    inline void Vector4::operator*=(Vector4::Arg v)
-    {
-        x *= v.x;
-        y *= v.y;
-        z *= v.z;
-        w *= v.w;
-    }
-
-    inline void Vector4::operator/=(Vector4::Arg v)
-    {
-        x /= v.x;
-        y /= v.y;
-        z /= v.z;
-        w /= v.w;
-    }
-
-    inline bool operator==(Vector4::Arg a, Vector4::Arg b)
-    {
-        return a.x == b.x && a.y == b.y && a.z == b.z && a.w == b.w; 
-    }
-    inline bool operator!=(Vector4::Arg a, Vector4::Arg b)
-    {
-        return a.x != b.x || a.y != b.y || a.z != b.z || a.w != b.w; 
-    }
-
-
-
-    // Functions
-
-
-    // Vector2
-
-    inline Vector2 add(Vector2::Arg a, Vector2::Arg b)
-    {
-        return Vector2(a.x + b.x, a.y + b.y);
-    }
-    inline Vector2 operator+(Vector2::Arg a, Vector2::Arg b)
-    {
-        return add(a, b);
-    }
-
-    inline Vector2 sub(Vector2::Arg a, Vector2::Arg b)
-    {
-        return Vector2(a.x - b.x, a.y - b.y);
-    }
-    inline Vector2 operator-(Vector2::Arg a, Vector2::Arg b)
-    {
-        return sub(a, b);
-    }
-
-    inline Vector2 scale(Vector2::Arg v, float s)
-    {
-        return Vector2(v.x * s, v.y * s);
-    }
-
-    inline Vector2 scale(Vector2::Arg v, Vector2::Arg s)
-    {
-        return Vector2(v.x * s.x, v.y * s.y);
-    }
-
-    inline Vector2 operator*(Vector2::Arg v, float s)
-    {
-        return scale(v, s);
-    }
-
-    inline Vector2 operator*(Vector2::Arg v1, Vector2::Arg v2)
-    {
-        return Vector2(v1.x*v2.x, v1.y*v2.y);
-    }
-
-    inline Vector2 operator*(float s, Vector2::Arg v)
-    {
-        return scale(v, s);
-    }
-
-    inline Vector2 operator/(Vector2::Arg v, float s)
-    {
-        return scale(v, 1.0f/s);
-    }
-
-    inline Vector2 lerp(Vector2::Arg v1, Vector2::Arg v2, float t)
-    {
-        const float s = 1.0f - t;
-        return Vector2(v1.x * s + t * v2.x, v1.y * s + t * v2.y);
-    }
-
-    inline float dot(Vector2::Arg a, Vector2::Arg b)
-    {
-        return a.x * b.x + a.y * b.y;
-    }
-
-    inline float lengthSquared(Vector2::Arg v)
-    {
-        return v.x * v.x + v.y * v.y;
-    }
-
-    inline float length(Vector2::Arg v)
-    {
-        return sqrtf(lengthSquared(v));
-    }
-
-    inline float distance(Vector2::Arg a, Vector2::Arg b)
-    {
-        return length(a - b);
-    }
-
-    inline float inverseLength(Vector2::Arg v)
-    {
-        return 1.0f / sqrtf(lengthSquared(v));
-    }
-
-    inline bool isNormalized(Vector2::Arg v, float epsilon = NV_NORMAL_EPSILON)
-    {
-        return equal(length(v), 1, epsilon);
-    }
-
-    inline Vector2 normalize(Vector2::Arg v, float epsilon = NV_EPSILON)
-    {
-        float l = length(v);
-        nvDebugCheck(!isZero(l, epsilon));
-        Vector2 n = scale(v, 1.0f / l);
-        nvDebugCheck(isNormalized(n));
-        return n;
-    }
-
-    inline Vector2 normalizeSafe(Vector2::Arg v, Vector2::Arg fallback, float epsilon = NV_EPSILON)
-    {
-        float l = length(v);
-        if (isZero(l, epsilon)) {
-            return fallback;
-        }
-        return scale(v, 1.0f / l);
-    }
-
-    // Safe, branchless normalization from Andy Firth. All error checking ommitted.
-    // http://altdevblogaday.com/2011/08/21/practical-flt-point-tricks/
-    inline Vector2 normalizeFast(Vector2::Arg v)
-    {
-        const float very_small_float = 1.0e-037f;
-        float l = very_small_float + length(v);
-        return scale(v, 1.0f / l);
-    }
-
-    inline bool equal(Vector2::Arg v1, Vector2::Arg v2, float epsilon = NV_EPSILON)
-    {
-        return equal(v1.x, v2.x, epsilon) && equal(v1.y, v2.y, epsilon);
-    }
-
-    inline Vector2 min(Vector2::Arg a, Vector2::Arg b)
-    {
-        return Vector2(min(a.x, b.x), min(a.y, b.y));
-    }
-
-    inline Vector2 max(Vector2::Arg a, Vector2::Arg b)
-    {
-        return Vector2(max(a.x, b.x), max(a.y, b.y));
-    }
-
-    inline Vector2 clamp(Vector2::Arg v, float min, float max)
-    {
-        return Vector2(clamp(v.x, min, max), clamp(v.y, min, max));
-    }
-
-    inline Vector2 saturate(Vector2::Arg v)
-    {
-        return Vector2(saturate(v.x), saturate(v.y));
-    }
-
-    inline bool isFinite(Vector2::Arg v)
-    {
-        return isFinite(v.x) && isFinite(v.y);
-    }
-
-    inline Vector2 validate(Vector2::Arg v, Vector2::Arg fallback = Vector2(0.0f))
-    {
-        if (!isFinite(v)) return fallback;
-        Vector2 vf = v;
-        nv::floatCleanup(vf.component, 2);
-        return vf;
-    }
-
-    // Note, this is the area scaled by 2!
-    inline float triangleArea(Vector2::Arg v0, Vector2::Arg v1)
-    {
-	    return (v0.x * v1.y - v0.y * v1.x); // * 0.5f;
-    }
-    inline float triangleArea(Vector2::Arg a, Vector2::Arg b, Vector2::Arg c)
-    {
-        // IC: While it may be appealing to use the following expression:
-        //return (c.x * a.y + a.x * b.y + b.x * c.y - b.x * a.y - c.x * b.y - a.x * c.y); // * 0.5f;
-
-        // That's actually a terrible idea. Small triangles far from the origin can end up producing fairly large floating point 
-        // numbers and the results becomes very unstable and dependent on the order of the factors.
-
-        // Instead, it's preferable to substract the vertices first, and multiply the resulting small values together. The result
-        // in this case is always much more accurate (as long as the triangle is small) and less dependent of the location of 
-        // the triangle.
-
-        //return ((a.x - c.x) * (b.y - c.y) - (a.y - c.y) * (b.x - c.x)); // * 0.5f;
-        return triangleArea(a-c, b-c);
-    }
-
-
-    template <>
-    inline uint hash(const Vector2 & v, uint h)
-    {
-        return sdbmFloatHash(v.component, 2, h);
-    }
-
-
-
-    // Vector3
-
-    inline Vector3 add(Vector3::Arg a, Vector3::Arg b)
-    {
-        return Vector3(a.x + b.x, a.y + b.y, a.z + b.z);
-    }
-    inline Vector3 add(Vector3::Arg a, float b)
-    {
-        return Vector3(a.x + b, a.y + b, a.z + b);
-    }
-    inline Vector3 operator+(Vector3::Arg a, Vector3::Arg b)
-    {
-        return add(a, b);
-    }
-    inline Vector3 operator+(Vector3::Arg a, float b)
-    {
-        return add(a, b);
-    }
-
-    inline Vector3 sub(Vector3::Arg a, Vector3::Arg b)
-    {
-        return Vector3(a.x - b.x, a.y - b.y, a.z - b.z);
-    }
-    inline Vector3 sub(Vector3::Arg a, float b)
-    {
-        return Vector3(a.x - b, a.y - b, a.z - b);
-    }
-    inline Vector3 operator-(Vector3::Arg a, Vector3::Arg b)
-    {
-        return sub(a, b);
-    }
-    inline Vector3 operator-(Vector3::Arg a, float b)
-    {
-        return sub(a, b);
-    }
-
-    inline Vector3 cross(Vector3::Arg a, Vector3::Arg b)
-    {
-        return Vector3(a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x);
-    }
-
-    inline Vector3 scale(Vector3::Arg v, float s)
-    {
-        return Vector3(v.x * s, v.y * s, v.z * s);
-    }
-
-    inline Vector3 scale(Vector3::Arg v, Vector3::Arg s)
-    {
-        return Vector3(v.x * s.x, v.y * s.y, v.z * s.z);
-    }
-
-    inline Vector3 operator*(Vector3::Arg v, float s)
-    {
-        return scale(v, s);
-    }
-
-    inline Vector3 operator*(float s, Vector3::Arg v)
-    {
-        return scale(v, s);
-    }
-
-    inline Vector3 operator*(Vector3::Arg v, Vector3::Arg s)
-    {
-        return scale(v, s);
-    }
-
-    inline Vector3 operator/(Vector3::Arg v, float s)
-    {
-        return scale(v, 1.0f/s);
-    }
-
-    /*inline Vector3 add_scaled(Vector3::Arg a, Vector3::Arg b, float s)
-    {
-        return Vector3(a.x + b.x * s, a.y + b.y * s, a.z + b.z * s);
-    }*/
-
-    inline Vector3 lerp(Vector3::Arg v1, Vector3::Arg v2, float t)
-    {
-        const float s = 1.0f - t;
-        return Vector3(v1.x * s + t * v2.x, v1.y * s + t * v2.y, v1.z * s + t * v2.z);
-    }
-
-    inline float dot(Vector3::Arg a, Vector3::Arg b)
-    {
-        return a.x * b.x + a.y * b.y + a.z * b.z;
-    }
-
-    inline float lengthSquared(Vector3::Arg v)
-    {
-        return v.x * v.x + v.y * v.y + v.z * v.z;
-    }
-
-    inline float length(Vector3::Arg v)
-    {
-        return sqrtf(lengthSquared(v));
-    }
-
-    inline float distance(Vector3::Arg a, Vector3::Arg b)
-    {
-        return length(a - b);
-    }
-
-    inline float distanceSquared(Vector3::Arg a, Vector3::Arg b)
-    {
-        return lengthSquared(a - b);
-    }
-
-    inline float inverseLength(Vector3::Arg v)
-    {
-        return 1.0f / sqrtf(lengthSquared(v));
-    }
-
-    inline bool isNormalized(Vector3::Arg v, float epsilon = NV_NORMAL_EPSILON)
-    {
-        return equal(length(v), 1, epsilon);
-    }
-
-    inline Vector3 normalize(Vector3::Arg v, float epsilon = NV_EPSILON)
-    {
-        float l = length(v);
-        nvDebugCheck(!isZero(l, epsilon));
-        Vector3 n = scale(v, 1.0f / l);
-        nvDebugCheck(isNormalized(n));
-        return n;
-    }
-
-    inline Vector3 normalizeSafe(Vector3::Arg v, Vector3::Arg fallback, float epsilon = NV_EPSILON)
-    {
-        float l = length(v);
-        if (isZero(l, epsilon)) {
-            return fallback;
-        }
-        return scale(v, 1.0f / l);
-    }
-
-    // Safe, branchless normalization from Andy Firth. All error checking ommitted.
-    // http://altdevblogaday.com/2011/08/21/practical-flt-point-tricks/
-    inline Vector3 normalizeFast(Vector3::Arg v)
-    {
-        const float very_small_float = 1.0e-037f;
-        float l = very_small_float + length(v);
-        return scale(v, 1.0f / l);
-    }
-
-    inline bool equal(Vector3::Arg v1, Vector3::Arg v2, float epsilon = NV_EPSILON)
-    {
-        return equal(v1.x, v2.x, epsilon) && equal(v1.y, v2.y, epsilon) && equal(v1.z, v2.z, epsilon);
-    }
-
-    inline Vector3 min(Vector3::Arg a, Vector3::Arg b)
-    {
-        return Vector3(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z));
-    }
-
-    inline Vector3 max(Vector3::Arg a, Vector3::Arg b)
-    {
-        return Vector3(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z));
-    }
-
-    inline Vector3 clamp(Vector3::Arg v, float min, float max)
-    {
-        return Vector3(clamp(v.x, min, max), clamp(v.y, min, max), clamp(v.z, min, max));
-    }
-
-    inline Vector3 saturate(Vector3::Arg v)
-    {
-        return Vector3(saturate(v.x), saturate(v.y), saturate(v.z));
-    }
-
-    inline Vector3 floor(Vector3::Arg v)
-    {
-        return Vector3(floorf(v.x), floorf(v.y), floorf(v.z));
-    }
-
-    inline Vector3 ceil(Vector3::Arg v)
-    {
-        return Vector3(ceilf(v.x), ceilf(v.y), ceilf(v.z));
-    }
-
-    inline bool isFinite(Vector3::Arg v)
-    {
-        return isFinite(v.x) && isFinite(v.y) && isFinite(v.z);
-    }
-
-    inline Vector3 validate(Vector3::Arg v, Vector3::Arg fallback = Vector3(0.0f))
-    {
-        if (!isFinite(v)) return fallback;
-        Vector3 vf = v;
-        nv::floatCleanup(vf.component, 3);
-        return vf;
-    }
-
-    inline Vector3 reflect(Vector3::Arg v, Vector3::Arg n)
-    {
-	    return v - (2 * dot(v, n)) * n;
-    }
-
-    template <>
-    inline uint hash(const Vector3 & v, uint h)
-    {
-        return sdbmFloatHash(v.component, 3, h);
-    }
-
-
-    // Vector4
-
-    inline Vector4 add(Vector4::Arg a, Vector4::Arg b)
-    {
-        return Vector4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
-    }
-    inline Vector4 operator+(Vector4::Arg a, Vector4::Arg b)
-    {
-        return add(a, b);
-    }
-
-    inline Vector4 sub(Vector4::Arg a, Vector4::Arg b)
-    {
-        return Vector4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w);
-    }
-    inline Vector4 operator-(Vector4::Arg a, Vector4::Arg b)
-    {
-        return sub(a, b);
-    }
-
-    inline Vector4 scale(Vector4::Arg v, float s)
-    {
-        return Vector4(v.x * s, v.y * s, v.z * s, v.w * s);
-    }
-
-    inline Vector4 scale(Vector4::Arg v, Vector4::Arg s)
-    {
-        return Vector4(v.x * s.x, v.y * s.y, v.z * s.z, v.w * s.w);
-    }
-
-    inline Vector4 operator*(Vector4::Arg v, float s)
-    {
-        return scale(v, s);
-    }
-
-    inline Vector4 operator*(float s, Vector4::Arg v)
-    {
-        return scale(v, s);
-    }
-
-    inline Vector4 operator*(Vector4::Arg v, Vector4::Arg s)
-    {
-        return scale(v, s);
-    }
-
-    inline Vector4 operator/(Vector4::Arg v, float s)
-    {
-        return scale(v, 1.0f/s);
-    }
-
-    /*inline Vector4 add_scaled(Vector4::Arg a, Vector4::Arg b, float s)
-    {
-        return Vector4(a.x + b.x * s, a.y + b.y * s, a.z + b.z * s, a.w + b.w * s);
-    }*/
-
-    inline Vector4 lerp(Vector4::Arg v1, Vector4::Arg v2, float t)
-    {
-        const float s = 1.0f - t;
-        return Vector4(v1.x * s + t * v2.x, v1.y * s + t * v2.y, v1.z * s + t * v2.z, v1.w * s + t * v2.w);
-    }
-
-    inline float dot(Vector4::Arg a, Vector4::Arg b)
-    {
-        return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
-    }
-
-    inline float lengthSquared(Vector4::Arg v)
-    {
-        return v.x * v.x + v.y * v.y + v.z * v.z + v.w * v.w;
-    }
-
-    inline float length(Vector4::Arg v)
-    {
-        return sqrtf(lengthSquared(v));
-    }
-
-    inline float inverseLength(Vector4::Arg v)
-    {
-        return 1.0f / sqrtf(lengthSquared(v));
-    }
-
-    inline bool isNormalized(Vector4::Arg v, float epsilon = NV_NORMAL_EPSILON)
-    {
-        return equal(length(v), 1, epsilon);
-    }
-
-    inline Vector4 normalize(Vector4::Arg v, float epsilon = NV_EPSILON)
-    {
-        float l = length(v);
-        nvDebugCheck(!isZero(l, epsilon));
-        Vector4 n = scale(v, 1.0f / l);
-        nvDebugCheck(isNormalized(n));
-        return n;
-    }
-
-    inline Vector4 normalizeSafe(Vector4::Arg v, Vector4::Arg fallback, float epsilon = NV_EPSILON)
-    {
-        float l = length(v);
-        if (isZero(l, epsilon)) {
-            return fallback;
-        }
-        return scale(v, 1.0f / l);
-    }
-
-    // Safe, branchless normalization from Andy Firth. All error checking ommitted.
-    // http://altdevblogaday.com/2011/08/21/practical-flt-point-tricks/
-    inline Vector4 normalizeFast(Vector4::Arg v)
-    {
-        const float very_small_float = 1.0e-037f;
-        float l = very_small_float + length(v);
-        return scale(v, 1.0f / l);
-    }
-
-    inline bool equal(Vector4::Arg v1, Vector4::Arg v2, float epsilon = NV_EPSILON)
-    {
-        return equal(v1.x, v2.x, epsilon) && equal(v1.y, v2.y, epsilon) && equal(v1.z, v2.z, epsilon) && equal(v1.w, v2.w, epsilon);
-    }
-
-    inline Vector4 min(Vector4::Arg a, Vector4::Arg b)
-    {
-        return Vector4(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z), min(a.w, b.w));
-    }
-
-    inline Vector4 max(Vector4::Arg a, Vector4::Arg b)
-    {
-        return Vector4(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z), max(a.w, b.w));
-    }
-
-    inline Vector4 clamp(Vector4::Arg v, float min, float max)
-    {
-        return Vector4(clamp(v.x, min, max), clamp(v.y, min, max), clamp(v.z, min, max), clamp(v.w, min, max));
-    }
-
-    inline Vector4 saturate(Vector4::Arg v)
-    {
-        return Vector4(saturate(v.x), saturate(v.y), saturate(v.z), saturate(v.w));
-    }
-
-    inline bool isFinite(Vector4::Arg v)
-    {
-        return isFinite(v.x) && isFinite(v.y) && isFinite(v.z) && isFinite(v.w);
-    }
-
-    inline Vector4 validate(Vector4::Arg v, Vector4::Arg fallback = Vector4(0.0f))
-    {
-        if (!isFinite(v)) return fallback;
-        Vector4 vf = v;
-        nv::floatCleanup(vf.component, 4);
-        return vf;
-    }
-
-    template <>
-    inline uint hash(const Vector4 & v, uint h)
-    {
-        return sdbmFloatHash(v.component, 4, h);
-    }
-
-
-#if NV_OS_IOS // LLVM is not happy with implicit conversion of immediate constants to float
-
-    //int:
-
-    inline Vector2 scale(Vector2::Arg v, int s)
-    {
-        return Vector2(v.x * s, v.y * s);
-    }
-
-    inline Vector2 operator*(Vector2::Arg v, int s)
-    {
-        return scale(v, s);
-    }
-
-    inline Vector2 operator*(int s, Vector2::Arg v)
-    {
-        return scale(v, s);
-    }
-
-    inline Vector2 operator/(Vector2::Arg v, int s)
-    {
-        return scale(v, 1.0f/s);
-    }
-
-    inline Vector3 scale(Vector3::Arg v, int s)
-    {
-        return Vector3(v.x * s, v.y * s, v.z * s);
-    }
-
-    inline Vector3 operator*(Vector3::Arg v, int s)
-    {
-        return scale(v, s);
-    }
-
-    inline Vector3 operator*(int s, Vector3::Arg v)
-    {
-        return scale(v, s);
-    }
-
-    inline Vector3 operator/(Vector3::Arg v, int s)
-    {
-        return scale(v, 1.0f/s);
-    }
-
-    inline Vector4 scale(Vector4::Arg v, int s)
-    {
-        return Vector4(v.x * s, v.y * s, v.z * s, v.w * s);
-    }
-
-    inline Vector4 operator*(Vector4::Arg v, int s)
-    {
-        return scale(v, s);
-    }
-
-    inline Vector4 operator*(int s, Vector4::Arg v)
-    {
-        return scale(v, s);
-    }
-
-    inline Vector4 operator/(Vector4::Arg v, int s)
-    {
-        return scale(v, 1.0f/s);
-    }
-
-    //double:
-
-    inline Vector3 operator*(Vector3::Arg v, double s)
-    {
-        return scale(v, (float)s);
-    }
-
-    inline Vector3 operator*(double s, Vector3::Arg v)
-    {
-        return scale(v, (float)s);
-    }
-
-    inline Vector3 operator/(Vector3::Arg v, double s)
-    {
-        return scale(v, 1.f/((float)s));
-    }    
-        
-#endif //NV_OS_IOS
-
-} // nv namespace
-
-#endif // NV_MATH_VECTOR_INL
+// This code is in the public domain -- castanyo@yahoo.es
+
+#pragma once
+#ifndef NV_MATH_VECTOR_INL
+#define NV_MATH_VECTOR_INL
+
+#include "Vector.h"
+#include "nvcore/Utils.h" // min, max
+#include "nvcore/Hash.h" // hash
+
+namespace nv
+{
+
+    // Helpers to convert vector types. Assume T has x,y members and 2 argument constructor.
+    //template <typename T> T to(Vector2::Arg v) { return T(v.x, v.y); }
+
+    // Helpers to convert vector types. Assume T has x,y,z members and 3 argument constructor.
+    //template <typename T> T to(Vector3::Arg v) { return T(v.x, v.y, v.z); }
+
+    // Helpers to convert vector types. Assume T has x,y,z members and 3 argument constructor.
+    //template <typename T> T to(Vector4::Arg v) { return T(v.x, v.y, v.z, v.w); }
+
+
+    // Vector2
+    inline Vector2::Vector2() {}
+    inline Vector2::Vector2(float f) : x(f), y(f) {}
+    inline Vector2::Vector2(float x, float y) : x(x), y(y) {}
+    inline Vector2::Vector2(Vector2::Arg v) : x(v.x), y(v.y) {}
+
+    inline const Vector2 & Vector2::operator=(Vector2::Arg v)
+    {
+        x = v.x;
+        y = v.y;
+        return *this;
+    }
+
+    inline const float * Vector2::ptr() const
+    {
+        return &x;
+    }
+
+    inline void Vector2::set(float x, float y)
+    {
+        this->x = x;
+        this->y = y;
+    }
+
+    inline Vector2 Vector2::operator-() const
+    {
+        return Vector2(-x, -y);
+    }
+
+    inline void Vector2::operator+=(Vector2::Arg v)
+    {
+        x += v.x;
+        y += v.y;
+    }
+
+    inline void Vector2::operator-=(Vector2::Arg v)
+    {
+        x -= v.x;
+        y -= v.y;
+    }
+
+    inline void Vector2::operator*=(float s)
+    {
+        x *= s;
+        y *= s;
+    }
+
+    inline void Vector2::operator*=(Vector2::Arg v)
+    {
+        x *= v.x;
+        y *= v.y;
+    }
+
+    inline bool operator==(Vector2::Arg a, Vector2::Arg b)
+    {
+        return a.x == b.x && a.y == b.y; 
+    }
+    inline bool operator!=(Vector2::Arg a, Vector2::Arg b)
+    {
+        return a.x != b.x || a.y != b.y; 
+    }
+
+
+    // Vector3
+    inline Vector3::Vector3() {}
+    inline Vector3::Vector3(float f) : x(f), y(f), z(f) {}
+    inline Vector3::Vector3(float x, float y, float z) : x(x), y(y), z(z) {}
+    inline Vector3::Vector3(Vector2::Arg v, float z) : x(v.x), y(v.y), z(z) {}
+    inline Vector3::Vector3(Vector3::Arg v) : x(v.x), y(v.y), z(v.z) {}
+
+    inline const Vector3 & Vector3::operator=(Vector3::Arg v)
+    {
+        x = v.x;
+        y = v.y;
+        z = v.z;
+        return *this;
+    }
+
+
+    inline Vector2 Vector3::xy() const
+    {
+        return Vector2(x, y);
+    }
+
+    inline const float * Vector3::ptr() const
+    {
+        return &x;
+    }
+
+    inline void Vector3::set(float x, float y, float z)
+    {
+        this->x = x;
+        this->y = y;
+        this->z = z;
+    }
+
+    inline Vector3 Vector3::operator-() const
+    {
+        return Vector3(-x, -y, -z);
+    }
+
+    inline void Vector3::operator+=(Vector3::Arg v)
+    {
+        x += v.x;
+        y += v.y;
+        z += v.z;
+    }
+
+    inline void Vector3::operator-=(Vector3::Arg v)
+    {
+        x -= v.x;
+        y -= v.y;
+        z -= v.z;
+    }
+
+    inline void Vector3::operator*=(float s)
+    {
+        x *= s;
+        y *= s;
+        z *= s;
+    }
+
+    inline void Vector3::operator/=(float s)
+    {
+        float is = 1.0f / s;
+        x *= is;
+        y *= is;
+        z *= is;
+    }
+
+    inline void Vector3::operator*=(Vector3::Arg v)
+    {
+        x *= v.x;
+        y *= v.y;
+        z *= v.z;
+    }
+
+    inline void Vector3::operator/=(Vector3::Arg v)
+    {
+        x /= v.x;
+        y /= v.y;
+        z /= v.z;
+    }
+
+    inline bool operator==(Vector3::Arg a, Vector3::Arg b)
+    {
+        return a.x == b.x && a.y == b.y && a.z == b.z; 
+    }
+    inline bool operator!=(Vector3::Arg a, Vector3::Arg b)
+    {
+        return a.x != b.x || a.y != b.y || a.z != b.z; 
+    }
+
+
+    // Vector4
+    inline Vector4::Vector4() {}
+    inline Vector4::Vector4(float f) : x(f), y(f), z(f), w(f) {}
+    inline Vector4::Vector4(float x, float y, float z, float w) : x(x), y(y), z(z), w(w) {}
+    inline Vector4::Vector4(Vector2::Arg v, float z, float w) : x(v.x), y(v.y), z(z), w(w) {}
+    inline Vector4::Vector4(Vector2::Arg v, Vector2::Arg u) : x(v.x), y(v.y), z(u.x), w(u.y) {}
+    inline Vector4::Vector4(Vector3::Arg v, float w) : x(v.x), y(v.y), z(v.z), w(w) {}
+    inline Vector4::Vector4(Vector4::Arg v) : x(v.x), y(v.y), z(v.z), w(v.w) {}
+
+    inline const Vector4 & Vector4::operator=(const Vector4 & v)
+    {
+        x = v.x;
+        y = v.y;
+        z = v.z;
+        w = v.w;
+        return *this;
+    }
+
+    inline Vector2 Vector4::xy() const
+    {
+        return Vector2(x, y);
+    }
+
+    inline Vector2 Vector4::zw() const
+    {
+        return Vector2(z, w);
+    }
+
+    inline Vector3 Vector4::xyz() const
+    {
+        return Vector3(x, y, z);
+    }
+
+    inline const float * Vector4::ptr() const
+    {
+        return &x;
+    }
+
+    inline void Vector4::set(float x, float y, float z, float w)
+    {
+        this->x = x;
+        this->y = y;
+        this->z = z;
+        this->w = w;
+    }
+
+    inline Vector4 Vector4::operator-() const
+    {
+        return Vector4(-x, -y, -z, -w);
+    }
+
+    inline void Vector4::operator+=(Vector4::Arg v)
+    {
+        x += v.x;
+        y += v.y;
+        z += v.z;
+        w += v.w;
+    }
+
+    inline void Vector4::operator-=(Vector4::Arg v)
+    {
+        x -= v.x;
+        y -= v.y;
+        z -= v.z;
+        w -= v.w;
+    }
+
+    inline void Vector4::operator*=(float s)
+    {
+        x *= s;
+        y *= s;
+        z *= s;
+        w *= s;
+    }
+
+    inline void Vector4::operator/=(float s)
+    {
+        x /= s;
+        y /= s;
+        z /= s;
+        w /= s;
+    }
+
+    inline void Vector4::operator*=(Vector4::Arg v)
+    {
+        x *= v.x;
+        y *= v.y;
+        z *= v.z;
+        w *= v.w;
+    }
+
+    inline void Vector4::operator/=(Vector4::Arg v)
+    {
+        x /= v.x;
+        y /= v.y;
+        z /= v.z;
+        w /= v.w;
+    }
+
+    inline bool operator==(Vector4::Arg a, Vector4::Arg b)
+    {
+        return a.x == b.x && a.y == b.y && a.z == b.z && a.w == b.w; 
+    }
+    inline bool operator!=(Vector4::Arg a, Vector4::Arg b)
+    {
+        return a.x != b.x || a.y != b.y || a.z != b.z || a.w != b.w; 
+    }
+
+
+
+    // Functions
+
+
+    // Vector2
+
+    inline Vector2 add(Vector2::Arg a, Vector2::Arg b)
+    {
+        return Vector2(a.x + b.x, a.y + b.y);
+    }
+    inline Vector2 operator+(Vector2::Arg a, Vector2::Arg b)
+    {
+        return add(a, b);
+    }
+
+    inline Vector2 sub(Vector2::Arg a, Vector2::Arg b)
+    {
+        return Vector2(a.x - b.x, a.y - b.y);
+    }
+    inline Vector2 operator-(Vector2::Arg a, Vector2::Arg b)
+    {
+        return sub(a, b);
+    }
+
+    inline Vector2 scale(Vector2::Arg v, float s)
+    {
+        return Vector2(v.x * s, v.y * s);
+    }
+
+    inline Vector2 scale(Vector2::Arg v, Vector2::Arg s)
+    {
+        return Vector2(v.x * s.x, v.y * s.y);
+    }
+
+    inline Vector2 operator*(Vector2::Arg v, float s)
+    {
+        return scale(v, s);
+    }
+
+    inline Vector2 operator*(Vector2::Arg v1, Vector2::Arg v2)
+    {
+        return Vector2(v1.x*v2.x, v1.y*v2.y);
+    }
+
+    inline Vector2 operator*(float s, Vector2::Arg v)
+    {
+        return scale(v, s);
+    }
+
+    inline Vector2 operator/(Vector2::Arg v, float s)
+    {
+        return scale(v, 1.0f/s);
+    }
+
+    inline Vector2 lerp(Vector2::Arg v1, Vector2::Arg v2, float t)
+    {
+        const float s = 1.0f - t;
+        return Vector2(v1.x * s + t * v2.x, v1.y * s + t * v2.y);
+    }
+
+    inline float dot(Vector2::Arg a, Vector2::Arg b)
+    {
+        return a.x * b.x + a.y * b.y;
+    }
+
+    inline float lengthSquared(Vector2::Arg v)
+    {
+        return v.x * v.x + v.y * v.y;
+    }
+
+    inline float length(Vector2::Arg v)
+    {
+        return sqrtf(lengthSquared(v));
+    }
+
+    inline float distance(Vector2::Arg a, Vector2::Arg b)
+    {
+        return length(a - b);
+    }
+
+    inline float inverseLength(Vector2::Arg v)
+    {
+        return 1.0f / sqrtf(lengthSquared(v));
+    }
+
+    inline bool isNormalized(Vector2::Arg v, float epsilon = NV_NORMAL_EPSILON)
+    {
+        return equal(length(v), 1, epsilon);
+    }
+
+    inline Vector2 normalize(Vector2::Arg v, float epsilon = NV_EPSILON)
+    {
+        float l = length(v);
+        nvDebugCheck(!isZero(l, epsilon));
+        Vector2 n = scale(v, 1.0f / l);
+        nvDebugCheck(isNormalized(n));
+        return n;
+    }
+
+    inline Vector2 normalizeSafe(Vector2::Arg v, Vector2::Arg fallback, float epsilon = NV_EPSILON)
+    {
+        float l = length(v);
+        if (isZero(l, epsilon)) {
+            return fallback;
+        }
+        return scale(v, 1.0f / l);
+    }
+
+    // Safe, branchless normalization from Andy Firth. All error checking ommitted.
+    // http://altdevblogaday.com/2011/08/21/practical-flt-point-tricks/
+    inline Vector2 normalizeFast(Vector2::Arg v)
+    {
+        const float very_small_float = 1.0e-037f;
+        float l = very_small_float + length(v);
+        return scale(v, 1.0f / l);
+    }
+
+    inline bool equal(Vector2::Arg v1, Vector2::Arg v2, float epsilon = NV_EPSILON)
+    {
+        return equal(v1.x, v2.x, epsilon) && equal(v1.y, v2.y, epsilon);
+    }
+
+    inline Vector2 min(Vector2::Arg a, Vector2::Arg b)
+    {
+        return Vector2(min(a.x, b.x), min(a.y, b.y));
+    }
+
+    inline Vector2 max(Vector2::Arg a, Vector2::Arg b)
+    {
+        return Vector2(max(a.x, b.x), max(a.y, b.y));
+    }
+
+    inline Vector2 clamp(Vector2::Arg v, float min, float max)
+    {
+        return Vector2(clamp(v.x, min, max), clamp(v.y, min, max));
+    }
+
+    inline Vector2 saturate(Vector2::Arg v)
+    {
+        return Vector2(saturate(v.x), saturate(v.y));
+    }
+
+    inline bool isFinite(Vector2::Arg v)
+    {
+        return isFinite(v.x) && isFinite(v.y);
+    }
+
+    inline Vector2 validate(Vector2::Arg v, Vector2::Arg fallback = Vector2(0.0f))
+    {
+        if (!isFinite(v)) return fallback;
+        Vector2 vf = v;
+        nv::floatCleanup(vf.component, 2);
+        return vf;
+    }
+
+    // Note, this is the area scaled by 2!
+    inline float triangleArea(Vector2::Arg v0, Vector2::Arg v1)
+    {
+	    return (v0.x * v1.y - v0.y * v1.x); // * 0.5f;
+    }
+    inline float triangleArea(Vector2::Arg a, Vector2::Arg b, Vector2::Arg c)
+    {
+        // IC: While it may be appealing to use the following expression:
+        //return (c.x * a.y + a.x * b.y + b.x * c.y - b.x * a.y - c.x * b.y - a.x * c.y); // * 0.5f;
+
+        // That's actually a terrible idea. Small triangles far from the origin can end up producing fairly large floating point 
+        // numbers and the results becomes very unstable and dependent on the order of the factors.
+
+        // Instead, it's preferable to subtract the vertices first, and multiply the resulting small values together. The result
+        // in this case is always much more accurate (as long as the triangle is small) and less dependent of the location of 
+        // the triangle.
+
+        //return ((a.x - c.x) * (b.y - c.y) - (a.y - c.y) * (b.x - c.x)); // * 0.5f;
+        return triangleArea(a-c, b-c);
+    }
+
+
+    template <>
+    inline uint hash(const Vector2 & v, uint h)
+    {
+        return sdbmFloatHash(v.component, 2, h);
+    }
+
+
+
+    // Vector3
+
+    inline Vector3 add(Vector3::Arg a, Vector3::Arg b)
+    {
+        return Vector3(a.x + b.x, a.y + b.y, a.z + b.z);
+    }
+    inline Vector3 add(Vector3::Arg a, float b)
+    {
+        return Vector3(a.x + b, a.y + b, a.z + b);
+    }
+    inline Vector3 operator+(Vector3::Arg a, Vector3::Arg b)
+    {
+        return add(a, b);
+    }
+    inline Vector3 operator+(Vector3::Arg a, float b)
+    {
+        return add(a, b);
+    }
+
+    inline Vector3 sub(Vector3::Arg a, Vector3::Arg b)
+    {
+        return Vector3(a.x - b.x, a.y - b.y, a.z - b.z);
+    }
+    inline Vector3 sub(Vector3::Arg a, float b)
+    {
+        return Vector3(a.x - b, a.y - b, a.z - b);
+    }
+    inline Vector3 operator-(Vector3::Arg a, Vector3::Arg b)
+    {
+        return sub(a, b);
+    }
+    inline Vector3 operator-(Vector3::Arg a, float b)
+    {
+        return sub(a, b);
+    }
+
+    inline Vector3 cross(Vector3::Arg a, Vector3::Arg b)
+    {
+        return Vector3(a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x);
+    }
+
+    inline Vector3 scale(Vector3::Arg v, float s)
+    {
+        return Vector3(v.x * s, v.y * s, v.z * s);
+    }
+
+    inline Vector3 scale(Vector3::Arg v, Vector3::Arg s)
+    {
+        return Vector3(v.x * s.x, v.y * s.y, v.z * s.z);
+    }
+
+    inline Vector3 operator*(Vector3::Arg v, float s)
+    {
+        return scale(v, s);
+    }
+
+    inline Vector3 operator*(float s, Vector3::Arg v)
+    {
+        return scale(v, s);
+    }
+
+    inline Vector3 operator*(Vector3::Arg v, Vector3::Arg s)
+    {
+        return scale(v, s);
+    }
+
+    inline Vector3 operator/(Vector3::Arg v, float s)
+    {
+        return scale(v, 1.0f/s);
+    }
+
+    /*inline Vector3 add_scaled(Vector3::Arg a, Vector3::Arg b, float s)
+    {
+        return Vector3(a.x + b.x * s, a.y + b.y * s, a.z + b.z * s);
+    }*/
+
+    inline Vector3 lerp(Vector3::Arg v1, Vector3::Arg v2, float t)
+    {
+        const float s = 1.0f - t;
+        return Vector3(v1.x * s + t * v2.x, v1.y * s + t * v2.y, v1.z * s + t * v2.z);
+    }
+
+    inline float dot(Vector3::Arg a, Vector3::Arg b)
+    {
+        return a.x * b.x + a.y * b.y + a.z * b.z;
+    }
+
+    inline float lengthSquared(Vector3::Arg v)
+    {
+        return v.x * v.x + v.y * v.y + v.z * v.z;
+    }
+
+    inline float length(Vector3::Arg v)
+    {
+        return sqrtf(lengthSquared(v));
+    }
+
+    inline float distance(Vector3::Arg a, Vector3::Arg b)
+    {
+        return length(a - b);
+    }
+
+    inline float distanceSquared(Vector3::Arg a, Vector3::Arg b)
+    {
+        return lengthSquared(a - b);
+    }
+
+    inline float inverseLength(Vector3::Arg v)
+    {
+        return 1.0f / sqrtf(lengthSquared(v));
+    }
+
+    inline bool isNormalized(Vector3::Arg v, float epsilon = NV_NORMAL_EPSILON)
+    {
+        return equal(length(v), 1, epsilon);
+    }
+
+    inline Vector3 normalize(Vector3::Arg v, float epsilon = NV_EPSILON)
+    {
+        float l = length(v);
+        nvDebugCheck(!isZero(l, epsilon));
+        Vector3 n = scale(v, 1.0f / l);
+        nvDebugCheck(isNormalized(n));
+        return n;
+    }
+
+    inline Vector3 normalizeSafe(Vector3::Arg v, Vector3::Arg fallback, float epsilon = NV_EPSILON)
+    {
+        float l = length(v);
+        if (isZero(l, epsilon)) {
+            return fallback;
+        }
+        return scale(v, 1.0f / l);
+    }
+
+    // Safe, branchless normalization from Andy Firth. All error checking ommitted.
+    // http://altdevblogaday.com/2011/08/21/practical-flt-point-tricks/
+    inline Vector3 normalizeFast(Vector3::Arg v)
+    {
+        const float very_small_float = 1.0e-037f;
+        float l = very_small_float + length(v);
+        return scale(v, 1.0f / l);
+    }
+
+    inline bool equal(Vector3::Arg v1, Vector3::Arg v2, float epsilon = NV_EPSILON)
+    {
+        return equal(v1.x, v2.x, epsilon) && equal(v1.y, v2.y, epsilon) && equal(v1.z, v2.z, epsilon);
+    }
+
+    inline Vector3 min(Vector3::Arg a, Vector3::Arg b)
+    {
+        return Vector3(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z));
+    }
+
+    inline Vector3 max(Vector3::Arg a, Vector3::Arg b)
+    {
+        return Vector3(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z));
+    }
+
+    inline Vector3 clamp(Vector3::Arg v, float min, float max)
+    {
+        return Vector3(clamp(v.x, min, max), clamp(v.y, min, max), clamp(v.z, min, max));
+    }
+
+    inline Vector3 saturate(Vector3::Arg v)
+    {
+        return Vector3(saturate(v.x), saturate(v.y), saturate(v.z));
+    }
+
+    inline Vector3 floor(Vector3::Arg v)
+    {
+        return Vector3(floorf(v.x), floorf(v.y), floorf(v.z));
+    }
+
+    inline Vector3 ceil(Vector3::Arg v)
+    {
+        return Vector3(ceilf(v.x), ceilf(v.y), ceilf(v.z));
+    }
+
+    inline bool isFinite(Vector3::Arg v)
+    {
+        return isFinite(v.x) && isFinite(v.y) && isFinite(v.z);
+    }
+
+    inline Vector3 validate(Vector3::Arg v, Vector3::Arg fallback = Vector3(0.0f))
+    {
+        if (!isFinite(v)) return fallback;
+        Vector3 vf = v;
+        nv::floatCleanup(vf.component, 3);
+        return vf;
+    }
+
+    inline Vector3 reflect(Vector3::Arg v, Vector3::Arg n)
+    {
+	    return v - (2 * dot(v, n)) * n;
+    }
+
+    template <>
+    inline uint hash(const Vector3 & v, uint h)
+    {
+        return sdbmFloatHash(v.component, 3, h);
+    }
+
+
+    // Vector4
+
+    inline Vector4 add(Vector4::Arg a, Vector4::Arg b)
+    {
+        return Vector4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
+    }
+    inline Vector4 operator+(Vector4::Arg a, Vector4::Arg b)
+    {
+        return add(a, b);
+    }
+
+    inline Vector4 sub(Vector4::Arg a, Vector4::Arg b)
+    {
+        return Vector4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w);
+    }
+    inline Vector4 operator-(Vector4::Arg a, Vector4::Arg b)
+    {
+        return sub(a, b);
+    }
+
+    inline Vector4 scale(Vector4::Arg v, float s)
+    {
+        return Vector4(v.x * s, v.y * s, v.z * s, v.w * s);
+    }
+
+    inline Vector4 scale(Vector4::Arg v, Vector4::Arg s)
+    {
+        return Vector4(v.x * s.x, v.y * s.y, v.z * s.z, v.w * s.w);
+    }
+
+    inline Vector4 operator*(Vector4::Arg v, float s)
+    {
+        return scale(v, s);
+    }
+
+    inline Vector4 operator*(float s, Vector4::Arg v)
+    {
+        return scale(v, s);
+    }
+
+    inline Vector4 operator*(Vector4::Arg v, Vector4::Arg s)
+    {
+        return scale(v, s);
+    }
+
+    inline Vector4 operator/(Vector4::Arg v, float s)
+    {
+        return scale(v, 1.0f/s);
+    }
+
+    /*inline Vector4 add_scaled(Vector4::Arg a, Vector4::Arg b, float s)
+    {
+        return Vector4(a.x + b.x * s, a.y + b.y * s, a.z + b.z * s, a.w + b.w * s);
+    }*/
+
+    inline Vector4 lerp(Vector4::Arg v1, Vector4::Arg v2, float t)
+    {
+        const float s = 1.0f - t;
+        return Vector4(v1.x * s + t * v2.x, v1.y * s + t * v2.y, v1.z * s + t * v2.z, v1.w * s + t * v2.w);
+    }
+
+    inline float dot(Vector4::Arg a, Vector4::Arg b)
+    {
+        return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
+    }
+
+    inline float lengthSquared(Vector4::Arg v)
+    {
+        return v.x * v.x + v.y * v.y + v.z * v.z + v.w * v.w;
+    }
+
+    inline float length(Vector4::Arg v)
+    {
+        return sqrtf(lengthSquared(v));
+    }
+
+    inline float inverseLength(Vector4::Arg v)
+    {
+        return 1.0f / sqrtf(lengthSquared(v));
+    }
+
+    inline bool isNormalized(Vector4::Arg v, float epsilon = NV_NORMAL_EPSILON)
+    {
+        return equal(length(v), 1, epsilon);
+    }
+
+    inline Vector4 normalize(Vector4::Arg v, float epsilon = NV_EPSILON)
+    {
+        float l = length(v);
+        nvDebugCheck(!isZero(l, epsilon));
+        Vector4 n = scale(v, 1.0f / l);
+        nvDebugCheck(isNormalized(n));
+        return n;
+    }
+
+    inline Vector4 normalizeSafe(Vector4::Arg v, Vector4::Arg fallback, float epsilon = NV_EPSILON)
+    {
+        float l = length(v);
+        if (isZero(l, epsilon)) {
+            return fallback;
+        }
+        return scale(v, 1.0f / l);
+    }
+
+    // Safe, branchless normalization from Andy Firth. All error checking ommitted.
+    // http://altdevblogaday.com/2011/08/21/practical-flt-point-tricks/
+    inline Vector4 normalizeFast(Vector4::Arg v)
+    {
+        const float very_small_float = 1.0e-037f;
+        float l = very_small_float + length(v);
+        return scale(v, 1.0f / l);
+    }
+
+    inline bool equal(Vector4::Arg v1, Vector4::Arg v2, float epsilon = NV_EPSILON)
+    {
+        return equal(v1.x, v2.x, epsilon) && equal(v1.y, v2.y, epsilon) && equal(v1.z, v2.z, epsilon) && equal(v1.w, v2.w, epsilon);
+    }
+
+    inline Vector4 min(Vector4::Arg a, Vector4::Arg b)
+    {
+        return Vector4(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z), min(a.w, b.w));
+    }
+
+    inline Vector4 max(Vector4::Arg a, Vector4::Arg b)
+    {
+        return Vector4(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z), max(a.w, b.w));
+    }
+
+    inline Vector4 clamp(Vector4::Arg v, float min, float max)
+    {
+        return Vector4(clamp(v.x, min, max), clamp(v.y, min, max), clamp(v.z, min, max), clamp(v.w, min, max));
+    }
+
+    inline Vector4 saturate(Vector4::Arg v)
+    {
+        return Vector4(saturate(v.x), saturate(v.y), saturate(v.z), saturate(v.w));
+    }
+
+    inline bool isFinite(Vector4::Arg v)
+    {
+        return isFinite(v.x) && isFinite(v.y) && isFinite(v.z) && isFinite(v.w);
+    }
+
+    inline Vector4 validate(Vector4::Arg v, Vector4::Arg fallback = Vector4(0.0f))
+    {
+        if (!isFinite(v)) return fallback;
+        Vector4 vf = v;
+        nv::floatCleanup(vf.component, 4);
+        return vf;
+    }
+
+    template <>
+    inline uint hash(const Vector4 & v, uint h)
+    {
+        return sdbmFloatHash(v.component, 4, h);
+    }
+
+
+#if NV_OS_IOS // LLVM is not happy with implicit conversion of immediate constants to float
+
+    //int:
+
+    inline Vector2 scale(Vector2::Arg v, int s)
+    {
+        return Vector2(v.x * s, v.y * s);
+    }
+
+    inline Vector2 operator*(Vector2::Arg v, int s)
+    {
+        return scale(v, s);
+    }
+
+    inline Vector2 operator*(int s, Vector2::Arg v)
+    {
+        return scale(v, s);
+    }
+
+    inline Vector2 operator/(Vector2::Arg v, int s)
+    {
+        return scale(v, 1.0f/s);
+    }
+
+    inline Vector3 scale(Vector3::Arg v, int s)
+    {
+        return Vector3(v.x * s, v.y * s, v.z * s);
+    }
+
+    inline Vector3 operator*(Vector3::Arg v, int s)
+    {
+        return scale(v, s);
+    }
+
+    inline Vector3 operator*(int s, Vector3::Arg v)
+    {
+        return scale(v, s);
+    }
+
+    inline Vector3 operator/(Vector3::Arg v, int s)
+    {
+        return scale(v, 1.0f/s);
+    }
+
+    inline Vector4 scale(Vector4::Arg v, int s)
+    {
+        return Vector4(v.x * s, v.y * s, v.z * s, v.w * s);
+    }
+
+    inline Vector4 operator*(Vector4::Arg v, int s)
+    {
+        return scale(v, s);
+    }
+
+    inline Vector4 operator*(int s, Vector4::Arg v)
+    {
+        return scale(v, s);
+    }
+
+    inline Vector4 operator/(Vector4::Arg v, int s)
+    {
+        return scale(v, 1.0f/s);
+    }
+
+    //double:
+
+    inline Vector3 operator*(Vector3::Arg v, double s)
+    {
+        return scale(v, (float)s);
+    }
+
+    inline Vector3 operator*(double s, Vector3::Arg v)
+    {
+        return scale(v, (float)s);
+    }
+
+    inline Vector3 operator/(Vector3::Arg v, double s)
+    {
+        return scale(v, 1.f/((float)s));
+    }    
+        
+#endif //NV_OS_IOS
+
+} // nv namespace
+
+#endif // NV_MATH_VECTOR_INL
diff --git a/src/nvmath/nvmath.h b/src/nvmath/nvmath.h
index 6016f28..d15a506 100644
--- a/src/nvmath/nvmath.h
+++ b/src/nvmath/nvmath.h
@@ -194,15 +194,20 @@ namespace nv
 #endif
     }
 
-    inline uint log2(uint i)
+    inline uint log2(uint32 i)
     {
-        uint value = 0;
-        while( i >>= 1 ) {
-            value++;
-        }
+        uint32 value = 0;
+        while( i >>= 1 ) value++;
         return value;
     }
 
+    inline uint log2(uint64 i)
+    {
+        uint64 value = 0;
+        while (i >>= 1) value++;
+        return U32(value);
+    }
+
     inline float lerp(float f0, float f1, float t)
     {
         const float s = 1.0f - t;
diff --git a/src/nvthread/Atomic.h b/src/nvthread/Atomic.h
index b8eaedb..970fec7 100644
--- a/src/nvthread/Atomic.h
+++ b/src/nvthread/Atomic.h
@@ -106,6 +106,11 @@ namespace nv {
 #error "Atomics not implemented."
 #endif
     }
+    
+    inline void storeRelease(volatile float * ptr, float value)
+    {
+        storeRelease((uint32 *)ptr, *(uint32 *)&value);
+    }
 
 
     template <typename T>
diff --git a/src/nvthread/Event.cpp b/src/nvthread/Event.cpp
index 92903a8..b6d8c15 100644
--- a/src/nvthread/Event.cpp
+++ b/src/nvthread/Event.cpp
@@ -17,7 +17,7 @@ struct Event::Private {
 };
 
 Event::Event() : m(new Private) {
-    m->handle = CreateEvent(NULL, FALSE, FALSE, NULL);
+    m->handle = CreateEvent(/*lpEventAttributes=*/NULL, /*bManualReset=*/FALSE, /*bInitialState=*/FALSE, /*lpName=*/NULL);
 }
 
 Event::~Event() {
diff --git a/src/nvthread/Mutex.cpp b/src/nvthread/Mutex.cpp
index 9d4aa66..fe7fe68 100644
--- a/src/nvthread/Mutex.cpp
+++ b/src/nvthread/Mutex.cpp
@@ -13,7 +13,9 @@
 
 #endif // NV_OS
 
-#if NV_USE_TELEMETRY
+#if NV_USE_TELEMETRY3
+#include <rad_tm.h>
+#elif NV_USE_TELEMETRY
 #include <telemetry.h>
 extern HTELEMETRY tmContext;
 #endif
@@ -45,14 +47,19 @@ Mutex::~Mutex ()
 
 void Mutex::lock()
 {
-#if NV_USE_TELEMETRY
+#if NV_USE_TELEMETRY3
+    tmStartWaitForLock(0, 0, this, m->name);
+#elif NV_USE_TELEMETRY
     TmU64 matcher;
     tmTryLockEx(tmContext, &matcher, 100/*0.1 ms*/, __FILE__, __LINE__, this, "blocked");
 #endif
     
     EnterCriticalSection(&m->mutex);
 
-#if NV_USE_TELEMETRY
+#if NV_USE_TELEMETRY3
+    tmEndWaitForLock(0);
+    tmAcquiredLock(0, 0, this, m->name);
+#elif NV_USE_TELEMETRY
     tmEndTryLockEx(tmContext, matcher, __FILE__, __LINE__, this, TMLR_SUCCESS);
     tmSetLockState(tmContext, this, TMLS_LOCKED, "acquired");
 #endif
@@ -60,7 +67,18 @@ void Mutex::lock()
 
 bool Mutex::tryLock()
 {
-#if NV_USE_TELEMETRY
+#if NV_USE_TELEMETRY3
+    tmStartWaitForLock(0, 0, this, m->name);
+    if (TryEnterCriticalSection(&m->mutex) != 0) {
+        tmEndWaitForLock(0);
+        tmAcquiredLock(0, 0, this, m->name);
+        return true;
+    }
+    else {
+        tmEndWaitForLock(0);
+        return false;
+    }
+#elif NV_USE_TELEMETRY
     TmU64 matcher;
     tmTryLockEx(tmContext, &matcher, 100/*0.1 ms*/, __FILE__, __LINE__, this, "blocked");
     if (TryEnterCriticalSection(&m->mutex) != 0) {
@@ -79,7 +97,9 @@ bool Mutex::tryLock()
 
 void Mutex::unlock()
 {
-#if NV_USE_TELEMETRY
+#if NV_USE_TELEMETRY3
+    tmReleasedLock(0, this);
+#elif NV_USE_TELEMETRY
     tmSetLockState(tmContext, this, TMLS_RELEASED, "released");
 #endif
 
@@ -90,13 +110,17 @@ void Mutex::unlock()
 
 struct Mutex::Private {
     pthread_mutex_t mutex;
+    pthread_mutexattr_t attr;
     const char * name;
 };
 
 
 Mutex::Mutex (const char * name) : m(new Private)
 {
-    int result = pthread_mutex_init(&m->mutex, NULL);
+    pthread_mutexattr_init(&m->attr);
+    pthread_mutexattr_settype(&m->attr, PTHREAD_MUTEX_RECURSIVE);
+    int result = pthread_mutex_init(&m->mutex, &m->attr);
+    //m->mutex = PTHREAD_MUTEX_INITIALIZER;
     m->name = name;
     nvDebugCheck(result == 0);
 }
@@ -105,6 +129,8 @@ Mutex::~Mutex ()
 {
     int result = pthread_mutex_destroy(&m->mutex);
     nvDebugCheck(result == 0);
+    result = pthread_mutexattr_destroy(&m->attr);
+    nvDebugCheck(result == 0);
 }
 
 void Mutex::lock()
diff --git a/src/nvthread/Thread.cpp b/src/nvthread/Thread.cpp
index 869d7e1..b72ba5a 100644
--- a/src/nvthread/Thread.cpp
+++ b/src/nvthread/Thread.cpp
@@ -9,7 +9,11 @@
     #include <unistd.h> // usleep
 #endif
 
-#if NV_USE_TELEMETRY
+#include "nvcore/StrLib.h"
+
+#if NV_USE_TELEMETRY3
+#include <rad_tm.h>
+#elif NV_USE_TELEMETRY
 #include <telemetry.h>
 extern HTELEMETRY tmContext;
 #endif
@@ -118,16 +122,12 @@ void Thread::start(ThreadFunc * func, void * arg)
     nvDebugCheck(p->thread != NULL);
     if (p->name != NULL) {
         setThreadName(threadId, p->name);
-    #if NV_USE_TELEMETRY
+    #if NV_USE_TELEMETRY3
+        tmThreadName(0, threadId, p->name);
+    #elif NV_USE_TELEMETRY
         tmThreadName(tmContext, threadId, p->name);
     #endif
     }
-#elif NV_OS_ORBIS
-    int ret = scePthreadCreate(&p->thread, NULL, threadFunc, p.ptr(), p->name ? p->name : "nv::Thread");
-    nvDebugCheck(ret == 0);
-	// use any non-system core
-	scePthreadSetaffinity(p->thread, 0x3F);
-    scePthreadSetprio(p->thread, (SCE_KERNEL_PRIO_FIFO_DEFAULT + SCE_KERNEL_PRIO_FIFO_HIGHEST) / 2);
 #elif NV_OS_USE_PTHREAD
     int result = pthread_create(&p->thread, NULL, threadFunc, p.ptr());
     nvDebugCheck(result == 0);
diff --git a/src/nvthread/ThreadPool.cpp b/src/nvthread/ThreadPool.cpp
index 8373870..40f39bb 100644
--- a/src/nvthread/ThreadPool.cpp
+++ b/src/nvthread/ThreadPool.cpp
@@ -8,7 +8,9 @@
 #include "nvcore/Utils.h"
 #include "nvcore/StrLib.h"
 
-#if NV_USE_TELEMETRY
+#if NV_USE_TELEMETRY3
+#include <rad_tm.h>
+#elif NV_USE_TELEMETRY
 #include <telemetry.h>
 extern HTELEMETRY tmContext;
 #endif
@@ -84,7 +86,9 @@ AutoPtr<ThreadPool> s_pool;
         }
         
         {
-#if NV_USE_TELEMETRY
+#if NV_USE_TELEMETRY3
+            tmZone(0, TMZF_NONE, "worker");
+#elif NV_USE_TELEMETRY
             tmZoneFiltered(tmContext, 20, TMZF_NONE, "worker");
 #endif
             func(s_pool->arg, s_pool->useCallingThread + i);
@@ -116,11 +120,11 @@ ThreadPool::ThreadPool(uint workerCount/*=processorCount()*/, bool useThreadAffi
         lockThreadToProcessor(0);   // Calling thread always locked to processor 0.
     }
 
+    StringBuilder name;
     for (uint i = 0; i < threadCount; i++) {
-        StringBuilder name;
         name.format("worker %d", i);
         workers[i].setName(name.release());     // @Leak
-        workers[i].start(workerFunc, (void *)i);
+        workers[i].start(workerFunc, (void *)(uintptr_t)i);
     }
 
     allIdle = true;
@@ -141,9 +145,6 @@ ThreadPool::~ThreadPool()
 
 void ThreadPool::run(ThreadTask * func, void * arg)
 {
-    // Wait until threads are idle.
-    wait();
-
     start(func, arg);
 
     if (useCallingThread) {
diff --git a/src/nvthread/nvthread.cpp b/src/nvthread/nvthread.cpp
index 38b5a86..b727f2e 100644
--- a/src/nvthread/nvthread.cpp
+++ b/src/nvthread/nvthread.cpp
@@ -85,7 +85,9 @@ uint nv::processorCount() {
 
     return count;
 #elif NV_OS_ORBIS
-    return 6;
+	return 6;
+#elif NV_OS_DURANGO
+	return 6;
 #elif NV_OS_XBOX
     return 3; // or 6?
 #elif NV_OS_LINUX || NV_OS_NETBSD // Linux, Solaris, & AIX
diff --git a/src/nvtt/BlockCompressor.cpp b/src/nvtt/BlockCompressor.cpp
index 89e15fb..5b703ac 100644
--- a/src/nvtt/BlockCompressor.cpp
+++ b/src/nvtt/BlockCompressor.cpp
@@ -25,6 +25,7 @@
 #include "BlockCompressor.h"
 #include "OutputOptions.h"
 #include "TaskDispatcher.h"
+#include "CompressionOptions.h"
 
 #include "nvimage/Image.h"
 #include "nvimage/ColorBlock.h"
@@ -33,6 +34,7 @@
 #include "nvmath/Vector.inl"
 
 #include "nvcore/Memory.h"
+#include "nvcore/Array.inl"
 
 #include <new> // placement new
 
@@ -40,85 +42,13 @@
 using namespace nv;
 using namespace nvtt;
 
-/*
-// OpenMP
-#if defined(HAVE_OPENMP)
-#include <omp.h>
-#endif
-
-void ColorBlockCompressor::compress(nvtt::AlphaMode alphaMode, uint w, uint h, const float * data, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions)
-{
-    const uint bs = blockSize();
-    const uint bw = (w + 3) / 4;
-    const uint bh = (h + 3) / 4;
-
-#if defined(HAVE_OPENMP)
-    bool singleThreaded = false;
-#else
-    bool singleThreaded = true;
-#endif
-
-    // Use a single thread to compress small textures.
-    if (bw * bh < 16) singleThreaded = true;
-
-    if (singleThreaded)
-    {
-        nvDebugCheck(bs <= 16);
-        uint8 mem[16]; // @@ Output one row at a time!
-
-        for (int y = 0; y < int(h); y += 4) {
-            for (uint x = 0; x < w; x += 4) {
-
-                ColorBlock rgba;
-                rgba.init(w, h, data, x, y);
-
-                compressBlock(rgba, alphaMode, compressionOptions, mem);
-
-                if (outputOptions.outputHandler != NULL) {
-                    outputOptions.outputHandler->writeData(mem, bs);
-                }
-            }
-        }
-    }
-#if defined(HAVE_OPENMP)
-    else
-    {
-        const uint size = bs * bw * bh;
-        uint8 * mem = new uint8[size];
-
-        #pragma omp parallel
-        {
-            #pragma omp for
-            for (int i = 0; i < int(bw*bh); i++)
-            {
-                const uint x = i % bw;
-                const uint y = i / bw;
-
-		ColorBlock rgba;
-		rgba.init(w, h, data, 4*x, 4*y);
-
-		uint8 * ptr = mem + (y * bw + x) * bs;
-		compressBlock(rgba, alphaMode, compressionOptions, ptr);
-	    } // omp for
-	} // omp parallel
-
-	if (outputOptions.outputHandler != NULL) {
-	    outputOptions.outputHandler->writeData(mem, size);
-	}
-
-        delete [] mem;
-    }
-#endif
-}
-*/
-
 
 struct CompressorContext
 {
-    nvtt::AlphaMode alphaMode;
+    AlphaMode alphaMode;
     uint w, h, d;
     const float * data;
-    const nvtt::CompressionOptions::Private * compressionOptions;
+    const CompressionOptions::Private * compressionOptions;
 
     uint bw, bh, bs;
     uint8 * mem;
@@ -144,7 +74,7 @@ void ColorBlockCompressorTask(void * data, int i)
     }
 }
 
-void ColorBlockCompressor::compress(nvtt::AlphaMode alphaMode, uint w, uint h, uint d, const float * data, nvtt::TaskDispatcher * dispatcher, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions)
+void ColorBlockCompressor::compress(AlphaMode alphaMode, uint w, uint h, uint d, const float * data, TaskDispatcher * dispatcher, const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions)
 {
     nvDebugCheck(d == 1);
 
@@ -182,66 +112,6 @@ void ColorBlockCompressor::compress(nvtt::AlphaMode alphaMode, uint w, uint h, u
     delete [] context.mem;
 }
 
-
-#if 0
-// Each task compresses one block.
-void ColorSetCompressorTask(void * data, int i)
-{
-    CompressorContext * d = (CompressorContext *) data;
-
-    uint x = i % d->bw;
-    uint y = i / d->bw;
-
-    //for (uint x = 0; x < d->bw; x++)
-    {
-        ColorSet set;
-        set.setColors(d->data, d->w, d->h, x * 4, y * 4);
-
-        uint8 * ptr = d->mem + (y * d->bw + x) * d->bs;
-        ((ColorSetCompressor *)d->compressor)->compressBlock(set, d->alphaMode, *d->compressionOptions, ptr);
-    }
-}
-
-
-void ColorSetCompressor::compress(AlphaMode alphaMode, uint w, uint h, uint d, const float * data, nvtt::TaskDispatcher * dispatcher, const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions)
-{
-    nvDebugCheck(d == 1);
-
-    CompressorContext context;
-    context.alphaMode = alphaMode;
-    context.w = w;
-    context.h = h;
-    context.data = data;
-    context.compressionOptions = &compressionOptions;
-
-    context.bs = blockSize();
-    context.bw = (w + 3) / 4;
-    context.bh = (h + 3) / 4;
-
-    context.compressor = this;
-
-    SequentialTaskDispatcher sequential;
-
-    // Use a single thread to compress small textures.
-    if (context.bh < 4) dispatcher = &sequential;
-
-#if _DEBUG
-    dispatcher = &sequential;
-#endif
-
-    const uint count = context.bw * context.bh;
-    const uint size = context.bs * count;
-    context.mem = new uint8[size];
-
-    dispatcher->dispatch(ColorSetCompressorTask, &context, count);
-
-    outputOptions.writeData(context.mem, size);
-
-    delete [] context.mem;
-}
-#endif // 0
-
-
 // Each task compresses one block.
 void FloatColorCompressorTask(void * data, int i)
 {
@@ -262,8 +132,8 @@ void FloatColorCompressorTask(void * data, int i)
     Vector4 colors[16];
     float weights[16];
 
-    const uint block_w = min(d->w - block_x * 4U, 4U);
-    const uint block_h = min(d->h - block_y * 4U, 4U);
+    const uint block_w = min(d->w - block_x * 4, 4U);
+    const uint block_h = min(d->h - block_y * 4, 4U);
 
     uint x, y;
     for (y = 0; y < block_h; y++) {
@@ -274,7 +144,7 @@ void FloatColorCompressorTask(void * data, int i)
             colors[dst_idx].y = g[src_idx];
             colors[dst_idx].z = b[src_idx];
             colors[dst_idx].w = a[src_idx];
-            weights[dst_idx] = (d->alphaMode == nvtt::AlphaMode_Transparency) ? a[src_idx] : 1.0f;
+            weights[dst_idx] = (d->alphaMode == AlphaMode_Transparency) ? saturate(a[src_idx]) : 1.0f;
         }
         for (; x < 4; x++) {
             uint dst_idx = 4 * y + x;
@@ -289,14 +159,14 @@ void FloatColorCompressorTask(void * data, int i)
             weights[dst_idx] = 0.0f;
         }
     }
-
+    
     // Compress block.
     uint8 * output = d->mem + (block_y * d->bw + block_x) * d->bs;
     ((FloatColorCompressor *)d->compressor)->compressBlock(colors, weights, *d->compressionOptions, output);
 }
 
 
-void FloatColorCompressor::compress(AlphaMode alphaMode, uint w, uint h, uint d, const float * data, nvtt::TaskDispatcher * dispatcher, const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions)
+void FloatColorCompressor::compress(AlphaMode alphaMode, uint w, uint h, uint d, const float * data, TaskDispatcher * dispatcher, const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions)
 {
     nvDebugCheck(d == 1);   // @@ Add support for compressed 3D textures.
 
@@ -308,7 +178,7 @@ void FloatColorCompressor::compress(AlphaMode alphaMode, uint w, uint h, uint d,
     context.data = data;
     context.compressionOptions = &compressionOptions;
 
-    context.bs = blockSize();
+    context.bs = blockSize(compressionOptions);
     context.bw = (w + 3) / 4;
     context.bh = (h + 3) / 4;
 
@@ -333,3 +203,466 @@ void FloatColorCompressor::compress(AlphaMode alphaMode, uint w, uint h, uint d,
 
     delete [] context.mem;
 }
+
+
+// BC1
+#include "CompressorDXT1.h"
+
+void FastCompressorDXT1::compressBlock(Vector4 colors[16], float weights[16], const CompressionOptions::Private & compressionOptions, void * output)
+{
+    compress_dxt1_fast(colors, weights, compressionOptions.colorWeight.xyz(), (BlockDXT1 *)output);
+}
+void CompressorDXT1::compressBlock(Vector4 colors[16], float weights[16], const CompressionOptions::Private & compressionOptions, void * output)
+{
+    compress_dxt1(colors, weights, compressionOptions.colorWeight.xyz(), /*three_color_mode*/true, (BlockDXT1 *)output);
+}
+
+
+// @@ BC1a
+
+// @@ BC2
+
+// @@ BC3
+
+
+// BC3_RGBM
+#include "CompressorDXT5_RGBM.h"
+
+void CompressorBC3_RGBM::compressBlock(Vector4 colors[16], float weights[16], const CompressionOptions::Private & compressionOptions, void * output)
+{
+    compress_dxt5_rgbm(colors, weights, compressionOptions.rgbmThreshold, (BlockDXT5 *)output);
+}
+
+
+// ETC
+#include "CompressorETC.h"
+
+void CompressorETC1::compressBlock(Vector4 colors[16], float weights[16], const CompressionOptions::Private & compressionOptions, void * output)
+{
+    compress_etc1(colors, weights, compressionOptions.colorWeight.xyz(), output);
+}
+void CompressorETC2_R::compressBlock(Vector4 colors[16], float weights[16], const CompressionOptions::Private & compressionOptions, void * output)
+{
+    // @@ Change radius based on quality.
+    compress_eac(colors, weights, /*input_channel=*/1, /*search_radius=*/1, /*use_11bit_mode=*/true, output);
+}
+void CompressorETC2_RG::compressBlock(Vector4 colors[16], float weights[16], const CompressionOptions::Private & compressionOptions, void * output)
+{
+    //compress_eac_rg(colors, weights, 1, 2, output);
+}
+void CompressorETC2_RGB::compressBlock(Vector4 colors[16], float weights[16], const CompressionOptions::Private & compressionOptions, void * output)
+{
+    // @@ Tweak quality options.
+    compress_etc2(colors, weights, compressionOptions.colorWeight.xyz(), output);
+}
+void CompressorETC2_RGBA::compressBlock(Vector4 colors[16], float weights[16], const CompressionOptions::Private & compressionOptions, void * output)
+{
+    // @@ Tweak quality options.
+    // @@ Change radius based on quality.
+    compress_etc2_eac(colors, weights, compressionOptions.colorWeight.xyz(), output);
+}
+/*void CompressorETC2_RG::compressBlock(Vector4 colors[16], float weights[16], const CompressionOptions::Private & compressionOptions, void * output)
+{
+    // @@ Change radius based on quality.
+    compress_eac_rg(colors, weights, compressionOptions.colorWeight.xyz(), output);
+}*/
+void CompressorETC2_RGBM::compressBlock(Vector4 colors[16], float weights[16], const CompressionOptions::Private & compressionOptions, void * output)
+{
+    compress_etc2_rgbm(colors, weights, compressionOptions.rgbmThreshold, output);
+}
+
+
+
+// External compressors.
+
+#if defined(HAVE_ATITC)
+
+typedef int BOOL;
+typedef _W64 unsigned long ULONG_PTR;
+typedef ULONG_PTR DWORD_PTR;
+#include "atitc/ATI_Compress.h"
+
+void AtiCompressorDXT1::compress(InputFormat inputFormat, AlphaMode alphaMode, uint w, uint h, uint d, void * data, const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions)
+{
+    nvDebugCheck(d == 1);
+
+    // Init source texture
+    ATI_TC_Texture srcTexture;
+    srcTexture.dwSize = sizeof(srcTexture);
+    srcTexture.dwWidth = w;
+    srcTexture.dwHeight = h;
+    if (inputFormat == InputFormat_BGRA_8UB)
+    {
+        srcTexture.dwPitch = w * 4;
+        srcTexture.format = ATI_TC_FORMAT_ARGB_8888;
+    }
+    else
+    {
+        // @@ Floating point input is not swizzled.
+        srcTexture.dwPitch = w * 16;
+        srcTexture.format = ATI_TC_FORMAT_ARGB_32F;
+    }
+    srcTexture.dwDataSize = ATI_TC_CalculateBufferSize(&srcTexture);
+    srcTexture.pData = (ATI_TC_BYTE*) data;
+
+    // Init dest texture
+    ATI_TC_Texture destTexture;
+    destTexture.dwSize = sizeof(destTexture);
+    destTexture.dwWidth = w;
+    destTexture.dwHeight = h;
+    destTexture.dwPitch = 0;
+    destTexture.format = ATI_TC_FORMAT_DXT1;
+    destTexture.dwDataSize = ATI_TC_CalculateBufferSize(&destTexture);
+    destTexture.pData = (ATI_TC_BYTE*) mem::malloc(destTexture.dwDataSize);
+
+    ATI_TC_CompressOptions options;
+    options.dwSize = sizeof(options);
+    options.bUseChannelWeighting = false;
+    options.bUseAdaptiveWeighting = false;
+    options.bDXT1UseAlpha = false;
+    options.nCompressionSpeed = ATI_TC_Speed_Normal;
+    options.bDisableMultiThreading = false;
+    //options.bDisableMultiThreading = true;
+
+    // Compress
+    ATI_TC_ConvertTexture(&srcTexture, &destTexture, &options, NULL, NULL, NULL);
+
+    if (outputOptions.outputHandler != NULL) {
+            outputOptions.outputHandler->writeData(destTexture.pData, destTexture.dwDataSize);
+    }
+
+    mem::free(destTexture.pData);
+}
+
+void AtiCompressorDXT5::compress(InputFormat inputFormat, AlphaMode alphaMode, uint w, uint h, uint d, void * data, const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions)
+{
+    nvDebugCheck(d == 1);
+
+    // Init source texture
+    ATI_TC_Texture srcTexture;
+    srcTexture.dwSize = sizeof(srcTexture);
+    srcTexture.dwWidth = w;
+    srcTexture.dwHeight = h;
+    if (inputFormat == InputFormat_BGRA_8UB)
+    {
+        srcTexture.dwPitch = w * 4;
+        srcTexture.format = ATI_TC_FORMAT_ARGB_8888;
+    }
+    else
+    {
+        srcTexture.dwPitch = w * 16;
+        srcTexture.format = ATI_TC_FORMAT_ARGB_32F;
+    }
+    srcTexture.dwDataSize = ATI_TC_CalculateBufferSize(&srcTexture);
+    srcTexture.pData = (ATI_TC_BYTE*) data;
+
+    // Init dest texture
+    ATI_TC_Texture destTexture;
+    destTexture.dwSize = sizeof(destTexture);
+    destTexture.dwWidth = w;
+    destTexture.dwHeight = h;
+    destTexture.dwPitch = 0;
+    destTexture.format = ATI_TC_FORMAT_DXT5;
+    destTexture.dwDataSize = ATI_TC_CalculateBufferSize(&destTexture);
+    destTexture.pData = (ATI_TC_BYTE*) mem::malloc(destTexture.dwDataSize);
+
+    // Compress
+    ATI_TC_ConvertTexture(&srcTexture, &destTexture, NULL, NULL, NULL, NULL);
+
+    if (outputOptions.outputHandler != NULL) {
+        outputOptions.outputHandler->writeData(destTexture.pData, destTexture.dwDataSize);
+    }
+
+    mem::free(destTexture.pData);
+}
+
+#endif // defined(HAVE_ATITC)
+
+#if defined(HAVE_SQUISH)
+
+//#include "squish/squish.h"
+#include "squish-1.10/squish.h"
+
+void SquishCompressorDXT1::compress(InputFormat inputFormat, AlphaMode alphaMode, uint w, uint h, uint d, void * data, const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions)
+{
+    nvDebugCheck(d == 1);
+    nvDebugCheck(false);
+
+#pragma message(NV_FILE_LINE "TODO: Convert input to fixed point ABGR format instead of ARGB")
+    /*
+    Image img(*image);
+    int count = img.width() * img.height();
+    for (int i = 0; i < count; i++)
+    {
+            Color32 c = img.pixel(i);
+            img.pixel(i) = Color32(c.b, c.g, c.r, c.a);
+    }
+
+    int size = squish::GetStorageRequirements(img.width(), img.height(), squish::kDxt1);
+    void * blocks = mem::malloc(size);
+
+    squish::CompressImage((const squish::u8 *)img.pixels(), img.width(), img.height(), blocks, squish::kDxt1 | squish::kColourClusterFit);
+
+    if (outputOptions.outputHandler != NULL) {
+            outputOptions.outputHandler->writeData(blocks, size);
+    }
+
+    mem::free(blocks);
+    */
+}
+
+#endif // defined(HAVE_SQUISH)
+
+
+#if defined(HAVE_D3DX)
+
+void D3DXCompressorDXT1::compress(InputFormat inputFormat, AlphaMode alphaMode, uint w, uint h, uint d, void * data, const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions)
+{
+    nvDebugCheck(d == 1);
+
+    IDirect3D9 * d3d = Direct3DCreate9(D3D_SDK_VERSION);
+
+    D3DPRESENT_PARAMETERS presentParams;
+    ZeroMemory(&presentParams, sizeof(presentParams));
+    presentParams.Windowed = TRUE;
+    presentParams.SwapEffect = D3DSWAPEFFECT_COPY;
+    presentParams.BackBufferWidth = 8;
+    presentParams.BackBufferHeight = 8;
+    presentParams.BackBufferFormat = D3DFMT_UNKNOWN;
+
+    HRESULT err;
+
+    IDirect3DDevice9 * device = NULL;
+    err = d3d->CreateDevice(D3DADAPTER_DEFAULT, D3DDEVTYPE_REF, GetDesktopWindow(), D3DCREATE_SOFTWARE_VERTEXPROCESSING, &presentParams, &device);
+
+    IDirect3DTexture9 * texture = NULL;
+    err = D3DXCreateTexture(device, w, h, 1, 0, D3DFMT_DXT1, D3DPOOL_SYSTEMMEM, &texture);
+
+    IDirect3DSurface9 * surface = NULL;
+    err = texture->GetSurfaceLevel(0, &surface);
+
+    RECT rect;
+    rect.left = 0;
+    rect.top = 0;
+    rect.bottom = h;
+    rect.right = w;
+
+    if (inputFormat == InputFormat_BGRA_8UB)
+    {
+        err = D3DXLoadSurfaceFromMemory(surface, NULL, NULL, data, D3DFMT_A8R8G8B8, w * 4, NULL, &rect, D3DX_DEFAULT, 0);
+    }
+    else
+    {
+        err = D3DXLoadSurfaceFromMemory(surface, NULL, NULL, data, D3DFMT_A32B32G32R32F, w * 16, NULL, &rect, D3DX_DEFAULT, 0);
+    }
+
+    if (err != D3DERR_INVALIDCALL && err != D3DXERR_INVALIDDATA)
+    {
+        D3DLOCKED_RECT rect;
+        ZeroMemory(&rect, sizeof(rect));
+
+        err = surface->LockRect(&rect, NULL, D3DLOCK_READONLY);
+
+        if (outputOptions.outputHandler != NULL) {
+            int size = rect.Pitch * ((h + 3) / 4);
+            outputOptions.outputHandler->writeData(rect.pBits, size);
+        }
+
+        err = surface->UnlockRect();
+    }
+
+    surface->Release();
+    device->Release();
+    d3d->Release();
+}
+
+#endif // defined(HAVE_D3DX)
+
+
+#if defined(HAVE_STB)
+
+#define STB_DEFINE
+#include "stb/stb_dxt.h"
+
+void StbCompressorDXT1::compressBlock(ColorBlock & rgba, AlphaMode alphaMode, const CompressionOptions::Private & compressionOptions, void * output)
+{
+    rgba.swizzle(2, 1, 0, 3); // Swap R and B
+    stb_compress_dxt_block((unsigned char *)output, (unsigned char *)rgba.colors(), 0, 0);
+}
+
+#endif // defined(HAVE_STB)
+
+
+#if defined(HAVE_ETCLIB)
+#include "Etc.h"
+
+void EtcLibCompressor::compress(AlphaMode alphaMode, uint w, uint h, uint d, const float * data, TaskDispatcher * dispatcher, const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions)
+{
+    //nvCheck(d == 1);  // Encode one layer at a time?
+
+    Etc::Image::Format format;
+    if (compressionOptions.format == Format_ETC1) {
+        format = Etc::Image::Format::ETC1;
+    }
+    else if (compressionOptions.format == Format_ETC2_R) {
+        format = Etc::Image::Format::R11;
+    }
+    else if (compressionOptions.format == Format_ETC2_RG) {
+        format = Etc::Image::Format::RG11;
+    }
+    else if (compressionOptions.format == Format_ETC2_RGB) {
+        format = Etc::Image::Format::RGB8;
+        //format = Etc::Image::Format::SRGB8;
+    }
+    else if (compressionOptions.format == Format_ETC2_RGBA) {
+        format = Etc::Image::Format::RGBA8;
+        //format = Etc::Image::Format::SRGBA8;
+    }
+    else if (compressionOptions.format == Format_ETC2_RGB_A1) {
+        format = Etc::Image::Format::RGB8A1;
+        //format = Etc::Image::Format::SRGB8A1;
+    }
+    else {
+        nvCheck(false);
+        return;
+    }
+
+    Etc::ErrorMetric error_metric = Etc::ErrorMetric::RGBA;
+
+    // @@ Use normal compression metric for normals?
+    //if (compressionOptions.)
+
+    // @@ Adjust based on quality.
+    int effort = ETCCOMP_DEFAULT_EFFORT_LEVEL;
+
+    // @@ What are the defaults?
+    uint jobs = 4;
+    uint max_jobs = 4;
+
+    uint8 * out_data = NULL;
+    uint out_size = 0;
+    uint out_width = 0;
+    uint out_height = 0;
+    int out_time = 0;
+
+    // Swizzle color data.
+    nv::Array<float> tmp;
+    uint count = w * h;
+    tmp.resize(4 * count);
+    for (uint i = 0; i < count; i++) {
+        tmp[4*i+0] = data[count*0 + i];
+        tmp[4*i+1] = data[count*1 + i];
+        tmp[4*i+2] = data[count*2 + i];
+        tmp[4*i+3] = data[count*3 + i];
+    }
+
+    Etc::Encode(tmp.buffer(), w, h, format, error_metric, effort, jobs, max_jobs, &out_data, &out_size, &out_width, &out_height, &out_time);
+
+    if (outputOptions.outputHandler != NULL) {
+        outputOptions.outputHandler->writeData(out_data, I32(out_size));
+    }
+}
+
+#endif
+
+#if defined(HAVE_RGETC)
+#include "rg_etc1.h"
+
+NV_AT_STARTUP(rg_etc1::pack_etc1_block_init());
+
+void RgEtcCompressor::compressBlock(ColorBlock & rgba, AlphaMode alphaMode, const CompressionOptions::Private & compressionOptions, void * output)
+{
+    rg_etc1::etc1_pack_params pack_params;
+
+    pack_params.m_quality = rg_etc1::cMediumQuality;
+    if (compressionOptions.quality == Quality_Fastest) pack_params.m_quality = rg_etc1::cLowQuality;
+    else if (compressionOptions.quality == Quality_Production) pack_params.m_quality = rg_etc1::cHighQuality;
+    else if (compressionOptions.quality == Quality_Highest) pack_params.m_quality = rg_etc1::cHighQuality;
+    else if (compressionOptions.quality == Quality_Normal) pack_params.m_quality = rg_etc1::cMediumQuality;
+
+    rgba.swizzle(2, 1, 0, 3);
+    rg_etc1::pack_etc1_block(output, (uint *)rgba.colors(), pack_params);
+
+    //Vector4 result[16];
+    //nv::decompress_etc(output, result);
+
+}
+
+#endif
+
+
+#if defined(HAVE_PVRTEXTOOL)
+
+#include <PVRTextureUtilities.h> // for CPVRTexture, CPVRTextureHeader, PixelType, Transcode
+
+#include "nvmath/Color.inl"
+
+void CompressorPVR::compress(AlphaMode alphaMode, uint w, uint h, uint d, const float * data, TaskDispatcher * dispatcher, const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions)
+{
+    EPVRTColourSpace color_space = ePVRTCSpacelRGB;
+
+    //pvrtexture::PixelType src_pixel_type = pvrtexture::PixelType('b','g','r','a',8,8,8,8);
+    pvrtexture::PixelType src_pixel_type = pvrtexture::PixelType('r','g','b',0,8,8,8,0);
+    pvrtexture::CPVRTextureHeader header(src_pixel_type.PixelTypeID, w, h, d, 1/*num mips*/, 1/*num array*/, 1/*num faces*/, color_space, ePVRTVarTypeUnsignedByteNorm);
+
+    /*
+    uint count = w * h * d;
+    Array<Color32> tmp;
+    tmp.resize(count);
+
+    for (uint i = 0; i < count; i++) {
+        tmp[i] = toColor32(Vector4(data[0*count + i], data[1*count + i], data[2*count + i], data[3*count + i]));
+    }
+    */
+
+    uint count = w * h * d;
+    Array<uint8> tmp;
+    tmp.resize(3 * count);
+
+    for (uint i = 0; i < count; i++) {
+        tmp[3*i+0] = data[0*count + i] * 255.0f;
+        tmp[3*i+1] = data[1*count + i] * 255.0f;
+        tmp[3*i+2] = data[2*count + i] * 255.0f;
+    }
+
+    pvrtexture::CPVRTexture texture(header, tmp.buffer());
+
+    pvrtexture::PixelType dst_pixel_type = pvrtexture::PixelType(ePVRTPF_PVRTCI_2bpp_RGB);
+
+    if (compressionOptions.format == Format_PVR_2BPP_RGB) dst_pixel_type = pvrtexture::PixelType(ePVRTPF_PVRTCI_2bpp_RGB);
+    else if (compressionOptions.format == Format_PVR_4BPP_RGB) dst_pixel_type = pvrtexture::PixelType(ePVRTPF_PVRTCI_4bpp_RGB);
+    else if (compressionOptions.format == Format_PVR_2BPP_RGBA) dst_pixel_type = pvrtexture::PixelType(ePVRTPF_PVRTCI_2bpp_RGBA);
+    else if (compressionOptions.format == Format_PVR_4BPP_RGBA) dst_pixel_type = pvrtexture::PixelType(ePVRTPF_PVRTCI_4bpp_RGBA);
+
+    bool success = pvrtexture::Transcode(texture, dst_pixel_type, ePVRTVarTypeUnsignedByteNorm, color_space, pvrtexture::ePVRTCNormal, false);
+
+    if (success) {
+        uint size = 0;
+        if (compressionOptions.format == Format_PVR_2BPP_RGB || compressionOptions.format == Format_PVR_2BPP_RGBA) {
+            // 2 bpp
+            const uint bpp = 2u;
+            const uint block_size = 8u * 4u;
+            const uint size_factor=(block_size*bpp)>>3u;
+            const uint block_width=nv::max((w>>3u), 2u);
+            const uint block_height=nv::max((h>>2u), 2u);
+            size = d * block_width * block_height * size_factor;
+        }
+        else {
+            // 4 bpp
+            const uint bpp = 4u;
+            const uint block_size = 4u * 4u;
+            const uint size_factor = (block_size*bpp) >> 3u;
+            const uint block_width = max((w>>2u), 2u);
+            const uint block_height = max((h>>2u), 2u);
+            size = d * block_width * block_height * size_factor;
+        }
+
+        if (outputOptions.outputHandler != NULL) {
+            outputOptions.outputHandler->writeData(texture.getDataPtr(), I32(size));
+        }
+    }
+}
+
+#endif
+
+
diff --git a/src/nvtt/BlockCompressor.h b/src/nvtt/BlockCompressor.h
index 7514bde..63a9b7c 100644
--- a/src/nvtt/BlockCompressor.h
+++ b/src/nvtt/BlockCompressor.h
@@ -27,7 +27,6 @@
 
 #include "Compressor.h"
 
-
 namespace nv
 {
     struct ColorBlock;
@@ -45,9 +44,149 @@ namespace nv
     {
         virtual void compress(nvtt::AlphaMode alphaMode, uint w, uint h, uint d, const float * rgba, nvtt::TaskDispatcher * dispatcher, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
 
-        virtual void compressBlock(const Vector4 colors[16], const float weights[16], const nvtt::CompressionOptions::Private & compressionOptions, void * output) = 0;
-        virtual uint blockSize() const = 0;
+        virtual void compressBlock(Vector4 colors[16], float weights[16], const nvtt::CompressionOptions::Private & compressionOptions, void * output) = 0;
+        virtual uint blockSize(const nvtt::CompressionOptions::Private & compressionOptions) const = 0;
+    };
+
+
+    // BC1
+    struct FastCompressorDXT1 : public FloatColorCompressor
+    {
+        virtual void compressBlock(Vector4 colors[16], float weights[16], const nvtt::CompressionOptions::Private & compressionOptions, void * output);
+        virtual uint blockSize(const nvtt::CompressionOptions::Private &) const { return 8; }
+    };
+    struct CompressorDXT1 : public FloatColorCompressor
+    {
+        virtual void compressBlock(Vector4 colors[16], float weights[16], const nvtt::CompressionOptions::Private & compressionOptions, void * output);
+        virtual uint blockSize(const nvtt::CompressionOptions::Private &) const { return 8; }
+    };
+
+    // BC3
+    struct CompressorBC3_RGBM : public FloatColorCompressor
+    {
+        virtual void compressBlock(Vector4 colors[16], float weights[16], const nvtt::CompressionOptions::Private & compressionOptions, void * output);
+        virtual uint blockSize(const nvtt::CompressionOptions::Private &) const { return 16; }
+    };
+
+
+    // ETC
+    struct CompressorETC1 : public FloatColorCompressor
+    {
+        virtual void compressBlock(Vector4 colors[16], float weights[16], const nvtt::CompressionOptions::Private & compressionOptions, void * output);
+        virtual uint blockSize(const nvtt::CompressionOptions::Private &) const { return 8; }
+    };
+    struct CompressorETC2_R : public FloatColorCompressor
+    {
+        virtual void compressBlock(Vector4 colors[16], float weights[16], const nvtt::CompressionOptions::Private & compressionOptions, void * output);
+        virtual uint blockSize(const nvtt::CompressionOptions::Private & ) const { return 8; }
+    };
+    struct CompressorETC2_RG : public FloatColorCompressor
+    {
+        virtual void compressBlock(Vector4 colors[16], float weights[16], const nvtt::CompressionOptions::Private & compressionOptions, void * output);
+        virtual uint blockSize(const nvtt::CompressionOptions::Private & ) const { return 16; }
+    };
+    struct CompressorETC2_RGB : public FloatColorCompressor
+    {
+        virtual void compressBlock(Vector4 colors[16], float weights[16], const nvtt::CompressionOptions::Private & compressionOptions, void * output);
+        virtual uint blockSize(const nvtt::CompressionOptions::Private & ) const { return 8; }
+    };
+    struct CompressorETC2_RGBA : public FloatColorCompressor
+    {
+        virtual void compressBlock(Vector4 colors[16], float weights[16], const nvtt::CompressionOptions::Private & compressionOptions, void * output);
+        virtual uint blockSize(const nvtt::CompressionOptions::Private & ) const { return 16; }
+    };
+    struct CompressorETC2_RGBM : public FloatColorCompressor
+    {
+        virtual void compressBlock(Vector4 colors[16], float weights[16], const nvtt::CompressionOptions::Private & compressionOptions, void * output);
+        virtual uint blockSize(const nvtt::CompressionOptions::Private &) const { return 16; }
+    };
+    
+    
+    // External compressors.
+#if defined(HAVE_ATITC)
+    struct AtiCompressorDXT1 : public CompressorInterface
+    {
+        virtual void compress(nvtt::InputFormat inputFormat, nvtt::AlphaMode alphaMode, uint w, uint h, uint d, void * data, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
+    };
+
+    struct AtiCompressorDXT5 : public CompressorInterface
+    {
+        virtual void compress(nvtt::InputFormat inputFormat, nvtt::AlphaMode alphaMode, uint w, uint h, uint d, void * data, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
+    };
+#endif
+
+#if defined(HAVE_SQUISH)
+    struct SquishCompressorDXT1 : public CompressorInterface
+    {
+        virtual void compress(nvtt::InputFormat inputFormat, nvtt::AlphaMode alphaMode, uint w, uint h, uint d, void * data, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
+    };
+#endif
+
+#if defined(HAVE_D3DX)
+    struct D3DXCompressorDXT1 : public CompressorInterface
+    {
+        virtual void compress(nvtt::InputFormat inputFormat, nvtt::AlphaMode alphaMode, uint w, uint h, uint d, void * data, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
+    };
+#endif
+
+#if defined(HAVE_STB)
+    struct StbCompressorDXT1 : public ColorBlockCompressor
+    {
+        virtual void compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
+        virtual uint blockSize() const { return 8; }
+    };
+#endif
+
+#if NV_USE_CRUNCH
+    struct CrunchCompressorETC1 : public CompressorInterface
+    {
+        virtual void compress(nvtt::InputFormat inputFormat, nvtt::AlphaMode alphaMode, uint w, uint h, uint d, void * data, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
+    };
+#endif
+
+#if NV_USE_INTEL_ISPC_TC
+    struct IspcCompressorBC1 : public CompressorInterface
+    {
+        virtual void compress(nvtt::InputFormat inputFormat, nvtt::AlphaMode alphaMode, uint w, uint h, uint d, void * data, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
+    };
+
+    struct IspcCompressorBC3 : public CompressorInterface
+    {
+        virtual void compress(nvtt::InputFormat inputFormat, nvtt::AlphaMode alphaMode, uint w, uint h, uint d, void * data, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
+    };
+
+    struct IspcCompressorBC7 : public CompressorInterface
+    {
+        virtual void compress(nvtt::InputFormat inputFormat, nvtt::AlphaMode alphaMode, uint w, uint h, uint d, void * data, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
+    };
+
+    struct IspcCompressorETC1 : public CompressorInterface
+    {
+        virtual void compress(nvtt::InputFormat inputFormat, nvtt::AlphaMode alphaMode, uint w, uint h, uint d, void * data, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
+    };
+#endif
+
+#if defined(HAVE_ETCLIB)
+    struct EtcLibCompressor : public CompressorInterface
+    {
+        virtual void compress(nvtt::AlphaMode alphaMode, uint w, uint h, uint d, const float * data, nvtt::TaskDispatcher * dispatcher, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
+    };
+#endif
+
+#if defined(HAVE_RGETC)
+    struct RgEtcCompressor : public ColorBlockCompressor
+    {
+        virtual void compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
+        virtual uint blockSize() const { return 8; }
+    };
+#endif
+
+#if defined(HAVE_PVRTEXTOOL)
+    struct CompressorPVR : public CompressorInterface
+    {
+        virtual void compress(nvtt::AlphaMode alphaMode, uint w, uint h, uint d, const float * data, nvtt::TaskDispatcher * dispatcher, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
     };
+#endif
 
 } // nv namespace
 
diff --git a/src/nvtt/CMakeLists.txt b/src/nvtt/CMakeLists.txt
index 9f1e394..445aa1e 100644
--- a/src/nvtt/CMakeLists.txt
+++ b/src/nvtt/CMakeLists.txt
@@ -13,6 +13,7 @@ SET(NVTT_SRCS
     CompressorDX11.h CompressorDX11.cpp
     CompressorDXT1.h CompressorDXT1.cpp
     CompressorDXT5_RGBM.h CompressorDXT5_RGBM.cpp
+    CompressorETC.h CompressorETC.cpp
     CompressorRGB.h CompressorRGB.cpp
     Context.h Context.cpp
     QuickCompressDXT.h QuickCompressDXT.cpp
@@ -38,6 +39,7 @@ IF (CUDA_FOUND)
 ENDIF (CUDA_FOUND)
 
 INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+INCLUDE_DIRECTORIES(${NV_SOURCE_DIR}/extern/rg_etc1_v104)
 
 ADD_DEFINITIONS(-DNVTT_EXPORTS)
 
@@ -47,7 +49,7 @@ ELSE(NVTT_SHARED)
     ADD_LIBRARY(nvtt ${NVTT_SRCS})
 ENDIF(NVTT_SHARED)
 
-TARGET_LINK_LIBRARIES(nvtt ${LIBS} nvcore nvimage nvthread squish bc6h bc7 nvmath)
+TARGET_LINK_LIBRARIES(nvtt ${LIBS} nvcore nvimage nvthread squish bc6h bc7 nvmath rg_etc1)
 
 INSTALL(TARGETS nvtt 
     RUNTIME DESTINATION bin
diff --git a/src/nvtt/ClusterFit.cpp b/src/nvtt/ClusterFit.cpp
index b3b2f1a..39bbbc4 100644
--- a/src/nvtt/ClusterFit.cpp
+++ b/src/nvtt/ClusterFit.cpp
@@ -38,79 +38,6 @@ ClusterFit::ClusterFit()
 {
 }
 
-#if 0 // @@ Deprecate. Do not use color set directly.
-void ClusterFit::setColorSet(const ColorSet * set) 
-{
-    // initialise the best error
-#if NVTT_USE_SIMD
-    m_besterror = SimdVector( FLT_MAX );
-    Vector3 metric = m_metric.toVector3();
-#else
-    m_besterror = FLT_MAX;
-    Vector3 metric = m_metric;
-#endif
-
-    // cache some values
-    m_count = set->colorCount;
-
-    Vector3 values[16];
-    for (uint i = 0; i < m_count; i++)
-    {
-        values[i] = set->colors[i].xyz();
-    }
-
-    Vector3 principal = Fit::computePrincipalComponent_PowerMethod(m_count, values, set->weights, metric);
-    //Vector3 principal = Fit::computePrincipalComponent_EigenSolver(m_count, values, set->weights, metric);
-
-    // build the list of values
-    int order[16];
-    float dps[16];
-    for (uint i = 0; i < m_count; ++i)
-    {
-        dps[i] = dot(values[i], principal);
-        order[i] = i;
-    }
-
-    // stable sort
-    for (uint i = 0; i < m_count; ++i)
-    {
-        for (uint j = i; j > 0 && dps[j] < dps[j - 1]; --j)
-        {
-            swap(dps[j], dps[j - 1]);
-            swap(order[j], order[j - 1]);
-        }
-    }
-
-    // weight all the points
-#if NVTT_USE_SIMD
-    m_xxsum = SimdVector( 0.0f );
-    m_xsum = SimdVector( 0.0f );
-#else
-    m_xxsum = Vector3(0.0f);
-    m_xsum = Vector3(0.0f);
-    m_wsum = 0.0f;
-#endif
-	
-    for (uint i = 0; i < m_count; ++i)
-    {
-        int p = order[i];
-#if NVTT_USE_SIMD
-        NV_ALIGN_16 Vector4 tmp(values[p], 1);
-        m_weighted[i] = SimdVector(tmp.component) * SimdVector(set->weights[p]);
-        m_xxsum += m_weighted[i] * m_weighted[i];
-        m_xsum += m_weighted[i];
-#else
-        m_weighted[i] = values[p] * set->weights[p];
-        m_xxsum += m_weighted[i] * m_weighted[i];
-        m_xsum += m_weighted[i];
-        m_weights[i] = set->weights[p];
-        m_wsum += m_weights[i];
-#endif
-    }
-}
-#endif // 0
-
-
 void ClusterFit::setColorSet(const Vector3 * colors, const float * weights, int count)
 {
     // initialise the best error
@@ -412,13 +339,13 @@ bool ClusterFit::compress4( Vector3 * start, Vector3 * end )
 #else
 
 inline Vector3 round565(const Vector3 & v) {
-	uint r = ftoi_trunc(v.x * 31.0f);
+    uint r = ftoi_trunc(v.x * 31.0f);
     float r0 = float(((r+0) << 3) | ((r+0) >> 2));
     float r1 = float(((r+1) << 3) | ((r+1) >> 2));
     if (fabs(v.x - r1) < fabs(v.x - r0)) r = min(r+1, 31U);
-	r = (r << 3) | (r >> 2);
+    r = (r << 3) | (r >> 2);
 
-	uint g = ftoi_trunc(v.y * 63.0f);
+    uint g = ftoi_trunc(v.y * 63.0f);
     float g0 = float(((g+0) << 2) | ((g+0) >> 4));
     float g1 = float(((g+1) << 2) | ((g+1) >> 4));
     if (fabs(v.y - g1) < fabs(v.y - g0)) g = min(g+1, 63U);
@@ -428,8 +355,8 @@ inline Vector3 round565(const Vector3 & v) {
     float b0 = float(((b+0) << 3) | ((b+0) >> 2));
     float b1 = float(((b+1) << 3) | ((b+1) >> 2));
     if (fabs(v.z - b1) < fabs(v.z - b0)) b = min(b+1, 31U);
-	
-	b = (b << 3) | (b >> 2);
+
+    b = (b << 3) | (b >> 2);
 
     return Vector3(float(r)/255, float(g)/255, float(b)/255);
 }
diff --git a/src/nvtt/CompressionOptions.cpp b/src/nvtt/CompressionOptions.cpp
index a899a67..502d1e3 100644
--- a/src/nvtt/CompressionOptions.cpp
+++ b/src/nvtt/CompressionOptions.cpp
@@ -50,7 +50,8 @@ void CompressionOptions::reset()
     m.format = Format_DXT1;
     m.quality = Quality_Normal;
     m.colorWeight.set(1.0f, 1.0f, 1.0f, 1.0f);
-
+    m.rgbmThreshold = 0.15f;
+    
     m.bitcount = 32;
     m.bmask = 0x000000FF;
     m.gmask = 0x0000FF00;
@@ -102,6 +103,11 @@ void CompressionOptions::setColorWeights(float red, float green, float blue, flo
     m.colorWeight.set(red, green, blue, alpha);
 }
 
+void CompressionOptions::setRGBMThreshold(float min_m)
+{
+    m.rgbmThreshold = min_m;
+}
+
 
 /// Set color mask to describe the RGB/RGBA format.
 void CompressionOptions::setPixelFormat(uint bitCount, uint rmask, uint gmask, uint bmask, uint amask)
@@ -162,7 +168,7 @@ void CompressionOptions::setPixelType(PixelType pixelType)
 /// Set pitch alignment in bytes.
 void CompressionOptions::setPitchAlignment(int pitchAlignment)
 {
-    nvDebugCheck(pitchAlignment > 0 && isPowerOfTwo(pitchAlignment));
+    nvDebugCheck(pitchAlignment > 0 && isPowerOfTwo(U32(pitchAlignment)));
     m.pitchAlignment = pitchAlignment;
 }
 
@@ -194,6 +200,10 @@ void CompressionOptions::setTargetDecoder(Decoder decoder)
 }
 
 
+Format CompressionOptions::format() const
+{
+    return m.format;
+}
 
 // Translate to and from D3D formats.
 unsigned int CompressionOptions::d3d9Format() const
@@ -246,10 +256,20 @@ unsigned int CompressionOptions::d3d9Format() const
             FOURCC_ATI2,    // Format_BC5
             FOURCC_DXT1,    // Format_DXT1n
 		    0,              // Format_CTX1
-            MAKEFOURCC('B', 'C', '6', 'H'),     // Format_BC6
-            MAKEFOURCC('B', 'C', '7', 'L'),     // Format_BC7
-            //FOURCC_ATI2,    // Format_BC5_Luma
-            FOURCC_DXT5,    // Format_BC3_RGBM
+            FOURCC_BC6H,     // Format_BC6
+            FOURCC_BC7L,     // Format_BC7
+            FOURCC_DXT5,                        // Format_BC3_RGBM
+            NV_MAKEFOURCC('E', 'T', 'C', '1'),  // Format_ETC1
+            0,                                  // Format_ETC2_R
+            0,                                  // Format_ETC2_RG
+            NV_MAKEFOURCC('E', 'T', 'C', '2'),  // Format_ETC2_RGB
+            0,                                  // Format_ETC2_RGBA
+            0,                                  // Format_ETC2_RGB_A1
+            0,                                  // Format_ETC2_RGBM
+            FOURCC_PVR0,
+            FOURCC_PVR1,
+            FOURCC_PVR2,
+            FOURCC_PVR3,
         };
 
         NV_COMPILER_CHECK(NV_ARRAY_SIZE(d3d9_formats) == Format_Count);
@@ -258,12 +278,80 @@ unsigned int CompressionOptions::d3d9Format() const
     }
 }
 
-/*
-bool CompressionOptions::setDirect3D9Format(unsigned int format)
+unsigned int CompressionOptions::dxgiFormat() const // @@ Add srgb flag.
 {
+    if (m.format == Format_RGB) {
+        if (m.pixelType == PixelType_UnsignedNorm) {
+            
+            uint bitcount = m.bitcount;
+            uint rmask = m.rmask;
+            uint gmask = m.gmask;
+            uint bmask = m.bmask;
+            uint amask = m.amask;
+            
+            if (bitcount == 0) {
+                bitcount = m.rsize + m.gsize + m.bsize + m.asize;
+                rmask = ((1 << m.rsize) - 1) << (m.asize + m.bsize + m.gsize);
+                gmask = ((1 << m.gsize) - 1) << (m.asize + m.bsize);
+                bmask = ((1 << m.bsize) - 1) << m.asize;
+                amask = ((1 << m.asize) - 1) << 0;
+            }
+            
+            if (bitcount <= 32) {
+                return nv::findDXGIFormat(bitcount, rmask, gmask, bmask, amask);
+            }
+            else {
+                if (m.rsize == 16 && m.gsize == 16 && m.bsize == 0 && m.asize == 0) return DXGI_FORMAT_R16G16_UNORM;
+                if (m.rsize == 16 && m.gsize == 16 && m.bsize == 16 && m.asize == 16) return DXGI_FORMAT_R16G16B16A16_UNORM;
+            }
+        }
+        else if (m.pixelType == PixelType_Float) {
+            if (m.rsize == 16 && m.gsize == 0 && m.bsize == 0 && m.asize == 0) return DXGI_FORMAT_R16_FLOAT;
+            if (m.rsize == 32 && m.gsize == 0 && m.bsize == 0 && m.asize == 0) return DXGI_FORMAT_R32_FLOAT;
+            if (m.rsize == 16 && m.gsize == 16 && m.bsize == 0 && m.asize == 0) return DXGI_FORMAT_R16G16_FLOAT;
+            if (m.rsize == 32 && m.gsize == 32 && m.bsize == 0 && m.asize == 0) return DXGI_FORMAT_R32G32_FLOAT;
+            if (m.rsize == 16 && m.gsize == 16 && m.bsize == 16 && m.asize == 16) return DXGI_FORMAT_R16G16B16A16_FLOAT;
+            if (m.rsize == 32 && m.gsize == 32 && m.bsize == 32 && m.asize == 32) return DXGI_FORMAT_R32G32B32A32_FLOAT;
+        }
+        
+        return 0;
+    }
+    else {
+        uint dxgi_formats[] = {
+            0,                          // Format_RGB,
+            DXGI_FORMAT_BC1_UNORM,      // Format_DXT1
+            DXGI_FORMAT_BC1_UNORM,      // Format_DXT1a
+            DXGI_FORMAT_BC2_UNORM,      // Format_DXT3
+            DXGI_FORMAT_BC3_UNORM,      // Format_DXT5
+            DXGI_FORMAT_BC3_UNORM,      // Format_DXT5n
+            DXGI_FORMAT_BC4_UNORM,      // Format_BC4
+            DXGI_FORMAT_BC5_UNORM,      // Format_BC5
+            DXGI_FORMAT_BC1_UNORM,      // Format_DXT1n
+            0,                          // Format_CTX1
+            DXGI_FORMAT_BC6H_UF16,      // Format_BC6
+            DXGI_FORMAT_BC7_UNORM,      // Format_BC7
+            DXGI_FORMAT_BC5_UNORM,      // Format_BC3_RGBM
+            0,                          // Format_ETC1
+            0,                          // Format_ETC2_R
+            0,                          // Format_ETC2_RG
+            0,                          // Format_ETC2_RGB
+            0,                          // Format_ETC2_RGBA
+            0,                          // Format_ETC2_RGB_A1
+            0,                          // Format_ETC2_RGBM
+            0,                          // Format_PVR_2BPP_RGB
+            0,                          // Format_PVR_4BPP_RGB
+            0,                          // Format_PVR_2BPP_RGBA
+            0,                          // Format_PVR_4BPP_RGB
+        };
+        
+        NV_COMPILER_CHECK(NV_ARRAY_SIZE(dxgi_formats) == Format_Count);
+        
+        return dxgi_formats[m.format];
+    }
 }
 
-unsigned int CompressionOptions::dxgiFormat() const
+/*
+bool CompressionOptions::setDirect3D9Format(unsigned int format)
 {
 }
 
diff --git a/src/nvtt/CompressionOptions.h b/src/nvtt/CompressionOptions.h
index 7612f8f..98d801a 100644
--- a/src/nvtt/CompressionOptions.h
+++ b/src/nvtt/CompressionOptions.h
@@ -39,7 +39,8 @@ namespace nvtt
         Quality quality;
 
         nv::Vector4 colorWeight;
-
+        float rgbmThreshold;
+        
         // Pixel format description.
         uint bitcount;
         uint rmask;
diff --git a/src/nvtt/Compressor.h b/src/nvtt/Compressor.h
index f55d94c..54b9cc9 100644
--- a/src/nvtt/Compressor.h
+++ b/src/nvtt/Compressor.h
@@ -30,6 +30,7 @@
 
 namespace nv
 {
+
     struct CompressorInterface
     {
         virtual ~CompressorInterface() {}
diff --git a/src/nvtt/CompressorDX11.cpp b/src/nvtt/CompressorDX11.cpp
index a349ffa..d64e662 100644
--- a/src/nvtt/CompressorDX11.cpp
+++ b/src/nvtt/CompressorDX11.cpp
@@ -39,7 +39,7 @@ using namespace nv;
 using namespace nvtt;
 
 
-void CompressorBC6::compressBlock(const Vector4 colors[16], const float weights[16], const CompressionOptions::Private & compressionOptions, void * output)
+void CompressorBC6::compressBlock(Vector4 colors[16], float weights[16], const CompressionOptions::Private & compressionOptions, void * output)
 {
     // !!!UNDONE: support channel weights
     // !!!UNDONE: set flags once, not per block (this is especially sketchy since block compression is multithreaded...)
@@ -77,7 +77,7 @@ void CompressorBC6::compressBlock(const Vector4 colors[16], const float weights[
     ZOH::compress(zohTile, (char *)output);
 }
 
-void CompressorBC7::compressBlock(const Vector4 colors[16], const float weights[16], const CompressionOptions::Private & compressionOptions, void * output)
+void CompressorBC7::compressBlock(Vector4 colors[16], float weights[16], const CompressionOptions::Private & compressionOptions, void * output)
 {
     // !!!UNDONE: support channel weights
     // !!!UNDONE: set flags once, not per block (this is especially sketchy since block compression is multithreaded...)
diff --git a/src/nvtt/CompressorDX11.h b/src/nvtt/CompressorDX11.h
index 7afaacb..39247f3 100644
--- a/src/nvtt/CompressorDX11.h
+++ b/src/nvtt/CompressorDX11.h
@@ -30,14 +30,14 @@ namespace nv
 {
     struct CompressorBC6 : public FloatColorCompressor
     {
-        virtual void compressBlock(const Vector4 colors[16], const float weights[16], const nvtt::CompressionOptions::Private & compressionOptions, void * output);
-        virtual uint blockSize() const { return 16; }
+        virtual void compressBlock(Vector4 colors[16], float weights[16], const nvtt::CompressionOptions::Private & compressionOptions, void * output);
+        virtual uint blockSize(const nvtt::CompressionOptions::Private & ) const { return 16; }
     };
 
     struct CompressorBC7 : public FloatColorCompressor
     {
-        virtual void compressBlock(const Vector4 colors[16], const float weights[16], const nvtt::CompressionOptions::Private & compressionOptions, void * output);
-        virtual uint blockSize() const { return 16; }
+        virtual void compressBlock(Vector4 colors[16], float weights[16], const nvtt::CompressionOptions::Private & compressionOptions, void * output);
+        virtual uint blockSize(const nvtt::CompressionOptions::Private & ) const { return 16; }
     };
 	
 } // nv namespace
diff --git a/src/nvtt/CompressorDX9.cpp b/src/nvtt/CompressorDX9.cpp
index 9cfd7da..c05c762 100644
--- a/src/nvtt/CompressorDX9.cpp
+++ b/src/nvtt/CompressorDX9.cpp
@@ -28,7 +28,7 @@
 #include "CompressionOptions.h"
 #include "OutputOptions.h"
 #include "ClusterFit.h"
-#include "CompressorDXT1.h"
+//#include "CompressorDXT1.h"
 #include "CompressorDXT5_RGBM.h"
 
 // squish
@@ -48,45 +48,11 @@
 
 #include <new> // placement new
 
-// s3_quant
-#if defined(HAVE_S3QUANT)
-#include "s3tc/s3_quant.h"
-#endif
-
-// ati tc
-#if defined(HAVE_ATITC)
-typedef int BOOL;
-typedef _W64 unsigned long ULONG_PTR;
-typedef ULONG_PTR DWORD_PTR;
-#include "atitc/ATI_Compress.h"
-#endif
-
-// squish
-#if defined(HAVE_SQUISH)
-//#include "squish/squish.h"
-#include "squish-1.10/squish.h"
-#endif
-
-// d3dx
-#if defined(HAVE_D3DX)
-#include <d3dx9.h>
-#endif
-
-// stb
-#if defined(HAVE_STB)
-#define STB_DEFINE
-#include "stb/stb_dxt.h"
-#endif
 
 using namespace nv;
 using namespace nvtt;
 
 
-void FastCompressorDXT1::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
-{
-    BlockDXT1 * block = new(output) BlockDXT1;
-    QuickCompress::compressDXT1(rgba, block);
-}
 
 void FastCompressorDXT1a::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
 {
@@ -115,39 +81,13 @@ void FastCompressorDXT5n::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alpha
 }
 
 
-#if 1
-
-void CompressorDXT1::compressBlock(const Vector4 colors[16], const float weights[16], const nvtt::CompressionOptions::Private & compressionOptions, void * output)
-{
-    compress_dxt1(colors, weights, compressionOptions.colorWeight.xyz(), /*three_color_mode*/true, (BlockDXT1 *)output);
-}
-
-#else
-void CompressorDXT1::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
-{
-    nvsquish::WeightedClusterFit fit;
-    fit.SetMetric(compressionOptions.colorWeight.x, compressionOptions.colorWeight.y, compressionOptions.colorWeight.z);
-
-    if (rgba.isSingleColor())
-    {
-        BlockDXT1 * block = new(output) BlockDXT1;
-        OptimalCompress::compressDXT1(rgba.color(0), block);
-    }
-    else
-    {
-        nvsquish::ColourSet colours((uint8 *)rgba.colors(), 0);
-        fit.SetColourSet(&colours, nvsquish::kDxt1);
-        fit.Compress(output);
-    }
-}
-#endif
 
 void CompressorDXT1a::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
 {
     uint alphaMask = 0;
     for (uint i = 0; i < 16; i++)
     {
-        if (rgba.color(i).a == 0) alphaMask |= (3 << (i * 2)); // Set two bits for each color.
+        if (rgba.color(i).a == 0) alphaMask |= (3U << (i * 2U)); // Set two bits for each color.
     }
 
     const bool isSingleColor = rgba.isSingleColor();
@@ -284,216 +224,6 @@ void CompressorDXT5n::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode
 }
 
 
-void CompressorBC3_RGBM::compressBlock(const Vector4 colors[16], const float weights[16], const nvtt::CompressionOptions::Private & compressionOptions, void * output)
-{
-    float min_m = 0.25f; // @@ Get from compression options.
-    compress_dxt5_rgbm(colors, weights, min_m, (BlockDXT5 *)output);
-}
-
-
-#if defined(HAVE_ATITC)
-
-void AtiCompressorDXT1::compress(nvtt::InputFormat inputFormat, nvtt::AlphaMode alphaMode, uint w, uint h, uint d, void * data, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions)
-{
-    nvDebugCheck(d == 1);
-
-    // Init source texture
-    ATI_TC_Texture srcTexture;
-    srcTexture.dwSize = sizeof(srcTexture);
-    srcTexture.dwWidth = w;
-    srcTexture.dwHeight = h;
-    if (inputFormat == nvtt::InputFormat_BGRA_8UB)
-    {
-        srcTexture.dwPitch = w * 4;
-        srcTexture.format = ATI_TC_FORMAT_ARGB_8888;
-    }
-    else
-    {
-        // @@ Floating point input is not swizzled.
-        srcTexture.dwPitch = w * 16;
-        srcTexture.format = ATI_TC_FORMAT_ARGB_32F;
-    }
-    srcTexture.dwDataSize = ATI_TC_CalculateBufferSize(&srcTexture);
-    srcTexture.pData = (ATI_TC_BYTE*) data;
-
-    // Init dest texture
-    ATI_TC_Texture destTexture;
-    destTexture.dwSize = sizeof(destTexture);
-    destTexture.dwWidth = w;
-    destTexture.dwHeight = h;
-    destTexture.dwPitch = 0;
-    destTexture.format = ATI_TC_FORMAT_DXT1;
-    destTexture.dwDataSize = ATI_TC_CalculateBufferSize(&destTexture);
-    destTexture.pData = (ATI_TC_BYTE*) mem::malloc(destTexture.dwDataSize);
-
-    ATI_TC_CompressOptions options;
-    options.dwSize = sizeof(options);
-    options.bUseChannelWeighting = false;
-    options.bUseAdaptiveWeighting = false;
-    options.bDXT1UseAlpha = false;
-    options.nCompressionSpeed = ATI_TC_Speed_Normal;
-    options.bDisableMultiThreading = false;
-    //options.bDisableMultiThreading = true;
-
-    // Compress
-    ATI_TC_ConvertTexture(&srcTexture, &destTexture, &options, NULL, NULL, NULL);
-
-    if (outputOptions.outputHandler != NULL) {
-            outputOptions.outputHandler->writeData(destTexture.pData, destTexture.dwDataSize);
-    }
 
-    mem::free(destTexture.pData);
-}
-
-void AtiCompressorDXT5::compress(nvtt::InputFormat inputFormat, nvtt::AlphaMode alphaMode, uint w, uint h, uint d, void * data, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions)
-{
-    nvDebugCheck(d == 1);
-
-    // Init source texture
-    ATI_TC_Texture srcTexture;
-    srcTexture.dwSize = sizeof(srcTexture);
-    srcTexture.dwWidth = w;
-    srcTexture.dwHeight = h;
-    if (inputFormat == nvtt::InputFormat_BGRA_8UB)
-    {
-        srcTexture.dwPitch = w * 4;
-        srcTexture.format = ATI_TC_FORMAT_ARGB_8888;
-    }
-    else
-    {
-        srcTexture.dwPitch = w * 16;
-        srcTexture.format = ATI_TC_FORMAT_ARGB_32F;
-    }
-    srcTexture.dwDataSize = ATI_TC_CalculateBufferSize(&srcTexture);
-    srcTexture.pData = (ATI_TC_BYTE*) data;
-
-    // Init dest texture
-    ATI_TC_Texture destTexture;
-    destTexture.dwSize = sizeof(destTexture);
-    destTexture.dwWidth = w;
-    destTexture.dwHeight = h;
-    destTexture.dwPitch = 0;
-    destTexture.format = ATI_TC_FORMAT_DXT5;
-    destTexture.dwDataSize = ATI_TC_CalculateBufferSize(&destTexture);
-    destTexture.pData = (ATI_TC_BYTE*) mem::malloc(destTexture.dwDataSize);
-
-    // Compress
-    ATI_TC_ConvertTexture(&srcTexture, &destTexture, NULL, NULL, NULL, NULL);
-
-    if (outputOptions.outputHandler != NULL) {
-        outputOptions.outputHandler->writeData(destTexture.pData, destTexture.dwDataSize);
-    }
-
-    mem::free(destTexture.pData);
-}
-
-#endif // defined(HAVE_ATITC)
-
-#if defined(HAVE_SQUISH)
-
-void SquishCompressorDXT1::compress(nvtt::InputFormat inputFormat, nvtt::AlphaMode alphaMode, uint w, uint h, uint d, void * data, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions)
-{
-    nvDebugCheck(d == 1);
-    nvDebugCheck(false);
-
-#pragma message(NV_FILE_LINE "TODO: Convert input to fixed point ABGR format instead of ARGB")
-    /*
-    Image img(*image);
-    int count = img.width() * img.height();
-    for (int i = 0; i < count; i++)
-    {
-            Color32 c = img.pixel(i);
-            img.pixel(i) = Color32(c.b, c.g, c.r, c.a);
-    }
-
-    int size = squish::GetStorageRequirements(img.width(), img.height(), squish::kDxt1);
-    void * blocks = mem::malloc(size);
-
-    squish::CompressImage((const squish::u8 *)img.pixels(), img.width(), img.height(), blocks, squish::kDxt1 | squish::kColourClusterFit);
-
-    if (outputOptions.outputHandler != NULL) {
-            outputOptions.outputHandler->writeData(blocks, size);
-    }
-
-    mem::free(blocks);
-    */
-}
-
-#endif // defined(HAVE_SQUISH)
-
-
-#if defined(HAVE_D3DX)
-
-void D3DXCompressorDXT1::compress(nvtt::InputFormat inputFormat, nvtt::AlphaMode alphaMode, uint w, uint h, uint d, void * data, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions)
-{
-    nvDebugCheck(d == 1);
-
-    IDirect3D9 * d3d = Direct3DCreate9(D3D_SDK_VERSION);
-
-    D3DPRESENT_PARAMETERS presentParams;
-    ZeroMemory(&presentParams, sizeof(presentParams));
-    presentParams.Windowed = TRUE;
-    presentParams.SwapEffect = D3DSWAPEFFECT_COPY;
-    presentParams.BackBufferWidth = 8;
-    presentParams.BackBufferHeight = 8;
-    presentParams.BackBufferFormat = D3DFMT_UNKNOWN;
-
-    HRESULT err;
-
-    IDirect3DDevice9 * device = NULL;
-    err = d3d->CreateDevice(D3DADAPTER_DEFAULT, D3DDEVTYPE_REF, GetDesktopWindow(), D3DCREATE_SOFTWARE_VERTEXPROCESSING, &presentParams, &device);
-
-    IDirect3DTexture9 * texture = NULL;
-    err = D3DXCreateTexture(device, w, h, 1, 0, D3DFMT_DXT1, D3DPOOL_SYSTEMMEM, &texture);
-
-    IDirect3DSurface9 * surface = NULL;
-    err = texture->GetSurfaceLevel(0, &surface);
-
-    RECT rect;
-    rect.left = 0;
-    rect.top = 0;
-    rect.bottom = h;
-    rect.right = w;
-
-    if (inputFormat == nvtt::InputFormat_BGRA_8UB)
-    {
-        err = D3DXLoadSurfaceFromMemory(surface, NULL, NULL, data, D3DFMT_A8R8G8B8, w * 4, NULL, &rect, D3DX_DEFAULT, 0);
-    }
-    else
-    {
-        err = D3DXLoadSurfaceFromMemory(surface, NULL, NULL, data, D3DFMT_A32B32G32R32F, w * 16, NULL, &rect, D3DX_DEFAULT, 0);
-    }
-
-    if (err != D3DERR_INVALIDCALL && err != D3DXERR_INVALIDDATA)
-    {
-        D3DLOCKED_RECT rect;
-        ZeroMemory(&rect, sizeof(rect));
-
-        err = surface->LockRect(&rect, NULL, D3DLOCK_READONLY);
-
-	    if (outputOptions.outputHandler != NULL) {
-	        int size = rect.Pitch * ((h + 3) / 4);
-	        outputOptions.outputHandler->writeData(rect.pBits, size);
-	    }
-
-        err = surface->UnlockRect();
-    }
-
-    surface->Release();
-    device->Release();
-    d3d->Release();
-}
-
-#endif // defined(HAVE_D3DX)
-
-
-#if defined(HAVE_STB)
-
-void StbCompressorDXT1::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
-{
-    rgba.swizzle(2, 1, 0, 3); // Swap R and B
-    stb_compress_dxt_block((unsigned char *)output, (unsigned char *)rgba.colors(), 0, 0);
-}
 
 
-#endif // defined(HAVE_STB)
diff --git a/src/nvtt/CompressorDX9.h b/src/nvtt/CompressorDX9.h
index 8a298c6..7cb13a3 100644
--- a/src/nvtt/CompressorDX9.h
+++ b/src/nvtt/CompressorDX9.h
@@ -32,12 +32,6 @@ namespace nv
     struct ColorBlock;
 
     // Fast CPU compressors.
-    struct FastCompressorDXT1 : public ColorBlockCompressor
-    {
-        virtual void compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
-        virtual uint blockSize() const { return 8; }
-    };
-
     struct FastCompressorDXT1a : public ColorBlockCompressor
     {
         virtual void compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
@@ -64,19 +58,6 @@ namespace nv
 
 
     // Normal CPU compressors.
-#if 1
-    struct CompressorDXT1 : public FloatColorCompressor
-    {
-        virtual void compressBlock(const Vector4 colors[16], const float weights[16], const nvtt::CompressionOptions::Private & compressionOptions, void * output);
-        virtual uint blockSize() const { return 8; }
-    };
-#else
-    struct CompressorDXT1 : public ColorBlockCompressor
-    {
-        virtual void compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
-        virtual uint blockSize() const { return 8; }
-    };
-#endif
 
     struct CompressorDXT1a : public ColorBlockCompressor
     {
@@ -108,47 +89,9 @@ namespace nv
         virtual uint blockSize() const { return 16; }
     };
 
-    struct CompressorBC3_RGBM : public FloatColorCompressor
-    {
-        virtual void compressBlock(const Vector4 colors[16], const float weights[16], const nvtt::CompressionOptions::Private & compressionOptions, void * output);
-        virtual uint blockSize() const { return 16; }
-    };
-
-
-    // External compressors.
-#if defined(HAVE_ATITC)
-    struct AtiCompressorDXT1 : public CompressorInterface
-    {
-        virtual void compress(nvtt::InputFormat inputFormat, nvtt::AlphaMode alphaMode, uint w, uint h, uint d, void * data, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
-    };
 
-    struct AtiCompressorDXT5 : public CompressorInterface
-    {
-        virtual void compress(nvtt::InputFormat inputFormat, nvtt::AlphaMode alphaMode, uint w, uint h, uint d, void * data, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
-    };
-#endif
 
-#if defined(HAVE_SQUISH)
-    struct SquishCompressorDXT1 : public CompressorInterface
-    {
-        virtual void compress(nvtt::InputFormat inputFormat, nvtt::AlphaMode alphaMode, uint w, uint h, uint d, void * data, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
-    };
-#endif
 
-#if defined(HAVE_D3DX)
-    struct D3DXCompressorDXT1 : public CompressorInterface
-    {
-        virtual void compress(nvtt::InputFormat inputFormat, nvtt::AlphaMode alphaMode, uint w, uint h, uint d, void * data, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
-    };
-#endif
-
-#if defined(HAVE_STB)
-    struct StbCompressorDXT1 : public ColorBlockCompressor
-    {
-        virtual void compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
-        virtual uint blockSize() const { return 8; }
-    };
-#endif
 
 } // nv namespace
 
diff --git a/src/nvtt/CompressorDXT1.cpp b/src/nvtt/CompressorDXT1.cpp
index 08134f8..8a09669 100644
--- a/src/nvtt/CompressorDXT1.cpp
+++ b/src/nvtt/CompressorDXT1.cpp
@@ -218,13 +218,13 @@ static int evaluate_mse(const Color32 & p, const Color32 & c) {
     return (square(int(p.r)-c.r) + square(int(p.g)-c.g) + square(int(p.b)-c.b));
 }
 
-static float evaluate_mse(const Vector3 palette[4], const Vector3 & c, const Vector3 & w) {
+/*static float evaluate_mse(const Vector3 palette[4], const Vector3 & c, const Vector3 & w) {
     float e0 = evaluate_mse(palette[0], c, w);
     float e1 = evaluate_mse(palette[1], c, w);
     float e2 = evaluate_mse(palette[2], c, w);
     float e3 = evaluate_mse(palette[3], c, w);
     return min(min(e0, e1), min(e2, e3));
-}
+}*/
 
 static int evaluate_mse(const Color32 palette[4], const Color32 & c) {
     int e0 = evaluate_mse(palette[0], c);
@@ -245,12 +245,12 @@ static int evaluate_mse(const BlockDXT1 * output, Color32 color, int index) {
 // Returns weighted MSE error in [0-255] range.
 static float evaluate_palette_error(Color32 palette[4], const Color32 * colors, const float * weights, int count) {
     
-	float total = 0.0f;
-	for (int i = 0; i < count; i++) {
+    float total = 0.0f;
+    for (int i = 0; i < count; i++) {
         total += weights[i] * evaluate_mse(palette, colors[i]);
-	}
+    }
 
-	return total;
+    return total;
 }
 
 #if 0
@@ -337,7 +337,7 @@ static void evaluate_palette(Color16 c0, Color16 c1, Vector3 palette[4]) {
     }
 }
 
-static void evaluate_palette3(Color16 c0, Color16 c1, Vector3 palette[4]) {
+/*static void evaluate_palette3(Color16 c0, Color16 c1, Vector3 palette[4]) {
     nvDebugCheck(c0.u > c1.u);
 
     Color32 palette32[4];
@@ -346,7 +346,7 @@ static void evaluate_palette3(Color16 c0, Color16 c1, Vector3 palette[4]) {
     for (int i = 0; i < 4; i++) {
         palette[i] = color_to_vector3(palette32[i]);
     }
-}
+}*/
 
 
 
@@ -355,38 +355,38 @@ static void evaluate_palette3(Color16 c0, Color16 c1, Vector3 palette[4]) {
 static uint compute_indices4(const Vector4 input_colors[16], const Vector3 & color_weights, const Vector3 palette[4]) {
     
     uint indices = 0;
-	for (int i = 0; i < 16; i++) {
-		float d0 = evaluate_mse(palette[0], input_colors[i].xyz(), color_weights);
-		float d1 = evaluate_mse(palette[1], input_colors[i].xyz(), color_weights);
-		float d2 = evaluate_mse(palette[2], input_colors[i].xyz(), color_weights);
-		float d3 = evaluate_mse(palette[3], input_colors[i].xyz(), color_weights);
-		
-		uint b0 = d0 > d3;
-		uint b1 = d1 > d2;
-		uint b2 = d0 > d2;
-		uint b3 = d1 > d3;
-		uint b4 = d2 > d3;
-		
-		uint x0 = b1 & b2;
-		uint x1 = b0 & b3;
-		uint x2 = b0 & b4;
-		
-		indices |= (x2 | ((x0 | x1) << 1)) << (2 * i);
-	}
+    for (int i = 0; i < 16; i++) {
+        float d0 = evaluate_mse(palette[0], input_colors[i].xyz(), color_weights);
+        float d1 = evaluate_mse(palette[1], input_colors[i].xyz(), color_weights);
+        float d2 = evaluate_mse(palette[2], input_colors[i].xyz(), color_weights);
+        float d3 = evaluate_mse(palette[3], input_colors[i].xyz(), color_weights);
+
+        uint b0 = d0 > d3;
+        uint b1 = d1 > d2;
+        uint b2 = d0 > d2;
+        uint b3 = d1 > d3;
+        uint b4 = d2 > d3;
+
+        uint x0 = b1 & b2;
+        uint x1 = b0 & b3;
+        uint x2 = b0 & b4;
+
+        indices |= (x2 | ((x0 | x1) << 1)) << (2 * i);
+    }
 
-	return indices;
+    return indices;
 }
 
 
 static uint compute_indices(const Vector4 input_colors[16], const Vector3 & color_weights, const Vector3 palette[4]) {
     
     uint indices = 0;
-	for (int i = 0; i < 16; i++) {
-		float d0 = evaluate_mse(palette[0], input_colors[i].xyz(), color_weights);
-		float d1 = evaluate_mse(palette[1], input_colors[i].xyz(), color_weights);
-		float d2 = evaluate_mse(palette[2], input_colors[i].xyz(), color_weights);
-		float d3 = evaluate_mse(palette[3], input_colors[i].xyz(), color_weights);
-		
+    for (int i = 0; i < 16; i++) {
+        float d0 = evaluate_mse(palette[0], input_colors[i].xyz(), color_weights);
+        float d1 = evaluate_mse(palette[1], input_colors[i].xyz(), color_weights);
+        float d2 = evaluate_mse(palette[2], input_colors[i].xyz(), color_weights);
+        float d3 = evaluate_mse(palette[3], input_colors[i].xyz(), color_weights);
+
         uint index;
         if (d0 < d1 && d0 < d2 && d0 < d3) index = 0;
         else if (d1 < d2 && d1 < d3) index = 1;
@@ -491,7 +491,8 @@ float nv::compress_dxt1_single_color(const Vector3 * colors, const float * weigh
 
     // Decompress block color.
     Color32 palette[4];
-    output->evaluatePalette(palette, /*d3d9=*/false);
+    evaluate_palette(output->col0, output->col1, palette);
+    //output->evaluatePalette(palette, /*d3d9=*/false);
 
     Vector3 block_color = color_to_vector3(palette[output->indices & 0x3]);
 
@@ -668,7 +669,7 @@ float nv::compress_dxt1(const Vector4 input_colors[16], const float input_weight
 
     // This is too expensive, even with a low threshold.
     // If high quality:
-    if (0) {
+    if (/* DISABLES CODE */ (0)) {
         BlockDXT1 exhaustive_output;
         float exhaustive_error = compress_dxt1_bounding_box_exhaustive(input_colors, colors, weights, count, color_weights, three_color_mode, 1400, &exhaustive_output);
 
@@ -720,7 +721,7 @@ float nv::compress_dxt1(const Vector4 input_colors[16], const float input_weight
 
 
 // Least squares fitting of color end points for the given indices. @@ Take weights into account.
-static bool optimize_end_points4(uint indices, const Vector3 * colors, const Vector3 * weights, int count, Vector3 * a, Vector3 * b)
+static bool optimize_end_points4(uint indices, const Vector4 * colors, /*const float * weights,*/ int count, Vector3 * a, Vector3 * b)
 {
     float alpha2_sum = 0.0f;
     float beta2_sum = 0.0f;
@@ -739,8 +740,8 @@ static bool optimize_end_points4(uint indices, const Vector3 * colors, const Vec
         alpha2_sum += alpha * alpha;
         beta2_sum += beta * beta;
         alphabeta_sum += alpha * beta;
-        alphax_sum += alpha * colors[i];
-        betax_sum += beta * colors[i];
+        alphax_sum += alpha * colors[i].xyz();
+        betax_sum += beta * colors[i].xyz();
     }
 
     float denom = alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum;
@@ -756,7 +757,7 @@ static bool optimize_end_points4(uint indices, const Vector3 * colors, const Vec
 
 
 // Least squares fitting of color end points for the given indices. @@ This does not support black/transparent index. @@ Take weights into account.
-static bool optimize_end_points3(uint indices, const Vector3 * colors, const Vector3 * weights, int count, Vector3 * a, Vector3 * b)
+static bool optimize_end_points3(uint indices, const Vector3 * colors, /*const float * weights,*/ int count, Vector3 * a, Vector3 * b)
 {
     float alpha2_sum = 0.0f;
     float beta2_sum = 0.0f;
@@ -794,6 +795,90 @@ static bool optimize_end_points3(uint indices, const Vector3 * colors, const Vec
 
 
 
+// find minimum and maximum colors based on bounding box in color space
+inline static void fit_colors_bbox(const Vector3 * colors, int count, Vector3 * restrict c0, Vector3 * restrict c1)
+{
+    *c0 = Vector3(0);
+    *c1 = Vector3(255);
+
+    for (int i = 0; i < count; i++) {
+        *c0 = max(*c0, colors[i]);
+        *c1 = min(*c1, colors[i]);
+    }
+}
+
+inline static void select_diagonal(const Vector3 * colors, int count, Vector3 * restrict c0, Vector3 * restrict c1)
+{
+    Vector3 center = (*c0 + *c1) * 0.5f;
+
+    Vector2 covariance = Vector2(0);
+    for (int i = 0; i < count; i++) {
+        Vector3 t = colors[i] - center;
+        covariance += t.xy() * t.z;
+    }
+
+    float x0 = c0->x;
+    float y0 = c0->y;
+    float x1 = c1->x;
+    float y1 = c1->y;
+
+    if (covariance.x < 0) {
+        swap(x0, x1);
+    }
+    if (covariance.y < 0) {
+        swap(y0, y1);
+    }
+
+    c0->set(x0, y0, c0->z);
+    c1->set(x1, y1, c1->z);
+}
+
+inline static void inset_bbox(Vector3 * restrict c0, Vector3 * restrict c1)
+{
+    Vector3 inset = (*c0 - *c1) / 16.0f - (8.0f / 255.0f) / 16.0f;
+    *c0 = clamp(*c0 - inset, 0.0f, 255.0f);
+    *c1 = clamp(*c1 + inset, 0.0f, 255.0f);
+}
+
+
+
+float nv::compress_dxt1_fast(const Vector4 input_colors[16], const float input_weights[16], const Vector3 & color_weights, BlockDXT1 * output)
+{
+    Vector3 colors[16];
+    float weights[16];
+    int count = reduce_colors(input_colors, input_weights, colors, weights);
+
+    if (count == 0) {
+        // Output trivial block.
+        output->col0.u = 0;
+        output->col1.u = 0;
+        output->indices = 0;
+        return 0;
+    }
+
+
+    float error = FLT_MAX;
+    error = compress_dxt1_single_color(colors, weights, count, color_weights, output);
+
+    if (error == 0.0f || count == 1) {
+        // Early out.
+        return error;
+    }
+
+    // Quick end point selection.
+    Vector3 c0, c1;
+    fit_colors_bbox(colors, count, &c0, &c1);
+    select_diagonal(colors, count, &c0, &c1);
+    inset_bbox(&c0, &c1);
+    output_block4(input_colors, color_weights, c0, c1, output);
+
+    // Refine color for the selected indices.
+    if (optimize_end_points4(output->indices, input_colors, 16, &c0, &c1)) {
+        output_block4(input_colors, color_weights, c0, c1, output);
+    }
+
+    return evaluate_mse(input_colors, input_weights, color_weights, output);
+}
 
 
 
diff --git a/src/nvtt/CompressorDXT1.h b/src/nvtt/CompressorDXT1.h
index c7e51d7..ac5bdb5 100644
--- a/src/nvtt/CompressorDXT1.h
+++ b/src/nvtt/CompressorDXT1.h
@@ -13,11 +13,14 @@ namespace nv {
     float compress_dxt1_single_color_optimal(const Vector3 & color, BlockDXT1 * output);
 
     float compress_dxt1_single_color(const Vector3 * colors, const float * weights, int count, const Vector3 & color_weights, BlockDXT1 * output);
-    float compress_dxt1_least_squares_fit(const Vector4 input_colors[16], const Vector3 * colors, const float * weights, int count, const Vector3 & color_weights, BlockDXT1 * output);
+    //float compress_dxt1_least_squares_fit(const Vector4 input_colors[16], const Vector3 * colors, const float * weights, int count, const Vector3 & color_weights, BlockDXT1 * output);
     float compress_dxt1_bounding_box_exhaustive(const Vector4 input_colors[16], const Vector3 * colors, const float * weights, int count, const Vector3 & color_weights, bool three_color_mode, int search_limit, BlockDXT1 * output);
     void compress_dxt1_cluster_fit(const Vector4 input_colors[16], const Vector3 * colors, const float * weights, int count, const Vector3 & color_weights, bool three_color_mode, BlockDXT1 * output);
 
-
+    // Cluster fit end point selection.
     float compress_dxt1(const Vector4 input_colors[16], const float input_weights[16], const Vector3 & color_weights, bool three_color_mode, BlockDXT1 * output);
 
+    // Quick end point selection followed by least squares refinement.
+    float compress_dxt1_fast(const Vector4 input_colors[16], const float input_weights[16], const Vector3 & color_weights, BlockDXT1 * output);
+
 }
diff --git a/src/nvtt/CompressorDXT5_RGBM.cpp b/src/nvtt/CompressorDXT5_RGBM.cpp
index 99dd412..3274f2c 100644
--- a/src/nvtt/CompressorDXT5_RGBM.cpp
+++ b/src/nvtt/CompressorDXT5_RGBM.cpp
@@ -3,6 +3,7 @@
 
 #include "OptimalCompressDXT.h"
 #include "QuickCompressDXT.h"
+#include "CompressorETC.h"
 
 #include "nvimage/ColorBlock.h"
 #include "nvimage/BlockDXT.h"
@@ -17,38 +18,45 @@
 
 using namespace nv;
 
-//static uint atomic_counter = 0;
-
-
-float nv::compress_dxt5_rgbm(const Vector4 input_colors[16], const float input_weights[16], float min_m, BlockDXT5 * output) {
 
-    // Convert to RGBM.
-    Vector4 input_colors_rgbm[16]; // @@ Write over input_colors?
-    float rgb_weights[16];
 
+static void convert_to_rgbm(const Vector4 input_colors[16], const float input_weights[16], float min_m, Vector4 rgbm_colors[16], float rgb_weights[16]) {
+    
     float weight_sum = 0;
-
+    
     for (uint i = 0; i < 16; i++) {
         const Vector4 & c = input_colors[i];
-
+        
         float R = saturate(c.x);
         float G = saturate(c.y);
         float B = saturate(c.z);
-
+        
         float M = max(max(R, G), max(B, min_m));
         float r = R / M;
         float g = G / M;
         float b = B / M;
         float a = (M - min_m) / (1 - min_m);
-
-        input_colors_rgbm[i] = Vector4(r, g, b, a);
+        
+        rgbm_colors[i] = Vector4(r, g, b, a);
         rgb_weights[i] = input_weights[i] * M;
         weight_sum += input_weights[i];
     }
-
+    
     if (weight_sum == 0) {
         for (uint i = 0; i < 16; i++) rgb_weights[i] = 1;
     }
+    
+}
+
+
+//static uint atomic_counter = 0;
+
+float nv::compress_dxt5_rgbm(const Vector4 input_colors[16], const float input_weights[16], float min_m, BlockDXT5 * output) {
+
+    // Convert to RGBM.
+    Vector4 input_colors_rgbm[16]; // @@ Write over input_colors?
+    float rgb_weights[16];
+    convert_to_rgbm(input_colors, input_weights, min_m, input_colors_rgbm, rgb_weights);
 
     // Compress RGB.
     compress_dxt1(input_colors_rgbm, rgb_weights, Vector3(1), /*three_color_mode=*/false, &output->color);
@@ -138,291 +146,61 @@ float nv::compress_dxt5_rgbm(const Vector4 input_colors[16], const float input_w
 }
 
 
-
-
-#if 0
-
-    BlockDXT5 * block = new(output)BlockDXT5;
-
-    // Decompress the color block and find the M values that reproduce the input most closely. This should compensate for some of the DXT errors.
-
-    // Compress the resulting M values optimally.
-
-    // Repeat this several times until compression error does not improve?
-
-    //Vector3 rgb_block[16];
-    //float m_block[16];
-
-
-    // Init RGB/M block.
-#if 0
-    nvsquish::WeightedClusterFit fit;
-
-    ColorBlock rgba;
-    for (int i = 0; i < 16; i++) {
-        const Vector4 & c = src.color(i);
-        float R = saturate(c.x);
-        float G = saturate(c.y);
-        float B = saturate(c.z);
-
-        float M = max(max(R, G), max(B, min_m));
-        float r = R / M;
-        float g = G / M;
-        float b = B / M;
-        float a = c.w;
-
-        rgba.color(i) = toColor32(Vector4(r, g, b, a));
-    }
-
-    if (rgba.isSingleColor())
-    {
-        OptimalCompress::compressDXT1(rgba.color(0), &block->color);
-    }
-    else
-    {
-        nvsquish::WeightedClusterFit fit;
-        fit.SetMetric(compressionOptions.colorWeight.x, compressionOptions.colorWeight.y, compressionOptions.colorWeight.z);
-
-        int flags = 0;
-        if (alphaMode == nvtt::AlphaMode_Transparency) flags |= nvsquish::kWeightColourByAlpha;
-
-        nvsquish::ColourSet colours((uint8 *)rgba.colors(), flags);
-        fit.SetColourSet(&colours, 0);
-        fit.Compress(&block->color);
-    }
-#endif
-#if 1
-    ColorSet rgb;
-    rgb.allocate(4, 4);
-
-    for (uint i = 0; i < 16; i++) {
-        const Vector4 & c = colors[i];
-
-        float R = saturate(c.x);
-        float G = saturate(c.y);
-        float B = saturate(c.z);
-
-        float M = max(max(R, G), max(B, min_m));
-        float r = R / M;
-        float g = G / M;
-        float b = B / M;
-        float a = c.w;
-
-        rgb.colors[i] = Vector4(r, g, b, a);
-        rgb.indices[i] = i;
-        rgb.weights[i] = max(weights[i], 0.001f);// weights[i];   // IC: For some reason 0 weights are causing problems, even if we eliminate the corresponding colors from the set.
-    }
-
-    rgb.createMinimalSet(/*ignoreTransparent=*/true);
-
-    if (rgb.isSingleColor(/*ignoreAlpha=*/true)) {
-        OptimalCompress::compressDXT1(toColor32(rgb.color(0)), &block->color);
-    }
-    else {
-        ClusterFit fit;
-        fit.setColorWeights(compressionOptions.colorWeight);
-        fit.setColorSet(&rgb);
-
-        Vector3 start, end;
-        fit.compress4(&start, &end);
-
-        QuickCompress::outputBlock4(rgb, start, end, &block->color);
-    }
-#endif
-
+float nv::compress_etc2_rgbm(Vector4 input_colors[16], float input_weights[16], float min_m, void * output) {
+    
+    // Convert to RGBM.
+    Vector4 rgbm_colors[16];
+    float rgb_weights[16];
+    convert_to_rgbm(input_colors, input_weights, min_m, rgbm_colors, rgb_weights);
+    
+    void * etc_output = (uint8 *)output + 8;
+    void * eac_output = output;
+    
+    // Compress RGB.
+    compress_etc2(rgbm_colors, rgb_weights, Vector3(1), etc_output);
+    
     // Decompress RGB/M block.
-    nv::ColorBlock RGB;
-    block->color.decodeBlock(&RGB);
+    decompress_etc(etc_output, rgbm_colors);
     
-#if 1
-    AlphaBlock4x4 M;
+    // Compute M values to compensate for RGB's error.
     for (int i = 0; i < 16; i++) {
-        const Vector4 & c = colors[i];
+        const Vector4 & c = input_colors[i];
+        
         float R = saturate(c.x);
         float G = saturate(c.y);
         float B = saturate(c.z);
-
-        float r = RGB.color(i).r / 255.0f;
-        float g = RGB.color(i).g / 255.0f;
-        float b = RGB.color(i).b / 255.0f;
-
-        float m = (R / r + G / g + B / b) / 3.0f;
-        //float m = max((R / r + G / g + B / b) / 3.0f, min_m);
-        //float m = max(max(R / r, G / g), max(B / b, min_m));
-        //float m = max(max(R, G), max(B, min_m));
+        
+        float rm = rgbm_colors[i].x;
+        float gm = rgbm_colors[i].y;
+        float bm = rgbm_colors[i].z;
+        
+        // compute m such that m * (r/M, g/M, b/M) == RGB
+        
+        // Three equations, one unknown:
+        //  m * r/M == R
+        //  m * g/M == G
+        //  m * b/M == B
+        
+        // Solve in the least squares sense!
+        
+        // m (rm gm bm) (rm gm bm)^T == (rm gm bm) (R G B)^T
+        
+        // m == dot(rgb, RGB) / dot(rgb, rgb)
+        
+        float m = dot(Vector3(rm, gm, bm), Vector3(R, G, B)) / dot(Vector3(rm, gm, bm), Vector3(rm, gm, bm));
+        if (!isFinite(m)) {
+            m = 1;
+        }
 
         m = (m - min_m) / (1 - min_m);
-
-        M.alpha[i] = U8(ftoi_round(saturate(m) * 255.0f));
-        M.weights[i] = weights[i];
-    }
-
-    // Compress M.
-    if (compressionOptions.quality == Quality_Fastest) {
-        QuickCompress::compressDXT5A(M, &block->alpha);
-    }
-    else {
-        OptimalCompress::compressDXT5A(M, &block->alpha);
-    }
-#else
-    OptimalCompress::compressDXT5A_RGBM(src, RGB, &block->alpha);
-#endif
-
-#if 0
-    // Decompress M.
-    block->alpha.decodeBlock(&M);
-
-    rgb.allocate(src.w, src.h);     // @@ Handle smaller blocks.
-
-    for (uint i = 0; i < src.colorCount; i++) {
-        const Vector4 & c = src.color(i);
-
-        float R = saturate(c.x);
-        float G = saturate(c.y);
-        float B = saturate(c.z);
-
-        //float m = max(max(R, G), max(B, min_m));
-        float m = float(M.alpha[i]) / 255.0f * (1 - min_m) + min_m;
-        float r = R / m;
-        float g = G / m;
-        float b = B / m;
-        float a = c.w;
-
-        rgb.colors[i] = Vector4(r, g, b, a);
-        rgb.indices[i] = i;
-        rgb.weights[i] = max(c.w, 0.001f);// src.weights[i];   // IC: For some reason 0 weights are causing problems, even if we eliminate the corresponding colors from the set.
-    }
-
-    rgb.createMinimalSet(/*ignoreTransparent=*/true);
-
-    if (rgb.isSingleColor(/*ignoreAlpha=*/true)) {
-        OptimalCompress::compressDXT1(toColor32(rgb.color(0)), &block->color);
-    }
-    else {
-        ClusterFit fit;
-        fit.setMetric(compressionOptions.colorWeight);
-        fit.setColourSet(&rgb);
-
-        Vector3 start, end;
-        fit.compress4(&start, &end);
-
-        QuickCompress::outputBlock4(rgb, start, end, &block->color);
+        
+        // Store M in alpha channel.
+        rgbm_colors[i].w = saturate(m);     // @@ What it we don't saturate?
     }
-#endif
-
-#if 0
-    block->color.decodeBlock(&RGB);
-
-    //AlphaBlock4x4 M;
-    //M.initWeights(src);
     
-    for (int i = 0; i < 16; i++) {
-        const Vector4 & c = src.color(i);
-        float R = saturate(c.x);
-        float G = saturate(c.y);
-        float B = saturate(c.z);
-
-        float r = RGB.color(i).r / 255.0f;
-        float g = RGB.color(i).g / 255.0f;
-        float b = RGB.color(i).b / 255.0f;
-
-        float m = (R / r + G / g + B / b) / 3.0f;
-        //float m = max((R / r + G / g + B / b) / 3.0f, min_m);
-        //float m = max(max(R / r, G / g), max(B / b, min_m));
-        //float m = max(max(R, G), max(B, min_m));
-
-        m = (m - min_m) / (1 - min_m);
-
-        M.alpha[i] = U8(ftoi_round(saturate(m) * 255.0f));
-        M.weights[i] = src.weights[i];
-    }
-
     // Compress M.
-    if (compressionOptions.quality == Quality_Fastest) {
-        QuickCompress::compressDXT5A(M, &block->alpha);
-    }
-    else {
-        OptimalCompress::compressDXT5A(M, &block->alpha);
-    }
-#endif
-
-
-
-#if 0
-    src.fromRGBM(M, min_m);
+    compress_eac(rgbm_colors, input_weights, /*input_channel=*/3, /*search_radius=*/1, /*11bit_mode*/false, eac_output);
 
-    src.createMinimalSet(/*ignoreTransparent=*/true);
-
-    if (src.isSingleColor(/*ignoreAlpha=*/true)) {
-        OptimalCompress::compressDXT1(src.color(0), &block->color);
-    }
-    else {
-        // @@ Use our improved compressor.
-        ClusterFit fit;
-        fit.setMetric(compressionOptions.colorWeight);
-        fit.setColourSet(&src);
-
-        Vector3 start, end;
-        fit.compress4(&start, &end);
-
-        if (fit.compress3(&start, &end)) {
-            QuickCompress::outputBlock3(src, start, end, block->color);
-        }
-        else {
-            QuickCompress::outputBlock4(src, start, end, block->color);
-        }
-    }
-#endif // 0
-
-    // @@ Decompress color and compute M that best approximates src with these colors? Then compress M again?
-
-
-
-    // RGBM encoding.
-    // Maximize precision.
-    // - Number of possible grey levels:
-    //   - Naive:  2^3 = 8
-    //   - Better: 2^3 + 2^2 = 12
-    //   - How to choose min_m? 
-    //     - Ideal = Adaptive per block, don't know where to store.
-    //     - Adaptive per lightmap. How to compute optimal?
-    //     - Fixed: 0.25 in our case. Lightmaps scaled to a fixed [0, 1] range.
-
-    // - Optimal compressor: Interpolation artifacts.
-
-    // - Color transform. 
-    //    - Measure error in post-tone-mapping color space. 
-    //    - Assume a simple tone mapping operator. We know minimum and maximum exposure, but don't know exact exposure in game.
-    //    - Guess based on average lighmap color? Use fixed exposure, in scaled lightmap space.
-
-    // - Enhanced DXT compressor.
-    //    - Typical RGBM encoding as follows:
-    //      rgb -> M = max(rgb), RGB=rgb/M -> RGBM
-    //    - If we add a compression step (M' = M) and M' < M, then rgb may be greater than 1.
-    //      - We could ensure that M' >= M during compression.
-    //      - We could clamp RGB anyway.
-    //      - We could add a fixed scale value to take into account compression errors and avoid clamping.
-
-
-    
-
-
-    // Compress color.
-    /*if (rgba.isSingleColor())
-    {
-        OptimalCompress::compressDXT1(rgba.color(0), &block->color);
-    }
-    else
-    {
-        nvsquish::WeightedClusterFit fit;
-        fit.SetMetric(compressionOptions.colorWeight.x, compressionOptions.colorWeight.y, compressionOptions.colorWeight.z);
-
-        int flags = 0;
-        if (alphaMode == nvtt::AlphaMode_Transparency) flags |= nvsquish::kWeightColourByAlpha;
-
-        nvsquish::ColourSet colours((uint8 *)rgba.colors(), flags);
-        fit.SetColourSet(&colours, 0);
-        fit.Compress(&block->color);
-    }*/
+    return 0; // @@ Compute error.
+}
 
-#endif // 0
\ No newline at end of file
diff --git a/src/nvtt/CompressorDXT5_RGBM.h b/src/nvtt/CompressorDXT5_RGBM.h
index 88cf646..3fdffe5 100644
--- a/src/nvtt/CompressorDXT5_RGBM.h
+++ b/src/nvtt/CompressorDXT5_RGBM.h
@@ -5,5 +5,5 @@ namespace nv {
     class Vector4;
 
     float compress_dxt5_rgbm(const Vector4 input_colors[16], const float input_weights[16], float min_m, BlockDXT5 * output);
-
+    float compress_etc2_rgbm(Vector4 input_colors[16], float input_weights[16], float min_m, void * output);
 }
diff --git a/src/nvtt/CompressorETC.cpp b/src/nvtt/CompressorETC.cpp
new file mode 100644
index 0000000..4fb8e1c
--- /dev/null
+++ b/src/nvtt/CompressorETC.cpp
@@ -0,0 +1,2307 @@
+
+#include "CompressorETC.h"
+
+#include "nvmath/Vector.inl"
+#include "nvmath/Matrix.inl"
+#include "nvmath/Color.inl"
+#include "nvcore/Utils.h"    // clamp
+
+#define HAVE_RGETC NV_OS_OSX
+#define HAVE_ETCPACK 0 // Only enable in OSX for debugging.
+
+#if HAVE_RGETC
+#include "rg_etc1.h"
+#endif
+
+#if HAVE_ETCPACK
+// From etcpack.cxx
+extern void decompressBlockETC2(unsigned int block_part1, unsigned int block_part2, uint8 *img, int width, int height, int startx, int starty);
+extern void decompressBlockAlpha(uint8* data, uint8* img, int width, int height, int ix, int iy);
+extern void decompressBlockAlpha16bit(uint8* data, uint8* img, int width, int height, int ix, int iy);
+extern int formatSigned;
+#endif
+
+#define assert nvCheck
+
+using namespace nv;
+
+// TODO:
+// - Accurate rounding of signed 3-bit components.
+// - Range based table selection.
+// - Slower try all options table selection?
+// - Trivial selector assignment.
+// * Base point optimization.
+// * Brute force base point optimization.
+// - Enumerate and evaluate all clusters.
+// - Brute force planar mode endpoint refinement. For each color try two rounding directions (8 tests).
+// - T & H modes decompression.
+
+union BlockETC {
+    // Definitions from EtcLib/EtcBlock4x4EncodingBits.h
+    struct Individual {
+        uint red2 : 4;      // byte 0
+        uint red1 : 4;
+        uint green2 : 4;    // byte 1
+        uint green1 : 4;
+        uint blue2 : 4;     // byte 2
+        uint blue1 : 4;
+        uint flip : 1;      // byte 3
+        uint diff : 1;
+        uint cw2 : 3;
+        uint cw1 : 3;
+        uint selectors;     // bytes 4-7
+    };
+    NV_COMPILER_CHECK(sizeof(BlockETC::Individual) == 64/8);
+
+    struct Differential {
+        uint dred2 : 3;     // byte 0
+        uint red1 : 5;
+        uint dgreen2 : 3;   // byte 1
+        uint green1 : 5;
+        uint dblue2 : 3;    // byte 2
+        uint blue1 : 5;
+        uint flip : 1;      // byte 3
+        uint diff : 1;
+        uint cw2 : 3;
+        uint cw1 : 3;
+        uint selectors;     // bytes 4-7
+    };
+    NV_COMPILER_CHECK(sizeof(Differential) == 64/8);
+
+    struct T {
+        uint red1b : 2;     // byte 0
+        uint detect2 : 1;
+        uint red1a : 2;
+        uint detect1 : 3;
+        uint blue1 : 4;     // byte 1
+        uint green1 : 4;
+        uint green2 : 4;    // byte 2
+        uint red2 : 4;
+        uint db : 1;        // byte 3
+        uint diff : 1;
+        uint da : 2;
+        uint blue2 : 4;
+        uint selectors;     // bytes 4-7
+    };
+    NV_COMPILER_CHECK(sizeof(T) == 64/8);
+
+    struct H {
+        uint green1a : 3;   // byte 0
+        uint red1 : 4;
+        uint detect1 : 1;
+        uint blue1b : 2;    // byte 1
+        uint detect3 : 1;
+        uint blue1a : 1;
+        uint green1b : 1;
+        uint detect2 : 3;
+        uint green2a : 3;   // byte 2
+        uint red2 : 4;
+        uint blue1c : 1;
+        uint db : 1;        // byte 3
+        uint diff : 1;
+        uint da : 1;
+        uint blue2 : 4;
+        uint green2b : 1;
+        uint selectors;     // bytes 4-7
+    };
+    NV_COMPILER_CHECK(sizeof(H) == 64/8);
+
+    struct Planar {
+        uint originGreen1 : 1;  // byte 0
+        uint originRed : 6;
+        uint detect1 : 1;
+        uint originBlue1 : 1;   // byte 1
+        uint originGreen2 : 6;
+        uint detect2 : 1;
+        uint originBlue3 : 2;   // byte 2
+        uint detect4 : 1;
+        uint originBlue2 : 2;
+        uint detect3 : 3;
+        uint horizRed2 : 1;     // byte 3
+        uint diff : 1;
+        uint horizRed1 : 5;
+        uint originBlue4 : 1;
+        uint horizBlue1: 1;     // byte 4
+        uint horizGreen : 7;
+        uint vertRed1 : 3;      // byte 5
+        uint horizBlue2 : 5;
+        uint vertGreen1 : 5;    // byte 6
+        uint vertRed2 : 3;
+        uint vertBlue : 6;      // byte 7
+        uint vertGreen2 : 2;
+    };
+    NV_COMPILER_CHECK(sizeof(Planar) == 64/8);
+
+
+    uint64 data64;
+    uint32 data32[2];
+    uint8 data8[8];
+    Individual individual;
+    Differential differential;
+    T t;
+    H h;
+    Planar planar;
+};
+NV_COMPILER_CHECK(sizeof(BlockETC) == 64/8);
+
+
+
+static const int etc_intensity_modifiers[8][4] = {
+    { -8,  -2,   2,   8 },
+    { -17,  -5,  5,  17 },
+    { -29,  -9,   9,  29 },
+    {  -42, -13, 13,  42 },
+    { -60, -18, 18,  60 },
+    { -80, -24, 24,  80 },
+    { -106, -33, 33, 106 },
+    { -183, -47, 47, 183 }
+};
+
+static const int etc_intensity_range[8] = {
+    16, 34, 58, 84, 120, 160, 212, 366
+};
+
+static const int etc_th_distances[8] = { 3, 6, 11, 16, 23, 32, 41, 64 };
+
+static const uint8 etc_selector_scramble[] = { 3, 2, 0, 1 };
+static const uint8 etc_selector_unscramble[] = { 2, 3, 1, 0 };
+
+
+static float midpoints4[16];
+NV_AT_STARTUP(
+    for (int i = 0; i < 15; i++) {
+        float f0 = float(((i+0) << 4) | ((i+0) >> 4)) / 255.0f;
+        float f1 = float(((i+1) << 4) | ((i+1) >> 4)) / 255.0f;
+        midpoints4[i] = (f0 + f1) * 0.5f;
+    }
+    midpoints4[15] = 1.0f;
+);
+
+static const float midpoints5[32] = {
+    0.015686f, 0.047059f, 0.078431f, 0.111765f, 0.145098f, 0.176471f, 0.207843f, 0.241176f, 0.274510f, 0.305882f, 0.337255f, 0.370588f, 0.403922f, 0.435294f, 0.466667f, 0.5f,
+    0.533333f, 0.564706f, 0.596078f, 0.629412f, 0.662745f, 0.694118f, 0.725490f, 0.758824f, 0.792157f, 0.823529f, 0.854902f, 0.888235f, 0.921569f, 0.952941f, 0.984314f, 1.0f
+};
+
+//static const float midpoints6[64];
+//static const float midpoints7[128];
+
+
+
+// ETC2 Modes:
+// - ETC1:
+//  - two partitions (flip modes): 2*(4x2, 2x4)
+//  - two base colors sotred as 444+444 or 555+333
+//  - two 3 bit intensity modifiers
+// - T Mode. 2 colors 444, 3 bit intensity modifiers, 2 bit indices.
+// - H Mode. 2 colors 444, 3 bit intensity modifiers, 2 bit indices.
+// - Planar mode: 3 colors 676
+
+struct ETC_Data {
+    enum Mode {
+        Mode_ETC1,
+        Mode_T,
+        Mode_H,
+        Mode_Planar,
+    } mode;
+
+    // @@ It may make more sense to store bit-expanded or even float colors here.
+    union {
+        struct {
+            uint16 color0;      // 444 or 555
+            uint16 color1;      // 444 or 333
+            uint8 table0;       // 3 bits
+            uint8 table1;       // 3 bits
+            bool flip;          // partition mode
+            bool diff;          // color encoding
+        } etc;
+        struct {
+            uint16 color0;      // 444
+            uint16 color1;      // 444
+            uint8 table;        // 3 bits
+        } t, h;
+        struct {
+            uint8 ro, go, bo;   // 676
+            uint8 rh, gh, bh;   // 676
+            uint8 rv, gv, bv;   // 676
+        } planar;
+    };
+
+    uint8 selector[16];         // 2 bit indices (32 bits)
+};
+
+struct ETC_Solution {
+    float error = NV_FLOAT_MAX;
+    ETC_Data data;
+};
+
+
+struct ETC_Options {
+    //bool fast_flip_mode_selection = false;
+    bool use_rg_etc = true;
+    bool enable_etc2 = true;
+    bool use_planar = true;
+    bool use_t_mode = true;
+    bool use_h_mode = true;
+    bool onebit_alpha = false;
+    Vector3 color_weights = Vector3(1);
+    
+    //int8 eac_search_radius = 1;  // [0-3]
+    //bool eac_11bit_mode = false;
+};
+
+/*static*/ float compress_etc(Vector4 input_colors[16], float input_weights[16], const ETC_Options & options, BlockETC * output);
+
+
+struct BlockEAC {
+    uint base : 8;
+    uint table : 4;
+    uint multiplier : 4;
+    uint selectors0 : 8;
+    uint selectors1 : 8;
+    
+    uint selectors2 : 8;
+    uint selectors3 : 8;
+    uint selectors4 : 8;
+    uint selectors5 : 8;
+};
+NV_COMPILER_CHECK(sizeof(BlockEAC) == 64/8);
+
+struct BlockETC_EAC {
+    BlockEAC eac;
+    BlockETC etc;
+};
+NV_COMPILER_CHECK(sizeof(BlockETC_EAC) == 128/8);
+
+// EAC:
+// 8 bit base code word
+// 4 bit multiplier
+// 4 bit table index
+// 16 * 3 bit indices.
+
+struct EAC_Data {
+    uint8 alpha;        // 8 bits
+    uint8 multiplier;   // 4 bits
+    uint8 table_index;  // 4 bits
+    uint8 selector[16]; // 3 bit indices
+};
+
+struct EAC_Solution {
+    float error = NV_FLOAT_MAX;
+    EAC_Data data;
+};
+
+struct EAC_Options {
+    int search_radius = 1;      // 0 = fast, 1 = medium, 2 = slow
+    bool use_11bit_mode = false;
+};
+
+
+static const int eac_intensity_modifiers[16][8] = {
+    {-3, -6, -9, -15, 2, 5, 8, 14}, // 0
+    {-3, -7,-10, -13, 2, 6, 9, 12}, // 1
+    {-2, -5, -8, -13, 1, 4, 7, 12}, // 2
+    {-2, -4, -6, -13, 1, 3, 5, 12}, // 3
+    {-3, -6, -8, -12, 2, 5, 7, 11}, // 4
+    {-3, -7, -9, -11, 2, 6, 8, 10}, // 5
+    {-4, -7, -8, -11, 3, 6, 7, 10}, // 6
+    {-3, -5, -8, -11, 2, 4, 7, 10}, // 7
+    {-2, -6, -8, -10, 1, 5, 7, 9 }, // 8
+    {-2, -5, -8, -10, 1, 4, 7, 9 }, // 9
+    {-2, -4, -8, -10, 1, 3, 7, 9 }, // 10
+    {-2, -5, -7, -10, 1, 4, 6, 9 }, // 11
+    {-3, -4, -7, -10, 2, 3, 6, 9 }, // 12
+    {-1, -2, -3, -10, 0, 1, 2, 9 }, // 13
+    {-4, -6, -8,  -9, 3, 5, 7, 8 }, // 14
+    {-3, -5, -7,  -9, 2, 4, 6, 8 }  // 15
+};
+
+
+
+
+static void pack_etc2_block(const ETC_Data & data, BlockETC * output_block) {
+    BlockETC block;
+
+    bool swap_colors = false;
+
+    if (data.mode == ETC_Data::Mode_ETC1) {
+        // These are the same for individual and differential blocks.
+        block.individual.diff = data.etc.diff;
+        block.individual.flip = data.etc.flip;
+        block.individual.cw1 = data.etc.table0;
+        block.individual.cw2 = data.etc.table1;
+
+        if (data.etc.diff) {
+            block.differential.red1 = data.etc.color0 >> 10;
+            block.differential.dred2 = data.etc.color1 >> 6;
+            block.differential.green1 = (data.etc.color0 >> 5) & 0x1F;
+            block.differential.dgreen2 = (data.etc.color1 >> 3) & 0x7;
+            block.differential.blue1 = data.etc.color0 & 0x1F;
+            block.differential.dblue2 = data.etc.color1 & 0x7;
+        }
+        else {
+            block.individual.red1 = data.etc.color0 >> 8;
+            block.individual.red2 = data.etc.color1 >> 8;
+            block.individual.green1 = (data.etc.color0 >> 4) & 0xF;
+            block.individual.green2 = (data.etc.color1 >> 4) & 0xF;
+            block.individual.blue1 = data.etc.color0 & 0xF;
+            block.individual.blue2 = data.etc.color1 & 0xF;
+        }
+    }
+    else if (data.mode == ETC_Data::Mode_T) {
+        block.t.red1a = (data.t.color0 >> 8) >> 2;
+        block.t.red1b = (data.t.color0 >> 8);
+        block.t.green1 = (data.t.color0 >> 4);
+        block.t.blue1 = data.t.color0;
+
+        block.t.red2 = (data.t.color1 >> 8);
+        block.t.green2 = (data.t.color1 >> 4);
+        block.t.blue2 = data.t.color1;
+
+        block.t.da = data.t.table >> 1;
+        block.t.db = data.t.table;
+
+        block.t.diff = 1;
+
+        // create an invalid R differential to trigger T mode
+        int dr = block.differential.dred2;
+        if (dr >= 4) dr -= 8;
+        int r = block.differential.red1 + dr;
+
+        block.t.detect1 = 0;
+        block.t.detect2 = 1;
+        if (r >= 4) {
+            block.t.detect1 = 7;
+            block.t.detect2 = 0;
+        }
+    }
+    else if (data.mode == ETC_Data::Mode_H) {
+        bool table_lsb = data.h.table & 1;
+        swap_colors = (data.h.color0 < data.h.color1) ^ !table_lsb;
+
+        uint16 color0 = data.h.color0;
+        uint16 color1 = data.h.color1;
+
+        if (swap_colors) {
+            swap(color0, color1);
+        }
+
+        block.h.red1 = (data.h.color0 >> 8);
+        block.h.green1a = (data.h.color0 >> 4) >> 1;
+        block.h.green1b = (data.h.color0 >> 4);
+        block.h.blue1a = data.h.color0 >> 3;
+        block.h.blue1b = data.h.color0 >> 1;
+        block.h.blue1c = data.h.color0;
+
+        block.h.red2 = (data.h.color1 >> 8);
+        block.h.green2a = (data.h.color1 >> 4) >> 1;
+        block.h.green2b = (data.h.color1 >> 4);
+        block.h.blue2 = (data.h.color1 >> 8);
+
+        block.h.da = data.h.table >> 2;
+        block.h.db = data.h.table >> 1;
+
+        block.h.diff = 1;
+
+        // create an invalid R differential to trigger T mode
+        block.h.detect1 = 0;
+        block.h.detect2 = 0;
+        block.h.detect3 = 0;
+
+        int dr = block.differential.dred2;
+        int dg = block.differential.dgreen2;
+        if (dr >= 4) dr -= 8;
+        if (dg >= 4) dg -= 8;
+        int r = block.differential.red1 + dr;
+        int g = block.differential.green1 + dg;
+
+        if (r < 0 || r > 31) {
+            block.h.detect1 = 1;
+        }
+        if (g >= 4) {
+            block.h.detect2 = 7;
+            block.h.detect3 = 0;
+        }
+        else {
+            block.h.detect2 = 0;
+            block.h.detect3 = 1;
+        }
+    }
+
+    if (data.mode == ETC_Data::Mode_Planar) {
+        // From ETCLib:
+        block.planar.originRed = data.planar.ro;
+        block.planar.originGreen1 = data.planar.go >> 6;
+        block.planar.originGreen2 = data.planar.go;
+        block.planar.originBlue1 = data.planar.bo >> 5;
+        block.planar.originBlue2 = data.planar.bo >> 3;
+        block.planar.originBlue3 = data.planar.bo >> 1;
+        block.planar.originBlue4 = data.planar.bo;
+
+        block.planar.horizRed1 = data.planar.rh >> 1;
+        block.planar.horizRed2 = data.planar.rh;
+        block.planar.horizGreen = data.planar.gh;
+        block.planar.horizBlue1 = data.planar.bh >> 5;
+        block.planar.horizBlue2 = data.planar.bh;
+
+        block.planar.vertRed1 = data.planar.rv >> 3;
+        block.planar.vertRed2 = data.planar.rv;
+        block.planar.vertGreen1 = data.planar.gv >> 2;
+        block.planar.vertGreen2 = data.planar.gv;
+        block.planar.vertBlue = data.planar.bv;
+
+        block.planar.diff = 1;
+
+        // create valid RG differentials and an invalid B differential to trigger planar mode
+        block.planar.detect1 = 0;
+        block.planar.detect2 = 0;
+        block.planar.detect3 = 0;
+        block.planar.detect4 = 0;
+
+        // @@ Clean this up.
+        int dr = block.differential.dred2;
+        int dg = block.differential.dgreen2;
+        int db = block.differential.dblue2;
+        if (dr >= 4) dr -= 8;
+        if (dg >= 4) dg -= 8;
+        if (db >= 4) db -= 8;
+        int r = block.differential.red1 + dr;
+        int g = block.differential.green1 + dg;
+        int b = block.differential.blue1 + db;
+
+        if (r < 0 || r > 31) {
+            block.planar.detect1 = 1;
+        }
+        if (g < 0 || g > 31) {
+            block.planar.detect2 = 1;
+        }
+        if (b >= 4) {
+            block.planar.detect3 = 7;
+            block.planar.detect4 = 0;
+        }
+        else {
+            block.planar.detect3 = 0;
+            block.planar.detect4 = 1;
+        }
+    }
+    else {
+        block.individual.selectors = 0;
+        for (int i = 0; i < 16; i++) {
+            uint selector = data.selector[i];
+            selector = etc_selector_scramble[selector];
+            block.individual.selectors |= (selector >> 1) << (i ^ 8);
+            block.individual.selectors |= (selector & 1) << ((16 + i) ^ 8);
+        }
+
+        if (swap_colors) {
+            block.individual.selectors ^= 0x0000FFFF;
+        }
+    }
+
+    // @@ output_block is big endian, byte swap:
+    *output_block = block;
+}
+
+static void unpack_etc2_block(const BlockETC * input_block, ETC_Data * data) {
+
+    // @@ input_block is big endian, byte swap first:
+    BlockETC block = *input_block;
+
+    // Assume ETC1 for now.
+    data->mode = ETC_Data::Mode_ETC1;
+
+    // These are the same for individual and differential blocks.
+    data->etc.diff = block.individual.diff != 0;
+    data->etc.flip = block.individual.flip != 0;
+    data->etc.table0 = block.individual.cw1;
+    data->etc.table1 = block.individual.cw2;
+
+    // Decode colors.
+    if (data->etc.diff) {
+        data->etc.color0 = U16((block.differential.red1 << 10) | (block.differential.green1 << 5) | block.differential.blue1);
+        data->etc.color1 = U16((block.differential.dred2 << 6) | (block.differential.dgreen2 << 3) | block.differential.dblue2);
+
+        // @@ Clean this up.
+        int dr = block.differential.dred2;
+        int dg = block.differential.dgreen2;
+        int db = block.differential.dblue2;
+        if (dr >= 4) dr -= 8;
+        if (dg >= 4) dg -= 8;
+        if (db >= 4) db -= 8;
+        int r = block.differential.red1 + dr;
+        int g = block.differential.green1 + dg;
+        int b = block.differential.blue1 + db;
+
+        // Detect ETC2 modes (invalid combinations).
+        if (r < 0 || r > 31) {
+            data->mode = ETC_Data::Mode_T;
+        }
+        else if (g < 0 || g > 31) {
+            data->mode = ETC_Data::Mode_H;
+        }
+        else if (b < 0 || b > 31) {
+            data->mode = ETC_Data::Mode_Planar;
+        }
+    }
+    else {
+        data->etc.color0 = U16((block.individual.red1 << 8) | (block.individual.green1 << 4) | block.individual.blue1);
+        data->etc.color1 = U16((block.individual.red2 << 8) | (block.individual.green2 << 4) | block.individual.blue2);
+    }
+
+    if (data->mode == ETC_Data::Mode_T) {
+        uint16 r0 = U16((block.t.red1a << 2) | block.t.red1b);
+        uint16 g0 = U16(block.t.green1);
+        uint16 b0 = U16(block.t.blue1);
+        data->t.color0 = U16(r0 << 8) | U16(g0 << 4) | b0;
+
+        uint16 r1 = U16(block.t.red2);
+        uint16 g1 = U16(block.t.green2);
+        uint16 b1 = U16(block.t.blue2);
+        data->t.color1 = U16(r1 << 8) | U16(g1 << 4) | b1;
+
+        data->t.table = U8((block.t.da << 1) | block.t.db);
+    }
+    else if (data->mode == ETC_Data::Mode_H) {
+        uint16 r0 = U16(block.h.red1);
+        uint16 g0 = U16((block.h.green1a << 1) | block.h.green1b);
+        uint16 b0 = U16((block.h.blue1a << 3) | (block.h.blue1b << 1) | block.h.blue1c);
+        data->h.color0 = U16(r0 << 8) | U16(g0 << 4) | b0;
+
+        uint16 r1 = U16(block.h.red2);
+        uint16 g1 = U16((block.h.green2a << 1) | block.h.green2b);
+        uint16 b1 = U16(block.h.blue2);
+        data->h.color1 = U16(r1 << 8) | U16(g1 << 4) | b1;
+
+        data->h.table = U8((block.h.da << 2) | (block.h.db << 1));
+
+        if (data->h.color0 >= data->h.color1) {
+            data->h.table++;
+        }
+    }
+
+    if (data->mode == ETC_Data::Mode_Planar) {
+        data->planar.ro = U8(block.planar.originRed);
+        data->planar.go = U8((block.planar.originGreen1 << 6) + block.planar.originGreen2);
+        data->planar.bo = U8((block.planar.originBlue1 << 5) + (block.planar.originBlue2 << 3) + (block.planar.originBlue3 << 1) + block.planar.originBlue4);
+
+        data->planar.rh = U8((block.planar.horizRed1 << 1) + block.planar.horizRed2);
+        data->planar.gh = U8(block.planar.horizGreen);
+        data->planar.bh = U8((block.planar.horizBlue1 << 5) + block.planar.horizBlue2);
+
+        data->planar.rv = U8((block.planar.vertRed1 << 3) + block.planar.vertRed2);
+        data->planar.gv = U8((block.planar.vertGreen1 << 2) + block.planar.vertGreen2);
+        data->planar.bv = U8(block.planar.vertBlue);
+    }
+    else {
+        // Note, selectors are arranged in columns, keep that order.
+        unsigned char * selectors = (uint8 *)&block.individual.selectors;
+        for (int i = 0; i < 16; i++) {
+            int byte_msb = (1 - (i / 8));
+            int byte_lsb = (3 - (i / 8));
+            int shift = (i & 7);
+
+            uint msb = (selectors[byte_msb] >> shift) & 1;
+            uint lsb = (selectors[byte_lsb] >> shift) & 1;
+
+            uint index = (msb << 1) | lsb;
+
+            if (data->mode == ETC_Data::Mode_ETC1) {
+                data->selector[i] = etc_selector_unscramble[index];
+            }
+            else {
+                // No scrambling in T & H modes.
+                data->selector[i] = index;
+            }
+        }
+    }
+}
+
+static void pack_eac_block(const EAC_Data & data, BlockEAC * output_block) {
+
+    output_block->base = data.alpha;
+    output_block->table = data.table_index;
+    output_block->multiplier = data.multiplier;
+    
+    uint64 selector_bits = 0;
+    for (uint i = 0; i < 16; i++) {
+        uint shift = 45 - (3 * i);
+        selector_bits |= uint64(data.selector[i]) << shift;
+    }
+    
+    output_block->selectors0 = selector_bits >> 40;
+    output_block->selectors1 = selector_bits >> 32;
+    output_block->selectors2 = selector_bits >> 24;
+    output_block->selectors3 = selector_bits >> 16;
+    output_block->selectors4 = selector_bits >> 8;
+    output_block->selectors5 = selector_bits >> 0;
+}
+
+static void unpack_eac_block(const BlockEAC * input_block, EAC_Data * data) {
+    
+    data->alpha = input_block->base;
+    data->table_index = input_block->table;
+    data->multiplier = input_block->multiplier;
+    
+    uint64 selector_bits = 0;
+    selector_bits |= uint64(input_block->selectors0) << 40;
+    selector_bits |= uint64(input_block->selectors1) << 32;
+    selector_bits |= uint64(input_block->selectors2) << 24;
+    selector_bits |= uint64(input_block->selectors3) << 16;
+    selector_bits |= uint64(input_block->selectors4) << 8;
+    selector_bits |= uint64(input_block->selectors5) << 0;
+    for (uint i = 0; i < 16; i++) {
+        uint shift = 45 - (3 * i);
+        data->selector[i] = (selector_bits >> shift) & 0x7;
+    }
+}
+
+
+
+// This assumes nin > nout-nin
+inline int bitexpand(uint32 bits, uint nin, uint nout) {
+    assert(nout > nin);
+    //assert(nout - nin > nin);
+    return (bits << uint(nout - nin)) | (bits >> uint(2U * nin - nout));
+}
+
+// Integer color unpacking for decompressor.
+static void unpack_color_444(uint32 packed_color, int * r, int * g, int * b) {
+    int r4 = (packed_color >> 8) & 0xF;
+    int g4 = (packed_color >> 4) & 0xF;
+    int b4 = packed_color & 0xF;
+    *r = r4 << 4 | r4;  // bitexpand(r4, 4, 8);
+    *g = g4 << 4 | g4;  // bitexpand(g4, 4, 8);
+    *b = b4 << 4 | b4;  // bitexpand(b4, 4, 8);
+}
+
+static Vector3 unpack_color_444(uint32 packed_color) {
+    int r, g, b;
+    unpack_color_444(packed_color, &r, &g, &b);
+    return Vector3(float(r), float(g), float(b)) * 1.0f / 255.0f;
+}
+
+static void unpack_color_555(uint32 packed_color, int * r, int * g, int * b) {
+    int r5 = (packed_color >> 10) & 0x1F;
+    int g5 = (packed_color >> 5) & 0x1F;
+    int b5 = packed_color & 0x1F;
+    *r = (r5 << 3) | (r5 >> 2); // bitexpand(r5, 5, 8);
+    *g = (g5 << 3) | (g5 >> 2); // bitexpand(g5, 5, 8);
+    *b = (b5 << 3) | (b5 >> 2); // bitexpand(b5, 5, 8);
+}
+
+static Vector3 unpack_color_555(uint32 packed_color) {
+    int r, g, b;
+    unpack_color_555(packed_color, &r, &g, &b);
+    return Vector3(float(r), float(g), float(b)) * 1.0f / 255.0f;
+}
+
+// Returns signed r,g,b without bit expansion.
+static void unpack_delta_333(uint32 packed_delta, int * r, int * g, int * b) {
+    *r = (packed_delta >> 6) & 7;
+    *g = (packed_delta >> 3) & 7;
+    *b = packed_delta & 7;
+    if (*r >= 4) *r -= 8;
+    if (*g >= 4) *g -= 8;
+    if (*b >= 4) *b -= 8;
+}
+
+static bool unpack_color_555(uint32 packed_color, uint32 packed_delta, int * r, int * g, int * b) {
+    int dc_r, dc_g, dc_b;
+    unpack_delta_333(packed_delta, &dc_r, &dc_g, &dc_b);
+
+    int r5 = int((packed_color >> 10U) & 0x1F) + dc_r;
+    int g5 = int((packed_color >> 5U) & 0x1F) + dc_g;
+    int b5 = int(packed_color & 0x1F) + dc_b;
+
+    bool success = true;
+    if (static_cast<uint>(r5 | g5 | b5) > 31U)
+    {
+       success = false;
+       r5 = clamp(r5, 0, 31);
+       g5 = clamp(g5, 0, 31);
+       b5 = clamp(b5, 0, 31);
+    }
+
+    *r = (r5 << 3) | (r5 >> 2); // bitexpand(r5, 5, 8);
+    *g = (g5 << 3) | (g5 >> 2); // bitexpand(g5, 5, 8);
+    *b = (b5 << 3) | (b5 >> 2); // bitexpand(b5, 5, 8);
+
+    return success;
+}
+
+static Vector3 unpack_color_555(uint32 packed_color, uint32 packed_delta) {
+    int r, g, b;
+    bool success = unpack_color_555(packed_color, packed_delta, &r, &g, &b);
+    assert(success);
+    return Vector3(float(r), float(g), float(b)) * 1.0f / 255.0f;
+}
+
+
+static void unpack_color_676(uint32 packed_color, int * r, int * g, int * b) {
+    int r6 = (packed_color >> 13) & 0x3F;
+    int g7 = (packed_color >> 6) & 0x7F;
+    int b6 = packed_color & 0x3F;
+
+    *r = bitexpand(r6, 6, 8);   // r << 2 | r >> 4
+    *g = bitexpand(g7, 7, 8);   // g << 1 | g >> 6
+    *b = bitexpand(b6, 6, 8);   // b << 2 | b >> 4
+}
+
+
+static uint32 pack_color_444(Vector3 color) {
+
+    // Truncate.
+    uint r = U32(ftoi_trunc(clamp(color.x * 15.0f, 0.0f, 15.0f)));
+    uint g = U32(ftoi_trunc(clamp(color.y * 15.0f, 0.0f, 15.0f)));
+    uint b = U32(ftoi_trunc(clamp(color.z * 15.0f, 0.0f, 15.0f)));
+
+    // Round exactly according to 444 bit-expansion.
+    r += (color.x > midpoints4[r]);
+    g += (color.y > midpoints4[g]);
+    b += (color.z > midpoints4[b]);
+
+    return (r << 8) | (g << 4) | b;
+}
+
+static uint32 pack_color_555(Vector3 color) {
+
+    // Truncate.
+    uint r = U32(ftoi_trunc(clamp(color.x * 31.0f, 0.0f, 31.0f)));
+    uint g = U32(ftoi_trunc(clamp(color.y * 31.0f, 0.0f, 31.0f)));
+    uint b = U32(ftoi_trunc(clamp(color.z * 31.0f, 0.0f, 31.0f)));
+
+    // Round exactly according to 555 bit-expansion.
+    r += (color.x > midpoints5[r]);
+    g += (color.y > midpoints5[g]);
+    b += (color.z > midpoints5[b]);
+
+    return (r << 10) | (g << 5) | b;
+}
+
+static uint32 pack_delta_333(Vector3 delta) {
+    // @@ Accurate rounding of signed 3-bit components.
+
+    int r = ftoi_round(clamp(delta.x * 31.0f, -4.0f, 3.0f));
+    int g = ftoi_round(clamp(delta.y * 31.0f, -4.0f, 3.0f));
+    int b = ftoi_round(clamp(delta.z * 31.0f, -4.0f, 3.0f));
+
+    //r += (delta.x > delta_midpoints3[r]);
+    //g += (delta.y > delta_midpoints3[g]);
+    //b += (delta.z > delta_midpoints3[b]);
+
+    if (r < 0) r += 8;
+    if (g < 0) g += 8;
+    if (b < 0) b += 8;
+    return static_cast<uint16>(b | (g << 3) | (r << 6));
+}
+
+static uint8 pack_float_6(float f) {
+
+    // Truncate.
+    uint u = U32(ftoi_trunc(clamp(f * 63.0f, 0.0f, 63.0f)));
+
+    // Round exactly according to 6 bit-expansion.
+    //u += (f > midpoints6[u]);
+
+    float midpoint = 0.5f * (bitexpand(u, 6, 8) + bitexpand(min(u + 1, 63U), 6, 8));    // @@ Precompute.
+    u += (f > midpoint);
+
+    return U8(u);
+}
+
+static uint8 pack_float_7(float f) {
+
+    // Truncate.
+    uint u = U32(ftoi_trunc(clamp(f * 127.0f, 0.0f, 127.0f)));
+
+    // Round exactly according to 6 bit-expansion.
+    //u += (f > midpoints7[u]);
+
+    float midpoint = 0.5f * (bitexpand(u, 7, 8) + bitexpand(min(u + 1, 127U), 7, 8));   // @@ Precompute.
+    u += (f > midpoint);
+
+    return U8(u);
+}
+
+static uint8 pack_float_6(float f, bool round_dir) {
+    uint u = U32(ftoi_trunc(clamp(f * 63.0f + round_dir, 0.0f, 63.0f)));
+    return U8(u);
+}
+
+static uint8 pack_float_7(float f, bool round_dir) {
+    uint u = U32(ftoi_trunc(clamp(f * 127.0f + round_dir, 0.0f, 127.0f)));
+    return U8(u);
+}
+
+
+
+
+Vector3 get_partition_color_average(const Vector4 input_colors[16], const float input_weights[16], bool flip, int partition) {
+    Vector3 sum_c(0);
+    float sum_w = 0;
+
+    if (flip) {
+        // Horizontal partition.
+        int offset = partition ? 8 : 0;
+
+        for (int i = 0; i < 8; i++) {
+            sum_c += input_colors[i+offset].xyz() * input_weights[i+offset];
+            sum_w += input_weights[i+offset];
+        }
+    }
+    else {
+        // Vertical partition.
+        int offset = partition ? 2 : 0;
+
+        for (int i = 0; i < 4; i++) {
+            sum_c += input_colors[i+offset].xyz() * input_weights[i+offset];
+            sum_w += input_weights[i+offset];
+
+            sum_c += input_colors[i+offset+1].xyz() * input_weights[i+offset+1];
+            sum_w += input_weights[i+offset+1];
+
+            offset += 2;
+        }
+    }
+    if (sum_w == 0) {
+        sum_w = 1;
+    }
+
+    return sum_c * 1.0f / sum_w;
+}
+
+// Approximate partition color using average.
+Vector3 base_color_average(const Vector3 colors[8]) {
+    Vector3 sum_c(0);
+
+    for (uint i = 0; i < 8; i++) {
+        sum_c += colors[i];
+    }
+
+    return sum_c * 1.0f / 8.0f;
+}
+Vector3 base_color_average(const Vector3 colors[8], const float weights[8]) {
+    Vector3 sum_c(0);
+    float sum_w = 0;
+
+    for (uint i = 0; i < 8; i++) {
+        sum_c += colors[i] * weights[i];
+        sum_w += weights[i];
+    }
+
+    return sum_c * 1.0f / sum_w;
+}
+
+#if 0
+// Compute base color using least squares.
+Vector3 base_color_least_squares(const Vector3 colors[8], int table_index, int indices[8]) {
+
+    // Compute dot(C, I) and dot(I, I)
+    Vector3 CI(0);
+    float II = 0;
+
+    for (int i = 0; i < 8; i++) {
+        Vector3 C = colors[i];
+        float I = etc_intensity_modifiers[table_index][indices[i]];
+        CI += C * I;
+        II += I * I;
+    }
+
+    return CI / II;
+}
+
+// @@ Do weighted least squares!
+Vector3 base_color_least_squares(const Vector3 colors[8], const float weights[8], int table_index, int indices[8]) {
+
+    // Compute dot(C, I) and dot(I, I)
+    Vector3 CI(0);
+    float II = 0;
+
+    for (int i = 0; i < 8; i++) {
+        Vector3 C = colors[i];
+        float w = weights[i];
+        float I = etc_intensity_modifiers[table_index][indices[i]];
+        CI += C * I * w;
+        II += I * I;
+    }
+
+    return CI / II;
+}
+
+// Is this any faster than the above?
+Vector3 base_color_least_squares(const Vector3 colors[8], int table_index, int c0, int c1, int c2) {
+
+    // Compute dot(C, I) and dot(I, I)
+    Vector3 CI(0);
+
+    float I0 = etc_intensity_modifiers[table_index][0];
+    float I1 = etc_intensity_modifiers[table_index][1];
+    float I2 = etc_intensity_modifiers[table_index][2];
+    float I3 = etc_intensity_modifiers[table_index][3];
+
+    float II = 0;
+    II += c0 * I0 * I0;
+    II += c1 * I1 * I1;
+    II += c2 * I2 * I2;
+    II += (8-c0-c1-c2) * I3 * I3;
+
+    int i = 0;
+    for (; i < c0; i++)         CI += colors[i] * I0;
+    for (; i < c0+c1; i++)      CI += colors[i] * I1;
+    for (; i < c0+c1+c2; i++)   CI += colors[i] * I2;
+    for (; i < 8; i++)          CI += colors[i] * I3;
+
+    return CI / II;
+}
+
+static void selectors_for_clusters(int c0, int c1, int c2, int selector[8]) {
+    int i = 0;
+    for (; i < c0; i++)         selector[i] = 0;
+    for (; i < c0+c1; i++)      selector[i] = 1;
+    for (; i < c0+c1+c2; i++)   selector[i] = 2;
+    for (; i < 8; i++)          selector[i] = 3;
+}
+
+static int cluster_count(int count = 8) {
+    int total = 0;
+
+    for (uint c0 = 0; c0 <= count; c0++) {
+        for (uint c1 = 0; c1 <= count-c0; c1++) {
+            for (uint c2 = 0; c2 <= count-c0-c1; c2++) {
+                total++;
+            }
+        }
+    }
+
+    // total is the number of possible cluster combinations.
+    return total;
+}
+
+// Does each partition have its own table index? Or is it shared for both?
+
+
+void test_all_total_orders(const Vector4 colors[8], const float weights[8], int table_index) {
+
+    // @@ compute average luminance of each partition.
+
+
+    // @@ sort colors by the luminance differences respect to partition average.
+
+    // @@ compute luminance range, pick table index based on that. Try nearest indices also?
+
+    // For each cluster combination:
+/*
+    for (uint c0 = 0; c0 <= count; c0++) {
+        for (uint c1 = 0; c1 <= count-c0; c1++) {
+            for (uint c2 = 0; c2 <= count-c0-c1; c2++) {
+
+                // compute selectors.
+                int selector[8];
+                selectors_for_clusters(c0, c1, c2, selector);
+
+                // compute base colors that minimize error in each partition.
+
+                // determine error for these quantized base colors. Record best cluster combination.
+
+            }
+        }
+    }
+*/
+}
+
+void test_all_total_orders(const Vector4 input_colors[16], const float input_weights[16], uint count, bool flip, int table_index) {
+
+    // Slow method is to test both flip modes.
+    //test_all_total_orders(input_colors, input_weights, /*flip=*/false, int table_index);
+    //test_all_total_orders(input_colors, input_weights, /*flip=*/true, int table_index);
+}
+
+
+// @@ How do compute the error for a given base color?
+
+// Compute indices using range fitting / quantization of input colors?
+
+// Compute indices using range fitting.
+
+void test_all_clusters() {
+    int count = 8; // Could be smaller.
+
+    for (uint c0 = 0; c0 <= count; c0++) {
+        Vector3 x1(0.0f);
+        float w1 = 0.0f;
+
+        for (uint c1 = 0; c1 <= count-c0; c1++) {
+            Vector3 x2(0.0f);
+            float w2 = 0.0f;
+
+            for (uint c2 = 0; c2 <= count-c0-c1; c2++) {
+            }
+        }
+    }
+
+}
+
+#endif
+
+
+
+
+
+
+
+
+static Color32 saturate_color(int R, int G, int B) {
+    Color32 c;
+    c.r = U8(clamp(R, 0, 255));
+    c.g = U8(clamp(G, 0, 255));
+    c.b = U8(clamp(B, 0, 255));
+    c.a = 255;
+    return c;
+}
+
+static void get_diff_subblock_palette(uint16 packed_color, uint table_idx, Color32 palette[4]) {
+    assert(table_idx < 8);
+
+    const int * intensity_table = etc_intensity_modifiers[table_idx];
+
+    int r, g, b;
+    unpack_color_555(packed_color, &r, &g, &b);
+
+    for (int i = 0; i < 4; i++) {
+        const int y = intensity_table[i];
+        palette[i] = saturate_color(r + y, g + y, b + y);
+    }
+}
+
+static bool get_diff_subblock_palette(uint16 packed_color, uint16 packed_delta, uint table_idx, Color32 palette[4]) {
+    assert(table_idx < 8);
+
+    const int * intensity_table = etc_intensity_modifiers[table_idx];
+
+    int r, g, b;
+    bool success = unpack_color_555(packed_color, packed_delta, &r, &g, &b);
+
+    for (int i = 0; i < 4; i++) {
+        const int y = intensity_table[i];
+        palette[i] = saturate_color(r + y, g + y, b + y);
+    }
+
+    return success;
+}
+
+static void get_abs_subblock_palette(uint16 packed_color, uint table_idx, Color32 palette[4]) {
+    assert(table_idx < 8);
+
+    const int * intensity_table = etc_intensity_modifiers[table_idx];
+
+    int r, g, b;
+    unpack_color_444(packed_color, &r, &g, &b);
+
+    for (int i = 0; i < 4; i++) {
+        const int y = intensity_table[i];
+        palette[i] = saturate_color(r + y, g + y, b + y);
+    }
+}
+
+static int get_selector(const ETC_Data & data, int x, int y) {
+    // Note selectors are arranged in column order.
+    return data.selector[x*4+y];
+}
+static int get_partition(const ETC_Data & data, int x, int y) {
+    assert(data.mode == ETC_Data::Mode_ETC1);
+    return data.etc.flip ? y > 1 : x > 1;
+}
+
+static void decode_etc1(const ETC_Data & data, Vector4 colors[16]) {
+    assert(data.mode == ETC_Data::Mode_ETC1);
+
+    Color32 palette[2][4];
+
+    if (data.etc.diff) {
+        // Decode colors in 555+333 mode.
+        get_diff_subblock_palette(data.etc.color0, data.etc.table0, palette[0]);
+        get_diff_subblock_palette(data.etc.color0, data.etc.color1, data.etc.table1, palette[1]);
+    }
+    else {
+        // Decode colors in 444,444 mode.
+        get_abs_subblock_palette(data.etc.color0, data.etc.table0, palette[0]);
+        get_abs_subblock_palette(data.etc.color1, data.etc.table1, palette[1]);
+    }
+
+    for (int y = 0; y < 4; y++) {
+        for (int x = 0; x < 4; x++) {
+            colors[y*4+x] = toVector4(palette[get_partition(data, x, y)][get_selector(data, x, y)]);
+        }
+    }
+}
+
+static void decode_etc2_t(const ETC_Data & data, Vector4 output_colors[16]) {
+    assert(data.mode == ETC_Data::Mode_T);
+
+    int r, g, b;
+    Color32 palette[4];
+
+    int d = etc_th_distances[data.t.table];
+
+    unpack_color_444(data.t.color0, &r, &g, &b);
+    palette[0] = saturate_color(r, g, b);
+
+    unpack_color_444(data.t.color1, &r, &g, &b);
+    palette[1] = saturate_color(r + d, g + d, b + d);
+    palette[2] = saturate_color(r, g, b);
+    palette[3] = saturate_color(r - d, g - d, b - d);
+
+    for (int y = 0; y < 4; y++) {
+        for (int x = 0; x < 4; x++) {
+            output_colors[y*4+x] = toVector4(palette[get_selector(data, x, y)]);
+        }
+    }
+}
+
+static void decode_etc2_h(const ETC_Data & data, Vector4 output_colors[16]) {
+    assert(data.mode == ETC_Data::Mode_H);
+
+    int r, g, b;
+    Color32 palette[4];
+
+    int d = etc_th_distances[data.t.table];
+
+    unpack_color_444(data.t.color0, &r, &g, &b);
+    palette[0] = saturate_color(r + d, g + d, b + d);
+    palette[1] = saturate_color(r - d, g - d, b - d);
+
+    unpack_color_444(data.t.color1, &r, &g, &b);
+    palette[2] = saturate_color(r + d, g + d, b + d);
+    palette[3] = saturate_color(r - d, g - d, b - d);
+
+    for (int y = 0; y < 4; y++) {
+        for (int x = 0; x < 4; x++) {
+            output_colors[y*4+x] = toVector4(palette[get_selector(data, x, y)]);
+        }
+    }
+}
+
+static void decode_etc2_planar(const ETC_Data & data, Vector4 output_colors[16]) {
+    assert(data.mode == ETC_Data::Mode_Planar);
+
+    int ro, go, bo; // origin color
+    int rh, gh, bh; // horizontal color
+    int rv, gv, bv; // vertical color
+
+    // Unpack from 676
+    ro = bitexpand(data.planar.ro, 6, 8);   // r << 2 | r >> 4
+    go = bitexpand(data.planar.go, 7, 8);   // g << 1 | g >> 6
+    bo = bitexpand(data.planar.bo, 6, 8);
+
+    rh = bitexpand(data.planar.rh, 6, 8);
+    gh = bitexpand(data.planar.gh, 7, 8);
+    bh = bitexpand(data.planar.bh, 6, 8);
+
+    rv = bitexpand(data.planar.rv, 6, 8);
+    gv = bitexpand(data.planar.gv, 7, 8);
+    bv = bitexpand(data.planar.bv, 6, 8);
+
+
+    for (int y = 0; y < 4; y++) {
+        for (int x = 0; x < 4; x++) {
+            int r = (4 * ro + x * (rh - ro) + y * (rv - ro) + 2) >> 2;
+            int g = (4 * go + x * (gh - go) + y * (gv - go) + 2) >> 2;
+            int b = (4 * bo + x * (bh - bo) + y * (bv - bo) + 2) >> 2;
+
+            int idx = 4 * y + x;
+            output_colors[idx].x = saturate(float(r) / 255.0f);
+            output_colors[idx].y = saturate(float(g) / 255.0f);
+            output_colors[idx].z = saturate(float(b) / 255.0f);
+            output_colors[idx].w = 1;
+        }
+    }
+}
+
+static void decode_etc2(const ETC_Data & data, Vector4 colors[16]) {
+
+    if (data.mode == ETC_Data::Mode_ETC1) {
+        decode_etc1(data, colors);
+    }
+    else if (data.mode == ETC_Data::Mode_T) {
+        decode_etc2_t(data, colors);
+    }
+    else if (data.mode == ETC_Data::Mode_H) {
+        decode_etc2_h(data, colors);
+    }
+    else /*if (data.mode == ETC_Data::Mode_Planar)*/ {
+        decode_etc2_planar(data, colors);
+    }
+}
+
+
+static float get_alpha11(int base, int table, int mul, int index) {
+	int elevenbase = base*8+4;
+    int tabVal = eac_intensity_modifiers[table][index];
+	int elevenTabVal = tabVal*8;
+ 
+	if(mul!=0) elevenTabVal*=mul;
+	else elevenTabVal/=8;
+  
+	//calculate sum
+	int elevenbits = elevenbase+elevenTabVal;
+ 
+	//clamp..
+	if(elevenbits>=256*8) elevenbits=256*8-1;
+	else if(elevenbits<0) elevenbits=0;
+	//elevenbits now contains the 11 bit alpha value as defined in the spec.
+ 
+	//extend to 16 bits before returning, since we don't have any good 11-bit file formats.
+	uint16 sixteenbits = (elevenbits<<5)+(elevenbits>>6);
+ 
+	return float(sixteenbits) / 65535.0f;
+}
+
+static float get_alpha8(int base, int table, int mul, int index) {
+    int value = clamp(base + eac_intensity_modifiers[table][index] * mul, 0, 255);
+    return value / 255.0f;
+}
+
+
+
+
+static void decode_eac_8(const EAC_Data & data, Vector4 output_colors[16], int output_channel = 3) {
+    for (int i = 0; i < 16; i++) {
+        int s = data.selector[4*(i%4) + i/4];
+        output_colors[i].component[output_channel] = get_alpha8(data.alpha, data.table_index, data.multiplier, s);
+    }
+}
+
+static void decode_eac_11(const EAC_Data & data, Vector4 output_colors[16], int output_channel = 0) {
+    for (int i = 0; i < 16; i++) {
+        int s = data.selector[4*(i%4) + i/4];
+        output_colors[i].component[output_channel] = get_alpha11(data.alpha, data.table_index, data.multiplier, s);
+    }
+}
+
+
+
+
+static float evaluate_mse(const Vector3 & p, const Vector3 & c, const Vector3 & w) {
+    Vector3 d = (p - c) * w;
+    return dot(d, d);
+}
+
+static float evaluate_rgb_mse(const Vector4 input_colors[16], const float input_weights[16], const ETC_Options & options, const ETC_Data & data) {
+    // Decode data and compare?
+    Vector4 colors[16];
+    decode_etc2(data, colors);
+
+    float error = 0;
+    for (int i = 0; i < 16; i++) {
+        error += input_weights[i] * evaluate_mse(input_colors[i].xyz(), colors[i].xyz(), options.color_weights);
+    }
+    return error;
+}
+
+
+static int select_table_index(const Vector3 & base_color, const Vector4 input_colors[16], const float input_weights[16], bool flip, int partition) {
+
+    //float min_lum_delta = NV_FLOAT_MAX;
+    float max_lum_delta = -NV_FLOAT_MAX;
+
+    int xb = partition ? 2 : 0;
+    int xe = partition ? 4 : 2;
+
+    for (int y = 0; y < 4; y++) {
+        for (int x = xb; x < xe; x++) {
+            int idx = flip ? x*4 + y : y*4 + x;
+            float lum_delta = dot(base_color, Vector3(1.0f/3)) - dot(input_colors[idx].xyz(), Vector3(1.0f/3));
+            //min_lum_delta = min(min_lum_delta, lum_delta);
+            max_lum_delta = max(max_lum_delta, fabsf(lum_delta));
+        }
+    }
+
+    int best_range = -1;
+    float best_error = NV_FLOAT_MAX;
+    for (int i = 0; i < 8; i++) {
+        float error = fabsf(etc_intensity_range[i] - 255 * max_lum_delta);
+        if (error < best_error) {
+            best_error = error;
+            best_range = i;
+        }
+    }
+
+    return best_range;
+}
+
+static float update_selectors(const Vector4 input_colors[16], const float input_weights[16], ETC_Data & data, const ETC_Options & options) {
+
+    Color32 palette[2][4];
+
+    if (data.etc.diff) {
+        // Decode colors in 555+333 mode.
+        get_diff_subblock_palette(data.etc.color0, data.etc.table0, palette[0]);
+        get_diff_subblock_palette(data.etc.color0, data.etc.color1, data.etc.table1, palette[1]);
+    }
+    else {
+        // Decode colors in 444,444 mode.
+        get_abs_subblock_palette(data.etc.color0, data.etc.table0, palette[0]);
+        get_abs_subblock_palette(data.etc.color1, data.etc.table1, palette[1]);
+    }
+
+    float total_error = 0;
+
+    for (int y = 0; y < 4; y++) {
+        for (int x = 0; x < 4; x++) {
+            int i = y*4 + x;
+
+            float best_error = NV_FLOAT_MAX;
+            int best_p = 0;
+
+            for (int p = 0; p < 4; p++) {
+                float error = evaluate_mse(toVector3(palette[get_partition(data, x, y)][p]), input_colors[i].xyz(), options.color_weights);
+                if (error < best_error) {
+                    best_error = error;
+                    best_p = p;
+                }
+            }
+
+            int s = x*4 + y;
+            data.selector[s] = U8(best_p);
+
+            total_error += best_error * input_weights[i];
+        }
+    }
+
+    return total_error;
+}
+
+
+static void partition_input_block(const Vector4 input_colors[16], const float input_weights[16], bool flip, int partition, Vector3 output_colors[8], float output_weights[8]) {
+
+    const int xb = partition ? 2 : 0;
+    const int xe = partition ? 4 : 2;
+
+    for (int y = 0, i = 0; y < 4; y++) {
+        for (int x = xb; x < xe; x++, i++) {
+            int idx = flip ? x*4 + y : y*4 + x;
+
+            output_colors[i] = input_colors[idx].xyz();
+            output_weights[i] = input_weights[idx];
+        }
+    }
+}
+
+
+struct ETC_SubBlock {
+    Vector3 color;
+    bool delta;
+    int table;
+    int indices[8];
+};
+
+static float evaluate_rgb_mse(const Vector3 colors[8], const float weights[8], const ETC_Options & options, ETC_SubBlock * sub_block) {
+
+    // Evaluate sub block palette.
+    Vector3 palette[4];
+    palette[0] = sub_block->color + Vector3(etc_intensity_modifiers[sub_block->table][0] / 255.0f);
+    palette[1] = sub_block->color + Vector3(etc_intensity_modifiers[sub_block->table][1] / 255.0f);
+    palette[2] = sub_block->color + Vector3(etc_intensity_modifiers[sub_block->table][2] / 255.0f);
+    palette[3] = sub_block->color + Vector3(etc_intensity_modifiers[sub_block->table][3] / 255.0f);
+
+    float mse = 0;
+    for (int i = 0; i < 8; i++) {
+        mse += evaluate_mse(colors[i], palette[sub_block->indices[i]], options.color_weights) * weights[i];
+    }
+
+    return mse;
+}
+
+static void optimize_base_color(const Vector3 colors[8], const float weights[8], ETC_SubBlock * sub_block) {
+
+    // @@ For a given index selection, find color that minimizes the error. RGB components are independent.
+
+    float D_sum = 0;
+    float R_sum = 0;
+    float G_sum = 0;
+    float B_sum = 0;
+    float W_sum = 0;
+
+    for (int i = 0; i < 8; i++) {
+        float Di = etc_intensity_modifiers[sub_block->table][sub_block->indices[i]] / 255.0f;  // @@ precompute?
+
+        D_sum += Di * weights[i];
+        R_sum += colors[i].x * weights[i];
+        G_sum += colors[i].y * weights[i];
+        B_sum += colors[i].z * weights[i];
+        W_sum += weights[i];
+    }
+
+    sub_block->color.x = (R_sum - D_sum) / W_sum;
+    sub_block->color.y = (R_sum - D_sum) / W_sum;
+    sub_block->color.z = (R_sum - D_sum) / W_sum;
+
+    // @@ Estimate error (without quantization)
+
+
+
+    // @@ Repeat for all tables?
+
+    // @@ Given a new center, compute new indices, then update center?
+
+}
+
+
+
+static int reduce_colors(Vector3 * colors, float * weights, int count) {
+
+    int n = 0;
+
+    for (int i = 0; i < count; i++) {
+
+        if (weights[i] == 0.0f) {
+            // skip without incrementing n.
+            continue;
+        }
+
+        colors[n] = colors[i];
+        weights[n] = weights[i];
+
+        // find color[j] that matches color[i]
+        for (int j = i + 1; j < count; j++) {
+            if (colors[i] == colors[j]) {       // @@ Compare within threshold?
+                weights[n] += weights[j];
+                weights[j] = 0.0f;
+            }
+        }
+
+        n++;
+    }
+
+    return n;
+}
+
+// stable sort. in place.
+static void sort_colors(Vector3 * colors, float * weights, int count) {
+    assert(count <= 8);
+
+    // build the list of values
+    //int order[8];
+    float lum[8];
+    for (int i = 0; i < count; ++i) {
+        //order[i] = i;
+        lum[i] = colors[i].x + colors[i].y + colors[i].z;
+    }
+
+    // stable sort
+    for (int i = 0; i < count; ++i) {
+        for (int j = i; j > 0 && lum[j] < lum[j - 1]; --j) {
+            swap(lum[j], lum[j - 1]);
+            //swap(order[j], order[j - 1]);
+            swap(colors[j], colors[j - 1]);
+        }
+    }
+}
+
+
+
+/*
+float optimize_center(float colors[4][10], uniform int p, uniform int table_level)
+{
+    float best_center = 0;
+    for (uniform int q = 0; q < 4; q++)
+    {
+        best_center += (colors[q][7 + p] - get_etc1_dY(table_level, q)) * colors[q][3];
+    }
+    best_center /= 8;
+
+    float best_err = 0;
+    for (uniform int q = 0; q < 4; q++)
+    {
+        float dY = get_etc1_dY(table_level, q);
+        best_err += sq(clamp(best_center + dY, 0, 255) - colors[q][7 + p]) * colors[q][3];
+    }
+
+    for (uniform int branch = 0; branch < 4; branch++)
+    {
+        float new_center = 0;
+        float sum = 0;
+        for (uniform int q = 0; q < 4; q++)
+        {
+            if (branch <= 1 && q <= branch) continue;
+            if (branch >= 2 && q >= branch) continue;
+            new_center += (colors[q][7 + p] - get_etc1_dY(table_level, q)) * colors[q][3];
+            sum += colors[q][3];
+        }
+
+        new_center /= sum;
+
+        float err = 0;
+        for (uniform int q = 0; q < 4; q++)
+        {
+            float dY = get_etc1_dY(table_level, q);
+            err += sq(clamp(new_center + dY, 0, 255) - colors[q][7 + p]) * colors[q][3];
+        }
+
+        if (err < best_err)
+        {
+            best_err = err;
+            best_center = new_center;
+        }
+    }
+
+    return best_center;
+}
+*/
+
+
+
+
+static void compress_etc1_test(const Vector4 input_colors[16], const float input_weights[16], const ETC_Options & options, ETC_Solution * result) {
+
+    Vector3 colors[8];
+    float weights[8];
+    //int xrefs[8];
+    ETC_SubBlock sub_block[2];
+
+    bool best_flip = false;
+    for (int flip = 0; flip <= 1; flip++) {
+
+        partition_input_block(input_colors, input_weights, !!flip, /*partition=*/0, colors, weights);
+
+        int count = reduce_colors(colors, weights, 8);
+
+        //sort_colors(colors, weights);
+
+        // @@ sort colors along luminance axis.
+
+        //sub_block[0].color
+
+        partition_input_block(input_colors, input_weights, !!flip, /*partition=*/1, colors, weights);
+
+    }
+
+    //pack_colors(sub_block[0].color, sub_block[1].color, &result->data);
+
+    result->error = update_selectors(input_colors, input_weights, result->data, options);
+
+}
+
+/*void pack_colors(const Vector3 & color0, const Vector3 & color1, const ETC_Options & options, ETC_Data * data) {
+
+    uint16 abs_c0 = U16(pack_color_444(color0));
+    uint16 abs_c1 = U16(pack_color_444(color1));
+    Vector3 abs_vc0 = unpack_color_444(abs_c0);
+    Vector3 abs_vc1 = unpack_color_444(abs_c1);
+    float abs_error = evaluate_mse(color0, abs_vc0, options.color_weights) + evaluate_mse(color1, abs_vc1, options.color_weights);
+
+    uint16 diff_c0 = U16(pack_color_555(color0));
+    Vector3 diff_vc0 = unpack_color_555(diff_c0);
+    uint16 diff_d1 = U16(pack_delta_333(color1 - diff_vc0));
+    Vector3 diff_vc1 = unpack_color_555(diff_c0, diff_d1);
+    float diff_error = evaluate_mse(color0, diff_vc0, options.color_weights) + evaluate_mse(color1, diff_vc1, options.color_weights);
+
+    if (diff_error < abs_error) {
+        data->etc.color0 = diff_c0;
+        data->etc.color1 = diff_c1;
+        return diff_error;
+    }
+    else {
+        if (abs_error < best_error) {
+            best_error = abs_error;
+            best_diff = false;
+            best_flip = flip;
+            best_c0 = abs_c0;
+            best_c1 = abs_c1;
+            best_vc0 = abs_vc0;
+            best_vc1 = abs_vc1;
+        }
+    }
+}*/
+
+static void compress_etc1_range_fit(const Vector4 input_colors[16], const float input_weights[16], const ETC_Options & options, ETC_Solution * result) {
+
+    float best_error = NV_FLOAT_MAX;
+    bool best_diff = false;
+    bool best_flip = false;
+    uint16 best_c0 = 0;
+    uint16 best_c1 = 0;
+    Vector3 best_vc0;
+    Vector3 best_vc1;
+
+    for (int flip = 0; flip <= 1; flip++) {
+        Vector3 color0 = get_partition_color_average(input_colors, input_weights, !!flip, /*partition=*/0);
+        Vector3 color1 = get_partition_color_average(input_colors, input_weights, !!flip, /*partition=*/1);
+
+        uint16 abs_c0 = U16(pack_color_444(color0));
+        uint16 abs_c1 = U16(pack_color_444(color1));
+        Vector3 abs_vc0 = unpack_color_444(abs_c0);
+        Vector3 abs_vc1 = unpack_color_444(abs_c1);
+        float abs_error = evaluate_mse(color0, abs_vc0, options.color_weights) + evaluate_mse(color1, abs_vc1, options.color_weights);
+
+        uint16 diff_c0 = U16(pack_color_555(color0));
+        Vector3 diff_vc0 = unpack_color_555(diff_c0);
+        uint16 diff_d1 = U16(pack_delta_333(color1 - diff_vc0));
+        Vector3 diff_vc1 = unpack_color_555(diff_c0, diff_d1);
+        float diff_error = evaluate_mse(color0, diff_vc0, options.color_weights) + evaluate_mse(color1, diff_vc1, options.color_weights);
+
+        if (diff_error < abs_error) {
+            if (diff_error < best_error) {
+                best_error = diff_error;
+                best_diff = true;
+                best_flip = !!flip;
+                best_c0 = diff_c0;
+                best_c1 = diff_d1;
+                best_vc0 = diff_vc0;
+                best_vc1 = diff_vc1;
+            }
+        }
+        else {
+            if (abs_error < best_error) {
+                best_error = abs_error;
+                best_diff = false;
+                best_flip = !!flip;
+                best_c0 = abs_c0;
+                best_c1 = abs_c1;
+                best_vc0 = abs_vc0;
+                best_vc1 = abs_vc1;
+            }
+        }
+    }
+
+
+    result->data.mode = ETC_Data::Mode_ETC1;
+    result->data.etc.flip = best_flip;
+    result->data.etc.diff = best_diff;
+    result->data.etc.table0 = select_table_index(best_vc0, input_colors, input_weights, best_flip, /*partition=*/0);
+    result->data.etc.table1 = select_table_index(best_vc1, input_colors, input_weights, best_flip, /*partition=*/1);
+    result->data.etc.color0 = best_c0;
+    result->data.etc.color1 = best_c1;
+
+    result->error = update_selectors(input_colors, input_weights, result->data, options);
+
+    result->error = evaluate_rgb_mse(input_colors, input_weights, options, result->data);
+}
+
+#if HAVE_RGETC
+#include "nvimage/ColorBlock.h"
+
+void compress_etc1_rg(const Vector4 input_colors[16], const float input_weights[16], const ETC_Options & options, ETC_Solution * result) {
+
+    rg_etc1::etc1_pack_params pack_params;
+    //pack_params.m_quality = rg_etc1::cLowQuality;
+    pack_params.m_quality = rg_etc1::cMediumQuality;  // @@ Select quality based on compression options. 
+
+    ColorBlock rgba;
+    for (uint i = 0; i < 16; i++) {
+        rgba.color(i) = toColor32(input_colors[i]);
+    }
+    rgba.swizzle(2, 1, 0, 3);
+
+    BlockETC block;
+    rg_etc1::pack_etc1_block((void *)&block, (const uint *)rgba.colors(), pack_params);
+
+    unpack_etc2_block(&block, &result->data);
+
+    result->error = evaluate_rgb_mse(input_colors, input_weights, options, result->data);
+}
+#endif
+
+static void compress_etc2_planar_solid(const Vector4 input_colors[16], const float input_weights[16], const ETC_Options & options, ETC_Solution * result) {
+
+    Vector3 C(0);
+    float W = 0;
+
+    for (int i = 0; i < 16; i++) {
+        C += input_colors[i].xyz() * input_weights[i];
+        W += input_weights[i];
+    }
+
+    C /= W;
+
+    // Convert colors to 676
+    result->data.mode = ETC_Data::Mode_Planar;
+    result->data.planar.ro = pack_float_6(C.x);
+    result->data.planar.go = pack_float_7(C.y);
+    result->data.planar.bo = pack_float_6(C.z);
+
+    result->data.planar.rh = result->data.planar.ro;
+    result->data.planar.gh = result->data.planar.go;
+    result->data.planar.bh = result->data.planar.bo;
+
+    result->data.planar.rv = result->data.planar.ro;
+    result->data.planar.gv = result->data.planar.go;
+    result->data.planar.bv = result->data.planar.bo;
+
+    // Evaluate error.
+    result->error = evaluate_rgb_mse(input_colors, input_weights, options, result->data);
+}
+
+// Least squares optimization of planar endpoints.
+static void compress_etc2_planar_lsqr(const Vector4 input_colors[16], const float input_weights[16], const ETC_Options & options, ETC_Solution * result) {
+
+    // Isn't this a simple least squares problem?
+    // - Yes, but that doesn't take clamping and quantization into account.
+    // - Solve the least squares problem, then refine endpoints?
+
+    // This matrix is always the same! But not when using arbitrary weights!
+    // This would be faster computing the matrix first, then multiplying by the weight covariance matrix.
+    Matrix3 m(0);
+
+    // For every pixel, decoder does:
+    // int r = (4 * ro + x * (rh - ro) + y * (rv - ro) + 2) >> 2;
+
+    // R(x,y) = (4 * ro + x * (rh - ro) + y * (rv - ro) + 2) / 4;
+
+    // R(x,y) = ro * (1 - x/4 - y/4) + rh * x/4 + rv * y/4 + 1/2;
+
+    // a = x/4
+    // b = y/4
+    // c = 1 - a - b
+    // R(x,y) = ro * c + rh * a + rv * b + 1/2;
+
+    float A[3 * 16];
+
+    for (int y = 0; y < 4; y++) {
+        for (int x = 0; x < 4; x++) {
+            float w = input_weights[4*y+x];
+            //if ((x == 1 || x == 2) && (y == 1 && y == 2)) w *= 0.5;
+
+            float a = float(x) / 4 * w;
+            float b = float(y) / 4 * w;
+            float c = (1 - a - b) * w;
+
+            int i = y*4 + x;
+            A[3 * i + 0] = a;
+            A[3 * i + 1] = b;
+            A[3 * i + 2] = c;
+
+            /*for (int yy = 0; yy < 4; yy++) {
+                for (int xx = 0; xx < 4; xx++) {
+                    float ww = input_weights[4*yy+xx];
+                    //if ((xx == 1 || xx == 2) && (yy == 1 && yy == 2)) ww *= 0.5;
+
+                    float aa = float(xx) / 4 * ww;
+                    float bb = float(yy) / 4 * ww;
+                    float cc = (1 - aa - bb) * ww;
+
+                    m(0,0) += a * aa;
+                    m(1,0) += b * aa;
+                    m(2,0) += c * aa;
+
+                    m(0,1) += a * bb;
+                    m(1,1) += b * bb;
+                    m(2,1) += c * bb;
+
+                    m(0,2) += a * cc;
+                    m(1,2) += b * cc;
+                    m(2,2) += c * cc;
+                }
+            }*/
+        }
+    }
+
+    // At*A
+    for (int y = 0; y < 3; y++) {
+        for (int x = 0; x < 3; x++) {
+            float d = 0;
+            for (int i = 0; i < 16; i++) {
+                d += A[3*i+x] * A[3*i+y];
+            }
+            m(x, y) = d;
+        }
+    }
+
+    // Compute right side:
+    Vector3 Ca(0), Cb(0), Cc(0);
+
+    for (int y = 0; y < 4; y++) {
+        for (int x = 0; x < 4; x++) {
+            float a = float(x) / 4;
+            float b = float(y) / 4;
+            float c = 1 - a - b;
+
+            Vector3 C = input_colors[4*y+x].xyz() - Vector3(0.5f / 255);
+
+            Ca += C * a;
+            Cb += C * b;
+            Cc += C * c;
+        }
+    }
+
+    // Now we have 3 equations (one for each color component).
+
+    Vector3 R(Ca.x, Cb.x, Cc.x);
+    Vector3 G(Ca.y, Cb.y, Cc.y);
+    Vector3 B(Ca.z, Cb.z, Cc.z);
+
+    Vector3 r, g, b;
+
+    if (!solveLU(m, R, &r)) {
+        result->error = NV_FLOAT_MAX;
+        return;
+    }
+    if (!solveLU(m, G, &g)) {
+        result->error = NV_FLOAT_MAX;
+        return;
+    }
+    if (!solveLU(m, B, &b)) {
+        result->error = NV_FLOAT_MAX;
+        return;
+    }
+
+    Vector3 Ch(r.x, g.x, b.x);
+    Vector3 Cv(r.y, g.y, b.y);
+    Vector3 Co(r.z, g.z, b.z);
+
+    // Convert colors to 676
+    result->data.mode = ETC_Data::Mode_Planar;
+    result->data.planar.ro = pack_float_6(Co.x);
+    result->data.planar.go = pack_float_7(Co.y);
+    result->data.planar.bo = pack_float_6(Co.z);
+
+    result->data.planar.rh = pack_float_6(Ch.x);
+    result->data.planar.gh = pack_float_7(Ch.y);
+    result->data.planar.bh = pack_float_6(Ch.z);
+
+    result->data.planar.rv = pack_float_6(Cv.x);
+    result->data.planar.gv = pack_float_7(Cv.y);
+    result->data.planar.bv = pack_float_6(Cv.z);
+
+    // Evaluate error.
+    result->error = evaluate_rgb_mse(input_colors, input_weights, options, result->data);
+
+    bool refine_endpoints = true;
+    if (refine_endpoints) {
+        ETC_Solution best = *result;
+
+        // @@ The per-component errors are not correllated, test 8 combinations 3 times.
+        for (int i = 0; i < 8; i++) {
+            result->data.planar.ro = pack_float_6(Co.x, (i & 1) != 0);
+            result->data.planar.rh = pack_float_6(Ch.x, (i & 2) != 0);
+            result->data.planar.rv = pack_float_6(Cv.x, (i & 4) != 0);
+
+            result->error = evaluate_rgb_mse(input_colors, input_weights, options, result->data);
+            if (result->error < best.error) {
+                best = *result;
+            }
+        }
+
+        *result = best;
+
+        for (int i = 0; i < 8; i++) {
+            result->data.planar.go = pack_float_7(Co.y, (i & 1) != 0);
+            result->data.planar.gh = pack_float_7(Ch.y, (i & 2) != 0);
+            result->data.planar.gv = pack_float_7(Cv.y, (i & 4) != 0);
+
+            result->error = evaluate_rgb_mse(input_colors, input_weights, options, result->data);
+            if (result->error < best.error) {
+                best = *result;
+            }
+        }
+
+        *result = best;
+
+        for (int i = 0; i < 8; i++) {
+            result->data.planar.bo = pack_float_6(Co.z, (i & 1) != 0);
+            result->data.planar.bh = pack_float_6(Ch.z, (i & 2) != 0);
+            result->data.planar.bv = pack_float_6(Cv.z, (i & 4) != 0);
+
+            result->error = evaluate_rgb_mse(input_colors, input_weights, options, result->data);
+            if (result->error < best.error) {
+                best = *result;
+            }
+        }
+
+        *result = best;
+    }
+}
+
+
+static void process_input_colors(Vector4 input_colors[16]) {
+    for (int i = 0; i < 16; i++) {
+        input_colors[i] = saturate(input_colors[i]);
+        
+        // @@ Sanitize input_weights?
+        //  - Avoid blocks with all zero weight.
+        //  - Normalize weights to avoid too small values?
+        //  - Remove NaNs, infinites, etc.
+    }
+}
+
+static void process_input_alphas(Vector4 input_colors[16], int input_channel) {
+    for (int i = 0; i < 16; i++) {
+        input_colors[i].component[input_channel] = saturate(input_colors[i].component[input_channel]);
+    }
+}
+
+static void process_input_weights(float input_weights[16]) {
+    float max_weight = 0.0f;
+    for (int i = 0; i < 16; i++) {
+        max_weight = nv::max(max_weight, input_weights[i]);
+    }
+    
+    const float min_weight = 0.0001f;
+    
+    if (max_weight <= min_weight) {
+        // Handle degenerate case.
+        for (int i = 0; i < 16; i++) {
+            input_weights[i] = 1;
+        }
+    }
+    else {
+        for (int i = 0; i < 16; i++) {
+            // Clamp to positive.
+            input_weights[i] = nv::max(input_weights[i], 0.0f);
+
+            // Flush to zero.
+            if (input_weights[i] < min_weight) input_weights[i] = 0.0f;
+
+            // Normalize.
+            input_weights[i] /= max_weight;
+        }
+    }
+}
+
+
+
+
+static float compress_etc_a1(Vector4 input_colors[16], float input_weights[16], const ETC_Options & options, void * output) {
+    assert(options.onebit_alpha == true);
+
+    // Classify block.
+    bool transparent_block = true;
+    bool opaque_block = true;
+
+    for (int i = 0; i < 16; i++) {
+        if (input_colors[i].w != 0) transparent_block = false;
+        if (input_colors[i].w != 1) opaque_block = false;
+    }
+
+    if (transparent_block) {
+        // @@ Encode trivial transparent block.
+        return 0;
+    }
+
+    if (opaque_block) {
+        // @@ Encode block with opaque bit set. @@ Isn't this like the standard encoder?
+    }
+    
+    // @@ Encode mixed block.
+    nvCheck(false); // Not implemented!
+    
+    //uint8 color_rgb[16*3];
+    //uint8 alpha[16];
+    //uint etc_word1, etc_word2;
+    //compressBlockDifferentialWithAlpha(bool isTransparent, uint8* img, uint8* alphaimg, uint8* imgdec, 4, 4, 0, 0, &etc_word1, &etc_word2);
+    
+    return NV_FLOAT_MAX;
+}
+
+//uint etc_blocks = 0;
+//uint planar_blocks = 0;
+//#include "nvthread/Atomic.h"
+
+static float compress_etc(Vector4 input_colors[16], float input_weights[16], const ETC_Options & options, void * output) {
+    assert(options.onebit_alpha == false);
+    
+    ETC_Solution result;
+    compress_etc1_range_fit(input_colors, input_weights, options, &result);
+
+    if (options.use_rg_etc) {
+#if HAVE_RGETC
+        ETC_Solution rg_result;
+        compress_etc1_rg(input_colors, input_weights, options, &rg_result);
+        if (rg_result.error < result.error) {
+            result = rg_result;
+        }
+#else
+        // @@ Print warning?
+#endif
+    }
+
+    if (options.enable_etc2) {
+        if (options.use_planar) {
+            ETC_Solution planar_result;
+            compress_etc2_planar_lsqr(input_colors, input_weights, options, &planar_result);
+
+            if (planar_result.error < result.error) {
+                result = planar_result;
+                //nv::atomicIncrement(&planar_blocks);
+            }
+            else {
+                //nv::atomicIncrement(&etc_blocks);
+            }
+        }
+        if (options.use_t_mode) {
+            // @@
+        }
+        if (options.use_h_mode) {
+            // @@
+        }
+    }
+
+    pack_etc2_block(result.data, (BlockETC *)output);
+
+    return result.error;
+}
+
+
+// Range search EAC compressor, slightly modified from ETCLib.
+float compress_eac_range_search(Vector4 input_colors[16], float input_weights[16], int input_channel, const EAC_Options & options, void * output) {
+
+    // Find alpha range
+    float min_a = 1.0f;
+    float max_a = 0.0f;
+    for (uint i = 0; i < 16; i++) {
+        float a = input_colors[i].component[input_channel];
+        min_a = nv::min(min_a, a);
+        max_a = nv::max(max_a, a);
+    }
+    const float range_a = max_a - min_a;
+
+    EAC_Solution best;
+    best.error = NV_FLOAT_MAX;
+
+    // try each modifier table entry
+    static const uint MODIFIER_TABLE_ENTRYS = 16;
+    for (uint t = 0; t < MODIFIER_TABLE_ENTRYS; t++) {
+        static const uint MIN_VALUE_SELECTOR = 3;
+        static const uint MAX_VALUE_SELECTOR = 7;
+    
+        const float fTableEntryCenter = (float)-eac_intensity_modifiers[t][MIN_VALUE_SELECTOR];
+        const float fTableEntryRange = (float)eac_intensity_modifiers[t][MAX_VALUE_SELECTOR] - eac_intensity_modifiers[t][MIN_VALUE_SELECTOR];
+        const float fCenterRatio = fTableEntryCenter / fTableEntryRange;
+    
+        const int center = ftoi_round(255.0f * (min_a + fCenterRatio * range_a));
+        const int min_base = max(0, center - options.search_radius);
+        const int max_base = min(center + options.search_radius, 255);
+    
+        for (int base = min_base; base <= max_base; base++) {
+            int range_multiplier = ftoi_round(255 * range_a / fTableEntryRange);
+            const int min_multiplier = clamp(range_multiplier - options.search_radius, 1, 15);
+            const int max_multiplier = clamp(range_multiplier + options.search_radius, 1, 15);
+        
+            for (int multiplier = min_multiplier; multiplier <= max_multiplier; multiplier++) {
+            
+                // find best selector for each pixel
+                float block_error = 0;
+                uint best_selector[16];
+                for (uint i = 0; i < 16; i++) {
+                
+                    float best_error_a = NV_FLOAT_MAX;
+                
+                    static const uint ALPHA_SELECTOR_BITS = 3;
+                    static const uint ALPHA_SELECTORS = 1 << ALPHA_SELECTOR_BITS;
+                    for (uint s = 0; s < ALPHA_SELECTORS; s++) {
+                        float alpha;
+                        if (options.use_11bit_mode) {
+                            alpha = get_alpha11(base, t, multiplier, s);
+                        }
+                        else {
+                            alpha = get_alpha8(base, t, multiplier, s);
+                        }
+                    
+                        float error_a = alpha - input_colors[i].component[input_channel];
+                        error_a = error_a * error_a;
+                    
+                        if (error_a < best_error_a) {
+                            best_error_a = error_a;
+                            best_selector[i] = s;
+                        }
+                    }
+                
+                    block_error += best_error_a * input_weights[i];
+                    if (block_error > best.error) {
+                        break;  // Don't waste more time.
+                    }
+                }
+            
+                if (block_error < best.error) {
+                    best.error = block_error;
+                
+                    best.data.alpha = base;
+                    best.data.multiplier = multiplier;
+                    best.data.table_index = t;
+                    for (uint i = 0; i < 16; i++) {
+                        // Flip selectors.
+                        best.data.selector[i] = best_selector[4*(i%4) + i/4];
+                    }
+                }
+            }
+        }
+    }
+
+    pack_eac_block(best.data, (BlockEAC *)output);
+
+    return best.error;
+}
+
+
+
+
+// Public API:
+
+void nv::decompress_etc(const void * input_block, Vector4 output_colors[16]) {
+#if 1 // Our code
+    ETC_Data data;
+    unpack_etc2_block((const BlockETC *)input_block, &data);
+
+    decode_etc2(data, output_colors);
+
+#elif HAVE_RGETC && 0
+
+    Color32 colors[16];
+    rg_etc1::unpack_etc1_block(input_block, &colors->u);
+
+    for (int i = 0; i < 16; i++) {
+        output_colors[i].x = colors[i].b * (1.0f / 255.0f);
+        output_colors[i].y = colors[i].g * (1.0f / 255.0f);
+        output_colors[i].z = colors[i].r * (1.0f / 255.0f);
+        output_colors[i].w = colors[i].a * (1.0f / 255.0f);
+    }
+
+#elif HAVE_ETCPACK // Use etcpack for reference.
+    const BlockETC * block = (const BlockETC *)input_block;
+
+    uint8 colors[3*16];
+    uint part1 = POSH_SwapU32(block->data32[0]);
+    uint part2 = POSH_SwapU32(block->data32[1]);
+    decompressBlockETC2(part1, part2, colors, 4, 4, 0, 0);
+
+    for (int i = 0; i < 16; i++) {
+        output_colors[i].x = colors[3*i+0] * (1.0f / 255.0f);
+        output_colors[i].y = colors[3*i+1] * (1.0f / 255.0f);
+        output_colors[i].z = colors[3*i+2] * (1.0f / 255.0f);
+        output_colors[i].w = 1.0f;
+    }
+#endif
+}
+
+void nv::decompress_eac(const void * input_block, Vector4 output_colors[16], int output_channel) {
+    nvCheck(output_channel >= 0 && output_channel < 4);
+    
+#if 1
+    EAC_Data data;
+    unpack_eac_block((const BlockEAC *)input_block, &data);
+    decode_eac_11(data, output_colors, output_channel);
+    
+#elif HAVE_ETCPACK
+    // Use etcpack for reference.
+    formatSigned = 0;
+
+    uint16 alphas[16];
+    decompressBlockAlpha16bit((uint8*)input_block, (uint8*)alphas, 4, 4, 0, 0);
+
+    for (int i = 0; i < 16; i++) {
+        uint16 alpha = POSH_SwapU16(alphas[i]);
+        output_colors[i].component[output_channel] = alpha * (1.0f / 65535.0f);
+    }
+#endif
+}
+
+void nv::decompress_etc_eac(const void * input, Vector4 output_colors[16]) {
+#if 1
+    BlockETC_EAC * input_block = (BlockETC_EAC *)input;
+
+    ETC_Data etc;
+    unpack_etc2_block(&input_block->etc, &etc);
+    decode_etc2(etc, output_colors);
+
+    EAC_Data eac;
+    unpack_eac_block(&input_block->eac, &eac);
+    decode_eac_8(eac, output_colors, 3);
+
+#elif HAVE_ETCPACK
+    // Use etcpack for reference.
+    uint8 colors[4*16];
+    decompressBlockAlpha((uint8*)input_block, colors, 4, 4, 0, 0);
+
+    for (int i = 0; i < 16; i++) {
+        output_colors[i].x = colors[4*i+0] * (1.0f / 255.0f);
+        output_colors[i].y = colors[4*i+1] * (1.0f / 255.0f);
+        output_colors[i].z = colors[4*i+2] * (1.0f / 255.0f);
+        output_colors[i].w = colors[4*i+3] * (1.0f / 255.0f);
+    }
+#endif
+}
+
+float nv::compress_etc1(Vector4 input_colors[16], float input_weights[16], const Vector3 & color_weights, void * output) {
+    
+    process_input_colors(input_colors);
+    
+    // @@ Use same options for all blocks?
+    ETC_Options options;
+    options.use_rg_etc = true;
+    options.enable_etc2 = false;
+    options.use_t_mode = false;
+    options.use_h_mode = false;
+    options.use_planar = false;
+    options.color_weights = color_weights;
+
+    return compress_etc(input_colors, input_weights, options, output);
+}
+
+float nv::compress_etc2(Vector4 input_colors[16], float input_weights[16], const Vector3 & color_weights, void * output) {
+    
+    process_input_colors(input_colors);
+    process_input_weights(input_weights);
+    
+    ETC_Options options;
+    options.use_rg_etc = true;
+    options.enable_etc2 = true;
+    options.use_t_mode = false; // @@ Not implemented.
+    options.use_h_mode = false; // @@ Not implemented.
+    options.use_planar = true;
+    options.color_weights = color_weights;
+
+    return compress_etc(input_colors, input_weights, options, output);
+}
+
+float nv::compress_etc2_a1(Vector4 input_colors[16], float input_weights[16], const Vector3 & color_weights, void * output) {
+    
+    process_input_colors(input_colors);
+    process_input_weights(input_weights);
+    
+    ETC_Options options;
+    options.use_rg_etc = true;
+    options.enable_etc2 = true;
+    options.use_t_mode = false; // @@ Not implemented.
+    options.use_h_mode = false; // @@ Not implemented.
+    options.use_planar = true;
+    options.onebit_alpha = true;
+    options.color_weights = color_weights;
+    
+    return compress_etc_a1(input_colors, input_weights, options, output);
+}
+
+
+float nv::compress_eac(Vector4 input_colors[16], float input_weights[16], int input_channel, int search_radius, bool use_11bit_mode, void * output) {
+    nvCheck(input_channel >= 0 && input_channel < 4);
+    
+    process_input_alphas(input_colors, input_channel);
+    process_input_weights(input_weights);
+    
+    EAC_Options options;
+    options.search_radius = search_radius;
+    options.use_11bit_mode = use_11bit_mode;
+    
+    return compress_eac_range_search(input_colors, input_weights, input_channel, options, output);
+}
+
+float nv::compress_etc2_eac(Vector4 input_colors[16], float input_weights[16], const Vector3 & color_weights, void * output) {
+    BlockETC_EAC * output_block = (BlockETC_EAC *)output;
+    float error = compress_etc2(input_colors, input_weights, color_weights, &output_block->etc);
+    error += compress_eac(input_colors, input_weights, /*input_channel=*/3, /*search_radius=*/1, /*use_11bit_mode=*/false, &output_block->eac);
+    return error;
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/src/nvtt/CompressorETC.h b/src/nvtt/CompressorETC.h
new file mode 100644
index 0000000..3101007
--- /dev/null
+++ b/src/nvtt/CompressorETC.h
@@ -0,0 +1,20 @@
+#include "nvcore/nvcore.h"
+
+namespace nv {
+
+    class Vector3;
+    class Vector4;
+    
+    void decompress_etc(const void * input_block, Vector4 output_colors[16]);
+    void decompress_eac(const void * input_block, Vector4 output_colors[16], int output_channel);
+    void decompress_etc_eac(const void * input_block, Vector4 output_colors[16]);
+
+    float compress_etc1(Vector4 input_colors[16], float input_weights[16], const Vector3 & color_weights, void * output);
+    float compress_etc2(Vector4 input_colors[16], float input_weights[16], const Vector3 & color_weights, void * output);
+    float compress_etc2_a1(Vector4 input_colors[16], float input_weights[16], const Vector3 & color_weights, void * output);
+    float compress_eac(Vector4 input_colors[16], float input_weights[16], int input_channel, int search_radius, bool use_11bit_mode, void * output);
+    float compress_etc2_eac(Vector4 input_colors[16], float input_weights[16], const Vector3 & color_weights, void * output);
+
+}
+
+
diff --git a/src/nvtt/CompressorRGB.cpp b/src/nvtt/CompressorRGB.cpp
index 7578883..861a882 100644
--- a/src/nvtt/CompressorRGB.cpp
+++ b/src/nvtt/CompressorRGB.cpp
@@ -250,6 +250,8 @@ namespace
 
         // Compute shared exponent.
         int exp_shared_p = max(-B-1, ftoi_floor(log2f(max_c))) + 1 + B;
+        nvDebugCheck(exp_shared_p <= Emax);
+        nvDebugCheck(exp_shared_p >= 0);
 
         int max_s = ftoi_round(max_c / (1 << (exp_shared_p - B - N)));
 
@@ -279,7 +281,7 @@ namespace
     {
         float v = max3(r, g, b);
 
-        uint rgbe;
+        uint rgbe = 0;
 
         if (v < 1e-32) {
             rgbe = 0;
@@ -534,6 +536,7 @@ void PixelFormatConverter::compress(nvtt::AlphaMode /*alphaMode*/, uint w, uint
                     }
                     else if (compressionOptions.pixelType == nvtt::PixelType_SignedNorm) {
                         // @@
+                        ir = ig = ib = ia = 0;
                     }
                     else if (compressionOptions.pixelType == nvtt::PixelType_UnsignedInt) {
                         ir = iround(clamp(r, 0.0f, 65535.0f));
@@ -543,6 +546,11 @@ void PixelFormatConverter::compress(nvtt::AlphaMode /*alphaMode*/, uint w, uint
                     }
                     else if (compressionOptions.pixelType == nvtt::PixelType_SignedInt) {
                         // @@
+                        ir = ig = ib = ia = 0;
+                    }
+                    else {
+                        // @@
+                        ir = ig = ib = ia = 0;
                     }
                     
                     uint p = 0;
diff --git a/src/nvtt/Context.cpp b/src/nvtt/Context.cpp
index 6aacc90..37c201b 100644
--- a/src/nvtt/Context.cpp
+++ b/src/nvtt/Context.cpp
@@ -39,6 +39,7 @@
 #include "cuda/CudaCompressorDXT.h"
 
 #include "nvimage/DirectDrawSurface.h"
+#include "nvimage/KtxFile.h"
 #include "nvimage/ColorBlock.h"
 #include "nvimage/BlockDXT.h"
 #include "nvimage/Image.h"
@@ -51,6 +52,7 @@
 
 #include "nvcore/Memory.h"
 #include "nvcore/Ptr.h"
+#include "nvcore/Array.inl"
 
 using namespace nv;
 using namespace nvtt;
@@ -222,11 +224,6 @@ bool Compressor::Private::compress(const InputOptions::Private & inputOptions, c
         return false;
     }
 
-    nvtt::Surface img;
-    img.setWrapMode(inputOptions.wrapMode);
-    img.setAlphaMode(inputOptions.alphaMode);
-    img.setNormalMap(inputOptions.isNormalMap);
-
     const int faceCount = inputOptions.faceCount;
     int width = inputOptions.width;
     int height = inputOptions.height;
@@ -244,97 +241,230 @@ bool Compressor::Private::compress(const InputOptions::Private & inputOptions, c
         if (inputOptions.maxLevel > 0) mipmapCount = min(mipmapCount, inputOptions.maxLevel);
     }
 
-    if (!outputHeader(inputOptions.textureType, width, height, depth, arraySize, mipmapCount, img.isNormalMap(), compressionOptions, outputOptions)) {
+    if (!outputHeader(inputOptions.textureType, width, height, depth, arraySize, mipmapCount, inputOptions.isNormalMap, compressionOptions, outputOptions)) {
         return false;
     }
 
 
-    // Output images.
-    for (int f = 0; f < faceCount; f++)
+    if (outputOptions.container != Container_KTX)
     {
+        nvtt::Surface img;
+        img.setWrapMode(inputOptions.wrapMode);
+        img.setAlphaMode(inputOptions.alphaMode);
+        img.setNormalMap(inputOptions.isNormalMap);
+
+        // Output each face from the largest mipmap to the smallest.
+        for (int f = 0; f < faceCount; f++)
+        {
+            int w = width;
+            int h = height;
+            int d = depth;
+            bool canUseSourceImagesForThisFace = canUseSourceImages;
+
+            img.setImage(inputOptions.inputFormat, inputOptions.width, inputOptions.height, inputOptions.depth, inputOptions.images[f]);
+
+            // To normal map.
+            if (inputOptions.convertToNormalMap) {
+                img.toGreyScale(inputOptions.heightFactors.x, inputOptions.heightFactors.y, inputOptions.heightFactors.z, inputOptions.heightFactors.w);
+                img.toNormalMap(inputOptions.bumpFrequencyScale.x, inputOptions.bumpFrequencyScale.y, inputOptions.bumpFrequencyScale.z, inputOptions.bumpFrequencyScale.w);
+            }
+
+            // To linear space.
+            if (!img.isNormalMap()) {
+                img.toLinear(inputOptions.inputGamma);
+            }
+
+            // Resize input.
+            img.resize(w, h, d, ResizeFilter_Box);
+
+            nvtt::Surface tmp = img;
+            if (!img.isNormalMap()) {
+                tmp.toGamma(inputOptions.outputGamma);
+            }
+
+            quantize(tmp, compressionOptions);
+            compress(tmp, f, 0, compressionOptions, outputOptions);
+
+            for (int m = 1; m < mipmapCount; m++) {
+                w = max(1, w/2);
+                h = max(1, h/2);
+                d = max(1, d/2);
+
+                int idx = m * faceCount + f;
+
+                bool useSourceImages = false;
+                if (canUseSourceImagesForThisFace) {
+                    if (inputOptions.images[idx] == NULL) { // One face is missing in this mipmap level.
+                        canUseSourceImagesForThisFace = false; // If one level is missing, ignore the following source images.
+                    }
+                    else {
+                        useSourceImages = true;
+                    }
+                }
+
+                if (useSourceImages) {
+                    img.setImage(inputOptions.inputFormat, w, h, d, inputOptions.images[idx]);
+
+                    // For already generated mipmaps, we need to convert to linear.
+                    if (!img.isNormalMap()) {
+                        img.toLinear(inputOptions.inputGamma);
+                    }
+                }
+                else {
+                    if (inputOptions.mipmapFilter == MipmapFilter_Kaiser) {
+                        float params[2] = { inputOptions.kaiserAlpha, inputOptions.kaiserStretch };
+                        img.buildNextMipmap(MipmapFilter_Kaiser, inputOptions.kaiserWidth, params);
+                    }
+                    else {
+                        img.buildNextMipmap(inputOptions.mipmapFilter);
+                    }
+                }
+                nvDebugCheck(img.width() == w);
+                nvDebugCheck(img.height() == h);
+                nvDebugCheck(img.depth() == d);
+
+                if (img.isNormalMap()) {
+                    if (inputOptions.normalizeMipmaps) {
+                        img.expandNormals();
+                        img.normalizeNormalMap();
+                        img.packNormals();
+                    }
+                    tmp = img;
+                }
+                else {
+                    tmp = img;
+                    tmp.toGamma(inputOptions.outputGamma);
+                }
+
+                quantize(tmp, compressionOptions);
+                compress(tmp, f, m, compressionOptions, outputOptions);
+            }
+        }
+    }
+    else
+    {
+        // KTX files expect face mipmaps to be interleaved.
+        Array<nvtt::Surface> images(faceCount);
+        Array<bool> mipChainBroken(faceCount);
+
         int w = width;
         int h = height;
         int d = depth;
-        bool canUseSourceImagesForThisFace = canUseSourceImages;
 
-        img.setImage(inputOptions.inputFormat, inputOptions.width, inputOptions.height, inputOptions.depth, inputOptions.images[f]);
+        // https://www.khronos.org/opengles/sdk/tools/KTX/file_format_spec/#2.16
+        uint imageSize = estimateSize(w, h, 1, 1, compressionOptions) * faceCount;
+        outputOptions.writeData(&imageSize, sizeof(uint32));
 
-        // To normal map.
-        if (inputOptions.convertToNormalMap) {
-            img.toGreyScale(inputOptions.heightFactors.x, inputOptions.heightFactors.y, inputOptions.heightFactors.z, inputOptions.heightFactors.w);
-            img.toNormalMap(inputOptions.bumpFrequencyScale.x, inputOptions.bumpFrequencyScale.y, inputOptions.bumpFrequencyScale.z, inputOptions.bumpFrequencyScale.w);
-            img.packNormals();
-        }
+        for (int f = 0; f < faceCount; f++)
+        {
+            nvtt::Surface s;
+            s.setWrapMode(inputOptions.wrapMode);
+            s.setAlphaMode(inputOptions.alphaMode);
+            s.setNormalMap(inputOptions.isNormalMap);
 
-        // To linear space.
-        if (!img.isNormalMap()) {
-            img.toLinear(inputOptions.inputGamma);
-        }
+            s.setImage(inputOptions.inputFormat, inputOptions.width, inputOptions.height, inputOptions.depth, inputOptions.images[f]);
 
-        // Resize input.
-        img.resize(w, h, d, ResizeFilter_Box);
+            // To normal map.
+            if (inputOptions.convertToNormalMap) {
+                s.toGreyScale(inputOptions.heightFactors.x, inputOptions.heightFactors.y, inputOptions.heightFactors.z, inputOptions.heightFactors.w);
+                s.toNormalMap(inputOptions.bumpFrequencyScale.x, inputOptions.bumpFrequencyScale.y, inputOptions.bumpFrequencyScale.z, inputOptions.bumpFrequencyScale.w);
+            }
 
-        nvtt::Surface tmp = img;
-        if (!img.isNormalMap()) {
-            tmp.toGamma(inputOptions.outputGamma);
-        }
+            // To linear space.
+            if (!s.isNormalMap()) {
+                s.toLinear(inputOptions.inputGamma);
+            }
 
-        quantize(tmp, compressionOptions);
-        compress(tmp, f, 0, compressionOptions, outputOptions);
+            // Resize input.
+            s.resize(w, h, d, ResizeFilter_Box);
+
+            nvtt::Surface tmp = s;
+            if (!s.isNormalMap()) {
+                tmp.toGamma(inputOptions.outputGamma);
+            }
+
+            quantize(tmp, compressionOptions);
+            compress(tmp, f, 0, compressionOptions, outputOptions);
 
-        for (int m = 1; m < mipmapCount; m++) {
+            images.push_back(s);
+            mipChainBroken.push_back(false);
+        }
+
+        static const unsigned char padding[3] = {0, 0, 0};
+        for (int m = 1; m < mipmapCount; m++)
+        {
             w = max(1, w/2);
             h = max(1, h/2);
             d = max(1, d/2);
 
-            int idx = m * faceCount + f;
+            // https://www.khronos.org/opengles/sdk/tools/KTX/file_format_spec/#2.16
+            imageSize = estimateSize(w, h, d, 1, compressionOptions) * faceCount;
 
-            bool useSourceImages = false;
-            if (canUseSourceImagesForThisFace) {
-                if (inputOptions.images[idx] == NULL) { // One face is missing in this mipmap level.
-                    canUseSourceImagesForThisFace = false; // If one level is missing, ignore the following source images.
-                }
-                else {
-                    useSourceImages = true;
+            outputOptions.writeData(&imageSize, sizeof(uint32));
+
+            nvtt::Surface tmp;
+
+            for (int f = 0; f < faceCount; f++)
+            {
+                nvtt::Surface& img = images[f];
+                int idx = m * faceCount + f;
+
+                bool useSourceImages = false;
+                if (!mipChainBroken[f]) {
+                    if (inputOptions.images[idx] == NULL) { // One face is missing in this mipmap level.
+                        mipChainBroken[f] = false; // If one level is missing, ignore the following source images.
+                    }
+                    else {
+                        useSourceImages = true;
+                    }
                 }
-            }
 
-            if (useSourceImages) {
-                img.setImage(inputOptions.inputFormat, w, h, d, inputOptions.images[idx]);
+                if (useSourceImages) {
+                    img.setImage(inputOptions.inputFormat, w, h, d, inputOptions.images[idx]);
 
-                // For already generated mipmaps, we need to convert to linear.
-                if (!img.isNormalMap()) {
-                    img.toLinear(inputOptions.inputGamma);
+                    // For already generated mipmaps, we need to convert to linear.
+                    if (!img.isNormalMap()) {
+                        img.toLinear(inputOptions.inputGamma);
+                    }
                 }
-            }
-            else {
-                if (inputOptions.mipmapFilter == MipmapFilter_Kaiser) {
-                    float params[2] = { inputOptions.kaiserAlpha, inputOptions.kaiserStretch };
-                    img.buildNextMipmap(MipmapFilter_Kaiser, inputOptions.kaiserWidth, params);
+                else {
+                    if (inputOptions.mipmapFilter == MipmapFilter_Kaiser) {
+                        float params[2] = { inputOptions.kaiserStretch, inputOptions.kaiserAlpha };
+                        img.buildNextMipmap(MipmapFilter_Kaiser, inputOptions.kaiserWidth, params);
+                    }
+                    else {
+                        img.buildNextMipmap(inputOptions.mipmapFilter);
+                    }
+                }
+                nvDebugCheck(img.width() == w);
+                nvDebugCheck(img.height() == h);
+                nvDebugCheck(img.depth() == d);
+
+                if (img.isNormalMap()) {
+                    if (inputOptions.normalizeMipmaps) {
+                        img.normalizeNormalMap();
+                    }
+                    tmp = img;
                 }
                 else {
-                    img.buildNextMipmap(inputOptions.mipmapFilter);
+                    tmp = img;
+                    tmp.toGamma(inputOptions.outputGamma);
                 }
-            }
-            nvDebugCheck(img.width() == w);
-            nvDebugCheck(img.height() == h);
-            nvDebugCheck(img.depth() == d);
 
-            if (img.isNormalMap()) {
-                if (inputOptions.normalizeMipmaps) {
-                    img.expandNormals();
-                    img.normalizeNormalMap();
-                    img.packNormals();
+                quantize(tmp, compressionOptions);
+                compress(tmp, f, m, compressionOptions, outputOptions);
+
+                //cube padding
+                if (faceCount == 6 && arraySize == 1)
+                {
+                    //TODO calc offset for uncompressed images
                 }
-                tmp = img;
-            }
-            else {
-                tmp = img;
-                tmp.toGamma(inputOptions.outputGamma);
             }
 
-            quantize(tmp, compressionOptions);
-            compress(tmp, f, m, compressionOptions, outputOptions);
+            int mipPadding = 3 - ((imageSize + 3) % 4);
+            if (mipPadding != 0) {
+                outputOptions.writeData(&padding, mipPadding);
+            }
         }
     }
 
@@ -673,6 +803,131 @@ bool Compressor::Private::outputHeader(nvtt::TextureType textureType, int w, int
 
         return writeSucceed;
     }
+    else if (outputOptions.container == Container_KTX) 
+    {
+        KtxHeader header;
+        // TODO cube arrays
+        if (textureType == TextureType_2D) {
+            nvCheck(arraySize == 1);
+            header.numberOfArrayElements = 0;
+            header.numberOfFaces = 1;
+            header.pixelDepth = 0;
+        }
+        else if (textureType == TextureType_Cube) {
+            nvCheck(arraySize == 1);
+            header.numberOfArrayElements = 0;
+            header.numberOfFaces = 6;
+            header.pixelDepth = 0;
+        }
+        else if (textureType == TextureType_3D) {
+            nvCheck(arraySize == 1);
+            header.numberOfArrayElements = 0;
+            header.numberOfFaces = 1;
+            header.pixelDepth = d;
+        }
+        else if (textureType == TextureType_Array) {
+            header.numberOfArrayElements = arraySize;
+            header.numberOfFaces = 1;
+            header.pixelDepth = 0; // Is it?
+        }
+
+        header.pixelWidth = w;
+        header.pixelHeight = h;
+        header.numberOfMipmapLevels = mipmapCount;
+
+        bool supported = true;
+
+        // TODO non-compressed formats
+        if (compressionOptions.format == Format_RGBA)
+        {
+            //header.glType = ?;
+            //header.glTypeSize = ?;
+            //header.glFormat = ?;
+        }
+        else
+        {
+            header.glType = 0;
+            header.glTypeSize = 1;
+            header.glFormat = 0;
+            
+            if (compressionOptions.format == Format_DXT1) {
+                header.glInternalFormat = outputOptions.srgb ? KTX_INTERNAL_COMPRESSED_SRGB_S3TC_DXT1 : KTX_INTERNAL_COMPRESSED_RGB_S3TC_DXT1;
+                header.glBaseInternalFormat = KTX_BASE_INTERNAL_RGB;
+            }
+            else if (compressionOptions.format == Format_DXT1a) {
+                header.glInternalFormat = outputOptions.srgb ? KTX_INTERNAL_COMPRESSED_SRGB_ALPHA_S3TC_DXT1 : KTX_INTERNAL_COMPRESSED_RGBA_S3TC_DXT1;
+                header.glBaseInternalFormat = KTX_BASE_INTERNAL_RGBA;
+            }
+            else if (compressionOptions.format == Format_DXT3) {
+                header.glInternalFormat = outputOptions.srgb ? KTX_INTERNAL_COMPRESSED_SRGB_ALPHA_S3TC_DXT3 : KTX_INTERNAL_COMPRESSED_RGBA_S3TC_DXT3;
+                header.glBaseInternalFormat = KTX_BASE_INTERNAL_RGBA;
+            }
+            else if (compressionOptions.format == Format_DXT5 || compressionOptions.format == Format_BC3_RGBM) {
+                header.glInternalFormat = outputOptions.srgb ? KTX_INTERNAL_COMPRESSED_SRGB_ALPHA_S3TC_DXT5 : KTX_INTERNAL_COMPRESSED_RGBA_S3TC_DXT5;
+                header.glBaseInternalFormat = KTX_BASE_INTERNAL_RGBA;
+            }
+            else if (compressionOptions.format == Format_BC4) {
+                header.glInternalFormat = KTX_INTERNAL_COMPRESSED_RED_RGTC1; // KTX_INTERNAL_COMPRESSED_SIGNED_RED_RGTC1 ?
+                header.glBaseInternalFormat = KTX_BASE_INTERNAL_RED;
+            }
+            else if (compressionOptions.format == Format_BC5) {
+                header.glInternalFormat = KTX_INTERNAL_COMPRESSED_RG_RGTC2; // KTX_INTERNAL_COMPRESSED_SIGNED_RG_RGTC2 ?
+                header.glBaseInternalFormat = KTX_BASE_INTERNAL_RG;
+            }
+            else if (compressionOptions.format == Format_BC6) {
+                if (compressionOptions.pixelType == PixelType_Float) header.glInternalFormat = KTX_INTERNAL_COMPRESSED_RGB_BPTC_SIGNED_FLOAT;
+                else /*if (compressionOptions.pixelType == PixelType_UnsignedFloat)*/ header.glInternalFormat = KTX_INTERNAL_COMPRESSED_RGB_BPTC_UNSIGNED_FLOAT; // By default we assume unsigned.
+                header.glBaseInternalFormat = KTX_BASE_INTERNAL_RGB;
+            }
+            else if (compressionOptions.format == Format_BC7) {
+                header.glInternalFormat = outputOptions.srgb ? KTX_INTERNAL_COMPRESSED_SRGB_ALPHA_BPTC_UNORM : KTX_INTERNAL_COMPRESSED_RGBA_BPTC_UNORM;
+                header.glBaseInternalFormat = KTX_BASE_INTERNAL_RGBA;
+            }
+            else if (compressionOptions.format == Format_ETC1) {
+                header.glInternalFormat = outputOptions.srgb ? KTX_INTERNAL_COMPRESSED_SRGB_ETC1 : KTX_INTERNAL_COMPRESSED_RGB_ETC1;
+                header.glBaseInternalFormat = KTX_BASE_INTERNAL_RGB;
+            }
+            else if (compressionOptions.format == Format_ETC2_R) {
+                header.glInternalFormat = KTX_INTERNAL_COMPRESSED_RED_EAC;
+                header.glBaseInternalFormat = KTX_BASE_INTERNAL_RED;
+            }
+            else if (compressionOptions.format == Format_ETC2_RG) {
+                header.glInternalFormat = KTX_INTERNAL_COMPRESSED_RG_EAC;
+                header.glBaseInternalFormat = KTX_BASE_INTERNAL_RG;
+            }
+            else if (compressionOptions.format == Format_ETC2_RGB) {
+                header.glInternalFormat = outputOptions.srgb ? KTX_INTERNAL_COMPRESSED_SRGB_ETC2 : KTX_INTERNAL_COMPRESSED_RGB_ETC2;
+                header.glBaseInternalFormat = KTX_BASE_INTERNAL_RGB;
+            }
+            else if (compressionOptions.format == Format_ETC2_RGBA) {
+                header.glInternalFormat = outputOptions.srgb ? KTX_INTERNAL_COMPRESSED_SRGB_ALPHA_ETC2_EAC : KTX_INTERNAL_COMPRESSED_RGBA_ETC2_EAC;
+                header.glBaseInternalFormat = KTX_BASE_INTERNAL_RGBA;
+            }
+            else {
+                supported = false;
+            }
+
+            //TODO compressionOptions.format == Format_DXT1n, Format_DXT5n ? There seems to be no way to indicate a normal map using ktx. Maybe via key value data?
+        }
+        
+        if (!supported)
+        {
+            // This container does not support the requested format.
+            outputOptions.error(Error_UnsupportedOutputFormat);
+            return false;
+        }
+
+        const uint headerSize = 64;
+        nvStaticCheck(sizeof(KtxHeader) == 64);
+
+        bool writeSucceed = outputOptions.writeData(&header, headerSize);
+        if (!writeSucceed)
+        {
+            outputOptions.error(Error_FileWrite);
+        }
+
+        return writeSucceed;
+    }
 
     return true;
 }
@@ -788,15 +1043,34 @@ CompressorInterface * Compressor::Private::chooseCpuCompressor(const Compression
     {
         return new CompressorBC7;
     }
-    /*else if (compressionOptions.format == Format_BC5_Luma)
-    {
-        return new ProductionCompressorBC5_Luma;
-    }*/
     else if (compressionOptions.format == Format_BC3_RGBM)
     {
         return new CompressorBC3_RGBM;
     }
-
+    else if (compressionOptions.format >= Format_ETC1 && compressionOptions.format <= Format_ETC2_RGB_A1)
+    {
+#if defined(HAVE_RGETC)
+        if (compressionOptions.format == Format_ETC1 && compressionOptions.externalCompressor == "rg_etc") return new RgEtcCompressor;
+#endif
+#if defined(HAVE_ETCLIB)
+        if (compressionOptions.externalCompressor == "etclib") return new EtcLibCompressor;
+#endif
+        if (compressionOptions.format == Format_ETC1) return new CompressorETC1;
+        else if (compressionOptions.format == Format_ETC2_R) return new CompressorETC2_R;
+        //else if (compressionOptions.format == Format_ETC2_RG) return new CompressorETC2_RG;
+        else if (compressionOptions.format == Format_ETC2_RGB) return new CompressorETC2_RGB;
+        else if (compressionOptions.format == Format_ETC2_RGBA) return new CompressorETC2_RGBA;
+    }
+    else if (compressionOptions.format == Format_ETC2_RGBM)
+    {
+        return new CompressorETC2_RGBM;
+    }
+    else if (compressionOptions.format >= Format_PVR_2BPP_RGB && compressionOptions.format <= Format_PVR_4BPP_RGBA)
+    {
+#if defined(HAVE_PVRTEXTOOL)
+        return new CompressorPVR;
+#endif
+    }
     return NULL;
 }
 
@@ -860,3 +1134,24 @@ CompressorInterface * Compressor::Private::chooseGpuCompressor(const Compression
 
     return NULL;
 }
+
+int Compressor::Private::estimateSize(int w, int h, int d, int mipmapCount, const CompressionOptions::Private & compressionOptions) const
+{
+    const Format format = compressionOptions.format;
+
+    const uint bitCount = compressionOptions.bitcount;
+    const uint pitchAlignment = compressionOptions.pitchAlignment;
+
+    int size = 0;
+    for (int m = 0; m < mipmapCount; m++)
+    {
+        size += computeImageSize(w, h, d, bitCount, pitchAlignment, format);
+
+        // Compute extents of next mipmap:
+        w = max(1, w / 2);
+        h = max(1, h / 2);
+        d = max(1, d / 2);
+    }
+
+    return size;
+}
diff --git a/src/nvtt/Context.h b/src/nvtt/Context.h
index c497bb1..de801c0 100644
--- a/src/nvtt/Context.h
+++ b/src/nvtt/Context.h
@@ -56,6 +56,7 @@ namespace nvtt
         nv::CompressorInterface * chooseCpuCompressor(const CompressionOptions::Private & compressionOptions) const;
         nv::CompressorInterface * chooseGpuCompressor(const CompressionOptions::Private & compressionOptions) const;
 
+        int estimateSize(int w, int h, int d, int mipmapCount, const CompressionOptions::Private & compressionOptions) const;
 
         bool cudaSupported;
         bool cudaEnabled;
diff --git a/src/nvtt/OutputOptions.h b/src/nvtt/OutputOptions.h
index 5b9a750..df6080c 100644
--- a/src/nvtt/OutputOptions.h
+++ b/src/nvtt/OutputOptions.h
@@ -34,61 +34,61 @@
 namespace nvtt
 {
 
-	struct DefaultOutputHandler : public nvtt::OutputHandler
-	{
-		DefaultOutputHandler(const char * fileName) : stream(fileName) {}
+    struct DefaultOutputHandler : public nvtt::OutputHandler
+    {
+        DefaultOutputHandler(const char * fileName) : stream(fileName) {}
         DefaultOutputHandler(FILE * fp) : stream(fp, false) {}
-		
-		virtual ~DefaultOutputHandler() {}
-		
-		virtual void beginImage(int size, int width, int height, int depth, int face, int miplevel)
-		{
-			// ignore.
-		}
-		
-		// Output data.
-		virtual bool writeData(const void * data, int size)
-		{
-			stream.serialize(const_cast<void *>(data), size);
-
-			//return !stream.isError();
-			return true;
-		}
-
-		virtual void endImage()
-		{
-			// ignore.
-		}
-
-		nv::StdOutputStream stream;
-	};
-
-
-	struct OutputOptions::Private
-	{
-		nv::Path fileName;
+
+        virtual ~DefaultOutputHandler() {}
+
+        virtual void beginImage(int size, int width, int height, int depth, int face, int miplevel)
+        {
+            // ignore.
+        }
+
+        // Output data.
+        virtual bool writeData(const void * data, int size)
+        {
+            stream.serialize(const_cast<void *>(data), size);
+
+            //return !stream.isError();
+            return true;
+        }
+
+        virtual void endImage()
+        {
+            // ignore.
+        }
+
+        nv::StdOutputStream stream;
+    };
+
+
+    struct OutputOptions::Private
+    {
+        nv::Path fileName;
         FILE * fileHandle;
-		
-		OutputHandler * outputHandler;
-		ErrorHandler * errorHandler;
 
-		bool outputHeader;
-		Container container;
+        OutputHandler * outputHandler;
+        ErrorHandler * errorHandler;
+
+        bool outputHeader;
+        Container container;
         int version;
         bool srgb;
         bool deleteOutputHandler;
 
         void * wrapperProxy;    // For the C/C# wrapper.
-		
-		bool hasValidOutputHandler() const;
 
-		void beginImage(int size, int width, int height, int depth, int face, int miplevel) const;
-		bool writeData(const void * data, int size) const;
+        bool hasValidOutputHandler() const;
+
+        void beginImage(int size, int width, int height, int depth, int face, int miplevel) const;
+        bool writeData(const void * data, int size) const;
         void endImage() const;
-		void error(Error e) const;
-	};
+        void error(Error e) const;
+    };
+
 
-	
 } // nvtt namespace
 
 
diff --git a/src/nvtt/QuickCompressDXT.h b/src/nvtt/QuickCompressDXT.h
index dbfc824..f5d952f 100644
--- a/src/nvtt/QuickCompressDXT.h
+++ b/src/nvtt/QuickCompressDXT.h
@@ -39,21 +39,21 @@ namespace nv
 	struct AlphaBlockDXT5;
     class Vector3;
 
-	namespace QuickCompress
-	{
-		void compressDXT1(const ColorBlock & src, BlockDXT1 * dst);
-		void compressDXT1a(const ColorBlock & src, BlockDXT1 * dst);
-		
-		void compressDXT3(const ColorBlock & src, BlockDXT3 * dst);
-		
-		void compressDXT5A(const ColorBlock & src, AlphaBlockDXT5 * dst, int iterationCount=8);
-		void compressDXT5A(const AlphaBlock4x4 & src, AlphaBlockDXT5 * dst, int iterationCount=8);
-
-		void compressDXT5(const ColorBlock & src, BlockDXT5 * dst, int iterationCount=8);
+    namespace QuickCompress
+    {
+        void compressDXT1(const ColorBlock & src, BlockDXT1 * dst);
+        void compressDXT1a(const ColorBlock & src, BlockDXT1 * dst);
+
+        void compressDXT3(const ColorBlock & src, BlockDXT3 * dst);
+
+        void compressDXT5A(const ColorBlock & src, AlphaBlockDXT5 * dst, int iterationCount=8);
+        void compressDXT5A(const AlphaBlock4x4 & src, AlphaBlockDXT5 * dst, int iterationCount=8);
+
+        void compressDXT5(const ColorBlock & src, BlockDXT5 * dst, int iterationCount=8);
 
         void outputBlock4(const ColorSet & set, const Vector3 & start, const Vector3 & end, BlockDXT1 * block);
         void outputBlock3(const ColorSet & set, const Vector3 & start, const Vector3 & end, BlockDXT1 * block);
-	}
+    }
 } // nv namespace
 
 #endif // NV_TT_QUICKCOMPRESSDXT_H
diff --git a/src/nvtt/Surface.cpp b/src/nvtt/Surface.cpp
index 869f0f1..41421ca 100644
--- a/src/nvtt/Surface.cpp
+++ b/src/nvtt/Surface.cpp
@@ -23,12 +23,14 @@
 // OTHER DEALINGS IN THE SOFTWARE.
 
 #include "Surface.h"
+#include "CompressorETC.h" // for ETC decoder.
 
 #include "nvmath/Vector.inl"
 #include "nvmath/Matrix.inl"
 #include "nvmath/Color.h"
 #include "nvmath/Half.h"
 #include "nvmath/ftoi.h"
+#include "nvmath/PackedFloat.h"
 
 #include "nvimage/Filter.h"
 #include "nvimage/ImageIO.h"
@@ -39,8 +41,13 @@
 #include "nvimage/ErrorMetric.h"
 #include "nvimage/DirectDrawSurface.h"
 
+#include "nvthread/ParallelFor.h"
+
+#include "nvcore/Array.inl"
+
 #include <float.h>
 #include <string.h> // memset, memcpy
+//#include <stdio.h> // printf?
 
 #if NV_CC_GNUC
 #include <math.h> // exp2f and log2f
@@ -123,6 +130,18 @@ namespace
         else if (format == Format_BC7) {
             return 16;
         }
+        else if (format == Format_ETC1 || format == Format_ETC2_R || format == Format_ETC2_RGB) {
+            return 8;
+        }
+        else if (format == Format_ETC2_RG || format == Format_ETC2_RGBA || format == Format_ETC2_RGBM) {
+            return 16;
+        }
+        else if (format == Format_PVR_2BPP_RGB || format == Format_PVR_2BPP_RGBA) {
+            return 4;
+        }
+        else if (format == Format_PVR_4BPP_RGB || format == Format_PVR_4BPP_RGBA) {
+            return 8;
+        }
         return 0;
     }
 
@@ -197,7 +216,7 @@ uint nv::computeImageSize(uint w, uint h, uint d, uint bitCount, uint pitchAlign
     }
 }
 
-void nv::getTargetExtent(int * width, int * height, int * depth, int maxExtent, RoundMode roundMode, TextureType textureType) {
+void nv::getTargetExtent(int * width, int * height, int * depth, int maxExtent, RoundMode roundMode, TextureType textureType, nvtt::ShapeRestriction shapeRestriction /*= nvtt::ShapeRestriction_None*/) {
     nvDebugCheck(width != NULL && *width > 0);
     nvDebugCheck(height != NULL && *height > 0);
     nvDebugCheck(depth != NULL && *depth > 0);
@@ -234,21 +253,21 @@ void nv::getTargetExtent(int * width, int * height, int * depth, int maxExtent,
     // Round to power of two.
     if (roundMode == RoundMode_ToNextPowerOfTwo)
     {
-        w = nextPowerOfTwo(w);
-        h = nextPowerOfTwo(h);
-        d = nextPowerOfTwo(d);
+        w = nextPowerOfTwo(U32(w));
+        h = nextPowerOfTwo(U32(h));
+        d = nextPowerOfTwo(U32(d));
     }
     else if (roundMode == RoundMode_ToNearestPowerOfTwo)
     {
-        w = nearestPowerOfTwo(w);
-        h = nearestPowerOfTwo(h);
-        d = nearestPowerOfTwo(d);
+        w = nearestPowerOfTwo(U32(w));
+        h = nearestPowerOfTwo(U32(h));
+        d = nearestPowerOfTwo(U32(d));
     }
     else if (roundMode == RoundMode_ToPreviousPowerOfTwo)
     {
-        w = previousPowerOfTwo(w);
-        h = previousPowerOfTwo(h);
-        d = previousPowerOfTwo(d);
+        w = previousPowerOfTwo(U32(w));
+        h = previousPowerOfTwo(U32(h));
+        d = previousPowerOfTwo(U32(d));
     }
     else if (roundMode == RoundMode_ToNextMultipleOfFour)
     {
@@ -269,6 +288,38 @@ void nv::getTargetExtent(int * width, int * height, int * depth, int maxExtent,
         d = previousMultipleOfFour(d);
     }
 
+    if(shapeRestriction == ShapeRestriction_Square)
+    {
+        if (textureType == TextureType_2D)
+        {
+            int md = nv::min(w,h);
+            w = md;
+            h = md;
+            d = 1;
+        }
+        else if (textureType == TextureType_3D)
+        {
+            int md = nv::min(nv::min(w,h),d);
+            w = md;
+            h = md;
+            d = md;
+        }
+        else if (textureType == TextureType_Cube) 
+        {
+            int md = nv::min(w, h);
+            w = md;
+            h = md;
+            d = 1;
+        }
+    }
+    else 
+    {
+        if (textureType == TextureType_2D || textureType == TextureType_Cube)
+        {
+            d = 1;
+        }
+    }
+
     *width = w;
     *height = h;
     *depth = d;
@@ -509,8 +560,8 @@ void Surface::range(int channel, float * rangeMin, float * rangeMax, int alpha_c
         }
     }
 
-    *rangeMin = range.x;
-    *rangeMax = range.y;
+    if (rangeMin) *rangeMin = range.x;
+    if (rangeMax) *rangeMax = range.y;
 }
 
 bool Surface::load(const char * fileName, bool * hasAlpha/*= NULL*/)
@@ -583,7 +634,7 @@ bool Surface::load(const char * fileName, bool * hasAlpha/*= NULL*/)
     }
 
     // @@ Have loadFloat allocate the image with the desired number of channels.
-    img->resizeChannelCount(4);
+    //img->resizeChannelCount(4);
 
     delete m->image;
     m->image = img.release();
@@ -601,7 +652,8 @@ bool Surface::save(const char * fileName, bool hasAlpha/*=0*/, bool hdr/*=0*/) c
         return ImageIO::saveFloat(fileName, m->image, 0, 4);
     }
     else {
-        AutoPtr<Image> image(m->image->createImage(0, 4));
+        uint c = min<uint>(m->image->componentCount(), 4);
+        AutoPtr<Image> image(m->image->createImage(0, c));
         nvCheck(image != NULL);
 
         if (hasAlpha) {
@@ -829,16 +881,35 @@ bool Surface::setImage(InputFormat format, int w, int h, int d, const void * r,
     return true;
 }
 
+#if defined(HAVE_PVRTEXTOOL)
+#include <PVRTDecompress.h>
+#endif
+
 // @@ Add support for compressed 3D textures.
 bool Surface::setImage2D(Format format, Decoder decoder, int w, int h, const void * data)
 {
     if (format != nvtt::Format_BC1 &&
         format != nvtt::Format_BC2 &&
         format != nvtt::Format_BC3 &&
+        format != nvtt::Format_BC3n &&
+        format != nvtt::Format_BC3_RGBM &&
         format != nvtt::Format_BC4 &&
         format != nvtt::Format_BC5 &&
         format != nvtt::Format_BC6 &&
-        format != nvtt::Format_BC7)
+        format != nvtt::Format_BC7 &&
+        format != nvtt::Format_ETC1 &&
+        format != nvtt::Format_ETC2_R &&
+        format != nvtt::Format_ETC2_RG &&
+        format != nvtt::Format_ETC2_RGB &&
+        format != nvtt::Format_ETC2_RGBA &&
+        format != nvtt::Format_ETC2_RGBM
+    #if defined(HAVE_PVRTEXTOOL)
+        && format != nvtt::Format_PVR_2BPP_RGB
+        && format != nvtt::Format_PVR_4BPP_RGB
+        && format != nvtt::Format_PVR_2BPP_RGBA
+        && format != nvtt::Format_PVR_4BPP_RGBA
+    #endif
+        )
     {
         return false;
     }
@@ -851,7 +922,7 @@ bool Surface::setImage2D(Format format, Decoder decoder, int w, int h, const voi
     m->image->allocate(4, w, h, 1);
     m->type = TextureType_2D;
 
-    const int bw = (w + 3) / 4;
+    const int bw = (w + 3) / 4;     // @@ Not if PVR 2bpp!
     const int bh = (h + 3) / 4;
 
     const uint bs = blockSize(format);
@@ -859,130 +930,166 @@ bool Surface::setImage2D(Format format, Decoder decoder, int w, int h, const voi
     const uint8 * ptr = (const uint8 *)data;
 
     TRY {
-		if (format == nvtt::Format_BC6)
-		{
-			// BC6 format - decode directly to float
-
-			for (int y = 0; y < bh; y++)
-			{
-				for (int x = 0; x < bw; x++)
-				{
-                    Vector3 colors[16];
-                    const BlockBC6 * block = (const BlockBC6 *)ptr;
-					block->decodeBlock(colors);
-
-					for (int yy = 0; yy < 4; yy++)
-					{
-						for (int xx = 0; xx < 4; xx++)
-						{
-							Vector3 rgb = colors[yy*4 + xx];
-
-							if (x * 4 + xx < w && y * 4 + yy < h)
-							{
-								m->image->pixel(0, x*4 + xx, y*4 + yy, 0) = rgb.x;
-								m->image->pixel(1, x*4 + xx, y*4 + yy, 0) = rgb.y;
-								m->image->pixel(2, x*4 + xx, y*4 + yy, 0) = rgb.z;
-								m->image->pixel(3, x*4 + xx, y*4 + yy, 0) = 1.0f;
-							}
-						}
-					}
-
-					ptr += bs;
-				}
-			}
-		}
-		else
-		{
-			// Non-BC6 - decode to 8-bit, then convert to float
-
-			for (int y = 0; y < bh; y++)
-			{
-				for (int x = 0; x < bw; x++)
-				{
-					ColorBlock colors;
-
-					if (format == nvtt::Format_BC1)
-					{
-						const BlockDXT1 * block = (const BlockDXT1 *)ptr;
-
-						if (decoder == Decoder_D3D10) {
-							block->decodeBlock(&colors, false);
-						}
-						else if (decoder == Decoder_D3D9) {
-							block->decodeBlock(&colors, false);
-						}
-						else if (decoder == Decoder_NV5x) {
-							block->decodeBlockNV5x(&colors);
-						}
-					}
-					else if (format == nvtt::Format_BC2)
-					{
-						const BlockDXT3 * block = (const BlockDXT3 *)ptr;
-
-						if (decoder == Decoder_D3D10) {
-							block->decodeBlock(&colors, false);
-						}
-						else if (decoder == Decoder_D3D9) {
-							block->decodeBlock(&colors, false);
-						}
-						else if (decoder == Decoder_NV5x) {
-							block->decodeBlockNV5x(&colors);
-						}
-					}
-					else if (format == nvtt::Format_BC3)
-					{
-						const BlockDXT5 * block = (const BlockDXT5 *)ptr;
-
-						if (decoder == Decoder_D3D10) {
-							block->decodeBlock(&colors, false);
-						}
-						else if (decoder == Decoder_D3D9) {
-							block->decodeBlock(&colors, false);
-						}
-						else if (decoder == Decoder_NV5x) {
-							block->decodeBlockNV5x(&colors);
-						}
-					}
-					else if (format == nvtt::Format_BC4)
-					{
-						const BlockATI1 * block = (const BlockATI1 *)ptr;
-						block->decodeBlock(&colors, decoder == Decoder_D3D9);
-					}
-					else if (format == nvtt::Format_BC5)
-					{
-						const BlockATI2 * block = (const BlockATI2 *)ptr;
-						block->decodeBlock(&colors, decoder == Decoder_D3D9);
-					}
-					else if (format == nvtt::Format_BC7)
-					{
-						const BlockBC7 * block = (const BlockBC7 *)ptr;
-						block->decodeBlock(&colors);
-					}
-					else
-					{
-						nvDebugCheck(false);
-					}
-
-					for (int yy = 0; yy < 4; yy++)
-					{
-						for (int xx = 0; xx < 4; xx++)
-						{
-							Color32 c = colors.color(xx, yy);
-
-							if (x * 4 + xx < w && y * 4 + yy < h)
-							{
-								m->image->pixel(0, x*4 + xx, y*4 + yy, 0) = float(c.r) * 1.0f/255.0f;
-								m->image->pixel(1, x*4 + xx, y*4 + yy, 0) = float(c.g) * 1.0f/255.0f;
-								m->image->pixel(2, x*4 + xx, y*4 + yy, 0) = float(c.b) * 1.0f/255.0f;
-								m->image->pixel(3, x*4 + xx, y*4 + yy, 0) = float(c.a) * 1.0f/255.0f;
-							}
-						}
-					}
-
-					ptr += bs;
-				}
-			}
-		}
+#if defined(HAVE_PVRTEXTOOL)
+        if (format >= nvtt::Format_PVR_2BPP_RGB && format <= nvtt::Format_PVR_4BPP_RGBA)
+        {
+            bool two_bit_mode = (format == nvtt::Format_PVR_2BPP_RGB || format == nvtt::Format_PVR_2BPP_RGBA);
+
+            uint8 * output = new uint8[4 * w * h];
+
+            PVRTDecompressPVRTC(ptr, two_bit_mode, w, h, output);
+
+            for (int y = 0; y < h; y++) {
+                for (int x = 0; x < w; x++) {
+                    m->image->pixel(0, x, y, 0) = output[4*(y*w + x) + 0] / 255.0f;
+                    m->image->pixel(1, x, y, 0) = output[4*(y*w + x) + 1] / 255.0f;
+                    m->image->pixel(2, x, y, 0) = output[4*(y*w + x) + 2] / 255.0f;
+                    m->image->pixel(3, x, y, 0) = output[4*(y*w + x) + 3] / 255.0f;
+                }
+            }
+
+            delete [] output;
+        }
+        else
+#endif
+        if (format == nvtt::Format_BC6 || (format >= nvtt::Format_ETC1 && format <= nvtt::Format_ETC2_RGBM))
+        {
+            // Some formats we decode directly to float:
+
+            for (int y = 0; y < bh; y++) {
+                for (int x = 0; x < bw; x++) {
+                    Vector4 colors[16];
+
+                    if (format == nvtt::Format_BC6) {
+                        const BlockBC6 * block = (const BlockBC6 *)ptr;
+                        block->decodeBlock(colors);
+                    }
+                    else if (format == nvtt::Format_ETC1 || format == nvtt::Format_ETC2_RGB) {
+                        nv::decompress_etc(ptr, colors);
+                    }
+                    else if (format == nvtt::Format_ETC2_RGBA || format == nvtt::Format_ETC2_RGBM) {
+                        nv::decompress_etc_eac(ptr, colors);
+                    }
+                    else if (format == nvtt::Format_ETC2_R) {
+                        // @@ Not implemented.
+                        //nv::decompress_eac(ptr, colors);
+                    }
+                    else if (format == nvtt::Format_ETC2_RG) {
+                        // @@ Not implemented.
+                        //nv::decompress_eac(ptr, colors);
+                    }
+                    else if (format == nvtt::Format_ETC2_RGB_A1) {
+                        // @@ Not implemented?
+                        //nv::decompress_etc(ptr, colors);
+                    }
+
+                    for (int yy = 0; yy < 4; yy++) {
+                        for (int xx = 0; xx < 4; xx++) {
+                            Vector4 c = colors[yy*4 + xx];
+
+                            if (x * 4 + xx < w && y * 4 + yy < h) {
+                                m->image->pixel(0, x*4 + xx, y*4 + yy, 0) = c.x;
+                                m->image->pixel(1, x*4 + xx, y*4 + yy, 0) = c.y;
+                                m->image->pixel(2, x*4 + xx, y*4 + yy, 0) = c.z;
+                                m->image->pixel(3, x*4 + xx, y*4 + yy, 0) = c.w;
+                            }
+                        }
+                    }
+
+                    ptr += bs;
+                }
+            }
+        }
+        else
+        {
+            // Others, we decode to 8-bit, then convert to float
+
+            for (int y = 0; y < bh; y++) {
+                for (int x = 0; x < bw; x++) {
+                    ColorBlock colors;
+
+                    if (format == nvtt::Format_BC1)
+                    {
+                        const BlockDXT1 * block = (const BlockDXT1 *)ptr;
+
+                        if (decoder == Decoder_D3D10) {
+                            block->decodeBlock(&colors, false);
+                        }
+                        else if (decoder == Decoder_D3D9) {
+                            block->decodeBlock(&colors, false);
+                        }
+                        else if (decoder == Decoder_NV5x) {
+                            block->decodeBlockNV5x(&colors);
+                        }
+                    }
+                    else if (format == nvtt::Format_BC2)
+                    {
+                        const BlockDXT3 * block = (const BlockDXT3 *)ptr;
+
+                        if (decoder == Decoder_D3D10) {
+                            block->decodeBlock(&colors, false);
+                        }
+                        else if (decoder == Decoder_D3D9) {
+                            block->decodeBlock(&colors, false);
+                        }
+                        else if (decoder == Decoder_NV5x) {
+                            block->decodeBlockNV5x(&colors);
+                        }
+                    }
+                    else if (format == nvtt::Format_BC3 || format == nvtt::Format_BC3n || format == nvtt::Format_BC3_RGBM)
+                    {
+                        const BlockDXT5 * block = (const BlockDXT5 *)ptr;
+
+                        if (decoder == Decoder_D3D10) {
+                            block->decodeBlock(&colors, false);
+                        }
+                        else if (decoder == Decoder_D3D9) {
+                            block->decodeBlock(&colors, false);
+                        }
+                        else if (decoder == Decoder_NV5x) {
+                            block->decodeBlockNV5x(&colors);
+                        }
+                    }
+                    else if (format == nvtt::Format_BC4)
+                    {
+                        const BlockATI1 * block = (const BlockATI1 *)ptr;
+                        block->decodeBlock(&colors, decoder == Decoder_D3D9);
+                    }
+                    else if (format == nvtt::Format_BC5)
+                    {
+                        const BlockATI2 * block = (const BlockATI2 *)ptr;
+                        block->decodeBlock(&colors, decoder == Decoder_D3D9);
+                    }
+                    else if (format == nvtt::Format_BC7)
+                    {
+                        const BlockBC7 * block = (const BlockBC7 *)ptr;
+                        block->decodeBlock(&colors);
+                    }
+                    else
+                    {
+                        nvDebugCheck(false);
+                    }
+
+                    for (int yy = 0; yy < 4; yy++)
+                    {
+                        for (int xx = 0; xx < 4; xx++)
+                        {
+                            Color32 c = colors.color(xx, yy);
+
+                            if (x * 4 + xx < w && y * 4 + yy < h)
+                            {
+                                m->image->pixel(0, x*4 + xx, y*4 + yy, 0) = float(c.r) * 1.0f/255.0f;
+                                m->image->pixel(1, x*4 + xx, y*4 + yy, 0) = float(c.g) * 1.0f/255.0f;
+                                m->image->pixel(2, x*4 + xx, y*4 + yy, 0) = float(c.b) * 1.0f/255.0f;
+                                m->image->pixel(3, x*4 + xx, y*4 + yy, 0) = float(c.a) * 1.0f/255.0f;
+                            }
+                        }
+                    }
+
+                    ptr += bs;
+                }
+            }
+        }
     }
     CATCH {
         return false;
@@ -1092,7 +1199,7 @@ void Surface::resize(int w, int h, int d, ResizeFilter filter, float filterWidth
     m->image = img;
 }
 
-void Surface::resize_make_square(int maxExtent, RoundMode roundMode, ResizeFilter filter)
+void Surface::resizeMakeSquare(int maxExtent, RoundMode roundMode, ResizeFilter filter)
 {
     if (isNull()) return;
 
@@ -1104,27 +1211,17 @@ void Surface::resize_make_square(int maxExtent, RoundMode roundMode, ResizeFilte
     int h = m->image->height();
     int d = m->image->depth();
 
-    getTargetExtent(&w, &h, &d, maxExtent, roundMode, m->type);
+    getTargetExtent(&w, &h, &d, maxExtent, roundMode, m->type, nvtt::ShapeRestriction_Square);
 
     if (m->type == TextureType_2D) 
     {
         nvDebugCheck(d==1);
-        int md = nv::min(w,h);
-        w = md;
-        h = md;
     }
     else if (m->type == TextureType_Cube)
     {
         nvDebugCheck(d==1);
         nvDebugCheck(w==h);
     }
-    else if (m->type == TextureType_3D)
-    {
-        int md = nv::min(nv::min(w,h),d);
-        w = md;
-        h = md;
-        d = md;
-    }
 
     resize(w, h, d, filter, filterWidth, params);
 }
@@ -1151,6 +1248,63 @@ void Surface::resize(int maxExtent, RoundMode roundMode, ResizeFilter filter, fl
     resize(w, h, d, filter, filterWidth, params);
 }
 
+
+float rmsBilinearError(nvtt::Surface original, nvtt::Surface resized) {
+    return nv::rmsBilinearColorError(original.m->image, resized.m->image, (FloatImage::WrapMode)original.wrapMode(), original.alphaMode() == AlphaMode_Transparency);
+}
+
+
+void Surface::autoResize(float errorTolerance, RoundMode mode, ResizeFilter filter)
+{
+    Surface original = *this;
+    Surface resized = original;
+
+    int w = width();
+    int h = height();
+    int d = depth();
+
+    w = (w + 1) / 2;
+    h = (h + 1) / 2;
+    d = (d + 1) / 2;
+
+    while (w >= 4 && h >= 4 && d >= 1) {
+
+        // Resize always from original? This is more expensive, but should produce higher quality.
+        //resized = original;
+        
+        resized.resize(w, h, d, filter);
+
+#if 0
+        // Scale back up to original size. @@ Upscaling not implemented!
+        Surface restored = resized;
+        restored.resize(original.width(), original.height(), original.depth(), ResizeFilter_Triangle);
+
+        float error;
+        if (isNormalMap()) {
+            error = nvtt::angularError(original, restored);
+        }
+        else {
+            error = nvtt::rmsError(original, restored);
+        }
+#else
+        float error = rmsBilinearError(original, resized);
+#endif
+
+        if (error < errorTolerance) {
+            *this = resized;
+            nvDebug("image resized %dx%d -> %dx%d (error=%f)\n", original.width(), original.height(), w, h, error);
+        }
+        else {
+            nvDebug("image can't be resized further (error=%f)\n", error);
+            break;
+        }
+
+        w = (w + 1) / 2;
+        h = (h + 1) / 2;
+        d = (d + 1) / 2;
+    }
+}
+
 bool Surface::canMakeNextMipmap(int min_size /*= 1*/)
 {
     if (isNull()) return false;
@@ -1196,7 +1350,7 @@ bool Surface::buildNextMipmap(MipmapFilter filter, float filterWidth, const floa
         {
             nvDebugCheck(filter == MipmapFilter_Kaiser);
             KaiserFilter filter(filterWidth);
-            if (params != NULL) filter.setParameters(params[0], params[1]);
+            if (params != NULL) filter.setParameters(/*alpha=*/params[0], /*stretch=*/params[1]);
             img = img->downSample(filter, wrapMode, 3);
         }
     }
@@ -1357,8 +1511,9 @@ void Surface::toSrgb()
     for (uint c = 0; c < 3; c++) {
         float * channel = img->channel(c);
         for (uint i = 0; i < count; i++) {
+        //parallel_for(count, 128, [=](int i) {
             channel[i] = ::toSrgb(channel[i]);
-        }
+        }//);
     }
 }
 
@@ -1382,8 +1537,9 @@ void Surface::toLinearFromSrgb()
     for (uint c = 0; c < 3; c++) {
         float * channel = img->channel(c);
         for (uint i = 0; i < count; i++) {
+        //parallel_for(count, 128, [=](int i) {
             channel[i] = ::fromSrgb(channel[i]);
-        }
+        }//);
     }
 }
 
@@ -2827,6 +2983,78 @@ Surface Surface::createSubImage(int x0, int x1, int y0, int y1, int z0, int z1)
     return s;
 }
 
+
+Surface Surface::warp(int w, int h, WarpFunction * warp_function) const
+{
+    Surface s;
+
+    FloatImage * img = s.m->image = new FloatImage;
+
+    const int C = m->image->componentCount();
+    img->allocate(C, w, h, 1);
+
+#define USE_PARALLEL_FOR 0
+#if USE_PARALLEL_FOR
+    nv::parallel_for(h, 1, [=](int y) {
+#else
+    for (int y = 0; y < h; y++) {
+#endif
+        for (int x = 0; x < w; x++) {
+            float fx = (float(x) + 0.0f) / w;
+            float fy = (float(y) + 0.0f) / h;
+            float fz = 0;
+
+            warp_function(fx, fy, fz);
+
+            for (int c = 0; c < C; c++) {
+                img->pixel(c, x, y, 0) = m->image->sampleLinearClamp(c, fx, fy);
+            }
+        }
+    }
+#if USE_PARALLEL_FOR
+    );
+#endif
+
+    return s;
+}
+
+Surface Surface::warp(int w, int h, int d, WarpFunction * warp_function) const
+{
+    Surface s;
+
+    FloatImage * img = s.m->image = new FloatImage;
+
+    const int C = m->image->componentCount();
+    img->allocate(C, w, h, d);
+
+    for (int z = 0; z < d; z++) {
+#define USE_PARALLEL_FOR 0
+#if USE_PARALLEL_FOR
+        nv::parallel_for(h, 1, [=](int y) {
+#else
+        for (int y = 0; y < h; y++) {
+#endif
+            for (int x = 0; x < w; x++) {
+                float fx = (float(x) + 0.0f) / w;
+                float fy = (float(y) + 0.0f) / h;
+                float fz = (float(z) + 0.0f) / d;
+
+                warp_function(fx, fy, fz);
+
+                for (int c = 0; c < C; c++) {
+                    img->pixel(c, x, y, z) = m->image->sampleLinearClamp(c, fx, fy, fz);    // @@ 2D only.
+                }
+            }
+        }
+#if USE_PARALLEL_FOR
+        );
+#endif
+    }
+
+    return s;
+}
+
+
 bool Surface::copyChannel(const Surface & srcImage, int srcChannel)
 {
     return copyChannel(srcImage, srcChannel, srcChannel);
@@ -2953,7 +3181,7 @@ void Surface::setAtlasBorder(int aw, int ah, float r, float g, float b, float a)
         }
 
         // Vertical lines:
-        for (uint i = 0, x = 0; i < uint(ah); i++, x += tile_width)
+        for (uint i = 0, x = 0; i < uint(aw); i++, x += tile_width)
         {
             for (uint y = 0; y < h; y++)
             {
@@ -3083,9 +3311,9 @@ Surface nvtt::histogram(const Surface & img, int width, int height)
     return histogram(img, /*minRange*/0, maxRange, width, height);
 }
 
-#include "nvcore/Array.inl"
-#include "nvmath/PackedFloat.h"
-#include <stdio.h>
+//#include "nvcore/Array.inl"
+//#include "nvmath/PackedFloat.h"
+//#include <stdio.h>
 
 nvtt::Surface nvtt::histogram(const Surface & img, float minRange, float maxRange, int width, int height)
 {
@@ -3234,7 +3462,7 @@ nvtt::Surface nvtt::histogram(const Surface & img, float minRange, float maxRang
         maxh = nv::max(maxh, nv::max3(buckets[i].x, buckets[i].y, buckets[i].z));
     }
 
-    printf("maxh = %f\n", maxh);
+    //printf("maxh = %f\n", maxh);
     //maxh = 80;
     maxh = 256;
 
diff --git a/src/nvtt/Surface.h b/src/nvtt/Surface.h
index 419a0e1..73f7574 100644
--- a/src/nvtt/Surface.h
+++ b/src/nvtt/Surface.h
@@ -83,7 +83,7 @@ namespace nv {
     uint countMipmaps(uint w, uint h, uint d);
     uint countMipmapsWithMinSize(uint w, uint h, uint d, uint min_size);
     uint computeImageSize(uint w, uint h, uint d, uint bitCount, uint alignmentInBytes, nvtt::Format format);
-    void getTargetExtent(int * w, int * h, int * d, int maxExtent, nvtt::RoundMode roundMode, nvtt::TextureType textureType);
+    void getTargetExtent(int * w, int * h, int * d, int maxExtent, nvtt::RoundMode roundMode, nvtt::TextureType textureType, nvtt::ShapeRestriction shapeRestriction = nvtt::ShapeRestriction_None);
 }
 
 
diff --git a/src/nvtt/TaskDispatcher.h b/src/nvtt/TaskDispatcher.h
index c7224d0..168ba56 100644
--- a/src/nvtt/TaskDispatcher.h
+++ b/src/nvtt/TaskDispatcher.h
@@ -10,8 +10,8 @@
 // Gran Central Dispatch (GCD/libdispatch)
 // http://developer.apple.com/mac/library/documentation/Performance/Reference/GCD_libdispatch_Ref/Reference/reference.html
 #if NV_OS_DARWIN && defined(HAVE_DISPATCH_H)
-#define HAVE_GCD 1
-#include <dispatch/dispatch.h>
+//#define HAVE_GCD 1
+//#include <dispatch/dispatch.h>
 #endif
 
 // Parallel Patterns Library (PPL) is part of Microsoft's concurrency runtime: 
@@ -64,7 +64,7 @@ namespace nvtt {
 
 #endif
 
-#if NV_OS_DARWIN && defined(HAVE_DISPATCH_H)
+#if HAVE_GCD
 
     // Task dispatcher using Apple's Grand Central Dispatch.
     struct AppleTaskDispatcher : public TaskDispatcher
diff --git a/src/nvtt/nvtt.cpp b/src/nvtt/nvtt.cpp
index b85d52e..bfdb0d1 100644
--- a/src/nvtt/nvtt.cpp
+++ b/src/nvtt/nvtt.cpp
@@ -47,9 +47,9 @@ const char * nvtt::errorString(Error e)
             return "Error writing through output handler";
         case Error_UnsupportedOutputFormat:
             return "The container file does not support the selected output format";
+        default:
+            return "Invalid error";
     }
-
-    return "Invalid error";
 }
 
 // Return NVTT version.
diff --git a/src/nvtt/nvtt.h b/src/nvtt/nvtt.h
index b695490..d86a503 100644
--- a/src/nvtt/nvtt.h
+++ b/src/nvtt/nvtt.h
@@ -105,7 +105,21 @@ namespace nvtt
         Format_BC6,
         Format_BC7,
 
-        Format_BC3_RGBM,    // 
+        Format_BC3_RGBM,
+
+        Format_ETC1,
+        Format_ETC2_R,
+        Format_ETC2_RG,
+        Format_ETC2_RGB,
+        Format_ETC2_RGBA,
+        Format_ETC2_RGB_A1,
+
+        Format_ETC2_RGBM,
+
+        Format_PVR_2BPP_RGB,     // Using PVR textools.
+        Format_PVR_4BPP_RGB,
+        Format_PVR_2BPP_RGBA,
+        Format_PVR_4BPP_RGBA,
 
         Format_Count
     };
@@ -155,6 +169,7 @@ namespace nvtt
         NVTT_API void setFormat(Format format);
         NVTT_API void setQuality(Quality quality);
         NVTT_API void setColorWeights(float red, float green, float blue, float alpha = 1.0f);
+        NVTT_API void setRGBMThreshold(float min_m);
 
         NVTT_API void setExternalCompressor(const char * name);
 
@@ -173,9 +188,10 @@ namespace nvtt
         NVTT_API void setTargetDecoder(Decoder decoder);
 
         // Translate to and from D3D formats.
+        NVTT_API Format format() const;
         NVTT_API unsigned int d3d9Format() const;
+        NVTT_API unsigned int dxgiFormat() const;
         //NVTT_API bool setD3D9Format(unsigned int format);
-        //NVTT_API unsigned int dxgiFormat() const;
         //NVTT_API bool setDxgiFormat(unsigned int format);
     };
 
@@ -253,6 +269,14 @@ namespace nvtt
         AlphaMode_Premultiplied,
     };
 
+    // Extents shape restrictions
+    enum ShapeRestriction
+    {
+        ShapeRestriction_None,
+        ShapeRestriction_Square,    
+    };
+
+
     // Input options. Specify format and layout of the input texture. (Deprecated in NVTT 2.1)
     struct InputOptions
     {
@@ -344,7 +368,7 @@ namespace nvtt
     {
         Container_DDS,
         Container_DDS10,
-        // Container_KTX,   // Khronos Texture: http://www.khronos.org/opengles/sdk/tools/KTX/
+        Container_KTX,   // Khronos Texture: http://www.khronos.org/opengles/sdk/tools/KTX/
         // Container_VTF,   // Valve Texture Format: http://developer.valvesoftware.com/wiki/Valve_Texture_Format
     };
 
@@ -439,6 +463,9 @@ namespace nvtt
         ToneMapper_Lightmap,
     };
 
+    // Transform the given x,y coordinates.
+    typedef void WarpFunction(float & x, float & y, float & d);
+
 
     // A surface is one level of a 2D or 3D texture. (New in NVTT 2.1)
     // @@ It would be nice to add support for texture borders for correct resizing of tiled textures and constrained DXT compression.
@@ -486,7 +513,8 @@ namespace nvtt
         NVTT_API void resize(int w, int h, int d, ResizeFilter filter, float filterWidth, const float * params = 0);
         NVTT_API void resize(int maxExtent, RoundMode mode, ResizeFilter filter);
         NVTT_API void resize(int maxExtent, RoundMode mode, ResizeFilter filter, float filterWidth, const float * params = 0);
-        NVTT_API void resize_make_square(int maxExtent, RoundMode roundMode, ResizeFilter filter);
+        NVTT_API void resizeMakeSquare(int maxExtent, RoundMode roundMode, ResizeFilter filter);
+        NVTT_API void autoResize(float errorTolerance, RoundMode mode, ResizeFilter filter);
 
         NVTT_API bool buildNextMipmap(MipmapFilter filter, int min_size = 1);
         NVTT_API bool buildNextMipmap(MipmapFilter filter, float filterWidth, const float * params = 0, int min_size = 1);
@@ -554,6 +582,10 @@ namespace nvtt
         NVTT_API void flipZ();
         NVTT_API Surface createSubImage(int x0, int x1, int y0, int y1, int z0, int z1) const;
 
+        NVTT_API Surface warp(int w, int h, WarpFunction * f) const;
+        NVTT_API Surface warp(int w, int h, int d, WarpFunction * f) const;
+
+
         // Copy image data.
         NVTT_API bool copyChannel(const Surface & srcImage, int srcChannel);
         NVTT_API bool copyChannel(const Surface & srcImage, int srcChannel, int dstChannel);
diff --git a/src/nvtt/tests/testsuite.cpp b/src/nvtt/tests/testsuite.cpp
index aceac51..652dcfc 100644
--- a/src/nvtt/tests/testsuite.cpp
+++ b/src/nvtt/tests/testsuite.cpp
@@ -146,9 +146,16 @@ static const char * s_witnessImageSet[] = {
 };
 
 static const char * s_witnessLmapImageSet[] = {
-    "specruin.dds",
-    "cottage.dds",
+    "hallway.dds",
+    "windmill.dds",
+    "tunnel.dds",
+    "theater.dds",
     "tower.dds",
+    "hub.dds",
+    "mine.dds",
+    "archway.dds",
+    "hut.dds",
+    "shaft.dds",
 };
 
 static const char * s_normalMapImageSet[] = {
@@ -187,8 +194,14 @@ enum Mode {
     Mode_BC5_Normal_Paraboloid,
     Mode_BC5_Normal_Quartic,
     //Mode_BC5_Normal_DualParaboloid,
-	Mode_BC6,
-	Mode_BC7,
+    Mode_BC6,
+    Mode_BC7,
+    Mode_ETC1_IC,
+    Mode_ETC1_EtcLib,
+    Mode_ETC2_EtcLib,
+    Mode_ETC1_RgEtc,
+    Mode_ETC2_RGBM,
+    Mode_PVR,
     Mode_Count
 };
 static const char * s_modeNames[] = {
@@ -207,8 +220,14 @@ static const char * s_modeNames[] = {
     "BC5-Normal-Paraboloid",        // Mode_BC5_Normal_Paraboloid,
     "BC5-Normal-Quartic",           // Mode_BC5_Normal_Quartic,
     //"BC5-Normal-DualParaboloid",    // Mode_BC5_Normal_DualParaboloid,
-	"BC6",			// Mode_BC6,
-	"BC7",			// Mode_BC7,
+    "BC6",          // Mode_BC6,
+    "BC7",          // Mode_BC7,
+    "ETC1-IC",
+    "ETC1-EtcLib",
+    "ETC2-EtcLib",
+    "ETC1-RgEtc",
+    "ETC2-RGBM",
+    "PVR",
 };
 nvStaticCheck(NV_ARRAY_SIZE(s_modeNames) == Mode_Count);
 
@@ -218,14 +237,16 @@ struct Test {
     Mode modes[6];
 };
 static Test s_imageTests[] = {
-    {"Color", 3, {Mode_BC1, Mode_BC3_YCoCg, Mode_BC3_RGBM, /*Mode_BC3_LUVW*/}},
-    {"Alpha", 3, {Mode_BC1_Alpha, Mode_BC2_Alpha, Mode_BC3_Alpha}},
-    //{"Normal", 3, {Mode_BC1_Normal, Mode_BC3_Normal, Mode_BC5_Normal}},
-    {"Normal", 4, {Mode_BC5_Normal, Mode_BC5_Normal_Stereographic, Mode_BC5_Normal_Paraboloid, Mode_BC5_Normal_Quartic}},
-    {"Lightmap", 4, {Mode_BC1, Mode_BC3_YCoCg, Mode_BC3_RGBM, Mode_BC3_RGBS}},
-	{"HDR", 2, {Mode_BC3_RGBM, Mode_BC6}},
-	{"BC6", 1, {Mode_BC6}},
-	{"BC7", 1, {Mode_BC7}},
+/*0*/   {"Color", 3, {Mode_BC1, Mode_BC3_YCoCg, Mode_BC3_RGBM, /*Mode_BC3_LUVW*/}},
+/*1*/   {"Alpha", 3, {Mode_BC1_Alpha, Mode_BC2_Alpha, Mode_BC3_Alpha}},
+/*2*/   {"Normal", 4, {Mode_BC5_Normal, Mode_BC5_Normal_Stereographic, Mode_BC5_Normal_Paraboloid, Mode_BC5_Normal_Quartic}},
+/*3*/   {"Lightmap", 4, {Mode_BC1, Mode_BC3_YCoCg, Mode_BC3_RGBM, Mode_BC3_RGBS}},
+/*4*/   {"HDR", 3, {Mode_ETC2_RGBM, Mode_BC3_RGBM, Mode_BC6}},
+/*5*/   {"BC6", 1, {Mode_BC6}},
+/*6*/   {"BC7", 1, {Mode_BC7}},
+/*7*/   {"ETC", 3, {Mode_ETC1_IC, Mode_ETC1_RgEtc, Mode_ETC2_EtcLib}},
+/*8*/   {"Color Mobile", 4, {Mode_PVR, Mode_ETC1_IC, Mode_ETC2_EtcLib, Mode_BC1}},
+/*9*/   //{"ETC-Lightmap", 2, {Mode_BC3_RGBM, Mode_ETC_RGBM}},
 };
 const int s_imageTestCount = ARRAY_SIZE(s_imageTests);
 
@@ -404,10 +425,10 @@ int main(int argc, char *argv[])
                 i++;
             }
         }
-		else
-		{
-			printf("Warning: unrecognized option \"%s\"\n", argv[i]);
-		}
+        else
+        {
+            printf("Warning: unrecognized option \"%s\"\n", argv[i]);
+        }
     }
 
     // Validate inputs.
@@ -462,7 +483,8 @@ int main(int argc, char *argv[])
     }
     else
     {
-        compressionOptions.setQuality(nvtt::Quality_Production);
+        compressionOptions.setQuality(nvtt::Quality_Normal);
+        //compressionOptions.setQuality(nvtt::Quality_Production);
     }
     //compressionOptions.setExternalCompressor("ati");
     //compressionOptions.setExternalCompressor("squish");
@@ -515,13 +537,13 @@ int main(int argc, char *argv[])
 
     // Labels on the left side.
     if (errorMode == ErrorMode_RMSE) {
-        graphWriter << "&chxr=0,1," << set.fileCount << ",1|1,0,0.05,0.01";
+        graphWriter << "&chxr=0,1," << set.fileCount << ",1|1,0,0.03,0.01";
     }
     else if (errorMode == ErrorMode_CieLab) {
        graphWriter << "&chxr=0,1," << set.fileCount << ",1|1,4,22,1";
     }
     else if (errorMode == ErrorMode_AngularRMSE) {
-        graphWriter << "&chxr=0,1," << set.fileCount << ",1|1,0,0.05,0.01";
+        graphWriter << "&chxr=0,1," << set.fileCount << ",1|1,0,0.2,0.02";      // 0.05,0.01
     }
 
     // Labels at the bottom.
@@ -581,7 +603,6 @@ int main(int argc, char *argv[])
     else if (errorMode == ErrorMode_AngularRMSE) {
         graphWriter << "&chtt=" << set.name << "%20-%20" << test.name << "%20-%20Angular RMSE";
     }
-    
 
 
     Timer timer;
@@ -590,7 +611,7 @@ int main(int argc, char *argv[])
 
     nvtt::Surface img;
 
-    printf("Running Test: %s with Set: %s\n", test.name, set.name);
+    printf("Running test '%s' with set '%s'\n", test.name, set.name);
 
     graphWriter << "&chd=t:";
 
@@ -602,10 +623,11 @@ int main(int argc, char *argv[])
         Mode mode = test.modes[t];
 
         nvtt::Format format;
+        const char * compressor_name = NULL;
         if (mode == Mode_BC1 || mode == Mode_BC1_Alpha || mode == Mode_BC1_Normal || mode == Mode_BC3_RGBS) {
             format = nvtt::Format_BC1;
         }
-        else if (mode == Mode_BC3_Alpha || mode == Mode_BC3_YCoCg || mode == Mode_BC3_RGBM || mode == Mode_BC3_LUVW) {
+        else if (mode == Mode_BC3_Alpha || mode == Mode_BC3_YCoCg || mode == Mode_BC3_LUVW) {
             format = nvtt::Format_BC3;
         }
         else if (mode == Mode_BC3_Normal) {
@@ -614,20 +636,51 @@ int main(int argc, char *argv[])
         else if (mode == Mode_BC5_Normal || mode == Mode_BC5_Normal_Stereographic || mode == Mode_BC5_Normal_Paraboloid || mode == Mode_BC5_Normal_Quartic) {
             format = nvtt::Format_BC5;
         }
-		else if (mode == Mode_BC6)
-		{
-			format = nvtt::Format_BC6;
-		}
-		else if (mode == Mode_BC7)
-		{
-			format = nvtt::Format_BC7;
-		}
-		else
-		{
-			nvDebugCheck(false);
-		}
+        else if (mode == Mode_BC3_RGBM) {
+            format = nvtt::Format_BC3_RGBM;
+        }
+        else if (mode == Mode_BC6)
+        {
+            format = nvtt::Format_BC6;
+        }
+        else if (mode == Mode_BC7)
+        {
+            format = nvtt::Format_BC7;
+        }
+        else if (mode == Mode_ETC1_IC)
+        {
+            format = nvtt::Format_ETC1;
+        }
+        else if (mode == Mode_ETC1_EtcLib)
+        {
+            format = nvtt::Format_ETC1;
+            compressor_name = "etclib";
+        }
+        else if (mode == Mode_ETC2_EtcLib)
+        {
+            format = nvtt::Format_ETC2_RGB;
+            compressor_name = "etclib";
+        }
+        else if (mode == Mode_ETC1_RgEtc)
+        {
+            format = nvtt::Format_ETC1;
+            compressor_name = "rg_etc";
+        }
+        else if (mode == Mode_ETC2_RGBM)
+        {
+            format = nvtt::Format_ETC2_RGBM;
+        }
+        else if (mode == Mode_PVR)
+        {
+            format = nvtt::Format_PVR_4BPP_RGB;
+        }
+        else
+        {
+            nvUnreachable();
+        }
         
         compressionOptions.setFormat(format);
+        if (compressor_name) compressionOptions.setExternalCompressor(compressor_name);
 
         if (set.type == ImageType_RGBA) {
             img.setAlphaMode(nvtt::AlphaMode_Transparency);
@@ -653,6 +706,7 @@ int main(int argc, char *argv[])
                 printf("Input image '%s' not found.\n", set.fileNames[i]);
                 return EXIT_FAILURE;
             }
+            float color_range = 0.0f;
 
             if (img.isNormalMap()) {
                 img.normalizeNormalMap();
@@ -693,16 +747,34 @@ int main(int argc, char *argv[])
                 tmp.clamp(2);
                 tmp.clamp(3);
             }
-            else if (mode == Mode_BC3_RGBM) {
-                tmp.setAlphaMode(nvtt::AlphaMode_None);
-                if (set.type == ImageType_HDR) {
-					// Transform to gamma-2.0 space before applying RGBM - helps a lot with banding in the darks.
-					tmp.toGamma(2.0f);
-                    tmp.toRGBM(3.0f);	// range of 3.0 in gamma-2.0 space == range of 9.0 in linear space
+            else if (mode == Mode_BC3_RGBM || mode == Mode_ETC2_RGBM) {
+                float r, g, b;
+                tmp.range(0, NULL, &r);
+                tmp.range(1, NULL, &g);
+                tmp.range(2, NULL, &b);
+                color_range = max3(r, g, b);
+                printf("color range = %f\n", color_range);
+
+                tmp.setAlphaMode(nvtt::AlphaMode_Transparency);
+
+                const float max_color_range = 16.0f;
+
+                if (color_range > max_color_range) {
+                    color_range = max_color_range;
                 }
-                else {
-                    tmp.toRGBM();
+
+                for (int i = 0; i < 3; i++) {
+                    tmp.scaleBias(i, 1.0f / color_range, 0.0f);
                 }
+                tmp.toneMap(nvtt::ToneMapper_Linear, /*parameters=*/NULL); // Clamp without changing the hue.
+
+                // Clamp alpha.
+                tmp.clamp(3);
+
+                // To gamma.
+                tmp.toGamma(2);
+
+                compressionOptions.setRGBMThreshold(0.2f);
             }
             else if (mode == Mode_BC3_LUVW) {
                 tmp.setAlphaMode(nvtt::AlphaMode_None);
@@ -781,14 +853,25 @@ int main(int argc, char *argv[])
                     }*/
                 }
             }
-            else if (mode == Mode_BC3_RGBM) {
-                if (set.type == ImageType_HDR) {
-                    img_out.fromRGBM(3.0f);
-					img_out.toLinear(2.0f);
+            else if (mode == Mode_BC3_RGBM || mode == Mode_ETC2_RGBM) {
+                /*if (set.type == ImageType_HDR) {
+                    //img_out.fromRGBM(3.0f);
+                    img_out.fromRGBM(range);
+                    img_out.toLinear(2.0f);
                 }
                 else {
                     img_out.fromRGBM();
+                }*/
+
+                img_out.fromRGBM(1.0f, 0.2f);
+                img_out.toLinear(2);
+
+                for (int i = 0; i < 3; i++) {
+                    img_out.scaleBias(i, color_range, 0.0f);
                 }
+
+                img_out.copyChannel(img, 3);          // Copy alpha channel from source.
+                img_out.setAlphaMode(nvtt::AlphaMode_Transparency);
             }
             else if (mode == Mode_BC3_LUVW) {
                 if (set.type == ImageType_HDR) {
diff --git a/src/nvtt/tools/cmdline.h b/src/nvtt/tools/cmdline.h
index 7617ae7..f46930d 100644
--- a/src/nvtt/tools/cmdline.h
+++ b/src/nvtt/tools/cmdline.h
@@ -61,6 +61,9 @@ struct MyAssertHandler : public nv::AssertHandler {
     virtual int assertion( const char *exp, const char *file, int line, const char *func, const char *msg, va_list arg ) {
         fprintf(stderr, "Assertion failed: %s\nIn %s:%d\n", exp, file, line);
         nv::debug::dumpInfo();
+        if (nv::debug::isDebuggerPresent()) {
+            return NV_ABORT_DEBUG;
+        }
         exit(1);
     }
 };
diff --git a/src/nvtt/tools/compress.cpp b/src/nvtt/tools/compress.cpp
index 412ba5a..36142f1 100644
--- a/src/nvtt/tools/compress.cpp
+++ b/src/nvtt/tools/compress.cpp
@@ -154,11 +154,13 @@ int main(int argc, char *argv[])
     bool loadAsFloat = false;
     bool rgbm = false;
     bool rangescale = false;
+    bool srgb = false;
 
     const char * externalCompressor = NULL;
 
     bool silent = false;
     bool dds10 = false;
+    bool ktx = false;
 
     nv::Path input;
     nv::Path output;
@@ -285,6 +287,31 @@ int main(int argc, char *argv[])
             format = nvtt::Format_BC3_RGBM;
             rgbm = true;
         }
+        else if (strcmp("-etc1", argv[i]) == 0)
+        {
+            format = nvtt::Format_ETC1;
+        }
+        else if (strcmp("-etc2", argv[i]) == 0 || strcmp("-etc2_rgb", argv[i]) == 0)
+        {
+            format = nvtt::Format_ETC2_RGB;
+        }
+        else if (strcmp("-etc2_eac", argv[i]) == 0 || strcmp("-etc2_rgba", argv[i]) == 0)
+        {
+            format = nvtt::Format_ETC2_RGBA;
+        }
+        else if (strcmp("-eac", argv[i]) == 0 || strcmp("-etc2_r", argv[i]) == 0)
+        {
+            format = nvtt::Format_ETC2_R;
+        }
+        else if (strcmp("-etc2_rg", argv[i]) == 0)
+        {
+            format = nvtt::Format_ETC2_R;
+        }
+        else if (strcmp("-etc2_rgbm", argv[i]) == 0)
+        {
+            format = nvtt::Format_ETC2_RGBM;
+            rgbm = true;
+        }
 
         // Undocumented option. Mainly used for testing.
         else if (strcmp("-ext", argv[i]) == 0)
@@ -309,7 +336,15 @@ int main(int argc, char *argv[])
         {
             dds10 = true;
         }
-
+        else if (strcmp("-ktx", argv[i]) == 0)
+        {
+            ktx = true;
+        }
+        else if (strcmp("-srgb", argv[i]) == 0)
+        {
+            srgb = true;
+        }
+        
         else if (argv[i][0] != '-')
         {
             input = argv[i];
@@ -321,15 +356,23 @@ int main(int argc, char *argv[])
             {
                 output.copy(input.str());
                 output.stripExtension();
-                output.append(".dds");
+                
+                if (ktx)
+                {
+                    output.append(".ktx");
+                }
+                else
+                {
+                    output.append(".dds");
+                }
             }
 
             break;
         }
-		else
-		{
-			printf("Warning: unrecognized option \"%s\"\n", argv[i]);
-		}
+        else
+        {
+            printf("Warning: unrecognized option \"%s\"\n", argv[i]);
+        }
     }
 
     const uint version = nvtt::version();
@@ -380,7 +423,9 @@ int main(int argc, char *argv[])
 
         printf("Output options:\n");
         printf("  -silent  \tDo not output progress messages\n");
-        printf("  -dds10   \tUse DirectX 10 DDS format (enabled by default for BC6/7)\n\n");
+        printf("  -dds10   \tUse DirectX 10 DDS format (enabled by default for BC6/7, unless ktx is being used)\n");
+        printf("  -ktx     \tUse KTX container format\n");
+        printf("  -srgb    \tIf the requested format allows it, output will be in sRGB color space\n\n");
 
         return EXIT_FAILURE;
     }
@@ -398,7 +443,7 @@ int main(int argc, char *argv[])
     bool useSurface = false;    // @@ use Surface API in all cases!
     nvtt::Surface image;
 
-    if (format == nvtt::Format_BC3_RGBM || rgbm) {
+    if (format == nvtt::Format_BC3_RGBM || format == nvtt::Format_ETC2_RGBM || rgbm) {
         useSurface = true;
 
         if (!image.load(input.str())) {
@@ -440,7 +485,7 @@ int main(int argc, char *argv[])
         // To gamma.
         image.toGamma(2);
 
-        if (format != nvtt::Format_BC3_RGBM) {
+        if (format != nvtt::Format_BC3_RGBM || format != nvtt::Format_ETC2_RGBM) {
             image.setAlphaMode(nvtt::AlphaMode_None);
             image.toRGBM(1, 0.15f);
         }
@@ -494,7 +539,7 @@ int main(int argc, char *argv[])
                 nvDebugCheck(dds.isTextureArray());
                 inputOptions.setTextureLayout(nvtt::TextureType_Array, dds.width(), dds.height(), 1, dds.arraySize());
                 faceCount = dds.arraySize();
-                dds10 = true;
+                dds10 = ktx ? false : true;
             }
 
             uint mipmapCount = dds.mipmapCount();
@@ -569,11 +614,12 @@ int main(int argc, char *argv[])
             inputOptions.setAlphaMode(nvtt::AlphaMode_None);
         }
 
+        // IC: Do not enforce D3D9 restrictions anymore.
         // Block compressed textures with mipmaps must be powers of two.
-        if (!noMipmaps && format != nvtt::Format_RGB)
+        /*if (!noMipmaps && format != nvtt::Format_RGB)
         {
             inputOptions.setRoundMode(nvtt::RoundMode_ToPreviousPowerOfTwo);
-        }
+        }*/
 
         if (normal)
         {
@@ -720,15 +766,27 @@ int main(int argc, char *argv[])
     outputOptions.setOutputHandler(&outputHandler);
     outputOptions.setErrorHandler(&errorHandler);
 
-	// Automatically use dds10 if compressing to BC6 or BC7
-	if (format == nvtt::Format_BC6 || format == nvtt::Format_BC7)
-	{
-		dds10 = true;
-	}
-
-    if (dds10)
+    if (ktx)
     {
-        outputOptions.setContainer(nvtt::Container_DDS10);
+        outputOptions.setContainer(nvtt::Container_KTX);
+    }
+    else
+    {
+        // Automatically use dds10 if compressing to BC6 or BC7
+        if (format == nvtt::Format_BC6 || format == nvtt::Format_BC7) {
+            dds10 = true;
+        }
+
+        if (dds10) {
+            outputOptions.setContainer(nvtt::Container_DDS10);
+        }
+        else {
+            outputOptions.setContainer(nvtt::Container_DDS);
+        }
+    }
+    
+    if (srgb) {
+        outputOptions.setSrgbFlag(true);
     }
 
     // printf("Press ENTER.\n");
diff --git a/src/nvtt/tools/thumbnailer.cpp b/src/nvtt/tools/thumbnailer.cpp
index 1c8ab01..b9fb4cf 100644
--- a/src/nvtt/tools/thumbnailer.cpp
+++ b/src/nvtt/tools/thumbnailer.cpp
@@ -99,8 +99,8 @@ int main(int argc, char *argv[])
                 return 1;
             }
 
-	    break;
-	}
+            break;
+        }
     }
 
     if (input.isNull() || output.isNull())
@@ -136,21 +136,21 @@ int main(int argc, char *argv[])
         nv::FloatImage fimage(&image);
         fimage.toLinear(0, 3, gamma);
 
-	uint thumbW, thumbH;
-	if (image.width() > image.height())
-	{
-	    thumbW = size;
-	    thumbH = uint ((float (image.height()) / float (image.width())) * size);
-	}
-	else
-	{
-	    thumbW = uint ((float (image.width()) / float (image.height())) * size);
-	    thumbH = size;
-	}
-	nv::AutoPtr<nv::FloatImage> fresult(fimage.resize(nv::BoxFilter(), thumbW, thumbH, nv::FloatImage::WrapMode_Clamp));
-
-	nv::AutoPtr<nv::Image> result(fresult->createImageGammaCorrect(gamma));
-	result->setFormat(nv::Image::Format_ARGB);
+        uint thumbW, thumbH;
+        if (image.width() > image.height())
+        {
+            thumbW = size;
+            thumbH = uint ((float (image.height()) / float (image.width())) * size);
+        }
+        else
+        {
+            thumbW = uint ((float (image.width()) / float (image.height())) * size);
+            thumbH = size;
+        }
+        nv::AutoPtr<nv::FloatImage> fresult(fimage.resize(nv::BoxFilter(), thumbW, thumbH, nv::FloatImage::WrapMode_Clamp));
+
+        nv::AutoPtr<nv::Image> result(fresult->createImageGammaCorrect(gamma));
+        result->setFormat(nv::Image::Format_ARGB);
 
         nv::StdOutputStream stream(output.str());
         nv::ImageIO::save(output.str(), stream, result.ptr(), metaData.buffer());
@@ -160,7 +160,7 @@ int main(int argc, char *argv[])
         nv::StdOutputStream stream(output.str());
         nv::ImageIO::save(output.str(), stream, &image, metaData.buffer());
     }
-	
+
     return 0;
 }