diff --git a/src/nvcore/Debug.cpp b/src/nvcore/Debug.cpp
index f980c07..616b84c 100644
--- a/src/nvcore/Debug.cpp
+++ b/src/nvcore/Debug.cpp
@@ -453,7 +453,7 @@ namespace
         {
             MSG msg;
             while( PeekMessage( &msg, NULL, 0, 0, PM_REMOVE ) ) {
-                if( msg.message == WM_QUIT ) break;
+                //if( msg.message == WM_QUIT ) break;
                 TranslateMessage( &msg );
                 DispatchMessage( &msg );
             }
@@ -467,12 +467,11 @@ namespace
             StringBuilder error_string;
             if( func != NULL ) {
                 error_string.format( "*** Assertion failed: %s\n    On file: %s\n    On function: %s\n    On line: %d\n ", exp, file, func, line );
-                nvDebug( error_string.str() );
             }
             else {
                 error_string.format( "*** Assertion failed: %s\n    On file: %s\n    On line: %d\n ", exp, file, line );
-                nvDebug( error_string.str() );
             }
+            nvDebug( error_string.str() );
 
             if (debug::isDebuggerPresent()) {
                 return NV_ABORT_DEBUG;
diff --git a/src/nvcore/Utils.h b/src/nvcore/Utils.h
index dd7fd5b..4498f34 100644
--- a/src/nvcore/Utils.h
+++ b/src/nvcore/Utils.h
@@ -70,15 +70,11 @@ namespace nv
         b = temp;
     }
 
-    /// Return the maximum of the two arguments.
+    /// Return the maximum of the two arguments. For floating point values, it returns the second value if the first is NaN.
     template <typename T> 
     inline const T & max(const T & a, const T & b)
     {
-        //return std::max(a, b);
-        if( a < b ) {
-            return b; 
-        }
-        return a;
+        return (b < a) ? a : b;
     }
 
     /// Return the maximum of the three arguments.
@@ -92,11 +88,7 @@ namespace nv
     template <typename T> 
     inline const T & min(const T & a, const T & b)
     {
-        //return std::min(a, b);
-        if( b < a ) {
-            return b; 
-        }
-        return a;
+        return (a < b) ? a : b;
     }
 
     /// Return the maximum of the three arguments.
diff --git a/src/nvimage/BlockDXT.cpp b/src/nvimage/BlockDXT.cpp
index c0dc2f9..24336d7 100644
--- a/src/nvimage/BlockDXT.cpp
+++ b/src/nvimage/BlockDXT.cpp
@@ -1,673 +1,673 @@
-// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
-// 
-// Permission is hereby granted, free of charge, to any person
-// obtaining a copy of this software and associated documentation
-// files (the "Software"), to deal in the Software without
-// restriction, including without limitation the rights to use,
-// copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the
-// Software is furnished to do so, subject to the following
-// conditions:
-// 
-// The above copyright notice and this permission notice shall be
-// included in all copies or substantial portions of the Software.
-// 
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-// OTHER DEALINGS IN THE SOFTWARE.
-
-#include "BlockDXT.h"
-#include "ColorBlock.h"
-
-#include "nvcore/Stream.h"
-#include "nvcore/Utils.h" // swap
-
-
-using namespace nv;
-
-
-/*----------------------------------------------------------------------------
-BlockDXT1
-----------------------------------------------------------------------------*/
-
-uint BlockDXT1::evaluatePalette(Color32 color_array[4], bool d3d9/*= false*/) const
-{
-    // Does bit expansion before interpolation.
-    color_array[0].b = (col0.b << 3) | (col0.b >> 2);
-    color_array[0].g = (col0.g << 2) | (col0.g >> 4);
-    color_array[0].r = (col0.r << 3) | (col0.r >> 2);
-    color_array[0].a = 0xFF;
-
-    // @@ Same as above, but faster?
-    //	Color32 c;
-    //	c.u = ((col0.u << 3) & 0xf8) | ((col0.u << 5) & 0xfc00) | ((col0.u << 8) & 0xf80000);
-    //	c.u |= (c.u >> 5) & 0x070007;
-    //	c.u |= (c.u >> 6) & 0x000300;
-    //	color_array[0].u = c.u;
-
-    color_array[1].r = (col1.r << 3) | (col1.r >> 2);
-    color_array[1].g = (col1.g << 2) | (col1.g >> 4);
-    color_array[1].b = (col1.b << 3) | (col1.b >> 2);
-    color_array[1].a = 0xFF;
-
-    // @@ Same as above, but faster?
-    //	c.u = ((col1.u << 3) & 0xf8) | ((col1.u << 5) & 0xfc00) | ((col1.u << 8) & 0xf80000);
-    //	c.u |= (c.u >> 5) & 0x070007;
-    //	c.u |= (c.u >> 6) & 0x000300;
-    //	color_array[1].u = c.u;
-
-    if( col0.u > col1.u ) {
-        int bias = 0;
-        if (d3d9) bias = 1;
-
-        // Four-color block: derive the other two colors.
-        color_array[2].r = (2 * color_array[0].r + color_array[1].r + bias) / 3;
-        color_array[2].g = (2 * color_array[0].g + color_array[1].g + bias) / 3;
-        color_array[2].b = (2 * color_array[0].b + color_array[1].b + bias) / 3;
-        color_array[2].a = 0xFF;
-
-        color_array[3].r = (2 * color_array[1].r + color_array[0].r + bias) / 3;
-        color_array[3].g = (2 * color_array[1].g + color_array[0].g + bias) / 3;
-        color_array[3].b = (2 * color_array[1].b + color_array[0].b + bias) / 3;
-        color_array[3].a = 0xFF;
-
-        return 4;
-    }
-    else {
-        // Three-color block: derive the other color.
-        color_array[2].r = (color_array[0].r + color_array[1].r) / 2;
-        color_array[2].g = (color_array[0].g + color_array[1].g) / 2;
-        color_array[2].b = (color_array[0].b + color_array[1].b) / 2;
-        color_array[2].a = 0xFF;
-
-        // Set all components to 0 to match DXT specs.
-        color_array[3].r = 0x00; // color_array[2].r;
-        color_array[3].g = 0x00; // color_array[2].g;
-        color_array[3].b = 0x00; // color_array[2].b;
-        color_array[3].a = 0x00;
-
-        return 3;
-    }
-}
-
-
-uint BlockDXT1::evaluatePaletteNV5x(Color32 color_array[4]) const
-{
-    // Does bit expansion before interpolation.
-    color_array[0].b = (3 * col0.b * 22) / 8;
-    color_array[0].g = (col0.g << 2) | (col0.g >> 4);
-    color_array[0].r = (3 * col0.r * 22) / 8;
-    color_array[0].a = 0xFF;
-
-    color_array[1].r = (3 * col1.r * 22) / 8;
-    color_array[1].g = (col1.g << 2) | (col1.g >> 4);
-    color_array[1].b = (3 * col1.b * 22) / 8;
-    color_array[1].a = 0xFF;
-
-    int gdiff = color_array[1].g - color_array[0].g;
-
-    if( col0.u > col1.u ) {
-        // Four-color block: derive the other two colors.
-        color_array[2].r = ((2 * col0.r + col1.r) * 22) / 8;
-        color_array[2].g = (256 * color_array[0].g + gdiff / 4 + 128 + gdiff * 80) / 256;
-        color_array[2].b = ((2 * col0.b + col1.b) * 22) / 8;
-        color_array[2].a = 0xFF;
-
-        color_array[3].r = ((2 * col1.r + col0.r) * 22) / 8;
-        color_array[3].g = (256 * color_array[1].g - gdiff / 4 + 128 - gdiff * 80) / 256;
-        color_array[3].b = ((2 * col1.b + col0.b) * 22) / 8;
-        color_array[3].a = 0xFF;
-
-        return 4;
-    }
-    else {
-        // Three-color block: derive the other color.
-        color_array[2].r = ((col0.r + col1.r) * 33) / 8;
-        color_array[2].g = (256 * color_array[0].g + gdiff / 4 + 128 + gdiff * 128) / 256;
-        color_array[2].b = ((col0.b + col1.b) * 33) / 8;
-        color_array[2].a = 0xFF;
-
-        // Set all components to 0 to match DXT specs.
-        color_array[3].r = 0x00; // color_array[2].r;
-        color_array[3].g = 0x00; // color_array[2].g;
-        color_array[3].b = 0x00; // color_array[2].b;
-        color_array[3].a = 0x00;
-
-        return 3;
-    }
-}
-
-// Evaluate palette assuming 3 color block.
-void BlockDXT1::evaluatePalette3(Color32 color_array[4], bool d3d9) const
-{
-    color_array[0].b = (col0.b << 3) | (col0.b >> 2);
-    color_array[0].g = (col0.g << 2) | (col0.g >> 4);
-    color_array[0].r = (col0.r << 3) | (col0.r >> 2);
-    color_array[0].a = 0xFF;
-
-    color_array[1].r = (col1.r << 3) | (col1.r >> 2);
-    color_array[1].g = (col1.g << 2) | (col1.g >> 4);
-    color_array[1].b = (col1.b << 3) | (col1.b >> 2);
-    color_array[1].a = 0xFF;
-
-    // Three-color block: derive the other color.
-    color_array[2].r = (color_array[0].r + color_array[1].r) / 2;
-    color_array[2].g = (color_array[0].g + color_array[1].g) / 2;
-    color_array[2].b = (color_array[0].b + color_array[1].b) / 2;
-    color_array[2].a = 0xFF;
-
-    // Set all components to 0 to match DXT specs.
-    color_array[3].r = 0x00; // color_array[2].r;
-    color_array[3].g = 0x00; // color_array[2].g;
-    color_array[3].b = 0x00; // color_array[2].b;
-    color_array[3].a = 0x00;
-}
-
-// Evaluate palette assuming 4 color block.
-void BlockDXT1::evaluatePalette4(Color32 color_array[4], bool d3d9) const
-{
-    color_array[0].b = (col0.b << 3) | (col0.b >> 2);
-    color_array[0].g = (col0.g << 2) | (col0.g >> 4);
-    color_array[0].r = (col0.r << 3) | (col0.r >> 2);
-    color_array[0].a = 0xFF;
-
-    color_array[1].r = (col1.r << 3) | (col1.r >> 2);
-    color_array[1].g = (col1.g << 2) | (col1.g >> 4);
-    color_array[1].b = (col1.b << 3) | (col1.b >> 2);
-    color_array[1].a = 0xFF;
-
-    int bias = 0;
-    if (d3d9) bias = 1;
-
-    // Four-color block: derive the other two colors.
-    color_array[2].r = (2 * color_array[0].r + color_array[1].r + bias) / 3;
-    color_array[2].g = (2 * color_array[0].g + color_array[1].g + bias) / 3;
-    color_array[2].b = (2 * color_array[0].b + color_array[1].b + bias) / 3;
-    color_array[2].a = 0xFF;
-
-    color_array[3].r = (2 * color_array[1].r + color_array[0].r + bias) / 3;
-    color_array[3].g = (2 * color_array[1].g + color_array[0].g + bias) / 3;
-    color_array[3].b = (2 * color_array[1].b + color_array[0].b + bias) / 3;
-    color_array[3].a = 0xFF;
-}
-
-
-void BlockDXT1::decodeBlock(ColorBlock * block, bool d3d9/*= false*/) const
-{
-    nvDebugCheck(block != NULL);
-
-    // Decode color block.
-    Color32 color_array[4];
-    evaluatePalette(color_array, d3d9);
-
-    // Write color block.
-    for( uint j = 0; j < 4; j++ ) {
-        for( uint i = 0; i < 4; i++ ) {
-            uint idx = (row[j] >> (2 * i)) & 3;
-            block->color(i, j) = color_array[idx];
-        }
-    }	
-}
-
-void BlockDXT1::decodeBlockNV5x(ColorBlock * block) const
-{
-    nvDebugCheck(block != NULL);
-
-    // Decode color block.
-    Color32 color_array[4];
-    evaluatePaletteNV5x(color_array);
-
-    // Write color block.
-    for( uint j = 0; j < 4; j++ ) {
-        for( uint i = 0; i < 4; i++ ) {
-            uint idx = (row[j] >> (2 * i)) & 3;
-            block->color(i, j) = color_array[idx];
-        }
-    }
-}
-
-void BlockDXT1::setIndices(int * idx)
-{
-    indices = 0;
-    for(uint i = 0; i < 16; i++) {
-        indices |= (idx[i] & 3) << (2 * i);
-    }
-}
-
-
-/// Flip DXT1 block vertically.
-inline void BlockDXT1::flip4()
-{
-    swap(row[0], row[3]);
-    swap(row[1], row[2]);
-}
-
-/// Flip half DXT1 block vertically.
-inline void BlockDXT1::flip2()
-{
-    swap(row[0], row[1]);
-}
-
-
-/*----------------------------------------------------------------------------
-BlockDXT3
-----------------------------------------------------------------------------*/
-
-void BlockDXT3::decodeBlock(ColorBlock * block, bool d3d9/*= false*/) const
-{
-    nvDebugCheck(block != NULL);
-
-    // Decode color.
-    color.decodeBlock(block, d3d9);
-
-    // Decode alpha.
-    alpha.decodeBlock(block, d3d9);
-}
-
-void BlockDXT3::decodeBlockNV5x(ColorBlock * block) const
-{
-    nvDebugCheck(block != NULL);
-
-    color.decodeBlockNV5x(block);
-    alpha.decodeBlock(block);
-}
-
-void AlphaBlockDXT3::decodeBlock(ColorBlock * block, bool d3d9/*= false*/) const
-{
-    nvDebugCheck(block != NULL);
-
-    block->color(0x0).a = (alpha0 << 4) | alpha0;
-    block->color(0x1).a = (alpha1 << 4) | alpha1;
-    block->color(0x2).a = (alpha2 << 4) | alpha2;
-    block->color(0x3).a = (alpha3 << 4) | alpha3;
-    block->color(0x4).a = (alpha4 << 4) | alpha4;
-    block->color(0x5).a = (alpha5 << 4) | alpha5;
-    block->color(0x6).a = (alpha6 << 4) | alpha6;
-    block->color(0x7).a = (alpha7 << 4) | alpha7;
-    block->color(0x8).a = (alpha8 << 4) | alpha8;
-    block->color(0x9).a = (alpha9 << 4) | alpha9;
-    block->color(0xA).a = (alphaA << 4) | alphaA;
-    block->color(0xB).a = (alphaB << 4) | alphaB;
-    block->color(0xC).a = (alphaC << 4) | alphaC;
-    block->color(0xD).a = (alphaD << 4) | alphaD;
-    block->color(0xE).a = (alphaE << 4) | alphaE;
-    block->color(0xF).a = (alphaF << 4) | alphaF;
-}
-
-/// Flip DXT3 alpha block vertically.
-void AlphaBlockDXT3::flip4()
-{
-    swap(row[0], row[3]);
-    swap(row[1], row[2]);
-}
-
-/// Flip half DXT3 alpha block vertically.
-void AlphaBlockDXT3::flip2()
-{
-    swap(row[0], row[1]);
-}
-
-/// Flip DXT3 block vertically.
-void BlockDXT3::flip4()
-{
-    alpha.flip4();
-    color.flip4();
-}
-
-/// Flip half DXT3 block vertically.
-void BlockDXT3::flip2()
-{
-    alpha.flip2();
-    color.flip2();
-}
-
-
-/*----------------------------------------------------------------------------
-BlockDXT5
-----------------------------------------------------------------------------*/
-
-void AlphaBlockDXT5::evaluatePalette(uint8 alpha[8], bool d3d9) const
-{
-    if (alpha0 > alpha1) {
-        evaluatePalette8(alpha, d3d9);
-    }
-    else {
-        evaluatePalette6(alpha, d3d9);
-    }
-}
-
-void AlphaBlockDXT5::evaluatePalette8(uint8 alpha[8], bool d3d9) const
-{
-    int bias = 0;
-    if (d3d9) bias = 3;
-
-    // 8-alpha block:  derive the other six alphas.
-    // Bit code 000 = alpha0, 001 = alpha1, others are interpolated.
-    alpha[0] = alpha0;
-    alpha[1] = alpha1;
-    alpha[2] = (6 * alpha[0] + 1 * alpha[1] + bias) / 7;    // bit code 010
-    alpha[3] = (5 * alpha[0] + 2 * alpha[1] + bias) / 7;    // bit code 011
-    alpha[4] = (4 * alpha[0] + 3 * alpha[1] + bias) / 7;    // bit code 100
-    alpha[5] = (3 * alpha[0] + 4 * alpha[1] + bias) / 7;    // bit code 101
-    alpha[6] = (2 * alpha[0] + 5 * alpha[1] + bias) / 7;    // bit code 110
-    alpha[7] = (1 * alpha[0] + 6 * alpha[1] + bias) / 7;    // bit code 111
-}
-
-void AlphaBlockDXT5::evaluatePalette6(uint8 alpha[8], bool d3d9) const
-{
-    int bias = 0;
-    if (d3d9) bias = 2;
-
-    // 6-alpha block.
-    // Bit code 000 = alpha0, 001 = alpha1, others are interpolated.
-    alpha[0] = alpha0;
-    alpha[1] = alpha1;
-    alpha[2] = (4 * alpha[0] + 1 * alpha[1] + bias) / 5;    // Bit code 010
-    alpha[3] = (3 * alpha[0] + 2 * alpha[1] + bias) / 5;    // Bit code 011
-    alpha[4] = (2 * alpha[0] + 3 * alpha[1] + bias) / 5;    // Bit code 100
-    alpha[5] = (1 * alpha[0] + 4 * alpha[1] + bias) / 5;    // Bit code 101
-    alpha[6] = 0x00;                                        // Bit code 110
-    alpha[7] = 0xFF;                                        // Bit code 111
-}
-
-void AlphaBlockDXT5::indices(uint8 index_array[16]) const
-{
-    index_array[0x0] = bits0;
-    index_array[0x1] = bits1;
-    index_array[0x2] = bits2;
-    index_array[0x3] = bits3;
-    index_array[0x4] = bits4;
-    index_array[0x5] = bits5;
-    index_array[0x6] = bits6;
-    index_array[0x7] = bits7;
-    index_array[0x8] = bits8;
-    index_array[0x9] = bits9;
-    index_array[0xA] = bitsA;
-    index_array[0xB] = bitsB;
-    index_array[0xC] = bitsC;
-    index_array[0xD] = bitsD;
-    index_array[0xE] = bitsE;
-    index_array[0xF] = bitsF;
-}
-
-uint AlphaBlockDXT5::index(uint index) const
-{
-    nvDebugCheck(index < 16);
-
-    int offset = (3 * index + 16);
-    return uint((this->u >> offset) & 0x7);
-}
-
-void AlphaBlockDXT5::setIndex(uint index, uint value)
-{
-    nvDebugCheck(index < 16);
-    nvDebugCheck(value < 8);
-
-    int offset = (3 * index + 16);
-    uint64 mask = uint64(0x7) << offset;
-    this->u = (this->u & ~mask) | (uint64(value) << offset);
-}
-
-void AlphaBlockDXT5::decodeBlock(ColorBlock * block, bool d3d9/*= false*/) const
-{
-    nvDebugCheck(block != NULL);
-
-    uint8 alpha_array[8];
-    evaluatePalette(alpha_array, d3d9);
-
-    uint8 index_array[16];
-    indices(index_array);
-
-    for(uint i = 0; i < 16; i++) {
-        block->color(i).a = alpha_array[index_array[i]];
-    }
-}
-
-void AlphaBlockDXT5::flip4()
-{
-    uint64 * b = (uint64 *)this;
-
-    // @@ The masks might have to be byte swapped.
-    uint64 tmp = (*b & POSH_U64(0x000000000000FFFF));
-    tmp |= (*b & POSH_U64(0x000000000FFF0000)) << 36;
-    tmp |= (*b & POSH_U64(0x000000FFF0000000)) << 12;
-    tmp |= (*b & POSH_U64(0x000FFF0000000000)) >> 12;
-    tmp |= (*b & POSH_U64(0xFFF0000000000000)) >> 36;
-
-    *b = tmp;
-}
-
-void AlphaBlockDXT5::flip2()
-{
-    uint * b = (uint *)this;
-
-    // @@ The masks might have to be byte swapped.
-    uint tmp = (*b & 0xFF000000);
-    tmp |=  (*b & 0x00000FFF) << 12;
-    tmp |= (*b & 0x00FFF000) >> 12;
-
-    *b = tmp;
-}
-
-void BlockDXT5::decodeBlock(ColorBlock * block, bool d3d9/*= false*/) const
-{
-    nvDebugCheck(block != NULL);
-
-    // Decode color.
-    color.decodeBlock(block, d3d9);
-
-    // Decode alpha.
-    alpha.decodeBlock(block, d3d9);
-}
-
-void BlockDXT5::decodeBlockNV5x(ColorBlock * block) const
-{
-    nvDebugCheck(block != NULL);
-
-    // Decode color.
-    color.decodeBlockNV5x(block);
-
-    // Decode alpha.
-    alpha.decodeBlock(block);
-}
-
-/// Flip DXT5 block vertically.
-void BlockDXT5::flip4()
-{
-    alpha.flip4();
-    color.flip4();
-}
-
-/// Flip half DXT5 block vertically.
-void BlockDXT5::flip2()
-{
-    alpha.flip2();
-    color.flip2();
-}
-
-
-/// Decode ATI1 block.
-void BlockATI1::decodeBlock(ColorBlock * block, bool d3d9/*= false*/) const
-{
-    uint8 alpha_array[8];
-    alpha.evaluatePalette(alpha_array, d3d9);
-
-    uint8 index_array[16];
-    alpha.indices(index_array);
-
-    for(uint i = 0; i < 16; i++) {
-        Color32 & c = block->color(i);
-        c.b = c.g = c.r = alpha_array[index_array[i]];
-        c.a = 255;
-    }
-}
-
-/// Flip ATI1 block vertically.
-void BlockATI1::flip4()
-{
-    alpha.flip4();
-}
-
-/// Flip half ATI1 block vertically.
-void BlockATI1::flip2()
-{
-    alpha.flip2();
-}
-
-
-/// Decode ATI2 block.
-void BlockATI2::decodeBlock(ColorBlock * block, bool d3d9/*= false*/) const
-{
-    uint8 alpha_array[8];
-    uint8 index_array[16];
-
-    x.evaluatePalette(alpha_array, d3d9);
-    x.indices(index_array);
-
-    for(uint i = 0; i < 16; i++) {
-        Color32 & c = block->color(i);
-        c.r = alpha_array[index_array[i]];
-    }
-
-    y.evaluatePalette(alpha_array, d3d9);
-    y.indices(index_array);
-
-    for(uint i = 0; i < 16; i++) {
-        Color32 & c = block->color(i);
-        c.g = alpha_array[index_array[i]];
-        c.b = 0;
-        c.a = 255;
-    }
-}
-
-/// Flip ATI2 block vertically.
-void BlockATI2::flip4()
-{
-    x.flip4();
-    y.flip4();
-}
-
-/// Flip half ATI2 block vertically.
-void BlockATI2::flip2()
-{
-    x.flip2();
-    y.flip2();
-}
-
-
-void BlockCTX1::evaluatePalette(Color32 color_array[4]) const
-{
-    // Does bit expansion before interpolation.
-    color_array[0].b = 0x00;
-    color_array[0].g = col0[1];
-    color_array[0].r = col0[0];
-    color_array[0].a = 0xFF;
-
-    color_array[1].r = 0x00;
-    color_array[1].g = col0[1];
-    color_array[1].b = col1[0];
-    color_array[1].a = 0xFF;
-
-    color_array[2].r = 0x00;
-    color_array[2].g = (2 * color_array[0].g + color_array[1].g) / 3;
-    color_array[2].b = (2 * color_array[0].b + color_array[1].b) / 3;
-    color_array[2].a = 0xFF;
-
-    color_array[3].r = 0x00;
-    color_array[3].g = (2 * color_array[1].g + color_array[0].g) / 3;
-    color_array[3].b = (2 * color_array[1].b + color_array[0].b) / 3;
-    color_array[3].a = 0xFF;
-}
-
-void BlockCTX1::decodeBlock(ColorBlock * block) const
-{
-    nvDebugCheck(block != NULL);
-
-    // Decode color block.
-    Color32 color_array[4];
-    evaluatePalette(color_array);
-
-    // Write color block.
-    for( uint j = 0; j < 4; j++ ) {
-        for( uint i = 0; i < 4; i++ ) {
-            uint idx = (row[j] >> (2 * i)) & 3;
-            block->color(i, j) = color_array[idx];
-        }
-    }	
-}
-
-void BlockCTX1::setIndices(int * idx)
-{
-    indices = 0;
-    for(uint i = 0; i < 16; i++) {
-        indices |= (idx[i] & 3) << (2 * i);
-    }
-}
-
-
-/// Flip CTX1 block vertically.
-inline void BlockCTX1::flip4()
-{
-    swap(row[0], row[3]);
-    swap(row[1], row[2]);
-}
-
-/// Flip half CTX1 block vertically.
-inline void BlockCTX1::flip2()
-{
-    swap(row[0], row[1]);
-}
-
-
-
-
-Stream & nv::operator<<(Stream & stream, BlockDXT1 & block)
-{
-    stream << block.col0.u << block.col1.u;
-    stream.serialize(&block.indices, sizeof(block.indices));
-    return stream;
-}
-
-Stream & nv::operator<<(Stream & stream, AlphaBlockDXT3 & block)
-{
-    stream.serialize(&block, sizeof(block));
-    return stream;
-}
-
-Stream & nv::operator<<(Stream & stream, BlockDXT3 & block)
-{
-    return stream << block.alpha << block.color;
-}
-
-Stream & nv::operator<<(Stream & stream, AlphaBlockDXT5 & block)
-{
-    stream.serialize(&block, sizeof(block));
-    return stream;
-}
-
-Stream & nv::operator<<(Stream & stream, BlockDXT5 & block)
-{
-    return stream << block.alpha << block.color;
-}
-
-Stream & nv::operator<<(Stream & stream, BlockATI1 & block)
-{
-    return stream << block.alpha;
-}
-
-Stream & nv::operator<<(Stream & stream, BlockATI2 & block)
-{
-    return stream << block.x << block.y;
-}
-
-Stream & nv::operator<<(Stream & stream, BlockCTX1 & block)
-{
-    stream.serialize(&block, sizeof(block));
-    return stream;
-}
-
+// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
+// 
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+#include "BlockDXT.h"
+#include "ColorBlock.h"
+
+#include "nvcore/Stream.h"
+#include "nvcore/Utils.h" // swap
+
+
+using namespace nv;
+
+
+/*----------------------------------------------------------------------------
+BlockDXT1
+----------------------------------------------------------------------------*/
+
+uint BlockDXT1::evaluatePalette(Color32 color_array[4], bool d3d9/*= false*/) const
+{
+    // Does bit expansion before interpolation.
+    color_array[0].b = (col0.b << 3) | (col0.b >> 2);
+    color_array[0].g = (col0.g << 2) | (col0.g >> 4);
+    color_array[0].r = (col0.r << 3) | (col0.r >> 2);
+    color_array[0].a = 0xFF;
+
+    // @@ Same as above, but faster?
+    //	Color32 c;
+    //	c.u = ((col0.u << 3) & 0xf8) | ((col0.u << 5) & 0xfc00) | ((col0.u << 8) & 0xf80000);
+    //	c.u |= (c.u >> 5) & 0x070007;
+    //	c.u |= (c.u >> 6) & 0x000300;
+    //	color_array[0].u = c.u;
+
+    color_array[1].r = (col1.r << 3) | (col1.r >> 2);
+    color_array[1].g = (col1.g << 2) | (col1.g >> 4);
+    color_array[1].b = (col1.b << 3) | (col1.b >> 2);
+    color_array[1].a = 0xFF;
+
+    // @@ Same as above, but faster?
+    //	c.u = ((col1.u << 3) & 0xf8) | ((col1.u << 5) & 0xfc00) | ((col1.u << 8) & 0xf80000);
+    //	c.u |= (c.u >> 5) & 0x070007;
+    //	c.u |= (c.u >> 6) & 0x000300;
+    //	color_array[1].u = c.u;
+
+    if( col0.u > col1.u ) {
+        int bias = 0;
+        if (d3d9) bias = 1;
+
+        // Four-color block: derive the other two colors.
+        color_array[2].r = (2 * color_array[0].r + color_array[1].r + bias) / 3;
+        color_array[2].g = (2 * color_array[0].g + color_array[1].g + bias) / 3;
+        color_array[2].b = (2 * color_array[0].b + color_array[1].b + bias) / 3;
+        color_array[2].a = 0xFF;
+
+        color_array[3].r = (2 * color_array[1].r + color_array[0].r + bias) / 3;
+        color_array[3].g = (2 * color_array[1].g + color_array[0].g + bias) / 3;
+        color_array[3].b = (2 * color_array[1].b + color_array[0].b + bias) / 3;
+        color_array[3].a = 0xFF;
+
+        return 4;
+    }
+    else {
+        // Three-color block: derive the other color.
+        color_array[2].r = (color_array[0].r + color_array[1].r) / 2;
+        color_array[2].g = (color_array[0].g + color_array[1].g) / 2;
+        color_array[2].b = (color_array[0].b + color_array[1].b) / 2;
+        color_array[2].a = 0xFF;
+
+        // Set all components to 0 to match DXT specs.
+        color_array[3].r = 0x00; // color_array[2].r;
+        color_array[3].g = 0x00; // color_array[2].g;
+        color_array[3].b = 0x00; // color_array[2].b;
+        color_array[3].a = 0x00;
+
+        return 3;
+    }
+}
+
+
+uint BlockDXT1::evaluatePaletteNV5x(Color32 color_array[4]) const
+{
+    // Does bit expansion before interpolation.
+    color_array[0].b = (3 * col0.b * 22) / 8;
+    color_array[0].g = (col0.g << 2) | (col0.g >> 4);
+    color_array[0].r = (3 * col0.r * 22) / 8;
+    color_array[0].a = 0xFF;
+
+    color_array[1].r = (3 * col1.r * 22) / 8;
+    color_array[1].g = (col1.g << 2) | (col1.g >> 4);
+    color_array[1].b = (3 * col1.b * 22) / 8;
+    color_array[1].a = 0xFF;
+
+    int gdiff = color_array[1].g - color_array[0].g;
+
+    if( col0.u > col1.u ) {
+        // Four-color block: derive the other two colors.
+        color_array[2].r = ((2 * col0.r + col1.r) * 22) / 8;
+        color_array[2].g = (256 * color_array[0].g + gdiff / 4 + 128 + gdiff * 80) / 256;
+        color_array[2].b = ((2 * col0.b + col1.b) * 22) / 8;
+        color_array[2].a = 0xFF;
+
+        color_array[3].r = ((2 * col1.r + col0.r) * 22) / 8;
+        color_array[3].g = (256 * color_array[1].g - gdiff / 4 + 128 - gdiff * 80) / 256;
+        color_array[3].b = ((2 * col1.b + col0.b) * 22) / 8;
+        color_array[3].a = 0xFF;
+
+        return 4;
+    }
+    else {
+        // Three-color block: derive the other color.
+        color_array[2].r = ((col0.r + col1.r) * 33) / 8;
+        color_array[2].g = (256 * color_array[0].g + gdiff / 4 + 128 + gdiff * 128) / 256;
+        color_array[2].b = ((col0.b + col1.b) * 33) / 8;
+        color_array[2].a = 0xFF;
+
+        // Set all components to 0 to match DXT specs.
+        color_array[3].r = 0x00; // color_array[2].r;
+        color_array[3].g = 0x00; // color_array[2].g;
+        color_array[3].b = 0x00; // color_array[2].b;
+        color_array[3].a = 0x00;
+
+        return 3;
+    }
+}
+
+// Evaluate palette assuming 3 color block.
+void BlockDXT1::evaluatePalette3(Color32 color_array[4], bool d3d9) const
+{
+    color_array[0].b = (col0.b << 3) | (col0.b >> 2);
+    color_array[0].g = (col0.g << 2) | (col0.g >> 4);
+    color_array[0].r = (col0.r << 3) | (col0.r >> 2);
+    color_array[0].a = 0xFF;
+
+    color_array[1].r = (col1.r << 3) | (col1.r >> 2);
+    color_array[1].g = (col1.g << 2) | (col1.g >> 4);
+    color_array[1].b = (col1.b << 3) | (col1.b >> 2);
+    color_array[1].a = 0xFF;
+
+    // Three-color block: derive the other color.
+    color_array[2].r = (color_array[0].r + color_array[1].r) / 2;
+    color_array[2].g = (color_array[0].g + color_array[1].g) / 2;
+    color_array[2].b = (color_array[0].b + color_array[1].b) / 2;
+    color_array[2].a = 0xFF;
+
+    // Set all components to 0 to match DXT specs.
+    color_array[3].r = 0x00; // color_array[2].r;
+    color_array[3].g = 0x00; // color_array[2].g;
+    color_array[3].b = 0x00; // color_array[2].b;
+    color_array[3].a = 0x00;
+}
+
+// Evaluate palette assuming 4 color block.
+void BlockDXT1::evaluatePalette4(Color32 color_array[4], bool d3d9) const
+{
+    color_array[0].b = (col0.b << 3) | (col0.b >> 2);
+    color_array[0].g = (col0.g << 2) | (col0.g >> 4);
+    color_array[0].r = (col0.r << 3) | (col0.r >> 2);
+    color_array[0].a = 0xFF;
+
+    color_array[1].r = (col1.r << 3) | (col1.r >> 2);
+    color_array[1].g = (col1.g << 2) | (col1.g >> 4);
+    color_array[1].b = (col1.b << 3) | (col1.b >> 2);
+    color_array[1].a = 0xFF;
+
+    int bias = 0;
+    if (d3d9) bias = 1;
+
+    // Four-color block: derive the other two colors.
+    color_array[2].r = (2 * color_array[0].r + color_array[1].r + bias) / 3;
+    color_array[2].g = (2 * color_array[0].g + color_array[1].g + bias) / 3;
+    color_array[2].b = (2 * color_array[0].b + color_array[1].b + bias) / 3;
+    color_array[2].a = 0xFF;
+
+    color_array[3].r = (2 * color_array[1].r + color_array[0].r + bias) / 3;
+    color_array[3].g = (2 * color_array[1].g + color_array[0].g + bias) / 3;
+    color_array[3].b = (2 * color_array[1].b + color_array[0].b + bias) / 3;
+    color_array[3].a = 0xFF;
+}
+
+
+void BlockDXT1::decodeBlock(ColorBlock * block, bool d3d9/*= false*/) const
+{
+    nvDebugCheck(block != NULL);
+
+    // Decode color block.
+    Color32 color_array[4];
+    evaluatePalette(color_array, d3d9);
+
+    // Write color block.
+    for( uint j = 0; j < 4; j++ ) {
+        for( uint i = 0; i < 4; i++ ) {
+            uint idx = (row[j] >> (2 * i)) & 3;
+            block->color(i, j) = color_array[idx];
+        }
+    }	
+}
+
+void BlockDXT1::decodeBlockNV5x(ColorBlock * block) const
+{
+    nvDebugCheck(block != NULL);
+
+    // Decode color block.
+    Color32 color_array[4];
+    evaluatePaletteNV5x(color_array);
+
+    // Write color block.
+    for( uint j = 0; j < 4; j++ ) {
+        for( uint i = 0; i < 4; i++ ) {
+            uint idx = (row[j] >> (2 * i)) & 3;
+            block->color(i, j) = color_array[idx];
+        }
+    }
+}
+
+void BlockDXT1::setIndices(int * idx)
+{
+    indices = 0;
+    for(uint i = 0; i < 16; i++) {
+        indices |= (idx[i] & 3) << (2 * i);
+    }
+}
+
+
+/// Flip DXT1 block vertically.
+inline void BlockDXT1::flip4()
+{
+    swap(row[0], row[3]);
+    swap(row[1], row[2]);
+}
+
+/// Flip half DXT1 block vertically.
+inline void BlockDXT1::flip2()
+{
+    swap(row[0], row[1]);
+}
+
+
+/*----------------------------------------------------------------------------
+BlockDXT3
+----------------------------------------------------------------------------*/
+
+void BlockDXT3::decodeBlock(ColorBlock * block, bool d3d9/*= false*/) const
+{
+    nvDebugCheck(block != NULL);
+
+    // Decode color.
+    color.decodeBlock(block, d3d9);
+
+    // Decode alpha.
+    alpha.decodeBlock(block, d3d9);
+}
+
+void BlockDXT3::decodeBlockNV5x(ColorBlock * block) const
+{
+    nvDebugCheck(block != NULL);
+
+    color.decodeBlockNV5x(block);
+    alpha.decodeBlock(block);
+}
+
+void AlphaBlockDXT3::decodeBlock(ColorBlock * block, bool d3d9/*= false*/) const
+{
+    nvDebugCheck(block != NULL);
+
+    block->color(0x0).a = (alpha0 << 4) | alpha0;
+    block->color(0x1).a = (alpha1 << 4) | alpha1;
+    block->color(0x2).a = (alpha2 << 4) | alpha2;
+    block->color(0x3).a = (alpha3 << 4) | alpha3;
+    block->color(0x4).a = (alpha4 << 4) | alpha4;
+    block->color(0x5).a = (alpha5 << 4) | alpha5;
+    block->color(0x6).a = (alpha6 << 4) | alpha6;
+    block->color(0x7).a = (alpha7 << 4) | alpha7;
+    block->color(0x8).a = (alpha8 << 4) | alpha8;
+    block->color(0x9).a = (alpha9 << 4) | alpha9;
+    block->color(0xA).a = (alphaA << 4) | alphaA;
+    block->color(0xB).a = (alphaB << 4) | alphaB;
+    block->color(0xC).a = (alphaC << 4) | alphaC;
+    block->color(0xD).a = (alphaD << 4) | alphaD;
+    block->color(0xE).a = (alphaE << 4) | alphaE;
+    block->color(0xF).a = (alphaF << 4) | alphaF;
+}
+
+/// Flip DXT3 alpha block vertically.
+void AlphaBlockDXT3::flip4()
+{
+    swap(row[0], row[3]);
+    swap(row[1], row[2]);
+}
+
+/// Flip half DXT3 alpha block vertically.
+void AlphaBlockDXT3::flip2()
+{
+    swap(row[0], row[1]);
+}
+
+/// Flip DXT3 block vertically.
+void BlockDXT3::flip4()
+{
+    alpha.flip4();
+    color.flip4();
+}
+
+/// Flip half DXT3 block vertically.
+void BlockDXT3::flip2()
+{
+    alpha.flip2();
+    color.flip2();
+}
+
+
+/*----------------------------------------------------------------------------
+BlockDXT5
+----------------------------------------------------------------------------*/
+
+void AlphaBlockDXT5::evaluatePalette(uint8 alpha[8], bool d3d9) const
+{
+    if (alpha0 > alpha1) {
+        evaluatePalette8(alpha, d3d9);
+    }
+    else {
+        evaluatePalette6(alpha, d3d9);
+    }
+}
+
+void AlphaBlockDXT5::evaluatePalette8(uint8 alpha[8], bool d3d9) const
+{
+    int bias = 0;
+    if (d3d9) bias = 3;
+
+    // 8-alpha block:  derive the other six alphas.
+    // Bit code 000 = alpha0, 001 = alpha1, others are interpolated.
+    alpha[0] = alpha0;
+    alpha[1] = alpha1;
+    alpha[2] = (6 * alpha[0] + 1 * alpha[1] + bias) / 7;    // bit code 010
+    alpha[3] = (5 * alpha[0] + 2 * alpha[1] + bias) / 7;    // bit code 011
+    alpha[4] = (4 * alpha[0] + 3 * alpha[1] + bias) / 7;    // bit code 100
+    alpha[5] = (3 * alpha[0] + 4 * alpha[1] + bias) / 7;    // bit code 101
+    alpha[6] = (2 * alpha[0] + 5 * alpha[1] + bias) / 7;    // bit code 110
+    alpha[7] = (1 * alpha[0] + 6 * alpha[1] + bias) / 7;    // bit code 111
+}
+
+void AlphaBlockDXT5::evaluatePalette6(uint8 alpha[8], bool d3d9) const
+{
+    int bias = 0;
+    if (d3d9) bias = 2;
+
+    // 6-alpha block.
+    // Bit code 000 = alpha0, 001 = alpha1, others are interpolated.
+    alpha[0] = alpha0;
+    alpha[1] = alpha1;
+    alpha[2] = (4 * alpha[0] + 1 * alpha[1] + bias) / 5;    // Bit code 010
+    alpha[3] = (3 * alpha[0] + 2 * alpha[1] + bias) / 5;    // Bit code 011
+    alpha[4] = (2 * alpha[0] + 3 * alpha[1] + bias) / 5;    // Bit code 100
+    alpha[5] = (1 * alpha[0] + 4 * alpha[1] + bias) / 5;    // Bit code 101
+    alpha[6] = 0x00;                                        // Bit code 110
+    alpha[7] = 0xFF;                                        // Bit code 111
+}
+
+void AlphaBlockDXT5::indices(uint8 index_array[16]) const
+{
+    index_array[0x0] = bits0;
+    index_array[0x1] = bits1;
+    index_array[0x2] = bits2;
+    index_array[0x3] = bits3;
+    index_array[0x4] = bits4;
+    index_array[0x5] = bits5;
+    index_array[0x6] = bits6;
+    index_array[0x7] = bits7;
+    index_array[0x8] = bits8;
+    index_array[0x9] = bits9;
+    index_array[0xA] = bitsA;
+    index_array[0xB] = bitsB;
+    index_array[0xC] = bitsC;
+    index_array[0xD] = bitsD;
+    index_array[0xE] = bitsE;
+    index_array[0xF] = bitsF;
+}
+
+uint AlphaBlockDXT5::index(uint index) const
+{
+    nvDebugCheck(index < 16);
+
+    int offset = (3 * index + 16);
+    return uint((this->u >> offset) & 0x7);
+}
+
+void AlphaBlockDXT5::setIndex(uint index, uint value)
+{
+    nvDebugCheck(index < 16);
+    nvDebugCheck(value < 8);
+
+    int offset = (3 * index + 16);
+    uint64 mask = uint64(0x7) << offset;
+    this->u = (this->u & ~mask) | (uint64(value) << offset);
+}
+
+void AlphaBlockDXT5::decodeBlock(ColorBlock * block, bool d3d9/*= false*/) const
+{
+    nvDebugCheck(block != NULL);
+
+    uint8 alpha_array[8];
+    evaluatePalette(alpha_array, d3d9);
+
+    uint8 index_array[16];
+    indices(index_array);
+
+    for(uint i = 0; i < 16; i++) {
+        block->color(i).a = alpha_array[index_array[i]];
+    }
+}
+
+void AlphaBlockDXT5::flip4()
+{
+    uint64 * b = (uint64 *)this;
+
+    // @@ The masks might have to be byte swapped.
+    uint64 tmp = (*b & POSH_U64(0x000000000000FFFF));
+    tmp |= (*b & POSH_U64(0x000000000FFF0000)) << 36;
+    tmp |= (*b & POSH_U64(0x000000FFF0000000)) << 12;
+    tmp |= (*b & POSH_U64(0x000FFF0000000000)) >> 12;
+    tmp |= (*b & POSH_U64(0xFFF0000000000000)) >> 36;
+
+    *b = tmp;
+}
+
+void AlphaBlockDXT5::flip2()
+{
+    uint * b = (uint *)this;
+
+    // @@ The masks might have to be byte swapped.
+    uint tmp = (*b & 0xFF000000);
+    tmp |=  (*b & 0x00000FFF) << 12;
+    tmp |= (*b & 0x00FFF000) >> 12;
+
+    *b = tmp;
+}
+
+void BlockDXT5::decodeBlock(ColorBlock * block, bool d3d9/*= false*/) const
+{
+    nvDebugCheck(block != NULL);
+
+    // Decode color.
+    color.decodeBlock(block, d3d9);
+
+    // Decode alpha.
+    alpha.decodeBlock(block, d3d9);
+}
+
+void BlockDXT5::decodeBlockNV5x(ColorBlock * block) const
+{
+    nvDebugCheck(block != NULL);
+
+    // Decode color.
+    color.decodeBlockNV5x(block);
+
+    // Decode alpha.
+    alpha.decodeBlock(block);
+}
+
+/// Flip DXT5 block vertically.
+void BlockDXT5::flip4()
+{
+    alpha.flip4();
+    color.flip4();
+}
+
+/// Flip half DXT5 block vertically.
+void BlockDXT5::flip2()
+{
+    alpha.flip2();
+    color.flip2();
+}
+
+
+/// Decode ATI1 block.
+void BlockATI1::decodeBlock(ColorBlock * block, bool d3d9/*= false*/) const
+{
+    uint8 alpha_array[8];
+    alpha.evaluatePalette(alpha_array, d3d9);
+
+    uint8 index_array[16];
+    alpha.indices(index_array);
+
+    for(uint i = 0; i < 16; i++) {
+        Color32 & c = block->color(i);
+        c.b = c.g = c.r = alpha_array[index_array[i]];
+        c.a = 255;
+    }
+}
+
+/// Flip ATI1 block vertically.
+void BlockATI1::flip4()
+{
+    alpha.flip4();
+}
+
+/// Flip half ATI1 block vertically.
+void BlockATI1::flip2()
+{
+    alpha.flip2();
+}
+
+
+/// Decode ATI2 block.
+void BlockATI2::decodeBlock(ColorBlock * block, bool d3d9/*= false*/) const
+{
+    uint8 alpha_array[8];
+    uint8 index_array[16];
+
+    x.evaluatePalette(alpha_array, d3d9);
+    x.indices(index_array);
+
+    for(uint i = 0; i < 16; i++) {
+        Color32 & c = block->color(i);
+        c.r = alpha_array[index_array[i]];
+    }
+
+    y.evaluatePalette(alpha_array, d3d9);
+    y.indices(index_array);
+
+    for(uint i = 0; i < 16; i++) {
+        Color32 & c = block->color(i);
+        c.g = alpha_array[index_array[i]];
+        c.b = 0;
+        c.a = 255;
+    }
+}
+
+/// Flip ATI2 block vertically.
+void BlockATI2::flip4()
+{
+    x.flip4();
+    y.flip4();
+}
+
+/// Flip half ATI2 block vertically.
+void BlockATI2::flip2()
+{
+    x.flip2();
+    y.flip2();
+}
+
+
+void BlockCTX1::evaluatePalette(Color32 color_array[4]) const
+{
+    // Does bit expansion before interpolation.
+    color_array[0].b = 0x00;
+    color_array[0].g = col0[1];
+    color_array[0].r = col0[0];
+    color_array[0].a = 0xFF;
+
+    color_array[1].r = 0x00;
+    color_array[1].g = col0[1];
+    color_array[1].b = col1[0];
+    color_array[1].a = 0xFF;
+
+    color_array[2].r = 0x00;
+    color_array[2].g = (2 * color_array[0].g + color_array[1].g) / 3;
+    color_array[2].b = (2 * color_array[0].b + color_array[1].b) / 3;
+    color_array[2].a = 0xFF;
+
+    color_array[3].r = 0x00;
+    color_array[3].g = (2 * color_array[1].g + color_array[0].g) / 3;
+    color_array[3].b = (2 * color_array[1].b + color_array[0].b) / 3;
+    color_array[3].a = 0xFF;
+}
+
+void BlockCTX1::decodeBlock(ColorBlock * block) const
+{
+    nvDebugCheck(block != NULL);
+
+    // Decode color block.
+    Color32 color_array[4];
+    evaluatePalette(color_array);
+
+    // Write color block.
+    for( uint j = 0; j < 4; j++ ) {
+        for( uint i = 0; i < 4; i++ ) {
+            uint idx = (row[j] >> (2 * i)) & 3;
+            block->color(i, j) = color_array[idx];
+        }
+    }	
+}
+
+void BlockCTX1::setIndices(int * idx)
+{
+    indices = 0;
+    for(uint i = 0; i < 16; i++) {
+        indices |= (idx[i] & 3) << (2 * i);
+    }
+}
+
+
+/// Flip CTX1 block vertically.
+inline void BlockCTX1::flip4()
+{
+    swap(row[0], row[3]);
+    swap(row[1], row[2]);
+}
+
+/// Flip half CTX1 block vertically.
+inline void BlockCTX1::flip2()
+{
+    swap(row[0], row[1]);
+}
+
+
+
+
+Stream & nv::operator<<(Stream & stream, BlockDXT1 & block)
+{
+    stream << block.col0.u << block.col1.u;
+    stream.serialize(&block.indices, sizeof(block.indices));
+    return stream;
+}
+
+Stream & nv::operator<<(Stream & stream, AlphaBlockDXT3 & block)
+{
+    stream.serialize(&block, sizeof(block));
+    return stream;
+}
+
+Stream & nv::operator<<(Stream & stream, BlockDXT3 & block)
+{
+    return stream << block.alpha << block.color;
+}
+
+Stream & nv::operator<<(Stream & stream, AlphaBlockDXT5 & block)
+{
+    stream.serialize(&block, sizeof(block));
+    return stream;
+}
+
+Stream & nv::operator<<(Stream & stream, BlockDXT5 & block)
+{
+    return stream << block.alpha << block.color;
+}
+
+Stream & nv::operator<<(Stream & stream, BlockATI1 & block)
+{
+    return stream << block.alpha;
+}
+
+Stream & nv::operator<<(Stream & stream, BlockATI2 & block)
+{
+    return stream << block.x << block.y;
+}
+
+Stream & nv::operator<<(Stream & stream, BlockCTX1 & block)
+{
+    stream.serialize(&block, sizeof(block));
+    return stream;
+}
+
diff --git a/src/nvimage/BlockDXT.h b/src/nvimage/BlockDXT.h
index b2b3de8..df0541b 100644
--- a/src/nvimage/BlockDXT.h
+++ b/src/nvimage/BlockDXT.h
@@ -1,228 +1,228 @@
-// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
-// 
-// Permission is hereby granted, free of charge, to any person
-// obtaining a copy of this software and associated documentation
-// files (the "Software"), to deal in the Software without
-// restriction, including without limitation the rights to use,
-// copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the
-// Software is furnished to do so, subject to the following
-// conditions:
-// 
-// The above copyright notice and this permission notice shall be
-// included in all copies or substantial portions of the Software.
-// 
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-// OTHER DEALINGS IN THE SOFTWARE.
-
-#pragma once
-#ifndef NV_IMAGE_BLOCKDXT_H
-#define NV_IMAGE_BLOCKDXT_H
-
-#include "nvimage.h"
-
-#include "nvmath/Color.h"
-
-namespace nv
-{
-    struct ColorBlock;
-    class Stream;
-
-
-    /// DXT1 block.
-    struct BlockDXT1
-    {
-        Color16 col0;
-        Color16 col1;
-        union {
-            uint8 row[4];
-            uint indices;
-        };
-
-        bool isFourColorMode() const;
-
-        uint evaluatePalette(Color32 color_array[4], bool d3d9) const;
-        uint evaluatePaletteNV5x(Color32 color_array[4]) const;
-
-        void evaluatePalette3(Color32 color_array[4], bool d3d9) const;
-        void evaluatePalette4(Color32 color_array[4], bool d3d9) const;
-
-        void decodeBlock(ColorBlock * block, bool d3d9 = false) const;
-        void decodeBlockNV5x(ColorBlock * block) const;
-
-        void setIndices(int * idx);
-
-        void flip4();
-        void flip2();
-    };
-
-    /// Return true if the block uses four color mode, false otherwise.
-    inline bool BlockDXT1::isFourColorMode() const
-    {
-        return col0.u > col1.u;
-    }
-
-
-    /// DXT3 alpha block with explicit alpha.
-    struct AlphaBlockDXT3
-    {
-        union {
-            struct {
-                uint alpha0 : 4;
-                uint alpha1 : 4;
-                uint alpha2 : 4;
-                uint alpha3 : 4;
-                uint alpha4 : 4;
-                uint alpha5 : 4;
-                uint alpha6 : 4;
-                uint alpha7 : 4;
-                uint alpha8 : 4;
-                uint alpha9 : 4;
-                uint alphaA : 4;
-                uint alphaB : 4;
-                uint alphaC : 4;
-                uint alphaD : 4;
-                uint alphaE : 4;
-                uint alphaF : 4;
-            };
-            uint16 row[4];
-        };
-
-        void decodeBlock(ColorBlock * block, bool d3d9 = false) const;
-
-        void flip4();
-        void flip2();
-    };
-
-
-    /// DXT3 block.
-    struct BlockDXT3
-    {
-        AlphaBlockDXT3 alpha;
-        BlockDXT1 color;
-
-        void decodeBlock(ColorBlock * block, bool d3d9 = false) const;
-        void decodeBlockNV5x(ColorBlock * block) const;
-
-        void flip4();
-        void flip2();
-    };
-
-
-    /// DXT5 alpha block.
-    struct AlphaBlockDXT5
-    {
-        union {
-            struct {
-                uint64 alpha0 : 8;	// 8
-                uint64 alpha1 : 8;	// 16
-                uint64 bits0 : 3;	// 3 - 19
-                uint64 bits1 : 3; 	// 6 - 22
-                uint64 bits2 : 3; 	// 9 - 25
-                uint64 bits3 : 3;	// 12 - 28
-                uint64 bits4 : 3;	// 15 - 31
-                uint64 bits5 : 3;	// 18 - 34
-                uint64 bits6 : 3;	// 21 - 37
-                uint64 bits7 : 3;	// 24 - 40
-                uint64 bits8 : 3;	// 27 - 43
-                uint64 bits9 : 3; 	// 30 - 46
-                uint64 bitsA : 3; 	// 33 - 49
-                uint64 bitsB : 3;	// 36 - 52
-                uint64 bitsC : 3;	// 39 - 55
-                uint64 bitsD : 3;	// 42 - 58
-                uint64 bitsE : 3;	// 45 - 61
-                uint64 bitsF : 3;	// 48 - 64
-            };
-            uint64 u;
-        };
-
-        void evaluatePalette(uint8 alpha[8], bool d3d9) const;
-        void evaluatePalette8(uint8 alpha[8], bool d3d9) const;
-        void evaluatePalette6(uint8 alpha[8], bool d3d9) const;
-        void indices(uint8 index_array[16]) const;
-
-        uint index(uint index) const;
-        void setIndex(uint index, uint value);
-
-        void decodeBlock(ColorBlock * block, bool d3d9 = false) const;
-
-        void flip4();
-        void flip2();
-    };
-
-
-    /// DXT5 block.
-    struct BlockDXT5
-    {
-        AlphaBlockDXT5 alpha;
-        BlockDXT1 color;
-
-        void decodeBlock(ColorBlock * block, bool d3d9 = false) const;
-        void decodeBlockNV5x(ColorBlock * block) const;
-
-        void flip4();
-        void flip2();
-    };
-
-    /// ATI1 block.
-    struct BlockATI1
-    {
-        AlphaBlockDXT5 alpha;
-
-        void decodeBlock(ColorBlock * block, bool d3d9 = false) const;
-
-        void flip4();
-        void flip2();
-    };
-
-    /// ATI2 block.
-    struct BlockATI2
-    {
-        AlphaBlockDXT5 x;
-        AlphaBlockDXT5 y;
-
-        void decodeBlock(ColorBlock * block, bool d3d9 = false) const;
-
-        void flip4();
-        void flip2();
-    };
-
-    /// CTX1 block.
-    struct BlockCTX1
-    {
-        uint8 col0[2];
-        uint8 col1[2];
-        union {
-            uint8 row[4];
-            uint indices;
-        };
-
-        void evaluatePalette(Color32 color_array[4]) const;
-        void setIndices(int * idx);
-
-        void decodeBlock(ColorBlock * block) const;
-
-        void flip4();
-        void flip2();
-    };
-
-
-    // Serialization functions.
-    NVIMAGE_API Stream & operator<<(Stream & stream, BlockDXT1 & block);
-    NVIMAGE_API Stream & operator<<(Stream & stream, AlphaBlockDXT3 & block);
-    NVIMAGE_API Stream & operator<<(Stream & stream, BlockDXT3 & block);
-    NVIMAGE_API Stream & operator<<(Stream & stream, AlphaBlockDXT5 & block);
-    NVIMAGE_API Stream & operator<<(Stream & stream, BlockDXT5 & block);
-    NVIMAGE_API Stream & operator<<(Stream & stream, BlockATI1 & block);
-    NVIMAGE_API Stream & operator<<(Stream & stream, BlockATI2 & block);
-    NVIMAGE_API Stream & operator<<(Stream & stream, BlockCTX1 & block);
-
-} // nv namespace
-
-#endif // NV_IMAGE_BLOCKDXT_H
+// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
+// 
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+#pragma once
+#ifndef NV_IMAGE_BLOCKDXT_H
+#define NV_IMAGE_BLOCKDXT_H
+
+#include "nvimage.h"
+
+#include "nvmath/Color.h"
+
+namespace nv
+{
+    struct ColorBlock;
+    class Stream;
+
+
+    /// DXT1 block.
+    struct BlockDXT1
+    {
+        Color16 col0;
+        Color16 col1;
+        union {
+            uint8 row[4];
+            uint indices;
+        };
+
+        bool isFourColorMode() const;
+
+        uint evaluatePalette(Color32 color_array[4], bool d3d9) const;
+        uint evaluatePaletteNV5x(Color32 color_array[4]) const;
+
+        void evaluatePalette3(Color32 color_array[4], bool d3d9) const;
+        void evaluatePalette4(Color32 color_array[4], bool d3d9) const;
+
+        void decodeBlock(ColorBlock * block, bool d3d9 = false) const;
+        void decodeBlockNV5x(ColorBlock * block) const;
+
+        void setIndices(int * idx);
+
+        void flip4();
+        void flip2();
+    };
+
+    /// Return true if the block uses four color mode, false otherwise.
+    inline bool BlockDXT1::isFourColorMode() const
+    {
+        return col0.u > col1.u;
+    }
+
+
+    /// DXT3 alpha block with explicit alpha.
+    struct AlphaBlockDXT3
+    {
+        union {
+            struct {
+                uint alpha0 : 4;
+                uint alpha1 : 4;
+                uint alpha2 : 4;
+                uint alpha3 : 4;
+                uint alpha4 : 4;
+                uint alpha5 : 4;
+                uint alpha6 : 4;
+                uint alpha7 : 4;
+                uint alpha8 : 4;
+                uint alpha9 : 4;
+                uint alphaA : 4;
+                uint alphaB : 4;
+                uint alphaC : 4;
+                uint alphaD : 4;
+                uint alphaE : 4;
+                uint alphaF : 4;
+            };
+            uint16 row[4];
+        };
+
+        void decodeBlock(ColorBlock * block, bool d3d9 = false) const;
+
+        void flip4();
+        void flip2();
+    };
+
+
+    /// DXT3 block.
+    struct BlockDXT3
+    {
+        AlphaBlockDXT3 alpha;
+        BlockDXT1 color;
+
+        void decodeBlock(ColorBlock * block, bool d3d9 = false) const;
+        void decodeBlockNV5x(ColorBlock * block) const;
+
+        void flip4();
+        void flip2();
+    };
+
+
+    /// DXT5 alpha block.
+    struct AlphaBlockDXT5
+    {
+        union {
+            struct {
+                uint64 alpha0 : 8;	// 8
+                uint64 alpha1 : 8;	// 16
+                uint64 bits0 : 3;	// 3 - 19
+                uint64 bits1 : 3; 	// 6 - 22
+                uint64 bits2 : 3; 	// 9 - 25
+                uint64 bits3 : 3;	// 12 - 28
+                uint64 bits4 : 3;	// 15 - 31
+                uint64 bits5 : 3;	// 18 - 34
+                uint64 bits6 : 3;	// 21 - 37
+                uint64 bits7 : 3;	// 24 - 40
+                uint64 bits8 : 3;	// 27 - 43
+                uint64 bits9 : 3; 	// 30 - 46
+                uint64 bitsA : 3; 	// 33 - 49
+                uint64 bitsB : 3;	// 36 - 52
+                uint64 bitsC : 3;	// 39 - 55
+                uint64 bitsD : 3;	// 42 - 58
+                uint64 bitsE : 3;	// 45 - 61
+                uint64 bitsF : 3;	// 48 - 64
+            };
+            uint64 u;
+        };
+
+        void evaluatePalette(uint8 alpha[8], bool d3d9) const;
+        void evaluatePalette8(uint8 alpha[8], bool d3d9) const;
+        void evaluatePalette6(uint8 alpha[8], bool d3d9) const;
+        void indices(uint8 index_array[16]) const;
+
+        uint index(uint index) const;
+        void setIndex(uint index, uint value);
+
+        void decodeBlock(ColorBlock * block, bool d3d9 = false) const;
+
+        void flip4();
+        void flip2();
+    };
+
+
+    /// DXT5 block.
+    struct BlockDXT5
+    {
+        AlphaBlockDXT5 alpha;
+        BlockDXT1 color;
+
+        void decodeBlock(ColorBlock * block, bool d3d9 = false) const;
+        void decodeBlockNV5x(ColorBlock * block) const;
+
+        void flip4();
+        void flip2();
+    };
+
+    /// ATI1 block.
+    struct BlockATI1
+    {
+        AlphaBlockDXT5 alpha;
+
+        void decodeBlock(ColorBlock * block, bool d3d9 = false) const;
+
+        void flip4();
+        void flip2();
+    };
+
+    /// ATI2 block.
+    struct BlockATI2
+    {
+        AlphaBlockDXT5 x;
+        AlphaBlockDXT5 y;
+
+        void decodeBlock(ColorBlock * block, bool d3d9 = false) const;
+
+        void flip4();
+        void flip2();
+    };
+
+    /// CTX1 block.
+    struct BlockCTX1
+    {
+        uint8 col0[2];
+        uint8 col1[2];
+        union {
+            uint8 row[4];
+            uint indices;
+        };
+
+        void evaluatePalette(Color32 color_array[4]) const;
+        void setIndices(int * idx);
+
+        void decodeBlock(ColorBlock * block) const;
+
+        void flip4();
+        void flip2();
+    };
+
+
+    // Serialization functions.
+    NVIMAGE_API Stream & operator<<(Stream & stream, BlockDXT1 & block);
+    NVIMAGE_API Stream & operator<<(Stream & stream, AlphaBlockDXT3 & block);
+    NVIMAGE_API Stream & operator<<(Stream & stream, BlockDXT3 & block);
+    NVIMAGE_API Stream & operator<<(Stream & stream, AlphaBlockDXT5 & block);
+    NVIMAGE_API Stream & operator<<(Stream & stream, BlockDXT5 & block);
+    NVIMAGE_API Stream & operator<<(Stream & stream, BlockATI1 & block);
+    NVIMAGE_API Stream & operator<<(Stream & stream, BlockATI2 & block);
+    NVIMAGE_API Stream & operator<<(Stream & stream, BlockCTX1 & block);
+
+} // nv namespace
+
+#endif // NV_IMAGE_BLOCKDXT_H
diff --git a/src/nvimage/ColorBlock.cpp b/src/nvimage/ColorBlock.cpp
index 2da7752..2087e85 100644
--- a/src/nvimage/ColorBlock.cpp
+++ b/src/nvimage/ColorBlock.cpp
@@ -1,635 +1,635 @@
-// This code is in the public domain -- castanyo@yahoo.es
-
-#include "ColorBlock.h"
-#include "Image.h"
-#include "FloatImage.h"
-
-#include "nvmath/Box.h"
-#include "nvmath/Vector.inl"
-#include "nvcore/Utils.h" // swap
-
-#include <string.h> // memcpy
-
-using namespace nv;
-
-namespace {
-
-    // Get approximate luminance.
-    inline static uint colorLuminance(Color32 c)
-    {
-        return c.r + c.g + c.b;
-    }
-
-    // Get the euclidean distance between the given colors.
-    inline static uint colorDistance(Color32 c0, Color32 c1)
-    {
-        return (c0.r - c1.r) * (c0.r - c1.r) + (c0.g - c1.g) * (c0.g - c1.g) + (c0.b - c1.b) * (c0.b - c1.b);
-    }
-
-} // namespace`
-
-
-/// Default constructor.
-ColorBlock::ColorBlock()
-{
-}
-
-/// Init the color block from an array of colors.
-ColorBlock::ColorBlock(const uint * linearImage)
-{
-    for(uint i = 0; i < 16; i++) {
-        color(i) = Color32(linearImage[i]);
-    }
-}
-
-/// Init the color block with the contents of the given block.
-ColorBlock::ColorBlock(const ColorBlock & block)
-{
-    for(uint i = 0; i < 16; i++) {
-        color(i) = block.color(i);
-    }
-}
-
-
-/// Initialize this color block.
-ColorBlock::ColorBlock(const Image * img, uint x, uint y)
-{
-    init(img, x, y);
-}
-
-void ColorBlock::init(const Image * img, uint x, uint y)
-{
-    init(img->width(), img->height(), (const uint *)img->pixels(), x, y);
-}
-
-void ColorBlock::init(uint w, uint h, const uint * data, uint x, uint y)
-{
-    nvDebugCheck(data != NULL);
-
-    const uint bw = min(w - x, 4U);
-    const uint bh = min(h - y, 4U);
-    nvDebugCheck(bw != 0 && bh != 0);
-
-    // Blocks that are smaller than 4x4 are handled by repeating the pixels.
-    // @@ Thats only correct when block size is 1, 2 or 4, but not with 3. :(
-    // @@ Ideally we should zero the weights of the pixels out of range.
-
-    for (uint i = 0; i < 4; i++)
-    {
-        const int by = i % bh;
-
-        for (uint e = 0; e < 4; e++)
-        {
-            const int bx = e % bw;
-            const uint idx = (y + by) * w + x + bx;
-
-            color(e, i).u = data[idx];
-        }
-    }
-}
-
-void ColorBlock::init(uint w, uint h, const float * data, uint x, uint y)
-{
-    nvDebugCheck(data != NULL);
-
-    const uint bw = min(w - x, 4U);
-    const uint bh = min(h - y, 4U);
-    nvDebugCheck(bw != 0 && bh != 0);
-
-    // Blocks that are smaller than 4x4 are handled by repeating the pixels.
-    // @@ Thats only correct when block size is 1, 2 or 4, but not with 3. :(
-    // @@ Ideally we should zero the weights of the pixels out of range.
-
-    uint srcPlane = w * h;
-
-    for (uint i = 0; i < 4; i++)
-    {
-        const uint by = i % bh;
-
-        for (uint e = 0; e < 4; e++)
-        {
-            const uint bx = e % bw;
-            const uint idx = ((y + by) * w + x + bx);
-
-            Color32 & c = color(e, i);
-            c.r = uint8(255 * clamp(data[idx + 0 * srcPlane], 0.0f, 1.0f)); // @@ Is this the right way to quantize floats to bytes?
-            c.g = uint8(255 * clamp(data[idx + 1 * srcPlane], 0.0f, 1.0f));
-            c.b = uint8(255 * clamp(data[idx + 2 * srcPlane], 0.0f, 1.0f));
-            c.a = uint8(255 * clamp(data[idx + 3 * srcPlane], 0.0f, 1.0f));
-        }
-    }
-}
-
-static inline uint8 component(Color32 c, uint i)
-{
-    if (i == 0) return c.r;
-    if (i == 1) return c.g;
-    if (i == 2) return c.b;
-    if (i == 3) return c.a;
-    if (i == 4) return 0xFF;
-    return 0;
-}
-
-void ColorBlock::swizzle(uint x, uint y, uint z, uint w)
-{
-    for (int i = 0; i < 16; i++)
-    {
-        Color32 c = m_color[i];
-        m_color[i].r = component(c, x);
-        m_color[i].g = component(c, y);
-        m_color[i].b = component(c, z);
-        m_color[i].a = component(c, w);
-    }
-}
-
-
-/// Returns true if the block has a single color.
-bool ColorBlock::isSingleColor(Color32 mask/*= Color32(0xFF, 0xFF, 0xFF, 0x00)*/) const
-{
-    uint u = m_color[0].u & mask.u;
-
-    for (int i = 1; i < 16; i++)
-    {
-        if (u != (m_color[i].u & mask.u))
-        {
-            return false;
-        }
-    }
-
-    return true;
-}
-
-/*
-/// Returns true if the block has a single color, ignoring transparent pixels.
-bool ColorBlock::isSingleColorNoAlpha() const
-{
-    Color32 c;
-    int i;
-    for(i = 0; i < 16; i++)
-    {
-        if (m_color[i].a != 0) c = m_color[i];
-    }
-
-    Color32 mask(0xFF, 0xFF, 0xFF, 0x00);
-    uint u = c.u & mask.u;
-
-    for(; i < 16; i++)
-    {
-        if (u != (m_color[i].u & mask.u))
-        {
-            return false;
-        }
-    }
-
-    return true;
-}
-*/
-
-/// Count number of unique colors in this color block.
-/*uint ColorBlock::countUniqueColors() const
-{
-    uint count = 0;
-
-    // @@ This does not have to be o(n^2)
-    for(int i = 0; i < 16; i++)
-    {
-        bool unique = true;
-        for(int j = 0; j < i; j++) {
-            if( m_color[i] != m_color[j] ) {
-                unique = false;
-            }
-        }
-
-        if( unique ) {
-            count++;
-        }
-    }
-
-    return count;
-}*/
-
-/*/// Get average color of the block.
-Color32 ColorBlock::averageColor() const
-{
-    uint r, g, b, a;
-    r = g = b = a = 0;
-
-    for(uint i = 0; i < 16; i++) {
-        r += m_color[i].r;
-        g += m_color[i].g;
-        b += m_color[i].b;
-        a += m_color[i].a;
-    }
-
-    return Color32(uint8(r / 16), uint8(g / 16), uint8(b / 16), uint8(a / 16));
-}*/
-
-/// Return true if the block is not fully opaque.
-bool ColorBlock::hasAlpha() const
-{
-    for (uint i = 0; i < 16; i++)
-    {
-        if (m_color[i].a != 255) return true;
-    }
-    return false;
-}
-
-#if 0
-
-/// Get diameter color range.
-void ColorBlock::diameterRange(Color32 * start, Color32 * end) const
-{
-    nvDebugCheck(start != NULL);
-    nvDebugCheck(end != NULL);
-
-    Color32 c0, c1;
-    uint best_dist = 0;
-
-    for(int i = 0; i < 16; i++) {
-        for (int j = i+1; j < 16; j++) {
-            uint dist = colorDistance(m_color[i], m_color[j]);
-            if( dist > best_dist ) {
-                best_dist = dist;
-                c0 = m_color[i];
-                c1 = m_color[j];
-            }
-        }
-    }
-
-    *start = c0;
-    *end = c1;
-}
-
-/// Get luminance color range.
-void ColorBlock::luminanceRange(Color32 * start, Color32 * end) const
-{
-    nvDebugCheck(start != NULL);
-    nvDebugCheck(end != NULL);
-
-    Color32 minColor, maxColor;
-    uint minLuminance, maxLuminance;
-
-    maxLuminance = minLuminance = colorLuminance(m_color[0]);
-
-    for(uint i = 1; i < 16; i++)
-    {
-        uint luminance = colorLuminance(m_color[i]);
-
-        if (luminance > maxLuminance) {
-            maxLuminance = luminance;
-            maxColor = m_color[i];
-        }
-        else if (luminance < minLuminance) {
-            minLuminance = luminance;
-            minColor = m_color[i];
-        }
-    }
-
-    *start = minColor;
-    *end = maxColor;
-}
-
-/// Get color range based on the bounding box. 
-void ColorBlock::boundsRange(Color32 * start, Color32 * end) const
-{
-    nvDebugCheck(start != NULL);
-    nvDebugCheck(end != NULL);
-
-    Color32 minColor(255, 255, 255);
-    Color32 maxColor(0, 0, 0);
-
-    for(uint i = 0; i < 16; i++)
-    {
-        if (m_color[i].r < minColor.r) { minColor.r = m_color[i].r; }
-        if (m_color[i].g < minColor.g) { minColor.g = m_color[i].g; }
-        if (m_color[i].b < minColor.b) { minColor.b = m_color[i].b; }
-        if (m_color[i].r > maxColor.r) { maxColor.r = m_color[i].r; }
-        if (m_color[i].g > maxColor.g) { maxColor.g = m_color[i].g; }
-        if (m_color[i].b > maxColor.b) { maxColor.b = m_color[i].b; }
-    }
-
-    // Offset range by 1/16 of the extents
-    Color32 inset;
-    inset.r = (maxColor.r - minColor.r) >> 4;
-    inset.g = (maxColor.g - minColor.g) >> 4;
-    inset.b = (maxColor.b - minColor.b) >> 4;
-
-    minColor.r = (minColor.r + inset.r <= 255) ? minColor.r + inset.r : 255;
-    minColor.g = (minColor.g + inset.g <= 255) ? minColor.g + inset.g : 255;
-    minColor.b = (minColor.b + inset.b <= 255) ? minColor.b + inset.b : 255;
-
-    maxColor.r = (maxColor.r >= inset.r) ? maxColor.r - inset.r : 0;
-    maxColor.g = (maxColor.g >= inset.g) ? maxColor.g - inset.g : 0;
-    maxColor.b = (maxColor.b >= inset.b) ? maxColor.b - inset.b : 0;
-
-    *start = minColor;
-    *end = maxColor;
-}
-
-/// Get color range based on the bounding box. 
-void ColorBlock::boundsRangeAlpha(Color32 * start, Color32 * end) const
-{
-    nvDebugCheck(start != NULL);
-    nvDebugCheck(end != NULL);
-
-    Color32 minColor(255, 255, 255, 255);
-    Color32 maxColor(0, 0, 0, 0);
-
-    for(uint i = 0; i < 16; i++)
-    {
-        if (m_color[i].r < minColor.r) { minColor.r = m_color[i].r; }
-        if (m_color[i].g < minColor.g) { minColor.g = m_color[i].g; }
-        if (m_color[i].b < minColor.b) { minColor.b = m_color[i].b; }
-        if (m_color[i].a < minColor.a) { minColor.a = m_color[i].a; }
-        if (m_color[i].r > maxColor.r) { maxColor.r = m_color[i].r; }
-        if (m_color[i].g > maxColor.g) { maxColor.g = m_color[i].g; }
-        if (m_color[i].b > maxColor.b) { maxColor.b = m_color[i].b; }
-        if (m_color[i].a > maxColor.a) { maxColor.a = m_color[i].a; }
-    }
-
-    // Offset range by 1/16 of the extents
-    Color32 inset;
-    inset.r = (maxColor.r - minColor.r) >> 4;
-    inset.g = (maxColor.g - minColor.g) >> 4;
-    inset.b = (maxColor.b - minColor.b) >> 4;
-    inset.a = (maxColor.a - minColor.a) >> 4;
-
-    minColor.r = (minColor.r + inset.r <= 255) ? minColor.r + inset.r : 255;
-    minColor.g = (minColor.g + inset.g <= 255) ? minColor.g + inset.g : 255;
-    minColor.b = (minColor.b + inset.b <= 255) ? minColor.b + inset.b : 255;
-    minColor.a = (minColor.a + inset.a <= 255) ? minColor.a + inset.a : 255;
-
-    maxColor.r = (maxColor.r >= inset.r) ? maxColor.r - inset.r : 0;
-    maxColor.g = (maxColor.g >= inset.g) ? maxColor.g - inset.g : 0;
-    maxColor.b = (maxColor.b >= inset.b) ? maxColor.b - inset.b : 0;
-    maxColor.a = (maxColor.a >= inset.a) ? maxColor.a - inset.a : 0;
-
-    *start = minColor;
-    *end = maxColor;
-}
-#endif
-
-/*/// Sort colors by abosolute value in their 16 bit representation.
-void ColorBlock::sortColorsByAbsoluteValue()
-{
-    // Dummy selection sort.
-    for( uint a = 0; a < 16; a++ ) {
-        uint max = a;
-        Color16 cmax(m_color[a]);
-
-        for( uint b = a+1; b < 16; b++ ) {
-            Color16 cb(m_color[b]);
-
-            if( cb.u > cmax.u ) {
-                max = b;
-                cmax = cb;
-            }
-        }
-        swap( m_color[a], m_color[max] );
-    }
-}*/
-
-
-/*/// Find extreme colors in the given axis.
-void ColorBlock::computeRange(Vector3::Arg axis, Color32 * start, Color32 * end) const
-{
-    nvDebugCheck(start != NULL);
-    nvDebugCheck(end != NULL);
-
-    int mini, maxi;
-    mini = maxi = 0;
-
-    float min, max;	
-    min = max = dot(Vector3(m_color[0].r, m_color[0].g, m_color[0].b), axis);
-
-    for(uint i = 1; i < 16; i++)
-    {
-        const Vector3 vec(m_color[i].r, m_color[i].g, m_color[i].b);
-
-        float val = dot(vec, axis);
-        if( val < min ) {
-            mini = i;
-            min = val;
-        }
-        else if( val > max ) {
-            maxi = i;
-            max = val;
-        }
-    }
-
-    *start = m_color[mini];
-    *end = m_color[maxi];
-}*/
-
-
-/*/// Sort colors in the given axis.
-void ColorBlock::sortColors(const Vector3 & axis)
-{
-    float luma_array[16];
-
-    for(uint i = 0; i < 16; i++) {
-        const Vector3 vec(m_color[i].r, m_color[i].g, m_color[i].b);
-        luma_array[i] = dot(vec, axis);
-    }
-
-    // Dummy selection sort.
-    for( uint a = 0; a < 16; a++ ) {
-        uint min = a;
-        for( uint b = a+1; b < 16; b++ ) {
-            if( luma_array[b] < luma_array[min] ) {
-                min = b;
-            }
-        }
-        swap( luma_array[a], luma_array[min] );
-        swap( m_color[a], m_color[min] );
-    }
-}*/
-
-
-/*/// Get the volume of the color block.
-float ColorBlock::volume() const
-{
-    Box bounds;
-    bounds.clearBounds();
-
-    for(int i = 0; i < 16; i++) {
-        const Vector3 point(m_color[i].r, m_color[i].g, m_color[i].b);
-        bounds.addPointToBounds(point);
-    }
-
-    return bounds.volume();
-}*/
-
-
-void ColorSet::setColors(const float * data, uint img_w, uint img_h, uint img_x, uint img_y)
-{
-    nvDebugCheck(img_x < img_w && img_y < img_h);
-
-    w = min(4U, img_w - img_x);
-    h = min(4U, img_h - img_y);
-    nvDebugCheck(w != 0 && h != 0);
-
-    count = w * h;
-
-    const float * r = data + img_w * img_h * 0;
-    const float * g = data + img_w * img_h * 1;
-    const float * b = data + img_w * img_h * 2;
-    const float * a = data + img_w * img_h * 3;
-
-    // Set colors.
-    for (uint y = 0, i = 0; y < h; y++)
-    {
-        for (uint x = 0; x < w; x++, i++)
-        {
-            uint idx = x + img_x + (y + img_y) * img_w;
-            colors[i].x = r[idx];
-            colors[i].y = g[idx];
-            colors[i].z = b[idx];
-            colors[i].w = a[idx];
-        }
-    }
-}
-
-void ColorSet::setAlphaWeights()
-{
-    for (uint i = 0; i < count; i++)
-    {
-        weights[i] = max(colors[i].w, 0.001f); // Avoid division by zero.
-    }
-}
-
-void ColorSet::setUniformWeights()
-{
-    for (uint i = 0; i < count; i++)
-    {
-        weights[i] = 1.0f;
-    }
-}
-
-
-void ColorSet::createMinimalSet(bool ignoreTransparent)
-{
-    nvDebugCheck(count == w*h); // Do not call this method multiple times.
-
-    Vector4 C[16];
-    float W[16];
-    memcpy(C, colors, sizeof(Vector4)*count);
-    memcpy(W, weights, sizeof(float)*count);
-
-    uint n = 0;
-    for (uint y = 0, i = 0; y < h; y++)
-    {
-        for (uint x = 0; x < w; x++, i++)
-        {
-            if (ignoreTransparent && C[i].w == 0) {
-                continue;
-            }
-
-            uint idx = y * 4 + x;
-
-            // loop over previous points for a match
-            for (int j = 0; ; j++)
-            {
-                // allocate a new point
-                if (j == i)
-                {
-                    colors[n] = C[i];
-                    weights[n] = W[i];
-                    remap[idx] = n;
-                    n++;
-                    break;
-                }
-
-                // check for a match
-                bool colorMatch = (C[i].x == C[j].x) && (C[i].w == C[j].w) && (C[i].z == C[j].z);
-                //bool alphaMatch = (C[i].w == C[j].w);
-
-                if (colorMatch)
-                {
-                    // get the index of the match
-                    int index = remap[j];
-
-                    // map to this point and increase the weight
-                    weights[index] += W[i];
-                    remap[idx] = index;
-                    break;
-                }
-            }
-        }
-    }
-
-    count = n;
-
-    // Avoid empty blocks.
-    if (count == 0) {
-        count = 1;
-        //colors[0] = C[0];
-        //weights[0] = W[0];
-        memset(remap, 0, sizeof(int)*16);
-    }
-}
-
-
-// Fill blocks that are smaller than (4,4) by wrapping indices.
-void ColorSet::wrapIndices()
-{
-    for (uint y = h; y < 4; y++)
-    {
-        uint base = (y % h) * w;
-        for (uint x = w; x < 4; x++)
-        {
-            remap[y*4+3] = remap[base + (x % w)];
-        }
-    }
-}
-
-bool ColorSet::isSingleColor(bool ignoreAlpha) const
-{
-    Vector4 v = colors[0];
-    if (ignoreAlpha) v.w = 1.0f;
-
-    for (uint i = 1; i < count; i++)
-    {
-        Vector4 c = colors[i];
-        if (ignoreAlpha) c.w = 1.0f;
-
-        if (v != c) {
-            return false;
-        }
-    }
-
-    return true;
-}
-
-
-// 0=r, 1=g, 2=b, 3=a, 4=0xFF, 5=0
-static inline float component(Vector4::Arg c, uint i)
-{
-    if (i == 0) return c.x;
-    if (i == 1) return c.y;
-    if (i == 2) return c.z;
-    if (i == 3) return c.w;
-    if (i == 4) return 0xFF;
-    return 0;
-}
-
-void ColorSet::swizzle(uint x, uint y, uint z, uint w)
-{
-    for (uint i = 0; i < count; i++)
-    {
-        Vector4 c = colors[i];
-        colors[i].x = component(c, x);
-        colors[i].y = component(c, y);
-        colors[i].z = component(c, z);
-        colors[i].w = component(c, w);
-    }
-}
-
-bool ColorSet::hasAlpha() const
-{
-    for (uint i = 0; i < count; i++)
-    {
-        if (colors[i].w != 0.0f) return true;
-    }
-    return false;
-}
+// This code is in the public domain -- castanyo@yahoo.es
+
+#include "ColorBlock.h"
+#include "Image.h"
+#include "FloatImage.h"
+
+#include "nvmath/Box.h"
+#include "nvmath/Vector.inl"
+#include "nvcore/Utils.h" // swap
+
+#include <string.h> // memcpy
+
+using namespace nv;
+
+namespace {
+
+    // Get approximate luminance.
+    inline static uint colorLuminance(Color32 c)
+    {
+        return c.r + c.g + c.b;
+    }
+
+    // Get the euclidean distance between the given colors.
+    inline static uint colorDistance(Color32 c0, Color32 c1)
+    {
+        return (c0.r - c1.r) * (c0.r - c1.r) + (c0.g - c1.g) * (c0.g - c1.g) + (c0.b - c1.b) * (c0.b - c1.b);
+    }
+
+} // namespace`
+
+
+/// Default constructor.
+ColorBlock::ColorBlock()
+{
+}
+
+/// Init the color block from an array of colors.
+ColorBlock::ColorBlock(const uint * linearImage)
+{
+    for(uint i = 0; i < 16; i++) {
+        color(i) = Color32(linearImage[i]);
+    }
+}
+
+/// Init the color block with the contents of the given block.
+ColorBlock::ColorBlock(const ColorBlock & block)
+{
+    for(uint i = 0; i < 16; i++) {
+        color(i) = block.color(i);
+    }
+}
+
+
+/// Initialize this color block.
+ColorBlock::ColorBlock(const Image * img, uint x, uint y)
+{
+    init(img, x, y);
+}
+
+void ColorBlock::init(const Image * img, uint x, uint y)
+{
+    init(img->width(), img->height(), (const uint *)img->pixels(), x, y);
+}
+
+void ColorBlock::init(uint w, uint h, const uint * data, uint x, uint y)
+{
+    nvDebugCheck(data != NULL);
+
+    const uint bw = min(w - x, 4U);
+    const uint bh = min(h - y, 4U);
+    nvDebugCheck(bw != 0 && bh != 0);
+
+    // Blocks that are smaller than 4x4 are handled by repeating the pixels.
+    // @@ Thats only correct when block size is 1, 2 or 4, but not with 3. :(
+    // @@ Ideally we should zero the weights of the pixels out of range.
+
+    for (uint i = 0; i < 4; i++)
+    {
+        const int by = i % bh;
+
+        for (uint e = 0; e < 4; e++)
+        {
+            const int bx = e % bw;
+            const uint idx = (y + by) * w + x + bx;
+
+            color(e, i).u = data[idx];
+        }
+    }
+}
+
+void ColorBlock::init(uint w, uint h, const float * data, uint x, uint y)
+{
+    nvDebugCheck(data != NULL);
+
+    const uint bw = min(w - x, 4U);
+    const uint bh = min(h - y, 4U);
+    nvDebugCheck(bw != 0 && bh != 0);
+
+    // Blocks that are smaller than 4x4 are handled by repeating the pixels.
+    // @@ Thats only correct when block size is 1, 2 or 4, but not with 3. :(
+    // @@ Ideally we should zero the weights of the pixels out of range.
+
+    uint srcPlane = w * h;
+
+    for (uint i = 0; i < 4; i++)
+    {
+        const uint by = i % bh;
+
+        for (uint e = 0; e < 4; e++)
+        {
+            const uint bx = e % bw;
+            const uint idx = ((y + by) * w + x + bx);
+
+            Color32 & c = color(e, i);
+            c.r = uint8(255 * clamp(data[idx + 0 * srcPlane], 0.0f, 1.0f)); // @@ Is this the right way to quantize floats to bytes?
+            c.g = uint8(255 * clamp(data[idx + 1 * srcPlane], 0.0f, 1.0f));
+            c.b = uint8(255 * clamp(data[idx + 2 * srcPlane], 0.0f, 1.0f));
+            c.a = uint8(255 * clamp(data[idx + 3 * srcPlane], 0.0f, 1.0f));
+        }
+    }
+}
+
+static inline uint8 component(Color32 c, uint i)
+{
+    if (i == 0) return c.r;
+    if (i == 1) return c.g;
+    if (i == 2) return c.b;
+    if (i == 3) return c.a;
+    if (i == 4) return 0xFF;
+    return 0;
+}
+
+void ColorBlock::swizzle(uint x, uint y, uint z, uint w)
+{
+    for (int i = 0; i < 16; i++)
+    {
+        Color32 c = m_color[i];
+        m_color[i].r = component(c, x);
+        m_color[i].g = component(c, y);
+        m_color[i].b = component(c, z);
+        m_color[i].a = component(c, w);
+    }
+}
+
+
+/// Returns true if the block has a single color.
+bool ColorBlock::isSingleColor(Color32 mask/*= Color32(0xFF, 0xFF, 0xFF, 0x00)*/) const
+{
+    uint u = m_color[0].u & mask.u;
+
+    for (int i = 1; i < 16; i++)
+    {
+        if (u != (m_color[i].u & mask.u))
+        {
+            return false;
+        }
+    }
+
+    return true;
+}
+
+/*
+/// Returns true if the block has a single color, ignoring transparent pixels.
+bool ColorBlock::isSingleColorNoAlpha() const
+{
+    Color32 c;
+    int i;
+    for(i = 0; i < 16; i++)
+    {
+        if (m_color[i].a != 0) c = m_color[i];
+    }
+
+    Color32 mask(0xFF, 0xFF, 0xFF, 0x00);
+    uint u = c.u & mask.u;
+
+    for(; i < 16; i++)
+    {
+        if (u != (m_color[i].u & mask.u))
+        {
+            return false;
+        }
+    }
+
+    return true;
+}
+*/
+
+/// Count number of unique colors in this color block.
+/*uint ColorBlock::countUniqueColors() const
+{
+    uint count = 0;
+
+    // @@ This does not have to be o(n^2)
+    for(int i = 0; i < 16; i++)
+    {
+        bool unique = true;
+        for(int j = 0; j < i; j++) {
+            if( m_color[i] != m_color[j] ) {
+                unique = false;
+            }
+        }
+
+        if( unique ) {
+            count++;
+        }
+    }
+
+    return count;
+}*/
+
+/*/// Get average color of the block.
+Color32 ColorBlock::averageColor() const
+{
+    uint r, g, b, a;
+    r = g = b = a = 0;
+
+    for(uint i = 0; i < 16; i++) {
+        r += m_color[i].r;
+        g += m_color[i].g;
+        b += m_color[i].b;
+        a += m_color[i].a;
+    }
+
+    return Color32(uint8(r / 16), uint8(g / 16), uint8(b / 16), uint8(a / 16));
+}*/
+
+/// Return true if the block is not fully opaque.
+bool ColorBlock::hasAlpha() const
+{
+    for (uint i = 0; i < 16; i++)
+    {
+        if (m_color[i].a != 255) return true;
+    }
+    return false;
+}
+
+#if 0
+
+/// Get diameter color range.
+void ColorBlock::diameterRange(Color32 * start, Color32 * end) const
+{
+    nvDebugCheck(start != NULL);
+    nvDebugCheck(end != NULL);
+
+    Color32 c0, c1;
+    uint best_dist = 0;
+
+    for(int i = 0; i < 16; i++) {
+        for (int j = i+1; j < 16; j++) {
+            uint dist = colorDistance(m_color[i], m_color[j]);
+            if( dist > best_dist ) {
+                best_dist = dist;
+                c0 = m_color[i];
+                c1 = m_color[j];
+            }
+        }
+    }
+
+    *start = c0;
+    *end = c1;
+}
+
+/// Get luminance color range.
+void ColorBlock::luminanceRange(Color32 * start, Color32 * end) const
+{
+    nvDebugCheck(start != NULL);
+    nvDebugCheck(end != NULL);
+
+    Color32 minColor, maxColor;
+    uint minLuminance, maxLuminance;
+
+    maxLuminance = minLuminance = colorLuminance(m_color[0]);
+
+    for(uint i = 1; i < 16; i++)
+    {
+        uint luminance = colorLuminance(m_color[i]);
+
+        if (luminance > maxLuminance) {
+            maxLuminance = luminance;
+            maxColor = m_color[i];
+        }
+        else if (luminance < minLuminance) {
+            minLuminance = luminance;
+            minColor = m_color[i];
+        }
+    }
+
+    *start = minColor;
+    *end = maxColor;
+}
+
+/// Get color range based on the bounding box. 
+void ColorBlock::boundsRange(Color32 * start, Color32 * end) const
+{
+    nvDebugCheck(start != NULL);
+    nvDebugCheck(end != NULL);
+
+    Color32 minColor(255, 255, 255);
+    Color32 maxColor(0, 0, 0);
+
+    for(uint i = 0; i < 16; i++)
+    {
+        if (m_color[i].r < minColor.r) { minColor.r = m_color[i].r; }
+        if (m_color[i].g < minColor.g) { minColor.g = m_color[i].g; }
+        if (m_color[i].b < minColor.b) { minColor.b = m_color[i].b; }
+        if (m_color[i].r > maxColor.r) { maxColor.r = m_color[i].r; }
+        if (m_color[i].g > maxColor.g) { maxColor.g = m_color[i].g; }
+        if (m_color[i].b > maxColor.b) { maxColor.b = m_color[i].b; }
+    }
+
+    // Offset range by 1/16 of the extents
+    Color32 inset;
+    inset.r = (maxColor.r - minColor.r) >> 4;
+    inset.g = (maxColor.g - minColor.g) >> 4;
+    inset.b = (maxColor.b - minColor.b) >> 4;
+
+    minColor.r = (minColor.r + inset.r <= 255) ? minColor.r + inset.r : 255;
+    minColor.g = (minColor.g + inset.g <= 255) ? minColor.g + inset.g : 255;
+    minColor.b = (minColor.b + inset.b <= 255) ? minColor.b + inset.b : 255;
+
+    maxColor.r = (maxColor.r >= inset.r) ? maxColor.r - inset.r : 0;
+    maxColor.g = (maxColor.g >= inset.g) ? maxColor.g - inset.g : 0;
+    maxColor.b = (maxColor.b >= inset.b) ? maxColor.b - inset.b : 0;
+
+    *start = minColor;
+    *end = maxColor;
+}
+
+/// Get color range based on the bounding box. 
+void ColorBlock::boundsRangeAlpha(Color32 * start, Color32 * end) const
+{
+    nvDebugCheck(start != NULL);
+    nvDebugCheck(end != NULL);
+
+    Color32 minColor(255, 255, 255, 255);
+    Color32 maxColor(0, 0, 0, 0);
+
+    for(uint i = 0; i < 16; i++)
+    {
+        if (m_color[i].r < minColor.r) { minColor.r = m_color[i].r; }
+        if (m_color[i].g < minColor.g) { minColor.g = m_color[i].g; }
+        if (m_color[i].b < minColor.b) { minColor.b = m_color[i].b; }
+        if (m_color[i].a < minColor.a) { minColor.a = m_color[i].a; }
+        if (m_color[i].r > maxColor.r) { maxColor.r = m_color[i].r; }
+        if (m_color[i].g > maxColor.g) { maxColor.g = m_color[i].g; }
+        if (m_color[i].b > maxColor.b) { maxColor.b = m_color[i].b; }
+        if (m_color[i].a > maxColor.a) { maxColor.a = m_color[i].a; }
+    }
+
+    // Offset range by 1/16 of the extents
+    Color32 inset;
+    inset.r = (maxColor.r - minColor.r) >> 4;
+    inset.g = (maxColor.g - minColor.g) >> 4;
+    inset.b = (maxColor.b - minColor.b) >> 4;
+    inset.a = (maxColor.a - minColor.a) >> 4;
+
+    minColor.r = (minColor.r + inset.r <= 255) ? minColor.r + inset.r : 255;
+    minColor.g = (minColor.g + inset.g <= 255) ? minColor.g + inset.g : 255;
+    minColor.b = (minColor.b + inset.b <= 255) ? minColor.b + inset.b : 255;
+    minColor.a = (minColor.a + inset.a <= 255) ? minColor.a + inset.a : 255;
+
+    maxColor.r = (maxColor.r >= inset.r) ? maxColor.r - inset.r : 0;
+    maxColor.g = (maxColor.g >= inset.g) ? maxColor.g - inset.g : 0;
+    maxColor.b = (maxColor.b >= inset.b) ? maxColor.b - inset.b : 0;
+    maxColor.a = (maxColor.a >= inset.a) ? maxColor.a - inset.a : 0;
+
+    *start = minColor;
+    *end = maxColor;
+}
+#endif
+
+/*/// Sort colors by abosolute value in their 16 bit representation.
+void ColorBlock::sortColorsByAbsoluteValue()
+{
+    // Dummy selection sort.
+    for( uint a = 0; a < 16; a++ ) {
+        uint max = a;
+        Color16 cmax(m_color[a]);
+
+        for( uint b = a+1; b < 16; b++ ) {
+            Color16 cb(m_color[b]);
+
+            if( cb.u > cmax.u ) {
+                max = b;
+                cmax = cb;
+            }
+        }
+        swap( m_color[a], m_color[max] );
+    }
+}*/
+
+
+/*/// Find extreme colors in the given axis.
+void ColorBlock::computeRange(Vector3::Arg axis, Color32 * start, Color32 * end) const
+{
+    nvDebugCheck(start != NULL);
+    nvDebugCheck(end != NULL);
+
+    int mini, maxi;
+    mini = maxi = 0;
+
+    float min, max;	
+    min = max = dot(Vector3(m_color[0].r, m_color[0].g, m_color[0].b), axis);
+
+    for(uint i = 1; i < 16; i++)
+    {
+        const Vector3 vec(m_color[i].r, m_color[i].g, m_color[i].b);
+
+        float val = dot(vec, axis);
+        if( val < min ) {
+            mini = i;
+            min = val;
+        }
+        else if( val > max ) {
+            maxi = i;
+            max = val;
+        }
+    }
+
+    *start = m_color[mini];
+    *end = m_color[maxi];
+}*/
+
+
+/*/// Sort colors in the given axis.
+void ColorBlock::sortColors(const Vector3 & axis)
+{
+    float luma_array[16];
+
+    for(uint i = 0; i < 16; i++) {
+        const Vector3 vec(m_color[i].r, m_color[i].g, m_color[i].b);
+        luma_array[i] = dot(vec, axis);
+    }
+
+    // Dummy selection sort.
+    for( uint a = 0; a < 16; a++ ) {
+        uint min = a;
+        for( uint b = a+1; b < 16; b++ ) {
+            if( luma_array[b] < luma_array[min] ) {
+                min = b;
+            }
+        }
+        swap( luma_array[a], luma_array[min] );
+        swap( m_color[a], m_color[min] );
+    }
+}*/
+
+
+/*/// Get the volume of the color block.
+float ColorBlock::volume() const
+{
+    Box bounds;
+    bounds.clearBounds();
+
+    for(int i = 0; i < 16; i++) {
+        const Vector3 point(m_color[i].r, m_color[i].g, m_color[i].b);
+        bounds.addPointToBounds(point);
+    }
+
+    return bounds.volume();
+}*/
+
+
+void ColorSet::setColors(const float * data, uint img_w, uint img_h, uint img_x, uint img_y)
+{
+    nvDebugCheck(img_x < img_w && img_y < img_h);
+
+    w = min(4U, img_w - img_x);
+    h = min(4U, img_h - img_y);
+    nvDebugCheck(w != 0 && h != 0);
+
+    count = w * h;
+
+    const float * r = data + img_w * img_h * 0;
+    const float * g = data + img_w * img_h * 1;
+    const float * b = data + img_w * img_h * 2;
+    const float * a = data + img_w * img_h * 3;
+
+    // Set colors.
+    for (uint y = 0, i = 0; y < h; y++)
+    {
+        for (uint x = 0; x < w; x++, i++)
+        {
+            uint idx = x + img_x + (y + img_y) * img_w;
+            colors[i].x = r[idx];
+            colors[i].y = g[idx];
+            colors[i].z = b[idx];
+            colors[i].w = a[idx];
+        }
+    }
+}
+
+void ColorSet::setAlphaWeights()
+{
+    for (uint i = 0; i < count; i++)
+    {
+        weights[i] = max(colors[i].w, 0.001f); // Avoid division by zero.
+    }
+}
+
+void ColorSet::setUniformWeights()
+{
+    for (uint i = 0; i < count; i++)
+    {
+        weights[i] = 1.0f;
+    }
+}
+
+
+void ColorSet::createMinimalSet(bool ignoreTransparent)
+{
+    nvDebugCheck(count == w*h); // Do not call this method multiple times.
+
+    Vector4 C[16];
+    float W[16];
+    memcpy(C, colors, sizeof(Vector4)*count);
+    memcpy(W, weights, sizeof(float)*count);
+
+    uint n = 0;
+    for (uint y = 0, i = 0; y < h; y++)
+    {
+        for (uint x = 0; x < w; x++, i++)
+        {
+            if (ignoreTransparent && C[i].w == 0) {
+                continue;
+            }
+
+            uint idx = y * 4 + x;
+
+            // loop over previous points for a match
+            for (int j = 0; ; j++)
+            {
+                // allocate a new point
+                if (j == i)
+                {
+                    colors[n] = C[i];
+                    weights[n] = W[i];
+                    remap[idx] = n;
+                    n++;
+                    break;
+                }
+
+                // check for a match
+                bool colorMatch = (C[i].x == C[j].x) && (C[i].w == C[j].w) && (C[i].z == C[j].z);
+                //bool alphaMatch = (C[i].w == C[j].w);
+
+                if (colorMatch)
+                {
+                    // get the index of the match
+                    int index = remap[j];
+
+                    // map to this point and increase the weight
+                    weights[index] += W[i];
+                    remap[idx] = index;
+                    break;
+                }
+            }
+        }
+    }
+
+    count = n;
+
+    // Avoid empty blocks.
+    if (count == 0) {
+        count = 1;
+        //colors[0] = C[0];
+        //weights[0] = W[0];
+        memset(remap, 0, sizeof(int)*16);
+    }
+}
+
+
+// Fill blocks that are smaller than (4,4) by wrapping indices.
+void ColorSet::wrapIndices()
+{
+    for (uint y = h; y < 4; y++)
+    {
+        uint base = (y % h) * w;
+        for (uint x = w; x < 4; x++)
+        {
+            remap[y*4+3] = remap[base + (x % w)];
+        }
+    }
+}
+
+bool ColorSet::isSingleColor(bool ignoreAlpha) const
+{
+    Vector4 v = colors[0];
+    if (ignoreAlpha) v.w = 1.0f;
+
+    for (uint i = 1; i < count; i++)
+    {
+        Vector4 c = colors[i];
+        if (ignoreAlpha) c.w = 1.0f;
+
+        if (v != c) {
+            return false;
+        }
+    }
+
+    return true;
+}
+
+
+// 0=r, 1=g, 2=b, 3=a, 4=0xFF, 5=0
+static inline float component(Vector4::Arg c, uint i)
+{
+    if (i == 0) return c.x;
+    if (i == 1) return c.y;
+    if (i == 2) return c.z;
+    if (i == 3) return c.w;
+    if (i == 4) return 0xFF;
+    return 0;
+}
+
+void ColorSet::swizzle(uint x, uint y, uint z, uint w)
+{
+    for (uint i = 0; i < count; i++)
+    {
+        Vector4 c = colors[i];
+        colors[i].x = component(c, x);
+        colors[i].y = component(c, y);
+        colors[i].z = component(c, z);
+        colors[i].w = component(c, w);
+    }
+}
+
+bool ColorSet::hasAlpha() const
+{
+    for (uint i = 0; i < count; i++)
+    {
+        if (colors[i].w != 0.0f) return true;
+    }
+    return false;
+}
diff --git a/src/nvimage/ColorBlock.h b/src/nvimage/ColorBlock.h
index 054bb61..ea0aaec 100644
--- a/src/nvimage/ColorBlock.h
+++ b/src/nvimage/ColorBlock.h
@@ -1,117 +1,117 @@
-// This code is in the public domain -- castanyo@yahoo.es
-
-#pragma once
-#ifndef NV_IMAGE_COLORBLOCK_H
-#define NV_IMAGE_COLORBLOCK_H
-
-#include "nvmath/Color.h"
-#include "nvmath/Vector.h"
-
-namespace nv
-{
-    class Image;
-    class FloatImage;
-
-    /// Uncompressed 4x4 color block.
-    struct ColorBlock
-    {
-        ColorBlock();
-        ColorBlock(const uint * linearImage);
-        ColorBlock(const ColorBlock & block);
-        ColorBlock(const Image * img, uint x, uint y);
-
-        void init(const Image * img, uint x, uint y);
-        void init(uint w, uint h, const uint * data, uint x, uint y);
-        void init(uint w, uint h, const float * data, uint x, uint y);
-
-        void swizzle(uint x, uint y, uint z, uint w); // 0=r, 1=g, 2=b, 3=a, 4=0xFF, 5=0
-
-        bool isSingleColor(Color32 mask = Color32(0xFF, 0xFF, 0xFF, 0x00)) const;
-        bool hasAlpha() const;
-
-
-        // Accessors
-        const Color32 * colors() const;
-
-        Color32 color(uint i) const;
-        Color32 & color(uint i);
-
-        Color32 color(uint x, uint y) const;
-        Color32 & color(uint x, uint y);
-
-    private:
-
-        Color32 m_color[4*4];
-
-    };
-
-
-    /// Get pointer to block colors.
-    inline const Color32 * ColorBlock::colors() const
-    {
-        return m_color;
-    }
-
-    /// Get block color.
-    inline Color32 ColorBlock::color(uint i) const
-    {
-        nvDebugCheck(i < 16);
-        return m_color[i];
-    }
-
-    /// Get block color.
-    inline Color32 & ColorBlock::color(uint i)
-    {
-        nvDebugCheck(i < 16);
-        return m_color[i];
-    }
-
-    /// Get block color.
-    inline Color32 ColorBlock::color(uint x, uint y) const
-    {
-        nvDebugCheck(x < 4 && y < 4);
-        return m_color[y * 4 + x];
-    }
-
-    /// Get block color.
-    inline Color32 & ColorBlock::color(uint x, uint y)
-    {
-        nvDebugCheck(x < 4 && y < 4);
-        return m_color[y * 4 + x];
-    }
-
-
-    struct ColorSet
-    {
-        void setColors(const float * data, uint img_w, uint img_h, uint img_x, uint img_y);
-
-        void setAlphaWeights();
-        void setUniformWeights();
-
-        void createMinimalSet(bool ignoreTransparent);
-        void wrapIndices();
-
-        void swizzle(uint x, uint y, uint z, uint w); // 0=r, 1=g, 2=b, 3=a, 4=0xFF, 5=0
-
-        bool isSingleColor(bool ignoreAlpha) const;
-        bool hasAlpha() const;
-
-        // These methods require indices to be set:
-        Vector4 color(uint x, uint y) const { nvDebugCheck(x < w && y < h); return colors[remap[y * 4 + x]]; }
-        Vector4 & color(uint x, uint y) { nvDebugCheck(x < w && y < h); return colors[remap[y * 4 + x]]; }
-
-        Vector4 color(uint i) const { nvDebugCheck(i < 16); return colors[remap[i]]; }
-        Vector4 & color(uint i) { nvDebugCheck(i < 16); return colors[remap[i]]; }
-
-
-        uint count;
-        uint w, h;
-
-        Vector4 colors[16];
-        float weights[16];
-        int remap[16];
-    };
-
-} // nv namespace
-
-#endif // NV_IMAGE_COLORBLOCK_H
+// This code is in the public domain -- castanyo@yahoo.es
+
+#pragma once
+#ifndef NV_IMAGE_COLORBLOCK_H
+#define NV_IMAGE_COLORBLOCK_H
+
+#include "nvmath/Color.h"
+#include "nvmath/Vector.h"
+
+namespace nv
+{
+    class Image;
+    class FloatImage;
+
+    /// Uncompressed 4x4 color block.
+    struct ColorBlock
+    {
+        ColorBlock();
+        ColorBlock(const uint * linearImage);
+        ColorBlock(const ColorBlock & block);
+        ColorBlock(const Image * img, uint x, uint y);
+
+        void init(const Image * img, uint x, uint y);
+        void init(uint w, uint h, const uint * data, uint x, uint y);
+        void init(uint w, uint h, const float * data, uint x, uint y);
+
+        void swizzle(uint x, uint y, uint z, uint w); // 0=r, 1=g, 2=b, 3=a, 4=0xFF, 5=0
+
+        bool isSingleColor(Color32 mask = Color32(0xFF, 0xFF, 0xFF, 0x00)) const;
+        bool hasAlpha() const;
+
+
+        // Accessors
+        const Color32 * colors() const;
+
+        Color32 color(uint i) const;
+        Color32 & color(uint i);
+
+        Color32 color(uint x, uint y) const;
+        Color32 & color(uint x, uint y);
+
+    private:
+
+        Color32 m_color[4*4];
+
+    };
+
+
+    /// Get pointer to block colors.
+    inline const Color32 * ColorBlock::colors() const
+    {
+        return m_color;
+    }
+
+    /// Get block color.
+    inline Color32 ColorBlock::color(uint i) const
+    {
+        nvDebugCheck(i < 16);
+        return m_color[i];
+    }
+
+    /// Get block color.
+    inline Color32 & ColorBlock::color(uint i)
+    {
+        nvDebugCheck(i < 16);
+        return m_color[i];
+    }
+
+    /// Get block color.
+    inline Color32 ColorBlock::color(uint x, uint y) const
+    {
+        nvDebugCheck(x < 4 && y < 4);
+        return m_color[y * 4 + x];
+    }
+
+    /// Get block color.
+    inline Color32 & ColorBlock::color(uint x, uint y)
+    {
+        nvDebugCheck(x < 4 && y < 4);
+        return m_color[y * 4 + x];
+    }
+
+
+    struct ColorSet
+    {
+        void setColors(const float * data, uint img_w, uint img_h, uint img_x, uint img_y);
+
+        void setAlphaWeights();
+        void setUniformWeights();
+
+        void createMinimalSet(bool ignoreTransparent);
+        void wrapIndices();
+
+        void swizzle(uint x, uint y, uint z, uint w); // 0=r, 1=g, 2=b, 3=a, 4=0xFF, 5=0
+
+        bool isSingleColor(bool ignoreAlpha) const;
+        bool hasAlpha() const;
+
+        // These methods require indices to be set:
+        Vector4 color(uint x, uint y) const { nvDebugCheck(x < w && y < h); return colors[remap[y * 4 + x]]; }
+        Vector4 & color(uint x, uint y) { nvDebugCheck(x < w && y < h); return colors[remap[y * 4 + x]]; }
+
+        Vector4 color(uint i) const { nvDebugCheck(i < 16); return colors[remap[i]]; }
+        Vector4 & color(uint i) { nvDebugCheck(i < 16); return colors[remap[i]]; }
+
+
+        uint count;
+        uint w, h;
+
+        Vector4 colors[16];
+        float weights[16];
+        int remap[16];
+    };
+
+} // nv namespace
+
+#endif // NV_IMAGE_COLORBLOCK_H
diff --git a/src/nvimage/ColorSpace.cpp b/src/nvimage/ColorSpace.cpp
index f6ac4ce..11bdae0 100644
--- a/src/nvimage/ColorSpace.cpp
+++ b/src/nvimage/ColorSpace.cpp
@@ -1,10 +1,10 @@
 // This code is in the public domain -- jim@tilander.org
-
+
 #include <nvcore/nvcore.h>
-
-#include <nvmath/Color.h>
-#include <nvimage/Image.h>
-
+
+#include <nvmath/Color.h>
+#include <nvimage/Image.h>
+
 #include "ColorSpace.h"
 
 namespace nv
@@ -67,4 +67,4 @@ namespace nv
 			}
 		}
 	}
-}
+}
diff --git a/src/nvimage/DirectDrawSurface.cpp b/src/nvimage/DirectDrawSurface.cpp
index aa8ff4d..c9faa76 100644
--- a/src/nvimage/DirectDrawSurface.cpp
+++ b/src/nvimage/DirectDrawSurface.cpp
@@ -1,1620 +1,1620 @@
-// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
-// 
-// Permission is hereby granted, free of charge, to any person
-// obtaining a copy of this software and associated documentation
-// files (the "Software"), to deal in the Software without
-// restriction, including without limitation the rights to use,
-// copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the
-// Software is furnished to do so, subject to the following
-// conditions:
-// 
-// The above copyright notice and this permission notice shall be
-// included in all copies or substantial portions of the Software.
-// 
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-// OTHER DEALINGS IN THE SOFTWARE.
-
-#include "DirectDrawSurface.h"
-#include "ColorBlock.h"
-#include "Image.h"
-#include "BlockDXT.h"
-#include "PixelFormat.h"
-
-#include "nvcore/Debug.h"
-#include "nvcore/Utils.h" // max
-#include "nvcore/StdStream.h"
-
-#include <string.h> // memset
-
-
-using namespace nv;
-
-
-const uint nv::FOURCC_NVTT = MAKEFOURCC('N', 'V', 'T', 'T');
-
-const uint nv::FOURCC_DDS = MAKEFOURCC('D', 'D', 'S', ' ');
-const uint nv::FOURCC_DXT1 = MAKEFOURCC('D', 'X', 'T', '1');
-const uint nv::FOURCC_DXT2 = MAKEFOURCC('D', 'X', 'T', '2');
-const uint nv::FOURCC_DXT3 = MAKEFOURCC('D', 'X', 'T', '3');
-const uint nv::FOURCC_DXT4 = MAKEFOURCC('D', 'X', 'T', '4');
-const uint nv::FOURCC_DXT5 = MAKEFOURCC('D', 'X', 'T', '5');
-const uint nv::FOURCC_RXGB = MAKEFOURCC('R', 'X', 'G', 'B');
-const uint nv::FOURCC_ATI1 = MAKEFOURCC('A', 'T', 'I', '1');
-const uint nv::FOURCC_ATI2 = MAKEFOURCC('A', 'T', 'I', '2');
-
-
-
-namespace
-{
-
-    static const uint FOURCC_A2XY = MAKEFOURCC('A', '2', 'X', 'Y');
-
-    static const uint FOURCC_DX10 = MAKEFOURCC('D', 'X', '1', '0');
-
-    static const uint FOURCC_UVER = MAKEFOURCC('U', 'V', 'E', 'R');
-
-
-
-    static const uint DDSD_CAPS = 0x00000001U;
-    static const uint DDSD_PIXELFORMAT = 0x00001000U;
-    static const uint DDSD_WIDTH = 0x00000004U;
-    static const uint DDSD_HEIGHT = 0x00000002U;
-    static const uint DDSD_PITCH = 0x00000008U;
-    static const uint DDSD_MIPMAPCOUNT = 0x00020000U;
-    static const uint DDSD_LINEARSIZE = 0x00080000U;
-    static const uint DDSD_DEPTH = 0x00800000U;
-
-    static const uint DDSCAPS_COMPLEX = 0x00000008U;
-    static const uint DDSCAPS_TEXTURE = 0x00001000U;
-    static const uint DDSCAPS_MIPMAP = 0x00400000U;
-    static const uint DDSCAPS2_VOLUME = 0x00200000U;
-    static const uint DDSCAPS2_CUBEMAP = 0x00000200U;
-
-    static const uint DDSCAPS2_CUBEMAP_POSITIVEX = 0x00000400U;
-    static const uint DDSCAPS2_CUBEMAP_NEGATIVEX = 0x00000800U;
-    static const uint DDSCAPS2_CUBEMAP_POSITIVEY = 0x00001000U;
-    static const uint DDSCAPS2_CUBEMAP_NEGATIVEY = 0x00002000U;
-    static const uint DDSCAPS2_CUBEMAP_POSITIVEZ = 0x00004000U;
-    static const uint DDSCAPS2_CUBEMAP_NEGATIVEZ = 0x00008000U;
-    static const uint DDSCAPS2_CUBEMAP_ALL_FACES = 0x0000FC00U;
-
-
-    const char * getDxgiFormatString(DXGI_FORMAT dxgiFormat)
-    {
-#define CASE(format) case DXGI_FORMAT_##format: return #format
-        switch(dxgiFormat)
-        {
-            CASE(UNKNOWN);
-
-            CASE(R32G32B32A32_TYPELESS);
-            CASE(R32G32B32A32_FLOAT);
-            CASE(R32G32B32A32_UINT);
-            CASE(R32G32B32A32_SINT);
-
-            CASE(R32G32B32_TYPELESS);
-            CASE(R32G32B32_FLOAT);
-            CASE(R32G32B32_UINT);
-            CASE(R32G32B32_SINT);
-
-            CASE(R16G16B16A16_TYPELESS);
-            CASE(R16G16B16A16_FLOAT);
-            CASE(R16G16B16A16_UNORM);
-            CASE(R16G16B16A16_UINT);
-            CASE(R16G16B16A16_SNORM);
-            CASE(R16G16B16A16_SINT);
-
-            CASE(R32G32_TYPELESS);
-            CASE(R32G32_FLOAT);
-            CASE(R32G32_UINT);
-            CASE(R32G32_SINT);
-
-            CASE(R32G8X24_TYPELESS);
-            CASE(D32_FLOAT_S8X24_UINT);
-            CASE(R32_FLOAT_X8X24_TYPELESS);
-            CASE(X32_TYPELESS_G8X24_UINT);
-
-            CASE(R10G10B10A2_TYPELESS);
-            CASE(R10G10B10A2_UNORM);
-            CASE(R10G10B10A2_UINT);
-
-            CASE(R11G11B10_FLOAT);
-
-            CASE(R8G8B8A8_TYPELESS);
-            CASE(R8G8B8A8_UNORM);
-            CASE(R8G8B8A8_UNORM_SRGB);
-            CASE(R8G8B8A8_UINT);
-            CASE(R8G8B8A8_SNORM);
-            CASE(R8G8B8A8_SINT);
-
-            CASE(R16G16_TYPELESS);
-            CASE(R16G16_FLOAT);
-            CASE(R16G16_UNORM);
-            CASE(R16G16_UINT);
-            CASE(R16G16_SNORM);
-            CASE(R16G16_SINT);
-
-            CASE(R32_TYPELESS);
-            CASE(D32_FLOAT);
-            CASE(R32_FLOAT);
-            CASE(R32_UINT);
-            CASE(R32_SINT);
-
-            CASE(R24G8_TYPELESS);
-            CASE(D24_UNORM_S8_UINT);
-            CASE(R24_UNORM_X8_TYPELESS);
-            CASE(X24_TYPELESS_G8_UINT);
-
-            CASE(R8G8_TYPELESS);
-            CASE(R8G8_UNORM);
-            CASE(R8G8_UINT);
-            CASE(R8G8_SNORM);
-            CASE(R8G8_SINT);
-
-            CASE(R16_TYPELESS);
-            CASE(R16_FLOAT);
-            CASE(D16_UNORM);
-            CASE(R16_UNORM);
-            CASE(R16_UINT);
-            CASE(R16_SNORM);
-            CASE(R16_SINT);
-
-            CASE(R8_TYPELESS);
-            CASE(R8_UNORM);
-            CASE(R8_UINT);
-            CASE(R8_SNORM);
-            CASE(R8_SINT);
-            CASE(A8_UNORM);
-
-            CASE(R1_UNORM);
-
-            CASE(R9G9B9E5_SHAREDEXP);
-
-            CASE(R8G8_B8G8_UNORM);
-            CASE(G8R8_G8B8_UNORM);
-
-            CASE(BC1_TYPELESS);
-            CASE(BC1_UNORM);
-            CASE(BC1_UNORM_SRGB);
-
-            CASE(BC2_TYPELESS);
-            CASE(BC2_UNORM);
-            CASE(BC2_UNORM_SRGB);
-
-            CASE(BC3_TYPELESS);
-            CASE(BC3_UNORM);
-            CASE(BC3_UNORM_SRGB);
-
-            CASE(BC4_TYPELESS);
-            CASE(BC4_UNORM);
-            CASE(BC4_SNORM);
-
-            CASE(BC5_TYPELESS);
-            CASE(BC5_UNORM);
-            CASE(BC5_SNORM);
-
-            CASE(B5G6R5_UNORM);
-            CASE(B5G5R5A1_UNORM);
-            CASE(B8G8R8A8_UNORM);
-            CASE(B8G8R8X8_UNORM);
-
-        default: 
-            return "UNKNOWN";
-        }
-#undef CASE
-    }
-
-    const char * getD3d10ResourceDimensionString(D3D10_RESOURCE_DIMENSION resourceDimension)
-    {
-        switch(resourceDimension)
-        {
-            default:
-            case D3D10_RESOURCE_DIMENSION_UNKNOWN: return "UNKNOWN";
-            case D3D10_RESOURCE_DIMENSION_BUFFER: return "BUFFER";
-            case D3D10_RESOURCE_DIMENSION_TEXTURE1D: return "TEXTURE1D";
-            case D3D10_RESOURCE_DIMENSION_TEXTURE2D: return "TEXTURE2D";
-            case D3D10_RESOURCE_DIMENSION_TEXTURE3D: return "TEXTURE3D";
-        }
-    }
-
-    static uint pixelSize(D3DFORMAT format) {
-        if (format == D3DFMT_R16F) return 8*2;
-        if (format == D3DFMT_G16R16F) return 8*4;
-        if (format == D3DFMT_A16B16G16R16F) return 8*8;
-        if (format == D3DFMT_R32F) return 8*4;
-        if (format == D3DFMT_G32R32F) return 8*8;
-        if (format == D3DFMT_A32B32G32R32F) return 8*16;
-
-        if (format == D3DFMT_R8G8B8) return 8*3;
-        if (format == D3DFMT_A8R8G8B8) return 8*4;
-        if (format == D3DFMT_X8R8G8B8) return 8*4;
-        if (format == D3DFMT_R5G6B5) return 8*2;
-        if (format == D3DFMT_X1R5G5B5) return 8*2;
-        if (format == D3DFMT_A1R5G5B5) return 8*2;
-        if (format == D3DFMT_A4R4G4B4) return 8*2;
-        if (format == D3DFMT_R3G3B2) return 8*1;
-        if (format == D3DFMT_A8) return 8*1;
-        if (format == D3DFMT_A8R3G3B2) return 8*2;
-        if (format == D3DFMT_X4R4G4B4) return 8*2;
-        if (format == D3DFMT_A2B10G10R10) return 8*4;
-        if (format == D3DFMT_A8B8G8R8) return 8*4;
-        if (format == D3DFMT_X8B8G8R8) return 8*4;
-        if (format == D3DFMT_G16R16) return 8*4;
-        if (format == D3DFMT_A2R10G10B10) return 8*4;
-        if (format == D3DFMT_A2B10G10R10) return 8*4;
-
-        if (format == D3DFMT_L8) return 8*1;
-        if (format == D3DFMT_L16) return 8*2;
-
-        return 0;
-    }
-
-    static uint pixelSize(DXGI_FORMAT format) {
-        switch(format) {
-            case DXGI_FORMAT_R32G32B32A32_TYPELESS:
-            case DXGI_FORMAT_R32G32B32A32_FLOAT:
-            case DXGI_FORMAT_R32G32B32A32_UINT:
-            case DXGI_FORMAT_R32G32B32A32_SINT:
-                return 8*16;
-
-            case DXGI_FORMAT_R32G32B32_TYPELESS:
-            case DXGI_FORMAT_R32G32B32_FLOAT:
-            case DXGI_FORMAT_R32G32B32_UINT:
-            case DXGI_FORMAT_R32G32B32_SINT:
-                return 8*12;
-
-            case DXGI_FORMAT_R16G16B16A16_TYPELESS:
-            case DXGI_FORMAT_R16G16B16A16_FLOAT:
-            case DXGI_FORMAT_R16G16B16A16_UNORM:
-            case DXGI_FORMAT_R16G16B16A16_UINT:
-            case DXGI_FORMAT_R16G16B16A16_SNORM:
-            case DXGI_FORMAT_R16G16B16A16_SINT:
-            
-            case DXGI_FORMAT_R32G32_TYPELESS:
-            case DXGI_FORMAT_R32G32_FLOAT:
-            case DXGI_FORMAT_R32G32_UINT:
-            case DXGI_FORMAT_R32G32_SINT:
-
-            case DXGI_FORMAT_R32G8X24_TYPELESS:
-            case DXGI_FORMAT_D32_FLOAT_S8X24_UINT:
-            case DXGI_FORMAT_R32_FLOAT_X8X24_TYPELESS:
-            case DXGI_FORMAT_X32_TYPELESS_G8X24_UINT:
-                return 8*8;
-
-            case DXGI_FORMAT_R10G10B10A2_TYPELESS:
-            case DXGI_FORMAT_R10G10B10A2_UNORM:
-            case DXGI_FORMAT_R10G10B10A2_UINT:
-
-            case DXGI_FORMAT_R11G11B10_FLOAT:
-
-            case DXGI_FORMAT_R8G8B8A8_TYPELESS:
-            case DXGI_FORMAT_R8G8B8A8_UNORM:
-            case DXGI_FORMAT_R8G8B8A8_UNORM_SRGB:
-            case DXGI_FORMAT_R8G8B8A8_UINT:
-            case DXGI_FORMAT_R8G8B8A8_SNORM:
-            case DXGI_FORMAT_R8G8B8A8_SINT:
-
-            case DXGI_FORMAT_R16G16_TYPELESS:
-            case DXGI_FORMAT_R16G16_FLOAT:
-            case DXGI_FORMAT_R16G16_UNORM:
-            case DXGI_FORMAT_R16G16_UINT:
-            case DXGI_FORMAT_R16G16_SNORM:
-            case DXGI_FORMAT_R16G16_SINT:
-
-            case DXGI_FORMAT_R32_TYPELESS:
-            case DXGI_FORMAT_D32_FLOAT:
-            case DXGI_FORMAT_R32_FLOAT:
-            case DXGI_FORMAT_R32_UINT:
-            case DXGI_FORMAT_R32_SINT:
-
-            case DXGI_FORMAT_R24G8_TYPELESS:
-            case DXGI_FORMAT_D24_UNORM_S8_UINT:
-            case DXGI_FORMAT_R24_UNORM_X8_TYPELESS:
-            case DXGI_FORMAT_X24_TYPELESS_G8_UINT:
-                return 8*4;
-
-            case DXGI_FORMAT_R8G8_TYPELESS:
-            case DXGI_FORMAT_R8G8_UNORM:
-            case DXGI_FORMAT_R8G8_UINT:
-            case DXGI_FORMAT_R8G8_SNORM:
-            case DXGI_FORMAT_R8G8_SINT:
-
-            case DXGI_FORMAT_R16_TYPELESS:
-            case DXGI_FORMAT_R16_FLOAT:
-            case DXGI_FORMAT_D16_UNORM:
-            case DXGI_FORMAT_R16_UNORM:
-            case DXGI_FORMAT_R16_UINT:
-            case DXGI_FORMAT_R16_SNORM:
-            case DXGI_FORMAT_R16_SINT:
-                return 8*2;
-
-            case DXGI_FORMAT_R8_TYPELESS:
-            case DXGI_FORMAT_R8_UNORM:
-            case DXGI_FORMAT_R8_UINT:
-            case DXGI_FORMAT_R8_SNORM:
-            case DXGI_FORMAT_R8_SINT:
-            case DXGI_FORMAT_A8_UNORM:
-                return 8*1;
-
-            case DXGI_FORMAT_R1_UNORM:
-                return 1;
-
-            case DXGI_FORMAT_R9G9B9E5_SHAREDEXP:
-                return 8*4;
-
-            case DXGI_FORMAT_R8G8_B8G8_UNORM:
-            case DXGI_FORMAT_G8R8_G8B8_UNORM:
-                return 8*4;
-
-            case DXGI_FORMAT_B5G6R5_UNORM:
-            case DXGI_FORMAT_B5G5R5A1_UNORM:
-                return 8*2;
-            
-            case DXGI_FORMAT_B8G8R8A8_UNORM:
-            case DXGI_FORMAT_B8G8R8X8_UNORM:
-                return 8*4;
-
-            case DXGI_FORMAT_R10G10B10_XR_BIAS_A2_UNORM:
-            case DXGI_FORMAT_B8G8R8A8_TYPELESS:
-            case DXGI_FORMAT_B8G8R8A8_UNORM_SRGB:
-            case DXGI_FORMAT_B8G8R8X8_TYPELESS:
-            case DXGI_FORMAT_B8G8R8X8_UNORM_SRGB:
-                return 8*4;
-        }
-
-        return 0;
-    }
-
-} // namespace
-
-namespace nv
-{
-    static Stream & operator<< (Stream & s, DDSPixelFormat & pf)
-    {
-        nvStaticCheck(sizeof(DDSPixelFormat) == 32);
-        s << pf.size;
-        s << pf.flags;
-        s << pf.fourcc;
-        s << pf.bitcount;
-        s.serialize(&pf.rmask, sizeof(pf.rmask));
-        s.serialize(&pf.gmask, sizeof(pf.gmask));
-        s.serialize(&pf.bmask, sizeof(pf.bmask));
-        s.serialize(&pf.amask, sizeof(pf.amask));
-        // s << pf.rmask;
-        // s << pf.gmask;
-        // s << pf.bmask;
-        // s << pf.amask;
-        return s;
-    }
-
-    static Stream & operator<< (Stream & s, DDSCaps & caps)
-    {
-        nvStaticCheck(sizeof(DDSCaps) == 16);
-        s << caps.caps1;
-        s << caps.caps2;
-        s << caps.caps3;
-        s << caps.caps4;
-        return s;
-    }
-
-    static Stream & operator<< (Stream & s, DDSHeader10 & header)
-    {
-        nvStaticCheck(sizeof(DDSHeader10) == 20);
-        s << header.dxgiFormat;
-        s << header.resourceDimension;
-        s << header.miscFlag;
-        s << header.arraySize;
-        s << header.reserved;
-        return s;
-    }
-
-    Stream & operator<< (Stream & s, DDSHeader & header)
-    {
-        nvStaticCheck(sizeof(DDSHeader) == 148);
-        s << header.fourcc;
-        s << header.size;
-        s << header.flags;
-        s << header.height;
-        s << header.width;
-        s << header.pitch;
-        s << header.depth;
-        s << header.mipmapcount;
-        for (int i = 0; i < 11; i++) {
-            s << header.reserved[i];
-        }
-        s << header.pf;
-        s << header.caps;
-        s << header.notused;
-
-        if (header.hasDX10Header())
-        {
-            s << header.header10;
-        }
-
-        return s;
-    }
-
-} // nv namespace
-
-namespace
-{
-    struct FormatDescriptor
-    {
-        uint format;
-        uint bitcount;
-        uint rmask;
-        uint gmask;
-        uint bmask;
-        uint amask;
-    };
-
-    static const FormatDescriptor s_d3d9Formats[] =
-    {
-        { D3DFMT_R8G8B8,         24, 0xFF0000,   0xFF00,	    0xFF,       0 },
-        { D3DFMT_A8R8G8B8,       32, 0xFF0000,   0xFF00,     0xFF,       0xFF000000 },  // DXGI_FORMAT_B8G8R8A8_UNORM
-        { D3DFMT_X8R8G8B8,       32, 0xFF0000,   0xFF00,     0xFF,       0 },           // DXGI_FORMAT_B8G8R8X8_UNORM
-        { D3DFMT_R5G6B5,         16, 0xF800,     0x7E0,      0x1F,       0 },           // DXGI_FORMAT_B5G6R5_UNORM
-        { D3DFMT_X1R5G5B5,       16, 0x7C00,     0x3E0,      0x1F,       0 },
-        { D3DFMT_A1R5G5B5,       16, 0x7C00,     0x3E0,      0x1F,       0x8000 },      // DXGI_FORMAT_B5G5R5A1_UNORM
-        { D3DFMT_A4R4G4B4,       16, 0xF00,      0xF0,       0xF,        0xF000 },
-        { D3DFMT_R3G3B2,         8,  0xE0,       0x1C,       0x3,	    0 },
-        { D3DFMT_A8,             8,  0,          0,          0,		    8 },            // DXGI_FORMAT_A8_UNORM
-        { D3DFMT_A8R3G3B2,       16, 0xE0,       0x1C,       0x3,        0xFF00 },
-        { D3DFMT_X4R4G4B4,       16, 0xF00,      0xF0,       0xF,        0 },
-        { D3DFMT_A2B10G10R10,    32, 0x3FF,      0xFFC00,    0x3FF00000, 0xC0000000 },  // DXGI_FORMAT_R10G10B10A2
-        { D3DFMT_A8B8G8R8,       32, 0xFF,       0xFF00,     0xFF0000,   0xFF000000 },  // DXGI_FORMAT_R8G8B8A8_UNORM
-        { D3DFMT_X8B8G8R8,       32, 0xFF,       0xFF00,     0xFF0000,   0 },
-        { D3DFMT_G16R16,         32, 0xFFFF,     0xFFFF0000, 0,          0 },           // DXGI_FORMAT_R16G16_UNORM
-        { D3DFMT_A2R10G10B10,    32, 0x3FF00000, 0xFFC00,    0x3FF,      0xC0000000 },
-        { D3DFMT_A2B10G10R10,    32, 0x3FF,      0xFFC00,    0x3FF00000, 0xC0000000 },
-
-        { D3DFMT_L8,             8,  8,          0,          0,          0 },           // DXGI_FORMAT_R8_UNORM 
-        { D3DFMT_L16,            16, 16,         0,          0,          0 },           // DXGI_FORMAT_R16_UNORM
-    };
-
-    static const uint s_d3d9FormatCount = NV_ARRAY_SIZE(s_d3d9Formats);
-
-} // namespace
-
-uint nv::findD3D9Format(uint bitcount, uint rmask, uint gmask, uint bmask, uint amask)
-{
-    for (int i = 0; i < s_d3d9FormatCount; i++)
-    {
-        if (s_d3d9Formats[i].bitcount == bitcount &&
-            s_d3d9Formats[i].rmask == rmask &&
-            s_d3d9Formats[i].gmask == gmask &&
-            s_d3d9Formats[i].bmask == bmask &&
-            s_d3d9Formats[i].amask == amask)
-        {
-            return s_d3d9Formats[i].format;
-        }
-    }
-
-    return 0;
-}
-
-
-DDSHeader::DDSHeader()
-{
-    this->fourcc = FOURCC_DDS;
-    this->size = 124;
-    this->flags  = (DDSD_CAPS|DDSD_PIXELFORMAT);
-    this->height = 0;
-    this->width = 0;
-    this->pitch = 0;
-    this->depth = 0;
-    this->mipmapcount = 0;
-    memset(this->reserved, 0, sizeof(this->reserved));
-
-    // Store version information on the reserved header attributes.
-    this->reserved[9] = FOURCC_NVTT;
-    this->reserved[10] = (2 << 16) | (1 << 8) | (0); // major.minor.revision
-
-    this->pf.size = 32;
-    this->pf.flags = 0;
-    this->pf.fourcc = 0;
-    this->pf.bitcount = 0;
-    this->pf.rmask = 0;
-    this->pf.gmask = 0;
-    this->pf.bmask = 0;
-    this->pf.amask = 0;
-    this->caps.caps1 = DDSCAPS_TEXTURE;
-    this->caps.caps2 = 0;
-    this->caps.caps3 = 0;
-    this->caps.caps4 = 0;
-    this->notused = 0;
-
-    this->header10.dxgiFormat = DXGI_FORMAT_UNKNOWN;
-    this->header10.resourceDimension = D3D10_RESOURCE_DIMENSION_UNKNOWN;
-    this->header10.miscFlag = 0;
-    this->header10.arraySize = 0;
-    this->header10.reserved = 0;
-}
-
-void DDSHeader::setWidth(uint w)
-{
-    this->flags |= DDSD_WIDTH;
-    this->width = w;
-}
-
-void DDSHeader::setHeight(uint h)
-{
-    this->flags |= DDSD_HEIGHT;
-    this->height = h;
-}
-
-void DDSHeader::setDepth(uint d)
-{
-    this->flags |= DDSD_DEPTH;
-    this->depth = d;
-}
-
-void DDSHeader::setMipmapCount(uint count)
-{
-    if (count == 0 || count == 1)
-    {
-        this->flags &= ~DDSD_MIPMAPCOUNT;
-        this->mipmapcount = 1;
-
-        if (this->caps.caps2 == 0) {
-            this->caps.caps1 = DDSCAPS_TEXTURE;
-        }
-        else {
-            this->caps.caps1 = DDSCAPS_TEXTURE | DDSCAPS_COMPLEX;
-        }
-    }
-    else
-    {
-        this->flags |= DDSD_MIPMAPCOUNT;
-        this->mipmapcount = count;
-
-        this->caps.caps1 |= DDSCAPS_COMPLEX | DDSCAPS_MIPMAP;
-    }
-}
-
-void DDSHeader::setTexture2D()
-{
-    this->header10.resourceDimension = D3D10_RESOURCE_DIMENSION_TEXTURE2D;
-    this->header10.arraySize = 1;
-}
-
-void DDSHeader::setTexture3D()
-{
-    this->caps.caps2 = DDSCAPS2_VOLUME;
-
-    this->header10.resourceDimension = D3D10_RESOURCE_DIMENSION_TEXTURE3D;
-    this->header10.arraySize = 1;
-}
-
-void DDSHeader::setTextureCube()
-{
-    this->caps.caps1 |= DDSCAPS_COMPLEX;
-    this->caps.caps2 = DDSCAPS2_CUBEMAP | DDSCAPS2_CUBEMAP_ALL_FACES;
-
-    this->header10.resourceDimension = D3D10_RESOURCE_DIMENSION_TEXTURE2D;
-    this->header10.arraySize = 6;
-}
-
-void DDSHeader::setLinearSize(uint size)
-{
-    this->flags &= ~DDSD_PITCH;
-    this->flags |= DDSD_LINEARSIZE;
-    this->pitch = size;
-}
-
-void DDSHeader::setPitch(uint pitch)
-{
-    this->flags &= ~DDSD_LINEARSIZE;
-    this->flags |= DDSD_PITCH;
-    this->pitch = pitch;
-}
-
-void DDSHeader::setFourCC(uint8 c0, uint8 c1, uint8 c2, uint8 c3)
-{
-    // set fourcc pixel format.
-    this->pf.flags = DDPF_FOURCC;
-    this->pf.fourcc = MAKEFOURCC(c0, c1, c2, c3);
-
-    this->pf.bitcount = 0;
-    this->pf.rmask = 0;
-    this->pf.gmask = 0;
-    this->pf.bmask = 0;
-    this->pf.amask = 0;
-}
-
-void DDSHeader::setFormatCode(uint32 code)
-{
-    // set fourcc pixel format.
-    this->pf.flags = DDPF_FOURCC;
-    this->pf.fourcc = code;
-
-    this->pf.bitcount = 0;
-    this->pf.rmask = 0;
-    this->pf.gmask = 0;
-    this->pf.bmask = 0;
-    this->pf.amask = 0;
-}
-
-void DDSHeader::setSwizzleCode(uint8 c0, uint8 c1, uint8 c2, uint8 c3)
-{
-    this->pf.bitcount = MAKEFOURCC(c0, c1, c2, c3);
-}
-
-
-void DDSHeader::setPixelFormat(uint bitcount, uint rmask, uint gmask, uint bmask, uint amask)
-{
-    // Make sure the masks are correct.
-    nvCheck((rmask & gmask) == 0);
-    nvCheck((rmask & bmask) == 0);
-    nvCheck((rmask & amask) == 0);
-    nvCheck((gmask & bmask) == 0);
-    nvCheck((gmask & amask) == 0);
-    nvCheck((bmask & amask) == 0);
-
-    if (rmask != 0 || gmask != 0 || bmask != 0)
-    {
-        if (gmask == 0 && bmask == 0)
-        {
-            this->pf.flags = DDPF_LUMINANCE;
-        }
-        else
-        {
-            this->pf.flags = DDPF_RGB;
-        }
-
-        if (amask != 0) {
-            this->pf.flags |= DDPF_ALPHAPIXELS;
-        }
-    }
-    else if (amask != 0)
-    {
-        this->pf.flags |= DDPF_ALPHA;
-    }
-
-    if (bitcount == 0)
-    {
-        // Compute bit count from the masks.
-        uint total = rmask | gmask | bmask | amask;
-        while(total != 0) {
-            bitcount++;
-            total >>= 1;
-        }
-    }
-
-    // D3DX functions do not like this:
-    this->pf.fourcc = 0; //findD3D9Format(bitcount, rmask, gmask, bmask, amask);
-    /*if (this->pf.fourcc) {
-        this->pf.flags |= DDPF_FOURCC;
-    }*/
-
-    nvCheck(bitcount > 0 && bitcount <= 32);
-    this->pf.bitcount = bitcount;
-    this->pf.rmask = rmask;
-    this->pf.gmask = gmask;
-    this->pf.bmask = bmask;
-    this->pf.amask = amask;
-}
-
-void DDSHeader::setDX10Format(uint format)
-{
-    this->pf.flags = DDPF_FOURCC;
-    this->pf.fourcc = FOURCC_DX10;
-    this->header10.dxgiFormat = format;
-}
-
-void DDSHeader::setNormalFlag(bool b)
-{
-    if (b) this->pf.flags |= DDPF_NORMAL;
-    else this->pf.flags &= ~DDPF_NORMAL;
-}
-
-void DDSHeader::setSrgbFlag(bool b)
-{
-    if (b) this->pf.flags |= DDPF_SRGB;
-    else this->pf.flags &= ~DDPF_SRGB;
-}
-
-void DDSHeader::setHasAlphaFlag(bool b)
-{
-    if (b) this->pf.flags |= DDPF_ALPHAPIXELS;
-    else this->pf.flags &= ~DDPF_ALPHAPIXELS;
-}
-
-void DDSHeader::setUserVersion(int version)
-{
-    this->reserved[7] = FOURCC_UVER;
-    this->reserved[8] = version;
-}
-
-void DDSHeader::swapBytes()
-{
-    this->fourcc = POSH_LittleU32(this->fourcc);
-    this->size = POSH_LittleU32(this->size);
-    this->flags = POSH_LittleU32(this->flags);
-    this->height = POSH_LittleU32(this->height);
-    this->width = POSH_LittleU32(this->width);
-    this->pitch = POSH_LittleU32(this->pitch);
-    this->depth = POSH_LittleU32(this->depth);
-    this->mipmapcount = POSH_LittleU32(this->mipmapcount);
-
-    for(int i = 0; i < 11; i++) {
-        this->reserved[i] = POSH_LittleU32(this->reserved[i]);
-    }
-
-    this->pf.size = POSH_LittleU32(this->pf.size);
-    this->pf.flags = POSH_LittleU32(this->pf.flags);
-    this->pf.fourcc = POSH_LittleU32(this->pf.fourcc);
-    this->pf.bitcount = POSH_LittleU32(this->pf.bitcount);
-    this->pf.rmask = POSH_LittleU32(this->pf.rmask);
-    this->pf.gmask = POSH_LittleU32(this->pf.gmask);
-    this->pf.bmask = POSH_LittleU32(this->pf.bmask);
-    this->pf.amask = POSH_LittleU32(this->pf.amask);
-    this->caps.caps1 = POSH_LittleU32(this->caps.caps1);
-    this->caps.caps2 = POSH_LittleU32(this->caps.caps2);
-    this->caps.caps3 = POSH_LittleU32(this->caps.caps3);
-    this->caps.caps4 = POSH_LittleU32(this->caps.caps4);
-    this->notused = POSH_LittleU32(this->notused);
-
-    this->header10.dxgiFormat = POSH_LittleU32(this->header10.dxgiFormat);
-    this->header10.resourceDimension = POSH_LittleU32(this->header10.resourceDimension);
-    this->header10.miscFlag = POSH_LittleU32(this->header10.miscFlag);
-    this->header10.arraySize = POSH_LittleU32(this->header10.arraySize);
-    this->header10.reserved = POSH_LittleU32(this->header10.reserved);
-}
-
-bool DDSHeader::hasDX10Header() const
-{
-    //if (pf.flags & DDPF_FOURCC) {
-        return this->pf.fourcc == FOURCC_DX10;
-    //}
-    //return false;
-}
-
-uint DDSHeader::signature() const
-{
-    return this->reserved[9];
-}
-
-uint DDSHeader::toolVersion() const
-{
-    return this->reserved[10];
-}
-
-uint DDSHeader::userVersion() const
-{
-    if (this->reserved[7] == FOURCC_UVER) {
-        return this->reserved[8];
-    }
-    return 0;
-}
-
-bool DDSHeader::isNormalMap() const
-{
-    return (pf.flags & DDPF_NORMAL) != 0;
-}
-
-bool DDSHeader::isSrgb() const
-{
-    return (pf.flags & DDPF_SRGB) != 0;
-}
-
-bool DDSHeader::hasAlpha() const
-{
-    return (pf.flags & DDPF_ALPHAPIXELS) != 0;
-}
-
-uint DDSHeader::d3d9Format() const
-{
-    if (pf.flags & DDPF_FOURCC) {
-        return pf.fourcc;
-    }
-    else {
-        return findD3D9Format(pf.bitcount, pf.rmask, pf.gmask, pf.bmask, pf.amask);
-    }
-}
-
-uint DDSHeader::pixelSize() const
-{
-    if (hasDX10Header()) {
-        return ::pixelSize((DXGI_FORMAT)header10.dxgiFormat);
-    }
-    else {
-        if (flags & DDPF_FOURCC) {
-            return ::pixelSize((D3DFORMAT)pf.fourcc);
-        }
-        else {
-            nvDebugCheck((pf.flags & DDPF_RGB) || (pf.flags & DDPF_LUMINANCE));
-            return pf.bitcount;
-        }
-    }
-}
-
-uint DDSHeader::blockSize() const
-{
-    switch(pf.fourcc) 
-    {
-    case FOURCC_DXT1:
-    case FOURCC_ATI1:
-        return 8;
-    case FOURCC_DXT2:
-    case FOURCC_DXT3:
-    case FOURCC_DXT4:
-    case FOURCC_DXT5:
-    case FOURCC_RXGB:
-    case FOURCC_ATI2:
-        return 16;
-    case FOURCC_DX10:
-        switch(header10.dxgiFormat)
-        {
-        case DXGI_FORMAT_BC1_TYPELESS:
-        case DXGI_FORMAT_BC1_UNORM:
-        case DXGI_FORMAT_BC1_UNORM_SRGB:
-        case DXGI_FORMAT_BC4_TYPELESS:
-        case DXGI_FORMAT_BC4_UNORM:
-        case DXGI_FORMAT_BC4_SNORM:
-            return 8;
-        case DXGI_FORMAT_BC2_TYPELESS:
-        case DXGI_FORMAT_BC2_UNORM:
-        case DXGI_FORMAT_BC2_UNORM_SRGB:
-        case DXGI_FORMAT_BC3_TYPELESS:
-        case DXGI_FORMAT_BC3_UNORM:
-        case DXGI_FORMAT_BC3_UNORM_SRGB:
-        case DXGI_FORMAT_BC5_TYPELESS:
-        case DXGI_FORMAT_BC5_UNORM:
-        case DXGI_FORMAT_BC5_SNORM:
-        case DXGI_FORMAT_BC6H_TYPELESS:
-        case DXGI_FORMAT_BC6H_SF16:
-        case DXGI_FORMAT_BC6H_UF16:
-        case DXGI_FORMAT_BC7_TYPELESS:
-        case DXGI_FORMAT_BC7_UNORM:
-        case DXGI_FORMAT_BC7_UNORM_SRGB:
-            return 16;
-        };
-    };
-
-    // Not a block image.
-    return 0;
-}
-
-bool DDSHeader::isBlockFormat() const
-{
-    return blockSize() != 0;
-}
-
-
-
-
-
-DirectDrawSurface::DirectDrawSurface() : stream(NULL)
-{
-}
-
-DirectDrawSurface::DirectDrawSurface(const char * name) : stream(NULL)
-{
-    load(name);
-}
-
-DirectDrawSurface::DirectDrawSurface(Stream * s) : stream(NULL)
-{
-    load(s);
-}
-
-DirectDrawSurface::~DirectDrawSurface()
-{
-    delete stream;
-}
-
-bool DirectDrawSurface::load(const char * filename)
-{
-    return load(new StdInputStream(filename));
-}
-
-bool DirectDrawSurface::load(Stream * stream)
-{
-    delete this->stream;
-    this->stream = stream;
-
-    if (!stream->isError())
-    {
-        (*stream) << header;
-        return true;
-    }
-
-    return false;
-}
-
-bool DirectDrawSurface::isValid() const
-{
-    if (stream == NULL || stream->isError())
-    {
-        return false;
-    }
-
-    if (header.fourcc != FOURCC_DDS || header.size != 124)
-    {
-        return false;
-    }
-
-    const uint required = (DDSD_WIDTH|DDSD_HEIGHT/*|DDSD_CAPS|DDSD_PIXELFORMAT*/);
-    if( (header.flags & required) != required ) {
-        return false;
-    }
-
-    if (header.pf.size != 32) {
-        return false;
-    }
-
-    if( !(header.caps.caps1 & DDSCAPS_TEXTURE) ) {
-        return false;
-    }
-
-    return true;
-}
-
-bool DirectDrawSurface::isSupported() const
-{
-    nvDebugCheck(isValid());
-
-    if (header.hasDX10Header())
-    {
-        if (header.header10.dxgiFormat == DXGI_FORMAT_BC1_UNORM ||
-            header.header10.dxgiFormat == DXGI_FORMAT_BC2_UNORM ||
-            header.header10.dxgiFormat == DXGI_FORMAT_BC3_UNORM ||
-            header.header10.dxgiFormat == DXGI_FORMAT_BC4_UNORM ||
-            header.header10.dxgiFormat == DXGI_FORMAT_BC5_UNORM)
-        {
-            return true;
-        }
-
-        return false;
-    }
-    else
-    {
-        if (header.pf.flags & DDPF_FOURCC)
-        {
-            if (header.pf.fourcc != FOURCC_DXT1 &&
-                header.pf.fourcc != FOURCC_DXT2 &&
-                header.pf.fourcc != FOURCC_DXT3 &&
-                header.pf.fourcc != FOURCC_DXT4 &&
-                header.pf.fourcc != FOURCC_DXT5 &&
-                header.pf.fourcc != FOURCC_RXGB &&
-                header.pf.fourcc != FOURCC_ATI1 &&
-                header.pf.fourcc != FOURCC_ATI2)
-            {
-                // Unknown fourcc code.
-                return false;
-            }
-        }
-        else if ((header.pf.flags & DDPF_RGB) || (header.pf.flags & DDPF_LUMINANCE))
-        {
-            // All RGB and luminance formats are supported now.
-        }
-        else
-        {
-            return false;
-        }
-
-        if (isTextureCube()) {
-            if (header.width != header.height) return false;
-
-            if ((header.caps.caps2 & DDSCAPS2_CUBEMAP_ALL_FACES) != DDSCAPS2_CUBEMAP_ALL_FACES)
-            {
-                // Cubemaps must contain all faces.
-                return false;
-            }
-        }
-    }
-
-    return true;
-}
-
-bool DirectDrawSurface::hasAlpha() const
-{
-    if (header.hasDX10Header())
-    {
-#pragma NV_MESSAGE("TODO: Update hasAlpha to handle all DX10 formats.")
-        return 
-            header.header10.dxgiFormat == DXGI_FORMAT_BC1_UNORM ||
-            header.header10.dxgiFormat == DXGI_FORMAT_BC2_UNORM ||
-            header.header10.dxgiFormat == DXGI_FORMAT_BC3_UNORM;
-    }
-    else
-    {
-        if (header.pf.flags & DDPF_RGB) 
-        {
-            return header.pf.amask != 0;
-        }
-        else if (header.pf.flags & DDPF_FOURCC)
-        {
-            if (header.pf.fourcc == FOURCC_RXGB ||
-                header.pf.fourcc == FOURCC_ATI1 ||
-                header.pf.fourcc == FOURCC_ATI2 ||
-                header.pf.flags & DDPF_NORMAL)
-            {
-                return false;
-            }
-            else
-            {
-                // @@ Here we could check the ALPHA_PIXELS flag, but nobody sets it. (except us?)
-                return true;
-            }
-        }
-
-        return false;
-    }
-}
-
-uint DirectDrawSurface::mipmapCount() const
-{
-    nvDebugCheck(isValid());
-    if (header.flags & DDSD_MIPMAPCOUNT) return header.mipmapcount;
-    else return 1;
-}
-
-
-uint DirectDrawSurface::width() const
-{
-    nvDebugCheck(isValid());
-    if (header.flags & DDSD_WIDTH) return header.width;
-    else return 1;
-}
-
-uint DirectDrawSurface::height() const
-{
-    nvDebugCheck(isValid());
-    if (header.flags & DDSD_HEIGHT) return header.height;
-    else return 1;
-}
-
-uint DirectDrawSurface::depth() const
-{
-    nvDebugCheck(isValid());
-    if (header.flags & DDSD_DEPTH) return header.depth;
-    else return 1;
-}
-
-bool DirectDrawSurface::isTexture1D() const
-{
-    nvDebugCheck(isValid());
-    if (header.hasDX10Header())
-    {
-        return header.header10.resourceDimension == D3D10_RESOURCE_DIMENSION_TEXTURE1D;
-    }
-    return false;
-}
-
-bool DirectDrawSurface::isTexture2D() const
-{
-    nvDebugCheck(isValid());
-    if (header.hasDX10Header())
-    {
-        return header.header10.resourceDimension == D3D10_RESOURCE_DIMENSION_TEXTURE2D;
-    }
-    else
-    {
-        return !isTexture3D() && !isTextureCube();
-    }
-}
-
-bool DirectDrawSurface::isTexture3D() const
-{
-    nvDebugCheck(isValid());
-    if (header.hasDX10Header())
-    {
-        return header.header10.resourceDimension == D3D10_RESOURCE_DIMENSION_TEXTURE3D;
-    }
-    else
-    {
-        return (header.caps.caps2 & DDSCAPS2_VOLUME) != 0;
-    }
-}
-
-bool DirectDrawSurface::isTextureCube() const
-{
-    nvDebugCheck(isValid());
-    return (header.caps.caps2 & DDSCAPS2_CUBEMAP) != 0;
-}
-
-void DirectDrawSurface::setNormalFlag(bool b)
-{
-    nvDebugCheck(isValid());
-    header.setNormalFlag(b);
-}
-
-void DirectDrawSurface::setHasAlphaFlag(bool b)
-{
-    nvDebugCheck(isValid());
-    header.setHasAlphaFlag(b);
-}
-
-void DirectDrawSurface::setUserVersion(int version)
-{
-    nvDebugCheck(isValid());
-    header.setUserVersion(version);
-}
-
-void DirectDrawSurface::mipmap(Image * img, uint face, uint mipmap)
-{
-    nvDebugCheck(isValid());
-
-    stream->seek(offset(face, mipmap));
-
-    uint w = width();
-    uint h = height();
-	uint d = depth();
-
-    // Compute width and height.
-    for (uint m = 0; m < mipmap; m++)
-    {
-        w = max(1U, w / 2);
-        h = max(1U, h / 2);
-		d = max(1U, d / 2);
-    }
-
-    img->allocate(w, h, d);
-
-    if (hasAlpha())
-    {
-        img->setFormat(Image::Format_ARGB);
-    }
-    else
-    {
-        img->setFormat(Image::Format_RGB);
-    }
-
-    if (header.hasDX10Header())
-    {
-        // So far only block formats supported.
-        readBlockImage(img);
-    }
-    else
-    {
-        if (header.pf.flags & DDPF_RGB) 
-        {
-            readLinearImage(img);
-        }
-        else if (header.pf.flags & DDPF_FOURCC)
-        {
-            readBlockImage(img);
-        }
-    }
-}
-
-/*void * DirectDrawSurface::readData(uint * sizePtr)
-{
-    uint header_size = 128; // sizeof(DDSHeader);
-
-    if (header.hasDX10Header())
-    {
-        header_size += 20; // sizeof(DDSHeader10);
-    }
-
-    stream->seek(header_size);
-
-    int size = stream->size() - header_size;
-    *sizePtr = size;
-
-    void * data = new unsigned char [size];
-    
-    size = stream->serialize(data, size);
-    nvDebugCheck(size == *sizePtr);
-
-    return data;
-}*/
-
-/*uint DirectDrawSurface::surfaceSize(uint mipmap) const
-{
-    uint w = header.width();
-    uint h = header.height();
-    uint d = header.depth();
-    for (int m = 0; m < mipmap; m++) {
-        w = (w + 1) / 2;
-        h = (h + 1) / 2;
-        d = (d + 1) / 2;
-    }
-    
-    bool isBlockFormat;
-    uint blockOrPixelSize;
-
-    if (header.hasDX10Header()) {
-        blockOrPixelSize = blockSize(header10.dxgiFormat);
-        isBlockFormat = (blockOrPixelSize != 0);
-        if (isBlockFormat) {
-            blockOrPixelSize = pixelSize(header10.dxgiFormat);
-        }
-    }
-    else {
-        header.pf.flags 
-    }
-
-    if (isBlockFormat) {
-        w = (w + 3) / 4;
-        h = (h + 3) / 4;
-        d = (d + 3) / 4; // @@ Is it necessary to align the depths?
-    }
-
-    uint blockOrPixelCount = w * h * d;
-
-    return blockCount = blockOrPixelSize;
-}*/
-
-bool DirectDrawSurface::readSurface(uint face, uint mipmap, void * data, uint size)
-{
-    if (size != surfaceSize(mipmap)) return false;
-
-    stream->seek(offset(face, mipmap));
-    if (stream->isError()) return false;
-
-    return stream->serialize(data, size) == size;
-}
-
-
-void DirectDrawSurface::readLinearImage(Image * img)
-{
-    nvDebugCheck(stream != NULL);
-    nvDebugCheck(img != NULL);
-
-    const uint w = img->width();
-    const uint h = img->height();
-
-    uint rshift, rsize;
-    PixelFormat::maskShiftAndSize(header.pf.rmask, &rshift, &rsize);
-
-    uint gshift, gsize;
-    PixelFormat::maskShiftAndSize(header.pf.gmask, &gshift, &gsize);
-
-    uint bshift, bsize;
-    PixelFormat::maskShiftAndSize(header.pf.bmask, &bshift, &bsize);
-
-    uint ashift, asize;
-    PixelFormat::maskShiftAndSize(header.pf.amask, &ashift, &asize);
-
-    uint byteCount = (header.pf.bitcount + 7) / 8;
-
-#pragma NV_MESSAGE("TODO: Support floating point linear images and other FOURCC codes.")
-
-    // Read linear RGB images.
-    for (uint y = 0; y < h; y++)
-    {
-        for (uint x = 0; x < w; x++)
-        {
-            uint c = 0;
-            stream->serialize(&c, byteCount);
-
-            Color32 pixel(0, 0, 0, 0xFF);
-            pixel.r = PixelFormat::convert((c & header.pf.rmask) >> rshift, rsize, 8);
-            pixel.g = PixelFormat::convert((c & header.pf.gmask) >> gshift, gsize, 8);
-            pixel.b = PixelFormat::convert((c & header.pf.bmask) >> bshift, bsize, 8);
-            pixel.a = PixelFormat::convert((c & header.pf.amask) >> ashift, asize, 8);
-
-            img->pixel(x, y) = pixel;
-        }
-    }
-}
-
-void DirectDrawSurface::readBlockImage(Image * img)
-{
-    nvDebugCheck(stream != NULL);
-    nvDebugCheck(img != NULL);
-
-    const uint w = img->width();
-    const uint h = img->height();
-
-    const uint bw = (w + 3) / 4;
-    const uint bh = (h + 3) / 4;
-
-    for (uint by = 0; by < bh; by++)
-    {
-        for (uint bx = 0; bx < bw; bx++)
-        {
-            ColorBlock block;
-
-            // Read color block.
-            readBlock(&block);
-
-            // Write color block.
-            for (uint y = 0; y < min(4U, h-4*by); y++)
-            {
-                for (uint x = 0; x < min(4U, w-4*bx); x++)
-                {
-                    img->pixel(4*bx+x, 4*by+y) = block.color(x, y);
-                }
-            }
-        }
-    }
-}
-
-static Color32 buildNormal(uint8 x, uint8 y)
-{
-    float nx = 2 * (x / 255.0f) - 1;
-    float ny = 2 * (y / 255.0f) - 1;
-    float nz = 0.0f;
-    if (1 - nx*nx - ny*ny > 0) nz = sqrtf(1 - nx*nx - ny*ny);
-    uint8 z = clamp(int(255.0f * (nz + 1) / 2.0f), 0, 255);
-
-    return Color32(x, y, z);
-}
-
-
-void DirectDrawSurface::readBlock(ColorBlock * rgba)
-{
-    nvDebugCheck(stream != NULL);
-    nvDebugCheck(rgba != NULL);
-
-    uint fourcc = header.pf.fourcc;
-
-    // Map DX10 block formats to fourcc codes.
-    if (header.hasDX10Header())
-    {
-        if (header.header10.dxgiFormat == DXGI_FORMAT_BC1_UNORM) fourcc = FOURCC_DXT1;
-        if (header.header10.dxgiFormat == DXGI_FORMAT_BC2_UNORM) fourcc = FOURCC_DXT3;
-        if (header.header10.dxgiFormat == DXGI_FORMAT_BC3_UNORM) fourcc = FOURCC_DXT5;
-        if (header.header10.dxgiFormat == DXGI_FORMAT_BC4_UNORM) fourcc = FOURCC_ATI1;
-        if (header.header10.dxgiFormat == DXGI_FORMAT_BC5_UNORM) fourcc = FOURCC_ATI2;
-    }
-
-
-    if (fourcc == FOURCC_DXT1)
-    {
-        BlockDXT1 block;
-        *stream << block;
-        block.decodeBlock(rgba);
-    }
-    else if (fourcc == FOURCC_DXT2 || fourcc == FOURCC_DXT3)
-    {
-        BlockDXT3 block;
-        *stream << block;
-        block.decodeBlock(rgba);
-    }
-    else if (fourcc == FOURCC_DXT4 || fourcc == FOURCC_DXT5 || fourcc == FOURCC_RXGB)
-    {
-        BlockDXT5 block;
-        *stream << block;
-        block.decodeBlock(rgba);
-
-        if (fourcc == FOURCC_RXGB)
-        {
-            // Swap R & A.
-            for (int i = 0; i < 16; i++)
-            {
-                Color32 & c = rgba->color(i);
-                uint tmp = c.r;
-                c.r = c.a;
-                c.a = tmp;
-            }
-        }
-    }
-    else if (fourcc == FOURCC_ATI1)
-    {
-        BlockATI1 block;
-        *stream << block;
-        block.decodeBlock(rgba);
-    }
-    else if (fourcc == FOURCC_ATI2)
-    {
-        BlockATI2 block;
-        *stream << block;
-        block.decodeBlock(rgba);
-    }
-
-    // If normal flag set, convert to normal.
-    if (header.pf.flags & DDPF_NORMAL)
-    {
-        if (fourcc == FOURCC_ATI2)
-        {
-            for (int i = 0; i < 16; i++)
-            {
-                Color32 & c = rgba->color(i);
-                c = buildNormal(c.r, c.g);
-            }
-        }
-        else if (fourcc == FOURCC_DXT5)
-        {
-            for (int i = 0; i < 16; i++)
-            {
-                Color32 & c = rgba->color(i);
-                c = buildNormal(c.a, c.g);
-            }
-        }
-    }
-}
-
-
-static uint mipmapExtent(uint mipmap, uint x)
-{
-    for (uint m = 0; m < mipmap; m++) {
-        x = max(1U, x / 2);
-    }
-    return x;
-}
-
-uint DirectDrawSurface::surfaceWidth(uint mipmap) const
-{
-    return mipmapExtent(mipmap, width());
-}
-
-uint DirectDrawSurface::surfaceHeight(uint mipmap) const
-{
-    return mipmapExtent(mipmap, height());
-}
-
-uint DirectDrawSurface::surfaceDepth(uint mipmap) const
-{
-    return mipmapExtent(mipmap, depth());
-}
-
-uint DirectDrawSurface::surfaceSize(uint mipmap) const
-{
-    uint w = surfaceWidth(mipmap);
-    uint h = surfaceHeight(mipmap);
-    uint d = surfaceDepth(mipmap);
-
-    uint blockSize = header.blockSize();
-
-    if (blockSize == 0) {
-        uint bitCount = header.pixelSize();
-        uint pitch = computeBytePitch(w, bitCount, 1); // Asuming 1 byte alignment, which is the same D3DX expects.
-        return pitch * h * d;
-    }
-    else {
-        w = (w + 3) / 4;
-        h = (h + 3) / 4;
-        d = d; // @@ How are 3D textures aligned?
-        return blockSize * w * h * d;
-    }
-}
-
-uint DirectDrawSurface::faceSize() const
-{
-    const uint count = mipmapCount();
-    uint size = 0;
-
-    for (uint m = 0; m < count; m++)
-    {
-        size += surfaceSize(m);
-    }
-
-    return size;
-}
-
-uint DirectDrawSurface::offset(const uint face, const uint mipmap)
-{
-    uint size = 128; // sizeof(DDSHeader);
-
-    if (header.hasDX10Header())
-    {
-        size += 20; // sizeof(DDSHeader10);
-    }
-
-    if (face != 0)
-    {
-        size += face * faceSize();
-    }
-
-    for (uint m = 0; m < mipmap; m++)
-    {
-        size += surfaceSize(m);
-    }
-
-    return size;
-}
-
-
-void DirectDrawSurface::printInfo() const
-{
-    printf("Flags: 0x%.8X\n", header.flags);
-    if (header.flags & DDSD_CAPS) printf("\tDDSD_CAPS\n");
-    if (header.flags & DDSD_PIXELFORMAT) printf("\tDDSD_PIXELFORMAT\n");
-    if (header.flags & DDSD_WIDTH) printf("\tDDSD_WIDTH\n");
-    if (header.flags & DDSD_HEIGHT) printf("\tDDSD_HEIGHT\n");
-    if (header.flags & DDSD_DEPTH) printf("\tDDSD_DEPTH\n");
-    if (header.flags & DDSD_PITCH) printf("\tDDSD_PITCH\n");
-    if (header.flags & DDSD_LINEARSIZE) printf("\tDDSD_LINEARSIZE\n");
-    if (header.flags & DDSD_MIPMAPCOUNT) printf("\tDDSD_MIPMAPCOUNT\n");
-
-    printf("Height: %d\n", header.height);
-    printf("Width: %d\n", header.width);
-    printf("Depth: %d\n", header.depth);
-    if (header.flags & DDSD_PITCH) printf("Pitch: %d\n", header.pitch);
-    else if (header.flags & DDSD_LINEARSIZE) printf("Linear size: %d\n", header.pitch);
-    printf("Mipmap count: %d\n", header.mipmapcount);
-
-    printf("Pixel Format:\n");
-    printf("\tFlags: 0x%.8X\n", header.pf.flags);
-    if (header.pf.flags & DDPF_RGB) printf("\t\tDDPF_RGB\n");
-    if (header.pf.flags & DDPF_LUMINANCE) printf("\t\tDDPF_LUMINANCE\n");
-    if (header.pf.flags & DDPF_FOURCC) printf("\t\tDDPF_FOURCC\n");
-    if (header.pf.flags & DDPF_ALPHAPIXELS) printf("\t\tDDPF_ALPHAPIXELS\n");
-    if (header.pf.flags & DDPF_ALPHA) printf("\t\tDDPF_ALPHA\n");
-    if (header.pf.flags & DDPF_PALETTEINDEXED1) printf("\t\tDDPF_PALETTEINDEXED1\n");
-    if (header.pf.flags & DDPF_PALETTEINDEXED2) printf("\t\tDDPF_PALETTEINDEXED2\n");
-    if (header.pf.flags & DDPF_PALETTEINDEXED4) printf("\t\tDDPF_PALETTEINDEXED4\n");
-    if (header.pf.flags & DDPF_PALETTEINDEXED8) printf("\t\tDDPF_PALETTEINDEXED8\n");
-    if (header.pf.flags & DDPF_ALPHAPREMULT) printf("\t\tDDPF_ALPHAPREMULT\n");
-    if (header.pf.flags & DDPF_NORMAL) printf("\t\tDDPF_NORMAL\n");
-
-    if (header.pf.fourcc != 0) { 
-        // Display fourcc code even when DDPF_FOURCC flag not set.
-        printf("\tFourCC: '%c%c%c%c' (0x%.8X)\n",
-            ((header.pf.fourcc >> 0) & 0xFF),
-            ((header.pf.fourcc >> 8) & 0xFF),
-            ((header.pf.fourcc >> 16) & 0xFF),
-            ((header.pf.fourcc >> 24) & 0xFF), 
-            header.pf.fourcc);
-    }
-
-    if ((header.pf.flags & DDPF_FOURCC) && (header.pf.bitcount != 0))
-    {
-        printf("\tSwizzle: '%c%c%c%c' (0x%.8X)\n", 
-            (header.pf.bitcount >> 0) & 0xFF,
-            (header.pf.bitcount >> 8) & 0xFF,
-            (header.pf.bitcount >> 16) & 0xFF,
-            (header.pf.bitcount >> 24) & 0xFF,
-            header.pf.bitcount);
-    }
-    else
-    {
-        printf("\tBit count: %d\n", header.pf.bitcount);
-    }
-
-    printf("\tRed mask:   0x%.8X\n", header.pf.rmask);
-    printf("\tGreen mask: 0x%.8X\n", header.pf.gmask);
-    printf("\tBlue mask:  0x%.8X\n", header.pf.bmask);
-    printf("\tAlpha mask: 0x%.8X\n", header.pf.amask);
-
-    printf("Caps:\n");
-    printf("\tCaps 1: 0x%.8X\n", header.caps.caps1);
-    if (header.caps.caps1 & DDSCAPS_COMPLEX) printf("\t\tDDSCAPS_COMPLEX\n");
-    if (header.caps.caps1 & DDSCAPS_TEXTURE) printf("\t\tDDSCAPS_TEXTURE\n");
-    if (header.caps.caps1 & DDSCAPS_MIPMAP) printf("\t\tDDSCAPS_MIPMAP\n");
-
-    printf("\tCaps 2: 0x%.8X\n", header.caps.caps2);
-    if (header.caps.caps2 & DDSCAPS2_VOLUME) printf("\t\tDDSCAPS2_VOLUME\n");
-    else if (header.caps.caps2 & DDSCAPS2_CUBEMAP)
-    {
-        printf("\t\tDDSCAPS2_CUBEMAP\n");
-        if ((header.caps.caps2 & DDSCAPS2_CUBEMAP_ALL_FACES) == DDSCAPS2_CUBEMAP_ALL_FACES) printf("\t\tDDSCAPS2_CUBEMAP_ALL_FACES\n");
-        else {
-            if (header.caps.caps2 & DDSCAPS2_CUBEMAP_POSITIVEX) printf("\t\tDDSCAPS2_CUBEMAP_POSITIVEX\n");
-            if (header.caps.caps2 & DDSCAPS2_CUBEMAP_NEGATIVEX) printf("\t\tDDSCAPS2_CUBEMAP_NEGATIVEX\n");
-            if (header.caps.caps2 & DDSCAPS2_CUBEMAP_POSITIVEY) printf("\t\tDDSCAPS2_CUBEMAP_POSITIVEY\n");
-            if (header.caps.caps2 & DDSCAPS2_CUBEMAP_NEGATIVEY) printf("\t\tDDSCAPS2_CUBEMAP_NEGATIVEY\n");
-            if (header.caps.caps2 & DDSCAPS2_CUBEMAP_POSITIVEZ) printf("\t\tDDSCAPS2_CUBEMAP_POSITIVEZ\n");
-            if (header.caps.caps2 & DDSCAPS2_CUBEMAP_NEGATIVEZ) printf("\t\tDDSCAPS2_CUBEMAP_NEGATIVEZ\n");
-        }
-    }
-
-    printf("\tCaps 3: 0x%.8X\n", header.caps.caps3);
-    printf("\tCaps 4: 0x%.8X\n", header.caps.caps4);
-
-    if (header.hasDX10Header())
-    {
-        printf("DX10 Header:\n");
-        printf("\tDXGI Format: %u (%s)\n", header.header10.dxgiFormat, getDxgiFormatString((DXGI_FORMAT)header.header10.dxgiFormat));
-        printf("\tResource dimension: %u (%s)\n", header.header10.resourceDimension, getD3d10ResourceDimensionString((D3D10_RESOURCE_DIMENSION)header.header10.resourceDimension));
-        printf("\tMisc flag: %u\n", header.header10.miscFlag);
-        printf("\tArray size: %u\n", header.header10.arraySize);
-    }
-
-    if (header.reserved[9] == FOURCC_NVTT)
-    {
-        int major = (header.reserved[10] >> 16) & 0xFF;
-        int minor = (header.reserved[10] >> 8) & 0xFF;
-        int revision= header.reserved[10] & 0xFF;
-
-        printf("Version:\n");
-        printf("\tNVIDIA Texture Tools %d.%d.%d\n", major, minor, revision);
-    }
-
-    if (header.reserved[7] == FOURCC_UVER)
-    {
-        printf("User Version: %d\n", header.reserved[8]);
-    }
-}
-
+// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
+// 
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+#include "DirectDrawSurface.h"
+#include "ColorBlock.h"
+#include "Image.h"
+#include "BlockDXT.h"
+#include "PixelFormat.h"
+
+#include "nvcore/Debug.h"
+#include "nvcore/Utils.h" // max
+#include "nvcore/StdStream.h"
+
+#include <string.h> // memset
+
+
+using namespace nv;
+
+
+const uint nv::FOURCC_NVTT = MAKEFOURCC('N', 'V', 'T', 'T');
+
+const uint nv::FOURCC_DDS = MAKEFOURCC('D', 'D', 'S', ' ');
+const uint nv::FOURCC_DXT1 = MAKEFOURCC('D', 'X', 'T', '1');
+const uint nv::FOURCC_DXT2 = MAKEFOURCC('D', 'X', 'T', '2');
+const uint nv::FOURCC_DXT3 = MAKEFOURCC('D', 'X', 'T', '3');
+const uint nv::FOURCC_DXT4 = MAKEFOURCC('D', 'X', 'T', '4');
+const uint nv::FOURCC_DXT5 = MAKEFOURCC('D', 'X', 'T', '5');
+const uint nv::FOURCC_RXGB = MAKEFOURCC('R', 'X', 'G', 'B');
+const uint nv::FOURCC_ATI1 = MAKEFOURCC('A', 'T', 'I', '1');
+const uint nv::FOURCC_ATI2 = MAKEFOURCC('A', 'T', 'I', '2');
+
+
+
+namespace
+{
+
+    static const uint FOURCC_A2XY = MAKEFOURCC('A', '2', 'X', 'Y');
+
+    static const uint FOURCC_DX10 = MAKEFOURCC('D', 'X', '1', '0');
+
+    static const uint FOURCC_UVER = MAKEFOURCC('U', 'V', 'E', 'R');
+
+
+
+    static const uint DDSD_CAPS = 0x00000001U;
+    static const uint DDSD_PIXELFORMAT = 0x00001000U;
+    static const uint DDSD_WIDTH = 0x00000004U;
+    static const uint DDSD_HEIGHT = 0x00000002U;
+    static const uint DDSD_PITCH = 0x00000008U;
+    static const uint DDSD_MIPMAPCOUNT = 0x00020000U;
+    static const uint DDSD_LINEARSIZE = 0x00080000U;
+    static const uint DDSD_DEPTH = 0x00800000U;
+
+    static const uint DDSCAPS_COMPLEX = 0x00000008U;
+    static const uint DDSCAPS_TEXTURE = 0x00001000U;
+    static const uint DDSCAPS_MIPMAP = 0x00400000U;
+    static const uint DDSCAPS2_VOLUME = 0x00200000U;
+    static const uint DDSCAPS2_CUBEMAP = 0x00000200U;
+
+    static const uint DDSCAPS2_CUBEMAP_POSITIVEX = 0x00000400U;
+    static const uint DDSCAPS2_CUBEMAP_NEGATIVEX = 0x00000800U;
+    static const uint DDSCAPS2_CUBEMAP_POSITIVEY = 0x00001000U;
+    static const uint DDSCAPS2_CUBEMAP_NEGATIVEY = 0x00002000U;
+    static const uint DDSCAPS2_CUBEMAP_POSITIVEZ = 0x00004000U;
+    static const uint DDSCAPS2_CUBEMAP_NEGATIVEZ = 0x00008000U;
+    static const uint DDSCAPS2_CUBEMAP_ALL_FACES = 0x0000FC00U;
+
+
+    const char * getDxgiFormatString(DXGI_FORMAT dxgiFormat)
+    {
+#define CASE(format) case DXGI_FORMAT_##format: return #format
+        switch(dxgiFormat)
+        {
+            CASE(UNKNOWN);
+
+            CASE(R32G32B32A32_TYPELESS);
+            CASE(R32G32B32A32_FLOAT);
+            CASE(R32G32B32A32_UINT);
+            CASE(R32G32B32A32_SINT);
+
+            CASE(R32G32B32_TYPELESS);
+            CASE(R32G32B32_FLOAT);
+            CASE(R32G32B32_UINT);
+            CASE(R32G32B32_SINT);
+
+            CASE(R16G16B16A16_TYPELESS);
+            CASE(R16G16B16A16_FLOAT);
+            CASE(R16G16B16A16_UNORM);
+            CASE(R16G16B16A16_UINT);
+            CASE(R16G16B16A16_SNORM);
+            CASE(R16G16B16A16_SINT);
+
+            CASE(R32G32_TYPELESS);
+            CASE(R32G32_FLOAT);
+            CASE(R32G32_UINT);
+            CASE(R32G32_SINT);
+
+            CASE(R32G8X24_TYPELESS);
+            CASE(D32_FLOAT_S8X24_UINT);
+            CASE(R32_FLOAT_X8X24_TYPELESS);
+            CASE(X32_TYPELESS_G8X24_UINT);
+
+            CASE(R10G10B10A2_TYPELESS);
+            CASE(R10G10B10A2_UNORM);
+            CASE(R10G10B10A2_UINT);
+
+            CASE(R11G11B10_FLOAT);
+
+            CASE(R8G8B8A8_TYPELESS);
+            CASE(R8G8B8A8_UNORM);
+            CASE(R8G8B8A8_UNORM_SRGB);
+            CASE(R8G8B8A8_UINT);
+            CASE(R8G8B8A8_SNORM);
+            CASE(R8G8B8A8_SINT);
+
+            CASE(R16G16_TYPELESS);
+            CASE(R16G16_FLOAT);
+            CASE(R16G16_UNORM);
+            CASE(R16G16_UINT);
+            CASE(R16G16_SNORM);
+            CASE(R16G16_SINT);
+
+            CASE(R32_TYPELESS);
+            CASE(D32_FLOAT);
+            CASE(R32_FLOAT);
+            CASE(R32_UINT);
+            CASE(R32_SINT);
+
+            CASE(R24G8_TYPELESS);
+            CASE(D24_UNORM_S8_UINT);
+            CASE(R24_UNORM_X8_TYPELESS);
+            CASE(X24_TYPELESS_G8_UINT);
+
+            CASE(R8G8_TYPELESS);
+            CASE(R8G8_UNORM);
+            CASE(R8G8_UINT);
+            CASE(R8G8_SNORM);
+            CASE(R8G8_SINT);
+
+            CASE(R16_TYPELESS);
+            CASE(R16_FLOAT);
+            CASE(D16_UNORM);
+            CASE(R16_UNORM);
+            CASE(R16_UINT);
+            CASE(R16_SNORM);
+            CASE(R16_SINT);
+
+            CASE(R8_TYPELESS);
+            CASE(R8_UNORM);
+            CASE(R8_UINT);
+            CASE(R8_SNORM);
+            CASE(R8_SINT);
+            CASE(A8_UNORM);
+
+            CASE(R1_UNORM);
+
+            CASE(R9G9B9E5_SHAREDEXP);
+
+            CASE(R8G8_B8G8_UNORM);
+            CASE(G8R8_G8B8_UNORM);
+
+            CASE(BC1_TYPELESS);
+            CASE(BC1_UNORM);
+            CASE(BC1_UNORM_SRGB);
+
+            CASE(BC2_TYPELESS);
+            CASE(BC2_UNORM);
+            CASE(BC2_UNORM_SRGB);
+
+            CASE(BC3_TYPELESS);
+            CASE(BC3_UNORM);
+            CASE(BC3_UNORM_SRGB);
+
+            CASE(BC4_TYPELESS);
+            CASE(BC4_UNORM);
+            CASE(BC4_SNORM);
+
+            CASE(BC5_TYPELESS);
+            CASE(BC5_UNORM);
+            CASE(BC5_SNORM);
+
+            CASE(B5G6R5_UNORM);
+            CASE(B5G5R5A1_UNORM);
+            CASE(B8G8R8A8_UNORM);
+            CASE(B8G8R8X8_UNORM);
+
+        default: 
+            return "UNKNOWN";
+        }
+#undef CASE
+    }
+
+    const char * getD3d10ResourceDimensionString(D3D10_RESOURCE_DIMENSION resourceDimension)
+    {
+        switch(resourceDimension)
+        {
+            default:
+            case D3D10_RESOURCE_DIMENSION_UNKNOWN: return "UNKNOWN";
+            case D3D10_RESOURCE_DIMENSION_BUFFER: return "BUFFER";
+            case D3D10_RESOURCE_DIMENSION_TEXTURE1D: return "TEXTURE1D";
+            case D3D10_RESOURCE_DIMENSION_TEXTURE2D: return "TEXTURE2D";
+            case D3D10_RESOURCE_DIMENSION_TEXTURE3D: return "TEXTURE3D";
+        }
+    }
+
+    static uint pixelSize(D3DFORMAT format) {
+        if (format == D3DFMT_R16F) return 8*2;
+        if (format == D3DFMT_G16R16F) return 8*4;
+        if (format == D3DFMT_A16B16G16R16F) return 8*8;
+        if (format == D3DFMT_R32F) return 8*4;
+        if (format == D3DFMT_G32R32F) return 8*8;
+        if (format == D3DFMT_A32B32G32R32F) return 8*16;
+
+        if (format == D3DFMT_R8G8B8) return 8*3;
+        if (format == D3DFMT_A8R8G8B8) return 8*4;
+        if (format == D3DFMT_X8R8G8B8) return 8*4;
+        if (format == D3DFMT_R5G6B5) return 8*2;
+        if (format == D3DFMT_X1R5G5B5) return 8*2;
+        if (format == D3DFMT_A1R5G5B5) return 8*2;
+        if (format == D3DFMT_A4R4G4B4) return 8*2;
+        if (format == D3DFMT_R3G3B2) return 8*1;
+        if (format == D3DFMT_A8) return 8*1;
+        if (format == D3DFMT_A8R3G3B2) return 8*2;
+        if (format == D3DFMT_X4R4G4B4) return 8*2;
+        if (format == D3DFMT_A2B10G10R10) return 8*4;
+        if (format == D3DFMT_A8B8G8R8) return 8*4;
+        if (format == D3DFMT_X8B8G8R8) return 8*4;
+        if (format == D3DFMT_G16R16) return 8*4;
+        if (format == D3DFMT_A2R10G10B10) return 8*4;
+        if (format == D3DFMT_A2B10G10R10) return 8*4;
+
+        if (format == D3DFMT_L8) return 8*1;
+        if (format == D3DFMT_L16) return 8*2;
+
+        return 0;
+    }
+
+    static uint pixelSize(DXGI_FORMAT format) {
+        switch(format) {
+            case DXGI_FORMAT_R32G32B32A32_TYPELESS:
+            case DXGI_FORMAT_R32G32B32A32_FLOAT:
+            case DXGI_FORMAT_R32G32B32A32_UINT:
+            case DXGI_FORMAT_R32G32B32A32_SINT:
+                return 8*16;
+
+            case DXGI_FORMAT_R32G32B32_TYPELESS:
+            case DXGI_FORMAT_R32G32B32_FLOAT:
+            case DXGI_FORMAT_R32G32B32_UINT:
+            case DXGI_FORMAT_R32G32B32_SINT:
+                return 8*12;
+
+            case DXGI_FORMAT_R16G16B16A16_TYPELESS:
+            case DXGI_FORMAT_R16G16B16A16_FLOAT:
+            case DXGI_FORMAT_R16G16B16A16_UNORM:
+            case DXGI_FORMAT_R16G16B16A16_UINT:
+            case DXGI_FORMAT_R16G16B16A16_SNORM:
+            case DXGI_FORMAT_R16G16B16A16_SINT:
+            
+            case DXGI_FORMAT_R32G32_TYPELESS:
+            case DXGI_FORMAT_R32G32_FLOAT:
+            case DXGI_FORMAT_R32G32_UINT:
+            case DXGI_FORMAT_R32G32_SINT:
+
+            case DXGI_FORMAT_R32G8X24_TYPELESS:
+            case DXGI_FORMAT_D32_FLOAT_S8X24_UINT:
+            case DXGI_FORMAT_R32_FLOAT_X8X24_TYPELESS:
+            case DXGI_FORMAT_X32_TYPELESS_G8X24_UINT:
+                return 8*8;
+
+            case DXGI_FORMAT_R10G10B10A2_TYPELESS:
+            case DXGI_FORMAT_R10G10B10A2_UNORM:
+            case DXGI_FORMAT_R10G10B10A2_UINT:
+
+            case DXGI_FORMAT_R11G11B10_FLOAT:
+
+            case DXGI_FORMAT_R8G8B8A8_TYPELESS:
+            case DXGI_FORMAT_R8G8B8A8_UNORM:
+            case DXGI_FORMAT_R8G8B8A8_UNORM_SRGB:
+            case DXGI_FORMAT_R8G8B8A8_UINT:
+            case DXGI_FORMAT_R8G8B8A8_SNORM:
+            case DXGI_FORMAT_R8G8B8A8_SINT:
+
+            case DXGI_FORMAT_R16G16_TYPELESS:
+            case DXGI_FORMAT_R16G16_FLOAT:
+            case DXGI_FORMAT_R16G16_UNORM:
+            case DXGI_FORMAT_R16G16_UINT:
+            case DXGI_FORMAT_R16G16_SNORM:
+            case DXGI_FORMAT_R16G16_SINT:
+
+            case DXGI_FORMAT_R32_TYPELESS:
+            case DXGI_FORMAT_D32_FLOAT:
+            case DXGI_FORMAT_R32_FLOAT:
+            case DXGI_FORMAT_R32_UINT:
+            case DXGI_FORMAT_R32_SINT:
+
+            case DXGI_FORMAT_R24G8_TYPELESS:
+            case DXGI_FORMAT_D24_UNORM_S8_UINT:
+            case DXGI_FORMAT_R24_UNORM_X8_TYPELESS:
+            case DXGI_FORMAT_X24_TYPELESS_G8_UINT:
+                return 8*4;
+
+            case DXGI_FORMAT_R8G8_TYPELESS:
+            case DXGI_FORMAT_R8G8_UNORM:
+            case DXGI_FORMAT_R8G8_UINT:
+            case DXGI_FORMAT_R8G8_SNORM:
+            case DXGI_FORMAT_R8G8_SINT:
+
+            case DXGI_FORMAT_R16_TYPELESS:
+            case DXGI_FORMAT_R16_FLOAT:
+            case DXGI_FORMAT_D16_UNORM:
+            case DXGI_FORMAT_R16_UNORM:
+            case DXGI_FORMAT_R16_UINT:
+            case DXGI_FORMAT_R16_SNORM:
+            case DXGI_FORMAT_R16_SINT:
+                return 8*2;
+
+            case DXGI_FORMAT_R8_TYPELESS:
+            case DXGI_FORMAT_R8_UNORM:
+            case DXGI_FORMAT_R8_UINT:
+            case DXGI_FORMAT_R8_SNORM:
+            case DXGI_FORMAT_R8_SINT:
+            case DXGI_FORMAT_A8_UNORM:
+                return 8*1;
+
+            case DXGI_FORMAT_R1_UNORM:
+                return 1;
+
+            case DXGI_FORMAT_R9G9B9E5_SHAREDEXP:
+                return 8*4;
+
+            case DXGI_FORMAT_R8G8_B8G8_UNORM:
+            case DXGI_FORMAT_G8R8_G8B8_UNORM:
+                return 8*4;
+
+            case DXGI_FORMAT_B5G6R5_UNORM:
+            case DXGI_FORMAT_B5G5R5A1_UNORM:
+                return 8*2;
+            
+            case DXGI_FORMAT_B8G8R8A8_UNORM:
+            case DXGI_FORMAT_B8G8R8X8_UNORM:
+                return 8*4;
+
+            case DXGI_FORMAT_R10G10B10_XR_BIAS_A2_UNORM:
+            case DXGI_FORMAT_B8G8R8A8_TYPELESS:
+            case DXGI_FORMAT_B8G8R8A8_UNORM_SRGB:
+            case DXGI_FORMAT_B8G8R8X8_TYPELESS:
+            case DXGI_FORMAT_B8G8R8X8_UNORM_SRGB:
+                return 8*4;
+        }
+
+        return 0;
+    }
+
+} // namespace
+
+namespace nv
+{
+    static Stream & operator<< (Stream & s, DDSPixelFormat & pf)
+    {
+        nvStaticCheck(sizeof(DDSPixelFormat) == 32);
+        s << pf.size;
+        s << pf.flags;
+        s << pf.fourcc;
+        s << pf.bitcount;
+        s.serialize(&pf.rmask, sizeof(pf.rmask));
+        s.serialize(&pf.gmask, sizeof(pf.gmask));
+        s.serialize(&pf.bmask, sizeof(pf.bmask));
+        s.serialize(&pf.amask, sizeof(pf.amask));
+        // s << pf.rmask;
+        // s << pf.gmask;
+        // s << pf.bmask;
+        // s << pf.amask;
+        return s;
+    }
+
+    static Stream & operator<< (Stream & s, DDSCaps & caps)
+    {
+        nvStaticCheck(sizeof(DDSCaps) == 16);
+        s << caps.caps1;
+        s << caps.caps2;
+        s << caps.caps3;
+        s << caps.caps4;
+        return s;
+    }
+
+    static Stream & operator<< (Stream & s, DDSHeader10 & header)
+    {
+        nvStaticCheck(sizeof(DDSHeader10) == 20);
+        s << header.dxgiFormat;
+        s << header.resourceDimension;
+        s << header.miscFlag;
+        s << header.arraySize;
+        s << header.reserved;
+        return s;
+    }
+
+    Stream & operator<< (Stream & s, DDSHeader & header)
+    {
+        nvStaticCheck(sizeof(DDSHeader) == 148);
+        s << header.fourcc;
+        s << header.size;
+        s << header.flags;
+        s << header.height;
+        s << header.width;
+        s << header.pitch;
+        s << header.depth;
+        s << header.mipmapcount;
+        for (int i = 0; i < 11; i++) {
+            s << header.reserved[i];
+        }
+        s << header.pf;
+        s << header.caps;
+        s << header.notused;
+
+        if (header.hasDX10Header())
+        {
+            s << header.header10;
+        }
+
+        return s;
+    }
+
+} // nv namespace
+
+namespace
+{
+    struct FormatDescriptor
+    {
+        uint format;
+        uint bitcount;
+        uint rmask;
+        uint gmask;
+        uint bmask;
+        uint amask;
+    };
+
+    static const FormatDescriptor s_d3d9Formats[] =
+    {
+        { D3DFMT_R8G8B8,         24, 0xFF0000,   0xFF00,	    0xFF,       0 },
+        { D3DFMT_A8R8G8B8,       32, 0xFF0000,   0xFF00,     0xFF,       0xFF000000 },  // DXGI_FORMAT_B8G8R8A8_UNORM
+        { D3DFMT_X8R8G8B8,       32, 0xFF0000,   0xFF00,     0xFF,       0 },           // DXGI_FORMAT_B8G8R8X8_UNORM
+        { D3DFMT_R5G6B5,         16, 0xF800,     0x7E0,      0x1F,       0 },           // DXGI_FORMAT_B5G6R5_UNORM
+        { D3DFMT_X1R5G5B5,       16, 0x7C00,     0x3E0,      0x1F,       0 },
+        { D3DFMT_A1R5G5B5,       16, 0x7C00,     0x3E0,      0x1F,       0x8000 },      // DXGI_FORMAT_B5G5R5A1_UNORM
+        { D3DFMT_A4R4G4B4,       16, 0xF00,      0xF0,       0xF,        0xF000 },
+        { D3DFMT_R3G3B2,         8,  0xE0,       0x1C,       0x3,	    0 },
+        { D3DFMT_A8,             8,  0,          0,          0,		    8 },            // DXGI_FORMAT_A8_UNORM
+        { D3DFMT_A8R3G3B2,       16, 0xE0,       0x1C,       0x3,        0xFF00 },
+        { D3DFMT_X4R4G4B4,       16, 0xF00,      0xF0,       0xF,        0 },
+        { D3DFMT_A2B10G10R10,    32, 0x3FF,      0xFFC00,    0x3FF00000, 0xC0000000 },  // DXGI_FORMAT_R10G10B10A2
+        { D3DFMT_A8B8G8R8,       32, 0xFF,       0xFF00,     0xFF0000,   0xFF000000 },  // DXGI_FORMAT_R8G8B8A8_UNORM
+        { D3DFMT_X8B8G8R8,       32, 0xFF,       0xFF00,     0xFF0000,   0 },
+        { D3DFMT_G16R16,         32, 0xFFFF,     0xFFFF0000, 0,          0 },           // DXGI_FORMAT_R16G16_UNORM
+        { D3DFMT_A2R10G10B10,    32, 0x3FF00000, 0xFFC00,    0x3FF,      0xC0000000 },
+        { D3DFMT_A2B10G10R10,    32, 0x3FF,      0xFFC00,    0x3FF00000, 0xC0000000 },
+
+        { D3DFMT_L8,             8,  8,          0,          0,          0 },           // DXGI_FORMAT_R8_UNORM 
+        { D3DFMT_L16,            16, 16,         0,          0,          0 },           // DXGI_FORMAT_R16_UNORM
+    };
+
+    static const uint s_d3d9FormatCount = NV_ARRAY_SIZE(s_d3d9Formats);
+
+} // namespace
+
+uint nv::findD3D9Format(uint bitcount, uint rmask, uint gmask, uint bmask, uint amask)
+{
+    for (int i = 0; i < s_d3d9FormatCount; i++)
+    {
+        if (s_d3d9Formats[i].bitcount == bitcount &&
+            s_d3d9Formats[i].rmask == rmask &&
+            s_d3d9Formats[i].gmask == gmask &&
+            s_d3d9Formats[i].bmask == bmask &&
+            s_d3d9Formats[i].amask == amask)
+        {
+            return s_d3d9Formats[i].format;
+        }
+    }
+
+    return 0;
+}
+
+
+DDSHeader::DDSHeader()
+{
+    this->fourcc = FOURCC_DDS;
+    this->size = 124;
+    this->flags  = (DDSD_CAPS|DDSD_PIXELFORMAT);
+    this->height = 0;
+    this->width = 0;
+    this->pitch = 0;
+    this->depth = 0;
+    this->mipmapcount = 0;
+    memset(this->reserved, 0, sizeof(this->reserved));
+
+    // Store version information on the reserved header attributes.
+    this->reserved[9] = FOURCC_NVTT;
+    this->reserved[10] = (2 << 16) | (1 << 8) | (0); // major.minor.revision
+
+    this->pf.size = 32;
+    this->pf.flags = 0;
+    this->pf.fourcc = 0;
+    this->pf.bitcount = 0;
+    this->pf.rmask = 0;
+    this->pf.gmask = 0;
+    this->pf.bmask = 0;
+    this->pf.amask = 0;
+    this->caps.caps1 = DDSCAPS_TEXTURE;
+    this->caps.caps2 = 0;
+    this->caps.caps3 = 0;
+    this->caps.caps4 = 0;
+    this->notused = 0;
+
+    this->header10.dxgiFormat = DXGI_FORMAT_UNKNOWN;
+    this->header10.resourceDimension = D3D10_RESOURCE_DIMENSION_UNKNOWN;
+    this->header10.miscFlag = 0;
+    this->header10.arraySize = 0;
+    this->header10.reserved = 0;
+}
+
+void DDSHeader::setWidth(uint w)
+{
+    this->flags |= DDSD_WIDTH;
+    this->width = w;
+}
+
+void DDSHeader::setHeight(uint h)
+{
+    this->flags |= DDSD_HEIGHT;
+    this->height = h;
+}
+
+void DDSHeader::setDepth(uint d)
+{
+    this->flags |= DDSD_DEPTH;
+    this->depth = d;
+}
+
+void DDSHeader::setMipmapCount(uint count)
+{
+    if (count == 0 || count == 1)
+    {
+        this->flags &= ~DDSD_MIPMAPCOUNT;
+        this->mipmapcount = 1;
+
+        if (this->caps.caps2 == 0) {
+            this->caps.caps1 = DDSCAPS_TEXTURE;
+        }
+        else {
+            this->caps.caps1 = DDSCAPS_TEXTURE | DDSCAPS_COMPLEX;
+        }
+    }
+    else
+    {
+        this->flags |= DDSD_MIPMAPCOUNT;
+        this->mipmapcount = count;
+
+        this->caps.caps1 |= DDSCAPS_COMPLEX | DDSCAPS_MIPMAP;
+    }
+}
+
+void DDSHeader::setTexture2D()
+{
+    this->header10.resourceDimension = D3D10_RESOURCE_DIMENSION_TEXTURE2D;
+    this->header10.arraySize = 1;
+}
+
+void DDSHeader::setTexture3D()
+{
+    this->caps.caps2 = DDSCAPS2_VOLUME;
+
+    this->header10.resourceDimension = D3D10_RESOURCE_DIMENSION_TEXTURE3D;
+    this->header10.arraySize = 1;
+}
+
+void DDSHeader::setTextureCube()
+{
+    this->caps.caps1 |= DDSCAPS_COMPLEX;
+    this->caps.caps2 = DDSCAPS2_CUBEMAP | DDSCAPS2_CUBEMAP_ALL_FACES;
+
+    this->header10.resourceDimension = D3D10_RESOURCE_DIMENSION_TEXTURE2D;
+    this->header10.arraySize = 6;
+}
+
+void DDSHeader::setLinearSize(uint size)
+{
+    this->flags &= ~DDSD_PITCH;
+    this->flags |= DDSD_LINEARSIZE;
+    this->pitch = size;
+}
+
+void DDSHeader::setPitch(uint pitch)
+{
+    this->flags &= ~DDSD_LINEARSIZE;
+    this->flags |= DDSD_PITCH;
+    this->pitch = pitch;
+}
+
+void DDSHeader::setFourCC(uint8 c0, uint8 c1, uint8 c2, uint8 c3)
+{
+    // set fourcc pixel format.
+    this->pf.flags = DDPF_FOURCC;
+    this->pf.fourcc = MAKEFOURCC(c0, c1, c2, c3);
+
+    this->pf.bitcount = 0;
+    this->pf.rmask = 0;
+    this->pf.gmask = 0;
+    this->pf.bmask = 0;
+    this->pf.amask = 0;
+}
+
+void DDSHeader::setFormatCode(uint32 code)
+{
+    // set fourcc pixel format.
+    this->pf.flags = DDPF_FOURCC;
+    this->pf.fourcc = code;
+
+    this->pf.bitcount = 0;
+    this->pf.rmask = 0;
+    this->pf.gmask = 0;
+    this->pf.bmask = 0;
+    this->pf.amask = 0;
+}
+
+void DDSHeader::setSwizzleCode(uint8 c0, uint8 c1, uint8 c2, uint8 c3)
+{
+    this->pf.bitcount = MAKEFOURCC(c0, c1, c2, c3);
+}
+
+
+void DDSHeader::setPixelFormat(uint bitcount, uint rmask, uint gmask, uint bmask, uint amask)
+{
+    // Make sure the masks are correct.
+    nvCheck((rmask & gmask) == 0);
+    nvCheck((rmask & bmask) == 0);
+    nvCheck((rmask & amask) == 0);
+    nvCheck((gmask & bmask) == 0);
+    nvCheck((gmask & amask) == 0);
+    nvCheck((bmask & amask) == 0);
+
+    if (rmask != 0 || gmask != 0 || bmask != 0)
+    {
+        if (gmask == 0 && bmask == 0)
+        {
+            this->pf.flags = DDPF_LUMINANCE;
+        }
+        else
+        {
+            this->pf.flags = DDPF_RGB;
+        }
+
+        if (amask != 0) {
+            this->pf.flags |= DDPF_ALPHAPIXELS;
+        }
+    }
+    else if (amask != 0)
+    {
+        this->pf.flags |= DDPF_ALPHA;
+    }
+
+    if (bitcount == 0)
+    {
+        // Compute bit count from the masks.
+        uint total = rmask | gmask | bmask | amask;
+        while(total != 0) {
+            bitcount++;
+            total >>= 1;
+        }
+    }
+
+    // D3DX functions do not like this:
+    this->pf.fourcc = 0; //findD3D9Format(bitcount, rmask, gmask, bmask, amask);
+    /*if (this->pf.fourcc) {
+        this->pf.flags |= DDPF_FOURCC;
+    }*/
+
+    nvCheck(bitcount > 0 && bitcount <= 32);
+    this->pf.bitcount = bitcount;
+    this->pf.rmask = rmask;
+    this->pf.gmask = gmask;
+    this->pf.bmask = bmask;
+    this->pf.amask = amask;
+}
+
+void DDSHeader::setDX10Format(uint format)
+{
+    this->pf.flags = DDPF_FOURCC;
+    this->pf.fourcc = FOURCC_DX10;
+    this->header10.dxgiFormat = format;
+}
+
+void DDSHeader::setNormalFlag(bool b)
+{
+    if (b) this->pf.flags |= DDPF_NORMAL;
+    else this->pf.flags &= ~DDPF_NORMAL;
+}
+
+void DDSHeader::setSrgbFlag(bool b)
+{
+    if (b) this->pf.flags |= DDPF_SRGB;
+    else this->pf.flags &= ~DDPF_SRGB;
+}
+
+void DDSHeader::setHasAlphaFlag(bool b)
+{
+    if (b) this->pf.flags |= DDPF_ALPHAPIXELS;
+    else this->pf.flags &= ~DDPF_ALPHAPIXELS;
+}
+
+void DDSHeader::setUserVersion(int version)
+{
+    this->reserved[7] = FOURCC_UVER;
+    this->reserved[8] = version;
+}
+
+void DDSHeader::swapBytes()
+{
+    this->fourcc = POSH_LittleU32(this->fourcc);
+    this->size = POSH_LittleU32(this->size);
+    this->flags = POSH_LittleU32(this->flags);
+    this->height = POSH_LittleU32(this->height);
+    this->width = POSH_LittleU32(this->width);
+    this->pitch = POSH_LittleU32(this->pitch);
+    this->depth = POSH_LittleU32(this->depth);
+    this->mipmapcount = POSH_LittleU32(this->mipmapcount);
+
+    for(int i = 0; i < 11; i++) {
+        this->reserved[i] = POSH_LittleU32(this->reserved[i]);
+    }
+
+    this->pf.size = POSH_LittleU32(this->pf.size);
+    this->pf.flags = POSH_LittleU32(this->pf.flags);
+    this->pf.fourcc = POSH_LittleU32(this->pf.fourcc);
+    this->pf.bitcount = POSH_LittleU32(this->pf.bitcount);
+    this->pf.rmask = POSH_LittleU32(this->pf.rmask);
+    this->pf.gmask = POSH_LittleU32(this->pf.gmask);
+    this->pf.bmask = POSH_LittleU32(this->pf.bmask);
+    this->pf.amask = POSH_LittleU32(this->pf.amask);
+    this->caps.caps1 = POSH_LittleU32(this->caps.caps1);
+    this->caps.caps2 = POSH_LittleU32(this->caps.caps2);
+    this->caps.caps3 = POSH_LittleU32(this->caps.caps3);
+    this->caps.caps4 = POSH_LittleU32(this->caps.caps4);
+    this->notused = POSH_LittleU32(this->notused);
+
+    this->header10.dxgiFormat = POSH_LittleU32(this->header10.dxgiFormat);
+    this->header10.resourceDimension = POSH_LittleU32(this->header10.resourceDimension);
+    this->header10.miscFlag = POSH_LittleU32(this->header10.miscFlag);
+    this->header10.arraySize = POSH_LittleU32(this->header10.arraySize);
+    this->header10.reserved = POSH_LittleU32(this->header10.reserved);
+}
+
+bool DDSHeader::hasDX10Header() const
+{
+    //if (pf.flags & DDPF_FOURCC) {
+        return this->pf.fourcc == FOURCC_DX10;
+    //}
+    //return false;
+}
+
+uint DDSHeader::signature() const
+{
+    return this->reserved[9];
+}
+
+uint DDSHeader::toolVersion() const
+{
+    return this->reserved[10];
+}
+
+uint DDSHeader::userVersion() const
+{
+    if (this->reserved[7] == FOURCC_UVER) {
+        return this->reserved[8];
+    }
+    return 0;
+}
+
+bool DDSHeader::isNormalMap() const
+{
+    return (pf.flags & DDPF_NORMAL) != 0;
+}
+
+bool DDSHeader::isSrgb() const
+{
+    return (pf.flags & DDPF_SRGB) != 0;
+}
+
+bool DDSHeader::hasAlpha() const
+{
+    return (pf.flags & DDPF_ALPHAPIXELS) != 0;
+}
+
+uint DDSHeader::d3d9Format() const
+{
+    if (pf.flags & DDPF_FOURCC) {
+        return pf.fourcc;
+    }
+    else {
+        return findD3D9Format(pf.bitcount, pf.rmask, pf.gmask, pf.bmask, pf.amask);
+    }
+}
+
+uint DDSHeader::pixelSize() const
+{
+    if (hasDX10Header()) {
+        return ::pixelSize((DXGI_FORMAT)header10.dxgiFormat);
+    }
+    else {
+        if (flags & DDPF_FOURCC) {
+            return ::pixelSize((D3DFORMAT)pf.fourcc);
+        }
+        else {
+            nvDebugCheck((pf.flags & DDPF_RGB) || (pf.flags & DDPF_LUMINANCE));
+            return pf.bitcount;
+        }
+    }
+}
+
+uint DDSHeader::blockSize() const
+{
+    switch(pf.fourcc) 
+    {
+    case FOURCC_DXT1:
+    case FOURCC_ATI1:
+        return 8;
+    case FOURCC_DXT2:
+    case FOURCC_DXT3:
+    case FOURCC_DXT4:
+    case FOURCC_DXT5:
+    case FOURCC_RXGB:
+    case FOURCC_ATI2:
+        return 16;
+    case FOURCC_DX10:
+        switch(header10.dxgiFormat)
+        {
+        case DXGI_FORMAT_BC1_TYPELESS:
+        case DXGI_FORMAT_BC1_UNORM:
+        case DXGI_FORMAT_BC1_UNORM_SRGB:
+        case DXGI_FORMAT_BC4_TYPELESS:
+        case DXGI_FORMAT_BC4_UNORM:
+        case DXGI_FORMAT_BC4_SNORM:
+            return 8;
+        case DXGI_FORMAT_BC2_TYPELESS:
+        case DXGI_FORMAT_BC2_UNORM:
+        case DXGI_FORMAT_BC2_UNORM_SRGB:
+        case DXGI_FORMAT_BC3_TYPELESS:
+        case DXGI_FORMAT_BC3_UNORM:
+        case DXGI_FORMAT_BC3_UNORM_SRGB:
+        case DXGI_FORMAT_BC5_TYPELESS:
+        case DXGI_FORMAT_BC5_UNORM:
+        case DXGI_FORMAT_BC5_SNORM:
+        case DXGI_FORMAT_BC6H_TYPELESS:
+        case DXGI_FORMAT_BC6H_SF16:
+        case DXGI_FORMAT_BC6H_UF16:
+        case DXGI_FORMAT_BC7_TYPELESS:
+        case DXGI_FORMAT_BC7_UNORM:
+        case DXGI_FORMAT_BC7_UNORM_SRGB:
+            return 16;
+        };
+    };
+
+    // Not a block image.
+    return 0;
+}
+
+bool DDSHeader::isBlockFormat() const
+{
+    return blockSize() != 0;
+}
+
+
+
+
+
+DirectDrawSurface::DirectDrawSurface() : stream(NULL)
+{
+}
+
+DirectDrawSurface::DirectDrawSurface(const char * name) : stream(NULL)
+{
+    load(name);
+}
+
+DirectDrawSurface::DirectDrawSurface(Stream * s) : stream(NULL)
+{
+    load(s);
+}
+
+DirectDrawSurface::~DirectDrawSurface()
+{
+    delete stream;
+}
+
+bool DirectDrawSurface::load(const char * filename)
+{
+    return load(new StdInputStream(filename));
+}
+
+bool DirectDrawSurface::load(Stream * stream)
+{
+    delete this->stream;
+    this->stream = stream;
+
+    if (!stream->isError())
+    {
+        (*stream) << header;
+        return true;
+    }
+
+    return false;
+}
+
+bool DirectDrawSurface::isValid() const
+{
+    if (stream == NULL || stream->isError())
+    {
+        return false;
+    }
+
+    if (header.fourcc != FOURCC_DDS || header.size != 124)
+    {
+        return false;
+    }
+
+    const uint required = (DDSD_WIDTH|DDSD_HEIGHT/*|DDSD_CAPS|DDSD_PIXELFORMAT*/);
+    if( (header.flags & required) != required ) {
+        return false;
+    }
+
+    if (header.pf.size != 32) {
+        return false;
+    }
+
+    if( !(header.caps.caps1 & DDSCAPS_TEXTURE) ) {
+        return false;
+    }
+
+    return true;
+}
+
+bool DirectDrawSurface::isSupported() const
+{
+    nvDebugCheck(isValid());
+
+    if (header.hasDX10Header())
+    {
+        if (header.header10.dxgiFormat == DXGI_FORMAT_BC1_UNORM ||
+            header.header10.dxgiFormat == DXGI_FORMAT_BC2_UNORM ||
+            header.header10.dxgiFormat == DXGI_FORMAT_BC3_UNORM ||
+            header.header10.dxgiFormat == DXGI_FORMAT_BC4_UNORM ||
+            header.header10.dxgiFormat == DXGI_FORMAT_BC5_UNORM)
+        {
+            return true;
+        }
+
+        return false;
+    }
+    else
+    {
+        if (header.pf.flags & DDPF_FOURCC)
+        {
+            if (header.pf.fourcc != FOURCC_DXT1 &&
+                header.pf.fourcc != FOURCC_DXT2 &&
+                header.pf.fourcc != FOURCC_DXT3 &&
+                header.pf.fourcc != FOURCC_DXT4 &&
+                header.pf.fourcc != FOURCC_DXT5 &&
+                header.pf.fourcc != FOURCC_RXGB &&
+                header.pf.fourcc != FOURCC_ATI1 &&
+                header.pf.fourcc != FOURCC_ATI2)
+            {
+                // Unknown fourcc code.
+                return false;
+            }
+        }
+        else if ((header.pf.flags & DDPF_RGB) || (header.pf.flags & DDPF_LUMINANCE))
+        {
+            // All RGB and luminance formats are supported now.
+        }
+        else
+        {
+            return false;
+        }
+
+        if (isTextureCube()) {
+            if (header.width != header.height) return false;
+
+            if ((header.caps.caps2 & DDSCAPS2_CUBEMAP_ALL_FACES) != DDSCAPS2_CUBEMAP_ALL_FACES)
+            {
+                // Cubemaps must contain all faces.
+                return false;
+            }
+        }
+    }
+
+    return true;
+}
+
+bool DirectDrawSurface::hasAlpha() const
+{
+    if (header.hasDX10Header())
+    {
+#pragma NV_MESSAGE("TODO: Update hasAlpha to handle all DX10 formats.")
+        return 
+            header.header10.dxgiFormat == DXGI_FORMAT_BC1_UNORM ||
+            header.header10.dxgiFormat == DXGI_FORMAT_BC2_UNORM ||
+            header.header10.dxgiFormat == DXGI_FORMAT_BC3_UNORM;
+    }
+    else
+    {
+        if (header.pf.flags & DDPF_RGB) 
+        {
+            return header.pf.amask != 0;
+        }
+        else if (header.pf.flags & DDPF_FOURCC)
+        {
+            if (header.pf.fourcc == FOURCC_RXGB ||
+                header.pf.fourcc == FOURCC_ATI1 ||
+                header.pf.fourcc == FOURCC_ATI2 ||
+                header.pf.flags & DDPF_NORMAL)
+            {
+                return false;
+            }
+            else
+            {
+                // @@ Here we could check the ALPHA_PIXELS flag, but nobody sets it. (except us?)
+                return true;
+            }
+        }
+
+        return false;
+    }
+}
+
+uint DirectDrawSurface::mipmapCount() const
+{
+    nvDebugCheck(isValid());
+    if (header.flags & DDSD_MIPMAPCOUNT) return header.mipmapcount;
+    else return 1;
+}
+
+
+uint DirectDrawSurface::width() const
+{
+    nvDebugCheck(isValid());
+    if (header.flags & DDSD_WIDTH) return header.width;
+    else return 1;
+}
+
+uint DirectDrawSurface::height() const
+{
+    nvDebugCheck(isValid());
+    if (header.flags & DDSD_HEIGHT) return header.height;
+    else return 1;
+}
+
+uint DirectDrawSurface::depth() const
+{
+    nvDebugCheck(isValid());
+    if (header.flags & DDSD_DEPTH) return header.depth;
+    else return 1;
+}
+
+bool DirectDrawSurface::isTexture1D() const
+{
+    nvDebugCheck(isValid());
+    if (header.hasDX10Header())
+    {
+        return header.header10.resourceDimension == D3D10_RESOURCE_DIMENSION_TEXTURE1D;
+    }
+    return false;
+}
+
+bool DirectDrawSurface::isTexture2D() const
+{
+    nvDebugCheck(isValid());
+    if (header.hasDX10Header())
+    {
+        return header.header10.resourceDimension == D3D10_RESOURCE_DIMENSION_TEXTURE2D;
+    }
+    else
+    {
+        return !isTexture3D() && !isTextureCube();
+    }
+}
+
+bool DirectDrawSurface::isTexture3D() const
+{
+    nvDebugCheck(isValid());
+    if (header.hasDX10Header())
+    {
+        return header.header10.resourceDimension == D3D10_RESOURCE_DIMENSION_TEXTURE3D;
+    }
+    else
+    {
+        return (header.caps.caps2 & DDSCAPS2_VOLUME) != 0;
+    }
+}
+
+bool DirectDrawSurface::isTextureCube() const
+{
+    nvDebugCheck(isValid());
+    return (header.caps.caps2 & DDSCAPS2_CUBEMAP) != 0;
+}
+
+void DirectDrawSurface::setNormalFlag(bool b)
+{
+    nvDebugCheck(isValid());
+    header.setNormalFlag(b);
+}
+
+void DirectDrawSurface::setHasAlphaFlag(bool b)
+{
+    nvDebugCheck(isValid());
+    header.setHasAlphaFlag(b);
+}
+
+void DirectDrawSurface::setUserVersion(int version)
+{
+    nvDebugCheck(isValid());
+    header.setUserVersion(version);
+}
+
+void DirectDrawSurface::mipmap(Image * img, uint face, uint mipmap)
+{
+    nvDebugCheck(isValid());
+
+    stream->seek(offset(face, mipmap));
+
+    uint w = width();
+    uint h = height();
+	uint d = depth();
+
+    // Compute width and height.
+    for (uint m = 0; m < mipmap; m++)
+    {
+        w = max(1U, w / 2);
+        h = max(1U, h / 2);
+		d = max(1U, d / 2);
+    }
+
+    img->allocate(w, h, d);
+
+    if (hasAlpha())
+    {
+        img->setFormat(Image::Format_ARGB);
+    }
+    else
+    {
+        img->setFormat(Image::Format_RGB);
+    }
+
+    if (header.hasDX10Header())
+    {
+        // So far only block formats supported.
+        readBlockImage(img);
+    }
+    else
+    {
+        if (header.pf.flags & DDPF_RGB) 
+        {
+            readLinearImage(img);
+        }
+        else if (header.pf.flags & DDPF_FOURCC)
+        {
+            readBlockImage(img);
+        }
+    }
+}
+
+/*void * DirectDrawSurface::readData(uint * sizePtr)
+{
+    uint header_size = 128; // sizeof(DDSHeader);
+
+    if (header.hasDX10Header())
+    {
+        header_size += 20; // sizeof(DDSHeader10);
+    }
+
+    stream->seek(header_size);
+
+    int size = stream->size() - header_size;
+    *sizePtr = size;
+
+    void * data = new unsigned char [size];
+    
+    size = stream->serialize(data, size);
+    nvDebugCheck(size == *sizePtr);
+
+    return data;
+}*/
+
+/*uint DirectDrawSurface::surfaceSize(uint mipmap) const
+{
+    uint w = header.width();
+    uint h = header.height();
+    uint d = header.depth();
+    for (int m = 0; m < mipmap; m++) {
+        w = (w + 1) / 2;
+        h = (h + 1) / 2;
+        d = (d + 1) / 2;
+    }
+    
+    bool isBlockFormat;
+    uint blockOrPixelSize;
+
+    if (header.hasDX10Header()) {
+        blockOrPixelSize = blockSize(header10.dxgiFormat);
+        isBlockFormat = (blockOrPixelSize != 0);
+        if (isBlockFormat) {
+            blockOrPixelSize = pixelSize(header10.dxgiFormat);
+        }
+    }
+    else {
+        header.pf.flags 
+    }
+
+    if (isBlockFormat) {
+        w = (w + 3) / 4;
+        h = (h + 3) / 4;
+        d = (d + 3) / 4; // @@ Is it necessary to align the depths?
+    }
+
+    uint blockOrPixelCount = w * h * d;
+
+    return blockCount = blockOrPixelSize;
+}*/
+
+bool DirectDrawSurface::readSurface(uint face, uint mipmap, void * data, uint size)
+{
+    if (size != surfaceSize(mipmap)) return false;
+
+    stream->seek(offset(face, mipmap));
+    if (stream->isError()) return false;
+
+    return stream->serialize(data, size) == size;
+}
+
+
+void DirectDrawSurface::readLinearImage(Image * img)
+{
+    nvDebugCheck(stream != NULL);
+    nvDebugCheck(img != NULL);
+
+    const uint w = img->width();
+    const uint h = img->height();
+
+    uint rshift, rsize;
+    PixelFormat::maskShiftAndSize(header.pf.rmask, &rshift, &rsize);
+
+    uint gshift, gsize;
+    PixelFormat::maskShiftAndSize(header.pf.gmask, &gshift, &gsize);
+
+    uint bshift, bsize;
+    PixelFormat::maskShiftAndSize(header.pf.bmask, &bshift, &bsize);
+
+    uint ashift, asize;
+    PixelFormat::maskShiftAndSize(header.pf.amask, &ashift, &asize);
+
+    uint byteCount = (header.pf.bitcount + 7) / 8;
+
+#pragma NV_MESSAGE("TODO: Support floating point linear images and other FOURCC codes.")
+
+    // Read linear RGB images.
+    for (uint y = 0; y < h; y++)
+    {
+        for (uint x = 0; x < w; x++)
+        {
+            uint c = 0;
+            stream->serialize(&c, byteCount);
+
+            Color32 pixel(0, 0, 0, 0xFF);
+            pixel.r = PixelFormat::convert((c & header.pf.rmask) >> rshift, rsize, 8);
+            pixel.g = PixelFormat::convert((c & header.pf.gmask) >> gshift, gsize, 8);
+            pixel.b = PixelFormat::convert((c & header.pf.bmask) >> bshift, bsize, 8);
+            pixel.a = PixelFormat::convert((c & header.pf.amask) >> ashift, asize, 8);
+
+            img->pixel(x, y) = pixel;
+        }
+    }
+}
+
+void DirectDrawSurface::readBlockImage(Image * img)
+{
+    nvDebugCheck(stream != NULL);
+    nvDebugCheck(img != NULL);
+
+    const uint w = img->width();
+    const uint h = img->height();
+
+    const uint bw = (w + 3) / 4;
+    const uint bh = (h + 3) / 4;
+
+    for (uint by = 0; by < bh; by++)
+    {
+        for (uint bx = 0; bx < bw; bx++)
+        {
+            ColorBlock block;
+
+            // Read color block.
+            readBlock(&block);
+
+            // Write color block.
+            for (uint y = 0; y < min(4U, h-4*by); y++)
+            {
+                for (uint x = 0; x < min(4U, w-4*bx); x++)
+                {
+                    img->pixel(4*bx+x, 4*by+y) = block.color(x, y);
+                }
+            }
+        }
+    }
+}
+
+static Color32 buildNormal(uint8 x, uint8 y)
+{
+    float nx = 2 * (x / 255.0f) - 1;
+    float ny = 2 * (y / 255.0f) - 1;
+    float nz = 0.0f;
+    if (1 - nx*nx - ny*ny > 0) nz = sqrtf(1 - nx*nx - ny*ny);
+    uint8 z = clamp(int(255.0f * (nz + 1) / 2.0f), 0, 255);
+
+    return Color32(x, y, z);
+}
+
+
+void DirectDrawSurface::readBlock(ColorBlock * rgba)
+{
+    nvDebugCheck(stream != NULL);
+    nvDebugCheck(rgba != NULL);
+
+    uint fourcc = header.pf.fourcc;
+
+    // Map DX10 block formats to fourcc codes.
+    if (header.hasDX10Header())
+    {
+        if (header.header10.dxgiFormat == DXGI_FORMAT_BC1_UNORM) fourcc = FOURCC_DXT1;
+        if (header.header10.dxgiFormat == DXGI_FORMAT_BC2_UNORM) fourcc = FOURCC_DXT3;
+        if (header.header10.dxgiFormat == DXGI_FORMAT_BC3_UNORM) fourcc = FOURCC_DXT5;
+        if (header.header10.dxgiFormat == DXGI_FORMAT_BC4_UNORM) fourcc = FOURCC_ATI1;
+        if (header.header10.dxgiFormat == DXGI_FORMAT_BC5_UNORM) fourcc = FOURCC_ATI2;
+    }
+
+
+    if (fourcc == FOURCC_DXT1)
+    {
+        BlockDXT1 block;
+        *stream << block;
+        block.decodeBlock(rgba);
+    }
+    else if (fourcc == FOURCC_DXT2 || fourcc == FOURCC_DXT3)
+    {
+        BlockDXT3 block;
+        *stream << block;
+        block.decodeBlock(rgba);
+    }
+    else if (fourcc == FOURCC_DXT4 || fourcc == FOURCC_DXT5 || fourcc == FOURCC_RXGB)
+    {
+        BlockDXT5 block;
+        *stream << block;
+        block.decodeBlock(rgba);
+
+        if (fourcc == FOURCC_RXGB)
+        {
+            // Swap R & A.
+            for (int i = 0; i < 16; i++)
+            {
+                Color32 & c = rgba->color(i);
+                uint tmp = c.r;
+                c.r = c.a;
+                c.a = tmp;
+            }
+        }
+    }
+    else if (fourcc == FOURCC_ATI1)
+    {
+        BlockATI1 block;
+        *stream << block;
+        block.decodeBlock(rgba);
+    }
+    else if (fourcc == FOURCC_ATI2)
+    {
+        BlockATI2 block;
+        *stream << block;
+        block.decodeBlock(rgba);
+    }
+
+    // If normal flag set, convert to normal.
+    if (header.pf.flags & DDPF_NORMAL)
+    {
+        if (fourcc == FOURCC_ATI2)
+        {
+            for (int i = 0; i < 16; i++)
+            {
+                Color32 & c = rgba->color(i);
+                c = buildNormal(c.r, c.g);
+            }
+        }
+        else if (fourcc == FOURCC_DXT5)
+        {
+            for (int i = 0; i < 16; i++)
+            {
+                Color32 & c = rgba->color(i);
+                c = buildNormal(c.a, c.g);
+            }
+        }
+    }
+}
+
+
+static uint mipmapExtent(uint mipmap, uint x)
+{
+    for (uint m = 0; m < mipmap; m++) {
+        x = max(1U, x / 2);
+    }
+    return x;
+}
+
+uint DirectDrawSurface::surfaceWidth(uint mipmap) const
+{
+    return mipmapExtent(mipmap, width());
+}
+
+uint DirectDrawSurface::surfaceHeight(uint mipmap) const
+{
+    return mipmapExtent(mipmap, height());
+}
+
+uint DirectDrawSurface::surfaceDepth(uint mipmap) const
+{
+    return mipmapExtent(mipmap, depth());
+}
+
+uint DirectDrawSurface::surfaceSize(uint mipmap) const
+{
+    uint w = surfaceWidth(mipmap);
+    uint h = surfaceHeight(mipmap);
+    uint d = surfaceDepth(mipmap);
+
+    uint blockSize = header.blockSize();
+
+    if (blockSize == 0) {
+        uint bitCount = header.pixelSize();
+        uint pitch = computeBytePitch(w, bitCount, 1); // Asuming 1 byte alignment, which is the same D3DX expects.
+        return pitch * h * d;
+    }
+    else {
+        w = (w + 3) / 4;
+        h = (h + 3) / 4;
+        d = d; // @@ How are 3D textures aligned?
+        return blockSize * w * h * d;
+    }
+}
+
+uint DirectDrawSurface::faceSize() const
+{
+    const uint count = mipmapCount();
+    uint size = 0;
+
+    for (uint m = 0; m < count; m++)
+    {
+        size += surfaceSize(m);
+    }
+
+    return size;
+}
+
+uint DirectDrawSurface::offset(const uint face, const uint mipmap)
+{
+    uint size = 128; // sizeof(DDSHeader);
+
+    if (header.hasDX10Header())
+    {
+        size += 20; // sizeof(DDSHeader10);
+    }
+
+    if (face != 0)
+    {
+        size += face * faceSize();
+    }
+
+    for (uint m = 0; m < mipmap; m++)
+    {
+        size += surfaceSize(m);
+    }
+
+    return size;
+}
+
+
+void DirectDrawSurface::printInfo() const
+{
+    printf("Flags: 0x%.8X\n", header.flags);
+    if (header.flags & DDSD_CAPS) printf("\tDDSD_CAPS\n");
+    if (header.flags & DDSD_PIXELFORMAT) printf("\tDDSD_PIXELFORMAT\n");
+    if (header.flags & DDSD_WIDTH) printf("\tDDSD_WIDTH\n");
+    if (header.flags & DDSD_HEIGHT) printf("\tDDSD_HEIGHT\n");
+    if (header.flags & DDSD_DEPTH) printf("\tDDSD_DEPTH\n");
+    if (header.flags & DDSD_PITCH) printf("\tDDSD_PITCH\n");
+    if (header.flags & DDSD_LINEARSIZE) printf("\tDDSD_LINEARSIZE\n");
+    if (header.flags & DDSD_MIPMAPCOUNT) printf("\tDDSD_MIPMAPCOUNT\n");
+
+    printf("Height: %d\n", header.height);
+    printf("Width: %d\n", header.width);
+    printf("Depth: %d\n", header.depth);
+    if (header.flags & DDSD_PITCH) printf("Pitch: %d\n", header.pitch);
+    else if (header.flags & DDSD_LINEARSIZE) printf("Linear size: %d\n", header.pitch);
+    printf("Mipmap count: %d\n", header.mipmapcount);
+
+    printf("Pixel Format:\n");
+    printf("\tFlags: 0x%.8X\n", header.pf.flags);
+    if (header.pf.flags & DDPF_RGB) printf("\t\tDDPF_RGB\n");
+    if (header.pf.flags & DDPF_LUMINANCE) printf("\t\tDDPF_LUMINANCE\n");
+    if (header.pf.flags & DDPF_FOURCC) printf("\t\tDDPF_FOURCC\n");
+    if (header.pf.flags & DDPF_ALPHAPIXELS) printf("\t\tDDPF_ALPHAPIXELS\n");
+    if (header.pf.flags & DDPF_ALPHA) printf("\t\tDDPF_ALPHA\n");
+    if (header.pf.flags & DDPF_PALETTEINDEXED1) printf("\t\tDDPF_PALETTEINDEXED1\n");
+    if (header.pf.flags & DDPF_PALETTEINDEXED2) printf("\t\tDDPF_PALETTEINDEXED2\n");
+    if (header.pf.flags & DDPF_PALETTEINDEXED4) printf("\t\tDDPF_PALETTEINDEXED4\n");
+    if (header.pf.flags & DDPF_PALETTEINDEXED8) printf("\t\tDDPF_PALETTEINDEXED8\n");
+    if (header.pf.flags & DDPF_ALPHAPREMULT) printf("\t\tDDPF_ALPHAPREMULT\n");
+    if (header.pf.flags & DDPF_NORMAL) printf("\t\tDDPF_NORMAL\n");
+
+    if (header.pf.fourcc != 0) { 
+        // Display fourcc code even when DDPF_FOURCC flag not set.
+        printf("\tFourCC: '%c%c%c%c' (0x%.8X)\n",
+            ((header.pf.fourcc >> 0) & 0xFF),
+            ((header.pf.fourcc >> 8) & 0xFF),
+            ((header.pf.fourcc >> 16) & 0xFF),
+            ((header.pf.fourcc >> 24) & 0xFF), 
+            header.pf.fourcc);
+    }
+
+    if ((header.pf.flags & DDPF_FOURCC) && (header.pf.bitcount != 0))
+    {
+        printf("\tSwizzle: '%c%c%c%c' (0x%.8X)\n", 
+            (header.pf.bitcount >> 0) & 0xFF,
+            (header.pf.bitcount >> 8) & 0xFF,
+            (header.pf.bitcount >> 16) & 0xFF,
+            (header.pf.bitcount >> 24) & 0xFF,
+            header.pf.bitcount);
+    }
+    else
+    {
+        printf("\tBit count: %d\n", header.pf.bitcount);
+    }
+
+    printf("\tRed mask:   0x%.8X\n", header.pf.rmask);
+    printf("\tGreen mask: 0x%.8X\n", header.pf.gmask);
+    printf("\tBlue mask:  0x%.8X\n", header.pf.bmask);
+    printf("\tAlpha mask: 0x%.8X\n", header.pf.amask);
+
+    printf("Caps:\n");
+    printf("\tCaps 1: 0x%.8X\n", header.caps.caps1);
+    if (header.caps.caps1 & DDSCAPS_COMPLEX) printf("\t\tDDSCAPS_COMPLEX\n");
+    if (header.caps.caps1 & DDSCAPS_TEXTURE) printf("\t\tDDSCAPS_TEXTURE\n");
+    if (header.caps.caps1 & DDSCAPS_MIPMAP) printf("\t\tDDSCAPS_MIPMAP\n");
+
+    printf("\tCaps 2: 0x%.8X\n", header.caps.caps2);
+    if (header.caps.caps2 & DDSCAPS2_VOLUME) printf("\t\tDDSCAPS2_VOLUME\n");
+    else if (header.caps.caps2 & DDSCAPS2_CUBEMAP)
+    {
+        printf("\t\tDDSCAPS2_CUBEMAP\n");
+        if ((header.caps.caps2 & DDSCAPS2_CUBEMAP_ALL_FACES) == DDSCAPS2_CUBEMAP_ALL_FACES) printf("\t\tDDSCAPS2_CUBEMAP_ALL_FACES\n");
+        else {
+            if (header.caps.caps2 & DDSCAPS2_CUBEMAP_POSITIVEX) printf("\t\tDDSCAPS2_CUBEMAP_POSITIVEX\n");
+            if (header.caps.caps2 & DDSCAPS2_CUBEMAP_NEGATIVEX) printf("\t\tDDSCAPS2_CUBEMAP_NEGATIVEX\n");
+            if (header.caps.caps2 & DDSCAPS2_CUBEMAP_POSITIVEY) printf("\t\tDDSCAPS2_CUBEMAP_POSITIVEY\n");
+            if (header.caps.caps2 & DDSCAPS2_CUBEMAP_NEGATIVEY) printf("\t\tDDSCAPS2_CUBEMAP_NEGATIVEY\n");
+            if (header.caps.caps2 & DDSCAPS2_CUBEMAP_POSITIVEZ) printf("\t\tDDSCAPS2_CUBEMAP_POSITIVEZ\n");
+            if (header.caps.caps2 & DDSCAPS2_CUBEMAP_NEGATIVEZ) printf("\t\tDDSCAPS2_CUBEMAP_NEGATIVEZ\n");
+        }
+    }
+
+    printf("\tCaps 3: 0x%.8X\n", header.caps.caps3);
+    printf("\tCaps 4: 0x%.8X\n", header.caps.caps4);
+
+    if (header.hasDX10Header())
+    {
+        printf("DX10 Header:\n");
+        printf("\tDXGI Format: %u (%s)\n", header.header10.dxgiFormat, getDxgiFormatString((DXGI_FORMAT)header.header10.dxgiFormat));
+        printf("\tResource dimension: %u (%s)\n", header.header10.resourceDimension, getD3d10ResourceDimensionString((D3D10_RESOURCE_DIMENSION)header.header10.resourceDimension));
+        printf("\tMisc flag: %u\n", header.header10.miscFlag);
+        printf("\tArray size: %u\n", header.header10.arraySize);
+    }
+
+    if (header.reserved[9] == FOURCC_NVTT)
+    {
+        int major = (header.reserved[10] >> 16) & 0xFF;
+        int minor = (header.reserved[10] >> 8) & 0xFF;
+        int revision= header.reserved[10] & 0xFF;
+
+        printf("Version:\n");
+        printf("\tNVIDIA Texture Tools %d.%d.%d\n", major, minor, revision);
+    }
+
+    if (header.reserved[7] == FOURCC_UVER)
+    {
+        printf("User Version: %d\n", header.reserved[8]);
+    }
+}
+
diff --git a/src/nvimage/DirectDrawSurface.h b/src/nvimage/DirectDrawSurface.h
index ed02b8a..5a8c62b 100644
--- a/src/nvimage/DirectDrawSurface.h
+++ b/src/nvimage/DirectDrawSurface.h
@@ -1,406 +1,406 @@
-// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
-// 
-// Permission is hereby granted, free of charge, to any person
-// obtaining a copy of this software and associated documentation
-// files (the "Software"), to deal in the Software without
-// restriction, including without limitation the rights to use,
-// copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the
-// Software is furnished to do so, subject to the following
-// conditions:
-// 
-// The above copyright notice and this permission notice shall be
-// included in all copies or substantial portions of the Software.
-// 
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-// OTHER DEALINGS IN THE SOFTWARE.
-
-#pragma once
-#ifndef NV_IMAGE_DIRECTDRAWSURFACE_H
-#define NV_IMAGE_DIRECTDRAWSURFACE_H
-
-#include "nvimage.h"
-
-#if !defined(MAKEFOURCC)
-#define MAKEFOURCC(ch0, ch1, ch2, ch3) \
-    (uint(uint8(ch0)) | (uint(uint8(ch1)) << 8) | \
-    (uint(uint8(ch2)) << 16) | (uint(uint8(ch3)) << 24 ))
-#endif
-
-namespace nv
-{
-    class Image;
-    class Stream;
-    struct ColorBlock;
-
-    extern const uint FOURCC_NVTT;
-    extern const uint FOURCC_DDS;
-    extern const uint FOURCC_DXT1;
-    extern const uint FOURCC_DXT2;
-    extern const uint FOURCC_DXT3;
-    extern const uint FOURCC_DXT4;
-    extern const uint FOURCC_DXT5;
-    extern const uint FOURCC_RXGB;
-    extern const uint FOURCC_ATI1;
-    extern const uint FOURCC_ATI2;
-
-    enum DDPF
-    {
-        DDPF_ALPHAPIXELS = 0x00000001U,
-        DDPF_ALPHA = 0x00000002U,
-        DDPF_FOURCC = 0x00000004U,
-        DDPF_RGB = 0x00000040U,
-        DDPF_PALETTEINDEXED1 = 0x00000800U,
-        DDPF_PALETTEINDEXED2 = 0x00001000U,
-        DDPF_PALETTEINDEXED4 = 0x00000008U,
-        DDPF_PALETTEINDEXED8 = 0x00000020U,
-        DDPF_LUMINANCE = 0x00020000U,
-        DDPF_ALPHAPREMULT = 0x00008000U,
-
-        // Custom NVTT flags.
-        DDPF_NORMAL = 0x80000000U,
-        DDPF_SRGB = 0x40000000U,
-    };
-
-
-    enum D3DFORMAT
-    {
-        // 32 bit RGB formats.
-        D3DFMT_R8G8B8 = 20,
-        D3DFMT_A8R8G8B8 = 21,
-        D3DFMT_X8R8G8B8 = 22,
-        D3DFMT_R5G6B5 = 23,
-        D3DFMT_X1R5G5B5 = 24,
-        D3DFMT_A1R5G5B5 = 25,
-        D3DFMT_A4R4G4B4 = 26,
-        D3DFMT_R3G3B2 = 27,
-        D3DFMT_A8 = 28,
-        D3DFMT_A8R3G3B2 = 29,
-        D3DFMT_X4R4G4B4 = 30,
-        D3DFMT_A2B10G10R10 = 31,
-        D3DFMT_A8B8G8R8 = 32,
-        D3DFMT_X8B8G8R8 = 33,
-        D3DFMT_G16R16 = 34,
-        D3DFMT_A2R10G10B10 = 35,
-
-        D3DFMT_A16B16G16R16 = 36,
-
-        // Palette formats.
-        D3DFMT_A8P8 = 40,
-        D3DFMT_P8 = 41,
-
-        // Luminance formats.
-        D3DFMT_L8 = 50,
-        D3DFMT_A8L8 = 51,
-        D3DFMT_A4L4 = 52,
-        D3DFMT_L16 = 81,
-
-        // Floating point formats
-        D3DFMT_R16F = 111,
-        D3DFMT_G16R16F = 112,
-        D3DFMT_A16B16G16R16F = 113,
-        D3DFMT_R32F = 114,
-        D3DFMT_G32R32F = 115,
-        D3DFMT_A32B32G32R32F = 116,
-    };
-
-
-    // D3D1x resource dimensions.
-    enum D3D10_RESOURCE_DIMENSION
-    {
-        D3D10_RESOURCE_DIMENSION_UNKNOWN = 0,
-        D3D10_RESOURCE_DIMENSION_BUFFER = 1,
-        D3D10_RESOURCE_DIMENSION_TEXTURE1D = 2,
-        D3D10_RESOURCE_DIMENSION_TEXTURE2D = 3,
-        D3D10_RESOURCE_DIMENSION_TEXTURE3D = 4,
-    };
-
-    // DXGI formats.
-    enum DXGI_FORMAT
-    {
-        DXGI_FORMAT_UNKNOWN = 0,
-
-        DXGI_FORMAT_R32G32B32A32_TYPELESS = 1,
-        DXGI_FORMAT_R32G32B32A32_FLOAT = 2,
-        DXGI_FORMAT_R32G32B32A32_UINT = 3,
-        DXGI_FORMAT_R32G32B32A32_SINT = 4,
-
-        DXGI_FORMAT_R32G32B32_TYPELESS = 5,
-        DXGI_FORMAT_R32G32B32_FLOAT = 6,
-        DXGI_FORMAT_R32G32B32_UINT = 7,
-        DXGI_FORMAT_R32G32B32_SINT = 8,
-
-        DXGI_FORMAT_R16G16B16A16_TYPELESS = 9,
-        DXGI_FORMAT_R16G16B16A16_FLOAT = 10,
-        DXGI_FORMAT_R16G16B16A16_UNORM = 11,
-        DXGI_FORMAT_R16G16B16A16_UINT = 12,
-        DXGI_FORMAT_R16G16B16A16_SNORM = 13,
-        DXGI_FORMAT_R16G16B16A16_SINT = 14,
-
-        DXGI_FORMAT_R32G32_TYPELESS = 15,
-        DXGI_FORMAT_R32G32_FLOAT = 16,
-        DXGI_FORMAT_R32G32_UINT = 17,
-        DXGI_FORMAT_R32G32_SINT = 18,
-
-        DXGI_FORMAT_R32G8X24_TYPELESS = 19,
-        DXGI_FORMAT_D32_FLOAT_S8X24_UINT = 20,
-        DXGI_FORMAT_R32_FLOAT_X8X24_TYPELESS = 21,
-        DXGI_FORMAT_X32_TYPELESS_G8X24_UINT = 22,
-
-        DXGI_FORMAT_R10G10B10A2_TYPELESS = 23,
-        DXGI_FORMAT_R10G10B10A2_UNORM = 24,
-        DXGI_FORMAT_R10G10B10A2_UINT = 25,
-
-        DXGI_FORMAT_R11G11B10_FLOAT = 26,
-
-        DXGI_FORMAT_R8G8B8A8_TYPELESS = 27,
-        DXGI_FORMAT_R8G8B8A8_UNORM = 28,
-        DXGI_FORMAT_R8G8B8A8_UNORM_SRGB = 29,
-        DXGI_FORMAT_R8G8B8A8_UINT = 30,
-        DXGI_FORMAT_R8G8B8A8_SNORM = 31,
-        DXGI_FORMAT_R8G8B8A8_SINT = 32,
-
-        DXGI_FORMAT_R16G16_TYPELESS = 33,
-        DXGI_FORMAT_R16G16_FLOAT = 34,
-        DXGI_FORMAT_R16G16_UNORM = 35,
-        DXGI_FORMAT_R16G16_UINT = 36,
-        DXGI_FORMAT_R16G16_SNORM = 37,
-        DXGI_FORMAT_R16G16_SINT = 38,
-
-        DXGI_FORMAT_R32_TYPELESS = 39,
-        DXGI_FORMAT_D32_FLOAT = 40,
-        DXGI_FORMAT_R32_FLOAT = 41,
-        DXGI_FORMAT_R32_UINT = 42,
-        DXGI_FORMAT_R32_SINT = 43,
-
-        DXGI_FORMAT_R24G8_TYPELESS = 44,
-        DXGI_FORMAT_D24_UNORM_S8_UINT = 45,
-        DXGI_FORMAT_R24_UNORM_X8_TYPELESS = 46,
-        DXGI_FORMAT_X24_TYPELESS_G8_UINT = 47,
-
-        DXGI_FORMAT_R8G8_TYPELESS = 48,
-        DXGI_FORMAT_R8G8_UNORM = 49,
-        DXGI_FORMAT_R8G8_UINT = 50,
-        DXGI_FORMAT_R8G8_SNORM = 51,
-        DXGI_FORMAT_R8G8_SINT = 52,
-
-        DXGI_FORMAT_R16_TYPELESS = 53,
-        DXGI_FORMAT_R16_FLOAT = 54,
-        DXGI_FORMAT_D16_UNORM = 55,
-        DXGI_FORMAT_R16_UNORM = 56,
-        DXGI_FORMAT_R16_UINT = 57,
-        DXGI_FORMAT_R16_SNORM = 58,
-        DXGI_FORMAT_R16_SINT = 59,
-
-        DXGI_FORMAT_R8_TYPELESS = 60,
-        DXGI_FORMAT_R8_UNORM = 61,
-        DXGI_FORMAT_R8_UINT = 62,
-        DXGI_FORMAT_R8_SNORM = 63,
-        DXGI_FORMAT_R8_SINT = 64,
-        DXGI_FORMAT_A8_UNORM = 65,
-
-        DXGI_FORMAT_R1_UNORM = 66,
-
-        DXGI_FORMAT_R9G9B9E5_SHAREDEXP = 67,
-
-        DXGI_FORMAT_R8G8_B8G8_UNORM = 68,
-        DXGI_FORMAT_G8R8_G8B8_UNORM = 69,
-
-        DXGI_FORMAT_BC1_TYPELESS = 70,
-        DXGI_FORMAT_BC1_UNORM = 71,
-        DXGI_FORMAT_BC1_UNORM_SRGB = 72,
-
-        DXGI_FORMAT_BC2_TYPELESS = 73,
-        DXGI_FORMAT_BC2_UNORM = 74,
-        DXGI_FORMAT_BC2_UNORM_SRGB = 75,
-
-        DXGI_FORMAT_BC3_TYPELESS = 76,
-        DXGI_FORMAT_BC3_UNORM = 77,
-        DXGI_FORMAT_BC3_UNORM_SRGB = 78,
-
-        DXGI_FORMAT_BC4_TYPELESS = 79,
-        DXGI_FORMAT_BC4_UNORM = 80,
-        DXGI_FORMAT_BC4_SNORM = 81,
-
-        DXGI_FORMAT_BC5_TYPELESS = 82,
-        DXGI_FORMAT_BC5_UNORM = 83,
-        DXGI_FORMAT_BC5_SNORM = 84,
-
-        DXGI_FORMAT_B5G6R5_UNORM = 85,
-        DXGI_FORMAT_B5G5R5A1_UNORM = 86,
-        DXGI_FORMAT_B8G8R8A8_UNORM = 87,
-        DXGI_FORMAT_B8G8R8X8_UNORM = 88,
-
-        DXGI_FORMAT_R10G10B10_XR_BIAS_A2_UNORM = 89,
-        DXGI_FORMAT_B8G8R8A8_TYPELESS = 90,
-        DXGI_FORMAT_B8G8R8A8_UNORM_SRGB = 91,
-        DXGI_FORMAT_B8G8R8X8_TYPELESS = 92,
-        DXGI_FORMAT_B8G8R8X8_UNORM_SRGB = 93,
-
-        DXGI_FORMAT_BC6H_TYPELESS = 94,
-        DXGI_FORMAT_BC6H_UF16 = 95,
-        DXGI_FORMAT_BC6H_SF16 = 96,
-
-        DXGI_FORMAT_BC7_TYPELESS = 97,
-        DXGI_FORMAT_BC7_UNORM = 98,
-        DXGI_FORMAT_BC7_UNORM_SRGB = 99,
-    };
-
-
-
-    extern uint findD3D9Format(uint bitcount, uint rmask, uint gmask, uint bmask, uint amask);
-
-    struct NVIMAGE_CLASS DDSPixelFormat
-    {
-        uint size;
-        uint flags;
-        uint fourcc;
-        uint bitcount;
-        uint rmask;
-        uint gmask;
-        uint bmask;
-        uint amask;
-    };
-
-    struct NVIMAGE_CLASS DDSCaps
-    {
-        uint caps1;
-        uint caps2;
-        uint caps3;
-        uint caps4;
-    };
-
-    /// DDS file header for DX10.
-    struct NVIMAGE_CLASS DDSHeader10
-    {
-        uint dxgiFormat;
-        uint resourceDimension;
-        uint miscFlag;
-        uint arraySize;
-        uint reserved;
-    };
-
-    /// DDS file header.
-    struct NVIMAGE_CLASS DDSHeader
-    {
-        uint fourcc;
-        uint size;
-        uint flags;
-        uint height;
-        uint width;
-        uint pitch;
-        uint depth;
-        uint mipmapcount;
-        uint reserved[11];
-        DDSPixelFormat pf;
-        DDSCaps caps;
-        uint notused;
-        DDSHeader10 header10;
-
-
-        // Helper methods.
-        DDSHeader();
-
-        void setWidth(uint w);
-        void setHeight(uint h);
-        void setDepth(uint d);
-        void setMipmapCount(uint count);
-        void setTexture2D();
-        void setTexture3D();
-        void setTextureCube();
-        void setLinearSize(uint size);
-        void setPitch(uint pitch);
-        void setFourCC(uint8 c0, uint8 c1, uint8 c2, uint8 c3);
-        void setFormatCode(uint code);
-        void setSwizzleCode(uint8 c0, uint8 c1, uint8 c2, uint8 c3);
-        void setPixelFormat(uint bitcount, uint rmask, uint gmask, uint bmask, uint amask);
-        void setDX10Format(uint format);
-        void setNormalFlag(bool b);
-        void setSrgbFlag(bool b);
-        void setHasAlphaFlag(bool b);
-        void setUserVersion(int version);
-
-        void swapBytes();
-
-        bool hasDX10Header() const;
-        uint signature() const;
-        uint toolVersion() const;
-        uint userVersion() const;
-        bool isNormalMap() const;
-        bool isSrgb() const;
-        bool hasAlpha() const;
-        uint d3d9Format() const;
-        uint pixelSize() const; // In bits!
-        uint blockSize() const; // In bytes!
-        bool isBlockFormat() const;
-    };
-
-    NVIMAGE_API Stream & operator<< (Stream & s, DDSHeader & header);
-
-
-    /// DirectDraw Surface. (DDS)
-    class NVIMAGE_CLASS DirectDrawSurface
-    {
-    public:
-        DirectDrawSurface();
-        DirectDrawSurface(const char * file);
-        DirectDrawSurface(Stream * stream);
-        ~DirectDrawSurface();
-
-        bool load(const char * filename);
-        bool load(Stream * stream);
-
-        bool isValid() const;
-        bool isSupported() const;
-
-        bool hasAlpha() const;
-
-        uint mipmapCount() const;
-        uint width() const;
-        uint height() const;
-        uint depth() const;
-        bool isTexture1D() const;
-        bool isTexture2D() const;
-        bool isTexture3D() const;
-        bool isTextureCube() const;
-
-        void setNormalFlag(bool b);
-        void setHasAlphaFlag(bool b);
-        void setUserVersion(int version);
-
-        void mipmap(Image * img, uint f, uint m);
-
-        uint surfaceWidth(uint mipmap) const;
-        uint surfaceHeight(uint mipmap) const;
-        uint surfaceDepth(uint mipmap) const;
-        uint surfaceSize(uint mipmap) const;
-        bool readSurface(uint face, uint mipmap, void * data, uint size);
-
-        void printInfo() const;
-
-        // Only initialized after loading.
-        DDSHeader header;
-
-    private:
-
-        uint faceSize() const;
-        uint offset(uint face, uint mipmap);
-
-        void readLinearImage(Image * img);
-        void readBlockImage(Image * img);
-        void readBlock(ColorBlock * rgba);
-
-
-    private:
-        Stream * stream;
-    };
-
-} // nv namespace
-
-#endif // NV_IMAGE_DIRECTDRAWSURFACE_H
+// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
+// 
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+#pragma once
+#ifndef NV_IMAGE_DIRECTDRAWSURFACE_H
+#define NV_IMAGE_DIRECTDRAWSURFACE_H
+
+#include "nvimage.h"
+
+#if !defined(MAKEFOURCC)
+#define MAKEFOURCC(ch0, ch1, ch2, ch3) \
+    (uint(uint8(ch0)) | (uint(uint8(ch1)) << 8) | \
+    (uint(uint8(ch2)) << 16) | (uint(uint8(ch3)) << 24 ))
+#endif
+
+namespace nv
+{
+    class Image;
+    class Stream;
+    struct ColorBlock;
+
+    extern const uint FOURCC_NVTT;
+    extern const uint FOURCC_DDS;
+    extern const uint FOURCC_DXT1;
+    extern const uint FOURCC_DXT2;
+    extern const uint FOURCC_DXT3;
+    extern const uint FOURCC_DXT4;
+    extern const uint FOURCC_DXT5;
+    extern const uint FOURCC_RXGB;
+    extern const uint FOURCC_ATI1;
+    extern const uint FOURCC_ATI2;
+
+    enum DDPF
+    {
+        DDPF_ALPHAPIXELS = 0x00000001U,
+        DDPF_ALPHA = 0x00000002U,
+        DDPF_FOURCC = 0x00000004U,
+        DDPF_RGB = 0x00000040U,
+        DDPF_PALETTEINDEXED1 = 0x00000800U,
+        DDPF_PALETTEINDEXED2 = 0x00001000U,
+        DDPF_PALETTEINDEXED4 = 0x00000008U,
+        DDPF_PALETTEINDEXED8 = 0x00000020U,
+        DDPF_LUMINANCE = 0x00020000U,
+        DDPF_ALPHAPREMULT = 0x00008000U,
+
+        // Custom NVTT flags.
+        DDPF_NORMAL = 0x80000000U,
+        DDPF_SRGB = 0x40000000U,
+    };
+
+
+    enum D3DFORMAT
+    {
+        // 32 bit RGB formats.
+        D3DFMT_R8G8B8 = 20,
+        D3DFMT_A8R8G8B8 = 21,
+        D3DFMT_X8R8G8B8 = 22,
+        D3DFMT_R5G6B5 = 23,
+        D3DFMT_X1R5G5B5 = 24,
+        D3DFMT_A1R5G5B5 = 25,
+        D3DFMT_A4R4G4B4 = 26,
+        D3DFMT_R3G3B2 = 27,
+        D3DFMT_A8 = 28,
+        D3DFMT_A8R3G3B2 = 29,
+        D3DFMT_X4R4G4B4 = 30,
+        D3DFMT_A2B10G10R10 = 31,
+        D3DFMT_A8B8G8R8 = 32,
+        D3DFMT_X8B8G8R8 = 33,
+        D3DFMT_G16R16 = 34,
+        D3DFMT_A2R10G10B10 = 35,
+
+        D3DFMT_A16B16G16R16 = 36,
+
+        // Palette formats.
+        D3DFMT_A8P8 = 40,
+        D3DFMT_P8 = 41,
+
+        // Luminance formats.
+        D3DFMT_L8 = 50,
+        D3DFMT_A8L8 = 51,
+        D3DFMT_A4L4 = 52,
+        D3DFMT_L16 = 81,
+
+        // Floating point formats
+        D3DFMT_R16F = 111,
+        D3DFMT_G16R16F = 112,
+        D3DFMT_A16B16G16R16F = 113,
+        D3DFMT_R32F = 114,
+        D3DFMT_G32R32F = 115,
+        D3DFMT_A32B32G32R32F = 116,
+    };
+
+
+    // D3D1x resource dimensions.
+    enum D3D10_RESOURCE_DIMENSION
+    {
+        D3D10_RESOURCE_DIMENSION_UNKNOWN = 0,
+        D3D10_RESOURCE_DIMENSION_BUFFER = 1,
+        D3D10_RESOURCE_DIMENSION_TEXTURE1D = 2,
+        D3D10_RESOURCE_DIMENSION_TEXTURE2D = 3,
+        D3D10_RESOURCE_DIMENSION_TEXTURE3D = 4,
+    };
+
+    // DXGI formats.
+    enum DXGI_FORMAT
+    {
+        DXGI_FORMAT_UNKNOWN = 0,
+
+        DXGI_FORMAT_R32G32B32A32_TYPELESS = 1,
+        DXGI_FORMAT_R32G32B32A32_FLOAT = 2,
+        DXGI_FORMAT_R32G32B32A32_UINT = 3,
+        DXGI_FORMAT_R32G32B32A32_SINT = 4,
+
+        DXGI_FORMAT_R32G32B32_TYPELESS = 5,
+        DXGI_FORMAT_R32G32B32_FLOAT = 6,
+        DXGI_FORMAT_R32G32B32_UINT = 7,
+        DXGI_FORMAT_R32G32B32_SINT = 8,
+
+        DXGI_FORMAT_R16G16B16A16_TYPELESS = 9,
+        DXGI_FORMAT_R16G16B16A16_FLOAT = 10,
+        DXGI_FORMAT_R16G16B16A16_UNORM = 11,
+        DXGI_FORMAT_R16G16B16A16_UINT = 12,
+        DXGI_FORMAT_R16G16B16A16_SNORM = 13,
+        DXGI_FORMAT_R16G16B16A16_SINT = 14,
+
+        DXGI_FORMAT_R32G32_TYPELESS = 15,
+        DXGI_FORMAT_R32G32_FLOAT = 16,
+        DXGI_FORMAT_R32G32_UINT = 17,
+        DXGI_FORMAT_R32G32_SINT = 18,
+
+        DXGI_FORMAT_R32G8X24_TYPELESS = 19,
+        DXGI_FORMAT_D32_FLOAT_S8X24_UINT = 20,
+        DXGI_FORMAT_R32_FLOAT_X8X24_TYPELESS = 21,
+        DXGI_FORMAT_X32_TYPELESS_G8X24_UINT = 22,
+
+        DXGI_FORMAT_R10G10B10A2_TYPELESS = 23,
+        DXGI_FORMAT_R10G10B10A2_UNORM = 24,
+        DXGI_FORMAT_R10G10B10A2_UINT = 25,
+
+        DXGI_FORMAT_R11G11B10_FLOAT = 26,
+
+        DXGI_FORMAT_R8G8B8A8_TYPELESS = 27,
+        DXGI_FORMAT_R8G8B8A8_UNORM = 28,
+        DXGI_FORMAT_R8G8B8A8_UNORM_SRGB = 29,
+        DXGI_FORMAT_R8G8B8A8_UINT = 30,
+        DXGI_FORMAT_R8G8B8A8_SNORM = 31,
+        DXGI_FORMAT_R8G8B8A8_SINT = 32,
+
+        DXGI_FORMAT_R16G16_TYPELESS = 33,
+        DXGI_FORMAT_R16G16_FLOAT = 34,
+        DXGI_FORMAT_R16G16_UNORM = 35,
+        DXGI_FORMAT_R16G16_UINT = 36,
+        DXGI_FORMAT_R16G16_SNORM = 37,
+        DXGI_FORMAT_R16G16_SINT = 38,
+
+        DXGI_FORMAT_R32_TYPELESS = 39,
+        DXGI_FORMAT_D32_FLOAT = 40,
+        DXGI_FORMAT_R32_FLOAT = 41,
+        DXGI_FORMAT_R32_UINT = 42,
+        DXGI_FORMAT_R32_SINT = 43,
+
+        DXGI_FORMAT_R24G8_TYPELESS = 44,
+        DXGI_FORMAT_D24_UNORM_S8_UINT = 45,
+        DXGI_FORMAT_R24_UNORM_X8_TYPELESS = 46,
+        DXGI_FORMAT_X24_TYPELESS_G8_UINT = 47,
+
+        DXGI_FORMAT_R8G8_TYPELESS = 48,
+        DXGI_FORMAT_R8G8_UNORM = 49,
+        DXGI_FORMAT_R8G8_UINT = 50,
+        DXGI_FORMAT_R8G8_SNORM = 51,
+        DXGI_FORMAT_R8G8_SINT = 52,
+
+        DXGI_FORMAT_R16_TYPELESS = 53,
+        DXGI_FORMAT_R16_FLOAT = 54,
+        DXGI_FORMAT_D16_UNORM = 55,
+        DXGI_FORMAT_R16_UNORM = 56,
+        DXGI_FORMAT_R16_UINT = 57,
+        DXGI_FORMAT_R16_SNORM = 58,
+        DXGI_FORMAT_R16_SINT = 59,
+
+        DXGI_FORMAT_R8_TYPELESS = 60,
+        DXGI_FORMAT_R8_UNORM = 61,
+        DXGI_FORMAT_R8_UINT = 62,
+        DXGI_FORMAT_R8_SNORM = 63,
+        DXGI_FORMAT_R8_SINT = 64,
+        DXGI_FORMAT_A8_UNORM = 65,
+
+        DXGI_FORMAT_R1_UNORM = 66,
+
+        DXGI_FORMAT_R9G9B9E5_SHAREDEXP = 67,
+
+        DXGI_FORMAT_R8G8_B8G8_UNORM = 68,
+        DXGI_FORMAT_G8R8_G8B8_UNORM = 69,
+
+        DXGI_FORMAT_BC1_TYPELESS = 70,
+        DXGI_FORMAT_BC1_UNORM = 71,
+        DXGI_FORMAT_BC1_UNORM_SRGB = 72,
+
+        DXGI_FORMAT_BC2_TYPELESS = 73,
+        DXGI_FORMAT_BC2_UNORM = 74,
+        DXGI_FORMAT_BC2_UNORM_SRGB = 75,
+
+        DXGI_FORMAT_BC3_TYPELESS = 76,
+        DXGI_FORMAT_BC3_UNORM = 77,
+        DXGI_FORMAT_BC3_UNORM_SRGB = 78,
+
+        DXGI_FORMAT_BC4_TYPELESS = 79,
+        DXGI_FORMAT_BC4_UNORM = 80,
+        DXGI_FORMAT_BC4_SNORM = 81,
+
+        DXGI_FORMAT_BC5_TYPELESS = 82,
+        DXGI_FORMAT_BC5_UNORM = 83,
+        DXGI_FORMAT_BC5_SNORM = 84,
+
+        DXGI_FORMAT_B5G6R5_UNORM = 85,
+        DXGI_FORMAT_B5G5R5A1_UNORM = 86,
+        DXGI_FORMAT_B8G8R8A8_UNORM = 87,
+        DXGI_FORMAT_B8G8R8X8_UNORM = 88,
+
+        DXGI_FORMAT_R10G10B10_XR_BIAS_A2_UNORM = 89,
+        DXGI_FORMAT_B8G8R8A8_TYPELESS = 90,
+        DXGI_FORMAT_B8G8R8A8_UNORM_SRGB = 91,
+        DXGI_FORMAT_B8G8R8X8_TYPELESS = 92,
+        DXGI_FORMAT_B8G8R8X8_UNORM_SRGB = 93,
+
+        DXGI_FORMAT_BC6H_TYPELESS = 94,
+        DXGI_FORMAT_BC6H_UF16 = 95,
+        DXGI_FORMAT_BC6H_SF16 = 96,
+
+        DXGI_FORMAT_BC7_TYPELESS = 97,
+        DXGI_FORMAT_BC7_UNORM = 98,
+        DXGI_FORMAT_BC7_UNORM_SRGB = 99,
+    };
+
+
+
+    extern uint findD3D9Format(uint bitcount, uint rmask, uint gmask, uint bmask, uint amask);
+
+    struct NVIMAGE_CLASS DDSPixelFormat
+    {
+        uint size;
+        uint flags;
+        uint fourcc;
+        uint bitcount;
+        uint rmask;
+        uint gmask;
+        uint bmask;
+        uint amask;
+    };
+
+    struct NVIMAGE_CLASS DDSCaps
+    {
+        uint caps1;
+        uint caps2;
+        uint caps3;
+        uint caps4;
+    };
+
+    /// DDS file header for DX10.
+    struct NVIMAGE_CLASS DDSHeader10
+    {
+        uint dxgiFormat;
+        uint resourceDimension;
+        uint miscFlag;
+        uint arraySize;
+        uint reserved;
+    };
+
+    /// DDS file header.
+    struct NVIMAGE_CLASS DDSHeader
+    {
+        uint fourcc;
+        uint size;
+        uint flags;
+        uint height;
+        uint width;
+        uint pitch;
+        uint depth;
+        uint mipmapcount;
+        uint reserved[11];
+        DDSPixelFormat pf;
+        DDSCaps caps;
+        uint notused;
+        DDSHeader10 header10;
+
+
+        // Helper methods.
+        DDSHeader();
+
+        void setWidth(uint w);
+        void setHeight(uint h);
+        void setDepth(uint d);
+        void setMipmapCount(uint count);
+        void setTexture2D();
+        void setTexture3D();
+        void setTextureCube();
+        void setLinearSize(uint size);
+        void setPitch(uint pitch);
+        void setFourCC(uint8 c0, uint8 c1, uint8 c2, uint8 c3);
+        void setFormatCode(uint code);
+        void setSwizzleCode(uint8 c0, uint8 c1, uint8 c2, uint8 c3);
+        void setPixelFormat(uint bitcount, uint rmask, uint gmask, uint bmask, uint amask);
+        void setDX10Format(uint format);
+        void setNormalFlag(bool b);
+        void setSrgbFlag(bool b);
+        void setHasAlphaFlag(bool b);
+        void setUserVersion(int version);
+
+        void swapBytes();
+
+        bool hasDX10Header() const;
+        uint signature() const;
+        uint toolVersion() const;
+        uint userVersion() const;
+        bool isNormalMap() const;
+        bool isSrgb() const;
+        bool hasAlpha() const;
+        uint d3d9Format() const;
+        uint pixelSize() const; // In bits!
+        uint blockSize() const; // In bytes!
+        bool isBlockFormat() const;
+    };
+
+    NVIMAGE_API Stream & operator<< (Stream & s, DDSHeader & header);
+
+
+    /// DirectDraw Surface. (DDS)
+    class NVIMAGE_CLASS DirectDrawSurface
+    {
+    public:
+        DirectDrawSurface();
+        DirectDrawSurface(const char * file);
+        DirectDrawSurface(Stream * stream);
+        ~DirectDrawSurface();
+
+        bool load(const char * filename);
+        bool load(Stream * stream);
+
+        bool isValid() const;
+        bool isSupported() const;
+
+        bool hasAlpha() const;
+
+        uint mipmapCount() const;
+        uint width() const;
+        uint height() const;
+        uint depth() const;
+        bool isTexture1D() const;
+        bool isTexture2D() const;
+        bool isTexture3D() const;
+        bool isTextureCube() const;
+
+        void setNormalFlag(bool b);
+        void setHasAlphaFlag(bool b);
+        void setUserVersion(int version);
+
+        void mipmap(Image * img, uint f, uint m);
+
+        uint surfaceWidth(uint mipmap) const;
+        uint surfaceHeight(uint mipmap) const;
+        uint surfaceDepth(uint mipmap) const;
+        uint surfaceSize(uint mipmap) const;
+        bool readSurface(uint face, uint mipmap, void * data, uint size);
+
+        void printInfo() const;
+
+        // Only initialized after loading.
+        DDSHeader header;
+
+    private:
+
+        uint faceSize() const;
+        uint offset(uint face, uint mipmap);
+
+        void readLinearImage(Image * img);
+        void readBlockImage(Image * img);
+        void readBlock(ColorBlock * rgba);
+
+
+    private:
+        Stream * stream;
+    };
+
+} // nv namespace
+
+#endif // NV_IMAGE_DIRECTDRAWSURFACE_H
diff --git a/src/nvimage/Filter.cpp b/src/nvimage/Filter.cpp
index db06d80..a89e54b 100644
--- a/src/nvimage/Filter.cpp
+++ b/src/nvimage/Filter.cpp
@@ -1,627 +1,627 @@
-// This code is in the public domain -- castanyo@yahoo.es
-
-/** @file Filter.cpp
- * @brief Image filters.
- *
- * Jonathan Blow articles:
- * http://number-none.com/product/Mipmapping, Part 1/index.html
- * http://number-none.com/product/Mipmapping, Part 2/index.html
- *
- * References from Thacher Ulrich:
- * See _Graphics Gems III_ "General Filtered Image Rescaling", Dale A. Schumacher
- * http://tog.acm.org/GraphicsGems/gemsiii/filter.c
- *
- * References from Paul Heckbert:
- * A.V. Oppenheim, R.W. Schafer, Digital Signal Processing, Prentice-Hall, 1975
- *
- * R.W. Hamming, Digital Filters, Prentice-Hall, Englewood Cliffs, NJ, 1983
- *
- * W.K. Pratt, Digital Image Processing, John Wiley and Sons, 1978
- *
- * H.S. Hou, H.C. Andrews, "Cubic Splines for Image Interpolation and
- *	Digital Filtering", IEEE Trans. Acoustics, Speech, and Signal Proc.,
- *	vol. ASSP-26, no. 6, Dec. 1978, pp. 508-517
- *
- * Paul Heckbert's zoom library.
- * http://www.xmission.com/~legalize/zoom.html
- * 
- * Reconstruction Filters in Computer Graphics
- * http://www.mentallandscape.com/Papers_siggraph88.pdf
- *
- * More references:
- * http://www.worldserver.com/turk/computergraphics/ResamplingFilters.pdf
- * http://www.dspguide.com/ch16.htm
- */
-
-#include "Filter.h"
-
-#include "nvmath/Vector.h" // Vector4
-#include "nvcore/Utils.h" // swap
-
-#include <string.h> // memset
-
-using namespace nv;
-
-namespace
-{
-    // Sinc function.
-    inline static float sincf(const float x)
-    {
-        if (fabs(x) < NV_EPSILON) {
-            //return 1.0;
-            return 1.0f + x*x*(-1.0f/6.0f + x*x*1.0f/120.0f);
-        }
-        else {
-            return sin(x) / x;
-        }
-    }
-
-    // Bessel function of the first kind from Jon Blow's article.
-    // http://mathworld.wolfram.com/BesselFunctionoftheFirstKind.html
-    // http://en.wikipedia.org/wiki/Bessel_function
-    inline static float bessel0(float x)
-    {
-        const float EPSILON_RATIO = 1e-6f;
-        float xh, sum, pow, ds;
-        int k;
-
-        xh = 0.5f * x;
-        sum = 1.0f;
-        pow = 1.0f;
-        k = 0;
-        ds = 1.0;
-        while (ds > sum * EPSILON_RATIO) {
-            ++k;
-            pow = pow * (xh / k);
-            ds = pow * pow;
-            sum = sum + ds;
-        }
-
-        return sum;
-    }
-
-    /*// Alternative bessel function from Paul Heckbert.
-    static float _bessel0(float x)
-    {
-        const float EPSILON_RATIO = 1E-6;
-        float sum = 1.0f;
-        float y = x * x / 4.0f;
-        float t = y;
-        for(int i = 2; t > EPSILON_RATIO; i++) {
-            sum += t;
-            t *= y / float(i * i);
-        }
-        return sum;
-    }*/
-
-} // namespace
-
-
-Filter::Filter(float width) : m_width(width)
-{
-}
-
-/*virtual*/ Filter::~Filter()
-{
-}
-
-float Filter::sampleDelta(float x, float scale) const
-{
-    return evaluate((x + 0.5f)* scale);
-}
-
-float Filter::sampleBox(float x, float scale, int samples) const
-{
-    double sum = 0;
-    float isamples = 1.0f / float(samples);
-
-    for(int s = 0; s < samples; s++)
-    {
-        float p = (x + (float(s) + 0.5f) * isamples) * scale;
-        float value = evaluate(p);
-
-        //printf("%f: %.8f (%X)\n", p, value, *(uint32 *)&value);
-
-        sum += value;
-    }
-
-    return float(sum * isamples);
-}
-
-float Filter::sampleTriangle(float x, float scale, int samples) const
-{
-    double sum = 0;
-    float isamples = 1.0f / float(samples);
-
-    for(int s = 0; s < samples; s++)
-    {
-        float offset = (2 * float(s) + 1.0f) * isamples;		
-        float p = (x + offset - 0.5f) * scale;
-        float value = evaluate(p);
-
-        float weight = offset;
-        if (weight > 1.0f) weight = 2.0f - weight;
-
-        sum += value * weight;
-    }
-
-    return float(2 * sum * isamples);
-}
-
-
-
-
-
-BoxFilter::BoxFilter() : Filter(0.5f) {}
-BoxFilter::BoxFilter(float width) : Filter(width) {}
-
-float BoxFilter::evaluate(float x) const
-{
-    if (fabs(x) <= m_width) return 1.0f;
-    else return 0.0f;
-}
-
-
-TriangleFilter::TriangleFilter() : Filter(1.0f) {}
-TriangleFilter::TriangleFilter(float width) : Filter(width) {}
-
-float TriangleFilter::evaluate(float x) const
-{
-    x = fabs(x);
-    if( x < m_width ) return m_width - x;
-    return 0.0f;
-}
-
-
-QuadraticFilter::QuadraticFilter() : Filter(1.5f) {}
-
-float QuadraticFilter::evaluate(float x) const
-{
-    x = fabs(x);
-    if( x < 0.5f ) return 0.75f - x * x;
-    if( x < 1.5f ) { 
-        float t = x - 1.5f;
-        return 0.5f * t * t;
-    }
-    return 0.0f;
-}
-
-
-CubicFilter::CubicFilter() : Filter(1.0f) {}
-
-float CubicFilter::evaluate(float x) const
-{
-    // f(t) = 2|t|^3 - 3|t|^2 + 1, -1 <= t <= 1
-    x = fabs(x);
-    if( x < 1.0f ) return((2.0f * x - 3.0f) * x * x + 1.0f);
-    return 0.0f;
-}
-
-
-BSplineFilter::BSplineFilter() : Filter(2.0f) {}
-
-float BSplineFilter::evaluate(float x) const
-{
-    x = fabs(x);
-    if( x < 1.0f ) return (4.0f + x * x * (-6.0f + x * 3.0f)) / 6.0f;
-    if( x < 2.0f ) { 
-        float t = 2.0f - x;
-        return t * t * t / 6.0f;
-    }
-    return 0.0f;
-}
-
-
-MitchellFilter::MitchellFilter() : Filter(2.0f) { setParameters(1.0f/3.0f, 1.0f/3.0f); }
-
-float MitchellFilter::evaluate(float x) const
-{
-    x = fabs(x);
-    if( x < 1.0f ) return p0 + x * x * (p2 + x * p3);
-    if( x < 2.0f ) return q0 + x * (q1 + x * (q2 + x * q3));
-    return 0.0f;
-}
-
-void MitchellFilter::setParameters(float b, float c)
-{
-    p0 = (6.0f -  2.0f * b) / 6.0f;
-    p2 = (-18.0f + 12.0f * b + 6.0f * c) / 6.0f;
-    p3 = (12.0f - 9.0f * b - 6.0f * c) / 6.0f;
-    q0 = (8.0f * b + 24.0f * c) / 6.0f;
-    q1 = (-12.0f * b - 48.0f * c) / 6.0f;
-    q2 = (6.0f * b + 30.0f * c) / 6.0f;
-    q3 = (-b - 6.0f * c) / 6.0f;
-}
-
-
-LanczosFilter::LanczosFilter() : Filter(3.0f) {}
-
-float LanczosFilter::evaluate(float x) const
-{
-    x = fabs(x);
-    if( x < 3.0f ) return sincf(PI * x) * sincf(PI * x / 3.0f);
-    return 0.0f;
-}
-
-
-SincFilter::SincFilter(float w) : Filter(w) {}
-
-float SincFilter::evaluate(float x) const
-{
-    return sincf(PI * x);
-}
-
-
-KaiserFilter::KaiserFilter(float w) : Filter(w) { setParameters(4.0f, 1.0f); }
-
-float KaiserFilter::evaluate(float x) const
-{
-    const float sinc_value = sincf(PI * x * stretch);
-    const float t = x / m_width;
-    if ((1 - t * t) >= 0) return sinc_value * bessel0(alpha * sqrtf(1 - t * t)) / bessel0(alpha);
-    else return 0;
-}
-
-void KaiserFilter::setParameters(float alpha, float stretch)
-{
-    this->alpha = alpha;
-    this->stretch = stretch;
-}
-
-GaussianFilter::GaussianFilter(float w) : Filter(w) { setParameters(1); }
-
-float GaussianFilter::evaluate(float x) const
-{
-    // variance = sigma^2
-    return (1.0f / sqrtf(2 * PI * variance)) * expf(-x*x / (2 * variance));
-}
-
-void GaussianFilter::setParameters(float variance)
-{
-    this->variance = variance;
-}
-
-
-
-Kernel1::Kernel1(const Filter & f, int iscale, int samples/*= 32*/)
-{
-    nvDebugCheck(iscale > 1);
-    nvDebugCheck(samples > 0);
-
-    const float scale = 1.0f / iscale;
-
-    m_width = f.width() * iscale;
-    m_windowSize = (int)ceilf(2 * m_width);
-    m_data = new float[m_windowSize];
-
-    const float offset = float(m_windowSize) / 2;
-
-    float total = 0.0f;
-    for (int i = 0; i < m_windowSize; i++)
-    {
-        const float sample = f.sampleBox(i - offset, scale, samples);
-        m_data[i] = sample;
-        total += sample;
-    }
-
-    const float inv = 1.0f / total;
-    for (int i = 0; i < m_windowSize; i++)
-    {
-        m_data[i] *= inv;
-    }
-}
-
-Kernel1::~Kernel1()
-{
-    delete m_data;
-}
-
-// Print the kernel for debugging purposes.
-void Kernel1::debugPrint()
-{
-    for (int i = 0; i < m_windowSize; i++) {
-        nvDebug("%d: %f\n", i, m_data[i]);
-    }
-}
-
-
-
-Kernel2::Kernel2(uint ws) : m_windowSize(ws)
-{
-    m_data = new float[m_windowSize * m_windowSize];
-}
-
+// This code is in the public domain -- castanyo@yahoo.es
+
+/** @file Filter.cpp
+ * @brief Image filters.
+ *
+ * Jonathan Blow articles:
+ * http://number-none.com/product/Mipmapping, Part 1/index.html
+ * http://number-none.com/product/Mipmapping, Part 2/index.html
+ *
+ * References from Thacher Ulrich:
+ * See _Graphics Gems III_ "General Filtered Image Rescaling", Dale A. Schumacher
+ * http://tog.acm.org/GraphicsGems/gemsiii/filter.c
+ *
+ * References from Paul Heckbert:
+ * A.V. Oppenheim, R.W. Schafer, Digital Signal Processing, Prentice-Hall, 1975
+ *
+ * R.W. Hamming, Digital Filters, Prentice-Hall, Englewood Cliffs, NJ, 1983
+ *
+ * W.K. Pratt, Digital Image Processing, John Wiley and Sons, 1978
+ *
+ * H.S. Hou, H.C. Andrews, "Cubic Splines for Image Interpolation and
+ *	Digital Filtering", IEEE Trans. Acoustics, Speech, and Signal Proc.,
+ *	vol. ASSP-26, no. 6, Dec. 1978, pp. 508-517
+ *
+ * Paul Heckbert's zoom library.
+ * http://www.xmission.com/~legalize/zoom.html
+ * 
+ * Reconstruction Filters in Computer Graphics
+ * http://www.mentallandscape.com/Papers_siggraph88.pdf
+ *
+ * More references:
+ * http://www.worldserver.com/turk/computergraphics/ResamplingFilters.pdf
+ * http://www.dspguide.com/ch16.htm
+ */
+
+#include "Filter.h"
+
+#include "nvmath/Vector.h" // Vector4
+#include "nvcore/Utils.h" // swap
+
+#include <string.h> // memset
+
+using namespace nv;
+
+namespace
+{
+    // Sinc function.
+    inline static float sincf(const float x)
+    {
+        if (fabs(x) < NV_EPSILON) {
+            //return 1.0;
+            return 1.0f + x*x*(-1.0f/6.0f + x*x*1.0f/120.0f);
+        }
+        else {
+            return sin(x) / x;
+        }
+    }
+
+    // Bessel function of the first kind from Jon Blow's article.
+    // http://mathworld.wolfram.com/BesselFunctionoftheFirstKind.html
+    // http://en.wikipedia.org/wiki/Bessel_function
+    inline static float bessel0(float x)
+    {
+        const float EPSILON_RATIO = 1e-6f;
+        float xh, sum, pow, ds;
+        int k;
+
+        xh = 0.5f * x;
+        sum = 1.0f;
+        pow = 1.0f;
+        k = 0;
+        ds = 1.0;
+        while (ds > sum * EPSILON_RATIO) {
+            ++k;
+            pow = pow * (xh / k);
+            ds = pow * pow;
+            sum = sum + ds;
+        }
+
+        return sum;
+    }
+
+    /*// Alternative bessel function from Paul Heckbert.
+    static float _bessel0(float x)
+    {
+        const float EPSILON_RATIO = 1E-6;
+        float sum = 1.0f;
+        float y = x * x / 4.0f;
+        float t = y;
+        for(int i = 2; t > EPSILON_RATIO; i++) {
+            sum += t;
+            t *= y / float(i * i);
+        }
+        return sum;
+    }*/
+
+} // namespace
+
+
+Filter::Filter(float width) : m_width(width)
+{
+}
+
+/*virtual*/ Filter::~Filter()
+{
+}
+
+float Filter::sampleDelta(float x, float scale) const
+{
+    return evaluate((x + 0.5f)* scale);
+}
+
+float Filter::sampleBox(float x, float scale, int samples) const
+{
+    double sum = 0;
+    float isamples = 1.0f / float(samples);
+
+    for(int s = 0; s < samples; s++)
+    {
+        float p = (x + (float(s) + 0.5f) * isamples) * scale;
+        float value = evaluate(p);
+
+        //printf("%f: %.8f (%X)\n", p, value, *(uint32 *)&value);
+
+        sum += value;
+    }
+
+    return float(sum * isamples);
+}
+
+float Filter::sampleTriangle(float x, float scale, int samples) const
+{
+    double sum = 0;
+    float isamples = 1.0f / float(samples);
+
+    for(int s = 0; s < samples; s++)
+    {
+        float offset = (2 * float(s) + 1.0f) * isamples;		
+        float p = (x + offset - 0.5f) * scale;
+        float value = evaluate(p);
+
+        float weight = offset;
+        if (weight > 1.0f) weight = 2.0f - weight;
+
+        sum += value * weight;
+    }
+
+    return float(2 * sum * isamples);
+}
+
+
+
+
+
+BoxFilter::BoxFilter() : Filter(0.5f) {}
+BoxFilter::BoxFilter(float width) : Filter(width) {}
+
+float BoxFilter::evaluate(float x) const
+{
+    if (fabs(x) <= m_width) return 1.0f;
+    else return 0.0f;
+}
+
+
+TriangleFilter::TriangleFilter() : Filter(1.0f) {}
+TriangleFilter::TriangleFilter(float width) : Filter(width) {}
+
+float TriangleFilter::evaluate(float x) const
+{
+    x = fabs(x);
+    if( x < m_width ) return m_width - x;
+    return 0.0f;
+}
+
+
+QuadraticFilter::QuadraticFilter() : Filter(1.5f) {}
+
+float QuadraticFilter::evaluate(float x) const
+{
+    x = fabs(x);
+    if( x < 0.5f ) return 0.75f - x * x;
+    if( x < 1.5f ) { 
+        float t = x - 1.5f;
+        return 0.5f * t * t;
+    }
+    return 0.0f;
+}
+
+
+CubicFilter::CubicFilter() : Filter(1.0f) {}
+
+float CubicFilter::evaluate(float x) const
+{
+    // f(t) = 2|t|^3 - 3|t|^2 + 1, -1 <= t <= 1
+    x = fabs(x);
+    if( x < 1.0f ) return((2.0f * x - 3.0f) * x * x + 1.0f);
+    return 0.0f;
+}
+
+
+BSplineFilter::BSplineFilter() : Filter(2.0f) {}
+
+float BSplineFilter::evaluate(float x) const
+{
+    x = fabs(x);
+    if( x < 1.0f ) return (4.0f + x * x * (-6.0f + x * 3.0f)) / 6.0f;
+    if( x < 2.0f ) { 
+        float t = 2.0f - x;
+        return t * t * t / 6.0f;
+    }
+    return 0.0f;
+}
+
+
+MitchellFilter::MitchellFilter() : Filter(2.0f) { setParameters(1.0f/3.0f, 1.0f/3.0f); }
+
+float MitchellFilter::evaluate(float x) const
+{
+    x = fabs(x);
+    if( x < 1.0f ) return p0 + x * x * (p2 + x * p3);
+    if( x < 2.0f ) return q0 + x * (q1 + x * (q2 + x * q3));
+    return 0.0f;
+}
+
+void MitchellFilter::setParameters(float b, float c)
+{
+    p0 = (6.0f -  2.0f * b) / 6.0f;
+    p2 = (-18.0f + 12.0f * b + 6.0f * c) / 6.0f;
+    p3 = (12.0f - 9.0f * b - 6.0f * c) / 6.0f;
+    q0 = (8.0f * b + 24.0f * c) / 6.0f;
+    q1 = (-12.0f * b - 48.0f * c) / 6.0f;
+    q2 = (6.0f * b + 30.0f * c) / 6.0f;
+    q3 = (-b - 6.0f * c) / 6.0f;
+}
+
+
+LanczosFilter::LanczosFilter() : Filter(3.0f) {}
+
+float LanczosFilter::evaluate(float x) const
+{
+    x = fabs(x);
+    if( x < 3.0f ) return sincf(PI * x) * sincf(PI * x / 3.0f);
+    return 0.0f;
+}
+
+
+SincFilter::SincFilter(float w) : Filter(w) {}
+
+float SincFilter::evaluate(float x) const
+{
+    return sincf(PI * x);
+}
+
+
+KaiserFilter::KaiserFilter(float w) : Filter(w) { setParameters(4.0f, 1.0f); }
+
+float KaiserFilter::evaluate(float x) const
+{
+    const float sinc_value = sincf(PI * x * stretch);
+    const float t = x / m_width;
+    if ((1 - t * t) >= 0) return sinc_value * bessel0(alpha * sqrtf(1 - t * t)) / bessel0(alpha);
+    else return 0;
+}
+
+void KaiserFilter::setParameters(float alpha, float stretch)
+{
+    this->alpha = alpha;
+    this->stretch = stretch;
+}
+
+GaussianFilter::GaussianFilter(float w) : Filter(w) { setParameters(1); }
+
+float GaussianFilter::evaluate(float x) const
+{
+    // variance = sigma^2
+    return (1.0f / sqrtf(2 * PI * variance)) * expf(-x*x / (2 * variance));
+}
+
+void GaussianFilter::setParameters(float variance)
+{
+    this->variance = variance;
+}
+
+
+
+Kernel1::Kernel1(const Filter & f, int iscale, int samples/*= 32*/)
+{
+    nvDebugCheck(iscale > 1);
+    nvDebugCheck(samples > 0);
+
+    const float scale = 1.0f / iscale;
+
+    m_width = f.width() * iscale;
+    m_windowSize = (int)ceilf(2 * m_width);
+    m_data = new float[m_windowSize];
+
+    const float offset = float(m_windowSize) / 2;
+
+    float total = 0.0f;
+    for (int i = 0; i < m_windowSize; i++)
+    {
+        const float sample = f.sampleBox(i - offset, scale, samples);
+        m_data[i] = sample;
+        total += sample;
+    }
+
+    const float inv = 1.0f / total;
+    for (int i = 0; i < m_windowSize; i++)
+    {
+        m_data[i] *= inv;
+    }
+}
+
+Kernel1::~Kernel1()
+{
+    delete m_data;
+}
+
+// Print the kernel for debugging purposes.
+void Kernel1::debugPrint()
+{
+    for (int i = 0; i < m_windowSize; i++) {
+        nvDebug("%d: %f\n", i, m_data[i]);
+    }
+}
+
+
+
+Kernel2::Kernel2(uint ws) : m_windowSize(ws)
+{
+    m_data = new float[m_windowSize * m_windowSize];
+}
+
 Kernel2::Kernel2(uint ws, const float * data) : m_windowSize(ws)
 {
     m_data = new float[m_windowSize * m_windowSize];
 
     memcpy(m_data, data, sizeof(float) * m_windowSize * m_windowSize);
-}
-
-Kernel2::Kernel2(const Kernel2 & k) : m_windowSize(k.m_windowSize)
-{
-    m_data = new float[m_windowSize * m_windowSize];
-    for (uint i = 0; i < m_windowSize * m_windowSize; i++) {
-        m_data[i] = k.m_data[i];
-    }
-}
-
-
-Kernel2::~Kernel2()
-{
-    delete m_data;
-}
-
-// Normalize the filter.
-void Kernel2::normalize()
-{
-    float total = 0.0f;
-    for(uint i = 0; i < m_windowSize*m_windowSize; i++) {
-        total += fabs(m_data[i]);
-    }
-
-    float inv = 1.0f / total;
-    for(uint i = 0; i < m_windowSize*m_windowSize; i++) {
-        m_data[i] *= inv;
-    }
-}
-
-// Transpose the kernel.
-void Kernel2::transpose()
-{
-    for(uint i = 0; i < m_windowSize; i++) {
-        for(uint j = i+1; j < m_windowSize; j++) {
-            swap(m_data[i*m_windowSize + j], m_data[j*m_windowSize + i]);
-        }
-    }
-}
-
-// Init laplacian filter, usually used for sharpening.
-void Kernel2::initLaplacian()
-{
-    nvDebugCheck(m_windowSize == 3);
-    //	m_data[0] = -1; m_data[1] = -1; m_data[2] = -1;
-    //	m_data[3] = -1; m_data[4] = +8; m_data[5] = -1;
-    //	m_data[6] = -1; m_data[7] = -1; m_data[8] = -1;	
-
-    m_data[0] = +0; m_data[1] = -1; m_data[2] = +0;
-    m_data[3] = -1; m_data[4] = +4; m_data[5] = -1;
-    m_data[6] = +0; m_data[7] = -1; m_data[8] = +0;	
-
-    //	m_data[0] = +1; m_data[1] = -2; m_data[2] = +1;
-    //	m_data[3] = -2; m_data[4] = +4; m_data[5] = -2;
-    //	m_data[6] = +1; m_data[7] = -2; m_data[8] = +1;	
-}
-
-
-// Init simple edge detection filter.
-void Kernel2::initEdgeDetection()
-{
-    nvCheck(m_windowSize == 3);
-    m_data[0] = 0; m_data[1] = 0; m_data[2] = 0;
-    m_data[3] =-1; m_data[4] = 0; m_data[5] = 1;
-    m_data[6] = 0; m_data[7] = 0; m_data[8] = 0;
-}
-
-// Init sobel filter.
-void Kernel2::initSobel()
-{
-    if (m_windowSize == 3)
-    {
-        m_data[0] = -1; m_data[1] = 0; m_data[2] = 1;
-        m_data[3] = -2; m_data[4] = 0; m_data[5] = 2;
-        m_data[6] = -1; m_data[7] = 0; m_data[8] = 1;
-    }
-    else if (m_windowSize == 5)
-    {
-        float elements[] = {
-            -1, -2, 0, 2, 1,
-            -2, -3, 0, 3, 2,
-            -3, -4, 0, 4, 3,
-            -2, -3, 0, 3, 2,
-            -1, -2, 0, 2, 1
-        };
-
-        for (int i = 0; i < 5*5; i++) {
-            m_data[i] = elements[i];
-        }
-    }
-    else if (m_windowSize == 7)
-    {
-        float elements[] = {
-            -1, -2, -3, 0, 3, 2, 1,
-            -2, -3, -4, 0, 4, 3, 2,
-            -3, -4, -5, 0, 5, 4, 3,
-            -4, -5, -6, 0, 6, 5, 4,
-            -3, -4, -5, 0, 5, 4, 3,
-            -2, -3, -4, 0, 4, 3, 2,
-            -1, -2, -3, 0, 3, 2, 1
-        };
-
-        for (int i = 0; i < 7*7; i++) {
-            m_data[i] = elements[i];
-        }
-    }
-    else if (m_windowSize == 9)
-    {
-        float elements[] = {
-            -1, -2, -3, -4, 0, 4, 3, 2, 1,
-            -2, -3, -4, -5, 0, 5, 4, 3, 2,
-            -3, -4, -5, -6, 0, 6, 5, 4, 3,
-            -4, -5, -6, -7, 0, 7, 6, 5, 4,
-            -5, -6, -7, -8, 0, 8, 7, 6, 5,
-            -4, -5, -6, -7, 0, 7, 6, 5, 4,
-            -3, -4, -5, -6, 0, 6, 5, 4, 3,
-            -2, -3, -4, -5, 0, 5, 4, 3, 2,
-            -1, -2, -3, -4, 0, 4, 3, 2, 1
-        };
-
-        for (int i = 0; i < 9*9; i++) {
-            m_data[i] = elements[i];
-        }
-    }
-}
-
-// Init prewitt filter.
-void Kernel2::initPrewitt()
-{
-    if (m_windowSize == 3)
-    {
-        m_data[0] = -1; m_data[1] = 0; m_data[2] = -1;
-        m_data[3] = -1; m_data[4] = 0; m_data[5] = -1;
-        m_data[6] = -1; m_data[7] = 0; m_data[8] = -1;
-    }
-    else if (m_windowSize == 5)
-    {
-        // @@ Is this correct?
-        float elements[] = {
-            -2, -1, 0, 1, 2,
-            -2, -1, 0, 1, 2,
-            -2, -1, 0, 1, 2,
-            -2, -1, 0, 1, 2,
-            -2, -1, 0, 1, 2
-        };
-
-        for (int i = 0; i < 5*5; i++) {
-            m_data[i] = elements[i];
-        }
-    }
-}
-
-// Init blended sobel filter.
-void Kernel2::initBlendedSobel(const Vector4 & scale)
-{
-    nvCheck(m_windowSize == 9);
-
-    {
-        const float elements[] = {
-            -1, -2, -3, -4, 0, 4, 3, 2, 1,
-            -2, -3, -4, -5, 0, 5, 4, 3, 2,
-            -3, -4, -5, -6, 0, 6, 5, 4, 3,
-            -4, -5, -6, -7, 0, 7, 6, 5, 4,
-            -5, -6, -7, -8, 0, 8, 7, 6, 5,
-            -4, -5, -6, -7, 0, 7, 6, 5, 4,
-            -3, -4, -5, -6, 0, 6, 5, 4, 3,
-            -2, -3, -4, -5, 0, 5, 4, 3, 2,
-            -1, -2, -3, -4, 0, 4, 3, 2, 1
-        };
-
-        for (int i = 0; i < 9*9; i++) {
-            m_data[i] = elements[i] * scale.w;
-        }
-    }
-    {
-        const float elements[] = {
-            -1, -2, -3, 0, 3, 2, 1,
-            -2, -3, -4, 0, 4, 3, 2,
-            -3, -4, -5, 0, 5, 4, 3,
-            -4, -5, -6, 0, 6, 5, 4,
-            -3, -4, -5, 0, 5, 4, 3,
-            -2, -3, -4, 0, 4, 3, 2,
-            -1, -2, -3, 0, 3, 2, 1,
-        };
-
-        for (int i = 0; i < 7; i++) {
-            for (int e = 0; e < 7; e++) {
-                m_data[(i + 1) * 9 + e + 1] += elements[i * 7 + e] * scale.z;
-            }
-        }
-    }
-    {
-        const float elements[] = {
-            -1, -2, 0, 2, 1,
-            -2, -3, 0, 3, 2,
-            -3, -4, 0, 4, 3,
-            -2, -3, 0, 3, 2,
-            -1, -2, 0, 2, 1
-        };
-
-        for (int i = 0; i < 5; i++) {
-            for (int e = 0; e < 5; e++) {
-                m_data[(i + 2) * 9 + e + 2] += elements[i * 5 + e] * scale.y;
-            }
-        }
-    }
-    {
-        const float elements[] = {
-            -1, 0, 1,
-            -2, 0, 2,
-            -1, 0, 1,
-        };
-
-        for (int i = 0; i < 3; i++) {
-            for (int e = 0; e < 3; e++) {
-                m_data[(i + 3) * 9 + e + 3] += elements[i * 3 + e] * scale.x;
-            }
-        }
-    }
-}
-
-
-PolyphaseKernel::PolyphaseKernel(const Filter & f, uint srcLength, uint dstLength, int samples/*= 32*/)
-{
-    nvDebugCheck(samples > 0);
-
-    float scale = float(dstLength) / float(srcLength);
-    const float iscale = 1.0f / scale;
-
-    if (scale > 1) {
-        // Upsampling.
-        samples = 1;
-        scale = 1;
-    }
-
-    m_length = dstLength;
-    m_width = f.width() * iscale;
-    m_windowSize = (int)ceilf(m_width * 2) + 1;
-
-    m_data = new float[m_windowSize * m_length];
-    memset(m_data, 0, sizeof(float) * m_windowSize * m_length);
-
-    for (uint i = 0; i < m_length; i++)
-    {
-        const float center = (0.5f + i) * iscale;
-
-        const int left = (int)floorf(center - m_width);
-        const int right = (int)ceilf(center + m_width);
-        nvDebugCheck(right - left <= m_windowSize);
-
-        float total = 0.0f;
-        for (int j = 0; j < m_windowSize; j++)
-        {
-            const float sample = f.sampleBox(left + j - center, scale, samples);
-
-            //printf("%f %X\n", sample, *(uint32 *)&sample);
-
-            m_data[i * m_windowSize + j] = sample;
-            total += sample;
-        }
-
-        // normalize weights.
-        for (int j = 0; j < m_windowSize; j++)
-        {
-            m_data[i * m_windowSize + j] /= total;
-        }
-    }
-}
-
-PolyphaseKernel::~PolyphaseKernel()
-{
-    delete [] m_data;
-}
-
-
-// Print the kernel for debugging purposes.
-void PolyphaseKernel::debugPrint() const
-{
-    for (uint i = 0; i < m_length; i++)
-    {
-        nvDebug("%d: ", i);
-        for (int j = 0; j < m_windowSize; j++)
-        {
-            nvDebug(" %6.4f", m_data[i * m_windowSize + j]);
-        }
-        nvDebug("\n");
-    }
-}
-
+}
+
+Kernel2::Kernel2(const Kernel2 & k) : m_windowSize(k.m_windowSize)
+{
+    m_data = new float[m_windowSize * m_windowSize];
+    for (uint i = 0; i < m_windowSize * m_windowSize; i++) {
+        m_data[i] = k.m_data[i];
+    }
+}
+
+
+Kernel2::~Kernel2()
+{
+    delete m_data;
+}
+
+// Normalize the filter.
+void Kernel2::normalize()
+{
+    float total = 0.0f;
+    for(uint i = 0; i < m_windowSize*m_windowSize; i++) {
+        total += fabs(m_data[i]);
+    }
+
+    float inv = 1.0f / total;
+    for(uint i = 0; i < m_windowSize*m_windowSize; i++) {
+        m_data[i] *= inv;
+    }
+}
+
+// Transpose the kernel.
+void Kernel2::transpose()
+{
+    for(uint i = 0; i < m_windowSize; i++) {
+        for(uint j = i+1; j < m_windowSize; j++) {
+            swap(m_data[i*m_windowSize + j], m_data[j*m_windowSize + i]);
+        }
+    }
+}
+
+// Init laplacian filter, usually used for sharpening.
+void Kernel2::initLaplacian()
+{
+    nvDebugCheck(m_windowSize == 3);
+    //	m_data[0] = -1; m_data[1] = -1; m_data[2] = -1;
+    //	m_data[3] = -1; m_data[4] = +8; m_data[5] = -1;
+    //	m_data[6] = -1; m_data[7] = -1; m_data[8] = -1;	
+
+    m_data[0] = +0; m_data[1] = -1; m_data[2] = +0;
+    m_data[3] = -1; m_data[4] = +4; m_data[5] = -1;
+    m_data[6] = +0; m_data[7] = -1; m_data[8] = +0;	
+
+    //	m_data[0] = +1; m_data[1] = -2; m_data[2] = +1;
+    //	m_data[3] = -2; m_data[4] = +4; m_data[5] = -2;
+    //	m_data[6] = +1; m_data[7] = -2; m_data[8] = +1;	
+}
+
+
+// Init simple edge detection filter.
+void Kernel2::initEdgeDetection()
+{
+    nvCheck(m_windowSize == 3);
+    m_data[0] = 0; m_data[1] = 0; m_data[2] = 0;
+    m_data[3] =-1; m_data[4] = 0; m_data[5] = 1;
+    m_data[6] = 0; m_data[7] = 0; m_data[8] = 0;
+}
+
+// Init sobel filter.
+void Kernel2::initSobel()
+{
+    if (m_windowSize == 3)
+    {
+        m_data[0] = -1; m_data[1] = 0; m_data[2] = 1;
+        m_data[3] = -2; m_data[4] = 0; m_data[5] = 2;
+        m_data[6] = -1; m_data[7] = 0; m_data[8] = 1;
+    }
+    else if (m_windowSize == 5)
+    {
+        float elements[] = {
+            -1, -2, 0, 2, 1,
+            -2, -3, 0, 3, 2,
+            -3, -4, 0, 4, 3,
+            -2, -3, 0, 3, 2,
+            -1, -2, 0, 2, 1
+        };
+
+        for (int i = 0; i < 5*5; i++) {
+            m_data[i] = elements[i];
+        }
+    }
+    else if (m_windowSize == 7)
+    {
+        float elements[] = {
+            -1, -2, -3, 0, 3, 2, 1,
+            -2, -3, -4, 0, 4, 3, 2,
+            -3, -4, -5, 0, 5, 4, 3,
+            -4, -5, -6, 0, 6, 5, 4,
+            -3, -4, -5, 0, 5, 4, 3,
+            -2, -3, -4, 0, 4, 3, 2,
+            -1, -2, -3, 0, 3, 2, 1
+        };
+
+        for (int i = 0; i < 7*7; i++) {
+            m_data[i] = elements[i];
+        }
+    }
+    else if (m_windowSize == 9)
+    {
+        float elements[] = {
+            -1, -2, -3, -4, 0, 4, 3, 2, 1,
+            -2, -3, -4, -5, 0, 5, 4, 3, 2,
+            -3, -4, -5, -6, 0, 6, 5, 4, 3,
+            -4, -5, -6, -7, 0, 7, 6, 5, 4,
+            -5, -6, -7, -8, 0, 8, 7, 6, 5,
+            -4, -5, -6, -7, 0, 7, 6, 5, 4,
+            -3, -4, -5, -6, 0, 6, 5, 4, 3,
+            -2, -3, -4, -5, 0, 5, 4, 3, 2,
+            -1, -2, -3, -4, 0, 4, 3, 2, 1
+        };
+
+        for (int i = 0; i < 9*9; i++) {
+            m_data[i] = elements[i];
+        }
+    }
+}
+
+// Init prewitt filter.
+void Kernel2::initPrewitt()
+{
+    if (m_windowSize == 3)
+    {
+        m_data[0] = -1; m_data[1] = 0; m_data[2] = -1;
+        m_data[3] = -1; m_data[4] = 0; m_data[5] = -1;
+        m_data[6] = -1; m_data[7] = 0; m_data[8] = -1;
+    }
+    else if (m_windowSize == 5)
+    {
+        // @@ Is this correct?
+        float elements[] = {
+            -2, -1, 0, 1, 2,
+            -2, -1, 0, 1, 2,
+            -2, -1, 0, 1, 2,
+            -2, -1, 0, 1, 2,
+            -2, -1, 0, 1, 2
+        };
+
+        for (int i = 0; i < 5*5; i++) {
+            m_data[i] = elements[i];
+        }
+    }
+}
+
+// Init blended sobel filter.
+void Kernel2::initBlendedSobel(const Vector4 & scale)
+{
+    nvCheck(m_windowSize == 9);
+
+    {
+        const float elements[] = {
+            -1, -2, -3, -4, 0, 4, 3, 2, 1,
+            -2, -3, -4, -5, 0, 5, 4, 3, 2,
+            -3, -4, -5, -6, 0, 6, 5, 4, 3,
+            -4, -5, -6, -7, 0, 7, 6, 5, 4,
+            -5, -6, -7, -8, 0, 8, 7, 6, 5,
+            -4, -5, -6, -7, 0, 7, 6, 5, 4,
+            -3, -4, -5, -6, 0, 6, 5, 4, 3,
+            -2, -3, -4, -5, 0, 5, 4, 3, 2,
+            -1, -2, -3, -4, 0, 4, 3, 2, 1
+        };
+
+        for (int i = 0; i < 9*9; i++) {
+            m_data[i] = elements[i] * scale.w;
+        }
+    }
+    {
+        const float elements[] = {
+            -1, -2, -3, 0, 3, 2, 1,
+            -2, -3, -4, 0, 4, 3, 2,
+            -3, -4, -5, 0, 5, 4, 3,
+            -4, -5, -6, 0, 6, 5, 4,
+            -3, -4, -5, 0, 5, 4, 3,
+            -2, -3, -4, 0, 4, 3, 2,
+            -1, -2, -3, 0, 3, 2, 1,
+        };
+
+        for (int i = 0; i < 7; i++) {
+            for (int e = 0; e < 7; e++) {
+                m_data[(i + 1) * 9 + e + 1] += elements[i * 7 + e] * scale.z;
+            }
+        }
+    }
+    {
+        const float elements[] = {
+            -1, -2, 0, 2, 1,
+            -2, -3, 0, 3, 2,
+            -3, -4, 0, 4, 3,
+            -2, -3, 0, 3, 2,
+            -1, -2, 0, 2, 1
+        };
+
+        for (int i = 0; i < 5; i++) {
+            for (int e = 0; e < 5; e++) {
+                m_data[(i + 2) * 9 + e + 2] += elements[i * 5 + e] * scale.y;
+            }
+        }
+    }
+    {
+        const float elements[] = {
+            -1, 0, 1,
+            -2, 0, 2,
+            -1, 0, 1,
+        };
+
+        for (int i = 0; i < 3; i++) {
+            for (int e = 0; e < 3; e++) {
+                m_data[(i + 3) * 9 + e + 3] += elements[i * 3 + e] * scale.x;
+            }
+        }
+    }
+}
+
+
+PolyphaseKernel::PolyphaseKernel(const Filter & f, uint srcLength, uint dstLength, int samples/*= 32*/)
+{
+    nvDebugCheck(samples > 0);
+
+    float scale = float(dstLength) / float(srcLength);
+    const float iscale = 1.0f / scale;
+
+    if (scale > 1) {
+        // Upsampling.
+        samples = 1;
+        scale = 1;
+    }
+
+    m_length = dstLength;
+    m_width = f.width() * iscale;
+    m_windowSize = (int)ceilf(m_width * 2) + 1;
+
+    m_data = new float[m_windowSize * m_length];
+    memset(m_data, 0, sizeof(float) * m_windowSize * m_length);
+
+    for (uint i = 0; i < m_length; i++)
+    {
+        const float center = (0.5f + i) * iscale;
+
+        const int left = (int)floorf(center - m_width);
+        const int right = (int)ceilf(center + m_width);
+        nvDebugCheck(right - left <= m_windowSize);
+
+        float total = 0.0f;
+        for (int j = 0; j < m_windowSize; j++)
+        {
+            const float sample = f.sampleBox(left + j - center, scale, samples);
+
+            //printf("%f %X\n", sample, *(uint32 *)&sample);
+
+            m_data[i * m_windowSize + j] = sample;
+            total += sample;
+        }
+
+        // normalize weights.
+        for (int j = 0; j < m_windowSize; j++)
+        {
+            m_data[i * m_windowSize + j] /= total;
+        }
+    }
+}
+
+PolyphaseKernel::~PolyphaseKernel()
+{
+    delete [] m_data;
+}
+
+
+// Print the kernel for debugging purposes.
+void PolyphaseKernel::debugPrint() const
+{
+    for (uint i = 0; i < m_length; i++)
+    {
+        nvDebug("%d: ", i);
+        for (int j = 0; j < m_windowSize; j++)
+        {
+            nvDebug(" %6.4f", m_data[i * m_windowSize + j]);
+        }
+        nvDebug("\n");
+    }
+}
+
diff --git a/src/nvimage/Filter.h b/src/nvimage/Filter.h
index 6ced86b..ab814f1 100644
--- a/src/nvimage/Filter.h
+++ b/src/nvimage/Filter.h
@@ -1,234 +1,234 @@
-// This code is in the public domain -- castanyo@yahoo.es
-
-#pragma once
-#ifndef NV_IMAGE_FILTER_H
-#define NV_IMAGE_FILTER_H
-
-#include "nvimage.h"
-#include "nvcore/Debug.h"
-
-namespace nv
-{
-    class Vector4;
-
-    /// Base filter class.
-    class NVIMAGE_CLASS Filter
-    {
-    public:
-        Filter(float width);
-        virtual ~Filter();
-
-        float width() const { return m_width; }
-        float sampleDelta(float x, float scale) const;
-        float sampleBox(float x, float scale, int samples) const;
-        float sampleTriangle(float x, float scale, int samples) const;
-
-        virtual float evaluate(float x) const = 0;
-
-    protected:
-        const float m_width;
-    };
-
-    // Box filter.
-    class NVIMAGE_CLASS BoxFilter : public Filter
-    {
-    public:
-        BoxFilter();
-        BoxFilter(float width);
-        virtual float evaluate(float x) const;
-    };
-
-    // Triangle (bilinear/tent) filter.
-    class NVIMAGE_CLASS TriangleFilter : public Filter
-    {
-    public:
-        TriangleFilter();
-        TriangleFilter(float width);
-        virtual float evaluate(float x) const;
-    };
-
-    // Quadratic (bell) filter.
-    class NVIMAGE_CLASS QuadraticFilter : public Filter
-    {
-    public:
-        QuadraticFilter();
-        virtual float evaluate(float x) const;
-    };
-
-    // Cubic filter from Thatcher Ulrich.
-    class NVIMAGE_CLASS CubicFilter : public Filter
-    {
-    public:
-        CubicFilter();
-        virtual float evaluate(float x) const;
-    };
-
-    // Cubic b-spline filter from Paul Heckbert.
-    class NVIMAGE_CLASS BSplineFilter : public Filter
-    {
-    public:
-        BSplineFilter();
-        virtual float evaluate(float x) const;
-    };
-
-    /// Mitchell & Netravali's two-param cubic
-    /// @see "Reconstruction Filters in Computer Graphics", SIGGRAPH 88
-    class NVIMAGE_CLASS MitchellFilter : public Filter
-    {
-    public:
-        MitchellFilter();
-        virtual float evaluate(float x) const;
-
-        void setParameters(float b, float c);
-
-    private:
-        float p0, p2, p3;
-        float q0, q1, q2, q3;
-    };
-
-    // Lanczos3 filter.
-    class NVIMAGE_CLASS LanczosFilter : public Filter
-    {
-    public:
-        LanczosFilter();
-        virtual float evaluate(float x) const;
-    };
-
-    // Sinc filter.
-    class NVIMAGE_CLASS SincFilter : public Filter
-    {
-    public:
-        SincFilter(float w);
-        virtual float evaluate(float x) const;
-    };
-
-    // Kaiser filter.
-    class NVIMAGE_CLASS KaiserFilter : public Filter
-    {
-    public:
-        KaiserFilter(float w);
-        virtual float evaluate(float x) const;
-
-        void setParameters(float a, float stretch);
-
-    private:
-        float alpha;
-        float stretch;
-    };
-
-    // Gaussian filter.
-    class GaussianFilter : public Filter
-    {
-    public:
-        GaussianFilter(float w);
-        virtual float evaluate(float x) const;
-
-        void setParameters(float variance);
-
-    private:
-        float variance;
-    };
-
-
-
-    /// A 1D kernel. Used to precompute filter weights.
-    class NVIMAGE_CLASS Kernel1
-    {
-        NV_FORBID_COPY(Kernel1);
-    public:
-        Kernel1(const Filter & f, int iscale, int samples = 32);
-        ~Kernel1();
-
-        float valueAt(uint x) const {
-            nvDebugCheck(x < (uint)m_windowSize);
-            return m_data[x];
-        }
-
-        int windowSize() const {
-            return m_windowSize;
-        }
-
-        float width() const {
-            return m_width;
-        }
-
-        void debugPrint();
-
-    private:
-        int m_windowSize;
-        float m_width;
-        float * m_data;
-    };
-
-
-    /// A 2D kernel.
-    class NVIMAGE_CLASS Kernel2 
-    {
-    public:
-        Kernel2(uint width);
-        Kernel2(uint width, const float * data);
-        Kernel2(const Kernel2 & k);
-        ~Kernel2();
-
-        void normalize();
-        void transpose();
-
-        float valueAt(uint x, uint y) const {
-            return m_data[y * m_windowSize + x];
-        }
-
-        uint windowSize() const {
-            return m_windowSize;
-        }
-
-        void initLaplacian();
-        void initEdgeDetection();
-        void initSobel();
-        void initPrewitt();
-
-        void initBlendedSobel(const Vector4 & scale);
-
-    private:
-        const uint m_windowSize;
-        float * m_data;
-    };
-
-
-    /// A 1D polyphase kernel
-    class NVIMAGE_CLASS PolyphaseKernel
-    {
-        NV_FORBID_COPY(PolyphaseKernel);
-    public:
-        PolyphaseKernel(const Filter & f, uint srcLength, uint dstLength, int samples = 32);
-        ~PolyphaseKernel();
-
-        int windowSize() const {
-            return m_windowSize;
-        }
-
-        uint length() const {
-            return m_length;
-        }
-
-        float width() const {
-            return m_width;
-        }
-
-        float valueAt(uint column, uint x) const {
-            nvDebugCheck(column < m_length);
-            nvDebugCheck(x < (uint)m_windowSize);
-            return m_data[column * m_windowSize + x];
-        }
-
-        void debugPrint() const;
-
-    private:
-        int m_windowSize;
-        uint m_length;
-        float m_width;
-        float * m_data;
-    };
-
-} // nv namespace
-
-#endif // NV_IMAGE_FILTER_H
+// This code is in the public domain -- castanyo@yahoo.es
+
+#pragma once
+#ifndef NV_IMAGE_FILTER_H
+#define NV_IMAGE_FILTER_H
+
+#include "nvimage.h"
+#include "nvcore/Debug.h"
+
+namespace nv
+{
+    class Vector4;
+
+    /// Base filter class.
+    class NVIMAGE_CLASS Filter
+    {
+    public:
+        Filter(float width);
+        virtual ~Filter();
+
+        float width() const { return m_width; }
+        float sampleDelta(float x, float scale) const;
+        float sampleBox(float x, float scale, int samples) const;
+        float sampleTriangle(float x, float scale, int samples) const;
+
+        virtual float evaluate(float x) const = 0;
+
+    protected:
+        const float m_width;
+    };
+
+    // Box filter.
+    class NVIMAGE_CLASS BoxFilter : public Filter
+    {
+    public:
+        BoxFilter();
+        BoxFilter(float width);
+        virtual float evaluate(float x) const;
+    };
+
+    // Triangle (bilinear/tent) filter.
+    class NVIMAGE_CLASS TriangleFilter : public Filter
+    {
+    public:
+        TriangleFilter();
+        TriangleFilter(float width);
+        virtual float evaluate(float x) const;
+    };
+
+    // Quadratic (bell) filter.
+    class NVIMAGE_CLASS QuadraticFilter : public Filter
+    {
+    public:
+        QuadraticFilter();
+        virtual float evaluate(float x) const;
+    };
+
+    // Cubic filter from Thatcher Ulrich.
+    class NVIMAGE_CLASS CubicFilter : public Filter
+    {
+    public:
+        CubicFilter();
+        virtual float evaluate(float x) const;
+    };
+
+    // Cubic b-spline filter from Paul Heckbert.
+    class NVIMAGE_CLASS BSplineFilter : public Filter
+    {
+    public:
+        BSplineFilter();
+        virtual float evaluate(float x) const;
+    };
+
+    /// Mitchell & Netravali's two-param cubic
+    /// @see "Reconstruction Filters in Computer Graphics", SIGGRAPH 88
+    class NVIMAGE_CLASS MitchellFilter : public Filter
+    {
+    public:
+        MitchellFilter();
+        virtual float evaluate(float x) const;
+
+        void setParameters(float b, float c);
+
+    private:
+        float p0, p2, p3;
+        float q0, q1, q2, q3;
+    };
+
+    // Lanczos3 filter.
+    class NVIMAGE_CLASS LanczosFilter : public Filter
+    {
+    public:
+        LanczosFilter();
+        virtual float evaluate(float x) const;
+    };
+
+    // Sinc filter.
+    class NVIMAGE_CLASS SincFilter : public Filter
+    {
+    public:
+        SincFilter(float w);
+        virtual float evaluate(float x) const;
+    };
+
+    // Kaiser filter.
+    class NVIMAGE_CLASS KaiserFilter : public Filter
+    {
+    public:
+        KaiserFilter(float w);
+        virtual float evaluate(float x) const;
+
+        void setParameters(float a, float stretch);
+
+    private:
+        float alpha;
+        float stretch;
+    };
+
+    // Gaussian filter.
+    class GaussianFilter : public Filter
+    {
+    public:
+        GaussianFilter(float w);
+        virtual float evaluate(float x) const;
+
+        void setParameters(float variance);
+
+    private:
+        float variance;
+    };
+
+
+
+    /// A 1D kernel. Used to precompute filter weights.
+    class NVIMAGE_CLASS Kernel1
+    {
+        NV_FORBID_COPY(Kernel1);
+    public:
+        Kernel1(const Filter & f, int iscale, int samples = 32);
+        ~Kernel1();
+
+        float valueAt(uint x) const {
+            nvDebugCheck(x < (uint)m_windowSize);
+            return m_data[x];
+        }
+
+        int windowSize() const {
+            return m_windowSize;
+        }
+
+        float width() const {
+            return m_width;
+        }
+
+        void debugPrint();
+
+    private:
+        int m_windowSize;
+        float m_width;
+        float * m_data;
+    };
+
+
+    /// A 2D kernel.
+    class NVIMAGE_CLASS Kernel2 
+    {
+    public:
+        Kernel2(uint width);
+        Kernel2(uint width, const float * data);
+        Kernel2(const Kernel2 & k);
+        ~Kernel2();
+
+        void normalize();
+        void transpose();
+
+        float valueAt(uint x, uint y) const {
+            return m_data[y * m_windowSize + x];
+        }
+
+        uint windowSize() const {
+            return m_windowSize;
+        }
+
+        void initLaplacian();
+        void initEdgeDetection();
+        void initSobel();
+        void initPrewitt();
+
+        void initBlendedSobel(const Vector4 & scale);
+
+    private:
+        const uint m_windowSize;
+        float * m_data;
+    };
+
+
+    /// A 1D polyphase kernel
+    class NVIMAGE_CLASS PolyphaseKernel
+    {
+        NV_FORBID_COPY(PolyphaseKernel);
+    public:
+        PolyphaseKernel(const Filter & f, uint srcLength, uint dstLength, int samples = 32);
+        ~PolyphaseKernel();
+
+        int windowSize() const {
+            return m_windowSize;
+        }
+
+        uint length() const {
+            return m_length;
+        }
+
+        float width() const {
+            return m_width;
+        }
+
+        float valueAt(uint column, uint x) const {
+            nvDebugCheck(column < m_length);
+            nvDebugCheck(x < (uint)m_windowSize);
+            return m_data[column * m_windowSize + x];
+        }
+
+        void debugPrint() const;
+
+    private:
+        int m_windowSize;
+        uint m_length;
+        float m_width;
+        float * m_data;
+    };
+
+} // nv namespace
+
+#endif // NV_IMAGE_FILTER_H
diff --git a/src/nvimage/FloatImage.h b/src/nvimage/FloatImage.h
index d618be0..39085c8 100644
--- a/src/nvimage/FloatImage.h
+++ b/src/nvimage/FloatImage.h
@@ -235,7 +235,7 @@ namespace nv
         nvDebugCheck(x < m_width);
         nvDebugCheck(y < m_height);
         nvDebugCheck(z < m_depth);
-        return m_mem[((c * m_depth + z) * m_height + y) * m_width + x];
+        return m_mem[c * m_pixelCount + index(x, y, z)];
     }
 
     /// Get pixel component.
@@ -246,7 +246,7 @@ namespace nv
         nvDebugCheck(x < m_width);
         nvDebugCheck(y < m_height);
         nvDebugCheck(z < m_depth);
-        return m_mem[((c * m_depth + z) * m_height + y) * m_width + x];
+        return m_mem[c * m_pixelCount + index(x, y, z)];
     }
 
     /// Get pixel component.
@@ -255,7 +255,7 @@ namespace nv
         nvDebugCheck(m_mem != NULL);
         nvDebugCheck(c < m_componentCount);
         nvDebugCheck(idx < m_pixelCount);
-        return m_mem[c * m_height * m_width + idx];
+        return m_mem[c * m_pixelCount + idx];
     }
 
     /// Get pixel component.
@@ -264,7 +264,7 @@ namespace nv
         nvDebugCheck(m_mem != NULL);
         nvDebugCheck(c < m_componentCount);
         nvDebugCheck(idx < m_pixelCount);
-        return m_mem[c * m_height * m_width + idx];
+        return m_mem[c * m_pixelCount + idx];
     }
 
     /// Get pixel component.
@@ -288,7 +288,9 @@ namespace nv
         nvDebugCheck(x < m_width);
         nvDebugCheck(y < m_height);
         nvDebugCheck(z < m_depth);
-        return (z * m_height + y) * m_width + x;
+        uint idx = (z * m_height + y) * m_width + x;
+        nvDebugCheck(idx < m_pixelCount);
+        return idx;
     }
 
 
diff --git a/src/nvimage/Image.cpp b/src/nvimage/Image.cpp
index 006c324..495110f 100644
--- a/src/nvimage/Image.cpp
+++ b/src/nvimage/Image.cpp
@@ -1,160 +1,160 @@
-// This code is in the public domain -- castanyo@yahoo.es
-
-#include "Image.h"
-#include "ImageIO.h"
-
-#include "nvmath/Color.h"
-
-#include "nvcore/Debug.h"
-#include "nvcore/Ptr.h"
-#include "nvcore/Utils.h" // swap
-
-
-using namespace nv;
-
-Image::Image() : m_width(0), m_height(0), m_format(Format_RGB), m_data(NULL)
-{
-}
-
-Image::Image(const Image & img) : m_data(NULL)
-{
-	allocate(img.m_width, img.m_height, img.m_depth);
-    m_format = img.m_format;
-    memcpy(m_data, img.m_data, sizeof(Color32) * m_width * m_height * m_depth);
-}
-
-Image::~Image()
-{
-    free();
-}
-
-const Image & Image::operator=(const Image & img)
-{
-    allocate(img.m_width, img.m_height, m_depth);
-    m_format = img.m_format;
-    memcpy(m_data, img.m_data, sizeof(Color32) * m_width * m_height * m_depth);
-    return *this;
-}
-
-
-void Image::allocate(uint w, uint h, uint d)
-{
-    free();
-    m_width = w;
-    m_height = h;
-	m_depth = d;
-    m_data = realloc<Color32>(m_data, w * h * d);
-}
-
-bool Image::load(const char * name)
-{
-    free();
-
-    AutoPtr<Image> img(ImageIO::load(name));
-    if (img == NULL) {
-        return false;
-    }
-
-    swap(m_width, img->m_width);
-    swap(m_height, img->m_height);
-	swap(m_depth, img->m_depth);
-    swap(m_format, img->m_format);
-    swap(m_data, img->m_data);
-
-    return true;
-}
-
-void Image::wrap(void * data, uint w, uint h, uint d)
-{
-    free();
-    m_data = (Color32 *)data;
-    m_width = w;
-    m_height = h;
-	m_depth = d;
-}
-
-void Image::unwrap()
-{
-    m_data = NULL;
-    m_width = 0;
-    m_height = 0;
-	m_depth = 0;
-}
-
-
-void Image::free()
-{
-    ::free(m_data);
-    m_data = NULL;
-}
-
-
-uint Image::width() const
-{
-    return m_width;
-}
-
-uint Image::height() const
-{
-    return m_height;
-}
-
-uint Image::depth() const
-{
-	return m_depth;
-}
-
-const Color32 * Image::scanline(uint h) const
-{
-    nvDebugCheck(h < m_height);
-    return m_data + h * m_width;
-}
-
-Color32 * Image::scanline(uint h)
-{
-    nvDebugCheck(h < m_height);
-    return m_data + h * m_width;
-}
-
-const Color32 * Image::pixels() const
-{
-    return m_data;
-}
-
-Color32 * Image::pixels()
-{
-    return m_data;
-}
-
-const Color32 & Image::pixel(uint idx) const
-{
-    nvDebugCheck(idx < m_width * m_height * m_depth);
-    return m_data[idx];
-}
-
-Color32 & Image::pixel(uint idx)
-{
-    nvDebugCheck(idx < m_width * m_height * m_depth);
-    return m_data[idx];
-}
-
-
-Image::Format Image::format() const
-{
-    return m_format;
-}
-
-void Image::setFormat(Image::Format f)
-{
-    m_format = f;
-}
-
-void Image::fill(Color32 c)
-{
-    const uint size = m_width * m_height * m_depth;
-    for (uint i = 0; i < size; ++i)
-    {
-        m_data[i] = c;
-    }
-}
-
+// This code is in the public domain -- castanyo@yahoo.es
+
+#include "Image.h"
+#include "ImageIO.h"
+
+#include "nvmath/Color.h"
+
+#include "nvcore/Debug.h"
+#include "nvcore/Ptr.h"
+#include "nvcore/Utils.h" // swap
+
+
+using namespace nv;
+
+Image::Image() : m_width(0), m_height(0), m_format(Format_RGB), m_data(NULL)
+{
+}
+
+Image::Image(const Image & img) : m_data(NULL)
+{
+	allocate(img.m_width, img.m_height, img.m_depth);
+    m_format = img.m_format;
+    memcpy(m_data, img.m_data, sizeof(Color32) * m_width * m_height * m_depth);
+}
+
+Image::~Image()
+{
+    free();
+}
+
+const Image & Image::operator=(const Image & img)
+{
+    allocate(img.m_width, img.m_height, m_depth);
+    m_format = img.m_format;
+    memcpy(m_data, img.m_data, sizeof(Color32) * m_width * m_height * m_depth);
+    return *this;
+}
+
+
+void Image::allocate(uint w, uint h, uint d)
+{
+    free();
+    m_width = w;
+    m_height = h;
+	m_depth = d;
+    m_data = realloc<Color32>(m_data, w * h * d);
+}
+
+bool Image::load(const char * name)
+{
+    free();
+
+    AutoPtr<Image> img(ImageIO::load(name));
+    if (img == NULL) {
+        return false;
+    }
+
+    swap(m_width, img->m_width);
+    swap(m_height, img->m_height);
+	swap(m_depth, img->m_depth);
+    swap(m_format, img->m_format);
+    swap(m_data, img->m_data);
+
+    return true;
+}
+
+void Image::wrap(void * data, uint w, uint h, uint d)
+{
+    free();
+    m_data = (Color32 *)data;
+    m_width = w;
+    m_height = h;
+	m_depth = d;
+}
+
+void Image::unwrap()
+{
+    m_data = NULL;
+    m_width = 0;
+    m_height = 0;
+	m_depth = 0;
+}
+
+
+void Image::free()
+{
+    ::free(m_data);
+    m_data = NULL;
+}
+
+
+uint Image::width() const
+{
+    return m_width;
+}
+
+uint Image::height() const
+{
+    return m_height;
+}
+
+uint Image::depth() const
+{
+	return m_depth;
+}
+
+const Color32 * Image::scanline(uint h) const
+{
+    nvDebugCheck(h < m_height);
+    return m_data + h * m_width;
+}
+
+Color32 * Image::scanline(uint h)
+{
+    nvDebugCheck(h < m_height);
+    return m_data + h * m_width;
+}
+
+const Color32 * Image::pixels() const
+{
+    return m_data;
+}
+
+Color32 * Image::pixels()
+{
+    return m_data;
+}
+
+const Color32 & Image::pixel(uint idx) const
+{
+    nvDebugCheck(idx < m_width * m_height * m_depth);
+    return m_data[idx];
+}
+
+Color32 & Image::pixel(uint idx)
+{
+    nvDebugCheck(idx < m_width * m_height * m_depth);
+    return m_data[idx];
+}
+
+
+Image::Format Image::format() const
+{
+    return m_format;
+}
+
+void Image::setFormat(Image::Format f)
+{
+    m_format = f;
+}
+
+void Image::fill(Color32 c)
+{
+    const uint size = m_width * m_height * m_depth;
+    for (uint i = 0; i < size; ++i)
+    {
+        m_data[i] = c;
+    }
+}
+
diff --git a/src/nvimage/Image.h b/src/nvimage/Image.h
index 9161e57..729ccd4 100644
--- a/src/nvimage/Image.h
+++ b/src/nvimage/Image.h
@@ -1,86 +1,86 @@
-// This code is in the public domain -- castanyo@yahoo.es
-
-#pragma once
-#ifndef NV_IMAGE_IMAGE_H
-#define NV_IMAGE_IMAGE_H
-
-#include "nvimage.h"
-#include "nvcore/Debug.h"
-
-namespace nv
-{
-    class Color32;
-
-    /// 32 bit RGBA image.
-    class NVIMAGE_CLASS Image
-    {
-    public:
-
-        enum Format 
-        {
-            Format_RGB,
-            Format_ARGB,
-        };
-
-        Image();
-        Image(const Image & img);
-        ~Image();
-
-        const Image & operator=(const Image & img);
-
-
-        void allocate(uint w, uint h, uint d = 1);
-        bool load(const char * name);
-
-        void wrap(void * data, uint w, uint h, uint d = 1);
-        void unwrap();
-
-        uint width() const;
-        uint height() const;
-        uint depth() const;
-
-        const Color32 * scanline(uint h) const;
-        Color32 * scanline(uint h);
-
-        const Color32 * pixels() const;
-        Color32 * pixels();
-
-        const Color32 & pixel(uint idx) const;
-        Color32 & pixel(uint idx);
-
-        const Color32 & pixel(uint x, uint y) const;
-        Color32 & pixel(uint x, uint y);
-
-        Format format() const;
-        void setFormat(Format f);
-
-        void fill(Color32 c);
-
-    private:
-        void free();
-
-    private:
-        uint m_width;
-        uint m_height;
-        uint m_depth;
-        Format m_format;
-        Color32 * m_data;
-    };
-
-
-    inline const Color32 & Image::pixel(uint x, uint y) const
-    {
-        nvDebugCheck(x < m_width && y < m_height);
-        return pixel(y * m_width + x);
-    }
-
-    inline Color32 & Image::pixel(uint x, uint y)
-    {
-        nvDebugCheck(x < m_width && y < m_height);
-        return pixel(y * m_width + x);
-    }
-
-} // nv namespace
-
-
-#endif // NV_IMAGE_IMAGE_H
+// This code is in the public domain -- castanyo@yahoo.es
+
+#pragma once
+#ifndef NV_IMAGE_IMAGE_H
+#define NV_IMAGE_IMAGE_H
+
+#include "nvimage.h"
+#include "nvcore/Debug.h"
+
+namespace nv
+{
+    class Color32;
+
+    /// 32 bit RGBA image.
+    class NVIMAGE_CLASS Image
+    {
+    public:
+
+        enum Format 
+        {
+            Format_RGB,
+            Format_ARGB,
+        };
+
+        Image();
+        Image(const Image & img);
+        ~Image();
+
+        const Image & operator=(const Image & img);
+
+
+        void allocate(uint w, uint h, uint d = 1);
+        bool load(const char * name);
+
+        void wrap(void * data, uint w, uint h, uint d = 1);
+        void unwrap();
+
+        uint width() const;
+        uint height() const;
+        uint depth() const;
+
+        const Color32 * scanline(uint h) const;
+        Color32 * scanline(uint h);
+
+        const Color32 * pixels() const;
+        Color32 * pixels();
+
+        const Color32 & pixel(uint idx) const;
+        Color32 & pixel(uint idx);
+
+        const Color32 & pixel(uint x, uint y) const;
+        Color32 & pixel(uint x, uint y);
+
+        Format format() const;
+        void setFormat(Format f);
+
+        void fill(Color32 c);
+
+    private:
+        void free();
+
+    private:
+        uint m_width;
+        uint m_height;
+        uint m_depth;
+        Format m_format;
+        Color32 * m_data;
+    };
+
+
+    inline const Color32 & Image::pixel(uint x, uint y) const
+    {
+        nvDebugCheck(x < m_width && y < m_height);
+        return pixel(y * m_width + x);
+    }
+
+    inline Color32 & Image::pixel(uint x, uint y)
+    {
+        nvDebugCheck(x < m_width && y < m_height);
+        return pixel(y * m_width + x);
+    }
+
+} // nv namespace
+
+
+#endif // NV_IMAGE_IMAGE_H
diff --git a/src/nvimage/ImageIO.cpp b/src/nvimage/ImageIO.cpp
index f0d5b32..9a81c00 100644
--- a/src/nvimage/ImageIO.cpp
+++ b/src/nvimage/ImageIO.cpp
@@ -1,1943 +1,1943 @@
-// This code is in the public domain -- castanyo@yahoo.es
-
-#include "ImageIO.h"
-#include "Image.h"
-#include "FloatImage.h"
-#include "TgaFile.h"
-#include "PsdFile.h"
-#include "DirectDrawSurface.h"
-#include "PixelFormat.h"
-
-#include "nvmath/Color.h"
-#include "nvmath/Half.h"
-
-#include "nvcore/Ptr.h"
-#include "nvcore/Utils.h"
-#include "nvcore/Array.h"
-#include "nvcore/StrLib.h"
-#include "nvcore/StdStream.h"
-#include "nvcore/TextWriter.h"
-
-// Extern
-#if defined(HAVE_FREEIMAGE)
-#   include <FreeImage.h>
-// If FreeImage available, do not use individual libraries, since that produces link conflicts in some platforms.
-#   undef HAVE_JPEG
-#   undef HAVE_PNG
-#   undef HAVE_TIFF
-#   undef HAVE_OPENEXR
-#endif
-
-#if defined(HAVE_JPEG)
-extern "C" {
-#   include <jpeglib.h>
-}
-#endif
-
-#if defined(HAVE_PNG)
-#   include <png.h>
-#endif
-
-#if defined(HAVE_TIFF)
-#   define _TIFF_DATA_TYPEDEFS_
-#   include <tiffio.h>
-#endif
-
-#if defined(HAVE_OPENEXR)
-#   include <ImfIO.h>
-#   include <ImathBox.h>
-#   include <ImfChannelList.h>
-#   include <ImfInputFile.h>
-#   include <ImfOutputFile.h>
-#   include <ImfArray.h>
-#endif
-
-#if defined(HAVE_STBIMAGE)
-#   define STBI_NO_STDIO
-#   include <stb_image.h>
-#endif
-
-
-using namespace nv;
-
-
-
-struct Color555 {
-    uint16 b : 5;
-    uint16 g : 5;
-    uint16 r : 5;
-};
-
-// Load TGA image.
-static Image * loadTGA(Stream & s)
-{
-    nvCheck(!s.isError());
-    nvCheck(s.isLoading());
-
-    TgaHeader tga;
-    s << tga;
-    s.seek(TgaHeader::Size + tga.id_length);
-
-    // Get header info.
-    bool rle = false;
-    bool pal = false;
-    bool rgb = false;
-    bool grey = false;
-
-    switch( tga.image_type ) {
-        case TGA_TYPE_RLE_INDEXED:
-            rle = true;
-            // no break is intended!
-        case TGA_TYPE_INDEXED:
-            if( tga.colormap_type!=1 || tga.colormap_size!=24 || tga.colormap_length>256 ) {
-                nvDebug( "*** loadTGA: Error, only 24bit paletted images are supported.\n" );
-                return NULL;
-            }
-            pal = true;
-            break;
-
-	case TGA_TYPE_RLE_RGB:
-	    rle = true;
-	    // no break is intended!
-	case TGA_TYPE_RGB:
-	    rgb = true;
-	    break;
-
-	case TGA_TYPE_RLE_GREY:
-	    rle = true;
-	    // no break is intended!
-	case TGA_TYPE_GREY:
-	    grey = true;
-	    break;
-
-	default:
-	    nvDebug( "*** loadTGA: Error, unsupported image type.\n" );
-	    return NULL;
-    }
-
-    const uint pixel_size = (tga.pixel_size/8);
-    nvDebugCheck(pixel_size <= 4);
-
-    const uint size = tga.width * tga.height * pixel_size;
-
-
-    // Read palette
-    uint8 palette[768];
-    if( pal ) {
-        nvDebugCheck(tga.colormap_length <= 256);
-        s.serialize(palette, 3 * tga.colormap_length);
-    }
-
-    // Decode image.
-    uint8 * mem = new uint8[size];
-    if( rle ) {
-        // Decompress image in src.
-        uint8 * dst = mem;
-        int num = size;
-
-	while (num > 0) {
-	    // Get packet header
-	    uint8 c;
-	    s << c;
-
-	    uint count = (c & 0x7f) + 1;
-	    num -= count * pixel_size;
-
-	    if (c & 0x80) {
-		// RLE pixels.
-		uint8 pixel[4];	// uint8 pixel[pixel_size];
-		s.serialize( pixel, pixel_size );
-		do {
-		    memcpy(dst, pixel, pixel_size);
-		    dst += pixel_size;
-		} while (--count);
-	    }
-	    else {
-		// Raw pixels.
-		count *= pixel_size;
-		//file->Read8(dst, count);
-		s.serialize(dst, count);
-		dst += count;
-	    }
-	}
-    }
-    else {
-        s.serialize(mem, size);
-    }
-
-    // Allocate image.
-    AutoPtr<Image> img(new Image());
-    img->allocate(tga.width, tga.height);
-
-    int lstep;
-    Color32 * dst;
-    if( tga.flags & TGA_ORIGIN_UPPER ) {
-        lstep = tga.width;
-        dst = img->pixels();
-    }
-    else {
-        lstep = - tga.width;
-        dst = img->pixels() + (tga.height-1) * tga.width;
-    }
-
-    // Write image.
-    uint8 * src = mem;
-    if( pal ) {
-        for( int y = 0; y < tga.height; y++ ) {
-            for( int x = 0; x < tga.width; x++ ) {
-                uint8 idx = *src++;
-                dst[x].setBGRA(palette[3*idx+0], palette[3*idx+1], palette[3*idx+2], 0xFF);
-            }
-            dst += lstep;
-        }
-    }
-    else if( grey ) {
-        img->setFormat(Image::Format_ARGB);
-
-        for( int y = 0; y < tga.height; y++ ) {
-            for( int x = 0; x < tga.width; x++ ) {
-                dst[x].setBGRA(*src, *src, *src, *src);
-                src++;
-            }
-            dst += lstep;
-        }
-    }
-    else {
-
-        if( tga.pixel_size == 16 ) {
-            for( int y = 0; y < tga.height; y++ ) {
-                for( int x = 0; x < tga.width; x++ ) {
-                    Color555 c = *reinterpret_cast<Color555 *>(src);
-                    uint8 b = (c.b << 3) | (c.b >> 2);
-                    uint8 g = (c.g << 3) | (c.g >> 2);
-                    uint8 r = (c.r << 3) | (c.r >> 2);
-                    dst[x].setBGRA(b, g, r, 0xFF);
-                    src += 2;
-                }
-                dst += lstep;
-            }
-        }
-        else if( tga.pixel_size == 24 ) {
-            for( int y = 0; y < tga.height; y++ ) {
-                for( int x = 0; x < tga.width; x++ ) {
-                    dst[x].setBGRA(src[0], src[1], src[2], 0xFF);
-                    src += 3;
-                }
-                dst += lstep;
-            }
-        }
-        else if( tga.pixel_size == 32 ) {
-            img->setFormat(Image::Format_ARGB);
-
-            for( int y = 0; y < tga.height; y++ ) {
-                for( int x = 0; x < tga.width; x++ ) {
-                    dst[x].setBGRA(src[0], src[1], src[2], src[3]);
-                    src += 4;
-                }
-                dst += lstep;
-            }
-        }
-    }
-
-    // free uncompressed data.
-    delete [] mem;
-
-    return img.release();
-}
-
-// Save TGA image.
-static bool saveTGA(Stream & s, const Image * img)
-{
-    nvCheck(!s.isError());
-    nvCheck(img != NULL);
-    nvCheck(img->pixels() != NULL);
-
-    TgaFile tga;
-    tga.head.id_length = 0;
-    tga.head.colormap_type = 0;
-    tga.head.image_type = TGA_TYPE_RGB;
-
-    tga.head.colormap_index = 0;
-    tga.head.colormap_length = 0;
-    tga.head.colormap_size = 0;
-
-    tga.head.x_origin = 0;
-    tga.head.y_origin = 0;
-    tga.head.width = img->width();
-    tga.head.height = img->height();
-    if(img->format() == Image::Format_ARGB) {
-        tga.head.pixel_size = 32;
-        tga.head.flags = TGA_ORIGIN_UPPER | TGA_HAS_ALPHA;
-    }
-    else {
-        tga.head.pixel_size = 24;
-        tga.head.flags = TGA_ORIGIN_UPPER;
-    }
-
-    // @@ Serialize directly.
-    tga.allocate();
-
-    const uint n = img->width() * img->height();
-    if(img->format() == Image::Format_ARGB) {
-        for(uint i = 0; i < n; i++) {
-            Color32 color = img->pixel(i);
-            tga.mem[4 * i + 0] = color.b;
-            tga.mem[4 * i + 1] = color.g;
-            tga.mem[4 * i + 2] = color.r;
-            tga.mem[4 * i + 3] = color.a;
-        }
-    }
-    else {
-        for(uint i = 0; i < n; i++) {
-            Color32 color = img->pixel(i);
-            tga.mem[3 * i + 0] = color.b;
-            tga.mem[3 * i + 1] = color.g;
-            tga.mem[3 * i + 2] = color.r;
-        }
-    }
-
-    s << tga;
-
-    tga.free();
-
-    return true;
-}
-
-/*static Image * loadPPM(Stream & s)
-{
-    // @@
-    return NULL;
-}*/
-
-// Save PPM image.
-static bool savePPM(Stream & s, const Image * img)
-{
-    //if (img->depth() != 1) return false;
-    //if (img->format() == Image::Format_ARGB) return false;
-
-    uint w = img->width();
-    uint h = img->height();
-
-    TextWriter writer(&s);
-    writer.write("P6\n");
-    writer.write("%d %d\n", w, h);
-    writer.write("255\n");
-    for (uint i = 0; i < w * h; i++) {
-        Color32 c = img->pixel(i);
-        s << c.r << c.g << c.b;
-    }
-
-    return true;
-}
-
-
-/*static FloatImage * loadFloatPFM(Stream & s)
-{
-    return NULL;
-}*/
-
-/*static bool saveFloatPFM(Stream & s, const FloatImage * img, uint base_channel, uint channel_count)
-{
-    return false;
-}*/
-
-// Load PSD image.
-static Image * loadPSD(Stream & s)
-{
-    nvCheck(!s.isError());
-    nvCheck(s.isLoading());
-
-    s.setByteOrder(Stream::BigEndian);
-
-    PsdHeader header;
-    s << header;
-
-    if (!header.isValid())
-    {
-        printf("invalid header!\n");
-        return NULL;
-    }
-
-    if (!header.isSupported())
-    {
-        printf("unsupported file!\n");
-        return NULL;
-    }
-
-    int tmp;
-
-    // Skip mode data.
-    s << tmp;
-    s.seek(s.tell() + tmp);
-
-    // Skip image resources.
-    s << tmp;
-    s.seek(s.tell() + tmp);
-
-    // Skip the reserved data.
-    s << tmp;
-    s.seek(s.tell() + tmp);
-
-    // Find out if the data is compressed.
-    // Known values:
-    //   0: no compression
-    //   1: RLE compressed
-    uint16 compression;
-    s << compression;
-
-    if (compression > 1) {
-        // Unknown compression type.
-        return NULL;
-    }
-
-    uint channel_num = header.channel_count;
-
-    AutoPtr<Image> img(new Image());
-    img->allocate(header.width, header.height);
-
-    if (channel_num < 4)
-    {
-        // Clear the image.
-        img->fill(Color32(0, 0, 0, 0xFF));
-    }
-    else
-    {
-        // Enable alpha.
-        img->setFormat(Image::Format_ARGB);
-
-        // Ignore remaining channels.
-        channel_num = 4;
-    }
-
-
-    const uint pixel_count = header.height * header.width;
-
-    static const uint components[4] = {2, 1, 0, 3};
-
-    if (compression)
-    {
-        s.seek(s.tell() + header.height * header.channel_count * sizeof(uint16));
-
-        // Read RLE data.
-        for (uint channel = 0; channel < channel_num; channel++)
-        {
-            uint8 * ptr = (uint8 *)img->pixels() + components[channel];
-
-            uint count = 0;
-            while( count < pixel_count )
-            {
-                if (s.isAtEnd()) return NULL;
-
-                uint8 c;
-                s << c;
-
-                uint len = c;
-                if (len < 128)
-                {
-                    // Copy next len+1 bytes literally.
-                    len++;
-                    count += len;
-                    if (count > pixel_count) return NULL;
-
-                    while (len != 0)
-                    {
-                        s << *ptr;
-                        ptr += 4;
-                        len--;
-                    }
-                }
-                else if (len > 128)
-                {
-                    // Next -len+1 bytes in the dest are replicated from next source byte.
-                    // (Interpret len as a negative 8-bit int.)
-                    len ^= 0xFF;
-                    len += 2;
-                    count += len;
-                    if (s.isAtEnd() || count > pixel_count) return NULL;
-
-                    uint8 val;
-                    s << val;
-                    while( len != 0 ) {
-                        *ptr = val;
-                        ptr += 4;
-                        len--;
-                    }
-                }
-                else if( len == 128 ) {
-                    // No-op.
-                }
-            }
-        }
-    }
-    else
-    {
-        // We're at the raw image data. It's each channel in order (Red, Green, Blue, Alpha, ...)
-        // where each channel consists of an 8-bit value for each pixel in the image.
-
-        // Read the data by channel.
-        for (uint channel = 0; channel < channel_num; channel++)
-        {
-            uint8 * ptr = (uint8 *)img->pixels() + components[channel];
-
-            // Read the data.
-            uint count = pixel_count;
-            while (count != 0)
-            {
-                s << *ptr;
-                ptr += 4;
-                count--;
-            }
-        }
-    }
-
-    return img.release();
-}
-
-static FloatImage * loadFloatDDS(Stream & s)
-{
-    nvCheck(s.isLoading());
-    nvCheck(!s.isError());
-
-    DDSHeader header;
-    s << header;
-
-    static const uint D3DFMT_A16B16G16R16F = 113;
-
-    // @@ We only support RGBA16F for now.
-    if (header.pf.fourcc == D3DFMT_A16B16G16R16F) {
-        const int size = header.width * header.height;
-        uint16 * const data = new uint16[size * 4];
-
-        s.serialize(data, size * 4 * sizeof(uint16));
-
-        FloatImage * img = new FloatImage;
-        img->allocate(4, header.width, header.height);
-
-        uint32 * r = (uint32 *)img->channel(0);
-        uint32 * g = (uint32 *)img->channel(1);
-        uint32 * b = (uint32 *)img->channel(2);
-        uint32 * a = (uint32 *)img->channel(3);
-
-        uint16 * ptr = data;
-        for (int i = 0; i < size; i++) {
-            *r++ = half_to_float( *ptr++ );
-            *g++ = half_to_float( *ptr++ );
-            *b++ = half_to_float( *ptr++ );
-            *a++ = half_to_float( *ptr++ );
-        }
-
-        delete [] data;
-
-        return img;
-    }
-
-    return NULL;
-}
-
-static bool saveFloatDDS(Stream & s, const FloatImage * img, uint base_component, uint num_components)
-{
-    nvCheck(s.isSaving());
-    nvCheck(!s.isError());
-
-    if (num_components != 4) return false;
-
-    static const uint D3DFMT_A16B16G16R16F = 113;
-
-    DDSHeader header;
-    header.setTexture2D();
-    header.setWidth(img->width());
-    header.setHeight(img->height());
-    header.setFormatCode(D3DFMT_A16B16G16R16F);
-    // ...
-
-    s << header;
-
-    uint32 * r = (uint32 *)img->channel(base_component + 0);
-    uint32 * g = (uint32 *)img->channel(base_component + 1);
-    uint32 * b = (uint32 *)img->channel(base_component + 2);
-    uint32 * a = (uint32 *)img->channel(base_component + 3);
-
-    const uint size = img->width() * img->height();
-    for (uint i = 0; i < size; i++) {
-        uint16 R = half_from_float( *r++ );
-        uint16 G = half_from_float( *g++ );
-        uint16 B = half_from_float( *b++ );
-        uint16 A = half_from_float( *a++ );
-
-        s.serialize(&R, sizeof(uint16));
-        s.serialize(&G, sizeof(uint16));
-        s.serialize(&B, sizeof(uint16));
-        s.serialize(&A, sizeof(uint16));
-    }
-
-    return true;
-}
-
-
-#if defined(HAVE_PNG)
-
-static void user_read_data(png_structp png_ptr, png_bytep data, png_size_t length)
-{
-    nvDebugCheck(png_ptr != NULL);
-
-    Stream * s = (Stream *)png_get_io_ptr(png_ptr);
-    s->serialize(data, (int)length);
-
-    if (s->isError()) {
-        png_error(png_ptr, "Read Error");
-    }
-}
-
-
-static Image * loadPNG(Stream & s)
-{
-    nvCheck(!s.isError());
-
-    // Set up a read buffer and check the library version
-    png_structp png_ptr;
-    png_ptr = png_create_read_struct(PNG_LIBPNG_VER_STRING, NULL, NULL, NULL);
-    if (png_ptr == NULL) {
-        //	nvDebug( "*** LoadPNG: Error allocating read buffer in file '%s'.\n", name );
-        return false;
-    }
-
-    // Allocate/initialize a memory block for the image information
-    png_infop info_ptr = png_create_info_struct(png_ptr);
-    if (info_ptr == NULL) {
-        png_destroy_read_struct(&png_ptr, NULL, NULL);
-        //	nvDebug( "*** LoadPNG: Error allocating image information for '%s'.\n", name );
-        return false;
-    }
-
-    // Set up the error handling
-    if (setjmp(png_jmpbuf(png_ptr))) {
-        png_destroy_read_struct(&png_ptr, &info_ptr, NULL);
-        //	nvDebug( "*** LoadPNG: Error reading png file '%s'.\n", name );
-        return false;
-    }
-
-    // Set up the I/O functions.
-    png_set_read_fn(png_ptr, (void*)&s, user_read_data);
-
-
-    // Retrieve the image header information
-    png_uint_32 width, height;
-    int bit_depth, color_type, interlace_type;
-    png_read_info(png_ptr, info_ptr);
-    png_get_IHDR(png_ptr, info_ptr, &width, &height, &bit_depth, &color_type, &interlace_type, NULL, NULL);
-
-
-    if (color_type == PNG_COLOR_TYPE_PALETTE && bit_depth <= 8) {
-        // Convert indexed images to RGB.
-        png_set_expand(png_ptr);
-    }
-    else if (color_type == PNG_COLOR_TYPE_GRAY && bit_depth < 8) {
-        // Convert grayscale to RGB.
-        png_set_expand(png_ptr);
-    }
-    else if (png_get_valid(png_ptr, info_ptr, PNG_INFO_tRNS)) {
-        // Expand images with transparency to full alpha channels
-        // so the data will be available as RGBA quartets.
-        png_set_expand(png_ptr);
-    }
-    else if (bit_depth < 8) {
-        // If we have < 8 scale it up to 8.
-        //png_set_expand(png_ptr);
-        png_set_packing(png_ptr);
-    }
-
-    // Reduce bit depth.
-    if (bit_depth == 16) {
-        png_set_strip_16(png_ptr);
-    }
-
-    // Represent gray as RGB
-    if (color_type == PNG_COLOR_TYPE_GRAY || color_type == PNG_COLOR_TYPE_GRAY_ALPHA) {
-        png_set_gray_to_rgb(png_ptr);
-    }
-
-    // Convert to RGBA filling alpha with 0xFF.
-    if (!(color_type & PNG_COLOR_MASK_ALPHA)) {
-        png_set_filler(png_ptr, 0xFF, PNG_FILLER_AFTER);
-    }
-
-    // @todo Choose gamma according to the platform?
-    double screen_gamma = 2.2;
-    int intent;
-    if (png_get_sRGB(png_ptr, info_ptr, &intent)) {
-        png_set_gamma(png_ptr, screen_gamma, 0.45455);
-    }
-    else {
-        double image_gamma;
-        if (png_get_gAMA(png_ptr, info_ptr, &image_gamma)) {
-            png_set_gamma(png_ptr, screen_gamma, image_gamma);
-        }
-        else {
-            png_set_gamma(png_ptr, screen_gamma, 0.45455);
-        }
-    }
-
-    // Perform the selected transforms.
-    png_read_update_info(png_ptr, info_ptr);
-
-    png_get_IHDR(png_ptr, info_ptr, &width, &height, &bit_depth, &color_type, &interlace_type, NULL, NULL);
-
-    AutoPtr<Image> img(new Image());
-    img->allocate(width, height);
-
-    // Set internal format flags.
-    if(color_type & PNG_COLOR_MASK_COLOR) {
-        //img->flags |= PI_IF_HAS_COLOR;
-    }
-    if(color_type & PNG_COLOR_MASK_ALPHA) {
-        //img->flags |= PI_IF_HAS_ALPHA;
-        img->setFormat(Image::Format_ARGB);
-    }
-
-    // Read the image
-    uint8 * pixels = (uint8 *)img->pixels();
-    png_bytep * row_data = new png_bytep[sizeof(png_byte) * height];
-    for (uint i = 0; i < height; i++) {
-        row_data[i] = &(pixels[width * 4 * i]);
-    }
-
-    png_read_image(png_ptr, row_data);
-    delete [] row_data;
-
-    // Finish things up
-    png_read_end(png_ptr, info_ptr);
-    png_destroy_read_struct(&png_ptr, &info_ptr, NULL);
-
-    // RGBA to BGRA.
-    uint num = width * height;
-    for(uint i = 0; i < num; i++)
-    {
-        Color32 c = img->pixel(i);
-        img->pixel(i) = Color32(c.b, c.g, c.r, c.a);
-    }
-
-    // Compute alpha channel if needed.
-    /*if( img->flags & PI_IU_BUMPMAP || img->flags & PI_IU_ALPHAMAP ) {
-        if( img->flags & PI_IF_HAS_COLOR && !(img->flags & PI_IF_HAS_ALPHA)) {
-            img->ComputeAlphaFromColor();
-        }
-    }*/
-
-    return img.release();
-}
-
-static void user_write_data(png_structp png_ptr, png_bytep data, png_size_t length)
-{
-    nvDebugCheck(png_ptr != NULL);
-
-    Stream * s = (Stream *)png_get_io_ptr(png_ptr);
-    s->serialize(data, (int)length);
-
-    if (s->isError()) {
-        png_error(png_ptr, "Write Error");
-    }
-}
-
-static void user_write_flush(png_structp png_ptr) { }
-
-static bool savePNG(Stream & s, const Image * img, const char ** tags/*=NULL*/)
-{
-    nvCheck(!s.isError());
-    nvCheck(img != NULL);
-    nvCheck(img->pixels() != NULL);
-
-    // Set up a write buffer and check the library version
-    png_structp png_ptr;
-    png_ptr = png_create_write_struct(PNG_LIBPNG_VER_STRING, NULL, NULL, NULL);
-    if (png_ptr == NULL) {
-        return false;
-    }
-
-    // Allocate/initialize a memory block for the image information
-    png_infop info_ptr = png_create_info_struct(png_ptr);
-    if (info_ptr == NULL) {
-        png_destroy_write_struct(&png_ptr, NULL);
-        return false;
-    }
-
-    // Set up the error handling
-    if (setjmp(png_jmpbuf(png_ptr))) {
-        png_destroy_write_struct(&png_ptr, &info_ptr);
-        return false;
-    }
-
-    // Set up the I/O functions.
-    png_set_write_fn(png_ptr, (void*)&s, user_write_data, user_write_flush);
-
-    // Set image header information
-    int color_type = PNG_COLOR_TYPE_RGBA;
-    switch(img->format())
-    {
-        case Image::Format_RGB:		color_type = PNG_COLOR_TYPE_RGB; break;
-        case Image::Format_ARGB:	color_type = PNG_COLOR_TYPE_RGBA; break;
-    }
-    png_set_IHDR(png_ptr, info_ptr, img->width(), img->height(),
-        8, color_type, PNG_INTERLACE_NONE,
-        PNG_COMPRESSION_TYPE_DEFAULT,
-        PNG_FILTER_TYPE_DEFAULT);
-
-    // Set image data
-    png_bytep * row_data = new png_bytep[sizeof(png_byte) * img->height()];
-    for (uint i = 0; i < img->height(); i++) {
-        row_data[i] = (png_byte*)img->scanline (i);
-        if (img->format() == Image::Format_RGB) row_data[i]--; // This is a bit of a hack, libpng expects images in ARGB format not BGRA, it supports BGR swapping, but not alpha swapping.
-    }
-    png_set_rows(png_ptr, info_ptr, row_data);
-
-    png_text * text = NULL;
-    if (tags != NULL)
-    {
-        int count = 0;
-        while(tags[2 * count] != NULL) count++;
-
-        text = new png_text[count];
-        memset(text, 0, count * sizeof(png_text);
-
-        for (int i = 0; i < count; i++) {
-            text[i].compression = PNG_TEXT_COMPRESSION_NONE;
-            text[i].key = tags[2 * i + 0];
-            text[i].text = tags[2 * i + 1];
-        }
-
-        png_set_text(png_ptr, info_ptr, text, count);
-    }
-
-    png_write_png(png_ptr, info_ptr,
-        // component order is BGR(A)
-        PNG_TRANSFORM_BGR |
-        // Strip alpha byte for RGB images
-        (img->format() == Image::Format_RGB ? PNG_TRANSFORM_STRIP_FILLER : 0)
-        , NULL);
-
-    // Finish things up
-    png_destroy_write_struct(&png_ptr, &info_ptr);
-
-    delete [] row_data;
-    delete [] text;
-
-    return true;
-}
-
-#endif // defined(HAVE_PNG)
-
-#if defined(HAVE_JPEG)
-
-static void init_source (j_decompress_ptr /*cinfo*/){
-}
-
-static boolean fill_input_buffer (j_decompress_ptr cinfo) {
-    struct jpeg_source_mgr * src = cinfo->src;
-    static JOCTET FakeEOI[] = { 0xFF, JPEG_EOI };
-
-    // Generate warning
-    nvDebug("jpeglib: Premature end of file\n");
-
-    // Insert a fake EOI marker
-    src->next_input_byte = FakeEOI;
-    src->bytes_in_buffer = 2;
-
-    return TRUE;
-}
-
-static void skip_input_data (j_decompress_ptr cinfo, long num_bytes) {
-    struct jpeg_source_mgr * src = cinfo->src;
-
-    if(num_bytes >= (long)src->bytes_in_buffer) {
-        fill_input_buffer(cinfo);
-        return;
-    }
-
-    src->bytes_in_buffer -= num_bytes;
-    src->next_input_byte += num_bytes;
-}
-
-static void term_source (j_decompress_ptr /*cinfo*/){
-    // no work necessary here
-}
-
-
-static Image * loadJPG(Stream & s)
-{
-    nvCheck(!s.isError());
-
-    // Read the entire file.
-    Array<uint8> byte_array;
-    byte_array.resize(s.size());
-    s.serialize(byte_array.buffer(), s.size());
-
-    jpeg_decompress_struct cinfo;
-    jpeg_error_mgr jerr;
-
-    cinfo.err = jpeg_std_error(&jerr);
-    jpeg_create_decompress(&cinfo);
-
-    cinfo.src = (struct jpeg_source_mgr *) (*cinfo.mem->alloc_small)
-                ((j_common_ptr) &cinfo, JPOOL_PERMANENT, sizeof(struct jpeg_source_mgr));
-    cinfo.src->init_source = init_source;
-    cinfo.src->fill_input_buffer = fill_input_buffer;
-    cinfo.src->skip_input_data = skip_input_data;
-    cinfo.src->resync_to_restart = jpeg_resync_to_restart;	// use default method
-    cinfo.src->term_source = term_source;
-    cinfo.src->bytes_in_buffer = byte_array.size();
-    cinfo.src->next_input_byte = byte_array.buffer();
-
-    jpeg_read_header(&cinfo, TRUE);
-    jpeg_start_decompress(&cinfo);
-
-    /*
-    cinfo.do_fancy_upsampling = FALSE;	// fast decompression
-    cinfo.dct_method = JDCT_FLOAT;			// Choose floating point DCT method.
-    */
-
-    uint8 * tmp_buffer = new uint8 [cinfo.output_width * cinfo.output_height * cinfo.num_components];
-    uint8 * scanline = tmp_buffer;
-
-    while( cinfo.output_scanline < cinfo.output_height ){
-        int num_scanlines = jpeg_read_scanlines (&cinfo, &scanline, 1);
-        scanline += num_scanlines * cinfo.output_width * cinfo.num_components;
-    }
-
-    jpeg_finish_decompress(&cinfo);
-
-    AutoPtr<Image> img(new Image());
-    img->allocate(cinfo.output_width, cinfo.output_height);
-
-    Color32 * dst = img->pixels();
-    const int size = img->height() * img->width();
-    const uint8 * src = tmp_buffer;
-
-    if( cinfo.num_components == 3 ) {
-        img->setFormat(Image::Format_RGB);
-        for( int i = 0; i < size; i++ ) {
-            *dst++ = Color32(src[0], src[1], src[2]);
-            src += 3;
-        }
-    }
-    else {
-        img->setFormat(Image::Format_ARGB);
-        for( int i = 0; i < size; i++ ) {
-            *dst++ = Color32(*src, *src, *src, *src);
-            src++;
-        }
-    }
-
-    delete [] tmp_buffer;
-    jpeg_destroy_decompress (&cinfo);
-
-    return img.release();
-}
-
-#endif // defined(HAVE_JPEG)
-
-#if defined(HAVE_TIFF)
-
-/*
-static tsize_t tiffReadWriteProc(thandle_t h, tdata_t ptr, tsize_t size)
-{
-    Stream * s = (Stream *)h;
-    nvDebugCheck(s != NULL);
-
-    s->serialize(ptr, size);
-
-    return size;
-}
-
-static toff_t tiffSeekProc(thandle_t h, toff_t offset, int whence)
-{
-    Stream * s = (Stream *)h;
-    nvDebugCheck(s != NULL);
-
-    if (!s->isSeekable())
-    {
-        return (toff_t)-1;
-    }
-
-    if (whence == SEEK_SET)
-    {
-        s->seek(offset);
-    }
-    else if (whence == SEEK_CUR)
-    {
-        s->seek(s->tell() + offset);
-    }
-    else if (whence == SEEK_END)
-    {
-        s->seek(s->size() + offset);
-    }
-
-    return s->tell();
-}
-
-static int tiffCloseProc(thandle_t)
-{
-    return 0;
-}
-
-static toff_t tiffSizeProc(thandle_t h)
-{
-    Stream * s = (Stream *)h;
-    nvDebugCheck(s != NULL);
-    return s->size();
-}
-
-static int tiffMapFileProc(thandle_t, tdata_t*, toff_t*)
-{
-    // @@ TODO, Implement these functions.
-    return -1;
-}
-
-static void tiffUnmapFileProc(thandle_t, tdata_t, toff_t)
-{
-    // @@ TODO, Implement these functions.
-}
-*/
-
-static FloatImage * loadFloatTIFF(const char * fileName, Stream & s)
-{
-    nvCheck(!s.isError());
-
-    TIFF * tif = TIFFOpen(fileName, "r");
-    //TIFF * tif = TIFFClientOpen(fileName, "r", &s, tiffReadWriteProc, tiffReadWriteProc, tiffSeekProc, tiffCloseProc, tiffSizeProc, tiffMapFileProc, tiffUnmapFileProc);
-
-    if (!tif)
-    {
-        nvDebug("Can't open '%s' for reading\n", fileName);
-        return NULL;
-    }
-
-    ::uint16 spp, bpp, format;
-    ::uint32 width, height;
-    TIFFGetField(tif, TIFFTAG_IMAGELENGTH, &height);
-    TIFFGetField(tif, TIFFTAG_IMAGEWIDTH, &width);
-    TIFFGetField(tif, TIFFTAG_BITSPERSAMPLE, &bpp);
-    TIFFGetField(tif, TIFFTAG_SAMPLESPERPIXEL, &spp);
-    TIFFGetField(tif, TIFFTAG_SAMPLEFORMAT, &format);
-
-    if (bpp != 8 && bpp != 16 && bpp != 32) {
-        nvDebug("Can't load '%s', only 1 sample per pixel supported\n", fileName);
-        TIFFClose(tif);
-        return NULL;
-    }
-
-    AutoPtr<FloatImage> fimage(new FloatImage());
-    fimage->allocate(spp, width, height);
-
-    int linesize = TIFFScanlineSize(tif);
-    tdata_t buf = malloc<uint8>(linesize);
-
-    for (uint y = 0; y < height; y++)
-    {
-        TIFFReadScanline(tif, buf, y, 0);
-
-	for (uint c=0; c<spp; c++ )
-	{
-	    float * dst = fimage->scanline(y, c);
-
-	    for(uint x = 0; x < width; x++)
-	    {
-		if (bpp == 8)
-		{
-			dst[x] = float(((::uint8 *)buf)[x*spp+c]) / float(0xFF);
-		}
-		else if (bpp == 16)
-		{
-			dst[x] = float(((::uint16 *)buf)[x*spp+c]) / float(0xFFFF);
-		}
-		else if (bpp == 32)
-		{
-		    if (format==SAMPLEFORMAT_IEEEFP)
-		    {
-			dst[x] = float(((float *)buf)[x*spp+c]);
-		    }
-		    else
-		    {
-			dst[x] = float(((::uint32 *)buf)[x*spp+c] >> 8) / float(0xFFFFFF);
-		    }
-		}
-	    }
-	}
-    }
-
-    free(buf);
-
-    TIFFClose(tif);
-
-    return fimage.release();
-}
-
-static bool saveFloatTIFF(const char * fileName, const FloatImage * fimage, uint base_component, uint num_components)
-{
-    nvCheck(fileName != NULL);
-    nvCheck(fimage != NULL);
-    nvCheck(base_component + num_components <= fimage->componentCount());
-
-    const int iW = fimage->width();
-    const int iH = fimage->height();
-    const int iC = num_components;
-
-    TIFF * image = TIFFOpen(fileName, "w");
-
-    // Open the TIFF file
-    if (image == NULL)
-    {
-        nvDebug("Could not open '%s' for writing\n", fileName);
-        return false;
-    }
-
-    TIFFSetField(image, TIFFTAG_IMAGEWIDTH,  iW);
-    TIFFSetField(image, TIFFTAG_IMAGELENGTH, iH);
-    TIFFSetField(image, TIFFTAG_SAMPLESPERPIXEL, iC);
-    TIFFSetField(image, TIFFTAG_SAMPLEFORMAT, SAMPLEFORMAT_IEEEFP);
-    TIFFSetField(image, TIFFTAG_BITSPERSAMPLE, 32);
-
-    uint32 rowsperstrip = TIFFDefaultStripSize(image, (uint32)-1);
-
-    TIFFSetField(image, TIFFTAG_ROWSPERSTRIP, rowsperstrip);
-    TIFFSetField(image, TIFFTAG_COMPRESSION, COMPRESSION_PACKBITS);
-    if (num_components == 3)
-    {
-        // Set this so that it can be visualized with pfstools.
-        TIFFSetField(image, TIFFTAG_PHOTOMETRIC, PHOTOMETRIC_RGB);
-    }
-    TIFFSetField(image, TIFFTAG_ORIENTATION, ORIENTATION_TOPLEFT);
-    TIFFSetField(image, TIFFTAG_PLANARCONFIG, PLANARCONFIG_CONTIG);
-
-    float * scanline = new float[iW * iC];
-    for (int y = 0; y < iH; y++)
-    {
-        for (int c = 0; c < iC; c++)
-        {
-            const float * src = fimage->scanline(y, base_component + c);
-            for (int x = 0; x < iW; x++) scanline[x * iC + c] = src[x];
-        }
-        if (TIFFWriteScanline(image, scanline, y, 0)==-1)
-        {
-            nvDebug("Error writing scanline %d\n", y);
-            return false;
-        }
-    }
-    delete [] scanline;
-
-    // Close the file
-    TIFFClose(image);
-    return true;
-}
-
-#endif // defined(HAVE_TIFF)
-
-#if defined(HAVE_OPENEXR)
-
-namespace
-{
-    class ExrStream : public Imf::IStream
-    {
-    public:
-        ExrStream(const char * name, Stream & s) : Imf::IStream(name), m_stream(s)
-        {
-            nvDebugCheck(s.isLoading());
-        }
-
-	virtual bool read(char c[], int n)
-	{
-	    m_stream.serialize(c, n);
-
-	    if (m_stream.isError())
-	    {
-		throw Iex::InputExc("I/O error.");
-	    }
-
-	    return m_stream.isAtEnd();
-	}
-
-	virtual Imf::Int64 tellg()
-	{
-	    return m_stream.tell();
-	}
-
-	virtual void seekg(Imf::Int64 pos)
-	{
-	    nvDebugCheck(pos >= 0 && pos < UINT_MAX);
-	    m_stream.seek((uint)pos);
-	}
-
-	virtual void clear()
-	{
-	    m_stream.clearError();
-	}
-
-    private:
-        Stream & m_stream;
-    };
-
-    static int channelIndexFromName(const char* name)
-    {
-        char c = tolower(name[0]);
-        switch (c)
-        {
-        default:
-        case 'r':
-            return 0;
-        case 'g':
-            return 1;
-        case 'b':
-            return 2;
-        case 'a':
-            return 3;
-        }
-    }
-
-} // namespace
-
-static FloatImage * loadFloatEXR(const char * fileName, Stream & s)
-{
-    nvCheck(s.isLoading());
-    nvCheck(!s.isError());
-
-    ExrStream stream(fileName, s);
-    Imf::InputFile inputFile(stream);
-
-    Imath::Box2i box = inputFile.header().dataWindow();
-
-    int width = box.max.x - box.min.y + 1;
-    int height = box.max.x - box.min.y + 1;
-
-    const Imf::ChannelList & channels = inputFile.header().channels();
-
-    // Count channels.
-    uint channelCount= 0;
-    for (Imf::ChannelList::ConstIterator it = channels.begin(); it != channels.end(); ++it)
-    {
-        channelCount++;
-    }
-
-    // Allocate FloatImage.
-    AutoPtr<FloatImage> fimage(new FloatImage());
-    fimage->allocate(channelCount, width, height);
-
-    // Describe image's layout with a framebuffer.
-    Imf::FrameBuffer frameBuffer;
-    uint i = 0;
-    for (Imf::ChannelList::ConstIterator it = channels.begin(); it != channels.end(); ++it, ++i)
-    {
-        int channelIndex = channelIndexFromName(it.name());
-        frameBuffer.insert(it.name(), Imf::Slice(Imf::FLOAT, (char *)fimage->channel(channelIndex), sizeof(float), sizeof(float) * width));
-    }
-
-    // Read it.
-    inputFile.setFrameBuffer (frameBuffer);
-    inputFile.readPixels (box.min.y, box.max.y);
-
-    return fimage.release();
-}
-
-static bool saveFloatEXR(const char * fileName, const FloatImage * fimage, uint base_component, uint num_components)
-{
-    nvCheck(fileName != NULL);
-    nvCheck(fimage != NULL);
-    nvCheck(base_component + num_components <= fimage->componentCount());
-    nvCheck(num_components > 0 && num_components <= 4);
-
-    const int w = fimage->width();
-    const int h = fimage->height();
-
-    const char * channelNames[] = {"R", "G", "B", "A"};
-
-    Imf::Header header (w, h);
-
-    for (uint c = 0; c < num_components; c++)
-    {
-        header.channels().insert(channelNames[c], Imf::Channel(Imf::FLOAT));
-    }
-
-    Imf::OutputFile file(fileName, header);
-    Imf::FrameBuffer frameBuffer;
-
-    for (uint c = 0; c < num_components; c++)
-    {
-        char * channel = (char *) fimage->channel(base_component + c);
-        frameBuffer.insert(channelNames[c], Imf::Slice(Imf::FLOAT, channel, sizeof(float), sizeof(float) * w));
-    }
-
-    file.setFrameBuffer(frameBuffer);
-    file.writePixels(h);
-
-    return true;
-}
-
-#endif // defined(HAVE_OPENEXR)
-
-
-#if defined(HAVE_FREEIMAGE)
-
-static unsigned DLL_CALLCONV ReadProc(void *buffer, unsigned size, unsigned count, fi_handle handle)
-{
-    Stream * s = (Stream *) handle;
-    s->serialize(buffer, size * count);
-    return count;
-}
-
-static unsigned DLL_CALLCONV WriteProc(void *buffer, unsigned size, unsigned count, fi_handle handle)
-{
-    Stream * s = (Stream *) handle;
-    s->serialize(buffer, size * count);
-    return count;
-}
-
-static int DLL_CALLCONV SeekProc(fi_handle handle, long offset, int origin)
-{
-    Stream * s = (Stream *) handle;
-
-    switch(origin) {
-        case SEEK_SET :
-            s->seek(offset);
-            break;
-        case SEEK_END :
-            s->seek(s->size() + offset);
-            break;
-        case SEEK_CUR :
-            s->seek(s->tell() + offset);
-            break;
-        default :
-            return 1;
-    }
-
-    return 0;
-}
-
-static long DLL_CALLCONV TellProc(fi_handle handle)
-{
-    Stream * s = (Stream *) handle;
-    return s->tell();
-}
-
-
-Image * nv::ImageIO::loadFreeImage(FREE_IMAGE_FORMAT fif, Stream & s)
-{
-    nvCheck(!s.isError());
-
-    FreeImageIO io;
-    io.read_proc = ReadProc;
-    io.write_proc = NULL;
-    io.seek_proc = SeekProc;
-    io.tell_proc = TellProc;
-
-    FIBITMAP * bitmap = FreeImage_LoadFromHandle(fif, &io, (fi_handle)&s, 0);
-
-    if (bitmap == NULL)
-    {
-        return NULL;
-    }
-
-    const int w = FreeImage_GetWidth(bitmap);
-    const int h = FreeImage_GetHeight(bitmap);
-
-    if (FreeImage_GetImageType(bitmap) != FIT_BITMAP)
-    {
-        // @@ Use tone mapping?
-        FIBITMAP * tmp = FreeImage_ConvertToType(bitmap, FIT_BITMAP, true);
-        FreeImage_Unload(bitmap);
-        bitmap = tmp;
-    }
-
-    nvDebugCheck(FreeImage_GetImageType(bitmap) == FIT_BITMAP);
-    if (FreeImage_GetBPP(bitmap) != 32)
-    {
-        FIBITMAP * tmp = FreeImage_ConvertTo32Bits(bitmap);
-        FreeImage_Unload(bitmap);
-        bitmap = tmp;
-    }
-
-
-    Image * image = new Image();
-    image->allocate(w, h, 1); // freeimage can only load 2d images:
-
-    // Copy the image over to our internal format, FreeImage has the scanlines bottom to top though.
-    for (int y=0; y < h; y++)
-    {
-        const void * src = FreeImage_GetScanLine(bitmap, h - y - 1);
-        void * dst = image->scanline(y);
-
-        memcpy(dst, src, 4 * w);
-    }
-
-    FreeImage_Unload(bitmap);
-
-    return image;
-}
-
-FloatImage * nv::ImageIO::loadFloatFreeImage(FREE_IMAGE_FORMAT fif, Stream & s)
-{
-    nvCheck(!s.isError());
-
-    FreeImageIO io;
-    io.read_proc = ReadProc;
-    io.write_proc = NULL;
-    io.seek_proc = SeekProc;
-    io.tell_proc = TellProc;
-
-    FIBITMAP * bitmap = FreeImage_LoadFromHandle(fif, &io, (fi_handle)&s, 0);
-
-    if (bitmap == NULL)
-    {
-        return NULL;
-    }
-
-    const int w = FreeImage_GetWidth(bitmap);
-    const int h = FreeImage_GetHeight(bitmap);
-
-    FREE_IMAGE_TYPE fit = FreeImage_GetImageType(bitmap);
-
-    FloatImage * floatImage = new FloatImage();
-
-    switch (fit)
-    {
-        case FIT_BITMAP:
-            floatImage->allocate(4, w, h);
-            {
-                FIBITMAP * tmp = FreeImage_ConvertTo32Bits(bitmap);
-
-                uint bitcount = FreeImage_GetBPP(bitmap);
-                uint byteCount = bitcount / 8;
-
-                for (int y=0; y < h; y++)
-                {
-                    const Color32 * src = (const Color32 *)FreeImage_GetScanLine(bitmap, h - y - 1 );
-
-                    float * r = floatImage->scanline(y, 0);
-                    float * g = floatImage->scanline(y, 1);
-                    float * b = floatImage->scanline(y, 2);
-                    float * a = floatImage->scanline(y, 3);
-
-                    for (int x=0; x < w; x++)
-                    {
-                        r[x] = float(src[x].r) / 255.0f;
-                        g[x] = float(src[x].g) / 255.0f;
-                        b[x] = float(src[x].b) / 255.0f;
-                        a[x] = float(src[x].a) / 255.0f;
-                    }
-
-                    src += byteCount;
-                }
-
-                FreeImage_Unload(tmp);
-            }
-            break;
-        case FIT_FLOAT:
-            floatImage->allocate(1, w, h);
-
-            for (int y=0; y < h; y++)
-            {
-                const float * src = (const float *)FreeImage_GetScanLine(bitmap, h - y - 1 );
-                float * dst = floatImage->scanline(y, 0);
-
-                for (int x=0; x < w; x++)
-                {
-                    dst[x] = src[x];
-                }
-            }
-            break;
-        case FIT_UINT16:
-            floatImage->allocate(1, w, h);
-
-            for (int y=0; y < h; y++)
-            {
-                const uint16 * src = (const uint16 *)FreeImage_GetScanLine(bitmap, h - y - 1 );
-                float * dst = floatImage->scanline(y, 0);
-
-                for (int x=0; x < w; x++)
-                {
-                    dst[x] = float(src[x]) / 65535;
-                }
-            }
-            break;
-        case FIT_COMPLEX:
-            floatImage->allocate(2, w, h);
-
-            for (int y=0; y < h; y++)
-            {
-                const FICOMPLEX * src = (const FICOMPLEX *)FreeImage_GetScanLine(bitmap, h - y - 1 );
-
-                float * dst_real = floatImage->scanline(y, 0);
-                float * dst_imag = floatImage->scanline(y, 1);
-
-                for (int x=0; x < w; x++)
-                {
-                    dst_real[x] = (float)src[x].r;
-                    dst_imag[x] = (float)src[x].i;
-                }
-            }
-            break;
-        case FIT_RGBF:
-            floatImage->allocate(3, w, h);
-
-            for (int y=0; y < h; y++)
-            {
-                const FIRGBF * src = (const FIRGBF *)FreeImage_GetScanLine(bitmap, h - y - 1 );
-
-                float * dst_red = floatImage->scanline(y, 0);
-                float * dst_green = floatImage->scanline(y, 1);
-                float * dst_blue = floatImage->scanline(y, 2);
-
-                for (int x=0; x < w; x++)
-                {
-                    dst_red[x] = src[x].red;
-                    dst_green[x] = src[x].green;
-                    dst_blue[x] = src[x].blue;
-                }
-            }
-            break;
-        case FIT_RGBAF:
-            floatImage->allocate(4, w, h);
-
-            for (int y=0; y < h; y++)
-            {
-                const FIRGBAF * src = (const FIRGBAF *)FreeImage_GetScanLine(bitmap, h - y - 1 );
-
-                float * dst_red = floatImage->scanline(y, 0);
-                float * dst_green = floatImage->scanline(y, 1);
-                float * dst_blue = floatImage->scanline(y, 2);
-                float * dst_alpha = floatImage->scanline(y, 3);
-
-                for (int x=0; x < w; x++)
-                {
-                    dst_red[x] = src[x].red;
-                    dst_green[x] = src[x].green;
-                    dst_blue[x] = src[x].blue;
-                    dst_alpha[x] = src[x].alpha;
-                }
-            }
-            break;
-        default:
-            delete floatImage;
-            floatImage = NULL;
-    }
-
-    FreeImage_Unload(bitmap);
-
-    return floatImage;
-}
-
-bool nv::ImageIO::saveFreeImage(FREE_IMAGE_FORMAT fif, Stream & s, const Image * img, const char ** tags)
-{
-    nvCheck(!s.isError());
-
-    FreeImageIO io;
-    io.read_proc = NULL;
-    io.write_proc = WriteProc;
-    io.seek_proc = SeekProc;
-    io.tell_proc = TellProc;
-
-    const uint w = img->width();
-    const uint h = img->height();
-
-    FIBITMAP * bitmap = FreeImage_Allocate(w, h, 32);
-
-    for (uint i = 0; i < h; i++)
-    {
-        uint8 * scanline = FreeImage_GetScanLine(bitmap, i);
-        memcpy(scanline, img->scanline(h - i - 1), w * sizeof(Color32));
-    }
-
-    if (tags != NULL)
-    {
-    #pragma NV_MESSAGE("TODO: Save image metadata")
-        //FreeImage_SetMetadata(
-    }
-
-    bool result = FreeImage_SaveToHandle(fif, bitmap, &io, (fi_handle)&s, 0) != 0;
-
-    FreeImage_Unload(bitmap);
-
-    return result;
-}
-
-bool nv::ImageIO::saveFloatFreeImage(FREE_IMAGE_FORMAT fif, Stream & s, const FloatImage * img, uint baseComponent, uint componentCount)
-{
-    nvCheck(!s.isError());
-
-    FreeImageIO io;
-    io.read_proc = NULL;
-    io.write_proc = WriteProc;
-    io.seek_proc = SeekProc;
-    io.tell_proc = TellProc;
-
-    const uint w = img->width();
-    const uint h = img->height();
-
-    FREE_IMAGE_TYPE type;
-    if (componentCount == 1)
-    {
-        type = FIT_FLOAT;
-    }
-    else if (componentCount == 3)
-    {
-        type = FIT_RGBF;
-    }
-    else if (componentCount == 4)
-    {
-        type = FIT_RGBAF;
-    }
-    else {
-        return false;
-    }
-
-
-    FIBITMAP * bitmap = FreeImage_AllocateT(type, w, h);
-
-    for (uint y = 0; y < h; y++)
-    {
-        float * scanline = (float *)FreeImage_GetScanLine(bitmap, y);
-
-        for (uint x = 0; x < w; x++)
-        {
-            for (uint c = 0; c < componentCount; c++)
-            {
-                scanline[x * componentCount + c] = img->pixel(x, y, baseComponent + c);
-            }
-        }
-    }
-
-    bool result = FreeImage_SaveToHandle(fif, bitmap, &io, (fi_handle)&s, 0) != 0;
-
-    FreeImage_Unload(bitmap);
-
-    return result;
-}
-
-#endif // defined(HAVE_FREEIMAGE)
-
-
-#if defined(HAVE_STBIMAGE)
-
-static Image * loadSTB(Stream & s)
-{
-    // @@ Assumes stream cursor is at the beginning and that image occupies the whole stream.
-    const int size = s.size();
-    uint8 * buffer = new uint8[size];
-
-    s.serialize(buffer, size);
-
-    int w, h, n;
-    uint8 * data = stbi_load_from_memory(buffer, size, &w, &h, &n, 4);
-
-    delete buffer;
-
-    if (data != NULL) {
-        Image * img = new Image;
-        img->allocate(w, h);
-        img->setFormat(n == 4 ? Image::Format_ARGB : Image::Format_RGB);
-
-        for (int y = 0; y < h; ++y)
-        {
-            nv::Color32* dest = img->scanline(y);
-            uint8* src = data + y * w * 4;
-
-            for (int x = 0; x < w; ++x)
-            {
-                dest[x].r = src[x * 4 + 0];
-                dest[x].g = src[x * 4 + 1];
-                dest[x].b = src[x * 4 + 2];
-                dest[x].a = src[x * 4 + 3];
-            }
-        }
-        
-        free(data);
-
-        return img;
-    }
-
-    return NULL;
-}
-
-static FloatImage * loadFloatSTB(Stream & s)
-{
-    // @@ Assumes stream cursor is at the beginning and that image occupies the whole stream.
-    const int size = s.size();
-    uint8 * buffer = new uint8[size];
-
-    s.serialize(buffer, size);
-
-    int w, h, n;
-    float * data = stbi_loadf_from_memory(buffer, size, &w, &h, &n, 0);
-
-    delete buffer;
-
-    // Copy to image.
-    if (data != NULL) {
-        FloatImage * img = new FloatImage;
-        img->allocate(n, w, h);
-
-        const int count = w * h;
-
-        for (int c = 0; c < n; c++) {
-            float * dst = img->channel(c);
-
-            for (int i = 0; i < count; i++) {
-                dst[i] = data[i*n + c];
-            }
-        }
-        return img;
-    }
-
-    return NULL;
-}
-
-#endif // defined(HAVE_STBIMAGE)
-
-
-
-
-
-Image * nv::ImageIO::load(const char * fileName)
-{
-    nvDebugCheck(fileName != NULL);
-
-    StdInputStream stream(fileName);
-
-    if (stream.isError()) {
-        return NULL;
-    }
-
-    return ImageIO::load(fileName, stream);
-}
-
-Image * nv::ImageIO::load(const char * fileName, Stream & s)
-{
-    nvDebugCheck(fileName != NULL);
-    nvDebugCheck(s.isLoading());
-
-    const char * extension = Path::extension(fileName);
-
-    if (strCaseCmp(extension, ".tga") == 0) {
-        return loadTGA(s);
-    }
-
-    if (strCaseCmp(extension, ".psd") == 0) {
-        return loadPSD(s);
-    }
-
-    /*if (strCaseCmp(extension, ".ppm") == 0) {
-        return loadPPM(s);
-    }*/
-
-#if defined(HAVE_JPEG)
-    if (strCaseCmp(extension, ".jpg") == 0 || strCaseCmp(extension, ".jpeg") == 0) {
-        return loadJPG(s);
-    }
-#endif
-
-#if defined(HAVE_PNG)
-    if (strCaseCmp(extension, ".png") == 0) {
-        return loadPNG(s);
-    }
-#endif
-
-#if defined(HAVE_FREEIMAGE)
-    FREE_IMAGE_FORMAT fif = FreeImage_GetFIFFromFilename(fileName);
-    if (fif != FIF_UNKNOWN && FreeImage_FIFSupportsReading(fif)) {
-        return loadFreeImage(fif, s);
-    }
-#endif
-
-#if defined(HAVE_STBIMAGE)
-    return loadSTB(s);
-#endif
-
-    return NULL;
-}
-
-bool nv::ImageIO::save(const char * fileName, Stream & s, const Image * img, const char ** tags/*=NULL*/)
-{
-    nvDebugCheck(fileName != NULL);
-    nvDebugCheck(s.isSaving());
-    nvDebugCheck(img != NULL);
-
-    const char * extension = Path::extension(fileName);
-
-    if (strCaseCmp(extension, ".tga") == 0) {
-        return saveTGA(s, img);
-    }
-
-    if (strCaseCmp(extension, ".ppm") == 0) {
-        return savePPM(s, img);
-    }
-
-#if defined(HAVE_PNG)
-    if (strCaseCmp(extension, ".png") == 0) {
-        return savePNG(s, img, tags);
-    }
-#endif
-
-#if defined(HAVE_FREEIMAGE)
-    FREE_IMAGE_FORMAT fif = FreeImage_GetFIFFromFilename(fileName);
-    if (fif != FIF_UNKNOWN && FreeImage_FIFSupportsWriting(fif)) {
-        return saveFreeImage(fif, s, img, tags);
-    }
-#endif
-
-    return false;
-}
-
-bool nv::ImageIO::save(const char * fileName, const Image * img, const char ** tags/*=NULL*/)
-{
-    nvDebugCheck(fileName != NULL);
-    nvDebugCheck(img != NULL);
-
-    StdOutputStream stream(fileName);
-    if (stream.isError())
-    {
-        return false;
-    }
-
-    return ImageIO::save(fileName, stream, img, tags);
-}
-
-FloatImage * nv::ImageIO::loadFloat(const char * fileName)
-{
-    nvDebugCheck(fileName != NULL);
-
-    StdInputStream stream(fileName);
-
-    if (stream.isError()) {
-        return NULL;
-    }
-
-    return loadFloat(fileName, stream);
-}
-
-FloatImage * nv::ImageIO::loadFloat(const char * fileName, Stream & s)
-{
-    nvDebugCheck(fileName != NULL);
-
-    const char * extension = Path::extension(fileName);
-
-    /*if (strCaseCmp(extension, ".pfm") == 0) {
-        return loadFloatPFM(s);
-    }*/
-
-#if defined(HAVE_TIFF)
-    #pragma NV_MESSAGE("TODO: Load TIFF from stream.")
-    if (strCaseCmp(extension, ".tif") == 0 || strCaseCmp(extension, ".tiff") == 0) {
-        return loadFloatTIFF(fileName, s);
-    }
-#endif
-
-#if defined(HAVE_OPENEXR)
-    #pragma NV_MESSAGE("TODO: Load EXR from stream.")
-    if (strCaseCmp(extension, ".exr") == 0) {
-        return loadFloatEXR(fileName, s);
-    }
-#endif
-
-#if defined(HAVE_FREEIMAGE)
-    FREE_IMAGE_FORMAT fif = FreeImage_GetFIFFromFilename(fileName);
-    if (fif != FIF_UNKNOWN && FreeImage_FIFSupportsReading(fif)) {
-        return loadFloatFreeImage(fif, s);
-    }
-#endif
-
-    if (strCaseCmp(extension, ".dds") == 0) {
-        const uint spos = s.tell(); // Save stream position.
-        FloatImage * floatImage = loadFloatDDS(s);
-        if (floatImage != NULL) return floatImage;
-        else s.seek(spos);
-    }
-
-    // Try to load as an RGBA8 image and convert to float.
-    AutoPtr<Image> img(load(fileName, s));
-    if (img != NULL) {
-        return new FloatImage(img.ptr());
-    }
-
-    return NULL;
-}
-
-bool nv::ImageIO::saveFloat(const char * fileName, Stream & s, const FloatImage * fimage, uint baseComponent, uint componentCount)
-{
-    if (componentCount == 0) {
-        componentCount = fimage->componentCount() - baseComponent;
-    }
-    if (baseComponent + componentCount < fimage->componentCount()) {
-        return false;
-    }
-
-    const char * extension = Path::extension(fileName);
-
-    if (strCaseCmp(extension, ".dds") == 0) {
-        return saveFloatDDS(s, fimage, baseComponent, componentCount);
-    }
-
-    /*if (strCaseCmp(extension, ".pfm") == 0) {
-        return saveFloatPFM(s, fimage, baseComponent, componentCount);
-    }*/
-
-#if defined(HAVE_FREEIMAGE)
-    FREE_IMAGE_FORMAT fif = FreeImage_GetFIFFromFilename(fileName);
-    if (fif != FIF_UNKNOWN && FreeImage_FIFSupportsWriting(fif)) {
-        return saveFloatFreeImage(fif, s, fimage, baseComponent, componentCount);
-    }
-#endif
-
-    // If everything else fails, save as LDR.
-    if (componentCount <= 4)
-    {
-        AutoPtr<Image> image(fimage->createImage(baseComponent, componentCount));
-        nvCheck(image != NULL);
-
-        if (componentCount == 1)
-        {
-            Color32 * c = image->pixels();
-            const uint count = image->width() * image->height();
-            for (uint i = 0; i < count; i++)
-            {
-                c[i].b = c[i].g = c[i].r;
-            }
-        }
-
-        if (componentCount == 4)
-        {
-            image->setFormat(Image::Format_ARGB);
-        }
-
-        return ImageIO::save(fileName, s, image.ptr());
-    }
-
-    return false;
-}
-
-bool nv::ImageIO::saveFloat(const char * fileName, const FloatImage * fimage, uint baseComponent, uint componentCount)
-{
-    if (componentCount == 0) {
-        componentCount = fimage->componentCount() - baseComponent;
-    }
-    if (baseComponent + componentCount < fimage->componentCount()) {
-        return false;
-    }
-
-    const char * extension = Path::extension(fileName);
-
-#if defined(HAVE_OPENEXR)
-    if (strCaseCmp(extension, ".exr") == 0) {
-        return saveFloatEXR(fileName, fimage, baseComponent, componentCount);
-    }
-#endif
-
-#if defined(HAVE_TIFF)
-    if (strCaseCmp(extension, ".tif") == 0 || strCaseCmp(extension, ".tiff") == 0) {
-        return saveFloatTIFF(fileName, fimage, baseComponent, componentCount);
-    }
-#endif
-
-    StdOutputStream stream(fileName);
-
-    if (stream.isError()) {
-        return false;
-    }
-
-    return saveFloat(fileName, stream, fimage, baseComponent, componentCount);
-}
+// This code is in the public domain -- castanyo@yahoo.es
+
+#include "ImageIO.h"
+#include "Image.h"
+#include "FloatImage.h"
+#include "TgaFile.h"
+#include "PsdFile.h"
+#include "DirectDrawSurface.h"
+#include "PixelFormat.h"
+
+#include "nvmath/Color.h"
+#include "nvmath/Half.h"
+
+#include "nvcore/Ptr.h"
+#include "nvcore/Utils.h"
+#include "nvcore/Array.h"
+#include "nvcore/StrLib.h"
+#include "nvcore/StdStream.h"
+#include "nvcore/TextWriter.h"
+
+// Extern
+#if defined(HAVE_FREEIMAGE)
+#   include <FreeImage.h>
+// If FreeImage available, do not use individual libraries, since that produces link conflicts in some platforms.
+#   undef HAVE_JPEG
+#   undef HAVE_PNG
+#   undef HAVE_TIFF
+#   undef HAVE_OPENEXR
+#endif
+
+#if defined(HAVE_JPEG)
+extern "C" {
+#   include <jpeglib.h>
+}
+#endif
+
+#if defined(HAVE_PNG)
+#   include <png.h>
+#endif
+
+#if defined(HAVE_TIFF)
+#   define _TIFF_DATA_TYPEDEFS_
+#   include <tiffio.h>
+#endif
+
+#if defined(HAVE_OPENEXR)
+#   include <ImfIO.h>
+#   include <ImathBox.h>
+#   include <ImfChannelList.h>
+#   include <ImfInputFile.h>
+#   include <ImfOutputFile.h>
+#   include <ImfArray.h>
+#endif
+
+#if defined(HAVE_STBIMAGE)
+#   define STBI_NO_STDIO
+#   include <stb_image.h>
+#endif
+
+
+using namespace nv;
+
+
+
+struct Color555 {
+    uint16 b : 5;
+    uint16 g : 5;
+    uint16 r : 5;
+};
+
+// Load TGA image.
+static Image * loadTGA(Stream & s)
+{
+    nvCheck(!s.isError());
+    nvCheck(s.isLoading());
+
+    TgaHeader tga;
+    s << tga;
+    s.seek(TgaHeader::Size + tga.id_length);
+
+    // Get header info.
+    bool rle = false;
+    bool pal = false;
+    bool rgb = false;
+    bool grey = false;
+
+    switch( tga.image_type ) {
+        case TGA_TYPE_RLE_INDEXED:
+            rle = true;
+            // no break is intended!
+        case TGA_TYPE_INDEXED:
+            if( tga.colormap_type!=1 || tga.colormap_size!=24 || tga.colormap_length>256 ) {
+                nvDebug( "*** loadTGA: Error, only 24bit paletted images are supported.\n" );
+                return NULL;
+            }
+            pal = true;
+            break;
+
+	case TGA_TYPE_RLE_RGB:
+	    rle = true;
+	    // no break is intended!
+	case TGA_TYPE_RGB:
+	    rgb = true;
+	    break;
+
+	case TGA_TYPE_RLE_GREY:
+	    rle = true;
+	    // no break is intended!
+	case TGA_TYPE_GREY:
+	    grey = true;
+	    break;
+
+	default:
+	    nvDebug( "*** loadTGA: Error, unsupported image type.\n" );
+	    return NULL;
+    }
+
+    const uint pixel_size = (tga.pixel_size/8);
+    nvDebugCheck(pixel_size <= 4);
+
+    const uint size = tga.width * tga.height * pixel_size;
+
+
+    // Read palette
+    uint8 palette[768];
+    if( pal ) {
+        nvDebugCheck(tga.colormap_length <= 256);
+        s.serialize(palette, 3 * tga.colormap_length);
+    }
+
+    // Decode image.
+    uint8 * mem = new uint8[size];
+    if( rle ) {
+        // Decompress image in src.
+        uint8 * dst = mem;
+        int num = size;
+
+	while (num > 0) {
+	    // Get packet header
+	    uint8 c;
+	    s << c;
+
+	    uint count = (c & 0x7f) + 1;
+	    num -= count * pixel_size;
+
+	    if (c & 0x80) {
+		// RLE pixels.
+		uint8 pixel[4];	// uint8 pixel[pixel_size];
+		s.serialize( pixel, pixel_size );
+		do {
+		    memcpy(dst, pixel, pixel_size);
+		    dst += pixel_size;
+		} while (--count);
+	    }
+	    else {
+		// Raw pixels.
+		count *= pixel_size;
+		//file->Read8(dst, count);
+		s.serialize(dst, count);
+		dst += count;
+	    }
+	}
+    }
+    else {
+        s.serialize(mem, size);
+    }
+
+    // Allocate image.
+    AutoPtr<Image> img(new Image());
+    img->allocate(tga.width, tga.height);
+
+    int lstep;
+    Color32 * dst;
+    if( tga.flags & TGA_ORIGIN_UPPER ) {
+        lstep = tga.width;
+        dst = img->pixels();
+    }
+    else {
+        lstep = - tga.width;
+        dst = img->pixels() + (tga.height-1) * tga.width;
+    }
+
+    // Write image.
+    uint8 * src = mem;
+    if( pal ) {
+        for( int y = 0; y < tga.height; y++ ) {
+            for( int x = 0; x < tga.width; x++ ) {
+                uint8 idx = *src++;
+                dst[x].setBGRA(palette[3*idx+0], palette[3*idx+1], palette[3*idx+2], 0xFF);
+            }
+            dst += lstep;
+        }
+    }
+    else if( grey ) {
+        img->setFormat(Image::Format_ARGB);
+
+        for( int y = 0; y < tga.height; y++ ) {
+            for( int x = 0; x < tga.width; x++ ) {
+                dst[x].setBGRA(*src, *src, *src, *src);
+                src++;
+            }
+            dst += lstep;
+        }
+    }
+    else {
+
+        if( tga.pixel_size == 16 ) {
+            for( int y = 0; y < tga.height; y++ ) {
+                for( int x = 0; x < tga.width; x++ ) {
+                    Color555 c = *reinterpret_cast<Color555 *>(src);
+                    uint8 b = (c.b << 3) | (c.b >> 2);
+                    uint8 g = (c.g << 3) | (c.g >> 2);
+                    uint8 r = (c.r << 3) | (c.r >> 2);
+                    dst[x].setBGRA(b, g, r, 0xFF);
+                    src += 2;
+                }
+                dst += lstep;
+            }
+        }
+        else if( tga.pixel_size == 24 ) {
+            for( int y = 0; y < tga.height; y++ ) {
+                for( int x = 0; x < tga.width; x++ ) {
+                    dst[x].setBGRA(src[0], src[1], src[2], 0xFF);
+                    src += 3;
+                }
+                dst += lstep;
+            }
+        }
+        else if( tga.pixel_size == 32 ) {
+            img->setFormat(Image::Format_ARGB);
+
+            for( int y = 0; y < tga.height; y++ ) {
+                for( int x = 0; x < tga.width; x++ ) {
+                    dst[x].setBGRA(src[0], src[1], src[2], src[3]);
+                    src += 4;
+                }
+                dst += lstep;
+            }
+        }
+    }
+
+    // free uncompressed data.
+    delete [] mem;
+
+    return img.release();
+}
+
+// Save TGA image.
+static bool saveTGA(Stream & s, const Image * img)
+{
+    nvCheck(!s.isError());
+    nvCheck(img != NULL);
+    nvCheck(img->pixels() != NULL);
+
+    TgaFile tga;
+    tga.head.id_length = 0;
+    tga.head.colormap_type = 0;
+    tga.head.image_type = TGA_TYPE_RGB;
+
+    tga.head.colormap_index = 0;
+    tga.head.colormap_length = 0;
+    tga.head.colormap_size = 0;
+
+    tga.head.x_origin = 0;
+    tga.head.y_origin = 0;
+    tga.head.width = img->width();
+    tga.head.height = img->height();
+    if(img->format() == Image::Format_ARGB) {
+        tga.head.pixel_size = 32;
+        tga.head.flags = TGA_ORIGIN_UPPER | TGA_HAS_ALPHA;
+    }
+    else {
+        tga.head.pixel_size = 24;
+        tga.head.flags = TGA_ORIGIN_UPPER;
+    }
+
+    // @@ Serialize directly.
+    tga.allocate();
+
+    const uint n = img->width() * img->height();
+    if(img->format() == Image::Format_ARGB) {
+        for(uint i = 0; i < n; i++) {
+            Color32 color = img->pixel(i);
+            tga.mem[4 * i + 0] = color.b;
+            tga.mem[4 * i + 1] = color.g;
+            tga.mem[4 * i + 2] = color.r;
+            tga.mem[4 * i + 3] = color.a;
+        }
+    }
+    else {
+        for(uint i = 0; i < n; i++) {
+            Color32 color = img->pixel(i);
+            tga.mem[3 * i + 0] = color.b;
+            tga.mem[3 * i + 1] = color.g;
+            tga.mem[3 * i + 2] = color.r;
+        }
+    }
+
+    s << tga;
+
+    tga.free();
+
+    return true;
+}
+
+/*static Image * loadPPM(Stream & s)
+{
+    // @@
+    return NULL;
+}*/
+
+// Save PPM image.
+static bool savePPM(Stream & s, const Image * img)
+{
+    //if (img->depth() != 1) return false;
+    //if (img->format() == Image::Format_ARGB) return false;
+
+    uint w = img->width();
+    uint h = img->height();
+
+    TextWriter writer(&s);
+    writer.write("P6\n");
+    writer.write("%d %d\n", w, h);
+    writer.write("255\n");
+    for (uint i = 0; i < w * h; i++) {
+        Color32 c = img->pixel(i);
+        s << c.r << c.g << c.b;
+    }
+
+    return true;
+}
+
+
+/*static FloatImage * loadFloatPFM(Stream & s)
+{
+    return NULL;
+}*/
+
+/*static bool saveFloatPFM(Stream & s, const FloatImage * img, uint base_channel, uint channel_count)
+{
+    return false;
+}*/
+
+// Load PSD image.
+static Image * loadPSD(Stream & s)
+{
+    nvCheck(!s.isError());
+    nvCheck(s.isLoading());
+
+    s.setByteOrder(Stream::BigEndian);
+
+    PsdHeader header;
+    s << header;
+
+    if (!header.isValid())
+    {
+        printf("invalid header!\n");
+        return NULL;
+    }
+
+    if (!header.isSupported())
+    {
+        printf("unsupported file!\n");
+        return NULL;
+    }
+
+    int tmp;
+
+    // Skip mode data.
+    s << tmp;
+    s.seek(s.tell() + tmp);
+
+    // Skip image resources.
+    s << tmp;
+    s.seek(s.tell() + tmp);
+
+    // Skip the reserved data.
+    s << tmp;
+    s.seek(s.tell() + tmp);
+
+    // Find out if the data is compressed.
+    // Known values:
+    //   0: no compression
+    //   1: RLE compressed
+    uint16 compression;
+    s << compression;
+
+    if (compression > 1) {
+        // Unknown compression type.
+        return NULL;
+    }
+
+    uint channel_num = header.channel_count;
+
+    AutoPtr<Image> img(new Image());
+    img->allocate(header.width, header.height);
+
+    if (channel_num < 4)
+    {
+        // Clear the image.
+        img->fill(Color32(0, 0, 0, 0xFF));
+    }
+    else
+    {
+        // Enable alpha.
+        img->setFormat(Image::Format_ARGB);
+
+        // Ignore remaining channels.
+        channel_num = 4;
+    }
+
+
+    const uint pixel_count = header.height * header.width;
+
+    static const uint components[4] = {2, 1, 0, 3};
+
+    if (compression)
+    {
+        s.seek(s.tell() + header.height * header.channel_count * sizeof(uint16));
+
+        // Read RLE data.
+        for (uint channel = 0; channel < channel_num; channel++)
+        {
+            uint8 * ptr = (uint8 *)img->pixels() + components[channel];
+
+            uint count = 0;
+            while( count < pixel_count )
+            {
+                if (s.isAtEnd()) return NULL;
+
+                uint8 c;
+                s << c;
+
+                uint len = c;
+                if (len < 128)
+                {
+                    // Copy next len+1 bytes literally.
+                    len++;
+                    count += len;
+                    if (count > pixel_count) return NULL;
+
+                    while (len != 0)
+                    {
+                        s << *ptr;
+                        ptr += 4;
+                        len--;
+                    }
+                }
+                else if (len > 128)
+                {
+                    // Next -len+1 bytes in the dest are replicated from next source byte.
+                    // (Interpret len as a negative 8-bit int.)
+                    len ^= 0xFF;
+                    len += 2;
+                    count += len;
+                    if (s.isAtEnd() || count > pixel_count) return NULL;
+
+                    uint8 val;
+                    s << val;
+                    while( len != 0 ) {
+                        *ptr = val;
+                        ptr += 4;
+                        len--;
+                    }
+                }
+                else if( len == 128 ) {
+                    // No-op.
+                }
+            }
+        }
+    }
+    else
+    {
+        // We're at the raw image data. It's each channel in order (Red, Green, Blue, Alpha, ...)
+        // where each channel consists of an 8-bit value for each pixel in the image.
+
+        // Read the data by channel.
+        for (uint channel = 0; channel < channel_num; channel++)
+        {
+            uint8 * ptr = (uint8 *)img->pixels() + components[channel];
+
+            // Read the data.
+            uint count = pixel_count;
+            while (count != 0)
+            {
+                s << *ptr;
+                ptr += 4;
+                count--;
+            }
+        }
+    }
+
+    return img.release();
+}
+
+static FloatImage * loadFloatDDS(Stream & s)
+{
+    nvCheck(s.isLoading());
+    nvCheck(!s.isError());
+
+    DDSHeader header;
+    s << header;
+
+    static const uint D3DFMT_A16B16G16R16F = 113;
+
+    // @@ We only support RGBA16F for now.
+    if (header.pf.fourcc == D3DFMT_A16B16G16R16F) {
+        const int size = header.width * header.height;
+        uint16 * const data = new uint16[size * 4];
+
+        s.serialize(data, size * 4 * sizeof(uint16));
+
+        FloatImage * img = new FloatImage;
+        img->allocate(4, header.width, header.height);
+
+        uint32 * r = (uint32 *)img->channel(0);
+        uint32 * g = (uint32 *)img->channel(1);
+        uint32 * b = (uint32 *)img->channel(2);
+        uint32 * a = (uint32 *)img->channel(3);
+
+        uint16 * ptr = data;
+        for (int i = 0; i < size; i++) {
+            *r++ = half_to_float( *ptr++ );
+            *g++ = half_to_float( *ptr++ );
+            *b++ = half_to_float( *ptr++ );
+            *a++ = half_to_float( *ptr++ );
+        }
+
+        delete [] data;
+
+        return img;
+    }
+
+    return NULL;
+}
+
+static bool saveFloatDDS(Stream & s, const FloatImage * img, uint base_component, uint num_components)
+{
+    nvCheck(s.isSaving());
+    nvCheck(!s.isError());
+
+    if (num_components != 4) return false;
+
+    static const uint D3DFMT_A16B16G16R16F = 113;
+
+    DDSHeader header;
+    header.setTexture2D();
+    header.setWidth(img->width());
+    header.setHeight(img->height());
+    header.setFormatCode(D3DFMT_A16B16G16R16F);
+    // ...
+
+    s << header;
+
+    uint32 * r = (uint32 *)img->channel(base_component + 0);
+    uint32 * g = (uint32 *)img->channel(base_component + 1);
+    uint32 * b = (uint32 *)img->channel(base_component + 2);
+    uint32 * a = (uint32 *)img->channel(base_component + 3);
+
+    const uint size = img->width() * img->height();
+    for (uint i = 0; i < size; i++) {
+        uint16 R = half_from_float( *r++ );
+        uint16 G = half_from_float( *g++ );
+        uint16 B = half_from_float( *b++ );
+        uint16 A = half_from_float( *a++ );
+
+        s.serialize(&R, sizeof(uint16));
+        s.serialize(&G, sizeof(uint16));
+        s.serialize(&B, sizeof(uint16));
+        s.serialize(&A, sizeof(uint16));
+    }
+
+    return true;
+}
+
+
+#if defined(HAVE_PNG)
+
+static void user_read_data(png_structp png_ptr, png_bytep data, png_size_t length)
+{
+    nvDebugCheck(png_ptr != NULL);
+
+    Stream * s = (Stream *)png_get_io_ptr(png_ptr);
+    s->serialize(data, (int)length);
+
+    if (s->isError()) {
+        png_error(png_ptr, "Read Error");
+    }
+}
+
+
+static Image * loadPNG(Stream & s)
+{
+    nvCheck(!s.isError());
+
+    // Set up a read buffer and check the library version
+    png_structp png_ptr;
+    png_ptr = png_create_read_struct(PNG_LIBPNG_VER_STRING, NULL, NULL, NULL);
+    if (png_ptr == NULL) {
+        //	nvDebug( "*** LoadPNG: Error allocating read buffer in file '%s'.\n", name );
+        return false;
+    }
+
+    // Allocate/initialize a memory block for the image information
+    png_infop info_ptr = png_create_info_struct(png_ptr);
+    if (info_ptr == NULL) {
+        png_destroy_read_struct(&png_ptr, NULL, NULL);
+        //	nvDebug( "*** LoadPNG: Error allocating image information for '%s'.\n", name );
+        return false;
+    }
+
+    // Set up the error handling
+    if (setjmp(png_jmpbuf(png_ptr))) {
+        png_destroy_read_struct(&png_ptr, &info_ptr, NULL);
+        //	nvDebug( "*** LoadPNG: Error reading png file '%s'.\n", name );
+        return false;
+    }
+
+    // Set up the I/O functions.
+    png_set_read_fn(png_ptr, (void*)&s, user_read_data);
+
+
+    // Retrieve the image header information
+    png_uint_32 width, height;
+    int bit_depth, color_type, interlace_type;
+    png_read_info(png_ptr, info_ptr);
+    png_get_IHDR(png_ptr, info_ptr, &width, &height, &bit_depth, &color_type, &interlace_type, NULL, NULL);
+
+
+    if (color_type == PNG_COLOR_TYPE_PALETTE && bit_depth <= 8) {
+        // Convert indexed images to RGB.
+        png_set_expand(png_ptr);
+    }
+    else if (color_type == PNG_COLOR_TYPE_GRAY && bit_depth < 8) {
+        // Convert grayscale to RGB.
+        png_set_expand(png_ptr);
+    }
+    else if (png_get_valid(png_ptr, info_ptr, PNG_INFO_tRNS)) {
+        // Expand images with transparency to full alpha channels
+        // so the data will be available as RGBA quartets.
+        png_set_expand(png_ptr);
+    }
+    else if (bit_depth < 8) {
+        // If we have < 8 scale it up to 8.
+        //png_set_expand(png_ptr);
+        png_set_packing(png_ptr);
+    }
+
+    // Reduce bit depth.
+    if (bit_depth == 16) {
+        png_set_strip_16(png_ptr);
+    }
+
+    // Represent gray as RGB
+    if (color_type == PNG_COLOR_TYPE_GRAY || color_type == PNG_COLOR_TYPE_GRAY_ALPHA) {
+        png_set_gray_to_rgb(png_ptr);
+    }
+
+    // Convert to RGBA filling alpha with 0xFF.
+    if (!(color_type & PNG_COLOR_MASK_ALPHA)) {
+        png_set_filler(png_ptr, 0xFF, PNG_FILLER_AFTER);
+    }
+
+    // @todo Choose gamma according to the platform?
+    double screen_gamma = 2.2;
+    int intent;
+    if (png_get_sRGB(png_ptr, info_ptr, &intent)) {
+        png_set_gamma(png_ptr, screen_gamma, 0.45455);
+    }
+    else {
+        double image_gamma;
+        if (png_get_gAMA(png_ptr, info_ptr, &image_gamma)) {
+            png_set_gamma(png_ptr, screen_gamma, image_gamma);
+        }
+        else {
+            png_set_gamma(png_ptr, screen_gamma, 0.45455);
+        }
+    }
+
+    // Perform the selected transforms.
+    png_read_update_info(png_ptr, info_ptr);
+
+    png_get_IHDR(png_ptr, info_ptr, &width, &height, &bit_depth, &color_type, &interlace_type, NULL, NULL);
+
+    AutoPtr<Image> img(new Image());
+    img->allocate(width, height);
+
+    // Set internal format flags.
+    if(color_type & PNG_COLOR_MASK_COLOR) {
+        //img->flags |= PI_IF_HAS_COLOR;
+    }
+    if(color_type & PNG_COLOR_MASK_ALPHA) {
+        //img->flags |= PI_IF_HAS_ALPHA;
+        img->setFormat(Image::Format_ARGB);
+    }
+
+    // Read the image
+    uint8 * pixels = (uint8 *)img->pixels();
+    png_bytep * row_data = new png_bytep[sizeof(png_byte) * height];
+    for (uint i = 0; i < height; i++) {
+        row_data[i] = &(pixels[width * 4 * i]);
+    }
+
+    png_read_image(png_ptr, row_data);
+    delete [] row_data;
+
+    // Finish things up
+    png_read_end(png_ptr, info_ptr);
+    png_destroy_read_struct(&png_ptr, &info_ptr, NULL);
+
+    // RGBA to BGRA.
+    uint num = width * height;
+    for(uint i = 0; i < num; i++)
+    {
+        Color32 c = img->pixel(i);
+        img->pixel(i) = Color32(c.b, c.g, c.r, c.a);
+    }
+
+    // Compute alpha channel if needed.
+    /*if( img->flags & PI_IU_BUMPMAP || img->flags & PI_IU_ALPHAMAP ) {
+        if( img->flags & PI_IF_HAS_COLOR && !(img->flags & PI_IF_HAS_ALPHA)) {
+            img->ComputeAlphaFromColor();
+        }
+    }*/
+
+    return img.release();
+}
+
+static void user_write_data(png_structp png_ptr, png_bytep data, png_size_t length)
+{
+    nvDebugCheck(png_ptr != NULL);
+
+    Stream * s = (Stream *)png_get_io_ptr(png_ptr);
+    s->serialize(data, (int)length);
+
+    if (s->isError()) {
+        png_error(png_ptr, "Write Error");
+    }
+}
+
+static void user_write_flush(png_structp png_ptr) { }
+
+static bool savePNG(Stream & s, const Image * img, const char ** tags/*=NULL*/)
+{
+    nvCheck(!s.isError());
+    nvCheck(img != NULL);
+    nvCheck(img->pixels() != NULL);
+
+    // Set up a write buffer and check the library version
+    png_structp png_ptr;
+    png_ptr = png_create_write_struct(PNG_LIBPNG_VER_STRING, NULL, NULL, NULL);
+    if (png_ptr == NULL) {
+        return false;
+    }
+
+    // Allocate/initialize a memory block for the image information
+    png_infop info_ptr = png_create_info_struct(png_ptr);
+    if (info_ptr == NULL) {
+        png_destroy_write_struct(&png_ptr, NULL);
+        return false;
+    }
+
+    // Set up the error handling
+    if (setjmp(png_jmpbuf(png_ptr))) {
+        png_destroy_write_struct(&png_ptr, &info_ptr);
+        return false;
+    }
+
+    // Set up the I/O functions.
+    png_set_write_fn(png_ptr, (void*)&s, user_write_data, user_write_flush);
+
+    // Set image header information
+    int color_type = PNG_COLOR_TYPE_RGBA;
+    switch(img->format())
+    {
+        case Image::Format_RGB:		color_type = PNG_COLOR_TYPE_RGB; break;
+        case Image::Format_ARGB:	color_type = PNG_COLOR_TYPE_RGBA; break;
+    }
+    png_set_IHDR(png_ptr, info_ptr, img->width(), img->height(),
+        8, color_type, PNG_INTERLACE_NONE,
+        PNG_COMPRESSION_TYPE_DEFAULT,
+        PNG_FILTER_TYPE_DEFAULT);
+
+    // Set image data
+    png_bytep * row_data = new png_bytep[sizeof(png_byte) * img->height()];
+    for (uint i = 0; i < img->height(); i++) {
+        row_data[i] = (png_byte*)img->scanline (i);
+        if (img->format() == Image::Format_RGB) row_data[i]--; // This is a bit of a hack, libpng expects images in ARGB format not BGRA, it supports BGR swapping, but not alpha swapping.
+    }
+    png_set_rows(png_ptr, info_ptr, row_data);
+
+    png_text * text = NULL;
+    if (tags != NULL)
+    {
+        int count = 0;
+        while(tags[2 * count] != NULL) count++;
+
+        text = new png_text[count];
+        memset(text, 0, count * sizeof(png_text);
+
+        for (int i = 0; i < count; i++) {
+            text[i].compression = PNG_TEXT_COMPRESSION_NONE;
+            text[i].key = tags[2 * i + 0];
+            text[i].text = tags[2 * i + 1];
+        }
+
+        png_set_text(png_ptr, info_ptr, text, count);
+    }
+
+    png_write_png(png_ptr, info_ptr,
+        // component order is BGR(A)
+        PNG_TRANSFORM_BGR |
+        // Strip alpha byte for RGB images
+        (img->format() == Image::Format_RGB ? PNG_TRANSFORM_STRIP_FILLER : 0)
+        , NULL);
+
+    // Finish things up
+    png_destroy_write_struct(&png_ptr, &info_ptr);
+
+    delete [] row_data;
+    delete [] text;
+
+    return true;
+}
+
+#endif // defined(HAVE_PNG)
+
+#if defined(HAVE_JPEG)
+
+static void init_source (j_decompress_ptr /*cinfo*/){
+}
+
+static boolean fill_input_buffer (j_decompress_ptr cinfo) {
+    struct jpeg_source_mgr * src = cinfo->src;
+    static JOCTET FakeEOI[] = { 0xFF, JPEG_EOI };
+
+    // Generate warning
+    nvDebug("jpeglib: Premature end of file\n");
+
+    // Insert a fake EOI marker
+    src->next_input_byte = FakeEOI;
+    src->bytes_in_buffer = 2;
+
+    return TRUE;
+}
+
+static void skip_input_data (j_decompress_ptr cinfo, long num_bytes) {
+    struct jpeg_source_mgr * src = cinfo->src;
+
+    if(num_bytes >= (long)src->bytes_in_buffer) {
+        fill_input_buffer(cinfo);
+        return;
+    }
+
+    src->bytes_in_buffer -= num_bytes;
+    src->next_input_byte += num_bytes;
+}
+
+static void term_source (j_decompress_ptr /*cinfo*/){
+    // no work necessary here
+}
+
+
+static Image * loadJPG(Stream & s)
+{
+    nvCheck(!s.isError());
+
+    // Read the entire file.
+    Array<uint8> byte_array;
+    byte_array.resize(s.size());
+    s.serialize(byte_array.buffer(), s.size());
+
+    jpeg_decompress_struct cinfo;
+    jpeg_error_mgr jerr;
+
+    cinfo.err = jpeg_std_error(&jerr);
+    jpeg_create_decompress(&cinfo);
+
+    cinfo.src = (struct jpeg_source_mgr *) (*cinfo.mem->alloc_small)
+                ((j_common_ptr) &cinfo, JPOOL_PERMANENT, sizeof(struct jpeg_source_mgr));
+    cinfo.src->init_source = init_source;
+    cinfo.src->fill_input_buffer = fill_input_buffer;
+    cinfo.src->skip_input_data = skip_input_data;
+    cinfo.src->resync_to_restart = jpeg_resync_to_restart;	// use default method
+    cinfo.src->term_source = term_source;
+    cinfo.src->bytes_in_buffer = byte_array.size();
+    cinfo.src->next_input_byte = byte_array.buffer();
+
+    jpeg_read_header(&cinfo, TRUE);
+    jpeg_start_decompress(&cinfo);
+
+    /*
+    cinfo.do_fancy_upsampling = FALSE;	// fast decompression
+    cinfo.dct_method = JDCT_FLOAT;			// Choose floating point DCT method.
+    */
+
+    uint8 * tmp_buffer = new uint8 [cinfo.output_width * cinfo.output_height * cinfo.num_components];
+    uint8 * scanline = tmp_buffer;
+
+    while( cinfo.output_scanline < cinfo.output_height ){
+        int num_scanlines = jpeg_read_scanlines (&cinfo, &scanline, 1);
+        scanline += num_scanlines * cinfo.output_width * cinfo.num_components;
+    }
+
+    jpeg_finish_decompress(&cinfo);
+
+    AutoPtr<Image> img(new Image());
+    img->allocate(cinfo.output_width, cinfo.output_height);
+
+    Color32 * dst = img->pixels();
+    const int size = img->height() * img->width();
+    const uint8 * src = tmp_buffer;
+
+    if( cinfo.num_components == 3 ) {
+        img->setFormat(Image::Format_RGB);
+        for( int i = 0; i < size; i++ ) {
+            *dst++ = Color32(src[0], src[1], src[2]);
+            src += 3;
+        }
+    }
+    else {
+        img->setFormat(Image::Format_ARGB);
+        for( int i = 0; i < size; i++ ) {
+            *dst++ = Color32(*src, *src, *src, *src);
+            src++;
+        }
+    }
+
+    delete [] tmp_buffer;
+    jpeg_destroy_decompress (&cinfo);
+
+    return img.release();
+}
+
+#endif // defined(HAVE_JPEG)
+
+#if defined(HAVE_TIFF)
+
+/*
+static tsize_t tiffReadWriteProc(thandle_t h, tdata_t ptr, tsize_t size)
+{
+    Stream * s = (Stream *)h;
+    nvDebugCheck(s != NULL);
+
+    s->serialize(ptr, size);
+
+    return size;
+}
+
+static toff_t tiffSeekProc(thandle_t h, toff_t offset, int whence)
+{
+    Stream * s = (Stream *)h;
+    nvDebugCheck(s != NULL);
+
+    if (!s->isSeekable())
+    {
+        return (toff_t)-1;
+    }
+
+    if (whence == SEEK_SET)
+    {
+        s->seek(offset);
+    }
+    else if (whence == SEEK_CUR)
+    {
+        s->seek(s->tell() + offset);
+    }
+    else if (whence == SEEK_END)
+    {
+        s->seek(s->size() + offset);
+    }
+
+    return s->tell();
+}
+
+static int tiffCloseProc(thandle_t)
+{
+    return 0;
+}
+
+static toff_t tiffSizeProc(thandle_t h)
+{
+    Stream * s = (Stream *)h;
+    nvDebugCheck(s != NULL);
+    return s->size();
+}
+
+static int tiffMapFileProc(thandle_t, tdata_t*, toff_t*)
+{
+    // @@ TODO, Implement these functions.
+    return -1;
+}
+
+static void tiffUnmapFileProc(thandle_t, tdata_t, toff_t)
+{
+    // @@ TODO, Implement these functions.
+}
+*/
+
+static FloatImage * loadFloatTIFF(const char * fileName, Stream & s)
+{
+    nvCheck(!s.isError());
+
+    TIFF * tif = TIFFOpen(fileName, "r");
+    //TIFF * tif = TIFFClientOpen(fileName, "r", &s, tiffReadWriteProc, tiffReadWriteProc, tiffSeekProc, tiffCloseProc, tiffSizeProc, tiffMapFileProc, tiffUnmapFileProc);
+
+    if (!tif)
+    {
+        nvDebug("Can't open '%s' for reading\n", fileName);
+        return NULL;
+    }
+
+    ::uint16 spp, bpp, format;
+    ::uint32 width, height;
+    TIFFGetField(tif, TIFFTAG_IMAGELENGTH, &height);
+    TIFFGetField(tif, TIFFTAG_IMAGEWIDTH, &width);
+    TIFFGetField(tif, TIFFTAG_BITSPERSAMPLE, &bpp);
+    TIFFGetField(tif, TIFFTAG_SAMPLESPERPIXEL, &spp);
+    TIFFGetField(tif, TIFFTAG_SAMPLEFORMAT, &format);
+
+    if (bpp != 8 && bpp != 16 && bpp != 32) {
+        nvDebug("Can't load '%s', only 1 sample per pixel supported\n", fileName);
+        TIFFClose(tif);
+        return NULL;
+    }
+
+    AutoPtr<FloatImage> fimage(new FloatImage());
+    fimage->allocate(spp, width, height);
+
+    int linesize = TIFFScanlineSize(tif);
+    tdata_t buf = malloc<uint8>(linesize);
+
+    for (uint y = 0; y < height; y++)
+    {
+        TIFFReadScanline(tif, buf, y, 0);
+
+	for (uint c=0; c<spp; c++ )
+	{
+	    float * dst = fimage->scanline(y, c);
+
+	    for(uint x = 0; x < width; x++)
+	    {
+		if (bpp == 8)
+		{
+			dst[x] = float(((::uint8 *)buf)[x*spp+c]) / float(0xFF);
+		}
+		else if (bpp == 16)
+		{
+			dst[x] = float(((::uint16 *)buf)[x*spp+c]) / float(0xFFFF);
+		}
+		else if (bpp == 32)
+		{
+		    if (format==SAMPLEFORMAT_IEEEFP)
+		    {
+			dst[x] = float(((float *)buf)[x*spp+c]);
+		    }
+		    else
+		    {
+			dst[x] = float(((::uint32 *)buf)[x*spp+c] >> 8) / float(0xFFFFFF);
+		    }
+		}
+	    }
+	}
+    }
+
+    free(buf);
+
+    TIFFClose(tif);
+
+    return fimage.release();
+}
+
+static bool saveFloatTIFF(const char * fileName, const FloatImage * fimage, uint base_component, uint num_components)
+{
+    nvCheck(fileName != NULL);
+    nvCheck(fimage != NULL);
+    nvCheck(base_component + num_components <= fimage->componentCount());
+
+    const int iW = fimage->width();
+    const int iH = fimage->height();
+    const int iC = num_components;
+
+    TIFF * image = TIFFOpen(fileName, "w");
+
+    // Open the TIFF file
+    if (image == NULL)
+    {
+        nvDebug("Could not open '%s' for writing\n", fileName);
+        return false;
+    }
+
+    TIFFSetField(image, TIFFTAG_IMAGEWIDTH,  iW);
+    TIFFSetField(image, TIFFTAG_IMAGELENGTH, iH);
+    TIFFSetField(image, TIFFTAG_SAMPLESPERPIXEL, iC);
+    TIFFSetField(image, TIFFTAG_SAMPLEFORMAT, SAMPLEFORMAT_IEEEFP);
+    TIFFSetField(image, TIFFTAG_BITSPERSAMPLE, 32);
+
+    uint32 rowsperstrip = TIFFDefaultStripSize(image, (uint32)-1);
+
+    TIFFSetField(image, TIFFTAG_ROWSPERSTRIP, rowsperstrip);
+    TIFFSetField(image, TIFFTAG_COMPRESSION, COMPRESSION_PACKBITS);
+    if (num_components == 3)
+    {
+        // Set this so that it can be visualized with pfstools.
+        TIFFSetField(image, TIFFTAG_PHOTOMETRIC, PHOTOMETRIC_RGB);
+    }
+    TIFFSetField(image, TIFFTAG_ORIENTATION, ORIENTATION_TOPLEFT);
+    TIFFSetField(image, TIFFTAG_PLANARCONFIG, PLANARCONFIG_CONTIG);
+
+    float * scanline = new float[iW * iC];
+    for (int y = 0; y < iH; y++)
+    {
+        for (int c = 0; c < iC; c++)
+        {
+            const float * src = fimage->scanline(y, base_component + c);
+            for (int x = 0; x < iW; x++) scanline[x * iC + c] = src[x];
+        }
+        if (TIFFWriteScanline(image, scanline, y, 0)==-1)
+        {
+            nvDebug("Error writing scanline %d\n", y);
+            return false;
+        }
+    }
+    delete [] scanline;
+
+    // Close the file
+    TIFFClose(image);
+    return true;
+}
+
+#endif // defined(HAVE_TIFF)
+
+#if defined(HAVE_OPENEXR)
+
+namespace
+{
+    class ExrStream : public Imf::IStream
+    {
+    public:
+        ExrStream(const char * name, Stream & s) : Imf::IStream(name), m_stream(s)
+        {
+            nvDebugCheck(s.isLoading());
+        }
+
+	virtual bool read(char c[], int n)
+	{
+	    m_stream.serialize(c, n);
+
+	    if (m_stream.isError())
+	    {
+		throw Iex::InputExc("I/O error.");
+	    }
+
+	    return m_stream.isAtEnd();
+	}
+
+	virtual Imf::Int64 tellg()
+	{
+	    return m_stream.tell();
+	}
+
+	virtual void seekg(Imf::Int64 pos)
+	{
+	    nvDebugCheck(pos >= 0 && pos < UINT_MAX);
+	    m_stream.seek((uint)pos);
+	}
+
+	virtual void clear()
+	{
+	    m_stream.clearError();
+	}
+
+    private:
+        Stream & m_stream;
+    };
+
+    static int channelIndexFromName(const char* name)
+    {
+        char c = tolower(name[0]);
+        switch (c)
+        {
+        default:
+        case 'r':
+            return 0;
+        case 'g':
+            return 1;
+        case 'b':
+            return 2;
+        case 'a':
+            return 3;
+        }
+    }
+
+} // namespace
+
+static FloatImage * loadFloatEXR(const char * fileName, Stream & s)
+{
+    nvCheck(s.isLoading());
+    nvCheck(!s.isError());
+
+    ExrStream stream(fileName, s);
+    Imf::InputFile inputFile(stream);
+
+    Imath::Box2i box = inputFile.header().dataWindow();
+
+    int width = box.max.x - box.min.y + 1;
+    int height = box.max.x - box.min.y + 1;
+
+    const Imf::ChannelList & channels = inputFile.header().channels();
+
+    // Count channels.
+    uint channelCount= 0;
+    for (Imf::ChannelList::ConstIterator it = channels.begin(); it != channels.end(); ++it)
+    {
+        channelCount++;
+    }
+
+    // Allocate FloatImage.
+    AutoPtr<FloatImage> fimage(new FloatImage());
+    fimage->allocate(channelCount, width, height);
+
+    // Describe image's layout with a framebuffer.
+    Imf::FrameBuffer frameBuffer;
+    uint i = 0;
+    for (Imf::ChannelList::ConstIterator it = channels.begin(); it != channels.end(); ++it, ++i)
+    {
+        int channelIndex = channelIndexFromName(it.name());
+        frameBuffer.insert(it.name(), Imf::Slice(Imf::FLOAT, (char *)fimage->channel(channelIndex), sizeof(float), sizeof(float) * width));
+    }
+
+    // Read it.
+    inputFile.setFrameBuffer (frameBuffer);
+    inputFile.readPixels (box.min.y, box.max.y);
+
+    return fimage.release();
+}
+
+static bool saveFloatEXR(const char * fileName, const FloatImage * fimage, uint base_component, uint num_components)
+{
+    nvCheck(fileName != NULL);
+    nvCheck(fimage != NULL);
+    nvCheck(base_component + num_components <= fimage->componentCount());
+    nvCheck(num_components > 0 && num_components <= 4);
+
+    const int w = fimage->width();
+    const int h = fimage->height();
+
+    const char * channelNames[] = {"R", "G", "B", "A"};
+
+    Imf::Header header (w, h);
+
+    for (uint c = 0; c < num_components; c++)
+    {
+        header.channels().insert(channelNames[c], Imf::Channel(Imf::FLOAT));
+    }
+
+    Imf::OutputFile file(fileName, header);
+    Imf::FrameBuffer frameBuffer;
+
+    for (uint c = 0; c < num_components; c++)
+    {
+        char * channel = (char *) fimage->channel(base_component + c);
+        frameBuffer.insert(channelNames[c], Imf::Slice(Imf::FLOAT, channel, sizeof(float), sizeof(float) * w));
+    }
+
+    file.setFrameBuffer(frameBuffer);
+    file.writePixels(h);
+
+    return true;
+}
+
+#endif // defined(HAVE_OPENEXR)
+
+
+#if defined(HAVE_FREEIMAGE)
+
+static unsigned DLL_CALLCONV ReadProc(void *buffer, unsigned size, unsigned count, fi_handle handle)
+{
+    Stream * s = (Stream *) handle;
+    s->serialize(buffer, size * count);
+    return count;
+}
+
+static unsigned DLL_CALLCONV WriteProc(void *buffer, unsigned size, unsigned count, fi_handle handle)
+{
+    Stream * s = (Stream *) handle;
+    s->serialize(buffer, size * count);
+    return count;
+}
+
+static int DLL_CALLCONV SeekProc(fi_handle handle, long offset, int origin)
+{
+    Stream * s = (Stream *) handle;
+
+    switch(origin) {
+        case SEEK_SET :
+            s->seek(offset);
+            break;
+        case SEEK_END :
+            s->seek(s->size() + offset);
+            break;
+        case SEEK_CUR :
+            s->seek(s->tell() + offset);
+            break;
+        default :
+            return 1;
+    }
+
+    return 0;
+}
+
+static long DLL_CALLCONV TellProc(fi_handle handle)
+{
+    Stream * s = (Stream *) handle;
+    return s->tell();
+}
+
+
+Image * nv::ImageIO::loadFreeImage(FREE_IMAGE_FORMAT fif, Stream & s)
+{
+    nvCheck(!s.isError());
+
+    FreeImageIO io;
+    io.read_proc = ReadProc;
+    io.write_proc = NULL;
+    io.seek_proc = SeekProc;
+    io.tell_proc = TellProc;
+
+    FIBITMAP * bitmap = FreeImage_LoadFromHandle(fif, &io, (fi_handle)&s, 0);
+
+    if (bitmap == NULL)
+    {
+        return NULL;
+    }
+
+    const int w = FreeImage_GetWidth(bitmap);
+    const int h = FreeImage_GetHeight(bitmap);
+
+    if (FreeImage_GetImageType(bitmap) != FIT_BITMAP)
+    {
+        // @@ Use tone mapping?
+        FIBITMAP * tmp = FreeImage_ConvertToType(bitmap, FIT_BITMAP, true);
+        FreeImage_Unload(bitmap);
+        bitmap = tmp;
+    }
+
+    nvDebugCheck(FreeImage_GetImageType(bitmap) == FIT_BITMAP);
+    if (FreeImage_GetBPP(bitmap) != 32)
+    {
+        FIBITMAP * tmp = FreeImage_ConvertTo32Bits(bitmap);
+        FreeImage_Unload(bitmap);
+        bitmap = tmp;
+    }
+
+
+    Image * image = new Image();
+    image->allocate(w, h, 1); // freeimage can only load 2d images:
+
+    // Copy the image over to our internal format, FreeImage has the scanlines bottom to top though.
+    for (int y=0; y < h; y++)
+    {
+        const void * src = FreeImage_GetScanLine(bitmap, h - y - 1);
+        void * dst = image->scanline(y);
+
+        memcpy(dst, src, 4 * w);
+    }
+
+    FreeImage_Unload(bitmap);
+
+    return image;
+}
+
+FloatImage * nv::ImageIO::loadFloatFreeImage(FREE_IMAGE_FORMAT fif, Stream & s)
+{
+    nvCheck(!s.isError());
+
+    FreeImageIO io;
+    io.read_proc = ReadProc;
+    io.write_proc = NULL;
+    io.seek_proc = SeekProc;
+    io.tell_proc = TellProc;
+
+    FIBITMAP * bitmap = FreeImage_LoadFromHandle(fif, &io, (fi_handle)&s, 0);
+
+    if (bitmap == NULL)
+    {
+        return NULL;
+    }
+
+    const int w = FreeImage_GetWidth(bitmap);
+    const int h = FreeImage_GetHeight(bitmap);
+
+    FREE_IMAGE_TYPE fit = FreeImage_GetImageType(bitmap);
+
+    FloatImage * floatImage = new FloatImage();
+
+    switch (fit)
+    {
+        case FIT_BITMAP:
+            floatImage->allocate(4, w, h);
+            {
+                FIBITMAP * tmp = FreeImage_ConvertTo32Bits(bitmap);
+
+                uint bitcount = FreeImage_GetBPP(bitmap);
+                uint byteCount = bitcount / 8;
+
+                for (int y=0; y < h; y++)
+                {
+                    const Color32 * src = (const Color32 *)FreeImage_GetScanLine(bitmap, h - y - 1 );
+
+                    float * r = floatImage->scanline(y, 0);
+                    float * g = floatImage->scanline(y, 1);
+                    float * b = floatImage->scanline(y, 2);
+                    float * a = floatImage->scanline(y, 3);
+
+                    for (int x=0; x < w; x++)
+                    {
+                        r[x] = float(src[x].r) / 255.0f;
+                        g[x] = float(src[x].g) / 255.0f;
+                        b[x] = float(src[x].b) / 255.0f;
+                        a[x] = float(src[x].a) / 255.0f;
+                    }
+
+                    src += byteCount;
+                }
+
+                FreeImage_Unload(tmp);
+            }
+            break;
+        case FIT_FLOAT:
+            floatImage->allocate(1, w, h);
+
+            for (int y=0; y < h; y++)
+            {
+                const float * src = (const float *)FreeImage_GetScanLine(bitmap, h - y - 1 );
+                float * dst = floatImage->scanline(y, 0);
+
+                for (int x=0; x < w; x++)
+                {
+                    dst[x] = src[x];
+                }
+            }
+            break;
+        case FIT_UINT16:
+            floatImage->allocate(1, w, h);
+
+            for (int y=0; y < h; y++)
+            {
+                const uint16 * src = (const uint16 *)FreeImage_GetScanLine(bitmap, h - y - 1 );
+                float * dst = floatImage->scanline(y, 0);
+
+                for (int x=0; x < w; x++)
+                {
+                    dst[x] = float(src[x]) / 65535;
+                }
+            }
+            break;
+        case FIT_COMPLEX:
+            floatImage->allocate(2, w, h);
+
+            for (int y=0; y < h; y++)
+            {
+                const FICOMPLEX * src = (const FICOMPLEX *)FreeImage_GetScanLine(bitmap, h - y - 1 );
+
+                float * dst_real = floatImage->scanline(y, 0);
+                float * dst_imag = floatImage->scanline(y, 1);
+
+                for (int x=0; x < w; x++)
+                {
+                    dst_real[x] = (float)src[x].r;
+                    dst_imag[x] = (float)src[x].i;
+                }
+            }
+            break;
+        case FIT_RGBF:
+            floatImage->allocate(3, w, h);
+
+            for (int y=0; y < h; y++)
+            {
+                const FIRGBF * src = (const FIRGBF *)FreeImage_GetScanLine(bitmap, h - y - 1 );
+
+                float * dst_red = floatImage->scanline(y, 0);
+                float * dst_green = floatImage->scanline(y, 1);
+                float * dst_blue = floatImage->scanline(y, 2);
+
+                for (int x=0; x < w; x++)
+                {
+                    dst_red[x] = src[x].red;
+                    dst_green[x] = src[x].green;
+                    dst_blue[x] = src[x].blue;
+                }
+            }
+            break;
+        case FIT_RGBAF:
+            floatImage->allocate(4, w, h);
+
+            for (int y=0; y < h; y++)
+            {
+                const FIRGBAF * src = (const FIRGBAF *)FreeImage_GetScanLine(bitmap, h - y - 1 );
+
+                float * dst_red = floatImage->scanline(y, 0);
+                float * dst_green = floatImage->scanline(y, 1);
+                float * dst_blue = floatImage->scanline(y, 2);
+                float * dst_alpha = floatImage->scanline(y, 3);
+
+                for (int x=0; x < w; x++)
+                {
+                    dst_red[x] = src[x].red;
+                    dst_green[x] = src[x].green;
+                    dst_blue[x] = src[x].blue;
+                    dst_alpha[x] = src[x].alpha;
+                }
+            }
+            break;
+        default:
+            delete floatImage;
+            floatImage = NULL;
+    }
+
+    FreeImage_Unload(bitmap);
+
+    return floatImage;
+}
+
+bool nv::ImageIO::saveFreeImage(FREE_IMAGE_FORMAT fif, Stream & s, const Image * img, const char ** tags)
+{
+    nvCheck(!s.isError());
+
+    FreeImageIO io;
+    io.read_proc = NULL;
+    io.write_proc = WriteProc;
+    io.seek_proc = SeekProc;
+    io.tell_proc = TellProc;
+
+    const uint w = img->width();
+    const uint h = img->height();
+
+    FIBITMAP * bitmap = FreeImage_Allocate(w, h, 32);
+
+    for (uint i = 0; i < h; i++)
+    {
+        uint8 * scanline = FreeImage_GetScanLine(bitmap, i);
+        memcpy(scanline, img->scanline(h - i - 1), w * sizeof(Color32));
+    }
+
+    if (tags != NULL)
+    {
+    #pragma NV_MESSAGE("TODO: Save image metadata")
+        //FreeImage_SetMetadata(
+    }
+
+    bool result = FreeImage_SaveToHandle(fif, bitmap, &io, (fi_handle)&s, 0) != 0;
+
+    FreeImage_Unload(bitmap);
+
+    return result;
+}
+
+bool nv::ImageIO::saveFloatFreeImage(FREE_IMAGE_FORMAT fif, Stream & s, const FloatImage * img, uint baseComponent, uint componentCount)
+{
+    nvCheck(!s.isError());
+
+    FreeImageIO io;
+    io.read_proc = NULL;
+    io.write_proc = WriteProc;
+    io.seek_proc = SeekProc;
+    io.tell_proc = TellProc;
+
+    const uint w = img->width();
+    const uint h = img->height();
+
+    FREE_IMAGE_TYPE type;
+    if (componentCount == 1)
+    {
+        type = FIT_FLOAT;
+    }
+    else if (componentCount == 3)
+    {
+        type = FIT_RGBF;
+    }
+    else if (componentCount == 4)
+    {
+        type = FIT_RGBAF;
+    }
+    else {
+        return false;
+    }
+
+
+    FIBITMAP * bitmap = FreeImage_AllocateT(type, w, h);
+
+    for (uint y = 0; y < h; y++)
+    {
+        float * scanline = (float *)FreeImage_GetScanLine(bitmap, y);
+
+        for (uint x = 0; x < w; x++)
+        {
+            for (uint c = 0; c < componentCount; c++)
+            {
+                scanline[x * componentCount + c] = img->pixel(x, y, baseComponent + c);
+            }
+        }
+    }
+
+    bool result = FreeImage_SaveToHandle(fif, bitmap, &io, (fi_handle)&s, 0) != 0;
+
+    FreeImage_Unload(bitmap);
+
+    return result;
+}
+
+#endif // defined(HAVE_FREEIMAGE)
+
+
+#if defined(HAVE_STBIMAGE)
+
+static Image * loadSTB(Stream & s)
+{
+    // @@ Assumes stream cursor is at the beginning and that image occupies the whole stream.
+    const int size = s.size();
+    uint8 * buffer = new uint8[size];
+
+    s.serialize(buffer, size);
+
+    int w, h, n;
+    uint8 * data = stbi_load_from_memory(buffer, size, &w, &h, &n, 4);
+
+    delete buffer;
+
+    if (data != NULL) {
+        Image * img = new Image;
+        img->allocate(w, h);
+        img->setFormat(n == 4 ? Image::Format_ARGB : Image::Format_RGB);
+
+        for (int y = 0; y < h; ++y)
+        {
+            nv::Color32* dest = img->scanline(y);
+            uint8* src = data + y * w * 4;
+
+            for (int x = 0; x < w; ++x)
+            {
+                dest[x].r = src[x * 4 + 0];
+                dest[x].g = src[x * 4 + 1];
+                dest[x].b = src[x * 4 + 2];
+                dest[x].a = src[x * 4 + 3];
+            }
+        }
+        
+        free(data);
+
+        return img;
+    }
+
+    return NULL;
+}
+
+static FloatImage * loadFloatSTB(Stream & s)
+{
+    // @@ Assumes stream cursor is at the beginning and that image occupies the whole stream.
+    const int size = s.size();
+    uint8 * buffer = new uint8[size];
+
+    s.serialize(buffer, size);
+
+    int w, h, n;
+    float * data = stbi_loadf_from_memory(buffer, size, &w, &h, &n, 0);
+
+    delete buffer;
+
+    // Copy to image.
+    if (data != NULL) {
+        FloatImage * img = new FloatImage;
+        img->allocate(n, w, h);
+
+        const int count = w * h;
+
+        for (int c = 0; c < n; c++) {
+            float * dst = img->channel(c);
+
+            for (int i = 0; i < count; i++) {
+                dst[i] = data[i*n + c];
+            }
+        }
+        return img;
+    }
+
+    return NULL;
+}
+
+#endif // defined(HAVE_STBIMAGE)
+
+
+
+
+
+Image * nv::ImageIO::load(const char * fileName)
+{
+    nvDebugCheck(fileName != NULL);
+
+    StdInputStream stream(fileName);
+
+    if (stream.isError()) {
+        return NULL;
+    }
+
+    return ImageIO::load(fileName, stream);
+}
+
+Image * nv::ImageIO::load(const char * fileName, Stream & s)
+{
+    nvDebugCheck(fileName != NULL);
+    nvDebugCheck(s.isLoading());
+
+    const char * extension = Path::extension(fileName);
+
+    if (strCaseCmp(extension, ".tga") == 0) {
+        return loadTGA(s);
+    }
+
+    if (strCaseCmp(extension, ".psd") == 0) {
+        return loadPSD(s);
+    }
+
+    /*if (strCaseCmp(extension, ".ppm") == 0) {
+        return loadPPM(s);
+    }*/
+
+#if defined(HAVE_JPEG)
+    if (strCaseCmp(extension, ".jpg") == 0 || strCaseCmp(extension, ".jpeg") == 0) {
+        return loadJPG(s);
+    }
+#endif
+
+#if defined(HAVE_PNG)
+    if (strCaseCmp(extension, ".png") == 0) {
+        return loadPNG(s);
+    }
+#endif
+
+#if defined(HAVE_FREEIMAGE)
+    FREE_IMAGE_FORMAT fif = FreeImage_GetFIFFromFilename(fileName);
+    if (fif != FIF_UNKNOWN && FreeImage_FIFSupportsReading(fif)) {
+        return loadFreeImage(fif, s);
+    }
+#endif
+
+#if defined(HAVE_STBIMAGE)
+    return loadSTB(s);
+#endif
+
+    return NULL;
+}
+
+bool nv::ImageIO::save(const char * fileName, Stream & s, const Image * img, const char ** tags/*=NULL*/)
+{
+    nvDebugCheck(fileName != NULL);
+    nvDebugCheck(s.isSaving());
+    nvDebugCheck(img != NULL);
+
+    const char * extension = Path::extension(fileName);
+
+    if (strCaseCmp(extension, ".tga") == 0) {
+        return saveTGA(s, img);
+    }
+
+    if (strCaseCmp(extension, ".ppm") == 0) {
+        return savePPM(s, img);
+    }
+
+#if defined(HAVE_PNG)
+    if (strCaseCmp(extension, ".png") == 0) {
+        return savePNG(s, img, tags);
+    }
+#endif
+
+#if defined(HAVE_FREEIMAGE)
+    FREE_IMAGE_FORMAT fif = FreeImage_GetFIFFromFilename(fileName);
+    if (fif != FIF_UNKNOWN && FreeImage_FIFSupportsWriting(fif)) {
+        return saveFreeImage(fif, s, img, tags);
+    }
+#endif
+
+    return false;
+}
+
+bool nv::ImageIO::save(const char * fileName, const Image * img, const char ** tags/*=NULL*/)
+{
+    nvDebugCheck(fileName != NULL);
+    nvDebugCheck(img != NULL);
+
+    StdOutputStream stream(fileName);
+    if (stream.isError())
+    {
+        return false;
+    }
+
+    return ImageIO::save(fileName, stream, img, tags);
+}
+
+FloatImage * nv::ImageIO::loadFloat(const char * fileName)
+{
+    nvDebugCheck(fileName != NULL);
+
+    StdInputStream stream(fileName);
+
+    if (stream.isError()) {
+        return NULL;
+    }
+
+    return loadFloat(fileName, stream);
+}
+
+FloatImage * nv::ImageIO::loadFloat(const char * fileName, Stream & s)
+{
+    nvDebugCheck(fileName != NULL);
+
+    const char * extension = Path::extension(fileName);
+
+    /*if (strCaseCmp(extension, ".pfm") == 0) {
+        return loadFloatPFM(s);
+    }*/
+
+#if defined(HAVE_TIFF)
+    #pragma NV_MESSAGE("TODO: Load TIFF from stream.")
+    if (strCaseCmp(extension, ".tif") == 0 || strCaseCmp(extension, ".tiff") == 0) {
+        return loadFloatTIFF(fileName, s);
+    }
+#endif
+
+#if defined(HAVE_OPENEXR)
+    #pragma NV_MESSAGE("TODO: Load EXR from stream.")
+    if (strCaseCmp(extension, ".exr") == 0) {
+        return loadFloatEXR(fileName, s);
+    }
+#endif
+
+#if defined(HAVE_FREEIMAGE)
+    FREE_IMAGE_FORMAT fif = FreeImage_GetFIFFromFilename(fileName);
+    if (fif != FIF_UNKNOWN && FreeImage_FIFSupportsReading(fif)) {
+        return loadFloatFreeImage(fif, s);
+    }
+#endif
+
+    if (strCaseCmp(extension, ".dds") == 0) {
+        const uint spos = s.tell(); // Save stream position.
+        FloatImage * floatImage = loadFloatDDS(s);
+        if (floatImage != NULL) return floatImage;
+        else s.seek(spos);
+    }
+
+    // Try to load as an RGBA8 image and convert to float.
+    AutoPtr<Image> img(load(fileName, s));
+    if (img != NULL) {
+        return new FloatImage(img.ptr());
+    }
+
+    return NULL;
+}
+
+bool nv::ImageIO::saveFloat(const char * fileName, Stream & s, const FloatImage * fimage, uint baseComponent, uint componentCount)
+{
+    if (componentCount == 0) {
+        componentCount = fimage->componentCount() - baseComponent;
+    }
+    if (baseComponent + componentCount < fimage->componentCount()) {
+        return false;
+    }
+
+    const char * extension = Path::extension(fileName);
+
+    if (strCaseCmp(extension, ".dds") == 0) {
+        return saveFloatDDS(s, fimage, baseComponent, componentCount);
+    }
+
+    /*if (strCaseCmp(extension, ".pfm") == 0) {
+        return saveFloatPFM(s, fimage, baseComponent, componentCount);
+    }*/
+
+#if defined(HAVE_FREEIMAGE)
+    FREE_IMAGE_FORMAT fif = FreeImage_GetFIFFromFilename(fileName);
+    if (fif != FIF_UNKNOWN && FreeImage_FIFSupportsWriting(fif)) {
+        return saveFloatFreeImage(fif, s, fimage, baseComponent, componentCount);
+    }
+#endif
+
+    // If everything else fails, save as LDR.
+    if (componentCount <= 4)
+    {
+        AutoPtr<Image> image(fimage->createImage(baseComponent, componentCount));
+        nvCheck(image != NULL);
+
+        if (componentCount == 1)
+        {
+            Color32 * c = image->pixels();
+            const uint count = image->width() * image->height();
+            for (uint i = 0; i < count; i++)
+            {
+                c[i].b = c[i].g = c[i].r;
+            }
+        }
+
+        if (componentCount == 4)
+        {
+            image->setFormat(Image::Format_ARGB);
+        }
+
+        return ImageIO::save(fileName, s, image.ptr());
+    }
+
+    return false;
+}
+
+bool nv::ImageIO::saveFloat(const char * fileName, const FloatImage * fimage, uint baseComponent, uint componentCount)
+{
+    if (componentCount == 0) {
+        componentCount = fimage->componentCount() - baseComponent;
+    }
+    if (baseComponent + componentCount < fimage->componentCount()) {
+        return false;
+    }
+
+    const char * extension = Path::extension(fileName);
+
+#if defined(HAVE_OPENEXR)
+    if (strCaseCmp(extension, ".exr") == 0) {
+        return saveFloatEXR(fileName, fimage, baseComponent, componentCount);
+    }
+#endif
+
+#if defined(HAVE_TIFF)
+    if (strCaseCmp(extension, ".tif") == 0 || strCaseCmp(extension, ".tiff") == 0) {
+        return saveFloatTIFF(fileName, fimage, baseComponent, componentCount);
+    }
+#endif
+
+    StdOutputStream stream(fileName);
+
+    if (stream.isError()) {
+        return false;
+    }
+
+    return saveFloat(fileName, stream, fimage, baseComponent, componentCount);
+}
diff --git a/src/nvimage/ImageIO.h b/src/nvimage/ImageIO.h
index ee3bfb0..25490ab 100644
--- a/src/nvimage/ImageIO.h
+++ b/src/nvimage/ImageIO.h
@@ -1,37 +1,37 @@
-// This code is in the public domain -- castanyo@yahoo.es
-
-#pragma once
-#ifndef NV_IMAGE_IMAGEIO_H
-#define NV_IMAGE_IMAGEIO_H
-
-#include "nvimage.h"
-
-#include "nvcore/StrLib.h"
-
-
-namespace nv
-{
-    class Image;
-    class FloatImage;
-    class Stream;
-
-    namespace ImageIO
-    {
-        NVIMAGE_API Image * load(const char * fileName);
-        NVIMAGE_API Image * load(const char * fileName, Stream & s);
-
-        NVIMAGE_API FloatImage * loadFloat(const char * fileName);
-        NVIMAGE_API FloatImage * loadFloat(const char * fileName, Stream & s);
-
-        NVIMAGE_API bool save(const char * fileName, const Image * img, const char ** tags=NULL); // NULL terminated list.
-        NVIMAGE_API bool save(const char * fileName, Stream & s, const Image * img, const char ** tags=NULL);
-
-        NVIMAGE_API bool saveFloat(const char * fileName, const FloatImage * fimage, uint baseComponent, uint componentCount);
-        NVIMAGE_API bool saveFloat(const char * fileName, Stream & s, const FloatImage * fimage, uint baseComponent, uint componentCount);
-
-    } // ImageIO namespace
-
-} // nv namespace
-
-
-#endif // NV_IMAGE_IMAGEIO_H
+// This code is in the public domain -- castanyo@yahoo.es
+
+#pragma once
+#ifndef NV_IMAGE_IMAGEIO_H
+#define NV_IMAGE_IMAGEIO_H
+
+#include "nvimage.h"
+
+#include "nvcore/StrLib.h"
+
+
+namespace nv
+{
+    class Image;
+    class FloatImage;
+    class Stream;
+
+    namespace ImageIO
+    {
+        NVIMAGE_API Image * load(const char * fileName);
+        NVIMAGE_API Image * load(const char * fileName, Stream & s);
+
+        NVIMAGE_API FloatImage * loadFloat(const char * fileName);
+        NVIMAGE_API FloatImage * loadFloat(const char * fileName, Stream & s);
+
+        NVIMAGE_API bool save(const char * fileName, const Image * img, const char ** tags=NULL); // NULL terminated list.
+        NVIMAGE_API bool save(const char * fileName, Stream & s, const Image * img, const char ** tags=NULL);
+
+        NVIMAGE_API bool saveFloat(const char * fileName, const FloatImage * fimage, uint baseComponent, uint componentCount);
+        NVIMAGE_API bool saveFloat(const char * fileName, Stream & s, const FloatImage * fimage, uint baseComponent, uint componentCount);
+
+    } // ImageIO namespace
+
+} // nv namespace
+
+
+#endif // NV_IMAGE_IMAGEIO_H
diff --git a/src/nvimage/NormalMap.cpp b/src/nvimage/NormalMap.cpp
index e0b1092..404186d 100644
--- a/src/nvimage/NormalMap.cpp
+++ b/src/nvimage/NormalMap.cpp
@@ -27,6 +27,7 @@
 #include "Image.h"
 
 #include "nvmath/Color.inl"
+#include "nvmath/Vector.h"
 
 #include "nvcore/Ptr.h"
 
diff --git a/src/nvimage/NormalMap.h b/src/nvimage/NormalMap.h
index 3f13d42..fc484c9 100644
--- a/src/nvimage/NormalMap.h
+++ b/src/nvimage/NormalMap.h
@@ -1,59 +1,59 @@
-// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
-// 
-// Permission is hereby granted, free of charge, to any person
-// obtaining a copy of this software and associated documentation
-// files (the "Software"), to deal in the Software without
-// restriction, including without limitation the rights to use,
-// copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the
-// Software is furnished to do so, subject to the following
-// conditions:
-// 
-// The above copyright notice and this permission notice shall be
-// included in all copies or substantial portions of the Software.
-// 
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-// OTHER DEALINGS IN THE SOFTWARE.
-
-#pragma once
-#ifndef NV_IMAGE_NORMALMAP_H
-#define NV_IMAGE_NORMALMAP_H
-
-#include "nvimage.h"
-#include "FloatImage.h"
-
-#include "nvmath/Vector.h"
-
-
-namespace nv
-{
-	class Image;
-
-	enum NormalMapFilter
-	{
-		NormalMapFilter_Sobel3x3,	// fine detail
-		NormalMapFilter_Sobel5x5,	// medium detail
-		NormalMapFilter_Sobel7x7,	// large detail
-		NormalMapFilter_Sobel9x9,	// very large
-	};
-
-	// @@ These two functions should be deprecated:
-	FloatImage * createNormalMap(const Image * img, FloatImage::WrapMode wm, Vector4::Arg heightWeights, NormalMapFilter filter = NormalMapFilter_Sobel3x3);
-	FloatImage * createNormalMap(const Image * img, FloatImage::WrapMode wm, Vector4::Arg heightWeights, Vector4::Arg filterWeights);
-
-	FloatImage * createNormalMap(const FloatImage * img, FloatImage::WrapMode wm, Vector4::Arg filterWeights);
-
-	void normalizeNormalMap(FloatImage * img);
-
-	// @@ Add generation of DU/DV maps.
-
-
-} // nv namespace
-
-#endif // NV_IMAGE_NORMALMAP_H
+// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
+// 
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+#pragma once
+#ifndef NV_IMAGE_NORMALMAP_H
+#define NV_IMAGE_NORMALMAP_H
+
+#include "nvimage.h"
+#include "FloatImage.h"
+
+#include "nvmath/Vector.h"
+
+
+namespace nv
+{
+	class Image;
+
+	enum NormalMapFilter
+	{
+		NormalMapFilter_Sobel3x3,	// fine detail
+		NormalMapFilter_Sobel5x5,	// medium detail
+		NormalMapFilter_Sobel7x7,	// large detail
+		NormalMapFilter_Sobel9x9,	// very large
+	};
+
+	// @@ These two functions should be deprecated:
+	FloatImage * createNormalMap(const Image * img, FloatImage::WrapMode wm, Vector4::Arg heightWeights, NormalMapFilter filter = NormalMapFilter_Sobel3x3);
+	FloatImage * createNormalMap(const Image * img, FloatImage::WrapMode wm, Vector4::Arg heightWeights, Vector4::Arg filterWeights);
+
+	FloatImage * createNormalMap(const FloatImage * img, FloatImage::WrapMode wm, Vector4::Arg filterWeights);
+
+	void normalizeNormalMap(FloatImage * img);
+
+	// @@ Add generation of DU/DV maps.
+
+
+} // nv namespace
+
+#endif // NV_IMAGE_NORMALMAP_H
diff --git a/src/nvimage/PixelFormat.h b/src/nvimage/PixelFormat.h
index 78c0a68..8610c6e 100644
--- a/src/nvimage/PixelFormat.h
+++ b/src/nvimage/PixelFormat.h
@@ -1,118 +1,118 @@
-// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
-// 
-// Permission is hereby granted, free of charge, to any person
-// obtaining a copy of this software and associated documentation
-// files (the "Software"), to deal in the Software without
-// restriction, including without limitation the rights to use,
-// copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the
-// Software is furnished to do so, subject to the following
-// conditions:
-// 
-// The above copyright notice and this permission notice shall be
-// included in all copies or substantial portions of the Software.
-// 
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-// OTHER DEALINGS IN THE SOFTWARE.
-
-#pragma once
-#ifndef NV_IMAGE_PIXELFORMAT_H
-#define NV_IMAGE_PIXELFORMAT_H
-
-#include "nvimage.h"
-
-
-namespace nv
-{
-    namespace PixelFormat
-    {
-
-        // Convert component @a c having @a inbits to the returned value having @a outbits.
-        inline uint convert(uint c, uint inbits, uint outbits)
-        {
-            if (inbits == 0)
-            {
-            	return 0;
-            }
-            else if (inbits >= outbits)
-            {
-            	// truncate
-            	return c >> (inbits - outbits);
-            }
-            else
-            {
-            	// bitexpand
-                return (c << (outbits - inbits)) | convert(c, inbits, outbits - inbits);
-            }
-        }
-
-        // Get pixel component shift and size given its mask.
-        inline void maskShiftAndSize(uint mask, uint * shift, uint * size)
-        {
-            if (!mask)
-            {
-                *shift = 0;
-                *size = 0;
-                return;
-            }
-        
-            *shift = 0;
-            while((mask & 1) == 0) {
-                ++(*shift);
-                mask >>= 1;
-            }
-        
-            *size = 0;
-            while((mask & 1) == 1) {
-                ++(*size);
-                mask >>= 1;
-            }
-        }
-
-        inline float quantizeCeil(float f, int inbits, int outbits)
-        {
-            nvDebugCheck(f >= 0.0f && f <= 1.0f);
-            //uint i = f * (float(1 << inbits) - 1);
-            //i = convert(i, inbits, outbits);
-            //float result = float(i) / (float(1 << outbits) - 1);
-            //nvCheck(result >= f);
-            float result;
-            int offset = 0;
-            do {
-                uint i = offset + uint(f * (float(1 << inbits) - 1));
-                i = convert(i, inbits, outbits);
-                result = float(i) / (float(1 << outbits) - 1);
-                offset++;
-            } while (result < f);
-
-            return result;
-        }
-
-        /*
-        inline float quantizeRound(float f, int bits)
-        {
-            nvDebugCheck(f >= 0.0f && f <= 1.0f);
-            float scale = float(1 << bits);
-            return fround(f * scale) / scale;
-        }
-
-        inline float quantizeFloor(float f, int bits)
-        {
-            nvDebugCheck(f >= 0.0f && f <= 1.0f);
-            float scale = float(1 << bits);
-            return floor(f * scale) / scale;
-        }
-        */
-
-    } // PixelFormat namespace
-
-} // nv namespace
-
-
-#endif // NV_IMAGE_PIXELFORMAT_H
+// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
+// 
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+#pragma once
+#ifndef NV_IMAGE_PIXELFORMAT_H
+#define NV_IMAGE_PIXELFORMAT_H
+
+#include "nvimage.h"
+
+
+namespace nv
+{
+    namespace PixelFormat
+    {
+
+        // Convert component @a c having @a inbits to the returned value having @a outbits.
+        inline uint convert(uint c, uint inbits, uint outbits)
+        {
+            if (inbits == 0)
+            {
+            	return 0;
+            }
+            else if (inbits >= outbits)
+            {
+            	// truncate
+            	return c >> (inbits - outbits);
+            }
+            else
+            {
+            	// bitexpand
+                return (c << (outbits - inbits)) | convert(c, inbits, outbits - inbits);
+            }
+        }
+
+        // Get pixel component shift and size given its mask.
+        inline void maskShiftAndSize(uint mask, uint * shift, uint * size)
+        {
+            if (!mask)
+            {
+                *shift = 0;
+                *size = 0;
+                return;
+            }
+        
+            *shift = 0;
+            while((mask & 1) == 0) {
+                ++(*shift);
+                mask >>= 1;
+            }
+        
+            *size = 0;
+            while((mask & 1) == 1) {
+                ++(*size);
+                mask >>= 1;
+            }
+        }
+
+        inline float quantizeCeil(float f, int inbits, int outbits)
+        {
+            nvDebugCheck(f >= 0.0f && f <= 1.0f);
+            //uint i = f * (float(1 << inbits) - 1);
+            //i = convert(i, inbits, outbits);
+            //float result = float(i) / (float(1 << outbits) - 1);
+            //nvCheck(result >= f);
+            float result;
+            int offset = 0;
+            do {
+                uint i = offset + uint(f * (float(1 << inbits) - 1));
+                i = convert(i, inbits, outbits);
+                result = float(i) / (float(1 << outbits) - 1);
+                offset++;
+            } while (result < f);
+
+            return result;
+        }
+
+        /*
+        inline float quantizeRound(float f, int bits)
+        {
+            nvDebugCheck(f >= 0.0f && f <= 1.0f);
+            float scale = float(1 << bits);
+            return fround(f * scale) / scale;
+        }
+
+        inline float quantizeFloor(float f, int bits)
+        {
+            nvDebugCheck(f >= 0.0f && f <= 1.0f);
+            float scale = float(1 << bits);
+            return floor(f * scale) / scale;
+        }
+        */
+
+    } // PixelFormat namespace
+
+} // nv namespace
+
+
+#endif // NV_IMAGE_PIXELFORMAT_H
diff --git a/src/nvimage/PsdFile.h b/src/nvimage/PsdFile.h
index 3f242c8..b4c625a 100644
--- a/src/nvimage/PsdFile.h
+++ b/src/nvimage/PsdFile.h
@@ -1,71 +1,71 @@
-// This code is in the public domain -- castanyo@yahoo.es
-
-#pragma once
-#ifndef NV_IMAGE_PSDFILE_H
-#define NV_IMAGE_PSDFILE_H
-
-#include "nvcore/Stream.h"
-
-namespace nv
-{
-    enum PsdColorMode
-    {
-        PsdColorMode_Bitmap = 0,
-        PsdColorMode_GrayScale = 1,
-        PsdColorMode_Indexed = 2,
-        PsdColorMode_RGB = 3,
-        PsdColorMode_CMYK = 4,
-        PsdColorMode_MultiChannel = 7,
-        PsdColorMode_DuoTone = 8,
-        PsdColorMode_LabColor = 9
-    };
-
-    /// PSD header.
-    struct PsdHeader
-    {
-        uint32 signature;
-        uint16 version;
-        uint8 reserved[6];
-        uint16 channel_count;
-        uint32 height;
-        uint32 width;
-        uint16 depth;
-        uint16 color_mode;
-
-        bool isValid() const
-        {
-                return signature == 0x38425053;	// '8BPS'
-        }
-
-        bool isSupported() const
-        {
-            if (version != 1) {
-                nvDebug("*** bad version number %u\n", version);
-                return false;
-            }
-            if (channel_count > 4) {
-                return false;
-            }
-            if (depth != 8) { // @@ Add support for 16 bit depths.
-                return false;
-            }
-            if (color_mode != PsdColorMode_RGB) {
-                return false;
-            }
-            return true;
-        }
-    };
-
-
-    inline Stream & operator<< (Stream & s, PsdHeader & head)
-    {
-        s << head.signature << head.version;
-        for (int i = 0; i < 6; i++) {
-            s << head.reserved[i];
-        }
-        return s << head.channel_count << head.height << head.width << head.depth << head.color_mode;
-    }
-
-} // nv namespace
-
-#endif // NV_IMAGE_PSDFILE_H
+// This code is in the public domain -- castanyo@yahoo.es
+
+#pragma once
+#ifndef NV_IMAGE_PSDFILE_H
+#define NV_IMAGE_PSDFILE_H
+
+#include "nvcore/Stream.h"
+
+namespace nv
+{
+    enum PsdColorMode
+    {
+        PsdColorMode_Bitmap = 0,
+        PsdColorMode_GrayScale = 1,
+        PsdColorMode_Indexed = 2,
+        PsdColorMode_RGB = 3,
+        PsdColorMode_CMYK = 4,
+        PsdColorMode_MultiChannel = 7,
+        PsdColorMode_DuoTone = 8,
+        PsdColorMode_LabColor = 9
+    };
+
+    /// PSD header.
+    struct PsdHeader
+    {
+        uint32 signature;
+        uint16 version;
+        uint8 reserved[6];
+        uint16 channel_count;
+        uint32 height;
+        uint32 width;
+        uint16 depth;
+        uint16 color_mode;
+
+        bool isValid() const
+        {
+                return signature == 0x38425053;	// '8BPS'
+        }
+
+        bool isSupported() const
+        {
+            if (version != 1) {
+                nvDebug("*** bad version number %u\n", version);
+                return false;
+            }
+            if (channel_count > 4) {
+                return false;
+            }
+            if (depth != 8) { // @@ Add support for 16 bit depths.
+                return false;
+            }
+            if (color_mode != PsdColorMode_RGB) {
+                return false;
+            }
+            return true;
+        }
+    };
+
+
+    inline Stream & operator<< (Stream & s, PsdHeader & head)
+    {
+        s << head.signature << head.version;
+        for (int i = 0; i < 6; i++) {
+            s << head.reserved[i];
+        }
+        return s << head.channel_count << head.height << head.width << head.depth << head.color_mode;
+    }
+
+} // nv namespace
+
+#endif // NV_IMAGE_PSDFILE_H
diff --git a/src/nvimage/Quantize.cpp b/src/nvimage/Quantize.cpp
index 64168c8..889a8c2 100644
--- a/src/nvimage/Quantize.cpp
+++ b/src/nvimage/Quantize.cpp
@@ -1,222 +1,222 @@
-// This code is in the public domain -- castanyo@yahoo.es
-
-/*
-http://www.visgraf.impa.br/Courses/ip00/proj/Dithering1/floyd_steinberg_dithering.html
-http://www.gamedev.net/reference/articles/article341.asp
-
-@@ Look at LPS: http://www.cs.rit.edu/~pga/pics2000/i.html
- 
-This is a really nice guide to dithering algorithms:
-http://www.efg2.com/Lab/Library/ImageProcessing/DHALF.TXT
-
-@@ This code needs to be reviewed, I'm not sure it's correct.
-*/
-
-#include "Quantize.h"
-#include "Image.h"
-#include "PixelFormat.h"
-
-#include "nvmath/Color.h"
-#include "nvmath/Vector.inl"
-
-#include "nvcore/Utils.h" // swap
-
-#include <string.h> // memset
-
-
-using namespace nv;
-
-
-// Simple quantization.
-void nv::Quantize::BinaryAlpha( Image * image, int alpha_threshold /*= 127*/ )
-{
-	nvCheck(image != NULL);
-	
-	const uint w = image->width();
-	const uint h = image->height();
-	
-	for(uint y = 0; y < h; y++) {
-		for(uint x = 0; x < w; x++) {
-			
-			Color32 pixel = image->pixel(x, y);
-			
-			// Convert color.
-			if( pixel.a > alpha_threshold ) pixel.a = 255;
-			else pixel.a = 0;
-			
-			// Store color.
-			image->pixel(x, y) = pixel;
-		}
-	}
-}
-
-
-// Simple quantization.
-void nv::Quantize::RGB16( Image * image )
-{
-	Truncate(image, 5, 6, 5, 8);
-}
-
-// Alpha quantization.
-void nv::Quantize::Alpha4( Image * image )
-{
-	Truncate(image, 8, 8, 8, 4);
-}
-
-
-// Error diffusion. Floyd Steinberg.
-void nv::Quantize::FloydSteinberg_RGB16( Image * image )
-{
-	FloydSteinberg(image, 5, 6, 5, 8);
-}
-
-
-// Error diffusion. Floyd Steinberg.
-void nv::Quantize::FloydSteinberg_BinaryAlpha( Image * image, int alpha_threshold /*= 127*/ ) 
-{
-	nvCheck(image != NULL);
-	
-	const uint w = image->width();
-	const uint h = image->height();
-	
-	// @@ Use fixed point?
-	float * row0 = new float[(w+2)];
-	float * row1 = new float[(w+2)];
-	memset(row0, 0, sizeof(float)*(w+2));
-	memset(row1, 0, sizeof(float)*(w+2));
-	
-	for(uint y = 0; y < h; y++) {
-		for(uint x = 0; x < w; x++) {
-			
-			Color32 pixel = image->pixel(x, y);
-			
-			// Add error.
-			int alpha = int(pixel.a) + int(row0[1+x]);
-			
-			// Convert color.
-			if( alpha > alpha_threshold ) pixel.a = 255;
-			else pixel.a = 0;
-			
-			// Store color.
-			image->pixel(x, y) = pixel;
-			
-			// Compute new error.
-			float diff = float(alpha - pixel.a);
-			
-			// Propagate new error.
-			row0[1+x+1] += 7.0f / 16.0f * diff;
-			row1[1+x-1] += 3.0f / 16.0f * diff;
-			row1[1+x+0] += 5.0f / 16.0f * diff;
-			row1[1+x+1] += 1.0f / 16.0f * diff;
-		}
-		
-		swap(row0, row1);
-		memset(row1, 0, sizeof(float)*(w+2));
-	}
-	
-	delete [] row0;
-	delete [] row1;
-}
-
-
-// Error diffusion. Floyd Steinberg.
-void nv::Quantize::FloydSteinberg_Alpha4( Image * image )
-{
-	FloydSteinberg(image, 8, 8, 8, 4);
-}
-
-
-void nv::Quantize::Truncate(Image * image, uint rsize, uint gsize, uint bsize, uint asize)
-{
-	nvCheck(image != NULL);
-	
-	const uint w = image->width();
-	const uint h = image->height();
-	
-	for(uint y = 0; y < h; y++) {
-		for(uint x = 0; x < w; x++) {
-			
-			Color32 pixel = image->pixel(x, y);
-
-			// Convert to our desired size, and reconstruct.
-			pixel.r = PixelFormat::convert(pixel.r, 8, rsize);
-			pixel.r = PixelFormat::convert(pixel.r, rsize, 8);
-
-			pixel.g = PixelFormat::convert(pixel.g, 8, gsize);
-			pixel.g = PixelFormat::convert(pixel.g, gsize, 8);
-
-			pixel.b = PixelFormat::convert(pixel.b, 8, bsize);
-			pixel.b = PixelFormat::convert(pixel.b, bsize, 8);
-
-			pixel.a = PixelFormat::convert(pixel.a, 8, asize);
-			pixel.a = PixelFormat::convert(pixel.a, asize, 8);
-
-			// Store color.
-			image->pixel(x, y) = pixel;
-		}
-	}
-}
-
-
-// Error diffusion. Floyd Steinberg.
-void nv::Quantize::FloydSteinberg(Image * image, uint rsize, uint gsize, uint bsize, uint asize)
-{
-	nvCheck(image != NULL);
-	
-	const uint w = image->width();
-	const uint h = image->height();
-	
-	Vector4 * row0 = new Vector4[w+2];
-	Vector4 * row1 = new Vector4[w+2];
-	memset(row0, 0, sizeof(Vector4)*(w+2));
-	memset(row1, 0, sizeof(Vector4)*(w+2));
-	
-	for (uint y = 0; y < h; y++) {
-		for (uint x = 0; x < w; x++) {
-			
-			Color32 pixel = image->pixel(x, y);
-
-			// Add error.
-			pixel.r = clamp(int(pixel.r) + int(row0[1+x].x), 0, 255);
-			pixel.g = clamp(int(pixel.g) + int(row0[1+x].y), 0, 255);
-			pixel.b = clamp(int(pixel.b) + int(row0[1+x].z), 0, 255);
-			pixel.a = clamp(int(pixel.a) + int(row0[1+x].w), 0, 255);
-			
-			int r = pixel.r;
-			int g = pixel.g;
-			int b = pixel.b;
-			int a = pixel.a;
-
-			// Convert to our desired size, and reconstruct.
-			r = PixelFormat::convert(r, 8, rsize);
-			r = PixelFormat::convert(r, rsize, 8);
-
-			g = PixelFormat::convert(g, 8, gsize);
-			g = PixelFormat::convert(g, gsize, 8);
-
-			b = PixelFormat::convert(b, 8, bsize);
-			b = PixelFormat::convert(b, bsize, 8);
-
-			a = PixelFormat::convert(a, 8, asize);
-			a = PixelFormat::convert(a, asize, 8);
-
-			// Store color.
-			image->pixel(x, y) = Color32(r, g, b, a);
-			
-			// Compute new error.
-			Vector4 diff(float(int(pixel.r) - r), float(int(pixel.g) - g), float(int(pixel.b) - b), float(int(pixel.a) - a));
-			
-			// Propagate new error.
-			row0[1+x+1] += 7.0f / 16.0f * diff;
-			row1[1+x-1] += 3.0f / 16.0f * diff;
-			row1[1+x+0] += 5.0f / 16.0f * diff;
-			row1[1+x+1] += 1.0f / 16.0f * diff;
-		}
-		
-		swap(row0, row1);
-		memset(row1, 0, sizeof(Vector4)*(w+2));
-	}
-	
-	delete [] row0;
-	delete [] row1;
-}
+// This code is in the public domain -- castanyo@yahoo.es
+
+/*
+http://www.visgraf.impa.br/Courses/ip00/proj/Dithering1/floyd_steinberg_dithering.html
+http://www.gamedev.net/reference/articles/article341.asp
+
+@@ Look at LPS: http://www.cs.rit.edu/~pga/pics2000/i.html
+ 
+This is a really nice guide to dithering algorithms:
+http://www.efg2.com/Lab/Library/ImageProcessing/DHALF.TXT
+
+@@ This code needs to be reviewed, I'm not sure it's correct.
+*/
+
+#include "Quantize.h"
+#include "Image.h"
+#include "PixelFormat.h"
+
+#include "nvmath/Color.h"
+#include "nvmath/Vector.inl"
+
+#include "nvcore/Utils.h" // swap
+
+#include <string.h> // memset
+
+
+using namespace nv;
+
+
+// Simple quantization.
+void nv::Quantize::BinaryAlpha( Image * image, int alpha_threshold /*= 127*/ )
+{
+	nvCheck(image != NULL);
+	
+	const uint w = image->width();
+	const uint h = image->height();
+	
+	for(uint y = 0; y < h; y++) {
+		for(uint x = 0; x < w; x++) {
+			
+			Color32 pixel = image->pixel(x, y);
+			
+			// Convert color.
+			if( pixel.a > alpha_threshold ) pixel.a = 255;
+			else pixel.a = 0;
+			
+			// Store color.
+			image->pixel(x, y) = pixel;
+		}
+	}
+}
+
+
+// Simple quantization.
+void nv::Quantize::RGB16( Image * image )
+{
+	Truncate(image, 5, 6, 5, 8);
+}
+
+// Alpha quantization.
+void nv::Quantize::Alpha4( Image * image )
+{
+	Truncate(image, 8, 8, 8, 4);
+}
+
+
+// Error diffusion. Floyd Steinberg.
+void nv::Quantize::FloydSteinberg_RGB16( Image * image )
+{
+	FloydSteinberg(image, 5, 6, 5, 8);
+}
+
+
+// Error diffusion. Floyd Steinberg.
+void nv::Quantize::FloydSteinberg_BinaryAlpha( Image * image, int alpha_threshold /*= 127*/ ) 
+{
+	nvCheck(image != NULL);
+	
+	const uint w = image->width();
+	const uint h = image->height();
+	
+	// @@ Use fixed point?
+	float * row0 = new float[(w+2)];
+	float * row1 = new float[(w+2)];
+	memset(row0, 0, sizeof(float)*(w+2));
+	memset(row1, 0, sizeof(float)*(w+2));
+	
+	for(uint y = 0; y < h; y++) {
+		for(uint x = 0; x < w; x++) {
+			
+			Color32 pixel = image->pixel(x, y);
+			
+			// Add error.
+			int alpha = int(pixel.a) + int(row0[1+x]);
+			
+			// Convert color.
+			if( alpha > alpha_threshold ) pixel.a = 255;
+			else pixel.a = 0;
+			
+			// Store color.
+			image->pixel(x, y) = pixel;
+			
+			// Compute new error.
+			float diff = float(alpha - pixel.a);
+			
+			// Propagate new error.
+			row0[1+x+1] += 7.0f / 16.0f * diff;
+			row1[1+x-1] += 3.0f / 16.0f * diff;
+			row1[1+x+0] += 5.0f / 16.0f * diff;
+			row1[1+x+1] += 1.0f / 16.0f * diff;
+		}
+		
+		swap(row0, row1);
+		memset(row1, 0, sizeof(float)*(w+2));
+	}
+	
+	delete [] row0;
+	delete [] row1;
+}
+
+
+// Error diffusion. Floyd Steinberg.
+void nv::Quantize::FloydSteinberg_Alpha4( Image * image )
+{
+	FloydSteinberg(image, 8, 8, 8, 4);
+}
+
+
+void nv::Quantize::Truncate(Image * image, uint rsize, uint gsize, uint bsize, uint asize)
+{
+	nvCheck(image != NULL);
+	
+	const uint w = image->width();
+	const uint h = image->height();
+	
+	for(uint y = 0; y < h; y++) {
+		for(uint x = 0; x < w; x++) {
+			
+			Color32 pixel = image->pixel(x, y);
+
+			// Convert to our desired size, and reconstruct.
+			pixel.r = PixelFormat::convert(pixel.r, 8, rsize);
+			pixel.r = PixelFormat::convert(pixel.r, rsize, 8);
+
+			pixel.g = PixelFormat::convert(pixel.g, 8, gsize);
+			pixel.g = PixelFormat::convert(pixel.g, gsize, 8);
+
+			pixel.b = PixelFormat::convert(pixel.b, 8, bsize);
+			pixel.b = PixelFormat::convert(pixel.b, bsize, 8);
+
+			pixel.a = PixelFormat::convert(pixel.a, 8, asize);
+			pixel.a = PixelFormat::convert(pixel.a, asize, 8);
+
+			// Store color.
+			image->pixel(x, y) = pixel;
+		}
+	}
+}
+
+
+// Error diffusion. Floyd Steinberg.
+void nv::Quantize::FloydSteinberg(Image * image, uint rsize, uint gsize, uint bsize, uint asize)
+{
+	nvCheck(image != NULL);
+	
+	const uint w = image->width();
+	const uint h = image->height();
+	
+	Vector4 * row0 = new Vector4[w+2];
+	Vector4 * row1 = new Vector4[w+2];
+	memset(row0, 0, sizeof(Vector4)*(w+2));
+	memset(row1, 0, sizeof(Vector4)*(w+2));
+	
+	for (uint y = 0; y < h; y++) {
+		for (uint x = 0; x < w; x++) {
+			
+			Color32 pixel = image->pixel(x, y);
+
+			// Add error.
+			pixel.r = clamp(int(pixel.r) + int(row0[1+x].x), 0, 255);
+			pixel.g = clamp(int(pixel.g) + int(row0[1+x].y), 0, 255);
+			pixel.b = clamp(int(pixel.b) + int(row0[1+x].z), 0, 255);
+			pixel.a = clamp(int(pixel.a) + int(row0[1+x].w), 0, 255);
+			
+			int r = pixel.r;
+			int g = pixel.g;
+			int b = pixel.b;
+			int a = pixel.a;
+
+			// Convert to our desired size, and reconstruct.
+			r = PixelFormat::convert(r, 8, rsize);
+			r = PixelFormat::convert(r, rsize, 8);
+
+			g = PixelFormat::convert(g, 8, gsize);
+			g = PixelFormat::convert(g, gsize, 8);
+
+			b = PixelFormat::convert(b, 8, bsize);
+			b = PixelFormat::convert(b, bsize, 8);
+
+			a = PixelFormat::convert(a, 8, asize);
+			a = PixelFormat::convert(a, asize, 8);
+
+			// Store color.
+			image->pixel(x, y) = Color32(r, g, b, a);
+			
+			// Compute new error.
+			Vector4 diff(float(int(pixel.r) - r), float(int(pixel.g) - g), float(int(pixel.b) - b), float(int(pixel.a) - a));
+			
+			// Propagate new error.
+			row0[1+x+1] += 7.0f / 16.0f * diff;
+			row1[1+x-1] += 3.0f / 16.0f * diff;
+			row1[1+x+0] += 5.0f / 16.0f * diff;
+			row1[1+x+1] += 1.0f / 16.0f * diff;
+		}
+		
+		swap(row0, row1);
+		memset(row1, 0, sizeof(Vector4)*(w+2));
+	}
+	
+	delete [] row0;
+	delete [] row1;
+}
diff --git a/src/nvimage/Quantize.h b/src/nvimage/Quantize.h
index 2278d45..2a6a26d 100644
--- a/src/nvimage/Quantize.h
+++ b/src/nvimage/Quantize.h
@@ -1,32 +1,32 @@
-// This code is in the public domain -- castanyo@yahoo.es
-
-#pragma once
-#ifndef NV_IMAGE_QUANTIZE_H
-#define NV_IMAGE_QUANTIZE_H
-
-#include "nvimage.h"
-
-
-namespace nv
-{
-	class Image;
-
-	namespace Quantize
-	{
-		void RGB16(Image * img);
-		void BinaryAlpha(Image * img, int alpha_threshold = 127);
-		void Alpha4(Image * img);
-		
-		void FloydSteinberg_RGB16(Image * img);
-		void FloydSteinberg_BinaryAlpha(Image * img, int alpha_threshold = 127);
-		void FloydSteinberg_Alpha4(Image * img);
-
-		void Truncate(Image * image, uint rsize, uint gsize, uint bsize, uint asize);
-		void FloydSteinberg(Image * image, uint rsize, uint gsize, uint bsize, uint asize);
-
-		// @@ Add palette quantization algorithms!
-	}
-}
-
-
-#endif // NV_IMAGE_QUANTIZE_H
+// This code is in the public domain -- castanyo@yahoo.es
+
+#pragma once
+#ifndef NV_IMAGE_QUANTIZE_H
+#define NV_IMAGE_QUANTIZE_H
+
+#include "nvimage.h"
+
+
+namespace nv
+{
+	class Image;
+
+	namespace Quantize
+	{
+		void RGB16(Image * img);
+		void BinaryAlpha(Image * img, int alpha_threshold = 127);
+		void Alpha4(Image * img);
+		
+		void FloydSteinberg_RGB16(Image * img);
+		void FloydSteinberg_BinaryAlpha(Image * img, int alpha_threshold = 127);
+		void FloydSteinberg_Alpha4(Image * img);
+
+		void Truncate(Image * image, uint rsize, uint gsize, uint bsize, uint asize);
+		void FloydSteinberg(Image * image, uint rsize, uint gsize, uint bsize, uint asize);
+
+		// @@ Add palette quantization algorithms!
+	}
+}
+
+
+#endif // NV_IMAGE_QUANTIZE_H
diff --git a/src/nvimage/TgaFile.h b/src/nvimage/TgaFile.h
index ed562b6..bce2fc1 100644
--- a/src/nvimage/TgaFile.h
+++ b/src/nvimage/TgaFile.h
@@ -1,106 +1,106 @@
-// This code is in the public domain -- castanyo@yahoo.es
-
-#pragma once
-#ifndef NV_IMAGE_TGAFILE_H
-#define NV_IMAGE_TGAFILE_H
-
-#include "nvcore/Stream.h"
-
-namespace nv
-{
-	
-// TGA types
-enum TGAType {
-    TGA_TYPE_INDEXED		= 1,
-    TGA_TYPE_RGB			= 2,
-    TGA_TYPE_GREY			= 3,
-    TGA_TYPE_RLE_INDEXED	= 9,
-    TGA_TYPE_RLE_RGB		= 10,
-    TGA_TYPE_RLE_GREY		= 11
-};
-
-#define TGA_INTERLEAVE_MASK	0xc0
-#define TGA_INTERLEAVE_NONE	0x00
-#define TGA_INTERLEAVE_2WAY	0x40
-#define TGA_INTERLEAVE_4WAY	0x80
-
-#define TGA_ORIGIN_MASK		0x30
-#define TGA_ORIGIN_LEFT		0x00
-#define TGA_ORIGIN_RIGHT	0x10
-#define TGA_ORIGIN_LOWER	0x00
-#define TGA_ORIGIN_UPPER	0x20
-
-#define TGA_HAS_ALPHA		0x0F
-
-
-/// Tga Header.
-struct TgaHeader {
-	uint8	id_length;
-	uint8	colormap_type;
-	uint8	image_type;
-	uint16	colormap_index;
-	uint16	colormap_length;
-	uint8	colormap_size;
-	uint16	x_origin;
-	uint16	y_origin;
-	uint16	width;
-	uint16	height;
-	uint8	pixel_size;
-	uint8	flags;
-
-	enum { Size = 18 };		//const static int SIZE = 18;
-};
-
-
-/// Tga File.
-struct TgaFile {
-
-	TgaFile() {
-		mem = NULL;
-	}
-	~TgaFile() {
-		free();
-	}
-
-	uint size() const {
-		return head.width * head.height * (head.pixel_size / 8);
-	}
-	void allocate() {
-		nvCheck( mem == NULL );
-		mem = new uint8[size()];
-	}
-	void free() {
-		delete [] mem;
-		mem = NULL;
-	}
-
-	TgaHeader head;
-	uint8 * mem;
-};
-
-
-inline Stream & operator<< (Stream & s, TgaHeader & head)
-{
-	s << head.id_length << head.colormap_type << head.image_type;
-	s << head.colormap_index << head.colormap_length << head.colormap_size;
-	s << head.x_origin << head.y_origin << head.width << head.height;
-	s << head.pixel_size << head.flags;
-	return s;
-}
-
-inline Stream & operator<< (Stream & s, TgaFile & tga)
-{
-	s << tga.head;
-
-	if( s.isLoading() ) {
-		tga.allocate();
-	}
-
-	s.serialize( tga.mem, tga.size() );
-
-	return s;
-}
-
-} // nv namespace
-
-#endif // NV_IMAGE_TGAFILE_H
+// This code is in the public domain -- castanyo@yahoo.es
+
+#pragma once
+#ifndef NV_IMAGE_TGAFILE_H
+#define NV_IMAGE_TGAFILE_H
+
+#include "nvcore/Stream.h"
+
+namespace nv
+{
+	
+// TGA types
+enum TGAType {
+    TGA_TYPE_INDEXED		= 1,
+    TGA_TYPE_RGB			= 2,
+    TGA_TYPE_GREY			= 3,
+    TGA_TYPE_RLE_INDEXED	= 9,
+    TGA_TYPE_RLE_RGB		= 10,
+    TGA_TYPE_RLE_GREY		= 11
+};
+
+#define TGA_INTERLEAVE_MASK	0xc0
+#define TGA_INTERLEAVE_NONE	0x00
+#define TGA_INTERLEAVE_2WAY	0x40
+#define TGA_INTERLEAVE_4WAY	0x80
+
+#define TGA_ORIGIN_MASK		0x30
+#define TGA_ORIGIN_LEFT		0x00
+#define TGA_ORIGIN_RIGHT	0x10
+#define TGA_ORIGIN_LOWER	0x00
+#define TGA_ORIGIN_UPPER	0x20
+
+#define TGA_HAS_ALPHA		0x0F
+
+
+/// Tga Header.
+struct TgaHeader {
+	uint8	id_length;
+	uint8	colormap_type;
+	uint8	image_type;
+	uint16	colormap_index;
+	uint16	colormap_length;
+	uint8	colormap_size;
+	uint16	x_origin;
+	uint16	y_origin;
+	uint16	width;
+	uint16	height;
+	uint8	pixel_size;
+	uint8	flags;
+
+	enum { Size = 18 };		//const static int SIZE = 18;
+};
+
+
+/// Tga File.
+struct TgaFile {
+
+	TgaFile() {
+		mem = NULL;
+	}
+	~TgaFile() {
+		free();
+	}
+
+	uint size() const {
+		return head.width * head.height * (head.pixel_size / 8);
+	}
+	void allocate() {
+		nvCheck( mem == NULL );
+		mem = new uint8[size()];
+	}
+	void free() {
+		delete [] mem;
+		mem = NULL;
+	}
+
+	TgaHeader head;
+	uint8 * mem;
+};
+
+
+inline Stream & operator<< (Stream & s, TgaHeader & head)
+{
+	s << head.id_length << head.colormap_type << head.image_type;
+	s << head.colormap_index << head.colormap_length << head.colormap_size;
+	s << head.x_origin << head.y_origin << head.width << head.height;
+	s << head.pixel_size << head.flags;
+	return s;
+}
+
+inline Stream & operator<< (Stream & s, TgaFile & tga)
+{
+	s << tga.head;
+
+	if( s.isLoading() ) {
+		tga.allocate();
+	}
+
+	s.serialize( tga.mem, tga.size() );
+
+	return s;
+}
+
+} // nv namespace
+
+#endif // NV_IMAGE_TGAFILE_H
diff --git a/src/nvmath/Half.cpp b/src/nvmath/Half.cpp
index b76794e..b0bd2a8 100644
--- a/src/nvmath/Half.cpp
+++ b/src/nvmath/Half.cpp
@@ -1,612 +1,612 @@
-// Branch-free implementation of half-precision (16 bit) floating point
-// Copyright 2006 Mike Acton <macton@gmail.com>
-// 
-// Permission is hereby granted, free of charge, to any person obtaining a 
-// copy of this software and associated documentation files (the "Software"),
-// to deal in the Software without restriction, including without limitation
-// the rights to use, copy, modify, merge, publish, distribute, sublicense, 
-// and/or sell copies of the Software, and to permit persons to whom the 
-// Software is furnished to do so, subject to the following conditions:
-// 
-// The above copyright notice and this permission notice shall be included 
-// in all copies or substantial portions of the Software.
-// 
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
-// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
-// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-// THE SOFTWARE
-//
-// Half-precision floating point format
-// ------------------------------------
-//
-//   | Field    | Last | First | Note
-//   |----------|------|-------|----------
-//   | Sign     | 15   | 15    |
-//   | Exponent | 14   | 10    | Bias = 15
-//   | Mantissa | 9    | 0     |
-//
-// Compiling
-// ---------
-//
-//  Preferred compile flags for GCC: 
-//     -O3 -fstrict-aliasing -std=c99 -pedantic -Wall -Wstrict-aliasing
-//
-//     This file is a C99 source file, intended to be compiled with a C99 
-//     compliant compiler. However, for the moment it remains combatible
-//     with C++98. Therefore if you are using a compiler that poorly implements
-//     C standards (e.g. MSVC), it may be compiled as C++. This is not
-//     guaranteed for future versions. 
-//
-// Features
-// --------
-//
-//  * QNaN + <x>  = QNaN
-//  * <x>  + +INF = +INF
-//  * <x>  - -INF = -INF
-//  * INF  - INF  = SNaN
-//  * Denormalized values
-//  * Difference of ZEROs is always +ZERO
-//  * Sum round with guard + round + sticky bit (grs)
-//  * And of course... no branching
-// 
-// Precision of Sum
-// ----------------
-//
-//  (SUM)        uint16 z = half_add( x, y );
-//  (DIFFERENCE) uint16 z = half_add( x, -y );
-//
-//     Will have exactly (0 ulps difference) the same result as:
-//     (For 32 bit IEEE 784 floating point and same rounding mode)
-//
-//     union FLOAT_32
-//     {
-//       float    f32;
-//       uint32 u32;
-//     };
-//
-//     union FLOAT_32 fx = { .u32 = half_to_float( x ) };
-//     union FLOAT_32 fy = { .u32 = half_to_float( y ) };
-//     union FLOAT_32 fz = { .f32 = fx.f32 + fy.f32    };
-//     uint16       z  = float_to_half( fz );
-//
-
-#include "Half.h"
-#include <stdio.h>
-
-// Load immediate
-static inline uint32 _uint32_li( uint32 a )
-{
-    return (a);
-}
-
-// Decrement
-static inline uint32 _uint32_dec( uint32 a )
-{
-    return (a - 1);
-}
-
-// Increment
-static inline uint32 _uint32_inc( uint32 a )
-{
-  return (a + 1);
-}
-
-// Complement
-static inline uint32 _uint32_not( uint32 a )
-{
-    return (~a);
-}
-
-// Negate
-static inline uint32 _uint32_neg( uint32 a )
-{
-#pragma warning(disable : 4146)     // unary minus operator applied to unsigned type, result still unsigned
-    return (-a);
-#pragma warning(default : 4146)
-}
-
-// Extend sign
-static inline uint32 _uint32_ext( uint32 a )
-{
-    return (((int32)a)>>31);
-}
-
-// And
-static inline uint32 _uint32_and( uint32 a, uint32 b )
-{
-    return (a & b);
-}
-
-// And with Complement
-static inline uint32 _uint32_andc( uint32 a, uint32 b )
-{
-    return (a & ~b);
-}
-
-// Or
-static inline uint32 _uint32_or( uint32 a, uint32 b )
-{
-    return (a | b);
-}
-
-// Shift Right Logical
-static inline uint32 _uint32_srl( uint32 a, int sa )
-{
-    return (a >> sa);
-}
-
-// Shift Left Logical
-static inline uint32 _uint32_sll( uint32 a, int sa )
-{
-    return (a << sa);
-}
-
-// Add
-static inline uint32 _uint32_add( uint32 a, uint32 b )
-{
-    return (a + b);
-}
-
-// Subtract
-static inline uint32 _uint32_sub( uint32 a, uint32 b )
-{
-    return (a - b);
-}
-
-// Select on Sign bit
-static inline uint32 _uint32_sels( uint32 test, uint32 a, uint32 b )
-{
-    const uint32 mask   = _uint32_ext( test );
-    const uint32 sel_a  = _uint32_and(  a,     mask  );
-    const uint32 sel_b  = _uint32_andc( b,     mask  );
-    const uint32 result = _uint32_or(   sel_a, sel_b );
-
-    return (result);
-}
-
-// Load Immediate
-static inline uint16 _uint16_li( uint16 a )
-{
-    return (a);
-}
-
-// Extend sign
-static inline uint16 _uint16_ext( uint16 a )
-{
-    return (((int16)a)>>15);
-}
-
-// Negate
-static inline uint16 _uint16_neg( uint16 a )
-{
-    return (-a);
-}
-
-// Complement
-static inline uint16 _uint16_not( uint16 a )
-{
-    return (~a);
-}
-
-// Decrement
-static inline uint16 _uint16_dec( uint16 a )
-{
-    return (a - 1);
-}
-
-// Shift Left Logical
-static inline uint16 _uint16_sll( uint16 a, int sa )
-{
-    return (a << sa);
-}
-
-// Shift Right Logical
-static inline uint16 _uint16_srl( uint16 a, int sa )
-{
-    return (a >> sa);
-}
-
-// Add
-static inline uint16 _uint16_add( uint16 a, uint16 b )
-{
-    return (a + b);
-}
-
-// Subtract
-static inline uint16 _uint16_sub( uint16 a, uint16 b )
-{
-    return (a - b);
-}
-
-// And
-static inline uint16 _uint16_and( uint16 a, uint16 b )
-{
-    return (a & b);
-}
-
-// Or
-static inline uint16 _uint16_or( uint16 a, uint16 b )
-{
-    return (a | b);
-}
-
-// Exclusive Or
-static inline uint16 _uint16_xor( uint16 a, uint16 b )
-{
-    return (a ^ b);
-}
-
-// And with Complement
-static inline uint16 _uint16_andc( uint16 a, uint16 b )
-{
-    return (a & ~b);
-}
-
-// And then Shift Right Logical
-static inline uint16 _uint16_andsrl( uint16 a, uint16 b, int sa )
-{
-    return ((a & b) >> sa);
-}
-
-// Shift Right Logical then Mask
-static inline uint16 _uint16_srlm( uint16 a, int sa, uint16 mask )
-{
-    return ((a >> sa) & mask);
-}
-
-// Add then Mask
-static inline uint16 _uint16_addm( uint16 a, uint16 b, uint16 mask )
-{
-    return ((a + b) & mask);
-}
-
-
-// Select on Sign bit
-static inline uint16 _uint16_sels( uint16 test, uint16 a, uint16 b )
-{
-    const uint16 mask   = _uint16_ext( test );
-    const uint16 sel_a  = _uint16_and(  a,     mask  );
-    const uint16 sel_b  = _uint16_andc( b,     mask  );
-    const uint16 result = _uint16_or(   sel_a, sel_b );
-
-    return (result);
-}
-
-#if NV_OS_XBOX
-#include <PPCIntrinsics.h>
-#elif NV_CC_MSVC
-
-#include <intrin.h>
-#pragma intrinsic(_BitScanReverse)
-
-uint32 _uint32_nlz( uint32 x ) {
-    unsigned long index;
-    _BitScanReverse(&index, x);
-    return 31 - index;
-}
-#endif
-
-
-// Count Leading Zeros
-static inline uint32 _uint32_cntlz( uint32 x )
-{
-#if NV_CC_GCC
-    /* On PowerPC, this will map to insn: cntlzw */
-    /* On Pentium, this will map to insn: clz    */
-    uint32 is_x_nez_msb = _uint32_neg( x );
-    uint32 nlz          = __builtin_clz( x );
-    uint32 result       = _uint32_sels( is_x_nez_msb, nlz, 0x00000020 );
-    return (result);
-#elif NV_OS_XBOX
-    // Xbox PPC has this as an intrinsic.
-    return _CountLeadingZeros(x);
-#elif NV_CC_MSVC
-    uint32 is_x_nez_msb = _uint32_neg( x );
-    uint32 nlz          = _uint32_nlz( x );
-    uint32 result       = _uint32_sels( is_x_nez_msb, nlz, 0x00000020 );
-    return (result);
-#else
-    const uint32 x0  = _uint32_srl(  x,  1 );
-    const uint32 x1  = _uint32_or(   x,  x0 );
-    const uint32 x2  = _uint32_srl(  x1, 2 );
-    const uint32 x3  = _uint32_or(   x1, x2 );
-    const uint32 x4  = _uint32_srl(  x3, 4 );
-    const uint32 x5  = _uint32_or(   x3, x4 );
-    const uint32 x6  = _uint32_srl(  x5, 8 );
-    const uint32 x7  = _uint32_or(   x5, x6 );
-    const uint32 x8  = _uint32_srl(  x7, 16 );
-    const uint32 x9  = _uint32_or(   x7, x8 );
-    const uint32 xA  = _uint32_not(  x9 );
-    const uint32 xB  = _uint32_srl(  xA, 1 );
-    const uint32 xC  = _uint32_and(  xB, 0x55555555 );
-    const uint32 xD  = _uint32_sub(  xA, xC );
-    const uint32 xE  = _uint32_and(  xD, 0x33333333 );
-    const uint32 xF  = _uint32_srl(  xD, 2 );
-    const uint32 x10 = _uint32_and(  xF, 0x33333333 );
-    const uint32 x11 = _uint32_add(  xE, x10 );
-    const uint32 x12 = _uint32_srl(  x11, 4 );
-    const uint32 x13 = _uint32_add(  x11, x12 );
-    const uint32 x14 = _uint32_and(  x13, 0x0f0f0f0f );
-    const uint32 x15 = _uint32_srl(  x14, 8 );
-    const uint32 x16 = _uint32_add(  x14, x15 );
-    const uint32 x17 = _uint32_srl(  x16, 16 );
-    const uint32 x18 = _uint32_add(  x16, x17 );
-    const uint32 x19 = _uint32_and(  x18, 0x0000003f );
-    return ( x19 );
-#endif
-}
-
-// Count Leading Zeros
-static inline uint16 _uint16_cntlz( uint16 x )
-{
-#ifdef __GNUC__
-    /* On PowerPC, this will map to insn: cntlzw */
-    /* On Pentium, this will map to insn: clz    */
-    uint16 nlz32 = (uint16)_uint32_cntlz( (uint32)x );
-    uint32 nlz   = _uint32_sub( nlz32, 16 );
-    return (nlz);
-#elif _NV_OS_XBOX_
-    uint16 nlz32 = (uint16)_CountLeadingZeros( (uint32)x );
-    return _uint32_sub( nlz32, 16);
-#else
-    const uint16 x0  = _uint16_srl(  x,  1 );
-    const uint16 x1  = _uint16_or(   x,  x0 );
-    const uint16 x2  = _uint16_srl(  x1, 2 );
-    const uint16 x3  = _uint16_or(   x1, x2 );
-    const uint16 x4  = _uint16_srl(  x3, 4 );
-    const uint16 x5  = _uint16_or(   x3, x4 );
-    const uint16 x6  = _uint16_srl(  x5, 8 );
-    const uint16 x7  = _uint16_or(   x5, x6 );
-    const uint16 x8  = _uint16_not(  x7 );
-    const uint16 x9  = _uint16_srlm( x8, 1, 0x5555 );
-    const uint16 xA  = _uint16_sub(  x8, x9 );
-    const uint16 xB  = _uint16_and(  xA, 0x3333 );
-    const uint16 xC  = _uint16_srlm( xA, 2, 0x3333 );
-    const uint16 xD  = _uint16_add(  xB, xC );
-    const uint16 xE  = _uint16_srl(  xD, 4 );
-    const uint16 xF  = _uint16_addm( xD, xE, 0x0f0f );
-    const uint16 x10 = _uint16_srl(  xF, 8 );
-    const uint16 x11 = _uint16_addm( xF, x10, 0x001f );
-    return ( x11 );
-#endif
-}
-
-uint16
-nv::half_from_float( uint32 f )
-{
-    const uint32 one                        = _uint32_li( 0x00000001 );
-    const uint32 f_s_mask                   = _uint32_li( 0x80000000 );
-    const uint32 f_e_mask                   = _uint32_li( 0x7f800000 );
-    const uint32 f_m_mask                   = _uint32_li( 0x007fffff );
-    const uint32 f_m_hidden_bit             = _uint32_li( 0x00800000 );
-    const uint32 f_m_round_bit              = _uint32_li( 0x00001000 );
-    const uint32 f_snan_mask                = _uint32_li( 0x7fc00000 );
-    const uint32 f_e_pos                    = _uint32_li( 0x00000017 );
-    const uint32 h_e_pos                    = _uint32_li( 0x0000000a );
-    const uint32 h_e_mask                   = _uint32_li( 0x00007c00 );
-    const uint32 h_snan_mask                = _uint32_li( 0x00007e00 );
-    const uint32 h_e_mask_value             = _uint32_li( 0x0000001f );
-    const uint32 f_h_s_pos_offset           = _uint32_li( 0x00000010 );
-    const uint32 f_h_bias_offset            = _uint32_li( 0x00000070 );
-    const uint32 f_h_m_pos_offset           = _uint32_li( 0x0000000d );
-    const uint32 h_nan_min                  = _uint32_li( 0x00007c01 );
-    const uint32 f_h_e_biased_flag          = _uint32_li( 0x0000008f );
-    const uint32 f_s                        = _uint32_and( f,               f_s_mask         );
-    const uint32 f_e                        = _uint32_and( f,               f_e_mask         );
-    const uint16 h_s                        = _uint32_srl( f_s,             f_h_s_pos_offset );
-    const uint32 f_m                        = _uint32_and( f,               f_m_mask         );
-    const uint16 f_e_amount                 = _uint32_srl( f_e,             f_e_pos          );
-    const uint32 f_e_half_bias              = _uint32_sub( f_e_amount,      f_h_bias_offset  );
-    const uint32 f_snan                     = _uint32_and( f,               f_snan_mask      );
-    const uint32 f_m_round_mask             = _uint32_and( f_m,             f_m_round_bit    );
-    const uint32 f_m_round_offset           = _uint32_sll( f_m_round_mask,  one              );
-    const uint32 f_m_rounded                = _uint32_add( f_m,             f_m_round_offset );
-    const uint32 f_m_denorm_sa              = _uint32_sub( one,             f_e_half_bias    );
-    const uint32 f_m_with_hidden            = _uint32_or(  f_m_rounded,     f_m_hidden_bit   );
-    const uint32 f_m_denorm                 = _uint32_srl( f_m_with_hidden, f_m_denorm_sa    );
-    const uint32 h_m_denorm                 = _uint32_srl( f_m_denorm,      f_h_m_pos_offset );
-    const uint32 f_m_rounded_overflow       = _uint32_and( f_m_rounded,     f_m_hidden_bit   );
-    const uint32 m_nan                      = _uint32_srl( f_m,             f_h_m_pos_offset );
-    const uint32 h_em_nan                   = _uint32_or(  h_e_mask,        m_nan            );
-    const uint32 h_e_norm_overflow_offset   = _uint32_inc( f_e_half_bias );
-    const uint32 h_e_norm_overflow          = _uint32_sll( h_e_norm_overflow_offset, h_e_pos          );
-    const uint32 h_e_norm                   = _uint32_sll( f_e_half_bias,            h_e_pos          );
-    const uint32 h_m_norm                   = _uint32_srl( f_m_rounded,              f_h_m_pos_offset );
-    const uint32 h_em_norm                  = _uint32_or(  h_e_norm,                 h_m_norm         );
-    const uint32 is_h_ndenorm_msb           = _uint32_sub( f_h_bias_offset,   f_e_amount    );
-    const uint32 is_f_e_flagged_msb         = _uint32_sub( f_h_e_biased_flag, f_e_half_bias );
-    const uint32 is_h_denorm_msb            = _uint32_not( is_h_ndenorm_msb );
-    const uint32 is_f_m_eqz_msb             = _uint32_dec( f_m   );
-    const uint32 is_h_nan_eqz_msb           = _uint32_dec( m_nan );
-    const uint32 is_f_inf_msb               = _uint32_and( is_f_e_flagged_msb, is_f_m_eqz_msb   );
-    const uint32 is_f_nan_underflow_msb     = _uint32_and( is_f_e_flagged_msb, is_h_nan_eqz_msb );
-    const uint32 is_e_overflow_msb          = _uint32_sub( h_e_mask_value,     f_e_half_bias    );
-    const uint32 is_h_inf_msb               = _uint32_or(  is_e_overflow_msb,  is_f_inf_msb     );
-    const uint32 is_f_nsnan_msb             = _uint32_sub( f_snan,             f_snan_mask      );
-    const uint32 is_m_norm_overflow_msb     = _uint32_neg( f_m_rounded_overflow );
-    const uint32 is_f_snan_msb              = _uint32_not( is_f_nsnan_msb );
-    const uint32 h_em_overflow_result       = _uint32_sels( is_m_norm_overflow_msb, h_e_norm_overflow, h_em_norm                 );
-    const uint32 h_em_nan_result            = _uint32_sels( is_f_e_flagged_msb,     h_em_nan,          h_em_overflow_result      );
-    const uint32 h_em_nan_underflow_result  = _uint32_sels( is_f_nan_underflow_msb, h_nan_min,         h_em_nan_result           );
-    const uint32 h_em_inf_result            = _uint32_sels( is_h_inf_msb,           h_e_mask,          h_em_nan_underflow_result );
-    const uint32 h_em_denorm_result         = _uint32_sels( is_h_denorm_msb,        h_m_denorm,        h_em_inf_result           );
-    const uint32 h_em_snan_result           = _uint32_sels( is_f_snan_msb,          h_snan_mask,       h_em_denorm_result        );
-    const uint32 h_result                   = _uint32_or( h_s, h_em_snan_result );
-
-    return (uint16)(h_result);
-}
-
-uint32 
-nv::half_to_float( uint16 h )
-{
-    const uint32 h_e_mask              = _uint32_li( 0x00007c00 );
-    const uint32 h_m_mask              = _uint32_li( 0x000003ff );
-    const uint32 h_s_mask              = _uint32_li( 0x00008000 );
-    const uint32 h_f_s_pos_offset      = _uint32_li( 0x00000010 );
-    const uint32 h_f_e_pos_offset      = _uint32_li( 0x0000000d );
-    const uint32 h_f_bias_offset       = _uint32_li( 0x0001c000 );
-    const uint32 f_e_mask              = _uint32_li( 0x7f800000 );
-    const uint32 f_m_mask              = _uint32_li( 0x007fffff );
-    const uint32 h_f_e_denorm_bias     = _uint32_li( 0x0000007e );
-    const uint32 h_f_m_denorm_sa_bias  = _uint32_li( 0x00000008 );
-    const uint32 f_e_pos               = _uint32_li( 0x00000017 );
-    const uint32 h_e_mask_minus_one    = _uint32_li( 0x00007bff );
-    const uint32 h_e                   = _uint32_and( h, h_e_mask );
-    const uint32 h_m                   = _uint32_and( h, h_m_mask );
-    const uint32 h_s                   = _uint32_and( h, h_s_mask );
-    const uint32 h_e_f_bias            = _uint32_add( h_e, h_f_bias_offset );
-    const uint32 h_m_nlz               = _uint32_cntlz( h_m );
-    const uint32 f_s                   = _uint32_sll( h_s,        h_f_s_pos_offset );
-    const uint32 f_e                   = _uint32_sll( h_e_f_bias, h_f_e_pos_offset );
-    const uint32 f_m                   = _uint32_sll( h_m,        h_f_e_pos_offset );
-    const uint32 f_em                  = _uint32_or(  f_e,        f_m              );
-    const uint32 h_f_m_sa              = _uint32_sub( h_m_nlz,             h_f_m_denorm_sa_bias );
-    const uint32 f_e_denorm_unpacked   = _uint32_sub( h_f_e_denorm_bias,   h_f_m_sa             );
-    const uint32 h_f_m                 = _uint32_sll( h_m,                 h_f_m_sa             );
-    const uint32 f_m_denorm            = _uint32_and( h_f_m,               f_m_mask             );
-    const uint32 f_e_denorm            = _uint32_sll( f_e_denorm_unpacked, f_e_pos              );
-    const uint32 f_em_denorm           = _uint32_or(  f_e_denorm,          f_m_denorm           );
-    const uint32 f_em_nan              = _uint32_or(  f_e_mask,            f_m                  );
-    const uint32 is_e_eqz_msb          = _uint32_dec(  h_e );
-    const uint32 is_m_nez_msb          = _uint32_neg(  h_m );
-    const uint32 is_e_flagged_msb      = _uint32_sub(  h_e_mask_minus_one, h_e );
-    const uint32 is_zero_msb           = _uint32_andc( is_e_eqz_msb,       is_m_nez_msb );
-    const uint32 is_inf_msb            = _uint32_andc( is_e_flagged_msb,   is_m_nez_msb );
-    const uint32 is_denorm_msb         = _uint32_and(  is_m_nez_msb,       is_e_eqz_msb );
-    const uint32 is_nan_msb            = _uint32_and(  is_e_flagged_msb,   is_m_nez_msb ); 
-    const uint32 is_zero               = _uint32_ext(  is_zero_msb );
-    const uint32 f_zero_result         = _uint32_andc( f_em, is_zero );
-    const uint32 f_denorm_result       = _uint32_sels( is_denorm_msb, f_em_denorm, f_zero_result );
-    const uint32 f_inf_result          = _uint32_sels( is_inf_msb,    f_e_mask,    f_denorm_result );
-    const uint32 f_nan_result          = _uint32_sels( is_nan_msb,    f_em_nan,    f_inf_result    );
-    const uint32 f_result              = _uint32_or( f_s, f_nan_result );
-
-    return (f_result);
-}
-
-
-// @@ These tables could be smaller.
-static uint32 mantissa_table[2048];
-static uint32 exponent_table[64];
-static uint32 offset_table[64];
-
-void nv::half_init_tables()
-{
-    // Init mantissa table.
-	mantissa_table[0] = 0;
-
-	for (int i = 1; i < 1024; i++) {
-		uint m = i << 13;
-		uint e = 0;
-
-		while ((m & 0x00800000) == 0) {
-			e -= 0x00800000;
-			m <<= 1;
-		}
-		m &= ~0x00800000;
-		e += 0x38800000;
-		mantissa_table[i] = m | e;
-	}
-
-    for (int i = 1024; i < 2048; i++) {
-		mantissa_table[i] = 0x38000000 + ((i - 1024) << 13);
-    }
-
-
-    // Init exponent table.
-	exponent_table[0] = 0;
-
-    for (int i = 1; i < 31; i++) {
-		exponent_table[i] = (i << 23);
-    }
-
-	exponent_table[31] = 0x47800000;
-	exponent_table[32] = 0x80000000;
-
-    for (int i = 33; i < 63; i++) {
-		exponent_table[i] = 0x80000000 + ((i - 32) << 23);
-    }
-
-	exponent_table[63] = 0xC7800000;
-
-
-    // Init offset table.
-	offset_table[0] = 0;
-
-    for (int i = 1; i < 32; i++) {
-		offset_table[i] = 1024;
-    }
-
-	offset_table[32] = 0;
-
-    for (int i = 33; i < 64; i++) {
-		offset_table[i] = 1024;
-    }
-
-    /*for (int i = 0; i < 64; i++) {
-        offset_table[i] = ((i & 31) != 0) * 1024;
-    }*/
-}
-
-// Fast half to float conversion based on:
-// http://www.fox-toolkit.org/ftp/fasthalffloatconversion.pdf
-uint32 nv::fast_half_to_float(uint16 h)
-{
-	uint exp = h >> 10;
-	return mantissa_table[offset_table[exp] + (h & 0x3ff)] + exponent_table[exp];
-}
-
-
-#if 0
-// Inaccurate conversion suggested at the ffmpeg mailing list:
-// http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/2009-July/068949.html
-uint32 nv::fast_half_to_float(uint16 v)
-{
-    if (v & 0x8000) return 0;
-    uint exp = v >> 10;
-    if (!exp) return (v>>9)&1;
-    if (exp >= 15) return 0xffff;
-    v <<= 6;
-    return (v+(1<<16)) >> (15-exp);
-}
-
-#endif
-
-#if 0
-
-// Some more from a gamedev thread:
-// http://www.devmaster.net/forums/showthread.php?t=10924
-
-// I believe it does not handle specials either.
-
-// Mike Acton's code should be fairly easy to vectorize and that would handle all cases too, the table method might still be faster, though.
-
-
-static __declspec(align(16)) unsigned half_sign[4]	  = {0x00008000, 0x00008000, 0x00008000, 0x00008000};
-static __declspec(align(16)) unsigned half_exponent[4]	  = {0x00007C00, 0x00007C00, 0x00007C00, 0x00007C00};
-static __declspec(align(16)) unsigned half_mantissa[4]	  = {0x000003FF, 0x000003FF, 0x000003FF, 0x000003FF};
-static __declspec(align(16)) unsigned half_bias_offset[4] = {0x0001C000, 0x0001C000, 0x0001C000, 0x0001C000};
-
-__asm
-{
-	movaps	xmm1, xmm0  // Input in xmm0
-	movaps	xmm2, xmm0
-
-	andps	xmm0, half_sign
-	andps	xmm1, half_exponent
-	andps	xmm2, half_mantissa
-	paddd	xmm1, half_bias_offset
-
-	pslld	xmm0, 16
-	pslld	xmm1, 13
-	pslld	xmm2, 13
-
-	orps	xmm1, xmm2
-	orps	xmm0, xmm1  // Result in xmm0
-}
-
-
+// Branch-free implementation of half-precision (16 bit) floating point
+// Copyright 2006 Mike Acton <macton@gmail.com>
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a 
+// copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense, 
+// and/or sell copies of the Software, and to permit persons to whom the 
+// Software is furnished to do so, subject to the following conditions:
+// 
+// The above copyright notice and this permission notice shall be included 
+// in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE
+//
+// Half-precision floating point format
+// ------------------------------------
+//
+//   | Field    | Last | First | Note
+//   |----------|------|-------|----------
+//   | Sign     | 15   | 15    |
+//   | Exponent | 14   | 10    | Bias = 15
+//   | Mantissa | 9    | 0     |
+//
+// Compiling
+// ---------
+//
+//  Preferred compile flags for GCC: 
+//     -O3 -fstrict-aliasing -std=c99 -pedantic -Wall -Wstrict-aliasing
+//
+//     This file is a C99 source file, intended to be compiled with a C99 
+//     compliant compiler. However, for the moment it remains combatible
+//     with C++98. Therefore if you are using a compiler that poorly implements
+//     C standards (e.g. MSVC), it may be compiled as C++. This is not
+//     guaranteed for future versions. 
+//
+// Features
+// --------
+//
+//  * QNaN + <x>  = QNaN
+//  * <x>  + +INF = +INF
+//  * <x>  - -INF = -INF
+//  * INF  - INF  = SNaN
+//  * Denormalized values
+//  * Difference of ZEROs is always +ZERO
+//  * Sum round with guard + round + sticky bit (grs)
+//  * And of course... no branching
+// 
+// Precision of Sum
+// ----------------
+//
+//  (SUM)        uint16 z = half_add( x, y );
+//  (DIFFERENCE) uint16 z = half_add( x, -y );
+//
+//     Will have exactly (0 ulps difference) the same result as:
+//     (For 32 bit IEEE 784 floating point and same rounding mode)
+//
+//     union FLOAT_32
+//     {
+//       float    f32;
+//       uint32 u32;
+//     };
+//
+//     union FLOAT_32 fx = { .u32 = half_to_float( x ) };
+//     union FLOAT_32 fy = { .u32 = half_to_float( y ) };
+//     union FLOAT_32 fz = { .f32 = fx.f32 + fy.f32    };
+//     uint16       z  = float_to_half( fz );
+//
+
+#include "Half.h"
+#include <stdio.h>
+
+// Load immediate
+static inline uint32 _uint32_li( uint32 a )
+{
+    return (a);
+}
+
+// Decrement
+static inline uint32 _uint32_dec( uint32 a )
+{
+    return (a - 1);
+}
+
+// Increment
+static inline uint32 _uint32_inc( uint32 a )
+{
+  return (a + 1);
+}
+
+// Complement
+static inline uint32 _uint32_not( uint32 a )
+{
+    return (~a);
+}
+
+// Negate
+static inline uint32 _uint32_neg( uint32 a )
+{
+#pragma warning(disable : 4146)     // unary minus operator applied to unsigned type, result still unsigned
+    return (-a);
+#pragma warning(default : 4146)
+}
+
+// Extend sign
+static inline uint32 _uint32_ext( uint32 a )
+{
+    return (((int32)a)>>31);
+}
+
+// And
+static inline uint32 _uint32_and( uint32 a, uint32 b )
+{
+    return (a & b);
+}
+
+// And with Complement
+static inline uint32 _uint32_andc( uint32 a, uint32 b )
+{
+    return (a & ~b);
+}
+
+// Or
+static inline uint32 _uint32_or( uint32 a, uint32 b )
+{
+    return (a | b);
+}
+
+// Shift Right Logical
+static inline uint32 _uint32_srl( uint32 a, int sa )
+{
+    return (a >> sa);
+}
+
+// Shift Left Logical
+static inline uint32 _uint32_sll( uint32 a, int sa )
+{
+    return (a << sa);
+}
+
+// Add
+static inline uint32 _uint32_add( uint32 a, uint32 b )
+{
+    return (a + b);
+}
+
+// Subtract
+static inline uint32 _uint32_sub( uint32 a, uint32 b )
+{
+    return (a - b);
+}
+
+// Select on Sign bit
+static inline uint32 _uint32_sels( uint32 test, uint32 a, uint32 b )
+{
+    const uint32 mask   = _uint32_ext( test );
+    const uint32 sel_a  = _uint32_and(  a,     mask  );
+    const uint32 sel_b  = _uint32_andc( b,     mask  );
+    const uint32 result = _uint32_or(   sel_a, sel_b );
+
+    return (result);
+}
+
+// Load Immediate
+static inline uint16 _uint16_li( uint16 a )
+{
+    return (a);
+}
+
+// Extend sign
+static inline uint16 _uint16_ext( uint16 a )
+{
+    return (((int16)a)>>15);
+}
+
+// Negate
+static inline uint16 _uint16_neg( uint16 a )
+{
+    return (-a);
+}
+
+// Complement
+static inline uint16 _uint16_not( uint16 a )
+{
+    return (~a);
+}
+
+// Decrement
+static inline uint16 _uint16_dec( uint16 a )
+{
+    return (a - 1);
+}
+
+// Shift Left Logical
+static inline uint16 _uint16_sll( uint16 a, int sa )
+{
+    return (a << sa);
+}
+
+// Shift Right Logical
+static inline uint16 _uint16_srl( uint16 a, int sa )
+{
+    return (a >> sa);
+}
+
+// Add
+static inline uint16 _uint16_add( uint16 a, uint16 b )
+{
+    return (a + b);
+}
+
+// Subtract
+static inline uint16 _uint16_sub( uint16 a, uint16 b )
+{
+    return (a - b);
+}
+
+// And
+static inline uint16 _uint16_and( uint16 a, uint16 b )
+{
+    return (a & b);
+}
+
+// Or
+static inline uint16 _uint16_or( uint16 a, uint16 b )
+{
+    return (a | b);
+}
+
+// Exclusive Or
+static inline uint16 _uint16_xor( uint16 a, uint16 b )
+{
+    return (a ^ b);
+}
+
+// And with Complement
+static inline uint16 _uint16_andc( uint16 a, uint16 b )
+{
+    return (a & ~b);
+}
+
+// And then Shift Right Logical
+static inline uint16 _uint16_andsrl( uint16 a, uint16 b, int sa )
+{
+    return ((a & b) >> sa);
+}
+
+// Shift Right Logical then Mask
+static inline uint16 _uint16_srlm( uint16 a, int sa, uint16 mask )
+{
+    return ((a >> sa) & mask);
+}
+
+// Add then Mask
+static inline uint16 _uint16_addm( uint16 a, uint16 b, uint16 mask )
+{
+    return ((a + b) & mask);
+}
+
+
+// Select on Sign bit
+static inline uint16 _uint16_sels( uint16 test, uint16 a, uint16 b )
+{
+    const uint16 mask   = _uint16_ext( test );
+    const uint16 sel_a  = _uint16_and(  a,     mask  );
+    const uint16 sel_b  = _uint16_andc( b,     mask  );
+    const uint16 result = _uint16_or(   sel_a, sel_b );
+
+    return (result);
+}
+
+#if NV_OS_XBOX
+#include <PPCIntrinsics.h>
+#elif NV_CC_MSVC
+
+#include <intrin.h>
+#pragma intrinsic(_BitScanReverse)
+
+uint32 _uint32_nlz( uint32 x ) {
+    unsigned long index;
+    _BitScanReverse(&index, x);
+    return 31 - index;
+}
+#endif
+
+
+// Count Leading Zeros
+static inline uint32 _uint32_cntlz( uint32 x )
+{
+#if NV_CC_GCC
+    /* On PowerPC, this will map to insn: cntlzw */
+    /* On Pentium, this will map to insn: clz    */
+    uint32 is_x_nez_msb = _uint32_neg( x );
+    uint32 nlz          = __builtin_clz( x );
+    uint32 result       = _uint32_sels( is_x_nez_msb, nlz, 0x00000020 );
+    return (result);
+#elif NV_OS_XBOX
+    // Xbox PPC has this as an intrinsic.
+    return _CountLeadingZeros(x);
+#elif NV_CC_MSVC
+    uint32 is_x_nez_msb = _uint32_neg( x );
+    uint32 nlz          = _uint32_nlz( x );
+    uint32 result       = _uint32_sels( is_x_nez_msb, nlz, 0x00000020 );
+    return (result);
+#else
+    const uint32 x0  = _uint32_srl(  x,  1 );
+    const uint32 x1  = _uint32_or(   x,  x0 );
+    const uint32 x2  = _uint32_srl(  x1, 2 );
+    const uint32 x3  = _uint32_or(   x1, x2 );
+    const uint32 x4  = _uint32_srl(  x3, 4 );
+    const uint32 x5  = _uint32_or(   x3, x4 );
+    const uint32 x6  = _uint32_srl(  x5, 8 );
+    const uint32 x7  = _uint32_or(   x5, x6 );
+    const uint32 x8  = _uint32_srl(  x7, 16 );
+    const uint32 x9  = _uint32_or(   x7, x8 );
+    const uint32 xA  = _uint32_not(  x9 );
+    const uint32 xB  = _uint32_srl(  xA, 1 );
+    const uint32 xC  = _uint32_and(  xB, 0x55555555 );
+    const uint32 xD  = _uint32_sub(  xA, xC );
+    const uint32 xE  = _uint32_and(  xD, 0x33333333 );
+    const uint32 xF  = _uint32_srl(  xD, 2 );
+    const uint32 x10 = _uint32_and(  xF, 0x33333333 );
+    const uint32 x11 = _uint32_add(  xE, x10 );
+    const uint32 x12 = _uint32_srl(  x11, 4 );
+    const uint32 x13 = _uint32_add(  x11, x12 );
+    const uint32 x14 = _uint32_and(  x13, 0x0f0f0f0f );
+    const uint32 x15 = _uint32_srl(  x14, 8 );
+    const uint32 x16 = _uint32_add(  x14, x15 );
+    const uint32 x17 = _uint32_srl(  x16, 16 );
+    const uint32 x18 = _uint32_add(  x16, x17 );
+    const uint32 x19 = _uint32_and(  x18, 0x0000003f );
+    return ( x19 );
+#endif
+}
+
+// Count Leading Zeros
+static inline uint16 _uint16_cntlz( uint16 x )
+{
+#ifdef __GNUC__
+    /* On PowerPC, this will map to insn: cntlzw */
+    /* On Pentium, this will map to insn: clz    */
+    uint16 nlz32 = (uint16)_uint32_cntlz( (uint32)x );
+    uint32 nlz   = _uint32_sub( nlz32, 16 );
+    return (nlz);
+#elif _NV_OS_XBOX_
+    uint16 nlz32 = (uint16)_CountLeadingZeros( (uint32)x );
+    return _uint32_sub( nlz32, 16);
+#else
+    const uint16 x0  = _uint16_srl(  x,  1 );
+    const uint16 x1  = _uint16_or(   x,  x0 );
+    const uint16 x2  = _uint16_srl(  x1, 2 );
+    const uint16 x3  = _uint16_or(   x1, x2 );
+    const uint16 x4  = _uint16_srl(  x3, 4 );
+    const uint16 x5  = _uint16_or(   x3, x4 );
+    const uint16 x6  = _uint16_srl(  x5, 8 );
+    const uint16 x7  = _uint16_or(   x5, x6 );
+    const uint16 x8  = _uint16_not(  x7 );
+    const uint16 x9  = _uint16_srlm( x8, 1, 0x5555 );
+    const uint16 xA  = _uint16_sub(  x8, x9 );
+    const uint16 xB  = _uint16_and(  xA, 0x3333 );
+    const uint16 xC  = _uint16_srlm( xA, 2, 0x3333 );
+    const uint16 xD  = _uint16_add(  xB, xC );
+    const uint16 xE  = _uint16_srl(  xD, 4 );
+    const uint16 xF  = _uint16_addm( xD, xE, 0x0f0f );
+    const uint16 x10 = _uint16_srl(  xF, 8 );
+    const uint16 x11 = _uint16_addm( xF, x10, 0x001f );
+    return ( x11 );
+#endif
+}
+
+uint16
+nv::half_from_float( uint32 f )
+{
+    const uint32 one                        = _uint32_li( 0x00000001 );
+    const uint32 f_s_mask                   = _uint32_li( 0x80000000 );
+    const uint32 f_e_mask                   = _uint32_li( 0x7f800000 );
+    const uint32 f_m_mask                   = _uint32_li( 0x007fffff );
+    const uint32 f_m_hidden_bit             = _uint32_li( 0x00800000 );
+    const uint32 f_m_round_bit              = _uint32_li( 0x00001000 );
+    const uint32 f_snan_mask                = _uint32_li( 0x7fc00000 );
+    const uint32 f_e_pos                    = _uint32_li( 0x00000017 );
+    const uint32 h_e_pos                    = _uint32_li( 0x0000000a );
+    const uint32 h_e_mask                   = _uint32_li( 0x00007c00 );
+    const uint32 h_snan_mask                = _uint32_li( 0x00007e00 );
+    const uint32 h_e_mask_value             = _uint32_li( 0x0000001f );
+    const uint32 f_h_s_pos_offset           = _uint32_li( 0x00000010 );
+    const uint32 f_h_bias_offset            = _uint32_li( 0x00000070 );
+    const uint32 f_h_m_pos_offset           = _uint32_li( 0x0000000d );
+    const uint32 h_nan_min                  = _uint32_li( 0x00007c01 );
+    const uint32 f_h_e_biased_flag          = _uint32_li( 0x0000008f );
+    const uint32 f_s                        = _uint32_and( f,               f_s_mask         );
+    const uint32 f_e                        = _uint32_and( f,               f_e_mask         );
+    const uint16 h_s                        = _uint32_srl( f_s,             f_h_s_pos_offset );
+    const uint32 f_m                        = _uint32_and( f,               f_m_mask         );
+    const uint16 f_e_amount                 = _uint32_srl( f_e,             f_e_pos          );
+    const uint32 f_e_half_bias              = _uint32_sub( f_e_amount,      f_h_bias_offset  );
+    const uint32 f_snan                     = _uint32_and( f,               f_snan_mask      );
+    const uint32 f_m_round_mask             = _uint32_and( f_m,             f_m_round_bit    );
+    const uint32 f_m_round_offset           = _uint32_sll( f_m_round_mask,  one              );
+    const uint32 f_m_rounded                = _uint32_add( f_m,             f_m_round_offset );
+    const uint32 f_m_denorm_sa              = _uint32_sub( one,             f_e_half_bias    );
+    const uint32 f_m_with_hidden            = _uint32_or(  f_m_rounded,     f_m_hidden_bit   );
+    const uint32 f_m_denorm                 = _uint32_srl( f_m_with_hidden, f_m_denorm_sa    );
+    const uint32 h_m_denorm                 = _uint32_srl( f_m_denorm,      f_h_m_pos_offset );
+    const uint32 f_m_rounded_overflow       = _uint32_and( f_m_rounded,     f_m_hidden_bit   );
+    const uint32 m_nan                      = _uint32_srl( f_m,             f_h_m_pos_offset );
+    const uint32 h_em_nan                   = _uint32_or(  h_e_mask,        m_nan            );
+    const uint32 h_e_norm_overflow_offset   = _uint32_inc( f_e_half_bias );
+    const uint32 h_e_norm_overflow          = _uint32_sll( h_e_norm_overflow_offset, h_e_pos          );
+    const uint32 h_e_norm                   = _uint32_sll( f_e_half_bias,            h_e_pos          );
+    const uint32 h_m_norm                   = _uint32_srl( f_m_rounded,              f_h_m_pos_offset );
+    const uint32 h_em_norm                  = _uint32_or(  h_e_norm,                 h_m_norm         );
+    const uint32 is_h_ndenorm_msb           = _uint32_sub( f_h_bias_offset,   f_e_amount    );
+    const uint32 is_f_e_flagged_msb         = _uint32_sub( f_h_e_biased_flag, f_e_half_bias );
+    const uint32 is_h_denorm_msb            = _uint32_not( is_h_ndenorm_msb );
+    const uint32 is_f_m_eqz_msb             = _uint32_dec( f_m   );
+    const uint32 is_h_nan_eqz_msb           = _uint32_dec( m_nan );
+    const uint32 is_f_inf_msb               = _uint32_and( is_f_e_flagged_msb, is_f_m_eqz_msb   );
+    const uint32 is_f_nan_underflow_msb     = _uint32_and( is_f_e_flagged_msb, is_h_nan_eqz_msb );
+    const uint32 is_e_overflow_msb          = _uint32_sub( h_e_mask_value,     f_e_half_bias    );
+    const uint32 is_h_inf_msb               = _uint32_or(  is_e_overflow_msb,  is_f_inf_msb     );
+    const uint32 is_f_nsnan_msb             = _uint32_sub( f_snan,             f_snan_mask      );
+    const uint32 is_m_norm_overflow_msb     = _uint32_neg( f_m_rounded_overflow );
+    const uint32 is_f_snan_msb              = _uint32_not( is_f_nsnan_msb );
+    const uint32 h_em_overflow_result       = _uint32_sels( is_m_norm_overflow_msb, h_e_norm_overflow, h_em_norm                 );
+    const uint32 h_em_nan_result            = _uint32_sels( is_f_e_flagged_msb,     h_em_nan,          h_em_overflow_result      );
+    const uint32 h_em_nan_underflow_result  = _uint32_sels( is_f_nan_underflow_msb, h_nan_min,         h_em_nan_result           );
+    const uint32 h_em_inf_result            = _uint32_sels( is_h_inf_msb,           h_e_mask,          h_em_nan_underflow_result );
+    const uint32 h_em_denorm_result         = _uint32_sels( is_h_denorm_msb,        h_m_denorm,        h_em_inf_result           );
+    const uint32 h_em_snan_result           = _uint32_sels( is_f_snan_msb,          h_snan_mask,       h_em_denorm_result        );
+    const uint32 h_result                   = _uint32_or( h_s, h_em_snan_result );
+
+    return (uint16)(h_result);
+}
+
+uint32 
+nv::half_to_float( uint16 h )
+{
+    const uint32 h_e_mask              = _uint32_li( 0x00007c00 );
+    const uint32 h_m_mask              = _uint32_li( 0x000003ff );
+    const uint32 h_s_mask              = _uint32_li( 0x00008000 );
+    const uint32 h_f_s_pos_offset      = _uint32_li( 0x00000010 );
+    const uint32 h_f_e_pos_offset      = _uint32_li( 0x0000000d );
+    const uint32 h_f_bias_offset       = _uint32_li( 0x0001c000 );
+    const uint32 f_e_mask              = _uint32_li( 0x7f800000 );
+    const uint32 f_m_mask              = _uint32_li( 0x007fffff );
+    const uint32 h_f_e_denorm_bias     = _uint32_li( 0x0000007e );
+    const uint32 h_f_m_denorm_sa_bias  = _uint32_li( 0x00000008 );
+    const uint32 f_e_pos               = _uint32_li( 0x00000017 );
+    const uint32 h_e_mask_minus_one    = _uint32_li( 0x00007bff );
+    const uint32 h_e                   = _uint32_and( h, h_e_mask );
+    const uint32 h_m                   = _uint32_and( h, h_m_mask );
+    const uint32 h_s                   = _uint32_and( h, h_s_mask );
+    const uint32 h_e_f_bias            = _uint32_add( h_e, h_f_bias_offset );
+    const uint32 h_m_nlz               = _uint32_cntlz( h_m );
+    const uint32 f_s                   = _uint32_sll( h_s,        h_f_s_pos_offset );
+    const uint32 f_e                   = _uint32_sll( h_e_f_bias, h_f_e_pos_offset );
+    const uint32 f_m                   = _uint32_sll( h_m,        h_f_e_pos_offset );
+    const uint32 f_em                  = _uint32_or(  f_e,        f_m              );
+    const uint32 h_f_m_sa              = _uint32_sub( h_m_nlz,             h_f_m_denorm_sa_bias );
+    const uint32 f_e_denorm_unpacked   = _uint32_sub( h_f_e_denorm_bias,   h_f_m_sa             );
+    const uint32 h_f_m                 = _uint32_sll( h_m,                 h_f_m_sa             );
+    const uint32 f_m_denorm            = _uint32_and( h_f_m,               f_m_mask             );
+    const uint32 f_e_denorm            = _uint32_sll( f_e_denorm_unpacked, f_e_pos              );
+    const uint32 f_em_denorm           = _uint32_or(  f_e_denorm,          f_m_denorm           );
+    const uint32 f_em_nan              = _uint32_or(  f_e_mask,            f_m                  );
+    const uint32 is_e_eqz_msb          = _uint32_dec(  h_e );
+    const uint32 is_m_nez_msb          = _uint32_neg(  h_m );
+    const uint32 is_e_flagged_msb      = _uint32_sub(  h_e_mask_minus_one, h_e );
+    const uint32 is_zero_msb           = _uint32_andc( is_e_eqz_msb,       is_m_nez_msb );
+    const uint32 is_inf_msb            = _uint32_andc( is_e_flagged_msb,   is_m_nez_msb );
+    const uint32 is_denorm_msb         = _uint32_and(  is_m_nez_msb,       is_e_eqz_msb );
+    const uint32 is_nan_msb            = _uint32_and(  is_e_flagged_msb,   is_m_nez_msb ); 
+    const uint32 is_zero               = _uint32_ext(  is_zero_msb );
+    const uint32 f_zero_result         = _uint32_andc( f_em, is_zero );
+    const uint32 f_denorm_result       = _uint32_sels( is_denorm_msb, f_em_denorm, f_zero_result );
+    const uint32 f_inf_result          = _uint32_sels( is_inf_msb,    f_e_mask,    f_denorm_result );
+    const uint32 f_nan_result          = _uint32_sels( is_nan_msb,    f_em_nan,    f_inf_result    );
+    const uint32 f_result              = _uint32_or( f_s, f_nan_result );
+
+    return (f_result);
+}
+
+
+// @@ These tables could be smaller.
+static uint32 mantissa_table[2048];
+static uint32 exponent_table[64];
+static uint32 offset_table[64];
+
+void nv::half_init_tables()
+{
+    // Init mantissa table.
+	mantissa_table[0] = 0;
+
+	for (int i = 1; i < 1024; i++) {
+		uint m = i << 13;
+		uint e = 0;
+
+		while ((m & 0x00800000) == 0) {
+			e -= 0x00800000;
+			m <<= 1;
+		}
+		m &= ~0x00800000;
+		e += 0x38800000;
+		mantissa_table[i] = m | e;
+	}
+
+    for (int i = 1024; i < 2048; i++) {
+		mantissa_table[i] = 0x38000000 + ((i - 1024) << 13);
+    }
+
+
+    // Init exponent table.
+	exponent_table[0] = 0;
+
+    for (int i = 1; i < 31; i++) {
+		exponent_table[i] = (i << 23);
+    }
+
+	exponent_table[31] = 0x47800000;
+	exponent_table[32] = 0x80000000;
+
+    for (int i = 33; i < 63; i++) {
+		exponent_table[i] = 0x80000000 + ((i - 32) << 23);
+    }
+
+	exponent_table[63] = 0xC7800000;
+
+
+    // Init offset table.
+	offset_table[0] = 0;
+
+    for (int i = 1; i < 32; i++) {
+		offset_table[i] = 1024;
+    }
+
+	offset_table[32] = 0;
+
+    for (int i = 33; i < 64; i++) {
+		offset_table[i] = 1024;
+    }
+
+    /*for (int i = 0; i < 64; i++) {
+        offset_table[i] = ((i & 31) != 0) * 1024;
+    }*/
+}
+
+// Fast half to float conversion based on:
+// http://www.fox-toolkit.org/ftp/fasthalffloatconversion.pdf
+uint32 nv::fast_half_to_float(uint16 h)
+{
+	uint exp = h >> 10;
+	return mantissa_table[offset_table[exp] + (h & 0x3ff)] + exponent_table[exp];
+}
+
+
+#if 0
+// Inaccurate conversion suggested at the ffmpeg mailing list:
+// http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/2009-July/068949.html
+uint32 nv::fast_half_to_float(uint16 v)
+{
+    if (v & 0x8000) return 0;
+    uint exp = v >> 10;
+    if (!exp) return (v>>9)&1;
+    if (exp >= 15) return 0xffff;
+    v <<= 6;
+    return (v+(1<<16)) >> (15-exp);
+}
+
+#endif
+
+#if 0
+
+// Some more from a gamedev thread:
+// http://www.devmaster.net/forums/showthread.php?t=10924
+
+// I believe it does not handle specials either.
+
+// Mike Acton's code should be fairly easy to vectorize and that would handle all cases too, the table method might still be faster, though.
+
+
+static __declspec(align(16)) unsigned half_sign[4]	  = {0x00008000, 0x00008000, 0x00008000, 0x00008000};
+static __declspec(align(16)) unsigned half_exponent[4]	  = {0x00007C00, 0x00007C00, 0x00007C00, 0x00007C00};
+static __declspec(align(16)) unsigned half_mantissa[4]	  = {0x000003FF, 0x000003FF, 0x000003FF, 0x000003FF};
+static __declspec(align(16)) unsigned half_bias_offset[4] = {0x0001C000, 0x0001C000, 0x0001C000, 0x0001C000};
+
+__asm
+{
+	movaps	xmm1, xmm0  // Input in xmm0
+	movaps	xmm2, xmm0
+
+	andps	xmm0, half_sign
+	andps	xmm1, half_exponent
+	andps	xmm2, half_mantissa
+	paddd	xmm1, half_bias_offset
+
+	pslld	xmm0, 16
+	pslld	xmm1, 13
+	pslld	xmm2, 13
+
+	orps	xmm1, xmm2
+	orps	xmm0, xmm1  // Result in xmm0
+}
+
+
 #endif
\ No newline at end of file
diff --git a/src/nvmath/Half.h b/src/nvmath/Half.h
index 08f8f11..f732e93 100644
--- a/src/nvmath/Half.h
+++ b/src/nvmath/Half.h
@@ -1,30 +1,30 @@
-#pragma once
-#ifndef NV_MATH_HALF_H
-#define NV_MATH_HALF_H
-
-#include "nvmath.h"
-
-namespace nv {
-
-    uint32 half_to_float( uint16 h );
-    uint16 half_from_float( uint32 f );
-
-    void half_init_tables();
-
-    uint32 fast_half_to_float(uint16 h);
-
-    inline uint16 to_half(float c) {
-        union { float f; uint32 u; } f;
-        f.f = c;
-        return nv::half_from_float( f.u );
-    }
-
-    inline float to_float(uint16 c) {
-        union { float f; uint32 u; } f;
-        f.u = nv::fast_half_to_float( c );
-        return f.f;
-    }
-
-} // nv namespace
-
-#endif // NV_MATH_HALF_H
+#pragma once
+#ifndef NV_MATH_HALF_H
+#define NV_MATH_HALF_H
+
+#include "nvmath.h"
+
+namespace nv {
+
+    uint32 half_to_float( uint16 h );
+    uint16 half_from_float( uint32 f );
+
+    void half_init_tables();
+
+    uint32 fast_half_to_float(uint16 h);
+
+    inline uint16 to_half(float c) {
+        union { float f; uint32 u; } f;
+        f.f = c;
+        return nv::half_from_float( f.u );
+    }
+
+    inline float to_float(uint16 c) {
+        union { float f; uint32 u; } f;
+        f.u = nv::fast_half_to_float( c );
+        return f.f;
+    }
+
+} // nv namespace
+
+#endif // NV_MATH_HALF_H
diff --git a/src/nvmath/Matrix.h b/src/nvmath/Matrix.h
index 5bd2cab..273e639 100644
--- a/src/nvmath/Matrix.h
+++ b/src/nvmath/Matrix.h
@@ -1,1199 +1,1199 @@
-// This code is in the public domain -- castanyo@yahoo.es
-
-#pragma once
-#ifndef NV_MATH_MATRIX_H
-#define NV_MATH_MATRIX_H
-
-#include <nvmath/nvmath.h>
-#include <nvmath/Vector.h>
-
-namespace nv
-{
-    enum identity_t { identity };
-
-    class NVMATH_CLASS Matrix3
-    {
-    public:
-        Matrix3();
-        explicit Matrix3(float f);
-        explicit Matrix3(identity_t);
-        Matrix3(const Matrix3 & m);
-        Matrix3(Vector3::Arg v0, Vector3::Arg v1, Vector3::Arg v2);
-
-        scalar get(uint row, uint col) const;
-        scalar operator()(uint row, uint col) const;
-        scalar & operator()(uint row, uint col);
-
-        Vector3 row(uint i) const;
-        Vector3 column(uint i) const;
-
-        void operator*=(float s);
-        void operator/=(float s);
-        void operator+=(const Matrix3 & m);
-        void operator-=(const Matrix3 & m);
-
-        float determinant() const;
-
-    private:
-        scalar m_data[9];
-    };
-
-    inline Matrix3::Matrix3() {}
-    
-    inline Matrix3::Matrix3(float f)
-    {
-        for(int i = 0; i < 9; i++) {
-            m_data[i] = f;
-        }
-    }
-
-    inline Matrix3::Matrix3(identity_t)
-    {
-        for(int i = 0; i < 3; i++) {
-            for(int j = 0; j < 3; j++) {
-                m_data[3*j+i] = (i == j) ? 1.0f : 0.0f;
-            }
-        }
-    }
-
-    inline Matrix3::Matrix3(const Matrix3 & m)
-    {
-        for(int i = 0; i < 9; i++) {
-            m_data[i] = m.m_data[i];
-        }
-    }
-    
-    inline Matrix3::Matrix3(Vector3::Arg v0, Vector3::Arg v1, Vector3::Arg v2)
-    {
-        m_data[0] = v0.x; m_data[1] = v0.y; m_data[2] = v0.z;
-        m_data[3] = v1.x; m_data[4] = v1.y; m_data[5] = v1.z;
-        m_data[6] = v2.x; m_data[7] = v2.y; m_data[8] = v2.z;
-    }
-
-    inline scalar Matrix3::get(uint row, uint col) const
-    {
-        nvDebugCheck(row < 3 && col < 3);
-        return m_data[col * 3 + row];
-    }
-    inline scalar Matrix3::operator()(uint row, uint col) const
-    {
-        nvDebugCheck(row < 3 && col < 3);
-        return m_data[col * 3 + row];
-    }
-    inline scalar & Matrix3::operator()(uint row, uint col)
-    {
-        nvDebugCheck(row < 3 && col < 3);
-        return m_data[col * 3 + row];
-    }
-
-    inline Vector3 Matrix3::row(uint i) const
-    {
-        nvDebugCheck(i < 3);
-        return Vector3(get(i, 0), get(i, 1), get(i, 2));
-    }
-    inline Vector3 Matrix3::column(uint i) const
-    {
-        nvDebugCheck(i < 3);
-        return Vector3(get(0, i), get(1, i), get(2, i));
-    }
-
-    inline void Matrix3::operator*=(float s)
-    {
-        for(int i = 0; i < 9; i++) {
-            m_data[i] *= s;
-        }
-    }
-
-    inline void Matrix3::operator/=(float s)
-    {
-        float is = 1.0f /s;
-        for(int i = 0; i < 9; i++) {
-            m_data[i] *= is;
-        }
-    }
-
-    inline void Matrix3::operator+=(const Matrix3 & m)
-    {
-        for(int i = 0; i < 9; i++) {
-            m_data[i] += m.m_data[i];
-        }
-    }
-
-    inline void Matrix3::operator-=(const Matrix3 & m)
-    {
-        for(int i = 0; i < 9; i++) {
-            m_data[i] -= m.m_data[i];
-        }
-    }
-
-    inline Matrix3 operator+(const Matrix3 & a, const Matrix3 & b)
-    {
-        Matrix3 m = a;
-        m += b;
-        return m;
-    }
-
-    inline Matrix3 operator-(const Matrix3 & a, const Matrix3 & b)
-    {
-        Matrix3 m = a;
-        m -= b;
-        return m;
-    }
-
-    inline Matrix3 operator*(const Matrix3 & a, float s)
-    {
-        Matrix3 m = a;
-        m *= s;
-        return m;
-    }
-
-    inline Matrix3 operator*(float s, const Matrix3 & a)
-    {
-        Matrix3 m = a;
-        m *= s;
-        return m;
-    }
-
-    inline Matrix3 operator/(const Matrix3 & a, float s)
-    {
-        Matrix3 m = a;
-        m /= s;
-        return m;
-    }
-
-    inline Matrix3 mul(const Matrix3 & a, const Matrix3 & b)
-    {
-        Matrix3 m;
-
-        for(int i = 0; i < 3; i++) {
-            const scalar ai0 = a(i,0), ai1 = a(i,1), ai2 = a(i,2);
-            m(i, 0) = ai0 * b(0,0) + ai1 * b(1,0) + ai2 * b(2,0);
-            m(i, 1) = ai0 * b(0,1) + ai1 * b(1,1) + ai2 * b(2,1);
-            m(i, 2) = ai0 * b(0,2) + ai1 * b(1,2) + ai2 * b(2,2);
-        }
-
-        return m;
-    }
-
-    inline Matrix3 operator*(const Matrix3 & a, const Matrix3 & b)
-    {
-        return mul(a, b);
-    }
-
-    inline float Matrix3::determinant() const
-    {
-        return 
-            get(0,0) * get(1,1) * get(2,2) + 
-            get(0,1) * get(1,2) * get(2,0) + 
-            get(0,2) * get(1,0) * get(2,1) -
-            get(0,2) * get(1,1) * get(2,0) - 
-            get(0,1) * get(1,0) * get(2,2) -
-            get(0,0) * get(1,2) * get(2,1);
-    }
-
-
-
-    /// 4x4 transformation matrix.
-    /// -# Matrices are stored in memory in column major order.
-    /// -# Points are to be though of as column vectors.
-    /// -# Transformation of a point p by a matrix M is: p' = M * p
-    class NVMATH_CLASS Matrix
-    {
-    public:
-        typedef Matrix const & Arg;
-
-        Matrix();
-        explicit Matrix(float f);
-        explicit Matrix(identity_t);
-        Matrix(const Matrix & m);
-        Matrix(Vector4::Arg v0, Vector4::Arg v1, Vector4::Arg v2, Vector4::Arg v3);
-        //explicit Matrix(const scalar m[]);	// m is assumed to contain 16 elements
-
-        scalar data(uint idx) const;
-        scalar & data(uint idx);
-        scalar get(uint row, uint col) const;
-        scalar operator()(uint row, uint col) const;
-        scalar & operator()(uint row, uint col);
-        const scalar * ptr() const;
-
-        Vector4 row(uint i) const;
-        Vector4 column(uint i) const;
-
-        void scale(scalar s);
-        void scale(Vector3::Arg s);
-        void translate(Vector3::Arg t);
-        void rotate(scalar theta, scalar v0, scalar v1, scalar v2);
-        scalar determinant() const;
-
-        void apply(Matrix::Arg m);
-
-    private:
-        scalar m_data[16];
-    };
-
-
-    inline Matrix::Matrix()
-    {
-    }
-
-    inline Matrix::Matrix(float f)
-    {
-        for(int i = 0; i < 16; i++) {
-            m_data[i] = 0.0f;
-        }
-    }
-
-    inline Matrix::Matrix(identity_t)
-    {
-        for(int i = 0; i < 4; i++) {
-            for(int j = 0; j < 4; j++) {
-                m_data[4*j+i] = (i == j) ? 1.0f : 0.0f;
-            }
-        }
-    }
-
-    inline Matrix::Matrix(const Matrix & m)
-    {
-        for(int i = 0; i < 16; i++) {
-            m_data[i] = m.m_data[i];
-        }
-    }
-
-    inline Matrix::Matrix(Vector4::Arg v0, Vector4::Arg v1, Vector4::Arg v2, Vector4::Arg v3)
-    {
-        m_data[ 0] = v0.x; m_data[ 1] = v0.y; m_data[ 2] = v0.z; m_data[ 3] = v0.w;
-        m_data[ 4] = v1.x; m_data[ 5] = v1.y; m_data[ 6] = v1.z; m_data[ 7] = v1.w;
-        m_data[ 8] = v2.x; m_data[ 9] = v2.y; m_data[10] = v2.z; m_data[11] = v2.w;
-        m_data[12] = v3.x; m_data[13] = v3.y; m_data[14] = v3.z; m_data[15] = v3.w;
-    }
-
-    /*inline Matrix::Matrix(const scalar m[])
-    {
-        for(int i = 0; i < 16; i++) {
-            m_data[i] = m[i];
-        }
-    }*/
-
-
-    // Accessors
-    inline scalar Matrix::data(uint idx) const
-    {
-        nvDebugCheck(idx < 16);
-        return m_data[idx];
-    }
-    inline scalar & Matrix::data(uint idx)
-    {
-        nvDebugCheck(idx < 16);
-        return m_data[idx];
-    }
-    inline scalar Matrix::get(uint row, uint col) const
-    {
-        nvDebugCheck(row < 4 && col < 4);
-        return m_data[col * 4 + row];
-    }
-    inline scalar Matrix::operator()(uint row, uint col) const
-    {
-        nvDebugCheck(row < 4 && col < 4);
-        return m_data[col * 4 + row];
-    }
-    inline scalar & Matrix::operator()(uint row, uint col)
-    {
-        nvDebugCheck(row < 4 && col < 4);
-        return m_data[col * 4 + row];
-    }
-
-    inline const scalar * Matrix::ptr() const
-    {
-        return m_data;
-    }
-
-    inline Vector4 Matrix::row(uint i) const
-    {
-        nvDebugCheck(i < 4);
-        return Vector4(get(i, 0), get(i, 1), get(i, 2), get(i, 3));
-    }
-
-    inline Vector4 Matrix::column(uint i) const
-    {
-        nvDebugCheck(i < 4);
-        return Vector4(get(0, i), get(1, i), get(2, i), get(3, i));
-    }
-
-    /// Apply scale.
-    inline void Matrix::scale(scalar s)
-    {
-        m_data[0] *= s; m_data[1] *= s; m_data[2] *= s; m_data[3] *= s;
-        m_data[4] *= s; m_data[5] *= s; m_data[6] *= s; m_data[7] *= s;
-        m_data[8] *= s; m_data[9] *= s; m_data[10] *= s; m_data[11] *= s;
-        m_data[12] *= s; m_data[13] *= s; m_data[14] *= s; m_data[15] *= s;
-    }
-
-    /// Apply scale.
-    inline void Matrix::scale(Vector3::Arg s)
-    {
-        m_data[0] *= s.x; m_data[1] *= s.x; m_data[2] *= s.x; m_data[3] *= s.x;
-        m_data[4] *= s.y; m_data[5] *= s.y; m_data[6] *= s.y; m_data[7] *= s.y;
-        m_data[8] *= s.z; m_data[9] *= s.z; m_data[10] *= s.z; m_data[11] *= s.z;
-    }
-
-    /// Apply translation.
-    inline void Matrix::translate(Vector3::Arg t)
-    {
-        m_data[12] = m_data[0] * t.x + m_data[4] * t.y + m_data[8]  * t.z + m_data[12];
-        m_data[13] = m_data[1] * t.x + m_data[5] * t.y + m_data[9]  * t.z + m_data[13];
-        m_data[14] = m_data[2] * t.x + m_data[6] * t.y + m_data[10] * t.z + m_data[14];
-        m_data[15] = m_data[3] * t.x + m_data[7] * t.y + m_data[11] * t.z + m_data[15];
-    }
-
-    Matrix rotation(scalar theta, scalar v0, scalar v1, scalar v2);
-
-    /// Apply rotation.
-    inline void Matrix::rotate(scalar theta, scalar v0, scalar v1, scalar v2)
-    {
-        Matrix R(rotation(theta, v0, v1, v2));
-        apply(R);
-    }
-
-    /// Apply transform.
-    inline void Matrix::apply(Matrix::Arg m)
-    {
-        nvDebugCheck(this != &m);
-
-        for(int i = 0; i < 4; i++) {
-            const scalar ai0 = get(i,0), ai1 = get(i,1), ai2 = get(i,2), ai3 = get(i,3);
-            m_data[0 + i] = ai0 * m(0,0) + ai1 * m(1,0) + ai2 * m(2,0) + ai3 * m(3,0);
-            m_data[4 + i] = ai0 * m(0,1) + ai1 * m(1,1) + ai2 * m(2,1) + ai3 * m(3,1);
-            m_data[8 + i] = ai0 * m(0,2) + ai1 * m(1,2) + ai2 * m(2,2) + ai3 * m(3,2);
-            m_data[12+ i] = ai0 * m(0,3) + ai1 * m(1,3) + ai2 * m(2,3) + ai3 * m(3,3);
-        }
-    }
-
-    /// Get scale matrix.
-    inline Matrix scale(Vector3::Arg s)
-    {
-        Matrix m(identity);
-        m(0,0) = s.x;
-        m(1,1) = s.y;
-        m(2,2) = s.z;
-        return m;
-    }
-
-    /// Get scale matrix.
-    inline Matrix scale(scalar s)
-    {
-        Matrix m(identity);
-        m(0,0) = m(1,1) = m(2,2) = s;
-        return m;
-    }
-
-    /// Get translation matrix.
-    inline Matrix translation(Vector3::Arg t)
-    {
-        Matrix m(identity);
-        m(0,3) = t.x;
-        m(1,3) = t.y;
-        m(2,3) = t.z;
-        return m;
-    }
-
-    /// Get rotation matrix.
-    inline Matrix rotation(scalar theta, scalar v0, scalar v1, scalar v2)
-    {
-        scalar cost = cosf(theta);
-        scalar sint = sinf(theta);
-
-        Matrix m(identity);
-
-        if( 1 == v0 && 0 == v1 && 0 == v2 ) {
-            m(1,1) = cost; m(2,1) = -sint;
-            m(1,2) = sint; m(2,2) = cost;
-        }
-        else if( 0 == v0  && 1 == v1 && 0 == v2 ) {
-            m(0,0) = cost; m(2,0) = sint;
-            m(1,2) = -sint; m(2,2) = cost;
-        }
-        else if( 0 == v0 && 0 == v1 && 1 == v2 ) {
-            m(0,0) = cost; m(1,0) = -sint;
-            m(0,1) = sint; m(1,1) = cost;
-        } 
-        else {
-            scalar a2, b2, c2;
-            a2 = v0 * v0;
-            b2 = v1 * v1;
-            c2 = v2 * v2;
-
-            scalar iscale = 1.0f / sqrtf(a2 + b2 + c2);
-            v0 *= iscale;
-            v1 *= iscale;
-            v2 *= iscale;
-
-            scalar abm, acm, bcm;
-            scalar mcos, asin, bsin, csin;
-            mcos = 1.0f - cost;
-            abm = v0 * v1 * mcos;
-            acm = v0 * v2 * mcos;
-            bcm = v1 * v2 * mcos;
-            asin = v0 * sint;
-            bsin = v1 * sint;
-            csin = v2 * sint;
-            m(0,0) = a2 * mcos + cost;
-            m(1,0) = abm - csin;
-            m(2,0) = acm + bsin;
-            m(3,0) = abm + csin;
-            m(1,1) = b2 * mcos + cost;
-            m(2,1) = bcm - asin;
-            m(3,1) = acm - bsin;
-            m(1,2) = bcm + asin;
-            m(2,2) = c2 * mcos + cost;
-        }
-        return m;
-    }
-
-    //Matrix rotation(scalar yaw, scalar pitch, scalar roll);
-    //Matrix skew(scalar angle, Vector3::Arg v1, Vector3::Arg v2);
-
-    /// Get frustum matrix.
-    inline Matrix frustum(scalar xmin, scalar xmax, scalar ymin, scalar ymax, scalar zNear, scalar zFar)
-    {
-        Matrix m(0.0f);
-
-        scalar doubleznear = 2.0f * zNear;
-        scalar one_deltax = 1.0f / (xmax - xmin);
-        scalar one_deltay = 1.0f / (ymax - ymin);
-        scalar one_deltaz = 1.0f / (zFar - zNear);
-
-        m(0,0) = doubleznear * one_deltax;
-        m(1,1) = doubleznear * one_deltay;
-        m(0,2) = (xmax + xmin) * one_deltax;
-        m(1,2) = (ymax + ymin) * one_deltay;
-        m(2,2) = -(zFar + zNear) * one_deltaz;
-        m(3,2) = -1.0f;
-        m(2,3) = -(zFar * doubleznear) * one_deltaz;
-
-        return m;
-    }
-
-    /// Get infinite frustum matrix.
-    inline Matrix frustum(scalar xmin, scalar xmax, scalar ymin, scalar ymax, scalar zNear)
-    {
-        Matrix m(0.0f);
-
-        scalar doubleznear = 2.0f * zNear;
-        scalar one_deltax = 1.0f / (xmax - xmin);
-        scalar one_deltay = 1.0f / (ymax - ymin);
-        scalar nudge = 1.0; // 0.999;
-
-        m(0,0) = doubleznear * one_deltax;
-        m(1,1) = doubleznear * one_deltay;
-        m(0,2) = (xmax + xmin) * one_deltax;
-        m(1,2) = (ymax + ymin) * one_deltay;
-        m(2,2) = -1.0f * nudge;
-        m(3,2) = -1.0f;
-        m(2,3) = -doubleznear * nudge;
-
-        return m;
-    }
-
-    /// Get perspective matrix.
-    inline Matrix perspective(scalar fovy, scalar aspect, scalar zNear, scalar zFar)
-    {
-        scalar xmax = zNear * tan(fovy / 2);
-        scalar xmin = -xmax;
-
-        scalar ymax = xmax / aspect;
-        scalar ymin = -ymax;
-
-        return frustum(xmin, xmax, ymin, ymax, zNear, zFar);	
-    }
-
-    /// Get infinite perspective matrix.
-    inline Matrix perspective(scalar fovy, scalar aspect, scalar zNear)
-    {
-        scalar x = zNear * tan(fovy / 2);
-        scalar y = x / aspect;
-        return frustum( -x, x, -y, y, zNear );	
-    }
-
-    /// Get matrix determinant.
-    inline scalar Matrix::determinant() const
-    {
-        return 
-            m_data[3] * m_data[6] * m_data[ 9] * m_data[12] - m_data[2] * m_data[7] * m_data[ 9] * m_data[12] - m_data[3] * m_data[5] * m_data[10] * m_data[12] + m_data[1] * m_data[7] * m_data[10] * m_data[12] +
-            m_data[2] * m_data[5] * m_data[11] * m_data[12] - m_data[1] * m_data[6] * m_data[11] * m_data[12] - m_data[3] * m_data[6] * m_data[ 8] * m_data[13] + m_data[2] * m_data[7] * m_data[ 8] * m_data[13] +
-            m_data[3] * m_data[4] * m_data[10] * m_data[13] - m_data[0] * m_data[7] * m_data[10] * m_data[13] - m_data[2] * m_data[4] * m_data[11] * m_data[13] + m_data[0] * m_data[6] * m_data[11] * m_data[13] +
-            m_data[3] * m_data[5] * m_data[ 8] * m_data[14] - m_data[1] * m_data[7] * m_data[ 8] * m_data[14] - m_data[3] * m_data[4] * m_data[ 9] * m_data[14] + m_data[0] * m_data[7] * m_data[ 9] * m_data[14] +
-            m_data[1] * m_data[4] * m_data[11] * m_data[14] - m_data[0] * m_data[5] * m_data[11] * m_data[14] - m_data[2] * m_data[5] * m_data[ 8] * m_data[15] + m_data[1] * m_data[6] * m_data[ 8] * m_data[15] +
-            m_data[2] * m_data[4] * m_data[ 9] * m_data[15] - m_data[0] * m_data[6] * m_data[ 9] * m_data[15] - m_data[1] * m_data[4] * m_data[10] * m_data[15] + m_data[0] * m_data[5] * m_data[10] * m_data[15];
-    }
-
-    inline Matrix transpose(Matrix::Arg m)
-    {
-        Matrix r;
-        for (int i = 0; i < 4; i++)
-        {
-            for (int j = 0; j < 4; j++)
-            {
-                r(i, j) = m(j, i);
-            }
-        }
-        return r;
-    }
-
-    inline Matrix inverse(Matrix::Arg m)
-    {
-        Matrix r;
-        r.data( 0) = m.data(6)*m.data(11)*m.data(13) - m.data(7)*m.data(10)*m.data(13) + m.data(7)*m.data(9)*m.data(14) - m.data(5)*m.data(11)*m.data(14) - m.data(6)*m.data(9)*m.data(15) + m.data(5)*m.data(10)*m.data(15);
-        r.data( 1) = m.data(3)*m.data(10)*m.data(13) - m.data(2)*m.data(11)*m.data(13) - m.data(3)*m.data(9)*m.data(14) + m.data(1)*m.data(11)*m.data(14) + m.data(2)*m.data(9)*m.data(15) - m.data(1)*m.data(10)*m.data(15);
-        r.data( 2) = m.data(2)*m.data( 7)*m.data(13) - m.data(3)*m.data( 6)*m.data(13) + m.data(3)*m.data(5)*m.data(14) - m.data(1)*m.data( 7)*m.data(14) - m.data(2)*m.data(5)*m.data(15) + m.data(1)*m.data( 6)*m.data(15);
-        r.data( 3) = m.data(3)*m.data( 6)*m.data( 9) - m.data(2)*m.data( 7)*m.data( 9) - m.data(3)*m.data(5)*m.data(10) + m.data(1)*m.data( 7)*m.data(10) + m.data(2)*m.data(5)*m.data(11) - m.data(1)*m.data( 6)*m.data(11);
-        r.data( 4) = m.data(7)*m.data(10)*m.data(12) - m.data(6)*m.data(11)*m.data(12) - m.data(7)*m.data(8)*m.data(14) + m.data(4)*m.data(11)*m.data(14) + m.data(6)*m.data(8)*m.data(15) - m.data(4)*m.data(10)*m.data(15);
-        r.data( 5) = m.data(2)*m.data(11)*m.data(12) - m.data(3)*m.data(10)*m.data(12) + m.data(3)*m.data(8)*m.data(14) - m.data(0)*m.data(11)*m.data(14) - m.data(2)*m.data(8)*m.data(15) + m.data(0)*m.data(10)*m.data(15);
-        r.data( 6) = m.data(3)*m.data( 6)*m.data(12) - m.data(2)*m.data( 7)*m.data(12) - m.data(3)*m.data(4)*m.data(14) + m.data(0)*m.data( 7)*m.data(14) + m.data(2)*m.data(4)*m.data(15) - m.data(0)*m.data( 6)*m.data(15);
-        r.data( 7) = m.data(2)*m.data( 7)*m.data( 8) - m.data(3)*m.data( 6)*m.data( 8) + m.data(3)*m.data(4)*m.data(10) - m.data(0)*m.data( 7)*m.data(10) - m.data(2)*m.data(4)*m.data(11) + m.data(0)*m.data( 6)*m.data(11);
-        r.data( 8) = m.data(5)*m.data(11)*m.data(12) - m.data(7)*m.data( 9)*m.data(12) + m.data(7)*m.data(8)*m.data(13) - m.data(4)*m.data(11)*m.data(13) - m.data(5)*m.data(8)*m.data(15) + m.data(4)*m.data( 9)*m.data(15);
-        r.data( 9) = m.data(3)*m.data( 9)*m.data(12) - m.data(1)*m.data(11)*m.data(12) - m.data(3)*m.data(8)*m.data(13) + m.data(0)*m.data(11)*m.data(13) + m.data(1)*m.data(8)*m.data(15) - m.data(0)*m.data( 9)*m.data(15);
-        r.data(10) = m.data(1)*m.data( 7)*m.data(12) - m.data(3)*m.data( 5)*m.data(12) + m.data(3)*m.data(4)*m.data(13) - m.data(0)*m.data( 7)*m.data(13) - m.data(1)*m.data(4)*m.data(15) + m.data(0)*m.data( 5)*m.data(15);
-        r.data(11) = m.data(3)*m.data( 5)*m.data( 8) - m.data(1)*m.data( 7)*m.data( 8) - m.data(3)*m.data(4)*m.data( 9) + m.data(0)*m.data( 7)*m.data( 9) + m.data(1)*m.data(4)*m.data(11) - m.data(0)*m.data( 5)*m.data(11);
-        r.data(12) = m.data(6)*m.data( 9)*m.data(12) - m.data(5)*m.data(10)*m.data(12) - m.data(6)*m.data(8)*m.data(13) + m.data(4)*m.data(10)*m.data(13) + m.data(5)*m.data(8)*m.data(14) - m.data(4)*m.data( 9)*m.data(14);
-        r.data(13) = m.data(1)*m.data(10)*m.data(12) - m.data(2)*m.data( 9)*m.data(12) + m.data(2)*m.data(8)*m.data(13) - m.data(0)*m.data(10)*m.data(13) - m.data(1)*m.data(8)*m.data(14) + m.data(0)*m.data( 9)*m.data(14);
-        r.data(14) = m.data(2)*m.data( 5)*m.data(12) - m.data(1)*m.data( 6)*m.data(12) - m.data(2)*m.data(4)*m.data(13) + m.data(0)*m.data( 6)*m.data(13) + m.data(1)*m.data(4)*m.data(14) - m.data(0)*m.data( 5)*m.data(14);
-        r.data(15) = m.data(1)*m.data( 6)*m.data( 8) - m.data(2)*m.data( 5)*m.data( 8) + m.data(2)*m.data(4)*m.data( 9) - m.data(0)*m.data( 6)*m.data( 9) - m.data(1)*m.data(4)*m.data(10) + m.data(0)*m.data( 5)*m.data(10);
-        r.scale(1.0f / m.determinant());
-        return r;
-    }
-
-    inline Matrix isometryInverse(Matrix::Arg m)
-    {
-        Matrix r(identity);
-
-        // transposed 3x3 upper left matrix
-        for (int i = 0; i < 3; i++)
-        {
-            for (int j = 0; j < 3; j++)
-            {
-                r(i, j) = m(j, i);
-            }
-        }
-
-        // translate by the negative offsets
-        r.translate(-Vector3(m.data(12), m.data(13), m.data(14)));
-
-        return r;
-    }
-
-    //Matrix affineInverse(Matrix::Arg m);
-
-    /// Transform the given 3d point with the given matrix.
-    inline Vector3 transformPoint(Matrix::Arg m, Vector3::Arg p)
-    {
-        return Vector3(
-            p.x * m(0,0) + p.y * m(0,1) + p.z * m(0,2) + m(0,3),
-            p.x * m(1,0) + p.y * m(1,1) + p.z * m(1,2) + m(1,3),
-            p.x * m(2,0) + p.y * m(2,1) + p.z * m(2,2) + m(2,3));
-    }
-
-    /// Transform the given 3d vector with the given matrix.
-    inline Vector3 transformVector(Matrix::Arg m, Vector3::Arg p)
-    {
-        return Vector3(
-            p.x * m(0,0) + p.y * m(0,1) + p.z * m(0,2),
-            p.x * m(1,0) + p.y * m(1,1) + p.z * m(1,2),
-            p.x * m(2,0) + p.y * m(2,1) + p.z * m(2,2));
-    }
-
-    /// Transform the given 4d vector with the given matrix.
-    inline Vector4 transform(Matrix::Arg m, Vector4::Arg p)
-    {
-        return Vector4(
-            p.x * m(0,0) + p.y * m(0,1) + p.z * m(0,2) + p.w * m(0,3),
-            p.x * m(1,0) + p.y * m(1,1) + p.z * m(1,2) + p.w * m(1,3),
-            p.x * m(2,0) + p.y * m(2,1) + p.z * m(2,2) + p.w * m(2,3),
-            p.x * m(3,0) + p.y * m(3,1) + p.z * m(3,2) + p.w * m(3,3));
-    }
-
-    inline Matrix mul(Matrix::Arg a, Matrix::Arg b)
-    {
-        // @@ Is this the right order? mul(a, b) = b * a
-        Matrix m = a;
-        m.apply(b);
-        return m;
-    }
-
-} // nv namespace
-
-
-
-
-#if 0
-/** @name Special matrices. */
-//@{
-/** Generate a translation matrix. */
-void TranslationMatrix(const Vec3 & v) {
-    data[0] = 1; data[1] = 0; data[2] = 0; data[3] = 0;
-    data[4] = 0; data[5] = 1; data[6] = 0; data[7] = 0;
-    data[8] = 0; data[9] = 0; data[10] = 1; data[11] = 0;
-    data[12] = v.x; data[13] = v.y; data[14] = v.z; data[15] = 1;
-}
-
-/** Rotate theta degrees around v. */
-void RotationMatrix( scalar theta, scalar v0, scalar v1, scalar v2 ) {
-    scalar cost = cos(theta);
-    scalar sint = sin(theta);
-
-    if( 1 == v0 && 0 == v1 && 0 == v2 ) {
-        data[0] = 1.0f;	data[1] = 0.0f;	data[2] = 0.0f;	data[3] = 0.0f;
-        data[4] = 0.0f;	data[5] = cost;	data[6] = -sint;data[7] = 0.0f;
-        data[8] = 0.0f;	data[9] = sint;	data[10] = cost;data[11] = 0.0f;
-        data[12] = 0.0f;data[13] = 0.0f;data[14] = 0.0f;data[15] = 1.0f;
-    }
-    else if( 0 == v0  && 1 == v1 && 0 == v2 ) {
-        data[0] = cost;	data[1] = 0.0f;	data[2] = sint;	data[3] = 0.0f;
-        data[4] = 0.0f;	data[5] = 1.0f;	data[6] = 0.0f;	data[7] = 0.0f;
-        data[8] = -sint;data[9] = 0.0f;data[10] = cost;	data[11] = 0.0f;
-        data[12] = 0.0f;data[13] = 0.0f;data[14] = 0.0f;data[15] = 1.0f;
-    }
-    else if( 0 == v0 && 0 == v1 && 1 == v2 ) {
-        data[0] = cost;	data[1] = -sint;data[2] = 0.0f;	data[3] = 0.0f;
-        data[4] = sint; data[5] = cost;	data[6] = 0.0f;	data[7] = 0.0f;
-        data[8] = 0.0f;	data[9] = 0.0f;	data[10] = 1.0f;data[11] = 0.0f;
-        data[12] = 0.0f;data[13] = 0.0f;data[14] = 0.0f;data[15] = 1.0f;
-    } 
-    else {
-        //we need scale a,b,c to unit length.
-        scalar a2, b2, c2;
-        a2 = v0 * v0;
-        b2 = v1 * v1;
-        c2 = v2 * v2;
-
-        scalar iscale = 1.0f / sqrtf(a2 + b2 + c2);
-        v0 *= iscale;
-        v1 *= iscale;
-        v2 *= iscale;
-
-        scalar abm, acm, bcm;
-        scalar mcos, asin, bsin, csin;
-        mcos = 1.0f - cost;
-        abm = v0 * v1 * mcos;
-        acm = v0 * v2 * mcos;
-        bcm = v1 * v2 * mcos;
-        asin = v0 * sint;
-        bsin = v1 * sint;
-        csin = v2 * sint;
-        data[0] = a2 * mcos + cost;
-        data[1] = abm - csin;
-        data[2] = acm + bsin;
-        data[3] = abm + csin;
-        data[4] = 0.0f;
-        data[5] = b2 * mcos + cost;
-        data[6] = bcm - asin;
-        data[7] = acm - bsin;
-        data[8] = 0.0f;
-        data[9] = bcm + asin;
-        data[10] = c2 * mcos + cost;
-        data[11] = 0.0f;
-        data[12] = 0.0f;
-        data[13] = 0.0f;
-        data[14] = 0.0f;
-        data[15] = 1.0f;
-    }
-}
-
-/*
-void SkewMatrix(scalar angle, const Vec3 & v1, const Vec3 & v2) {
-v1.Normalize();
-v2.Normalize();
-
-Vec3 v3;
-v3.Cross(v1, v2);
-v3.Normalize();
-
-// Get skew factor.
-scalar costheta = Vec3DotProduct(v1, v2);
-scalar sintheta = Real.Sqrt(1 - costheta * costheta);
-scalar skew = tan(Trig.DegreesToRadians(angle) + acos(sintheta)) * sintheta - costheta;
-
-// Build orthonormal matrix.
-v1 = FXVector3.Cross(v3, v2);
-v1.Normalize();
-
-Matrix R = Matrix::Identity;
-R[0, 0] = v3.X;�// Not sure this is in the correct order...
-R[1, 0] = v3.Y;
-R[2, 0] = v3.Z;
-R[0, 1] = v1.X;
-R[1, 1] = v1.Y;
-R[2, 1] = v1.Z;
-R[0, 2] = v2.X;
-R[1, 2] = v2.Y;
-R[2, 2] = v2.Z;
-
-// Build skew matrix.
-Matrix S = Matrix::Identity;
-S[2, 1] = -skew;
-
-// Return skew transform.
-return R * S * R.Transpose;	// Not sure this is in the correct order...
-}
-*/
-
-/**
-* Generate rotation matrix for the euler angles. This is the same as computing
-* 3 rotation matrices and multiplying them together in our custom order.
-*
-* @todo Have to recompute this code for our new convention.
-**/
-void RotationMatrix( scalar yaw, scalar pitch, scalar roll ) {
-    scalar sy = sin(yaw+ToRadian(90));
-    scalar cy = cos(yaw+ToRadian(90));
-    scalar sp = sin(pitch-ToRadian(90));
-    scalar cp = cos(pitch-ToRadian(90));
-    scalar sr = sin(roll);
-    scalar cr = cos(roll);
-
-    data[0] = cr*cy + sr*sp*sy;
-    data[1] = cp*sy;
-    data[2] = -sr*cy + cr*sp*sy;
-    data[3] = 0;
-
-    data[4] = -cr*sy + sr*sp*cy;
-    data[5] = cp*cy;
-    data[6] = sr*sy + cr*sp*cy;
-    data[7] = 0;
-
-    data[8] = sr*cp;
-    data[9] = -sp;
-    data[10] = cr*cp;
-    data[11] = 0;
-
-    data[12] = 0;
-    data[13] = 0;
-    data[14] = 0;
-    data[15] = 1;
-}
-
-/** Create a frustum matrix with the far plane at the infinity. */
-void Frustum( scalar xmin, scalar xmax, scalar ymin, scalar ymax, scalar zNear, scalar zFar ) {
-    scalar one_deltax, one_deltay, one_deltaz, doubleznear;
-
-    doubleznear = 2.0f * zNear;
-    one_deltax = 1.0f / (xmax - xmin);
-    one_deltay = 1.0f / (ymax - ymin);
-    one_deltaz = 1.0f / (zFar - zNear);
-
-    data[0] = (scalar)(doubleznear * one_deltax);
-    data[1] = 0.0f;
-    data[2] = 0.0f;
-    data[3] = 0.0f;
-    data[4] = 0.0f;
-    data[5] = (scalar)(doubleznear * one_deltay);
-    data[6] = 0.f;
-    data[7] = 0.f;
-    data[8] = (scalar)((xmax + xmin) * one_deltax);
-    data[9] = (scalar)((ymax + ymin) * one_deltay);
-    data[10] = (scalar)(-(zFar + zNear) * one_deltaz);
-    data[11] = -1.f;
-    data[12] = 0.f;
-    data[13] = 0.f;
-    data[14] = (scalar)(-(zFar * doubleznear) * one_deltaz);
-    data[15] = 0.f;
-}
-
-/** Create a frustum matrix with the far plane at the infinity. */
-void FrustumInf( scalar xmin, scalar xmax, scalar ymin, scalar ymax, scalar zNear ) {
-    scalar one_deltax, one_deltay, doubleznear, nudge;
-
-    doubleznear = 2.0f * zNear;
-    one_deltax = 1.0f / (xmax - xmin);
-    one_deltay = 1.0f / (ymax - ymin);
-    nudge = 1.0; // 0.999;
-
-    data[0] = doubleznear * one_deltax;
-    data[1] = 0.0f;
-    data[2] = 0.0f;
-    data[3] = 0.0f;
-
-    data[4] = 0.0f;
-    data[5] = doubleznear * one_deltay;
-    data[6] = 0.f;
-    data[7] = 0.f;
-
-    data[8] = (xmax + xmin) * one_deltax;
-    data[9] = (ymax + ymin) * one_deltay;
-    data[10] = -1.0f * nudge;
-    data[11] = -1.0f;
-
-    data[12] = 0.f;
-    data[13] = 0.f;
-    data[14] = -doubleznear * nudge;
-    data[15] = 0.f;
-}
-
-/** Create an inverse frustum matrix with the far plane at the infinity. */
-void FrustumInfInv( scalar left, scalar right, scalar bottom, scalar top, scalar zNear ) {
-    // this matrix is wrong (not tested scalarly) I think it should be transposed.
-    data[0] = (right - left) / (2 * zNear);
-    data[1] = 0;
-    data[2] = 0;
-    data[3] = (right + left) / (2 * zNear);
-    data[4] = 0;
-    data[5] = (top - bottom) / (2 * zNear);
-    data[6] = 0;
-    data[7] = (top + bottom) / (2 * zNear);
-    data[8] = 0;
-    data[9] = 0;
-    data[10] = 0;
-    data[11] = -1;
-    data[12] = 0;
-    data[13] = 0;
-    data[14] = -1 / (2 * zNear);
-    data[15] = 1 / (2 * zNear);
-}
-
-/** Create an homogeneous projection matrix. */
-void Perspective( scalar fov, scalar aspect, scalar zNear, scalar zFar ) {
-    scalar xmin, xmax, ymin, ymax;
-
-    xmax = zNear * tan( fov/2 );
-    xmin = -xmax;
-
-    ymax = xmax / aspect;
-    ymin = -ymax;
-
-    Frustum(xmin, xmax, ymin, ymax, zNear, zFar);
-}
-
-/** Create a projection matrix with the far plane at the infinity. */
-void PerspectiveInf( scalar fov, scalar aspect, scalar zNear ) {
-    scalar x = zNear * tan( fov/2 );
-    scalar y = x / aspect;
-    FrustumInf( -x, x, -y, y, zNear );
-}
-
-/** Create an inverse projection matrix with far plane at the infinity. */
-void PerspectiveInfInv( scalar fov, scalar aspect, scalar zNear ) {
-    scalar x = zNear * tan( fov/2 );
-    scalar y = x / aspect;
-    FrustumInfInv( -x, x, -y, y, zNear );
-}
-
-/** Build bone matrix from quatertion and offset. */
-void BoneMatrix(const Quat & q, const Vec3 & offset) {
-    scalar x2, y2, z2, xx, xy, xz, yy, yz, zz, wx, wy, wz;
-
-    // calculate coefficients
-    x2 = q.x + q.x;
-    y2 = q.y + q.y;
-    z2 = q.z + q.z;
-
-    xx = q.x * x2;   xy = q.x * y2;   xz = q.x * z2;
-    yy = q.y * y2;   yz = q.y * z2;   zz = q.z * z2;
-    wx = q.w * x2;   wy = q.w * y2;   wz = q.w * z2;
-
-    data[0] = 1.0f - (yy + zz); 	
-    data[1] = xy - wz;
-    data[2] = xz + wy;		
-    data[3] = 0.0f;
-
-    data[4] = xy + wz;		
-    data[5] = 1.0f - (xx + zz);
-    data[6] = yz - wx;		
-    data[7] = 0.0f;
-
-    data[8] = xz - wy;		
-    data[9] = yz + wx;
-    data[10] = 1.0f - (xx + yy);		
-    data[11] = 0.0f;
-
-    data[12] = offset.x;
-    data[13] = offset.y;
-    data[14] = offset.z;			
-    data[15] = 1.0f;
-}
-
-//@}
-
-
-/** @name Transformations: */
-//@{
-
-/** Apply a general scale. */
-void Scale( scalar x, scalar y, scalar z ) {
-    data[0] *= x;	data[4] *= y;	data[8]  *= z;
-    data[1] *= x;	data[5] *= y;	data[9]  *= z;
-    data[2] *= x;	data[6] *= y;	data[10] *= z;
-    data[3] *= x;	data[7] *= y;	data[11] *= z;
-}
-
-/** Apply a rotation of theta degrees around the axis v*/
-void Rotate( scalar theta, const Vec3 & v ) {
-    Matrix b;
-    b.RotationMatrix( theta, v[0], v[1], v[2] );
-    Multiply4x3( b );
-}
-
-/** Apply a rotation of theta degrees around the axis v*/
-void Rotate( scalar theta, scalar v0, scalar v1, scalar v2 ) {
-    Matrix b;
-    b.RotationMatrix( theta, v0, v1, v2 );
-    Multiply4x3( b );
-}
-
-/**
-* Translate the matrix by t. This is the same as multiplying by a
-* translation matrix with the given offset.
-* this = T * this
-*/
-void Translate( const Vec3 &t ) {
-    data[12] = data[0] * t.x + data[4] * t.y + data[8]  * t.z + data[12];
-    data[13] = data[1] * t.x + data[5] * t.y + data[9]  * t.z + data[13];
-    data[14] = data[2] * t.x + data[6] * t.y + data[10] * t.z + data[14];
-    data[15] = data[3] * t.x + data[7] * t.y + data[11] * t.z + data[15];
-}
-
-/** 
-* Translate the matrix by x, y, z. This is the same as multiplying by a 
-* translation matrix with the given offsets.
-*/
-void Translate( scalar x, scalar y, scalar z ) {
-    data[12] = data[0] * x + data[4] * y + data[8]  * z + data[12];
-    data[13] = data[1] * x + data[5] * y + data[9]  * z + data[13];
-    data[14] = data[2] * x + data[6] * y + data[10] * z + data[14];
-    data[15] = data[3] * x + data[7] * y + data[11] * z + data[15];
-}
-
-/** Compute the transposed matrix. */
-void Transpose() {
-    piSwap(data[1], data[4]);
-    piSwap(data[2], data[8]);
-    piSwap(data[6], data[9]);
-    piSwap(data[3], data[12]);
-    piSwap(data[7], data[13]);
-    piSwap(data[11], data[14]);
-}
-
-/** Compute the inverse of a rigid-body/isometry/orthonormal matrix. */
-void IsometryInverse() {
-    // transposed 3x3 upper left matrix
-    piSwap(data[1], data[4]);
-    piSwap(data[2], data[8]);
-    piSwap(data[6], data[9]);
-
-    // translate by the negative offsets
-    Vec3 v(-data[12], -data[13], -data[14]);
-    data[12] = data[13] = data[14] = 0;
-    Translate(v);
-}
-
-/** Compute the inverse of the affine portion of this matrix. */
-void AffineInverse() {
-    data[12] = data[13] = data[14] = 0;
-    Transpose();
-}
-//@}
-
-/** @name Matrix operations: */
-//@{
-
-/** Return the determinant of this matrix. */
-scalar Determinant() const {
-    return	data[0] * data[5] * data[10] * data[15] + 
-        data[1] * data[6] * data[11] * data[12] +
-        data[2] * data[7] * data[ 8] * data[13] +
-        data[3] * data[4] * data[ 9] * data[14] -
-        data[3] * data[6] * data[ 9] * data[12] -
-        data[2] * data[5] * data[ 8] * data[15] -
-        data[1] * data[4] * data[11] * data[14] -
-        data[0] * data[7] * data[10] * data[12];
-}
-
-
-/** Standard matrix product: this *= B. */
-void Multiply4x4( const Matrix & restrict B ) {
-    Multiply4x4(*this, B);
-}
-
-/** Standard matrix product: this = A * B. this != B*/
-void Multiply4x4( const Matrix & A, const Matrix & restrict B ) {
-    piDebugCheck(this != &B);
-
-    for(int i = 0; i < 4; i++) {
-        const scalar ai0 = A(i,0), ai1 = A(i,1), ai2 = A(i,2), ai3 = A(i,3);
-        GetElem(i,0) = ai0 * B(0,0) + ai1 * B(1,0) + ai2 * B(2,0) + ai3 * B(3,0);
-        GetElem(i,1) = ai0 * B(0,1) + ai1 * B(1,1) + ai2 * B(2,1) + ai3 * B(3,1);
-        GetElem(i,2) = ai0 * B(0,2) + ai1 * B(1,2) + ai2 * B(2,2) + ai3 * B(3,2);
-        GetElem(i,3) = ai0 * B(0,3) + ai1 * B(1,3) + ai2 * B(2,3) + ai3 * B(3,3);
-    }
-
-    /* Unrolled but does not allow this == A
-    data[0] = A.data[0] * B.data[0] + A.data[4] * B.data[1] + A.data[8] * B.data[2] + A.data[12] * B.data[3];
-    data[1] = A.data[1] * B.data[0] + A.data[5] * B.data[1] + A.data[9] * B.data[2] + A.data[13] * B.data[3];
-    data[2] = A.data[2] * B.data[0] + A.data[6] * B.data[1] + A.data[10] * B.data[2] + A.data[14] * B.data[3];
-    data[3] = A.data[3] * B.data[0] + A.data[7] * B.data[1] + A.data[11] * B.data[2] + A.data[15] * B.data[3];
-    data[4] = A.data[0] * B.data[4] + A.data[4] * B.data[5] + A.data[8] * B.data[6] + A.data[12] * B.data[7];
-    data[5] = A.data[1] * B.data[4] + A.data[5] * B.data[5] + A.data[9] * B.data[6] + A.data[13] * B.data[7];
-    data[6] = A.data[2] * B.data[4] + A.data[6] * B.data[5] + A.data[10] * B.data[6] + A.data[14] * B.data[7];
-    data[7] = A.data[3] * B.data[4] + A.data[7] * B.data[5] + A.data[11] * B.data[6] + A.data[15] * B.data[7];
-    data[8] = A.data[0] * B.data[8] + A.data[4] * B.data[9] + A.data[8] * B.data[10] + A.data[12] * B.data[11];
-    data[9] = A.data[1] * B.data[8] + A.data[5] * B.data[9] + A.data[9] * B.data[10] + A.data[13] * B.data[11];
-    data[10]= A.data[2] * B.data[8] + A.data[6] * B.data[9] + A.data[10] * B.data[10] + A.data[14] * B.data[11];
-    data[11]= A.data[3] * B.data[8] + A.data[7] * B.data[9] + A.data[11] * B.data[10] + A.data[15] * B.data[11];
-    data[12]= A.data[0] * B.data[12] + A.data[4] * B.data[13] + A.data[8] * B.data[14] + A.data[12] * B.data[15];
-    data[13]= A.data[1] * B.data[12] + A.data[5] * B.data[13] + A.data[9] * B.data[14] + A.data[13] * B.data[15];
-    data[14]= A.data[2] * B.data[12] + A.data[6] * B.data[13] + A.data[10] * B.data[14] + A.data[14] * B.data[15];
-    data[15]= A.data[3] * B.data[12] + A.data[7] * B.data[13] + A.data[11] * B.data[14] + A.data[15] * B.data[15];
-    */
-}
-
-/** Standard matrix product: this *= B. */
-void Multiply4x3( const Matrix & restrict B ) {
-    Multiply4x3(*this, B);
-}
-
-/** Standard product of matrices, where the last row is [0 0 0 1]. */
-void Multiply4x3( const Matrix & A, const Matrix & restrict B ) {
-    piDebugCheck(this != &B);
-
-    for(int i = 0; i < 3; i++) {
-        const scalar ai0 = A(i,0), ai1 = A(i,1), ai2 = A(i,2), ai3 = A(i,3);
-        GetElem(i,0) = ai0 * B(0,0) + ai1 * B(1,0) + ai2 * B(2,0) + ai3 * B(3,0);
-        GetElem(i,1) = ai0 * B(0,1) + ai1 * B(1,1) + ai2 * B(2,1) + ai3 * B(3,1);
-        GetElem(i,2) = ai0 * B(0,2) + ai1 * B(1,2) + ai2 * B(2,2) + ai3 * B(3,2);
-        GetElem(i,3) = ai0 * B(0,3) + ai1 * B(1,3) + ai2 * B(2,3) + ai3 * B(3,3);
-    }
-    data[3] = 0.0f; data[7] = 0.0f; data[11] = 0.0f; data[15] = 1.0f;
-
-    /* Unrolled but does not allow this == A
-    data[0] = a.data[0] * b.data[0] + a.data[4] * b.data[1] + a.data[8] * b.data[2] + a.data[12] * b.data[3];
-    data[1] = a.data[1] * b.data[0] + a.data[5] * b.data[1] + a.data[9] * b.data[2] + a.data[13] * b.data[3];
-    data[2] = a.data[2] * b.data[0] + a.data[6] * b.data[1] + a.data[10] * b.data[2] + a.data[14] * b.data[3];
-    data[3] = 0.0f;
-    data[4] = a.data[0] * b.data[4] + a.data[4] * b.data[5] + a.data[8] * b.data[6] + a.data[12] * b.data[7];
-    data[5] = a.data[1] * b.data[4] + a.data[5] * b.data[5] + a.data[9] * b.data[6] + a.data[13] * b.data[7];
-    data[6] = a.data[2] * b.data[4] + a.data[6] * b.data[5] + a.data[10] * b.data[6] + a.data[14] * b.data[7];
-    data[7] = 0.0f;
-    data[8] = a.data[0] * b.data[8] + a.data[4] * b.data[9] + a.data[8] * b.data[10] + a.data[12] * b.data[11];
-    data[9] = a.data[1] * b.data[8] + a.data[5] * b.data[9] + a.data[9] * b.data[10] + a.data[13] * b.data[11];
-    data[10]= a.data[2] * b.data[8] + a.data[6] * b.data[9] + a.data[10] * b.data[10] + a.data[14] * b.data[11];
-    data[11]= 0.0f;
-    data[12]= a.data[0] * b.data[12] + a.data[4] * b.data[13] + a.data[8] * b.data[14] + a.data[12] * b.data[15];
-    data[13]= a.data[1] * b.data[12] + a.data[5] * b.data[13] + a.data[9] * b.data[14] + a.data[13] * b.data[15];
-    data[14]= a.data[2] * b.data[12] + a.data[6] * b.data[13] + a.data[10] * b.data[14] + a.data[14] * b.data[15];
-    data[15]= 1.0f;
-    */
-}
-//@}
-
-
-/** @name Vector operations: */
-//@{
-
-/** Transform 3d vector (w=0). */
-void TransformVec3(const Vec3 & restrict orig, Vec3 * restrict dest) const {
-    piDebugCheck(&orig != dest);
-    dest->x = orig.x * data[0] + orig.y * data[4] + orig.z * data[8];
-    dest->y = orig.x * data[1] + orig.y * data[5] + orig.z * data[9];
-    dest->z = orig.x * data[2] + orig.y * data[6] + orig.z * data[10];
-}
-/** Transform 3d vector by the transpose (w=0). */
-void TransformVec3T(const Vec3 & restrict orig, Vec3 * restrict dest) const {
-    piDebugCheck(&orig != dest);
-    dest->x = orig.x * data[0] + orig.y * data[1] + orig.z * data[2];
-    dest->y = orig.x * data[4] + orig.y * data[5] + orig.z * data[6];
-    dest->z = orig.x * data[8] + orig.y * data[9] + orig.z * data[10];
-}
-
-/** Transform a 3d homogeneous vector, where the fourth coordinate is assumed to be 1. */
-void TransformPoint(const Vec3 & restrict orig, Vec3 * restrict dest) const {
-    piDebugCheck(&orig != dest);
-    dest->x = orig.x * data[0] + orig.y * data[4] + orig.z * data[8] + data[12];
-    dest->y = orig.x * data[1] + orig.y * data[5] + orig.z * data[9] + data[13];
-    dest->z = orig.x * data[2] + orig.y * data[6] + orig.z * data[10] + data[14];
-}
-
-/** Transform a point, normalize it, and return w. */
-scalar TransformPointAndNormalize(const Vec3 & restrict orig, Vec3 * restrict dest) const {
-    piDebugCheck(&orig != dest);
-    scalar w;
-    dest->x = orig.x * data[0] + orig.y * data[4] + orig.z * data[8] + data[12];
-    dest->y = orig.x * data[1] + orig.y * data[5] + orig.z * data[9] + data[13];
-    dest->z = orig.x * data[2] + orig.y * data[6] + orig.z * data[10] + data[14];
-    w = 1 / (orig.x * data[3] + orig.y * data[7] + orig.z * data[11] + data[15]);
-    *dest *= w;
-    return w;
-}
-
-/** Transform a point and return w. */
-scalar TransformPointReturnW(const Vec3 & restrict orig, Vec3 * restrict dest) const {
-    piDebugCheck(&orig != dest);
-    dest->x = orig.x * data[0] + orig.y * data[4] + orig.z * data[8] + data[12];
-    dest->y = orig.x * data[1] + orig.y * data[5] + orig.z * data[9] + data[13];
-    dest->z = orig.x * data[2] + orig.y * data[6] + orig.z * data[10] + data[14];
-    return orig.x * data[3] + orig.y * data[7] + orig.z * data[11] + data[15];
-}
-
-/** Transform a normalized 3d point by a 4d matrix and return the resulting 4d vector. */
-void TransformVec4(const Vec3 & orig, Vec4 * dest) const {
-    dest->x = orig.x * data[0] + orig.y * data[4] + orig.z * data[8] + data[12];
-    dest->y = orig.x * data[1] + orig.y * data[5] + orig.z * data[9] + data[13];
-    dest->z = orig.x * data[2] + orig.y * data[6] + orig.z * data[10] + data[14];
-    dest->w = orig.x * data[3] + orig.y * data[7] + orig.z * data[11] + data[15];
-}
-//@}
-
-/** @name Matrix analysis. */
-//@{
-
-/** Get the ZYZ euler angles from the matrix. Assumes the matrix is orthonormal. */
-void GetEulerAnglesZYZ(scalar * s, scalar * t, scalar * r) const {
-    if( GetElem(2,2) < 1.0f ) {
-        if( GetElem(2,2) > -1.0f ) {
-            // 	cs*ct*cr-ss*sr 		-ss*ct*cr-cs*sr		st*cr
-            //	cs*ct*sr+ss*cr		-ss*ct*sr+cs*cr		st*sr
-            //	-cs*st				ss*st				ct
-            *s = atan2(GetElem(1,2), -GetElem(0,2));
-            *t = acos(GetElem(2,2));
-            *r = atan2(GetElem(2,1), GetElem(2,0));		
-        }
-        else {
-            // 	-c(s-r)	 	s(s-r)		0
-            //	s(s-r)		c(s-r)		0
-            //	0			0			-1
-            *s = atan2(GetElem(0, 1), -GetElem(0, 0)); // = s-r
-            *t = PI;
-            *r = 0;
-        }
-    }
-    else {
-        // 	c(s+r)		-s(s+r)		0
-        //	s(s+r)		c(s+r)		0
-        //	0			0			1
-        *s = atan2(GetElem(0, 1), GetElem(0, 0)); // = s+r
-        *t = 0;
-        *r = 0;
-    }
-}
-
-//@}
-
-MATHLIB_API friend PiStream & operator<< ( PiStream & s, Matrix & m );
-
-/** Print to debug output. */
-void Print() const {
-    piDebug( "[ %5.2f %5.2f %5.2f %5.2f ]\n", data[0], data[4], data[8], data[12] );
-    piDebug( "[ %5.2f %5.2f %5.2f %5.2f ]\n", data[1], data[5], data[9], data[13] );
-    piDebug( "[ %5.2f %5.2f %5.2f %5.2f ]\n", data[2], data[6], data[10], data[14] );
-    piDebug( "[ %5.2f %5.2f %5.2f %5.2f ]\n", data[3], data[7], data[11], data[15] );
-}
-
-
-public:
-
-    scalar data[16];
-
-};
-#endif
-
-
-
-
-#endif // NV_MATH_MATRIX_H
+// This code is in the public domain -- castanyo@yahoo.es
+
+#pragma once
+#ifndef NV_MATH_MATRIX_H
+#define NV_MATH_MATRIX_H
+
+#include <nvmath/nvmath.h>
+#include <nvmath/Vector.h>
+
+namespace nv
+{
+    enum identity_t { identity };
+
+    class NVMATH_CLASS Matrix3
+    {
+    public:
+        Matrix3();
+        explicit Matrix3(float f);
+        explicit Matrix3(identity_t);
+        Matrix3(const Matrix3 & m);
+        Matrix3(Vector3::Arg v0, Vector3::Arg v1, Vector3::Arg v2);
+
+        scalar get(uint row, uint col) const;
+        scalar operator()(uint row, uint col) const;
+        scalar & operator()(uint row, uint col);
+
+        Vector3 row(uint i) const;
+        Vector3 column(uint i) const;
+
+        void operator*=(float s);
+        void operator/=(float s);
+        void operator+=(const Matrix3 & m);
+        void operator-=(const Matrix3 & m);
+
+        float determinant() const;
+
+    private:
+        scalar m_data[9];
+    };
+
+    inline Matrix3::Matrix3() {}
+    
+    inline Matrix3::Matrix3(float f)
+    {
+        for(int i = 0; i < 9; i++) {
+            m_data[i] = f;
+        }
+    }
+
+    inline Matrix3::Matrix3(identity_t)
+    {
+        for(int i = 0; i < 3; i++) {
+            for(int j = 0; j < 3; j++) {
+                m_data[3*j+i] = (i == j) ? 1.0f : 0.0f;
+            }
+        }
+    }
+
+    inline Matrix3::Matrix3(const Matrix3 & m)
+    {
+        for(int i = 0; i < 9; i++) {
+            m_data[i] = m.m_data[i];
+        }
+    }
+    
+    inline Matrix3::Matrix3(Vector3::Arg v0, Vector3::Arg v1, Vector3::Arg v2)
+    {
+        m_data[0] = v0.x; m_data[1] = v0.y; m_data[2] = v0.z;
+        m_data[3] = v1.x; m_data[4] = v1.y; m_data[5] = v1.z;
+        m_data[6] = v2.x; m_data[7] = v2.y; m_data[8] = v2.z;
+    }
+
+    inline scalar Matrix3::get(uint row, uint col) const
+    {
+        nvDebugCheck(row < 3 && col < 3);
+        return m_data[col * 3 + row];
+    }
+    inline scalar Matrix3::operator()(uint row, uint col) const
+    {
+        nvDebugCheck(row < 3 && col < 3);
+        return m_data[col * 3 + row];
+    }
+    inline scalar & Matrix3::operator()(uint row, uint col)
+    {
+        nvDebugCheck(row < 3 && col < 3);
+        return m_data[col * 3 + row];
+    }
+
+    inline Vector3 Matrix3::row(uint i) const
+    {
+        nvDebugCheck(i < 3);
+        return Vector3(get(i, 0), get(i, 1), get(i, 2));
+    }
+    inline Vector3 Matrix3::column(uint i) const
+    {
+        nvDebugCheck(i < 3);
+        return Vector3(get(0, i), get(1, i), get(2, i));
+    }
+
+    inline void Matrix3::operator*=(float s)
+    {
+        for(int i = 0; i < 9; i++) {
+            m_data[i] *= s;
+        }
+    }
+
+    inline void Matrix3::operator/=(float s)
+    {
+        float is = 1.0f /s;
+        for(int i = 0; i < 9; i++) {
+            m_data[i] *= is;
+        }
+    }
+
+    inline void Matrix3::operator+=(const Matrix3 & m)
+    {
+        for(int i = 0; i < 9; i++) {
+            m_data[i] += m.m_data[i];
+        }
+    }
+
+    inline void Matrix3::operator-=(const Matrix3 & m)
+    {
+        for(int i = 0; i < 9; i++) {
+            m_data[i] -= m.m_data[i];
+        }
+    }
+
+    inline Matrix3 operator+(const Matrix3 & a, const Matrix3 & b)
+    {
+        Matrix3 m = a;
+        m += b;
+        return m;
+    }
+
+    inline Matrix3 operator-(const Matrix3 & a, const Matrix3 & b)
+    {
+        Matrix3 m = a;
+        m -= b;
+        return m;
+    }
+
+    inline Matrix3 operator*(const Matrix3 & a, float s)
+    {
+        Matrix3 m = a;
+        m *= s;
+        return m;
+    }
+
+    inline Matrix3 operator*(float s, const Matrix3 & a)
+    {
+        Matrix3 m = a;
+        m *= s;
+        return m;
+    }
+
+    inline Matrix3 operator/(const Matrix3 & a, float s)
+    {
+        Matrix3 m = a;
+        m /= s;
+        return m;
+    }
+
+    inline Matrix3 mul(const Matrix3 & a, const Matrix3 & b)
+    {
+        Matrix3 m;
+
+        for(int i = 0; i < 3; i++) {
+            const scalar ai0 = a(i,0), ai1 = a(i,1), ai2 = a(i,2);
+            m(i, 0) = ai0 * b(0,0) + ai1 * b(1,0) + ai2 * b(2,0);
+            m(i, 1) = ai0 * b(0,1) + ai1 * b(1,1) + ai2 * b(2,1);
+            m(i, 2) = ai0 * b(0,2) + ai1 * b(1,2) + ai2 * b(2,2);
+        }
+
+        return m;
+    }
+
+    inline Matrix3 operator*(const Matrix3 & a, const Matrix3 & b)
+    {
+        return mul(a, b);
+    }
+
+    inline float Matrix3::determinant() const
+    {
+        return 
+            get(0,0) * get(1,1) * get(2,2) + 
+            get(0,1) * get(1,2) * get(2,0) + 
+            get(0,2) * get(1,0) * get(2,1) -
+            get(0,2) * get(1,1) * get(2,0) - 
+            get(0,1) * get(1,0) * get(2,2) -
+            get(0,0) * get(1,2) * get(2,1);
+    }
+
+
+
+    /// 4x4 transformation matrix.
+    /// -# Matrices are stored in memory in column major order.
+    /// -# Points are to be though of as column vectors.
+    /// -# Transformation of a point p by a matrix M is: p' = M * p
+    class NVMATH_CLASS Matrix
+    {
+    public:
+        typedef Matrix const & Arg;
+
+        Matrix();
+        explicit Matrix(float f);
+        explicit Matrix(identity_t);
+        Matrix(const Matrix & m);
+        Matrix(Vector4::Arg v0, Vector4::Arg v1, Vector4::Arg v2, Vector4::Arg v3);
+        //explicit Matrix(const scalar m[]);	// m is assumed to contain 16 elements
+
+        scalar data(uint idx) const;
+        scalar & data(uint idx);
+        scalar get(uint row, uint col) const;
+        scalar operator()(uint row, uint col) const;
+        scalar & operator()(uint row, uint col);
+        const scalar * ptr() const;
+
+        Vector4 row(uint i) const;
+        Vector4 column(uint i) const;
+
+        void scale(scalar s);
+        void scale(Vector3::Arg s);
+        void translate(Vector3::Arg t);
+        void rotate(scalar theta, scalar v0, scalar v1, scalar v2);
+        scalar determinant() const;
+
+        void apply(Matrix::Arg m);
+
+    private:
+        scalar m_data[16];
+    };
+
+
+    inline Matrix::Matrix()
+    {
+    }
+
+    inline Matrix::Matrix(float f)
+    {
+        for(int i = 0; i < 16; i++) {
+            m_data[i] = 0.0f;
+        }
+    }
+
+    inline Matrix::Matrix(identity_t)
+    {
+        for(int i = 0; i < 4; i++) {
+            for(int j = 0; j < 4; j++) {
+                m_data[4*j+i] = (i == j) ? 1.0f : 0.0f;
+            }
+        }
+    }
+
+    inline Matrix::Matrix(const Matrix & m)
+    {
+        for(int i = 0; i < 16; i++) {
+            m_data[i] = m.m_data[i];
+        }
+    }
+
+    inline Matrix::Matrix(Vector4::Arg v0, Vector4::Arg v1, Vector4::Arg v2, Vector4::Arg v3)
+    {
+        m_data[ 0] = v0.x; m_data[ 1] = v0.y; m_data[ 2] = v0.z; m_data[ 3] = v0.w;
+        m_data[ 4] = v1.x; m_data[ 5] = v1.y; m_data[ 6] = v1.z; m_data[ 7] = v1.w;
+        m_data[ 8] = v2.x; m_data[ 9] = v2.y; m_data[10] = v2.z; m_data[11] = v2.w;
+        m_data[12] = v3.x; m_data[13] = v3.y; m_data[14] = v3.z; m_data[15] = v3.w;
+    }
+
+    /*inline Matrix::Matrix(const scalar m[])
+    {
+        for(int i = 0; i < 16; i++) {
+            m_data[i] = m[i];
+        }
+    }*/
+
+
+    // Accessors
+    inline scalar Matrix::data(uint idx) const
+    {
+        nvDebugCheck(idx < 16);
+        return m_data[idx];
+    }
+    inline scalar & Matrix::data(uint idx)
+    {
+        nvDebugCheck(idx < 16);
+        return m_data[idx];
+    }
+    inline scalar Matrix::get(uint row, uint col) const
+    {
+        nvDebugCheck(row < 4 && col < 4);
+        return m_data[col * 4 + row];
+    }
+    inline scalar Matrix::operator()(uint row, uint col) const
+    {
+        nvDebugCheck(row < 4 && col < 4);
+        return m_data[col * 4 + row];
+    }
+    inline scalar & Matrix::operator()(uint row, uint col)
+    {
+        nvDebugCheck(row < 4 && col < 4);
+        return m_data[col * 4 + row];
+    }
+
+    inline const scalar * Matrix::ptr() const
+    {
+        return m_data;
+    }
+
+    inline Vector4 Matrix::row(uint i) const
+    {
+        nvDebugCheck(i < 4);
+        return Vector4(get(i, 0), get(i, 1), get(i, 2), get(i, 3));
+    }
+
+    inline Vector4 Matrix::column(uint i) const
+    {
+        nvDebugCheck(i < 4);
+        return Vector4(get(0, i), get(1, i), get(2, i), get(3, i));
+    }
+
+    /// Apply scale.
+    inline void Matrix::scale(scalar s)
+    {
+        m_data[0] *= s; m_data[1] *= s; m_data[2] *= s; m_data[3] *= s;
+        m_data[4] *= s; m_data[5] *= s; m_data[6] *= s; m_data[7] *= s;
+        m_data[8] *= s; m_data[9] *= s; m_data[10] *= s; m_data[11] *= s;
+        m_data[12] *= s; m_data[13] *= s; m_data[14] *= s; m_data[15] *= s;
+    }
+
+    /// Apply scale.
+    inline void Matrix::scale(Vector3::Arg s)
+    {
+        m_data[0] *= s.x; m_data[1] *= s.x; m_data[2] *= s.x; m_data[3] *= s.x;
+        m_data[4] *= s.y; m_data[5] *= s.y; m_data[6] *= s.y; m_data[7] *= s.y;
+        m_data[8] *= s.z; m_data[9] *= s.z; m_data[10] *= s.z; m_data[11] *= s.z;
+    }
+
+    /// Apply translation.
+    inline void Matrix::translate(Vector3::Arg t)
+    {
+        m_data[12] = m_data[0] * t.x + m_data[4] * t.y + m_data[8]  * t.z + m_data[12];
+        m_data[13] = m_data[1] * t.x + m_data[5] * t.y + m_data[9]  * t.z + m_data[13];
+        m_data[14] = m_data[2] * t.x + m_data[6] * t.y + m_data[10] * t.z + m_data[14];
+        m_data[15] = m_data[3] * t.x + m_data[7] * t.y + m_data[11] * t.z + m_data[15];
+    }
+
+    Matrix rotation(scalar theta, scalar v0, scalar v1, scalar v2);
+
+    /// Apply rotation.
+    inline void Matrix::rotate(scalar theta, scalar v0, scalar v1, scalar v2)
+    {
+        Matrix R(rotation(theta, v0, v1, v2));
+        apply(R);
+    }
+
+    /// Apply transform.
+    inline void Matrix::apply(Matrix::Arg m)
+    {
+        nvDebugCheck(this != &m);
+
+        for(int i = 0; i < 4; i++) {
+            const scalar ai0 = get(i,0), ai1 = get(i,1), ai2 = get(i,2), ai3 = get(i,3);
+            m_data[0 + i] = ai0 * m(0,0) + ai1 * m(1,0) + ai2 * m(2,0) + ai3 * m(3,0);
+            m_data[4 + i] = ai0 * m(0,1) + ai1 * m(1,1) + ai2 * m(2,1) + ai3 * m(3,1);
+            m_data[8 + i] = ai0 * m(0,2) + ai1 * m(1,2) + ai2 * m(2,2) + ai3 * m(3,2);
+            m_data[12+ i] = ai0 * m(0,3) + ai1 * m(1,3) + ai2 * m(2,3) + ai3 * m(3,3);
+        }
+    }
+
+    /// Get scale matrix.
+    inline Matrix scale(Vector3::Arg s)
+    {
+        Matrix m(identity);
+        m(0,0) = s.x;
+        m(1,1) = s.y;
+        m(2,2) = s.z;
+        return m;
+    }
+
+    /// Get scale matrix.
+    inline Matrix scale(scalar s)
+    {
+        Matrix m(identity);
+        m(0,0) = m(1,1) = m(2,2) = s;
+        return m;
+    }
+
+    /// Get translation matrix.
+    inline Matrix translation(Vector3::Arg t)
+    {
+        Matrix m(identity);
+        m(0,3) = t.x;
+        m(1,3) = t.y;
+        m(2,3) = t.z;
+        return m;
+    }
+
+    /// Get rotation matrix.
+    inline Matrix rotation(scalar theta, scalar v0, scalar v1, scalar v2)
+    {
+        scalar cost = cosf(theta);
+        scalar sint = sinf(theta);
+
+        Matrix m(identity);
+
+        if( 1 == v0 && 0 == v1 && 0 == v2 ) {
+            m(1,1) = cost; m(2,1) = -sint;
+            m(1,2) = sint; m(2,2) = cost;
+        }
+        else if( 0 == v0  && 1 == v1 && 0 == v2 ) {
+            m(0,0) = cost; m(2,0) = sint;
+            m(1,2) = -sint; m(2,2) = cost;
+        }
+        else if( 0 == v0 && 0 == v1 && 1 == v2 ) {
+            m(0,0) = cost; m(1,0) = -sint;
+            m(0,1) = sint; m(1,1) = cost;
+        } 
+        else {
+            scalar a2, b2, c2;
+            a2 = v0 * v0;
+            b2 = v1 * v1;
+            c2 = v2 * v2;
+
+            scalar iscale = 1.0f / sqrtf(a2 + b2 + c2);
+            v0 *= iscale;
+            v1 *= iscale;
+            v2 *= iscale;
+
+            scalar abm, acm, bcm;
+            scalar mcos, asin, bsin, csin;
+            mcos = 1.0f - cost;
+            abm = v0 * v1 * mcos;
+            acm = v0 * v2 * mcos;
+            bcm = v1 * v2 * mcos;
+            asin = v0 * sint;
+            bsin = v1 * sint;
+            csin = v2 * sint;
+            m(0,0) = a2 * mcos + cost;
+            m(1,0) = abm - csin;
+            m(2,0) = acm + bsin;
+            m(3,0) = abm + csin;
+            m(1,1) = b2 * mcos + cost;
+            m(2,1) = bcm - asin;
+            m(3,1) = acm - bsin;
+            m(1,2) = bcm + asin;
+            m(2,2) = c2 * mcos + cost;
+        }
+        return m;
+    }
+
+    //Matrix rotation(scalar yaw, scalar pitch, scalar roll);
+    //Matrix skew(scalar angle, Vector3::Arg v1, Vector3::Arg v2);
+
+    /// Get frustum matrix.
+    inline Matrix frustum(scalar xmin, scalar xmax, scalar ymin, scalar ymax, scalar zNear, scalar zFar)
+    {
+        Matrix m(0.0f);
+
+        scalar doubleznear = 2.0f * zNear;
+        scalar one_deltax = 1.0f / (xmax - xmin);
+        scalar one_deltay = 1.0f / (ymax - ymin);
+        scalar one_deltaz = 1.0f / (zFar - zNear);
+
+        m(0,0) = doubleznear * one_deltax;
+        m(1,1) = doubleznear * one_deltay;
+        m(0,2) = (xmax + xmin) * one_deltax;
+        m(1,2) = (ymax + ymin) * one_deltay;
+        m(2,2) = -(zFar + zNear) * one_deltaz;
+        m(3,2) = -1.0f;
+        m(2,3) = -(zFar * doubleznear) * one_deltaz;
+
+        return m;
+    }
+
+    /// Get infinite frustum matrix.
+    inline Matrix frustum(scalar xmin, scalar xmax, scalar ymin, scalar ymax, scalar zNear)
+    {
+        Matrix m(0.0f);
+
+        scalar doubleznear = 2.0f * zNear;
+        scalar one_deltax = 1.0f / (xmax - xmin);
+        scalar one_deltay = 1.0f / (ymax - ymin);
+        scalar nudge = 1.0; // 0.999;
+
+        m(0,0) = doubleznear * one_deltax;
+        m(1,1) = doubleznear * one_deltay;
+        m(0,2) = (xmax + xmin) * one_deltax;
+        m(1,2) = (ymax + ymin) * one_deltay;
+        m(2,2) = -1.0f * nudge;
+        m(3,2) = -1.0f;
+        m(2,3) = -doubleznear * nudge;
+
+        return m;
+    }
+
+    /// Get perspective matrix.
+    inline Matrix perspective(scalar fovy, scalar aspect, scalar zNear, scalar zFar)
+    {
+        scalar xmax = zNear * tan(fovy / 2);
+        scalar xmin = -xmax;
+
+        scalar ymax = xmax / aspect;
+        scalar ymin = -ymax;
+
+        return frustum(xmin, xmax, ymin, ymax, zNear, zFar);	
+    }
+
+    /// Get infinite perspective matrix.
+    inline Matrix perspective(scalar fovy, scalar aspect, scalar zNear)
+    {
+        scalar x = zNear * tan(fovy / 2);
+        scalar y = x / aspect;
+        return frustum( -x, x, -y, y, zNear );	
+    }
+
+    /// Get matrix determinant.
+    inline scalar Matrix::determinant() const
+    {
+        return 
+            m_data[3] * m_data[6] * m_data[ 9] * m_data[12] - m_data[2] * m_data[7] * m_data[ 9] * m_data[12] - m_data[3] * m_data[5] * m_data[10] * m_data[12] + m_data[1] * m_data[7] * m_data[10] * m_data[12] +
+            m_data[2] * m_data[5] * m_data[11] * m_data[12] - m_data[1] * m_data[6] * m_data[11] * m_data[12] - m_data[3] * m_data[6] * m_data[ 8] * m_data[13] + m_data[2] * m_data[7] * m_data[ 8] * m_data[13] +
+            m_data[3] * m_data[4] * m_data[10] * m_data[13] - m_data[0] * m_data[7] * m_data[10] * m_data[13] - m_data[2] * m_data[4] * m_data[11] * m_data[13] + m_data[0] * m_data[6] * m_data[11] * m_data[13] +
+            m_data[3] * m_data[5] * m_data[ 8] * m_data[14] - m_data[1] * m_data[7] * m_data[ 8] * m_data[14] - m_data[3] * m_data[4] * m_data[ 9] * m_data[14] + m_data[0] * m_data[7] * m_data[ 9] * m_data[14] +
+            m_data[1] * m_data[4] * m_data[11] * m_data[14] - m_data[0] * m_data[5] * m_data[11] * m_data[14] - m_data[2] * m_data[5] * m_data[ 8] * m_data[15] + m_data[1] * m_data[6] * m_data[ 8] * m_data[15] +
+            m_data[2] * m_data[4] * m_data[ 9] * m_data[15] - m_data[0] * m_data[6] * m_data[ 9] * m_data[15] - m_data[1] * m_data[4] * m_data[10] * m_data[15] + m_data[0] * m_data[5] * m_data[10] * m_data[15];
+    }
+
+    inline Matrix transpose(Matrix::Arg m)
+    {
+        Matrix r;
+        for (int i = 0; i < 4; i++)
+        {
+            for (int j = 0; j < 4; j++)
+            {
+                r(i, j) = m(j, i);
+            }
+        }
+        return r;
+    }
+
+    inline Matrix inverse(Matrix::Arg m)
+    {
+        Matrix r;
+        r.data( 0) = m.data(6)*m.data(11)*m.data(13) - m.data(7)*m.data(10)*m.data(13) + m.data(7)*m.data(9)*m.data(14) - m.data(5)*m.data(11)*m.data(14) - m.data(6)*m.data(9)*m.data(15) + m.data(5)*m.data(10)*m.data(15);
+        r.data( 1) = m.data(3)*m.data(10)*m.data(13) - m.data(2)*m.data(11)*m.data(13) - m.data(3)*m.data(9)*m.data(14) + m.data(1)*m.data(11)*m.data(14) + m.data(2)*m.data(9)*m.data(15) - m.data(1)*m.data(10)*m.data(15);
+        r.data( 2) = m.data(2)*m.data( 7)*m.data(13) - m.data(3)*m.data( 6)*m.data(13) + m.data(3)*m.data(5)*m.data(14) - m.data(1)*m.data( 7)*m.data(14) - m.data(2)*m.data(5)*m.data(15) + m.data(1)*m.data( 6)*m.data(15);
+        r.data( 3) = m.data(3)*m.data( 6)*m.data( 9) - m.data(2)*m.data( 7)*m.data( 9) - m.data(3)*m.data(5)*m.data(10) + m.data(1)*m.data( 7)*m.data(10) + m.data(2)*m.data(5)*m.data(11) - m.data(1)*m.data( 6)*m.data(11);
+        r.data( 4) = m.data(7)*m.data(10)*m.data(12) - m.data(6)*m.data(11)*m.data(12) - m.data(7)*m.data(8)*m.data(14) + m.data(4)*m.data(11)*m.data(14) + m.data(6)*m.data(8)*m.data(15) - m.data(4)*m.data(10)*m.data(15);
+        r.data( 5) = m.data(2)*m.data(11)*m.data(12) - m.data(3)*m.data(10)*m.data(12) + m.data(3)*m.data(8)*m.data(14) - m.data(0)*m.data(11)*m.data(14) - m.data(2)*m.data(8)*m.data(15) + m.data(0)*m.data(10)*m.data(15);
+        r.data( 6) = m.data(3)*m.data( 6)*m.data(12) - m.data(2)*m.data( 7)*m.data(12) - m.data(3)*m.data(4)*m.data(14) + m.data(0)*m.data( 7)*m.data(14) + m.data(2)*m.data(4)*m.data(15) - m.data(0)*m.data( 6)*m.data(15);
+        r.data( 7) = m.data(2)*m.data( 7)*m.data( 8) - m.data(3)*m.data( 6)*m.data( 8) + m.data(3)*m.data(4)*m.data(10) - m.data(0)*m.data( 7)*m.data(10) - m.data(2)*m.data(4)*m.data(11) + m.data(0)*m.data( 6)*m.data(11);
+        r.data( 8) = m.data(5)*m.data(11)*m.data(12) - m.data(7)*m.data( 9)*m.data(12) + m.data(7)*m.data(8)*m.data(13) - m.data(4)*m.data(11)*m.data(13) - m.data(5)*m.data(8)*m.data(15) + m.data(4)*m.data( 9)*m.data(15);
+        r.data( 9) = m.data(3)*m.data( 9)*m.data(12) - m.data(1)*m.data(11)*m.data(12) - m.data(3)*m.data(8)*m.data(13) + m.data(0)*m.data(11)*m.data(13) + m.data(1)*m.data(8)*m.data(15) - m.data(0)*m.data( 9)*m.data(15);
+        r.data(10) = m.data(1)*m.data( 7)*m.data(12) - m.data(3)*m.data( 5)*m.data(12) + m.data(3)*m.data(4)*m.data(13) - m.data(0)*m.data( 7)*m.data(13) - m.data(1)*m.data(4)*m.data(15) + m.data(0)*m.data( 5)*m.data(15);
+        r.data(11) = m.data(3)*m.data( 5)*m.data( 8) - m.data(1)*m.data( 7)*m.data( 8) - m.data(3)*m.data(4)*m.data( 9) + m.data(0)*m.data( 7)*m.data( 9) + m.data(1)*m.data(4)*m.data(11) - m.data(0)*m.data( 5)*m.data(11);
+        r.data(12) = m.data(6)*m.data( 9)*m.data(12) - m.data(5)*m.data(10)*m.data(12) - m.data(6)*m.data(8)*m.data(13) + m.data(4)*m.data(10)*m.data(13) + m.data(5)*m.data(8)*m.data(14) - m.data(4)*m.data( 9)*m.data(14);
+        r.data(13) = m.data(1)*m.data(10)*m.data(12) - m.data(2)*m.data( 9)*m.data(12) + m.data(2)*m.data(8)*m.data(13) - m.data(0)*m.data(10)*m.data(13) - m.data(1)*m.data(8)*m.data(14) + m.data(0)*m.data( 9)*m.data(14);
+        r.data(14) = m.data(2)*m.data( 5)*m.data(12) - m.data(1)*m.data( 6)*m.data(12) - m.data(2)*m.data(4)*m.data(13) + m.data(0)*m.data( 6)*m.data(13) + m.data(1)*m.data(4)*m.data(14) - m.data(0)*m.data( 5)*m.data(14);
+        r.data(15) = m.data(1)*m.data( 6)*m.data( 8) - m.data(2)*m.data( 5)*m.data( 8) + m.data(2)*m.data(4)*m.data( 9) - m.data(0)*m.data( 6)*m.data( 9) - m.data(1)*m.data(4)*m.data(10) + m.data(0)*m.data( 5)*m.data(10);
+        r.scale(1.0f / m.determinant());
+        return r;
+    }
+
+    inline Matrix isometryInverse(Matrix::Arg m)
+    {
+        Matrix r(identity);
+
+        // transposed 3x3 upper left matrix
+        for (int i = 0; i < 3; i++)
+        {
+            for (int j = 0; j < 3; j++)
+            {
+                r(i, j) = m(j, i);
+            }
+        }
+
+        // translate by the negative offsets
+        r.translate(-Vector3(m.data(12), m.data(13), m.data(14)));
+
+        return r;
+    }
+
+    //Matrix affineInverse(Matrix::Arg m);
+
+    /// Transform the given 3d point with the given matrix.
+    inline Vector3 transformPoint(Matrix::Arg m, Vector3::Arg p)
+    {
+        return Vector3(
+            p.x * m(0,0) + p.y * m(0,1) + p.z * m(0,2) + m(0,3),
+            p.x * m(1,0) + p.y * m(1,1) + p.z * m(1,2) + m(1,3),
+            p.x * m(2,0) + p.y * m(2,1) + p.z * m(2,2) + m(2,3));
+    }
+
+    /// Transform the given 3d vector with the given matrix.
+    inline Vector3 transformVector(Matrix::Arg m, Vector3::Arg p)
+    {
+        return Vector3(
+            p.x * m(0,0) + p.y * m(0,1) + p.z * m(0,2),
+            p.x * m(1,0) + p.y * m(1,1) + p.z * m(1,2),
+            p.x * m(2,0) + p.y * m(2,1) + p.z * m(2,2));
+    }
+
+    /// Transform the given 4d vector with the given matrix.
+    inline Vector4 transform(Matrix::Arg m, Vector4::Arg p)
+    {
+        return Vector4(
+            p.x * m(0,0) + p.y * m(0,1) + p.z * m(0,2) + p.w * m(0,3),
+            p.x * m(1,0) + p.y * m(1,1) + p.z * m(1,2) + p.w * m(1,3),
+            p.x * m(2,0) + p.y * m(2,1) + p.z * m(2,2) + p.w * m(2,3),
+            p.x * m(3,0) + p.y * m(3,1) + p.z * m(3,2) + p.w * m(3,3));
+    }
+
+    inline Matrix mul(Matrix::Arg a, Matrix::Arg b)
+    {
+        // @@ Is this the right order? mul(a, b) = b * a
+        Matrix m = a;
+        m.apply(b);
+        return m;
+    }
+
+} // nv namespace
+
+
+
+
+#if 0
+/** @name Special matrices. */
+//@{
+/** Generate a translation matrix. */
+void TranslationMatrix(const Vec3 & v) {
+    data[0] = 1; data[1] = 0; data[2] = 0; data[3] = 0;
+    data[4] = 0; data[5] = 1; data[6] = 0; data[7] = 0;
+    data[8] = 0; data[9] = 0; data[10] = 1; data[11] = 0;
+    data[12] = v.x; data[13] = v.y; data[14] = v.z; data[15] = 1;
+}
+
+/** Rotate theta degrees around v. */
+void RotationMatrix( scalar theta, scalar v0, scalar v1, scalar v2 ) {
+    scalar cost = cos(theta);
+    scalar sint = sin(theta);
+
+    if( 1 == v0 && 0 == v1 && 0 == v2 ) {
+        data[0] = 1.0f;	data[1] = 0.0f;	data[2] = 0.0f;	data[3] = 0.0f;
+        data[4] = 0.0f;	data[5] = cost;	data[6] = -sint;data[7] = 0.0f;
+        data[8] = 0.0f;	data[9] = sint;	data[10] = cost;data[11] = 0.0f;
+        data[12] = 0.0f;data[13] = 0.0f;data[14] = 0.0f;data[15] = 1.0f;
+    }
+    else if( 0 == v0  && 1 == v1 && 0 == v2 ) {
+        data[0] = cost;	data[1] = 0.0f;	data[2] = sint;	data[3] = 0.0f;
+        data[4] = 0.0f;	data[5] = 1.0f;	data[6] = 0.0f;	data[7] = 0.0f;
+        data[8] = -sint;data[9] = 0.0f;data[10] = cost;	data[11] = 0.0f;
+        data[12] = 0.0f;data[13] = 0.0f;data[14] = 0.0f;data[15] = 1.0f;
+    }
+    else if( 0 == v0 && 0 == v1 && 1 == v2 ) {
+        data[0] = cost;	data[1] = -sint;data[2] = 0.0f;	data[3] = 0.0f;
+        data[4] = sint; data[5] = cost;	data[6] = 0.0f;	data[7] = 0.0f;
+        data[8] = 0.0f;	data[9] = 0.0f;	data[10] = 1.0f;data[11] = 0.0f;
+        data[12] = 0.0f;data[13] = 0.0f;data[14] = 0.0f;data[15] = 1.0f;
+    } 
+    else {
+        //we need scale a,b,c to unit length.
+        scalar a2, b2, c2;
+        a2 = v0 * v0;
+        b2 = v1 * v1;
+        c2 = v2 * v2;
+
+        scalar iscale = 1.0f / sqrtf(a2 + b2 + c2);
+        v0 *= iscale;
+        v1 *= iscale;
+        v2 *= iscale;
+
+        scalar abm, acm, bcm;
+        scalar mcos, asin, bsin, csin;
+        mcos = 1.0f - cost;
+        abm = v0 * v1 * mcos;
+        acm = v0 * v2 * mcos;
+        bcm = v1 * v2 * mcos;
+        asin = v0 * sint;
+        bsin = v1 * sint;
+        csin = v2 * sint;
+        data[0] = a2 * mcos + cost;
+        data[1] = abm - csin;
+        data[2] = acm + bsin;
+        data[3] = abm + csin;
+        data[4] = 0.0f;
+        data[5] = b2 * mcos + cost;
+        data[6] = bcm - asin;
+        data[7] = acm - bsin;
+        data[8] = 0.0f;
+        data[9] = bcm + asin;
+        data[10] = c2 * mcos + cost;
+        data[11] = 0.0f;
+        data[12] = 0.0f;
+        data[13] = 0.0f;
+        data[14] = 0.0f;
+        data[15] = 1.0f;
+    }
+}
+
+/*
+void SkewMatrix(scalar angle, const Vec3 & v1, const Vec3 & v2) {
+v1.Normalize();
+v2.Normalize();
+
+Vec3 v3;
+v3.Cross(v1, v2);
+v3.Normalize();
+
+// Get skew factor.
+scalar costheta = Vec3DotProduct(v1, v2);
+scalar sintheta = Real.Sqrt(1 - costheta * costheta);
+scalar skew = tan(Trig.DegreesToRadians(angle) + acos(sintheta)) * sintheta - costheta;
+
+// Build orthonormal matrix.
+v1 = FXVector3.Cross(v3, v2);
+v1.Normalize();
+
+Matrix R = Matrix::Identity;
+R[0, 0] = v3.X;�// Not sure this is in the correct order...
+R[1, 0] = v3.Y;
+R[2, 0] = v3.Z;
+R[0, 1] = v1.X;
+R[1, 1] = v1.Y;
+R[2, 1] = v1.Z;
+R[0, 2] = v2.X;
+R[1, 2] = v2.Y;
+R[2, 2] = v2.Z;
+
+// Build skew matrix.
+Matrix S = Matrix::Identity;
+S[2, 1] = -skew;
+
+// Return skew transform.
+return R * S * R.Transpose;	// Not sure this is in the correct order...
+}
+*/
+
+/**
+* Generate rotation matrix for the euler angles. This is the same as computing
+* 3 rotation matrices and multiplying them together in our custom order.
+*
+* @todo Have to recompute this code for our new convention.
+**/
+void RotationMatrix( scalar yaw, scalar pitch, scalar roll ) {
+    scalar sy = sin(yaw+ToRadian(90));
+    scalar cy = cos(yaw+ToRadian(90));
+    scalar sp = sin(pitch-ToRadian(90));
+    scalar cp = cos(pitch-ToRadian(90));
+    scalar sr = sin(roll);
+    scalar cr = cos(roll);
+
+    data[0] = cr*cy + sr*sp*sy;
+    data[1] = cp*sy;
+    data[2] = -sr*cy + cr*sp*sy;
+    data[3] = 0;
+
+    data[4] = -cr*sy + sr*sp*cy;
+    data[5] = cp*cy;
+    data[6] = sr*sy + cr*sp*cy;
+    data[7] = 0;
+
+    data[8] = sr*cp;
+    data[9] = -sp;
+    data[10] = cr*cp;
+    data[11] = 0;
+
+    data[12] = 0;
+    data[13] = 0;
+    data[14] = 0;
+    data[15] = 1;
+}
+
+/** Create a frustum matrix with the far plane at the infinity. */
+void Frustum( scalar xmin, scalar xmax, scalar ymin, scalar ymax, scalar zNear, scalar zFar ) {
+    scalar one_deltax, one_deltay, one_deltaz, doubleznear;
+
+    doubleznear = 2.0f * zNear;
+    one_deltax = 1.0f / (xmax - xmin);
+    one_deltay = 1.0f / (ymax - ymin);
+    one_deltaz = 1.0f / (zFar - zNear);
+
+    data[0] = (scalar)(doubleznear * one_deltax);
+    data[1] = 0.0f;
+    data[2] = 0.0f;
+    data[3] = 0.0f;
+    data[4] = 0.0f;
+    data[5] = (scalar)(doubleznear * one_deltay);
+    data[6] = 0.f;
+    data[7] = 0.f;
+    data[8] = (scalar)((xmax + xmin) * one_deltax);
+    data[9] = (scalar)((ymax + ymin) * one_deltay);
+    data[10] = (scalar)(-(zFar + zNear) * one_deltaz);
+    data[11] = -1.f;
+    data[12] = 0.f;
+    data[13] = 0.f;
+    data[14] = (scalar)(-(zFar * doubleznear) * one_deltaz);
+    data[15] = 0.f;
+}
+
+/** Create a frustum matrix with the far plane at the infinity. */
+void FrustumInf( scalar xmin, scalar xmax, scalar ymin, scalar ymax, scalar zNear ) {
+    scalar one_deltax, one_deltay, doubleznear, nudge;
+
+    doubleznear = 2.0f * zNear;
+    one_deltax = 1.0f / (xmax - xmin);
+    one_deltay = 1.0f / (ymax - ymin);
+    nudge = 1.0; // 0.999;
+
+    data[0] = doubleznear * one_deltax;
+    data[1] = 0.0f;
+    data[2] = 0.0f;
+    data[3] = 0.0f;
+
+    data[4] = 0.0f;
+    data[5] = doubleznear * one_deltay;
+    data[6] = 0.f;
+    data[7] = 0.f;
+
+    data[8] = (xmax + xmin) * one_deltax;
+    data[9] = (ymax + ymin) * one_deltay;
+    data[10] = -1.0f * nudge;
+    data[11] = -1.0f;
+
+    data[12] = 0.f;
+    data[13] = 0.f;
+    data[14] = -doubleznear * nudge;
+    data[15] = 0.f;
+}
+
+/** Create an inverse frustum matrix with the far plane at the infinity. */
+void FrustumInfInv( scalar left, scalar right, scalar bottom, scalar top, scalar zNear ) {
+    // this matrix is wrong (not tested scalarly) I think it should be transposed.
+    data[0] = (right - left) / (2 * zNear);
+    data[1] = 0;
+    data[2] = 0;
+    data[3] = (right + left) / (2 * zNear);
+    data[4] = 0;
+    data[5] = (top - bottom) / (2 * zNear);
+    data[6] = 0;
+    data[7] = (top + bottom) / (2 * zNear);
+    data[8] = 0;
+    data[9] = 0;
+    data[10] = 0;
+    data[11] = -1;
+    data[12] = 0;
+    data[13] = 0;
+    data[14] = -1 / (2 * zNear);
+    data[15] = 1 / (2 * zNear);
+}
+
+/** Create an homogeneous projection matrix. */
+void Perspective( scalar fov, scalar aspect, scalar zNear, scalar zFar ) {
+    scalar xmin, xmax, ymin, ymax;
+
+    xmax = zNear * tan( fov/2 );
+    xmin = -xmax;
+
+    ymax = xmax / aspect;
+    ymin = -ymax;
+
+    Frustum(xmin, xmax, ymin, ymax, zNear, zFar);
+}
+
+/** Create a projection matrix with the far plane at the infinity. */
+void PerspectiveInf( scalar fov, scalar aspect, scalar zNear ) {
+    scalar x = zNear * tan( fov/2 );
+    scalar y = x / aspect;
+    FrustumInf( -x, x, -y, y, zNear );
+}
+
+/** Create an inverse projection matrix with far plane at the infinity. */
+void PerspectiveInfInv( scalar fov, scalar aspect, scalar zNear ) {
+    scalar x = zNear * tan( fov/2 );
+    scalar y = x / aspect;
+    FrustumInfInv( -x, x, -y, y, zNear );
+}
+
+/** Build bone matrix from quatertion and offset. */
+void BoneMatrix(const Quat & q, const Vec3 & offset) {
+    scalar x2, y2, z2, xx, xy, xz, yy, yz, zz, wx, wy, wz;
+
+    // calculate coefficients
+    x2 = q.x + q.x;
+    y2 = q.y + q.y;
+    z2 = q.z + q.z;
+
+    xx = q.x * x2;   xy = q.x * y2;   xz = q.x * z2;
+    yy = q.y * y2;   yz = q.y * z2;   zz = q.z * z2;
+    wx = q.w * x2;   wy = q.w * y2;   wz = q.w * z2;
+
+    data[0] = 1.0f - (yy + zz); 	
+    data[1] = xy - wz;
+    data[2] = xz + wy;		
+    data[3] = 0.0f;
+
+    data[4] = xy + wz;		
+    data[5] = 1.0f - (xx + zz);
+    data[6] = yz - wx;		
+    data[7] = 0.0f;
+
+    data[8] = xz - wy;		
+    data[9] = yz + wx;
+    data[10] = 1.0f - (xx + yy);		
+    data[11] = 0.0f;
+
+    data[12] = offset.x;
+    data[13] = offset.y;
+    data[14] = offset.z;			
+    data[15] = 1.0f;
+}
+
+//@}
+
+
+/** @name Transformations: */
+//@{
+
+/** Apply a general scale. */
+void Scale( scalar x, scalar y, scalar z ) {
+    data[0] *= x;	data[4] *= y;	data[8]  *= z;
+    data[1] *= x;	data[5] *= y;	data[9]  *= z;
+    data[2] *= x;	data[6] *= y;	data[10] *= z;
+    data[3] *= x;	data[7] *= y;	data[11] *= z;
+}
+
+/** Apply a rotation of theta degrees around the axis v*/
+void Rotate( scalar theta, const Vec3 & v ) {
+    Matrix b;
+    b.RotationMatrix( theta, v[0], v[1], v[2] );
+    Multiply4x3( b );
+}
+
+/** Apply a rotation of theta degrees around the axis v*/
+void Rotate( scalar theta, scalar v0, scalar v1, scalar v2 ) {
+    Matrix b;
+    b.RotationMatrix( theta, v0, v1, v2 );
+    Multiply4x3( b );
+}
+
+/**
+* Translate the matrix by t. This is the same as multiplying by a
+* translation matrix with the given offset.
+* this = T * this
+*/
+void Translate( const Vec3 &t ) {
+    data[12] = data[0] * t.x + data[4] * t.y + data[8]  * t.z + data[12];
+    data[13] = data[1] * t.x + data[5] * t.y + data[9]  * t.z + data[13];
+    data[14] = data[2] * t.x + data[6] * t.y + data[10] * t.z + data[14];
+    data[15] = data[3] * t.x + data[7] * t.y + data[11] * t.z + data[15];
+}
+
+/** 
+* Translate the matrix by x, y, z. This is the same as multiplying by a 
+* translation matrix with the given offsets.
+*/
+void Translate( scalar x, scalar y, scalar z ) {
+    data[12] = data[0] * x + data[4] * y + data[8]  * z + data[12];
+    data[13] = data[1] * x + data[5] * y + data[9]  * z + data[13];
+    data[14] = data[2] * x + data[6] * y + data[10] * z + data[14];
+    data[15] = data[3] * x + data[7] * y + data[11] * z + data[15];
+}
+
+/** Compute the transposed matrix. */
+void Transpose() {
+    piSwap(data[1], data[4]);
+    piSwap(data[2], data[8]);
+    piSwap(data[6], data[9]);
+    piSwap(data[3], data[12]);
+    piSwap(data[7], data[13]);
+    piSwap(data[11], data[14]);
+}
+
+/** Compute the inverse of a rigid-body/isometry/orthonormal matrix. */
+void IsometryInverse() {
+    // transposed 3x3 upper left matrix
+    piSwap(data[1], data[4]);
+    piSwap(data[2], data[8]);
+    piSwap(data[6], data[9]);
+
+    // translate by the negative offsets
+    Vec3 v(-data[12], -data[13], -data[14]);
+    data[12] = data[13] = data[14] = 0;
+    Translate(v);
+}
+
+/** Compute the inverse of the affine portion of this matrix. */
+void AffineInverse() {
+    data[12] = data[13] = data[14] = 0;
+    Transpose();
+}
+//@}
+
+/** @name Matrix operations: */
+//@{
+
+/** Return the determinant of this matrix. */
+scalar Determinant() const {
+    return	data[0] * data[5] * data[10] * data[15] + 
+        data[1] * data[6] * data[11] * data[12] +
+        data[2] * data[7] * data[ 8] * data[13] +
+        data[3] * data[4] * data[ 9] * data[14] -
+        data[3] * data[6] * data[ 9] * data[12] -
+        data[2] * data[5] * data[ 8] * data[15] -
+        data[1] * data[4] * data[11] * data[14] -
+        data[0] * data[7] * data[10] * data[12];
+}
+
+
+/** Standard matrix product: this *= B. */
+void Multiply4x4( const Matrix & restrict B ) {
+    Multiply4x4(*this, B);
+}
+
+/** Standard matrix product: this = A * B. this != B*/
+void Multiply4x4( const Matrix & A, const Matrix & restrict B ) {
+    piDebugCheck(this != &B);
+
+    for(int i = 0; i < 4; i++) {
+        const scalar ai0 = A(i,0), ai1 = A(i,1), ai2 = A(i,2), ai3 = A(i,3);
+        GetElem(i,0) = ai0 * B(0,0) + ai1 * B(1,0) + ai2 * B(2,0) + ai3 * B(3,0);
+        GetElem(i,1) = ai0 * B(0,1) + ai1 * B(1,1) + ai2 * B(2,1) + ai3 * B(3,1);
+        GetElem(i,2) = ai0 * B(0,2) + ai1 * B(1,2) + ai2 * B(2,2) + ai3 * B(3,2);
+        GetElem(i,3) = ai0 * B(0,3) + ai1 * B(1,3) + ai2 * B(2,3) + ai3 * B(3,3);
+    }
+
+    /* Unrolled but does not allow this == A
+    data[0] = A.data[0] * B.data[0] + A.data[4] * B.data[1] + A.data[8] * B.data[2] + A.data[12] * B.data[3];
+    data[1] = A.data[1] * B.data[0] + A.data[5] * B.data[1] + A.data[9] * B.data[2] + A.data[13] * B.data[3];
+    data[2] = A.data[2] * B.data[0] + A.data[6] * B.data[1] + A.data[10] * B.data[2] + A.data[14] * B.data[3];
+    data[3] = A.data[3] * B.data[0] + A.data[7] * B.data[1] + A.data[11] * B.data[2] + A.data[15] * B.data[3];
+    data[4] = A.data[0] * B.data[4] + A.data[4] * B.data[5] + A.data[8] * B.data[6] + A.data[12] * B.data[7];
+    data[5] = A.data[1] * B.data[4] + A.data[5] * B.data[5] + A.data[9] * B.data[6] + A.data[13] * B.data[7];
+    data[6] = A.data[2] * B.data[4] + A.data[6] * B.data[5] + A.data[10] * B.data[6] + A.data[14] * B.data[7];
+    data[7] = A.data[3] * B.data[4] + A.data[7] * B.data[5] + A.data[11] * B.data[6] + A.data[15] * B.data[7];
+    data[8] = A.data[0] * B.data[8] + A.data[4] * B.data[9] + A.data[8] * B.data[10] + A.data[12] * B.data[11];
+    data[9] = A.data[1] * B.data[8] + A.data[5] * B.data[9] + A.data[9] * B.data[10] + A.data[13] * B.data[11];
+    data[10]= A.data[2] * B.data[8] + A.data[6] * B.data[9] + A.data[10] * B.data[10] + A.data[14] * B.data[11];
+    data[11]= A.data[3] * B.data[8] + A.data[7] * B.data[9] + A.data[11] * B.data[10] + A.data[15] * B.data[11];
+    data[12]= A.data[0] * B.data[12] + A.data[4] * B.data[13] + A.data[8] * B.data[14] + A.data[12] * B.data[15];
+    data[13]= A.data[1] * B.data[12] + A.data[5] * B.data[13] + A.data[9] * B.data[14] + A.data[13] * B.data[15];
+    data[14]= A.data[2] * B.data[12] + A.data[6] * B.data[13] + A.data[10] * B.data[14] + A.data[14] * B.data[15];
+    data[15]= A.data[3] * B.data[12] + A.data[7] * B.data[13] + A.data[11] * B.data[14] + A.data[15] * B.data[15];
+    */
+}
+
+/** Standard matrix product: this *= B. */
+void Multiply4x3( const Matrix & restrict B ) {
+    Multiply4x3(*this, B);
+}
+
+/** Standard product of matrices, where the last row is [0 0 0 1]. */
+void Multiply4x3( const Matrix & A, const Matrix & restrict B ) {
+    piDebugCheck(this != &B);
+
+    for(int i = 0; i < 3; i++) {
+        const scalar ai0 = A(i,0), ai1 = A(i,1), ai2 = A(i,2), ai3 = A(i,3);
+        GetElem(i,0) = ai0 * B(0,0) + ai1 * B(1,0) + ai2 * B(2,0) + ai3 * B(3,0);
+        GetElem(i,1) = ai0 * B(0,1) + ai1 * B(1,1) + ai2 * B(2,1) + ai3 * B(3,1);
+        GetElem(i,2) = ai0 * B(0,2) + ai1 * B(1,2) + ai2 * B(2,2) + ai3 * B(3,2);
+        GetElem(i,3) = ai0 * B(0,3) + ai1 * B(1,3) + ai2 * B(2,3) + ai3 * B(3,3);
+    }
+    data[3] = 0.0f; data[7] = 0.0f; data[11] = 0.0f; data[15] = 1.0f;
+
+    /* Unrolled but does not allow this == A
+    data[0] = a.data[0] * b.data[0] + a.data[4] * b.data[1] + a.data[8] * b.data[2] + a.data[12] * b.data[3];
+    data[1] = a.data[1] * b.data[0] + a.data[5] * b.data[1] + a.data[9] * b.data[2] + a.data[13] * b.data[3];
+    data[2] = a.data[2] * b.data[0] + a.data[6] * b.data[1] + a.data[10] * b.data[2] + a.data[14] * b.data[3];
+    data[3] = 0.0f;
+    data[4] = a.data[0] * b.data[4] + a.data[4] * b.data[5] + a.data[8] * b.data[6] + a.data[12] * b.data[7];
+    data[5] = a.data[1] * b.data[4] + a.data[5] * b.data[5] + a.data[9] * b.data[6] + a.data[13] * b.data[7];
+    data[6] = a.data[2] * b.data[4] + a.data[6] * b.data[5] + a.data[10] * b.data[6] + a.data[14] * b.data[7];
+    data[7] = 0.0f;
+    data[8] = a.data[0] * b.data[8] + a.data[4] * b.data[9] + a.data[8] * b.data[10] + a.data[12] * b.data[11];
+    data[9] = a.data[1] * b.data[8] + a.data[5] * b.data[9] + a.data[9] * b.data[10] + a.data[13] * b.data[11];
+    data[10]= a.data[2] * b.data[8] + a.data[6] * b.data[9] + a.data[10] * b.data[10] + a.data[14] * b.data[11];
+    data[11]= 0.0f;
+    data[12]= a.data[0] * b.data[12] + a.data[4] * b.data[13] + a.data[8] * b.data[14] + a.data[12] * b.data[15];
+    data[13]= a.data[1] * b.data[12] + a.data[5] * b.data[13] + a.data[9] * b.data[14] + a.data[13] * b.data[15];
+    data[14]= a.data[2] * b.data[12] + a.data[6] * b.data[13] + a.data[10] * b.data[14] + a.data[14] * b.data[15];
+    data[15]= 1.0f;
+    */
+}
+//@}
+
+
+/** @name Vector operations: */
+//@{
+
+/** Transform 3d vector (w=0). */
+void TransformVec3(const Vec3 & restrict orig, Vec3 * restrict dest) const {
+    piDebugCheck(&orig != dest);
+    dest->x = orig.x * data[0] + orig.y * data[4] + orig.z * data[8];
+    dest->y = orig.x * data[1] + orig.y * data[5] + orig.z * data[9];
+    dest->z = orig.x * data[2] + orig.y * data[6] + orig.z * data[10];
+}
+/** Transform 3d vector by the transpose (w=0). */
+void TransformVec3T(const Vec3 & restrict orig, Vec3 * restrict dest) const {
+    piDebugCheck(&orig != dest);
+    dest->x = orig.x * data[0] + orig.y * data[1] + orig.z * data[2];
+    dest->y = orig.x * data[4] + orig.y * data[5] + orig.z * data[6];
+    dest->z = orig.x * data[8] + orig.y * data[9] + orig.z * data[10];
+}
+
+/** Transform a 3d homogeneous vector, where the fourth coordinate is assumed to be 1. */
+void TransformPoint(const Vec3 & restrict orig, Vec3 * restrict dest) const {
+    piDebugCheck(&orig != dest);
+    dest->x = orig.x * data[0] + orig.y * data[4] + orig.z * data[8] + data[12];
+    dest->y = orig.x * data[1] + orig.y * data[5] + orig.z * data[9] + data[13];
+    dest->z = orig.x * data[2] + orig.y * data[6] + orig.z * data[10] + data[14];
+}
+
+/** Transform a point, normalize it, and return w. */
+scalar TransformPointAndNormalize(const Vec3 & restrict orig, Vec3 * restrict dest) const {
+    piDebugCheck(&orig != dest);
+    scalar w;
+    dest->x = orig.x * data[0] + orig.y * data[4] + orig.z * data[8] + data[12];
+    dest->y = orig.x * data[1] + orig.y * data[5] + orig.z * data[9] + data[13];
+    dest->z = orig.x * data[2] + orig.y * data[6] + orig.z * data[10] + data[14];
+    w = 1 / (orig.x * data[3] + orig.y * data[7] + orig.z * data[11] + data[15]);
+    *dest *= w;
+    return w;
+}
+
+/** Transform a point and return w. */
+scalar TransformPointReturnW(const Vec3 & restrict orig, Vec3 * restrict dest) const {
+    piDebugCheck(&orig != dest);
+    dest->x = orig.x * data[0] + orig.y * data[4] + orig.z * data[8] + data[12];
+    dest->y = orig.x * data[1] + orig.y * data[5] + orig.z * data[9] + data[13];
+    dest->z = orig.x * data[2] + orig.y * data[6] + orig.z * data[10] + data[14];
+    return orig.x * data[3] + orig.y * data[7] + orig.z * data[11] + data[15];
+}
+
+/** Transform a normalized 3d point by a 4d matrix and return the resulting 4d vector. */
+void TransformVec4(const Vec3 & orig, Vec4 * dest) const {
+    dest->x = orig.x * data[0] + orig.y * data[4] + orig.z * data[8] + data[12];
+    dest->y = orig.x * data[1] + orig.y * data[5] + orig.z * data[9] + data[13];
+    dest->z = orig.x * data[2] + orig.y * data[6] + orig.z * data[10] + data[14];
+    dest->w = orig.x * data[3] + orig.y * data[7] + orig.z * data[11] + data[15];
+}
+//@}
+
+/** @name Matrix analysis. */
+//@{
+
+/** Get the ZYZ euler angles from the matrix. Assumes the matrix is orthonormal. */
+void GetEulerAnglesZYZ(scalar * s, scalar * t, scalar * r) const {
+    if( GetElem(2,2) < 1.0f ) {
+        if( GetElem(2,2) > -1.0f ) {
+            // 	cs*ct*cr-ss*sr 		-ss*ct*cr-cs*sr		st*cr
+            //	cs*ct*sr+ss*cr		-ss*ct*sr+cs*cr		st*sr
+            //	-cs*st				ss*st				ct
+            *s = atan2(GetElem(1,2), -GetElem(0,2));
+            *t = acos(GetElem(2,2));
+            *r = atan2(GetElem(2,1), GetElem(2,0));		
+        }
+        else {
+            // 	-c(s-r)	 	s(s-r)		0
+            //	s(s-r)		c(s-r)		0
+            //	0			0			-1
+            *s = atan2(GetElem(0, 1), -GetElem(0, 0)); // = s-r
+            *t = PI;
+            *r = 0;
+        }
+    }
+    else {
+        // 	c(s+r)		-s(s+r)		0
+        //	s(s+r)		c(s+r)		0
+        //	0			0			1
+        *s = atan2(GetElem(0, 1), GetElem(0, 0)); // = s+r
+        *t = 0;
+        *r = 0;
+    }
+}
+
+//@}
+
+MATHLIB_API friend PiStream & operator<< ( PiStream & s, Matrix & m );
+
+/** Print to debug output. */
+void Print() const {
+    piDebug( "[ %5.2f %5.2f %5.2f %5.2f ]\n", data[0], data[4], data[8], data[12] );
+    piDebug( "[ %5.2f %5.2f %5.2f %5.2f ]\n", data[1], data[5], data[9], data[13] );
+    piDebug( "[ %5.2f %5.2f %5.2f %5.2f ]\n", data[2], data[6], data[10], data[14] );
+    piDebug( "[ %5.2f %5.2f %5.2f %5.2f ]\n", data[3], data[7], data[11], data[15] );
+}
+
+
+public:
+
+    scalar data[16];
+
+};
+#endif
+
+
+
+
+#endif // NV_MATH_MATRIX_H
diff --git a/src/nvmath/Vector.inl b/src/nvmath/Vector.inl
index a676ee4..9b0ec0a 100644
--- a/src/nvmath/Vector.inl
+++ b/src/nvmath/Vector.inl
@@ -381,14 +381,14 @@ namespace nv
         return Vector2(max(a.x, b.x), max(a.y, b.y));
     }
 
-    inline bool isValid(Vector2::Arg v)
+    inline bool isFinite(Vector2::Arg v)
     {
         return isFinite(v.x) && isFinite(v.y);
     }
 
     inline Vector2 validate(Vector2::Arg v, Vector2::Arg fallback = Vector2(0.0f))
     {
-        if (!isValid(v)) return fallback;
+        if (!isFinite(v)) return fallback;
         Vector2 vf = v;
         nv::floatCleanup(vf.component, 2);
         return vf;
@@ -567,14 +567,14 @@ namespace nv
         return Vector3(ceilf(v.x), ceilf(v.y), ceilf(v.z));
     }
 
-    inline bool isValid(Vector3::Arg v)
+    inline bool isFinite(Vector3::Arg v)
     {
         return isFinite(v.x) && isFinite(v.y) && isFinite(v.z);
     }
 
     inline Vector3 validate(Vector3::Arg v, Vector3::Arg fallback = Vector3(0.0f))
     {
-        if (!isValid(v)) return fallback;
+        if (!isFinite(v)) return fallback;
         Vector3 vf = v;
         nv::floatCleanup(vf.component, 3);
         return vf;
@@ -699,14 +699,14 @@ namespace nv
         return Vector4(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z), max(a.w, b.w));
     }
 
-    inline bool isValid(Vector4::Arg v)
+    inline bool isFinite(Vector4::Arg v)
     {
         return isFinite(v.x) && isFinite(v.y) && isFinite(v.z) && isFinite(v.w);
     }
 
     inline Vector4 validate(Vector4::Arg v, Vector4::Arg fallback = Vector4(0.0f))
     {
-        if (!isValid(v)) return fallback;
+        if (!isFinite(v)) return fallback;
         Vector4 vf = v;
         nv::floatCleanup(vf.component, 4);
         return vf;
diff --git a/src/nvmath/nvmath.h b/src/nvmath/nvmath.h
index 717157c..b9a1bad 100644
--- a/src/nvmath/nvmath.h
+++ b/src/nvmath/nvmath.h
@@ -5,14 +5,13 @@
 #define NV_MATH_H
 
 #include "nvcore/nvcore.h"
-#include "nvcore/Debug.h"
-#include "nvcore/Utils.h" // clamp
+#include "nvcore/Debug.h"   // nvDebugCheck
+#include "nvcore/Utils.h"   // clamp
 
 #include <math.h>
-#include <limits.h> // INT_MAX
 
 #if NV_OS_WIN32 || NV_OS_XBOX
-#include <float.h>
+#include <float.h>  // finite, isnan
 #endif
 
 // Function linkage
@@ -105,9 +104,12 @@ namespace nv
     inline float toRadian(float degree) { return degree * (PI / 180.0f); }
     inline float toDegree(float radian) { return radian * (180.0f / PI); }
 
+    // Robust floating point comparisons:
+    // http://realtimecollisiondetection.net/blog/?p=89
     inline bool equal(const float f0, const float f1, const float epsilon = NV_EPSILON)
     {
-        return fabs(f0-f1) <= epsilon;
+        //return fabs(f0-f1) <= epsilon;
+        return fabs(f0-f1) <= epsilon * max(1.0f, fabs(f0), fabs(f1));
     }
 
     inline bool isZero(const float f, const float epsilon = NV_EPSILON)
diff --git a/src/nvthread/ThreadPool.h b/src/nvthread/ThreadPool.h
index 147a607..84fc41e 100644
--- a/src/nvthread/ThreadPool.h
+++ b/src/nvthread/ThreadPool.h
@@ -1,8 +1,8 @@
-// This code is in the public domain -- castano@gmail.com
-
-#pragma once
-#ifndef NV_THREAD_THREADPOOL_H
-#define NV_THREAD_THREADPOOL_H
+// This code is in the public domain -- castano@gmail.com
+
+#pragma once
+#ifndef NV_THREAD_THREADPOOL_H
+#define NV_THREAD_THREADPOOL_H
 
 #include "nvthread.h"
 
diff --git a/src/nvtt/CubeSurface.cpp b/src/nvtt/CubeSurface.cpp
index 83f39d8..99c3c7e 100644
--- a/src/nvtt/CubeSurface.cpp
+++ b/src/nvtt/CubeSurface.cpp
@@ -183,6 +183,35 @@ Surface CubeSurface::unfold(CubeLayout layout) const
 }
 
 
+float CubeSurface::average(int channel) const
+{
+    const uint edgeLength = m->edgeLength;
+
+    // These tables along with the surface so that we only compute them once.
+    if (m->solidAngleTable == NULL) {
+        m->solidAngleTable = new SolidAngleTable(edgeLength);
+    }
+
+    float total = 0.0f;
+    float sum = 0.0f;
+
+    for (int f = 0; f < 6; f++) {
+        float * c = m->face[f].m->image->channel(channel);
+
+        for (uint y = 0; y < edgeLength; y++) {
+            for (uint x = 0; x < edgeLength; x++) {
+                float solidAngle = m->solidAngleTable->lookup(x, y);
+
+                total += solidAngle;
+                sum += c[y * edgeLength + x] * solidAngle;
+            }
+        }
+    }
+
+    return sum / total;
+}
+
+
 CubeSurface CubeSurface::irradianceFilter(int size) const
 {
     // @@ TODO
@@ -237,7 +266,7 @@ SolidAngleTable::SolidAngleTable(uint edgeLength) : size(edgeLength/2) {
 
     for (uint y = 0; y < size; y++) {
         for (uint x = 0; x < size; x++) {
-            data[y * size + x] = solidAngleTerm(128+x, 128+y, inverseEdgeLength);
+            data[y * size + x] = solidAngleTerm(size+x, size+y, inverseEdgeLength);
         }
     }
 }
@@ -631,7 +660,7 @@ CubeSurface CubeSurface::cosinePowerFilter(int size, float cosinePower) const
     CubeSurface filteredCube;
     filteredCube.m->allocate(size);
 
-    // Store these tables along with the surface. Compute them only once!
+    // These tables along with the surface so that we only compute them once.
     if (m->solidAngleTable == NULL) {
         m->solidAngleTable = new SolidAngleTable(edgeLength);
     }
diff --git a/src/nvtt/CubeSurface.h b/src/nvtt/CubeSurface.h
index 31cc46d..19a42e0 100644
--- a/src/nvtt/CubeSurface.h
+++ b/src/nvtt/CubeSurface.h
@@ -74,7 +74,7 @@ namespace nvtt
 
             edgeLength = p.edgeLength;
             for (uint i = 0; i < 6; i++) {
-                face[i] = p.face[6];
+                face[i] = p.face[i];
             }
             solidAngleTable = NULL; // @@ Transfer tables. Needs refcounting?
             vectorTable = NULL;
diff --git a/src/nvtt/OutputOptions.cpp b/src/nvtt/OutputOptions.cpp
index 59de6b8..f5e6e71 100644
--- a/src/nvtt/OutputOptions.cpp
+++ b/src/nvtt/OutputOptions.cpp
@@ -44,6 +44,7 @@ OutputOptions::~OutputOptions()
 void OutputOptions::reset()
 {
     m.fileName.reset();
+    m.fileHandle = NULL;
 
     m.outputHandler = NULL;
     m.errorHandler = NULL;
@@ -52,37 +53,67 @@ void OutputOptions::reset()
     m.container = Container_DDS;
     m.version = 0;
     m.srgb = false;
+    m.deleteOutputHandler = false;
 }
 
 
 /// Set output file name.
 void OutputOptions::setFileName(const char * fileName)
 {
-    if (!m.fileName.isNull())
+    if (m.deleteOutputHandler)
     {
-        // To close the file and avoid leak.
         delete m.outputHandler;
     }
 
     m.fileName = fileName;
+    m.fileHandle = NULL;
     m.outputHandler = NULL;
+    m.deleteOutputHandler = false;
 
     DefaultOutputHandler * oh = new DefaultOutputHandler(fileName);
-    if (!oh->stream.isError())
-    {
+    if (oh->stream.isError()) {
+        delete oh;
+    }
+    else {
+        m.deleteOutputHandler = true;
+        m.outputHandler = oh;
+    }
+}
+
+/// Set output file handle.
+void OutputOptions::setFileHandle(void * fp)
+{
+    if (m.deleteOutputHandler) {
+        delete m.outputHandler;
+    }
+
+    m.fileName.reset();
+    m.fileHandle = (FILE *)fp;
+    m.outputHandler = NULL;
+    m.deleteOutputHandler = false;
+
+    DefaultOutputHandler * oh = new DefaultOutputHandler(m.fileHandle);
+    if (oh->stream.isError()) {
+        delete oh;
+    }
+    else {
+        m.deleteOutputHandler = true;
         m.outputHandler = oh;
     }
 }
 
+
 /// Set output handler.
 void OutputOptions::setOutputHandler(OutputHandler * outputHandler)
 {
-    if (!m.fileName.isNull())
-    {
+    if (m.deleteOutputHandler) {
         delete m.outputHandler;
-        m.fileName.reset();
     }
+
+    m.fileName.reset();
+    m.fileHandle = NULL;
     m.outputHandler = outputHandler;
+    m.deleteOutputHandler = false;
 }
 
 /// Set error handler.
@@ -117,7 +148,7 @@ void OutputOptions::setSrgbFlag(bool b)
 
 bool OutputOptions::Private::hasValidOutputHandler() const
 {
-    if (!fileName.isNull())
+    if (!fileName.isNull() || fileHandle != NULL)
     {
         return outputHandler != NULL;
     }
diff --git a/src/nvtt/OutputOptions.h b/src/nvtt/OutputOptions.h
index 2a272a0..90376b9 100644
--- a/src/nvtt/OutputOptions.h
+++ b/src/nvtt/OutputOptions.h
@@ -25,16 +25,19 @@
 #ifndef NV_TT_OUTPUTOPTIONS_H
 #define NV_TT_OUTPUTOPTIONS_H
 
-#include <nvcore/StrLib.h> // Path
-#include <nvcore/StdStream.h>
 #include "nvtt.h"
 
+#include "nvcore/StrLib.h" // Path
+#include "nvcore/StdStream.h"
+
+
 namespace nvtt
 {
 
 	struct DefaultOutputHandler : public nvtt::OutputHandler
 	{
 		DefaultOutputHandler(const char * fileName) : stream(fileName) {}
+        DefaultOutputHandler(FILE * fp) : stream(fp, false) {}
 		
 		virtual ~DefaultOutputHandler() {}
 		
@@ -64,6 +67,7 @@ namespace nvtt
 	struct OutputOptions::Private
 	{
 		nv::Path fileName;
+        FILE * fileHandle;
 		
 		OutputHandler * outputHandler;
 		ErrorHandler * errorHandler;
@@ -72,6 +76,7 @@ namespace nvtt
 		Container container;
         int version;
         bool srgb;
+        bool deleteOutputHandler;
 		
 		bool hasValidOutputHandler() const;
 
diff --git a/src/nvtt/Surface.cpp b/src/nvtt/Surface.cpp
index e5fb086..59f8148 100644
--- a/src/nvtt/Surface.cpp
+++ b/src/nvtt/Surface.cpp
@@ -704,13 +704,14 @@ void Surface::resize(int w, int h, int d, ResizeFilter filter)
 
 void Surface::resize(int w, int h, int d, ResizeFilter filter, float filterWidth, const float * params)
 {
-    FloatImage * img = m->image;
-    if (img == NULL || (w == img->width() && h == img->height() && d == img->depth())) {
+    if (isNull() || (w == width() && h == height() && d == depth())) {
         return;
     }
 
     detach();
 
+    FloatImage * img = m->image;
+
     FloatImage::WrapMode wrapMode = (FloatImage::WrapMode)m->wrapMode;
 
     if (m->alphaMode == AlphaMode_Transparency)
@@ -781,7 +782,7 @@ void Surface::resize(int maxExtent, RoundMode roundMode, ResizeFilter filter)
 
 void Surface::resize(int maxExtent, RoundMode roundMode, ResizeFilter filter, float filterWidth, const float * params)
 {
-    if (m->image == NULL) return;
+    if (isNull()) return;
 
     int w = m->image->width();
     int h = m->image->height();
@@ -803,13 +804,14 @@ bool Surface::buildNextMipmap(MipmapFilter filter)
 
 bool Surface::buildNextMipmap(MipmapFilter filter, float filterWidth, const float * params)
 {
-    FloatImage * img = m->image;
-    if (img == NULL || (img->width() == 1 && img->height() == 1 && img->depth() == 1)) {
+    if (isNull() || (width() == 1 && height() == 1 && depth() == 1)) {
         return false;
     }
 
     detach();
 
+    FloatImage * img = m->image;
+
     FloatImage::WrapMode wrapMode = (FloatImage::WrapMode)m->wrapMode;
 
     if (m->alphaMode == AlphaMode_Transparency)
@@ -868,13 +870,14 @@ void Surface::canvasSize(int w, int h, int d)
 {
     nvDebugCheck(w > 0 && h > 0 && d > 0);
 
-    FloatImage * img = m->image;
-    if (img == NULL || (w == img->width() && h == img->height() && d == img->depth())) {
+    if (isNull() || (w == width() && h == height() && d == depth())) {
         return;
     }
 
     detach();
 
+    FloatImage * img = m->image;
+
     FloatImage * new_img = new FloatImage;
     new_img->allocate(4, w, h, d);
     new_img->clear();
@@ -903,7 +906,7 @@ void Surface::canvasSize(int w, int h, int d)
 // Color transforms.
 void Surface::toLinear(float gamma)
 {
-    if (m->image == NULL) return;
+    if (isNull()) return;
     if (equal(gamma, 1.0f)) return;
 
     detach();
@@ -913,7 +916,7 @@ void Surface::toLinear(float gamma)
 
 void Surface::toGamma(float gamma)
 {
-    if (m->image == NULL) return;
+    if (isNull()) return;
     if (equal(gamma, 1.0f)) return;
 
     detach();
@@ -923,7 +926,8 @@ void Surface::toGamma(float gamma)
 
 
 static float toSrgb(float f) {
-    if (f <= 0.0)               f = 0.0f;
+    if (isNan(f))               f = 0.0f;
+    else if (f <= 0.0f)         f = 0.0f;
     else if (f <= 0.0031308f)   f = 12.92f * f;
     else if (f <= 1.0f)         f = (powf(f, 0.41666f) * 1.055f) - 0.055f;
     else                        f = 1.0f;
@@ -932,21 +936,43 @@ static float toSrgb(float f) {
 
 void Surface::toSrgb()
 {
-    FloatImage * img = m->image;
-    if (img == NULL) return;
+    if (isNull()) return;
 
     detach();
 
+    FloatImage * img = m->image;
+
     const uint count = img->pixelCount();
-    for (uint j = 0; j < count; j++)
-    {
-        float & r = img->pixel(0, j);
-        float & g = img->pixel(1, j);
-        float & b = img->pixel(2, j);
+    for (uint c = 0; c < 3; c++) {
+        float * channel = img->channel(c);
+        for (uint i = 0; i < count; i++) {
+            channel[i] = ::toSrgb(channel[i]);
+        }
+    }
+}
+
+static float fromSrgb(float f) {
+    if (f < 0.0f)           f = 0.0f;
+    else if (f < 0.04045f)  f = f / 12.92f;
+    else if (f <= 1.0f)     f = powf((f + 0.055f) / 1.055f, 2.4f);
+    else                    f = 1.0f;
+    return f;
+}
+
+void Surface::toLinearFromSrgb()
+{
+    if (isNull()) return;
+
+    detach();
 
-        r = ::toSrgb(r);
-        g = ::toSrgb(g);
-        b = ::toSrgb(b);
+    FloatImage * img = m->image;
+
+    const uint count = img->pixelCount();
+    for (uint c = 0; c < 3; c++) {
+        float * channel = img->channel(c);
+        for (uint i = 0; i < count; i++) {
+            channel[i] = ::fromSrgb(channel[i]);
+        }
     }
 }
 
@@ -962,28 +988,25 @@ static float toXenonSrgb(float f) {
 
 void Surface::toXenonSrgb()
 {
-    FloatImage * img = m->image;
-    if (img == NULL) return;
+    if (isNull()) return;
 
     detach();
 
-    const uint count = img->pixelCount();
-    for (uint j = 0; j < count; j++)
-    {
-        float & r = img->pixel(0, j);
-        float & g = img->pixel(1, j);
-        float & b = img->pixel(2, j);
+    FloatImage * img = m->image;
 
-        r = ::toXenonSrgb(r);
-        g = ::toXenonSrgb(g);
-        b = ::toXenonSrgb(b);
+    const uint count = img->pixelCount();
+    for (uint c = 0; c < 3; c++) {
+        float * channel = img->channel(c);
+        for (uint i = 0; i < count; i++) {
+            channel[i] = ::toXenonSrgb(channel[i]);
+        }
     }
 }
 
 
 void Surface::transform(const float w0[4], const float w1[4], const float w2[4], const float w3[4], const float offset[4])
 {
-    if (m->image == NULL) return;
+    if (isNull()) return;
 
     detach();
 
@@ -1000,7 +1023,7 @@ void Surface::transform(const float w0[4], const float w1[4], const float w2[4],
 
 void Surface::swizzle(int r, int g, int b, int a)
 {
-    if (m->image == NULL) return;
+    if (isNull()) return;
     if (r == 0 && g == 1 && b == 2 && a == 3) return;
 
     detach();
@@ -1011,7 +1034,7 @@ void Surface::swizzle(int r, int g, int b, int a)
 // color * scale + bias
 void Surface::scaleBias(int channel, float scale, float bias)
 {
-    if (m->image == NULL) return;
+    if (isNull()) return;
     if (equal(scale, 1.0f) && equal(bias, 0.0f)) return;
 
     detach();
@@ -1021,7 +1044,7 @@ void Surface::scaleBias(int channel, float scale, float bias)
 
 void Surface::clamp(int channel, float low, float high)
 {
-    if (m->image == NULL) return;
+    if (isNull()) return;
 
     detach();
 
@@ -1045,7 +1068,7 @@ void Surface::expandNormal()
 
 void Surface::blend(float red, float green, float blue, float alpha, float t)
 {
-    if (m->image == NULL) return;
+    if (isNull()) return;
 
     detach();
 
@@ -1067,7 +1090,7 @@ void Surface::blend(float red, float green, float blue, float alpha, float t)
 
 void Surface::premultiplyAlpha()
 {
-    if (m->image == NULL) return;
+    if (isNull()) return;
 
     detach();
 
@@ -1089,7 +1112,7 @@ void Surface::premultiplyAlpha()
 
 void Surface::toGreyScale(float redScale, float greenScale, float blueScale, float alphaScale)
 {
-    if (m->image == NULL) return;
+    if (isNull()) return;
 
     detach();
 
@@ -1116,7 +1139,7 @@ void Surface::toGreyScale(float redScale, float greenScale, float blueScale, flo
 // Draw colored border.
 void Surface::setBorder(float r, float g, float b, float a)
 {
-    if (m->image == NULL) return;
+    if (isNull()) return;
 
     detach();
 
@@ -1158,7 +1181,7 @@ void Surface::setBorder(float r, float g, float b, float a)
 // Fill image with the given color.
 void Surface::fill(float red, float green, float blue, float alpha)
 {
-    if (m->image == NULL) return;
+    if (isNull()) return;
 
     detach();
 
@@ -1181,7 +1204,7 @@ void Surface::fill(float red, float green, float blue, float alpha)
 
 void Surface::scaleAlphaToCoverage(float coverage, float alphaRef/*= 0.5f*/)
 {
-    if (m->image == NULL) return;
+    if (isNull()) return;
 
     detach();
 
@@ -1220,7 +1243,7 @@ void Surface::scaleAlphaToCoverage(float coverage, float alphaRef/*= 0.5f*/)
 // Once you have M quantized, you would compute the corresponding RGB and quantize that.
 void Surface::toRGBM(float range/*= 1*/, float threshold/*= 0.25*/)
 {
-    if (m->image == NULL) return;
+    if (isNull()) return;
 
     detach();
 
@@ -1288,7 +1311,7 @@ void Surface::toRGBM(float range/*= 1*/, float threshold/*= 0.25*/)
 
 void Surface::fromRGBM(float range/*= 1*/)
 {
-    if (m->image == NULL) return;
+    if (isNull()) return;
 
     detach();
 
@@ -1557,7 +1580,7 @@ void Surface::fromRGBE(int mantissaBits, int exponentBits)
 // Y is in the [0, 1] range, while CoCg are in the [-1, 1] range.
 void Surface::toYCoCg()
 {
-    if (m->image == NULL) return;
+    if (isNull()) return;
 
     detach();
 
@@ -1594,7 +1617,7 @@ void Surface::toYCoCg()
 // and minimize bilinear interpolation artifacts.
 void Surface::blockScaleCoCg(int bits/*= 5*/, float threshold/*= 0.0*/)
 {
-    if (m->image == NULL || m->image->depth() != 1) return;
+    if (isNull() || depth() != 1) return;
 
     detach();
 
@@ -1652,7 +1675,7 @@ void Surface::blockScaleCoCg(int bits/*= 5*/, float threshold/*= 0.0*/)
 
 void Surface::fromYCoCg()
 {
-    if (m->image == NULL) return;
+    if (isNull()) return;
 
     detach();
 
@@ -1685,7 +1708,7 @@ void Surface::fromYCoCg()
 
 void Surface::toLUVW(float range/*= 1.0f*/)
 {
-    if (m->image == NULL) return;
+    if (isNull()) return;
 
     detach();
 
@@ -1720,7 +1743,7 @@ void Surface::fromLUVW(float range/*= 1.0f*/)
 
 void Surface::abs(int channel)
 {
-    if (m->image == NULL) return;
+    if (isNull()) return;
 
     detach();
 
@@ -1735,7 +1758,7 @@ void Surface::abs(int channel)
 
 void Surface::convolve(int channel, int kernelSize, float * kernelData)
 {
-    if (m->image == NULL) return;
+    if (isNull()) return;
 
     detach();
 
@@ -1746,7 +1769,7 @@ void Surface::convolve(int channel, int kernelSize, float * kernelData)
 /*
 void Surface::blockLuminanceScale(float scale)
 {
-    if (m->image == NULL) return;
+    if (isNull()) return;
 
     detach();
 
@@ -1821,7 +1844,7 @@ void Surface::blockLuminanceScale(float scale)
 /*
 void Surface::toJPEGLS()
 {
-    if (m->image == NULL) return;
+    if (isNull()) return;
 
     detach();
 
@@ -1844,7 +1867,7 @@ void Surface::toJPEGLS()
 
 void Surface::fromJPEGLS()
 {
-    if (m->image == NULL) return;
+    if (isNull()) return;
 
     detach();
 
@@ -1870,7 +1893,7 @@ void Surface::fromJPEGLS()
 // If dither is true, this uses Floyd-Steinberg dithering method.
 void Surface::binarize(int channel, float threshold, bool dither)
 {
-    if (m->image == NULL) return;
+    if (isNull()) return;
 
     detach();
 
@@ -1933,7 +1956,7 @@ void Surface::binarize(int channel, float threshold, bool dither)
 // When dither is true, this uses Floyd-Steinberg dithering.
 void Surface::quantize(int channel, int bits, bool exactEndPoints, bool dither)
 {
-    if (m->image == NULL) return;
+    if (isNull()) return;
 
     detach();
 
@@ -2004,7 +2027,7 @@ void Surface::quantize(int channel, int bits, bool exactEndPoints, bool dither)
 // Set normal map options.
 void Surface::toNormalMap(float sm, float medium, float big, float large)
 {
-    if (m->image == NULL) return;
+    if (isNull()) return;
 
     detach();
 
@@ -2023,7 +2046,7 @@ void Surface::toNormalMap(float sm, float medium, float big, float large)
 
 void Surface::normalizeNormalMap()
 {
-    if (m->image == NULL) return;
+    if (isNull()) return;
     if (!m->isNormalMap) return;
 
     detach();
@@ -2033,7 +2056,7 @@ void Surface::normalizeNormalMap()
 
 void Surface::transformNormals(NormalTransform xform)
 {
-    if (m->image == NULL) return;
+    if (isNull()) return;
 
     detach();
 
@@ -2106,7 +2129,7 @@ void Surface::transformNormals(NormalTransform xform)
 
 void Surface::reconstructNormals(NormalTransform xform)
 {
-    if (m->image == NULL) return;
+    if (isNull()) return;
 
     detach();
 
@@ -2155,7 +2178,7 @@ void Surface::reconstructNormals(NormalTransform xform)
 
 void Surface::toCleanNormalMap()
 {
-    if (m->image == NULL) return;
+    if (isNull()) return;
 
     detach();
 
@@ -2174,14 +2197,14 @@ void Surface::toCleanNormalMap()
 
 // [-1,1] -> [ 0,1]
 void Surface::packNormals() {
-    if (m->image == NULL) return;
+    if (isNull()) return;
     detach();
     m->image->packNormals(0);
 }
 
 // [ 0,1] -> [-1,1]
 void Surface::expandNormals() {
-    if (m->image == NULL) return;
+    if (isNull()) return;
     detach();
     m->image->expandNormals(0);
 }
@@ -2189,7 +2212,7 @@ void Surface::expandNormals() {
 
 void Surface::flipX()
 {
-    if (m->image == NULL) return;
+    if (isNull()) return;
 
     detach();
 
@@ -2198,7 +2221,7 @@ void Surface::flipX()
 
 void Surface::flipY()
 {
-    if (m->image == NULL) return;
+    if (isNull()) return;
 
     detach();
 
@@ -2207,7 +2230,7 @@ void Surface::flipY()
 
 void Surface::flipZ()
 {
-    if (m->image == NULL) return;
+    if (isNull()) return;
 
     detach();
 
@@ -2233,6 +2256,8 @@ bool Surface::copyChannel(const Surface & srcImage, int srcChannel, int dstChann
 
     detach();
 
+    dst = m->image;
+
     memcpy(dst->channel(dstChannel), src->channel(srcChannel), dst->pixelCount()*sizeof(float));
 
     return true;
@@ -2252,6 +2277,8 @@ bool Surface::addChannel(const Surface & srcImage, int srcChannel, int dstChanne
 
     detach();
 
+    dst = m->image;
+
     const uint w = src->width();
     const uint h = src->height();
 
diff --git a/src/nvtt/nvtt.h b/src/nvtt/nvtt.h
index c71a41e..4f2b068 100644
--- a/src/nvtt/nvtt.h
+++ b/src/nvtt/nvtt.h
@@ -350,6 +350,7 @@ namespace nvtt
         NVTT_API void reset();
 
         NVTT_API void setFileName(const char * fileName);
+        NVTT_API void setFileHandle(void * fp);
 
         NVTT_API void setOutputHandler(OutputHandler * outputHandler);
         NVTT_API void setErrorHandler(ErrorHandler * errorHandler);
@@ -464,6 +465,7 @@ namespace nvtt
         NVTT_API void toLinear(float gamma);
         NVTT_API void toGamma(float gamma);
         NVTT_API void toSrgb();
+        NVTT_API void toLinearFromSrgb();
         NVTT_API void toXenonSrgb();
         NVTT_API void transform(const float w0[4], const float w1[4], const float w2[4], const float w3[4], const float offset[4]);
         NVTT_API void swizzle(int r, int g, int b, int a);
@@ -564,6 +566,8 @@ namespace nvtt
 
         // @@ Add edge fixup methods.
 
+        NVTT_API float average(int channel) const;
+
         // Filtering.
         NVTT_API CubeSurface irradianceFilter(int size) const;
         NVTT_API CubeSurface cosinePowerFilter(int size, float cosinePower) const;