Import all sources from perforce.

2007-04-17 08:49:19 +00:00
commit 7543dd1efa
197 changed files with 49819 additions and 0 deletions
--- a/src/nvimage/CMakeLists.txt
+++ b/src/nvimage/CMakeLists.txt
@ -0,0 +1,54 @@
+PROJECT(nvimage)
+
+SUBDIRS(nvtt)
+
+SET(IMAGE_SRCS	
+	nvimage.h
+	FloatImage.h
+	FloatImage.cpp
+	Filter.h
+	Filter.cpp
+	Image.h
+	Image.cpp
+	ImageIO.h
+	ImageIO.cpp
+	ColorBlock.h
+	ColorBlock.cpp
+	HoleFilling.h
+	HoleFilling.cpp
+	DirectDrawSurface.h
+	DirectDrawSurface.cpp
+	Quantize.h
+	Quantize.cpp
+	NormalMap.h
+	NormalMap.cpp)
+
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+
+IF(PNG_FOUND)
+	SET(LIBS ${LIBS} ${PNG_LIBRARIES})
+	INCLUDE_DIRECTORIES(${PNG_INCLUDE_DIR})
+ENDIF(PNG_FOUND)
+
+IF(JPEG_FOUND)
+	SET(LIBS ${LIBS} ${JPEG_LIBRARIES})
+	INCLUDE_DIRECTORIES(${JPEG_INCLUDE_DIR})
+ENDIF(JPEG_FOUND)
+
+IF(TIFF_FOUND)
+	SET(LIBS ${LIBS} ${TIFF_LIBRARIES})
+	INCLUDE_DIRECTORIES(${TIFF_INCLUDE_DIR})
+ENDIF(TIFF_FOUND)
+
+# targets
+ADD_DEFINITIONS(-DNVIMAGE_EXPORTS)
+
+IF(NVIMAGE_SHARED)	
+	ADD_LIBRARY(nvimage SHARED ${IMAGE_SRCS})
+ELSE(NVIMAGE_SHARED)
+	ADD_LIBRARY(nvimage ${IMAGE_SRCS})
+ENDIF(NVIMAGE_SHARED)
+
+TARGET_LINK_LIBRARIES(nvimage ${LIBS} nvcore nvmath posh)
+
+
--- a/src/nvimage/ColorBlock.cpp
+++ b/src/nvimage/ColorBlock.cpp
@ -0,0 +1,392 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#include <nvmath/Box.h>
+#include <nvimage/ColorBlock.h>
+#include <nvimage/Image.h>
+
+using namespace nv;
+
+namespace {
+	
+	// Get approximate luminance.
+	inline static uint colorLuminance(Color32 c)
+	{
+		return c.r + c.g + c.b;
+	}
+	
+	// Get the euclidean distance between the given colors.
+	inline static uint colorDistance(Color32 c0, Color32 c1)
+	{
+		return (c0.r - c1.r) * (c0.r - c1.r) + (c0.g - c1.g) * (c0.g - c1.g) + (c0.b - c1.b) * (c0.b - c1.b);
+	}
+	
+} // namespace`
+
+
+/// Default constructor.
+ColorBlock::ColorBlock()
+{
+}
+
+/// Init the color block with the contents of the given block.
+ColorBlock::ColorBlock(const ColorBlock & block)
+{
+	for(uint i = 0; i < 16; i++) {
+		color(i) = block.color(i);
+	}
+}
+
+
+/// Initialize this color block.
+ColorBlock::ColorBlock(const Image * img, uint x, uint y)
+{
+	init(img, x, y);
+}
+
+void ColorBlock::init(const Image * img, uint x, uint y)
+{
+	nvDebugCheck(img != NULL);
+	
+	const uint bw = min(img->width() - x, 4U);
+	const uint bh = min(img->height() - y, 4U);
+
+	nvDebugCheck(bw != 0);
+	nvDebugCheck(bh != 0);
+
+	int remainder[] = {
+		0, 0, 0, 0,
+		0, 1, 0, 1,
+		0, 1, 2, 0,
+		0, 1, 2, 3,
+	};
+
+	// Blocks that are smaller than 4x4 are handled by repeating the pixels.
+	// @@ Thats only correct when block size is 1, 2 or 4, but not with 3.
+
+	for(uint i = 0; i < 4; i++) {
+		//const int by = i % bh;
+		const int by = remainder[(bh - 1) * 4 + i];
+		for(uint e = 0; e < 4; e++) {
+			//const int bx = e % bw;
+			const int bx = remainder[(bw - 1) * 4 + e];
+			color(e, i) = img->pixel(x + bx, y + by);
+		}
+	}
+}
+
+
+void ColorBlock::swizzleDXT5n()
+{
+	for(int i = 0; i < 16; i++)
+	{
+		Color32 c = m_color[i];
+		m_color[i] = Color32(0, c.r, 0, c.g);
+	}
+}
+
+void ColorBlock::splatX()
+{
+	for(int i = 0; i < 16; i++)
+	{
+		uint8 x = m_color[i].r;
+		m_color[i] = Color32(x, x, x, x);
+	}
+}
+
+void ColorBlock::splatY()
+{
+	for(int i = 0; i < 16; i++)
+	{
+		uint8 y = m_color[i].g;
+		m_color[i] = Color32(y, y, y, y);
+	}
+}
+
+
+/// Count number of unique colors in this color block.
+uint ColorBlock::countUniqueColors() const
+{
+	uint count = 0;
+
+	// @@ This does not have to be o(n^2)
+	for(int i = 0; i < 16; i++)
+	{
+		bool unique = true;
+		for(int j = 0; j < i; j++) {
+			if( m_color[i] != m_color[j] ) {
+				unique = false;
+			}
+		}
+		
+		if( unique ) {
+			count++;
+		}
+	}
+	
+	return count;
+}
+
+/// Get average color of the block.
+Color32 ColorBlock::averageColor() const
+{
+	uint r, g, b, a;
+	r = g = b = a = 0;
+
+	for(uint i = 0; i < 16; i++) {
+		r += m_color[i].r;
+		g += m_color[i].g;
+		b += m_color[i].b;
+		a += m_color[i].a;
+	}
+	
+	return Color32(uint8(r / 16), uint8(g / 16), uint8(b / 16), uint8(a / 16));
+}
+
+
+/// Get diameter color range.
+void ColorBlock::diameterRange(Color32 * start, Color32 * end) const
+{
+	nvDebugCheck(start != NULL);
+	nvDebugCheck(end != NULL);
+	
+	Color32 c0, c1;
+	uint best_dist = 0;
+	
+	for(int i = 0; i < 16; i++) {
+		for (int j = i+1; j < 16; j++) {
+			uint dist = colorDistance(m_color[i], m_color[j]);
+			if( dist > best_dist ) {
+				best_dist = dist;
+				c0 = m_color[i];
+				c1 = m_color[j];
+			}
+		}
+	}
+	
+	*start = c0;
+	*end = c1;
+}
+
+/// Get luminance color range.
+void ColorBlock::luminanceRange(Color32 * start, Color32 * end) const
+{
+	nvDebugCheck(start != NULL);
+	nvDebugCheck(end != NULL);
+	
+	Color32 minColor, maxColor;
+	uint minLuminance, maxLuminance;
+	
+	maxLuminance = minLuminance = colorLuminance(m_color[0]);
+	
+	for(uint i = 1; i < 16; i++)
+	{
+		uint luminance = colorLuminance(m_color[i]);
+		
+		if (luminance > maxLuminance) {
+			maxLuminance = luminance;
+			maxColor = m_color[i];
+		}
+		else if (luminance < minLuminance) {
+			minLuminance = luminance;
+			minColor = m_color[i];
+		}
+	}
+
+	*start = minColor;
+	*end = maxColor;
+}
+
+/// Get color range based on the bounding box. 
+void ColorBlock::boundsRange(Color32 * start, Color32 * end) const
+{
+	nvDebugCheck(start != NULL);
+	nvDebugCheck(end != NULL);
+
+	Color32 minColor(255, 255, 255);
+	Color32 maxColor(0, 0, 0);
+
+	for(uint i = 0; i < 16; i++)
+	{
+		if (m_color[i].r < minColor.r) { minColor.r = m_color[i].r; }
+		if (m_color[i].g < minColor.g) { minColor.g = m_color[i].g; }
+		if (m_color[i].b < minColor.b) { minColor.b = m_color[i].b; }
+		if (m_color[i].r > maxColor.r) { maxColor.r = m_color[i].r; }
+		if (m_color[i].g > maxColor.g) { maxColor.g = m_color[i].g; }
+		if (m_color[i].b > maxColor.b) { maxColor.b = m_color[i].b; }
+	}
+
+	// Offset range by 1/16 of the extents
+	Color32 inset;
+	inset.r = (maxColor.r - minColor.r) >> 4;
+	inset.g = (maxColor.g - minColor.g) >> 4;
+	inset.b = (maxColor.b - minColor.b) >> 4;
+
+	minColor.r = (minColor.r + inset.r <= 255) ? minColor.r + inset.r : 255;
+	minColor.g = (minColor.g + inset.g <= 255) ? minColor.g + inset.g : 255;
+	minColor.b = (minColor.b + inset.b <= 255) ? minColor.b + inset.b : 255;
+
+	maxColor.r = (maxColor.r >= inset.r) ? maxColor.r - inset.r : 0;
+	maxColor.g = (maxColor.g >= inset.g) ? maxColor.g - inset.g : 0;
+	maxColor.b = (maxColor.b >= inset.b) ? maxColor.b - inset.b : 0;
+
+	*start = minColor;
+	*end = maxColor;
+}
+
+/// Get color range based on the bounding box. 
+void ColorBlock::boundsRangeAlpha(Color32 * start, Color32 * end) const
+{
+	nvDebugCheck(start != NULL);
+	nvDebugCheck(end != NULL);
+
+	Color32 minColor(255, 255, 255, 255);
+	Color32 maxColor(0, 0, 0, 0);
+
+	for(uint i = 0; i < 16; i++)
+	{
+		if (m_color[i].r < minColor.r) { minColor.r = m_color[i].r; }
+		if (m_color[i].g < minColor.g) { minColor.g = m_color[i].g; }
+		if (m_color[i].b < minColor.b) { minColor.b = m_color[i].b; }
+		if (m_color[i].a < minColor.a) { minColor.a = m_color[i].a; }
+		if (m_color[i].r > maxColor.r) { maxColor.r = m_color[i].r; }
+		if (m_color[i].g > maxColor.g) { maxColor.g = m_color[i].g; }
+		if (m_color[i].b > maxColor.b) { maxColor.b = m_color[i].b; }
+		if (m_color[i].a > maxColor.a) { maxColor.a = m_color[i].a; }
+	}
+
+	// Offset range by 1/16 of the extents
+	Color32 inset;
+	inset.r = (maxColor.r - minColor.r) >> 4;
+	inset.g = (maxColor.g - minColor.g) >> 4;
+	inset.b = (maxColor.b - minColor.b) >> 4;
+	inset.a = (maxColor.a - minColor.a) >> 4;
+
+	minColor.r = (minColor.r + inset.r <= 255) ? minColor.r + inset.r : 255;
+	minColor.g = (minColor.g + inset.g <= 255) ? minColor.g + inset.g : 255;
+	minColor.b = (minColor.b + inset.b <= 255) ? minColor.b + inset.b : 255;
+	minColor.a = (minColor.a + inset.a <= 255) ? minColor.a + inset.a : 255;
+
+	maxColor.r = (maxColor.r >= inset.r) ? maxColor.r - inset.r : 0;
+	maxColor.g = (maxColor.g >= inset.g) ? maxColor.g - inset.g : 0;
+	maxColor.b = (maxColor.b >= inset.b) ? maxColor.b - inset.b : 0;
+	maxColor.a = (maxColor.a >= inset.a) ? maxColor.a - inset.a : 0;
+	
+	*start = minColor;
+	*end = maxColor;
+}
+
+
+void ColorBlock::bestFitRange(Color32 * start, Color32 * end) const
+{
+	nvDebugCheck(start != NULL);
+	nvDebugCheck(end != NULL);
+
+	Vector3 axis = bestFitLine().direction();
+	computeRange(axis, start, end);
+}
+
+/// Sort colors by abosolute value in their 16 bit representation.
+void ColorBlock::sortColorsByAbsoluteValue()
+{
+	// Dummy selection sort.
+	for( uint a = 0; a < 16; a++ ) {
+		uint max = a;
+		Color16 cmax(m_color[a]);
+		
+		for( uint b = a+1; b < 16; b++ ) {
+			Color16 cb(m_color[b]);
+			
+			if( cb.u > cmax.u ) {
+				max = b;
+				cmax = cb;
+			}
+		}
+		swap( m_color[a], m_color[max] );
+	}
+}
+
+
+/// Find extreme colors in the given axis.
+void ColorBlock::computeRange(Vector3::Arg axis, Color32 * start, Color32 * end) const
+{
+	nvDebugCheck(start != NULL);
+	nvDebugCheck(end != NULL);
+	
+	int mini, maxi;
+	mini = maxi = 0;
+	
+	float min, max;	
+	min = max = dot(Vector3(m_color[0].r, m_color[0].g, m_color[0].b), axis);
+
+	for(uint i = 1; i < 16; i++)
+	{
+		const Vector3 vec(m_color[i].r, m_color[i].g, m_color[i].b);
+		
+		float val = dot(vec, axis);
+		if( val < min ) {
+			mini = i;
+			min = val;
+		}
+		else if( val > max ) {
+			maxi = i;
+			max = val;
+		}
+	}
+	
+	*start = m_color[mini];
+	*end = m_color[maxi];
+}
+
+
+/// Sort colors in the given axis.
+void ColorBlock::sortColors(const Vector3 & axis)
+{
+	float luma_array[16];
+	
+	for(uint i = 0; i < 16; i++) {
+		const Vector3 vec(m_color[i].r, m_color[i].g, m_color[i].b);
+		luma_array[i] = dot(vec, axis);
+	}
+	
+	// Dummy selection sort.
+	for( uint a = 0; a < 16; a++ ) {
+		uint min = a;
+		for( uint b = a+1; b < 16; b++ ) {
+			if( luma_array[b] < luma_array[min] ) {
+				min = b;
+			}
+		}
+		swap( luma_array[a], luma_array[min] );
+		swap( m_color[a], m_color[min] );
+	}
+}
+
+
+/// Get least squares line that best approxiamtes the points of the color block.
+Line3 ColorBlock::bestFitLine() const
+{
+	Array<Vector3> pointArray(16);
+	
+	for(int i = 0; i < 16; i++) {
+		pointArray.append(Vector3(m_color[i].r, m_color[i].g, m_color[i].b));
+	}
+	
+	return Fit::bestLine(pointArray);
+}
+
+
+/// Get the volume of the color block.
+float ColorBlock::volume() const
+{
+	Box bounds;
+	bounds.clearBounds();
+	
+	for(int i = 0; i < 16; i++) {
+		const Vector3 point(m_color[i].r, m_color[i].g, m_color[i].b);
+		bounds.addPointToBounds(point);
+	}
+	
+	return bounds.volume();
+}
+
+
--- a/src/nvimage/ColorBlock.h
+++ b/src/nvimage/ColorBlock.h
@ -0,0 +1,96 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#ifndef NV_IMAGE_COLORBLOCK_H
+#define NV_IMAGE_COLORBLOCK_H
+
+#include <nvmath/Color.h>
+#include <nvmath/Fitting.h>	// Line3
+
+namespace nv
+{
+	class Image;
+
+	/// Uncompressed 4x4 color block.
+	struct ColorBlock
+	{
+		ColorBlock();
+		ColorBlock(const ColorBlock & block);
+		ColorBlock(const Image * img, uint x, uint y);
+		
+		void init(const Image * img, uint x, uint y);
+		
+		void swizzleDXT5n();
+		void splatX();
+		void splatY();
+		
+		uint countUniqueColors() const;
+		Color32 averageColor() const;
+		
+		void diameterRange(Color32 * start, Color32 * end) const;
+		void luminanceRange(Color32 * start, Color32 * end) const;
+		void boundsRange(Color32 * start, Color32 * end) const;
+		void boundsRangeAlpha(Color32 * start, Color32 * end) const;
+		void bestFitRange(Color32 * start, Color32 * end) const;
+		
+		void sortColorsByAbsoluteValue();
+		
+		void computeRange(const Vector3 & axis, Color32 * start, Color32 * end) const;
+		void sortColors(const Vector3 & axis);
+		
+		Line3 bestFitLine() const;
+		float volume() const;
+		Line3 diameterLine() const;
+		
+		// Accessors
+		const Color32 * colors() const;
+
+		Color32 color(uint i) const;
+		Color32 & color(uint i);
+		
+		Color32 color(uint x, uint y) const;
+		Color32 & color(uint x, uint y);
+		
+	private:
+		
+		Color32 m_color[4*4];
+		
+	};
+	
+
+	/// Get pointer to block colors.
+	inline const Color32 * ColorBlock::colors() const
+	{
+		return m_color;
+	}
+	
+	/// Get block color.
+	inline Color32 ColorBlock::color(uint i) const
+	{
+		nvDebugCheck(i < 16);
+		return m_color[i];
+	}
+	
+	/// Get block color.
+	inline Color32 & ColorBlock::color(uint i)
+	{
+		nvDebugCheck(i < 16);
+		return m_color[i];
+	}
+	
+	/// Get block color.
+	inline Color32 ColorBlock::color(uint x, uint y) const
+	{
+		nvDebugCheck(x < 4 && y < 4);
+		return m_color[y * 4 + x];
+	}
+	
+	/// Get block color.
+	inline Color32 & ColorBlock::color(uint x, uint y)
+	{
+		nvDebugCheck(x < 4 && y < 4);
+		return m_color[y * 4 + x];
+	}
+	
+} // nv namespace
+
+#endif // NV_IMAGE_COLORBLOCK_H
--- a/src/nvimage/DirectDrawSurface.cpp
+++ b/src/nvimage/DirectDrawSurface.cpp
@ -0,0 +1,258 @@
+// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
+// 
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+#include <nvcore/Debug.h>
+
+#include <nvimage/DirectDrawSurface.h>
+
+#include <string.h> // memset
+
+
+using namespace nv;
+
+#if !defined(MAKEFOURCC)
+#	define MAKEFOURCC(ch0, ch1, ch2, ch3) \
+		(uint(uint8(ch0)) | (uint(uint8(ch1)) << 8) | \
+		(uint(uint8(ch2)) << 16) | (uint(uint8(ch3)) << 24 ))
+#endif
+
+namespace
+{
+	static const uint FOURCC_DDS = MAKEFOURCC('D', 'D', 'S', ' ');
+	static const uint FOURCC_DXT1 = MAKEFOURCC('D', 'X', 'T', '1');
+	static const uint FOURCC_DXT2 = MAKEFOURCC('D', 'X', 'T', '2');
+	static const uint FOURCC_DXT3 = MAKEFOURCC('D', 'X', 'T', '3');
+	static const uint FOURCC_DXT4 = MAKEFOURCC('D', 'X', 'T', '4');
+	static const uint FOURCC_DXT5 = MAKEFOURCC('D', 'X', 'T', '5');
+	static const uint FOURCC_RXGB = MAKEFOURCC('R', 'X', 'G', 'B');
+	static const uint FOURCC_ATI1 = MAKEFOURCC('A', 'T', 'I', '1');
+	static const uint FOURCC_ATI2 = MAKEFOURCC('A', 'T', 'I', '2');
+
+	static const uint DDSD_CAPS = 0x00000001U;
+	static const uint DDSD_PIXELFORMAT = 0x00001000U;
+	static const uint DDSD_WIDTH = 0x00000004U;
+	static const uint DDSD_HEIGHT = 0x00000002U;
+	static const uint DDSD_PITCH = 0x00000008U;
+	static const uint DDSD_MIPMAPCOUNT = 0x00020000U;
+	static const uint DDSD_LINEARSIZE = 0x00080000U;
+	static const uint DDSD_DEPTH = 0x00800000U;
+		
+	static const uint DDSCAPS_COMPLEX = 0x00000008U;
+	static const uint DDSCAPS_TEXTURE = 0x00001000U;
+	static const uint DDSCAPS_MIPMAP = 0x00400000U;
+	static const uint DDSCAPS2_VOLUME = 0x00200000U;
+	static const uint DDSCAPS2_CUBEMAP = 0x00000200U;
+
+	static const uint DDSCAPS2_CUBEMAP_POSITIVEX = 0x00000400U;
+	static const uint DDSCAPS2_CUBEMAP_NEGATIVEX = 0x00000800U;
+	static const uint DDSCAPS2_CUBEMAP_POSITIVEY = 0x00001000U;
+	static const uint DDSCAPS2_CUBEMAP_NEGATIVEY = 0x00002000U;
+	static const uint DDSCAPS2_CUBEMAP_POSITIVEZ = 0x00004000U;
+	static const uint DDSCAPS2_CUBEMAP_NEGATIVEZ = 0x00008000U;
+	static const uint DDSCAPS2_CUBEMAP_ALL_FACES = 0x0000F000U;
+
+	static const uint DDPF_RGB = 0x00000040U;
+	static const uint DDPF_FOURCC = 0x00000004U;
+	static const uint DDPF_ALPHAPIXELS = 0x00000001U;
+}
+
+DDSHeader::DDSHeader()
+{
+	this->fourcc = FOURCC_DDS;
+	this->size = 124;
+	this->flags  = (DDSD_CAPS|DDSD_PIXELFORMAT);
+	this->height = 0;
+	this->width = 0;
+	this->pitch = 0;
+	this->depth = 0;
+	this->mipmapcount = 0;
+	memset(this->reserved, 0, sizeof(this->reserved));
+
+	// Store version information on the reserved header attributes.
+	this->reserved[9] = MAKEFOURCC('N', 'V', 'T', 'T');
+	this->reserved[10] = (0 << 16) | (1 << 8) | (0);	// major.minor.revision
+
+	this->pf.size = 32;
+	this->pf.flags = 0;
+	this->pf.fourcc = 0;
+	this->pf.bitcount = 0;
+	this->pf.rmask = 0;
+	this->pf.gmask = 0;
+	this->pf.bmask = 0;
+	this->pf.amask = 0;
+	this->caps.caps1 = DDSCAPS_TEXTURE;
+	this->caps.caps2 = 0;
+	this->caps.caps3 = 0;
+	this->caps.caps4 = 0;
+	this->notused = 0;
+}
+
+void DDSHeader::setWidth(uint w)
+{
+	this->flags |= DDSD_WIDTH;
+	this->width = w;
+}
+
+void DDSHeader::setHeight(uint h)
+{
+	this->flags |= DDSD_HEIGHT;
+	this->height = h;
+}
+
+void DDSHeader::setDepth(uint d)
+{
+	this->flags |= DDSD_DEPTH;
+	this->height = d;
+}
+
+void DDSHeader::setMipmapCount(uint count)
+{
+	if (count == 0)
+	{
+		this->flags &= ~DDSD_MIPMAPCOUNT;
+		this->mipmapcount = 0;
+
+		if (this->caps.caps2 == 0) {
+			this->caps.caps1 = DDSCAPS_TEXTURE;
+		}
+		else {
+			this->caps.caps1 = DDSCAPS_TEXTURE | DDSCAPS_COMPLEX;
+		}
+	}
+	else
+	{
+		this->flags |= DDSD_MIPMAPCOUNT;
+		this->mipmapcount = count;
+
+		this->caps.caps1 |= DDSCAPS_COMPLEX | DDSCAPS_MIPMAP;
+	}
+}
+
+void DDSHeader::setTexture2D()
+{
+	// nothing to do here.
+}
+
+void DDSHeader::setTexture3D()
+{
+	this->caps.caps2 = DDSCAPS2_VOLUME;
+}
+
+void DDSHeader::setTextureCube()
+{
+	this->caps.caps1 |= DDSCAPS_COMPLEX;
+	this->caps.caps2 = DDSCAPS2_CUBEMAP | DDSCAPS2_CUBEMAP_ALL_FACES;
+}
+
+void DDSHeader::setLinearSize(uint size)
+{
+	this->flags &= ~DDSD_PITCH;
+	this->flags |= DDSD_LINEARSIZE;
+	this->pitch = size;
+}
+
+void DDSHeader::setPitch(uint pitch)
+{
+	this->flags &= ~DDSD_LINEARSIZE;
+	this->flags |= DDSD_PITCH;
+	this->pitch = pitch;
+}
+
+void DDSHeader::setFourCC(uint8 c0, uint8 c1, uint8 c2, uint8 c3)
+{
+	// set fourcc pixel format.
+	this->pf.flags = DDPF_FOURCC;
+	this->pf.fourcc = MAKEFOURCC(c0, c1, c2, c3);
+	this->pf.bitcount = 0;
+	this->pf.rmask = 0;
+	this->pf.gmask = 0;
+	this->pf.bmask = 0;
+	this->pf.amask = 0;
+}
+
+void DDSHeader::setPixelFormat(uint bitcount, uint rmask, uint gmask, uint bmask, uint amask)
+{
+	// Make sure the masks are correct.
+	nvCheck((rmask & gmask) == 0);
+	nvCheck((rmask & bmask) == 0);
+	nvCheck((rmask & amask) == 0);
+	nvCheck((gmask & bmask) == 0);
+	nvCheck((gmask & amask) == 0);
+	nvCheck((bmask & amask) == 0);
+
+	this->pf.flags = DDPF_RGB;
+
+	if (amask != 0) {
+		this->pf.flags |= DDPF_ALPHAPIXELS;
+	}
+
+	if (bitcount == 0)
+	{
+		// Compute bit count from the masks.
+		uint total = rmask | gmask | bmask | amask;
+		while(total != 0) {
+			bitcount++;
+			total >>= 1;
+		}
+		// @@ Align to 8?
+	}
+
+	this->pf.fourcc = 0;
+	this->pf.bitcount = bitcount;
+	this->pf.rmask = rmask;
+	this->pf.gmask = gmask;
+	this->pf.bmask = bmask;
+	this->pf.amask = amask;
+}
+
+
+void DDSHeader::swapBytes()
+{
+	this->fourcc = POSH_LittleU32(this->fourcc);
+	this->size = POSH_LittleU32(this->size);
+	this->flags = POSH_LittleU32(this->flags);
+	this->height = POSH_LittleU32(this->height);
+	this->width = POSH_LittleU32(this->width);
+	this->pitch = POSH_LittleU32(this->pitch);
+	this->depth = POSH_LittleU32(this->depth);
+	this->mipmapcount = POSH_LittleU32(this->mipmapcount);
+	
+	for(int i = 0; i < 11; i++) {
+		this->reserved[i] = POSH_LittleU32(this->reserved[i]);
+	}
+
+	this->pf.size = POSH_LittleU32(this->pf.size);
+	this->pf.flags = POSH_LittleU32(this->pf.flags);
+	this->pf.fourcc = POSH_LittleU32(this->pf.fourcc);
+	this->pf.bitcount = POSH_LittleU32(this->pf.bitcount);
+	this->pf.rmask = POSH_LittleU32(this->pf.rmask);
+	this->pf.gmask = POSH_LittleU32(this->pf.gmask);
+	this->pf.bmask = POSH_LittleU32(this->pf.bmask);
+	this->pf.amask = POSH_LittleU32(this->pf.amask);
+	this->caps.caps1 = POSH_LittleU32(this->caps.caps1);
+	this->caps.caps2 = POSH_LittleU32(this->caps.caps2);
+	this->caps.caps3 = POSH_LittleU32(this->caps.caps3);
+	this->caps.caps4 = POSH_LittleU32(this->caps.caps4);
+	this->notused = POSH_LittleU32(this->notused);
+}
+
--- a/src/nvimage/DirectDrawSurface.h
+++ b/src/nvimage/DirectDrawSurface.h
@ -0,0 +1,85 @@
+// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
+// 
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef NV_IMAGE_DIRECTDRAWSURFACE_H
+#define NV_IMAGE_DIRECTDRAWSURFACE_H
+
+#include <nvcore/nvcore.h>
+
+namespace nv
+{
+
+	struct DDSPixelFormat {
+		uint size;
+		uint flags;
+		uint fourcc;
+		uint bitcount;
+		uint rmask;
+		uint gmask;
+		uint bmask;
+		uint amask;
+	};
+
+	struct DDSCaps {
+		uint caps1;
+		uint caps2;
+		uint caps3;
+		uint caps4;
+	};
+
+	/// DDS file header.
+	struct DDSHeader {
+		uint fourcc;
+		uint size;
+		uint flags;
+		uint height;
+		uint width;
+		uint pitch;
+		uint depth;
+		uint mipmapcount;
+		uint reserved[11];
+		DDSPixelFormat pf;
+		DDSCaps caps;
+		uint notused;
+
+		// Helper methods.
+		DDSHeader();
+		void setWidth(uint w);
+		void setHeight(uint h);
+		void setDepth(uint d);
+		void setMipmapCount(uint count);
+		void setLinearSize(uint size);
+		void setPitch(uint pitch);
+		void setFourCC(uint8 c0, uint8 c1, uint8 c2, uint8 c3);
+		void setPixelFormat(uint bitcount, uint rmask, uint gmask, uint bmask, uint amask);
+		void setTexture2D();
+		void setTexture3D();
+		void setTextureCube();
+		
+		void swapBytes();
+	};
+
+
+} // nv namespace
+
+#endif // NV_IMAGE_DIRECTDRAWSURFACE_H
--- a/src/nvimage/Filter.cpp
+++ b/src/nvimage/Filter.cpp
@ -0,0 +1,572 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+/** @file Filter.cpp
+ * @brief Image filters.
+ *
+ * Jonathan Blow articles:
+ * http://number-none.com/product/Mipmapping, Part 1/index.html
+ * http://number-none.com/product/Mipmapping, Part 2/index.html
+ *
+ * References from Thacher Ulrich:
+ * See _Graphics Gems III_ "General Filtered Image Rescaling", Dale A. Schumacher
+ *
+ * References from Paul Heckbert:
+ * A.V. Oppenheim, R.W. Schafer, Digital Signal Processing, Prentice-Hall, 1975
+ *
+ * R.W. Hamming, Digital Filters, Prentice-Hall, Englewood Cliffs, NJ, 1983
+ *
+ * W.K. Pratt, Digital Image Processing, John Wiley and Sons, 1978
+ *
+ * H.S. Hou, H.C. Andrews, "Cubic Splines for Image Interpolation and
+ *	Digital Filtering", IEEE Trans. Acoustics, Speech, and Signal Proc.,
+ *	vol. ASSP-26, no. 6, Dec. 1978, pp. 508-517
+ *
+ * Paul Heckbert's zoom library.
+ * http://www.xmission.com/~legalize/zoom.html
+ * 
+ * Reconstruction Filters in Computer Graphics
+ * http://www.mentallandscape.com/Papers_siggraph88.pdf 
+ *
+ */
+
+
+#include <nvcore/Containers.h>	// swap
+#include <nvmath/nvmath.h>	// fabs
+#include <nvmath/Vector.h>	// Vector4
+#include <nvimage/Filter.h>
+
+using namespace nv;
+
+namespace
+{
+
+// support = 0.5
+inline static float filter_box(float x)
+{
+    if( x < -0.5f ) return 0.0f;
+    if( x <= 0.5 ) return 1.0f;
+    return 0.0f;
+}
+
+// support = 1.0
+inline static float filter_triangle(float x)
+{
+    if( x < -1.0f ) return 0.0f;
+    if( x < 0.0f ) return 1.0f + x;
+    if( x < 1.0f ) return 1.0f - x;
+    return 0.0f;
+}
+
+// support = 1.5
+inline static float filter_quadratic(float x)
+{
+	if( x < 0.0f ) x = -x;
+    if( x < 0.5f ) return 0.75f - x * x;
+    if( x < 1.5f ) { 
+    	float t = x - 1.5f;
+    	return 0.5f * t * t;
+    }
+    return 0.0f;
+}
+
+// @@ Filter from tulrich. 
+// support 1.0
+inline static float filter_cubic(float x)
+{
+	// f(t) = 2|t|^3 - 3|t|^2 + 1, -1 <= t <= 1
+	if( x < 0.0f ) x = -x;
+	if( x < 1.0f ) return((2.0f * x - 3.0f) * x * x + 1.0f);
+	return 0.0f;
+}
+
+
+// @@ Paul Heckbert calls this cubic instead of spline.
+// support = 2.0
+inline static float filter_spline(float x)
+{
+    if( x < 0.0f ) x = -x;
+    if( x < 1.0f ) return (4.0f + x * x * (-6.0f + x * 3.0f)) / 6.0f;
+    if( x < 2.0f ) { 
+    	float t = 2.0f - x;
+    	return t * t * t / 6.0f;
+    }
+    return 0.0f;
+}
+
+/// Sinc function.
+inline float sincf( const float x )
+{
+	if( fabs(x) < NV_EPSILON ) {
+		return 1.0 ;
+		//return 1.0f + x*x*(-1.0f/6.0f + x*x*1.0f/120.0f);
+	}
+	else {
+		return sin(x) / x;
+	}
+}
+
+// support = 3.0
+inline static float filter_lanczos3(float x)
+{
+	if( x < 0.0f ) x = -x;
+	if( x < 3.0f ) return(sincf(x) * sincf(x / 3.0f));
+	return 0.0f;
+}
+
+
+
+// Mitchell & Netravali's two-param cubic
+// see "Reconstruction Filters in Computer Graphics", SIGGRAPH 88
+// support = 2.0
+inline static float filter_mitchell(float x, float b, float c)
+{
+	// @@ Coefficients could be precomputed.
+	// @@ if b and c are fixed, these are constants.
+	const float p0 = (6.0f -  2.0f * b) / 6.0f;
+	const float p2 = (-18.0f + 12.0f * b + 6.0f * c) / 6.0f;
+	const float p3 = (12.0f - 9.0f * b - 6.0f * c) / 6.0f;
+	const float q0 = (8.0f * b + 24.0f * c) / 6.0f;
+	const float q1 = (-12.0f * b - 48.0f * c) / 6.0f;
+	const float q2 = (6.0f * b + 30.0f * c) / 6.0f;
+	const float q3 = (-b - 6.0f * c) / 6.0f;
+
+	if( x < 0.0f ) x = -x;
+	if( x < 1.0f ) return p0 + x * x * (p2 + x * p3);
+	if( x < 2.0f ) return q0 + x * (q1 + x * (q2 + x * q3));
+	return 0.0f;
+}
+
+inline static float filter_mitchell(float x)
+{
+	return filter_mitchell(x, 1.0f/3.0f, 1.0f/3.0f);
+}
+
+// Bessel function of the first kind from Jon Blow's article.
+// http://mathworld.wolfram.com/BesselFunctionoftheFirstKind.html
+// http://en.wikipedia.org/wiki/Bessel_function
+static float bessel0(float x)
+{
+	const float EPSILON_RATIO = 1E-6;
+	float xh, sum, pow, ds;
+	int k;
+
+	xh = 0.5 * x;
+	sum = 1.0;
+	pow = 1.0;
+	k = 0;
+	ds = 1.0;
+	while (ds > sum * EPSILON_RATIO) {
+		++k;
+		pow = pow * (xh / k);
+		ds = pow * pow;
+		sum = sum + ds;
+	}
+
+	return sum;
+}
+
+// Alternative bessel function from Paul Heckbert.
+static float _bessel0(float x)
+{
+	const float EPSILON_RATIO = 1E-6;
+    float sum = 1.0f;
+    float y = x * x / 4.0f;
+    float t = y;
+    for(int i = 2; t > EPSILON_RATIO; i++) {
+		sum += t;
+		t *= y / float(i * i);
+    }
+    return sum;
+}
+
+// support = 1.0
+inline static float filter_kaiser(float x, float alpha)
+{
+	return bessel0(alpha * sqrtf(1 - x * x)) / bessel0(alpha);
+}
+
+inline static float filter_kaiser(float x)
+{
+	return filter_kaiser(x, 4.0f);
+}
+
+
+// Array of filters.
+static Filter s_filter_array[] = {
+	{filter_box, 		0.5f},	// Box
+	{filter_triangle, 	1.0f},	// Triangle
+	{filter_quadratic, 	1.5f},	// Quadratic
+	{filter_cubic, 		1.0f},	// Cubic
+	{filter_spline,		2.0f},	// Spline
+	{filter_lanczos3,	3.0f},	// Lanczos
+	{filter_mitchell,	1.0f},	// Mitchell
+	{filter_kaiser,		1.0f},	// Kaiser
+};
+
+} // namespace
+
+
+
+/// Ctor.
+Kernel1::Kernel1(uint width) : w(width)
+{
+	data = new float[w];
+}
+
+/// Copy ctor.
+Kernel1::Kernel1(const Kernel1 & k) : w(k.w)
+{
+	data = new float[w];
+	for(uint i = 0; i < w; i++) {
+		data[i] = k.data[i];
+	}
+}
+
+/// Dtor.
+Kernel1::~Kernel1()
+{
+	delete data;
+}
+
+/// Normalize the filter.
+void Kernel1::normalize()
+{
+	float total = 0.0f;
+	for(uint i = 0; i < w; i++) {
+		total += data[i];
+	}
+	
+	float inv = 1.0f / total;
+	for(uint i = 0; i < w; i++) {
+		data[i] *= inv;
+	}
+}
+
+
+/// Init 1D Box filter.
+void Kernel1::initFilter(Filter::Enum f)
+{
+	nvCheck((w & 1) == 0);
+	nvCheck(f < Filter::Num);
+	
+	float (* filter_function)(float) = s_filter_array[f].function;
+	const float support = s_filter_array[f].support;
+	
+	const float half_width = float(w / 2);
+	const float offset = -half_width;
+	const float nudge = 0.5f;
+	
+	for(uint i = 0; i < w; i++) {
+		const float x = (i + offset) + nudge;
+		data[i] = filter_function(x * support / half_width);
+	}
+	
+	normalize();
+}
+
+
+/// Init 1D sinc filter.
+void Kernel1::initSinc(float stretch /*= 1*/)
+{
+	nvCheck((w & 1) == 0);
+	
+	const float half_width = float(w / 2);
+	const float offset = -half_width;
+	const float nudge = 0.5f;
+	
+	for(uint i = 0; i < w; i++) {
+		const float x = (i + offset) + nudge;
+		data[i] = sincf(PI * x * stretch);
+	}
+
+	normalize();
+}
+
+
+/// Init 1D windowed Kaiser filter.
+void Kernel1::initKaiser(float alpha, float stretch /*= 1*/)
+{
+	nvCheck((w & 1) == 0);
+	
+	const float half_width = float(w / 2);
+	const float offset = -half_width;
+	const float nudge = 0.5f;
+	
+	for(uint i = 0; i < w; i++) {
+		const float x = (i + offset) + nudge;
+		const float sinc_value = sincf(PI * x * stretch);
+		const float window_value = filter_kaiser(x / half_width, alpha);
+		
+		data[i] = sinc_value * window_value;	// @@ sinc windowed by kaiser
+	}
+
+	normalize();
+}
+
+
+/// Init 1D Mitchell filter.
+void Kernel1::initMitchell(float b, float c)
+{
+	nvCheck((w & 1) == 0);
+	
+	const float half_width = float(w / 2);
+	const float offset = -half_width;
+	const float nudge = 0.5f;
+	
+	for(uint i = 0; i < w; i++) {
+		const float x = (i + offset) + nudge;
+		data[i] = filter_mitchell(x / half_width, b, c);
+	}
+	
+	normalize();
+}
+
+
+/// Print the kernel for debugging purposes.
+void Kernel1::debugPrint()
+{
+	for(uint i = 0; i < w; i++) {
+		nvDebug("%d: %f\n", i, data[i]);
+	}
+}
+
+
+
+/// Ctor.
+Kernel2::Kernel2(uint width) : w(width)
+{
+	data = new float[w*w];
+}
+
+/// Copy ctor.
+Kernel2::Kernel2(const Kernel2 & k) : w(k.w)
+{
+	data = new float[w*w];
+	for(uint i = 0; i < w*w; i++) {
+		data[i] = k.data[i];
+	}
+}
+
+
+/// Dtor.
+Kernel2::~Kernel2()
+{
+	delete data;
+}
+
+/// Normalize the filter.
+void Kernel2::normalize()
+{
+	float total = 0.0f;
+	for(uint i = 0; i < w*w; i++) {
+		total += fabs(data[i]);
+	}
+	
+	float inv = 1.0f / total;
+	for(uint i = 0; i < w*w; i++) {
+		data[i] *= inv;
+	}
+}
+
+/// Transpose the kernel.
+void Kernel2::transpose()
+{
+	for(uint i = 0; i < w; i++) {
+		for(uint j = i+1; j < w; j++) {
+			swap(data[i*w + j], data[j*w + i]);
+		}
+	}
+}
+
+/// Init laplacian filter, usually used for sharpening.
+void Kernel2::initLaplacian()
+{
+	nvDebugCheck(w == 3);
+//	data[0] = -1; data[1] = -1; data[2] = -1;
+//	data[3] = -1; data[4] = +8; data[5] = -1;
+//	data[6] = -1; data[7] = -1; data[8] = -1;	
+	
+	data[0] = +0; data[1] = -1; data[2] = +0;
+	data[3] = -1; data[4] = +4; data[5] = -1;
+	data[6] = +0; data[7] = -1; data[8] = +0;	
+	
+//	data[0] = +1; data[1] = -2; data[2] = +1;
+//	data[3] = -2; data[4] = +4; data[5] = -2;
+//	data[6] = +1; data[7] = -2; data[8] = +1;	
+}
+
+
+/// Init simple edge detection filter.
+void Kernel2::initEdgeDetection()
+{
+	nvCheck(w == 3);
+	data[0] = 0; data[1] = 0; data[2] = 0;
+	data[3] = -1; data[4] = 0; data[5] = 1;
+	data[6] = 0; data[7] = 0; data[8] = 0;
+}
+
+/// Init sobel filter.
+void Kernel2::initSobel()
+{
+	if (w == 3)
+	{
+		data[0] = -1; data[1] = 0; data[2] = 1;
+		data[3] = -2; data[4] = 0; data[5] = 2;
+		data[6] = -1; data[7] = 0; data[8] = 1;
+	}
+	else if (w == 5)
+	{
+		float elements[] = {
+            -1, -2, 0, 2, 1,
+            -2, -3, 0, 3, 2,
+            -3, -4, 0, 4, 3,
+            -2, -3, 0, 3, 2,
+            -1, -2, 0, 2, 1
+		};
+
+		for (int i = 0; i < 5*5; i++) {
+			data[i] = elements[i];
+		}
+	}
+	else if (w == 7)
+	{
+		float elements[] = {
+            -1, -2, -3, 0, 3, 2, 1,
+            -2, -3, -4, 0, 4, 3, 2,
+            -3, -4, -5, 0, 5, 4, 3,
+            -4, -5, -6, 0, 6, 5, 4,
+            -3, -4, -5, 0, 5, 4, 3,
+            -2, -3, -4, 0, 4, 3, 2,
+            -1, -2, -3, 0, 3, 2, 1
+		};
+
+		for (int i = 0; i < 7*7; i++) {
+			data[i] = elements[i];
+		}
+	}
+	else if (w == 9)
+	{
+		float elements[] = {
+            -1, -2, -3, -4, 0, 4, 3, 2, 1,
+            -2, -3, -4, -5, 0, 5, 4, 3, 2,
+            -3, -4, -5, -6, 0, 6, 5, 4, 3,
+            -4, -5, -6, -7, 0, 7, 6, 5, 4,
+            -5, -6, -7, -8, 0, 8, 7, 6, 5,
+            -4, -5, -6, -7, 0, 7, 6, 5, 4,
+            -3, -4, -5, -6, 0, 6, 5, 4, 3,
+            -2, -3, -4, -5, 0, 5, 4, 3, 2,
+            -1, -2, -3, -4, 0, 4, 3, 2, 1
+		};
+		
+		for (int i = 0; i < 9*9; i++) {
+			data[i] = elements[i];
+		}
+	}
+}
+
+/// Init prewitt filter.
+void Kernel2::initPrewitt()
+{
+	if (w == 3)
+	{
+		data[0] = -1; data[1] = 0; data[2] = -1;
+		data[3] = -1; data[4] = 0; data[5] = -1;
+		data[6] = -1; data[7] = 0; data[8] = -1;
+	}
+	else if (w == 5)
+	{
+		// @@ Is this correct?
+		float elements[] = {
+            -2, -1, 0, 1, 2,
+            -2, -1, 0, 1, 2,
+            -2, -1, 0, 1, 2,
+            -2, -1, 0, 1, 2,
+            -2, -1, 0, 1, 2
+		};
+
+		for (int i = 0; i < 5*5; i++) {
+			data[i] = elements[i];
+		}
+	}
+}
+
+/// Init blended sobel filter.
+void Kernel2::initBlendedSobel(const Vector4 & scale)
+{
+	nvCheck(w == 9);
+
+	{
+		float elements[] = {
+            -1, -2, -3, -4, 0, 4, 3, 2, 1,
+            -2, -3, -4, -5, 0, 5, 4, 3, 2,
+            -3, -4, -5, -6, 0, 6, 5, 4, 3,
+            -4, -5, -6, -7, 0, 7, 6, 5, 4,
+            -5, -6, -7, -8, 0, 8, 7, 6, 5,
+            -4, -5, -6, -7, 0, 7, 6, 5, 4,
+            -3, -4, -5, -6, 0, 6, 5, 4, 3,
+            -2, -3, -4, -5, 0, 5, 4, 3, 2,
+            -1, -2, -3, -4, 0, 4, 3, 2, 1
+		};
+		
+		for (int i = 0; i < 9*9; i++) {
+			data[i] = elements[i] * scale.w();
+		}
+	}
+	{
+		float elements[] = {
+            -1, -2, -3, 0, 3, 2, 1,
+            -2, -3, -4, 0, 4, 3, 2,
+            -3, -4, -5, 0, 5, 4, 3,
+            -4, -5, -6, 0, 6, 5, 4,
+            -3, -4, -5, 0, 5, 4, 3,
+            -2, -3, -4, 0, 4, 3, 2,
+            -1, -2, -3, 0, 3, 2, 1,
+		};
+
+		for (int i = 0; i < 7; i++) {
+			for (int e = 0; e < 7; e++) {
+				data[i * 9 + e + 1] += elements[i * 7 + e] * scale.z();
+			}
+		}
+	}
+	{
+		float elements[] = {
+            -1, -2, 0, 2, 1,
+            -2, -3, 0, 3, 2,
+            -3, -4, 0, 4, 3,
+            -2, -3, 0, 3, 2,
+            -1, -2, 0, 2, 1
+		};
+
+		for (int i = 0; i < 5; i++) {
+			for (int e = 0; e < 5; e++) {
+				data[i * 9 + e + 2] += elements[i * 5 + e] * scale.y();
+			}
+		}
+	}
+	{
+		float elements[] = {
+            -1, 0, 1,
+            -2, 0, 2,
+            -1, 0, 1,
+		};
+
+		for (int i = 0; i < 3; i++) {
+			for (int e = 0; e < 3; e++) {
+				data[i * 9 + e + 3] += elements[i * 3 + e] * scale.x();
+			}
+		}
+	}
+}
+
+
+/*PI_DECLARE_TEST(BesselTest) {
+
+	for(int i = 0; i < 8; i++) {
+		nvDebug("bessel0(%i) %f =? %f\n", i, bessel0(i), _bessel0(i));
+		PI_TEST(equalf(bessel0(i), _bessel0(i)));
+	}
+
+	return PiTestUnit::Succeed;
+}*/
+
--- a/src/nvimage/Filter.h
+++ b/src/nvimage/Filter.h
@ -0,0 +1,103 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#ifndef NV_IMAGE_FILTER_H
+#define NV_IMAGE_FILTER_H
+
+#include <nvimage/nvimage.h>
+
+namespace nv
+{
+	class Vector4;
+
+	/// A filter function.
+	struct Filter 
+	{
+		// Standard filters.
+		enum Enum 
+		{
+			Box,
+			Triangle,
+			Quadratic,	// Bell
+			Cubic,
+			Spline,
+			Lanczos,
+			Mitchell,
+			Kaiser,
+			Num
+		};
+
+		float (*function)(float x);
+		float support;
+	};
+
+
+	/// A 1D kernel. Used to precompute filter weights.
+	class Kernel1
+	{
+	public:
+		NVIMAGE_API Kernel1(uint width);
+		NVIMAGE_API Kernel1(const Kernel1 & k);
+		NVIMAGE_API ~Kernel1();
+		
+		NVIMAGE_API void normalize();
+		
+		float valueAt(uint x) const {
+			return data[x];
+		}
+		
+		uint width() const {
+			return w;
+		}
+		
+		NVIMAGE_API void initFilter(Filter::Enum filter);
+		NVIMAGE_API void initSinc(float stretch = 1);
+		NVIMAGE_API void initKaiser(float alpha = 4.0f, float stretch = 1.0f);
+		NVIMAGE_API void initMitchell(float b = 1.0f/3.0f, float c = 1.0f/3.0f);
+		
+		NVIMAGE_API void debugPrint();
+		
+	private:
+		const uint w;
+		float * data;
+	};
+
+
+	/// A 2D kernel.
+	class Kernel2 
+	{
+	public:
+		NVIMAGE_API Kernel2(uint width);
+		NVIMAGE_API Kernel2(const Kernel2 & k);
+		NVIMAGE_API ~Kernel2();
+		
+		NVIMAGE_API void normalize();
+		NVIMAGE_API void transpose();
+		
+		float valueAt(uint x, uint y) const {
+			return data[y * w + x];
+		}
+		
+		uint width() const {
+			return w;
+		}
+		
+		NVIMAGE_API void initLaplacian();
+		NVIMAGE_API void initEdgeDetection();
+		NVIMAGE_API void initSobel();
+		NVIMAGE_API void initPrewitt();
+
+		NVIMAGE_API void initBlendedSobel(const Vector4 & scale);
+
+	private:
+		const uint w;
+		float * data;
+	};
+
+
+	// @@ Implement non linear filters:
+	// Kuwahara filter
+	// Median filter
+
+} // nv namespace
+
+#endif // NV_IMAGE_FILTER_H
--- a/src/nvimage/FloatImage.cpp
+++ b/src/nvimage/FloatImage.cpp
@ -0,0 +1,839 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#include <nvcore/Containers.h>
+#include <nvcore/Ptr.h>
+
+#include <nvmath/Color.h>
+
+#include "FloatImage.h"
+#include "Filter.h"
+#include "Image.h"
+
+#include <math.h>
+
+using namespace nv;
+
+namespace 
+{
+	static int round(float f)
+	{
+		return int(f);
+	}
+
+	static float frac(float f)
+	{
+		return f - floor(f);
+	}
+}
+
+
+/// Ctor.
+FloatImage::FloatImage() : m_width(0), m_height(0), 
+	m_componentNum(0), m_count(0), m_mem(NULL)
+{
+}
+
+/// Ctor. Init from image.
+FloatImage::FloatImage(const Image * img) : m_width(0), m_height(0), 
+	m_componentNum(0), m_count(0), m_mem(NULL)
+{
+	initFrom(img);
+}
+
+/// Dtor.
+FloatImage::~FloatImage()
+{
+	free();
+}
+
+
+/// Init the floating point image from a regular image.
+void FloatImage::initFrom(const Image * img)
+{
+	nvCheck(img != NULL);
+	
+	allocate(4, img->width(), img->height());
+	
+	float * red_channel = channel(0);
+	float * green_channel = channel(1);
+	float * blue_channel = channel(2);
+	float * alpha_channel = channel(3);
+	
+	const uint count = m_width * m_height;
+	for(uint i = 0; i < count; i++) {
+		Color32 pixel = img->pixel(i);
+		red_channel[i] = float(pixel.r) / 255.0f;
+		green_channel[i] = float(pixel.g) / 255.0f;
+		blue_channel[i] = float(pixel.b) / 255.0f;
+		alpha_channel[i] = float(pixel.a) / 255.0f;
+	}
+}
+
+/// Convert the floating point image to a regular image.
+Image * FloatImage::createImage(uint base_component/*= 0*/, uint num/*= 4*/) const
+{
+	nvCheck(num <= 4);
+	nvCheck(base_component + num <= m_componentNum);
+	
+	AutoPtr<Image> img(new Image());
+	img->allocate(m_width, m_height);
+	
+	const uint size = m_width * m_height;
+	for(uint i = 0; i < size; i++) {
+		
+		uint c;
+		uint8 rgba[4];
+
+		for(c = 0; c < num; c++) {
+			float f = m_mem[size * (base_component + c) + i];
+			rgba[c] = nv::clamp(int(255.0f * f), 0, 255);
+		}
+
+		// Fill the rest with 0xff000000;
+		for(; c < 4; c++) {
+			rgba[c] = c != 3 ? 0 : 0xff;
+		}
+		
+		img->pixel(i) = Color32(rgba[0], rgba[1], rgba[2], rgba[3]);
+	}
+	
+	return img.release();
+}
+
+
+/// Convert the floating point image to a regular image. Correct gamma of rgb, but not alpha.
+Image * FloatImage::createImageGammaCorrect(float gamma/*= 2.2f*/) const
+{
+	nvCheck(m_componentNum == 4);
+	
+	AutoPtr<Image> img(new Image());
+	img->allocate(m_width, m_height);
+	
+	const float * rChannel = this->channel(0);
+	const float * gChannel = this->channel(1);
+	const float * bChannel = this->channel(2);
+	const float * aChannel = this->channel(3);
+
+	const uint size = m_width * m_height;
+	for(uint i = 0; i < size; i++)
+	{
+		const uint8 r = nv::clamp(int(255.0f * pow(rChannel[i], 1.0f/gamma)), 0, 255);
+		const uint8 g = nv::clamp(int(255.0f * pow(gChannel[i], 1.0f/gamma)), 0, 255);
+		const uint8 b = nv::clamp(int(255.0f * pow(bChannel[i], 1.0f/gamma)), 0, 255);
+		const uint8 a = nv::clamp(int(255.0f * aChannel[i]), 0, 255);
+
+		img->pixel(i) = Color32(r, g, b, a);
+	}
+	
+	return img.release();
+}
+
+/// Allocate a 2d float image of the given format and the given extents.
+void FloatImage::allocate(uint c, uint w, uint h)
+{
+	nvCheck(m_mem == NULL);
+	m_width = w;
+	m_height = h;
+	m_componentNum = c;
+	m_count = w * h * c;
+	m_mem = reinterpret_cast<float *>(nv::mem::malloc(m_count * sizeof(float)));
+}
+
+/// Free the image, but don't clear the members.
+void FloatImage::free()
+{
+	nvCheck(m_mem != NULL);
+	nv::mem::free( reinterpret_cast<void *>(m_mem) );
+	m_mem = NULL;
+}
+
+void FloatImage::clear(float f/*=0.0f*/)
+{
+	for(uint i = 0; i < m_count; i++) {
+		m_mem[i] = f;
+	}
+}
+
+void FloatImage::normalize(uint base_component)
+{
+	nvCheck(base_component + 3 <= m_componentNum);
+	
+	float * xChannel = this->channel(base_component + 0);
+	float * yChannel = this->channel(base_component + 1);
+	float * zChannel = this->channel(base_component + 2);
+
+	const uint size = m_width * m_height;
+	for(uint i = 0; i < size; i++) {
+		
+		Vector3 normal(xChannel[i], yChannel[i], zChannel[i]);
+		normal = normalizeSafe(normal, Vector3(zero));
+		
+		xChannel[i] = normal.x();
+		yChannel[i] = normal.y();
+		zChannel[i] = normal.z();
+	}
+}
+
+void FloatImage::packNormals(uint base_component)
+{
+	scaleBias(base_component, 3, 0.5f, 1.0f);
+}
+
+void FloatImage::expandNormals(uint base_component)
+{
+	scaleBias(base_component, 3, 2, 0.5);
+}
+
+void FloatImage::scaleBias(uint base_component, uint num, float scale, float bias)
+{
+	const uint size = m_width * m_height;
+	
+	for(uint c = 0; c < num; c++) {
+		float * ptr = this->channel(base_component + c);
+		
+		for(uint i = 0; i < size; i++) {
+			ptr[i] = scale * (ptr[i] + bias);
+		}
+	}
+}
+
+/// Clamp the elements of the image.
+void FloatImage::clamp(float low, float high)
+{
+	for(uint i = 0; i < m_count; i++) {
+		m_mem[i] = nv::clamp(m_mem[i], low, high);
+	}
+}
+
+/// From gamma to linear space.
+void FloatImage::toLinear(uint base_component, uint num, float gamma /*= 2.2f*/)
+{
+	exponentiate(base_component, num, gamma);
+}
+
+/// From linear to gamma space.
+void FloatImage::toGamma(uint base_component, uint num, float gamma /*= 2.2f*/)
+{
+	exponentiate(base_component, num, 1.0f/gamma);
+}
+
+/// Exponentiate the elements of the image.
+void FloatImage::exponentiate(uint base_component, uint num, float power)
+{
+	const uint size = m_width * m_height;
+
+	for(uint c = 0; c < num; c++) {
+		float * ptr = this->channel(base_component + c);
+		
+		for(uint i = 0; i < size; i++) {
+			ptr[i] = pow(ptr[i], power);
+		}
+	}
+}
+
+#if 0
+float FloatImage::nearest(float x, float y, int c, WrapMode wm) const
+{
+	if( wm == WrapMode_Clamp ) return nearest_clamp(x, y, c);
+	/*if( wm == WrapMode_Repeat )*/ return nearest_repeat(x, y, c);
+	//if( wm == WrapMode_Mirror ) return nearest_mirror(x, y, c);
+}
+
+float FloatImage::nearest_clamp(int x, int y, const int c) const
+{
+	const int w = m_width;
+	const int h = m_height;
+	int ix = ::clamp(x, 0, w-1);
+	int iy = ::clamp(y, 0, h-1);
+	return pixel(ix, iy, c);
+}
+
+float FloatImage::nearest_repeat(int x, int y, const int c) const
+{
+	const int w = m_width;
+	const int h = m_height;
+	int ix = x % w;
+	int iy = y % h;
+	return pixel(ix, iy, c);
+}
+#endif
+
+float FloatImage::nearest(float x, float y, int c, WrapMode wm) const
+{
+	if( wm == WrapMode_Clamp ) return nearest_clamp(x, y, c);
+	/*if( wm == WrapMode_Repeat )*/ return nearest_repeat(x, y, c);
+	//if( wm == WrapMode_Mirror ) return nearest_mirror(x, y, c);
+}
+
+float FloatImage::linear(float x, float y, int c, WrapMode wm) const
+{
+	if( wm == WrapMode_Clamp ) return linear_clamp(x, y, c);
+	/*if( wm == WrapMode_Repeat )*/ return linear_repeat(x, y, c);
+	//if( wm == WrapMode_Mirror ) return linear_mirror(x, y, c);
+}
+
+float FloatImage::nearest_clamp(float x, float y, const int c) const
+{
+	const int w = m_width;
+	const int h = m_height;
+	int ix = ::clamp(round(x * w), 0, w-1);
+	int iy = ::clamp(round(y * h), 0, h-1);
+	return pixel(ix, iy, c);
+}
+
+float FloatImage::nearest_repeat(float x, float y, const int c) const
+{
+	const int w = m_width;
+	const int h = m_height;
+	int ix = round(frac(x) * w);
+	int iy = round(frac(y) * h);
+	return pixel(ix, iy, c);
+}
+
+float FloatImage::nearest_mirror(float x, float y, const int c) const
+{
+	// @@ TBD
+	return 0.0f;
+}
+
+float FloatImage::linear_clamp(float x, float y, const int c) const
+{
+	const int w = m_width;
+	const int h = m_height;
+	
+	x *= w;
+	y *= h;
+	
+	const float fracX = frac(x);
+	const float fracY = frac(y);
+	
+	const int ix0 = ::clamp(round(x), 0, w-1);
+	const int iy0 = ::clamp(round(y), 0, h-1);
+	const int ix1 = ::clamp(round(x)+1, 0, w-1);
+	const int iy1 = ::clamp(round(y)+1, 0, h-1);
+
+	float f1 = pixel(ix0, iy0, c);
+	float f2 = pixel(ix1, iy0, c);
+	float f3 = pixel(ix0, iy1, c);
+	float f4 = pixel(ix1, iy1, c);
+	
+	float i1 = lerp(f1, f2, fracX);
+	float i2 = lerp(f3, f4, fracX);
+
+	return lerp(i1, i2, fracY);
+}
+
+float FloatImage::linear_repeat(float x, float y, int c) const
+{
+	const int w = m_width;
+	const int h = m_height;
+	
+	const float fracX = frac(x * w);
+	const float fracY = frac(y * h);
+	
+	int ix0 = round(frac(x) * w);
+	int iy0 = round(frac(y) * h);
+	int ix1 = round(frac(x + 1.0f/w) * w);
+	int iy1 = round(frac(y + 1.0f/h) * h);
+	
+	float f1 = pixel(ix0, iy0, c);
+	float f2 = pixel(ix1, iy0, c);
+	float f3 = pixel(ix0, iy1, c);
+	float f4 = pixel(ix1, iy1, c);
+	
+	float i1 = lerp(f1, f2, fracX);
+	float i2 = lerp(f3, f4, fracX);
+
+	return lerp(i1, i2, fracY);
+}
+
+float FloatImage::linear_mirror(float x, float y, int c) const
+{
+	// @@ TBD
+	return 0.0f;
+}
+
+
+/// Fast downsampling using box filter. 
+///
+/// The extents of the image are divided by two and rounded down.
+///
+/// When the size of the image is odd, this uses a polyphase box filter as explained in:
+/// http://developer.nvidia.com/object/np2_mipmapping.html
+///
+FloatImage * FloatImage::fastDownSample() const
+{
+	nvDebugCheck(m_width != 1 || m_height != 1);
+	
+	AutoPtr<FloatImage> dst_image( new FloatImage() );
+
+	const uint w = max(1, m_width / 2);
+	const uint h = max(1, m_height / 2);
+	dst_image->allocate(m_componentNum, w, h);
+
+	// 1D box filter.
+	if (m_width == 1 || m_height == 1)
+	{
+		const uint w = m_width * m_height;
+		
+		if (w & 1)
+		{
+			const float scale = 1.0f / (2 * w + 1);
+			
+			for(uint c = 0; c < m_componentNum; c++)
+			{
+				const float * src = this->channel(c);
+				float * dst = dst_image->channel(c);
+				
+				for(uint x = 0; x < w; x++)
+				{
+					const float w0 = (w - x);
+					const float w1 = (w - 0);
+					const float w2 = (1 + x);
+					
+					*dst++ = scale * (w0 * src[0] + w1 * src[1] + w2 * src[2]);
+					src += 2;
+				}
+			}
+		}
+		else
+		{
+			for(uint c = 0; c < m_componentNum; c++)
+			{
+				const float * src = this->channel(c);
+				float * dst = dst_image->channel(c);
+				
+				for(uint x = 0; x < w; x++)
+				{
+					*dst = 0.5f * (src[0] + src[1]);
+					dst++;
+					src += 2;
+				}
+			}
+		}
+	}
+	
+	// Regular box filter.
+	else if ((m_width & 1) == 0 && (m_height & 1) == 0)
+	{
+		for(uint c = 0; c < m_componentNum; c++)
+		{
+			const float * src = this->channel(c);
+			float * dst = dst_image->channel(c);
+			
+			for(uint y = 0; y < h; y++)
+			{
+				for(uint x = 0; x < w; x++)
+				{
+					*dst = 0.25f * (src[0] + src[1] + src[m_width] + src[m_width + 1]);
+					dst++;
+					src += 2;
+				}
+				
+				src += m_width;
+			}
+		}
+	}
+	
+	// Polyphase filters.
+	else if (m_width & 1 && m_height & 1)
+	{
+		nvDebugCheck(m_width == 2 * w + 1);
+		nvDebugCheck(m_height == 2 * h + 1);
+		
+		const float scale = 1.0f / (m_width * m_height);
+		
+		for(uint c = 0; c < m_componentNum; c++)
+		{
+			const float * src = this->channel(c);
+			float * dst = dst_image->channel(c);
+			
+			for(uint y = 0; y < h; y++)
+			{
+				const float v0 = (h - y);
+				const float v1 = (h - 0);
+				const float v2 = (1 + y);
+				
+				for (uint x = 0; x < w; x++)
+				{
+					const float w0 = (w - x);
+					const float w1 = (w - 0);
+					const float w2 = (1 + x);
+					
+					float f = 0.0f;
+					f += v0 * (w0 * src[0 * m_width + 2 * x] + w1 * src[0 * m_width + 2 * x + 1] + w2 * src[0 * m_width + 2 * x + 2]);
+					f += v1 * (w0 * src[1 * m_width + 2 * x] + w1 * src[1 * m_width + 2 * x + 1] + w2 * src[0 * m_width + 2 * x + 2]);
+					f += v2 * (w0 * src[2 * m_width + 2 * x] + w1 * src[2 * m_width + 2 * x + 1] + w2 * src[0 * m_width + 2 * x + 2]);
+					
+					*dst = f * scale;
+					dst++;
+				}
+				
+				src += 2 * m_width;
+			}
+		}
+	}
+	else if (m_width & 1)
+	{
+		nvDebugCheck(m_width == 2 * w + 1);
+		const float scale = 1.0f / (2 * m_width);
+		
+		for(uint c = 0; c < m_componentNum; c++)
+		{
+			const float * src = this->channel(c);
+			float * dst = dst_image->channel(c);
+			
+			for(uint y = 0; y < h; y++)
+			{
+				for (uint x = 0; x < w; x++)
+				{
+					const float w0 = (w - x);
+					const float w1 = (w - 0);
+					const float w2 = (1 + x);
+					
+					float f = 0.0f;
+					f += w0 * (src[2 * x + 0] + src[m_width + 2 * x + 0]);
+					f += w1 * (src[2 * x + 1] + src[m_width + 2 * x + 1]);
+					f += w2 * (src[2 * x + 2] + src[m_width + 2 * x + 2]);
+					
+					*dst = f * scale;
+					dst++;
+				}
+				
+				src += 2 * m_width;
+			}
+		}
+	}
+	else if (m_height & 1)
+	{
+		nvDebugCheck(m_height == 2 * h + 1);
+		
+		const float scale = 1.0f / (2 * m_height);
+		
+		for(uint c = 0; c < m_componentNum; c++)
+		{
+			const float * src = this->channel(c);
+			float * dst = dst_image->channel(c);
+			
+			for(uint y = 0; y < h; y++)
+			{
+				const float v0 = (h - y);
+				const float v1 = (h - 0);
+				const float v2 = (1 + y);
+				
+				for (uint x = 0; x < w; x++)
+				{
+					float f = 0.0f;
+					f += v0 * (src[0 * m_width + 2 * x] + src[0 * m_width + 2 * x + 1]);
+					f += v1 * (src[1 * m_width + 2 * x] + src[1 * m_width + 2 * x + 1]);
+					f += v2 * (src[2 * m_width + 2 * x] + src[2 * m_width + 2 * x + 1]);
+					
+					*dst = f * scale;
+					dst++;
+				}
+				
+				src += 2 * m_width;
+			}
+		}
+	}
+	
+	return dst_image.release();
+}
+
+
+/// Downsample applying a 1D kernel separately in each dimension.
+FloatImage * FloatImage::downSample(const Kernel1 & kernel, WrapMode wm) const
+{
+	const uint w = max(1, m_width / 2);
+	const uint h = max(1, m_height / 2);
+	
+	return downSample(kernel, w, h, wm);
+}
+
+
+/// Downsample applying a 1D kernel separately in each dimension.
+FloatImage * FloatImage::downSample(const Kernel1 & kernel, uint w, uint h, WrapMode wm) const
+{
+	nvCheck(!(kernel.width() & 1));	// Make sure that kernel m_width is even.
+
+	AutoPtr<FloatImage> tmp_image( new FloatImage() );
+	tmp_image->allocate(m_componentNum, w, m_height);
+	
+	AutoPtr<FloatImage> dst_image( new FloatImage() );	
+	dst_image->allocate(m_componentNum, w, h);
+	
+	const float xscale = float(m_width) / float(w);
+	const float yscale = float(m_height) / float(h);
+	
+	for(uint c = 0; c < m_componentNum; c++) {
+		float * tmp_channel = tmp_image->channel(c);
+		
+		for(uint y = 0; y < m_height; y++) {
+			for(uint x = 0; x < w; x++) {
+				
+				float sum = this->applyKernelHorizontal(&kernel, uint(x*xscale), y, c, wm);
+				
+				const uint tmp_index = tmp_image->index(x, y);
+				tmp_channel[tmp_index] = sum;
+			}
+		}
+		
+		float * dst_channel = dst_image->channel(c);
+		
+		for(uint y = 0; y < h; y++) {
+			for(uint x = 0; x < w; x++) {
+				
+				float sum = this->applyKernelVertical(&kernel, uint(x*xscale), uint(y*yscale), c, wm);
+				
+				const uint dst_index = dst_image->index(x, y);
+				dst_channel[dst_index] = sum;
+			}
+		}
+	}
+	
+	return dst_image.release();
+}
+
+
+/// Apply 2D kernel at the given coordinates and return result.
+float FloatImage::applyKernel(const Kernel2 * k, int x, int y, int c, WrapMode wm) const
+{
+	nvDebugCheck(k != NULL);
+	
+	const uint kernelWidth = k->width();
+	const int kernelOffset = int(kernelWidth / 2) - 1;
+	
+	const float * channel = this->channel(c);
+
+	float sum = 0.0f;
+	for(uint i = 0; i < kernelWidth; i++)
+	{
+		const int src_y = int(y + i) - kernelOffset;
+		
+		for(uint e = 0; e < kernelWidth; e++)
+		{
+			const int src_x = int(x + e) - kernelOffset;
+			
+			int idx = this->index(src_x, src_y, wm);
+			
+			sum += k->valueAt(e, i) * channel[idx];
+		}
+	}
+	
+	return sum;
+}
+
+
+/// Apply 1D vertical kernel at the given coordinates and return result.
+float FloatImage::applyKernelVertical(const Kernel1 * k, int x, int y, int c, WrapMode wm) const
+{
+	nvDebugCheck(k != NULL);
+	
+	const uint kernelWidth = k->width();
+	const int kernelOffset = int(kernelWidth / 2) - 1;
+	
+	const float * channel = this->channel(c);
+
+	float sum = 0.0f;
+	for(uint i = 0; i < kernelWidth; i++)
+	{
+		const int src_y = int(y + i) - kernelOffset;
+		const int idx = this->index(x, src_y, wm);
+		
+		sum += k->valueAt(i) * channel[idx];
+	}
+	
+	return sum;
+}
+
+/// Apply 1D horizontal kernel at the given coordinates and return result.
+float FloatImage::applyKernelHorizontal(const Kernel1 * k, int x, int y, int c, WrapMode wm) const
+{
+	nvDebugCheck(k != NULL);
+	
+	const uint kernelWidth = k->width();
+	const int kernelOffset = int(kernelWidth / 2) - 1;
+	
+	const float * channel = this->channel(c);
+
+	float sum = 0.0f;
+	for(uint e = 0; e < kernelWidth; e++)
+	{
+		const int src_x = int(x + e) - kernelOffset;
+		const int idx = this->index(src_x, y, wm);
+		
+		sum += k->valueAt(e) * channel[idx];
+	}
+	
+	return sum;
+}
+
+
+
+#if 0
+
+Vec3d bilinear(double u, double v) const
+{
+	u = mod(u*(W-1),W);
+	v = mod(v*(H-1),H);
+
+	Vec3d v1,v2,v3,v4;
+
+	int x_small	= (int)floor(u);
+	int x_big	  = x_small + 1;
+	int y_small = (int)floor(v);
+	int y_big   = y_small + 1;
+
+	if (x_small < 0)
+		x_small = W-1;
+	else if (x_big >= W)
+		x_big = 0;
+	if (y_small < 0)
+		y_small = H-1;
+	else if (y_big >= H)
+		y_big = 0;
+	
+	double fractional_X = u - x_small;
+	double fractional_Y = v - y_small;
+
+	if (nchan == 3)
+	{
+		v1 = Vec3d(pixel(x_small, y_small)[0], pixel(x_small, y_small)[1], pixel(x_small, y_small)[2]);
+		v2 = Vec3d(pixel(x_big, y_small)[0], pixel(x_big, y_small)[1], pixel(x_big, y_small)[2]);
+		v3 = Vec3d(pixel(x_small, y_big)[0], pixel(x_small, y_big)[1], pixel(x_small, y_big)[2]);
+		v4 = Vec3d(pixel(x_big, y_big)[0], pixel(x_big, y_big)[1], pixel(x_big, y_big)[2]);
+	}
+			
+	Vec3d i1 = lerp(v1, v2, fractional_X);
+	Vec3d i2 = lerp(v3, v4, fractional_X);
+
+	return lerp(i1, i2, fractional_Y);
+}
+
+Vec3d bicubic(double u, double v) const
+{
+	u = mod(u*(W-1),W);
+	v = mod(v*(H-1),H);
+
+	int x_small1	= (int)floor(u),
+		x_small2	= x_small1 - 1,
+		x_big1		= x_small1 + 1,				
+		x_big2		= x_small1 + 2;
+
+	int y_small1	= (int)floor(v),
+		y_small2	= y_small1 - 1,
+		y_big1		= y_small1 + 1,
+		y_big2		= y_small1 + 2;
+
+	x_small1	= (int)mod(x_small1,W);
+	x_small2	= (int)mod(x_small2,W);
+	x_big1		= (int)mod(x_big1,W);
+	x_big2		= (int)mod(x_big2,W);
+	
+	y_small1	= (int)mod(y_small1,H);
+	y_small2	= (int)mod(y_small2,H);
+	y_big1		= (int)mod(y_big1,H);
+	y_big2		= (int)mod(y_big2,H);
+	
+	double fractional_X = u - x_small1;
+	double fractional_Y = v - y_small1;
+
+	if (nchan == 3)
+	{
+		// the interpolations across the rows
+		Vec3d row1 = cubic(Vec3d(pixel(x_small2, y_small2)[0], pixel(x_small2, y_small2)[1], pixel(x_small2, y_small2)[2]),
+							Vec3d(pixel(x_small1, y_small2)[0], pixel(x_small1, y_small2)[1], pixel(x_small1, y_small2)[2]),
+							Vec3d(pixel(x_big1, y_small2)[0], pixel(x_big1, y_small2)[1], pixel(x_big1, y_small2)[2]),
+							Vec3d(pixel(x_big2, y_small2)[0], pixel(x_big2, y_small2)[1], pixel(x_big2, y_small2)[2]),
+							fractional_X);
+
+		Vec3d row2 = cubic(Vec3d(pixel(x_small2, y_small1)[0], pixel(x_small2, y_small1)[1], pixel(x_small2, y_small1)[2]),
+							Vec3d(pixel(x_small1, y_small1)[0], pixel(x_small1, y_small1)[1], pixel(x_small1, y_small1)[2]),
+							Vec3d(pixel(x_big1, y_small1)[0], pixel(x_big1, y_small1)[1], pixel(x_big1, y_small1)[2]),
+							Vec3d(pixel(x_big2, y_small1)[0], pixel(x_big2, y_small1)[1], pixel(x_big2, y_small1)[2]),
+							fractional_X);
+
+		Vec3d row3 = cubic(Vec3d(pixel(x_small2, y_big1)[0], pixel(x_small2, y_big1)[1], pixel(x_small2, y_big1)[2]),
+							Vec3d(pixel(x_small1, y_big1)[0], pixel(x_small1, y_big1)[1], pixel(x_small1, y_big1)[2]),
+							Vec3d(pixel(x_big1, y_big1)[0], pixel(x_big1, y_big1)[1], pixel(x_big1, y_big1)[2]),
+							Vec3d(pixel(x_big2, y_big1)[0], pixel(x_big2, y_big1)[1], pixel(x_big2, y_big1)[2]),
+							fractional_X);
+
+		Vec3d row4 = cubic(Vec3d(pixel(x_small2, y_big2)[0], pixel(x_small2, y_big2)[1], pixel(x_small2, y_big2)[2]),
+							Vec3d(pixel(x_small1, y_big2)[0], pixel(x_small1, y_big2)[1], pixel(x_small1, y_big2)[2]),
+							Vec3d(pixel(x_big1, y_big2)[0], pixel(x_big1, y_big2)[1], pixel(x_big1, y_big2)[2]),
+							Vec3d(pixel(x_big2, y_big2)[0], pixel(x_big2, y_big2)[1], pixel(x_big2, y_big2)[2]),
+							fractional_X);
+
+		// now interpolate across the interpolated rows (the columns)
+
+		return cubic(row1,row2,row3,row4,fractional_Y);
+	}
+	else
+		return Vec3d(0.0);
+}
+
+Vec3d bicubic2(double u, double v) const
+{
+	u = mod(u*(W-1),W);
+	v = mod(v*(H-1),H);
+
+	int x_small1	= floorf(u),
+		x_small2	= x_small1 - 1,
+		x_big1		= int(x_small1 + 1),
+		x_big2		= int(x_small1 + 2);
+
+	int y_small1	= floorf(v),
+		y_small2	= y_small1 - 1,
+		y_big1		= y_small1 + 1,
+		y_big2		= y_small1 + 2;
+
+	x_small1	= (int)mod(x_small1,W);
+	x_small2	= (int)mod(x_small2,W);
+	x_big1		= (int)mod(x_big1,W);
+	x_big2		= (int)mod(x_big2,W);
+	
+	y_small1	= (int)mod(y_small1,H);
+	y_small2	= (int)mod(y_small2,H);
+	y_big1		= (int)mod(y_big1,H);
+	y_big2		= (int)mod(y_big2,H);
+	
+	double fractional_X = u - x_small1;
+	double fractional_Y = v - y_small1;
+
+	if (nchan == 3)
+	{
+		// the interpolations across the rows
+		Vec3d row1 = cubic2(Vec3d(pixel(x_small2, y_small2)[0], pixel(x_small2, y_small2)[1], pixel(x_small2, y_small2)[2]),
+							Vec3d(pixel(x_small1, y_small2)[0], pixel(x_small1, y_small2)[1], pixel(x_small1, y_small2)[2]),
+							Vec3d(pixel(x_big1, y_small2)[0], pixel(x_big1, y_small2)[1], pixel(x_big1, y_small2)[2]),
+							Vec3d(pixel(x_big2, y_small2)[0], pixel(x_big2, y_small2)[1], pixel(x_big2, y_small2)[2]),
+							fractional_X);
+
+		Vec3d row2 = cubic2(Vec3d(pixel(x_small2, y_small1)[0], pixel(x_small2, y_small1)[1], pixel(x_small2, y_small1)[2]),
+							Vec3d(pixel(x_small1, y_small1)[0], pixel(x_small1, y_small1)[1], pixel(x_small1, y_small1)[2]),
+							Vec3d(pixel(x_big1, y_small1)[0], pixel(x_big1, y_small1)[1], pixel(x_big1, y_small1)[2]),
+							Vec3d(pixel(x_big2, y_small1)[0], pixel(x_big2, y_small1)[1], pixel(x_big2, y_small1)[2]),
+							fractional_X);
+
+		Vec3d row3 = cubic2(Vec3d(pixel(x_small2, y_big1)[0], pixel(x_small2, y_big1)[1], pixel(x_small2, y_big1)[2]),
+							Vec3d(pixel(x_small1, y_big1)[0], pixel(x_small1, y_big1)[1], pixel(x_small1, y_big1)[2]),
+							Vec3d(pixel(x_big1, y_big1)[0], pixel(x_big1, y_big1)[1], pixel(x_big1, y_big1)[2]),
+							Vec3d(pixel(x_big2, y_big1)[0], pixel(x_big2, y_big1)[1], pixel(x_big2, y_big1)[2]),
+							fractional_X);
+
+		Vec3d row4 = cubic2(Vec3d(pixel(x_small2, y_big2)[0], pixel(x_small2, y_big2)[1], pixel(x_small2, y_big2)[2]),
+							Vec3d(pixel(x_small1, y_big2)[0], pixel(x_small1, y_big2)[1], pixel(x_small1, y_big2)[2]),
+							Vec3d(pixel(x_big1, y_big2)[0], pixel(x_big1, y_big2)[1], pixel(x_big1, y_big2)[2]),
+							Vec3d(pixel(x_big2, y_big2)[0], pixel(x_big2, y_big2)[1], pixel(x_big2, y_big2)[2]),
+							fractional_X);
+
+		// now interpolate across the interpolated rows (the columns)
+
+		return cubic2(row1,row2,row3,row4,fractional_Y);
+	}
+	else
+		return Vec3d(0.0);
+}
+
+#endif
--- a/src/nvimage/FloatImage.h
+++ b/src/nvimage/FloatImage.h
@ -0,0 +1,241 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#ifndef NV_IMAGE_FLOATIMAGE_H
+#define NV_IMAGE_FLOATIMAGE_H
+
+#include <nvcore/Debug.h>
+#include <nvcore/Containers.h> // clamp
+#include <nvimage/nvimage.h>
+
+namespace nv
+{
+class Image;
+class Kernel1;
+class Kernel2;
+
+
+/// Multicomponent floating point image class.
+class FloatImage
+{
+public:
+
+	enum WrapMode {
+		WrapMode_Clamp,
+		WrapMode_Repeat,
+		WrapMode_Mirror
+	};
+	
+	NVIMAGE_API FloatImage();
+	NVIMAGE_API FloatImage(const Image * img);
+	NVIMAGE_API virtual ~FloatImage();
+
+	/** @name Conversion. */
+	//@{
+	NVIMAGE_API void initFrom(const Image * img);
+	NVIMAGE_API Image * createImage(uint base_component = 0, uint num = 4) const;
+	NVIMAGE_API Image * createImageGammaCorrect(float gamma = 2.2f) const;
+	//@}
+
+	/** @name Allocation. */
+	//@{
+	NVIMAGE_API void allocate(uint c, uint w, uint h);
+	NVIMAGE_API void free(); // Does not clear members.
+	//@}
+
+	/** @name Manipulation. */
+	//@{
+	NVIMAGE_API void clear(float f=0.0f);
+
+	//NVIMAGE_API void ComputeMipmaps();
+	//NVIMAGE_API void ComputeNormalMap(const float height_scale = 1.0f);
+	
+	//NVIMAGE_API void Clamp(uint base_component, uint num);
+	//NVIMAGE_API void NormalizeColor(uint base_component);
+	NVIMAGE_API void normalize(uint base_component);
+	
+	NVIMAGE_API void packNormals(uint base_component);
+	NVIMAGE_API void expandNormals(uint base_component);
+	NVIMAGE_API void scaleBias(uint base_component, uint num, float scale, float add);
+	NVIMAGE_API void clamp(float low, float high);
+	
+	NVIMAGE_API void toLinear(uint base_component, uint num, float gamma = 2.2f);
+	NVIMAGE_API void toGamma(uint base_component, uint num, float gamma = 2.2f);
+	NVIMAGE_API void exponentiate(uint base_component, uint num, float power);
+	
+	
+	NVIMAGE_API FloatImage * fastDownSample() const;
+	NVIMAGE_API FloatImage * downSample(const Kernel1 & filter, WrapMode wm) const;
+	NVIMAGE_API FloatImage * downSample(const Kernel1 & filter, uint w, uint h, WrapMode wm) const;
+	//@}
+
+	NVIMAGE_API float applyKernel(const Kernel2 * k, int x, int y, int c, WrapMode wm) const;
+	NVIMAGE_API float applyKernelVertical(const Kernel1 * k, int x, int y, int c, WrapMode wm) const;
+	NVIMAGE_API float applyKernelHorizontal(const Kernel1 * k, int x, int y, int c, WrapMode wm) const;
+	
+	uint width() const { return m_width; }
+	uint height() const { return m_height; }
+	uint componentNum() const { return m_componentNum; }
+	uint count() const { return m_count; }
+
+
+	/** @name Pixel access. */
+	//@{
+	const float * channel(uint c) const;
+	float * channel(uint c);
+	
+	const float * scanline(uint y, uint c) const;
+	float * scanline(uint y, uint c);
+	
+	void setPixel(float f, uint x, uint y, uint c);
+	float pixel(uint x, uint y, uint c) const;
+	
+	void setPixel(float f, uint idx);
+	float pixel(uint idx) const;
+	
+	float nearest(int x, int y, int c, WrapMode wm) const;
+	
+	float nearest(float x, float y, int c, WrapMode wm) const;
+	float linear(float x, float y, int c, WrapMode wm) const;
+	
+	float nearest_clamp(float x, float y, int c) const;
+	float nearest_repeat(float x, float y, int c) const;
+	float nearest_mirror(float x, float y, int c) const;
+	
+	float linear_clamp(float x, float y, int c) const;
+	float linear_repeat(float x, float y, int c) const;
+	float linear_mirror(float x, float y, int c) const;
+	//@}
+	
+public:
+	
+	uint index(uint x, uint y) const;
+	uint indexClamp(int x, int y) const;
+	uint indexRepeat(int x, int y) const;
+	uint indexMirror(int x, int y) const;
+	uint index(int x, int y, WrapMode wm) const;
+
+public:
+
+	uint16 m_width;			///< Width of the texture.
+	uint16 m_height;		///< Height of the texture.
+	uint32 m_componentNum;	///< Number of components.
+	uint32 m_count;			///< Image pixel count.
+	float * m_mem;
+
+};
+
+
+/// Get const channel pointer.
+inline const float * FloatImage::channel(uint c) const
+{
+	nvDebugCheck(m_mem != NULL);
+	nvDebugCheck(c < m_componentNum);
+	return m_mem + c * m_width * m_height;
+}
+
+/// Get channel pointer.
+inline float * FloatImage::channel(uint c) {
+	nvDebugCheck(m_mem != NULL);
+	nvDebugCheck(c < m_componentNum);
+	return m_mem + c * m_width * m_height;
+}
+
+/// Get const scanline pointer.
+inline const float * FloatImage::scanline(uint y, uint c) const
+{
+	nvDebugCheck(y < m_height);
+	return channel(c) + y * m_width;
+}
+
+/// Get scanline pointer.
+inline float * FloatImage::scanline(uint y, uint c)
+{
+	nvDebugCheck(y < m_height);
+	return channel(c) + y * m_width;
+}
+
+/// Set pixel component.
+inline void FloatImage::setPixel(float f, uint x, uint y, uint c)
+{
+	nvDebugCheck(m_mem != NULL);
+	nvDebugCheck(x < m_width);
+	nvDebugCheck(y < m_height);
+	nvDebugCheck(c < m_componentNum);
+	m_mem[(c * m_height + y) * m_width + x] = f;
+}
+
+/// Get pixel component.
+inline float FloatImage::pixel(uint x, uint y, uint c) const
+{
+	nvDebugCheck(m_mem != NULL);
+	nvDebugCheck(x < m_width);
+	nvDebugCheck(y < m_height);
+	nvDebugCheck(c < m_componentNum);
+	return m_mem[(c * m_height + y) * m_width + x];
+}
+
+/// Set pixel component.
+inline void FloatImage::setPixel(float f, uint idx)
+{
+	nvDebugCheck(idx < m_count);
+	m_mem[idx] = f;
+}
+
+/// Get pixel component.
+inline float FloatImage::pixel(uint idx) const
+{
+	nvDebugCheck(idx < m_count);
+	return m_mem[idx];
+}
+
+inline uint FloatImage::index(uint x, uint y) const
+{
+	nvDebugCheck(x < m_width);
+	nvDebugCheck(y < m_height);
+	return y * m_width + x;
+}
+
+inline uint FloatImage::indexClamp(int x, int y) const
+{
+	return nv::clamp(y, int(0), int(m_height-1)) * m_width + nv::clamp(x, int(0), int(m_width-1));
+}
+
+inline int repeat_remainder(int a, int b)
+{
+   if (a >= 0) return a % b;
+   else return (a + 1) % b + b - 1;
+}
+
+inline uint FloatImage::indexRepeat(int x, int y) const
+{
+	return repeat_remainder(y, m_height) * m_width + repeat_remainder(x, m_width);
+}
+
+// @@ This could be way more efficient.
+inline uint FloatImage::indexMirror(int x, int y) const
+{
+	while ((x < 0) || (x > (m_width - 1))) {
+		if (x < 0) x = -x;
+		if (x >= m_width) x = m_width + m_width - x - 1;
+	}
+
+	while ((y < 0) || (y > (m_height - 1))) {
+		if (y < 0) y = -y;
+		if (y >= m_height) y = m_height + m_height - y - 1;
+	}
+
+	return index(x, y);
+}
+
+inline uint FloatImage::index(int x, int y, WrapMode wm) const
+{
+	if (wm == WrapMode_Clamp) return indexClamp(x, y);
+	if (wm == WrapMode_Repeat) return indexRepeat(x, y);
+	/*if (wm == WrapMode_Mirror)*/ return indexMirror(x, y);
+}
+
+} // nv namespace
+
+
+
+#endif // NV_IMAGE_FLOATIMAGE_H
--- a/src/nvimage/HoleFilling.cpp
+++ b/src/nvimage/HoleFilling.cpp
@ -0,0 +1,751 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#include <nvcore/Containers.h>
+
+#include <nvmath/nvmath.h>
+
+#include <nvimage/HoleFilling.h>
+#include <nvimage/FloatImage.h>
+
+using namespace nv;
+
+
+// This is a variation of Sapiro's inpainting method.
+void nv::fillExtrapolateOnce(FloatImage * img, BitMap * bmap)
+{
+	nvCheck(img != NULL);
+	nvCheck(bmap != NULL);
+
+	const int w = img->width();
+	const int h = img->height();
+	const int count = img->componentNum();
+
+	nvCheck(bmap->width() == uint(w));
+	nvCheck(bmap->height() == uint(h));
+
+	BitMap * newbmap = new BitMap(w, h);
+
+	for(int c = 0; c < count; c++) {
+		
+		float * channel = img->channel(c);
+		
+		for(int y = 0; y < h; y++) {
+			for(int x = 0; x < w; x++) {
+				
+				if (bmap->bitAt(x, y)) {
+					// Not a hole.
+					newbmap->setBitAt(x, y);
+					continue;
+				}
+				
+				const bool west = bmap->bitAt(img->indexClamp(x-1, y));
+				const bool east = bmap->bitAt(img->indexClamp(x+1, y));
+				const bool north = bmap->bitAt(img->indexClamp(x, y-1));
+				const bool south = bmap->bitAt(img->indexClamp(x, y+1));
+				const bool northwest = bmap->bitAt(img->indexClamp(x-1, y-1));
+				const bool northeast = bmap->bitAt(img->indexClamp(x+1, y-1));
+				const bool southwest = bmap->bitAt(img->indexClamp(x-1, y+1));
+				const bool southeast = bmap->bitAt(img->indexClamp(x+1, y+1));
+				
+				int num = west + east + north + south + northwest + northeast + southwest + southeast;
+				
+				if (num != 0) {
+
+					float average = 0.0f;
+					if (num == 3 && west && northwest && southwest) {
+						average = channel[img->indexClamp(x-1, y)];
+					}
+					else if (num == 3 && east && northeast && southeast) {
+						average = channel[img->indexClamp(x+1, y)];
+					}
+					else if (num == 3 && north && northwest && northeast) {
+						average = channel[img->indexClamp(x, y-1)];
+					}
+					else if (num == 3 && south && southwest && southeast) {
+						average = channel[img->indexClamp(x, y+1)];
+					}
+					else {
+						float total = 0.0f;
+						if (west) { average += 1 * channel[img->indexClamp(x-1, y)]; total += 1; }
+						if (east) { average += 1 * channel[img->indexClamp(x+1, y)]; total += 1; }
+						if (north) { average += 1 * channel[img->indexClamp(x, y-1)]; total += 1; }
+						if (south) { average += 1 * channel[img->indexClamp(x, y+1)]; total += 1; }
+					
+						if (northwest) { average += channel[img->indexClamp(x-1, y-1)]; ++total; }
+						if (northeast) { average += channel[img->indexClamp(x+1, y-1)]; ++total; }
+						if (southwest) { average += channel[img->indexClamp(x-1, y+1)]; ++total; }
+						if (southeast) { average += channel[img->indexClamp(x+1, y+1)]; ++total; }
+						
+						average /= total;
+					}
+
+					channel[img->indexClamp(x, y)] = average;
+					newbmap->setBitAt(x, y);
+				}
+			}
+		}
+	}
+
+	// Update the bit mask.
+	swap(*newbmap, *bmap);
+
+}
+
+void nv::fillExtrapolateNTimes(FloatImage * img, BitMap * bmap, int n)
+{
+	nvCheck(img != NULL);
+	nvCheck(bmap != NULL);
+	nvCheck(n > 0);
+
+	for(int i = 0; i < n; i++)
+	{
+		fillExtrapolateOnce(img, bmap);
+	}
+}
+
+
+namespace {
+
+	struct Neighbor {
+		uint16 x;
+		uint16 y;
+		uint32 d;
+	};
+
+	// Compute euclidean squared distance.
+	static uint dist( uint16 ax, uint16 ay, uint16 bx, uint16 by ) {
+		int dx = bx - ax;
+		int dy = by - ay;
+		return uint(dx*dx + dy*dy);
+	}
+	
+	// Check neighbour, this is the core of the EDT algorithm.
+	static void checkNeighbour( int x, int y, Neighbor * e, const Neighbor & n ) {
+		nvDebugCheck(e != NULL);
+		
+		uint d = dist( x, y, n.x, n.y );
+		if( d < e->d ) {
+			e->x = n.x;
+			e->y = n.y;
+			e->d = d;
+		}
+	}
+
+} // namespace
+
+// Voronoi filling using EDT-4
+void nv::fillVoronoi(FloatImage * img, const BitMap & bmap)
+{
+	nvCheck(img != NULL);
+
+	const int w = img->width();
+	const int h = img->height();
+	const int count = img->componentNum();
+
+	nvCheck(bmap.width() == uint(w));
+	nvCheck(bmap.height() == uint(h));
+
+	Array<Neighbor> edm;
+	edm.resize(w * h);
+	
+	int x, y;
+	int x0, x1, y0, y1;
+
+	// Init edm.
+	for( y = 0; y < h; y++ ) {
+		for( x = 0; x < w; x++ ) {
+			if( bmap.bitAt(x, y) ) {
+				edm[y * w + x].x = x;
+				edm[y * w + x].y = y;
+				edm[y * w + x].d = 0;
+			}
+			else {
+				edm[y * w + x].x = w;
+				edm[y * w + x].y = h;
+				edm[y * w + x].d = w*w + h*h;
+			}
+		}
+	}
+	
+	// First pass.
+	for( y = 0; y < h; y++ ) {
+		for( x = 0; x < w; x++ ) {
+			x0 = clamp(x-1, 0, w-1);	// @@ Wrap?
+			x1 = clamp(x+1, 0, w-1);
+			y0 = clamp(y-1, 0, h-1);
+			
+			Neighbor & e = edm[y * w + x];
+			checkNeighbour(x, y, &e, edm[y0 * w + x0]);
+			checkNeighbour(x, y, &e, edm[y0 * w + x]);
+			checkNeighbour(x, y, &e, edm[y0 * w + x1]);
+			checkNeighbour(x, y, &e, edm[y * w + x0]);
+		}
+		
+		for( x = w-1; x >= 0; x-- ) {
+			x1 = clamp(x+1, 0, w-1);
+			
+			Neighbor & e = edm[y * w + x];
+			checkNeighbour(x, y, &e, edm[y * w + x1]);
+		}
+	}
+	
+	// Third pass.
+	for( y = h-1; y >= 0; y-- ) {
+		for( x = w-1; x >= 0; x-- ) {
+			x0 = clamp(x-1, 0, w-1);
+			x1 = clamp(x+1, 0, w-1);
+			y1 = clamp(y+1, 0, h-1);
+			
+			Neighbor & e = edm[y * w + x];
+			checkNeighbour(x, y, &e, edm[y * w + x1]);
+			checkNeighbour(x, y, &e, edm[y1 * w + x0]);
+			checkNeighbour(x, y, &e, edm[y1 * w + x]);
+			checkNeighbour(x, y, &e, edm[y1 * w + x1]);
+		}
+		
+		for( x = 0; x < w; x++ ) {
+			x0 = clamp(x-1, 0, w-1);
+			
+			Neighbor & e = edm[y * w + x];
+			checkNeighbour(x, y, &e, edm[y * w + x0]);
+		}
+	}
+	
+	// Fill empty holes.
+	for( y = 0; y < h; y++ ) {
+		for( x = 0; x < w; x++ ) {
+			const int sx = edm[y * w + x].x;
+			const int sy = edm[y * w + x].y;
+			nvDebugCheck(sx < w && sy < h);
+			
+			if( sx != x || sy != y ) {
+				for(int c = 0; c < count; c++ ) {
+					img->setPixel(img->pixel(sx, sy, c), x, y, c);
+				}
+			}
+		}
+	}
+
+}
+
+
+void nv::fillBlur(FloatImage * img, const BitMap & bmap)
+{
+	nvCheck(img != NULL);
+	
+	// @@ Apply a 3x3 kernel.
+}
+
+
+static bool downsample(const FloatImage * src, const BitMap * srcMask, const FloatImage ** _dst, const BitMap ** _dstMask)
+{
+	const uint w = src->width();
+	const uint h = src->height();
+	const uint count = src->componentNum();
+
+	// count holes in srcMask, return false if fully filled.
+	uint holes = 0;
+	for(uint y = 0; y < h; y++) {
+		for(uint x = 0; x < w; x++) {
+			holes += srcMask->bitAt(x, y) == 0;
+		}
+	}
+	if (holes == 0 || (w == 2 || h == 2)) {
+		// Stop when no holes or when the texture is very small.
+		return false;
+	}
+
+	// Apply box filter to image and mask and return true.
+	const uint nw = w / 2;
+	const uint nh = h / 2;
+
+	FloatImage * dst = new FloatImage();
+	dst->allocate(count, nw, nh);
+	BitMap * dstMask = new BitMap(nw, nh);
+
+	for(uint c = 0; c < count; c++) {
+		for(uint y = 0; y < nh; y++) {
+			for(uint x = 0; x < nw; x++) {
+
+				const uint x0 = 2 * x + 0;
+				const uint x1 = 2 * x + 1;
+				const uint y0 = 2 * y + 0;
+				const uint y1 = 2 * y + 1;
+
+				const float f0 = src->pixel(x0, y0, c);
+				const float f1 = src->pixel(x1, y0, c);
+				const float f2 = src->pixel(x0, y1, c);
+				const float f3 = src->pixel(x1, y1, c);
+
+				const bool b0 = srcMask->bitAt(x0, y0);
+				const bool b1 = srcMask->bitAt(x1, y0);
+				const bool b2 = srcMask->bitAt(x0, y1);
+				const bool b3 = srcMask->bitAt(x1, y1);
+
+				if (b0 || b1 || b2 || b3) {
+					// Set bit mask.
+					dstMask->setBitAt(x, y);
+
+					// Set pixel.
+					float value = 0.0f;
+					int total = 0;
+					if (b0) { value += f0; total++; }
+					if (b1) { value += f1; total++; }
+					if (b2) { value += f2; total++; }
+					if (b3) { value += f3; total++; }
+					dst->setPixel(value / total, x, y, c);
+				}
+			}
+		}
+	}
+
+	*_dst = dst;
+	*_dstMask = dstMask;
+
+	return true;
+}
+
+// This is the filter used in the Lumigraph paper. The Unreal engine uses something similar.
+void nv::fillPullPush(FloatImage * img, const BitMap & bmap)
+{
+	nvCheck(img != NULL);
+
+	const uint count = img->componentNum();
+	const uint w = img->width();
+	const uint h = img->height();
+	const uint num = log2(max(w,h));
+
+	// Build mipmap chain.
+	Array<const FloatImage *> mipmaps(num);
+	Array<const BitMap *> mipmapMasks(num);
+
+	mipmaps.append(img);
+	mipmapMasks.append(&bmap);
+
+	const FloatImage * current;
+	const BitMap * currentMask;
+
+	// Compute mipmap chain.
+	while(downsample(mipmaps.back(), mipmapMasks.back(), &current, &currentMask))
+	{
+		mipmaps.append(current);
+		mipmapMasks.append(currentMask);
+	}
+
+	// Sample mipmaps until non-hole is found.
+	for(uint y = 0; y < h; y++) {
+		for(uint x = 0; x < w; x++) {
+
+			uint sx = x;
+			uint sy = y;
+
+			const uint levelCount = mipmaps.count();
+			for(uint l = 0; l < levelCount; l++) {
+
+				if (mipmapMasks[l]->bitAt(sx, sy))
+				{
+					// Sample mipmaps[l](sx, sy) and copy to img(x, y)
+					for(uint c = 0; c < count; c++) {
+						img->setPixel(mipmaps[l]->pixel(sx, sy, c), x, y, c);
+					}
+					break;
+				}
+
+				sx /= 2;
+				sy /= 2;
+			}
+		}
+	}
+
+	deleteAll(mipmaps);
+	deleteAll(mipmapMasks);
+}
+
+
+/*
+void nv::fillSeamFix(FloatImage * img, const BitMap & bmap)
+{
+}
+*/
+#if 0 // Code below is under the BPL license.
+
+
+/**
+
+DoPixelSeamFix
+10-20-02
+
+Looks in the 5x5 local neighborhood (LocalPixels) of the desired pixel to fill.
+It tries to build a quadratic model of the neighborhood surface to use in
+extrapolating.  You need 5 pixels to establish a 2d quadratic curve.
+
+This is really just a nice generic way to extrapolate pixels.  It also happens
+to work great for seam-fixing.
+
+Note that I'm working on normals, but I treat them just as 3 scalars and normalize
+at the end.  To be more correct, I would work on the surface of a sphere, but that
+just seems like way too much work.
+
+**/
+
+struct LocalPixels
+{
+	// 5x5 neighborhood
+	// the center is at result
+	// index [y][x]
+	bool fill[5][5];
+	float data[5][5];
+	mutable float result;
+	mutable float weight;
+
+
+	bool Quad3SubH(gVec4 * pQ,int row) const
+	{
+		const bool * pFill = fill[row];
+		const float * pDat = data[row];
+	
+		if ( pFill[1] && pFill[2] && pFill[3] )
+		{
+			// good row
+			*pQ = pDat[1] - 2.f * pDat[2] + pDat[3];
+			return true;
+		}
+		else if ( pFill[0] && pFill[1] && pFill[2] )
+		{
+			// good row
+			*pQ = pDat[0] - 2.f * pDat[1] + pDat[2];
+			return true;
+		}
+		else if ( pFill[2] && pFill[3] && pFill[4] )
+		{
+			// good row
+			*pQ = pDat[2] - 2.f * pDat[3] + pDat[4];
+			return true;
+		}
+		return false;
+	}
+
+	// improve result with a horizontal quad in row 1 and/or 
+	bool Quad3SubV(gVec4 * pQ,int col) const
+	{
+		if ( fill[1][col] && fill[2][col] && fill[3][col] )
+		{
+			// good row
+			*pQ = data[1][col] - 2.f * data[2][col] + data[3][col];
+			return true;
+		}
+		else if ( fill[0][col] && fill[1][col] && fill[2][col] )
+		{
+			// good row
+			*pQ = data[0][col] - 2.f * data[1][col] + data[2][col];
+			return true;
+		}
+		else if ( fill[2][col] && fill[3][col] && fill[4][col] )
+		{
+			// good row
+			*pQ = data[2][col] - 2.f * data[3][col] + data[4][col];
+			return true;
+		}
+		return false;
+	}
+	
+	bool Quad3H(gVec4 * pQ) const
+	{
+		if ( ! Quad3SubH(pQ,1) )
+		{
+			return Quad3SubH(pQ,3);	
+		}
+		gVec4 q(0,0,0,0); // initializer not needed, just make it shut up
+		if ( Quad3SubH(&q,3) )
+		{
+			// got q and pQ
+			*pQ = (*pQ+q)*0.5f;
+		}
+		return true;
+	}
+	
+	bool Quad3V(gVec4 * pQ) const
+	{
+		if ( ! Quad3SubV(pQ,1) )
+		{
+			return Quad3SubV(pQ,3);	
+		}
+		gVec4 q(0,0,0,0); // initializer not needed, just make it shut up
+		if ( Quad3SubV(&q,3) )
+		{
+			// got q and pQ
+			*pQ = (*pQ+q)*0.5f;
+		}
+		return true;
+	}
+	// Quad returns ([0]+[2] - 2.f*[1])
+	//	a common want is [1] - ([0]+[2])*0.5f ;
+	// so use -0.5f*Quad
+
+	bool TryQuads() const
+	{
+		bool res = false;
+	
+		// look for a pair that straddles the middle:
+		if ( fill[2][1] && fill[2][3] )
+		{
+			// got horizontal straddle
+			gVec4 q;
+			if ( Quad3H(&q) )
+			{
+				result += (data[2][1] + data[2][3] - q) * 0.5f;
+				weight += 1.f;
+				res = true;
+			}
+		}
+		if ( fill[1][2] && fill[3][2] )
+		{
+			// got vertical straddle
+			gVec4 q;
+			if ( Quad3V(&q) )
+			{
+				result += (data[1][2] + data[3][2] - q) * 0.5f;
+				weight += 1.f;
+				res = true;
+			}
+		}
+	
+		// look for pairs that lead into the middle :
+		if ( fill[2][0] && fill[2][1] )
+		{
+			// got left-side pair
+			gVec4 q;
+			if ( Quad3H(&q) )
+			{
+				result += data[2][1]*2.f - data[2][0] + q;
+				weight += 1.f;
+				res = true;
+			}
+		}
+		if ( fill[2][3] && fill[2][4] )
+		{
+			// got right-side pair
+			gVec4 q;
+			if ( Quad3H(&q) )
+			{
+				result += data[2][3]*2.f - data[2][4] + q;
+				weight += 1.f;
+				res = true;
+			}
+		}
+		if ( fill[0][2] && fill[1][2] )
+		{
+			// got left-side pair
+			gVec4 q;
+			if ( Quad3V(&q) )
+			{
+				result += data[1][2]*2.f - data[0][2] + q;
+				weight += 1.f;
+				res = true;
+			}
+		}
+		if ( fill[3][2] && fill[4][2] )
+		{
+			// got right-side pair
+			gVec4 q;
+			if ( Quad3V(&q) )
+			{
+				result += data[3][2]*2.f - data[4][2] + q;
+				weight += 1.f;
+				res = true;
+			}
+		}
+		return res;
+	}
+	
+	bool TryPlanar() const
+	{
+		// four cases :
+		const int indices[] =
+		{
+			2,1, 1,2, 1,1,
+			2,1, 3,2, 3,1,
+			2,3, 1,2, 1,3,
+			2,3, 3,2, 3,3
+		};
+		bool res = false;
+		for(int i=0;i<4;i++)
+		{
+			const int * I = indices + i*6;
+			if ( ! fill[ I[0] ][ I[1] ] )
+				continue;
+			if ( ! fill[ I[2] ][ I[3] ] )
+				continue;
+			if ( ! fill[ I[4] ][ I[5] ] )
+				continue;
+	
+			result += data[ I[0] ][ I[1] ] + data[ I[2] ][ I[3] ] - data[ I[4] ][ I[5] ];
+			weight += 1.f;
+			res = true;
+		}
+		return res;
+	}
+	
+	bool TryTwos() const
+	{
+		bool res = false;
+	
+		if ( fill[2][1] && fill[2][3] )
+		{
+			result += (data[2][1] + data[2][3]) * 0.5f;
+			weight += 1.f;
+			res = true;
+		}
+		if ( fill[1][2] && fill[3][2] )
+		{
+			result += (data[1][2] + data[3][2]) * 0.5f;
+			weight += 1.f;
+			res = true;
+		}
+		
+		// four side-rotates :
+		const int indices[] =
+		{
+			2,1, 2,0,
+			2,3, 2,4,
+			1,2, 0,2,
+			3,2, 4,2,
+		};
+		for(int i=0;i<4;i++)
+		{
+			const int * I = indices + i*4;
+			if ( ! fill[ I[0] ][ I[1] ] )
+				continue;
+			if ( ! fill[ I[2] ][ I[3] ] )
+				continue;
+	
+			result += data[ I[0] ][ I[1] ]*2.f - data[ I[2] ][ I[3] ];
+			weight += 1.f;
+			res = true;
+		}
+	
+		return res;
+	}
+
+
+	bool DoLocalPixelFill() const
+	{
+		result = gVec4::zero;
+		weight = 0.f;
+	
+		if ( TryQuads() )
+			return true;
+			
+		if ( TryPlanar() )
+			return true;
+	
+		return TryTwos();
+	}
+
+}; // LocalPixels -----------------------------------------------
+
+void gNormalMap::DoPixelSeamFix()
+{
+	gLog::Printf("gNormalMap::DoPixelSeamFix..");
+
+	const int desiredTicks = 30;
+	const int heightPerTick = NUM_SEAMFIX_PASSES * m_height / desiredTicks;
+	int tick = 0;
+
+	for(int pass=0;pass<NUM_SEAMFIX_PASSES;pass++)
+	{
+		for(int yb=0;yb<m_height;yb++)
+		{
+			gVec4 * pRow = m_normals + m_width * yb;
+			const EState * pStateRow = m_states + m_width * yb;
+			for(int xb=0;xb<m_width;xb++)
+			{
+				if ( pStateRow[xb] != eNull && pStateRow[xb] != eEdge )
+				{
+					ASSERT( ! IsNull(pRow[xb]) );
+					continue; // it's got a pixel
+				}
+				// can be non-null, if it wasn't actually inside any tri,
+				//	but got the anti-aliased edge effect of a tri
+				// replace edge pixels with seam-fix here
+				//ASSERT( IsNull(pRow[xb]) );
+
+				// make the local neighborhood:
+				int numFill = 0;
+				LocalPixels lp;
+				for(int ny=0;ny<5;ny++)
+				{
+					int y = (yb + ny - 2);
+					if ( y < 0 || y >= m_height )
+					{
+						// out of range
+						for(int i=0;i<5;i++)
+						{
+							lp.fill[ny][i] = false;
+						}
+						continue;
+					}
+					gVec4 * pRow = m_normals + m_width * y;
+					const EState * pStateRow = m_states + m_width * y;
+					for(int nx=0;nx<5;nx++)
+					{
+						int x = (xb + nx - 2);
+						if ( x < 0 || x >= m_width )
+						{
+							lp.fill[ny][nx] = false;
+						}
+						else if ( pStateRow[x] == eNull || pStateRow[x] == eEdge )
+						{
+							lp.fill[ny][nx] = false;
+						}
+						else
+						{
+							lp.fill[ny][nx] = true;
+							lp.data[ny][nx] = pRow[x];
+							numFill++;
+						}
+					}
+				}
+
+				// need at least 3 to do anything decent
+				if ( numFill < 2 )
+					continue;
+
+				ASSERT(lp.fill[2][2] == false);
+				if ( lp.DoLocalPixelFill() )
+				{
+					if ( lp.result.MutableVec3().NormalizeSafe() )
+					{
+						pRow[xb] = lp.result;
+						pRow[xb][3] /= lp.weight;
+					}
+				}
+			}
+
+			if ( ++tick == heightPerTick )
+			{
+				tick = 0;
+				gLog::Printf(".");
+			}
+		}
+
+		// now run back over and stamp anything that's not null as being ok
+
+		for(int y=0;y<m_height;y++)
+		{
+			const gVec4 * pRow = m_normals + m_width * y;
+			EState * pStateRow = m_states + m_width * y;
+			for(int x=0;x<m_width;x++)
+			{
+				if ( ( pStateRow[x] == eNull || pStateRow[x] == eEdge ) && ! IsNull(pRow[x]) )
+				{
+					pStateRow[x] = eSeamFixed;
+				}
+			}
+		}
+	}
+	
+	gLog::Printf("done\n");
+}
+
+#endif // 0
--- a/src/nvimage/HoleFilling.h
+++ b/src/nvimage/HoleFilling.h
@ -0,0 +1,96 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#ifndef NV_IMAGE_HOLEFILLING_H
+#define NV_IMAGE_HOLEFILLING_H
+
+#include <nvcore/BitArray.h>
+#include <nvimage/nvimage.h>
+
+namespace nv 
+{
+	class FloatImage;
+
+	/// Bit mask.
+	class BitMap
+	{
+	public:
+		BitMap(uint w, uint h) : 
+			m_width(w), m_height(h), m_bitArray(w*h) 
+		{
+		}
+		
+		const uint width() const { return m_width; }
+		const uint height() const { return m_height; }
+		
+		bool bitAt(uint x, uint y) const
+		{
+			nvDebugCheck(x < m_width && y < m_height);
+			return m_bitArray.bitAt(y * m_width + x);
+		}
+		bool bitAt(uint idx) const
+		{
+			return m_bitArray.bitAt(idx);
+		}
+	
+		void setBitAt(uint x, uint y)
+		{
+			nvDebugCheck(x < m_width && y < m_height);
+			m_bitArray.setBitAt(y * m_width + x);
+		}
+		void setBitAt(uint idx)
+		{
+			m_bitArray.setBitAt(idx);
+		}
+	
+		void clearBitAt(uint x, uint y)
+		{
+			nvDebugCheck(x < m_width && y < m_height);
+			m_bitArray.clearBitAt(y * m_width + x);
+		}
+		void clearBitAt(uint idx)
+		{
+			m_bitArray.clearBitAt(idx);
+		}
+	
+		void clearAll()
+		{
+			m_bitArray.clearAll();
+		}
+	
+		void setAll()
+		{
+			m_bitArray.setAll();
+		}
+	
+		void toggleAll()
+		{
+			m_bitArray.toggleAll();
+		}
+		
+		friend void swap(BitMap & a, BitMap & b)
+		{
+			nvCheck(a.m_width == b.m_width);
+			nvCheck(a.m_height == b.m_height);
+			//swap(const_cast<uint &>(a.m_width), const_cast<uint &>(b.m_width));
+			//swap(const_cast<uint &>(a.m_height), const_cast<uint &>(b.m_height));
+			swap(a.m_bitArray, b.m_bitArray);
+		}
+		
+	private:
+		
+		const uint m_width;
+		const uint m_height;
+		BitArray m_bitArray;
+		
+	};
+
+	NVIMAGE_API void fillVoronoi(FloatImage * img, const BitMap & bmap);
+	NVIMAGE_API void fillBlur(FloatImage * img, const BitMap & bmap);
+	NVIMAGE_API void fillPullPush(FloatImage * img, const BitMap & bmap);
+	
+	NVIMAGE_API void fillExtrapolateOnce(FloatImage * img, BitMap * bmap);
+	NVIMAGE_API void fillExtrapolateNTimes(FloatImage * img, BitMap * bmap, int n);
+	
+} // nv namespace
+
+#endif // NV_IMAGE_HOLEFILLING_H
--- a/src/nvimage/Image.cpp
+++ b/src/nvimage/Image.cpp
@ -0,0 +1,125 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#include <nvcore/Debug.h>
+#include <nvcore/Ptr.h>
+
+#include <nvmath/Color.h>
+
+#include <nvimage/Image.h>
+#include <nvimage/ImageIO.h>
+
+
+using namespace nv;
+
+Image::Image() : m_width(0), m_height(0), m_format(Format_RGB), m_data(NULL)
+{
+}
+
+Image::~Image()
+{
+	free();
+}
+
+void Image::allocate(uint w, uint h)
+{
+	free();
+	m_width = w;
+	m_height = h;
+	m_data = new Color32[w*h];
+}
+
+bool Image::load(const char * name)
+{
+	free();
+	
+	AutoPtr<Image> img(ImageIO::load(name));
+	if (img == NULL) {
+		return false;
+	}
+	
+	swap(m_width, img->m_width);
+	swap(m_height, img->m_height);
+	swap(m_format, img->m_format);
+	swap(m_data, img->m_data);
+	
+	return true;
+}
+
+void Image::wrap(void * data, uint w, uint h)
+{
+	free();
+	m_data = (Color32 *)data;
+	m_width = w;
+	m_height = h;
+}
+
+void Image::unwrap()
+{
+	m_data = NULL;
+	m_width = 0;
+	m_height = 0;
+}
+
+
+void Image::free()
+{
+	delete m_data;
+	m_data = NULL;
+}
+
+
+uint Image::width() const
+{
+	return m_width;
+}
+
+uint Image::height() const
+{
+	return m_height;
+}
+
+const Color32 * Image::scanline(uint h) const
+{
+	nvDebugCheck(h < m_height);
+	return m_data + h * m_width;
+}
+
+Color32 * Image::scanline(uint h)
+{
+	nvDebugCheck(h < m_height);
+	return m_data + h * m_width;
+}
+
+const Color32 * Image::pixels() const
+{
+	return m_data;
+}
+
+Color32 * Image::pixels()
+{
+	return m_data;
+}
+
+const Color32 & Image::pixel(uint idx) const
+{
+	nvDebugCheck(idx < m_width * m_height);
+	return m_data[idx];
+}
+
+Color32 & Image::pixel(uint idx)
+{
+	nvDebugCheck(idx < m_width * m_height);
+	return m_data[idx];
+}
+
+
+Image::Format Image::format() const
+{
+	return m_format;
+}
+
+void Image::setFormat(Image::Format f)
+{
+	m_format = f;
+}
+
--- a/src/nvimage/Image.h
+++ b/src/nvimage/Image.h
@ -0,0 +1,77 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#ifndef NV_IMAGE_IMAGE_H
+#define NV_IMAGE_IMAGE_H
+
+#include <nvcore/Debug.h>
+#include <nvimage/nvimage.h>
+
+namespace nv
+{
+	class Color32;
+	
+	/// 32 bit RGBA image.
+	class Image
+	{
+	public:
+		
+		enum Format 
+		{
+			Format_RGB,
+			Format_ARGB,
+		};
+		
+		NVIMAGE_API Image();
+		NVIMAGE_API ~Image();
+		
+		NVIMAGE_API void allocate(uint w, uint h);
+		NVIMAGE_API bool load(const char * name);
+		
+		NVIMAGE_API void wrap(void * data, uint w, uint h);
+		NVIMAGE_API void unwrap();
+		
+		NVIMAGE_API uint width() const;
+		NVIMAGE_API uint height() const;
+		
+		NVIMAGE_API const Color32 * scanline(uint h) const;
+		NVIMAGE_API Color32 * scanline(uint h);
+		
+		NVIMAGE_API const Color32 * pixels() const;
+		NVIMAGE_API Color32 * pixels();
+		
+		NVIMAGE_API const Color32 & pixel(uint idx) const;
+		NVIMAGE_API Color32 & pixel(uint idx);
+		
+		const Color32 & pixel(uint x, uint y) const;
+		Color32 & pixel(uint x, uint y);
+		
+		NVIMAGE_API Format format() const;
+		NVIMAGE_API void setFormat(Format f);
+		
+	private:
+		void free();
+		
+	private:
+		uint m_width;
+		uint m_height;
+		Format m_format;
+		Color32 * m_data;
+	};
+
+
+	inline const Color32 & Image::pixel(uint x, uint y) const
+	{
+		nvDebugCheck(x < width() && y < height());
+		return pixel(y * width() + x);
+	}
+	
+	inline Color32 & Image::pixel(uint x, uint y)
+	{
+		nvDebugCheck(x < width() && y < height());
+		return pixel(y * width() + x);
+	}
+
+} // nv namespace
+
+
+#endif // NV_IMAGE_IMAGE_H
--- a/src/nvimage/ImageIO.cpp
+++ b/src/nvimage/ImageIO.cpp
@ -0,0 +1,863 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#include <nvcore/Ptr.h>
+#include <nvcore/Containers.h>
+#include <nvcore/StrLib.h>
+#include <nvcore/StdStream.h>
+
+#include <nvmath/Color.h>
+
+#include "ImageIO.h"
+#include "Image.h"
+#include "FloatImage.h"
+#include "TgaFile.h"
+
+// Extern
+#if defined(HAVE_JPEG)
+extern "C" {
+#	include <jpeglib.h>
+}
+#endif
+
+#if defined(HAVE_PNG)
+#	include <png.h>
+#endif
+
+#if defined(HAVE_TIFF)
+#	define _TIFF_DATA_TYPEDEFS_
+#	include <tiffio.h>
+#endif
+
+using namespace nv;
+
+namespace {
+
+	// Array of image load plugins.
+//	static HashMap<String, ImageInput_Plugin> s_plugin_load_map;
+
+	// Array of image save plugins.
+//	static HashMap<String, ImageOutput_Plugin> s_plugin_save_map;
+	
+	struct Color555 {
+		uint16 b : 5;
+		uint16 g : 5;
+		uint16 r : 5;
+	};
+	
+} // namespace
+
+
+Image * nv::ImageIO::load(const char * name)
+{
+	StdInputStream stream(name);
+	
+	if (stream.isError()) {
+		return false;
+	}
+	
+	return load(name, stream);
+}
+
+Image * nv::ImageIO::load(const char * name, Stream & s)
+{
+	const char * extension = Path::extension(name);
+	
+	if (strCaseCmp(extension, ".tga") == 0) {
+		return loadTGA(s);
+	}
+#if defined(HAVE_JPEG)
+	if (strCaseCmp(extension, ".jpg") == 0 || strCaseCmp(extension, ".jpeg") == 0) {
+		return loadJPG(s);
+	}
+#endif
+#if defined(HAVE_PNG)
+	if (strCaseCmp(extension, ".png") == 0) {
+		return loadPNG(s);
+	}
+#endif
+	// @@ use image plugins?
+
+	return NULL;
+}
+
+
+/// Load TGA image.
+Image * nv::ImageIO::loadTGA(Stream & s)
+{
+	nvCheck(!s.isError());
+	
+	TgaHeader tga;
+	s << tga;
+	s.seek(TgaHeader::Size + tga.id_length);
+
+	// Get header info.
+	bool rle = false;
+	bool pal = false;
+	bool rgb = false;
+	bool grey = false;
+
+	switch( tga.image_type ) {
+		case TGA_TYPE_RLE_INDEXED:
+			rle = true;
+			// no break is intended!
+		case TGA_TYPE_INDEXED:
+			if( tga.colormap_type!=1 || tga.colormap_size!=24 || tga.colormap_length>256 ) {
+				nvDebug( "*** ImageIO::loadTGA: Error, only 24bit paletted images are supported.\n" );
+				return false;
+			}
+			pal = true;
+			break;
+
+		case TGA_TYPE_RLE_RGB:
+			rle = true;
+			// no break is intended!
+		case TGA_TYPE_RGB:
+			rgb = true;
+			break;
+
+		case TGA_TYPE_RLE_GREY:
+			rle = true;
+			// no break is intended!
+		case TGA_TYPE_GREY:
+			grey = true;
+			break;
+
+		default:
+			nvDebug( "*** ImageIO::loadTGA: Error, unsupported image type.\n" );
+			return false;
+	}
+
+	const uint pixel_size = (tga.pixel_size/8);
+	nvDebugCheck(pixel_size <= 4);
+	
+	const uint size = tga.width * tga.height * pixel_size;
+
+	
+	// Read palette
+	uint8 palette[768];
+	if( pal ) {
+		nvDebugCheck(tga.colormap_length < 256);
+		s.serialize(palette, 3 * tga.colormap_length);
+	}
+
+	// Decode image.
+	uint8 * mem = new uint8[size];
+	if( rle ) {
+		// Decompress image in src.
+		uint8 * dst = mem;
+		int num = size;
+
+		while (num > 0) {
+			// Get packet header
+			uint8 c; 
+			s << c;
+
+			uint count = (c & 0x7f) + 1;
+			num -= count * pixel_size;
+
+			if (c & 0x80) {
+				// RLE pixels.
+				uint8 pixel[4];	// uint8 pixel[pixel_size];
+				s.serialize( pixel, pixel_size );
+				do {
+					memcpy(dst, pixel, pixel_size);
+					dst += pixel_size;
+				} while (--count);
+			}
+			else {
+				// Raw pixels.
+				count *= pixel_size;
+				//file->Read8(dst, count);
+				s.serialize(dst, count);
+				dst += count;
+			}
+		}
+	}
+	else {
+		s.serialize(mem, size);
+	}
+
+	// Allocate image.
+	AutoPtr<Image> img(new Image());
+	img->allocate(tga.width, tga.height);
+
+	int lstep;
+	Color32 * dst;
+	if( tga.flags & TGA_ORIGIN_UPPER ) {
+		lstep = tga.width;
+		dst = img->pixels();
+	}
+	else {
+		lstep = - tga.width;
+		dst = img->pixels() + (tga.height-1) * tga.width;
+	}
+
+	// Write image.
+	uint8 * src = mem;
+	if( pal ) {
+		for( int y = 0; y < tga.height; y++ ) {
+			for( int x = 0; x < tga.width; x++ ) {
+				uint8 idx = *src++;
+				dst[x].setBGRA(palette[3*idx+0], palette[3*idx+1], palette[3*idx+2], 0xFF);
+			}
+			dst += lstep;
+		}
+	}
+	else if( grey ) {
+		img->setFormat(Image::Format_ARGB);
+		
+		for( int y = 0; y < tga.height; y++ ) {
+			for( int x = 0; x < tga.width; x++ ) {
+				dst[x].setBGRA(*src, *src, *src, *src);
+				src++;
+			}
+			dst += lstep;
+		}
+	}
+	else {
+		
+		if( tga.pixel_size == 16 ) {
+			for( int y = 0; y < tga.height; y++ ) {
+				for( int x = 0; x < tga.width; x++ ) {
+					Color555 c = *reinterpret_cast<Color555 *>(src);
+					uint8 b = (c.b << 3) | (c.b >> 2);					
+					uint8 g = (c.g << 3) | (c.g >> 2);
+					uint8 r = (c.r << 3) | (c.r >> 2);
+					dst[x].setBGRA(b, g, r, 0xFF);
+					src += 2;
+				}
+				dst += lstep;
+			}
+		}
+		else if( tga.pixel_size == 24 ) {
+			for( int y = 0; y < tga.height; y++ ) {
+				for( int x = 0; x < tga.width; x++ ) {
+					dst[x].setBGRA(src[0], src[1], src[2], 0xFF);
+					src += 3;
+				}
+				dst += lstep;
+			}
+		}
+		else if( tga.pixel_size == 32 ) {
+			img->setFormat(Image::Format_ARGB);
+			
+			for( int y = 0; y < tga.height; y++ ) {
+				for( int x = 0; x < tga.width; x++ ) {
+					dst[x].setBGRA(src[0], src[1], src[2], src[3]);
+					src += 4;
+				}
+				dst += lstep;
+			}
+		}
+	}
+
+	// free uncompressed data.
+	delete [] mem;
+
+	return img.release();
+}
+
+/// Save TGA image.
+bool nv::ImageIO::saveTGA(Stream & s, const Image * img)
+{
+	nvCheck(!s.isError());
+	nvCheck(img != NULL);
+	nvCheck(img->pixels() != NULL);
+	
+	TgaFile tga;
+	tga.head.id_length = 0;
+	tga.head.colormap_type = 0;
+	tga.head.image_type = TGA_TYPE_RGB;
+
+	tga.head.colormap_index = 0;
+	tga.head.colormap_length = 0;
+	tga.head.colormap_size = 0;
+
+	tga.head.x_origin = 0;
+	tga.head.y_origin = 0;
+	tga.head.width = img->width();
+	tga.head.height = img->height();
+	if(img->format() == Image::Format_ARGB) {
+		tga.head.pixel_size = 32;
+		tga.head.flags = TGA_ORIGIN_UPPER;
+	}
+	else {
+		tga.head.pixel_size = 24;
+		tga.head.flags = TGA_ORIGIN_UPPER;
+	}
+
+	// @@ Serialize directly.
+	tga.allocate();
+
+	const uint n = img->width() * img->height();
+	if(img->format() == Image::Format_ARGB) {
+		for(uint i = 0; i < n; i++) {
+			Color32 color = img->pixel(i);
+			tga.mem[4 * i + 0] = color.b;
+			tga.mem[4 * i + 1] = color.g;
+			tga.mem[4 * i + 2] = color.r;
+			tga.mem[4 * i + 3] = color.a;
+		}
+	}
+	else {
+		for(uint i = 0; i < n; i++) {
+			Color32 color = img->pixel(i);
+			tga.mem[3 * i + 0] = color.b;
+			tga.mem[3 * i + 1] = color.g;
+			tga.mem[3 * i + 2] = color.r;
+		}
+	}
+
+	s << tga;
+	
+	tga.free();
+	
+	return true;
+}
+
+
+#if defined(HAVE_PNG)
+
+static void user_read_data(png_structp png_ptr, png_bytep data, png_size_t length)
+{
+	nvDebugCheck(png_ptr != NULL);
+	
+	Stream * s = (Stream *)png_ptr->io_ptr;
+	s->serialize(data, (int)length);
+	
+	if (s->isError()) {
+		png_error(png_ptr, "Read Error");
+	}
+}
+
+
+Image * nv::ImageIO::loadPNG(Stream & s)
+{
+	nvCheck(!s.isError());
+	
+	// Set up a read buffer and check the library version
+	png_structp png_ptr;
+	png_ptr = png_create_read_struct(PNG_LIBPNG_VER_STRING, NULL, NULL, NULL);
+	if (png_ptr == NULL) {
+	//	nvDebug( "*** LoadPNG: Error allocating read buffer in file '%s'.\n", name );
+		return false;
+	}
+
+	// Allocate/initialize a memory block for the image information
+	png_infop info_ptr = png_create_info_struct(png_ptr);
+	if (info_ptr == NULL) {
+		png_destroy_read_struct(&png_ptr, NULL, NULL);
+	//	nvDebug( "*** LoadPNG: Error allocating image information for '%s'.\n", name );
+		return false;
+	}
+
+	// Set up the error handling
+	if (setjmp(png_jmpbuf(png_ptr))) {
+		png_destroy_read_struct(&png_ptr, &info_ptr, NULL);
+	//	nvDebug( "*** LoadPNG: Error reading png file '%s'.\n", name );
+		return false;
+	}
+
+	// Set up the I/O functions.
+	png_set_read_fn(png_ptr, (void*)&s, user_read_data);
+
+
+	// Retrieve the image header information
+	png_uint_32 width, height;
+	int bit_depth, color_type, interlace_type;
+	png_read_info(png_ptr, info_ptr);
+	png_get_IHDR(png_ptr, info_ptr, &width, &height, &bit_depth, &color_type, &interlace_type, NULL, NULL);
+
+
+	if (color_type == PNG_COLOR_TYPE_PALETTE && bit_depth <= 8) {
+		// Convert indexed images to RGB.
+		png_set_expand(png_ptr);
+	}
+	else if (color_type == PNG_COLOR_TYPE_GRAY && bit_depth < 8) {
+		// Convert grayscale to RGB.
+		png_set_expand(png_ptr);
+	}
+	else if (png_get_valid(png_ptr, info_ptr, PNG_INFO_tRNS)) {
+		// Expand images with transparency to full alpha channels
+		// so the data will be available as RGBA quartets.
+		png_set_expand(png_ptr);
+	}
+	else if (bit_depth < 8) {
+		// If we have < 8 scale it up to 8.
+		//png_set_expand(png_ptr);
+		png_set_packing(png_ptr);
+	}
+
+	// Reduce bit depth.
+	if (bit_depth == 16) {
+		png_set_strip_16(png_ptr);
+	}
+
+	// Represent gray as RGB
+	if (color_type == PNG_COLOR_TYPE_GRAY || color_type == PNG_COLOR_TYPE_GRAY_ALPHA) {
+		png_set_gray_to_rgb(png_ptr);
+	}
+
+	// Convert to RGBA filling alpha with 0xFF.
+	if (!(color_type & PNG_COLOR_MASK_ALPHA)) {
+		png_set_filler(png_ptr, 0xFF, PNG_FILLER_AFTER);
+	}
+
+	// @todo Choose gamma according to the platform?
+	double screen_gamma = 2.2;
+	int intent;
+	if (png_get_sRGB(png_ptr, info_ptr, &intent)) {
+		png_set_gamma(png_ptr, screen_gamma, 0.45455);
+	}
+	else {
+		double image_gamma;
+		if (png_get_gAMA(png_ptr, info_ptr, &image_gamma)) {
+			png_set_gamma(png_ptr, screen_gamma, image_gamma);
+		}
+		else {
+			png_set_gamma(png_ptr, screen_gamma, 0.45455);
+		}
+	}
+
+	// Perform the selected transforms.
+	png_read_update_info(png_ptr, info_ptr);
+
+	png_get_IHDR(png_ptr, info_ptr, &width, &height, &bit_depth, &color_type, &interlace_type, NULL, NULL);
+
+	AutoPtr<Image> img(new Image());
+	img->allocate(width, height);
+
+	// Set internal format flags.
+	if(color_type & PNG_COLOR_MASK_COLOR) {
+		//img->flags |= PI_IF_HAS_COLOR;
+	}
+	if(color_type & PNG_COLOR_MASK_ALPHA) {
+		//img->flags |= PI_IF_HAS_ALPHA;
+		img->setFormat(Image::Format_ARGB);
+	}
+
+	// Read the image
+	uint8 * pixels = (uint8 *)img->pixels();
+	png_bytep * row_data = new png_bytep[sizeof(png_byte) * height];
+	for (uint i = 0; i < height; i++) {
+		row_data[i] = &(pixels[width * 4 * i]);
+	}
+
+	png_read_image(png_ptr, row_data);
+	delete [] row_data;
+
+	// Finish things up
+	png_read_end(png_ptr, info_ptr);
+	png_destroy_read_struct(&png_ptr, &info_ptr, NULL);
+
+	// RGBA to BGRA.
+	uint num = width * height;
+	for(uint i = 0; i < num; i++)
+	{
+		Color32 c = img->pixel(i);
+		img->pixel(i) = Color32(c.b, c.g, c.r, c.a);
+	}
+	
+	// Compute alpha channel if needed.
+	/*if( img->flags & PI_IU_BUMPMAP || img->flags & PI_IU_ALPHAMAP ) {
+		if( img->flags & PI_IF_HAS_COLOR && !(img->flags & PI_IF_HAS_ALPHA)) {
+			img->ComputeAlphaFromColor();
+		}
+	}*/
+
+	return img.release();
+}
+
+
+FloatImage * nv::ImageIO::loadFloatPNG(Stream & s)
+{
+	return NULL;
+}
+
+
+#endif // defined(HAVE_PNG)
+
+#if defined(HAVE_JPEG)
+
+static void init_source (j_decompress_ptr /*cinfo*/){
+}
+
+static boolean fill_input_buffer (j_decompress_ptr cinfo){
+	struct jpeg_source_mgr * src = cinfo->src;
+	static JOCTET FakeEOI[] = { 0xFF, JPEG_EOI };
+
+	// Generate warning
+	nvDebug("jpeglib: Premature end of file\n");
+
+	// Insert a fake EOI marker
+	src->next_input_byte = FakeEOI;
+	src->bytes_in_buffer = 2;
+
+	return TRUE;
+}
+
+static void skip_input_data (j_decompress_ptr cinfo, long num_bytes) {
+	struct jpeg_source_mgr * src = cinfo->src;
+
+	if(num_bytes >= (long)src->bytes_in_buffer) {
+		fill_input_buffer(cinfo);
+		return;
+	}
+
+	src->bytes_in_buffer -= num_bytes;
+	src->next_input_byte += num_bytes;
+}
+
+static void term_source (j_decompress_ptr /*cinfo*/){
+	// no work necessary here
+}
+
+
+Image * nv::ImageIO::loadJPG(Stream & s)
+{
+	nvCheck(!s.isError());
+	
+	// Read the entire file.
+	Array<uint8> byte_array;
+	byte_array.resize(s.size());
+	s.serialize(byte_array.unsecureBuffer(), s.size());
+	
+	jpeg_decompress_struct cinfo;
+	jpeg_error_mgr jerr;
+
+	cinfo.err = jpeg_std_error(&jerr);
+	jpeg_create_decompress(&cinfo);
+
+	cinfo.src = (struct jpeg_source_mgr *) (*cinfo.mem->alloc_small)
+			((j_common_ptr) &cinfo, JPOOL_PERMANENT, sizeof(struct jpeg_source_mgr));
+	cinfo.src->init_source = init_source;
+	cinfo.src->fill_input_buffer = fill_input_buffer;
+	cinfo.src->skip_input_data = skip_input_data;
+	cinfo.src->resync_to_restart = jpeg_resync_to_restart;	// use default method
+	cinfo.src->term_source = term_source;
+	cinfo.src->bytes_in_buffer = byte_array.size();
+	cinfo.src->next_input_byte = byte_array.buffer();
+
+	jpeg_read_header(&cinfo, TRUE);
+	jpeg_start_decompress(&cinfo);
+
+	/*
+	cinfo.do_fancy_upsampling = FALSE;	// fast decompression
+	cinfo.dct_method = JDCT_FLOAT;			// Choose floating point DCT method.
+	*/
+
+	uint8 * tmp_buffer = new uint8 [cinfo.output_width * cinfo.output_height * cinfo.num_components];
+	uint8 * scanline = tmp_buffer;
+
+	while( cinfo.output_scanline < cinfo.output_height ){
+		int num_scanlines = jpeg_read_scanlines (&cinfo, &scanline, 1);
+		scanline += num_scanlines * cinfo.output_width * cinfo.num_components;
+	}
+
+	jpeg_finish_decompress(&cinfo);
+
+	AutoPtr<Image> img(new Image());
+	img->allocate(cinfo.output_width, cinfo.output_height);
+
+	Color32 * dst = img->pixels();
+	const int size = img->height() * img->width();
+	const uint8 * src = tmp_buffer;
+
+	if( cinfo.num_components == 3 ) {
+		img->setFormat(Image::Format_RGB);
+		for( int i = 0; i < size; i++ ) {
+			*dst++ = Color32(src[0], src[1], src[2]);
+			src += 3;
+		}
+	}
+	else {
+		img->setFormat(Image::Format_ARGB);
+		for( int i = 0; i < size; i++ ) {
+			*dst++ = Color32(*src, *src, *src, *src);
+			src++;
+		}
+	}
+
+	delete [] tmp_buffer;
+	jpeg_destroy_decompress (&cinfo);
+
+	return img.release();
+}
+
+#endif // defined(HAVE_JPEG)
+
+#if defined(HAVE_TIFF)
+
+FloatImage * nv::ImageIO::loadFloatTIFF(Stream & s)
+{
+	nvCheck(!s.isError());
+	return NULL;
+}
+
+FloatImage * nv::ImageIO::loadFloatTIFF(const char * fileName)
+{
+	TIFF * tif = TIFFOpen(fileName, "r");
+	if (!tif)
+	{
+		nvDebug("Can't open '%s' for reading\n", fileName);
+		return NULL;
+	}
+	
+	::uint16 spp, bpp;
+	::uint32 width, height;
+	TIFFGetField(tif, TIFFTAG_IMAGELENGTH, &height);
+	TIFFGetField(tif, TIFFTAG_IMAGEWIDTH, &width);
+	TIFFGetField(tif, TIFFTAG_BITSPERSAMPLE, &bpp);
+	TIFFGetField(tif, TIFFTAG_SAMPLESPERPIXEL, &spp);
+	
+	if (spp != 1 || (bpp != 8 && bpp != 16 && bpp != 32)) {
+		nvDebug("Can't load '%s', only 1 sample per pixel supported\n", fileName);
+		TIFFClose(tif);
+		return NULL;
+	}
+	
+	FloatImage * fimage = new FloatImage();
+	fimage->allocate(spp, width, height);
+	
+	int linesize = TIFFScanlineSize(tif);
+	tdata_t buf = (::uint8 *)nv::mem::malloc(linesize);
+	
+	for (uint y = 0; y < height; y++) {
+		TIFFReadScanline(tif, buf, y, 0);
+		
+		float * dst = fimage->scanline(y, 0);
+		
+		if (bpp == 8) {
+			for(uint x = 0; x < width; x++) {
+				dst[x] = float(((::uint8 *)buf)[x]) / float(0xFF);
+			}
+		}
+		else if (bpp == 16) {
+			for(uint x = 0; x < width; x++) {
+				dst[x] = float(((::uint16 *)buf)[x]) / float(0xFFFF);
+			}
+		}
+		else /*if (bpp == 32)*/ {
+			// Mantissa has only 24 bits, so drop 8 bits.
+			for(uint x = 0; x < width; x++) {
+				dst[x] = float(((::uint32 *)buf)[x] >> 8) / float(0xFFFFFF);
+			}
+		}
+	}
+
+	nv::mem::free(buf);
+	
+	TIFFClose(tif);
+	
+	return fimage;
+}
+
+#endif
+
+#if 0
+
+/** Save PNG*/
+static bool SavePNG(const PiImage * img, const char * name) {
+	nvCheck( img != NULL );
+	nvCheck( img->mem != NULL );
+
+	if( piStrCmp(piExtension(name), ".png" ) != 0 ) {
+		return false;
+	}
+	
+	if( img->flags & PI_IT_CUBEMAP ) {
+		nvDebug("*** Cannot save cubemaps as PNG.");
+		return false;
+	}
+	if( img->flags & PI_IT_DDS ) {
+		nvDebug("*** Cannot save DDS surface as PNG.");
+		return false;
+	}
+
+	nvDebug( "--- Saving '%s'.\n", name );
+	
+	PiAutoPtr<PiStream> ar( PiFileSystem::CreateFileWriter( name ) );
+	if( ar == NULL ) {
+		nvDebug( "*** SavePNG: Error, cannot save file '%s'.\n", name );
+		return false;
+	}
+
+/*
+public class PNGEnc {
+
+    public static function encode(img:BitmapData):ByteArray {
+        // Create output byte array
+        var png:ByteArray = new ByteArray();
+        // Write PNG signature
+        png.writeUnsignedInt(0x89504e47);
+        png.writeUnsignedInt(0x0D0A1A0A);
+        // Build IHDR chunk
+        var IHDR:ByteArray = new ByteArray();
+        IHDR.writeInt(img.width);
+        IHDR.writeInt(img.height);
+        IHDR.writeUnsignedInt(0x08060000); // 32bit RGBA
+        IHDR.writeByte(0);
+        writeChunk(png,0x49484452,IHDR);
+        // Build IDAT chunk
+        var IDAT:ByteArray= new ByteArray();
+        for(var i:int=0;i < img.height;i++) {
+            // no filter
+            IDAT.writeByte(0);
+            var p:uint;
+            if ( !img.transparent ) {
+                for(var j:int=0;j < img.width;j++) {
+                    p = img.getPixel(j,i);
+                    IDAT.writeUnsignedInt(
+                        uint(((p&0xFFFFFF) << 8)|0xFF));
+                }
+            } else {
+                for(var j:int=0;j < img.width;j++) {
+                    p = img.getPixel32(j,i);
+                    IDAT.writeUnsignedInt(
+                        uint(((p&0xFFFFFF) << 8)|
+                        (shr(p,24))));
+                }
+            }
+        }
+        IDAT.compress();
+        writeChunk(png,0x49444154,IDAT);
+        // Build IEND chunk
+        writeChunk(png,0x49454E44,null);
+        // return PNG
+        return png;
+    }
+
+    private static var crcTable:Array;
+    private static var crcTableComputed:Boolean = false;
+
+    private static function writeChunk(png:ByteArray, 
+            type:uint, data:ByteArray) {
+        if (!crcTableComputed) {
+            crcTableComputed = true;
+            crcTable = [];
+            for (var n:uint = 0; n < 256; n++) {
+                var c:uint = n;
+                for (var k:uint = 0; k < 8; k++) {
+                    if (c & 1) {
+                        c = uint(uint(0xedb88320) ^ 
+                            uint(c >>> 1));
+                    } else {
+                        c = uint(c >>> 1);
+                    }
+                }
+                crcTable[n] = c;
+            }
+        }
+        var len:uint = 0;
+        if (data != null) {
+            len = data.length;
+        }
+        png.writeUnsignedInt(len);
+        var p:uint = png.position;
+        png.writeUnsignedInt(type);
+        if ( data != null ) {
+            png.writeBytes(data);
+        }
+        var e:uint = png.position;
+        png.position = p;
+        var c:uint = 0xffffffff;
+        for (var i:int = 0; i < (e-p); i++) {
+            c = uint(crcTable[
+                (c ^ png.readUnsignedByte()) & 
+                uint(0xff)] ^ uint(c >>> 8));
+        }
+        c = uint(c^uint(0xffffffff));
+        png.position = e;
+        png.writeUnsignedInt(c);
+    }
+}
+*/
+}
+
+#endif // 0
+
+#if 0
+
+
+namespace ImageIO {
+
+	/** Init ImageIO plugins. */
+	void InitPlugins() {
+	//	AddInputPlugin( "", LoadANY );
+		AddInputPlugin( "tga", LoadTGA );
+#if HAVE_PNG
+		AddInputPlugin( "png", LoadPNG );
+#endif
+#if HAVE_JPEG
+		AddInputPlugin( "jpg", LoadJPG );
+#endif
+		AddInputPlugin( "dds", LoadDDS );
+		
+		AddOutputPlugin( "tga", SaveTGA );
+	}
+	
+	/** Reset ImageIO plugins. */
+	void ResetPlugins() {
+		s_plugin_load_map.Clear();
+		s_plugin_save_map.Clear();
+	}
+	
+	/** Add an input plugin. */
+	void AddInputPlugin( const char * ext, ImageInput_Plugin plugin ) {
+		s_plugin_load_map.Add(ext, plugin);
+	}
+	
+	/** Add an output plugin. */
+	void AddOutputPlugin( const char * ext, ImageOutput_Plugin plugin ) {
+		s_plugin_save_map.Add(ext, plugin);
+	}
+
+	
+	bool Load(PiImage * img, const char * name, PiStream & stream) {
+			
+		// Get name extension.
+		const char * extension = piExtension(name);
+		
+		// Skip the dot.
+		if( *extension == '.' ) {
+			extension++;
+		}
+		
+		// Lookup plugin in the map.
+		ImageInput_Plugin plugin = NULL;
+		if( s_plugin_load_map.Get(extension, &plugin) ) {
+			return plugin(img, stream);
+		}
+		
+		/*foreach(i, s_plugin_load_map) {
+			nvDebug("%s %s %d\n", s_plugin_load_map[i].key.GetStr(), extension, 0 == strcmp(extension, s_plugin_load_map[i].key));
+		}
+		
+		nvDebug("No plugin found for '%s' %d.\n", extension, s_plugin_load_map.Size());*/
+		
+		return false;
+	}
+
+	bool Save(const PiImage * img, const char * name, PiStream & stream) {
+				
+		// Get name extension.
+		const char * extension = piExtension(name);
+		
+		// Skip the dot.
+		if( *extension == '.' ) {
+			extension++;
+		}
+		
+		// Lookup plugin in the map.
+		ImageOutput_Plugin plugin = NULL;
+		if( s_plugin_save_map.Get(extension, &plugin) ) {
+			return plugin(img, stream);
+		}
+		
+		return false;
+	}
+	
+} // ImageIO
+
+#endif // 0
+
--- a/src/nvimage/ImageIO.h
+++ b/src/nvimage/ImageIO.h
@ -0,0 +1,43 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#ifndef NV_IMAGE_IMAGEIO_H
+#define NV_IMAGE_IMAGEIO_H
+
+#include <nvimage/nvimage.h>
+
+namespace nv
+{
+	class Image;
+	class FloatImage;
+	class Stream;
+
+	namespace ImageIO
+	{
+		NVIMAGE_API Image * load(const char * name);
+		NVIMAGE_API Image * load(const char * name, Stream & s);
+		
+		NVIMAGE_API Image * loadTGA(Stream & s);
+		NVIMAGE_API bool saveTGA(Stream & s, const Image * img);
+
+#if defined(HAVE_PNG)
+		NVIMAGE_API Image * loadPNG(Stream & s);
+		NVIMAGE_API FloatImage * loadFloatPNG(Stream & s);
+#endif
+
+#if defined(HAVE_JPEG)
+		NVIMAGE_API Image * loadJPG(Stream & s);
+#endif
+		
+#if defined(HAVE_TIFF)
+		// Hack!
+		NVIMAGE_API FloatImage * loadFloatTIFF(const char * fileName);
+
+		NVIMAGE_API FloatImage * loadFloatTIFF(Stream & s);
+#endif
+
+	} // ImageIO namespace
+	
+} // nv namespace
+
+
+#endif // NV_IMAGE_IMAGEIO_H
--- a/src/nvimage/NormalMap.cpp
+++ b/src/nvimage/NormalMap.cpp
@ -0,0 +1,138 @@
+// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
+// 
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+#include <nvcore/Ptr.h>
+
+#include <nvmath/Color.h>
+
+#include <nvimage/NormalMap.h>
+#include <nvimage/Filter.h>
+#include <nvimage/FloatImage.h>
+#include <nvimage/Image.h>
+
+using namespace nv;
+
+// Create normal map using the given kernels.
+static FloatImage * createNormalMap(const Image * img, FloatImage::WrapMode wm, Vector4::Arg heightWeights, const Kernel2 * kdu, const Kernel2 * kdv)
+{
+	nvCheck(kdu != NULL);
+	nvCheck(kdv != NULL);
+	nvCheck(img != NULL);
+	
+	const uint w = img->width();
+	const uint h = img->height();
+	
+	AutoPtr<FloatImage> fimage(new FloatImage());
+	fimage->allocate(4, w, h);
+	
+	// Compute height and store in alpha channel:
+	float * alphaChannel = fimage->channel(3);
+	for(uint i = 0; i < w*h; i++)
+	{
+		Vector4 color = toVector4(img->pixel(i));
+		alphaChannel[i] = dot(color, heightWeights);
+	}
+	
+	float heightScale = 1.0f / 16.0f;	// @@ Use a user defined factor.
+	
+	for(uint y = 0; y < h; y++)
+	{
+		for(uint x = 0; x < w; x++)
+		{
+			const float du = fimage->applyKernel(kdu, x, y, 3, wm);
+			const float dv = fimage->applyKernel(kdv, x, y, 3, wm);
+			
+			Vector3 n = normalize(Vector3(du, dv, heightScale));
+			
+			fimage->setPixel(0.5f * n.x() + 0.5f, x, y, 0);
+			fimage->setPixel(0.5f * n.y() + 0.5f, x, y, 1);
+			fimage->setPixel(0.5f * n.z() + 0.5f, x, y, 2);
+		}
+	}
+	
+	return fimage.release();
+}
+
+
+/// Create normal map using the given filter.
+FloatImage * nv::createNormalMap(const Image * img, FloatImage::WrapMode wm, Vector4::Arg heightWeights, NormalMapFilter filter /*= Sobel3x3*/)
+{
+	nvCheck(img != NULL);
+	
+	// Init the kernels.
+	Kernel2 * kdu = NULL;
+	Kernel2 * kdv = NULL;
+
+	switch(filter)
+	{
+		case NormalMapFilter_Sobel3x3:
+			kdu = new Kernel2(3);
+			break;
+		case NormalMapFilter_Sobel5x5:
+			kdu = new Kernel2(5);
+			break;
+		case NormalMapFilter_Sobel7x7:
+			kdu = new Kernel2(7);
+			break;
+		case NormalMapFilter_Sobel9x9:
+			kdu = new Kernel2(9);
+			break;
+		default:
+			nvDebugCheck(false);
+	};
+
+	kdu->initSobel();
+	kdu->normalize();
+
+	kdv = new Kernel2(*kdu);
+	kdv->transpose();
+
+	return ::createNormalMap(img, wm, heightWeights, kdu, kdv);
+}
+
+
+/// Create normal map combining multiple sobel filters.
+FloatImage * nv::createNormalMap(const Image * img, FloatImage::WrapMode wm, Vector4::Arg heightWeights, Vector4::Arg filterWeights)
+{
+	nvCheck(img != NULL);
+
+	Kernel2 * kdu = NULL;
+	Kernel2 * kdv = NULL;
+
+	kdu = new Kernel2(9);
+	kdu->initBlendedSobel(filterWeights);
+	kdu->normalize();
+	
+	kdv = new Kernel2(*kdu);
+	kdv->transpose();
+	
+	return ::createNormalMap(img, wm, heightWeights, kdu, kdv);
+}
+
+/// Normalize the given image in place.
+void nv::normalize(FloatImage * img)
+{
+	nvCheck(img != NULL);
+	img->normalize(0);
+}
+
--- a/src/nvimage/NormalMap.h
+++ b/src/nvimage/NormalMap.h
@ -0,0 +1,55 @@
+// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
+// 
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef NV_IMAGE_NORMALMAP_H
+#define NV_IMAGE_NORMALMAP_H
+
+#include <nvmath/Vector.h>
+#include <nvimage/nvimage.h>
+#include <nvimage/FloatImage.h>
+
+
+namespace nv
+{
+	class Image;
+
+	enum NormalMapFilter
+	{
+		NormalMapFilter_Sobel3x3,	// fine detail
+		NormalMapFilter_Sobel5x5,	// medium detail
+		NormalMapFilter_Sobel7x7,	// large detail
+		NormalMapFilter_Sobel9x9,	// very large
+	};
+
+	FloatImage * createNormalMap(const Image * img, FloatImage::WrapMode wm, Vector4::Arg heightWeights, NormalMapFilter filter = NormalMapFilter_Sobel3x3);
+
+	FloatImage * createNormalMap(const Image * img, FloatImage::WrapMode wm, Vector4::Arg heightWeights, Vector4::Arg filterWeights);
+
+	void normalize(FloatImage * img);
+
+	// @@ Add generation of DU/DV maps.
+
+
+} // nv namespace
+
+#endif // NV_IMAGE_NORMALMAP_H
--- a/src/nvimage/Quantize.cpp
+++ b/src/nvimage/Quantize.cpp
@ -0,0 +1,234 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+/*
+http://www.visgraf.impa.br/Courses/ip00/proj/Dithering1/floyd_steinberg_dithering.html
+http://www.gamedev.net/reference/articles/article341.asp
+
+@@ Look at LPS: http://www.cs.rit.edu/~pga/pics2000/i.html
+ 
+This is a really nice guide to dithering algorithms:
+http://www.efg2.com/Lab/Library/ImageProcessing/DHALF.TXT
+
+@@ This code needs to be reviewed, I'm not sure it's correct.
+*/
+
+#include <nvmath/Color.h>
+
+#include <nvimage/Image.h>
+#include <nvimage/Quantize.h>
+
+using namespace nv;
+
+
+// Simple quantization.
+void nv::Quantize::BinaryAlpha( Image * image, int alpha_threshold /*= 127*/ )
+{
+	nvCheck(image != NULL);
+	
+	const uint w = image->width();
+	const uint h = image->height();
+	
+	for(uint y = 0; y < h; y++) {
+		for(uint x = 0; x < w; x++) {
+			
+			Color32 pixel = image->pixel(x, y);
+			
+			// Convert color.
+			if( pixel.a > alpha_threshold ) pixel.a = 255;
+			else pixel.a = 0;
+			
+			// Store color.
+			image->pixel(x, y) = pixel;
+		}
+	}
+}
+
+
+// Simple quantization.
+void nv::Quantize::RGB16( Image * image )
+{
+	nvCheck(image != NULL);
+	
+	const uint w = image->width();
+	const uint h = image->height();
+	
+	for(uint y = 0; y < h; y++) {
+		for(uint x = 0; x < w; x++) {
+			
+			Color32 pixel32 = image->pixel(x, y);
+			
+			// Convert to 16 bit and back to 32 using regular bit expansion.
+			Color32 pixel16 = toColor32( toColor16(pixel32) );
+			
+			// Store color.
+			image->pixel(x, y) = pixel16;
+		}
+	}
+}
+
+// Alpha quantization.
+void nv::Quantize::Alpha4( Image * image )
+{
+	nvCheck(image != NULL);
+	
+	const uint w = image->width();
+	const uint h = image->height();
+	
+	for(uint y = 0; y < h; y++) {
+		for(uint x = 0; x < w; x++) {
+			
+			Color32 pixel = image->pixel(x, y);
+			
+			// Convert to 4 bit using regular bit expansion.
+			pixel.a = (pixel.a & 0xF0) | ((pixel.a & 0xF0) >> 4);
+			
+			// Store color.
+			image->pixel(x, y) = pixel;
+		}
+	}
+}
+
+
+// Error diffusion. Floyd Steinberg.
+void nv::Quantize::FloydSteinberg_RGB16( Image * image )
+{
+	nvCheck(image != NULL);
+	
+	const uint w = image->width();
+	const uint h = image->height();
+	
+	// @@ Use fixed point?
+	Vector3 * row0 = new Vector3[w+2];
+	Vector3 * row1 = new Vector3[w+2];
+	memset(row0, 0, sizeof(Vector3)*(w+2));
+	memset(row1, 0, sizeof(Vector3)*(w+2));
+	
+	for(uint y = 0; y < h; y++) {
+		for(uint x = 0; x < w; x++) {
+			
+			Color32 pixel32 = image->pixel(x, y);
+			
+			// Add error.	// @@ We shouldn't clamp here!
+			pixel32.r = clamp(int(pixel32.r) + int(row0[1+x].x()), 0, 255);
+			pixel32.g = clamp(int(pixel32.g) + int(row0[1+x].y()), 0, 255);
+			pixel32.b = clamp(int(pixel32.b) + int(row0[1+x].z()), 0, 255);
+			
+			// Convert to 16 bit. @@ Use regular clamp?
+			Color32 pixel16 = toColor32( toColor16(pixel32) );
+			
+			// Store color.
+			image->pixel(x, y) = pixel16;
+			
+			// Compute new error.
+			Vector3 diff(float(pixel32.r - pixel16.r), float(pixel32.g - pixel16.g), float(pixel32.b - pixel16.b));
+			
+			// Propagate new error.
+			row0[1+x+1] += 7.0f / 16.0f * diff;
+			row1[1+x-1] += 3.0f / 16.0f * diff;
+			row1[1+x+0] += 5.0f / 16.0f * diff;
+			row1[1+x+1] += 1.0f / 16.0f * diff;
+		}
+		
+		swap(row0, row1);
+		memset(row1, 0, sizeof(Vector3)*(w+2));
+	}
+	
+	delete [] row0;
+	delete [] row1;
+}
+
+
+// Error diffusion. Floyd Steinberg.
+void nv::Quantize::FloydSteinberg_BinaryAlpha( Image * image, int alpha_threshold /*= 127*/ ) 
+{
+	nvCheck(image != NULL);
+	
+	const uint w = image->width();
+	const uint h = image->height();
+	
+	// @@ Use fixed point?
+	float * row0 = new float[(w+2)];
+	float * row1 = new float[(w+2)];
+	memset(row0, 0, sizeof(float)*(w+2));
+	memset(row1, 0, sizeof(float)*(w+2));
+	
+	for(uint y = 0; y < h; y++) {
+		for(uint x = 0; x < w; x++) {
+			
+			Color32 pixel = image->pixel(x, y);
+			
+			// Add error.
+			int alpha = int(pixel.a) + int(row0[1+x]);
+			
+			// Convert color.
+			if( alpha > alpha_threshold ) pixel.a = 255;
+			else pixel.a = 0;
+			
+			// Store color.
+			image->pixel(x, y) = pixel;
+			
+			// Compute new error.
+			float diff = float(alpha - pixel.a);
+			
+			// Propagate new error.
+			row0[1+x+1] += 7.0f / 16.0f * diff;
+			row1[1+x-1] += 3.0f / 16.0f * diff;
+			row1[1+x+0] += 5.0f / 16.0f * diff;
+			row1[1+x+1] += 1.0f / 16.0f * diff;
+		}
+		
+		swap(row0, row1);
+		memset(row1, 0, sizeof(float)*(w+2));
+	}
+	
+	delete [] row0;
+	delete [] row1;
+}
+
+
+// Error diffusion. Floyd Steinberg.
+void nv::Quantize::FloydSteinberg_Alpha4( Image * image )
+{
+	nvCheck(image != NULL);
+	
+	const uint w = image->width();
+	const uint h = image->height();
+	
+	// @@ Use fixed point?
+	float * row0 = new float[(w+2)];
+	float * row1 = new float[(w+2)];
+	memset(row0, 0, sizeof(float)*(w+2));
+	memset(row1, 0, sizeof(float)*(w+2));
+	
+	for(uint y = 0; y < h; y++) {
+		for(uint x = 0; x < w; x++) {
+			
+			Color32 pixel = image->pixel(x, y);
+			
+			// Add error.
+			int alpha = int(pixel.a) + int(row0[1+x]);
+			
+			// Convert to 4 bit using regular bit expansion.
+			pixel.a = (pixel.a & 0xF0) | ((pixel.a & 0xF0) >> 4);
+			
+			// Store color.
+			image->pixel(x, y) = pixel;
+			
+			// Compute new error.
+			float diff = float(alpha - pixel.a);
+			
+			// Propagate new error.
+			row0[1+x+1] += 7.0f / 16.0f * diff;
+			row1[1+x-1] += 3.0f / 16.0f * diff;
+			row1[1+x+0] += 5.0f / 16.0f * diff;
+			row1[1+x+1] += 1.0f / 16.0f * diff;
+		}
+		
+		swap(row0, row1);
+		memset(row1, 0, sizeof(float)*(w+2));
+	}
+	
+	delete [] row0;
+	delete [] row1;
+}
+
--- a/src/nvimage/Quantize.h
+++ b/src/nvimage/Quantize.h
@ -0,0 +1,25 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#ifndef NV_IMAGE_QUANTIZE_H
+#define NV_IMAGE_QUANTIZE_H
+
+namespace nv
+{
+	class Image;
+
+	namespace Quantize
+	{
+		void RGB16(Image * img);
+		void BinaryAlpha(Image * img, int alpha_threshold = 127);
+		void Alpha4(Image * img);
+		
+		void FloydSteinberg_RGB16(Image * img);
+		void FloydSteinberg_BinaryAlpha(Image * img, int alpha_threshold = 127);
+		void FloydSteinberg_Alpha4(Image * img);
+
+		// @@ Add palette quantization algorithms!
+	}
+}
+
+
+#endif // NV_IMAGE_QUANTIZE_H
--- a/src/nvimage/TgaFile.h
+++ b/src/nvimage/TgaFile.h
@ -0,0 +1,103 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#ifndef NV_IMAGE_TGAFILE_H
+#define NV_IMAGE_TGAFILE_H
+
+#include <nvcore/Stream.h>
+
+namespace nv
+{
+	
+// TGA types
+enum TGAType {
+    TGA_TYPE_INDEXED		= 1,
+    TGA_TYPE_RGB			= 2,
+    TGA_TYPE_GREY			= 3,
+    TGA_TYPE_RLE_INDEXED	= 9,
+    TGA_TYPE_RLE_RGB		= 10,
+    TGA_TYPE_RLE_GREY		= 11
+};
+
+#define TGA_INTERLEAVE_MASK	0xc0
+#define TGA_INTERLEAVE_NONE	0x00
+#define TGA_INTERLEAVE_2WAY	0x40
+#define TGA_INTERLEAVE_4WAY	0x80
+
+#define TGA_ORIGIN_MASK		0x30
+#define TGA_ORIGIN_LEFT		0x00
+#define TGA_ORIGIN_RIGHT	0x10
+#define TGA_ORIGIN_LOWER	0x00
+#define TGA_ORIGIN_UPPER	0x20
+
+
+/// Tga Header.
+struct TgaHeader {
+	uint8	id_length;
+	uint8	colormap_type;
+	uint8	image_type;
+	uint16	colormap_index;
+	uint16	colormap_length;
+	uint8	colormap_size;
+	uint16	x_origin;
+	uint16	y_origin;
+	uint16	width;
+	uint16	height;
+	uint8	pixel_size;
+	uint8	flags;
+
+	enum { Size = 18 };		//const static int SIZE = 18;
+};
+
+
+/// Tga File.
+struct TgaFile {
+
+	TgaFile() {
+		mem = NULL;
+	}
+	~TgaFile() {
+		free();
+	}
+
+	uint size() const {
+		return head.width * head.height * (head.pixel_size / 8);
+	}
+	void allocate() {
+		nvCheck( mem == NULL );
+		mem = new uint8[size()];
+	}
+	void free() {
+		delete [] mem;
+		mem = NULL;
+	}
+
+	TgaHeader head;
+	uint8 * mem;
+};
+
+
+inline Stream & operator<< (Stream & s, TgaHeader & head)
+{
+	s << head.id_length << head.colormap_type << head.image_type;
+	s << head.colormap_index << head.colormap_length << head.colormap_size;
+	s << head.x_origin << head.y_origin << head.width << head.height;
+	s << head.pixel_size << head.flags;
+	return s;
+}
+
+inline Stream & operator<< (Stream & s, TgaFile & tga)
+{
+	s << tga.head;
+
+	if( s.isLoading() ) {
+		tga.allocate();
+	}
+
+	s.serialize( tga.mem, tga.size() );
+
+	return s;
+}
+
+} // nv namespace
+
+#endif // NV_IMAGE_TGAFILE_H
--- a/src/nvimage/nvimage.h
+++ b/src/nvimage/nvimage.h
@ -0,0 +1,22 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#ifndef NV_IMAGE_H
+#define NV_IMAGE_H
+
+#include <nvcore/nvcore.h>
+
+// Function linkage
+#if NVIMAGE_SHARED
+#ifdef NVIMAGE_EXPORTS
+#define NVIMAGE_API DLL_EXPORT
+#define NVIMAGE_CLASS DLL_EXPORT_CLASS
+#else
+#define NVIMAGE_API DLL_IMPORT
+#define NVIMAGE_CLASS DLL_IMPORT
+#endif
+#else
+#define NVIMAGE_API
+#define NVIMAGE_CLASS
+#endif
+
+#endif // NV_IMAGE_H
--- a/src/nvimage/nvtt/BlockDXT.cpp
+++ b/src/nvimage/nvtt/BlockDXT.cpp
@ -0,0 +1,553 @@
+// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
+// 
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+#include <nvimage/ColorBlock.h>
+#include "BlockDXT.h"
+
+using namespace nv;
+
+
+/*----------------------------------------------------------------------------
+	BlockDXT1
+----------------------------------------------------------------------------*/
+
+uint BlockDXT1::evaluatePalette(Color32 color_array[4]) const
+{
+	// Does bit expansion before interpolation.
+	color_array[0].b = (col0.b << 3) | (col0.b >> 2);
+	color_array[0].g = (col0.g << 2) | (col0.g >> 4);
+	color_array[0].r = (col0.r << 3) | (col0.r >> 2);
+	color_array[0].a = 0xFF;
+	
+	// @@ Same as above, but faster?
+//	Color32 c;
+//	c.u = ((col0.u << 3) & 0xf8) | ((col0.u << 5) & 0xfc00) | ((col0.u << 8) & 0xf80000);
+//	c.u |= (c.u >> 5) & 0x070007;
+//	c.u |= (c.u >> 6) & 0x000300;
+//	color_array[0].u = c.u;
+	
+	color_array[1].r = (col1.r << 3) | (col1.r >> 2);
+	color_array[1].g = (col1.g << 2) | (col1.g >> 4);
+	color_array[1].b = (col1.b << 3) | (col1.b >> 2);
+	color_array[1].a = 0xFF;
+	
+	// @@ Same as above, but faster?
+//	c.u = ((col1.u << 3) & 0xf8) | ((col1.u << 5) & 0xfc00) | ((col1.u << 8) & 0xf80000);
+//	c.u |= (c.u >> 5) & 0x070007;
+//	c.u |= (c.u >> 6) & 0x000300;
+//	color_array[1].u = c.u;
+	
+	if( col0.u > col1.u ) {
+		// Four-color block: derive the other two colors.
+		color_array[2].r = (2 * color_array[0].r + color_array[1].r) / 3;
+		color_array[2].g = (2 * color_array[0].g + color_array[1].g) / 3;
+		color_array[2].b = (2 * color_array[0].b + color_array[1].b) / 3;
+		color_array[2].a = 0xFF;
+		
+		color_array[3].r = (2 * color_array[1].r + color_array[0].r) / 3;
+		color_array[3].g = (2 * color_array[1].g + color_array[0].g) / 3;
+		color_array[3].b = (2 * color_array[1].b + color_array[0].b) / 3;
+		color_array[3].a = 0xFF;
+		
+		return 4;
+	}
+	else {
+		// Three-color block: derive the other color.
+		color_array[2].r = (color_array[0].r + color_array[1].r) / 2;
+		color_array[2].g = (color_array[0].g + color_array[1].g) / 2;
+		color_array[2].b = (color_array[0].b + color_array[1].b) / 2;
+		color_array[2].a = 0xFF;
+		
+		// Set all components to 0 to match DXT specs.
+		color_array[3].r = 0x00; // color_array[2].r;
+		color_array[3].g = 0x00; // color_array[2].g;
+		color_array[3].b = 0x00; // color_array[2].b;
+		color_array[3].a = 0x00;
+		
+		return 3;
+	}
+}
+
+// Evaluate palette assuming 3 color block.
+void BlockDXT1::evaluatePalette3(Color32 color_array[4]) const
+{
+	color_array[0].b = (col0.b << 3) | (col0.b >> 2);
+	color_array[0].g = (col0.g << 2) | (col0.g >> 4);
+	color_array[0].r = (col0.r << 3) | (col0.r >> 2);
+	color_array[0].a = 0xFF;
+	
+	color_array[1].r = (col1.r << 3) | (col1.r >> 2);
+	color_array[1].g = (col1.g << 2) | (col1.g >> 4);
+	color_array[1].b = (col1.b << 3) | (col1.b >> 2);
+	color_array[1].a = 0xFF;
+	
+	// Three-color block: derive the other color.
+	color_array[2].r = (color_array[0].r + color_array[1].r) / 2;
+	color_array[2].g = (color_array[0].g + color_array[1].g) / 2;
+	color_array[2].b = (color_array[0].b + color_array[1].b) / 2;
+	color_array[2].a = 0xFF;
+		
+	// Set all components to 0 to match DXT specs.
+	color_array[3].r = 0x00; // color_array[2].r;
+	color_array[3].g = 0x00; // color_array[2].g;
+	color_array[3].b = 0x00; // color_array[2].b;
+	color_array[3].a = 0x00;
+}
+
+// Evaluate palette assuming 4 color block.
+void BlockDXT1::evaluatePalette4(Color32 color_array[4]) const
+{
+	color_array[0].b = (col0.b << 3) | (col0.b >> 2);
+	color_array[0].g = (col0.g << 2) | (col0.g >> 4);
+	color_array[0].r = (col0.r << 3) | (col0.r >> 2);
+	color_array[0].a = 0xFF;
+	
+	color_array[1].r = (col1.r << 3) | (col1.r >> 2);
+	color_array[1].g = (col1.g << 2) | (col1.g >> 4);
+	color_array[1].b = (col1.b << 3) | (col1.b >> 2);
+	color_array[1].a = 0xFF;
+	
+	// Four-color block: derive the other two colors.
+	color_array[2].r = (2 * color_array[0].r + color_array[1].r) / 3;
+	color_array[2].g = (2 * color_array[0].g + color_array[1].g) / 3;
+	color_array[2].b = (2 * color_array[0].b + color_array[1].b) / 3;
+	color_array[2].a = 0xFF;
+		
+	color_array[3].r = (2 * color_array[1].r + color_array[0].r) / 3;
+	color_array[3].g = (2 * color_array[1].g + color_array[0].g) / 3;
+	color_array[3].b = (2 * color_array[1].b + color_array[0].b) / 3;
+	color_array[3].a = 0xFF;
+}
+
+
+/* Jason Dorie's code.
+// ----------------------------------------------------------------------------
+// Build palette for a 3 color + traparent black block
+// ----------------------------------------------------------------------------
+void DXTCGen::BuildCodes3(cbVector *pVects, cbVector &v1, cbVector &v2)
+{
+	//pVects[0] = v1;
+	//pVects[2] = v2;
+	//pVects[1][0] = v1[0];
+	//pVects[1][1] = (BYTE)( ((long)v1[1] + (long)v2[1]) / 2 );
+	//pVects[1][2] = (BYTE)( ((long)v1[2] + (long)v2[2]) / 2 );
+	//pVects[1][3] = (BYTE)( ((long)v1[3] + (long)v2[3]) / 2 );
+
+	__asm {
+		mov			ecx, dword ptr pVects
+		mov			eax, dword ptr v1
+		mov			ebx, dword ptr v2
+
+		movd		mm0, [eax]
+		movd		mm1, [ebx]
+		pxor		mm2, mm2
+		nop
+
+		movd		[ecx], mm0
+		movd		[ecx+8], mm1
+
+		punpcklbw	mm0, mm2
+		punpcklbw	mm1, mm2
+
+		paddw		mm0, mm1
+		psrlw		mm0, 1
+
+		packuswb	mm0, mm0
+		movd		[ecx+4], mm0
+	}
+	// *(long *)&pVects[1] = r1;
+}
+
+__int64 ScaleOneThird = 0x5500550055005500;
+
+// ----------------------------------------------------------------------------
+// Build palette for a 4 color block
+// ----------------------------------------------------------------------------
+void DXTCGen::BuildCodes4(cbVector *pVects, cbVector &v1, cbVector &v2)
+{
+// 	pVects[0] = v1;
+// 	pVects[3] = v2;
+// 
+// 	pVects[1][0] = v1[0];
+// 	pVects[1][1] = (BYTE)( ((long)v1[1] * 2 + (long)v2[1]) / 3 );
+// 	pVects[1][2] = (BYTE)( ((long)v1[2] * 2 + (long)v2[2]) / 3 );
+// 	pVects[1][3] = (BYTE)( ((long)v1[3] * 2 + (long)v2[3]) / 3 );
+// 
+// 	pVects[2][0] = v1[0];
+// 	pVects[2][1] = (BYTE)( ((long)v2[1] * 2 + (long)v1[1]) / 3 );
+// 	pVects[2][2] = (BYTE)( ((long)v2[2] * 2 + (long)v1[2]) / 3 );
+// 	pVects[2][3] = (BYTE)( ((long)v2[3] * 2 + (long)v1[3]) / 3 );
+
+	__asm {
+		mov			ecx, dword ptr pVects
+		mov			eax, dword ptr v1
+		mov			ebx, dword ptr v2
+
+		movd		mm0, [eax]
+		movd		mm1, [ebx]
+
+		pxor		mm2, mm2
+		movd		[ecx], mm0
+		movd		[ecx+12], mm1
+
+		punpcklbw	mm0, mm2
+		punpcklbw	mm1, mm2
+		movq		mm3, mm0		// mm3 = v0
+
+		paddw		mm0, mm1		// mm0 = v0 + v1
+		paddw		mm3, mm3		// mm3 = v0*2
+
+		paddw		mm0, mm1		// mm0 = v0 + v1*2
+		paddw		mm1, mm3		// mm1 = v0*2 + v1
+
+		pmulhw		mm0, ScaleOneThird
+		pmulhw		mm1, ScaleOneThird
+		packuswb	mm1, mm0
+
+		movq		[ecx+4], mm1
+	}
+}
+*/
+
+void BlockDXT1::decodeBlock(ColorBlock * block) const
+{
+	nvDebugCheck(block != NULL);
+	
+	// Decode color block.
+	Color32 color_array[4];
+	evaluatePalette(color_array);
+	
+	// Write color block.
+	for( uint j = 0; j < 4; j++ ) {
+		for( uint i = 0; i < 4; i++ ) {
+			uint idx = (row[j] >> (2 * i)) & 3;
+			block->color(i, j) = color_array[idx];
+		}
+	}	
+}
+
+void BlockDXT1::setIndices(int * idx)
+{
+	indices = 0;
+	for(uint i = 0; i < 16; i++) {
+		indices |= (idx[i] & 3) << (2 * i);
+	}
+}
+
+
+/// Flip DXT1 block vertically.
+inline void BlockDXT1::flip4()
+{
+	swap(row[0], row[3]);
+	swap(row[1], row[2]);
+}
+
+/// Flip half DXT1 block vertically.
+inline void BlockDXT1::flip2()
+{
+	swap(row[0], row[1]);
+}
+
+
+/*----------------------------------------------------------------------------
+	BlockDXT3
+----------------------------------------------------------------------------*/
+
+void BlockDXT3::decodeBlock(ColorBlock * block) const
+{
+	nvDebugCheck(block != NULL);
+	
+	// Decode color.
+	color.decodeBlock(block);
+	
+	// Decode alpha.
+	block->color(0x0).a = (alpha.alpha0 << 4) | alpha.alpha0;
+	block->color(0x1).a = (alpha.alpha1 << 4) | alpha.alpha1;
+	block->color(0x2).a = (alpha.alpha2 << 4) | alpha.alpha2;
+	block->color(0x3).a = (alpha.alpha3 << 4) | alpha.alpha3;
+	block->color(0x4).a = (alpha.alpha4 << 4) | alpha.alpha4;
+	block->color(0x5).a = (alpha.alpha5 << 4) | alpha.alpha5;
+	block->color(0x6).a = (alpha.alpha6 << 4) | alpha.alpha6;
+	block->color(0x7).a = (alpha.alpha7 << 4) | alpha.alpha7;
+	block->color(0x8).a = (alpha.alpha8 << 4) | alpha.alpha8;
+	block->color(0x9).a = (alpha.alpha9 << 4) | alpha.alpha9;
+	block->color(0xA).a = (alpha.alphaA << 4) | alpha.alphaA;
+	block->color(0xB).a = (alpha.alphaB << 4) | alpha.alphaB;
+	block->color(0xC).a = (alpha.alphaC << 4) | alpha.alphaC;
+	block->color(0xD).a = (alpha.alphaD << 4) | alpha.alphaD;
+	block->color(0xE).a = (alpha.alphaE << 4) | alpha.alphaE;
+	block->color(0xF).a = (alpha.alphaF << 4) | alpha.alphaF;
+}
+
+/// Flip DXT3 alpha block vertically.
+void AlphaBlockDXT3::flip4()
+{
+	swap(row[0], row[3]);
+	swap(row[1], row[2]);
+}
+
+/// Flip half DXT3 alpha block vertically.
+void AlphaBlockDXT3::flip2()
+{
+	swap(row[0], row[1]);
+}
+
+/// Flip DXT3 block vertically.
+void BlockDXT3::flip4()
+{
+	alpha.flip4();
+	color.flip4();
+}
+
+/// Flip half DXT3 block vertically.
+void BlockDXT3::flip2()
+{
+	alpha.flip2();
+	color.flip2();
+}
+
+
+/*----------------------------------------------------------------------------
+	BlockDXT5
+----------------------------------------------------------------------------*/
+
+void AlphaBlockDXT5::evaluatePalette(uint8 alpha[8]) const
+{
+	if (alpha0 > alpha1) {
+		evaluatePalette8(alpha);
+	}
+	else {
+		evaluatePalette6(alpha);
+	}
+}
+
+void AlphaBlockDXT5::evaluatePalette8(uint8 alpha[8]) const
+{
+	// 8-alpha block:  derive the other six alphas.
+	// Bit code 000 = alpha0, 001 = alpha1, others are interpolated.
+	alpha[0] = alpha0;
+	alpha[1] = alpha1;
+	alpha[2] = (6 * alpha0 + 1 * alpha1) / 7;	// bit code 010
+	alpha[3] = (5 * alpha0 + 2 * alpha1) / 7;	// bit code 011
+	alpha[4] = (4 * alpha0 + 3 * alpha1) / 7;	// bit code 100
+	alpha[5] = (3 * alpha0 + 4 * alpha1) / 7;	// bit code 101
+	alpha[6] = (2 * alpha0 + 5 * alpha1) / 7;	// bit code 110
+	alpha[7] = (1 * alpha0 + 6 * alpha1) / 7;	// bit code 111
+}
+
+void AlphaBlockDXT5::evaluatePalette6(uint8 alpha[8]) const
+{
+	// 6-alpha block.
+	// Bit code 000 = alpha0, 001 = alpha1, others are interpolated.
+	alpha[0] = alpha0;
+	alpha[1] = alpha1;
+	alpha[2] = (4 * alpha0 + 1 * alpha1) / 5;	// Bit code 010
+	alpha[3] = (3 * alpha0 + 2 * alpha1) / 5;	// Bit code 011
+	alpha[4] = (2 * alpha0 + 3 * alpha1) / 5;	// Bit code 100
+	alpha[5] = (1 * alpha0 + 4 * alpha1) / 5;	// Bit code 101
+	alpha[6] = 0x00;							// Bit code 110
+	alpha[7] = 0xFF;							// Bit code 111
+}
+
+void AlphaBlockDXT5::indices(uint8 index_array[16]) const
+{
+	index_array[0x0] = bits0;
+	index_array[0x1] = bits1;
+	index_array[0x2] = bits2;
+	index_array[0x3] = bits3;
+	index_array[0x4] = bits4;
+	index_array[0x5] = bits5;
+	index_array[0x6] = bits6;
+	index_array[0x7] = bits7;
+	index_array[0x8] = bits8;
+	index_array[0x9] = bits9;
+	index_array[0xA] = bitsA;
+	index_array[0xB] = bitsB;
+	index_array[0xC] = bitsC;
+	index_array[0xD] = bitsD;
+	index_array[0xE] = bitsE;
+	index_array[0xF] = bitsF;
+	
+	/*
+	// @@ missaligned reads might be very expensive on some hardware.		
+	uint b = (uint &) bits[0];
+	for(int i = 0; i < 8; i++) {
+		index_array[i] = uint8(b & 0x07); 
+		b >>= 3;
+	}
+	
+	b = (uint &) bits[3];
+	for(int i = 0; i < 8; i++) {
+		index_array[8+i] = uint8(b & 0x07); 
+		b >>= 3;
+	}
+	*/
+}
+
+uint AlphaBlockDXT5::index(uint index) const
+{
+	nvDebugCheck(index < 16);
+
+	int offset = (3 * index + 16);
+	return (this->u >> offset) & 0x7;
+/*
+	if (index == 0x0) return bits0;
+	else if (index == 0x1) return bits1;
+	else if (index == 0x2) return bits2;
+	else if (index == 0x3) return bits3;
+	else if (index == 0x4) return bits4;
+	else if (index == 0x5) return bits5;
+	else if (index == 0x6) return bits6;
+	else if (index == 0x7) return bits7;
+	else if (index == 0x8) return bits8;
+	else if (index == 0x9) return bits9;
+	else if (index == 0xA) return bitsA;
+	else if (index == 0xB) return bitsB;
+	else if (index == 0xC) return bitsC;
+	else if (index == 0xD) return bitsD;
+	else if (index == 0xE) return bitsE;
+	else if (index == 0xF) return bitsF;
+	return 0;
+*/
+}
+
+void AlphaBlockDXT5::setIndex(uint index, uint value)
+{
+	nvDebugCheck(index < 16);
+	nvDebugCheck(value < 8);
+
+	int offset = (3 * index + 16);
+	uint64 mask = uint64(0x7) << offset;
+	this->u = (this->u & ~mask) | (uint64(value) << offset);
+
+/*
+	// @@ Really bad code...
+	if (index == 0x0) bits0 = value;
+	else if (index == 0x1) bits1 = value;
+	else if (index == 0x2) bits2 = value;
+	else if (index == 0x3) bits3 = value;
+	else if (index == 0x4) bits4 = value;
+	else if (index == 0x5) bits5 = value;
+	else if (index == 0x6) bits6 = value;
+	else if (index == 0x7) bits7 = value;
+	else if (index == 0x8) bits8 = value;
+	else if (index == 0x9) bits9 = value;
+	else if (index == 0xA) bitsA = value;
+	else if (index == 0xB) bitsB = value;
+	else if (index == 0xC) bitsC = value;
+	else if (index == 0xD) bitsD = value;
+	else if (index == 0xE) bitsE = value;
+	else if (index == 0xF) bitsF = value;
+*/
+}
+
+void AlphaBlockDXT5::flip4()
+{
+	uint64 * b = (uint64 *)this;
+	
+	// @@ The masks might have to be byte swapped.
+	uint64 tmp = (*b & POSH_U64(0x000000000000FFFF));
+	tmp |= (*b & POSH_U64(0x000000000FFF0000)) << 36;
+	tmp |= (*b & POSH_U64(0x000000FFF0000000)) << 12;
+	tmp |= (*b & POSH_U64(0x000FFF0000000000)) >> 12;
+	tmp |= (*b & POSH_U64(0xFFF0000000000000)) >> 36;
+	
+	*b = tmp;
+}
+
+void AlphaBlockDXT5::flip2()
+{
+	uint * b = (uint *)this;
+	
+	// @@ The masks might have to be byte swapped.
+	uint tmp = (*b & 0xFF000000);
+	tmp |=  (*b & 0x00000FFF) << 12;
+	tmp |= (*b & 0x00FFF000) >> 12;
+	
+	*b = tmp;
+}
+
+void BlockDXT5::decodeBlock(ColorBlock * block) const
+{
+	nvDebugCheck(block != NULL);
+	
+	// Decode color.
+	color.decodeBlock(block);
+	
+	// Decode alpha.
+	uint8 alpha_array[8];
+	alpha.evaluatePalette(alpha_array);
+	
+	uint8 index_array[16];
+	alpha.indices(index_array);
+	
+	for(uint i = 0; i < 16; i++) {
+		block->color(i).a = alpha_array[index_array[i]];
+	}
+}
+
+/// Flip DXT5 block vertically.
+void BlockDXT5::flip4()
+{
+	alpha.flip4();
+	color.flip4();
+}
+
+/// Flip half DXT5 block vertically.
+void BlockDXT5::flip2()
+{
+	alpha.flip2();
+	color.flip2();
+}
+
+
+/// Decode 3DC block.
+void Block3DC::decodeBlock(ColorBlock * block) const
+{
+	// @@ TBD
+}
+
+/// Flip 3DC block vertically.
+void Block3DC::flip4()
+{
+	y.flip4();
+	x.flip4();
+}
+
+/// Flip half 3DC block vertically.
+void Block3DC::flip2()
+{
+	y.flip2();
+	x.flip2();
+}
+
+
+
+
+
+	
+
+
+
+
+
--- a/src/nvimage/nvtt/BlockDXT.h
+++ b/src/nvimage/nvtt/BlockDXT.h
@ -0,0 +1,176 @@
+// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
+// 
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef NV_TT_BLOCKDXT_H
+#define NV_TT_BLOCKDXT_H
+
+#include <nvmath/Color.h>
+#include "nvtt.h"
+
+namespace nv
+{
+	struct ColorBlock;
+
+	/// DXT1 block.
+	struct BlockDXT1
+	{
+		Color16 col0;
+		Color16 col1;
+		union {
+			uint8 row[4];
+			uint indices;
+		};
+	
+		bool isFourColorMode() const;
+	
+		uint evaluatePalette(Color32 color_array[4]) const;
+		uint evaluatePaletteFast(Color32 color_array[4]) const;
+		void evaluatePalette3(Color32 color_array[4]) const;
+		void evaluatePalette4(Color32 color_array[4]) const;
+		
+		void decodeBlock(ColorBlock * block) const;
+		
+		void setIndices(int * idx);
+
+		void flip4();
+		void flip2();
+	};
+	
+	/// Return true if the block uses four color mode, false otherwise.
+	inline bool BlockDXT1::isFourColorMode() const
+	{
+		return col0.u >= col1.u;	// @@ > or >= ?
+	}
+	
+	
+	
+	
+	/// DXT3 alpha block with explicit alpha.
+	struct AlphaBlockDXT3
+	{
+		union {
+			struct {
+				uint alpha0 : 4;
+				uint alpha1 : 4;
+				uint alpha2 : 4;
+				uint alpha3 : 4;
+				uint alpha4 : 4;
+				uint alpha5 : 4;
+				uint alpha6 : 4;
+				uint alpha7 : 4;
+				uint alpha8 : 4;
+				uint alpha9 : 4;
+				uint alphaA : 4;
+				uint alphaB : 4;
+				uint alphaC : 4;
+				uint alphaD : 4;
+				uint alphaE : 4;
+				uint alphaF : 4;
+			};
+			uint16 row[4];
+		};
+		
+		void flip4();
+		void flip2();
+	};
+	
+	
+	/// DXT3 block.
+	struct BlockDXT3
+	{
+		AlphaBlockDXT3 alpha;
+		BlockDXT1 color;
+		
+		void decodeBlock(ColorBlock * block) const;
+		
+		void flip4();
+		void flip2();
+	};
+	
+	
+	/// DXT5 alpha block.
+	struct AlphaBlockDXT5
+	{
+		union {
+			struct {
+				uint64 alpha0 : 8;	// 8
+				uint64 alpha1 : 8;	// 16
+				uint64 bits0 : 3;	// 3 - 19
+				uint64 bits1 : 3; 	// 6 - 22
+				uint64 bits2 : 3; 	// 9 - 25
+				uint64 bits3 : 3;	// 12 - 28
+				uint64 bits4 : 3;	// 15 - 31
+				uint64 bits5 : 3;	// 18 - 34
+				uint64 bits6 : 3;	// 21 - 37
+				uint64 bits7 : 3;	// 24 - 40
+				uint64 bits8 : 3;	// 27 - 43
+				uint64 bits9 : 3; 	// 30 - 46
+				uint64 bitsA : 3; 	// 33 - 49
+				uint64 bitsB : 3;	// 36 - 52
+				uint64 bitsC : 3;	// 39 - 55
+				uint64 bitsD : 3;	// 42 - 58
+				uint64 bitsE : 3;	// 45 - 61
+				uint64 bitsF : 3;	// 48 - 64
+			};
+			uint64 u;
+		};
+		
+		void evaluatePalette(uint8 alpha[8]) const;
+		void evaluatePalette8(uint8 alpha[8]) const;
+		void evaluatePalette6(uint8 alpha[8]) const;
+		void indices(uint8 index_array[16]) const;
+
+		uint index(uint index) const;
+		void setIndex(uint index, uint value);
+		
+		void flip4();
+		void flip2();
+	};
+	
+	/// DXT5 block.
+	struct BlockDXT5
+	{
+		AlphaBlockDXT5 alpha;
+		BlockDXT1 color;
+		
+		void decodeBlock(ColorBlock * block) const;
+		
+		void flip4();
+		void flip2();
+	};
+	
+	/// 3DC block.
+	struct Block3DC
+	{
+		AlphaBlockDXT5 y;
+		AlphaBlockDXT5 x;
+		
+		void decodeBlock(ColorBlock * block) const;
+		
+		void flip4();
+		void flip2();
+	};
+
+} // nv namespace
+
+#endif // NV_TT_BLOCKDXT_H
--- a/src/nvimage/nvtt/CMakeLists.txt
+++ b/src/nvimage/nvtt/CMakeLists.txt
@ -0,0 +1,57 @@
+PROJECT(nvtt)
+
+ADD_SUBDIRECTORY(squish)
+
+SET(NVTT_SRCS
+	nvtt.h 
+	CompressDXT.h
+	CompressDXT.cpp
+	CompressRGB.h
+	CompressRGB.cpp
+	FastCompressDXT.h
+	FastCompressDXT.cpp
+	BlockDXT.h
+	BlockDXT.cpp
+	dxtlib.cpp
+	dxtlib_compat.h
+	CompressionOptions.h
+	CompressionOptions.cpp
+	InputOptions.h
+	InputOptions.cpp
+	OutputOptions.cpp
+	cuda/CudaUtils.h
+	cuda/CudaUtils.cpp
+	cuda/CudaCompressDXT.h
+	cuda/CudaCompressDXT.cpp)
+
+IF(CUDA_FOUND)
+	ADD_DEFINITIONS(-DHAVE_CUDA)
+	WRAP_CUDA(CUDA_SRCS cuda/CompressKernel.cu)
+	SET(NVTT_SRCS ${NVTT_SRCS} ${CUDA_SRCS})
+	SET(LIBS ${LIBS} ${CUDA_LIBRARY})
+	INCLUDE_DIRECTORIES(${CUDA_INCLUDE_PATH})
+ENDIF(CUDA_FOUND)
+
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+
+ADD_DEFINITIONS(-DNVTT_EXPORTS)
+
+IF(NVTT_SHARED)	
+	ADD_LIBRARY(nvtt SHARED ${DXT_SRCS})
+ELSE(NVTT_SHARED)
+	ADD_LIBRARY(nvtt ${NVTT_SRCS})
+ENDIF(NVTT_SHARED)
+
+TARGET_LINK_LIBRARIES(nvtt ${LIBS} nvcore nvmath nvimage squish)
+
+
+# test executables
+ADD_EXECUTABLE(nvcompress compress.cpp)
+TARGET_LINK_LIBRARIES(nvcompress nvcore nvmath nvimage nvtt)
+
+INSTALL(TARGETS nvcompress DESTINATION bin)
+
+#ADD_EXECUTABLE(nvtextool nvdxt.cpp configdialog.cpp)
+
+
+
--- a/src/nvimage/nvtt/CompressDXT.cpp
+++ b/src/nvimage/nvtt/CompressDXT.cpp
@ -0,0 +1,535 @@
+// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
+// 
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+#include <nvcore/Memory.h>
+
+#include <nvimage/Image.h>
+#include <nvimage/ColorBlock.h>
+
+#include "nvtt.h"
+#include "CompressDXT.h"
+#include "FastCompressDXT.h"
+#include "BlockDXT.h"
+#include "CompressionOptions.h"
+
+// squish
+#include "squish/colourset.h"
+//#include "squish/clusterfit.h"
+#include "squish/fastclusterfit.h"
+#include "squish/weightedclusterfit.h"
+
+// s3_quant
+#if defined(HAVE_S3QUANT)
+#include "s3tc/s3_quant.h"
+#endif
+
+// ati tc
+#if defined(HAVE_ATITC)
+#include "atitc/ATI_Compress.h"
+#endif
+
+//#include <time.h>
+
+using namespace nv;
+using namespace nvtt;
+
+
+void nv::fastCompressDXT1(const Image * image, const OutputOptions & outputOptions)
+{
+	const uint w = image->width();
+	const uint h = image->height();
+	
+	ColorBlock rgba;
+	BlockDXT1 block;
+
+	for (uint y = 0; y < h; y += 4) {
+		for (uint x = 0; x < w; x += 4) {
+			rgba.init(image, x, y);
+			
+			compressBlock_BoundsRange(rgba, &block);
+
+			// @@ Use iterative optimization.
+			optimizeEndPoints(rgba, &block);
+			
+			if (outputOptions.outputHandler != NULL) {
+				outputOptions.outputHandler->writeData(&block, sizeof(block));
+			}
+		}
+	}
+}
+
+
+void nv::fastCompressDXT3(const Image * image, const nvtt::OutputOptions & outputOptions)
+{
+	const uint w = image->width();
+	const uint h = image->height();
+	
+	ColorBlock rgba;
+	BlockDXT3 block;
+
+	for (uint y = 0; y < h; y += 4) {
+		for (uint x = 0; x < w; x += 4) {
+			rgba.init(image, x, y);
+			compressBlock_BoundsRange(rgba, &block);
+			
+			if (outputOptions.outputHandler != NULL) {
+				outputOptions.outputHandler->writeData(&block, sizeof(block));
+			}
+		}
+	}
+}
+
+
+void nv::fastCompressDXT5(const Image * image, const nvtt::OutputOptions & outputOptions)
+{
+	const uint w = image->width();
+	const uint h = image->height();
+	
+	ColorBlock rgba;
+	BlockDXT5 block;
+
+	for (uint y = 0; y < h; y += 4) {
+		for (uint x = 0; x < w; x += 4) {
+			rgba.init(image, x, y);
+			compressBlock_BoundsRange(rgba, &block);
+			
+			if (outputOptions.outputHandler != NULL) {
+				outputOptions.outputHandler->writeData(&block, sizeof(block));
+			}
+		}
+	}
+}
+
+
+void nv::fastCompressDXT5n(const Image * image, const nvtt::OutputOptions & outputOptions)
+{
+	const uint w = image->width();
+	const uint h = image->height();
+	
+	ColorBlock rgba;
+	BlockDXT5 block;
+
+	for (uint y = 0; y < h; y += 4) {
+		for (uint x = 0; x < w; x += 4) {
+			rgba.init(image, x, y);
+			
+			// copy X coordinate to green channel and Y coordinate to alpha channel.
+			rgba.swizzleDXT5n();			
+			
+			compressBlock_BoundsRange(rgba, &block);
+			
+			if (outputOptions.outputHandler != NULL) {
+				outputOptions.outputHandler->writeData(&block, sizeof(block));
+			}
+		}
+	}
+}
+
+
+void nv::fastCompressBC4(const Image * image, const nvtt::OutputOptions & outputOptions)
+{
+	// @@ TODO
+	// compress red channel (X)
+}
+
+
+void nv::fastCompressBC5(const Image * image, const nvtt::OutputOptions & outputOptions)
+{
+	// @@ TODO
+	// compress red, green channels (X,Y)
+}
+
+
+void nv::doPrecomputation()
+{
+	static bool done = false;
+	
+	if (!done)
+	{
+		done = true;
+		squish::FastClusterFit::doPrecomputation();
+	}
+}
+
+
+void nv::compressDXT1(const Image * image, const OutputOptions & outputOptions, const CompressionOptions::Private & compressionOptions)
+{
+	const uint w = image->width();
+	const uint h = image->height();
+	
+	ColorBlock rgba;
+	BlockDXT1 block;
+
+	doPrecomputation();
+
+	for (uint y = 0; y < h; y += 4) {
+		for (uint x = 0; x < w; x += 4) {
+			
+			rgba.init(image, x, y);
+			
+			// Compress color.
+			squish::ColourSet colours((uint8 *)rgba.colors(), 0);
+			squish::FastClusterFit fit(&colours, squish::kDxt1);
+			//squish::WeightedClusterFit fit(&colours, squish::kDxt1);
+			//squish::ClusterFit fit(&colours, squish::kDxt1);
+			fit.setMetric(compressionOptions.colorWeight.x(), compressionOptions.colorWeight.y(), compressionOptions.colorWeight.z());
+			fit.Compress(&block);
+			
+			// @@ Use iterative cluster fit algorithm to improve error in highest quality mode.
+			
+			if (outputOptions.outputHandler != NULL) {
+				outputOptions.outputHandler->writeData(&block, sizeof(block));
+			}
+		}
+	}
+}
+
+
+void nv::compressDXT3(const Image * image, const OutputOptions & outputOptions, const CompressionOptions::Private & compressionOptions)
+{
+	const uint w = image->width();
+	const uint h = image->height();
+	
+	ColorBlock rgba;
+	BlockDXT3 block;
+	
+	for (uint y = 0; y < h; y += 4) {
+		for (uint x = 0; x < w; x += 4) {
+			
+			rgba.init(image, x, y);
+			
+			// Compress explicit alpha.
+			compressBlock(rgba, &block.alpha);
+			
+			// Compress color.
+			squish::ColourSet colours((uint8 *)rgba.colors(), squish::kWeightColourByAlpha);
+			squish::WeightedClusterFit fit(&colours, 0);
+			fit.setMetric(compressionOptions.colorWeight.x(), compressionOptions.colorWeight.y(), compressionOptions.colorWeight.z());
+			fit.Compress(&block.color);
+			
+			if (outputOptions.outputHandler != NULL) {
+				outputOptions.outputHandler->writeData(&block, sizeof(block));
+			}
+		}
+	}
+}
+
+void nv::compressDXT5(const Image * image, const OutputOptions & outputOptions, const CompressionOptions::Private & compressionOptions)
+{
+	const uint w = image->width();
+	const uint h = image->height();
+	
+	ColorBlock rgba;
+	BlockDXT5 block;
+	
+	for (uint y = 0; y < h; y += 4) {
+		for (uint x = 0; x < w; x += 4) {
+			
+			rgba.init(image, x, y);
+
+			// Compress alpha.
+			uint error;
+			if (compressionOptions.quality == Quality_Highest)
+			{
+				error = compressBlock_BruteForce(rgba, &block.alpha);
+			}
+			else
+			{
+				error = compressBlock_Iterative(rgba, &block.alpha);
+			}
+
+			// Compress color.
+			squish::ColourSet colours((uint8 *)rgba.colors(), squish::kWeightColourByAlpha);
+			squish::WeightedClusterFit fit(&colours, 0);
+			fit.setMetric(compressionOptions.colorWeight.x(), compressionOptions.colorWeight.y(), compressionOptions.colorWeight.z());
+			fit.Compress(&block.color);
+			
+			if (outputOptions.outputHandler != NULL) {
+				outputOptions.outputHandler->writeData(&block, sizeof(block));
+			}
+		}
+	}
+}
+
+
+void nv::compressDXT5n(const Image * image, const OutputOptions & outputOptions, const CompressionOptions::Private & compressionOptions)
+{
+	const uint w = image->width();
+	const uint h = image->height();
+	
+	ColorBlock rgba;
+	BlockDXT5 block;
+	
+	doPrecomputation();
+
+	for (uint y = 0; y < h; y += 4) {
+		for (uint x = 0; x < w; x += 4) {
+			
+			rgba.init(image, x, y);
+			
+			// copy X coordinate to green channel and Y coordinate to alpha channel.
+			rgba.swizzleDXT5n();			
+			
+			// Compress Y.
+			uint error = compressBlock_Iterative(rgba, &block.alpha);
+			if (compressionOptions.quality == Quality_Highest)
+			{
+				error = compressBlock_BruteForce(rgba, &block.alpha);
+			}
+			
+			// Compress X.
+			squish::ColourSet colours((uint8 *)rgba.colors(), 0);
+			squish::FastClusterFit fit(&colours, 0);
+			fit.setMetric(0, 1, 0);
+			fit.Compress(&block.color);
+			
+			if (outputOptions.outputHandler != NULL) {
+				outputOptions.outputHandler->writeData(&block, sizeof(block));
+			}
+		}
+	}
+}
+
+
+void nv::compressBC4(const Image * image, const nvtt::OutputOptions & outputOptions, const CompressionOptions::Private & compressionOptions)
+{
+	const uint w = image->width();
+	const uint h = image->height();
+	
+	// threshold should be from [0 - 1] but may also be higher...
+	const uint threshold = uint(compressionOptions.errorThreshold * 256);
+	
+	ColorBlock rgba;
+	AlphaBlockDXT5 block;
+	
+	uint totalError = 0;
+	
+	for (uint y = 0; y < h; y += 4) {
+		for (uint x = 0; x < w; x += 4) {
+			
+			rgba.init(image, x, y);
+
+			//error = compressBlock_BoundsRange(rgba, &block);
+			uint error = compressBlock_Iterative(rgba, &block);
+
+			if (compressionOptions.quality == Quality_Highest ||
+				(compressionOptions.quality == Quality_Production && error > threshold))
+			{
+				// Try brute force algorithm.
+				error = compressBlock_BruteForce(rgba, &block);
+			}
+
+			totalError += error;
+
+			if (outputOptions.outputHandler != NULL) {
+				outputOptions.outputHandler->writeData(&block, sizeof(block));
+			}
+		}
+	}
+
+	// @@ All the compressors should work like this.
+	// Effect of adjusting threshold: 
+	// (threshold: error - time)
+	// 0: 4.29 - 1.83
+	// 32: 4.32 - 1.77
+	// 48: 4.37 - 1.72
+	// 64: 4.43 - 1.45
+	// 74: 4.45 - 1.35
+	// 92: 4.54 - 1.15
+	// 128: 4.67 - 0.79
+	// 256: 4.92 - 0.20
+	// inf: 4.98 - 0.09
+
+	printf("Alpha error: %f\n", float(totalError) / (w*h));
+}
+
+
+void nv::compressBC5(const Image * image, const nvtt::OutputOptions & outputOptions, const CompressionOptions::Private & compressionOptions)
+{
+	const uint w = image->width();
+	const uint h = image->height();
+
+	ColorBlock xcolor;
+	ColorBlock ycolor;
+
+	Block3DC block;
+
+	for (uint y = 0; y < h; y += 4) {
+		for (uint x = 0; x < w; x += 4) {
+			
+			xcolor.init(image, x, y);
+			xcolor.splatX();
+			
+			ycolor.init(image, x, y);
+			ycolor.splatY();
+
+			// @@ Compute normal error, instead of separate xy errors.
+			uint xerror, yerror;
+			
+			if (compressionOptions.quality == Quality_Highest)
+			{
+				xerror = compressBlock_BruteForce(xcolor, &block.x);
+				yerror = compressBlock_BruteForce(ycolor, &block.y);
+			}
+			else
+			{
+				xerror = compressBlock_Iterative(xcolor, &block.x);
+				yerror = compressBlock_Iterative(ycolor, &block.y);
+			}
+
+			if (outputOptions.outputHandler != NULL) {
+				outputOptions.outputHandler->writeData(&block, sizeof(block));
+			}
+		}
+	}
+}
+
+
+#if defined(HAVE_S3QUANT)
+
+void nv::s3CompressDXT1(const Image * image, const nvtt::OutputOptions & outputOptions)
+{
+	const uint w = image->width();
+	const uint h = image->height();
+	
+	float error = 0.0f;
+
+	BlockDXT1 dxtBlock3;
+	BlockDXT1 dxtBlock4;
+	ColorBlock block;
+
+	for (uint y = 0; y < h; y += 4) {
+		for (uint x = 0; x < w; x += 4) {
+			block.init(image, x, y);
+
+			// Init rgb block.
+			RGBBlock rgbBlock;
+			rgbBlock.n = 16;
+			for (uint i = 0; i < 16; i++) {
+				rgbBlock.colorChannel[i][0] = clamp(float(block.color(i).r) / 255.0f, 0.0f, 1.0f);
+				rgbBlock.colorChannel[i][1] = clamp(float(block.color(i).g) / 255.0f, 0.0f, 1.0f);
+				rgbBlock.colorChannel[i][2] = clamp(float(block.color(i).b) / 255.0f, 0.0f, 1.0f);
+			}
+			rgbBlock.weight[0] = 1.0f;
+			rgbBlock.weight[1] = 1.0f;
+			rgbBlock.weight[2] = 1.0f;
+
+			rgbBlock.inLevel = 4;
+			CodeRGBBlock(&rgbBlock);
+
+			// Copy results to DXT block.
+			dxtBlock4.col0.r = rgbBlock.endPoint[0][0];
+			dxtBlock4.col0.g = rgbBlock.endPoint[0][1];
+			dxtBlock4.col0.b = rgbBlock.endPoint[0][2];
+
+			dxtBlock4.col1.r = rgbBlock.endPoint[1][0];
+			dxtBlock4.col1.g = rgbBlock.endPoint[1][1];
+			dxtBlock4.col1.b = rgbBlock.endPoint[1][2];
+
+			dxtBlock4.setIndices(rgbBlock.index);
+
+			if (dxtBlock4.col0.u < dxtBlock4.col1.u) {
+				swap(dxtBlock4.col0.u, dxtBlock4.col1.u);
+				dxtBlock4.indices ^= 0x55555555;
+			}
+
+			uint error4 = blockError(block, dxtBlock4);
+
+			rgbBlock.inLevel = 3;
+
+			CodeRGBBlock(&rgbBlock);
+
+			// Copy results to DXT block.
+			dxtBlock3.col0.r = rgbBlock.endPoint[0][0];
+			dxtBlock3.col0.g = rgbBlock.endPoint[0][1];
+			dxtBlock3.col0.b = rgbBlock.endPoint[0][2];
+
+			dxtBlock3.col1.r = rgbBlock.endPoint[1][0];
+			dxtBlock3.col1.g = rgbBlock.endPoint[1][1];
+			dxtBlock3.col1.b = rgbBlock.endPoint[1][2];
+
+			dxtBlock3.setIndices(rgbBlock.index);
+
+			if (dxtBlock3.col0.u > dxtBlock3.col1.u) {
+				swap(dxtBlock3.col0.u, dxtBlock3.col1.u);
+				dxtBlock3.indices ^= (~dxtBlock3.indices  >> 1) & 0x55555555;
+			}
+
+			uint error3 = blockError(block, dxtBlock3);
+
+			if (error3 < error4) {
+				error += error3;
+
+				if (outputOptions.outputHandler != NULL) {
+					outputOptions.outputHandler->writeData(&dxtBlock3, sizeof(dxtBlock3));
+				}
+			}
+			else {
+				error += error4;
+
+				if (outputOptions.outputHandler != NULL) {
+					outputOptions.outputHandler->writeData(&dxtBlock4, sizeof(dxtBlock4));
+				}
+			}
+		}
+	}
+
+	printf("error = %f\n", error/((w+3)/4 * (h+3)/4));
+}
+
+#endif // defined(HAVE_S3QUANT)
+
+
+#if defined(HAVE_ATITC)
+
+void nv::atiCompressDXT1(const Image * image, const OutputOptions & outputOptions)
+{
+	// Init source texture
+	ATI_TC_Texture srcTexture;
+	srcTexture.dwSize = sizeof(srcTexture);
+	srcTexture.dwWidth = image->width();
+	srcTexture.dwHeight = image->height();
+	srcTexture.dwPitch = image->width() * 4;
+	srcTexture.format = ATI_TC_FORMAT_ARGB_8888;
+	srcTexture.dwDataSize = ATI_TC_CalculateBufferSize(&srcTexture);
+	srcTexture.pData = (ATI_TC_BYTE*) image->pixels();
+
+	// Init dest texture
+	ATI_TC_Texture destTexture;
+	destTexture.dwSize = sizeof(destTexture);
+	destTexture.dwWidth = image->width();
+	destTexture.dwHeight = image->height();
+	destTexture.dwPitch = 0;
+	destTexture.format = ATI_TC_FORMAT_DXT1;
+	destTexture.dwDataSize = ATI_TC_CalculateBufferSize(&destTexture);
+	destTexture.pData = (ATI_TC_BYTE*) mem::malloc(destTexture.dwDataSize);
+
+	// Compress
+	ATI_TC_ConvertTexture(&srcTexture, &destTexture, NULL, NULL, NULL, NULL);
+
+	if (outputOptions.outputHandler != NULL) {
+		outputOptions.outputHandler->writeData(destTexture.pData, destTexture.dwDataSize);
+	}
+}
+
+#endif // defined(HAVE_ATITC)
--- a/src/nvimage/nvtt/CompressDXT.h
+++ b/src/nvimage/nvtt/CompressDXT.h
@ -0,0 +1,65 @@
+// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
+// 
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef NV_TT_COMPRESSDXT_H
+#define NV_TT_COMPRESSDXT_H
+
+#include <nvimage/nvimage.h>
+#include "nvtt.h"
+
+namespace nv
+{
+	class Image;
+	class FloatImage;
+
+	void doPrecomputation();
+	
+	// Fast compressors.
+	void fastCompressDXT1(const Image * image, const nvtt::OutputOptions & outputOptions);
+	void fastCompressDXT3(const Image * image, const nvtt::OutputOptions & outputOptions);
+	void fastCompressDXT5(const Image * image, const nvtt::OutputOptions & outputOptions);
+	void fastCompressDXT5n(const Image * image, const nvtt::OutputOptions & outputOptions);
+	void fastCompressBC4(const Image * image, const nvtt::OutputOptions & outputOptions);
+	void fastCompressBC5(const Image * image, const nvtt::OutputOptions & outputOptions);
+
+	// Normal compressors.
+	void compressDXT1(const Image * image, const nvtt::OutputOptions & outputOptions, const nvtt::CompressionOptions::Private & compressionOptions);
+	void compressDXT3(const Image * image, const nvtt::OutputOptions & outputOptions, const nvtt::CompressionOptions::Private & compressionOptions);
+	void compressDXT5(const Image * image, const nvtt::OutputOptions & outputOptions, const nvtt::CompressionOptions::Private & compressionOptions);
+	void compressDXT5n(const Image * image, const nvtt::OutputOptions & outputOptions, const nvtt::CompressionOptions::Private & compressionOptions);
+	void compressBC4(const Image * image, const nvtt::OutputOptions & outputOptions, const nvtt::CompressionOptions::Private & compressionOptions);
+	void compressBC5(const Image * image, const nvtt::OutputOptions & outputOptions, const nvtt::CompressionOptions::Private & compressionOptions);
+	
+	// External compressors.
+#if defined(HAVE_S3QUANT)
+	void s3CompressDXT1(const Image * image, const nvtt::OutputOptions & outputOptions);
+#endif
+	
+#if defined(HAVE_ATITC)
+	void atiCompressDXT1(const Image * image, const nvtt::OutputOptions & outputOptions);
+#endif
+
+} // nv namespace
+
+
+#endif // NV_TT_COMPRESSDXT_H
--- a/src/nvimage/nvtt/CompressRGB.cpp
+++ b/src/nvimage/nvtt/CompressRGB.cpp
@ -0,0 +1,153 @@
+// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
+// 
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+#include <string.h>
+#include <nvcore/Debug.h>
+#include <nvimage/Image.h>
+
+#include "CompressRGB.h"
+#include "CompressionOptions.h"
+
+
+using namespace nv;
+using namespace nvtt;
+
+namespace 
+{
+
+	inline uint computePitch(uint w, uint bitsize)
+	{
+		uint p = w * ((bitsize + 7) / 8);
+
+		// Align to 32 bits.
+		return ((p + 3) / 4) * 4;
+	}
+
+	static void convert_to_rgba8888(void * src, void * dst, uint w)
+	{
+		// @@ TODO
+	}
+
+	static void convert_to_bgra8888(const void * src, void * dst, uint w)
+	{
+		memcpy(dst, src, 4 * w);
+	}
+
+	static void convert_to_rgb888(const void * src, void * dst, uint w)
+	{
+		// @@ TODO
+	}
+
+	static uint truncate(uint c, uint inbits, uint outbits)
+	{
+		nvDebugCheck(inbits > outbits);	
+		c >>= inbits - outbits;
+	}
+
+	static uint bitexpand(uint c, uint inbits, uint outbits)
+	{
+		// @@ TODO
+	}
+	
+	static void maskShiftAndSize(uint mask, uint & shift, uint & size)
+	{
+		shift = 0;
+		while((mask & 1) == 0) {
+			shift++;
+			mask >>= 1;
+		}
+		
+		while((mask & 1) == 1) {
+			size++;
+			mask >>= 1;
+		}
+	}
+	
+} // namespace
+
+
+// Pixel format converter.
+void nv::compressRGB(const Image * image, const OutputOptions & outputOptions, const CompressionOptions::Private & compressionOptions)
+{
+	nvCheck(image != NULL);
+
+	const uint w = image->width();
+	const uint h = image->height();
+
+	uint rshift, rsize;
+	maskShiftAndSize(compressionOptions.rmask, rshift, rsize);
+	
+	uint gshift, gsize;
+	maskShiftAndSize(compressionOptions.gmask, gshift, gsize);
+	
+	uint bshift, bsize;
+	maskShiftAndSize(compressionOptions.bmask, bshift, bsize);
+	
+	uint ashift, asize;
+	maskShiftAndSize(compressionOptions.amask, ashift, asize);
+
+
+	// Determine pitch.
+	uint pitch = computePitch(w, compressionOptions.bitcount);
+
+	void * dst = malloc(pitch);
+
+	for (uint y = 0; y < h; y++)
+	{
+		const Color32 * src = image->scanline(y);
+
+		convert_to_bgra8888(src, dst, w);
+
+		if (false)
+		{
+		//	uint c = 0;
+		//	c |= (src[i].r >> (8 - rsize)) << rshift;
+		//	c |= (src[i].g >> (8 - gsize)) << gshift;
+		//	c |= (src[i].b >> (8 - bsize)) << bshift;
+		}
+
+		/*
+		if (rmask == 0xFF000000 && gmask == 0xFF0000 && bmask == 0xFF00 && amask == 0xFF)
+		{
+			convert_to_rgba8888(src, dst, w);
+		}
+		else if (rmask == 0xFF0000 && gmask == 0xFF00 && bmask == 0xFF && amask == 0)
+		{
+			convert_to_rgb888(src, dst, w);
+		}
+		else
+		{
+			// @@ Not supported.
+		}
+		*/
+
+		if (outputOptions.outputHandler != NULL)
+		{
+			outputOptions.outputHandler->writeData(dst, pitch);
+		}
+	}
+
+	free(dst);
+}
+
+
--- a/src/nvimage/nvtt/CompressRGB.h
+++ b/src/nvimage/nvtt/CompressRGB.h
@ -0,0 +1,39 @@
+// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
+// 
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef NV_TT_COMPRESSRGB_H
+#define NV_TT_COMPRESSRGB_H
+
+#include "nvtt.h"
+
+namespace nv
+{
+	class Image;
+
+	// Pixel format converter.
+	void compressRGB(const Image * image, const nvtt::OutputOptions & outputOptions, const nvtt::CompressionOptions::Private & compressionOptions);
+	
+} // nv namespace
+
+
+#endif // NV_TT_COMPRESSDXT_H
--- a/src/nvimage/nvtt/CompressionOptions.cpp
+++ b/src/nvimage/nvtt/CompressionOptions.cpp
@ -0,0 +1,113 @@
+// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
+// 
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+#include "nvtt.h"
+#include "CompressionOptions.h"
+
+using namespace nv;
+using namespace nvtt;
+
+
+/// Constructor. Sets compression options to the default values.
+CompressionOptions::CompressionOptions() : m(*new CompressionOptions::Private())
+{
+	reset();
+}
+
+
+/// Destructor.
+CompressionOptions::~CompressionOptions()
+{
+	delete &m;
+}
+
+
+/// Set default compression options.
+void CompressionOptions::reset()
+{
+	m.format = Format_DXT1;
+	m.quality = Quality_Normal;
+	m.colorWeight.set(1.0f, 1.0f, 1.0f);
+	m.useCuda = true;
+	m.bitcount = 32;
+	m.bmask = 0x000000FF;
+	m.gmask = 0x0000FF00;
+	m.rmask = 0x00FF0000;
+	m.amask = 0xFF000000;
+}
+
+
+/// Set desired compression format.
+void CompressionOptions::setFormat(Format format)
+{
+	m.format = format;
+}
+
+
+/// Set compression quality settings.
+void CompressionOptions::setQuality(Quality quality, float errorThreshold /*= 0.5f*/)
+{
+	m.quality = quality;
+	m.errorThreshold = errorThreshold;
+}
+
+
+/// Set the weights of each color channel. 
+/// The choice for these values is subjective. In many case uniform color weights 
+/// (1.0, 1.0, 1.0) work very well. A popular choice is to use the NTSC luma encoding 
+/// weights (0.2126, 0.7152, 0.0722), but I think that blue contributes to our 
+/// perception more than a 7%. A better choice in my opinion is (3, 4, 2). Ideally
+/// the compressor should use a non linear colour metric as described here:
+/// http://www.compuphase.com/cmetric.htm
+void CompressionOptions::setColorWeights(float red, float green, float blue)
+{
+	float total = red + green + blue;
+	float x = blue / total;
+	float y = green / total;
+
+	m.colorWeight.set(x, y, 1.0f - x - y);
+}
+
+
+/// Enable or disable hardware compression.
+void CompressionOptions::enableHardwareCompression(bool enable)
+{
+	m.useCuda = enable;
+}
+
+
+/// Set color mask to describe the RGB/RGBA format.
+void CompressionOptions::setPixelFormat(uint bitcount, uint rmask, uint gmask, uint bmask, uint amask)
+{
+	m.bitcount = bitcount;
+	m.rmask = rmask;
+	m.gmask = gmask;
+	m.bmask = bmask;
+	m.amask = amask;
+}
+
+/// Use external compressor.
+void CompressionOptions::setExternalCompressor(const char * name)
+{
+	m.externalCompressor = name;
+}
--- a/src/nvimage/nvtt/CompressionOptions.h
+++ b/src/nvimage/nvtt/CompressionOptions.h
@ -0,0 +1,57 @@
+// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
+// 
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef NV_TT_COMPRESSIONOPTIONS_H
+#define NV_TT_COMPRESSIONOPTIONS_H
+
+#include <nvcore/StrLib.h>
+#include <nvmath/Vector.h>
+#include "nvtt.h"
+
+namespace nvtt
+{
+
+	struct CompressionOptions::Private
+	{
+		Format format;
+		
+		Quality quality;
+		float errorThreshold;
+		
+		nv::Vector3 colorWeight;
+		
+		uint bitcount;
+		uint rmask;
+		uint gmask;
+		uint bmask;
+		uint amask;
+		
+		bool useCuda;
+
+		nv::String externalCompressor;
+	};
+
+} // nvtt namespace
+
+
+#endif // NV_TT_COMPRESSIONOPTIONS_H
--- a/src/nvimage/nvtt/FastCompressDXT.cpp
+++ b/src/nvimage/nvtt/FastCompressDXT.cpp
--- a/src/nvimage/nvtt/FastCompressDXT.h
+++ b/src/nvimage/nvtt/FastCompressDXT.h
@ -0,0 +1,81 @@
+// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
+// 
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef NV_TT_FASTCOMPRESSDXT_H
+#define NV_TT_FASTCOMPRESSDXT_H
+
+#include <nvimage/nvimage.h>
+
+namespace nv
+{
+	struct ColorBlock;
+	struct BlockDXT1;
+	struct BlockDXT3;
+	struct BlockDXT5;
+	struct AlphaBlockDXT3;
+	struct AlphaBlockDXT5;
+
+	// Color compression:
+
+	// Compressor that uses the extremes of the luminance axis.
+	void compressBlock_DiameterAxis(const ColorBlock & rgba, BlockDXT1 * block);
+
+	// Compressor that uses the extremes of the luminance axis.
+	void compressBlock_LuminanceAxis(const ColorBlock & rgba, BlockDXT1 * block);
+
+	// Compressor that uses bounding box.
+	void compressBlock_BoundsRange(const ColorBlock & rgba, BlockDXT1 * block);
+
+	// Compressor that uses the best fit axis.
+	void compressBlock_BestFitAxis(const ColorBlock & rgba, BlockDXT1 * block);
+
+
+	// Simple, but slow compressor that tests all color pairs.
+	void compressBlock_TestAllPairs(const ColorBlock & rgba, BlockDXT1 * block);
+	
+	// Brute force 6d search along the best fit axis.
+	void compressBlock_AnalyzeBestFitAxis(const ColorBlock & rgba, BlockDXT1 * block);
+
+	// Spatial greedy search.
+	void refineSolution_1dSearch(const ColorBlock & rgba, BlockDXT1 * block);
+	void refineSolution_3dSearch(const ColorBlock & rgba, BlockDXT1 * block);
+	void refineSolution_6dSearch(const ColorBlock & rgba, BlockDXT1 * block);
+	
+	// Minimize error of the endpoints.
+	void optimizeEndPoints(const ColorBlock & rgba, BlockDXT1 * block);
+	
+	uint blockError(const ColorBlock & rgba, const BlockDXT1 & block);
+	uint blockError(const ColorBlock & rgba, const AlphaBlockDXT5 & block);
+
+	// Alpha compression:
+	void compressBlock(const ColorBlock & rgba, AlphaBlockDXT3 * block);
+	void compressBlock_BoundsRange(const ColorBlock & rgba, BlockDXT3 * block);
+	void compressBlock_BoundsRange(const ColorBlock & rgba, BlockDXT5 * block);
+
+	uint compressBlock_BoundsRange(const ColorBlock & rgba, AlphaBlockDXT5 * block);
+	uint compressBlock_BruteForce(const ColorBlock & rgba, AlphaBlockDXT5 * block);
+	uint compressBlock_Iterative(const ColorBlock & rgba, AlphaBlockDXT5 * block);
+
+} // nv namespace
+
+#endif // NV_TT_FASTCOMPRESSDXT_H
--- a/src/nvimage/nvtt/InputOptions.cpp
+++ b/src/nvimage/nvtt/InputOptions.cpp
@ -0,0 +1,250 @@
+// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
+// 
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+#include <string.h> // memcpy
+
+#include <nvcore/Memory.h>
+
+#include "nvtt.h"
+#include "InputOptions.h"
+
+using namespace nv;
+using namespace nvtt;
+
+namespace
+{
+
+	static int countMipmaps(int w, int h, int d)
+	{
+		int mipmap = 0;
+		
+		while (w != 1 && h != 1) {
+			w = max(1, w / 2);
+			h = max(1, h / 2);
+			d = max(1, d / 2);
+			mipmap++;
+		}
+		
+		return mipmap + 1;
+	}
+
+} // namespace
+
+
+/// Constructor.
+InputOptions::InputOptions() : m(*new InputOptions::Private())
+{ 
+	reset();
+}
+
+// Delete images.
+InputOptions::~InputOptions()
+{
+	resetTextureLayout();
+	
+	delete &m;
+}
+
+
+// Reset input options.
+void InputOptions::reset()
+{
+	m.wrapMode = WrapMode_Repeat;
+	m.textureType = TextureType_2D;
+	m.inputFormat = InputFormat_BGRA_8UB;
+
+	m.enableColorDithering = false;
+	m.enableAlphaDithering = false;
+	m.binaryAlpha = false;
+	m.alphaThreshold = 127;
+
+	m.alphaTransparency = true;
+
+	m.inputGamma = 2.2f;
+	m.outputGamma = 2.2f;
+	
+	m.generateMipmaps = false;
+	m.maxLevel = -1;
+	m.mipmapFilter = MipmapFilter_Box;
+
+	m.normalizeMipmaps = false;
+	m.convertToNormalMap = false;
+	m.heightFactors.set(0.0f, 0.0f, 0.0f, 1.0f);
+	m.bumpFrequencyScale = Vector4(1.0f, 0.5f, 0.25f, 0.125f) / (1.0f + 0.5f + 0.25f + 0.125f);
+}
+
+
+// Setup the input image.
+void InputOptions::setTextureLayout(TextureType type, int w, int h, int d /*= 1*/)
+{
+	// Validate arguments.
+	nvCheck(w >= 0);
+	nvCheck(h >= 0);
+	nvCheck(d >= 0);
+
+	// Correct arguments.
+	if (w == 0) w = 1;
+	if (h == 0) h = 1;
+	if (d == 0) d = 1;
+
+	// Delete previous images.
+	resetTextureLayout();
+	
+	m.textureType = type;
+	
+	// Allocate images.
+	m.mipmapCount = countMipmaps(w, h, d);
+	m.faceCount = (type == TextureType_Cube) ? 6 : 1;
+	m.imageCount = m.mipmapCount * m.faceCount;
+	
+	m.images = new Private::Image[m.imageCount];
+	
+	for(int f = 0; f < m.faceCount; f++)
+	{
+		for (int mipLevel = 0; mipLevel < m.mipmapCount; mipLevel++)
+		{
+			Private::Image & img = m.images[f * m.mipmapCount + mipLevel];
+			img.width = w;
+			img.height = h;
+			img.depth = d;
+			img.mipLevel = mipLevel;
+			img.face = f;
+			
+			img.data = NULL;
+			
+			w = max(1, w / 2);
+			h = max(1, h / 2);
+			d = max(1, d / 2);
+		}
+	}
+}
+
+
+void InputOptions::resetTextureLayout()
+{
+	if (m.images != NULL)
+	{
+		// Delete image array.
+		delete [] m.images;
+		m.images = NULL;
+
+		m.faceCount = 0;
+		m.mipmapCount = 0;
+		m.imageCount = 0;
+	}
+}
+
+
+// Copies the data to our internal structures.
+bool InputOptions::setMipmapData(const void * data, int width, int height, int depth /*= 1*/, int face /*= 0*/, int mipLevel /*= 0*/)
+{
+	nvCheck(depth == 1);
+	
+	const int idx = face * m.mipmapCount + mipLevel;
+	
+	if (m.images[idx].width != width || m.images[idx].height != height || m.images[idx].depth != depth || m.images[idx].mipLevel != mipLevel || m.images[idx].face != face)
+	{
+		// Invalid dimension or index.
+		return false;
+	}
+	
+	m.images[idx].data = new nv::Image();
+	m.images[idx].data->allocate(width, height);
+	memcpy(m.images[idx].data->pixels(), data, width * height * 4); 
+	
+	return true;
+}
+
+
+/// Describe the format of the input.
+void InputOptions::setFormat(InputFormat format, bool alphaTransparency)
+{
+	m.inputFormat = format;
+	m.alphaTransparency = alphaTransparency;
+}
+
+
+/// Set gamma settings.
+void InputOptions::setGamma(float inputGamma, float outputGamma)
+{
+	m.inputGamma = inputGamma;
+	m.outputGamma = outputGamma;
+}
+
+
+/// Set texture wrappign mode.
+void InputOptions::setWrapMode(WrapMode mode)
+{
+	m.wrapMode = mode;
+}
+
+
+/// Set mipmapping options.
+void InputOptions::setMipmapping(bool generateMipmaps, MipmapFilter filter/*= MipmapFilter_Kaiser*/, int maxLevel/*= -1*/)
+{
+	m.generateMipmaps = generateMipmaps;
+	m.mipmapFilter = filter;
+	m.maxLevel = maxLevel;
+}
+
+
+/// Set quantization options.
+/// @warning Do not enable dithering unless you know what you are doing. Quantization 
+/// introduces errors. It's better to let the compressor quantize the result to 
+/// minimize the error, instead of quantizing the data before handling it to
+/// the compressor.
+void InputOptions::setQuantization(bool colorDithering, bool alphaDithering, bool binaryAlpha, int alphaThreshold/*= 127*/)
+{
+	m.enableColorDithering = colorDithering;
+	m.enableAlphaDithering = alphaDithering;
+	m.binaryAlpha = binaryAlpha;
+	m.alphaThreshold = alphaThreshold;
+}
+
+
+/// Enable normal map conversion.
+void InputOptions::setConvertToNormalMap(bool convert)
+{
+	m.convertToNormalMap = convert;
+}
+
+/// Set height evaluation factors.
+void InputOptions::setHeightEvaluation(float redScale, float greenScale, float blueScale, float alphaScale)
+{
+	// Do not normalize height factors.
+//	float total = redScale + greenScale + blueScale + alphaScale;
+	m.heightFactors = Vector4(redScale, greenScale, blueScale, alphaScale);
+}
+
+/// Set normal map conversion filter.
+void InputOptions::setNormalFilter(float small, float medium, float big, float large)
+{
+	float total = small + medium + big + large;
+	m.bumpFrequencyScale = Vector4(small, medium, big, large) / total;
+}
+
+/// Enable mipmap normalization.
+void InputOptions::setNormalizeMipmaps(bool normalize)
+{
+	m.normalizeMipmaps = normalize;
+}
--- a/src/nvimage/nvtt/InputOptions.h
+++ b/src/nvimage/nvtt/InputOptions.h
@ -0,0 +1,91 @@
+// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
+// 
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef NV_TT_INPUTOPTIONS_H
+#define NV_TT_INPUTOPTIONS_H
+
+#include <nvmath/Vector.h>
+#include <nvimage/Image.h>
+#include "nvtt.h"
+
+namespace nvtt
+{
+
+	struct InputOptions::Private
+	{
+		Private() : images(NULL) {}
+
+		WrapMode wrapMode;
+		TextureType textureType;
+		InputFormat inputFormat;
+		
+		int faceCount;
+		int mipmapCount;
+		int imageCount;
+		
+		struct Image;
+		Image * images;
+
+		// Quantization.
+		bool enableColorDithering;
+		bool enableAlphaDithering;
+		bool binaryAlpha;
+		int alphaThreshold;			// reference value used for binary alpha quantization.
+
+		bool alphaTransparency;	// set to true if alpha is used for transparency.
+		
+		// Gamma conversion.
+		float inputGamma;
+		float outputGamma;
+		
+		// Mipmap generation options.
+		bool generateMipmaps;
+		int maxLevel;
+		MipmapFilter mipmapFilter;
+		
+		// Normal map options.
+		bool normalizeMipmaps;
+		bool convertToNormalMap;
+		nv::Vector4 heightFactors;
+		nv::Vector4 bumpFrequencyScale;
+	};
+
+	// Internal image structure.
+	struct InputOptions::Private::Image
+	{
+		Image() {}
+		~Image() { delete data; }
+		
+		int mipLevel;
+		int face;
+		
+		int width;
+		int height;
+		int depth;
+		
+		nv::Image * data;
+	};
+
+} // nvtt namespace
+
+#endif // NV_TT_INPUTOPTIONS_H
--- a/src/nvimage/nvtt/OutputOptions.cpp
+++ b/src/nvimage/nvtt/OutputOptions.cpp
@ -0,0 +1,32 @@
+// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
+// 
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+#include "nvtt.h"
+
+using namespace nvtt;
+
+/// Set default output options.
+void OutputOptions::reset()
+{
+	// endiannes = native...
+}
--- a/src/nvimage/nvtt/cmdline.h
+++ b/src/nvimage/nvtt/cmdline.h
@ -0,0 +1,44 @@
+// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
+
+#ifndef CMDLINE_H
+#define CMDLINE_H
+
+#include <nvcore/Debug.h>
+
+#include <stdarg.h>
+
+struct MyMessageHandler : public nv::MessageHandler {
+	MyMessageHandler() {
+		nv::debug::setMessageHandler( this );
+	}
+	~MyMessageHandler() {
+		nv::debug::resetMessageHandler();
+	}
+
+	virtual void log( const char * str, va_list arg ) {
+		va_list val;
+		va_copy(val, arg);
+		vfprintf(stderr, str, arg);
+		va_end(val);		
+	}
+};
+
+
+struct MyAssertHandler : public nv::AssertHandler {
+	MyAssertHandler() {
+		nv::debug::setAssertHandler( this );
+	}
+	~MyAssertHandler() {
+		nv::debug::resetAssertHandler();
+	}
+	
+	// Handler method, note that func might be NULL!
+	virtual int assert( const char *exp, const char *file, int line, const char *func ) {
+		fprintf(stderr, "Assertion failed: %s\nIn %s:%d\n", exp, file, line);
+		nv::debug::dumpInfo();
+		exit(1);
+	}
+};
+
+
+#endif // CMDLINE_H
--- a/src/nvimage/nvtt/compress.cpp
+++ b/src/nvimage/nvtt/compress.cpp
@ -0,0 +1,354 @@
+// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
+// 
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+#include <nvcore/StrLib.h>
+#include <nvcore/StdStream.h>
+
+#include <nvimage/Image.h>
+#include <nvimage/nvtt/nvtt.h>
+
+#include "cmdline.h"
+
+#include <time.h> // clock
+
+struct MyOutputHandler : public nvtt::OutputHandler
+{
+	MyOutputHandler() : total(0), progress(0), percentage(0), stream(NULL) {}
+	MyOutputHandler(const char * name) : total(0), progress(0), percentage(0), stream(new nv::StdOutputStream(name)) {}
+	virtual ~MyOutputHandler() { delete stream; }
+	
+	bool open(const char * name)
+	{
+		stream = new nv::StdOutputStream(name);
+		percentage = progress = 0;
+		if (stream->isError()) {
+			printf("Error opening '%s' for writting\n", name);
+			return false;
+		}
+		return true;
+	}
+	
+	virtual void setTotal(int t)
+	{
+		total = t;
+	}
+
+	virtual void mipmap(int size, int width, int height, int depth, int face, int miplevel)
+	{
+		// ignore.
+	}
+	
+	// Output data.
+	virtual void writeData(const void * data, int size)
+	{
+		nvDebugCheck(stream != NULL);
+		stream->serialize(const_cast<void *>(data), size);
+
+		progress += size;
+		int p = (100 * progress) / total;
+		if (p != percentage)
+		{
+			percentage = p;
+			printf("\r%d%%", percentage);
+			fflush(stdout);
+		}
+	}
+	
+	int total;
+	int progress;
+	int percentage;
+	nv::StdOutputStream * stream;
+};
+
+struct MyErrorHandler : public nvtt::ErrorHandler
+{
+	virtual void error(nvtt::Error e)
+	{
+		nvDebugBreak();
+	}
+};
+
+
+
+
+// Set color to normal map conversion options.
+void setColorToNormalMap(nvtt::InputOptions & inputOptions)
+{
+	inputOptions.setConvertToNormalMap(true);
+	inputOptions.setHeightEvaluation(1.0f/3.0f, 1.0f/3.0f, 1.0f/3.0f, 0.0f);
+	//inputOptions.setNormalFilter(1.0f, 0, 0, 0);
+	//inputOptions.setNormalFilter(0.0f, 0, 0, 1);
+	inputOptions.setGamma(1.0f, 1.0f);
+	inputOptions.setNormalizeMipmaps(true);
+}
+
+// Set options for normal maps.
+void setNormalMap(nvtt::InputOptions & inputOptions)
+{
+	inputOptions.setConvertToNormalMap(false);
+	inputOptions.setGamma(1.0f, 1.0f);
+	inputOptions.setNormalizeMipmaps(true);
+}
+
+// Set options for color maps.
+void setColorMap(nvtt::InputOptions & inputOptions)
+{
+	inputOptions.setConvertToNormalMap(false);
+	inputOptions.setGamma(2.2f, 2.2f);
+	inputOptions.setNormalizeMipmaps(false);
+}
+
+
+
+int main(int argc, char *argv[])
+{
+	MyAssertHandler assertHandler;
+	MyMessageHandler messageHandler;
+
+	bool normal = false;
+	bool color2normal = false;
+	bool wrapRepeat = false;
+	bool noMipmaps = false;
+	bool fast = false;
+	bool nocuda = false;
+	nvtt::Format format = nvtt::Format_BC1;
+
+	const char * externalCompressor = NULL;
+
+	nv::Path input;
+	nv::Path output;
+
+
+	// Parse arguments.
+	for (int i = 1; i < argc; i++)
+	{
+		// Input options.
+		if (strcmp("-color", argv[i]) == 0)
+		{
+		}
+		else if (strcmp("-normal", argv[i]) == 0)
+		{
+			normal = true;
+		}
+		else if (strcmp("-tonormal", argv[i]) == 0)
+		{
+			color2normal = true;
+		}
+		else if (strcmp("-clamp", argv[i]) == 0)
+		{
+		}
+		else if (strcmp("-repeat", argv[i]) == 0)
+		{
+			wrapRepeat = true;
+		}
+		else if (strcmp("-nomips", argv[i]) == 0)
+		{
+			noMipmaps = true;
+		}
+
+		// Compression options.
+		else if (strcmp("-fast", argv[i]) == 0)
+		{
+			fast = true;
+		}
+		else if (strcmp("-nocuda", argv[i]) == 0)
+		{
+			nocuda = true;
+		}
+		else if (strcmp("-rgb", argv[i]) == 0)
+		{
+			format = nvtt::Format_RGB;
+		}
+		else if (strcmp("-bc1", argv[i]) == 0)
+		{
+			format = nvtt::Format_BC1;
+		}
+		else if (strcmp("-bc2", argv[i]) == 0)
+		{
+			format = nvtt::Format_BC2;
+		}
+		else if (strcmp("-bc3", argv[i]) == 0)
+		{
+			format = nvtt::Format_BC3;
+		}
+		else if (strcmp("-bc3n", argv[i]) == 0)
+		{
+			format = nvtt::Format_BC3n;
+		}
+		else if (strcmp("-bc4", argv[i]) == 0)
+		{
+			format = nvtt::Format_BC4;
+		}
+		else if (strcmp("-bc5", argv[i]) == 0)
+		{
+			format = nvtt::Format_BC5;
+		}
+
+		// Undocumented option. Mainly used for testing.
+		else if (strcmp("-ext", argv[i]) == 0)
+		{
+			if (i+1 < argc && argv[i+1][0] != '-') {
+				externalCompressor = argv[i+1];
+				printf("using %s\n", argv[i+1]);
+				i++;
+			}
+		}
+
+		else if (argv[i][0] != '-')
+		{
+			input = argv[i];
+
+			if (i+1 < argc && argv[i+1][0] != '-') {
+				output = argv[i+1];
+			}
+			else
+			{
+				output.copy(input.str());
+				output.stripExtension();
+				output.append(".dds");
+			}
+
+			break;
+		}
+	}
+
+	if (input.empty())
+	{
+		printf("NVIDIA Texture Tools - Copyright NVIDIA Corporation 2007\n\n");
+		
+		printf("usage: nvcompress [options] infile [outfile]\n\n");
+		
+		printf("Input options:\n");
+		printf("  -color   \tThe input image is a color map (default).\n");
+		printf("  -normal  \tThe input image is a normal map.\n");
+		printf("  -tonormal\tConvert input to normal map.\n");
+		printf("  -clamp   \tClamp wrapping mode (default).\n");
+		printf("  -repeat  \tRepeat wrapping mode.\n");
+		printf("  -nomips  \tDisable mipmap generation.\n\n");
+
+		printf("Compression options:\n");
+		printf("  -fast    \tFast compression.\n");
+		printf("  -nocuda  \tDo not use cuda compressor.\n");
+		printf("  -rgb     \tRGBA format\n");
+		printf("  -bc1     \tBC1 format (DXT1)\n");
+		printf("  -bc2     \tBC2 format (DXT3)\n");
+		printf("  -bc3     \tBC3 format (DXT5)\n");
+		printf("  -bc3n    \tBC3 normal map format (DXT5n/RXGB)\n");
+		printf("  -bc4     \tBC4 format (ATI1)\n");
+		printf("  -bc5     \tBC5 format (3Dc/ATI2)\n\n");
+		
+		return 1;
+	}
+
+	nv::Image image;
+	if (!image.load(input))
+	{
+		printf("The file '%s' is not a supported image type.\n", input.str());
+		return 1;
+	}
+
+
+	MyErrorHandler errorHandler;
+	MyOutputHandler outputHandler(output);
+	if (outputHandler.stream->isError())
+	{
+		printf("Error opening '%s' for writting\n", output.str());
+		return 1;
+	}
+
+	// Set input options.
+	nvtt::InputOptions inputOptions;
+	inputOptions.setTextureLayout(nvtt::TextureType_2D, image.width(), image.height());
+	inputOptions.setMipmapData(image.pixels(), image.width(), image.height());
+
+	if (fast)
+	{
+		inputOptions.setMipmapping(true, nvtt::MipmapFilter_Box);
+	}
+	else
+	{
+		inputOptions.setMipmapping(true, nvtt::MipmapFilter_Kaiser);
+	}
+
+	if (wrapRepeat)
+	{
+		inputOptions.setWrapMode(nvtt::WrapMode_Repeat);
+	}
+	else
+	{
+		inputOptions.setWrapMode(nvtt::WrapMode_Clamp);
+	}
+
+	if (normal)
+	{
+		setNormalMap(inputOptions);
+	}
+	else if (color2normal)
+	{
+		setColorToNormalMap(inputOptions);
+	}
+	else
+	{
+		setColorMap(inputOptions);
+	}
+	
+	if (noMipmaps)
+	{
+		inputOptions.setMipmapping(false);
+	}
+	
+
+	nvtt::CompressionOptions compressionOptions;
+	compressionOptions.setFormat(format);
+	if (fast)
+	{
+		compressionOptions.setQuality(nvtt::Quality_Fastest);
+	}
+	else
+	{
+		compressionOptions.setQuality(nvtt::Quality_Normal);
+		//compressionOptions.setQuality(nvtt::Quality_Production, 0.5f);
+		//compressionOptions.setQuality(nvtt::Quality_Highest);
+	}
+	compressionOptions.enableHardwareCompression(!nocuda);
+	compressionOptions.setColorWeights(1, 1, 1);
+
+	if (externalCompressor != NULL)
+	{
+		compressionOptions.setExternalCompressor(externalCompressor);
+	}
+
+	outputHandler.setTotal(nvtt::estimateSize(inputOptions, compressionOptions));
+
+	nvtt::OutputOptions outputOptions(&outputHandler, &errorHandler);
+	
+	clock_t start = clock();
+
+	nvtt::compress(inputOptions, outputOptions, compressionOptions);
+
+	clock_t end = clock();
+	printf("\rtime taken: %.3f seconds\n", float(end-start) / CLOCKS_PER_SEC);
+	
+	return 0;
+}
+
--- a/src/nvimage/nvtt/cuda/CompressKernel.cu
+++ b/src/nvimage/nvtt/cuda/CompressKernel.cu
@ -0,0 +1,481 @@
+// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
+// 
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+
+#include "CudaMath.h"
+
+#define THREAD_NUM 64		// Number of threads per block.
+
+#if __DEVICE_EMULATION__
+#define __debugsync() __syncthreads()
+#else
+#define __debugsync()
+#endif
+
+typedef unsigned short ushort;
+typedef unsigned int uint;
+
+template <class T> 
+__device__ inline void swap(T & a, T & b)
+{
+	T tmp = a;
+	a = b;
+	b = tmp;
+}
+
+__constant__ float3 kColorMetric = { 1.0f, 1.0f, 1.0f };
+
+
+////////////////////////////////////////////////////////////////////////////////
+// Round color to RGB565 and expand
+////////////////////////////////////////////////////////////////////////////////
+inline __device__ float3 roundAndExpand(float3 v, ushort * w)
+{
+    v.x = rintf(__saturatef(v.x) * 31.0f);
+    v.y = rintf(__saturatef(v.y) * 63.0f);
+    v.z = rintf(__saturatef(v.z) * 31.0f);
+    *w = ((ushort)v.x << 11) | ((ushort)v.y << 5) | (ushort)v.z;
+    v.x *= 0.03227752766457f; // approximate integer bit expansion.
+    v.y *= 0.01583151765563f;
+    v.z *= 0.03227752766457f;
+    return v;
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+// Evaluate permutations
+////////////////////////////////////////////////////////////////////////////////
+static __device__ float evalPermutation4(const float3 * colors, uint permutation, ushort * start, ushort * end)
+{
+    // Compute endpoints using least squares.
+    float alpha2_sum = 0.0f;
+    float beta2_sum = 0.0f;
+    float alphabeta_sum = 0.0f;
+    float3 alphax_sum = make_float3(0.0f, 0.0f, 0.0f);
+    float3 betax_sum = make_float3(0.0f, 0.0f, 0.0f);
+
+    // Compute alpha & beta for this permutation.
+    for (int i = 0; i < 16; i++)
+    {
+        const uint bits = permutation >> (2*i);
+
+        float beta = (bits & 1);
+        if (bits & 2) beta = (1 + beta) / 3.0f;
+        float alpha = 1.0f - beta;
+    
+        alpha2_sum += alpha * alpha;
+        beta2_sum += beta * beta;
+        alphabeta_sum += alpha * beta;
+        alphax_sum += alpha * colors[i];
+        betax_sum += beta * colors[i];
+    }
+
+    // alpha2, beta2, alphabeta and factor could be precomputed for each permutation, but it's faster to recompute them.
+    const float factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);
+
+    float3 a = (alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor;
+    float3 b = (betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor;
+    
+    // Round a, b to the closest 5-6-5 color and expand...
+    a = roundAndExpand(a, start);
+    b = roundAndExpand(b, end);
+
+    // compute the error
+    float3 e = a * a * alpha2_sum + b * b * beta2_sum + 2.0f * (a * b * alphabeta_sum - a * alphax_sum - b * betax_sum);
+
+    return dot(e, kColorMetric);
+}
+
+
+static __device__ float evalPermutation3(const float3 * colors, uint permutation, ushort * start, ushort * end)
+{
+    // Compute endpoints using least squares.
+    float alpha2_sum = 0.0f;
+    float beta2_sum = 0.0f;
+    float alphabeta_sum = 0.0f;
+    float3 alphax_sum = make_float3(0.0f, 0.0f, 0.0f);
+    float3 betax_sum = make_float3(0.0f, 0.0f, 0.0f);
+
+    // Compute alpha & beta for this permutation.
+    for (int i = 0; i < 16; i++)
+    {
+        const uint bits = permutation >> (2*i);
+
+        float beta = (bits & 1);
+        if (bits & 2) beta = 0.5f;
+        float alpha = 1.0f - beta;
+    
+        alpha2_sum += alpha * alpha;
+        beta2_sum += beta * beta;
+        alphabeta_sum += alpha * beta;
+        alphax_sum += alpha * colors[i];
+        betax_sum += beta * colors[i];
+    }
+
+    const float factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);
+
+    float3 a = (alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor;
+    float3 b = (betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor;
+    
+    // Round a, b to the closest 5-6-5 color and expand...
+    a = roundAndExpand(a, start);
+    b = roundAndExpand(b, end);
+
+    // compute the error
+    float3 e = a * a * alpha2_sum + b * b * beta2_sum + 2.0f * (a * b * alphabeta_sum - a * alphax_sum - b * betax_sum);
+
+    return dot(e, kColorMetric);
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+// Sort colors
+////////////////////////////////////////////////////////////////////////////////
+__device__ void sortColors(float * values, float3 * colors, int * xrefs)
+{
+#if __DEVICE_EMULATION__
+
+    if (threadIdx.x == 0) 
+    {
+        for( int i = 0; i < 16; ++i )
+        {
+			xrefs[i] = i;
+		}
+        
+        // Use a sequential sort on emulation.
+        for( int i = 0; i < 16; ++i )
+        {
+            for( int j = i; j > 0 && values[j] < values[j - 1]; --j )
+            {
+                swap( values[j], values[j - 1] );
+                swap( xrefs[j], xrefs[j - 1] );
+            //    swap( colors[j], colors[j - 1] );
+            }
+        }
+        
+        float3 tmp[16];
+        for( int i = 0; i < 16; ++i ) 
+        {
+			tmp[i] = colors[i];
+		}
+        
+        for( int i = 0; i < 16; ++i )
+        {
+            int xid = xrefs[i];
+            colors[i] = tmp[xid];
+        }
+    }
+
+#else
+    int tid = threadIdx.x;
+
+	xrefs[tid] = tid;
+
+    // Parallel bitonic sort.
+    for (int k = 2; k <= 16; k *= 2)
+    {
+        // bitonic merge:
+        for (int j = k / 2; j>0; j /= 2)
+        {
+            int ixj = tid ^ j;
+            
+            if (ixj > tid) {
+                // @@ Optimize these branches.
+                if ((tid & k) == 0) {
+                    if (values[xrefs[tid]] > values[xrefs[ixj]]) {
+                    //    swap(values[tid], values[ixj]);
+                        swap(colors[tid], colors[ixj]);
+                        swap(xrefs[tid], xrefs[ixj]);
+                    }
+                }
+                else {
+                    if (values[xrefs[tid]] < values[xrefs[ixj]]) {
+                    //    swap(values[tid], values[ixj]);
+                        swap(colors[tid], colors[ixj]);
+                        swap(xrefs[tid], xrefs[ixj]);
+                    }
+                }
+            }
+        }
+    }
+#endif
+
+    // It would be faster to avoid color swaps during the sort, but there
+    // are compiler bugs preventing that.
+#if 0
+	float3 tmp = colors[xrefs[tid]];
+    colors[tid] = tmp;
+#endif
+}
+
+// This sort is faster, but does not sort correctly elements with the same value.
+__device__ void sortColors2(float * values, float3 * colors, int * cmp)
+{
+	int tid = threadIdx.x;
+
+	cmp[tid] = (values[0] < values[tid]);
+	cmp[tid] += (values[1] < values[tid]);
+	cmp[tid] += (values[2] < values[tid]);
+	cmp[tid] += (values[3] < values[tid]);
+	cmp[tid] += (values[4] < values[tid]);
+	cmp[tid] += (values[5] < values[tid]);
+	cmp[tid] += (values[6] < values[tid]);
+	cmp[tid] += (values[7] < values[tid]);
+	cmp[tid] += (values[8] < values[tid]);
+	cmp[tid] += (values[9] < values[tid]);
+	cmp[tid] += (values[10] < values[tid]);
+	cmp[tid] += (values[11] < values[tid]);
+	cmp[tid] += (values[12] < values[tid]);
+	cmp[tid] += (values[13] < values[tid]);
+	cmp[tid] += (values[14] < values[tid]);
+	cmp[tid] += (values[15] < values[tid]);
+	
+	float3 tmp = colors[tid];
+	colors[cmp[tid]] = tmp;
+}
+
+
+
+////////////////////////////////////////////////////////////////////////////////
+// Find index with minimum error
+////////////////////////////////////////////////////////////////////////////////
+__device__ void minimizeError(float * errors, int * indices)
+{
+	const int idx = threadIdx.x;
+
+#if __DEVICE_EMULATION__
+
+	for(int d = THREAD_NUM/2; d > 0; d >>= 1)
+	{
+		__syncthreads();
+
+		if (idx < d)
+		{
+			float err0 = errors[idx];
+			float err1 = errors[idx + d];
+			
+			if (err1 < err0) {
+				errors[idx] = err1;
+				indices[idx] = indices[idx + d];
+			}
+		}
+	}
+
+#else
+
+	for(int d = THREAD_NUM/2; d > 32; d >>= 1)
+	{
+		__syncthreads();
+
+		if (idx < d)
+		{
+			float err0 = errors[idx];
+			float err1 = errors[idx + d];
+			
+			if (err1 < err0) {
+				errors[idx] = err1;
+				indices[idx] = indices[idx + d];
+			}
+		}
+	}
+
+	// unroll last 6 steps 
+	if (idx <= 32)
+	{
+		if (errors[idx + 32] < errors[idx]) {
+			errors[idx] = errors[idx + 32];
+			indices[idx] = indices[idx + 32];
+		}
+		if (errors[idx + 16] < errors[idx]) {
+			errors[idx] = errors[idx + 16];
+			indices[idx] = indices[idx + 16];
+		}
+		if (errors[idx + 8] < errors[idx]) {
+			errors[idx] = errors[idx + 8];
+			indices[idx] = indices[idx + 8];
+		}
+		if (errors[idx + 4] < errors[idx]) {
+			errors[idx] = errors[idx + 4];
+			indices[idx] = indices[idx + 4];
+		}
+		if (errors[idx + 2] < errors[idx]) {
+			errors[idx] = errors[idx + 2];
+			indices[idx] = indices[idx + 2];
+		}
+		if (errors[idx + 1] < errors[idx]) {
+			errors[idx] = errors[idx + 1];
+			indices[idx] = indices[idx + 1];
+		}
+	}
+#endif
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+// Compress color block
+////////////////////////////////////////////////////////////////////////////////
+__global__ void compress(const uint * permutations, const uint * image, uint * result)
+{
+	const int bid = blockIdx.x;
+	const int idx = threadIdx.x;
+	
+	__shared__ float3 colors[16];
+	__shared__ float dps[16];
+	__shared__ int xrefs[16];
+	
+	if (idx < 16)
+	{
+		// Read color.
+		uint c = image[(bid) * 16 + idx];
+	
+		// No need to synchronize, 16 < warp size.
+#if __DEVICE_EMULATION__
+		} __debugsync(); if (idx < 16) {
+#endif
+		
+		// Copy color to shared mem.
+		colors[idx].z = ((c >> 0) & 0xFF) * (1.0f / 255.0f);
+		colors[idx].y = ((c >> 8) & 0xFF) * (1.0f / 255.0f);
+		colors[idx].x = ((c >> 16) & 0xFF) * (1.0f / 255.0f);
+		
+#if __DEVICE_EMULATION__
+		} __debugsync(); if (idx < 16) {
+#endif
+
+		// Sort colors along the best fit line.
+		float3 axis = bestFitLine(colors);
+		
+		dps[idx] = dot(colors[idx], axis);
+		
+#if __DEVICE_EMULATION__
+		} __debugsync(); if (idx < 16) {
+#endif
+		
+		sortColors(dps, colors, xrefs);
+	}
+	
+	ushort bestStart, bestEnd;
+	uint bestPermutation;
+	float bestError = FLT_MAX;
+	
+	__syncthreads();
+	
+	for(int i = 0; i < 16; i++)
+	{
+		if (i == 15 && idx >= 32) break;
+		
+		ushort start, end;
+		uint permutation = permutations[idx + THREAD_NUM * i];
+		float error = evalPermutation4(colors, permutation, &start, &end);
+		
+		if (error < bestError)
+		{
+			bestError = error;
+			bestPermutation = permutation;
+			bestStart = start;
+			bestEnd = end;
+		}
+	}
+
+	if (bestStart < bestEnd)
+	{
+		swap(bestEnd, bestStart);
+		bestPermutation ^= 0x55555555;	// Flip indices.
+	}
+
+	for(int i = 0; i < 3; i++)
+	{
+		if (i == 2 && idx >= 32) break;
+		
+		ushort start, end;
+		uint permutation = permutations[idx + THREAD_NUM * i];
+		float error = evalPermutation3(colors, permutation, &start, &end);
+		
+		if (error < bestError)
+		{
+			bestError = error;
+			bestPermutation = permutation;
+			bestStart = start;
+			bestEnd = end;
+			
+			if (bestStart > bestEnd)
+			{
+				swap(bestEnd, bestStart);
+				bestPermutation ^= (~bestPermutation >> 1) & 0x55555555;	// Flip indices.
+			}
+		}
+	}
+	
+	if (bestStart == bestEnd)
+	{
+		bestPermutation = 0;
+	}
+	
+	__syncthreads();
+	
+	// Use a parallel reduction to find minimum error.
+	__shared__ float errors[THREAD_NUM];
+	__shared__ int indices[THREAD_NUM];
+	
+	errors[idx] = bestError;
+	indices[idx] = idx;
+	
+	minimizeError(errors, indices);
+	
+	__syncthreads();
+	
+	// Only write the result of the winner thread.
+	if (idx == indices[0])
+	{
+		// Reorder permutation.
+		uint perm = 0;
+		for(int i = 0; i < 16; i++)
+		{
+			int ref = xrefs[i];
+			perm |= ((bestPermutation >> (2 * i)) & 3) << (2 * ref);
+		}
+		
+		// Write endpoints. (bestStart, bestEnd)
+		result[2 * bid + 0] = (bestEnd << 16) | bestStart;
+		
+		// Write palette indices (permutation).
+		result[2 * bid + 1] = perm;
+	}
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+// Launch kernel
+////////////////////////////////////////////////////////////////////////////////
+extern "C" void compressKernel(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps, float weights[3])
+{
+	// Set constants.
+	cudaMemcpyToSymbol(kColorMetric, weights, sizeof(float) * 3, 0);
+
+	compress<<<blockNum, THREAD_NUM>>>(d_bitmaps, d_data, d_result);
+}
+
--- a/src/nvimage/nvtt/cuda/CudaCompressDXT.cpp
+++ b/src/nvimage/nvtt/cuda/CudaCompressDXT.cpp
@ -0,0 +1,264 @@
+// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
+// 
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+#include <nvcore/Debug.h>
+#include <nvcore/Containers.h>
+#include <nvmath/Color.h>
+#include <nvimage/Image.h>
+#include <nvimage/nvtt/CompressionOptions.h>
+
+#include "CudaCompressDXT.h"
+#include "CudaUtils.h"
+
+#if defined HAVE_CUDA
+#include <cuda_runtime.h>
+#endif
+
+using namespace nv;
+using namespace nvtt;
+
+#if defined HAVE_CUDA
+
+extern "C" void compressKernel(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps, float weights[3]);
+
+
+static uint * d_bitmaps = NULL;
+
+static void doPrecomputation()
+{
+	if (d_bitmaps != NULL) {
+		return;
+	}
+
+	uint bitmaps[1024];
+
+	int indices[16];
+	int num = 0;
+
+	// Compute bitmaps with 3 clusters:
+
+	// first cluster [0,i) is at the start
+	for( int m = 0; m < 16; ++m )
+	{
+		indices[m] = 0;
+	}
+	const int imax = 15;
+	for( int i = imax; i >= 0; --i )
+	{
+		// second cluster [i,j) is half along
+		for( int m = i; m < 16; ++m )
+		{
+			indices[m] = 2;
+		}
+		const int jmax = ( i == 0 ) ? 15 : 16;
+		for( int j = jmax; j >= i; --j )
+		{
+			// last cluster [j,k) is at the end
+			if( j < 16 )
+			{
+				indices[j] = 1;
+			}
+
+			uint bitmap = 0;
+			
+			for(int p = 0; p < 16; p++) {
+				bitmap |= indices[p] << (p * 2);
+			}
+				
+			bitmaps[num] = bitmap;
+				
+			num++;
+		}
+	}
+	nvDebugCheck(num == 151);
+
+	// Align to 160.
+	for(int i = 0; i < 9; i++)
+	{
+		bitmaps[num] = 0x000AA555;
+		num++;
+	}
+	nvDebugCheck(num == 160);
+
+	// Append bitmaps with 4 clusters:
+
+	// first cluster [0,i) is at the start
+	for( int m = 0; m < 16; ++m )
+	{
+		indices[m] = 0;
+	}
+	for( int i = imax; i >= 0; --i )
+	{
+		// second cluster [i,j) is one third along
+		for( int m = i; m < 16; ++m )
+		{
+			indices[m] = 2;
+		}
+		const int jmax = ( i == 0 ) ? 15 : 16;
+		for( int j = jmax; j >= i; --j )
+		{
+			// third cluster [j,k) is two thirds along
+			for( int m = j; m < 16; ++m )
+			{
+				indices[m] = 3;
+			}
+
+			int kmax = ( j == 0 ) ? 15 : 16;
+			for( int k = kmax; k >= j; --k )
+			{
+				// last cluster [k,n) is at the end
+				if( k < 16 )
+				{
+					indices[k] = 1;
+				}
+				
+				uint bitmap = 0;
+
+				bool hasThree = false;
+				for(int p = 0; p < 16; p++) {
+					bitmap |= indices[p] << (p * 2);
+
+					if (indices[p] == 3) hasThree = true;
+				}
+				
+				if (hasThree) {
+					bitmaps[num] = bitmap;
+					num++;
+				}
+			}
+		}
+	}
+	nvDebugCheck(num == 975);
+	
+	// Align to 1024.
+	for(int i = 0; i < 49; i++)
+	{
+		bitmaps[num] = 0x00AAFF55;
+		num++;
+	}
+
+	nvDebugCheck(num == 1024);
+
+    // Upload bitmaps.
+    cudaMalloc((void**) &d_bitmaps, 1024 * sizeof(uint));
+    cudaMemcpy(d_bitmaps, bitmaps, 1024 * sizeof(uint), cudaMemcpyHostToDevice);
+
+	// @@ Check for errors.
+
+}
+
+#endif
+
+
+/// Compress image using CUDA.
+void nv::cudaCompressDXT1(const Image * image, const OutputOptions & outputOptions, const nvtt::CompressionOptions::Private & compressionOptions)
+{
+	nvDebugCheck(cuda::isHardwarePresent());
+#if defined HAVE_CUDA
+
+	doPrecomputation();
+
+	// Image size in blocks.
+	const uint w = (image->width() + 3) / 4;
+	const uint h = (image->height() + 3) / 4;
+
+	uint imageSize = w * h * 16 * sizeof(Color32);
+    uint * blockLinearImage = (uint *) malloc(imageSize);
+
+	// Convert linear image to block linear.
+	for(uint by = 0; by < h; by++) {
+		for(uint bx = 0; bx < w; bx++) {
+			const uint bw = min(image->width() - bx * 4, 4U);
+			const uint bh = min(image->height() - by * 4, 4U);
+
+			for (uint i = 0; i < 16; i++) {
+				const int x = (i & 3) % bw;
+				const int y = (i / 4) % bh;
+				blockLinearImage[(by * w + bx) * 16 + i] = image->pixel(bx * 4 + x, by * 4 + y).u;
+			}
+		}
+	}
+
+	const uint blockNum = w * h;
+	const uint compressedSize = blockNum * 8;
+	const uint blockMax = 32768; // 65535
+
+    // Allocate image in device memory.
+    uint * d_data = NULL;
+    cudaMalloc((void**) &d_data, min(imageSize, blockMax * 64U));
+
+	// Allocate result.
+    uint * d_result = NULL;
+    cudaMalloc((void**) &d_result, min(compressedSize, blockMax * 8U));
+
+	// TODO: Add support for multiple GPUs.
+	uint bn = 0;
+	while(bn != blockNum)
+	{
+		uint count = min(blockNum - bn, blockMax);
+
+	    cudaMemcpy(d_data, blockLinearImage + bn * 16, count * 64, cudaMemcpyHostToDevice);
+
+		// Launch kernel.
+		float weights[3];
+		weights[0] = compressionOptions.colorWeight.x();
+		weights[1] = compressionOptions.colorWeight.y();
+		weights[2] = compressionOptions.colorWeight.z();
+		compressKernel(count, d_data, d_result, d_bitmaps, weights);
+
+		// Check for errors.
+		cudaError_t err = cudaGetLastError();
+		if (err != cudaSuccess)
+		{
+			nvDebug("CUDA Error: %s\n", cudaGetErrorString(err));
+
+			if (outputOptions.errorHandler != NULL)
+			{
+				outputOptions.errorHandler->error(nvtt::Error_CudaError);
+			}
+		}
+
+		// Copy result to host, overwrite swizzled image.
+		cudaMemcpy(blockLinearImage, d_result, count * 8, cudaMemcpyDeviceToHost);
+
+		// Output result.
+		if (outputOptions.outputHandler != NULL)
+		{
+			outputOptions.outputHandler->writeData(blockLinearImage, count * 8);
+		}
+
+		bn += count;
+	}
+
+	free(blockLinearImage);
+	cudaFree(d_data);
+	cudaFree(d_result);
+
+#else
+	if (outputOptions.errorHandler != NULL)
+	{
+		outputOptions.errorHandler->error(Error_CudaError);
+	}
+#endif
+}
+
--- a/src/nvimage/nvtt/cuda/CudaCompressDXT.h
+++ b/src/nvimage/nvtt/cuda/CudaCompressDXT.h
@ -0,0 +1,39 @@
+// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
+// 
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef NV_TT_CUDACOMPRESSDXT_H
+#define NV_TT_CUDACOMPRESSDXT_H
+
+#include <nvimage/nvimage.h>
+#include <nvimage/nvtt/nvtt.h>
+
+namespace nv
+{
+	class Image;
+
+	void cudaCompressDXT1(const Image * image, const nvtt::OutputOptions & outputOptions, const nvtt::CompressionOptions::Private & compressionOptions);
+	
+} // nv namespace
+
+
+#endif // NV_TT_CUDAUTILS_H
--- a/src/nvimage/nvtt/cuda/CudaMath.h
+++ b/src/nvimage/nvtt/cuda/CudaMath.h
@ -0,0 +1,214 @@
+// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
+// 
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+// Math functions and operators to be used with vector types.
+
+#ifndef CUDAMATH_H
+#define CUDAMATH_H
+
+#include <float.h>
+
+
+inline __device__ __host__ float3 operator *(float3 a, float3 b)
+{
+    return make_float3(a.x*b.x, a.y*b.y, a.z*b.z);
+}
+
+inline __device__ __host__ float3 operator *(float f, float3 v)
+{
+    return make_float3(v.x*f, v.y*f, v.z*f);
+}
+
+inline __device__ __host__ float3 operator *(float3 v, float f)
+{
+    return make_float3(v.x*f, v.y*f, v.z*f);
+}
+
+inline __device__ __host__ float3 operator +(float3 a, float3 b)
+{
+    return make_float3(a.x+b.x, a.y+b.y, a.z+b.z);
+}
+
+inline __device__ __host__ void operator +=(float3 & b, float3 a)
+{
+    b.x += a.x;
+    b.y += a.y;
+    b.z += a.z;
+}
+
+inline __device__ __host__ float3 operator -(float3 a, float3 b)
+{
+    return make_float3(a.x-b.x, a.y-b.y, a.z-b.z);
+}
+
+inline __device__ __host__ void operator -=(float3 & b, float3 a)
+{
+    b.x -= a.x;
+    b.y -= a.y;
+    b.z -= a.z;
+}
+
+inline __device__ __host__ float3 operator /(float3 v, float f)
+{
+    float inv = 1.0f / f;
+    return v * inv;
+}
+
+inline __device__ __host__ void operator /=(float3 & b, float f)
+{
+    float inv = 1.0f / f;
+    b.x *= inv;
+    b.y *= inv;
+    b.z *= inv;
+}
+
+
+inline __device__ __host__ float dot(float3 a, float3 b)
+{
+    return a.x * b.x + a.y * b.y + a.z * b.z;
+}
+
+inline __device__ __host__ float dot(float4 a, float4 b)
+{
+    return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
+}
+
+inline __device__ __host__ float clamp(float f, float a, float b)
+{
+    return max(a, min(f, b));
+}
+
+inline __device__ __host__ float3 clamp(float3 v, float a, float b)
+{
+    return make_float3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b));
+}
+
+inline __device__ __host__ float3 clamp(float3 v, float3 a, float3 b)
+{
+    return make_float3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z));
+}
+
+
+inline __device__ __host__ float3 normalize(float3 v)
+{
+    float len = 1.0f / dot(v, v);
+    return make_float3(v.x * len, v.y * len, v.z * len);
+}
+
+
+
+
+// Use power method to find the first eigenvector.
+// http://www.miislita.com/information-retrieval-tutorial/matrix-tutorial-3-eigenvalues-eigenvectors.html
+inline __device__ __host__ float3 firstEigenVector( float matrix[6] )
+{
+    // 8 iterations seems to be more than enough.
+
+    float3 v = make_float3(1.0f, 1.0f, 1.0f);
+    for(int i = 0; i < 8; i++) {
+        float x = v.x * matrix[0] + v.y * matrix[1] + v.z * matrix[2];
+        float y = v.x * matrix[1] + v.y * matrix[3] + v.z * matrix[4];
+        float z = v.x * matrix[2] + v.y * matrix[4] + v.z * matrix[5];
+        float m = max(max(x, y), z);        
+        float iv = 1.0f / m;
+        #if __DEVICE_EMULATION__
+        if (m == 0.0f) iv = 0.0f;
+        #endif
+        v = make_float3(x*iv, y*iv, z*iv);
+    }
+
+    return v;
+}
+
+inline __device__ float3 bestFitLine(const float3 * colors)
+{
+#if __DEVICE_EMULATION__
+
+    // Compute covariance matrix of the given colors.
+    float3 center = make_float3(0.0f, 0.0f, 0.0f);
+    for (int i = 0; i < 16; i++)
+    {
+        center += colors[i];
+    }
+    center /= 16.0f;
+
+    float covariance[6] = {0, 0, 0, 0, 0, 0};
+    for (int i = 0; i < 16; i++)
+    {
+        float3 a = colors[i] - center;
+        covariance[0] += a.x * a.x;
+        covariance[1] += a.x * a.y;
+        covariance[2] += a.x * a.z;
+        covariance[3] += a.y * a.y;
+        covariance[4] += a.y * a.z;
+        covariance[5] += a.z * a.z;
+    }
+
+#else
+
+    const int idx = threadIdx.x;
+
+    __shared__ float3 colorSum[16];
+    colorSum[idx] = colors[idx];
+
+    // Unrolled parallel reduction.
+    if (idx < 8) {
+        colorSum[idx] += colorSum[idx + 8];
+        colorSum[idx] += colorSum[idx + 4];
+        colorSum[idx] += colorSum[idx + 2];
+        colorSum[idx] += colorSum[idx + 1];
+    }
+
+    // @@ Eliminate two-way bank conflicts here.
+    // @@ It seems that doing that and unrolling the reduction doesn't help...
+    __shared__ float covariance[16*6];
+    colorSum[idx] = colors[idx] - colorSum[0] / 16.0f;
+    
+    covariance[6 * idx + 0] = colorSum[idx].x * colorSum[idx].x;    // 0, 6, 12, 2, 8, 14, 4, 10, 0
+    covariance[6 * idx + 1] = colorSum[idx].x * colorSum[idx].y;
+    covariance[6 * idx + 2] = colorSum[idx].x * colorSum[idx].z;
+    covariance[6 * idx + 3] = colorSum[idx].y * colorSum[idx].y;
+    covariance[6 * idx + 4] = colorSum[idx].y * colorSum[idx].z;
+    covariance[6 * idx + 5] = colorSum[idx].z * colorSum[idx].z;
+
+    for(int d = 8; d > 0; d >>= 1)
+    {
+        if (idx < d)
+        {
+            covariance[6 * idx + 0] += covariance[6 * (idx+d) + 0];
+            covariance[6 * idx + 1] += covariance[6 * (idx+d) + 1];
+            covariance[6 * idx + 2] += covariance[6 * (idx+d) + 2];
+            covariance[6 * idx + 3] += covariance[6 * (idx+d) + 3];
+            covariance[6 * idx + 4] += covariance[6 * (idx+d) + 4];
+            covariance[6 * idx + 5] += covariance[6 * (idx+d) + 5];
+        }
+    }
+
+#endif
+    
+    // Compute first eigen vector.
+    return firstEigenVector(covariance);
+}
+
+
+#endif // CUDAMATH_H
--- a/src/nvimage/nvtt/cuda/CudaUtils.cpp
+++ b/src/nvimage/nvtt/cuda/CudaUtils.cpp
@ -0,0 +1,109 @@
+// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
+// 
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+#include <nvcore/Debug.h>
+#include "CudaUtils.h"
+
+#if defined HAVE_CUDA
+#include <cuda_runtime.h>
+#endif
+
+using namespace nv;
+using namespace cuda;
+
+#if NV_OS_WIN32
+
+#define WINDOWS_LEAN_AND_MEAN
+#include <windows.h>
+
+static bool isWindowsVista()
+{
+	OSVERSIONINFO osvi;
+	osvi.dwOSVersionInfoSize = sizeof(OSVERSIONINFO);
+
+	::GetVersionEx(&osvi);
+	return osvi.dwMajorVersion >= 6;
+}
+
+
+typedef BOOL (WINAPI *LPFN_ISWOW64PROCESS) (HANDLE, PBOOL);
+
+static bool isWow32()
+{
+	LPFN_ISWOW64PROCESS fnIsWow64Process = (LPFN_ISWOW64PROCESS)GetProcAddress(GetModuleHandle("kernel32"), "IsWow64Process");
+
+    BOOL bIsWow64 = FALSE;
+ 
+    if (NULL != fnIsWow64Process)
+    {
+        if (!fnIsWow64Process(GetCurrentProcess(), &bIsWow64))
+        {
+			// Assume 32 bits.
+            return true;
+        }
+    }
+
+    return !bIsWow64;
+}
+
+#endif
+
+
+/// Determine if CUDA is available.
+bool nv::cuda::isHardwarePresent()
+{
+#if defined HAVE_CUDA
+	return !isWindowsVista() && deviceCount() > 0;
+	//return !isWindowsVista() && isWow32() && deviceCount() > 0;
+#else
+	return false;
+#endif
+}
+
+/// Get number of CUDA enabled devices.
+int nv::cuda::deviceCount()
+{
+#if defined HAVE_CUDA
+	int gpuCount = 0;
+
+	cudaError_t result = cudaGetDeviceCount(&gpuCount);
+
+	if (result == cudaSuccess)
+	{
+		return gpuCount;
+	}
+#endif
+	return 0;
+}
+
+/// Activate the given devices.
+bool nv::cuda::setDevice(int i)
+{
+	nvCheck(i < deviceCount());
+#if defined HAVE_CUDA
+	cudaError_t result = cudaSetDevice(i);
+	return result == cudaSuccess;
+#else
+	return false;
+#endif
+}
--- a/src/nvimage/nvtt/cuda/CudaUtils.h
+++ b/src/nvimage/nvtt/cuda/CudaUtils.h
@ -0,0 +1,40 @@
+// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
+// 
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef NV_TT_CUDAUTILS_H
+#define NV_TT_CUDAUTILS_H
+
+namespace nv
+{
+	
+	namespace cuda
+	{
+		bool isHardwarePresent();
+		int deviceCount();
+		bool setDevice(int i);
+	};
+	
+} // nv namespace
+
+
+#endif // NV_TT_CUDAUTILS_H
--- a/src/nvimage/nvtt/dxtlib.cpp
+++ b/src/nvimage/nvtt/dxtlib.cpp
@ -0,0 +1,486 @@
+// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
+// 
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+#include <nvcore/Memory.h>
+#include <nvcore/Ptr.h>
+
+#include <nvimage/DirectDrawSurface.h>
+#include <nvimage/ColorBlock.h>
+#include <nvimage/Image.h>
+#include <nvimage/FloatImage.h>
+#include <nvimage/Filter.h>
+#include <nvimage/Quantize.h>
+#include <nvimage/NormalMap.h>
+
+#include "CompressDXT.h"
+#include "FastCompressDXT.h"
+#include "CompressRGB.h"
+#include "BlockDXT.h"
+#include "InputOptions.h"
+#include "CompressionOptions.h"
+#include "cuda/CudaUtils.h"
+#include "cuda/CudaCompressDXT.h"
+
+
+using namespace nv;
+using namespace nvtt;
+
+namespace
+{
+	
+	static int blockSize(Format format)
+	{
+		if (format == Format_DXT1 /*|| format == Format_DXT1a*/) {
+			return 8;
+		}
+		else if (format == Format_DXT3) {
+			return 16;
+		}
+		else if (format == Format_DXT5 || format == Format_DXT5n) {
+			return 16;
+		}
+		else if (format == Format_BC4) {
+			return 8;
+		}
+		else if (format == Format_BC5) {
+			return 16;
+		}
+		return 0;
+	}
+	
+	static int computeImageSize(int w, int h, Format format)
+	{
+		if (format == Format_RGBA) {
+			return w * h * sizeof(Color32);
+		}
+		else {
+			return ((w + 3) / 4) * ((h + 3) / 4) * blockSize(format);
+		}
+	}
+	
+} // namespace
+
+
+
+
+
+//
+// compress
+//
+
+static void outputHeader(const InputOptions::Private & inputOptions, const OutputOptions & outputOptions, const CompressionOptions::Private & compressionOptions)
+{
+	// Output DDS header.
+	if (outputOptions.outputHandler != NULL && outputOptions.outputHeader)
+	{
+		DDSHeader header;
+		
+		// Only 1 face and 2d textures supported.
+		nvCheck(inputOptions.faceCount == 1);
+		
+		InputOptions::Private::Image * img = inputOptions.images;
+		nvCheck(img != NULL);
+		
+		header.setWidth(img->width);
+		header.setHeight(img->height);
+		
+		int mipmapCount = inputOptions.mipmapCount;
+		if (!inputOptions.generateMipmaps) mipmapCount = 0;
+		else if (inputOptions.maxLevel != -1 && inputOptions.maxLevel < mipmapCount) mipmapCount = inputOptions.maxLevel;
+		header.setMipmapCount(mipmapCount);
+
+		if (inputOptions.textureType == TextureType_2D) {
+			header.setTexture2D();
+		}
+		else if (inputOptions.textureType == TextureType_Cube) {
+			header.setTextureCube();
+		}		
+		/*else if (inputOptions.textureType == TextureType_3D) {
+			header.setTexture3D();
+			header.setDepth(img->depth);
+		}*/
+		
+		if (compressionOptions.format == Format_RGBA)
+		{
+			header.setPitch(4 * img->width);
+			header.setPixelFormat(compressionOptions.bitcount, compressionOptions.rmask, compressionOptions.gmask, compressionOptions.bmask, compressionOptions.amask);
+		}
+		else
+		{
+			header.setLinearSize(computeImageSize(img->width, img->height, compressionOptions.format));
+			
+			if (compressionOptions.format == Format_DXT1 /*|| compressionOptions.format == Format_DXT1a*/) {
+				header.setFourCC('D', 'X', 'T', '1');
+			}
+			else if (compressionOptions.format == Format_DXT3) {
+				header.setFourCC('D', 'X', 'T', '3');
+			}
+			else if (compressionOptions.format == Format_DXT5) {
+				header.setFourCC('D', 'X', 'T', '5');
+			}
+			else if (compressionOptions.format == Format_DXT5n) {
+				header.setFourCC('R', 'X', 'G', 'B');
+			}
+			else if (compressionOptions.format == Format_BC4) {
+				header.setFourCC('A', 'T', 'I', '1');
+			}
+			else if (compressionOptions.format == Format_BC5) {
+				header.setFourCC('A', 'T', 'I', '2');
+			}
+		}
+		
+		// Swap bytes if necessary.
+		header.swapBytes();
+		
+		nvStaticCheck(sizeof(DDSHeader) == 128);
+		outputOptions.outputHandler->writeData(&header, 128);
+		
+		// Revert swap.
+		header.swapBytes();
+	}
+}
+
+
+static bool compressMipmap(const Image * image, const OutputOptions & outputOptions, const CompressionOptions::Private & compressionOptions)
+{
+	nvDebugCheck(image != NULL);
+
+	if (compressionOptions.format == Format_RGBA || compressionOptions.format == Format_RGB)
+	{
+		compressRGB(image, outputOptions, compressionOptions);
+	}
+	else if (compressionOptions.format == Format_DXT1)
+	{
+#if defined(HAVE_S3QUANT)
+		if (compressionOptions.externalCompressor == "s3")
+		{
+			s3CompressDXT1(image, outputOptions);
+		}
+		else
+#endif
+
+#if defined(HAVE_ATITC)
+		if (compressionOptions.externalCompressor == "ati")
+		{
+			printf("ATI\n");
+			atiCompressDXT1(image, outputOptions);
+		}
+		else
+#endif
+		if (compressionOptions.useCuda && nv::cuda::isHardwarePresent())
+		{
+			cudaCompressDXT1(image, outputOptions, compressionOptions);
+		}
+		else
+		{
+			if (compressionOptions.quality == Quality_Fastest)
+			{
+				fastCompressDXT1(image, outputOptions);
+			}
+			else
+			{
+				compressDXT1(image, outputOptions, compressionOptions);
+			}
+		}
+	}
+	else if (compressionOptions.format == Format_DXT3)
+	{
+		if (compressionOptions.quality == Quality_Fastest)
+		{
+			fastCompressDXT3(image, outputOptions);
+		}
+		else
+		{
+			compressDXT3(image, outputOptions, compressionOptions);
+		}
+	}
+	else if (compressionOptions.format == Format_DXT5)
+	{
+		if (compressionOptions.quality == Quality_Fastest)
+		{
+			fastCompressDXT5(image, outputOptions);
+		}
+		else
+		{
+			compressDXT5(image, outputOptions, compressionOptions);
+		}
+	}
+	else if (compressionOptions.format == Format_DXT5n)
+	{
+		if (compressionOptions.quality == Quality_Fastest)
+		{
+			fastCompressDXT5n(image, outputOptions);
+		}
+		else
+		{
+			compressDXT5n(image, outputOptions, compressionOptions);
+		}
+	}
+	else if (compressionOptions.format == Format_BC4)
+	{
+		compressBC4(image, outputOptions, compressionOptions);
+	}
+	else if (compressionOptions.format == Format_BC5)
+	{
+		compressBC5(image, outputOptions, compressionOptions);
+	}
+
+	return true;
+}
+
+
+// Convert input image to linear float image.
+static FloatImage * toFloatImage(const Image * image, const InputOptions::Private & inputOptions)
+{
+	nvDebugCheck(image != NULL);
+
+	FloatImage * floatImage = new FloatImage(image);
+
+	// Convert to linear space.
+	if (inputOptions.inputGamma != 1.0f) {
+		floatImage->toLinear(0, 3, inputOptions.inputGamma);
+	}
+
+	return floatImage;
+}
+
+
+// Convert linear float image to output image.
+static Image * toFixedImage(const FloatImage * floatImage, const InputOptions::Private & inputOptions)
+{
+	nvDebugCheck(floatImage != NULL);
+
+	return floatImage->createImageGammaCorrect(inputOptions.outputGamma);
+}
+
+
+// Create mipmap from the given image.
+static FloatImage * createMipmap(const FloatImage * floatImage, const InputOptions::Private & inputOptions)
+{
+	FloatImage * result = NULL;
+	
+	if (inputOptions.mipmapFilter == MipmapFilter_Box)
+	{
+		// Use fast downsample.
+		result = floatImage->fastDownSample();
+	}
+	else if (inputOptions.mipmapFilter == MipmapFilter_Triangle)
+	{
+		Kernel1 kernel(4);
+		kernel.initFilter(Filter::Triangle);
+		result = floatImage->downSample(kernel, (FloatImage::WrapMode)inputOptions.wrapMode);
+	}
+	else /*if (inputOptions.mipmapFilter == MipmapFilter_Kaiser)*/
+	{
+		Kernel1 kernel(10);
+		kernel.initKaiser(8.0, 0.75f);
+		result = floatImage->downSample(kernel, (FloatImage::WrapMode)inputOptions.wrapMode);
+	}
+	
+	// Normalize mipmap.
+	if (inputOptions.normalizeMipmaps)
+	{
+		normalize(result);
+	}
+	
+	return result;
+}
+
+
+// Quantize the input image to the precision of the output format.
+static void quantize(Image * img, const InputOptions::Private & inputOptions, Format format)
+{
+	if (inputOptions.enableColorDithering)
+	{
+		if (format >= Format_DXT1 && format <= Format_DXT5)
+		{
+			Quantize::FloydSteinberg_RGB16(img);
+		}
+	}
+	if (inputOptions.binaryAlpha)
+	{
+		if (inputOptions.enableAlphaDithering)
+		{
+			Quantize::FloydSteinberg_BinaryAlpha(img, inputOptions.alphaThreshold);
+		}
+		else
+		{
+			Quantize::BinaryAlpha(img, inputOptions.alphaThreshold);
+		}
+	}
+	else
+	{
+		if (inputOptions.enableAlphaDithering)
+		{
+			if (format == Format_DXT3)
+			{
+				Quantize::Alpha4(img);
+			}
+			/*else if (format == Format_DXT1a)
+			{
+				Quantize::BinaryAlpha(img, inputOptions.alphaThreshold);
+			}*/
+		}
+	}
+}
+
+
+/// Compress the input texture with the given compression options.
+bool nvtt::compress(const InputOptions & inputOptions, const OutputOptions & outputOptions, const CompressionOptions & compressionOptions)
+{
+	// Make sure enums match.
+	nvStaticCheck(FloatImage::WrapMode_Clamp == (FloatImage::WrapMode)WrapMode_Clamp);
+	nvStaticCheck(FloatImage::WrapMode_Mirror == (FloatImage::WrapMode)WrapMode_Mirror);
+	nvStaticCheck(FloatImage::WrapMode_Repeat == (FloatImage::WrapMode)WrapMode_Repeat);
+
+	// Output DDS header.
+	outputHeader(inputOptions.m, outputOptions, compressionOptions.m);
+
+	Format format = compressionOptions.m.format;
+
+	for (int f = 0; f < inputOptions.m.faceCount; f++)
+	{
+		Image * lastImage = NULL;
+		AutoPtr<FloatImage> floatImage(NULL);
+		
+		for (int m = 0; m < inputOptions.m.mipmapCount; m++)
+		{
+			int idx = f * inputOptions.m.mipmapCount + m;
+			InputOptions::Private::Image & mipmap = inputOptions.m.images[idx];
+			
+			if (outputOptions.outputHandler)
+			{
+				int size = computeImageSize(mipmap.width, mipmap.height, format);
+				outputOptions.outputHandler->mipmap(size, mipmap.width, mipmap.height, mipmap.depth, mipmap.face, mipmap.mipLevel);
+			}
+			
+			Image * img; // Image to compress.
+			
+			if (mipmap.data != NULL) // Mipmap provided.
+			{
+				// Convert to normal map.
+				if (inputOptions.m.convertToNormalMap)
+				{
+					floatImage = createNormalMap(mipmap.data, (FloatImage::WrapMode)inputOptions.m.wrapMode, inputOptions.m.heightFactors, inputOptions.m.bumpFrequencyScale);
+				}
+				else
+				{
+					lastImage = img = mipmap.data;
+					
+					// Delete float image.
+					floatImage = NULL;
+				}
+			}
+			else // Create mipmap from last.
+			{
+				if (m == 0) {
+					// First mipmap missing.
+					if (outputOptions.errorHandler != NULL) outputOptions.errorHandler->error(Error_InvalidInput);
+					return false;
+				}
+				
+				if (floatImage == NULL)
+				{
+					nvDebugCheck(lastImage != NULL);
+					floatImage = toFloatImage(lastImage, inputOptions.m);
+				}
+				
+				// Create mipmap.
+				floatImage = createMipmap(floatImage.ptr(), inputOptions.m);
+			}
+			
+			if (floatImage != NULL)
+			{
+				// Convert to fixed.
+				img = toFixedImage(floatImage.ptr(), inputOptions.m);
+			}
+			
+			quantize(img, inputOptions.m, format);
+			
+			compressMipmap(img, outputOptions, compressionOptions.m);
+			
+			if (img != mipmap.data)
+			{
+				delete img;
+			}
+			
+			if (!inputOptions.m.generateMipmaps || (inputOptions.m.maxLevel >= 0 && m >= inputOptions.m.maxLevel)) {
+				// continue with next face.
+				break;
+			}
+		}
+	}
+
+	return true;
+}
+
+
+
+
+/// Estimate the size of compressing the input with the given options.
+int nvtt::estimateSize(const InputOptions & inputOptions, const CompressionOptions & compressionOptions)
+{
+	Format format = compressionOptions.m.format;
+
+	int size = 0;
+	
+	for (int f = 0; f < inputOptions.m.faceCount; f++)
+	{
+		for (int m = 0; m < inputOptions.m.mipmapCount; m++)
+		{
+			int idx = f * inputOptions.m.mipmapCount + m;
+			const InputOptions::Private::Image & img = inputOptions.m.images[idx];
+			
+			size += computeImageSize(img.width, img.height, format);
+			
+			if (!inputOptions.m.generateMipmaps || (inputOptions.m.maxLevel >= 0 && m >= inputOptions.m.maxLevel)) {
+				// continue with next face.
+				break;
+			}
+		}
+	}
+	
+	return size;
+}
+
+
+/// Return a string for the given error.
+const char * nvtt::errorString(Error e)
+{
+	switch(e)
+	{
+		case Error_InvalidInput:
+			return "Invalid input";
+		case Error_UserInterruption:
+			return "User interruption";
+		case Error_UnsupportedFeature:
+			return "Unsupported feature";
+		case Error_CudaError:
+			return "CUDA error";
+		case Error_Unknown:
+			return "Unknown error";
+	}
+
+	return NULL;
+}
+
--- a/src/nvimage/nvtt/nvtt.h
+++ b/src/nvimage/nvtt/nvtt.h
@ -0,0 +1,242 @@
+// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
+// 
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef NV_TT_H
+#define NV_TT_H
+
+#include <nvcore/nvcore.h>
+
+// Function linkage
+#if NVTT_SHARED
+#ifdef NVTT_EXPORTS
+#define NVTT_API DLL_EXPORT
+#define NVTT_CLASS DLL_EXPORT_CLASS
+#else
+#define NVTT_API DLL_IMPORT
+#define NVTT_CLASS DLL_IMPORT
+#endif
+#else
+#define NVTT_API
+#define NVTT_CLASS
+#endif
+
+// Public interface.
+namespace nvtt
+{
+	/// Supported compression formats.
+	enum Format
+	{
+		// No compression.
+		Format_RGB,
+		Format_RGBA = Format_RGB,
+
+		// DX9 formats.
+		Format_DXT1,
+	//	Format_DXT1a,	// DXT1 with binary alpha.
+		Format_DXT3,
+		Format_DXT5,
+		Format_DXT5n,	// Compressed HILO: R=0, G=x, B=0, A=y
+		
+		// DX10 formats.
+		Format_BC1 = Format_DXT1,
+		Format_BC2 = Format_DXT3,
+		Format_BC3 = Format_DXT5,
+		Format_BC3n = Format_DXT5n,
+		Format_BC4,		// ATI1
+		Format_BC5,		// 3DC, ATI2
+
+		// OpenGL formats.
+		Format_LATC = Format_BC5,
+	};
+	
+	/// Quality modes.
+	enum Quality
+	{
+		Quality_Fastest,
+		Quality_Normal,
+		Quality_Production,
+		Quality_Highest,
+	};
+
+	/// Compression options. This class describes the desired compression format and other compression settings.
+	class CompressionOptions
+	{
+	public:
+		NVTT_API CompressionOptions();
+		NVTT_API ~CompressionOptions();
+		
+		NVTT_API void reset();
+		
+		NVTT_API void setFormat(Format format);
+		NVTT_API void setQuality(Quality quality, float errorThreshold = 0.5f);
+		NVTT_API void setColorWeights(float red, float green, float blue);
+		NVTT_API void enableHardwareCompression(bool enable);
+		
+		NVTT_API void setExternalCompressor(const char * name);
+
+		// Set color mask to describe the RGB/RGBA format.
+		NVTT_API void setPixelFormat(uint bitcount, uint rmask, uint gmask, uint bmask, uint amask);
+
+	//private:
+		struct Private;
+		Private & m;
+	};
+
+
+	/// Wrap modes. // This matches FloatImage::WrapMode.
+	enum WrapMode
+	{
+		WrapMode_Clamp,
+		WrapMode_Repeat,
+		WrapMode_Mirror,
+	};
+	
+	/// Texture types.
+	enum TextureType
+	{
+		TextureType_2D,
+		TextureType_Cube,
+	//	TextureType_3D,
+	};
+	
+	/// Input formats.
+	enum InputFormat
+	{
+		InputFormat_BGRA_8UB,
+	//	InputFormat_RGBE_8UB,
+	//	InputFormat_BGRA_32F,
+	};
+	
+	/// Mipmap downsampling filters.
+	enum MipmapFilter
+	{
+		MipmapFilter_Box,		///< Box filter is quite good and very fast.
+		MipmapFilter_Triangle,	///< Triangle filter blurs the results too much, but that might be what you want.
+		MipmapFilter_Kaiser,	///< Kaiser-windowed Sinc filter is the best downsampling filter.
+	};
+	
+
+
+	/// Input options. Specify format and layout of the input texture.
+	struct InputOptions
+	{
+		NVTT_API InputOptions();
+		NVTT_API ~InputOptions();
+		
+		// Set default options.
+		NVTT_API void reset();
+		
+		// Setup input layout.
+		NVTT_API void setTextureLayout(TextureType type, int w, int h, int d = 1);
+		NVTT_API void resetTextureLayout();
+
+		// Set mipmap data. Copies the data.
+		NVTT_API bool setMipmapData(const void * data, int w, int h, int d = 1, int face = 0, int mipmap = 0);		
+
+		// Describe the format of the input.
+		NVTT_API void setFormat(InputFormat fmt, bool alphaTransparency);
+
+		// Set gamma settings.
+		NVTT_API void setGamma(float inputGamma, float outputGamma);
+
+		// Set texture wrappign mode.
+		NVTT_API void setWrapMode(WrapMode mode);
+
+		// Set mipmapping options.
+		NVTT_API void setMipmapping(bool generateMipmaps, MipmapFilter filter = MipmapFilter_Kaiser, int maxLevel = -1);
+		
+		// Set quantization options.
+		NVTT_API void setQuantization(bool colorDithering, bool alphaDithering, bool binaryAlpha, int alphaThreshold = 127);
+
+		// Set normal map options.
+		NVTT_API void setConvertToNormalMap(bool convert);
+		NVTT_API void setHeightEvaluation(float redScale, float greenScale, float blueScale, float alphaScale);
+		NVTT_API void setNormalFilter(float small, float medium, float big, float large);
+		NVTT_API void setNormalizeMipmaps(bool b);
+
+	//private:
+		struct Private;
+		Private & m;
+	};
+	
+	
+	/// Output handler.
+	struct OutputHandler
+	{
+		virtual ~OutputHandler() {}
+		
+		/// Indicate the start of a new compressed image that's part of the final texture.
+		virtual void mipmap(int size, int width, int height, int depth, int face, int miplevel) = 0;
+		
+		/// Output data. Compressed data is output as soon as it's generated to minimize memory allocations.
+		virtual void writeData(const void * data, int size) = 0;
+	};
+
+	/// Error codes.
+	enum Error
+	{
+		Error_InvalidInput,
+		Error_UserInterruption,
+		Error_UnsupportedFeature,
+		Error_CudaError,
+		Error_Unknown,
+	};
+	
+	/// Error handler.
+	struct ErrorHandler
+	{
+		virtual ~ErrorHandler() {}
+		
+		// Signal error.
+		virtual void error(Error e) = 0;
+	};
+
+
+	/// Output Options. This class holds pointers to the interfaces that are used to report the output of 
+	/// the compressor to the user.
+	struct OutputOptions
+	{
+		OutputOptions() : outputHandler(NULL), outputHeader(true) { reset(); }
+		OutputOptions(OutputHandler * oh, ErrorHandler * eh) : outputHandler(oh), errorHandler(eh), outputHeader(true) { reset(); }
+		
+		// Set default options.
+		NVTT_API void reset();
+		
+		OutputHandler * outputHandler;
+		ErrorHandler * errorHandler;
+		bool outputHeader;
+	};
+	
+	
+	// Main entrypoint of the compression library.
+	NVTT_API bool compress(const InputOptions & inputOptions, const OutputOptions & outputOptions, const CompressionOptions & compressionOptions);
+	
+	// Estimate the size of compressing the input with the given options.
+	NVTT_API int estimateSize(const InputOptions & inputOptions, const CompressionOptions & compressionOptions);
+	
+	// Return string for the given error.
+	NVTT_API const char * errorString(Error e);
+
+} // nvtt namespace
+
+#endif // NV_TT_H
--- a/src/nvimage/nvtt/squish/CMakeLists.txt
+++ b/src/nvimage/nvtt/squish/CMakeLists.txt
@ -0,0 +1,52 @@
+PROJECT(squish)
+ENABLE_TESTING()
+
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+
+SET(SQUISH_SRCS
+	alpha.cpp
+	alpha.h
+	clusterfit.cpp
+	clusterfit.h
+	fastclusterfit.cpp
+	fastclusterfit.h
+	weightedclusterfit.cpp
+	weightedclusterfit.h
+	colourblock.cpp
+	colourblock.h
+	colourfit.cpp
+	colourfit.h
+	colourset.cpp
+	colourset.h
+	config.h
+	maths.cpp
+	maths.h
+	rangefit.cpp
+	rangefit.h
+	singlecolourfit.cpp
+	singlecolourfit.h
+	singlecolourlookup.inl
+	squish.cpp
+	squish.h
+	simd.h
+	simd_sse.h
+	simd_ve.h)
+
+ADD_LIBRARY(squish STATIC ${SQUISH_SRCS})
+
+# libpng
+FIND_PACKAGE(PNG)
+
+IF(PNG_FOUND)
+	INCLUDE_DIRECTORIES(${PNG_INCLUDE_DIR})
+	ADD_EXECUTABLE(squishpng extra/squishpng.cpp)
+	TARGET_LINK_LIBRARIES(squishpng squish ${PNG_LIBRARY})
+ENDIF(PNG_FOUND)
+
+#ADD_EXECUTABLE(squishgen extra/squishgen.cpp)
+
+ADD_EXECUTABLE(squishtest extra/squishtest.cpp)
+TARGET_LINK_LIBRARIES(squishtest squish)
+
+ADD_TEST(SQUISHTEST squishtest)
+
--- a/src/nvimage/nvtt/squish/ChangeLog
+++ b/src/nvimage/nvtt/squish/ChangeLog
@ -0,0 +1,38 @@
+
+1.7
+* Fixed floating-point equality issue in clusterfit sort (x86 affected only)
+* Implemented proper SSE(2) floor function for 50% speedup on SSE builds 
+* The range fit implementation now uses the correct colour metric
+
+1.6
+* Fixed bug in CompressImage where masked pixels were not skipped over
+* DXT3 and DXT5 alpha compression now properly use the mask to ignore pixels
+* Fixed major DXT1 bug that can generate unexpected transparent pixels
+
+1.5
+* Added CompressMasked function to handle incomplete DXT blocks more cleanly
+* Added kWeightColourByAlpha flag for better quality images when alpha blending
+
+1.4
+* Fixed stack overflow in rangefit
+
+1.3
+* Worked around SSE floor implementation bug, proper fix needed!
+* This release has visual studio and makefile builds that work
+
+1.2
+* Added provably optimal single colour compressor
+* Added extra/squishgen.cpp that generates single colour lookup tables
+
+1.1
+* Fixed a DXT1 colour output bug
+* Changed argument order for Decompress function to match Compress
+* Added GetStorageRequirements function
+* Added CompressImage function
+* Added DecompressImage function
+* Moved squishtool.cpp to extra/squishpng.cpp
+* Added extra/squishtest.cpp
+
+1.0
+* Initial release
+
--- a/src/nvimage/nvtt/squish/Doxyfile
+++ b/src/nvimage/nvtt/squish/Doxyfile
@ -0,0 +1,223 @@
+# Doxyfile 1.4.6
+
+#---------------------------------------------------------------------------
+# Project related configuration options
+#---------------------------------------------------------------------------
+PROJECT_NAME           = squish
+PROJECT_NUMBER         = 1.1
+OUTPUT_DIRECTORY       = docs 
+CREATE_SUBDIRS         = NO
+OUTPUT_LANGUAGE        = English
+USE_WINDOWS_ENCODING   = NO
+BRIEF_MEMBER_DESC      = YES
+REPEAT_BRIEF           = YES
+ABBREVIATE_BRIEF       = 
+ALWAYS_DETAILED_SEC    = NO
+INLINE_INHERITED_MEMB  = NO
+FULL_PATH_NAMES        = YES
+STRIP_FROM_PATH        = 
+STRIP_FROM_INC_PATH    = 
+SHORT_NAMES            = NO
+JAVADOC_AUTOBRIEF      = NO
+MULTILINE_CPP_IS_BRIEF = NO
+DETAILS_AT_TOP         = NO
+INHERIT_DOCS           = YES
+SEPARATE_MEMBER_PAGES  = NO
+TAB_SIZE               = 4
+ALIASES                = 
+OPTIMIZE_OUTPUT_FOR_C  = NO
+OPTIMIZE_OUTPUT_JAVA   = NO
+BUILTIN_STL_SUPPORT    = NO
+DISTRIBUTE_GROUP_DOC   = NO
+SUBGROUPING            = YES
+#---------------------------------------------------------------------------
+# Build related configuration options
+#---------------------------------------------------------------------------
+EXTRACT_ALL            = YES
+EXTRACT_PRIVATE        = NO
+EXTRACT_STATIC         = NO
+EXTRACT_LOCAL_CLASSES  = YES
+EXTRACT_LOCAL_METHODS  = NO
+HIDE_UNDOC_MEMBERS     = NO
+HIDE_UNDOC_CLASSES     = NO
+HIDE_FRIEND_COMPOUNDS  = NO
+HIDE_IN_BODY_DOCS      = NO
+INTERNAL_DOCS          = NO
+CASE_SENSE_NAMES       = NO
+HIDE_SCOPE_NAMES       = NO
+SHOW_INCLUDE_FILES     = YES
+INLINE_INFO            = YES
+SORT_MEMBER_DOCS       = YES
+SORT_BRIEF_DOCS        = NO
+SORT_BY_SCOPE_NAME     = NO
+GENERATE_TODOLIST      = YES
+GENERATE_TESTLIST      = YES
+GENERATE_BUGLIST       = YES
+GENERATE_DEPRECATEDLIST= YES
+ENABLED_SECTIONS       = 
+MAX_INITIALIZER_LINES  = 30
+SHOW_USED_FILES        = YES
+SHOW_DIRECTORIES       = NO
+FILE_VERSION_FILTER    = 
+#---------------------------------------------------------------------------
+# configuration options related to warning and progress messages
+#---------------------------------------------------------------------------
+QUIET                  = YES
+WARNINGS               = YES
+WARN_IF_UNDOCUMENTED   = YES
+WARN_IF_DOC_ERROR      = YES
+WARN_NO_PARAMDOC       = NO
+WARN_FORMAT            = "$file:$line: $text"
+WARN_LOGFILE           = 
+#---------------------------------------------------------------------------
+# configuration options related to the input files
+#---------------------------------------------------------------------------
+INPUT                  = squish.h
+FILE_PATTERNS          = 
+RECURSIVE              = NO
+EXCLUDE                = 
+EXCLUDE_SYMLINKS       = NO
+EXCLUDE_PATTERNS       = 
+EXAMPLE_PATH           = 
+EXAMPLE_PATTERNS       = 
+EXAMPLE_RECURSIVE      = NO
+IMAGE_PATH             = 
+INPUT_FILTER           = 
+FILTER_PATTERNS        = 
+FILTER_SOURCE_FILES    = NO
+#---------------------------------------------------------------------------
+# configuration options related to source browsing
+#---------------------------------------------------------------------------
+SOURCE_BROWSER         = NO
+INLINE_SOURCES         = NO
+STRIP_CODE_COMMENTS    = YES
+REFERENCED_BY_RELATION = YES
+REFERENCES_RELATION    = YES
+USE_HTAGS              = NO
+VERBATIM_HEADERS       = YES
+#---------------------------------------------------------------------------
+# configuration options related to the alphabetical class index
+#---------------------------------------------------------------------------
+ALPHABETICAL_INDEX     = NO
+COLS_IN_ALPHA_INDEX    = 5
+IGNORE_PREFIX          = 
+#---------------------------------------------------------------------------
+# configuration options related to the HTML output
+#---------------------------------------------------------------------------
+GENERATE_HTML          = YES
+HTML_OUTPUT            = html
+HTML_FILE_EXTENSION    = .html
+HTML_HEADER            = 
+HTML_FOOTER            = 
+HTML_STYLESHEET        = 
+HTML_ALIGN_MEMBERS     = YES
+GENERATE_HTMLHELP      = NO
+CHM_FILE               = 
+HHC_LOCATION           = 
+GENERATE_CHI           = NO
+BINARY_TOC             = NO
+TOC_EXPAND             = NO
+DISABLE_INDEX          = NO
+ENUM_VALUES_PER_LINE   = 4
+GENERATE_TREEVIEW      = NO
+TREEVIEW_WIDTH         = 250
+#---------------------------------------------------------------------------
+# configuration options related to the LaTeX output
+#---------------------------------------------------------------------------
+GENERATE_LATEX         = NO
+LATEX_OUTPUT           = latex
+LATEX_CMD_NAME         = latex
+MAKEINDEX_CMD_NAME     = makeindex
+COMPACT_LATEX          = NO
+PAPER_TYPE             = a4wide
+EXTRA_PACKAGES         = 
+LATEX_HEADER           = 
+PDF_HYPERLINKS         = NO
+USE_PDFLATEX           = NO
+LATEX_BATCHMODE        = NO
+LATEX_HIDE_INDICES     = NO
+#---------------------------------------------------------------------------
+# configuration options related to the RTF output
+#---------------------------------------------------------------------------
+GENERATE_RTF           = NO
+RTF_OUTPUT             = rtf
+COMPACT_RTF            = NO
+RTF_HYPERLINKS         = NO
+RTF_STYLESHEET_FILE    = 
+RTF_EXTENSIONS_FILE    = 
+#---------------------------------------------------------------------------
+# configuration options related to the man page output
+#---------------------------------------------------------------------------
+GENERATE_MAN           = NO
+MAN_OUTPUT             = man
+MAN_EXTENSION          = .3
+MAN_LINKS              = NO
+#---------------------------------------------------------------------------
+# configuration options related to the XML output
+#---------------------------------------------------------------------------
+GENERATE_XML           = NO
+XML_OUTPUT             = xml
+XML_SCHEMA             = 
+XML_DTD                = 
+XML_PROGRAMLISTING     = YES
+#---------------------------------------------------------------------------
+# configuration options for the AutoGen Definitions output
+#---------------------------------------------------------------------------
+GENERATE_AUTOGEN_DEF   = NO
+#---------------------------------------------------------------------------
+# configuration options related to the Perl module output
+#---------------------------------------------------------------------------
+GENERATE_PERLMOD       = NO
+PERLMOD_LATEX          = NO
+PERLMOD_PRETTY         = YES
+PERLMOD_MAKEVAR_PREFIX = 
+#---------------------------------------------------------------------------
+# Configuration options related to the preprocessor   
+#---------------------------------------------------------------------------
+ENABLE_PREPROCESSING   = YES
+MACRO_EXPANSION        = NO
+EXPAND_ONLY_PREDEF     = NO
+SEARCH_INCLUDES        = YES
+INCLUDE_PATH           = 
+INCLUDE_FILE_PATTERNS  = 
+PREDEFINED             = 
+EXPAND_AS_DEFINED      = 
+SKIP_FUNCTION_MACROS   = YES
+#---------------------------------------------------------------------------
+# Configuration::additions related to external references   
+#---------------------------------------------------------------------------
+TAGFILES               = 
+GENERATE_TAGFILE       = 
+ALLEXTERNALS           = NO
+EXTERNAL_GROUPS        = YES
+PERL_PATH              = /usr/bin/perl
+#---------------------------------------------------------------------------
+# Configuration options related to the dot tool   
+#---------------------------------------------------------------------------
+CLASS_DIAGRAMS         = YES
+HIDE_UNDOC_RELATIONS   = YES
+HAVE_DOT               = YES
+CLASS_GRAPH            = YES
+COLLABORATION_GRAPH    = YES
+GROUP_GRAPHS           = YES
+UML_LOOK               = NO
+TEMPLATE_RELATIONS     = NO
+INCLUDE_GRAPH          = YES
+INCLUDED_BY_GRAPH      = YES
+CALL_GRAPH             = NO
+GRAPHICAL_HIERARCHY    = YES
+DIRECTORY_GRAPH        = YES
+DOT_IMAGE_FORMAT       = png
+DOT_PATH               = /Applications/Graphviz.app/Contents/MacOS
+DOTFILE_DIRS           = 
+MAX_DOT_GRAPH_WIDTH    = 1024
+MAX_DOT_GRAPH_HEIGHT   = 1024
+MAX_DOT_GRAPH_DEPTH    = 0
+DOT_TRANSPARENT        = NO
+DOT_MULTI_TARGETS      = NO
+GENERATE_LEGEND        = YES
+DOT_CLEANUP            = YES
+#---------------------------------------------------------------------------
+# Configuration::additions related to the search engine   
+#---------------------------------------------------------------------------
+SEARCHENGINE           = NO
--- a/src/nvimage/nvtt/squish/Makefile
+++ b/src/nvimage/nvtt/squish/Makefile
@ -0,0 +1,31 @@
+
+include config
+
+SRC = alpha.cpp clusterfit.cpp colourblock.cpp colourfit.cpp colourset.cpp maths.cpp rangefit.cpp singlecolourfit.cpp squish.cpp
+
+OBJ = $(SRC:%.cpp=%.o)
+
+LIB = libsquish.a
+
+all : $(LIB)
+
+install : $(LIB)
+	install squish.h $(INSTALL_DIR)/include 
+	install libsquish.a $(INSTALL_DIR)/lib
+
+uninstall:
+	$(RM) $(INSTALL_DIR)/include/squish.h
+	$(RM) $(INSTALL_DIR)/lib/libsquish.a
+
+$(LIB) : $(OBJ)
+	$(AR) cr $@ $?
+	ranlib $@
+
+%.o : %.cpp
+	$(CXX) $(CPPFLAGS) -I. $(CXXFLAGS) -o$@ -c $<
+
+clean :
+	$(RM) $(OBJ) $(LIB)
+
+
+
--- a/src/nvimage/nvtt/squish/README
+++ b/src/nvimage/nvtt/squish/README
@ -0,0 +1,35 @@
+LICENSE
+-------
+
+The squish library is distributed under the terms and conditions of the MIT
+license. This license is specified at the top of each source file and must be
+preserved in its entirety.
+
+BUILDING AND INSTALLING THE LIBRARY
+-----------------------------------
+
+If you are using Visual Studio 2003 or above under Windows then load the Visual
+Studio 2003 project in the vs7 folder. By default, the library is built using
+SSE optimisations. To change this either change or remove the SQUISH_USE_SSE=1
+from the preprocessor symbols.
+
+If you are using a Mac then load the Xcode 2.2 project in the distribution. By
+default, the library is built using Altivec optimisations. To change this
+either change or remove SQUISH_USE_ALTIVEC=1 from the preprocessor symbols. I
+guess I'll have to think about changing this for the new Intel Macs that are
+rolling out...
+
+If you are using unix then first edit the config file in the base directory of
+the distribution, enabling Altivec or SSE with the USE_ALTIVEC or USE_SSE
+variables, and editing the optimisation flags passed to the C++ compiler if
+necessary. Then make can be used to build the library, and make install (from
+the superuser account) can be used to install (into /usr/local by default).
+
+REPORTING BUGS OR FEATURE REQUESTS
+----------------------------------
+
+Feedback can be sent to Simon Brown (the developer) at si@sjbrown.co.uk
+
+New releases are announced on the squish library homepage at
+http://sjbrown.co.uk/?code=squish
+
--- a/src/nvimage/nvtt/squish/alpha.cpp
+++ b/src/nvimage/nvtt/squish/alpha.cpp
@ -0,0 +1,326 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#include "alpha.h"
+#include <algorithm>
+
+namespace squish {
+
+static int FloatToInt( float a, int limit )
+{
+	// use ANSI round-to-zero behaviour to get round-to-nearest
+	int i = ( int )( a + 0.5f );
+
+	// clamp to the limit
+	if( i < 0 )
+		i = 0;
+	else if( i > limit )
+		i = limit; 
+
+	// done
+	return i;
+}
+
+void CompressAlphaDxt3( u8 const* rgba, void* block )
+{
+	u8* bytes = reinterpret_cast< u8* >( block );
+	
+	// quantise and pack the alpha values pairwise
+	for( int i = 0; i < 8; ++i )
+	{
+		// quantise down to 4 bits
+		float alpha1 = ( float )rgba[8*i + 3] * ( 15.0f/255.0f );
+		float alpha2 = ( float )rgba[8*i + 7] * ( 15.0f/255.0f );
+		int quant1 = FloatToInt( alpha1, 15 );
+		int quant2 = FloatToInt( alpha2, 15 );
+
+		// pack into the byte
+		bytes[i] = ( u8 )( quant1 | ( quant2 << 4 ) );
+	}
+}
+
+void DecompressAlphaDxt3( u8* rgba, void const* block )
+{
+	u8 const* bytes = reinterpret_cast< u8 const* >( block );
+	
+	// unpack the alpha values pairwise
+	for( int i = 0; i < 8; ++i )
+	{
+		// quantise down to 4 bits
+		u8 quant = bytes[i];
+		
+		// unpack the values
+		u8 lo = quant & 0x0f;
+		u8 hi = quant & 0xf0;
+
+		// convert back up to bytes
+		rgba[8*i + 3] = lo | ( lo << 4 );
+		rgba[8*i + 7] = hi | ( hi >> 4 );
+	}
+}
+
+static void FixRange( int& min, int& max, int steps )
+{
+	if( max - min < steps )
+		max = std::min( min + steps, 255 );
+	if( max - min < steps )
+		min = std::max( 0, max - steps );
+}
+
+static int FitCodes( u8 const* rgba, u8 const* codes, u8* indices )
+{
+	// fit each alpha value to the codebook
+	int err = 0;
+	for( int i = 0; i < 16; ++i )
+	{
+		// find the least error and corresponding index
+		int value = rgba[4*i + 3];
+		int least = INT_MAX;
+		int index = 0;
+		for( int j = 0; j < 8; ++j )
+		{
+			// get the squared error from this code
+			int dist = ( int )value - ( int )codes[j];
+			dist *= dist;
+			
+			// compare with the best so far
+			if( dist < least )
+			{
+				least = dist;
+				index = j;
+			}
+		}
+		
+		// save this index and accumulate the error
+		indices[i] = ( u8 )index;
+		err += least;
+	}
+	
+	// return the total error
+	return err;
+}
+
+static void WriteAlphaBlock( int alpha0, int alpha1, u8 const* indices, void* block )
+{
+	u8* bytes = reinterpret_cast< u8* >( block );
+	
+	// write the first two bytes
+	bytes[0] = ( u8 )alpha0;
+	bytes[1] = ( u8 )alpha1;
+	
+	// pack the indices with 3 bits each
+	u8* dest = bytes + 2;
+	u8 const* src = indices;
+	for( int i = 0; i < 2; ++i )
+	{
+		// pack 8 3-bit values
+		int value = 0;
+		for( int j = 0; j < 8; ++j )
+		{
+			int index = *src++;
+			value |= ( index << 3*j );
+		}
+			
+		// store in 3 bytes
+		for( int j = 0; j < 3; ++j )
+		{
+			int byte = ( value >> 8*j ) & 0xff;
+			*dest++ = ( u8 )byte;
+		}
+	}
+}
+
+static void WriteAlphaBlock5( int alpha0, int alpha1, u8 const* indices, void* block )
+{
+	// check the relative values of the endpoints
+	if( alpha0 > alpha1 )
+	{
+		// swap the indices
+		u8 swapped[16];
+		for( int i = 0; i < 16; ++i )
+		{
+			u8 index = indices[i];
+			if( index == 0 )
+				swapped[i] = 1;
+			else if( index == 1 )
+				swapped[i] = 0;
+			else if( index <= 5 )
+				swapped[i] = 7 - index;
+			else 
+				swapped[i] = index;
+		}
+		
+		// write the block
+		WriteAlphaBlock( alpha1, alpha0, swapped, block );
+	}
+	else
+	{
+		// write the block
+		WriteAlphaBlock( alpha0, alpha1, indices, block );
+	}	
+}
+
+static void WriteAlphaBlock7( int alpha0, int alpha1, u8 const* indices, void* block )
+{
+	// check the relative values of the endpoints
+	if( alpha0 < alpha1 )
+	{
+		// swap the indices
+		u8 swapped[16];
+		for( int i = 0; i < 16; ++i )
+		{
+			u8 index = indices[i];
+			if( index == 0 )
+				swapped[i] = 1;
+			else if( index == 1 )
+				swapped[i] = 0;
+			else
+				swapped[i] = 9 - index;
+		}
+		
+		// write the block
+		WriteAlphaBlock( alpha1, alpha0, swapped, block );
+	}
+	else
+	{
+		// write the block
+		WriteAlphaBlock( alpha0, alpha1, indices, block );
+	}	
+}
+
+void CompressAlphaDxt5( u8 const* rgba, void* block )
+{
+	// get the range for 5-alpha and 7-alpha interpolation
+	int min5 = 255;
+	int max5 = 0;
+	int min7 = 255;
+	int max7 = 0;
+	for( int i = 0; i < 16; ++i )
+	{
+		// incorporate into the min/max
+		int value = rgba[4*i + 3];
+		if( value < min7 )
+			min7 = value;
+		if( value > max7 )
+			max7 = value;
+		if( value != 0 && value < min5 )
+			min5 = value;
+		if( value != 255 && value > max5 )
+			max5 = value;
+	}
+	
+	// handle the case that no valid range was found
+	if( min5 > max5 )
+		min5 = max5;
+	if( min7 > max7 )
+		min7 = max7;
+		
+	// fix the range to be the minimum in each case
+	FixRange( min5, max5, 5 );
+	FixRange( min7, max7, 7 );
+	
+	// set up the 5-alpha code book
+	u8 codes5[8];
+	codes5[0] = ( u8 )min5;
+	codes5[1] = ( u8 )max5;
+	for( int i = 1; i < 5; ++i )
+		codes5[1 + i] = ( u8 )( ( ( 5 - i )*min5 + i*max5 )/5 );
+	codes5[6] = 0;
+	codes5[7] = 255;
+	
+	// set up the 7-alpha code book
+	u8 codes7[8];
+	codes7[0] = ( u8 )min7;
+	codes7[1] = ( u8 )max7;
+	for( int i = 1; i < 7; ++i )
+		codes7[1 + i] = ( u8 )( ( ( 7 - i )*min7 + i*max7 )/7 );
+		
+	// fit the data to both code books
+	u8 indices5[16];
+	u8 indices7[16];
+	int err5 = FitCodes( rgba, codes5, indices5 );
+	int err7 = FitCodes( rgba, codes7, indices7 );
+	
+	// save the block with least error
+	if( err5 <= err7 )
+		WriteAlphaBlock5( min5, max5, indices5, block );
+	else
+		WriteAlphaBlock7( min7, max7, indices7, block );
+}
+
+void DecompressAlphaDxt5( u8* rgba, void const* block )
+{
+	// get the two alpha values
+	u8 const* bytes = reinterpret_cast< u8 const* >( block );
+	int alpha0 = bytes[0];
+	int alpha1 = bytes[1];
+	
+	// compare the values to build the codebook
+	u8 codes[8];
+	codes[0] = ( u8 )alpha0;
+	codes[1] = ( u8 )alpha1;
+	if( alpha0 <= alpha1 )
+	{
+		// use 5-alpha codebook
+		for( int i = 1; i < 5; ++i )
+			codes[1 + i] = ( u8 )( ( ( 5 - i )*alpha0 + i*alpha1 )/5 );
+		codes[6] = 0;
+		codes[7] = 255;
+	}
+	else
+	{
+		// use 7-alpha codebook
+		for( int i = 1; i < 7; ++i )
+			codes[1 + i] = ( u8 )( ( ( 7 - i )*alpha0 + i*alpha1 )/7 );
+	}
+	
+	// decode the indices
+	u8 indices[16];
+	u8 const* src = bytes + 2;
+	u8* dest = indices;
+	for( int i = 0; i < 2; ++i )
+	{
+		// grab 3 bytes
+		int value = 0;
+		for( int j = 0; j < 3; ++j )
+		{
+			int byte = *src++;
+			value |= ( byte << 8*j );
+		}
+		
+		// unpack 8 3-bit values from it
+		for( int j = 0; j < 8; ++j )
+		{
+			int index = ( value >> 3*j ) & 0x7;
+			*dest++ = ( u8 )index;
+		}
+	}
+	
+	// write out the indexed codebook values
+	for( int i = 0; i < 16; ++i )
+		rgba[4*i + 3] = codes[indices[i]];
+}
+
+} // namespace squish
--- a/src/nvimage/nvtt/squish/alpha.h
+++ b/src/nvimage/nvtt/squish/alpha.h
@ -0,0 +1,41 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#ifndef SQUISH_ALPHA_H
+#define SQUISH_ALPHA_H
+
+#include <squish.h>
+
+namespace squish {
+
+void CompressAlphaDxt3( u8 const* rgba, void* block );
+void CompressAlphaDxt5( u8 const* rgba, void* block );
+
+void DecompressAlphaDxt3( u8* rgba, void const* block );
+void DecompressAlphaDxt5( u8* rgba, void const* block );
+
+} // namespace squish
+
+#endif // ndef SQUISH_ALPHA_H
--- a/src/nvimage/nvtt/squish/clusterfit.cpp
+++ b/src/nvimage/nvtt/squish/clusterfit.cpp
@ -0,0 +1,499 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#include "clusterfit.h"
+#include "colourset.h"
+#include "colourblock.h"
+#include <cfloat>
+
+namespace squish {
+
+ClusterFit::ClusterFit( ColourSet const* colours, int flags ) 
+  : ColourFit( colours, flags )
+{
+	// initialise the best error
+#if SQUISH_USE_SIMD
+	m_besterror = VEC4_CONST( FLT_MAX );
+#else
+	m_besterror = FLT_MAX;
+#endif
+
+/*	// initialise the metric
+	bool perceptual = ( ( m_flags & kColourMetricPerceptual ) != 0 );
+#if SQUISH_USE_SIMD
+	if( perceptual )
+		m_metric = Vec4( 0.2126f, 0.7152f, 0.0722f, 0.0f );
+	else
+		m_metric = VEC4_CONST( 1.0f );	
+#else
+	if( perceptual )
+		m_metric = Vec3( 0.2126f, 0.7152f, 0.0722f );
+	else
+		m_metric = Vec3( 1.0f );
+#endif
+*/
+	// cache some values
+	int const count = m_colours->GetCount();
+	Vec3 const* values = m_colours->GetPoints();
+	
+	// get the covariance matrix
+	Sym3x3 covariance = ComputeWeightedCovariance( count, values, m_colours->GetWeights() );
+	
+	// compute the principle component
+	Vec3 principle = ComputePrincipleComponent( covariance );
+
+	// build the list of values
+	float dps[16];
+	for( int i = 0; i < count; ++i )
+	{
+		dps[i] = Dot( values[i], principle );
+		m_order[i] = i;
+	}
+	
+	// stable sort
+	for( int i = 0; i < count; ++i )
+	{
+		for( int j = i; j > 0 && dps[j] < dps[j - 1]; --j )
+		{
+			std::swap( dps[j], dps[j - 1] );
+			std::swap( m_order[j], m_order[j - 1] );
+		}
+	}
+
+	// weight all the points
+#if SQUISH_USE_SIMD
+	Vec4 const* unweighted = m_colours->GetPointsSimd();
+	Vec4 const* weights = m_colours->GetWeightsSimd();
+	m_xxsum = VEC4_CONST( 0.0f );
+#else
+	Vec3 const* unweighted = m_colours->GetPoints();
+	float const* weights = m_colours->GetWeights();
+	m_xxsum = Vec3( 0.0f );
+#endif
+	for( int i = 0; i < count; ++i )
+	{
+		int p = m_order[i];
+		m_unweighted[i] = unweighted[p];
+		m_weights[i] = weights[p];
+		m_weighted[i] = weights[p]*unweighted[p];
+		m_xxsum += m_weighted[i]*m_weighted[i];
+	}
+}
+
+
+void ClusterFit::setMetric(float r, float g, float b)
+{
+#if SQUISH_USE_SIMD
+	m_metric = Vec4(r, g, b, 0);
+#else
+	m_metric = Vec3(r, g, b);
+#endif
+}
+
+float ClusterFit::bestError() const
+{
+#if SQUISH_USE_SIMD
+	return m_besterror.GetVec3().X();
+#else
+	return m_besterror;
+#endif
+}
+
+
+void ClusterFit::Compress3( void* block )
+{
+	// declare variables
+	int const count = m_colours->GetCount();
+#if SQUISH_USE_SIMD
+	Vec4 beststart = VEC4_CONST( 0.0f );
+	Vec4 bestend = VEC4_CONST( 0.0f );
+	Vec4 besterror = VEC4_CONST( FLT_MAX );
+	Vec4 const half = VEC4_CONST( 0.5f );
+	Vec4 const zero = VEC4_CONST( 0.0f );
+#else
+	Vec3 beststart( 0.0f );
+	Vec3 bestend( 0.0f );
+	float besterror = FLT_MAX;
+	float const half = 0.5f;
+	float const zero = 0.0f;
+#endif	
+
+	// check all possible clusters for this total order
+	u8 indices[16];
+	u8 bestindices[16];
+	
+	// first cluster [0,i) is at the start
+	for( int m = 0; m < count; ++m )
+	{
+		indices[m] = 0;
+		m_alpha[m] = m_weights[m];
+		m_beta[m] = zero;
+	}
+	for( int i = count; i >= 0; --i )
+	{
+		// second cluster [i,j) is half along
+		for( int m = i; m < count; ++m )
+		{
+			indices[m] = 2;
+			m_alpha[m] = m_beta[m] = half*m_weights[m];
+		}		
+		for( int j = count; j > i; --j )
+		{
+			// last cluster [j,k) is at the end
+			if( j < count )
+			{
+				indices[j] = 1;
+				m_alpha[j] = zero;
+				m_beta[j] = m_weights[j];
+			}		
+			
+			// solve a least squares problem to place the endpoints
+#if SQUISH_USE_SIMD
+			Vec4 start, end;
+			Vec4 error = SolveLeastSquares( start, end );
+#else
+			Vec3 start, end;
+			float error = SolveLeastSquares( start, end );
+#endif
+
+			// keep the solution if it wins
+#if SQUISH_USE_SIMD
+			if( CompareAnyLessThan( error, besterror ) )
+#else
+			if( error < besterror )
+#endif
+			{
+				beststart = start;
+				bestend = end;
+				for( int m = 0; m < 16; ++m )	// TODO: make this faster?
+					bestindices[m] = indices[m];
+				besterror = error;
+			}
+		}
+	}
+	
+	// save the block if necessary
+#if SQUISH_USE_SIMD
+	if( CompareAnyLessThan( besterror, m_besterror ) )
+#else
+	if( besterror < m_besterror )
+#endif
+	{
+		// remap the indices
+		u8 unordered[16];
+		for( int i = 0; i < count; ++i )
+			unordered[m_order[i]] = bestindices[i];
+		m_colours->RemapIndices( unordered, bestindices );
+		
+		// save the block
+#if SQUISH_USE_SIMD
+		WriteColourBlock3( beststart.GetVec3(), bestend.GetVec3(), bestindices, block );
+#else
+		WriteColourBlock3( beststart, bestend, bestindices, block );
+#endif
+
+		// save the error
+		m_besterror = besterror;
+	}
+}
+
+//static int run = 0;
+//static bool debug = false;
+
+void ClusterFit::Compress4( void* block )
+{
+	//debug = (run == 1);
+	//run++;
+
+	// declare variables
+	int const count = m_colours->GetCount();
+#if SQUISH_USE_SIMD
+	Vec4 beststart = VEC4_CONST( 0.0f );
+	Vec4 bestend = VEC4_CONST( 0.0f );
+	Vec4 besterror = m_besterror;
+	Vec4 const twothirds = VEC4_CONST( 2.0f/3.0f );
+	Vec4 const onethird = VEC4_CONST( 1.0f/3.0f );
+	Vec4 const zero = VEC4_CONST( 0.0f );
+#else
+	Vec3 beststart( 0.0f );
+	Vec3 bestend( 0.0f );
+	float besterror = m_besterror;
+	float const twothirds = 2.0f/3.0f;
+	float const onethird = 1.0f/3.0f;
+	float const zero = 0.0f;
+#endif
+
+	// check all possible clusters for this total order
+	u8 indices[16];
+	u8 bestindices[16];
+	
+	// first cluster [0,i) is at the start
+	for( int m = 0; m < count; ++m )
+	{
+		indices[m] = 0;
+		m_alpha[m] = m_weights[m];
+		m_beta[m] = zero;
+	}
+	for( int i = count; i >= 0; --i )
+	{
+		// second cluster [i,j) is one third along
+		for( int m = i; m < count; ++m )
+		{
+			indices[m] = 2;
+			m_alpha[m] = twothirds*m_weights[m];
+			m_beta[m] = onethird*m_weights[m];
+		}		
+		for( int j = count; j >= i; --j )
+		{
+			// third cluster [j,k) is two thirds along
+			for( int m = j; m < count; ++m )
+			{
+				indices[m] = 3;
+				m_alpha[m] = onethird*m_weights[m];
+				m_beta[m] = twothirds*m_weights[m];
+			}		
+			for( int k = count; k >= j; --k )
+			{
+				if (j + k == 0) continue;
+				
+				// last cluster [k,n) is at the end
+				if( k < count )
+				{
+					indices[k] = 1;
+					m_alpha[k] = zero;
+					m_beta[k] = m_weights[k];
+				}
+
+				/*unsigned int permutation = 0;
+				for(int p = 0; p < 16; p++) {
+					permutation |= indices[p] << (p * 2);
+				}
+				if (debug) printf("%X:\t", permutation);
+
+				if (debug && permutation == 0x55FFFFAA) __debugbreak();
+				*/
+
+				// solve a least squares problem to place the endpoints
+#if SQUISH_USE_SIMD
+				Vec4 start, end;
+				Vec4 error = SolveLeastSquares( start, end );
+#else
+				Vec3 start, end;
+				float error = SolveLeastSquares( start, end );
+#endif
+
+				// keep the solution if it wins
+#if SQUISH_USE_SIMD
+				if( CompareAnyLessThan( error, besterror ) )
+#else
+				if( error < besterror )
+#endif
+				{
+					beststart = start;
+					bestend = end;
+					for( int m = 0; m < 16; ++m )	// TODO: make this faster?
+						bestindices[m] = indices[m];	
+					besterror = error;
+				}
+			}
+		}
+	}
+
+	// save the block if necessary
+#if SQUISH_USE_SIMD
+	if( CompareAnyLessThan( besterror, m_besterror ) )
+#else
+	if( besterror < m_besterror )
+#endif
+	{
+		// remap the indices
+		u8 unordered[16];
+		for( int i = 0; i < count; ++i )
+			unordered[m_order[i]] = bestindices[i];
+		m_colours->RemapIndices( unordered, bestindices );
+		
+		// save the block
+#if SQUISH_USE_SIMD
+		WriteColourBlock4( beststart.GetVec3(), bestend.GetVec3(), bestindices, block );
+#else
+		WriteColourBlock4( beststart, bestend, bestindices, block );
+#endif
+
+		// save the error
+		m_besterror = besterror;
+	}
+}
+
+#if SQUISH_USE_SIMD
+Vec4 ClusterFit::SolveLeastSquares( Vec4& start, Vec4& end ) const
+{
+	// accumulate all the quantities we need
+	int const count = m_colours->GetCount();
+	Vec4 alpha2_sum = VEC4_CONST( 0.0f );
+	Vec4 beta2_sum = VEC4_CONST( 0.0f );
+	Vec4 alphabeta_sum = VEC4_CONST( 0.0f );
+	Vec4 alphax_sum = VEC4_CONST( 0.0f );
+	Vec4 betax_sum = VEC4_CONST( 0.0f );
+	for( int i = 0; i < count; ++i )
+	{
+		Vec4 alpha = m_alpha[i];
+		Vec4 beta = m_beta[i];
+		Vec4 x = m_weighted[i];
+	
+		alpha2_sum = MultiplyAdd( alpha, alpha, alpha2_sum );
+		beta2_sum = MultiplyAdd( beta, beta, beta2_sum );
+		alphabeta_sum = MultiplyAdd( alpha, beta, alphabeta_sum );
+		alphax_sum = MultiplyAdd( alpha, x, alphax_sum );
+		betax_sum = MultiplyAdd( beta, x, betax_sum );	
+	}
+
+	// select the results
+	Vec4 const zero = VEC4_CONST( 0.0f );
+	Vec4 beta2_sum_zero = CompareEqual( beta2_sum, zero );
+	Vec4 alpha2_sum_zero = CompareEqual( alpha2_sum, zero );
+	
+	Vec4 a1 = alphax_sum*Reciprocal( alpha2_sum );
+	Vec4 b1 = betax_sum*Reciprocal( beta2_sum );
+	
+	Vec4 factor = Reciprocal( NegativeMultiplySubtract( 
+		alphabeta_sum, alphabeta_sum, alpha2_sum*beta2_sum 
+	) );
+	Vec4 a2 = NegativeMultiplySubtract( 
+		betax_sum, alphabeta_sum, alphax_sum*beta2_sum
+	)*factor;
+	Vec4 b2 = NegativeMultiplySubtract(
+		alphax_sum, alphabeta_sum, betax_sum*alpha2_sum
+	)*factor;
+	
+	Vec4 a = Select( Select( a2, a1, beta2_sum_zero ), zero, alpha2_sum_zero );
+	Vec4 b = Select( Select( b2, b1, alpha2_sum_zero ), zero, beta2_sum_zero );
+
+	// clamp the output to [0, 1]
+	Vec4 const one = VEC4_CONST( 1.0f );
+	Vec4 const half = VEC4_CONST( 0.5f );
+	a = Min( one, Max( zero, a ) );
+	b = Min( one, Max( zero, b ) );
+
+	// clamp to the grid
+	Vec4 const grid( 31.0f, 63.0f, 31.0f, 0.0f );
+//	Vec4 const gridrcp( 1.0f/31.0f, 1.0f/63.0f, 1.0f/31.0f, 0.0f );
+	Vec4 const gridrcp( 0.03227752766457f, 0.01583151765563f, 0.03227752766457f, 0.0f ); // IC: use approximate grid fitting.
+	Vec4 const onethird = VEC4_CONST( 1.0f/3.0f );
+	Vec4 const twothirds = VEC4_CONST( 2.0f/3.0f );
+	a = Truncate( MultiplyAdd( grid, a, half ) )*gridrcp;
+	b = Truncate( MultiplyAdd( grid, b, half ) )*gridrcp;
+
+	// compute the error
+	Vec4 const two = VEC4_CONST( 2.0 );
+	Vec4 e1 = MultiplyAdd( b*b, beta2_sum, m_xxsum );
+	Vec4 e2 = MultiplyAdd( a, alphax_sum, b*betax_sum );
+	Vec4 e3 = MultiplyAdd( a*a, alpha2_sum, e1 );
+	Vec4 e4 = MultiplyAdd( a*b*alphabeta_sum - e2, two, e3 );
+
+	// apply the metric to the error term
+	Vec4 e5 = e4*m_metric;
+	Vec4 error = e5.SplatX() + e5.SplatY() + e5.SplatZ();
+	
+	// save the start and end
+	start = a;
+	end = b;
+	return error;
+}
+#else
+float ClusterFit::SolveLeastSquares( Vec3& start, Vec3& end ) const
+{
+	// accumulate all the quantities we need
+	int const count = m_colours->GetCount();
+	float alpha2_sum = 0.0f;
+	float beta2_sum = 0.0f;
+	float alphabeta_sum = 0.0f;
+	Vec3 alphax_sum( 0.0f );
+	Vec3 betax_sum( 0.0f );	
+	for( int i = 0; i < count; ++i )
+	{
+		float alpha = m_alpha[i];
+		float beta = m_beta[i];
+		Vec3 const& x = m_weighted[i];
+		
+		alpha2_sum += alpha*alpha;
+		beta2_sum += beta*beta;
+		alphabeta_sum += alpha*beta;
+		alphax_sum += alpha*x;
+		betax_sum += beta*x;
+	}
+
+	//if (debug) printf("%f %f %f", alpha2_sum, beta2_sum, alphabeta_sum);
+
+	// zero where non-determinate
+	Vec3 a, b;
+	if( beta2_sum == 0.0f )
+	{
+		a = alphax_sum/alpha2_sum;
+		b = Vec3( 0.0f );
+	}
+	else if( alpha2_sum == 0.0f )
+	{
+		a = Vec3( 0.0f );
+		b = betax_sum/beta2_sum;
+	}
+	else
+	{
+		float factor = 1.0f/( alpha2_sum*beta2_sum - alphabeta_sum*alphabeta_sum );
+		
+		a = ( alphax_sum*beta2_sum - betax_sum*alphabeta_sum )*factor;
+		b = ( betax_sum*alpha2_sum - alphax_sum*alphabeta_sum )*factor;
+	}
+	
+	// clamp the output to [0, 1]
+	Vec3 const one( 1.0f );
+	Vec3 const zero( 0.0f );
+	a = Min( one, Max( zero, a ) );
+	b = Min( one, Max( zero, b ) );
+
+	// clamp to the grid
+	Vec3 const grid( 31.0f, 63.0f, 31.0f );
+	//Vec3 const gridrcp( 1.0f/31.0f, 1.0f/63.0f, 1.0f/31.0f );
+	Vec3 const gridrcp(0.03227752766457f, 0.01583151765563f, 0.03227752766457f); // IC: use approximate grid fitting.
+	Vec3 const half( 0.5f );
+	a = Floor( grid*a + half )*gridrcp;
+	b = Floor( grid*b + half )*gridrcp;
+
+	// compute the error
+	Vec3 e1 = a*a*alpha2_sum + b*b*beta2_sum /*+ m_xxsum*/
+		+ 2.0f*( a*b*alphabeta_sum - a*alphax_sum - b*betax_sum );
+
+	// apply the metric to the error term
+	float error = Dot( e1, m_metric );
+	
+	//if (debug) printf(" - %f\n", error);
+
+	// save the start and end
+	start = a;
+	end = b;
+	return error;
+}
+#endif
+
+} // namespace squish
--- a/src/nvimage/nvtt/squish/clusterfit.h
+++ b/src/nvimage/nvtt/squish/clusterfit.h
@ -0,0 +1,79 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#ifndef SQUISH_CLUSTERFIT_H
+#define SQUISH_CLUSTERFIT_H
+
+#include "squish.h"
+#include "maths.h"
+#include "simd.h"
+#include "colourfit.h"
+
+namespace squish {
+
+class ClusterFit : public ColourFit
+{
+public:
+	ClusterFit( ColourSet const* colours, int flags );
+	
+	void setMetric(float r, float g, float b);
+	float bestError() const;
+
+private:
+	virtual void Compress3( void* block );
+	virtual void Compress4( void* block );
+
+	void Reorder( Vec3::Arg principle );
+
+	Vec3 m_principle;
+#if SQUISH_USE_SIMD
+	Vec4 SolveLeastSquares( Vec4& start, Vec4& end ) const;
+
+	Vec4 m_weighted[16];
+	Vec4 m_unweighted[16];
+	Vec4 m_weights[16];
+	Vec4 m_metric;
+	Vec4 m_alpha[16];
+	Vec4 m_beta[16];
+	Vec4 m_xxsum;
+	Vec4 m_besterror;
+#else
+	float SolveLeastSquares( Vec3& start, Vec3& end ) const;
+
+	Vec3 m_weighted[16];
+	Vec3 m_unweighted[16];
+	float m_weights[16];
+	Vec3 m_metric;
+	float m_alpha[16];
+	float m_beta[16];
+	Vec3 m_xxsum;
+	float m_besterror;
+#endif
+	int m_order[16];
+};
+
+} // namespace squish
+
+#endif // ndef SQUISH_CLUSTERFIT_H
--- a/src/nvimage/nvtt/squish/colourblock.cpp
+++ b/src/nvimage/nvtt/squish/colourblock.cpp
@ -0,0 +1,278 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#include "colourblock.h"
+
+namespace squish {
+
+static int FloatToInt( float a, int limit )
+{
+	// use ANSI round-to-zero behaviour to get round-to-nearest
+	int i = ( int )( a + 0.5f );
+
+	// clamp to the limit
+	if( i < 0 )
+		i = 0;
+	else if( i > limit )
+		i = limit; 
+
+	// done
+	return i;
+}
+
+static int FloatTo565( Vec3::Arg colour )
+{
+	// get the components in the correct range
+	int r = FloatToInt( 31.0f*colour.X(), 31 );
+	int g = FloatToInt( 63.0f*colour.Y(), 63 );
+	int b = FloatToInt( 31.0f*colour.Z(), 31 );
+	
+	// pack into a single value
+	return ( r << 11 ) | ( g << 5 ) | b;
+}
+
+static void WriteColourBlock( int a, int b, u8* indices, void* block )
+{
+	// get the block as bytes
+	u8* bytes = ( u8* )block;
+
+	// write the endpoints
+	bytes[0] = ( u8 )( a & 0xff );
+	bytes[1] = ( u8 )( a >> 8 );
+	bytes[2] = ( u8 )( b & 0xff );
+	bytes[3] = ( u8 )( b >> 8 );
+	
+	// write the indices
+	for( int i = 0; i < 4; ++i )
+	{
+		u8 const* ind = indices + 4*i;
+		bytes[4 + i] = ind[0] | ( ind[1] << 2 ) | ( ind[2] << 4 ) | ( ind[3] << 6 );
+	}
+}
+
+void WriteColourBlock3( Vec3::Arg start, Vec3::Arg end, u8 const* indices, void* block )
+{
+	// get the packed values
+	int a = FloatTo565( start );
+	int b = FloatTo565( end );
+
+	// remap the indices
+	u8 remapped[16];
+	if( a <= b )
+	{
+		// use the indices directly
+		for( int i = 0; i < 16; ++i )
+			remapped[i] = indices[i];
+	}
+	else
+	{
+		// swap a and b
+		std::swap( a, b );
+		for( int i = 0; i < 16; ++i )
+		{
+			if( indices[i] == 0 )
+				remapped[i] = 1;
+			else if( indices[i] == 1 )
+				remapped[i] = 0;
+			else
+				remapped[i] = indices[i];
+		}
+	}
+	
+	// write the block
+	WriteColourBlock( a, b, remapped, block );
+}
+
+void WriteColourBlock4( Vec3::Arg start, Vec3::Arg end, u8 const* indices, void* block )
+{
+	// get the packed values
+	int a = FloatTo565( start );
+	int b = FloatTo565( end );
+
+	// remap the indices
+	u8 remapped[16];
+	if( a < b )
+	{
+		// swap a and b
+		std::swap( a, b );
+		for( int i = 0; i < 16; ++i )
+			remapped[i] = ( indices[i] ^ 0x1 ) & 0x3;
+	}
+	else if( a == b )
+	{
+		// use index 0
+		for( int i = 0; i < 16; ++i )
+			remapped[i] = 0;
+	}
+	else
+	{
+		// use the indices directly
+		for( int i = 0; i < 16; ++i )
+			remapped[i] = indices[i];
+	}
+	
+	// write the block
+	WriteColourBlock( a, b, remapped, block );
+}
+
+/*
+static void WriteColourBlock( int a, int b, uint indices, void* block )
+{
+	// get the block as bytes
+	u8* bytes = ( u8* )block;
+
+	// write the endpoints
+	bytes[0] = ( u8 )( a & 0xff );
+	bytes[1] = ( u8 )( a >> 8 );
+	bytes[2] = ( u8 )( b & 0xff );
+	bytes[3] = ( u8 )( b >> 8 );
+	
+	// write the indices @@ Not sure that's correct...
+	bytes[4] = ( u8 )((indices >> 24) & 0xff);
+	bytes[5] = ( u8 )((indices >> 16) & 0xff);
+	bytes[6] = ( u8 )((indices >> 8) & 0xff);
+	bytes[7] = ( u8 )((indices >> 0) & 0xff);
+}
+
+void WriteColourBlock3( Vec3::Arg start, Vec3::Arg end, uint indices, void* block )
+{
+	// get the packed values
+	int a = FloatTo565( start );
+	int b = FloatTo565( end );
+
+	// remap the indices
+	if( a > b )
+	{
+		// swap a and b
+		std::swap( a, b );
+		indices ^= (~indices >> 1) & 0x55555555;
+	}
+	else if ( a == b )
+	{
+		indices = 0;
+	}
+	
+	// write the block
+	WriteColourBlock( a, b, indices, block );
+}
+
+void WriteColourBlock4( Vec3::Arg start, Vec3::Arg end, uint indices, void* block )
+{
+	// get the packed values
+	int a = FloatTo565( start );
+	int b = FloatTo565( end );
+
+	// remap the indices
+	if( a < b )
+	{
+		// swap a and b
+		std::swap( a, b );
+		indices ^= 0x55555555;
+	}
+	else if( a == b )
+	{
+		indices = 0;
+	}
+	
+	// write the block
+	WriteColourBlock( a, b, indices, block );
+}
+*/
+
+static int Unpack565( u8 const* packed, u8* colour )
+{
+	// build the packed value
+	int value = ( int )packed[0] | ( ( int )packed[1] << 8 );
+	
+	// get the components in the stored range
+	u8 red = ( u8 )( ( value >> 11 ) & 0x1f );
+	u8 green = ( u8 )( ( value >> 5 ) & 0x3f );
+	u8 blue = ( u8 )( value & 0x1f );
+
+	// scale up to 8 bits
+	colour[0] = ( red << 3 ) | ( red >> 2 );
+	colour[1] = ( green << 2 ) | ( green >> 4 );
+	colour[2] = ( blue << 3 ) | ( blue >> 2 );
+	colour[3] = 255;
+	
+	// return the value
+	return value;
+}
+
+void DecompressColour( u8* rgba, void const* block, bool isDxt1 )
+{
+	// get the block bytes
+	u8 const* bytes = reinterpret_cast< u8 const* >( block );
+	
+	// unpack the endpoints
+	u8 codes[16];
+	int a = Unpack565( bytes, codes );
+	int b = Unpack565( bytes + 2, codes + 4 );
+	
+	// generate the midpoints
+	for( int i = 0; i < 3; ++i )
+	{
+		int c = codes[i];
+		int d = codes[4 + i];
+
+		if( isDxt1 && a <= b )
+		{
+			codes[8 + i] = ( u8 )( ( c + d )/2 );
+			codes[12 + i] = 0;
+		}
+		else
+		{
+			codes[8 + i] = ( u8 )( ( 2*c + d )/3 );
+			codes[12 + i] = ( u8 )( ( c + 2*d )/3 );
+		}
+	}
+	
+	// fill in alpha for the intermediate values
+	codes[8 + 3] = 255;
+	codes[12 + 3] = ( isDxt1 && a <= b ) ? 0 : 255;
+	
+	// unpack the indices
+	u8 indices[16];
+	for( int i = 0; i < 4; ++i )
+	{
+		u8* ind = indices + 4*i;
+		u8 packed = bytes[4 + i];
+		
+		ind[0] = packed & 0x3;
+		ind[1] = ( packed >> 2 ) & 0x3;
+		ind[2] = ( packed >> 4 ) & 0x3;
+		ind[3] = ( packed >> 6 ) & 0x3;
+	}
+
+	// store out the colours
+	for( int i = 0; i < 16; ++i )
+	{
+		u8 offset = 4*indices[i];
+		for( int j = 0; j < 4; ++j )
+			rgba[4*i + j] = codes[offset + j];
+	}
+}
+
+} // namespace squish
--- a/src/nvimage/nvtt/squish/colourblock.h
+++ b/src/nvimage/nvtt/squish/colourblock.h
@ -0,0 +1,43 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#ifndef SQUISH_COLOURBLOCK_H
+#define SQUISH_COLOURBLOCK_H
+
+#include "squish.h"
+#include "maths.h"
+
+namespace squish {
+
+void WriteColourBlock3( Vec3::Arg start, Vec3::Arg end, u8 const* indices, void* block );
+void WriteColourBlock4( Vec3::Arg start, Vec3::Arg end, u8 const* indices, void* block );
+//void WriteColourBlock3( Vec3::Arg start, Vec3::Arg end, uint indices, void* block );
+//void WriteColourBlock4( Vec3::Arg start, Vec3::Arg end, uint indices, void* block );
+
+void DecompressColour( u8* rgba, void const* block, bool isDxt1 );
+
+} // namespace squish
+
+#endif // ndef SQUISH_COLOURBLOCK_H
--- a/src/nvimage/nvtt/squish/colourfit.cpp
+++ b/src/nvimage/nvtt/squish/colourfit.cpp
@ -0,0 +1,54 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#include "colourfit.h"
+#include "colourset.h"
+
+namespace squish {
+
+ColourFit::ColourFit( ColourSet const* colours, int flags ) 
+  : m_colours( colours ), 
+	m_flags( flags )
+{
+}
+
+void ColourFit::Compress( void* block )
+{
+	bool isDxt1 = ( ( m_flags & kDxt1 ) != 0 );
+	if( isDxt1 )
+	{
+		Compress4( block );
+		if( !m_colours->IsTransparent() )
+		{		
+			Compress3( block );
+		}
+	}
+	else
+	{
+		Compress4( block );
+	}
+}
+
+} // namespace squish
--- a/src/nvimage/nvtt/squish/colourfit.h
+++ b/src/nvimage/nvtt/squish/colourfit.h
@ -0,0 +1,53 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#ifndef SQUISH_COLOURFIT_H
+#define SQUISH_COLOURFIT_H
+
+#include "squish.h"
+#include "maths.h"
+
+namespace squish {
+
+class ColourSet;
+
+class ColourFit
+{
+public:
+	ColourFit( ColourSet const* colours, int flags );
+
+	void Compress( void* block );
+
+protected:
+	virtual void Compress3( void* block ) = 0;
+	virtual void Compress4( void* block ) = 0;
+
+	ColourSet const* m_colours;
+	int m_flags;
+};
+
+} // namespace squish
+
+#endif // ndef SQUISH_COLOURFIT_H
--- a/src/nvimage/nvtt/squish/colourset.cpp
+++ b/src/nvimage/nvtt/squish/colourset.cpp
@ -0,0 +1,134 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#include "colourset.h"
+
+namespace squish {
+
+ColourSet::ColourSet( u8 const* rgba, int flags )
+  : m_count( 0 ), 
+	m_transparent( false )
+{
+	// check the compression mode for dxt1
+	bool isDxt1 = ( ( flags & kDxt1 ) != 0 );
+	bool weightByAlpha = ( ( flags & kWeightColourByAlpha ) != 0 );
+
+	// create the minimal set
+	for( int i = 0; i < 16; ++i )
+	{
+		// check for transparent pixels when using dxt1
+		if( isDxt1 && rgba[4*i + 3] == 0 )
+		{
+			m_remap[i] = -1;
+			m_transparent = true;
+			continue;
+		}
+		
+#if 1
+		// normalise coordinates to [0,1]
+		float x = ( float )rgba[4*i + 2] / 255.0f;
+		float y = ( float )rgba[4*i + 1] / 255.0f;
+		float z = ( float )rgba[4*i + 0] / 255.0f;
+		
+		// ensure there is always non-zero weight even for zero alpha
+		float w = ( float )( rgba[4*i + 3] + 1 ) / 256.0f;
+
+		// add the point
+		m_points[m_count] = Vec3( x, y, z );
+		m_weights[m_count] = ( weightByAlpha ? w : 1.0f );
+		m_remap[i] = m_count;
+		
+		// advance
+		++m_count;
+#else
+		// loop over previous points for a match
+		for( int j = 0;; ++j )
+		{
+			// allocate a new point
+			if( j == i )
+			{
+				// normalise coordinates to [0,1]
+				float x = ( float )rgba[4*i + 2] / 255.0f;
+				float y = ( float )rgba[4*i + 1] / 255.0f;
+				float z = ( float )rgba[4*i + 0] / 255.0f;
+				
+				// ensure there is always non-zero weight even for zero alpha
+				float w = ( float )( rgba[4*i + 3] + 1 ) / 256.0f;
+
+				// add the point
+				m_points[m_count] = Vec3( x, y, z );
+				m_weights[m_count] = ( weightByAlpha ? w : 1.0f );
+				m_remap[i] = m_count;
+				
+				// advance
+				++m_count;
+				break;
+			}
+		
+			// check for a match
+			bool match = ( rgba[4*i] == rgba[4*j] )
+				&& ( rgba[4*i + 1] == rgba[4*j + 1] )
+				&& ( rgba[4*i + 2] == rgba[4*j + 2] )
+				&& ( rgba[4*j + 3] != 0 || !isDxt1 );
+			if( match )
+			{
+				// get the index of the match
+				int index = m_remap[j];
+				
+				// ensure there is always non-zero weight even for zero alpha
+				float w = ( float )( rgba[4*i + 3] + 1 ) / 256.0f;
+
+				// map to this point and increase the weight
+				m_weights[index] += ( weightByAlpha ? w : 1.0f );
+				m_remap[i] = index;
+				break;
+			}
+		}
+#endif
+	}
+	
+#if SQUISH_USE_SIMD
+	// generate vector values
+	for( int i = 0; i < m_count; ++i )
+	{
+		m_points_simd[i] = Vec4(m_points[i].X(), m_points[i].Y(), m_points[i].Z(), 1);
+		m_weights_simd[i] = VEC4_CONST(m_weights[i]);
+	}
+#endif
+}
+
+void ColourSet::RemapIndices( u8 const* source, u8* target ) const
+{
+	for( int i = 0; i < 16; ++i )
+	{
+		int j = m_remap[i];
+		if( j == -1 )
+			target[i] = 3;
+		else
+			target[i] = source[j];
+	}
+}
+
+} // namespace squish
--- a/src/nvimage/nvtt/squish/colourset.h
+++ b/src/nvimage/nvtt/squish/colourset.h
@ -0,0 +1,69 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#ifndef SQUISH_COLOURSET_H
+#define SQUISH_COLOURSET_H
+
+#include "squish.h"
+#include "maths.h"
+#include "simd.h"
+
+namespace squish {
+
+/*! @brief Represents a set of block colours
+*/
+class ColourSet
+{
+public:
+	ColourSet( u8 const* rgba, int flags );
+
+	int GetCount() const { return m_count; }
+	Vec3 const* GetPoints() const { return m_points; }
+	float const* GetWeights() const { return m_weights; }
+	bool IsTransparent() const { return m_transparent; }
+
+	void RemapIndices( u8 const* source, u8* target ) const;
+
+private:
+	int m_count;
+	Vec3 m_points[16];
+	float m_weights[16];
+	int m_remap[16];
+	bool m_transparent;
+
+#if SQUISH_USE_SIMD
+public:
+	Vec4 const* GetPointsSimd() const { return m_points_simd; }
+	Vec4 const* GetWeightsSimd() const { return m_weights_simd; }
+	
+private:
+	Vec4 m_points_simd[16];
+	Vec4 m_weights_simd[16];
+#endif
+};
+
+} // namespace sqish
+
+#endif // ndef SQUISH_COLOURSET_H
--- a/src/nvimage/nvtt/squish/config
+++ b/src/nvimage/nvtt/squish/config
@ -0,0 +1,22 @@
+# config file used for the Makefile only
+
+# define to 1 to use altivec instructions
+USE_ALTIVEC ?= 0
+
+# define to 1 to use sse instructions
+USE_SSE ?= 0
+
+# default flags
+CXXFLAGS ?= -O2
+ifeq ($(USE_ALTIVEC),1)
+CPPFLAGS += -DSQUISH_USE_ALTIVEC=1
+CXXFLAGS += -maltivec
+endif
+ifeq ($(USE_SSE),1)
+CPPFLAGS += -DSQUISH_USE_SSE=1
+CXXFLAGS += -msse
+endif
+
+# where should we install to
+INSTALL_DIR ?= /usr/local
+
--- a/src/nvimage/nvtt/squish/config.h
+++ b/src/nvimage/nvtt/squish/config.h
@ -0,0 +1,55 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#ifndef SQUISH_CONFIG_H
+#define SQUISH_CONFIG_H
+
+// Set to 1 when building squish to use altivec instructions.
+#ifndef SQUISH_USE_ALTIVEC
+#	define SQUISH_USE_ALTIVEC defined(__VEC__)
+#endif
+
+// Set to 1 when building squish to use sse instructions.
+#ifndef SQUISH_USE_SSE
+#	if defined(__SSE2__)
+#		define SQUISH_USE_SSE 2
+#	elif defined(__SSE__)
+#		define SQUISH_USE_SSE 1
+#	else
+#		define SQUISH_USE_SSE 0
+#	endif
+#endif
+
+// Internally et SQUISH_USE_SIMD when either altivec or sse is available.
+#if SQUISH_USE_ALTIVEC && SQUISH_USE_SSE
+#	error "Cannot enable both altivec and sse!"
+#endif
+#if SQUISH_USE_ALTIVEC || SQUISH_USE_SSE
+#	define SQUISH_USE_SIMD 1
+#else
+#	define SQUISH_USE_SIMD 0
+#endif
+
+#endif // ndef SQUISH_CONFIG_H
--- a/src/nvimage/nvtt/squish/extra/squishgen.cpp
+++ b/src/nvimage/nvtt/squish/extra/squishgen.cpp
@ -0,0 +1,158 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#include <iostream>
+
+struct SourceBlock
+{
+	int start;
+	int end;
+	int error;
+};
+
+struct TargetValue
+{
+	SourceBlock sources[4];
+};
+
+static void GenerateData( std::string const& name, int bits, int colours )
+{
+	TargetValue values[256];
+	
+	// initialise the data
+	for( int target = 0; target < 256; ++target )
+		for( int index = 0; index < colours; ++index )
+			values[target].sources[index].error = 255;	
+
+	// loop over all possible source points
+	int count = ( 1 << bits );
+	for( int value1 = 0; value1 < count; ++value1 )
+	{
+		for( int value2 = 0; value2 < count; ++value2 )
+		{
+			// compute the 8-bit endpoints
+			int a = ( value1 << ( 8 - bits ) ) | ( value1 >> ( 2*bits - 8 ) );
+			int b = ( value2 << ( 8 - bits ) ) | ( value2 >> ( 2*bits - 8 ) );
+			
+			// fill in the codebook with the these and intermediates
+			int codes[4];
+			codes[0] = a;
+			codes[1] = b;
+			if( colours == 3 )
+			{
+				codes[2] = ( a + b )/2;
+				codes[3] = 0;
+			}
+			else
+			{
+				codes[2] = ( 2*a + b )/3;
+				codes[3] = ( a + 2*b )/3;
+			}
+			
+			// mark each target point with the endpoints and index needed for it
+			for( int index = 0; index < colours; ++index )
+			{
+				int target = codes[index];
+				
+				SourceBlock& block = values[target].sources[index];
+				if( block.error != 0 )
+				{
+					block.start = value1;
+					block.end = value2;
+					block.error = 0;
+				}
+			}
+		}
+	}
+	
+	// iteratively fill in the missing values
+	for( ;; )
+	{
+		bool stable = true;
+		for( int index = 0; index < colours; ++index )
+		{
+			for( int target = 0; target < 256; ++target )
+			{
+				if( target != 255 )
+				{
+					SourceBlock& current = values[target].sources[index];
+					SourceBlock& next = values[target + 1].sources[index];
+					if( current.error > next.error + 1 )
+					{
+						current.start = next.start;
+						current.end = next.end;
+						current.error = next.error + 1;
+						stable = false;
+					}
+				}
+				if( target != 0 )
+				{
+					SourceBlock& current = values[target].sources[index];
+					SourceBlock& previous = values[target - 1].sources[index];
+					if( current.error > previous.error + 1 )
+					{
+						current.start = previous.start;
+						current.end = previous.end;
+						current.error = previous.error + 1;
+						stable = false;
+					}
+				}
+			}
+		}
+		if( stable )
+			break;
+	}
+	
+	// debug
+	std::cout << "\nstatic SingleColourLookup const " << name << "[] = \n{\n"; 
+	for( int i = 0;; )
+	{
+		std::cout << "\t{ { ";
+		for( int j = 0;; )
+		{
+			SourceBlock const& block = values[i].sources[j];
+			if( j < colours )
+				std::cout << "{ " << block.start << ", " << block.end << ", " << block.error << " }";
+			else
+				std::cout << "{ 0, 0, 0 }";
+			if( ++j == 4 )
+				break;
+			std::cout << ", ";
+		}
+		std::cout << " } }";
+		if( ++i == 256 )
+			break;
+		std::cout << ",\n";
+	}
+	std::cout << "\n};\n";
+}
+
+int main()
+{
+	GenerateData( "lookup_5_3", 5, 3 );
+	GenerateData( "lookup_6_3", 6, 3 );
+	GenerateData( "lookup_5_4", 5, 4 );
+	GenerateData( "lookup_6_4", 6, 4 );
+}
--- a/src/nvimage/nvtt/squish/extra/squishpng.cpp
+++ b/src/nvimage/nvtt/squish/extra/squishpng.cpp
@ -0,0 +1,603 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+/*! @file
+
+	@brief	Example program that converts between the PNG and DXT formats.
+	
+	This program requires libpng for PNG input and output, and is designed
+	to show how to prepare data for the squish library when it is not simply
+	a contiguous block of memory.
+*/
+
+#include <iostream>
+#include <string>
+#include <sstream>
+#include <ctime>
+#include <cmath>
+#include <squish.h>
+#include <png.h>
+
+#ifdef _MSC_VER
+#pragma warning( disable: 4511 4512 )
+#endif // def _MSC_VER
+
+using namespace squish;
+
+//! Simple exception class.
+class Error : public std::exception
+{
+public:
+	Error( std::string const& excuse ) : m_excuse( excuse ) {}
+	~Error() throw() {}
+	
+	virtual char const* what() const throw() { return m_excuse.c_str(); }
+	
+private:
+	std::string m_excuse;
+};
+
+//! Base class to make derived classes non-copyable
+class NonCopyable
+{
+public:
+	NonCopyable() {}
+	
+private:
+	NonCopyable( NonCopyable const& );
+	NonCopyable& operator=( NonCopyable const& );
+};
+
+//! Memory object.
+class Mem : NonCopyable
+{
+public:
+	explicit Mem( int size ) : m_p( new u8[size] ) {}
+	~Mem() { delete[] m_p; }
+	
+	u8* Get() const { return m_p; }
+	
+private:
+	u8* m_p;
+};
+
+//! File object.
+class File : NonCopyable
+{
+public:
+	explicit File( FILE* fp ) : m_fp( fp ) {}
+	~File() { if( m_fp ) fclose( m_fp ); }
+	
+	bool IsValid() const { return m_fp != 0; }
+	FILE* Get() const { return m_fp; }
+
+private:
+	FILE* m_fp;
+};
+
+//! PNG read object.
+class PngReadStruct : NonCopyable
+{
+public:
+	PngReadStruct()
+	  : m_png( 0 ), 
+		m_info( 0 ), 
+		m_end( 0 )
+	{
+		m_png = png_create_read_struct( PNG_LIBPNG_VER_STRING, 0, 0, 0 );
+		if( !m_png )
+			throw Error( "failed to create png read struct" );	
+			
+		m_info = png_create_info_struct( m_png );
+		m_end = png_create_info_struct( m_png );
+		if( !m_info || !m_end )
+		{
+			png_infopp info = m_info ? &m_info : 0;
+			png_infopp end = m_end ? &m_end : 0;
+			png_destroy_read_struct( &m_png, info, end );
+			throw Error( "failed to create png info structs" );
+		}
+	}
+	
+	~PngReadStruct() 
+	{ 
+		png_destroy_read_struct( &m_png, &m_info, &m_end );
+	}
+
+	png_structp GetPng() const { return m_png; }
+	png_infop GetInfo() const { return m_info; }
+
+private:
+	png_structp m_png;
+	png_infop m_info, m_end;
+};
+
+//! PNG write object.
+class PngWriteStruct : NonCopyable
+{
+public:
+	PngWriteStruct()
+	  : m_png( 0 ), 
+		m_info( 0 )
+	{
+		m_png = png_create_write_struct( PNG_LIBPNG_VER_STRING, 0, 0, 0 );
+		if( !m_png )
+			throw Error( "failed to create png read struct" );	
+			
+		m_info = png_create_info_struct( m_png );
+		if( !m_info )
+		{
+			png_infopp info = m_info ? &m_info : 0;
+			png_destroy_write_struct( &m_png, info );
+			throw Error( "failed to create png info structs" );
+		}
+	}
+	
+	~PngWriteStruct()
+	{
+		png_destroy_write_struct( &m_png, &m_info );
+	}
+	
+	png_structp GetPng() const { return m_png; }
+	png_infop GetInfo() const { return m_info; }
+
+private:
+	png_structp m_png;
+	png_infop m_info;
+};
+
+//! PNG rows object.
+class PngRows : NonCopyable
+{
+public:
+	PngRows( int width, int height, int stride ) : m_width( width ), m_height( height )
+	{
+		m_rows = ( png_bytep* )malloc( m_height*sizeof( png_bytep ) );
+		for( int i = 0; i < m_height; ++i )
+			m_rows[i] = ( png_bytep )malloc( m_width*stride );
+	}
+	
+	~PngRows() 
+	{
+		for( int i = 0; i < m_height; ++i )
+			free( m_rows[i] );
+		free( m_rows );
+	}
+	
+	png_bytep* Get() const { return m_rows; }
+	
+private:
+	png_bytep* m_rows;
+	int m_width, m_height;
+};
+
+class PngImage
+{
+public:
+	explicit PngImage( std::string const& fileName );
+
+	int GetWidth() const { return m_width; }
+	int GetHeight() const { return m_height; }
+	int GetStride() const { return m_stride; }
+	bool IsColour() const { return m_colour; }
+	bool IsAlpha() const { return m_alpha; }
+	
+	u8 const* GetRow( int row ) const { return ( u8* )m_rows[row]; }
+
+private:
+	PngReadStruct m_png;
+
+	int m_width;
+	int m_height;
+	int m_stride;
+	bool m_colour;
+	bool m_alpha;
+	
+	png_bytep* m_rows;
+};
+
+PngImage::PngImage( std::string const& fileName )
+{
+	// open the source file
+	File file( fopen( fileName.c_str(), "rb" ) );
+	if( !file.IsValid() )
+	{
+		std::ostringstream oss;
+		oss << "failed to open \"" << fileName << "\" for reading";
+		throw Error( oss.str() );
+	}
+	
+	// check the signature bytes
+	png_byte header[8];
+	fread( header, 1, 8, file.Get() );
+	if( png_sig_cmp( header, 0, 8 ) )
+	{
+		std::ostringstream oss;
+		oss << "\"" << fileName << "\" does not look like a png file";
+		throw Error( oss.str() );
+	}
+	
+	// read the image into memory
+	png_init_io( m_png.GetPng(), file.Get() );
+	png_set_sig_bytes( m_png.GetPng(), 8 );
+	png_read_png( m_png.GetPng(), m_png.GetInfo(), PNG_TRANSFORM_EXPAND, 0 );
+
+	// get the image info
+	png_uint_32 width;
+	png_uint_32 height;
+	int bitDepth;
+	int colourType;
+	png_get_IHDR( m_png.GetPng(), m_png.GetInfo(), &width, &height, &bitDepth, &colourType, 0, 0, 0 );
+	
+	// check the image is 8 bit
+	if( bitDepth != 8 )
+	{
+		std::ostringstream oss;
+		oss << "cannot process " << bitDepth << "-bit image (bit depth must be 8)";
+		throw Error( oss.str() );
+	}
+	
+	// save the info
+	m_width = width;
+	m_height = height;
+	m_colour = ( ( colourType & PNG_COLOR_MASK_COLOR ) != 0 );
+	m_alpha = ( ( colourType & PNG_COLOR_MASK_ALPHA ) != 0 );
+	m_stride = ( m_colour ? 3 : 1 ) + ( m_alpha ? 1 : 0 );
+
+	// get the image rows
+	m_rows = png_get_rows( m_png.GetPng(), m_png.GetInfo() );
+	if( !m_rows )
+		throw Error( "failed to get image rows" );
+}
+
+static void Compress( std::string const& sourceFileName, std::string const& targetFileName, int flags )
+{
+	// load the source image
+	PngImage sourceImage( sourceFileName );
+
+	// get the image info
+	int width = sourceImage.GetWidth();
+	int height = sourceImage.GetHeight();
+	int stride = sourceImage.GetStride();
+	bool colour = sourceImage.IsColour();
+	bool alpha = sourceImage.IsAlpha();
+
+	// check the image dimensions
+	if( ( width % 4 ) != 0 || ( height % 4 ) != 0 )
+	{
+		std::ostringstream oss;
+		oss << "cannot compress " << width << "x" << height
+			<< "image (dimensions must be multiples of 4)";
+		throw Error( oss.str() );
+	}
+	
+	// create the target data
+	int bytesPerBlock = ( ( flags & kDxt1 ) != 0 ) ? 8 : 16;
+	int targetDataSize = bytesPerBlock*width*height/16;
+	Mem targetData( targetDataSize );
+	
+	// loop over blocks and compress them
+	clock_t start = std::clock();
+	u8* targetBlock = targetData.Get();
+	for( int y = 0; y < height; y += 4 )
+	{
+		// process a row of blocks
+		for( int x = 0; x < width; x += 4 )
+		{
+			// get the block data
+			u8 sourceRgba[16*4];
+			for( int py = 0, i = 0; py < 4; ++py )
+			{
+				u8 const* row = sourceImage.GetRow( y + py ) + x*stride;
+				for( int px = 0; px < 4; ++px, ++i )
+				{
+					// get the pixel colour 
+					if( colour )
+					{
+						for( int j = 0; j < 3; ++j )
+							sourceRgba[4*i + j] = *row++;
+					}
+					else
+					{
+						for( int j = 0; j < 3; ++j )
+							sourceRgba[4*i + j] = *row;
+						++row;
+					}
+					
+					// skip alpha for now
+					if( alpha )
+						sourceRgba[4*i + 3] = *row++;
+					else
+						sourceRgba[4*i + 3] = 255;
+				}
+			}
+			
+			// compress this block
+			Compress( sourceRgba, targetBlock, flags );
+			
+			// advance
+			targetBlock += bytesPerBlock;			
+		}
+	}
+	clock_t end = std::clock();
+	double duration = ( double )( end - start ) / CLOCKS_PER_SEC;
+	std::cout << "time taken: " << duration << " seconds" << std::endl;
+	
+	// open the target file
+	File targetFile( fopen( targetFileName.c_str(), "wb" ) );
+	if( !targetFile.IsValid() )
+	{
+		std::ostringstream oss;
+		oss << "failed to open \"" << sourceFileName << "\" for writing";
+		throw Error( oss.str() );
+	}
+	
+	// write the header
+	fwrite( &width, sizeof( int ), 1, targetFile.Get() );
+	fwrite( &height, sizeof( int ), 1, targetFile.Get() );
+	
+	// write the data
+	fwrite( targetData.Get(), 1, targetDataSize, targetFile.Get() );
+}
+
+static void Decompress( std::string const& sourceFileName, std::string const& targetFileName, int flags )
+{
+	// open the source file
+	File sourceFile( fopen( sourceFileName.c_str(), "rb" ) );
+	if( !sourceFile.IsValid() )
+	{
+		std::ostringstream oss;
+		oss << "failed to open \"" << sourceFileName << "\" for reading";
+		throw Error( oss.str() );
+	}
+	
+	// get the width and height
+	int width, height;
+	fread( &width, sizeof( int ), 1, sourceFile.Get() ); 
+	fread( &height, sizeof( int ), 1, sourceFile.Get() );
+	
+	// work out the data size
+	int bytesPerBlock = ( ( flags & kDxt1 ) != 0 ) ? 8 : 16;
+	int sourceDataSize = bytesPerBlock*width*height/16;
+	Mem sourceData( sourceDataSize );
+	
+	// read the source data
+	fread( sourceData.Get(), 1, sourceDataSize, sourceFile.Get() );
+		
+	// create the target rows
+	PngRows targetRows( width, height, 4 );
+	
+	// loop over blocks and compress them
+	u8 const* sourceBlock = sourceData.Get();
+	for( int y = 0; y < height; y += 4 )
+	{
+		// process a row of blocks
+		for( int x = 0; x < width; x += 4 )
+		{
+			// decompress back
+			u8 targetRgba[16*4];
+			Decompress( targetRgba, sourceBlock, flags );
+			
+			// write the data into the target rows
+			for( int py = 0, i = 0; py < 4; ++py )
+			{
+				u8* row = ( u8* )targetRows.Get()[y + py] + x*4;
+				for( int px = 0; px < 4; ++px, ++i )
+				{	
+					for( int j = 0; j < 4; ++j )
+						*row++ = targetRgba[4*i + j];
+				}
+			}
+			
+			// advance
+			sourceBlock += bytesPerBlock;
+		}
+	}
+	
+	// create the target PNG
+	PngWriteStruct targetPng;
+
+	// set up the image
+	png_set_IHDR(
+		targetPng.GetPng(), targetPng.GetInfo(), width, height,
+		8, PNG_COLOR_TYPE_RGBA, PNG_INTERLACE_NONE,
+		PNG_COMPRESSION_TYPE_DEFAULT, PNG_FILTER_TYPE_DEFAULT 
+	);
+	   
+	// open the target file
+	File targetFile( fopen( targetFileName.c_str(), "wb" ) );
+	if( !targetFile.IsValid() )
+	{
+		std::ostringstream oss;
+		oss << "failed to open \"" << targetFileName << "\" for writing";
+		throw Error( oss.str() );
+	}
+	
+	// write the image
+	png_set_rows( targetPng.GetPng(), targetPng.GetInfo(), targetRows.Get() );
+	png_init_io( targetPng.GetPng(), targetFile.Get() );
+	png_write_png( targetPng.GetPng(), targetPng.GetInfo(), PNG_TRANSFORM_IDENTITY, 0 );
+}
+
+static void Diff( std::string const& sourceFileName, std::string const& targetFileName )
+{
+	// load the images
+	PngImage sourceImage( sourceFileName );
+	PngImage targetImage( targetFileName );
+	
+	// get the image info
+	int width = sourceImage.GetWidth();
+	int height = sourceImage.GetHeight();
+	int sourceStride = sourceImage.GetStride();
+	int targetStride = targetImage.GetStride();
+	int stride = std::min( sourceStride, targetStride );
+
+	// check they match
+	if( width != targetImage.GetWidth() || height != targetImage.GetHeight() )
+		throw Error( "source and target dimensions do not match" );
+		
+	// work out the error
+	double error = 0.0;
+	for( int y = 0; y < height; ++y )
+	{
+		u8 const* sourceRow = sourceImage.GetRow( y );
+		u8 const* targetRow = targetImage.GetRow( y );
+		for( int x = 0; x < width; ++x )
+		{	
+			u8 const* sourcePixel = sourceRow + x*sourceStride;
+			u8 const* targetPixel = targetRow + x*targetStride;
+			for( int i = 0; i < stride; ++i )
+			{
+				int diff = ( int )sourcePixel[i] - ( int )targetPixel[i];
+				error += ( double )( diff*diff );
+			}
+		}
+	}
+	error = std::sqrt( error / ( width*height ) );
+	
+	// print it out
+	std::cout << "rms error: " << error << std::endl;
+}
+
+enum Mode
+{
+	kCompress, 
+	kDecompress,
+	kDiff
+};
+
+int main( int argc, char* argv[] )
+{
+	try
+	{
+		// parse the command-line
+		std::string sourceFileName;
+		std::string targetFileName;
+		Mode mode = kCompress;
+		int method = kDxt1;
+		int metric = kColourMetricPerceptual;
+		int fit = kColourClusterFit;
+		int extra = 0;
+		bool help = false;
+		bool arguments = true;
+		for( int i = 1; i < argc; ++i )
+		{
+			// check for options
+			char const* word = argv[i];
+			if( arguments && word[0] == '-' )
+			{
+				for( int j = 1; word[j] != '\0'; ++j )
+				{
+					switch( word[j] )
+					{
+					case 'h': help = true; break;
+					case 'c': mode = kCompress; break;
+					case 'd': mode = kDecompress; break;
+					case 'e': mode = kDiff; break;
+					case '1': method = kDxt1; break;
+					case '3': method = kDxt3; break;
+					case '5': method = kDxt5; break;
+					case 'u': metric = kColourMetricUniform; break;
+					case 'r': fit = kColourRangeFit; break;
+					case 'w': extra = kWeightColourByAlpha; break;
+					case '-': arguments = false; break;
+					default:
+						std::cerr << "unknown option '" << word[j] << "'" << std::endl;
+						return -1;
+					}
+				}
+			}
+			else
+			{
+				if( sourceFileName.empty() )
+					sourceFileName.assign( word );
+				else if( targetFileName.empty() )
+					targetFileName.assign( word );
+				else
+				{
+					std::cerr << "unexpected argument \"" << word << "\"" << std::endl;
+				}
+			}
+		}
+		
+		// check arguments
+		if( help )
+		{
+			std::cout 
+				<< "SYNTAX" << std::endl
+				<< "\tsquishpng [-cde135] <source> <target>" << std::endl 
+				<< "OPTIONS" << std::endl
+				<< "\t-c\tCompress source png to target raw dxt (default)" << std::endl
+				<< "\t-135\tSpecifies whether to use DXT1 (default), DXT3 or DXT5 compression" << std::endl
+				<< "\t-u\tUse a uniform colour metric during colour compression" << std::endl
+				<< "\t-r\tUse the fast but inferior range-based colour compressor" << std::endl
+				<< "\t-w\tWeight colour values by alpha in the cluster colour compressor" << std::endl
+				<< "\t-d\tDecompress source raw dxt to target png" << std::endl
+				<< "\t-e\tDiff source and target png" << std::endl
+				;
+			
+			return 0;
+		}
+		if( sourceFileName.empty() )
+		{
+			std::cerr << "no source file given" << std::endl;
+			return -1;
+		}
+		if( targetFileName.empty() )
+		{
+			std::cerr << "no target file given" << std::endl;
+			return -1;
+		}
+
+		// do the work
+		switch( mode )
+		{
+		case kCompress:
+			Compress( sourceFileName, targetFileName, method | metric | fit | extra );
+			break;
+		
+		case kDecompress:
+			Decompress( sourceFileName, targetFileName, method );
+			break;
+			
+		case kDiff:
+			Diff( sourceFileName, targetFileName );
+			break;
+			
+		default:
+			std::cerr << "unknown mode" << std::endl;
+			throw std::exception();
+		}
+	}
+	catch( std::exception& excuse )
+	{
+		// complain
+		std::cerr << "squishpng error: " << excuse.what() << std::endl;
+		return -1;
+	}
+	
+	// done
+	return 0;
+}
--- a/src/nvimage/nvtt/squish/extra/squishtest.cpp
+++ b/src/nvimage/nvtt/squish/extra/squishtest.cpp
@ -0,0 +1,205 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+/*! @file
+
+	@brief	This program tests the error for 1 and 2-colour DXT compression.
+	
+	This tests the effectiveness of the DXT compression algorithm for all
+	possible 1 and 2-colour blocks of pixels.
+*/
+
+#include <squish.h>
+#include <iostream>
+#include <cmath>
+#include <cfloat>
+
+using namespace squish;
+
+double GetColourError( u8 const* a, u8 const* b )
+{
+	double error = 0.0;
+	for( int i = 0; i < 16; ++i )
+	{
+		for( int j = 0; j < 3; ++j )
+		{
+			int index = 4*i + j;
+			int diff = ( int )a[index] - ( int )b[index];
+			error += ( double )( diff*diff );
+		}
+	}
+	return error / 16.0;
+}
+
+void TestOneColour( int flags )
+{
+	u8 input[4*16];
+	u8 output[4*16];
+	u8 block[16];
+	
+	double avg = 0.0, min = DBL_MAX, max = -DBL_MAX;
+	int counter = 0;
+	
+	// test all single-channel colours
+	for( int i = 0; i < 16*4; ++i )
+		input[i] = ( ( i % 4 ) == 3 ) ? 255 : 0;
+	for( int channel = 0; channel < 3; ++channel )
+	{
+		for( int value = 0; value < 255; ++value )
+		{
+			// set the channnel value
+			for( int i = 0; i < 16; ++i )
+				input[4*i + channel] = ( u8 )value;
+			
+			// compress and decompress
+			Compress( input, block, flags );
+			Decompress( output, block, flags );
+			
+			// test the results
+			double rm = GetColourError( input, output );
+			double rms = std::sqrt( rm );
+			
+			// accumulate stats
+			min = std::min( min, rms );
+			max = std::max( max, rms );
+			avg += rm;
+			++counter;
+		}
+		
+		// reset the channel value
+		for( int i = 0; i < 16; ++i )
+			input[4*i + channel] = 0;
+	}
+	
+	// finish stats
+	avg = std::sqrt( avg/counter );
+	
+	// show stats
+	std::cout << "one colour error (min, max, avg): " 
+		<< min << ", " << max << ", " << avg << std::endl;
+}
+
+void TestOneColourRandom( int flags )
+{
+	u8 input[4*16];
+	u8 output[4*16];
+	u8 block[16];
+	
+	double avg = 0.0, min = DBL_MAX, max = -DBL_MAX;
+	int counter = 0;
+	
+	// test all single-channel colours
+	for( int test = 0; test < 1000; ++test )
+	{
+		// set a constant random colour
+		for( int channel = 0; channel < 3; ++channel )
+		{
+			u8 value = ( u8 )( rand() & 0xff );
+			for( int i = 0; i < 16; ++i )
+				input[4*i + channel] = value;
+		}
+		for( int i = 0; i < 16; ++i )
+			input[4*i + 3] = 255;
+		
+		// compress and decompress
+		Compress( input, block, flags );
+		Decompress( output, block, flags );
+		
+		// test the results
+		double rm = GetColourError( input, output );
+		double rms = std::sqrt( rm );
+		
+		// accumulate stats
+		min = std::min( min, rms );
+		max = std::max( max, rms );
+		avg += rm;
+		++counter;
+	}
+	
+	// finish stats
+	avg = std::sqrt( avg/counter );
+	
+	// show stats
+	std::cout << "random one colour error (min, max, avg): " 
+		<< min << ", " << max << ", " << avg << std::endl;
+}
+
+void TestTwoColour( int flags )
+{
+	u8 input[4*16];
+	u8 output[4*16];
+	u8 block[16];
+	
+	double avg = 0.0, min = DBL_MAX, max = -DBL_MAX;
+	int counter = 0;
+	
+	// test all single-channel colours
+	for( int i = 0; i < 16*4; ++i )
+		input[i] = ( ( i % 4 ) == 3 ) ? 255 : 0;
+	for( int channel = 0; channel < 3; ++channel )
+	{
+		for( int value1 = 0; value1 < 255; ++value1 )
+		{
+			for( int value2 = value1 + 1; value2 < 255; ++value2 )
+			{
+				// set the channnel value
+				for( int i = 0; i < 16; ++i )
+					input[4*i + channel] = ( u8 )( ( i < 8 ) ? value1 : value2 );
+				
+				// compress and decompress
+				Compress( input, block, flags );
+				Decompress( output, block, flags );
+				
+				// test the results
+				double rm = GetColourError( input, output );
+				double rms = std::sqrt( rm );
+				
+				// accumulate stats
+				min = std::min( min, rms );
+				max = std::max( max, rms );
+				avg += rm;
+				++counter;
+			}
+		}
+				
+		// reset the channel value
+		for( int i = 0; i < 16; ++i )
+			input[4*i + channel] = 0;
+	}
+	
+	// finish stats
+	avg = std::sqrt( avg/counter );
+	
+	// show stats
+	std::cout << "two colour error (min, max, avg): " 
+		<< min << ", " << max << ", " << avg << std::endl;
+}
+
+int main()
+{
+	TestOneColourRandom( kDxt1 | kColourRangeFit );
+	TestOneColour( kDxt1 );
+	TestTwoColour( kDxt1 );
+}
--- a/src/nvimage/nvtt/squish/fastclusterfit.cpp
+++ b/src/nvimage/nvtt/squish/fastclusterfit.cpp
@ -0,0 +1,673 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+	Copyright (c) 2006 Ignacio Castano                      icastano@nvidia.com
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+
+#include "fastclusterfit.h"
+#include "colourset.h"
+#include "colourblock.h"
+#include <cfloat>
+
+namespace squish {
+
+FastClusterFit::FastClusterFit( ColourSet const* colours, int flags ) :
+	ColourFit( colours, flags )
+{
+	// initialise the best error
+#if SQUISH_USE_SIMD
+	m_besterror = VEC4_CONST( FLT_MAX );
+#else
+	m_besterror = FLT_MAX;
+#endif
+
+	// cache some values
+	int const count = m_colours->GetCount();
+	Vec3 const* values = m_colours->GetPoints();
+	
+	// get the covariance matrix
+	Sym3x3 covariance = ComputeWeightedCovariance( count, values, m_colours->GetWeights() );
+	
+	// compute the principle component
+	Vec3 principle = ComputePrincipleComponent( covariance );
+
+	// build the list of values
+	float dps[16];
+	for( int i = 0; i < count; ++i )
+	{
+		dps[i] = Dot( values[i], principle );
+		m_order[i] = i;
+	}
+	
+	// stable sort
+	for( int i = 0; i < count; ++i )
+	{
+		for( int j = i; j > 0 && dps[j] < dps[j - 1]; --j )
+		{
+			std::swap( dps[j], dps[j - 1] );
+			std::swap( m_order[j], m_order[j - 1] );
+		}
+	}
+	
+	// weight all the points
+#if SQUISH_USE_SIMD
+	Vec4 const* unweighted = m_colours->GetPointsSimd();
+	m_xxsum = VEC4_CONST( 0.0f );
+	m_xsum = VEC4_CONST( 0.0f );
+#else
+	Vec3 const* unweighted = m_colours->GetPoints();
+	m_xxsum = Vec3( 0.0f );
+	m_xsum = Vec3( 0.0f );
+#endif
+
+	for( int i = 0; i < count; ++i )
+	{
+		int p = m_order[i];
+		m_unweighted[i] = unweighted[p];
+		m_xxsum += m_unweighted[i]*m_unweighted[i];
+		m_xsum += m_unweighted[i];
+	}
+}
+
+
+struct Precomp {
+	float alpha2_sum;
+	float beta2_sum;
+	float alphabeta_sum;
+	float factor;
+};
+
+static Precomp s_threeElement[153];
+static Precomp s_fourElement[969];
+
+void FastClusterFit::doPrecomputation()
+{
+	int i = 0;
+	
+	// Three element clusters:
+	for( int c0 = 0; c0 <= 16; c0++)	// At least two clusters.
+	{
+		for( int c1 = 0; c1 <=  16-c0; c1++)
+		{
+			int c2 = 16 - c0 - c1;
+
+			/*if (c2 == 16) {
+				// a = b = x2 / 16
+				s_threeElement[i].alpha2_sum = 0;
+				s_threeElement[i].beta2_sum = 16;
+				s_threeElement[i].alphabeta_sum = -16;
+				s_threeElement[i].factor = 1.0f / 256.0f;
+			}
+			else if (c0 == 16) {
+				// a = b = x0 / 16
+				s_threeElement[i].alpha2_sum = 16;
+				s_threeElement[i].beta2_sum = 0;
+				s_threeElement[i].alphabeta_sum = -16;
+				s_threeElement[i].factor = 1.0f / 256.0f;
+			}
+			else*/ {
+				s_threeElement[i].alpha2_sum = c0 + c1 * 0.25f;
+				s_threeElement[i].beta2_sum = c2 + c1 * 0.25f;
+				s_threeElement[i].alphabeta_sum = c1 * 0.25f;
+				s_threeElement[i].factor = 1.0f / (s_threeElement[i].alpha2_sum * s_threeElement[i].beta2_sum - s_threeElement[i].alphabeta_sum * s_threeElement[i].alphabeta_sum);
+			}
+			
+			i++;
+		}
+	}
+	//printf("%d three cluster elements\n", i);
+	
+	// Four element clusters:
+	i = 0;
+	for( int c0 = 0; c0 <= 16; c0++)
+	{
+		for( int c1 = 0; c1 <=  16-c0; c1++)
+		{
+			for( int c2 = 0; c2 <=  16-c0-c1; c2++)
+			{
+				int c3 = 16 - c0 - c1 - c2;
+				
+				/*if (c3 == 16) {
+					// a = b = x3 / 16
+					s_fourElement[i].alpha2_sum = 16.0f;
+					s_fourElement[i].beta2_sum = 0.0f;
+					s_fourElement[i].alphabeta_sum = -16.0f;
+					s_fourElement[i].factor = 1.0f / 256.0f;					
+				}
+				else if (c0 == 16) {
+					// a = b = x0 / 16
+					s_fourElement[i].alpha2_sum = 0.0f;
+					s_fourElement[i].beta2_sum = 16.0f;
+					s_fourElement[i].alphabeta_sum = -16.0f;
+					s_fourElement[i].factor = 1.0f / 256.0f;					
+				}
+				else*/ {
+					s_fourElement[i].alpha2_sum = c0 + c1 * (4.0f/9.0f) + c2 * (1.0f/9.0f);
+					s_fourElement[i].beta2_sum = c3 + c2 * (4.0f/9.0f) + c1 * (1.0f/9.0f);
+					s_fourElement[i].alphabeta_sum = (c1 + c2) * (2.0f/9.0f);
+					s_fourElement[i].factor = 1.0f / (s_fourElement[i].alpha2_sum * s_fourElement[i].beta2_sum - s_fourElement[i].alphabeta_sum * s_fourElement[i].alphabeta_sum);
+				}
+
+				i++;
+			}
+		}
+	}
+	//printf("%d four cluster elements\n", i);
+}
+
+void FastClusterFit::setMetric(float r, float g, float b)
+{
+#if SQUISH_USE_SIMD
+	m_metric = Vec4(r, g, b, 0);
+#else
+	m_metric = Vec3(r, g, b);
+#endif
+}
+
+float FastClusterFit::bestError() const
+{
+#if SQUISH_USE_SIMD
+	Vec4 x = m_xxsum * m_metric;
+	Vec4 error = m_besterror + x.SplatX() + x.SplatY() + x.SplatZ();
+	return error.GetVec3().X();
+#else
+	return m_besterror + Dot(m_xxsum, m_metric);
+#endif
+
+}
+
+#if SQUISH_USE_SIMD
+
+void FastClusterFit::Compress3( void* block )
+{
+	Vec4 const one = VEC4_CONST(1.0f);
+	Vec4 const zero = VEC4_CONST(0.0f);
+	Vec4 const half = VEC4_CONST(0.5f);
+	Vec4 const two = VEC4_CONST(2.0);
+	 
+	// declare variables
+	Vec4 beststart = VEC4_CONST( 0.0f );
+	Vec4 bestend = VEC4_CONST( 0.0f );
+	Vec4 besterror = VEC4_CONST( FLT_MAX );
+
+	Vec4 x0 = zero;
+	Vec4 x1;
+	int b0 = 0, b1 = 0;
+	int i = 0;
+
+	// check all possible clusters for this total order
+	for( int c0 = 0; c0 <= 16; c0++)
+	{	
+		x1 = zero;
+		
+		for( int c1 = 0; c1 <= 16-c0; c1++)
+		{	
+			Vec4 const alpha2_sum = VEC4_CONST(s_threeElement[i].alpha2_sum);
+			Vec4 const beta2_sum = VEC4_CONST(s_threeElement[i].beta2_sum);
+			Vec4 const alphabeta_sum = VEC4_CONST(s_threeElement[i].alphabeta_sum);
+			Vec4 const factor = VEC4_CONST(s_threeElement[i].factor);
+			i++;
+			
+			Vec4 const alphax_sum = MultiplyAdd(half, x1, x0);
+			Vec4 const betax_sum = m_xsum - alphax_sum;
+			
+			Vec4 a = NegativeMultiplySubtract(betax_sum, alphabeta_sum, alphax_sum*beta2_sum) * factor;
+			Vec4 b = NegativeMultiplySubtract(alphax_sum, alphabeta_sum, betax_sum*alpha2_sum) * factor;
+			
+			// clamp the output to [0, 1]
+			a = Min( one, Max( zero, a ) );
+			b = Min( one, Max( zero, b ) );
+			
+			// clamp to the grid
+			Vec4 const grid( 31.0f, 63.0f, 31.0f, 0.0f );
+			Vec4 const gridrcp( 0.03227752766457f, 0.01583151765563f, 0.03227752766457f, 0.0f );
+			a = Truncate( MultiplyAdd( grid, a, half ) ) * gridrcp;
+			b = Truncate( MultiplyAdd( grid, b, half ) ) * gridrcp;
+			
+			// compute the error
+			Vec4 e1 = MultiplyAdd( a, alphax_sum, b*betax_sum );
+			Vec4 e2 = MultiplyAdd( a*a, alpha2_sum, b*b*beta2_sum );
+			Vec4 e3 = MultiplyAdd( a*b*alphabeta_sum - e1, two, e2 );
+			
+			// apply the metric to the error term
+			Vec4 e4 = e3 * m_metric;
+			Vec4 error = e4.SplatX() + e4.SplatY() + e4.SplatZ();
+			
+			// keep the solution if it wins
+			if( CompareAnyLessThan( error, besterror ) )
+			{
+				besterror = error;
+				beststart = a;
+				bestend = b;
+				b0 = c0;
+				b1 = c1;
+			}
+			
+			x1 += m_unweighted[c0+c1];
+		}
+		
+		x0 += m_unweighted[c0];
+	}
+
+	// save the block if necessary
+	if( CompareAnyLessThan( besterror, m_besterror ) )
+	{
+		// compute indices from cluster sizes.
+		/*uint bestindices = 0;
+		{
+			int i = b0;
+			for(; i < b0+b1; i++) {
+				bestindices |= 2 << (2 * i);
+			}
+			for(; i < 16; i++) {
+				bestindices |= 1 << (2 * i);
+			}
+		}*/
+		u8 bestindices[16];
+		{
+			int i = 0;
+			for(; i < b0; i++) {
+				bestindices[i] = 0;
+			}
+			for(; i < b0+b1; i++) {
+				bestindices[i] = 2;
+			}
+			for(; i < 16; i++) {
+				bestindices[i] = 1;
+			}
+		}
+		
+		// remap the indices
+		u8 ordered[16];
+		for( int i = 0; i < 16; ++i )
+			ordered[m_order[i]] = bestindices[i];
+		
+		// save the block
+		WriteColourBlock3( beststart.GetVec3(), bestend.GetVec3(), ordered, block );
+		
+		// save the error
+		m_besterror = besterror;
+	}
+}
+
+void FastClusterFit::Compress4( void* block )
+{
+	Vec4 const one = VEC4_CONST(1.0f);
+	Vec4 const zero = VEC4_CONST(0.0f);
+	Vec4 const half = VEC4_CONST(0.5f);
+	Vec4 const two = VEC4_CONST(2.0);
+	Vec4 const onethird = VEC4_CONST( 1.0f/3.0f );
+	Vec4 const twothirds = VEC4_CONST( 2.0f/3.0f );
+
+	// declare variables
+	Vec4 beststart = VEC4_CONST( 0.0f );
+	Vec4 bestend = VEC4_CONST( 0.0f );
+	Vec4 besterror = VEC4_CONST( FLT_MAX );
+
+	Vec4 x0 = zero;
+	int b0 = 0, b1 = 0, b2 = 0;
+	int i = 0;
+
+	// check all possible clusters for this total order
+	for( int c0 = 0; c0 <= 16; c0++)
+	{	
+		Vec4 x1 = zero;
+		
+		for( int c1 = 0; c1 <= 16-c0; c1++)
+		{	
+			Vec4 x2 = zero;
+			
+			for( int c2 = 0; c2 <= 16-c0-c1; c2++)
+			{
+				Vec4 const alpha2_sum = VEC4_CONST(s_fourElement[i].alpha2_sum);
+				Vec4 const beta2_sum = VEC4_CONST(s_fourElement[i].beta2_sum);
+				Vec4 const alphabeta_sum = VEC4_CONST(s_fourElement[i].alphabeta_sum);
+				Vec4 const factor = VEC4_CONST(s_fourElement[i].factor);
+				i++;
+				
+				Vec4 const alphax_sum = x0 + MultiplyAdd(x1, twothirds, x2 * onethird);
+				Vec4 const betax_sum = m_xsum - alphax_sum;
+				
+				Vec4 a = NegativeMultiplySubtract(betax_sum, alphabeta_sum, alphax_sum*beta2_sum) * factor;
+				Vec4 b = NegativeMultiplySubtract(alphax_sum, alphabeta_sum, betax_sum*alpha2_sum) * factor;
+				
+				// clamp the output to [0, 1]
+				a = Min( one, Max( zero, a ) );
+				b = Min( one, Max( zero, b ) );
+				
+				// clamp to the grid
+				Vec4 const grid( 31.0f, 63.0f, 31.0f, 0.0f );
+				Vec4 const gridrcp( 0.03227752766457f, 0.01583151765563f, 0.03227752766457f, 0.0f );
+				a = Truncate( MultiplyAdd( grid, a, half ) ) * gridrcp;
+				b = Truncate( MultiplyAdd( grid, b, half ) ) * gridrcp;
+				
+				// compute the error
+				Vec4 e1 = MultiplyAdd( a, alphax_sum, b*betax_sum );
+				Vec4 e2 = MultiplyAdd( a*a, alpha2_sum, b*b*beta2_sum );
+				Vec4 e3 = MultiplyAdd( a*b*alphabeta_sum - e1, two, e2 );
+				
+				// apply the metric to the error term
+				Vec4 e4 = e3 * m_metric;
+				Vec4 error = e4.SplatX() + e4.SplatY() + e4.SplatZ();
+				
+				// keep the solution if it wins
+				if( CompareAnyLessThan( error, besterror ) )
+				{
+					besterror = error;
+					beststart = a;
+					bestend = b;
+					b0 = c0;
+					b1 = c1;
+					b2 = c2;
+				}
+				
+				x2 += m_unweighted[c0+c1+c2];
+			}
+			
+			x1 += m_unweighted[c0+c1];
+		}
+		
+		x0 += m_unweighted[c0];
+	}
+
+	// save the block if necessary
+	if( CompareAnyLessThan( besterror, m_besterror ) )
+	{
+		// compute indices from cluster sizes.
+		/*uint bestindices = 0;
+		{
+			int i = b0;
+			for(; i < b0+b1; i++) {
+				bestindices = 2 << (2 * m_order[i]);
+			}
+			for(; i < b0+b1+b2; i++) {
+				bestindices = 3 << (2 * m_order[i]);
+			}
+			for(; i < 16; i++) {
+				bestindices = 1 << (2 * m_order[i]);
+			}
+		}*/
+		u8 bestindices[16];
+		{
+			int i = 0;
+			for(; i < b0; i++) {
+				bestindices[i] = 0;
+			}
+			for(; i < b0+b1; i++) {
+				bestindices[i] = 2;
+			}
+			for(; i < b0+b1+b2; i++) {
+				bestindices[i] = 3;
+			}
+			for(; i < 16; i++) {
+				bestindices[i] = 1;
+			}
+		}
+		
+		// remap the indices
+		u8 ordered[16];
+		for( int i = 0; i < 16; ++i )
+			ordered[m_order[i]] = bestindices[i];
+		
+		// save the block
+		WriteColourBlock4( beststart.GetVec3(), bestend.GetVec3(), ordered, block );
+		
+		// save the error
+		m_besterror = besterror;
+	}
+}
+
+#else
+
+void FastClusterFit::Compress3( void* block )
+{
+	// declare variables
+	Vec3 beststart( 0.0f );
+	Vec3 bestend( 0.0f );
+	float besterror = FLT_MAX;
+
+	Vec3 x0(0.0f);
+	Vec3 x1;
+	int b0 = 0, b1 = 0;
+	int i = 0;
+
+	// check all possible clusters for this total order
+	for( int c0 = 0; c0 <= 16; c0++)
+	{	
+		x1 = Vec3(0);
+		
+		for( int c1 = 0; c1 <= 16-c0; c1++)
+		{	
+			float const alpha2_sum = s_threeElement[i].alpha2_sum;
+			float const beta2_sum = s_threeElement[i].beta2_sum;
+			float const alphabeta_sum = s_threeElement[i].alphabeta_sum;
+			float const factor = s_threeElement[i].factor;
+			i++;
+			
+			Vec3 const alphax_sum = x0 + x1 * 0.5f;
+			Vec3 const betax_sum = m_xsum - alphax_sum;
+			
+			Vec3 a = (alphax_sum*beta2_sum - betax_sum*alphabeta_sum) * factor;
+			Vec3 b = (betax_sum*alpha2_sum - alphax_sum*alphabeta_sum) * factor;
+			
+			// clamp the output to [0, 1]
+			Vec3 const one( 1.0f );
+			Vec3 const zero( 0.0f );
+			a = Min( one, Max( zero, a ) );
+			b = Min( one, Max( zero, b ) );
+			
+			// clamp to the grid
+			Vec3 const grid( 31.0f, 63.0f, 31.0f );
+			Vec3 const gridrcp( 0.03227752766457f, 0.01583151765563f, 0.03227752766457f );
+			Vec3 const half( 0.5f );
+			a = Floor( grid*a + half )*gridrcp;
+			b = Floor( grid*b + half )*gridrcp;
+			
+			// compute the error
+			Vec3 e1 = a*a*alpha2_sum + b*b*beta2_sum + 2.0f*( a*b*alphabeta_sum - a*alphax_sum - b*betax_sum );
+			
+			// apply the metric to the error term
+			float error = Dot( e1, m_metric );
+			
+			// keep the solution if it wins
+			if( error < besterror )
+			{
+				besterror = error;
+				beststart = a;
+				bestend = b;
+				b0 = c0;
+				b1 = c1;
+			}
+			
+			x1 += m_unweighted[c0+c1];
+		}
+		
+		x0 += m_unweighted[c0];
+	}
+
+	// save the block if necessary
+	if( besterror < m_besterror )
+	{
+		// compute indices from cluster sizes.
+		/*uint bestindices = 0;
+		{
+			int i = b0;
+			for(; i < b0+b1; i++) {
+				bestindices |= 2 << (2 * m_order[i]);
+			}
+			for(; i < 16; i++) {
+				bestindices |= 1 << (2 * m_order[i]);
+			}
+		}*/
+		u8 bestindices[16];
+		{
+			int i = 0;
+			for(; i < b0; i++) {
+				bestindices[i] = 0;
+			}
+			for(; i < b0+b1; i++) {
+				bestindices[i] = 2;
+			}
+			for(; i < 16; i++) {
+				bestindices[i] = 1;
+			}
+		}
+		
+		// remap the indices
+		u8 ordered[16];
+		for( int i = 0; i < 16; ++i )
+			ordered[m_order[i]] = bestindices[i];
+		
+		// save the block
+		WriteColourBlock3( beststart, bestend, ordered, block );
+		
+		// save the error
+		m_besterror = besterror;
+	}
+}
+
+void FastClusterFit::Compress4( void* block )
+{
+	// declare variables
+	Vec3 beststart( 0.0f );
+	Vec3 bestend( 0.0f );
+	float besterror = FLT_MAX;
+
+	Vec3 x0(0.0f);
+	Vec3 x1;
+	Vec3 x2;
+	int b0 = 0, b1 = 0, b2 = 0;
+	int i = 0;
+
+	// check all possible clusters for this total order
+	for( int c0 = 0; c0 <= 16; c0++)
+	{	
+		x1 = Vec3(0.0f);
+		
+		for( int c1 = 0; c1 <= 16-c0; c1++)
+		{	
+			x2 = Vec3(0.0f);
+			
+			for( int c2 = 0; c2 <= 16-c0-c1; c2++)
+			{
+				float const alpha2_sum = s_fourElement[i].alpha2_sum;
+				float const beta2_sum = s_fourElement[i].beta2_sum;
+				float const alphabeta_sum = s_fourElement[i].alphabeta_sum;
+				float const factor = s_fourElement[i].factor;
+				i++;
+				
+				Vec3 const alphax_sum = x0 + x1 * (2.0f / 3.0f) + x2 * (1.0f / 3.0f);
+				Vec3 const betax_sum = m_xsum - alphax_sum;
+				
+				Vec3 a = ( alphax_sum*beta2_sum - betax_sum*alphabeta_sum )*factor;
+				Vec3 b = ( betax_sum*alpha2_sum - alphax_sum*alphabeta_sum )*factor;
+				
+				// clamp the output to [0, 1]
+				Vec3 const one( 1.0f );
+				Vec3 const zero( 0.0f );
+				a = Min( one, Max( zero, a ) );
+				b = Min( one, Max( zero, b ) );
+				
+				// clamp to the grid
+				Vec3 const grid( 31.0f, 63.0f, 31.0f );
+				Vec3 const gridrcp( 0.03227752766457f, 0.01583151765563f, 0.03227752766457f );
+				Vec3 const half( 0.5f );
+				a = Floor( grid*a + half )*gridrcp;
+				b = Floor( grid*b + half )*gridrcp;
+				
+				// compute the error
+				Vec3 e1 = a*a*alpha2_sum + b*b*beta2_sum + 2.0f*( a*b*alphabeta_sum - a*alphax_sum - b*betax_sum );
+				
+				// apply the metric to the error term
+				float error = Dot( e1, m_metric );
+				
+				// keep the solution if it wins
+				if( error < besterror )
+				{
+					besterror = error;
+					beststart = a;
+					bestend = b;
+					b0 = c0;
+					b1 = c1;
+					b2 = c2;
+				}
+				
+				x2 += m_unweighted[c0+c1+c2];
+			}
+			
+			x1 += m_unweighted[c0+c1];
+		}
+		
+		x0 += m_unweighted[c0];
+	}
+
+	// save the block if necessary
+	if( besterror < m_besterror )
+	{
+		// compute indices from cluster sizes.
+		/*uint bestindices = 0;
+		{
+			int i = b0;
+			for(; i < b0+b1; i++) {
+				bestindices = 2 << (2 * m_order[i]);
+			}
+			for(; i < b0+b1+b2; i++) {
+				bestindices = 3 << (2 * m_order[i]);
+			}
+			for(; i < 16; i++) {
+				bestindices = 1 << (2 * m_order[i]);
+			}
+		}*/
+		u8 bestindices[16];
+		{
+			int i = 0;
+			for(; i < b0; i++) {
+				bestindices[i] = 0;
+			}
+			for(; i < b0+b1; i++) {
+				bestindices[i] = 2;
+			}
+			for(; i < b0+b1+b2; i++) {
+				bestindices[i] = 3;
+			}
+			for(; i < 16; i++) {
+				bestindices[i] = 1;
+			}
+		}
+		
+		// remap the indices
+		u8 ordered[16];
+		for( int i = 0; i < 16; ++i )
+			ordered[m_order[i]] = bestindices[i];
+		
+		// save the block
+		WriteColourBlock4( beststart, bestend, ordered, block );
+		
+		// save the error
+		m_besterror = besterror;
+	}
+}
+
+#endif
+
+} // namespace squish
--- a/src/nvimage/nvtt/squish/fastclusterfit.h
+++ b/src/nvimage/nvtt/squish/fastclusterfit.h
@ -0,0 +1,74 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+	Copyright (c) 2006 Ignacio Castano                      icastano@nvidia.com
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#ifndef SQUISH_FASTCLUSTERFIT_H
+#define SQUISH_FASTCLUSTERFIT_H
+
+#include "squish.h"
+#include "maths.h"
+#include "simd.h"
+#include "colourfit.h"
+
+namespace squish {
+
+class FastClusterFit : public ColourFit
+{
+public:
+	FastClusterFit( ColourSet const* colours, int flags );
+	
+	void setMetric(float r, float g, float b);
+	float bestError() const;
+
+	static void doPrecomputation();
+
+	// Make them public
+	virtual void Compress3( void* block );
+	virtual void Compress4( void* block );
+	
+private:
+
+	Vec3 m_principle;
+
+#if SQUISH_USE_SIMD
+	Vec4 m_unweighted[16];
+	Vec4 m_metric;
+	Vec4 m_xxsum;
+	Vec4 m_xsum;
+	Vec4 m_besterror;
+#else
+	Vec3 m_unweighted[16];
+	Vec3 m_metric;
+	Vec3 m_xxsum;
+	Vec3 m_xsum;
+	float m_besterror;
+#endif
+
+	int m_order[16];
+};
+
+} // namespace squish
+
+#endif // ndef SQUISH_CLUSTERFIT_H
--- a/src/nvimage/nvtt/squish/maths.cpp
+++ b/src/nvimage/nvtt/squish/maths.cpp
@ -0,0 +1,252 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+/*! @file
+
+	The symmetric eigensystem solver algorithm is from 
+	http://www.geometrictools.com/Documentation/EigenSymmetric3x3.pdf
+*/
+
+#include "maths.h"
+#include <cfloat>
+
+namespace squish {
+
+Sym3x3 ComputeWeightedCovariance( int n, Vec3 const* points, float const* weights )
+{
+	// compute the centroid
+	float total = 0.0f;
+	Vec3 centroid( 0.0f );
+	for( int i = 0; i < n; ++i )
+	{
+		total += weights[i];
+		centroid += weights[i]*points[i];
+	}
+	centroid /= total;
+
+	// accumulate the covariance matrix
+	Sym3x3 covariance( 0.0f );
+	for( int i = 0; i < n; ++i )
+	{
+		Vec3 a = points[i] - centroid;
+		Vec3 b = weights[i]*a;
+		
+		covariance[0] += a.X()*b.X();
+		covariance[1] += a.X()*b.Y();
+		covariance[2] += a.X()*b.Z();
+		covariance[3] += a.Y()*b.Y();
+		covariance[4] += a.Y()*b.Z();
+		covariance[5] += a.Z()*b.Z();
+	}
+	
+	// return it
+	return covariance;
+}
+
+/*
+static Vec3 GetMultiplicity1Evector( Sym3x3 const& matrix, float evalue )
+{
+	// compute M
+	Sym3x3 m;
+	m[0] = matrix[0] - evalue;
+	m[1] = matrix[1];
+	m[2] = matrix[2];
+	m[3] = matrix[3] - evalue;
+	m[4] = matrix[4];
+	m[5] = matrix[5] - evalue;
+
+	// compute U
+	Sym3x3 u;
+	u[0] = m[3]*m[5] - m[4]*m[4];
+	u[1] = m[2]*m[4] - m[1]*m[5];
+	u[2] = m[1]*m[4] - m[2]*m[3];
+	u[3] = m[0]*m[5] - m[2]*m[2];
+	u[4] = m[1]*m[2] - m[4]*m[0];
+	u[5] = m[0]*m[3] - m[1]*m[1];
+
+	// find the largest component
+	float mc = std::fabs( u[0] );
+	int mi = 0;
+	for( int i = 1; i < 6; ++i )
+	{
+		float c = std::fabs( u[i] );
+		if( c > mc )
+		{
+			mc = c;
+			mi = i;
+		}
+	}
+
+	// pick the column with this component
+	switch( mi )
+	{
+	case 0:
+		return Vec3( u[0], u[1], u[2] );
+
+	case 1:
+	case 3:
+		return Vec3( u[1], u[3], u[4] );
+
+	default:
+		return Vec3( u[2], u[4], u[5] );
+	}
+}
+
+static Vec3 GetMultiplicity2Evector( Sym3x3 const& matrix, float evalue )
+{
+	// compute M
+	Sym3x3 m;
+	m[0] = matrix[0] - evalue;
+	m[1] = matrix[1];
+	m[2] = matrix[2];
+	m[3] = matrix[3] - evalue;
+	m[4] = matrix[4];
+	m[5] = matrix[5] - evalue;
+
+	// find the largest component
+	float mc = std::fabs( m[0] );
+	int mi = 0;
+	for( int i = 1; i < 6; ++i )
+	{
+		float c = std::fabs( m[i] );
+		if( c > mc )
+		{
+			mc = c;
+			mi = i;
+		}
+	}
+
+	// pick the first eigenvector based on this index
+	switch( mi )
+	{
+	case 0:
+	case 1:
+		return Vec3( -m[1], m[0], 0.0f );
+
+	case 2:
+		return Vec3( m[2], 0.0f, -m[0] );
+
+	case 3:
+	case 4:
+		return Vec3( 0.0f, -m[4], m[3] );
+
+	default:
+		return Vec3( 0.0f, -m[5], m[4] );
+	}
+}
+
+Vec3 ComputePrincipleComponent( Sym3x3 const& matrix )
+{
+	// compute the cubic coefficients
+	float c0 = matrix[0]*matrix[3]*matrix[5] 
+		+ 2.0f*matrix[1]*matrix[2]*matrix[4] 
+		- matrix[0]*matrix[4]*matrix[4] 
+		- matrix[3]*matrix[2]*matrix[2] 
+		- matrix[5]*matrix[1]*matrix[1];
+	float c1 = matrix[0]*matrix[3] + matrix[0]*matrix[5] + matrix[3]*matrix[5]
+		- matrix[1]*matrix[1] - matrix[2]*matrix[2] - matrix[4]*matrix[4];
+	float c2 = matrix[0] + matrix[3] + matrix[5];
+
+	// compute the quadratic coefficients
+	float a = c1 - ( 1.0f/3.0f )*c2*c2;
+	float b = ( -2.0f/27.0f )*c2*c2*c2 + ( 1.0f/3.0f )*c1*c2 - c0;
+
+	// compute the root count check
+	float Q = 0.25f*b*b + ( 1.0f/27.0f )*a*a*a;
+
+	// test the multiplicity
+	if( FLT_EPSILON < Q )
+	{
+		// only one root, which implies we have a multiple of the identity
+        return Vec3( 1.0f );
+	}
+	else if( Q < -FLT_EPSILON )
+	{
+		// three distinct roots
+		float theta = std::atan2( std::sqrt( -Q ), -0.5f*b );
+		float rho = std::sqrt( 0.25f*b*b - Q );
+
+		float rt = std::pow( rho, 1.0f/3.0f );
+		float ct = std::cos( theta/3.0f );
+		float st = std::sin( theta/3.0f );
+
+		float l1 = ( 1.0f/3.0f )*c2 + 2.0f*rt*ct;
+		float l2 = ( 1.0f/3.0f )*c2 - rt*( ct + ( float )sqrt( 3.0f )*st );
+		float l3 = ( 1.0f/3.0f )*c2 - rt*( ct - ( float )sqrt( 3.0f )*st );
+
+		// pick the larger
+		if( std::fabs( l2 ) > std::fabs( l1 ) )
+			l1 = l2;
+		if( std::fabs( l3 ) > std::fabs( l1 ) )
+			l1 = l3;
+
+		// get the eigenvector
+		return GetMultiplicity1Evector( matrix, l1 );
+	}
+	else // if( -FLT_EPSILON <= Q && Q <= FLT_EPSILON )
+	{
+		// two roots
+		float rt;
+		if( b < 0.0f )
+			rt = -std::pow( -0.5f*b, 1.0f/3.0f );
+		else
+			rt = std::pow( 0.5f*b, 1.0f/3.0f );
+		
+		float l1 = ( 1.0f/3.0f )*c2 + rt;		// repeated
+		float l2 = ( 1.0f/3.0f )*c2 - 2.0f*rt;
+		
+		// get the eigenvector
+		if( std::fabs( l1 ) > std::fabs( l2 ) )
+			return GetMultiplicity2Evector( matrix, l1 );
+		else
+			return GetMultiplicity1Evector( matrix, l2 );
+	}
+}
+*/
+
+Vec3 ComputePrincipleComponent( Sym3x3 const& matrix )
+{
+	const int NUM = 8;
+
+	Vec3 v(1, 1, 1);
+	for(int i = 0; i < NUM; i++) {
+		float x = v.X() * matrix[0] + v.Y() * matrix[1] + v.Z() * matrix[2];
+		float y = v.X() * matrix[1] + v.Y() * matrix[3] + v.Z() * matrix[4];
+		float z = v.X() * matrix[2] + v.Y() * matrix[4] + v.Z() * matrix[5];
+		
+		float norm = std::max(std::max(x, y), z);
+		float iv = 1.0f / norm;
+		if (norm == 0.0f) {		// @@ I think this is not necessary in this case!!
+			return Vec3(0.0f);
+		}
+		
+		v = Vec3(x*iv, y*iv, z*iv);
+	}
+
+	return v;
+}
+
+
+} // namespace squish
--- a/src/nvimage/nvtt/squish/maths.h
+++ b/src/nvimage/nvtt/squish/maths.h
@ -0,0 +1,233 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#ifndef SQUISH_MATHS_H
+#define SQUISH_MATHS_H
+
+#include <cmath>
+#include <algorithm>
+#include "config.h"
+
+namespace squish {
+
+class Vec3
+{
+public:
+	typedef Vec3 const& Arg;
+
+	Vec3()
+	{
+	}
+
+	explicit Vec3( float a )
+	{
+		m_x = a;
+		m_y = a;
+		m_z = a;
+	}
+
+	Vec3( float a, float b, float c )
+	{
+		m_x = a;
+		m_y = b;
+		m_z = c;
+	}
+	
+	float X() const { return m_x; }
+	float Y() const { return m_y; }
+	float Z() const { return m_z; }
+	
+	Vec3 operator-() const
+	{
+		return Vec3( -m_x, -m_y, -m_z );
+	}
+	
+	Vec3& operator+=( Arg v )
+	{
+		m_x += v.m_x;
+		m_y += v.m_y;
+		m_z += v.m_z;
+		return *this;
+	}
+	
+	Vec3& operator-=( Arg v )
+	{
+		m_x -= v.m_x;
+		m_y -= v.m_y;
+		m_z -= v.m_z;
+		return *this;
+	}
+	
+	Vec3& operator*=( Arg v )
+	{
+		m_x *= v.m_x;
+		m_y *= v.m_y;
+		m_z *= v.m_z;
+		return *this;
+	}
+	
+	Vec3& operator*=( float s )
+	{
+		m_x *= s;
+		m_y *= s;
+		m_z *= s;
+		return *this;
+	}
+	
+	Vec3& operator/=( Arg v )
+	{
+		m_x /= v.m_x;
+		m_y /= v.m_y;
+		m_z /= v.m_z;
+		return *this;
+	}
+	
+	Vec3& operator/=( float s )
+	{
+		float t = 1.0f/s;
+		m_x *= t;
+		m_y *= t;
+		m_z *= t;
+		return *this;
+	}
+	
+	friend Vec3 operator+( Arg left, Arg right )
+	{
+		Vec3 copy( left );
+		return copy += right;
+	}
+	
+	friend Vec3 operator-( Arg left, Arg right )
+	{
+		Vec3 copy( left );
+		return copy -= right;
+	}
+	
+	friend Vec3 operator*( Arg left, Arg right )
+	{
+		Vec3 copy( left );
+		return copy *= right;
+	}
+	
+	friend Vec3 operator*( Arg left, float right )
+	{
+		Vec3 copy( left );
+		return copy *= right;
+	}
+	
+	friend Vec3 operator*( float left, Arg right )
+	{
+		Vec3 copy( right );
+		return copy *= left;
+	}
+	
+	friend Vec3 operator/( Arg left, Arg right )
+	{
+		Vec3 copy( left );
+		return copy /= right;
+	}
+	
+	friend Vec3 operator/( Arg left, float right )
+	{
+		Vec3 copy( left );
+		return copy /= right;
+	}
+	
+	friend float Dot( Arg left, Arg right )
+	{
+		return left.m_x*right.m_x + left.m_y*right.m_y + left.m_z*right.m_z;
+	}
+	
+	friend Vec3 Min( Arg left, Arg right )
+	{
+		return Vec3(
+			std::min( left.m_x, right.m_x ), 
+			std::min( left.m_y, right.m_y ), 
+			std::min( left.m_z, right.m_z )
+		);
+	}
+
+	friend Vec3 Max( Arg left, Arg right )
+	{
+		return Vec3(
+			std::max( left.m_x, right.m_x ), 
+			std::max( left.m_y, right.m_y ), 
+			std::max( left.m_z, right.m_z )
+		);
+	}
+
+	friend Vec3 Floor( Arg v )
+	{
+		return Vec3(
+			std::floor( v.m_x ), 
+			std::floor( v.m_y ), 
+			std::floor( v.m_z )
+		);
+	}
+
+private:
+	float m_x;
+	float m_y;
+	float m_z;
+};
+
+inline float LengthSquared( Vec3::Arg v )
+{
+	return Dot( v, v );
+}
+
+class Sym3x3
+{
+public:
+	Sym3x3()
+	{
+	}
+
+	Sym3x3( float a )
+	{
+		for( int i = 0; i < 6; ++i )
+			m_x[i] = a;
+	}
+
+	float operator[]( int index ) const
+	{
+		return m_x[index];
+	}
+
+	float& operator[]( int index )
+	{
+		return m_x[index];
+	}
+
+private:
+	float m_x[6];
+};
+
+Sym3x3 ComputeWeightedCovariance( int n, Vec3 const* points, float const* weights );
+Vec3 ComputePrincipleComponent( Sym3x3 const& matrix );
+
+} // namespace squish
+
+#endif // ndef SQUISH_MATHS_H
--- a/src/nvimage/nvtt/squish/rangefit.cpp
+++ b/src/nvimage/nvtt/squish/rangefit.cpp
@ -0,0 +1,202 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#include "rangefit.h"
+#include "colourset.h"
+#include "colourblock.h"
+#include <cfloat>
+
+namespace squish {
+
+RangeFit::RangeFit( ColourSet const* colours, int flags ) 
+  : ColourFit( colours, flags )
+{
+	// initialise the metric
+	bool perceptual = ( ( m_flags & kColourMetricPerceptual ) != 0 );
+	if( perceptual )
+		m_metric = Vec3( 0.2126f, 0.7152f, 0.0722f );
+	else
+		m_metric = Vec3( 1.0f );
+
+	// initialise the best error
+	m_besterror = FLT_MAX;
+
+	// cache some values
+	int const count = m_colours->GetCount();
+	Vec3 const* values = m_colours->GetPoints();
+	float const* weights = m_colours->GetWeights();
+	
+	// get the covariance matrix
+	Sym3x3 covariance = ComputeWeightedCovariance( count, values, weights );
+	
+	// compute the principle component
+	Vec3 principle = ComputePrincipleComponent( covariance );
+
+	// get the min and max range as the codebook endpoints
+	Vec3 start( 0.0f );
+	Vec3 end( 0.0f );
+	if( count > 0 )
+	{
+		float min, max;
+		
+		// compute the range
+		start = end = values[0];
+		min = max = Dot( values[0], principle );
+		for( int i = 1; i < count; ++i )
+		{
+			float val = Dot( values[i], principle );
+			if( val < min )
+			{
+				start = values[i];
+				min = val;
+			}
+			else if( val > max )
+			{
+				end = values[i];
+				max = val;
+			}
+		}
+	}
+			
+	// clamp the output to [0, 1]
+	Vec3 const one( 1.0f );
+	Vec3 const zero( 0.0f );
+	start = Min( one, Max( zero, start ) );
+	end = Min( one, Max( zero, end ) );
+
+	// clamp to the grid and save
+	Vec3 const grid( 31.0f, 63.0f, 31.0f );
+	Vec3 const gridrcp( 1.0f/31.0f, 1.0f/63.0f, 1.0f/31.0f );
+	Vec3 const half( 0.5f );
+	m_start = Floor( grid*start + half )*gridrcp;
+	m_end = Floor( grid*end + half )*gridrcp;
+}
+
+void RangeFit::Compress3( void* block )
+{
+	// cache some values
+	int const count = m_colours->GetCount();
+	Vec3 const* values = m_colours->GetPoints();
+	
+	// create a codebook
+	Vec3 codes[3];
+	codes[0] = m_start;
+	codes[1] = m_end;
+	codes[2] = 0.5f*m_start + 0.5f*m_end;
+
+	// match each point to the closest code
+	u8 closest[16];
+	float error = 0.0f;
+	for( int i = 0; i < count; ++i )
+	{
+		// find the closest code
+		float dist = FLT_MAX;
+		int idx = 0;
+		for( int j = 0; j < 3; ++j )
+		{
+			float d = LengthSquared( m_metric*( values[i] - codes[j] ) );
+			if( d < dist )
+			{
+				dist = d;
+				idx = j;
+			}
+		}
+		
+		// save the index
+		closest[i] = ( u8 )idx;
+		
+		// accumulate the error
+		error += dist;
+	}
+	
+	// save this scheme if it wins
+	if( error < m_besterror )
+	{
+		// remap the indices
+		u8 indices[16];
+		m_colours->RemapIndices( closest, indices );
+		
+		// save the block
+		WriteColourBlock3( m_start, m_end, indices, block );
+		
+		// save the error
+		m_besterror = error;
+	}
+}
+
+void RangeFit::Compress4( void* block )
+{
+	// cache some values
+	int const count = m_colours->GetCount();
+	Vec3 const* values = m_colours->GetPoints();
+	
+	// create a codebook
+	Vec3 codes[4];
+	codes[0] = m_start;
+	codes[1] = m_end;
+	codes[2] = ( 2.0f/3.0f )*m_start + ( 1.0f/3.0f )*m_end;
+	codes[3] = ( 1.0f/3.0f )*m_start + ( 2.0f/3.0f )*m_end;
+
+	// match each point to the closest code
+	u8 closest[16];
+	float error = 0.0f;
+	for( int i = 0; i < count; ++i )
+	{
+		// find the closest code
+		float dist = FLT_MAX;
+		int idx = 0;
+		for( int j = 0; j < 4; ++j )
+		{
+			float d = LengthSquared( m_metric*( values[i] - codes[j] ) );
+			if( d < dist )
+			{
+				dist = d;
+				idx = j;
+			}
+		}
+		
+		// save the index
+		closest[i] = ( u8 )idx;
+		
+		// accumulate the error
+		error += dist;
+	}
+	
+	// save this scheme if it wins
+	if( error < m_besterror )
+	{
+		// remap the indices
+		u8 indices[16];
+		m_colours->RemapIndices( closest, indices );
+		
+		// save the block
+		WriteColourBlock4( m_start, m_end, indices, block );
+
+		// save the error
+		m_besterror = error;
+	}
+}
+
+} // namespace squish
--- a/src/nvimage/nvtt/squish/rangefit.h
+++ b/src/nvimage/nvtt/squish/rangefit.h
@ -0,0 +1,54 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#ifndef SQUISH_RANGEFIT_H
+#define SQUISH_RANGEFIT_H
+
+#include <squish.h>
+#include "colourfit.h"
+#include "maths.h"
+
+namespace squish {
+
+class ColourSet;
+
+class RangeFit : public ColourFit
+{
+public:
+	RangeFit( ColourSet const* colours, int flags );
+	
+private:
+	virtual void Compress3( void* block );
+	virtual void Compress4( void* block );
+	
+	Vec3 m_metric;
+	Vec3 m_start;
+	Vec3 m_end;
+	float m_besterror;
+};
+
+} // squish
+
+#endif // ndef SQUISH_RANGEFIT_H
--- a/src/nvimage/nvtt/squish/simd.h
+++ b/src/nvimage/nvtt/squish/simd.h
@ -0,0 +1,39 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#ifndef SQUISH_SIMD_H
+#define SQUISH_SIMD_H
+
+#include "maths.h"
+
+#if SQUISH_USE_ALTIVEC
+#include "simd_ve.h"
+#endif
+
+#if SQUISH_USE_SSE
+#include "simd_sse.h"
+#endif
+
+#endif // ndef SQUISH_SIMD_H
--- a/src/nvimage/nvtt/squish/simd_sse.h
+++ b/src/nvimage/nvtt/squish/simd_sse.h
@ -0,0 +1,192 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#ifndef SQUISH_SIMD_SSE_H
+#define SQUISH_SIMD_SSE_H
+
+#include <xmmintrin.h>
+#if ( SQUISH_USE_SSE > 1 )
+#include <emmintrin.h>
+#endif
+#include <cassert>
+
+#define SQUISH_SSE_SPLAT( a )										\
+	( ( a ) | ( ( a ) << 2 ) | ( ( a ) << 4 ) | ( ( a ) << 6 ) )
+
+namespace squish {
+
+#define VEC4_CONST( X ) Vec4( _mm_set1_ps( X ) )
+
+class Vec4
+{
+public:
+	typedef Vec4 const& Arg;
+
+	Vec4() {}
+		
+	explicit Vec4( __m128 v ) : m_v( v ) {}
+	
+	Vec4( Vec4 const& arg ) : m_v( arg.m_v ) {}
+	
+	Vec4& operator=( Vec4 const& arg )
+	{
+		m_v = arg.m_v;
+		return *this;
+	}
+	
+	Vec4( float x, float y, float z, float w )
+	{
+		m_v = _mm_setr_ps( x, y, z, w );
+	}
+	
+	Vec3 GetVec3() const
+	{
+#ifdef __GNUC__
+		__attribute__ ((__aligned__ (16))) float c[4];
+#else
+		__declspec(align(16)) float c[4];
+#endif
+		_mm_store_ps( c, m_v );
+		return Vec3( c[0], c[1], c[2] );
+	}
+	
+	Vec4 SplatX() const { return Vec4( _mm_shuffle_ps( m_v, m_v, SQUISH_SSE_SPLAT( 0 ) ) ); }
+	Vec4 SplatY() const { return Vec4( _mm_shuffle_ps( m_v, m_v, SQUISH_SSE_SPLAT( 1 ) ) ); }
+	Vec4 SplatZ() const { return Vec4( _mm_shuffle_ps( m_v, m_v, SQUISH_SSE_SPLAT( 2 ) ) ); }
+	Vec4 SplatW() const { return Vec4( _mm_shuffle_ps( m_v, m_v, SQUISH_SSE_SPLAT( 3 ) ) ); }
+
+	Vec4& operator+=( Arg v )
+	{
+		m_v = _mm_add_ps( m_v, v.m_v );
+		return *this;
+	}
+	
+	Vec4& operator-=( Arg v )
+	{
+		m_v = _mm_sub_ps( m_v, v.m_v );
+		return *this;
+	}
+	
+	Vec4& operator*=( Arg v )
+	{
+		m_v = _mm_mul_ps( m_v, v.m_v );
+		return *this;
+	}
+	
+	friend Vec4 operator+( Vec4::Arg left, Vec4::Arg right  )
+	{
+		return Vec4( _mm_add_ps( left.m_v, right.m_v ) );
+	}
+	
+	friend Vec4 operator-( Vec4::Arg left, Vec4::Arg right  )
+	{
+		return Vec4( _mm_sub_ps( left.m_v, right.m_v ) );
+	}
+	
+	friend Vec4 operator*( Vec4::Arg left, Vec4::Arg right  )
+	{
+		return Vec4( _mm_mul_ps( left.m_v, right.m_v ) );
+	}
+	
+	//! Returns a*b + c
+	friend Vec4 MultiplyAdd( Vec4::Arg a, Vec4::Arg b, Vec4::Arg c )
+	{
+		return Vec4( _mm_add_ps( _mm_mul_ps( a.m_v, b.m_v ), c.m_v ) );
+	}
+	
+	//! Returns -( a*b - c )
+	friend Vec4 NegativeMultiplySubtract( Vec4::Arg a, Vec4::Arg b, Vec4::Arg c )
+	{
+		return Vec4( _mm_sub_ps( c.m_v, _mm_mul_ps( a.m_v, b.m_v ) ) );
+	}
+	
+	friend Vec4 Reciprocal( Vec4::Arg v )
+	{
+		// get the reciprocal estimate
+		__m128 estimate = _mm_rcp_ps( v.m_v );
+
+		// one round of Newton-Rhaphson refinement
+		__m128 diff = _mm_sub_ps( _mm_set1_ps( 1.0f ), _mm_mul_ps( estimate, v.m_v ) );
+		return Vec4( _mm_add_ps( _mm_mul_ps( diff, estimate ), estimate ) );
+	}
+	
+	friend Vec4 Min( Vec4::Arg left, Vec4::Arg right )
+	{
+		return Vec4( _mm_min_ps( left.m_v, right.m_v ) );
+	}
+	
+	friend Vec4 Max( Vec4::Arg left, Vec4::Arg right )
+	{
+		return Vec4( _mm_max_ps( left.m_v, right.m_v ) );
+	}
+	
+	friend Vec4 Truncate( Vec4::Arg v )
+	{
+#if ( SQUISH_USE_SSE == 1 )
+		// convert to ints
+		__m128 input = v.m_v;
+		__m64 lo = _mm_cvttps_pi32( input );
+		__m64 hi = _mm_cvttps_pi32( _mm_movehl_ps( input, input ) );
+
+		// convert to floats
+		__m128 part = _mm_movelh_ps( input, _mm_cvtpi32_ps( input, hi ) );
+		__m128 truncated = _mm_cvtpi32_ps( part, lo );
+		
+		// clear out the MMX multimedia state to allow FP calls later
+		_mm_empty(); 
+		return Vec4( truncated );
+#else
+		// use SSE2 instructions
+		return Vec4( _mm_cvtepi32_ps( _mm_cvttps_epi32( v.m_v ) ) );
+#endif
+	}
+	
+	friend Vec4 CompareEqual( Vec4::Arg left, Vec4::Arg right )
+	{
+		return Vec4( _mm_cmpeq_ps( left.m_v, right.m_v ) );
+	}
+	
+	friend Vec4 Select( Vec4::Arg off, Vec4::Arg on, Vec4::Arg bits )
+	{
+        __m128 a = _mm_andnot_ps( bits.m_v, off.m_v );
+        __m128 b = _mm_and_ps( bits.m_v, on.m_v );
+
+        return Vec4( _mm_or_ps( a, b ) );
+	}
+	
+	friend bool CompareAnyLessThan( Vec4::Arg left, Vec4::Arg right ) 
+	{
+		__m128 bits = _mm_cmplt_ps( left.m_v, right.m_v );
+		int value = _mm_movemask_ps( bits );
+		return value != 0;
+	}
+	
+private:
+	__m128 m_v;
+};
+
+} // namespace squish
+
+#endif // ndef SQUISH_SIMD_SSE_H
--- a/src/nvimage/nvtt/squish/simd_ve.h
+++ b/src/nvimage/nvtt/squish/simd_ve.h
@ -0,0 +1,166 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#ifndef SQUISH_SIMD_VE_H
+#define SQUISH_SIMD_VE_H
+
+#include <altivec.h>
+#undef bool
+
+namespace squish {
+
+#define VEC4_CONST( X ) Vec4( ( vector float )( X ) )
+
+class Vec4
+{
+public:
+	typedef Vec4 Arg;
+
+	Vec4() {}
+		
+	explicit Vec4( vector float v ) : m_v( v ) {}
+	
+	Vec4( Vec4 const& arg ) : m_v( arg.m_v ) {}
+	
+	Vec4& operator=( Vec4 const& arg )
+	{
+		m_v = arg.m_v;
+		return *this;
+	}
+	
+	Vec4( float x, float y, float z, float w )
+	{
+		union { vector float v; float c[4]; } u;
+		u.c[0] = x;
+		u.c[1] = y;
+		u.c[2] = z;
+		u.c[3] = w;
+		m_v = u.v;
+	}
+	
+	Vec3 GetVec3() const
+	{
+		union { vector float v; float c[4]; } u;
+		u.v = m_v;
+		return Vec3( u.c[0], u.c[1], u.c[2] );
+	}
+	
+	Vec4 SplatX() const { return Vec4( vec_splat( m_v, 0 ) ); }
+	Vec4 SplatY() const { return Vec4( vec_splat( m_v, 1 ) ); }
+	Vec4 SplatZ() const { return Vec4( vec_splat( m_v, 2 ) ); }
+	Vec4 SplatW() const { return Vec4( vec_splat( m_v, 3 ) ); }
+
+	Vec4& operator+=( Arg v )
+	{
+		m_v = vec_add( m_v, v.m_v );
+		return *this;
+	}
+	
+	Vec4& operator-=( Arg v )
+	{
+		m_v = vec_sub( m_v, v.m_v );
+		return *this;
+	}
+	
+	Vec4& operator*=( Arg v )
+	{
+		m_v = vec_madd( m_v, v.m_v, ( vector float )( -0.0f ) );
+		return *this;
+	}
+	
+	friend Vec4 operator+( Vec4::Arg left, Vec4::Arg right  )
+	{
+		return Vec4( vec_add( left.m_v, right.m_v ) );
+	}
+	
+	friend Vec4 operator-( Vec4::Arg left, Vec4::Arg right  )
+	{
+		return Vec4( vec_sub( left.m_v, right.m_v ) );
+	}
+	
+	friend Vec4 operator*( Vec4::Arg left, Vec4::Arg right  )
+	{
+		return Vec4( vec_madd( left.m_v, right.m_v, ( vector float )( -0.0f ) ) );
+	}
+	
+	//! Returns a*b + c
+	friend Vec4 MultiplyAdd( Vec4::Arg a, Vec4::Arg b, Vec4::Arg c )
+	{
+		return Vec4( vec_madd( a.m_v, b.m_v, c.m_v ) );
+	}
+	
+	//! Returns -( a*b - c )
+	friend Vec4 NegativeMultiplySubtract( Vec4::Arg a, Vec4::Arg b, Vec4::Arg c )
+	{
+		return Vec4( vec_nmsub( a.m_v, b.m_v, c.m_v ) );
+	}
+	
+	friend Vec4 Reciprocal( Vec4::Arg v )
+	{
+		// get the reciprocal estimate
+		vector float estimate = vec_re( v.m_v );
+		
+		// one round of Newton-Rhaphson refinement
+		vector float diff = vec_nmsub( estimate, v.m_v, ( vector float )( 1.0f ) );
+		return Vec4( vec_madd( diff, estimate, estimate ) );
+	}
+	
+	friend Vec4 Min( Vec4::Arg left, Vec4::Arg right )
+	{
+		return Vec4( vec_min( left.m_v, right.m_v ) );
+	}
+	
+	friend Vec4 Max( Vec4::Arg left, Vec4::Arg right )
+	{
+		return Vec4( vec_max( left.m_v, right.m_v ) );
+	}
+	
+	friend Vec4 Truncate( Vec4::Arg v )
+	{
+		return Vec4( vec_trunc( v.m_v ) );
+	}
+	
+	friend Vec4 CompareEqual( Vec4::Arg left, Vec4::Arg right )
+	{
+		return Vec4( ( vector float )vec_cmpeq( left.m_v, right.m_v ) );
+	}
+	
+	friend Vec4 Select( Vec4::Arg off, Vec4::Arg on, Vec4::Arg bits )
+	{
+		return Vec4( vec_sel( off.m_v, on.m_v, ( vector unsigned int )bits.m_v ) );
+	}
+	
+	friend bool CompareAnyLessThan( Vec4::Arg left, Vec4::Arg right ) 
+	{
+		return vec_any_lt( left.m_v, right.m_v ) != 0;
+	}
+	
+private:
+	vector float m_v;
+};
+
+} // namespace squish
+
+#endif // ndef SQUISH_SIMD_VE_H
--- a/src/nvimage/nvtt/squish/singlechannelfit.cpp
+++ b/src/nvimage/nvtt/squish/singlechannelfit.cpp
@ -0,0 +1,144 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+	Copyright (c) 2006 Ignacio Castano                      castanyo@yahoo.es
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#include "singlechannelfit.h"
+#include "colourset.h"
+#include "colourblock.h"
+#include <cfloat>
+
+namespace squish {
+
+SingleChannelFit::SingleChannelFit( ColourSet const* colours, int const flags ) 
+  : ColourFit( colours, flags )
+{
+	// cache some values
+	int const count = m_colours->GetCount();
+	Vec3 const* values = m_colours->GetPoints();
+	
+	// Find bounds of the search space.
+	m_g_min = 63;
+	m_g_max = 0;
+	
+	for(uint i = 0; i < count; i++) {
+		
+		int grey = int(values[i].Y() * 255.0f);	// @@ rounding?
+		grey = std::min(grey, 255);	// clamp to [0, 1)
+		grey = std::max(grey, 0);
+		m_greys[i] = u8(grey);
+		
+		m_g_min = std::min(m_g_min, grey >> 2);
+		m_g_max = std::max(m_g_max, grey >> 2);
+	}
+	
+	int const g_pad = m_g_max - m_g_min + 1;
+
+	m_g_min = std::max(0, m_g_min - g_pad);
+	m_g_max = std::min(63, m_g_max + g_pad);
+}
+
+void SingleChannelFit::Compress3( void* block )
+{
+	// do not do anything.
+}
+
+void SingleChannelFit::Compress4( void* block )
+{
+	int const count = m_colours->GetCount();
+	Vec3 const* values = m_colours->GetPoints();
+	float const* weights = m_colours->GetWeights();
+	
+	int best_g0;
+	int best_g1;
+	float best_error = FLT_MAX;
+	
+	// Brute force approach, try all the possible endpoints with g0 > g1.
+	for(int g0 = m_g_min+1; g0 <= m_g_max; g0++) {
+		for(int g1 = m_g_min; g1 < g0; g1++) {
+			
+			// Compute palette.
+			const int c0 = (g0 << 2) | (g0 >> 4);
+			const int c1 = (g1 << 2) | (g1 >> 4);
+			const int c2 = (2 * c0 + c1) / 3;
+			const int c3 = (2 * c1 + c0) / 3;
+			
+			// Evaluate palette error.
+			float error = 0;
+			for(int i = 0; i < count; i++) {
+				const int grey = m_greys[i];
+				
+				int min_dist = abs(c0 - grey);	// Use absolute distance, not squared.
+				min_dist = std::min(min_dist, abs(c1 - grey));
+				min_dist = std::min(min_dist, abs(c2 - grey));
+				min_dist = std::min(min_dist, abs(c3 - grey));
+				
+				error += min_dist * weights[i];
+			}
+			
+			if(error < best_error) {
+				best_error = error;
+				best_g0 = g0;
+				best_g1 = g1;
+			}
+		}
+	}
+	
+	// Compute best palette.
+	const int best_c0 = (best_g0 << 2) | (best_g0 >> 4);
+	const int best_c1 = (best_g1 << 2) | (best_g1 >> 4);
+	const int best_c2 = (2 * best_c0 + best_c1) / 3;
+	const int best_c3 = (2 * best_c1 + best_c0) / 3;
+	
+	// Compute best indices.
+	u8 closest[16];
+	for(int i = 0; i < count; i++) {
+		const int grey = m_greys[i];
+		
+		int dist = abs(best_c0 - grey);
+		int min_dist = dist;
+		int min_i = 0;
+		
+		dist = abs(best_c1 - grey);
+		if( dist < min_dist ) { min_dist = dist; min_i = 1; }
+		
+		dist = abs(best_c2 - grey);
+		if( dist < min_dist ) { min_dist = dist; min_i = 2; }
+		
+		dist = abs(best_c3 - grey);
+		if( dist < min_dist ) { min_dist = dist; min_i = 3; }
+		
+		closest[i] = min_i;
+	}
+	
+	// remap the indices
+	u8 indices[16];
+	m_colours->RemapIndices( closest, indices );
+	
+	// Output block.
+	WriteColourBlock(best_g0 << 5, best_g1 << 5, indices, block);
+}
+
+
+} // namespace squish
--- a/src/nvimage/nvtt/squish/singlechannelfit.h
+++ b/src/nvimage/nvtt/squish/singlechannelfit.h
@ -0,0 +1,53 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+	Copyright (c) 2006 Ignacio Castano                      castanyo@yahoo.es
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#ifndef SQUISH_SINGLECHANNELFIT_H
+#define SQUISH_SINGLECHANNELFIT_H
+
+#include <squish.h>
+#include "maths.h"
+#include "colourfit.h"
+
+namespace squish {
+
+class SingleChannelFit : public ColourFit
+{
+public:
+	SingleChannelFit( ColourSet const* colours, int flags );
+	
+private:
+	virtual void Compress3( void* block );
+	virtual void Compress4( void* block );
+
+private:
+	u8 m_greys[16];
+	int m_g_min;
+	int m_g_max;
+};
+
+} // namespace squish
+
+#endif // ndef SQUISH_SINGLECHANNELFIT_H
--- a/src/nvimage/nvtt/squish/singlecolourfit.cpp
+++ b/src/nvimage/nvtt/squish/singlecolourfit.cpp
@ -0,0 +1,172 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#include "singlecolourfit.h"
+#include "colourset.h"
+#include "colourblock.h"
+
+namespace squish {
+
+struct SourceBlock
+{
+	u8 start;
+	u8 end;
+	u8 error;
+};
+
+struct SingleColourLookup
+{
+	SourceBlock sources[4];
+};
+
+#include "singlecolourlookup.inl"
+
+static int FloatToInt( float a, int limit )
+{
+	// use ANSI round-to-zero behaviour to get round-to-nearest
+	int i = ( int )( a + 0.5f );
+
+	// clamp to the limit
+	if( i < 0 )
+		i = 0;
+	else if( i > limit )
+		i = limit; 
+
+	// done
+	return i;
+}
+
+SingleColourFit::SingleColourFit( ColourSet const* colours, int flags )
+  : ColourFit( colours, flags )
+{
+	// grab the single colour
+	Vec3 const* values = m_colours->GetPoints();
+	m_colour[0] = ( u8 )FloatToInt( 255.0f*values->X(), 255 );
+	m_colour[1] = ( u8 )FloatToInt( 255.0f*values->Y(), 255 );
+	m_colour[2] = ( u8 )FloatToInt( 255.0f*values->Z(), 255 );
+		
+	// initialise the best error
+	m_besterror = INT_MAX;
+}
+
+void SingleColourFit::Compress3( void* block )
+{
+	// build the table of lookups
+	SingleColourLookup const* const lookups[] = 
+	{
+		lookup_5_3, 
+		lookup_6_3, 
+		lookup_5_3
+	};
+	
+	// find the best end-points and index
+	ComputeEndPoints( 3, lookups );
+	
+	// build the block if we win
+	if( m_error < m_besterror )
+	{
+		// remap the indices
+		u8 indices[16];
+		m_colours->RemapIndices( &m_index, indices );
+		
+		// save the block
+		WriteColourBlock3( m_start, m_end, indices, block );
+
+		// save the error
+		m_besterror = m_error;
+	}
+}
+
+void SingleColourFit::Compress4( void* block )
+{
+	// build the table of lookups
+	SingleColourLookup const* const lookups[] = 
+	{
+		lookup_5_4, 
+		lookup_6_4, 
+		lookup_5_4
+	};
+	
+	// find the best end-points and index
+	ComputeEndPoints( 4, lookups );
+	
+	// build the block if we win
+	if( m_error < m_besterror )
+	{
+		// remap the indices
+		u8 indices[16];
+		m_colours->RemapIndices( &m_index, indices );
+		
+		// save the block
+		WriteColourBlock4( m_start, m_end, indices, block );
+
+		// save the error
+		m_besterror = m_error;
+	}
+}
+
+void SingleColourFit::ComputeEndPoints( int count, SingleColourLookup const* const* lookups )
+{
+	// check each index combination
+	m_error = INT_MAX;
+	for( int index = 0; index < count; ++index )
+	{
+		// check the error for this codebook index
+		SourceBlock const* sources[3];
+		int error = 0;
+		for( int channel = 0; channel < 3; ++channel )
+		{
+			// grab the lookup table and index for this channel
+			SingleColourLookup const* lookup = lookups[channel];
+			int target = m_colour[channel];
+			
+			// store a pointer to the source for this channel
+			sources[channel] = lookup[target].sources + index;
+			
+			// accumulate the error
+			int diff = sources[channel]->error;
+			error += diff*diff;			
+		}
+		
+		// keep it if the error is lower
+		if( error < m_error )
+		{
+			m_start = Vec3(
+				( float )sources[0]->start/31.0f, 
+				( float )sources[1]->start/63.0f, 
+				( float )sources[2]->start/31.0f
+			);
+			m_end = Vec3(
+				( float )sources[0]->end/31.0f, 
+				( float )sources[1]->end/63.0f, 
+				( float )sources[2]->end/31.0f
+			);
+			m_index = ( u8 )index;
+			m_error = error;
+		}
+	}
+}
+
+} // namespace squish
--- a/src/nvimage/nvtt/squish/singlecolourfit.h
+++ b/src/nvimage/nvtt/squish/singlecolourfit.h
@ -0,0 +1,58 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#ifndef SQUISH_SINGLECOLOURFIT_H
+#define SQUISH_SINGLECOLOURFIT_H
+
+#include <squish.h>
+#include "colourfit.h"
+
+namespace squish {
+
+class ColourSet;
+struct SingleColourLookup;
+
+class SingleColourFit : public ColourFit
+{
+public:
+	SingleColourFit( ColourSet const* colours, int flags );
+	
+private:
+	virtual void Compress3( void* block );
+	virtual void Compress4( void* block );
+	
+	void ComputeEndPoints( int count, SingleColourLookup const* const* lookups );
+	
+	u8 m_colour[3];
+	Vec3 m_start;
+	Vec3 m_end;
+	u8 m_index;
+	int m_error;
+	int m_besterror;
+};
+
+} // namespace squish
+
+#endif // ndef SQUISH_SINGLECOLOURFIT_H
--- a/src/nvimage/nvtt/squish/singlecolourlookup.inl
+++ b/src/nvimage/nvtt/squish/singlecolourlookup.inl
--- a/src/nvimage/nvtt/squish/squish.cpp
+++ b/src/nvimage/nvtt/squish/squish.cpp
@ -0,0 +1,225 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#include <squish.h>
+#include "colourset.h"
+#include "maths.h"
+#include "rangefit.h"
+#include "clusterfit.h"
+#include "colourblock.h"
+#include "alpha.h"
+#include "singlecolourfit.h"
+
+namespace squish {
+
+static int FixFlags( int flags )
+{
+	// grab the flag bits
+	int method = flags & ( kDxt1 | kDxt3 | kDxt5 );
+	int fit = flags & ( kColourClusterFit | kColourRangeFit );
+	int metric = flags & ( kColourMetricPerceptual | kColourMetricUniform );
+	int extra = flags & kWeightColourByAlpha;
+	
+	// set defaults
+	if( method != kDxt3 && method != kDxt5 )
+		method = kDxt1;
+	if( fit != kColourRangeFit )
+		fit = kColourClusterFit;
+	if( metric != kColourMetricUniform )
+		metric = kColourMetricPerceptual;
+		
+	// done
+	return method | fit | metric | extra;
+}
+
+
+void Compress( u8 const* rgba, void* block, int flags )
+{
+	// fix any bad flags
+	flags = FixFlags( flags );
+
+	// get the block locations
+	void* colourBlock = block;
+	void* alphaBock = block;
+	if( ( flags & ( kDxt3 | kDxt5 ) ) != 0 )
+		colourBlock = reinterpret_cast< u8* >( block ) + 8;
+
+	// create the minimal point set
+	ColourSet colours( rgba, flags );
+	
+	// check the compression type and compress colour
+	if( colours.GetCount() == 1 )
+	{
+		// always do a single colour fit
+		SingleColourFit fit( &colours, flags );
+		fit.Compress( colourBlock );
+	}
+	else if( ( flags & kColourRangeFit ) != 0 )
+	{
+		// do a range fit
+		RangeFit fit( &colours, flags );
+		fit.Compress( colourBlock );
+	}
+	else
+	{
+		// default to a cluster fit
+		ClusterFit fit( &colours, flags );
+		fit.Compress( colourBlock );
+	}
+	
+	// compress alpha separately if necessary
+	if( ( flags & kDxt3 ) != 0 )
+		CompressAlphaDxt3( rgba, alphaBock );
+	else if( ( flags & kDxt5 ) != 0 )
+		CompressAlphaDxt5( rgba, alphaBock );
+}
+
+
+void Decompress( u8* rgba, void const* block, int flags )
+{
+	// fix any bad flags
+	flags = FixFlags( flags );
+
+	// get the block locations
+	void const* colourBlock = block;
+	void const* alphaBock = block;
+	if( ( flags & ( kDxt3 | kDxt5 ) ) != 0 )
+		colourBlock = reinterpret_cast< u8 const* >( block ) + 8;
+
+	// decompress colour
+	DecompressColour( rgba, colourBlock, ( flags & kDxt1 ) != 0 );
+
+	// decompress alpha separately if necessary
+	if( ( flags & kDxt3 ) != 0 )
+		DecompressAlphaDxt3( rgba, alphaBock );
+	else if( ( flags & kDxt5 ) != 0 )
+		DecompressAlphaDxt5( rgba, alphaBock );
+}
+
+int GetStorageRequirements( int width, int height, int flags )
+{
+	// fix any bad flags
+	flags = FixFlags( flags );
+	
+	// compute the storage requirements
+	int blockcount = ( ( width + 3 )/4 ) * ( ( height + 3 )/4 );
+	int blocksize = ( ( flags & kDxt1 ) != 0 ) ? 8 : 16;
+	return blockcount*blocksize;	
+}
+
+void CompressImage( u8 const* rgba, int width, int height, void* blocks, int flags )
+{
+	// fix any bad flags
+	flags = FixFlags( flags );
+
+	// initialise the block output
+	u8* targetBlock = reinterpret_cast< u8* >( blocks );
+	int bytesPerBlock = ( ( flags & kDxt1 ) != 0 ) ? 8 : 16;
+
+	int bh = std::min(width, 4);
+	int bw = std::min(height, 4);
+	
+	// loop over blocks
+	for( int y = 0; y < height; y += 4 )
+	{
+		for( int x = 0; x < width; x += 4 )
+		{
+			// build the 4x4 block of pixels
+			u8 sourceRgba[16*4];
+			u8* targetPixel = sourceRgba;
+			for( int py = 0; py < 4; ++py )
+			{
+				for( int px = 0; px < 4; ++px )
+				{
+					// get the source pixel in the image
+					int sx = x + (px % bw);
+					int sy = y + (py % bh);
+					
+					// copy the rgba value
+					u8 const* sourcePixel = rgba + 4*( width*sy + sx );
+					for( int i = 0; i < 4; ++i )
+						*targetPixel++ = *sourcePixel++;
+				}
+			}
+			
+			// compress it into the output
+			Compress( sourceRgba, targetBlock, flags );
+			
+			// advance
+			targetBlock += bytesPerBlock;
+		}
+	}
+}
+
+void DecompressImage( u8* rgba, int width, int height, void const* blocks, int flags )
+{
+	// fix any bad flags
+	flags = FixFlags( flags );
+
+	// initialise the block input
+	u8 const* sourceBlock = reinterpret_cast< u8 const* >( blocks );
+	int bytesPerBlock = ( ( flags & kDxt1 ) != 0 ) ? 8 : 16;
+
+	// loop over blocks
+	for( int y = 0; y < height; y += 4 )
+	{
+		for( int x = 0; x < width; x += 4 )
+		{
+			// decompress the block
+			u8 targetRgba[4*16];
+			Decompress( targetRgba, sourceBlock, flags );
+			
+			// write the decompressed pixels to the correct image locations
+			u8 const* sourcePixel = targetRgba;
+			for( int py = 0; py < 4; ++py )
+			{
+				for( int px = 0; px < 4; ++px )
+				{
+					// get the target location
+					int sx = x + px;
+					int sy = y + py;
+					if( sx < width && sy < height )
+					{
+						u8* targetPixel = rgba + 4*( width*sy + sx );
+						
+						// copy the rgba value
+						for( int i = 0; i < 4; ++i )
+							*targetPixel++ = *sourcePixel++;
+					}
+					else
+					{
+						// skip this pixel as its outside the image
+						sourcePixel += 4;
+					}
+				}
+			}
+			
+			// advance
+			sourceBlock += bytesPerBlock;
+		}
+	}
+}
+
+} // namespace squish
--- a/src/nvimage/nvtt/squish/squish.h
+++ b/src/nvimage/nvtt/squish/squish.h
@ -0,0 +1,244 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#ifndef SQUISH_H
+#define SQUISH_H
+
+//! All squish API functions live in this namespace.
+namespace squish {
+
+// -----------------------------------------------------------------------------
+
+//! Typedef a quantity that is a single unsigned byte.
+typedef unsigned char u8;
+
+// -----------------------------------------------------------------------------
+
+enum
+{
+	//! Use DXT1 compression.
+	kDxt1 = ( 1 << 0 ), 
+	
+	//! Use DXT3 compression.
+	kDxt3 = ( 1 << 1 ), 
+	
+	//! Use DXT5 compression.
+	kDxt5 = ( 1 << 2 ), 
+	
+	//! Use a slow but high quality colour compressor (the default).
+	kColourClusterFit = ( 1 << 3 ),	
+	
+	//! Use a fast but low quality colour compressor.
+	kColourRangeFit	= ( 1 << 4 ),
+	
+	//! Use a perceptual metric for colour error (the default).
+	kColourMetricPerceptual = ( 1 << 5 ),
+
+	//! Use a uniform metric for colour error.
+	kColourMetricUniform = ( 1 << 6 ),
+	
+	//! Weight the colour by alpha during cluster fit (disabled by default).
+	kWeightColourByAlpha = ( 1 << 7 )
+};
+
+// -----------------------------------------------------------------------------
+
+/*! @brief Compresses a 4x4 block of pixels.
+
+	@param rgba		The rgba values of the 16 source pixels.
+	@param block	Storage for the compressed DXT block.
+	@param flags	Compression flags.
+	
+	The source pixels should be presented as a contiguous array of 16 rgba
+	values, with each component as 1 byte each. In memory this should be:
+	
+		{ r1, g1, b1, a1, .... , r16, g16, b16, a16 }
+	
+	The flags parameter should specify either kDxt1, kDxt3 or kDxt5 compression, 
+	however, DXT1 will be used by default if none is specified. When using DXT1 
+	compression, 8 bytes of storage are required for the compressed DXT block. 
+	DXT3 and DXT5 compression require 16 bytes of storage per block.
+	
+	The flags parameter can also specify a preferred colour compressor and 
+	colour error metric to use when fitting the RGB components of the data. 
+	Possible colour compressors are: kColourClusterFit (the default) or 
+	kColourRangeFit. Possible colour error metrics are: kColourMetricPerceptual
+	(the default) or kColourMetricUniform. If no flags are specified in any 
+	particular category then the default will be used. Unknown flags are 
+	ignored.
+	
+	When using kColourClusterFit, an additional flag can be specified to
+	weight the colour of each pixel by its alpha value. For images that are
+	rendered using alpha blending, this can significantly increase the 
+	perceived quality.
+*/
+void Compress( u8 const* rgba, void* block, int flags );
+
+// -----------------------------------------------------------------------------
+
+/*! @brief Compresses a 4x4 block of pixels.
+
+	@param rgba		The rgba values of the 16 source pixels.
+	@param mask		The valid pixel mask.
+	@param block	Storage for the compressed DXT block.
+	@param flags	Compression flags.
+	
+	The source pixels should be presented as a contiguous array of 16 rgba
+	values, with each component as 1 byte each. In memory this should be:
+	
+		{ r1, g1, b1, a1, .... , r16, g16, b16, a16 }
+		
+	The mask parameter enables only certain pixels within the block. The lowest
+	bit enables the first pixel and so on up to the 16th bit. Bits beyond the
+	16th bit are ignored. Pixels that are not enabled are allowed to take
+	arbitrary colours in the output block. An example of how this can be used
+	is in the CompressImage function to disable pixels outside the bounds of
+	the image when the width or height is not divisible by 4.
+	
+	The flags parameter should specify either kDxt1, kDxt3 or kDxt5 compression, 
+	however, DXT1 will be used by default if none is specified. When using DXT1 
+	compression, 8 bytes of storage are required for the compressed DXT block. 
+	DXT3 and DXT5 compression require 16 bytes of storage per block.
+	
+	The flags parameter can also specify a preferred colour compressor and 
+	colour error metric to use when fitting the RGB components of the data. 
+	Possible colour compressors are: kColourClusterFit (the default) or 
+	kColourRangeFit. Possible colour error metrics are: kColourMetricPerceptual
+	(the default) or kColourMetricUniform. If no flags are specified in any 
+	particular category then the default will be used. Unknown flags are 
+	ignored.
+	
+	When using kColourClusterFit, an additional flag can be specified to
+	weight the colour of each pixel by its alpha value. For images that are
+	rendered using alpha blending, this can significantly increase the 
+	perceived quality.
+*/
+void CompressMasked( u8 const* rgba, int mask, void* block, int flags );
+
+// -----------------------------------------------------------------------------
+
+/*! @brief Decompresses a 4x4 block of pixels.
+
+	@param rgba		Storage for the 16 decompressed pixels.
+	@param block	The compressed DXT block.
+	@param flags	Compression flags.
+
+	The decompressed pixels will be written as a contiguous array of 16 rgba
+	values, with each component as 1 byte each. In memory this is:
+	
+		{ r1, g1, b1, a1, .... , r16, g16, b16, a16 }
+	
+	The flags parameter should specify either kDxt1, kDxt3 or kDxt5 compression, 
+	however, DXT1 will be used by default if none is specified. All other flags 
+	are ignored.
+*/
+void Decompress( u8* rgba, void const* block, int flags );
+
+// -----------------------------------------------------------------------------
+
+/*! @brief Computes the amount of compressed storage required.
+
+	@param width	The width of the image.
+	@param height	The height of the image.
+	@param flags	Compression flags.
+	
+	The flags parameter should specify either kDxt1, kDxt3 or kDxt5 compression, 
+	however, DXT1 will be used by default if none is specified. All other flags 
+	are ignored.
+	
+	Most DXT images will be a multiple of 4 in each dimension, but this 
+	function supports arbitrary size images by allowing the outer blocks to
+	be only partially used.
+*/
+int GetStorageRequirements( int width, int height, int flags );
+
+// -----------------------------------------------------------------------------
+
+/*! @brief Compresses an image in memory.
+
+	@param rgba		The pixels of the source.
+	@param width	The width of the source image.
+	@param height	The height of the source image.
+	@param blocks	Storage for the compressed output.
+	@param flags	Compression flags.
+	
+	The source pixels should be presented as a contiguous array of width*height
+	rgba values, with each component as 1 byte each. In memory this should be:
+	
+		{ r1, g1, b1, a1, .... , rn, gn, bn, an } for n = width*height
+		
+	The flags parameter should specify either kDxt1, kDxt3 or kDxt5 compression, 
+	however, DXT1 will be used by default if none is specified. When using DXT1 
+	compression, 8 bytes of storage are required for each compressed DXT block. 
+	DXT3 and DXT5 compression require 16 bytes of storage per block.
+	
+	The flags parameter can also specify a preferred colour compressor and 
+	colour error metric to use when fitting the RGB components of the data. 
+	Possible colour compressors are: kColourClusterFit (the default) or 
+	kColourRangeFit. Possible colour error metrics are: kColourMetricPerceptual
+	(the default) or kColourMetricUniform. If no flags are specified in any 
+	particular category then the default will be used. Unknown flags are 
+	ignored.
+	
+	When using kColourClusterFit, an additional flag can be specified to
+	weight the colour of each pixel by its alpha value. For images that are
+	rendered using alpha blending, this can significantly increase the 
+	perceived quality.
+	
+	Internally this function calls squish::Compress for each block. To see how
+	much memory is required in the compressed image, use
+	squish::GetStorageRequirements.
+*/
+void CompressImage( u8 const* rgba, int width, int height, void* blocks, int flags );
+
+// -----------------------------------------------------------------------------
+
+/*! @brief Decompresses an image in memory.
+
+	@param rgba		Storage for the decompressed pixels.
+	@param width	The width of the source image.
+	@param height	The height of the source image.
+	@param blocks	The compressed DXT blocks.
+	@param flags	Compression flags.
+	
+	The decompressed pixels will be written as a contiguous array of width*height
+	16 rgba values, with each component as 1 byte each. In memory this is:
+	
+		{ r1, g1, b1, a1, .... , rn, gn, bn, an } for n = width*height
+		
+	The flags parameter should specify either kDxt1, kDxt3 or kDxt5 compression, 
+	however, DXT1 will be used by default if none is specified. All other flags 
+	are ignored.
+
+	Internally this function calls squish::Decompress for each block.
+*/
+void DecompressImage( u8* rgba, int width, int height, void const* blocks, int flags );
+
+// -----------------------------------------------------------------------------
+
+} // namespace squish
+
+#endif // ndef SQUISH_H
+
--- a/src/nvimage/nvtt/squish/squish.xcodeproj/project.pbxproj
+++ b/src/nvimage/nvtt/squish/squish.xcodeproj/project.pbxproj
@ -0,0 +1,531 @@
+// !$*UTF8*$!
+{
+	archiveVersion = 1;
+	classes = {
+	};
+	objectVersion = 42;
+	objects = {
+
+/* Begin PBXBuildFile section */
+		133FA0DC096A7B8E0050752E /* alpha.h in Headers */ = {isa = PBXBuildFile; fileRef = 133FA0DA096A7B8E0050752E /* alpha.h */; };
+		133FA0DD096A7B8E0050752E /* alpha.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 133FA0DB096A7B8E0050752E /* alpha.cpp */; };
+		1342B4160999DF1900152915 /* libsquish.a in Frameworks */ = {isa = PBXBuildFile; fileRef = D2AAC046055464E500DB518D /* libsquish.a */; };
+		1342B41A0999DF7000152915 /* squishpng.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1342B4190999DF7000152915 /* squishpng.cpp */; };
+		1342B43F0999E0CC00152915 /* squishtest.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1342B43E0999E0CC00152915 /* squishtest.cpp */; };
+		1342B4420999E0EC00152915 /* libsquish.a in Frameworks */ = {isa = PBXBuildFile; fileRef = D2AAC046055464E500DB518D /* libsquish.a */; };
+		1350D71A092AA858005EE038 /* clusterfit.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1350D70B092AA857005EE038 /* clusterfit.cpp */; };
+		1350D71B092AA858005EE038 /* clusterfit.h in Headers */ = {isa = PBXBuildFile; fileRef = 1350D70C092AA858005EE038 /* clusterfit.h */; };
+		1350D71E092AA858005EE038 /* colourblock.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1350D70F092AA858005EE038 /* colourblock.cpp */; };
+		1350D71F092AA858005EE038 /* colourblock.h in Headers */ = {isa = PBXBuildFile; fileRef = 1350D710092AA858005EE038 /* colourblock.h */; };
+		1350D720092AA858005EE038 /* config.h in Headers */ = {isa = PBXBuildFile; fileRef = 1350D711092AA858005EE038 /* config.h */; };
+		1350D721092AA858005EE038 /* maths.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1350D712092AA858005EE038 /* maths.cpp */; };
+		1350D722092AA858005EE038 /* maths.h in Headers */ = {isa = PBXBuildFile; fileRef = 1350D713092AA858005EE038 /* maths.h */; };
+		1350D725092AA858005EE038 /* rangefit.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1350D716092AA858005EE038 /* rangefit.cpp */; };
+		1350D726092AA858005EE038 /* rangefit.h in Headers */ = {isa = PBXBuildFile; fileRef = 1350D717092AA858005EE038 /* rangefit.h */; };
+		1350D727092AA858005EE038 /* squish.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1350D718092AA858005EE038 /* squish.cpp */; };
+		1350D728092AA858005EE038 /* squish.h in Headers */ = {isa = PBXBuildFile; fileRef = 1350D719092AA858005EE038 /* squish.h */; settings = {ATTRIBUTES = (Public, ); }; };
+		139C21CF09ADAB0800A2500D /* squishgen.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 139C21CE09ADAB0800A2500D /* squishgen.cpp */; };
+		139C234F09B0602700A2500D /* singlecolourfit.h in Headers */ = {isa = PBXBuildFile; fileRef = 139C234D09B0602700A2500D /* singlecolourfit.h */; };
+		139C235009B0602700A2500D /* singlecolourfit.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 139C234E09B0602700A2500D /* singlecolourfit.cpp */; };
+		13A7CCA40952BE63001C963A /* colourfit.h in Headers */ = {isa = PBXBuildFile; fileRef = 13A7CCA20952BE63001C963A /* colourfit.h */; };
+		13A7CCA50952BE63001C963A /* colourfit.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 13A7CCA30952BE63001C963A /* colourfit.cpp */; };
+		13C4C7AD0941C18000AC5B89 /* colourset.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 13C4C7AB0941C18000AC5B89 /* colourset.cpp */; };
+		13C4C7AE0941C18000AC5B89 /* colourset.h in Headers */ = {isa = PBXBuildFile; fileRef = 13C4C7AC0941C18000AC5B89 /* colourset.h */; };
+		13CD64C2092BCF8A00488C97 /* simd.h in Headers */ = {isa = PBXBuildFile; fileRef = 13CD64C0092BCF8A00488C97 /* simd.h */; };
+		13D0DC910931F93A00909807 /* simd_ve.h in Headers */ = {isa = PBXBuildFile; fileRef = 13D0DC900931F93A00909807 /* simd_ve.h */; };
+		13D0DC970931F9D600909807 /* simd_sse.h in Headers */ = {isa = PBXBuildFile; fileRef = 13D0DC960931F9D600909807 /* simd_sse.h */; };
+/* End PBXBuildFile section */
+
+/* Begin PBXContainerItemProxy section */
+		1342B52B099BF72F00152915 /* PBXContainerItemProxy */ = {
+			isa = PBXContainerItemProxy;
+			containerPortal = 08FB7793FE84155DC02AAC07 /* Project object */;
+			proxyType = 1;
+			remoteGlobalIDString = D2AAC045055464E500DB518D;
+			remoteInfo = squish;
+		};
+		1342B58E099BF93D00152915 /* PBXContainerItemProxy */ = {
+			isa = PBXContainerItemProxy;
+			containerPortal = 08FB7793FE84155DC02AAC07 /* Project object */;
+			proxyType = 1;
+			remoteGlobalIDString = D2AAC045055464E500DB518D;
+			remoteInfo = squish;
+		};
+/* End PBXContainerItemProxy section */
+
+/* Begin PBXFileReference section */
+		133FA0DA096A7B8E0050752E /* alpha.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = alpha.h; sourceTree = "<group>"; };
+		133FA0DB096A7B8E0050752E /* alpha.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = alpha.cpp; sourceTree = "<group>"; };
+		1342B4110999DE7F00152915 /* squishpng */ = {isa = PBXFileReference; explicitFileType = "compiled.mach-o.executable"; includeInIndex = 0; path = squishpng; sourceTree = BUILT_PRODUCTS_DIR; };
+		1342B4190999DF7000152915 /* squishpng.cpp */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.cpp.cpp; name = squishpng.cpp; path = extra/squishpng.cpp; sourceTree = "<group>"; };
+		1342B4370999E07C00152915 /* squishtest */ = {isa = PBXFileReference; explicitFileType = "compiled.mach-o.executable"; includeInIndex = 0; path = squishtest; sourceTree = BUILT_PRODUCTS_DIR; };
+		1342B43E0999E0CC00152915 /* squishtest.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = squishtest.cpp; path = extra/squishtest.cpp; sourceTree = "<group>"; };
+		1350D70B092AA857005EE038 /* clusterfit.cpp */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.cpp.cpp; path = clusterfit.cpp; sourceTree = "<group>"; };
+		1350D70C092AA858005EE038 /* clusterfit.h */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.c.h; path = clusterfit.h; sourceTree = "<group>"; };
+		1350D70F092AA858005EE038 /* colourblock.cpp */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.cpp.cpp; path = colourblock.cpp; sourceTree = "<group>"; };
+		1350D710092AA858005EE038 /* colourblock.h */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.c.h; path = colourblock.h; sourceTree = "<group>"; };
+		1350D711092AA858005EE038 /* config.h */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.c.h; path = config.h; sourceTree = "<group>"; };
+		1350D712092AA858005EE038 /* maths.cpp */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.cpp.cpp; path = maths.cpp; sourceTree = "<group>"; };
+		1350D713092AA858005EE038 /* maths.h */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.c.h; path = maths.h; sourceTree = "<group>"; };
+		1350D716092AA858005EE038 /* rangefit.cpp */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.cpp.cpp; path = rangefit.cpp; sourceTree = "<group>"; };
+		1350D717092AA858005EE038 /* rangefit.h */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.c.h; path = rangefit.h; sourceTree = "<group>"; };
+		1350D718092AA858005EE038 /* squish.cpp */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.cpp.cpp; path = squish.cpp; sourceTree = "<group>"; };
+		1350D719092AA858005EE038 /* squish.h */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.c.h; path = squish.h; sourceTree = "<group>"; };
+		13906CE3096938880000A6A7 /* texture_compression_s3tc.txt */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = text; path = texture_compression_s3tc.txt; sourceTree = "<group>"; };
+		139C21C409ADAA7000A2500D /* squishgen */ = {isa = PBXFileReference; explicitFileType = "compiled.mach-o.executable"; includeInIndex = 0; path = squishgen; sourceTree = BUILT_PRODUCTS_DIR; };
+		139C21CE09ADAB0800A2500D /* squishgen.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = squishgen.cpp; path = extra/squishgen.cpp; sourceTree = "<group>"; };
+		139C234D09B0602700A2500D /* singlecolourfit.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = singlecolourfit.h; sourceTree = "<group>"; };
+		139C234E09B0602700A2500D /* singlecolourfit.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = singlecolourfit.cpp; sourceTree = "<group>"; };
+		139C236D09B060A900A2500D /* singlecolourlookup.inl */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = text; path = singlecolourlookup.inl; sourceTree = "<group>"; };
+		13A7CCA20952BE63001C963A /* colourfit.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = colourfit.h; sourceTree = "<group>"; };
+		13A7CCA30952BE63001C963A /* colourfit.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = colourfit.cpp; sourceTree = "<group>"; };
+		13C4C7AB0941C18000AC5B89 /* colourset.cpp */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.cpp.cpp; path = colourset.cpp; sourceTree = "<group>"; };
+		13C4C7AC0941C18000AC5B89 /* colourset.h */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.c.h; path = colourset.h; sourceTree = "<group>"; };
+		13CD64C0092BCF8A00488C97 /* simd.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = simd.h; sourceTree = "<group>"; };
+		13D0DC900931F93A00909807 /* simd_ve.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = simd_ve.h; sourceTree = "<group>"; };
+		13D0DC960931F9D600909807 /* simd_sse.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = simd_sse.h; sourceTree = "<group>"; };
+		D2AAC046055464E500DB518D /* libsquish.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = libsquish.a; sourceTree = BUILT_PRODUCTS_DIR; };
+/* End PBXFileReference section */
+
+/* Begin PBXFrameworksBuildPhase section */
+		1342B40F0999DE7F00152915 /* Frameworks */ = {
+			isa = PBXFrameworksBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				1342B4160999DF1900152915 /* libsquish.a in Frameworks */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+		1342B4350999E07C00152915 /* Frameworks */ = {
+			isa = PBXFrameworksBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				1342B4420999E0EC00152915 /* libsquish.a in Frameworks */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+		139C21C209ADAA7000A2500D /* Frameworks */ = {
+			isa = PBXFrameworksBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+		D289987405E68DCB004EDB86 /* Frameworks */ = {
+			isa = PBXFrameworksBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXFrameworksBuildPhase section */
+
+/* Begin PBXGroup section */
+		08FB7794FE84155DC02AAC07 /* squish */ = {
+			isa = PBXGroup;
+			children = (
+				08FB7795FE84155DC02AAC07 /* Source */,
+				C6A0FF2B0290797F04C91782 /* Documentation */,
+				1AB674ADFE9D54B511CA2CBB /* Products */,
+			);
+			name = squish;
+			sourceTree = "<group>";
+		};
+		08FB7795FE84155DC02AAC07 /* Source */ = {
+			isa = PBXGroup;
+			children = (
+				133FA0DB096A7B8E0050752E /* alpha.cpp */,
+				133FA0DA096A7B8E0050752E /* alpha.h */,
+				1350D70B092AA857005EE038 /* clusterfit.cpp */,
+				1350D70C092AA858005EE038 /* clusterfit.h */,
+				13A7CCA30952BE63001C963A /* colourfit.cpp */,
+				13A7CCA20952BE63001C963A /* colourfit.h */,
+				13C4C7AB0941C18000AC5B89 /* colourset.cpp */,
+				13C4C7AC0941C18000AC5B89 /* colourset.h */,
+				1350D70F092AA858005EE038 /* colourblock.cpp */,
+				1350D710092AA858005EE038 /* colourblock.h */,
+				13906CE3096938880000A6A7 /* texture_compression_s3tc.txt */,
+				1350D711092AA858005EE038 /* config.h */,
+				1350D712092AA858005EE038 /* maths.cpp */,
+				1350D713092AA858005EE038 /* maths.h */,
+				1350D716092AA858005EE038 /* rangefit.cpp */,
+				1350D717092AA858005EE038 /* rangefit.h */,
+				13CD64C0092BCF8A00488C97 /* simd.h */,
+				13D0DC960931F9D600909807 /* simd_sse.h */,
+				13D0DC900931F93A00909807 /* simd_ve.h */,
+				139C234E09B0602700A2500D /* singlecolourfit.cpp */,
+				139C234D09B0602700A2500D /* singlecolourfit.h */,
+				139C236D09B060A900A2500D /* singlecolourlookup.inl */,
+				1350D718092AA858005EE038 /* squish.cpp */,
+				1350D719092AA858005EE038 /* squish.h */,
+				139C21CE09ADAB0800A2500D /* squishgen.cpp */,
+				1342B4190999DF7000152915 /* squishpng.cpp */,
+				1342B43E0999E0CC00152915 /* squishtest.cpp */,
+			);
+			name = Source;
+			sourceTree = "<group>";
+		};
+		1AB674ADFE9D54B511CA2CBB /* Products */ = {
+			isa = PBXGroup;
+			children = (
+				D2AAC046055464E500DB518D /* libsquish.a */,
+				1342B4110999DE7F00152915 /* squishpng */,
+				1342B4370999E07C00152915 /* squishtest */,
+				139C21C409ADAA7000A2500D /* squishgen */,
+			);
+			name = Products;
+			sourceTree = "<group>";
+		};
+		C6A0FF2B0290797F04C91782 /* Documentation */ = {
+			isa = PBXGroup;
+			children = (
+			);
+			name = Documentation;
+			sourceTree = "<group>";
+		};
+/* End PBXGroup section */
+
+/* Begin PBXHeadersBuildPhase section */
+		D2AAC043055464E500DB518D /* Headers */ = {
+			isa = PBXHeadersBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				1350D71B092AA858005EE038 /* clusterfit.h in Headers */,
+				1350D71F092AA858005EE038 /* colourblock.h in Headers */,
+				1350D720092AA858005EE038 /* config.h in Headers */,
+				1350D722092AA858005EE038 /* maths.h in Headers */,
+				1350D726092AA858005EE038 /* rangefit.h in Headers */,
+				1350D728092AA858005EE038 /* squish.h in Headers */,
+				13CD64C2092BCF8A00488C97 /* simd.h in Headers */,
+				13D0DC910931F93A00909807 /* simd_ve.h in Headers */,
+				13D0DC970931F9D600909807 /* simd_sse.h in Headers */,
+				13C4C7AE0941C18000AC5B89 /* colourset.h in Headers */,
+				13A7CCA40952BE63001C963A /* colourfit.h in Headers */,
+				133FA0DC096A7B8E0050752E /* alpha.h in Headers */,
+				139C234F09B0602700A2500D /* singlecolourfit.h in Headers */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXHeadersBuildPhase section */
+
+/* Begin PBXNativeTarget section */
+		1342B4100999DE7F00152915 /* squishpng */ = {
+			isa = PBXNativeTarget;
+			buildConfigurationList = 1342B4130999DE9F00152915 /* Build configuration list for PBXNativeTarget "squishpng" */;
+			buildPhases = (
+				1342B40E0999DE7F00152915 /* Sources */,
+				1342B40F0999DE7F00152915 /* Frameworks */,
+			);
+			buildRules = (
+			);
+			dependencies = (
+				1342B58F099BF93D00152915 /* PBXTargetDependency */,
+			);
+			name = squishpng;
+			productName = squishpng;
+			productReference = 1342B4110999DE7F00152915 /* squishpng */;
+			productType = "com.apple.product-type.tool";
+		};
+		1342B4360999E07C00152915 /* squishtest */ = {
+			isa = PBXNativeTarget;
+			buildConfigurationList = 1342B43B0999E0C000152915 /* Build configuration list for PBXNativeTarget "squishtest" */;
+			buildPhases = (
+				1342B4340999E07C00152915 /* Sources */,
+				1342B4350999E07C00152915 /* Frameworks */,
+			);
+			buildRules = (
+			);
+			dependencies = (
+				1342B52C099BF72F00152915 /* PBXTargetDependency */,
+			);
+			name = squishtest;
+			productName = squishtest;
+			productReference = 1342B4370999E07C00152915 /* squishtest */;
+			productType = "com.apple.product-type.tool";
+		};
+		139C21C309ADAA7000A2500D /* squishgen */ = {
+			isa = PBXNativeTarget;
+			buildConfigurationList = 139C21CB09ADAB0300A2500D /* Build configuration list for PBXNativeTarget "squishgen" */;
+			buildPhases = (
+				139C21C109ADAA7000A2500D /* Sources */,
+				139C21C209ADAA7000A2500D /* Frameworks */,
+			);
+			buildRules = (
+			);
+			dependencies = (
+			);
+			name = squishgen;
+			productName = squishgen;
+			productReference = 139C21C409ADAA7000A2500D /* squishgen */;
+			productType = "com.apple.product-type.tool";
+		};
+		D2AAC045055464E500DB518D /* squish */ = {
+			isa = PBXNativeTarget;
+			buildConfigurationList = 1DEB91EB08733DB70010E9CD /* Build configuration list for PBXNativeTarget "squish" */;
+			buildPhases = (
+				D2AAC043055464E500DB518D /* Headers */,
+				D2AAC044055464E500DB518D /* Sources */,
+				D289987405E68DCB004EDB86 /* Frameworks */,
+			);
+			buildRules = (
+			);
+			dependencies = (
+			);
+			name = squish;
+			productName = squish;
+			productReference = D2AAC046055464E500DB518D /* libsquish.a */;
+			productType = "com.apple.product-type.library.static";
+		};
+/* End PBXNativeTarget section */
+
+/* Begin PBXProject section */
+		08FB7793FE84155DC02AAC07 /* Project object */ = {
+			isa = PBXProject;
+			buildConfigurationList = 1DEB91EF08733DB70010E9CD /* Build configuration list for PBXProject "squish" */;
+			hasScannedForEncodings = 1;
+			mainGroup = 08FB7794FE84155DC02AAC07 /* squish */;
+			projectDirPath = "";
+			targets = (
+				D2AAC045055464E500DB518D /* squish */,
+				1342B4100999DE7F00152915 /* squishpng */,
+				1342B4360999E07C00152915 /* squishtest */,
+				139C21C309ADAA7000A2500D /* squishgen */,
+			);
+		};
+/* End PBXProject section */
+
+/* Begin PBXSourcesBuildPhase section */
+		1342B40E0999DE7F00152915 /* Sources */ = {
+			isa = PBXSourcesBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				1342B41A0999DF7000152915 /* squishpng.cpp in Sources */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+		1342B4340999E07C00152915 /* Sources */ = {
+			isa = PBXSourcesBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				1342B43F0999E0CC00152915 /* squishtest.cpp in Sources */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+		139C21C109ADAA7000A2500D /* Sources */ = {
+			isa = PBXSourcesBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				139C21CF09ADAB0800A2500D /* squishgen.cpp in Sources */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+		D2AAC044055464E500DB518D /* Sources */ = {
+			isa = PBXSourcesBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				1350D71A092AA858005EE038 /* clusterfit.cpp in Sources */,
+				1350D71E092AA858005EE038 /* colourblock.cpp in Sources */,
+				1350D721092AA858005EE038 /* maths.cpp in Sources */,
+				1350D725092AA858005EE038 /* rangefit.cpp in Sources */,
+				1350D727092AA858005EE038 /* squish.cpp in Sources */,
+				13C4C7AD0941C18000AC5B89 /* colourset.cpp in Sources */,
+				13A7CCA50952BE63001C963A /* colourfit.cpp in Sources */,
+				133FA0DD096A7B8E0050752E /* alpha.cpp in Sources */,
+				139C235009B0602700A2500D /* singlecolourfit.cpp in Sources */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXSourcesBuildPhase section */
+
+/* Begin PBXTargetDependency section */
+		1342B52C099BF72F00152915 /* PBXTargetDependency */ = {
+			isa = PBXTargetDependency;
+			target = D2AAC045055464E500DB518D /* squish */;
+			targetProxy = 1342B52B099BF72F00152915 /* PBXContainerItemProxy */;
+		};
+		1342B58F099BF93D00152915 /* PBXTargetDependency */ = {
+			isa = PBXTargetDependency;
+			target = D2AAC045055464E500DB518D /* squish */;
+			targetProxy = 1342B58E099BF93D00152915 /* PBXContainerItemProxy */;
+		};
+/* End PBXTargetDependency section */
+
+/* Begin XCBuildConfiguration section */
+		1342B4140999DE9F00152915 /* Debug */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				HEADER_SEARCH_PATHS = (
+					..,
+					/sw/include,
+				);
+				INSTALL_PATH = "$(HOME)/bin";
+				LIBRARY_SEARCH_PATHS = /sw/lib;
+				OTHER_LDFLAGS = "-lpng";
+				PRODUCT_NAME = squishpng;
+			};
+			name = Debug;
+		};
+		1342B4150999DE9F00152915 /* Release */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				HEADER_SEARCH_PATHS = (
+					..,
+					/sw/include,
+				);
+				INSTALL_PATH = "$(HOME)/bin";
+				LIBRARY_SEARCH_PATHS = /sw/lib;
+				OTHER_LDFLAGS = "-lpng";
+				PRODUCT_NAME = squishpng;
+			};
+			name = Release;
+		};
+		1342B43C0999E0C000152915 /* Debug */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				HEADER_SEARCH_PATHS = ..;
+				INSTALL_PATH = "$(HOME)/bin";
+				PRODUCT_NAME = squishtest;
+			};
+			name = Debug;
+		};
+		1342B43D0999E0C000152915 /* Release */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				HEADER_SEARCH_PATHS = ..;
+				INSTALL_PATH = "$(HOME)/bin";
+				PRODUCT_NAME = squishtest;
+			};
+			name = Release;
+		};
+		139C21CC09ADAB0300A2500D /* Debug */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				HEADER_SEARCH_PATHS = ..;
+				INSTALL_PATH = "$(HOME)/bin";
+				PRODUCT_NAME = squishgen;
+			};
+			name = Debug;
+		};
+		139C21CD09ADAB0300A2500D /* Release */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				HEADER_SEARCH_PATHS = ..;
+				INSTALL_PATH = "$(HOME)/bin";
+				PRODUCT_NAME = squishgen;
+			};
+			name = Release;
+		};
+		1DEB91EC08733DB70010E9CD /* Debug */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				COPY_PHASE_STRIP = NO;
+				GCC_PREPROCESSOR_DEFINITIONS = "SQUISH_USE_ALTIVEC=1";
+				INSTALL_PATH = /usr/local/lib;
+				OTHER_CFLAGS = "-maltivec";
+				PRODUCT_NAME = squish;
+				STRIP_INSTALLED_PRODUCT = NO;
+			};
+			name = Debug;
+		};
+		1DEB91ED08733DB70010E9CD /* Release */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				GCC_PREPROCESSOR_DEFINITIONS = "SQUISH_USE_ALTIVEC=1";
+				INSTALL_PATH = /usr/local/lib;
+				OTHER_CFLAGS = "-maltivec";
+				PRODUCT_NAME = squish;
+				STRIP_INSTALLED_PRODUCT = YES;
+			};
+			name = Release;
+		};
+		1DEB91F008733DB70010E9CD /* Debug */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				GCC_DYNAMIC_NO_PIC = YES;
+				GCC_OPTIMIZATION_LEVEL = 0;
+				GCC_TREAT_WARNINGS_AS_ERRORS = YES;
+				GCC_WARN_ABOUT_MISSING_NEWLINE = YES;
+				GCC_WARN_ABOUT_RETURN_TYPE = YES;
+				GCC_WARN_PEDANTIC = YES;
+				GCC_WARN_SHADOW = YES;
+				GCC_WARN_SIGN_COMPARE = YES;
+				GCC_WARN_UNUSED_PARAMETER = YES;
+				GCC_WARN_UNUSED_VALUE = YES;
+				GCC_WARN_UNUSED_VARIABLE = YES;
+				PREBINDING = NO;
+				SDKROOT = /Developer/SDKs/MacOSX10.4u.sdk;
+			};
+			name = Debug;
+		};
+		1DEB91F108733DB70010E9CD /* Release */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				GCC_DYNAMIC_NO_PIC = YES;
+				GCC_OPTIMIZATION_LEVEL = 3;
+				GCC_TREAT_WARNINGS_AS_ERRORS = YES;
+				GCC_UNROLL_LOOPS = YES;
+				GCC_WARN_ABOUT_MISSING_NEWLINE = YES;
+				GCC_WARN_ABOUT_RETURN_TYPE = YES;
+				GCC_WARN_PEDANTIC = YES;
+				GCC_WARN_SHADOW = YES;
+				GCC_WARN_SIGN_COMPARE = YES;
+				GCC_WARN_UNUSED_PARAMETER = YES;
+				GCC_WARN_UNUSED_VALUE = YES;
+				GCC_WARN_UNUSED_VARIABLE = YES;
+				PREBINDING = NO;
+				SDKROOT = /Developer/SDKs/MacOSX10.4u.sdk;
+			};
+			name = Release;
+		};
+/* End XCBuildConfiguration section */
+
+/* Begin XCConfigurationList section */
+		1342B4130999DE9F00152915 /* Build configuration list for PBXNativeTarget "squishpng" */ = {
+			isa = XCConfigurationList;
+			buildConfigurations = (
+				1342B4140999DE9F00152915 /* Debug */,
+				1342B4150999DE9F00152915 /* Release */,
+			);
+			defaultConfigurationIsVisible = 0;
+			defaultConfigurationName = Release;
+		};
+		1342B43B0999E0C000152915 /* Build configuration list for PBXNativeTarget "squishtest" */ = {
+			isa = XCConfigurationList;
+			buildConfigurations = (
+				1342B43C0999E0C000152915 /* Debug */,
+				1342B43D0999E0C000152915 /* Release */,
+			);
+			defaultConfigurationIsVisible = 0;
+			defaultConfigurationName = Release;
+		};
+		139C21CB09ADAB0300A2500D /* Build configuration list for PBXNativeTarget "squishgen" */ = {
+			isa = XCConfigurationList;
+			buildConfigurations = (
+				139C21CC09ADAB0300A2500D /* Debug */,
+				139C21CD09ADAB0300A2500D /* Release */,
+			);
+			defaultConfigurationIsVisible = 0;
+			defaultConfigurationName = Release;
+		};
+		1DEB91EB08733DB70010E9CD /* Build configuration list for PBXNativeTarget "squish" */ = {
+			isa = XCConfigurationList;
+			buildConfigurations = (
+				1DEB91EC08733DB70010E9CD /* Debug */,
+				1DEB91ED08733DB70010E9CD /* Release */,
+			);
+			defaultConfigurationIsVisible = 0;
+			defaultConfigurationName = Release;
+		};
+		1DEB91EF08733DB70010E9CD /* Build configuration list for PBXProject "squish" */ = {
+			isa = XCConfigurationList;
+			buildConfigurations = (
+				1DEB91F008733DB70010E9CD /* Debug */,
+				1DEB91F108733DB70010E9CD /* Release */,
+			);
+			defaultConfigurationIsVisible = 0;
+			defaultConfigurationName = Release;
+		};
+/* End XCConfigurationList section */
+	};
+	rootObject = 08FB7793FE84155DC02AAC07 /* Project object */;
+}
--- a/src/nvimage/nvtt/squish/texture_compression_s3tc.txt
+++ b/src/nvimage/nvtt/squish/texture_compression_s3tc.txt
@ -0,0 +1,508 @@
+Name
+
+    EXT_texture_compression_s3tc
+
+Name Strings
+
+    GL_EXT_texture_compression_s3tc
+
+Contact
+
+    Pat Brown, NVIDIA Corporation (pbrown 'at' nvidia.com)
+
+Status
+
+    FINAL
+
+Version
+
+    1.1, 16 November 2001 (containing only clarifications relative to
+                           version 1.0, dated 7 July 2000)
+
+Number
+
+    198
+
+Dependencies
+
+    OpenGL 1.1 is required.
+
+    GL_ARB_texture_compression is required.
+
+    This extension is written against the OpenGL 1.2.1 Specification.
+
+Overview
+
+    This extension provides additional texture compression functionality
+    specific to S3's S3TC format (called DXTC in Microsoft's DirectX API),
+    subject to all the requirements and limitations described by the extension
+    GL_ARB_texture_compression.
+
+    This extension supports DXT1, DXT3, and DXT5 texture compression formats.
+    For the DXT1 image format, this specification supports an RGB-only mode
+    and a special RGBA mode with single-bit "transparent" alpha.
+
+IP Status
+
+    Contact S3 Incorporated (http://www.s3.com) regarding any intellectual
+    property issues associated with implementing this extension.
+
+    WARNING:  Vendors able to support S3TC texture compression in Direct3D
+    drivers do not necessarily have the right to use the same functionality in
+    OpenGL.
+
+Issues
+
+    (1) Should DXT2 and DXT4 (premultiplied alpha) formats be supported?
+
+        RESOLVED:  No -- insufficient interest.  Supporting DXT2 and DXT4
+        would require some rework to the TexEnv definition (maybe add a new
+        base internal format RGBA_PREMULTIPLIED_ALPHA) for these formats.
+        Note that the EXT_texture_env_combine extension (which extends normal
+        TexEnv modes) can be used to support textures with premultipled alpha.
+
+    (2) Should generic "RGB_S3TC_EXT" and "RGBA_S3TC_EXT" enums be supported
+        or should we use only the DXT<n> enums?  
+
+        RESOLVED:  No.  A generic RGBA_S3TC_EXT is problematic because DXT3
+        and DXT5 are both nominally RGBA (and DXT1 with the 1-bit alpha is
+        also) yet one format must be chosen up front.
+
+    (3) Should TexSubImage support all block-aligned edits or just the minimal
+        functionality required by the ARB_texture_compression extension?
+
+        RESOLVED:  Allow all valid block-aligned edits.
+
+    (4) A pre-compressed image with a DXT1 format can be used as either an
+        RGB_S3TC_DXT1 or an RGBA_S3TC_DXT1 image.  If the image has
+        transparent texels, how are they treated in each format?
+
+        RESOLVED:  The renderer has to make sure that an RGB_S3TC_DXT1 format
+        is decoded as RGB (where alpha is effectively one for all texels),
+        while RGBA_S3TC_DXT1 is decoded as RGBA (where alpha is zero for all
+        texels with "transparent" encodings).  Otherwise, the formats are
+        identical.
+
+    (5) Is the encoding of the RGB components for DXT1 formats correct in this
+        spec?  MSDN documentation does not specify an RGB color for the
+        "transparent" encoding.  Is it really black?
+
+        RESOLVED:  Yes.  The specification for the DXT1 format initially
+        required black, but later changed that requirement to a
+        recommendation.  All vendors involved in the definition of this
+        specification support black.  In addition, specifying black has a
+        useful behavior.
+
+        When blending multiple texels (GL_LINEAR filtering), mixing opaque and
+        transparent samples is problematic.  Defining a black color on
+        transparent texels achieves a sensible result that works like a
+        texture with premultiplied alpha.  For example, if three opaque white
+        and one transparent sample is being averaged, the result would be a
+        75% intensity gray (with an alpha of 75%).  This is the same result on
+        the color channels as would be obtained using a white color, 75%
+        alpha, and a SRC_ALPHA blend factor.
+
+    (6) Is the encoding of the RGB components for DXT3 and DXT5 formats
+        correct in this spec?  MSDN documentation suggests that the RGB blocks
+        for DXT3 and DXT5 are decoded as described by the DXT1 format.
+
+        RESOLVED:  Yes -- this appears to be a bug in the MSDN documentation.
+        The specification for the DXT2-DXT5 formats require decoding using the
+        opaque block encoding, regardless of the relative values of "color0"
+        and "color1".
+
+New Procedures and Functions
+
+    None.
+
+New Tokens
+
+    Accepted by the <internalformat> parameter of TexImage2D, CopyTexImage2D,
+    and CompressedTexImage2DARB and the <format> parameter of
+    CompressedTexSubImage2DARB:
+
+        COMPRESSED_RGB_S3TC_DXT1_EXT                   0x83F0
+        COMPRESSED_RGBA_S3TC_DXT1_EXT                  0x83F1
+        COMPRESSED_RGBA_S3TC_DXT3_EXT                  0x83F2
+        COMPRESSED_RGBA_S3TC_DXT5_EXT                  0x83F3
+
+Additions to Chapter 2 of the OpenGL 1.2.1 Specification (OpenGL Operation)
+
+    None.
+
+Additions to Chapter 3 of the OpenGL 1.2.1 Specification (Rasterization)
+
+    Add to Table 3.16.1:  Specific Compressed Internal Formats
+
+        Compressed Internal Format         Base Internal Format
+        ==========================         ====================
+        COMPRESSED_RGB_S3TC_DXT1_EXT       RGB
+        COMPRESSED_RGBA_S3TC_DXT1_EXT      RGBA
+        COMPRESSED_RGBA_S3TC_DXT3_EXT      RGBA
+        COMPRESSED_RGBA_S3TC_DXT5_EXT      RGBA
+
+    
+    Modify Section 3.8.2, Alternate Image Specification
+
+    (add to end of TexSubImage discussion, p.123 -- after edit from the
+    ARB_texture_compression spec)
+
+    If the internal format of the texture image being modified is
+    COMPRESSED_RGB_S3TC_DXT1_EXT, COMPRESSED_RGBA_S3TC_DXT1_EXT,
+    COMPRESSED_RGBA_S3TC_DXT3_EXT, or COMPRESSED_RGBA_S3TC_DXT5_EXT, the
+    texture is stored using one of the several S3TC compressed texture image
+    formats.  Such images are easily edited along 4x4 texel boundaries, so the
+    limitations on TexSubImage2D or CopyTexSubImage2D parameters are relaxed.
+    TexSubImage2D and CopyTexSubImage2D will result in an INVALID_OPERATION
+    error only if one of the following conditions occurs:
+
+        * <width> is not a multiple of four or equal to TEXTURE_WIDTH, 
+          unless <xoffset> and <yoffset> are both zero.
+        * <height> is not a multiple of four or equal to TEXTURE_HEIGHT,
+          unless <xoffset> and <yoffset> are both zero.
+        * <xoffset> or <yoffset> is not a multiple of four.
+
+    The contents of any 4x4 block of texels of an S3TC compressed texture
+    image that does not intersect the area being modified are preserved during
+    valid TexSubImage2D and CopyTexSubImage2D calls.
+
+
+    Add to Section 3.8.2, Alternate Image Specification (adding to the end of
+    the CompressedTexImage section introduced by the ARB_texture_compression
+    spec)
+
+    If <internalformat> is COMPRESSED_RGB_S3TC_DXT1_EXT,
+    COMPRESSED_RGBA_S3TC_DXT1_EXT, COMPRESSED_RGBA_S3TC_DXT3_EXT, or
+    COMPRESSED_RGBA_S3TC_DXT5_EXT, the compressed texture is stored using one
+    of several S3TC compressed texture image formats.  The S3TC texture
+    compression algorithm supports only 2D images without borders.
+    CompressedTexImage1DARB and CompressedTexImage3DARB produce an
+    INVALID_ENUM error if <internalformat> is an S3TC format.
+    CompressedTexImage2DARB will produce an INVALID_OPERATION error if
+    <border> is non-zero.
+
+
+    Add to Section 3.8.2, Alternate Image Specification (adding to the end of
+    the CompressedTexSubImage section introduced by the
+    ARB_texture_compression spec)
+
+    If the internal format of the texture image being modified is
+    COMPRESSED_RGB_S3TC_DXT1_EXT, COMPRESSED_RGBA_S3TC_DXT1_EXT,
+    COMPRESSED_RGBA_S3TC_DXT3_EXT, or COMPRESSED_RGBA_S3TC_DXT5_EXT, the
+    texture is stored using one of the several S3TC compressed texture image
+    formats.  Since the S3TC texture compression algorithm supports only 2D
+    images, CompressedTexSubImage1DARB and CompressedTexSubImage3DARB produce
+    an INVALID_ENUM error if <format> is an S3TC format.  Since S3TC images
+    are easily edited along 4x4 texel boundaries, the limitations on
+    CompressedTexSubImage2D are relaxed.  CompressedTexSubImage2D will result
+    in an INVALID_OPERATION error only if one of the following conditions
+    occurs:
+
+        * <width> is not a multiple of four or equal to TEXTURE_WIDTH.
+        * <height> is not a multiple of four or equal to TEXTURE_HEIGHT.
+        * <xoffset> or <yoffset> is not a multiple of four.
+
+    The contents of any 4x4 block of texels of an S3TC compressed texture
+    image that does not intersect the area being modified are preserved during
+    valid TexSubImage2D and CopyTexSubImage2D calls.
+
+Additions to Chapter 4 of the OpenGL 1.2.1 Specification (Per-Fragment
+Operations and the Frame Buffer)
+
+    None.
+
+Additions to Chapter 5 of the OpenGL 1.2.1 Specification (Special Functions)
+
+    None.
+
+Additions to Chapter 6 of the OpenGL 1.2.1 Specification (State and
+State Requests)
+
+    None.
+
+Additions to Appendix A of the OpenGL 1.2.1 Specification (Invariance)
+
+    None.
+
+Additions to the AGL/GLX/WGL Specifications
+
+    None.
+
+GLX Protocol
+
+    None.
+
+Errors
+
+    INVALID_ENUM is generated by CompressedTexImage1DARB or
+    CompressedTexImage3DARB if <internalformat> is
+    COMPRESSED_RGB_S3TC_DXT1_EXT, COMPRESSED_RGBA_S3TC_DXT1_EXT,
+    COMPRESSED_RGBA_S3TC_DXT3_EXT, or COMPRESSED_RGBA_S3TC_DXT5_EXT.
+
+    INVALID_OPERATION is generated by CompressedTexImage2DARB if
+    <internalformat> is COMPRESSED_RGB_S3TC_DXT1_EXT,
+    COMPRESSED_RGBA_S3TC_DXT1_EXT, COMPRESSED_RGBA_S3TC_DXT3_EXT, or
+    COMPRESSED_RGBA_S3TC_DXT5_EXT and <border> is not equal to zero.
+
+    INVALID_ENUM is generated by CompressedTexSubImage1DARB or
+    CompressedTexSubImage3DARB if <format> is COMPRESSED_RGB_S3TC_DXT1_EXT,
+    COMPRESSED_RGBA_S3TC_DXT1_EXT, COMPRESSED_RGBA_S3TC_DXT3_EXT, or
+    COMPRESSED_RGBA_S3TC_DXT5_EXT.
+
+    INVALID_OPERATION is generated by TexSubImage2D CopyTexSubImage2D, or
+    CompressedTexSubImage2D if TEXTURE_INTERNAL_FORMAT is
+    COMPRESSED_RGB_S3TC_DXT1_EXT, COMPRESSED_RGBA_S3TC_DXT1_EXT,
+    COMPRESSED_RGBA_S3TC_DXT3_EXT, or COMPRESSED_RGBA_S3TC_DXT5_EXT and any of
+    the following apply: <width> is not a multiple of four or equal to
+    TEXTURE_WIDTH; <height> is not a multiple of four or equal to
+    TEXTURE_HEIGHT; <xoffset> or <yoffset> is not a multiple of four.
+
+
+    The following restrictions from the ARB_texture_compression specification
+    do not apply to S3TC texture formats, since subimage modification is
+    straightforward as long as the subimage is properly aligned.
+
+    DELETE: INVALID_OPERATION is generated by TexSubImage1D, TexSubImage2D,
+    DELETE: TexSubImage3D, CopyTexSubImage1D, CopyTexSubImage2D, or
+    DELETE: CopyTexSubImage3D if the internal format of the texture image is
+    DELETE: compressed and <xoffset>, <yoffset>, or <zoffset> does not equal
+    DELETE: -b, where b is value of TEXTURE_BORDER.
+
+    DELETE: INVALID_VALUE is generated by CompressedTexSubImage1DARB,
+    DELETE: CompressedTexSubImage2DARB, or CompressedTexSubImage3DARB if the
+    DELETE: entire texture image is not being edited:  if <xoffset>,
+    DELETE: <yoffset>, or <zoffset> is greater than -b, <xoffset> + <width> is
+    DELETE: less than w+b, <yoffset> + <height> is less than h+b, or <zoffset>
+    DELETE: + <depth> is less than d+b, where b is the value of
+    DELETE: TEXTURE_BORDER, w is the value of TEXTURE_WIDTH, h is the value of
+    DELETE: TEXTURE_HEIGHT, and d is the value of TEXTURE_DEPTH.
+
+    See also errors in the GL_ARB_texture_compression specification.
+
+New State
+
+    In the "Textures" state table, increment the TEXTURE_INTERNAL_FORMAT
+    subscript for Z by 4 in the "Type" row.
+
+New Implementation Dependent State
+
+    None
+
+Appendix
+
+    S3TC Compressed Texture Image Formats
+
+    Compressed texture images stored using the S3TC compressed image formats
+    are represented as a collection of 4x4 texel blocks, where each block
+    contains 64 or 128 bits of texel data.  The image is encoded as a normal
+    2D raster image in which each 4x4 block is treated as a single pixel.  If
+    an S3TC image has a width or height less than four, the data corresponding
+    to texels outside the image are irrelevant and undefined.
+
+    When an S3TC image with a width of <w>, height of <h>, and block size of
+    <blocksize> (8 or 16 bytes) is decoded, the corresponding image size (in
+    bytes) is:
+    
+        ceil(<w>/4) * ceil(<h>/4) * blocksize.
+
+    When decoding an S3TC image, the block containing the texel at offset
+    (<x>, <y>) begins at an offset (in bytes) relative to the base of the
+    image of:
+
+        blocksize * (ceil(<w>/4) * floor(<y>/4) + floor(<x>/4)).
+
+    The data corresponding to a specific texel (<x>, <y>) are extracted from a
+    4x4 texel block using a relative (x,y) value of
+    
+        (<x> modulo 4, <y> modulo 4).
+
+    There are four distinct S3TC image formats:
+
+    COMPRESSED_RGB_S3TC_DXT1_EXT:  Each 4x4 block of texels consists of 64
+    bits of RGB image data.  
+
+    Each RGB image data block is encoded as a sequence of 8 bytes, called (in
+    order of increasing address):
+
+            c0_lo, c0_hi, c1_lo, c1_hi, bits_0, bits_1, bits_2, bits_3
+
+        The 8 bytes of the block are decoded into three quantities:
+
+            color0 = c0_lo + c0_hi * 256
+            color1 = c1_lo + c1_hi * 256
+            bits   = bits_0 + 256 * (bits_1 + 256 * (bits_2 + 256 * bits_3))
+        
+        color0 and color1 are 16-bit unsigned integers that are unpacked to
+        RGB colors RGB0 and RGB1 as though they were 16-bit packed pixels with
+        a <format> of RGB and a type of UNSIGNED_SHORT_5_6_5.
+
+        bits is a 32-bit unsigned integer, from which a two-bit control code
+        is extracted for a texel at location (x,y) in the block using:
+
+            code(x,y) = bits[2*(4*y+x)+1..2*(4*y+x)+0]
+        
+        where bit 31 is the most significant and bit 0 is the least
+        significant bit.
+
+        The RGB color for a texel at location (x,y) in the block is given by:
+
+            RGB0,              if color0 > color1 and code(x,y) == 0
+            RGB1,              if color0 > color1 and code(x,y) == 1
+            (2*RGB0+RGB1)/3,   if color0 > color1 and code(x,y) == 2
+            (RGB0+2*RGB1)/3,   if color0 > color1 and code(x,y) == 3
+
+            RGB0,              if color0 <= color1 and code(x,y) == 0
+            RGB1,              if color0 <= color1 and code(x,y) == 1
+            (RGB0+RGB1)/2,     if color0 <= color1 and code(x,y) == 2
+            BLACK,             if color0 <= color1 and code(x,y) == 3
+
+        Arithmetic operations are done per component, and BLACK refers to an
+        RGB color where red, green, and blue are all zero.
+
+    Since this image has an RGB format, there is no alpha component and the
+    image is considered fully opaque.
+
+
+    COMPRESSED_RGBA_S3TC_DXT1_EXT:  Each 4x4 block of texels consists of 64
+    bits of RGB image data and minimal alpha information.  The RGB components
+    of a texel are extracted in the same way as COMPRESSED_RGB_S3TC_DXT1_EXT.
+ 
+        The alpha component for a texel at location (x,y) in the block is
+        given by:
+
+            0.0,               if color0 <= color1 and code(x,y) == 3
+            1.0,               otherwise
+
+        IMPORTANT:  When encoding an RGBA image into a format using 1-bit
+        alpha, any texels with an alpha component less than 0.5 end up with an
+        alpha of 0.0 and any texels with an alpha component greater than or
+        equal to 0.5 end up with an alpha of 1.0.  When encoding an RGBA image
+        into the COMPRESSED_RGBA_S3TC_DXT1_EXT format, the resulting red,
+        green, and blue components of any texels with a final alpha of 0.0
+        will automatically be zero (black).  If this behavior is not desired
+        by an application, it should not use COMPRESSED_RGBA_S3TC_DXT1_EXT.
+        This format will never be used when a generic compressed internal
+        format (Table 3.16.2) is specified, although the nearly identical
+        format COMPRESSED_RGB_S3TC_DXT1_EXT (above) may be.
+
+
+    COMPRESSED_RGBA_S3TC_DXT3_EXT:  Each 4x4 block of texels consists of 64
+    bits of uncompressed alpha image data followed by 64 bits of RGB image
+    data.  
+
+    Each RGB image data block is encoded according to the
+    COMPRESSED_RGB_S3TC_DXT1_EXT format, with the exception that the two code
+    bits always use the non-transparent encodings.  In other words, they are
+    treated as though color0 > color1, regardless of the actual values of
+    color0 and color1.
+
+    Each alpha image data block is encoded as a sequence of 8 bytes, called
+    (in order of increasing address):
+
+            a0, a1, a2, a3, a4, a5, a6, a7
+
+        The 8 bytes of the block are decoded into one 64-bit integer:
+
+            alpha = a0 + 256 * (a1 + 256 * (a2 + 256 * (a3 + 256 * (a4 +
+                         256 * (a5 + 256 * (a6 + 256 * a7))))))
+
+        alpha is a 64-bit unsigned integer, from which a four-bit alpha value
+        is extracted for a texel at location (x,y) in the block using:
+
+            alpha(x,y) = bits[4*(4*y+x)+3..4*(4*y+x)+0]
+
+        where bit 63 is the most significant and bit 0 is the least
+        significant bit.
+
+        The alpha component for a texel at location (x,y) in the block is
+        given by alpha(x,y) / 15.
+
+ 
+    COMPRESSED_RGBA_S3TC_DXT5_EXT:  Each 4x4 block of texels consists of 64
+    bits of compressed alpha image data followed by 64 bits of RGB image data.
+
+    Each RGB image data block is encoded according to the
+    COMPRESSED_RGB_S3TC_DXT1_EXT format, with the exception that the two code
+    bits always use the non-transparent encodings.  In other words, they are
+    treated as though color0 > color1, regardless of the actual values of
+    color0 and color1.
+
+    Each alpha image data block is encoded as a sequence of 8 bytes, called
+    (in order of increasing address):
+
+        alpha0, alpha1, bits_0, bits_1, bits_2, bits_3, bits_4, bits_5
+
+        The alpha0 and alpha1 are 8-bit unsigned bytes converted to alpha
+        components by multiplying by 1/255.
+
+        The 6 "bits" bytes of the block are decoded into one 48-bit integer:
+
+          bits = bits_0 + 256 * (bits_1 + 256 * (bits_2 + 256 * (bits_3 + 
+                          256 * (bits_4 + 256 * bits_5))))
+
+        bits is a 48-bit unsigned integer, from which a three-bit control code
+        is extracted for a texel at location (x,y) in the block using:
+
+            code(x,y) = bits[3*(4*y+x)+1..3*(4*y+x)+0]
+
+        where bit 47 is the most significant and bit 0 is the least
+        significant bit.
+
+        The alpha component for a texel at location (x,y) in the block is
+        given by:
+
+              alpha0,                   code(x,y) == 0
+              alpha1,                   code(x,y) == 1
+
+              (6*alpha0 + 1*alpha1)/7,  alpha0 > alpha1 and code(x,y) == 2
+              (5*alpha0 + 2*alpha1)/7,  alpha0 > alpha1 and code(x,y) == 3
+              (4*alpha0 + 3*alpha1)/7,  alpha0 > alpha1 and code(x,y) == 4
+              (3*alpha0 + 4*alpha1)/7,  alpha0 > alpha1 and code(x,y) == 5
+              (2*alpha0 + 5*alpha1)/7,  alpha0 > alpha1 and code(x,y) == 6
+              (1*alpha0 + 6*alpha1)/7,  alpha0 > alpha1 and code(x,y) == 7
+
+              (4*alpha0 + 1*alpha1)/5,  alpha0 <= alpha1 and code(x,y) == 2
+              (3*alpha0 + 2*alpha1)/5,  alpha0 <= alpha1 and code(x,y) == 3
+              (2*alpha0 + 3*alpha1)/5,  alpha0 <= alpha1 and code(x,y) == 4
+              (1*alpha0 + 4*alpha1)/5,  alpha0 <= alpha1 and code(x,y) == 5
+              0.0,                      alpha0 <= alpha1 and code(x,y) == 6
+              1.0,                      alpha0 <= alpha1 and code(x,y) == 7
+
+
+Revision History
+
+    1.1,  11/16/01 pbrown:    Updated contact info, clarified where texels
+                              fall within a single block.
+
+    1.0,  07/07/00 prbrown1:  Published final version agreed to by working
+                              group members.
+
+    0.9,  06/24/00 prbrown1:  Documented that block-aligned TexSubImage calls
+                              do not modify existing texels outside the
+                              modified blocks.  Added caveat to allow for a
+                              (0,0)-anchored TexSubImage operation of
+                              arbitrary size.
+
+    0.7,  04/11/00 prbrown1:  Added issues on DXT1, DXT3, and DXT5 encodings
+                              where the MSDN documentation doesn't match what
+                              is really done.  Added enum values from the
+                              extension registry.
+
+    0.4,  03/28/00 prbrown1:  Updated to reflect final version of the
+                              ARB_texture_compression extension.  Allowed
+                              block-aligned TexSubImage calls.
+
+    0.3,  03/07/00 prbrown1:  Resolved issues pertaining to the format of RGB
+                              blocks in the DXT3 and DXT5 formats (they don't
+                              ever use the "transparent" encoding).  Fixed
+                              decoding of DXT1 blocks.  Pointed out issue of
+                              "transparent" texels in DXT1 encodings having
+                              different behaviors for RGB and RGBA internal
+                              formats.
+
+    0.2,  02/23/00 prbrown1:  Minor revisions; added several issues.
+
+    0.11, 02/17/00 prbrown1:  Slight modification to error semantics
+                              (INVALID_ENUM instead of INVALID_OPERATION).
+
+    0.1,  02/15/00 prbrown1:  Initial revision.
--- a/src/nvimage/nvtt/squish/vs7/squish.sln
+++ b/src/nvimage/nvtt/squish/vs7/squish.sln
@ -0,0 +1,39 @@
+Microsoft Visual Studio Solution File, Format Version 8.00
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "squish", "squish\squish.vcproj", "{6A8518C3-D81A-4428-BD7F-C37933088AC1}"
+	ProjectSection(ProjectDependencies) = postProject
+	EndProjectSection
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "squishpng", "squishpng\squishpng.vcproj", "{3BC7CF47-F1C8-4BDA-BE30-92F17B21D2C7}"
+	ProjectSection(ProjectDependencies) = postProject
+		{6A8518C3-D81A-4428-BD7F-C37933088AC1} = {6A8518C3-D81A-4428-BD7F-C37933088AC1}
+	EndProjectSection
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "squishtest", "squishtest\squishtest.vcproj", "{77A3F26C-A1D6-4535-9E37-7D3DF34E4B4B}"
+	ProjectSection(ProjectDependencies) = postProject
+		{6A8518C3-D81A-4428-BD7F-C37933088AC1} = {6A8518C3-D81A-4428-BD7F-C37933088AC1}
+	EndProjectSection
+EndProject
+Global
+	GlobalSection(SolutionConfiguration) = preSolution
+		Debug = Debug
+		Release = Release
+	EndGlobalSection
+	GlobalSection(ProjectConfiguration) = postSolution
+		{6A8518C3-D81A-4428-BD7F-C37933088AC1}.Debug.ActiveCfg = Debug|Win32
+		{6A8518C3-D81A-4428-BD7F-C37933088AC1}.Debug.Build.0 = Debug|Win32
+		{6A8518C3-D81A-4428-BD7F-C37933088AC1}.Release.ActiveCfg = Release|Win32
+		{6A8518C3-D81A-4428-BD7F-C37933088AC1}.Release.Build.0 = Release|Win32
+		{3BC7CF47-F1C8-4BDA-BE30-92F17B21D2C7}.Debug.ActiveCfg = Debug|Win32
+		{3BC7CF47-F1C8-4BDA-BE30-92F17B21D2C7}.Debug.Build.0 = Debug|Win32
+		{3BC7CF47-F1C8-4BDA-BE30-92F17B21D2C7}.Release.ActiveCfg = Release|Win32
+		{3BC7CF47-F1C8-4BDA-BE30-92F17B21D2C7}.Release.Build.0 = Release|Win32
+		{77A3F26C-A1D6-4535-9E37-7D3DF34E4B4B}.Debug.ActiveCfg = Debug|Win32
+		{77A3F26C-A1D6-4535-9E37-7D3DF34E4B4B}.Debug.Build.0 = Debug|Win32
+		{77A3F26C-A1D6-4535-9E37-7D3DF34E4B4B}.Release.ActiveCfg = Release|Win32
+		{77A3F26C-A1D6-4535-9E37-7D3DF34E4B4B}.Release.Build.0 = Release|Win32
+	EndGlobalSection
+	GlobalSection(ExtensibilityGlobals) = postSolution
+	EndGlobalSection
+	GlobalSection(ExtensibilityAddIns) = postSolution
+	EndGlobalSection
+EndGlobal
--- a/src/nvimage/nvtt/squish/vs7/squish/squish.vcproj
+++ b/src/nvimage/nvtt/squish/vs7/squish/squish.vcproj
@ -0,0 +1,198 @@
+<?xml version="1.0" encoding="Windows-1252"?>
+<VisualStudioProject
+	ProjectType="Visual C++"
+	Version="7.10"
+	Name="squish"
+	ProjectGUID="{6A8518C3-D81A-4428-BD7F-C37933088AC1}"
+	Keyword="Win32Proj">
+	<Platforms>
+		<Platform
+			Name="Win32"/>
+	</Platforms>
+	<Configurations>
+		<Configuration
+			Name="Debug|Win32"
+			OutputDirectory="Debug"
+			IntermediateDirectory="Debug"
+			ConfigurationType="4"
+			CharacterSet="2">
+			<Tool
+				Name="VCCLCompilerTool"
+				Optimization="0"
+				AdditionalIncludeDirectories="..\.."
+				PreprocessorDefinitions="WIN32;_DEBUG;_LIB;SQUISH_USE_SSE=1"
+				MinimalRebuild="TRUE"
+				BasicRuntimeChecks="3"
+				RuntimeLibrary="3"
+				EnableEnhancedInstructionSet="1"
+				ForceConformanceInForLoopScope="TRUE"
+				UsePrecompiledHeader="0"
+				WarningLevel="4"
+				WarnAsError="TRUE"
+				Detect64BitPortabilityProblems="TRUE"
+				DebugInformationFormat="3"/>
+			<Tool
+				Name="VCCustomBuildTool"/>
+			<Tool
+				Name="VCLibrarianTool"
+				OutputFile="$(OutDir)/squish.lib"/>
+			<Tool
+				Name="VCMIDLTool"/>
+			<Tool
+				Name="VCPostBuildEventTool"/>
+			<Tool
+				Name="VCPreBuildEventTool"/>
+			<Tool
+				Name="VCPreLinkEventTool"/>
+			<Tool
+				Name="VCResourceCompilerTool"/>
+			<Tool
+				Name="VCWebServiceProxyGeneratorTool"/>
+			<Tool
+				Name="VCXMLDataGeneratorTool"/>
+			<Tool
+				Name="VCManagedWrapperGeneratorTool"/>
+			<Tool
+				Name="VCAuxiliaryManagedWrapperGeneratorTool"/>
+		</Configuration>
+		<Configuration
+			Name="Release|Win32"
+			OutputDirectory="Release"
+			IntermediateDirectory="Release"
+			ConfigurationType="4"
+			CharacterSet="2"
+			WholeProgramOptimization="TRUE">
+			<Tool
+				Name="VCCLCompilerTool"
+				GlobalOptimizations="TRUE"
+				InlineFunctionExpansion="2"
+				FavorSizeOrSpeed="1"
+				OmitFramePointers="TRUE"
+				AdditionalIncludeDirectories="..\.."
+				PreprocessorDefinitions="WIN32;NDEBUG;_LIB;SQUISH_USE_SSE=1"
+				RuntimeLibrary="2"
+				ForceConformanceInForLoopScope="TRUE"
+				UsePrecompiledHeader="0"
+				WarningLevel="4"
+				WarnAsError="TRUE"
+				Detect64BitPortabilityProblems="TRUE"
+				DebugInformationFormat="3"/>
+			<Tool
+				Name="VCCustomBuildTool"/>
+			<Tool
+				Name="VCLibrarianTool"
+				OutputFile="$(OutDir)/squish.lib"/>
+			<Tool
+				Name="VCMIDLTool"/>
+			<Tool
+				Name="VCPostBuildEventTool"/>
+			<Tool
+				Name="VCPreBuildEventTool"/>
+			<Tool
+				Name="VCPreLinkEventTool"/>
+			<Tool
+				Name="VCResourceCompilerTool"/>
+			<Tool
+				Name="VCWebServiceProxyGeneratorTool"/>
+			<Tool
+				Name="VCXMLDataGeneratorTool"/>
+			<Tool
+				Name="VCManagedWrapperGeneratorTool"/>
+			<Tool
+				Name="VCAuxiliaryManagedWrapperGeneratorTool"/>
+		</Configuration>
+	</Configurations>
+	<References>
+	</References>
+	<Files>
+		<Filter
+			Name="Source Files"
+			Filter="cpp;c;cxx;def;odl;idl;hpj;bat;asm;asmx"
+			UniqueIdentifier="{4FC737F1-C7A5-4376-A066-2A32D752A2FF}">
+			<File
+				RelativePath="..\..\alpha.cpp">
+			</File>
+			<File
+				RelativePath="..\..\clusterfit.cpp">
+			</File>
+			<File
+				RelativePath="..\..\colourblock.cpp">
+			</File>
+			<File
+				RelativePath="..\..\colourfit.cpp">
+			</File>
+			<File
+				RelativePath="..\..\colourset.cpp">
+			</File>
+			<File
+				RelativePath="..\..\maths.cpp">
+			</File>
+			<File
+				RelativePath="..\..\rangefit.cpp">
+			</File>
+			<File
+				RelativePath="..\..\singlecolourfit.cpp">
+			</File>
+			<File
+				RelativePath="..\..\squish.cpp">
+			</File>
+		</Filter>
+		<Filter
+			Name="Header Files"
+			Filter="h;hpp;hxx;hm;inl;inc;xsd"
+			UniqueIdentifier="{93995380-89BD-4b04-88EB-625FBE52EBFB}">
+			<File
+				RelativePath="..\..\alpha.h">
+			</File>
+			<File
+				RelativePath="..\..\clusterfit.h">
+			</File>
+			<File
+				RelativePath="..\..\colourblock.h">
+			</File>
+			<File
+				RelativePath="..\..\colourfit.h">
+			</File>
+			<File
+				RelativePath="..\..\colourset.h">
+			</File>
+			<File
+				RelativePath="..\..\config.h">
+			</File>
+			<File
+				RelativePath="..\..\maths.h">
+			</File>
+			<File
+				RelativePath="..\..\rangefit.h">
+			</File>
+			<File
+				RelativePath="..\..\simd.h">
+			</File>
+			<File
+				RelativePath="..\..\simd_sse.h">
+			</File>
+			<File
+				RelativePath="..\..\simd_ve.h">
+			</File>
+			<File
+				RelativePath="..\..\singlecolourfit.h">
+			</File>
+			<File
+				RelativePath="..\..\singlecolourlookup.inl">
+			</File>
+			<File
+				RelativePath="..\..\squish.h">
+			</File>
+		</Filter>
+		<Filter
+			Name="Resource Files"
+			Filter="rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx"
+			UniqueIdentifier="{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}">
+			<File
+				RelativePath="..\..\texture_compression_s3tc.txt">
+			</File>
+		</Filter>
+	</Files>
+	<Globals>
+	</Globals>
+</VisualStudioProject>
--- a/src/nvimage/nvtt/squish/vs7/squishpng/squishpng.vcproj
+++ b/src/nvimage/nvtt/squish/vs7/squishpng/squishpng.vcproj
@ -0,0 +1,140 @@
+<?xml version="1.0" encoding="Windows-1252"?>
+<VisualStudioProject
+	ProjectType="Visual C++"
+	Version="7.10"
+	Name="squishpng"
+	ProjectGUID="{3BC7CF47-F1C8-4BDA-BE30-92F17B21D2C7}"
+	Keyword="Win32Proj">
+	<Platforms>
+		<Platform
+			Name="Win32"/>
+	</Platforms>
+	<Configurations>
+		<Configuration
+			Name="Debug|Win32"
+			OutputDirectory="Debug"
+			IntermediateDirectory="Debug"
+			ConfigurationType="1"
+			CharacterSet="2">
+			<Tool
+				Name="VCCLCompilerTool"
+				Optimization="0"
+				AdditionalIncludeDirectories="..\.."
+				PreprocessorDefinitions="WIN32;_DEBUG;_CONSOLE"
+				MinimalRebuild="TRUE"
+				BasicRuntimeChecks="3"
+				RuntimeLibrary="3"
+				ForceConformanceInForLoopScope="TRUE"
+				UsePrecompiledHeader="0"
+				WarningLevel="4"
+				WarnAsError="TRUE"
+				Detect64BitPortabilityProblems="TRUE"
+				DebugInformationFormat="3"/>
+			<Tool
+				Name="VCCustomBuildTool"/>
+			<Tool
+				Name="VCLinkerTool"
+				AdditionalDependencies="libpng13d.lib"
+				OutputFile="$(OutDir)/squishpng.exe"
+				LinkIncremental="2"
+				GenerateDebugInformation="TRUE"
+				ProgramDatabaseFile="$(OutDir)/squishpng.pdb"
+				SubSystem="1"
+				TargetMachine="1"/>
+			<Tool
+				Name="VCMIDLTool"/>
+			<Tool
+				Name="VCPostBuildEventTool"/>
+			<Tool
+				Name="VCPreBuildEventTool"/>
+			<Tool
+				Name="VCPreLinkEventTool"/>
+			<Tool
+				Name="VCResourceCompilerTool"/>
+			<Tool
+				Name="VCWebServiceProxyGeneratorTool"/>
+			<Tool
+				Name="VCXMLDataGeneratorTool"/>
+			<Tool
+				Name="VCWebDeploymentTool"/>
+			<Tool
+				Name="VCManagedWrapperGeneratorTool"/>
+			<Tool
+				Name="VCAuxiliaryManagedWrapperGeneratorTool"/>
+		</Configuration>
+		<Configuration
+			Name="Release|Win32"
+			OutputDirectory="Release"
+			IntermediateDirectory="Release"
+			ConfigurationType="1"
+			CharacterSet="2">
+			<Tool
+				Name="VCCLCompilerTool"
+				AdditionalIncludeDirectories="..\.."
+				PreprocessorDefinitions="WIN32;NDEBUG;_CONSOLE"
+				RuntimeLibrary="2"
+				ForceConformanceInForLoopScope="TRUE"
+				UsePrecompiledHeader="0"
+				WarningLevel="4"
+				WarnAsError="TRUE"
+				Detect64BitPortabilityProblems="TRUE"
+				DebugInformationFormat="3"/>
+			<Tool
+				Name="VCCustomBuildTool"/>
+			<Tool
+				Name="VCLinkerTool"
+				AdditionalDependencies="libpng13.lib"
+				OutputFile="$(OutDir)/squishpng.exe"
+				LinkIncremental="1"
+				GenerateDebugInformation="TRUE"
+				SubSystem="1"
+				OptimizeReferences="2"
+				EnableCOMDATFolding="2"
+				TargetMachine="1"/>
+			<Tool
+				Name="VCMIDLTool"/>
+			<Tool
+				Name="VCPostBuildEventTool"/>
+			<Tool
+				Name="VCPreBuildEventTool"/>
+			<Tool
+				Name="VCPreLinkEventTool"/>
+			<Tool
+				Name="VCResourceCompilerTool"/>
+			<Tool
+				Name="VCWebServiceProxyGeneratorTool"/>
+			<Tool
+				Name="VCXMLDataGeneratorTool"/>
+			<Tool
+				Name="VCWebDeploymentTool"/>
+			<Tool
+				Name="VCManagedWrapperGeneratorTool"/>
+			<Tool
+				Name="VCAuxiliaryManagedWrapperGeneratorTool"/>
+		</Configuration>
+	</Configurations>
+	<References>
+	</References>
+	<Files>
+		<Filter
+			Name="Source Files"
+			Filter="cpp;c;cxx;def;odl;idl;hpj;bat;asm;asmx"
+			UniqueIdentifier="{4FC737F1-C7A5-4376-A066-2A32D752A2FF}">
+			<File
+				RelativePath="..\..\extra\squishpng.cpp">
+			</File>
+		</Filter>
+		<Filter
+			Name="Header Files"
+			Filter="h;hpp;hxx;hm;inl;inc;xsd"
+			UniqueIdentifier="{93995380-89BD-4b04-88EB-625FBE52EBFB}">
+		</Filter>
+		<Filter
+			Name="Resource Files"
+			Filter="rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx"
+			UniqueIdentifier="{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}">
+		</Filter>
+	</Files>
+	<Globals>
+	</Globals>
+</VisualStudioProject>
--- a/src/nvimage/nvtt/squish/vs7/squishtest/squishtest.vcproj
+++ b/src/nvimage/nvtt/squish/vs7/squishtest/squishtest.vcproj
@ -0,0 +1,138 @@
+<?xml version="1.0" encoding="Windows-1252"?>
+<VisualStudioProject
+	ProjectType="Visual C++"
+	Version="7.10"
+	Name="squishtest"
+	ProjectGUID="{77A3F26C-A1D6-4535-9E37-7D3DF34E4B4B}"
+	Keyword="Win32Proj">
+	<Platforms>
+		<Platform
+			Name="Win32"/>
+	</Platforms>
+	<Configurations>
+		<Configuration
+			Name="Debug|Win32"
+			OutputDirectory="Debug"
+			IntermediateDirectory="Debug"
+			ConfigurationType="1"
+			CharacterSet="2">
+			<Tool
+				Name="VCCLCompilerTool"
+				Optimization="0"
+				AdditionalIncludeDirectories="..\.."
+				PreprocessorDefinitions="WIN32;_DEBUG;_CONSOLE"
+				MinimalRebuild="TRUE"
+				BasicRuntimeChecks="3"
+				RuntimeLibrary="3"
+				ForceConformanceInForLoopScope="TRUE"
+				UsePrecompiledHeader="0"
+				WarningLevel="4"
+				WarnAsError="TRUE"
+				Detect64BitPortabilityProblems="TRUE"
+				DebugInformationFormat="3"/>
+			<Tool
+				Name="VCCustomBuildTool"/>
+			<Tool
+				Name="VCLinkerTool"
+				OutputFile="$(OutDir)/squishtest.exe"
+				LinkIncremental="2"
+				GenerateDebugInformation="TRUE"
+				ProgramDatabaseFile="$(OutDir)/squishtest.pdb"
+				SubSystem="1"
+				TargetMachine="1"/>
+			<Tool
+				Name="VCMIDLTool"/>
+			<Tool
+				Name="VCPostBuildEventTool"/>
+			<Tool
+				Name="VCPreBuildEventTool"/>
+			<Tool
+				Name="VCPreLinkEventTool"/>
+			<Tool
+				Name="VCResourceCompilerTool"/>
+			<Tool
+				Name="VCWebServiceProxyGeneratorTool"/>
+			<Tool
+				Name="VCXMLDataGeneratorTool"/>
+			<Tool
+				Name="VCWebDeploymentTool"/>
+			<Tool
+				Name="VCManagedWrapperGeneratorTool"/>
+			<Tool
+				Name="VCAuxiliaryManagedWrapperGeneratorTool"/>
+		</Configuration>
+		<Configuration
+			Name="Release|Win32"
+			OutputDirectory="Release"
+			IntermediateDirectory="Release"
+			ConfigurationType="1"
+			CharacterSet="2">
+			<Tool
+				Name="VCCLCompilerTool"
+				AdditionalIncludeDirectories="..\.."
+				PreprocessorDefinitions="WIN32;NDEBUG;_CONSOLE"
+				RuntimeLibrary="2"
+				ForceConformanceInForLoopScope="TRUE"
+				UsePrecompiledHeader="0"
+				WarningLevel="4"
+				WarnAsError="TRUE"
+				Detect64BitPortabilityProblems="TRUE"
+				DebugInformationFormat="3"/>
+			<Tool
+				Name="VCCustomBuildTool"/>
+			<Tool
+				Name="VCLinkerTool"
+				OutputFile="$(OutDir)/squishtest.exe"
+				LinkIncremental="1"
+				GenerateDebugInformation="TRUE"
+				SubSystem="1"
+				OptimizeReferences="2"
+				EnableCOMDATFolding="2"
+				TargetMachine="1"/>
+			<Tool
+				Name="VCMIDLTool"/>
+			<Tool
+				Name="VCPostBuildEventTool"/>
+			<Tool
+				Name="VCPreBuildEventTool"/>
+			<Tool
+				Name="VCPreLinkEventTool"/>
+			<Tool
+				Name="VCResourceCompilerTool"/>
+			<Tool
+				Name="VCWebServiceProxyGeneratorTool"/>
+			<Tool
+				Name="VCXMLDataGeneratorTool"/>
+			<Tool
+				Name="VCWebDeploymentTool"/>
+			<Tool
+				Name="VCManagedWrapperGeneratorTool"/>
+			<Tool
+				Name="VCAuxiliaryManagedWrapperGeneratorTool"/>
+		</Configuration>
+	</Configurations>
+	<References>
+	</References>
+	<Files>
+		<Filter
+			Name="Source Files"
+			Filter="cpp;c;cxx;def;odl;idl;hpj;bat;asm;asmx"
+			UniqueIdentifier="{4FC737F1-C7A5-4376-A066-2A32D752A2FF}">
+			<File
+				RelativePath="..\..\extra\squishtest.cpp">
+			</File>
+		</Filter>
+		<Filter
+			Name="Header Files"
+			Filter="h;hpp;hxx;hm;inl;inc;xsd"
+			UniqueIdentifier="{93995380-89BD-4b04-88EB-625FBE52EBFB}">
+		</Filter>
+		<Filter
+			Name="Resource Files"
+			Filter="rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx"
+			UniqueIdentifier="{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}">
+		</Filter>
+	</Files>
+	<Globals>
+	</Globals>
+</VisualStudioProject>
--- a/src/nvimage/nvtt/squish/weightedclusterfit.cpp
+++ b/src/nvimage/nvtt/squish/weightedclusterfit.cpp
@ -0,0 +1,576 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+	Copyright (c) 2006 Ignacio Castano                      icastano@nvidia.com
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+
+#include "weightedclusterfit.h"
+#include "colourset.h"
+#include "colourblock.h"
+#include <cfloat>
+
+
+namespace squish {
+
+WeightedClusterFit::WeightedClusterFit( ColourSet const* colours, int flags ) :
+	ColourFit( colours, flags )
+{
+	// initialise the best error
+#if SQUISH_USE_SIMD
+	m_besterror = VEC4_CONST( FLT_MAX );
+#else
+	m_besterror = FLT_MAX;
+#endif
+
+	// cache some values
+	int const count = m_colours->GetCount();
+	Vec3 const* values = m_colours->GetPoints();
+	
+	// get the covariance matrix
+	Sym3x3 covariance = ComputeWeightedCovariance( count, values, m_colours->GetWeights() );
+	
+	// compute the principle component
+	Vec3 principle = ComputePrincipleComponent( covariance );
+
+	// build the list of values
+	float dps[16];
+	for( int i = 0; i < count; ++i )
+	{
+		dps[i] = Dot( values[i], principle );
+		m_order[i] = i;
+	}
+	
+	// stable sort
+	for( int i = 0; i < count; ++i )
+	{
+		for( int j = i; j > 0 && dps[j] < dps[j - 1]; --j )
+		{
+			std::swap( dps[j], dps[j - 1] );
+			std::swap( m_order[j], m_order[j - 1] );
+		}
+	}
+	
+	// weight all the points
+#if SQUISH_USE_SIMD
+	Vec4 const* unweighted = m_colours->GetPointsSimd();
+	Vec4 const* weights = m_colours->GetWeightsSimd();
+	m_xxsum = VEC4_CONST( 0.0f );
+	m_xsum = VEC4_CONST( 0.0f );
+#else
+	Vec3 const* unweighted = m_colours->GetPoints();
+	float const* weights = m_colours->GetWeights();
+	m_xxsum = Vec3( 0.0f );
+	m_xsum = Vec3( 0.0f );
+	m_wsum = 0.0f;	
+#endif
+	
+	for( int i = 0; i < count; ++i )
+	{
+		int p = m_order[i];
+		m_weighted[i] = weights[p] * unweighted[p];
+		m_xxsum += m_weighted[i] * m_weighted[i];
+		m_xsum += m_weighted[i];
+#if !SQUISH_USE_SIMD		
+		m_weights[i] = weights[p];
+		m_wsum += m_weights[i];
+#endif
+	}
+}
+
+
+void WeightedClusterFit::setMetric(float r, float g, float b)
+{
+#if SQUISH_USE_SIMD
+	m_metric = Vec4(r, g, b, 0);
+#else
+	m_metric = Vec3(r, g, b);
+#endif
+}
+
+float WeightedClusterFit::bestError() const
+{
+#if SQUISH_USE_SIMD
+	Vec4 x = m_xxsum * m_metric;
+	Vec4 error = m_besterror + x.SplatX() + x.SplatY() + x.SplatZ();
+	return error.GetVec3().X();
+#else
+	return m_besterror + Dot(m_xxsum, m_metric);
+#endif
+
+}
+
+#if SQUISH_USE_SIMD
+
+void WeightedClusterFit::Compress3( void* block )
+{
+	Vec4 const one = VEC4_CONST(1.0f);
+	Vec4 const zero = VEC4_CONST(0.0f);
+	Vec4 const half(0.5f, 0.5f, 0.5f, 0.25f);
+	Vec4 const two = VEC4_CONST(2.0);
+	 
+	// declare variables
+	Vec4 beststart = VEC4_CONST( 0.0f );
+	Vec4 bestend = VEC4_CONST( 0.0f );
+	Vec4 besterror = VEC4_CONST( FLT_MAX );
+
+	Vec4 x0 = zero;
+	
+	int b0 = 0, b1 = 0;
+
+	// check all possible clusters for this total order
+	for( int c0 = 0; c0 <= 16; c0++)
+	{	
+		Vec4 x1 = zero;
+		
+		for( int c1 = 0; c1 <= 16-c0; c1++)
+		{
+			Vec4 const x2 = m_xsum - x1 - x0;
+			
+			//Vec3 const alphax_sum = x0 + x1 * 0.5f;
+			//float const alpha2_sum = w0 + w1 * 0.25f;
+			Vec4 const alphax_sum = MultiplyAdd(x1, half, x0); // alphax_sum, alpha2_sum
+			Vec4 const alpha2_sum = alphax_sum.SplatW();
+			
+			//Vec3 const betax_sum = x2 + x1 * 0.5f;
+			//float const beta2_sum = w2 + w1 * 0.25f;
+			Vec4 const betax_sum = MultiplyAdd(x1, half, x2); // betax_sum, beta2_sum
+			Vec4 const beta2_sum = betax_sum.SplatW();
+			
+			//float const alphabeta_sum = w1 * 0.25f;
+			Vec4 const alphabeta_sum = (x1 * half).SplatW(); // alphabeta_sum
+			
+			// float const factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);
+			Vec4 const factor = Reciprocal( NegativeMultiplySubtract(alphabeta_sum, alphabeta_sum, alpha2_sum*beta2_sum) );
+			
+			Vec4 a = NegativeMultiplySubtract(betax_sum, alphabeta_sum, alphax_sum*beta2_sum) * factor;
+			Vec4 b = NegativeMultiplySubtract(alphax_sum, alphabeta_sum, betax_sum*alpha2_sum) * factor;
+			
+			// clamp the output to [0, 1]
+			a = Min( one, Max( zero, a ) );
+			b = Min( one, Max( zero, b ) );
+			
+			// clamp to the grid
+			Vec4 const grid( 31.0f, 63.0f, 31.0f, 0.0f );
+			Vec4 const gridrcp( 0.03227752766457f, 0.01583151765563f, 0.03227752766457f, 0.0f );
+			a = Truncate( MultiplyAdd( grid, a, half ) ) * gridrcp;
+			b = Truncate( MultiplyAdd( grid, b, half ) ) * gridrcp;
+			
+			// compute the error
+			Vec4 e1 = MultiplyAdd( a, alphax_sum, b*betax_sum );
+			Vec4 e2 = MultiplyAdd( a*a, alpha2_sum, b*b*beta2_sum );
+			Vec4 e3 = MultiplyAdd( a*b*alphabeta_sum - e1, two, e2 );
+			
+			// apply the metric to the error term
+			Vec4 e4 = e3 * m_metric;
+			Vec4 error = e4.SplatX() + e4.SplatY() + e4.SplatZ();
+			
+			// keep the solution if it wins
+			if( CompareAnyLessThan( error, besterror ) )
+			{
+				besterror = error;
+				beststart = a;
+				bestend = b;
+				b0 = c0;
+				b1 = c1;
+			}
+			
+			x1 += m_weighted[c0+c1];
+		}
+		
+		x0 += m_weighted[c0];
+	}
+
+	// save the block if necessary
+	if( CompareAnyLessThan( besterror, m_besterror ) )
+	{
+		// compute indices from cluster sizes.
+		u8 bestindices[16];
+		{
+			int i = 0;
+			for(; i < b0; i++) {
+				bestindices[i] = 0;
+			}
+			for(; i < b0+b1; i++) {
+				bestindices[i] = 2;
+			}
+			for(; i < 16; i++) {
+				bestindices[i] = 1;
+			}
+		}
+		
+		// remap the indices
+		u8 ordered[16];
+		for( int i = 0; i < 16; ++i )
+			ordered[m_order[i]] = bestindices[i];
+		
+		// save the block
+		WriteColourBlock3( beststart.GetVec3(), bestend.GetVec3(), ordered, block );
+		
+		// save the error
+		m_besterror = besterror;
+	}
+}
+
+void WeightedClusterFit::Compress4( void* block )
+{
+	Vec4 const one = VEC4_CONST(1.0f);
+	Vec4 const zero = VEC4_CONST(0.0f);
+	Vec4 const half = VEC4_CONST(0.5f);
+	Vec4 const two = VEC4_CONST(2.0);
+	Vec4 const onethird( 1.0f/3.0f, 1.0f/3.0f, 1.0f/3.0f, 1.0f/9.0f );
+	Vec4 const twothirds( 2.0f/3.0f, 2.0f/3.0f, 2.0f/3.0f, 4.0f/9.0f );
+	
+	// declare variables
+	Vec4 beststart = VEC4_CONST( 0.0f );
+	Vec4 bestend = VEC4_CONST( 0.0f );
+	Vec4 besterror = VEC4_CONST( FLT_MAX );
+
+	Vec4 x0 = zero;
+	int b0 = 0, b1 = 0, b2 = 0;
+
+	// check all possible clusters for this total order
+	for( int c0 = 0; c0 <= 16; c0++)
+	{	
+		Vec4 x1 = zero;
+		
+		for( int c1 = 0; c1 <= 16-c0; c1++)
+		{	
+			Vec4 x2 = zero;
+			
+			for( int c2 = 0; c2 <= 16-c0-c1; c2++)
+			{
+				Vec4 const x3 = m_xsum - x2 - x1 - x0;
+				
+				//Vec3 const alphax_sum = x0 + x1 * (2.0f / 3.0f) + x2 * (1.0f / 3.0f);
+				//float const alpha2_sum = w0 + w1 * (4.0f/9.0f) + w2 * (1.0f/9.0f);
+				Vec4 const alphax_sum = x0 + MultiplyAdd(x1, twothirds, x2 * onethird); // alphax_sum, alpha2_sum
+				Vec4 const alpha2_sum = alphax_sum.SplatW();
+				
+				//Vec3 const betax_sum = x3 + x2 * (2.0f / 3.0f) + x1 * (1.0f / 3.0f);
+				//float const beta2_sum = w3 + w2 * (4.0f/9.0f) + w1 * (1.0f/9.0f);
+				Vec4 const betax_sum = x3 + MultiplyAdd(x2, twothirds, x1 * onethird); // betax_sum, beta2_sum
+				Vec4 const beta2_sum = betax_sum.SplatW();
+				
+				//float const alphabeta_sum = w1 * (2.0f/9.0f) + w2 * (2.0f/9.0f);
+				Vec4 const alphabeta_sum = two * (x1 * onethird + x2 * onethird).SplatW(); // alphabeta_sum
+				
+				// float const factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);
+				Vec4 const factor = Reciprocal( NegativeMultiplySubtract(alphabeta_sum, alphabeta_sum, alpha2_sum*beta2_sum) );
+				
+				Vec4 a = NegativeMultiplySubtract(betax_sum, alphabeta_sum, alphax_sum*beta2_sum) * factor;
+				Vec4 b = NegativeMultiplySubtract(alphax_sum, alphabeta_sum, betax_sum*alpha2_sum) * factor;
+				
+				// clamp the output to [0, 1]
+				a = Min( one, Max( zero, a ) );
+				b = Min( one, Max( zero, b ) );
+				
+				// clamp to the grid
+				Vec4 const grid( 31.0f, 63.0f, 31.0f, 0.0f );
+				Vec4 const gridrcp( 0.03227752766457f, 0.01583151765563f, 0.03227752766457f, 0.0f );
+				a = Truncate( MultiplyAdd( grid, a, half ) ) * gridrcp;
+				b = Truncate( MultiplyAdd( grid, b, half ) ) * gridrcp;
+				
+				// compute the error
+				Vec4 e1 = MultiplyAdd( a, alphax_sum, b*betax_sum );
+				Vec4 e2 = MultiplyAdd( a*a, alpha2_sum, b*b*beta2_sum );
+				Vec4 e3 = MultiplyAdd( a*b*alphabeta_sum - e1, two, e2 );
+				
+				// apply the metric to the error term
+				Vec4 e4 = e3 * m_metric;
+				Vec4 error = e4.SplatX() + e4.SplatY() + e4.SplatZ();
+				
+				// keep the solution if it wins
+				if( CompareAnyLessThan( error, besterror ) )
+				{
+					besterror = error;
+					beststart = a;
+					bestend = b;
+					b0 = c0;
+					b1 = c1;
+					b2 = c2;
+				}
+				
+				x2 += m_weighted[c0+c1+c2];
+			}
+			
+			x1 += m_weighted[c0+c1];
+		}
+		
+		x0 += m_weighted[c0];
+	}
+
+	// save the block if necessary
+	if( CompareAnyLessThan( besterror, m_besterror ) )
+	{
+		// compute indices from cluster sizes.
+		u8 bestindices[16];
+		{
+			int i = 0;
+			for(; i < b0; i++) {
+				bestindices[i] = 0;
+			}
+			for(; i < b0+b1; i++) {
+				bestindices[i] = 2;
+			}
+			for(; i < b0+b1+b2; i++) {
+				bestindices[i] = 3;
+			}
+			for(; i < 16; i++) {
+				bestindices[i] = 1;
+			}
+		}
+		
+		// remap the indices
+		u8 ordered[16];
+		for( int i = 0; i < 16; ++i )
+			ordered[m_order[i]] = bestindices[i];
+		
+		// save the block
+		WriteColourBlock4( beststart.GetVec3(), bestend.GetVec3(), ordered, block );
+		
+		// save the error
+		m_besterror = besterror;
+	}
+}
+
+#else
+
+void WeightedClusterFit::Compress3( void* block )
+{
+	// declare variables
+	Vec3 beststart( 0.0f );
+	Vec3 bestend( 0.0f );
+	float besterror = FLT_MAX;
+
+	Vec3 x0(0.0f);
+	float w0 = 0.0f;
+	
+	int b0 = 0, b1 = 0;
+
+	// check all possible clusters for this total order
+	for( int c0 = 0; c0 <= 16; c0++)
+	{	
+		Vec3 x1(0.0f);
+		float w1 = 0.0f;
+		
+		for( int c1 = 0; c1 <= 16-c0; c1++)
+		{	
+			float w2 = m_wsum - w0 - w1;
+			
+			// These factors could be entirely precomputed.
+			float const alpha2_sum = w0 + w1 * 0.25f;
+			float const beta2_sum = w2 + w1 * 0.25f;
+			float const alphabeta_sum = w1 * 0.25f;
+			float const factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);
+			
+			Vec3 const alphax_sum = x0 + x1 * 0.5f;
+			Vec3 const betax_sum = m_xsum - alphax_sum;
+			
+			Vec3 a = (alphax_sum*beta2_sum - betax_sum*alphabeta_sum) * factor;
+			Vec3 b = (betax_sum*alpha2_sum - alphax_sum*alphabeta_sum) * factor;
+			
+			// clamp the output to [0, 1]
+			Vec3 const one( 1.0f );
+			Vec3 const zero( 0.0f );
+			a = Min( one, Max( zero, a ) );
+			b = Min( one, Max( zero, b ) );
+			
+			// clamp to the grid
+			Vec3 const grid( 31.0f, 63.0f, 31.0f );
+			Vec3 const gridrcp( 0.03227752766457f, 0.01583151765563f, 0.03227752766457f );
+			Vec3 const half( 0.5f );
+			a = Floor( grid*a + half )*gridrcp;
+			b = Floor( grid*b + half )*gridrcp;
+			
+			// compute the error
+			Vec3 e1 = a*a*alpha2_sum + b*b*beta2_sum + 2.0f*( a*b*alphabeta_sum - a*alphax_sum - b*betax_sum );
+			
+			// apply the metric to the error term
+			float error = Dot( e1, m_metric );
+			
+			// keep the solution if it wins
+			if( error < besterror )
+			{
+				besterror = error;
+				beststart = a;
+				bestend = b;
+				b0 = c0;
+				b1 = c1;
+			}
+			
+			x1 += m_weighted[c0+c1];
+			w1 += m_weights[c0+c1];
+		}
+		
+		x0 += m_weighted[c0];
+		w0 += m_weights[c0];
+	}
+
+	// save the block if necessary
+	if( besterror < m_besterror )
+	{
+		// compute indices from cluster sizes.
+		u8 bestindices[16];
+		{
+			int i = 0;
+			for(; i < b0; i++) {
+				bestindices[i] = 0;
+			}
+			for(; i < b0+b1; i++) {
+				bestindices[i] = 2;
+			}
+			for(; i < 16; i++) {
+				bestindices[i] = 1;
+			}
+		}
+		
+		// remap the indices
+		u8 ordered[16];
+		for( int i = 0; i < 16; ++i )
+			ordered[m_order[i]] = bestindices[i];
+		
+		// save the block
+		WriteColourBlock3( beststart, bestend, ordered, block );
+		
+		// save the error
+		m_besterror = besterror;
+	}
+}
+
+void WeightedClusterFit::Compress4( void* block )
+{
+	// declare variables
+	Vec3 beststart( 0.0f );
+	Vec3 bestend( 0.0f );
+	float besterror = FLT_MAX;
+
+	Vec3 x0(0.0f);
+	float w0 = 0.0f;
+	int b0 = 0, b1 = 0, b2 = 0;
+	int i = 0;
+
+	// check all possible clusters for this total order
+	for( int c0 = 0; c0 <= 16; c0++)
+	{	
+		Vec3 x1(0.0f);
+		float w1 = 0.0f;
+		
+		for( int c1 = 0; c1 <= 16-c0; c1++)
+		{	
+			Vec3 x2(0.0f);
+			float w2 = 0.0f;
+			
+			for( int c2 = 0; c2 <= 16-c0-c1; c2++)
+			{
+				float w3 = m_wsum - w0 - w1 - w2;
+				
+				float const alpha2_sum = w0 + w1 * (4.0f/9.0f) + w2 * (1.0f/9.0f);
+				float const beta2_sum = w3 + w2 * (4.0f/9.0f) + w1 * (1.0f/9.0f);
+				float const alphabeta_sum = (w1 + w2) * (2.0f/9.0f);
+				float const factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);
+				
+				Vec3 const alphax_sum = x0 + x1 * (2.0f / 3.0f) + x2 * (1.0f / 3.0f);
+				Vec3 const betax_sum = m_xsum - alphax_sum;
+				
+				Vec3 a = ( alphax_sum*beta2_sum - betax_sum*alphabeta_sum )*factor;
+				Vec3 b = ( betax_sum*alpha2_sum - alphax_sum*alphabeta_sum )*factor;
+				
+				// clamp the output to [0, 1]
+				Vec3 const one( 1.0f );
+				Vec3 const zero( 0.0f );
+				a = Min( one, Max( zero, a ) );
+				b = Min( one, Max( zero, b ) );
+				
+				// clamp to the grid
+				Vec3 const grid( 31.0f, 63.0f, 31.0f );
+				Vec3 const gridrcp( 0.03227752766457f, 0.01583151765563f, 0.03227752766457f );
+				Vec3 const half( 0.5f );
+				a = Floor( grid*a + half )*gridrcp;
+				b = Floor( grid*b + half )*gridrcp;
+				
+				// compute the error
+				Vec3 e1 = a*a*alpha2_sum + b*b*beta2_sum + 2.0f*( a*b*alphabeta_sum - a*alphax_sum - b*betax_sum );
+				
+				// apply the metric to the error term
+				float error = Dot( e1, m_metric );
+				
+				// keep the solution if it wins
+				if( error < besterror )
+				{
+					besterror = error;
+					beststart = a;
+					bestend = b;
+					b0 = c0;
+					b1 = c1;
+					b2 = c2;
+				}
+				
+				x2 += m_weighted[c0+c1+c2];
+				w2 += m_weights[c0+c1+c2];
+			}
+			
+			x1 += m_weighted[c0+c1];
+			w1 += m_weights[c0+c1];
+		}
+		
+		x0 += m_weighted[c0];
+		w0 += m_weights[c0];
+	}
+
+	// save the block if necessary
+	if( besterror < m_besterror )
+	{
+		// compute indices from cluster sizes.
+		u8 bestindices[16];
+		{
+			int i = 0;
+			for(; i < b0; i++) {
+				bestindices[i] = 0;
+			}
+			for(; i < b0+b1; i++) {
+				bestindices[i] = 2;
+			}
+			for(; i < b0+b1+b2; i++) {
+				bestindices[i] = 3;
+			}
+			for(; i < 16; i++) {
+				bestindices[i] = 1;
+			}
+		}
+		
+		// remap the indices
+		u8 ordered[16];
+		for( int i = 0; i < 16; ++i )
+			ordered[m_order[i]] = bestindices[i];
+		
+		// save the block
+		WriteColourBlock4( beststart, bestend, ordered, block );
+
+		// save the error
+		m_besterror = besterror;
+	}
+}
+
+#endif
+
+} // namespace squish
--- a/src/nvimage/nvtt/squish/weightedclusterfit.h
+++ b/src/nvimage/nvtt/squish/weightedclusterfit.h
@ -0,0 +1,76 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+	Copyright (c) 2006 Ignacio Castano                      icastano@nvidia.com
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#ifndef SQUISH_WEIGHTEDCLUSTERFIT_H
+#define SQUISH_WEIGHTEDCLUSTERFIT_H
+
+#include "squish.h"
+#include "maths.h"
+#include "simd.h"
+#include "colourfit.h"
+
+namespace squish {
+
+class WeightedClusterFit : public ColourFit
+{
+public:
+	WeightedClusterFit( ColourSet const* colours, int flags );
+	
+	void setMetric(float r, float g, float b);
+	float bestError() const;
+
+	static void doPrecomputation();
+
+	// Make them public
+	virtual void Compress3( void* block );
+	virtual void Compress4( void* block );
+	
+private:
+
+	Vec3 m_principle;
+
+#if SQUISH_USE_SIMD
+	Vec4 m_weighted[16];
+	Vec4 m_metric;
+	Vec4 m_xxsum;
+	Vec4 m_xsum;
+	Vec4 m_besterror;
+#else
+	Vec3 m_weighted[16];
+	float m_weights[16];
+	Vec3 m_metric;
+	Vec3 m_xxsum;
+	Vec3 m_xsum;
+	float m_wsum;
+	float m_besterror;
+#endif
+
+	int m_order[16];
+};
+
+} // namespace squish
+
+#endif // ndef SQUISH_WEIGHTEDCLUSTERFIT_H