diff --git a/src/nvtt/CompressDXT.cpp b/src/nvtt/CompressDXT.cpp
index dea258b..a8684d6 100644
--- a/src/nvtt/CompressDXT.cpp
+++ b/src/nvtt/CompressDXT.cpp
@@ -1,784 +1,675 @@
-// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
-// 
-// Permission is hereby granted, free of charge, to any person
-// obtaining a copy of this software and associated documentation
-// files (the "Software"), to deal in the Software without
-// restriction, including without limitation the rights to use,
-// copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the
-// Software is furnished to do so, subject to the following
-// conditions:
-// 
-// The above copyright notice and this permission notice shall be
-// included in all copies or substantial portions of the Software.
-// 
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-// OTHER DEALINGS IN THE SOFTWARE.
-
-#include "CompressDXT.h"
-#include "QuickCompressDXT.h"
-#include "OptimalCompressDXT.h"
-#include "CompressionOptions.h"
-#include "OutputOptions.h"
-
-// squish
-#include "squish/colourset.h"
-#include "squish/fastclusterfit.h"
-#include "squish/weightedclusterfit.h"
-
-#include <nvtt/nvtt.h>
-
-#include <nvcore/Memory.h>
-
-#include <nvimage/Image.h>
-#include <nvimage/ColorBlock.h>
-#include <nvimage/BlockDXT.h>
-
-
-// s3_quant
-#if defined(HAVE_S3QUANT)
-#include "s3tc/s3_quant.h"
-#endif
-
-// ati tc
-#if defined(HAVE_ATITC)
-typedef int BOOL;
-typedef _W64 unsigned long ULONG_PTR;
-typedef ULONG_PTR DWORD_PTR;
-#include "atitc/ATI_Compress.h"
-#endif
-
-// squish
-#if defined(HAVE_SQUISH)
-//#include "squish/squish.h"
-#include "squish-1.10/squish.h"
-#endif
-
-// d3dx
-#if defined(HAVE_D3DX)
-#include <d3dx9.h>
-#endif
-
-// stb
-#if defined(HAVE_STB)
-#define STB_DEFINE
-#include "stb/stb_dxt.h"
-#endif
-
-
-using namespace nv;
-using namespace nvtt;
-
-
-nv::FastCompressor::FastCompressor() : m_image(NULL), m_alphaMode(AlphaMode_None)
-{
-}
-
-nv::FastCompressor::~FastCompressor()
-{
-}
-
-void nv::FastCompressor::setImage(const Image * image, nvtt::AlphaMode alphaMode)
-{
-	m_image = image;
-	m_alphaMode = alphaMode;
-}
-
-void nv::FastCompressor::compressDXT1(const OutputOptions::Private & outputOptions)
-{
-	const uint w = m_image->width();
-	const uint h = m_image->height();
-	
-	ColorBlock rgba;
-	BlockDXT1 block;
-
-	for (uint y = 0; y < h; y += 4) {
-		for (uint x = 0; x < w; x += 4) {
-			rgba.init(m_image, x, y);
-			
-			QuickCompress::compressDXT1(rgba, &block);
-			
-			if (outputOptions.outputHandler != NULL) {
-				outputOptions.outputHandler->writeData(&block, sizeof(block));
-			}
-		}
-	}
-}
-
-
-void nv::FastCompressor::compressDXT1a(const OutputOptions::Private & outputOptions)
-{
-	const uint w = m_image->width();
-	const uint h = m_image->height();
-	
-	ColorBlock rgba;
-	BlockDXT1 block;
-
-	for (uint y = 0; y < h; y += 4) {
-		for (uint x = 0; x < w; x += 4) {
-			rgba.init(m_image, x, y);
-			
-			QuickCompress::compressDXT1a(rgba, &block);
-			
-			if (outputOptions.outputHandler != NULL) {
-				outputOptions.outputHandler->writeData(&block, sizeof(block));
-			}
-		}
-	}
-}
-
-
-void nv::FastCompressor::compressDXT3(const nvtt::OutputOptions::Private & outputOptions)
-{
-	const uint w = m_image->width();
-	const uint h = m_image->height();
-	
-	ColorBlock rgba;
-	BlockDXT3 block;
-
-	for (uint y = 0; y < h; y += 4) {
-		for (uint x = 0; x < w; x += 4) {
-			rgba.init(m_image, x, y);
-
-			QuickCompress::compressDXT3(rgba, &block);
-			
-			if (outputOptions.outputHandler != NULL) {
-				outputOptions.outputHandler->writeData(&block, sizeof(block));
-			}
-		}
-	}
-}
-
-
-void nv::FastCompressor::compressDXT5(const nvtt::OutputOptions::Private & outputOptions)
-{
-	const uint w = m_image->width();
-	const uint h = m_image->height();
-	
-	ColorBlock rgba;
-	BlockDXT5 block;
-
-	for (uint y = 0; y < h; y += 4) {
-		for (uint x = 0; x < w; x += 4) {
-			rgba.init(m_image, x, y);
-			
-			QuickCompress::compressDXT5(rgba, &block, 0);
-			
-			if (outputOptions.outputHandler != NULL) {
-				outputOptions.outputHandler->writeData(&block, sizeof(block));
-			}
-		}
-	}
-}
-
-
-void nv::FastCompressor::compressDXT5n(const nvtt::OutputOptions::Private & outputOptions)
-{
-	const uint w = m_image->width();
-	const uint h = m_image->height();
-	
-	ColorBlock rgba;
-	BlockDXT5 block;
-
-	for (uint y = 0; y < h; y += 4) {
-		for (uint x = 0; x < w; x += 4) {
-			rgba.init(m_image, x, y);
-			
-			rgba.swizzleDXT5n();
-
-			QuickCompress::compressDXT5(rgba, &block, 0);
-			
-			if (outputOptions.outputHandler != NULL) {
-				outputOptions.outputHandler->writeData(&block, sizeof(block));
-			}
-		}
-	}
-}
-
-
-nv::SlowCompressor::SlowCompressor() : m_image(NULL), m_alphaMode(AlphaMode_None)
-{
-}
-
-nv::SlowCompressor::~SlowCompressor()
-{
-}
-
-void nv::SlowCompressor::setImage(const Image * image, nvtt::AlphaMode alphaMode)
-{
-	m_image = image;
-	m_alphaMode = alphaMode;
-}
-
-void nv::SlowCompressor::compressDXT1(const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions)
-{
-	const uint w = m_image->width();
-	const uint h = m_image->height();
-	
-	ColorBlock rgba;
-	BlockDXT1 block;
-
-	nvsquish::WeightedClusterFit fit;
-	//nvsquish::ClusterFit fit;
-	//nvsquish::FastClusterFit fit;
-	fit.SetMetric(compressionOptions.colorWeight.x(), compressionOptions.colorWeight.y(), compressionOptions.colorWeight.z());
-
-	for (uint y = 0; y < h; y += 4) {
-		for (uint x = 0; x < w; x += 4) {
-			
-			rgba.init(m_image, x, y);
-			
-			if (rgba.isSingleColor())
-			{
-				OptimalCompress::compressDXT1(rgba.color(0), &block);
-			}
-			else
-			{
-				nvsquish::ColourSet colours((uint8 *)rgba.colors(), 0, true);
-				fit.SetColourSet(&colours, nvsquish::kDxt1);
-				fit.Compress(&block);
-			}
-			
-			if (outputOptions.outputHandler != NULL) {
-				outputOptions.outputHandler->writeData(&block, sizeof(block));
-			}
-		}
-	}
-}
-
-
-void nv::SlowCompressor::compressDXT1a(const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions)
-{
-	const uint w = m_image->width();
-	const uint h = m_image->height();
-	
-	ColorBlock rgba;
-	BlockDXT1 block;
-
-	nvsquish::WeightedClusterFit fit;
-	fit.SetMetric(compressionOptions.colorWeight.x(), compressionOptions.colorWeight.y(), compressionOptions.colorWeight.z());
-
-	for (uint y = 0; y < h; y += 4) {
-		for (uint x = 0; x < w; x += 4) {
-			
-			rgba.init(m_image, x, y);
-			
-			bool anyAlpha = false;
-			bool allAlpha = true;
-			
-			for (uint i = 0; i < 16; i++)
-			{
-				if (rgba.color(i).a < 128) anyAlpha = true;
-				else allAlpha = false;
-			}
-			
-			if ((!anyAlpha && rgba.isSingleColor() || allAlpha))
-			{
-				OptimalCompress::compressDXT1a(rgba.color(0), &block);
-			}
-			else
-			{
-				nvsquish::ColourSet colours((uint8 *)rgba.colors(), nvsquish::kDxt1|nvsquish::kWeightColourByAlpha);
-				fit.SetColourSet(&colours, nvsquish::kDxt1);
-				fit.Compress(&block);
-			}
-			
-			if (outputOptions.outputHandler != NULL) {
-				outputOptions.outputHandler->writeData(&block, sizeof(block));
-			}
-		}
-	}
-}
-
-
-void nv::SlowCompressor::compressDXT3(const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions)
-{
-	const uint w = m_image->width();
-	const uint h = m_image->height();
-	
-	ColorBlock rgba;
-	BlockDXT3 block;
-	
-	nvsquish::WeightedClusterFit fit;
-	fit.SetMetric(compressionOptions.colorWeight.x(), compressionOptions.colorWeight.y(), compressionOptions.colorWeight.z());
-
-	int flags = 0;
-	if (m_alphaMode == AlphaMode_Transparency)
-	{
-		flags = nvsquish::kWeightColourByAlpha;
-	}
-
-	for (uint y = 0; y < h; y += 4) {
-		for (uint x = 0; x < w; x += 4) {
-			
-			rgba.init(m_image, x, y);
-			
-			// Compress explicit alpha.
-			OptimalCompress::compressDXT3A(rgba, &block.alpha);
-
-			// Compress color.
-			if (rgba.isSingleColor())
-			{
-				OptimalCompress::compressDXT1(rgba.color(0), &block.color);
-			}
-			else
-			{
-				nvsquish::ColourSet colours((uint8 *)rgba.colors(), flags);
-				fit.SetColourSet(&colours, 0);
-				fit.Compress(&block.color);
-			}
-			
-			if (outputOptions.outputHandler != NULL) {
-				outputOptions.outputHandler->writeData(&block, sizeof(block));
-			}
-		}
-	}
-}
-
-void nv::SlowCompressor::compressDXT5(const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions)
-{
-	const uint w = m_image->width();
-	const uint h = m_image->height();
-	
-	ColorBlock rgba;
-	BlockDXT5 block;
-
-	nvsquish::WeightedClusterFit fit;
-	fit.SetMetric(compressionOptions.colorWeight.x(), compressionOptions.colorWeight.y(), compressionOptions.colorWeight.z());
-
-	int flags = 0;
-	if (m_alphaMode == AlphaMode_Transparency)
-	{
-		flags = nvsquish::kWeightColourByAlpha;
-	}
-
-	for (uint y = 0; y < h; y += 4) {
-		for (uint x = 0; x < w; x += 4) {
-			
-			rgba.init(m_image, x, y);
-
-			// Compress alpha.
-			if (compressionOptions.quality == Quality_Highest)
-			{
-				OptimalCompress::compressDXT5A(rgba, &block.alpha);
-			}
-			else
-			{
-				QuickCompress::compressDXT5A(rgba, &block.alpha);
-			}
-		
-			// Compress color.
-			if (rgba.isSingleColor())
-			{
-				OptimalCompress::compressDXT1(rgba.color(0), &block.color);
-			}
-			else
-			{
-				nvsquish::ColourSet colours((uint8 *)rgba.colors(), flags);
-				fit.SetColourSet(&colours, 0);
-				fit.Compress(&block.color);
-			}
-			
-			if (outputOptions.outputHandler != NULL) {
-				outputOptions.outputHandler->writeData(&block, sizeof(block));
-			}
-		}
-	}
-}
-
-
-void nv::SlowCompressor::compressDXT5n(const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions)
-{
-	const uint w = m_image->width();
-	const uint h = m_image->height();
-	
-	ColorBlock rgba;
-	BlockDXT5 block;
-	
-	nvsquish::WeightedClusterFit fit;
-	fit.SetMetric(0, 1, 0);
-
-	for (uint y = 0; y < h; y += 4) {
-		for (uint x = 0; x < w; x += 4) {
-			
-			rgba.init(m_image, x, y);
-			
-			rgba.swizzleDXT5n();			
-			
-			// Compress X.
-			if (compressionOptions.quality == Quality_Highest)
-			{
-				OptimalCompress::compressDXT5A(rgba, &block.alpha);
-			}
-			else
-			{
-				QuickCompress::compressDXT5A(rgba, &block.alpha);
-			}
-			
-			// Compress Y.
-			//OptimalCompress::compressDXT1G(rgba, &block.color);
-
-			/*if (rgba.isSingleColor())
-			{
-				OptimalCompress::compressDXT1G(rgba.color(0), &block.color);
-			}
-			else*/
-			{
-				nvsquish::ColourSet colours((uint8 *)rgba.colors(), 0);
-				fit.SetColourSet(&colours, 0);
-				fit.Compress(&block.color);
-			}
-			
-			if (outputOptions.outputHandler != NULL) {
-				outputOptions.outputHandler->writeData(&block, sizeof(block));
-			}
-		}
-	}
-}
-
-
-void nv::SlowCompressor::compressBC4(const CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions)
-{
-	const uint w = m_image->width();
-	const uint h = m_image->height();
-	
-	ColorBlock rgba;
-	AlphaBlockDXT5 block;
-	
-	for (uint y = 0; y < h; y += 4) {
-		for (uint x = 0; x < w; x += 4) {
-			
-			rgba.init(m_image, x, y);
-
-			if (compressionOptions.quality == Quality_Highest)
-			{
-				OptimalCompress::compressDXT5A(rgba, &block);
-			}
-			else
-			{
-				QuickCompress::compressDXT5A(rgba, &block);
-			}
-
-			if (outputOptions.outputHandler != NULL) {
-				outputOptions.outputHandler->writeData(&block, sizeof(block));
-			}
-		}
-	}
-}
-
-
-void nv::SlowCompressor::compressBC5(const CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions)
-{
-	const uint w = m_image->width();
-	const uint h = m_image->height();
-
-	ColorBlock xcolor;
-	ColorBlock ycolor;
-
-	BlockATI2 block;
-
-	for (uint y = 0; y < h; y += 4) {
-		for (uint x = 0; x < w; x += 4) {
-			
-			xcolor.init(m_image, x, y);
-			xcolor.splatX();
-			
-			ycolor.init(m_image, x, y);
-			ycolor.splatY();
-
-			if (compressionOptions.quality == Quality_Highest)
-			{
-				OptimalCompress::compressDXT5A(xcolor, &block.x);
-				OptimalCompress::compressDXT5A(ycolor, &block.y);
-			}
-			else
-			{
-				QuickCompress::compressDXT5A(xcolor, &block.x);
-				QuickCompress::compressDXT5A(ycolor, &block.y);
-			}
-
-			if (outputOptions.outputHandler != NULL) {
-				outputOptions.outputHandler->writeData(&block, sizeof(block));
-			}
-		}
-	}
-}
-
-
-#if defined(HAVE_S3QUANT)
-
-void nv::s3CompressDXT1(const Image * image, const OutputOptions::Private & outputOptions)
-{
-	const uint w = image->width();
-	const uint h = image->height();
-	
-	float error = 0.0f;
-
-	BlockDXT1 dxtBlock3;
-	BlockDXT1 dxtBlock4;
-	ColorBlock block;
-
-	for (uint y = 0; y < h; y += 4) {
-		for (uint x = 0; x < w; x += 4) {
-			block.init(image, x, y);
-
-			// Init rgb block.
-			RGBBlock rgbBlock;
-			rgbBlock.n = 16;
-			for (uint i = 0; i < 16; i++) {
-				rgbBlock.colorChannel[i][0] = clamp(float(block.color(i).r) / 255.0f, 0.0f, 1.0f);
-				rgbBlock.colorChannel[i][1] = clamp(float(block.color(i).g) / 255.0f, 0.0f, 1.0f);
-				rgbBlock.colorChannel[i][2] = clamp(float(block.color(i).b) / 255.0f, 0.0f, 1.0f);
-			}
-			rgbBlock.weight[0] = 1.0f;
-			rgbBlock.weight[1] = 1.0f;
-			rgbBlock.weight[2] = 1.0f;
-
-			rgbBlock.inLevel = 4;
-			CodeRGBBlock(&rgbBlock);
-
-			// Copy results to DXT block.
-			dxtBlock4.col0.r = rgbBlock.endPoint[0][0];
-			dxtBlock4.col0.g = rgbBlock.endPoint[0][1];
-			dxtBlock4.col0.b = rgbBlock.endPoint[0][2];
-
-			dxtBlock4.col1.r = rgbBlock.endPoint[1][0];
-			dxtBlock4.col1.g = rgbBlock.endPoint[1][1];
-			dxtBlock4.col1.b = rgbBlock.endPoint[1][2];
-
-			dxtBlock4.setIndices(rgbBlock.index);
-
-			if (dxtBlock4.col0.u < dxtBlock4.col1.u) {
-				swap(dxtBlock4.col0.u, dxtBlock4.col1.u);
-				dxtBlock4.indices ^= 0x55555555;
-			}
-
-			uint error4 = blockError(block, dxtBlock4);
-
-			rgbBlock.inLevel = 3;
-
-			CodeRGBBlock(&rgbBlock);
-
-			// Copy results to DXT block.
-			dxtBlock3.col0.r = rgbBlock.endPoint[0][0];
-			dxtBlock3.col0.g = rgbBlock.endPoint[0][1];
-			dxtBlock3.col0.b = rgbBlock.endPoint[0][2];
-
-			dxtBlock3.col1.r = rgbBlock.endPoint[1][0];
-			dxtBlock3.col1.g = rgbBlock.endPoint[1][1];
-			dxtBlock3.col1.b = rgbBlock.endPoint[1][2];
-
-			dxtBlock3.setIndices(rgbBlock.index);
-
-			if (dxtBlock3.col0.u > dxtBlock3.col1.u) {
-				swap(dxtBlock3.col0.u, dxtBlock3.col1.u);
-				dxtBlock3.indices ^= (~dxtBlock3.indices  >> 1) & 0x55555555;
-			}
-
-			uint error3 = blockError(block, dxtBlock3);
-
-			if (error3 < error4) {
-				error += error3;
-
-				if (outputOptions.outputHandler != NULL) {
-					outputOptions.outputHandler->writeData(&dxtBlock3, sizeof(dxtBlock3));
-				}
-			}
-			else {
-				error += error4;
-
-				if (outputOptions.outputHandler != NULL) {
-					outputOptions.outputHandler->writeData(&dxtBlock4, sizeof(dxtBlock4));
-				}
-			}
-		}
-	}
-}
-
-#endif // defined(HAVE_S3QUANT)
-
-
-#if defined(HAVE_ATITC)
-
-void nv::atiCompressDXT1(const Image * image, const OutputOptions::Private & outputOptions)
-{
-	// Init source texture
-	ATI_TC_Texture srcTexture;
-	srcTexture.dwSize = sizeof(srcTexture);
-	srcTexture.dwWidth = image->width();
-	srcTexture.dwHeight = image->height();
-	srcTexture.dwPitch = image->width() * 4;
-	srcTexture.format = ATI_TC_FORMAT_ARGB_8888;
-	srcTexture.dwDataSize = ATI_TC_CalculateBufferSize(&srcTexture);
-	srcTexture.pData = (ATI_TC_BYTE*) image->pixels();
-
-	// Init dest texture
-	ATI_TC_Texture destTexture;
-	destTexture.dwSize = sizeof(destTexture);
-	destTexture.dwWidth = image->width();
-	destTexture.dwHeight = image->height();
-	destTexture.dwPitch = 0;
-	destTexture.format = ATI_TC_FORMAT_DXT1;
-	destTexture.dwDataSize = ATI_TC_CalculateBufferSize(&destTexture);
-	destTexture.pData = (ATI_TC_BYTE*) mem::malloc(destTexture.dwDataSize);
-
-	// Compress
-	ATI_TC_ConvertTexture(&srcTexture, &destTexture, NULL, NULL, NULL, NULL);
-
-	if (outputOptions.outputHandler != NULL) {
-		outputOptions.outputHandler->writeData(destTexture.pData, destTexture.dwDataSize);
-	}
-
-	mem::free(destTexture.pData);
-}
-
-void nv::atiCompressDXT5(const Image * image, const OutputOptions::Private & outputOptions)
-{
-	// Init source texture
-	ATI_TC_Texture srcTexture;
-	srcTexture.dwSize = sizeof(srcTexture);
-	srcTexture.dwWidth = image->width();
-	srcTexture.dwHeight = image->height();
-	srcTexture.dwPitch = image->width() * 4;
-	srcTexture.format = ATI_TC_FORMAT_ARGB_8888;
-	srcTexture.dwDataSize = ATI_TC_CalculateBufferSize(&srcTexture);
-	srcTexture.pData = (ATI_TC_BYTE*) image->pixels();
-
-	// Init dest texture
-	ATI_TC_Texture destTexture;
-	destTexture.dwSize = sizeof(destTexture);
-	destTexture.dwWidth = image->width();
-	destTexture.dwHeight = image->height();
-	destTexture.dwPitch = 0;
-	destTexture.format = ATI_TC_FORMAT_DXT5;
-	destTexture.dwDataSize = ATI_TC_CalculateBufferSize(&destTexture);
-	destTexture.pData = (ATI_TC_BYTE*) mem::malloc(destTexture.dwDataSize);
-
-	// Compress
-	ATI_TC_ConvertTexture(&srcTexture, &destTexture, NULL, NULL, NULL, NULL);
-
-	if (outputOptions.outputHandler != NULL) {
-		outputOptions.outputHandler->writeData(destTexture.pData, destTexture.dwDataSize);
-	}
-
-	mem::free(destTexture.pData);
-}
-
-#endif // defined(HAVE_ATITC)
-
-#if defined(HAVE_SQUISH)
-
-void nv::squishCompressDXT1(const Image * image, const OutputOptions::Private & outputOptions)
-{
-	Image img(*image);
-	int count = img.width() * img.height();
-	for (int i = 0; i < count; i++)
-	{
-		Color32 c = img.pixel(i);
-		img.pixel(i) = Color32(c.b, c.g, c.r, c.a);
-	}
-
-	int size = squish::GetStorageRequirements(img.width(), img.height(), squish::kDxt1);
-	void * blocks = mem::malloc(size);
-
-	squish::CompressImage((const squish::u8 *)img.pixels(), img.width(), img.height(), blocks, squish::kDxt1 | squish::kColourClusterFit);
-
-	if (outputOptions.outputHandler != NULL) {
-		outputOptions.outputHandler->writeData(blocks, size);
-	}
-
-	mem::free(blocks);
-}
-
-#endif // defined(HAVE_SQUISH)
-
-
-#if defined(HAVE_D3DX)
-
-void nv::d3dxCompressDXT1(const Image * image, const OutputOptions::Private & outputOptions)
-{
-	IDirect3D9 * d3d = Direct3DCreate9(D3D_SDK_VERSION);
-
-	D3DPRESENT_PARAMETERS presentParams;
-	ZeroMemory(&presentParams, sizeof(presentParams));
-	presentParams.Windowed = TRUE;
-	presentParams.SwapEffect = D3DSWAPEFFECT_COPY;
-	presentParams.BackBufferWidth = 8;
-	presentParams.BackBufferHeight = 8;
-	presentParams.BackBufferFormat = D3DFMT_UNKNOWN;
-
-	HRESULT err;
-
-	IDirect3DDevice9 * device = NULL;
-	err = d3d->CreateDevice(D3DADAPTER_DEFAULT, D3DDEVTYPE_REF, GetDesktopWindow(), D3DCREATE_SOFTWARE_VERTEXPROCESSING, &presentParams, &device);
-
-	IDirect3DTexture9 * texture = NULL;
-	err = D3DXCreateTexture(device, image->width(), image->height(), 1, 0, D3DFMT_DXT1, D3DPOOL_SYSTEMMEM, &texture);
-	
-	IDirect3DSurface9 * surface = NULL;
-	err = texture->GetSurfaceLevel(0, &surface);
-
-	RECT rect;
-	rect.left = 0; 
-	rect.top = 0; 
-	rect.bottom = image->height();
-	rect.right = image->width();
-
-	err = D3DXLoadSurfaceFromMemory(surface, NULL, NULL, image->pixels(), D3DFMT_A8R8G8B8, image->width() * sizeof(Color32), NULL, &rect, D3DX_DEFAULT, 0);
-
-	if (err != D3DERR_INVALIDCALL && err != D3DXERR_INVALIDDATA)
-	{
-		D3DLOCKED_RECT rect;
-		ZeroMemory(&rect, sizeof(rect));
-
-		err = surface->LockRect(&rect, NULL, D3DLOCK_READONLY);
-
-		if (outputOptions.outputHandler != NULL) {
-			int size = rect.Pitch * ((image->height() + 3) / 4);
-			outputOptions.outputHandler->writeData(rect.pBits, size);
-		}
-
-		err = surface->UnlockRect();
-	}
-
-	surface->Release();
-	device->Release();
-	d3d->Release();
-}
-
-#endif // defined(HAVE_D3DX)
-
-
-#if defined(HAVE_STB)
-
-void nv::stbCompressDXT1(const Image * image, const OutputOptions::Private & outputOptions)
-{
-	const uint w = image->width();
-	const uint h = image->height();
-	
-	float error = 0.0f;
-
-	BlockDXT1 dxtBlock;
-	ColorBlock block;
-
-	for (uint y = 0; y < h; y += 4) {
-		for (uint x = 0; x < w; x += 4) {
-			block.init(image, x, y);
-			block.swizzleSTB();
-
-			stb_compress_dxt_block((unsigned char *)&dxtBlock, (unsigned char *)block.colors(), 0, 0);
-
-			if (outputOptions.outputHandler != NULL) {
-				outputOptions.outputHandler->writeData(&dxtBlock, sizeof(dxtBlock));
-			}
-		}
-	}
-}
-
-#endif // defined(HAVE_STB)
+// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
+// 
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+#include "CompressDXT.h"
+#include "QuickCompressDXT.h"
+#include "OptimalCompressDXT.h"
+#include "CompressionOptions.h"
+#include "OutputOptions.h"
+
+// squish
+#include "squish/colourset.h"
+#include "squish/fastclusterfit.h"
+#include "squish/weightedclusterfit.h"
+
+#include <nvtt/nvtt.h>
+
+#include <nvcore/Memory.h>
+
+#include <nvimage/Image.h>
+#include <nvimage/ColorBlock.h>
+#include <nvimage/BlockDXT.h>
+
+
+// s3_quant
+#if defined(HAVE_S3QUANT)
+#include "s3tc/s3_quant.h"
+#endif
+
+// ati tc
+#if defined(HAVE_ATITC)
+typedef int BOOL;
+typedef _W64 unsigned long ULONG_PTR;
+typedef ULONG_PTR DWORD_PTR;
+#include "atitc/ATI_Compress.h"
+#endif
+
+// squish
+#if defined(HAVE_SQUISH)
+//#include "squish/squish.h"
+#include "squish-1.10/squish.h"
+#endif
+
+// d3dx
+#if defined(HAVE_D3DX)
+#include <d3dx9.h>
+#endif
+
+// stb
+#if defined(HAVE_STB)
+#define STB_DEFINE
+#include "stb/stb_dxt.h"
+#endif
+
+#pragma message(NV_FILE_LINE "FIXME: Define HAVE_OPENMP from cmake.")
+#define HAVE_OPENMP
+#include <omp.h>
+
+using namespace nv;
+using namespace nvtt;
+
+
+void FixedBlockCompressor::compress(nvtt::InputFormat inputFormat, nvtt::AlphaMode alphaMode, uint w, uint h, void * data, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions)
+{
+	const uint bs = blockSize();
+	const uint bw = (w + 3) / 4;
+	const uint bh = (h + 3) / 4;
+	const uint size = bs * bw * bh;
+
+#if defined(HAVE_OPENMP)
+	bool singleThreaded = false;
+#else
+	bool singleThreaded = true;
+#endif
+
+	// Use a single thread to compress small textures.
+	if (bw * bh < 16) singleThreaded = true;
+
+	if (singleThreaded)
+	{
+		nvDebugCheck(bs <= 16);
+		uint8 mem[16];
+
+		for (int y = 0; y < int(h); y += 4) {
+			for (uint x = 0; x < w; x += 4) {
+
+				ColorBlock rgba;
+				if (inputFormat == nvtt::InputFormat_BGRA_8UB) {
+					rgba.init(w, h, (uint *)data, x, y);
+				}
+				else {
+					nvDebugCheck(inputFormat == nvtt::InputFormat_RGBA_32F);
+					rgba.init(w, h, (float *)data, x, y);
+				}
+
+				compressBlock(rgba, alphaMode, compressionOptions, mem);
+
+				if (outputOptions.outputHandler != NULL) {
+					outputOptions.outputHandler->writeData(mem, bs);
+				}
+			}
+		}
+	}
+#if defined(HAVE_OPENMP)
+	else
+	{
+		uint8 * mem = new uint8[size];
+
+	#pragma omp parallel
+		{
+	#pragma omp for
+			for (int i = 0; i < int(bw*bh); i++)
+			{
+				const uint x = i % bw;
+				const uint y = i / bw;
+
+				ColorBlock rgba;
+				if (inputFormat == nvtt::InputFormat_BGRA_8UB) {
+					rgba.init(w, h, (uint *)data, 4*x, 4*y);
+				}
+				else {
+					nvDebugCheck(inputFormat == nvtt::InputFormat_RGBA_32F);
+					rgba.init(w, h, (float *)data, 4*x, 4*y);
+				}
+
+				uint8 * ptr = mem + (y * bw + x) * bs;
+				compressBlock(rgba, alphaMode, compressionOptions, ptr);
+			} // omp for
+		} // omp parallel
+
+		if (outputOptions.outputHandler != NULL) {
+			outputOptions.outputHandler->writeData(mem, size);
+		}
+
+		delete [] mem;
+	}
+#endif
+}
+
+
+void FastCompressorDXT1::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
+{
+	BlockDXT1 * block = new(output) BlockDXT1;
+	QuickCompress::compressDXT1(rgba, block);
+}
+
+void FastCompressorDXT1a::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
+{
+	BlockDXT1 * block = new(output) BlockDXT1;
+	QuickCompress::compressDXT1a(rgba, block);
+}
+
+void FastCompressorDXT3::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
+{
+	BlockDXT3 * block = new(output) BlockDXT3;
+	QuickCompress::compressDXT3(rgba, block);
+}
+
+void FastCompressorDXT5::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
+{
+	BlockDXT5 * block = new(output) BlockDXT5;
+	QuickCompress::compressDXT5(rgba, block);
+}
+
+void FastCompressorDXT5n::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
+{
+	rgba.swizzle(4, 1, 5, 0); // 0xFF, G, 0, R
+
+	BlockDXT5 * block = new(output) BlockDXT5;
+	QuickCompress::compressDXT5(rgba, block);
+}
+
+void FastCompressorBC4::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
+{
+	BlockATI1 * block = new(output) BlockATI1;
+	
+	rgba.swizzle(0, 1, 2, 0); // Copy red to alpha
+	QuickCompress::compressDXT5A(rgba, &block->alpha);
+}
+
+void FastCompressorBC5::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
+{
+	BlockATI2 * block = new(output) BlockATI2;
+	
+	rgba.swizzle(0, 1, 2, 0); // Copy red to alpha
+	QuickCompress::compressDXT5A(rgba, &block->x);
+	
+	rgba.swizzle(0, 1, 2, 1); // Copy green to alpha
+	QuickCompress::compressDXT5A(rgba, &block->y);
+}
+
+
+void NormalCompressorDXT1::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
+{
+	nvsquish::WeightedClusterFit fit;
+	fit.SetMetric(compressionOptions.colorWeight.x(), compressionOptions.colorWeight.y(), compressionOptions.colorWeight.z());
+
+	if (rgba.isSingleColor())
+	{
+		BlockDXT1 * block = new(output) BlockDXT1;
+		OptimalCompress::compressDXT1(rgba.color(0), block);
+	}
+	else
+	{
+		nvsquish::ColourSet colours((uint8 *)rgba.colors(), 0);
+		fit.SetColourSet(&colours, nvsquish::kDxt1);
+		fit.Compress(output);
+	}
+}
+
+
+void NormalCompressorDXT1a::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
+{
+	bool anyAlpha = false;
+	bool allAlpha = true;
+		
+	for (uint i = 0; i < 16; i++)
+	{
+		if (rgba.color(i).a < 128) anyAlpha = true;
+		else allAlpha = false;
+	}
+
+	const bool isSingleColor = rgba.isSingleColor();
+		
+	if ((!anyAlpha && isSingleColor || allAlpha))
+	{
+		BlockDXT1 * block = new(output) BlockDXT1;
+		OptimalCompress::compressDXT1a(rgba.color(0), block);
+	}
+	else
+	{
+		nvsquish::WeightedClusterFit fit;
+		fit.SetMetric(compressionOptions.colorWeight.x(), compressionOptions.colorWeight.y(), compressionOptions.colorWeight.z());
+
+		int flags = nvsquish::kDxt1;
+		if (alphaMode == nvtt::AlphaMode_Transparency) flags |= nvsquish::kWeightColourByAlpha;
+
+		nvsquish::ColourSet colours((uint8 *)rgba.colors(), flags);
+		fit.SetColourSet(&colours, nvsquish::kDxt1);
+
+		fit.Compress(output);
+	}
+}
+
+
+void NormalCompressorDXT3::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
+{
+	BlockDXT3 * block = new(output) BlockDXT3;
+
+	// Compress explicit alpha.
+	OptimalCompress::compressDXT3A(rgba, &block->alpha);
+
+	// Compress color.
+	if (rgba.isSingleColor())
+	{
+		OptimalCompress::compressDXT1(rgba.color(0), &block->color);
+	}
+	else
+	{
+		nvsquish::WeightedClusterFit fit;
+		fit.SetMetric(compressionOptions.colorWeight.x(), compressionOptions.colorWeight.y(), compressionOptions.colorWeight.z());
+
+		int flags = 0;
+		if (alphaMode == nvtt::AlphaMode_Transparency) flags |= nvsquish::kWeightColourByAlpha;
+
+		nvsquish::ColourSet colours((uint8 *)rgba.colors(), flags);
+		fit.SetColourSet(&colours, 0);
+		fit.Compress(&block->color);
+	}
+}
+
+
+void NormalCompressorDXT5::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
+{
+	BlockDXT5 * block = new(output) BlockDXT5;
+
+	// Compress alpha.
+	if (compressionOptions.quality == Quality_Highest)
+	{
+		OptimalCompress::compressDXT5A(rgba, &block->alpha);
+	}
+	else
+	{
+		QuickCompress::compressDXT5A(rgba, &block->alpha);
+	}
+
+	// Compress color.
+	if (rgba.isSingleColor())
+	{
+		OptimalCompress::compressDXT1(rgba.color(0), &block->color);
+	}
+	else
+	{
+		nvsquish::WeightedClusterFit fit;
+		fit.SetMetric(compressionOptions.colorWeight.x(), compressionOptions.colorWeight.y(), compressionOptions.colorWeight.z());
+
+		int flags = 0;
+		if (alphaMode == nvtt::AlphaMode_Transparency) flags |= nvsquish::kWeightColourByAlpha;
+
+		nvsquish::ColourSet colours((uint8 *)rgba.colors(), flags);
+		fit.SetColourSet(&colours, 0);
+		fit.Compress(&block->color);
+	}
+}
+
+
+void NormalCompressorDXT5n::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
+{
+	rgba.swizzle(4, 1, 5, 0); // 0xFF, G, 0, R
+
+	BlockDXT5 * block = new(output) BlockDXT5;
+
+	// Compress X.
+	if (compressionOptions.quality == Quality_Highest)
+	{
+		OptimalCompress::compressDXT5A(rgba, &block->alpha);
+	}
+	else
+	{
+		QuickCompress::compressDXT5A(rgba, &block->alpha);
+	}
+
+	// Compress Y.
+	if (compressionOptions.quality == Quality_Highest)
+	{
+		OptimalCompress::compressDXT1G(rgba, &block->color);
+	}
+	else
+	{
+		if (rgba.isSingleColor())
+		{
+			OptimalCompress::compressDXT1G(rgba.color(0), &block->color);
+		}
+		else
+		{
+			nvsquish::WeightedClusterFit fit;
+			fit.SetMetric(0, 1, 0);
+
+			int flags = 0;
+			if (alphaMode == nvtt::AlphaMode_Transparency) flags |= nvsquish::kWeightColourByAlpha;
+
+			nvsquish::ColourSet colours((uint8 *)rgba.colors(), flags);
+			fit.SetColourSet(&colours, 0);
+			fit.Compress(&block->color);
+		}
+	}
+}
+
+
+void ProductionCompressorBC4::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
+{
+	BlockATI1 * block = new(output) BlockATI1;
+
+	rgba.swizzle(0, 1, 2, 0); // Copy red to alpha
+	OptimalCompress::compressDXT5A(rgba, &block->alpha);
+}
+
+void ProductionCompressorBC5::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
+{
+	BlockATI2 * block = new(output) BlockATI2;
+	
+	rgba.swizzle(0, 1, 2, 0); // Copy red to alpha
+	OptimalCompress::compressDXT5A(rgba, &block->x);
+	
+	rgba.swizzle(0, 1, 2, 1); // Copy green to alpha
+	OptimalCompress::compressDXT5A(rgba, &block->y);
+}
+
+
+
+#if defined(HAVE_S3QUANT)
+
+void S3CompressorDXT1::compress(nvtt::InputFormat inputFormat, nvtt::AlphaMode alphaMode, uint w, uint h, void * data, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions)
+{
+	float error = 0.0f;
+
+	BlockDXT1 dxtBlock3;
+	BlockDXT1 dxtBlock4;
+	ColorBlock block;
+
+	for (uint y = 0; y < h; y += 4) {
+		for (uint x = 0; x < w; x += 4) {
+			block.init(inputFormat, w, h, data, x, y);
+
+			// Init rgb block.
+			RGBBlock rgbBlock;
+			rgbBlock.n = 16;
+			for (uint i = 0; i < 16; i++) {
+				rgbBlock.colorChannel[i][0] = clamp(float(block.color(i).r) / 255.0f, 0.0f, 1.0f);
+				rgbBlock.colorChannel[i][1] = clamp(float(block.color(i).g) / 255.0f, 0.0f, 1.0f);
+				rgbBlock.colorChannel[i][2] = clamp(float(block.color(i).b) / 255.0f, 0.0f, 1.0f);
+			}
+			rgbBlock.weight[0] = 1.0f;
+			rgbBlock.weight[1] = 1.0f;
+			rgbBlock.weight[2] = 1.0f;
+
+			rgbBlock.inLevel = 4;
+			CodeRGBBlock(&rgbBlock);
+
+			// Copy results to DXT block.
+			dxtBlock4.col0.r = rgbBlock.endPoint[0][0];
+			dxtBlock4.col0.g = rgbBlock.endPoint[0][1];
+			dxtBlock4.col0.b = rgbBlock.endPoint[0][2];
+
+			dxtBlock4.col1.r = rgbBlock.endPoint[1][0];
+			dxtBlock4.col1.g = rgbBlock.endPoint[1][1];
+			dxtBlock4.col1.b = rgbBlock.endPoint[1][2];
+
+			dxtBlock4.setIndices(rgbBlock.index);
+
+			if (dxtBlock4.col0.u < dxtBlock4.col1.u) {
+				swap(dxtBlock4.col0.u, dxtBlock4.col1.u);
+				dxtBlock4.indices ^= 0x55555555;
+			}
+
+			uint error4 = blockError(block, dxtBlock4);
+
+			rgbBlock.inLevel = 3;
+
+			CodeRGBBlock(&rgbBlock);
+
+			// Copy results to DXT block.
+			dxtBlock3.col0.r = rgbBlock.endPoint[0][0];
+			dxtBlock3.col0.g = rgbBlock.endPoint[0][1];
+			dxtBlock3.col0.b = rgbBlock.endPoint[0][2];
+
+			dxtBlock3.col1.r = rgbBlock.endPoint[1][0];
+			dxtBlock3.col1.g = rgbBlock.endPoint[1][1];
+			dxtBlock3.col1.b = rgbBlock.endPoint[1][2];
+
+			dxtBlock3.setIndices(rgbBlock.index);
+
+			if (dxtBlock3.col0.u > dxtBlock3.col1.u) {
+				swap(dxtBlock3.col0.u, dxtBlock3.col1.u);
+				dxtBlock3.indices ^= (~dxtBlock3.indices  >> 1) & 0x55555555;
+			}
+
+			uint error3 = blockError(block, dxtBlock3);
+
+			if (error3 < error4) {
+				error += error3;
+
+				if (outputOptions.outputHandler != NULL) {
+					outputOptions.outputHandler->writeData(&dxtBlock3, sizeof(dxtBlock3));
+				}
+			}
+			else {
+				error += error4;
+
+				if (outputOptions.outputHandler != NULL) {
+					outputOptions.outputHandler->writeData(&dxtBlock4, sizeof(dxtBlock4));
+				}
+			}
+		}
+	}
+}
+
+#endif // defined(HAVE_S3QUANT)
+
+
+#if defined(HAVE_ATITC)
+
+void AtiCompressorDXT1::compress(nvtt::InputFormat inputFormat, nvtt::AlphaMode alphaMode, uint w, uint h, void * data, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions)
+{
+	// Init source texture
+	ATI_TC_Texture srcTexture;
+	srcTexture.dwSize = sizeof(srcTexture);
+	srcTexture.dwWidth = w;
+	srcTexture.dwHeight = h;
+	if (inputFormat == nvtt::InputFormat_BGRA_8UB)
+	{
+		srcTexture.dwPitch = w * 4;
+		srcTexture.format = ATI_TC_FORMAT_ARGB_8888;
+	}
+	else
+	{
+		srcTexture.dwPitch = w * 16;
+		srcTexture.format = ATI_TC_FORMAT_ARGB_32F;
+	}
+	srcTexture.dwDataSize = ATI_TC_CalculateBufferSize(&srcTexture);
+	srcTexture.pData = (ATI_TC_BYTE*) data;
+
+	// Init dest texture
+	ATI_TC_Texture destTexture;
+	destTexture.dwSize = sizeof(destTexture);
+	destTexture.dwWidth = w;
+	destTexture.dwHeight = h;
+	destTexture.dwPitch = 0;
+	destTexture.format = ATI_TC_FORMAT_DXT1;
+	destTexture.dwDataSize = ATI_TC_CalculateBufferSize(&destTexture);
+	destTexture.pData = (ATI_TC_BYTE*) mem::malloc(destTexture.dwDataSize);
+
+	ATI_TC_CompressOptions options;
+	options.dwSize = sizeof(options);
+	options.bUseChannelWeighting = false;
+	options.bUseAdaptiveWeighting = false;
+	options.bDXT1UseAlpha = false;
+	options.nCompressionSpeed = ATI_TC_Speed_Normal;
+	options.bDisableMultiThreading = false;
+	//options.bDisableMultiThreading = true;
+
+	// Compress
+	ATI_TC_ConvertTexture(&srcTexture, &destTexture, &options, NULL, NULL, NULL);
+
+	if (outputOptions.outputHandler != NULL) {
+		outputOptions.outputHandler->writeData(destTexture.pData, destTexture.dwDataSize);
+	}
+
+	mem::free(destTexture.pData);
+}
+
+void AtiCompressorDXT5::compress(nvtt::InputFormat inputFormat, nvtt::AlphaMode alphaMode, uint w, uint h, void * data, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions)
+{
+	// Init source texture
+	ATI_TC_Texture srcTexture;
+	srcTexture.dwSize = sizeof(srcTexture);
+	srcTexture.dwWidth = w;
+	srcTexture.dwHeight = h;
+	if (inputFormat == nvtt::InputFormat_BGRA_8UB)
+	{
+		srcTexture.dwPitch = w * 4;
+		srcTexture.format = ATI_TC_FORMAT_ARGB_8888;
+	}
+	else
+	{
+		srcTexture.dwPitch = w * 16;
+		srcTexture.format = ATI_TC_FORMAT_ARGB_32F;
+	}
+	srcTexture.dwDataSize = ATI_TC_CalculateBufferSize(&srcTexture);
+	srcTexture.pData = (ATI_TC_BYTE*) data;
+
+	// Init dest texture
+	ATI_TC_Texture destTexture;
+	destTexture.dwSize = sizeof(destTexture);
+	destTexture.dwWidth = w;
+	destTexture.dwHeight = h;
+	destTexture.dwPitch = 0;
+	destTexture.format = ATI_TC_FORMAT_DXT5;
+	destTexture.dwDataSize = ATI_TC_CalculateBufferSize(&destTexture);
+	destTexture.pData = (ATI_TC_BYTE*) mem::malloc(destTexture.dwDataSize);
+
+	// Compress
+	ATI_TC_ConvertTexture(&srcTexture, &destTexture, NULL, NULL, NULL, NULL);
+
+	if (outputOptions.outputHandler != NULL) {
+		outputOptions.outputHandler->writeData(destTexture.pData, destTexture.dwDataSize);
+	}
+
+	mem::free(destTexture.pData);
+}
+
+#endif // defined(HAVE_ATITC)
+
+#if defined(HAVE_SQUISH)
+
+void SquishCompressorDXT1::compress(nvtt::InputFormat inputFormat, nvtt::AlphaMode alphaMode, uint w, uint h, void * data, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions)
+{
+#pragma message(NV_FILE_LINE "TODO: Convert input to fixed point ABGR format instead of ARGB")
+	/*
+	Image img(*image);
+	int count = img.width() * img.height();
+	for (int i = 0; i < count; i++)
+	{
+		Color32 c = img.pixel(i);
+		img.pixel(i) = Color32(c.b, c.g, c.r, c.a);
+	}
+
+	int size = squish::GetStorageRequirements(img.width(), img.height(), squish::kDxt1);
+	void * blocks = mem::malloc(size);
+
+	squish::CompressImage((const squish::u8 *)img.pixels(), img.width(), img.height(), blocks, squish::kDxt1 | squish::kColourClusterFit);
+
+	if (outputOptions.outputHandler != NULL) {
+		outputOptions.outputHandler->writeData(blocks, size);
+	}
+
+	mem::free(blocks);
+	*/
+}
+
+#endif // defined(HAVE_SQUISH)
+
+
+#if defined(HAVE_D3DX)
+
+void D3DXCompressorDXT1::compress(nvtt::InputFormat inputFormat, nvtt::AlphaMode alphaMode, uint w, uint h, void * data, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions)
+{
+	IDirect3D9 * d3d = Direct3DCreate9(D3D_SDK_VERSION);
+
+	D3DPRESENT_PARAMETERS presentParams;
+	ZeroMemory(&presentParams, sizeof(presentParams));
+	presentParams.Windowed = TRUE;
+	presentParams.SwapEffect = D3DSWAPEFFECT_COPY;
+	presentParams.BackBufferWidth = 8;
+	presentParams.BackBufferHeight = 8;
+	presentParams.BackBufferFormat = D3DFMT_UNKNOWN;
+
+	HRESULT err;
+
+	IDirect3DDevice9 * device = NULL;
+	err = d3d->CreateDevice(D3DADAPTER_DEFAULT, D3DDEVTYPE_REF, GetDesktopWindow(), D3DCREATE_SOFTWARE_VERTEXPROCESSING, &presentParams, &device);
+
+	IDirect3DTexture9 * texture = NULL;
+	err = D3DXCreateTexture(device, w, h, 1, 0, D3DFMT_DXT1, D3DPOOL_SYSTEMMEM, &texture);
+	
+	IDirect3DSurface9 * surface = NULL;
+	err = texture->GetSurfaceLevel(0, &surface);
+
+	RECT rect;
+	rect.left = 0; 
+	rect.top = 0; 
+	rect.bottom = h;
+	rect.right = w;
+
+	if (inputFormat == nvtt::InputFormat_BGRA_8UB)
+	{
+		err = D3DXLoadSurfaceFromMemory(surface, NULL, NULL, data, D3DFMT_A8R8G8B8, w * 4, NULL, &rect, D3DX_DEFAULT, 0);
+	}
+	else
+	{
+		err = D3DXLoadSurfaceFromMemory(surface, NULL, NULL, data, D3DFMT_A32B32G32R32F, w * 16, NULL, &rect, D3DX_DEFAULT, 0);
+	}
+
+	if (err != D3DERR_INVALIDCALL && err != D3DXERR_INVALIDDATA)
+	{
+		D3DLOCKED_RECT rect;
+		ZeroMemory(&rect, sizeof(rect));
+
+		err = surface->LockRect(&rect, NULL, D3DLOCK_READONLY);
+
+		if (outputOptions.outputHandler != NULL) {
+			int size = rect.Pitch * ((h + 3) / 4);
+			outputOptions.outputHandler->writeData(rect.pBits, size);
+		}
+
+		err = surface->UnlockRect();
+	}
+
+	surface->Release();
+	device->Release();
+	d3d->Release();
+}
+
+#endif // defined(HAVE_D3DX)
+
+
+#if defined(HAVE_STB)
+
+void StbCompressorDXT1::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
+{
+	rgba.swizzle(2, 1, 0, 3); // Swap R and B
+	stb_compress_dxt_block((unsigned char *)output, (unsigned char *)rgba.colors(), 0, 0);
+}
+
+
+#endif // defined(HAVE_STB)
diff --git a/src/nvtt/CompressDXT.h b/src/nvtt/CompressDXT.h
index b914c14..6c946e3 100644
--- a/src/nvtt/CompressDXT.h
+++ b/src/nvtt/CompressDXT.h
@@ -30,68 +30,153 @@
 namespace nv
 {
 	class Image;
-	class FloatImage;
+	struct ColorBlock;
 
-	class FastCompressor
+	struct CompressorInterface
 	{
-	public:
-		FastCompressor();
-		~FastCompressor();
+		virtual ~CompressorInterface() {}
+		virtual void compress(nvtt::InputFormat inputFormat, nvtt::AlphaMode alphaMode, uint w, uint h, void * data, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions) = 0;
+	};
+
+	struct FixedBlockCompressor : public CompressorInterface
+	{
+		virtual void compress(nvtt::InputFormat inputFormat, nvtt::AlphaMode alphaMode, uint w, uint h, void * data, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
+
+		virtual void compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output) = 0;
+		virtual uint blockSize() const = 0;
+	};
+
+
+	// Fast CPU compressors.
+	struct FastCompressorDXT1 : public FixedBlockCompressor
+	{
+		virtual void compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
+		virtual uint blockSize() const { return 8; }
+	};
+
+	struct FastCompressorDXT1a : public FixedBlockCompressor
+	{
+		virtual void compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
+		virtual uint blockSize() const { return 8; }
+	};
+
+	struct FastCompressorDXT3 : public FixedBlockCompressor
+	{
+		virtual void compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
+		virtual uint blockSize() const { return 16; }
+	};
 
-		void setImage(const Image * image, nvtt::AlphaMode alphaMode);
+	struct FastCompressorDXT5 : public FixedBlockCompressor
+	{
+		virtual void compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
+		virtual uint blockSize() const { return 16; }
+	};
 
-		void compressDXT1(const nvtt::OutputOptions::Private & outputOptions);
-		void compressDXT1a(const nvtt::OutputOptions::Private & outputOptions);
-		void compressDXT3(const nvtt::OutputOptions::Private & outputOptions);
-		void compressDXT5(const nvtt::OutputOptions::Private & outputOptions);
-		void compressDXT5n(const nvtt::OutputOptions::Private & outputOptions);
+	struct FastCompressorDXT5n : public FixedBlockCompressor
+	{
+		virtual void compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
+		virtual uint blockSize() const { return 16; }
+	};
 
-	private:
-		const Image * m_image;
-		nvtt::AlphaMode m_alphaMode;
+	struct FastCompressorBC4 : public FixedBlockCompressor
+	{
+		virtual void compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
+		virtual uint blockSize() const { return 8; }
 	};
 
-	class SlowCompressor
+	struct FastCompressorBC5 : public FixedBlockCompressor
 	{
-	public:
-		SlowCompressor();
-		~SlowCompressor();
+		virtual void compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
+		virtual uint blockSize() const { return 16; }
+	};
 
-		void setImage(const Image * image, nvtt::AlphaMode alphaMode);
 
-		void compressDXT1(const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
-		void compressDXT1a(const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
-		void compressDXT3(const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
-		void compressDXT5(const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
-		void compressDXT5n(const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
-		void compressBC4(const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
-		void compressBC5(const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
+	// Normal CPU compressors.
+	struct NormalCompressorDXT1 : public FixedBlockCompressor
+	{
+		virtual void compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
+		virtual uint blockSize() const { return 8; }
+	};
 
-	private:
-		const Image * m_image;
-		nvtt::AlphaMode m_alphaMode;
+	struct NormalCompressorDXT1a : public FixedBlockCompressor
+	{
+		virtual void compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
+		virtual uint blockSize() const { return 8; }
 	};
 
+	struct NormalCompressorDXT3 : public FixedBlockCompressor
+	{
+		virtual void compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
+		virtual uint blockSize() const { return 16; }
+	};
+
+	struct NormalCompressorDXT5 : public FixedBlockCompressor
+	{
+		virtual void compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
+		virtual uint blockSize() const { return 16; }
+	};
+
+	struct NormalCompressorDXT5n : public FixedBlockCompressor
+	{
+		virtual void compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
+		virtual uint blockSize() const { return 16; }
+	};
+
+
+	// Production CPU compressors.
+	struct ProductionCompressorBC4 : public FixedBlockCompressor
+	{
+		virtual void compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
+		virtual uint blockSize() const { return 8; }
+	};
+
+	struct ProductionCompressorBC5 : public FixedBlockCompressor
+	{
+		virtual void compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
+		virtual uint blockSize() const { return 16; }
+	};
+
+
 	// External compressors.
 #if defined(HAVE_S3QUANT)
-	void s3CompressDXT1(const Image * image, const nvtt::OutputOptions::Private & outputOptions);
+	struct S3CompressorDXT1 : public CompressorInterface
+	{
+		virtual void compress(nvtt::InputFormat inputFormat, nvtt::AlphaMode alphaMode, uint w, uint h, void * data, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
+	};
 #endif
 	
 #if defined(HAVE_ATITC)
-	void atiCompressDXT1(const Image * image, const nvtt::OutputOptions::Private & outputOptions);
-	void atiCompressDXT5(const Image * image, const nvtt::OutputOptions::Private & outputOptions);
+	struct AtiCompressorDXT1 : public CompressorInterface
+	{
+		virtual void compress(nvtt::InputFormat inputFormat, nvtt::AlphaMode alphaMode, uint w, uint h, void * data, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
+	};
+
+	struct AtiCompressorDXT5 : public CompressorInterface
+	{
+		virtual void compress(nvtt::InputFormat inputFormat, nvtt::AlphaMode alphaMode, uint w, uint h, void * data, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
+	};
 #endif
 
 #if defined(HAVE_SQUISH)
-	void squishCompressDXT1(const Image * image, const nvtt::OutputOptions::Private & outputOptions);
+	struct SquishCompressorDXT1 : public CompressorInterface
+	{
+		virtual void compress(nvtt::InputFormat inputFormat, nvtt::AlphaMode alphaMode, uint w, uint h, void * data, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
+	};
 #endif
 
 #if defined(HAVE_D3DX)
-	void d3dxCompressDXT1(const Image * image, const nvtt::OutputOptions::Private & outputOptions);
+	struct D3DXCompressorDXT1 : public CompressorInterface
+	{
+		virtual void compress(nvtt::InputFormat inputFormat, nvtt::AlphaMode alphaMode, uint w, uint h, void * data, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
+	};
 #endif
 
-#if defined(HAVE_D3DX)
-	void stbCompressDXT1(const Image * image, const nvtt::OutputOptions::Private & outputOptions);
+#if defined(HAVE_STB)
+	struct StbCompressorDXT1 : public FixedBlockCompressor
+	{
+		virtual void compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
+		virtual uint blockSize() const { return 8; }
+	};
 #endif
 
 } // nv namespace
diff --git a/src/nvtt/Context.cpp b/src/nvtt/Context.cpp
index 68dfa98..9624de9 100644
--- a/src/nvtt/Context.cpp
+++ b/src/nvtt/Context.cpp
@@ -222,6 +222,7 @@ Compressor::Compressor() : m(*new Compressor::Private())
 
 	if (m.cudaEnabled)
 	{
+#pragma message(NV_FILE_LINE "FIXME: This code is duplicated below.")
 		// Select fastest CUDA device.
 		int device = cuda::getFastestDevice();
 		if (!cuda::setDevice(device))
@@ -231,7 +232,7 @@ Compressor::Compressor() : m(*new Compressor::Private())
 		}
 		else
 		{
-			m.cuda = new CudaCompressor();
+			m.cuda = new CudaContext();
 
 			if (!m.cuda->isValid())
 			{
@@ -268,7 +269,7 @@ void Compressor::enableCudaAcceleration(bool enable)
 		}
 		else
 		{
-			m.cuda = new CudaCompressor();
+			m.cuda = new CudaContext();
 
 			if (!m.cuda->isValid())
 			{
@@ -292,17 +293,18 @@ bool Compressor::process(const InputOptions & inputOptions, const CompressionOpt
 	return m.compress(inputOptions.m, compressionOptions.m, outputOptions.m);
 }
 
-
 /// Estimate the size of compressing the input with the given options.
 int Compressor::estimateSize(const InputOptions & inputOptions, const CompressionOptions & compressionOptions) const
 {
 	return m.estimateSize(inputOptions.m, compressionOptions.m);
 }
 
+
 // RAW api.
 bool Compressor::compress2D(InputFormat format, int w, int h, void * data, const CompressionOptions & compressionOptions, const OutputOptions & outputOptions) const
 {
-	// @@ Make sure type of input format matches compression format.
+#pragma message(NV_FILE_LINE "TODO: Implement raw compress api")
+	return false;
 }
 
 int Compressor::estimateSize(int w, int h, int d, const CompressionOptions & compressionOptions) const
@@ -324,16 +326,21 @@ TexImage Compressor::createTexImage() const
 	return *new TexImage();
 }
 
+
 bool Compressor::outputHeader(const TexImage & tex, int mipmapCount, const CompressionOptions & compressionOptions, const OutputOptions & outputOptions) const
 {
-	m.outputHeader(tex, mipmapCount, compressionOptions.m, outputOptions.m);
+	return m.outputHeader(tex, mipmapCount, compressionOptions.m, outputOptions.m);
 }
 
 bool Compressor::compress(const TexImage & tex, const CompressionOptions & compressionOptions, const OutputOptions & outputOptions) const
 {
+#pragma message(NV_FILE_LINE "TODO: Implement TexImage compress api")
+
 	// @@ Convert to fixed point and call compress2D for each face.
+	return false;
 }
 
+/// Estimate the size of compressing the given texture.
 int Compressor::estimateSize(const TexImage & tex, const CompressionOptions & compressionOptions) const
 {
 	const uint w = tex.width();
@@ -345,6 +352,8 @@ int Compressor::estimateSize(const TexImage & tex, const CompressionOptions & co
 }
 
 
+
+
 bool Compressor::Private::compress(const InputOptions::Private & inputOptions, const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions) const
 {
 	// Make sure enums match.
@@ -358,9 +367,7 @@ bool Compressor::Private::compress(const InputOptions::Private & inputOptions, c
 		if (outputOptions.errorHandler) outputOptions.errorHandler->error(Error_FileOpen);
 		return false;
 	}
-
-#pragma message(NV_FILE_LINE "TODO: If DefaultOutputHandler, then seek begining of the file.")
-
+	
 	inputOptions.computeTargetExtents();
 	
 	// Output DDS header.
@@ -625,7 +632,10 @@ bool Compressor::Private::outputHeader(const TexImage & tex, int mipmapCount, co
 {
 	if (tex.width() <= 0 || tex.height() <= 0 || tex.depth() <= 0 || mipmapCount <= 0)
 	{
-#pragma message(NV_FILE_LINE "TODO: Set invalid argument error.")
+		if (outputOptions.errorHandler != NULL)
+		{
+			outputOptions.errorHandler->error(Error_InvalidInput);
+		}
 		return false;
 	}
 
@@ -1252,216 +1262,222 @@ void Compressor::Private::quantizeMipmap(Mipmap & mipmap, const CompressionOptio
 }
 
 
-// Compress the given mipmap.
-bool Compressor::Private::compressMipmap(const Mipmap & mipmap, const InputOptions::Private & inputOptions, const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions) const
+CompressorInterface * Compressor::Private::chooseCpuCompressor(const CompressionOptions::Private & compressionOptions) const
 {
-	if (compressionOptions.format == Format_RGBA)
+	if (compressionOptions.format == Format_DXT1)
 	{
-		// Pixel format conversion.
-		if (compressionOptions.pixelType == PixelType_Float)
-		{
-			compressRGB(mipmap.asFloatImage(), outputOptions, compressionOptions);
-		}
-		else
-		{
-			compressRGB(mipmap.asFixedImage(), outputOptions, compressionOptions);
-		}
-	}
-	else
-	{
-		const Image * image = mipmap.asFixedImage();
-		nvDebugCheck(image != NULL);
-
-		// @@ Use FastCompressor::isSupported(compressionOptions.format) to chose compressor.
-
-		FastCompressor fast;
-		fast.setImage(image, inputOptions.alphaMode);
-
-		SlowCompressor slow;
-		slow.setImage(image, inputOptions.alphaMode);
-
-		const bool useCuda = cudaEnabled && image->width() * image->height() >= 512;
-
-		if (compressionOptions.format == Format_DXT1)
-		{
 #if defined(HAVE_S3QUANT)
-			if (compressionOptions.externalCompressor == "s3")
-			{
-				s3CompressDXT1(image, outputOptions);
-			}
-			else
+		if (compressionOptions.externalCompressor == "s3") return new S3CompressorDXT1;
+		else
 #endif
 
 #if defined(HAVE_ATITC)
-			if (compressionOptions.externalCompressor == "ati")
-			{
-				atiCompressDXT1(image, outputOptions);
-			}
-			else
+		if (compressionOptions.externalCompressor == "ati") return new AtiCompressorDXT1;
+		else
 #endif
 
 #if defined(HAVE_SQUISH)
-			if (compressionOptions.externalCompressor == "squish")
-			{
-				squishCompressDXT1(image, outputOptions);
-			}
-			else
+		if (compressionOptions.externalCompressor == "squish") return new SquishCompressorDXT1;
+		else
 #endif
 
 #if defined(HAVE_D3DX)
-			if (compressionOptions.externalCompressor == "d3dx")
-			{
-				d3dxCompressDXT1(image, outputOptions);
-			}
-			else
+		if (compressionOptions.externalCompressor == "d3dx") return new D3DXCompressorDXT1;
+		else
 #endif
 
 #if defined(HAVE_D3DX)
-			if (compressionOptions.externalCompressor == "stb")
-			{
-				stbCompressDXT1(image, outputOptions);
-			}
-			else
+		if (compressionOptions.externalCompressor == "stb") return new StbCompressorDXT1;
+		else
 #endif
 
-			if (compressionOptions.quality == Quality_Fastest)
-			{
-				fast.compressDXT1(outputOptions);
-			}
-			else
-			{
-				if (useCuda)
-				{
-					nvDebugCheck(cudaSupported);
-					cuda->setImage(image, inputOptions.alphaMode);
-					//cuda->compressDXT1(compressionOptions, outputOptions);
-					cuda->compressDXT1(compressionOptions, outputOptions);
-				}
-				else
-				{
-					slow.compressDXT1(compressionOptions, outputOptions);
-				}
-			}
-		}
-		else if (compressionOptions.format == Format_DXT1a)
+		if (compressionOptions.quality == Quality_Fastest)
 		{
-			if (compressionOptions.quality == Quality_Fastest)
-			{
-				fast.compressDXT1a(outputOptions);
-			}
-			else
-			{
-				if (useCuda)
-				{
-					nvDebugCheck(cudaSupported);
-					/*cuda*/slow.compressDXT1a(compressionOptions, outputOptions);
-				}
-				else
-				{
-					slow.compressDXT1a(compressionOptions, outputOptions);
-				}
-			}
+			return new FastCompressorDXT1;
 		}
-		else if (compressionOptions.format == Format_DXT1n)
+
+		return new NormalCompressorDXT1;
+	}
+	else if (compressionOptions.format == Format_DXT1a)
+	{
+		if (compressionOptions.quality == Quality_Fastest)
 		{
-			if (useCuda)
-			{
-				nvDebugCheck(cudaSupported);
-				cuda->setImage(image, inputOptions.alphaMode);	
-				cuda->compressDXT1n(compressionOptions, outputOptions);
-			}
-			else
-			{
-				if (outputOptions.errorHandler) outputOptions.errorHandler->error(Error_UnsupportedFeature);
-			}
+			return new FastCompressorDXT1a;
 		}
-		else if (compressionOptions.format == Format_DXT3)
+
+		return new NormalCompressorDXT1a;
+	}
+	else if (compressionOptions.format == Format_DXT1n)
+	{
+		// Not supported.
+	}
+	else if (compressionOptions.format == Format_DXT3)
+	{
+		if (compressionOptions.quality == Quality_Fastest)
 		{
-			if (compressionOptions.quality == Quality_Fastest)
-			{
-				fast.compressDXT3(outputOptions);
-			}
-			else
-			{
-				if (useCuda)
-				{
-					nvDebugCheck(cudaSupported);
-					cuda->setImage(image, inputOptions.alphaMode);
-					cuda->compressDXT3(compressionOptions, outputOptions);
-				}
-				else
-				{
-					slow.compressDXT3(compressionOptions, outputOptions);
-				}
-			}
+			return new FastCompressorDXT3;
 		}
-		else if (compressionOptions.format == Format_DXT5)
-		{
+
+		return new NormalCompressorDXT3;
+	}
+	else if (compressionOptions.format == Format_DXT5)
+	{
 #if defined(HAVE_ATITC)
-			if (compressionOptions.externalCompressor == "ati")
-			{
-				atiCompressDXT5(image, outputOptions);
-			}
-			else
+		if (compressionOptions.externalCompressor == "ati") return new AtiCompressorDXT5;
+		else
 #endif
-			if (compressionOptions.quality == Quality_Fastest)
-			{
-				fast.compressDXT5(outputOptions);
-			}
-			else
-			{
-				if (useCuda)
-				{
-					nvDebugCheck(cudaSupported);
-					cuda->setImage(image, inputOptions.alphaMode);
-					cuda->compressDXT5(compressionOptions, outputOptions);
-				}
-				else
-				{
-					slow.compressDXT5(compressionOptions, outputOptions);
-				}
-			}
+
+		if (compressionOptions.quality == Quality_Fastest)
+		{
+			return new FastCompressorDXT5;
 		}
-		else if (compressionOptions.format == Format_DXT5n)
+
+		return new NormalCompressorDXT5;
+	}
+	else if (compressionOptions.format == Format_DXT5n)
+	{
+		if (compressionOptions.quality == Quality_Fastest)
 		{
-			if (compressionOptions.quality == Quality_Fastest)
-			{
-				fast.compressDXT5n(outputOptions);
-			}
-			else
-			{
-				/*if (useCuda)
-				{
-					nvDebugCheck(cudaSupported);
-					cuda->setImage(image, inputOptions.alphaMode);
-					cuda->compressDXT5n(compressionOptions, outputOptions);
-				}
-				else*/
-				{
-					slow.compressDXT5n(compressionOptions, outputOptions);
-				}
-			}
+			return new FastCompressorDXT5n;
 		}
-		else if (compressionOptions.format == Format_BC4)
+
+		return new NormalCompressorDXT5n;
+	}
+	else if (compressionOptions.format == Format_BC4)
+	{
+		if (compressionOptions.quality == Quality_Fastest || compressionOptions.quality == Quality_Normal)
 		{
-			slow.compressBC4(compressionOptions, outputOptions);
+			return new FastCompressorBC4;
 		}
-		else if (compressionOptions.format == Format_BC5)
+
+		return new ProductionCompressorBC4;
+	}
+	else if (compressionOptions.format == Format_BC5)
+	{
+		if (compressionOptions.quality == Quality_Fastest || compressionOptions.quality == Quality_Normal)
 		{
-			slow.compressBC5(compressionOptions, outputOptions);
+			return new FastCompressorBC5;
 		}
-		else if (compressionOptions.format == Format_CTX1)
+
+		return new ProductionCompressorBC5;
+	}
+	else if (compressionOptions.format == Format_CTX1)
+	{
+		// Not supported.
+	}
+	else if (compressionOptions.format == Format_BC6)
+	{
+		// Not supported.
+	}
+	else if (compressionOptions.format == Format_BC7)
+	{
+		// Not supported.
+	}
+
+	return NULL;
+}
+
+
+CompressorInterface * Compressor::Private::chooseGpuCompressor(const CompressionOptions::Private & compressionOptions) const
+{
+	nvDebugCheck(cudaSupported);
+
+	if (compressionOptions.quality == Quality_Fastest)
+	{
+		// Do not use CUDA compressors in fastest quality mode.
+		return NULL;
+	}
+
+	if (compressionOptions.format == Format_DXT1)
+	{
+		return new CudaCompressorDXT1(*cuda);
+	}
+	else if (compressionOptions.format == Format_DXT1a)
+	{
+#pragma message(NV_FILE_LINE "TODO: Implement CUDA DXT1a compressor.")
+	}
+	else if (compressionOptions.format == Format_DXT1n)
+	{
+		// Not supported.
+	}
+	else if (compressionOptions.format == Format_DXT3)
+	{
+		return new CudaCompressorDXT3(*cuda);
+	}
+	else if (compressionOptions.format == Format_DXT5)
+	{
+		return new CudaCompressorDXT5(*cuda);
+	}
+	else if (compressionOptions.format == Format_DXT5n)
+	{
+		// @@ Return CUDA compressor.
+	}
+	else if (compressionOptions.format == Format_BC4)
+	{
+		// Not supported.
+	}
+	else if (compressionOptions.format == Format_BC5)
+	{
+		// Not supported.
+	}
+	else if (compressionOptions.format == Format_CTX1)
+	{
+		// @@ Return CUDA compressor.
+	}
+	else if (compressionOptions.format == Format_BC6)
+	{
+		// Not supported.
+	}
+	else if (compressionOptions.format == Format_BC7)
+	{
+		// Not supported.
+	}
+
+	return NULL;
+}
+
+
+
+// Compress the given mipmap.
+bool Compressor::Private::compressMipmap(const Mipmap & mipmap, const InputOptions::Private & inputOptions, const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions) const
+{
+	if (compressionOptions.format == Format_RGBA)
+	{
+		// Pixel format conversion.
+		if (compressionOptions.pixelType == PixelType_Float)
 		{
-			if (useCuda)
-			{
-				nvDebugCheck(cudaSupported);
-				cuda->setImage(image, inputOptions.alphaMode);
-				cuda->compressCTX1(compressionOptions, outputOptions);
-			}
-			else
-			{
-				if (outputOptions.errorHandler) outputOptions.errorHandler->error(Error_UnsupportedFeature);
-			}
+			compressRGB(mipmap.asFloatImage(), outputOptions, compressionOptions);
+		}
+		else
+		{
+			compressRGB(mipmap.asFixedImage(), outputOptions, compressionOptions);
+		}
+	}
+	else
+	{
+		const Image * image = mipmap.asFixedImage();
+		nvDebugCheck(image != NULL);
+
+		// Decide what compressor to use.
+		CompressorInterface * compressor = NULL;
+		if (cudaEnabled && image->width() * image->height() >= 512)
+		{
+			compressor = chooseGpuCompressor(compressionOptions);
+		}
+		if (compressor == NULL)
+		{
+			compressor = chooseCpuCompressor(compressionOptions);
+		}
+
+		if (compressor == NULL)
+		{
+			if (outputOptions.errorHandler) outputOptions.errorHandler->error(Error_UnsupportedFeature);
+		}
+		else
+		{
+			compressor->compress(InputFormat_BGRA_8UB, inputOptions.alphaMode, image->width(), image->height(), (void *)image->pixels(), compressionOptions, outputOptions);
+
+			delete compressor;
 		}
 	}
 
diff --git a/src/nvtt/Context.h b/src/nvtt/Context.h
index 727bd1c..1c81db7 100644
--- a/src/nvtt/Context.h
+++ b/src/nvtt/Context.h
@@ -27,6 +27,7 @@
 #include <nvcore/Ptr.h>
 
 #include <nvtt/cuda/CudaCompressDXT.h>
+#include <nvtt/CompressDXT.h>
 
 #include "nvtt.h"
 
@@ -44,6 +45,9 @@ namespace nvtt
 		Private() {}
 
 		bool compress(const InputOptions::Private & inputOptions, const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions) const;
+
+		bool compress(const void * data, int width, int height, const CompressionOptions & compressionOptions, const OutputOptions & outputOptions) const;
+
 		int estimateSize(const InputOptions::Private & inputOptions, const CompressionOptions::Private & compressionOptions) const;
 
 		bool outputHeader(const TexImage & tex, int mipmapCount, const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions);
@@ -51,6 +55,10 @@ namespace nvtt
 	private:
 
 		bool outputHeader(const InputOptions::Private & inputOptions, const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions) const;
+
+		nv::CompressorInterface * chooseCpuCompressor(const CompressionOptions::Private & compressionOptions) const;
+		nv::CompressorInterface * chooseGpuCompressor(const CompressionOptions::Private & compressionOptions) const;
+
 		bool compressMipmaps(uint f, const InputOptions::Private & inputOptions, const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions) const;
 
 		bool initMipmap(Mipmap & mipmap, const InputOptions::Private & inputOptions, uint w, uint h, uint d, uint f, uint m) const;
@@ -71,7 +79,7 @@ namespace nvtt
 		bool cudaSupported;
 		bool cudaEnabled;
 
-		nv::AutoPtr<nv::CudaCompressor> cuda;
+		nv::AutoPtr<nv::CudaContext> cuda;
 
 	};
 
diff --git a/src/nvtt/cuda/CompressKernel.cu b/src/nvtt/cuda/CompressKernel.cu
index e829a7c..cbdf492 100644
--- a/src/nvtt/cuda/CompressKernel.cu
+++ b/src/nvtt/cuda/CompressKernel.cu
@@ -296,6 +296,51 @@ __device__ float3 blockError3(const float3 * colors, uint permutation, float3 a,
 // Sort colors
 ////////////////////////////////////////////////////////////////////////////////
 
+// @@ Experimental code to avoid duplicate colors for faster compression.
+// We could first sort along the best fit line and only compare colors that have the same projection.
+// The hardest part is to maintain the indices to map packed/sorted colors to the input colors.
+// We also need to update several functions that assume the number of colors is fixed to 16.
+// And compute different bit maps for the different color counts.
+// This is a fairly high amount of work.
+__device__ int packColors(float3 * values, float * weights, int * ranks)
+{
+    const int tid = threadIdx.x;
+
+	__shared__ int count;
+	count = 0;
+
+	bool alive = true;
+
+	// Append this 
+	for (int i = 0; i < 16; i++)
+	{
+		// One thread leads on each iteration.
+		if (tid == i) {
+
+			// If thread alive, then append element.
+			if (alive) {
+				values[count] = values[i];
+				weights[count] = weights[i];
+				count++;
+			}
+
+			// Otherwise update weight.
+			else {
+				weights[ranks[i]] += weights[i];
+			}
+		}
+
+		// Kill all threads that have the same element and record rank.
+		if (values[i] == values[tid]) {
+			alive = false;
+			ranks[tid] = count - 1;
+		}
+	}
+
+	return count;
+}
+
+
 __device__ void sortColors(const float * values, int * ranks)
 {
 #if __DEVICE_EMULATION__
@@ -343,12 +388,60 @@ __device__ void sortColors(const float * values, int * ranks)
 #endif
 }
 
+__device__ void sortColors(const float * values, int * ranks, int count)
+{
+#if __DEVICE_EMULATION__
+    if (threadIdx.x == 0)
+    {
+        for (int tid = 0; tid < count; tid++)
+        {
+            int rank = 0;
+            for (int i = 0; i < count; i++)
+            {
+                rank += (values[i] < values[tid]);
+            }
+            
+            ranks[tid] = rank;
+        }
+
+        // Resolve elements with the same index.
+        for (int i = 0; i < count-1; i++)
+        {
+            for (int tid = 0; tid < count; tid++)
+            {
+                if (tid > i && ranks[tid] == ranks[i]) ++ranks[tid];
+            }
+        }
+    }
+#else
+    const int tid = threadIdx.x;
+
+    int rank = 0;
+
+    #pragma unroll
+    for (int i = 0; i < count; i++)
+    {
+        rank += (values[i] < values[tid]);
+    }
+    
+    ranks[tid] = rank;
+
+    // Resolve elements with the same index.
+    #pragma unroll
+    for (int i = 0; i < count-1; i++)
+    {
+        if ((tid > i) & (ranks[tid] == ranks[i])) ++ranks[tid];
+    }
+#endif
+}
+
+
 
 ////////////////////////////////////////////////////////////////////////////////
 // Load color block to shared mem
 ////////////////////////////////////////////////////////////////////////////////
 
-__device__ void loadColorBlock(const uint * image, float3 colors[16], float3 sums[16], int xrefs[16], int * sameColor)
+/*__device__ void loadColorBlock(const uint * image, float3 colors[16], float3 sums[16], int xrefs[16], int * sameColor)
 {
 	const int bid = blockIdx.x;
 	const int idx = threadIdx.x;
@@ -389,9 +482,9 @@ __device__ void loadColorBlock(const uint * image, float3 colors[16], float3 sum
 		__debugsync();
 	}
 #endif
-}
+}*/
 
-__device__ void loadColorBlockTex(uint bn, uint w, float3 colors[16], float3 sums[16], int xrefs[16], int * sameColor)
+__device__ void loadColorBlockTex(uint firstBlock, uint width, float3 colors[16], float3 sums[16], int xrefs[16], int * sameColor)
 {
 	const int bid = blockIdx.x;
 	const int idx = threadIdx.x;
@@ -400,8 +493,8 @@ __device__ void loadColorBlockTex(uint bn, uint w, float3 colors[16], float3 sum
 
 	if (idx < 16)
 	{
-		float x = 4 * ((bn + bid) % w) + idx % 4; // @@ Avoid mod and div by using 2D grid?
-		float y = 4 * ((bn + bid) / w) + idx / 4;
+		float x = 4 * ((firstBlock + bid) % width) + idx % 4; // @@ Avoid mod and div by using 2D grid?
+		float y = 4 * ((firstBlock + bid) / width) + idx / 4;
 
 		// Read color and copy to shared mem.
 		float4 c = tex2D(tex, x, y);
@@ -437,10 +530,107 @@ __device__ void loadColorBlockTex(uint bn, uint w, float3 colors[16], float3 sum
 		__debugsync();
 	}
 #endif
+}
 
+/*
+__device__ void loadColorBlockTex(uint firstBlock, uint w, float3 colors[16], float3 sums[16], float weights[16], int xrefs[16], int * sameColor)
+{
+	const int bid = blockIdx.x;
+	const int idx = threadIdx.x;
+
+	__shared__ float dps[16];
+
+	if (idx < 16)
+	{
+		float x = 4 * ((firstBlock + bid) % w) + idx % 4; // @@ Avoid mod and div by using 2D grid?
+		float y = 4 * ((firstBlock + bid) / w) + idx / 4;
+
+		// Read color and copy to shared mem.
+		float4 c = tex2D(tex, x, y);
+
+		colors[idx].x = c.z;
+		colors[idx].y = c.y;
+		colors[idx].z = c.x;
+		weights[idx] = 1;
+
+		int count = packColors(colors, weights);
+		if (idx < count)
+		{
+			// Sort colors along the best fit line.
+			colorSums(colors, sums);
+			float3 axis = bestFitLine(colors, sums[0], kColorMetric);
+			
+			*sameColor = (axis == make_float3(0, 0, 0));
+			
+			dps[idx] = dot(colors[idx], axis);
+			
+			sortColors(dps, xrefs);
+			
+			float3 tmp = colors[idx];
+			colors[xrefs[idx]] = tmp;
+		}
+	}
 }
+*/
+
+__device__ void loadColorBlockTex(uint firstBlock, uint width, float3 colors[16], float3 sums[16], float weights[16], int xrefs[16], int * sameColor)
+{
+	const int bid = blockIdx.x;
+	const int idx = threadIdx.x;
 
+	__shared__ float3 rawColors[16];
+	__shared__ float dps[16];
 
+	if (idx < 16)
+	{
+		float x = 4 * ((firstBlock + bid) % width) + idx % 4; // @@ Avoid mod and div by using 2D grid?
+		float y = 4 * ((firstBlock + bid) / width) + idx / 4;
+
+		// Read color and copy to shared mem.
+		float4 c = tex2D(tex, x, y);
+
+		rawColors[idx].x = c.z;
+		rawColors[idx].y = c.y;
+		rawColors[idx].z = c.x;
+		weights[idx] = c.w;
+
+		colors[idx] = rawColors[idx] * weights[idx];
+
+		// No need to synchronize, 16 < warp size.
+		__debugsync();
+		
+		// Sort colors along the best fit line.
+		colorSums(colors, sums);
+		float3 axis = bestFitLine(colors, sums[0], kColorMetric);
+		
+		*sameColor = (axis == make_float3(0, 0, 0));
+		
+		// Single color compressor needs unweighted colors.
+		if (*sameColor) colors[idx] = rawColors[idx];
+
+		dps[idx] = dot(colors[idx], axis);
+		
+		__debugsync();
+		
+		sortColors(dps, xrefs);
+		
+		float3 tmp = colors[idx];
+		float w = weights[idx];
+		__debugsync();
+		colors[xrefs[idx]] = tmp;
+		weights[xrefs[idx]] = w;
+	}
+#if __DEVICE_EMULATION__
+	else
+	{
+		__debugsync();
+		__debugsync();
+		__debugsync();
+	}
+#endif
+}
+
+/*
 __device__ void loadColorBlock(const uint * image, float3 colors[16], float3 sums[16], float weights[16], int xrefs[16], int * sameColor)
 {
 	const int bid = blockIdx.x;
@@ -494,6 +684,7 @@ __device__ void loadColorBlock(const uint * image, float3 colors[16], float3 sum
 	}
 #endif
 }
+*/
 
 __device__ void loadColorBlock(const uint * image, float2 colors[16], float2 sums[16], int xrefs[16], int * sameColor)
 {
@@ -1457,48 +1648,15 @@ __device__ void saveSingleColorBlockCTX1(float2 color, uint2 * result)
 ////////////////////////////////////////////////////////////////////////////////
 // Compress color block
 ////////////////////////////////////////////////////////////////////////////////
-__global__ void compressDXT1(const uint * permutations, const uint * image, uint2 * result)
-{
-	__shared__ float3 colors[16];
-	__shared__ float3 sums[16];
-	__shared__ int xrefs[16];
-	__shared__ int sameColor;
-	
-	loadColorBlock(image, colors, sums, xrefs, &sameColor);
 
-	__syncthreads();
-
-	if (sameColor)
-	{
-		if (threadIdx.x == 0) saveSingleColorBlockDXT1(colors[0], result);
-		return;
-	}
-
-	ushort bestStart, bestEnd;
-	uint bestPermutation;
-
-	__shared__ float errors[NUM_THREADS];
-
-	evalAllPermutations(colors, sums[0], permutations, bestStart, bestEnd, bestPermutation, errors);
-	
-	// Use a parallel reduction to find minimum error.
-	const int minIdx = findMinError(errors);
-	
-	// Only write the result of the winner thread.
-	if (threadIdx.x == minIdx)
-	{
-		saveBlockDXT1(bestStart, bestEnd, bestPermutation, xrefs, result);
-	}
-}
-
-__global__ void compressDXT1_Tex(uint bn, uint w, const uint * permutations, uint2 * result)
+__global__ void compressDXT1(uint firstBlock, uint w, const uint * permutations, uint2 * result)
 {
 	__shared__ float3 colors[16];
 	__shared__ float3 sums[16];
 	__shared__ int xrefs[16];
 	__shared__ int sameColor;
 	
-	loadColorBlockTex(bn, w, colors, sums, xrefs, &sameColor);
+	loadColorBlockTex(firstBlock, w, colors, sums, xrefs, &sameColor);
 
 	__syncthreads();
 
@@ -1534,14 +1692,14 @@ __global__ void compressDXT1_Tex(uint bn, uint w, const uint * permutations, uin
 }
 
 
-__global__ void compressLevel4DXT1(const uint * permutations, const uint * image, uint2 * result)
+__global__ void compressLevel4DXT1(uint firstBlock, uint w, const uint * permutations, uint2 * result)
 {
 	__shared__ float3 colors[16];
 	__shared__ float3 sums[16];
 	__shared__ int xrefs[16];
 	__shared__ int sameColor;
 	
-	loadColorBlock(image, colors, sums, xrefs, &sameColor);
+	loadColorBlockTex(firstBlock, w, colors, sums, xrefs, &sameColor);
 
 	__syncthreads();
 
@@ -1568,7 +1726,7 @@ __global__ void compressLevel4DXT1(const uint * permutations, const uint * image
 	}
 }
 
-__global__ void compressWeightedDXT1(const uint * permutations, const uint * image, uint2 * result)
+__global__ void compressWeightedDXT1(uint firstBlock, uint w, const uint * permutations, uint2 * result)
 {
 	__shared__ float3 colors[16];
 	__shared__ float3 sums[16];
@@ -1576,7 +1734,7 @@ __global__ void compressWeightedDXT1(const uint * permutations, const uint * ima
 	__shared__ int xrefs[16];
 	__shared__ int sameColor;
 	
-	loadColorBlock(image, colors, sums, weights, xrefs, &sameColor);
+	loadColorBlockTex(firstBlock, w, colors, sums, weights, xrefs, &sameColor);
 	
 	__syncthreads();
 
@@ -1987,40 +2145,70 @@ extern "C" void setupCompressKernel(const float weights[3])
 	cudaMemcpyToSymbol(kColorMetricSqr, weightsSqr, sizeof(float) * 3, 0);
 }
 
+extern "C" void bindTextureToArray(cudaArray * d_data)
+{
+	// Setup texture
+	tex.normalized = false;
+	tex.filterMode = cudaFilterModePoint;
+	tex.addressMode[0] = cudaAddressModeClamp;
+	tex.addressMode[1] = cudaAddressModeClamp;
+    
+	cudaBindTextureToArray(tex, d_data);
+}
+
+
 
 ////////////////////////////////////////////////////////////////////////////////
 // Launch kernel
 ////////////////////////////////////////////////////////////////////////////////
 
-extern "C" void compressKernelDXT1(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps)
+// DXT1 compressors:
+extern "C" void compressKernelDXT1(uint firstBlock, uint blockNum, uint w, uint * d_result, uint * d_bitmaps)
 {
-	compressDXT1<<<blockNum, NUM_THREADS>>>(d_bitmaps, d_data, (uint2 *)d_result);
+	compressDXT1<<<blockNum, NUM_THREADS>>>(firstBlock, w, d_bitmaps, (uint2 *)d_result);
 }
 
-extern "C" void compressKernelDXT1_Tex(uint bn, uint blockNum, uint w, cudaArray * d_data, uint * d_result, uint * d_bitmaps)
+extern "C" void compressKernelDXT1_Level4(uint firstBlock, uint blockNum, uint w, uint * d_result, uint * d_bitmaps)
 {
-	// Setup texture
-	tex.normalized = false;
-	tex.filterMode = cudaFilterModePoint;
-	tex.addressMode[0] = cudaAddressModeClamp;
-	tex.addressMode[1] = cudaAddressModeClamp;
-    
-	cudaBindTextureToArray(tex, d_data);
+	compressLevel4DXT1<<<blockNum, NUM_THREADS>>>(firstBlock, w, d_bitmaps, (uint2 *)d_result);
+}
 
-	compressDXT1_Tex<<<blockNum, NUM_THREADS>>>(bn, w, d_bitmaps, (uint2 *)d_result);
+extern "C" void compressWeightedKernelDXT1(uint firstBlock, uint blockNum, uint w, uint * d_result, uint * d_bitmaps)
+{
+	compressWeightedDXT1<<<blockNum, NUM_THREADS>>>(firstBlock, w, d_bitmaps, (uint2 *)d_result);
 }
 
+// @@ DXT1a compressors.
 
-extern "C" void compressKernelDXT1_Level4(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps)
+
+// @@ DXT3 compressors:
+extern "C" void compressKernelDXT3(uint firstBlock, uint blockNum, uint w, uint * d_result, uint * d_bitmaps)
 {
-	compressLevel4DXT1<<<blockNum, NUM_THREADS>>>(d_bitmaps, d_data, (uint2 *)d_result);
+	//compressDXT3<<<blockNum, NUM_THREADS>>>(firstBlock, w, d_bitmaps, (uint2 *)d_result);
 }
 
-extern "C" void compressWeightedKernelDXT1(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps)
+extern "C" void compressWeightedKernelDXT3(uint firstBlock, uint blockNum, uint w, uint * d_result, uint * d_bitmaps)
+{
+	//compressWeightedDXT3<<<blockNum, NUM_THREADS>>>(firstBlock, w, d_bitmaps, (uint2 *)d_result);
+}
+
+
+// @@ DXT5 compressors.
+extern "C" void compressKernelDXT5(uint firstBlock, uint blockNum, uint w, uint * d_result, uint * d_bitmaps)
 {
-	compressWeightedDXT1<<<blockNum, NUM_THREADS>>>(d_bitmaps, d_data, (uint2 *)d_result);
+	//compressDXT5<<<blockNum, NUM_THREADS>>>(firstBlock, w, d_bitmaps, (uint2 *)d_result);
 }
 
+extern "C" void compressWeightedKernelDXT5(uint firstBlock, uint blockNum, uint w, uint * d_result, uint * d_bitmaps)
+{
+	//compressWeightedDXT5<<<blockNum, NUM_THREADS>>>(firstBlock, w, d_bitmaps, (uint2 *)d_result);
+}
+
+
+
+
+
+/*
 extern "C" void compressNormalKernelDXT1(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps)
 {
 	compressNormalDXT1<<<blockNum, NUM_THREADS>>>(d_bitmaps, d_data, (uint2 *)d_result);
@@ -2030,16 +2218,10 @@ extern "C" void compressKernelCTX1(uint blockNum, uint * d_data, uint * d_result
 {
 	compressCTX1<<<blockNum, NUM_THREADS>>>(d_bitmaps, d_data, (uint2 *)d_result);
 }
-
+*/
+/*
 extern "C" void compressKernelDXT5n(uint blockNum, cudaArray * d_data, uint * d_result)
 {
-	// Setup texture
-	tex.normalized = false;
-	tex.filterMode = cudaFilterModePoint;
-	tex.addressMode[0] = cudaAddressModeClamp;
-	tex.addressMode[1] = cudaAddressModeClamp;
-    
-	cudaBindTextureToArray(tex, d_data);
-
 //	compressDXT5n<<<blockNum/128, 128>>>(blockNum, (uint2 *)d_result);
 }
+*/
\ No newline at end of file
diff --git a/src/nvtt/cuda/CudaCompressDXT.cpp b/src/nvtt/cuda/CudaCompressDXT.cpp
index 9bfa7a0..8fe0f48 100644
--- a/src/nvtt/cuda/CudaCompressDXT.cpp
+++ b/src/nvtt/cuda/CudaCompressDXT.cpp
@@ -52,16 +52,20 @@ using namespace nvtt;
 
 
 extern "C" void setupCompressKernel(const float weights[3]);
-extern "C" void compressKernelDXT1(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps);
-extern "C" void compressKernelDXT1_Tex(uint bn, uint blockNum, uint w, cudaArray * d_data, uint * d_result, uint * d_bitmaps);
+extern "C" void bindTextureToArray(cudaArray * d_data);
+
+extern "C" void compressKernelDXT1(uint firstBlock, uint blockNum, uint w, uint * d_result, uint * d_bitmaps);
 extern "C" void compressKernelDXT1_Level4(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps);
 extern "C" void compressWeightedKernelDXT1(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps);
-extern "C" void compressNormalKernelDXT1(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps);
-extern "C" void compressKernelCTX1(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps);
+extern "C" void compressKernelDXT3(uint firstBlock, uint blockNum, uint w, uint * d_result, uint * d_bitmaps);
+//extern "C" void compressNormalKernelDXT1(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps);
+//extern "C" void compressKernelCTX1(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps);
 
 
-#include "Bitmaps.h"	// @@ Rename to BitmapTable.h
+#pragma message(NV_FILE_LINE "TODO: Rename Bitmaps.h to BitmapTable.h")
+#include "Bitmaps.h"
 
+/*
 // Convert linear image to block linear.
 static void convertToBlockLinear(const Image * image, uint * blockLinearImage)
 {
@@ -81,45 +85,49 @@ static void convertToBlockLinear(const Image * image, uint * blockLinearImage)
 		}
 	}
 }
+*/
 
 #endif
 
 
-CudaCompressor::CudaCompressor() : m_bitmapTable(NULL), m_bitmapTableCTX(NULL), m_data(NULL), m_result(NULL)
-{
+CudaContext::CudaContext() : 
+	bitmapTable(NULL), 
+	bitmapTableCTX(NULL), 
+	data(NULL), 
+	result(NULL)
+{
 #if defined HAVE_CUDA
     // Allocate and upload bitmaps.
-    cudaMalloc((void**) &m_bitmapTable, 992 * sizeof(uint));
-	if (m_bitmapTable != NULL)
+    cudaMalloc((void**) &bitmapTable, 992 * sizeof(uint));
+	if (bitmapTable != NULL)
 	{
-		cudaMemcpy(m_bitmapTable, s_bitmapTable, 992 * sizeof(uint), cudaMemcpyHostToDevice);
+		cudaMemcpy(bitmapTable, s_bitmapTable, 992 * sizeof(uint), cudaMemcpyHostToDevice);
 	}
 
-    cudaMalloc((void**) &m_bitmapTableCTX, 704 * sizeof(uint));
-
-	if (m_bitmapTableCTX != NULL)
+    cudaMalloc((void**) &bitmapTableCTX, 704 * sizeof(uint));
+	if (bitmapTableCTX != NULL)
 	{
-		cudaMemcpy(m_bitmapTableCTX, s_bitmapTableCTX, 704 * sizeof(uint), cudaMemcpyHostToDevice);
+		cudaMemcpy(bitmapTableCTX, s_bitmapTableCTX, 704 * sizeof(uint), cudaMemcpyHostToDevice);
 	}
 
 	// Allocate scratch buffers.
-    cudaMalloc((void**) &m_data, MAX_BLOCKS * 64U);
-    cudaMalloc((void**) &m_result, MAX_BLOCKS * 8U);
+    cudaMalloc((void**) &data, MAX_BLOCKS * 64U);
+    cudaMalloc((void**) &result, MAX_BLOCKS * 8U);
 #endif
-}
-
-CudaCompressor::~CudaCompressor()
-{
+}
+
+CudaContext::~CudaContext()
+{
 #if defined HAVE_CUDA
 	// Free device mem allocations.
-	cudaFree(m_data);
-	cudaFree(m_result);
-	cudaFree(m_bitmapTable);
-	cudaFree(m_bitmapTableCTX);
+	cudaFree(bitmapTableCTX);
+	cudaFree(bitmapTable);
+	cudaFree(data);
+	cudaFree(result);
 #endif
-}
+}
 
-bool CudaCompressor::isValid() const
+bool CudaContext::isValid() const
 {
 #if defined HAVE_CUDA
 	cudaError_t err = cudaGetLastError();
@@ -129,91 +137,88 @@ bool CudaCompressor::isValid() const
 		return false;
 	}
 #endif
-	return m_data != NULL && m_result != NULL && m_bitmapTable != NULL;
+	return bitmapTable != NULL && bitmapTableCTX != NULL && data != NULL && result != NULL;
 }
 
-// @@ This code is very repetitive and needs to be cleaned up.
 
-#if 0
 
-struct CudaCompressionKernel
+CudaCompressor::CudaCompressor(CudaContext & ctx) : m_ctx(ctx)
 {
-	virtual void setup(const CompressionOptions::Private & compressionOptions)
-	{
-		setupCompressKernel(compressionOptions.colorWeight.ptr());
-	}
-	
-	virtual void setBitmapTable();
-	
-	virtual void runDeviceCode(int count);
-	
-	virtual void runHostCode(int count);
-	
-};
 
-void CudaCompressor::compressKernel(CudaCompressionKernel * kernel)
+}
+
+void CudaCompressor::compress(nvtt::InputFormat inputFormat, nvtt::AlphaMode alphaMode, uint w, uint h, void * data, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions)
 {
 	nvDebugCheck(cuda::isHardwarePresent());
+
 #if defined HAVE_CUDA
 
+	// Allocate image as a cuda array.
+	cudaArray * d_image;
+	if (inputFormat == nvtt::InputFormat_BGRA_8UB)
+	{
+        cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsigned);
+	    cudaMallocArray(&d_image, &channelDesc, w, h);
+
+		const int imageSize = w * h * sizeof(uint);
+		cudaMemcpyToArray(d_image, 0, 0, data, imageSize, cudaMemcpyHostToDevice);
+	}
+	else
+	{
+#pragma message(NV_FILE_LINE "FIXME: Floating point textures not really supported by CUDA compressors.")
+		cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 32, 32, 32, cudaChannelFormatKindFloat);
+	    cudaMallocArray(&d_image, &channelDesc, w, h);
+
+		const int imageSize = w * h * sizeof(uint);
+		cudaMemcpyToArray(d_image, 0, 0, data, imageSize, cudaMemcpyHostToDevice);
+	}
+
 	// Image size in blocks.
-	const uint w = (image->width() + 3) / 4;
-	const uint h = (image->height() + 3) / 4;
+	const uint bw = (w + 3) / 4;
+	const uint bh = (h + 3) / 4;
+	const uint bs = blockSize();
+	const uint blockNum = bw * bh;
+	const uint compressedSize = blockNum * bs;
 
-	uint imageSize = w * h * 16 * sizeof(Color32);
-    uint * blockLinearImage = (uint *) malloc(imageSize);
-	convertToBlockLinear(image, blockLinearImage);	// @@ Do this in parallel with the GPU, or in the GPU!
+	void * h_result = malloc(min(blockNum, MAX_BLOCKS) * bs);
 
-	const uint blockNum = w * h;
-	const uint compressedSize = blockNum * 8;
+	setup(d_image, compressionOptions);
 
-	clock_t start = clock();
+	// Timer timer;
+	// timer.start();
 
-	kernel->setup(compressionOptions);
-	kernel->setBitmapTable(m_bitmapTable);
-	
-	// TODO: Add support for multiple GPUs.
 	uint bn = 0;
 	while(bn != blockNum)
 	{
 		uint count = min(blockNum - bn, MAX_BLOCKS);
 
-	    cudaMemcpy(m_data, blockLinearImage + bn * 16, count * 64, cudaMemcpyHostToDevice);
-
-		kernel->runDeviceCode(count, m_data, m_result);
+		compressBlocks(bn, count, w, h, alphaMode, compressionOptions, h_result);
 
-		kernel->runHostCode(count);
-		
 		// Check for errors.
 		cudaError_t err = cudaGetLastError();
 		if (err != cudaSuccess)
 		{
-			nvDebug("CUDA Error: %s\n", cudaGetErrorString(err));
-
+			//nvDebug("CUDA Error: %s\n", cudaGetErrorString(err));
 			if (outputOptions.errorHandler != NULL)
 			{
 				outputOptions.errorHandler->error(Error_CudaError);
 			}
 		}
 
-		// Copy result to host, overwrite swizzled image.
-		cudaMemcpy(blockLinearImage, m_result, count * 8, cudaMemcpyDeviceToHost);
-
 		// Output result.
-		kernel->outputResult(outputOptions.outputHandler);
-		
 		if (outputOptions.outputHandler != NULL)
 		{
-			outputOptions.outputHandler->writeData(blockLinearImage, count * 8);
+			outputOptions.outputHandler->writeData(h_result, count * bs);
 		}
 
 		bn += count;
 	}
 
-	clock_t end = clock();
-	//printf("\rCUDA time taken: %.3f seconds\n", float(end-start) / CLOCKS_PER_SEC);
+	//timer.stop();
+	//printf("\rCUDA time taken: %.3f seconds\n", timer.elapsed() / CLOCKS_PER_SEC);
 
-	free(blockLinearImage);
+	free(h_result);
+	cudaFreeArray(d_image);
 
 #else
 	if (outputOptions.errorHandler != NULL)
@@ -221,92 +226,88 @@ void CudaCompressor::compressKernel(CudaCompressionKernel * kernel)
 		outputOptions.errorHandler->error(Error_CudaError);
 	}
 #endif
-}
 
-#endif // 0
+}
 
 
-void CudaCompressor::setImage(const Image * image, nvtt::AlphaMode alphaMode)
+void CudaCompressorDXT1::setup(cudaArray * image, const nvtt::CompressionOptions::Private & compressionOptions)
 {
-	m_image = image;
-	m_alphaMode = alphaMode;
+	setupCompressKernel(compressionOptions.colorWeight.ptr());
+	bindTextureToArray(image);
 }
 
-
-
-/// Compress image using CUDA.
-void CudaCompressor::compressDXT1(const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions)
+void CudaCompressorDXT1::compressBlocks(uint first, uint count, uint w, uint h, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
 {
-	nvDebugCheck(cuda::isHardwarePresent());
-#if defined HAVE_CUDA
+	// Launch kernel.
+	compressKernelDXT1(first, count, w, m_ctx.result, m_ctx.bitmapTable);
 
-    // Allocate image as a cuda array.
-    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsigned);
+	// Copy result to host.
+	cudaMemcpy(output, m_ctx.result, count * 8, cudaMemcpyDeviceToHost);
+}
 
-    cudaArray * d_image;
-	const int imageSize = m_image->width() * m_image->height() * sizeof(uint);
-    cudaMallocArray(&d_image, &channelDesc, m_image->width(), m_image->height()); 
-    cudaMemcpyToArray(d_image, 0, 0, m_image->pixels(), imageSize, cudaMemcpyHostToDevice);
 
+void CudaCompressorDXT3::setup(cudaArray * image, const nvtt::CompressionOptions::Private & compressionOptions)
+{
+	setupCompressKernel(compressionOptions.colorWeight.ptr());
+	bindTextureToArray(image);
+}
 
-	// Image size in blocks.
-	const uint w = (m_image->width() + 3) / 4;
-	const uint h = (m_image->height() + 3) / 4;
-	const uint blockNum = w * h;
-	const uint compressedSize = blockNum * 8;
+void CudaCompressorDXT3::compressBlocks(uint first, uint count, uint w, uint h, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
+{
+	// Launch kernel.
+	compressKernelDXT3(first, count, w, m_ctx.result, m_ctx.bitmapTable);
 
-	void * h_result = malloc(min(blockNum, MAX_BLOCKS) * 8);
+	// Copy result to host.
+	cudaMemcpy(output, m_ctx.result, count * 16, cudaMemcpyDeviceToHost);
+}
 
-	//clock_t start = clock();
 
+void CudaCompressorDXT5::setup(cudaArray * image, const nvtt::CompressionOptions::Private & compressionOptions)
+{
 	setupCompressKernel(compressionOptions.colorWeight.ptr());
+	bindTextureToArray(image);
+}
 
-	uint bn = 0;
-	while(bn != blockNum)
+void CudaCompressorDXT5::compressBlocks(uint first, uint count, uint w, uint h, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
+{
+	/*// Launch kernel.
+	compressKernelDXT5(first, count, w, m_ctx.result, m_ctx.bitmapTable);
+
+	// Copy result to host.
+	cudaMemcpy(output, m_ctx.result, count * 16, cudaMemcpyDeviceToHost);*/
+
+	// Launch kernel.
+	if (alphaMode == AlphaMode_Transparency)
 	{
-		uint count = min(blockNum - bn, MAX_BLOCKS);
+	//	compressWeightedKernelDXT1(first, count, w, m_ctx.result, m_ctx.bitmapTable);
+	}
+	else
+	{
+	//	compressKernelDXT1_Level4(first, count, w, m_ctx.result, m_ctx.bitmapTable);
+	}
 
-		// Launch kernel.
-		compressKernelDXT1_Tex(bn, count, w, d_image, m_result, m_bitmapTable);
+	// Compress alpha in parallel with the GPU.
+	for (uint i = 0; i < count; i++)
+	{
+		//ColorBlock rgba(blockLinearImage + (first + i) * 16);
+		//OptimalCompress::compressDXT3A(rgba, alphaBlocks + i);
+	}
 
-		// Check for errors.
-		cudaError_t err = cudaGetLastError();
-		if (err != cudaSuccess)
-		{
-			nvDebug("CUDA Error: %s\n", cudaGetErrorString(err));
+	// Copy result to host.
+	cudaMemcpy(output, m_ctx.result, count * 8, cudaMemcpyDeviceToHost);
 
-			if (outputOptions.errorHandler != NULL)
-			{
-				outputOptions.errorHandler->error(Error_CudaError);
-			}
-		}
+	// @@ Interleave color and alpha blocks.
 
-		// Copy result to host, overwrite swizzled image.
-		cudaMemcpy(h_result, m_result, count * 8, cudaMemcpyDeviceToHost);
+}
 
-		// Output result.
-		if (outputOptions.outputHandler != NULL)
-		{
-			outputOptions.outputHandler->writeData(h_result, count * 8);
-		}
 
-		bn += count;
-	}
 
-	//clock_t end = clock();
-	//printf("\rCUDA time taken: %.3f seconds\n", float(end-start) / CLOCKS_PER_SEC);
 
-	free(h_result);
 
-#else
-	if (outputOptions.errorHandler != NULL)
-	{
-		outputOptions.errorHandler->error(Error_CudaError);
-	}
-#endif
-}
 
+// @@ This code is very repetitive and needs to be cleaned up.
 
+#if 0
 
 /// Compress image using CUDA.
 void CudaCompressor::compressDXT3(const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions)
@@ -337,16 +338,16 @@ void CudaCompressor::compressDXT3(const CompressionOptions::Private & compressio
 	{
 		uint count = min(blockNum - bn, MAX_BLOCKS);
 
-	    cudaMemcpy(m_data, blockLinearImage + bn * 16, count * 64, cudaMemcpyHostToDevice);
+	    cudaMemcpy(m_ctx.data, blockLinearImage + bn * 16, count * 64, cudaMemcpyHostToDevice);
 
 		// Launch kernel.
 		if (m_alphaMode == AlphaMode_Transparency)
 		{
-			compressWeightedKernelDXT1(count, m_data, m_result, m_bitmapTable);
+			compressWeightedKernelDXT1(count, m_ctx.data, m_ctx.result, m_ctx.bitmapTable);
 		}
 		else
 		{
-			compressKernelDXT1_Level4(count, m_data, m_result, m_bitmapTable);
+			compressKernelDXT1_Level4(count, m_ctx.data, m_ctx.result, m_ctx.bitmapTable);
 		}
 
 		// Compress alpha in parallel with the GPU.
@@ -369,7 +370,7 @@ void CudaCompressor::compressDXT3(const CompressionOptions::Private & compressio
 		}
 
 		// Copy result to host, overwrite swizzled image.
-		cudaMemcpy(blockLinearImage, m_result, count * 8, cudaMemcpyDeviceToHost);
+		cudaMemcpy(blockLinearImage, m_ctx.result, count * 8, cudaMemcpyDeviceToHost);
 
 		// Output result.
 		if (outputOptions.outputHandler != NULL)
@@ -428,16 +429,16 @@ void CudaCompressor::compressDXT5(const CompressionOptions::Private & compressio
 	{
 		uint count = min(blockNum - bn, MAX_BLOCKS);
 
-	    cudaMemcpy(m_data, blockLinearImage + bn * 16, count * 64, cudaMemcpyHostToDevice);
+	    cudaMemcpy(m_ctx.data, blockLinearImage + bn * 16, count * 64, cudaMemcpyHostToDevice);
 
 		// Launch kernel.
 		if (m_alphaMode == AlphaMode_Transparency)
 		{
-			compressWeightedKernelDXT1(count, m_data, m_result, m_bitmapTable);
+			compressWeightedKernelDXT1(count, m_ctx.data, m_ctx.result, m_ctx.bitmapTable);
 		}
 		else
 		{
-			compressKernelDXT1_Level4(count, m_data, m_result, m_bitmapTable);
+			compressKernelDXT1_Level4(count, m_ctx.data, m_ctx.result, m_ctx.bitmapTable);
 		}
 
 		// Compress alpha in parallel with the GPU.
@@ -460,7 +461,7 @@ void CudaCompressor::compressDXT5(const CompressionOptions::Private & compressio
 		}
 
 		// Copy result to host, overwrite swizzled image.
-		cudaMemcpy(blockLinearImage, m_result, count * 8, cudaMemcpyDeviceToHost);
+		cudaMemcpy(blockLinearImage, m_ctx.result, count * 8, cudaMemcpyDeviceToHost);
 
 		// Output result.
 		if (outputOptions.outputHandler != NULL)
@@ -516,10 +517,10 @@ void CudaCompressor::compressDXT1n(const nvtt::CompressionOptions::Private & com
 	{
 		uint count = min(blockNum - bn, MAX_BLOCKS);
 
-	    cudaMemcpy(m_data, blockLinearImage + bn * 16, count * 64, cudaMemcpyHostToDevice);
+	    cudaMemcpy(m_ctx.data, blockLinearImage + bn * 16, count * 64, cudaMemcpyHostToDevice);
 
 		// Launch kernel.
-		compressNormalKernelDXT1(count, m_data, m_result, m_bitmapTable);
+		compressNormalKernelDXT1(count, m_ctx.data, m_ctx.result, m_ctx.bitmapTable);
 
 		// Check for errors.
 		cudaError_t err = cudaGetLastError();
@@ -534,7 +535,7 @@ void CudaCompressor::compressDXT1n(const nvtt::CompressionOptions::Private & com
 		}
 
 		// Copy result to host, overwrite swizzled image.
-		cudaMemcpy(blockLinearImage, m_result, count * 8, cudaMemcpyDeviceToHost);
+		cudaMemcpy(blockLinearImage, m_ctx.result, count * 8, cudaMemcpyDeviceToHost);
 
 		// Output result.
 		if (outputOptions.outputHandler != NULL)
@@ -585,10 +586,10 @@ void CudaCompressor::compressCTX1(const nvtt::CompressionOptions::Private & comp
 	{
 		uint count = min(blockNum - bn, MAX_BLOCKS);
 
-	    cudaMemcpy(m_data, blockLinearImage + bn * 16, count * 64, cudaMemcpyHostToDevice);
+	    cudaMemcpy(m_ctx.data, blockLinearImage + bn * 16, count * 64, cudaMemcpyHostToDevice);
 
 		// Launch kernel.
-		compressKernelCTX1(count, m_data, m_result, m_bitmapTableCTX);
+		compressKernelCTX1(count, m_ctx.data, m_ctx.result, m_ctx.bitmapTableCTX);
 
 		// Check for errors.
 		cudaError_t err = cudaGetLastError();
@@ -603,7 +604,7 @@ void CudaCompressor::compressCTX1(const nvtt::CompressionOptions::Private & comp
 		}
 
 		// Copy result to host, overwrite swizzled image.
-		cudaMemcpy(blockLinearImage, m_result, count * 8, cudaMemcpyDeviceToHost);
+		cudaMemcpy(blockLinearImage, m_ctx.result, count * 8, cudaMemcpyDeviceToHost);
 
 		// Output result.
 		if (outputOptions.outputHandler != NULL)
@@ -643,4 +644,4 @@ void CudaCompressor::compressDXT5n(const nvtt::CompressionOptions::Private & com
 #endif
 }
 
-
+#endif // 0
diff --git a/src/nvtt/cuda/CudaCompressDXT.h b/src/nvtt/cuda/CudaCompressDXT.h
index 18a7e79..b60bdf3 100644
--- a/src/nvtt/cuda/CudaCompressDXT.h
+++ b/src/nvtt/cuda/CudaCompressDXT.h
@@ -27,38 +27,86 @@
 #include <nvimage/nvimage.h>
 #include <nvtt/nvtt.h>
 
+#include "nvtt/CompressDXT.h"
+
+struct cudaArray;
+
 namespace nv
 {
 	class Image;
 
-	class CudaCompressor
+	class CudaContext
 	{
 	public:
-		CudaCompressor();
-		~CudaCompressor();
+		CudaContext();
+		~CudaContext();
 
 		bool isValid() const;
 
-		void setImage(const Image * image, nvtt::AlphaMode alphaMode);
+	public:
+		// Device pointers.
+		uint * bitmapTable;
+		uint * bitmapTableCTX;
+		uint * data;
+		uint * result;
+	};
 
-		void compressDXT1(const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
-		void compressDXT3(const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
-		void compressDXT5(const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
-		void compressDXT1n(const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
-		void compressCTX1(const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
-		void compressDXT5n(const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
 
-	private:
+	struct CudaCompressor : public CompressorInterface
+	{
+		CudaCompressor(CudaContext & ctx);
+
+		virtual void compress(nvtt::InputFormat inputFormat, nvtt::AlphaMode alphaMode, uint w, uint h, void * data, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
+
+		virtual void setup(cudaArray * image, const nvtt::CompressionOptions::Private & compressionOptions) = 0;
+		virtual void compressBlocks(uint first, uint count, uint w, uint h, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output) = 0;
+		virtual uint blockSize() const = 0;
+
+	protected:
+		CudaContext & m_ctx;
+	};
 
-		uint * m_bitmapTable;
-		uint * m_bitmapTableCTX;
-		uint * m_data;
-		uint * m_result;
-		
-		const Image * m_image;
-		nvtt::AlphaMode m_alphaMode;
+	struct CudaCompressorDXT1 : public CudaCompressor
+	{
+		CudaCompressorDXT1(CudaContext & ctx) : CudaCompressor(ctx) {}
+
+		virtual void setup(cudaArray * image, const nvtt::CompressionOptions::Private & compressionOptions);
+		virtual void compressBlocks(uint first, uint count, uint w, uint h, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
+		virtual uint blockSize() const { return 8; };
 	};
 
+	/*struct CudaCompressorDXT1n : public CudaCompressor
+	{
+		virtual void setup(const CompressionOptions::Private & compressionOptions);
+		virtual void compressBlocks(uint blockCount, const void * input, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output) = 0;
+		virtual uint blockSize() const { return 8; };
+	};*/
+
+	struct CudaCompressorDXT3 : public CudaCompressor
+	{
+		CudaCompressorDXT3(CudaContext & ctx) : CudaCompressor(ctx) {}
+
+		virtual void setup(cudaArray * image, const nvtt::CompressionOptions::Private & compressionOptions);
+		virtual void compressBlocks(uint first, uint count, uint w, uint h, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
+		virtual uint blockSize() const { return 16; };
+	};
+
+	struct CudaCompressorDXT5 : public CudaCompressor
+	{
+		CudaCompressorDXT5(CudaContext & ctx) : CudaCompressor(ctx) {}
+
+		virtual void setup(cudaArray * image, const nvtt::CompressionOptions::Private & compressionOptions);
+		virtual void compressBlocks(uint first, uint count, uint w, uint h, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
+		virtual uint blockSize() const { return 16; };
+	};
+
+	/*struct CudaCompressorCXT1 : public CudaCompressor
+	{
+		virtual void setup(const CompressionOptions::Private & compressionOptions);
+		virtual void compressBlocks(uint blockCount, const void * input, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output) = 0;
+		virtual uint blockSize() const { return 8; };
+	};*/
+
 } // nv namespace
 
 
diff --git a/src/nvtt/nvtt.h b/src/nvtt/nvtt.h
index 520ecfd..8a4d8da 100644
--- a/src/nvtt/nvtt.h
+++ b/src/nvtt/nvtt.h
@@ -93,6 +93,9 @@ namespace nvtt
 		Format_DXT1n,
 		Format_CTX1,
 		Format_YCoCg_DXT5,
+
+		Format_BC6,
+		Format_BC7,
 	};
 
 	/// Pixel types.