Large refactoring of compressor codes:

- Define compressor interface. - Implement compressor interface for different compressors. - Add parallel compressor using OpenMP. Experimental. - Add generic GPU compressor, so far only DXT1 enabled.
2009-10-21 07:48:27 +00:00
parent 18a3abf794
commit 8820c43175
8 changed files with 1559 additions and 1325 deletions
--- a/src/nvtt/cuda/CompressKernel.cu
+++ b/src/nvtt/cuda/CompressKernel.cu
@ -296,6 +296,51 @@ __device__ float3 blockError3(const float3 * colors, uint permutation, float3 a,
 // Sort colors
 ////////////////////////////////////////////////////////////////////////////////

+// @@ Experimental code to avoid duplicate colors for faster compression.
+// We could first sort along the best fit line and only compare colors that have the same projection.
+// The hardest part is to maintain the indices to map packed/sorted colors to the input colors.
+// We also need to update several functions that assume the number of colors is fixed to 16.
+// And compute different bit maps for the different color counts.
+// This is a fairly high amount of work.
+__device__ int packColors(float3 * values, float * weights, int * ranks)
+{
+    const int tid = threadIdx.x;
+
+	__shared__ int count;
+	count = 0;
+
+	bool alive = true;
+
+	// Append this 
+	for (int i = 0; i < 16; i++)
+	{
+		// One thread leads on each iteration.
+		if (tid == i) {
+
+			// If thread alive, then append element.
+			if (alive) {
+				values[count] = values[i];
+				weights[count] = weights[i];
+				count++;
+			}
+
+			// Otherwise update weight.
+			else {
+				weights[ranks[i]] += weights[i];
+			}
+		}
+
+		// Kill all threads that have the same element and record rank.
+		if (values[i] == values[tid]) {
+			alive = false;
+			ranks[tid] = count - 1;
+		}
+	}
+
+	return count;
+}
+
+
 __device__ void sortColors(const float * values, int * ranks)
 {
 #if __DEVICE_EMULATION__
@ -343,12 +388,60 @@ __device__ void sortColors(const float * values, int * ranks)
 #endif
 }

+__device__ void sortColors(const float * values, int * ranks, int count)
+{
+#if __DEVICE_EMULATION__
+    if (threadIdx.x == 0)
+    {
+        for (int tid = 0; tid < count; tid++)
+        {
+            int rank = 0;
+            for (int i = 0; i < count; i++)
+            {
+                rank += (values[i] < values[tid]);
+            }
+            
+            ranks[tid] = rank;
+        }
+
+        // Resolve elements with the same index.
+        for (int i = 0; i < count-1; i++)
+        {
+            for (int tid = 0; tid < count; tid++)
+            {
+                if (tid > i && ranks[tid] == ranks[i]) ++ranks[tid];
+            }
+        }
+    }
+#else
+    const int tid = threadIdx.x;
+
+    int rank = 0;
+
+    #pragma unroll
+    for (int i = 0; i < count; i++)
+    {
+        rank += (values[i] < values[tid]);
+    }
+    
+    ranks[tid] = rank;
+
+    // Resolve elements with the same index.
+    #pragma unroll
+    for (int i = 0; i < count-1; i++)
+    {
+        if ((tid > i) & (ranks[tid] == ranks[i])) ++ranks[tid];
+    }
+#endif
+}
+
+

 ////////////////////////////////////////////////////////////////////////////////
 // Load color block to shared mem
 ////////////////////////////////////////////////////////////////////////////////

-__device__ void loadColorBlock(const uint * image, float3 colors[16], float3 sums[16], int xrefs[16], int * sameColor)
+/*__device__ void loadColorBlock(const uint * image, float3 colors[16], float3 sums[16], int xrefs[16], int * sameColor)
 {
 	const int bid = blockIdx.x;
 	const int idx = threadIdx.x;
@ -389,9 +482,9 @@ __device__ void loadColorBlock(const uint * image, float3 colors[16], float3 sum
 		__debugsync();
 	}
 #endif
-}
+}*/

-__device__ void loadColorBlockTex(uint bn, uint w, float3 colors[16], float3 sums[16], int xrefs[16], int * sameColor)
+__device__ void loadColorBlockTex(uint firstBlock, uint width, float3 colors[16], float3 sums[16], int xrefs[16], int * sameColor)
 {
 	const int bid = blockIdx.x;
 	const int idx = threadIdx.x;
@ -400,8 +493,8 @@ __device__ void loadColorBlockTex(uint bn, uint w, float3 colors[16], float3 sum

 	if (idx < 16)
 	{
-		float x = 4 * ((bn + bid) % w) + idx % 4; // @@ Avoid mod and div by using 2D grid?
-		float y = 4 * ((bn + bid) / w) + idx / 4;
+		float x = 4 * ((firstBlock + bid) % width) + idx % 4; // @@ Avoid mod and div by using 2D grid?
+		float y = 4 * ((firstBlock + bid) / width) + idx / 4;

 		// Read color and copy to shared mem.
 		float4 c = tex2D(tex, x, y);
@ -437,10 +530,107 @@ __device__ void loadColorBlockTex(uint bn, uint w, float3 colors[16], float3 sum
 		__debugsync();
 	}
 #endif
-
 }

+/*
+__device__ void loadColorBlockTex(uint firstBlock, uint w, float3 colors[16], float3 sums[16], float weights[16], int xrefs[16], int * sameColor)
+{
+	const int bid = blockIdx.x;
+	const int idx = threadIdx.x;

+	__shared__ float dps[16];
+
+	if (idx < 16)
+	{
+		float x = 4 * ((firstBlock + bid) % w) + idx % 4; // @@ Avoid mod and div by using 2D grid?
+		float y = 4 * ((firstBlock + bid) / w) + idx / 4;
+
+		// Read color and copy to shared mem.
+		float4 c = tex2D(tex, x, y);
+
+		colors[idx].x = c.z;
+		colors[idx].y = c.y;
+		colors[idx].z = c.x;
+		weights[idx] = 1;
+
+		int count = packColors(colors, weights);
+		if (idx < count)
+		{
+			// Sort colors along the best fit line.
+			colorSums(colors, sums);
+			float3 axis = bestFitLine(colors, sums[0], kColorMetric);
+			
+			*sameColor = (axis == make_float3(0, 0, 0));
+			
+			dps[idx] = dot(colors[idx], axis);
+			
+			sortColors(dps, xrefs);
+			
+			float3 tmp = colors[idx];
+			colors[xrefs[idx]] = tmp;
+		}
+	}
+}
+*/
+
+__device__ void loadColorBlockTex(uint firstBlock, uint width, float3 colors[16], float3 sums[16], float weights[16], int xrefs[16], int * sameColor)
+{
+	const int bid = blockIdx.x;
+	const int idx = threadIdx.x;
+
+	__shared__ float3 rawColors[16];
+	__shared__ float dps[16];
+
+	if (idx < 16)
+	{
+		float x = 4 * ((firstBlock + bid) % width) + idx % 4; // @@ Avoid mod and div by using 2D grid?
+		float y = 4 * ((firstBlock + bid) / width) + idx / 4;
+
+		// Read color and copy to shared mem.
+		float4 c = tex2D(tex, x, y);
+
+		rawColors[idx].x = c.z;
+		rawColors[idx].y = c.y;
+		rawColors[idx].z = c.x;
+		weights[idx] = c.w;
+
+		colors[idx] = rawColors[idx] * weights[idx];
+
+		// No need to synchronize, 16 < warp size.
+		__debugsync();
+		
+		// Sort colors along the best fit line.
+		colorSums(colors, sums);
+		float3 axis = bestFitLine(colors, sums[0], kColorMetric);
+		
+		*sameColor = (axis == make_float3(0, 0, 0));
+		
+		// Single color compressor needs unweighted colors.
+		if (*sameColor) colors[idx] = rawColors[idx];
+
+		dps[idx] = dot(colors[idx], axis);
+		
+		__debugsync();
+		
+		sortColors(dps, xrefs);
+		
+		float3 tmp = colors[idx];
+		float w = weights[idx];
+		__debugsync();
+		colors[xrefs[idx]] = tmp;
+		weights[xrefs[idx]] = w;
+	}
+#if __DEVICE_EMULATION__
+	else
+	{
+		__debugsync();
+		__debugsync();
+		__debugsync();
+	}
+#endif
+}
+
+/*
 __device__ void loadColorBlock(const uint * image, float3 colors[16], float3 sums[16], float weights[16], int xrefs[16], int * sameColor)
 {
 	const int bid = blockIdx.x;
@ -494,6 +684,7 @@ __device__ void loadColorBlock(const uint * image, float3 colors[16], float3 sum
 	}
 #endif
 }
+*/

 __device__ void loadColorBlock(const uint * image, float2 colors[16], float2 sums[16], int xrefs[16], int * sameColor)
 {
@ -1457,48 +1648,15 @@ __device__ void saveSingleColorBlockCTX1(float2 color, uint2 * result)
 ////////////////////////////////////////////////////////////////////////////////
 // Compress color block
 ////////////////////////////////////////////////////////////////////////////////
-__global__ void compressDXT1(const uint * permutations, const uint * image, uint2 * result)
+
+__global__ void compressDXT1(uint firstBlock, uint w, const uint * permutations, uint2 * result)
 {
 	__shared__ float3 colors[16];
 	__shared__ float3 sums[16];
 	__shared__ int xrefs[16];
 	__shared__ int sameColor;
 	
-	loadColorBlock(image, colors, sums, xrefs, &sameColor);
-
-	__syncthreads();
-
-	if (sameColor)
-	{
-		if (threadIdx.x == 0) saveSingleColorBlockDXT1(colors[0], result);
-		return;
-	}
-
-	ushort bestStart, bestEnd;
-	uint bestPermutation;
-
-	__shared__ float errors[NUM_THREADS];
-
-	evalAllPermutations(colors, sums[0], permutations, bestStart, bestEnd, bestPermutation, errors);
-	
-	// Use a parallel reduction to find minimum error.
-	const int minIdx = findMinError(errors);
-	
-	// Only write the result of the winner thread.
-	if (threadIdx.x == minIdx)
-	{
-		saveBlockDXT1(bestStart, bestEnd, bestPermutation, xrefs, result);
-	}
-}
-
-__global__ void compressDXT1_Tex(uint bn, uint w, const uint * permutations, uint2 * result)
-{
-	__shared__ float3 colors[16];
-	__shared__ float3 sums[16];
-	__shared__ int xrefs[16];
-	__shared__ int sameColor;
-	
-	loadColorBlockTex(bn, w, colors, sums, xrefs, &sameColor);
+	loadColorBlockTex(firstBlock, w, colors, sums, xrefs, &sameColor);

 	__syncthreads();

@ -1534,14 +1692,14 @@ __global__ void compressDXT1_Tex(uint bn, uint w, const uint * permutations, uin
 }


-__global__ void compressLevel4DXT1(const uint * permutations, const uint * image, uint2 * result)
+__global__ void compressLevel4DXT1(uint firstBlock, uint w, const uint * permutations, uint2 * result)
 {
 	__shared__ float3 colors[16];
 	__shared__ float3 sums[16];
 	__shared__ int xrefs[16];
 	__shared__ int sameColor;
 	
-	loadColorBlock(image, colors, sums, xrefs, &sameColor);
+	loadColorBlockTex(firstBlock, w, colors, sums, xrefs, &sameColor);

 	__syncthreads();

@ -1568,7 +1726,7 @@ __global__ void compressLevel4DXT1(const uint * permutations, const uint * image
 	}
 }

-__global__ void compressWeightedDXT1(const uint * permutations, const uint * image, uint2 * result)
+__global__ void compressWeightedDXT1(uint firstBlock, uint w, const uint * permutations, uint2 * result)
 {
 	__shared__ float3 colors[16];
 	__shared__ float3 sums[16];
@ -1576,7 +1734,7 @@ __global__ void compressWeightedDXT1(const uint * permutations, const uint * ima
 	__shared__ int xrefs[16];
 	__shared__ int sameColor;
 	
-	loadColorBlock(image, colors, sums, weights, xrefs, &sameColor);
+	loadColorBlockTex(firstBlock, w, colors, sums, weights, xrefs, &sameColor);
 	
 	__syncthreads();

@ -1987,17 +2145,7 @@ extern "C" void setupCompressKernel(const float weights[3])
 	cudaMemcpyToSymbol(kColorMetricSqr, weightsSqr, sizeof(float) * 3, 0);
 }

-
-////////////////////////////////////////////////////////////////////////////////
-// Launch kernel
-////////////////////////////////////////////////////////////////////////////////
-
-extern "C" void compressKernelDXT1(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps)
-{
-	compressDXT1<<<blockNum, NUM_THREADS>>>(d_bitmaps, d_data, (uint2 *)d_result);
-}
-
-extern "C" void compressKernelDXT1_Tex(uint bn, uint blockNum, uint w, cudaArray * d_data, uint * d_result, uint * d_bitmaps)
+extern "C" void bindTextureToArray(cudaArray * d_data)
 {
 	// Setup texture
 	tex.normalized = false;
@ -2006,21 +2154,61 @@ extern "C" void compressKernelDXT1_Tex(uint bn, uint blockNum, uint w, cudaArray
 	tex.addressMode[1] = cudaAddressModeClamp;
    
 	cudaBindTextureToArray(tex, d_data);
-
-	compressDXT1_Tex<<<blockNum, NUM_THREADS>>>(bn, w, d_bitmaps, (uint2 *)d_result);
 }


-extern "C" void compressKernelDXT1_Level4(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps)
+
+////////////////////////////////////////////////////////////////////////////////
+// Launch kernel
+////////////////////////////////////////////////////////////////////////////////
+
+// DXT1 compressors:
+extern "C" void compressKernelDXT1(uint firstBlock, uint blockNum, uint w, uint * d_result, uint * d_bitmaps)
 {
-	compressLevel4DXT1<<<blockNum, NUM_THREADS>>>(d_bitmaps, d_data, (uint2 *)d_result);
+	compressDXT1<<<blockNum, NUM_THREADS>>>(firstBlock, w, d_bitmaps, (uint2 *)d_result);
 }

-extern "C" void compressWeightedKernelDXT1(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps)
+extern "C" void compressKernelDXT1_Level4(uint firstBlock, uint blockNum, uint w, uint * d_result, uint * d_bitmaps)
 {
-	compressWeightedDXT1<<<blockNum, NUM_THREADS>>>(d_bitmaps, d_data, (uint2 *)d_result);
+	compressLevel4DXT1<<<blockNum, NUM_THREADS>>>(firstBlock, w, d_bitmaps, (uint2 *)d_result);
 }

+extern "C" void compressWeightedKernelDXT1(uint firstBlock, uint blockNum, uint w, uint * d_result, uint * d_bitmaps)
+{
+	compressWeightedDXT1<<<blockNum, NUM_THREADS>>>(firstBlock, w, d_bitmaps, (uint2 *)d_result);
+}
+
+// @@ DXT1a compressors.
+
+
+// @@ DXT3 compressors:
+extern "C" void compressKernelDXT3(uint firstBlock, uint blockNum, uint w, uint * d_result, uint * d_bitmaps)
+{
+	//compressDXT3<<<blockNum, NUM_THREADS>>>(firstBlock, w, d_bitmaps, (uint2 *)d_result);
+}
+
+extern "C" void compressWeightedKernelDXT3(uint firstBlock, uint blockNum, uint w, uint * d_result, uint * d_bitmaps)
+{
+	//compressWeightedDXT3<<<blockNum, NUM_THREADS>>>(firstBlock, w, d_bitmaps, (uint2 *)d_result);
+}
+
+
+// @@ DXT5 compressors.
+extern "C" void compressKernelDXT5(uint firstBlock, uint blockNum, uint w, uint * d_result, uint * d_bitmaps)
+{
+	//compressDXT5<<<blockNum, NUM_THREADS>>>(firstBlock, w, d_bitmaps, (uint2 *)d_result);
+}
+
+extern "C" void compressWeightedKernelDXT5(uint firstBlock, uint blockNum, uint w, uint * d_result, uint * d_bitmaps)
+{
+	//compressWeightedDXT5<<<blockNum, NUM_THREADS>>>(firstBlock, w, d_bitmaps, (uint2 *)d_result);
+}
+
+
+
+
+
+/*
 extern "C" void compressNormalKernelDXT1(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps)
 {
 	compressNormalDXT1<<<blockNum, NUM_THREADS>>>(d_bitmaps, d_data, (uint2 *)d_result);
@ -2030,16 +2218,10 @@ extern "C" void compressKernelCTX1(uint blockNum, uint * d_data, uint * d_result
 {
 	compressCTX1<<<blockNum, NUM_THREADS>>>(d_bitmaps, d_data, (uint2 *)d_result);
 }
-
+*/
+/*
 extern "C" void compressKernelDXT5n(uint blockNum, cudaArray * d_data, uint * d_result)
 {
-	// Setup texture
-	tex.normalized = false;
-	tex.filterMode = cudaFilterModePoint;
-	tex.addressMode[0] = cudaAddressModeClamp;
-	tex.addressMode[1] = cudaAddressModeClamp;
-    
-	cudaBindTextureToArray(tex, d_data);
-
 //	compressDXT5n<<<blockNum/128, 128>>>(blockNum, (uint2 *)d_result);
 }
+*/
--- a/src/nvtt/cuda/CudaCompressDXT.cpp
+++ b/src/nvtt/cuda/CudaCompressDXT.cpp
@ -52,16 +52,20 @@ using namespace nvtt;


 extern "C" void setupCompressKernel(const float weights[3]);
-extern "C" void compressKernelDXT1(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps);
-extern "C" void compressKernelDXT1_Tex(uint bn, uint blockNum, uint w, cudaArray * d_data, uint * d_result, uint * d_bitmaps);
+extern "C" void bindTextureToArray(cudaArray * d_data);
+
+extern "C" void compressKernelDXT1(uint firstBlock, uint blockNum, uint w, uint * d_result, uint * d_bitmaps);
 extern "C" void compressKernelDXT1_Level4(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps);
 extern "C" void compressWeightedKernelDXT1(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps);
-extern "C" void compressNormalKernelDXT1(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps);
-extern "C" void compressKernelCTX1(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps);
+extern "C" void compressKernelDXT3(uint firstBlock, uint blockNum, uint w, uint * d_result, uint * d_bitmaps);
+//extern "C" void compressNormalKernelDXT1(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps);
+//extern "C" void compressKernelCTX1(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps);


-#include "Bitmaps.h"	// @@ Rename to BitmapTable.h
+#pragma message(NV_FILE_LINE "TODO: Rename Bitmaps.h to BitmapTable.h")
+#include "Bitmaps.h"

+/*
 // Convert linear image to block linear.
 static void convertToBlockLinear(const Image * image, uint * blockLinearImage)
 {
@ -81,45 +85,49 @@ static void convertToBlockLinear(const Image * image, uint * blockLinearImage)
 		}
 	}
 }
+*/

 #endif


-CudaCompressor::CudaCompressor() : m_bitmapTable(NULL), m_bitmapTableCTX(NULL), m_data(NULL), m_result(NULL)
-{
+CudaContext::CudaContext() : 
+	bitmapTable(NULL), 
+	bitmapTableCTX(NULL), 
+	data(NULL), 
+	result(NULL)
+{
 #if defined HAVE_CUDA
    // Allocate and upload bitmaps.
-    cudaMalloc((void**) &m_bitmapTable, 992 * sizeof(uint));
-	if (m_bitmapTable != NULL)
+    cudaMalloc((void**) &bitmapTable, 992 * sizeof(uint));
+	if (bitmapTable != NULL)
 	{
-		cudaMemcpy(m_bitmapTable, s_bitmapTable, 992 * sizeof(uint), cudaMemcpyHostToDevice);
+		cudaMemcpy(bitmapTable, s_bitmapTable, 992 * sizeof(uint), cudaMemcpyHostToDevice);
 	}

-    cudaMalloc((void**) &m_bitmapTableCTX, 704 * sizeof(uint));
-
-	if (m_bitmapTableCTX != NULL)
+    cudaMalloc((void**) &bitmapTableCTX, 704 * sizeof(uint));
+	if (bitmapTableCTX != NULL)
 	{
-		cudaMemcpy(m_bitmapTableCTX, s_bitmapTableCTX, 704 * sizeof(uint), cudaMemcpyHostToDevice);
+		cudaMemcpy(bitmapTableCTX, s_bitmapTableCTX, 704 * sizeof(uint), cudaMemcpyHostToDevice);
 	}

 	// Allocate scratch buffers.
-    cudaMalloc((void**) &m_data, MAX_BLOCKS * 64U);
-    cudaMalloc((void**) &m_result, MAX_BLOCKS * 8U);
+    cudaMalloc((void**) &data, MAX_BLOCKS * 64U);
+    cudaMalloc((void**) &result, MAX_BLOCKS * 8U);
 #endif
-}
-
-CudaCompressor::~CudaCompressor()
-{
+}
+
+CudaContext::~CudaContext()
+{
 #if defined HAVE_CUDA
 	// Free device mem allocations.
-	cudaFree(m_data);
-	cudaFree(m_result);
-	cudaFree(m_bitmapTable);
-	cudaFree(m_bitmapTableCTX);
+	cudaFree(bitmapTableCTX);
+	cudaFree(bitmapTable);
+	cudaFree(data);
+	cudaFree(result);
 #endif
-}
+}

-bool CudaCompressor::isValid() const
+bool CudaContext::isValid() const
 {
 #if defined HAVE_CUDA
 	cudaError_t err = cudaGetLastError();
@ -129,185 +137,178 @@ bool CudaCompressor::isValid() const
 		return false;
 	}
 #endif
-	return m_data != NULL && m_result != NULL && m_bitmapTable != NULL;
+	return bitmapTable != NULL && bitmapTableCTX != NULL && data != NULL && result != NULL;
 }

+
+
+CudaCompressor::CudaCompressor(CudaContext & ctx) : m_ctx(ctx)
+{
+
+}
+
+void CudaCompressor::compress(nvtt::InputFormat inputFormat, nvtt::AlphaMode alphaMode, uint w, uint h, void * data, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions)
+{
+	nvDebugCheck(cuda::isHardwarePresent());
+
+#if defined HAVE_CUDA
+
+	// Allocate image as a cuda array.
+	cudaArray * d_image;
+	if (inputFormat == nvtt::InputFormat_BGRA_8UB)
+	{
+        cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsigned);
+	    cudaMallocArray(&d_image, &channelDesc, w, h);
+
+		const int imageSize = w * h * sizeof(uint);
+		cudaMemcpyToArray(d_image, 0, 0, data, imageSize, cudaMemcpyHostToDevice);
+	}
+	else
+	{
+#pragma message(NV_FILE_LINE "FIXME: Floating point textures not really supported by CUDA compressors.")
+		cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 32, 32, 32, cudaChannelFormatKindFloat);
+	    cudaMallocArray(&d_image, &channelDesc, w, h);
+
+		const int imageSize = w * h * sizeof(uint);
+		cudaMemcpyToArray(d_image, 0, 0, data, imageSize, cudaMemcpyHostToDevice);
+	}
+
+	// Image size in blocks.
+	const uint bw = (w + 3) / 4;
+	const uint bh = (h + 3) / 4;
+	const uint bs = blockSize();
+	const uint blockNum = bw * bh;
+	const uint compressedSize = blockNum * bs;
+
+	void * h_result = malloc(min(blockNum, MAX_BLOCKS) * bs);
+
+	setup(d_image, compressionOptions);
+
+	// Timer timer;
+	// timer.start();
+
+	uint bn = 0;
+	while(bn != blockNum)
+	{
+		uint count = min(blockNum - bn, MAX_BLOCKS);
+
+		compressBlocks(bn, count, w, h, alphaMode, compressionOptions, h_result);
+
+		// Check for errors.
+		cudaError_t err = cudaGetLastError();
+		if (err != cudaSuccess)
+		{
+			//nvDebug("CUDA Error: %s\n", cudaGetErrorString(err));
+			if (outputOptions.errorHandler != NULL)
+			{
+				outputOptions.errorHandler->error(Error_CudaError);
+			}
+		}
+
+		// Output result.
+		if (outputOptions.outputHandler != NULL)
+		{
+			outputOptions.outputHandler->writeData(h_result, count * bs);
+		}
+
+		bn += count;
+	}
+
+	//timer.stop();
+	//printf("\rCUDA time taken: %.3f seconds\n", timer.elapsed() / CLOCKS_PER_SEC);
+
+	free(h_result);
+	cudaFreeArray(d_image);
+
+#else
+	if (outputOptions.errorHandler != NULL)
+	{
+		outputOptions.errorHandler->error(Error_CudaError);
+	}
+#endif
+
+}
+
+
+void CudaCompressorDXT1::setup(cudaArray * image, const nvtt::CompressionOptions::Private & compressionOptions)
+{
+	setupCompressKernel(compressionOptions.colorWeight.ptr());
+	bindTextureToArray(image);
+}
+
+void CudaCompressorDXT1::compressBlocks(uint first, uint count, uint w, uint h, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
+{
+	// Launch kernel.
+	compressKernelDXT1(first, count, w, m_ctx.result, m_ctx.bitmapTable);
+
+	// Copy result to host.
+	cudaMemcpy(output, m_ctx.result, count * 8, cudaMemcpyDeviceToHost);
+}
+
+
+void CudaCompressorDXT3::setup(cudaArray * image, const nvtt::CompressionOptions::Private & compressionOptions)
+{
+	setupCompressKernel(compressionOptions.colorWeight.ptr());
+	bindTextureToArray(image);
+}
+
+void CudaCompressorDXT3::compressBlocks(uint first, uint count, uint w, uint h, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
+{
+	// Launch kernel.
+	compressKernelDXT3(first, count, w, m_ctx.result, m_ctx.bitmapTable);
+
+	// Copy result to host.
+	cudaMemcpy(output, m_ctx.result, count * 16, cudaMemcpyDeviceToHost);
+}
+
+
+void CudaCompressorDXT5::setup(cudaArray * image, const nvtt::CompressionOptions::Private & compressionOptions)
+{
+	setupCompressKernel(compressionOptions.colorWeight.ptr());
+	bindTextureToArray(image);
+}
+
+void CudaCompressorDXT5::compressBlocks(uint first, uint count, uint w, uint h, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
+{
+	/*// Launch kernel.
+	compressKernelDXT5(first, count, w, m_ctx.result, m_ctx.bitmapTable);
+
+	// Copy result to host.
+	cudaMemcpy(output, m_ctx.result, count * 16, cudaMemcpyDeviceToHost);*/
+
+	// Launch kernel.
+	if (alphaMode == AlphaMode_Transparency)
+	{
+	//	compressWeightedKernelDXT1(first, count, w, m_ctx.result, m_ctx.bitmapTable);
+	}
+	else
+	{
+	//	compressKernelDXT1_Level4(first, count, w, m_ctx.result, m_ctx.bitmapTable);
+	}
+
+	// Compress alpha in parallel with the GPU.
+	for (uint i = 0; i < count; i++)
+	{
+		//ColorBlock rgba(blockLinearImage + (first + i) * 16);
+		//OptimalCompress::compressDXT3A(rgba, alphaBlocks + i);
+	}
+
+	// Copy result to host.
+	cudaMemcpy(output, m_ctx.result, count * 8, cudaMemcpyDeviceToHost);
+
+	// @@ Interleave color and alpha blocks.
+
+}
+
+
+
+
+
+
 // @@ This code is very repetitive and needs to be cleaned up.

 #if 0

-struct CudaCompressionKernel
-{
-	virtual void setup(const CompressionOptions::Private & compressionOptions)
-	{
-		setupCompressKernel(compressionOptions.colorWeight.ptr());
-	}
-	
-	virtual void setBitmapTable();
-	
-	virtual void runDeviceCode(int count);
-	
-	virtual void runHostCode(int count);
-	
-};
-
-void CudaCompressor::compressKernel(CudaCompressionKernel * kernel)
-{
-	nvDebugCheck(cuda::isHardwarePresent());
-#if defined HAVE_CUDA
-
-	// Image size in blocks.
-	const uint w = (image->width() + 3) / 4;
-	const uint h = (image->height() + 3) / 4;
-
-	uint imageSize = w * h * 16 * sizeof(Color32);
-    uint * blockLinearImage = (uint *) malloc(imageSize);
-	convertToBlockLinear(image, blockLinearImage);	// @@ Do this in parallel with the GPU, or in the GPU!
-
-	const uint blockNum = w * h;
-	const uint compressedSize = blockNum * 8;
-
-	clock_t start = clock();
-
-	kernel->setup(compressionOptions);
-	kernel->setBitmapTable(m_bitmapTable);
-	
-	// TODO: Add support for multiple GPUs.
-	uint bn = 0;
-	while(bn != blockNum)
-	{
-		uint count = min(blockNum - bn, MAX_BLOCKS);
-
-	    cudaMemcpy(m_data, blockLinearImage + bn * 16, count * 64, cudaMemcpyHostToDevice);
-
-		kernel->runDeviceCode(count, m_data, m_result);
-
-		kernel->runHostCode(count);
-		
-		// Check for errors.
-		cudaError_t err = cudaGetLastError();
-		if (err != cudaSuccess)
-		{
-			nvDebug("CUDA Error: %s\n", cudaGetErrorString(err));
-
-			if (outputOptions.errorHandler != NULL)
-			{
-				outputOptions.errorHandler->error(Error_CudaError);
-			}
-		}
-
-		// Copy result to host, overwrite swizzled image.
-		cudaMemcpy(blockLinearImage, m_result, count * 8, cudaMemcpyDeviceToHost);
-
-		// Output result.
-		kernel->outputResult(outputOptions.outputHandler);
-		
-		if (outputOptions.outputHandler != NULL)
-		{
-			outputOptions.outputHandler->writeData(blockLinearImage, count * 8);
-		}
-
-		bn += count;
-	}
-
-	clock_t end = clock();
-	//printf("\rCUDA time taken: %.3f seconds\n", float(end-start) / CLOCKS_PER_SEC);
-
-	free(blockLinearImage);
-
-#else
-	if (outputOptions.errorHandler != NULL)
-	{
-		outputOptions.errorHandler->error(Error_CudaError);
-	}
-#endif
-}
-
-#endif // 0
-
-
-void CudaCompressor::setImage(const Image * image, nvtt::AlphaMode alphaMode)
-{
-	m_image = image;
-	m_alphaMode = alphaMode;
-}
-
-
-
-/// Compress image using CUDA.
-void CudaCompressor::compressDXT1(const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions)
-{
-	nvDebugCheck(cuda::isHardwarePresent());
-#if defined HAVE_CUDA
-
-    // Allocate image as a cuda array.
-    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsigned);
-
-    cudaArray * d_image;
-	const int imageSize = m_image->width() * m_image->height() * sizeof(uint);
-    cudaMallocArray(&d_image, &channelDesc, m_image->width(), m_image->height()); 
-    cudaMemcpyToArray(d_image, 0, 0, m_image->pixels(), imageSize, cudaMemcpyHostToDevice);
-
-
-	// Image size in blocks.
-	const uint w = (m_image->width() + 3) / 4;
-	const uint h = (m_image->height() + 3) / 4;
-	const uint blockNum = w * h;
-	const uint compressedSize = blockNum * 8;
-
-	void * h_result = malloc(min(blockNum, MAX_BLOCKS) * 8);
-
-	//clock_t start = clock();
-
-	setupCompressKernel(compressionOptions.colorWeight.ptr());
-
-	uint bn = 0;
-	while(bn != blockNum)
-	{
-		uint count = min(blockNum - bn, MAX_BLOCKS);
-
-		// Launch kernel.
-		compressKernelDXT1_Tex(bn, count, w, d_image, m_result, m_bitmapTable);
-
-		// Check for errors.
-		cudaError_t err = cudaGetLastError();
-		if (err != cudaSuccess)
-		{
-			nvDebug("CUDA Error: %s\n", cudaGetErrorString(err));
-
-			if (outputOptions.errorHandler != NULL)
-			{
-				outputOptions.errorHandler->error(Error_CudaError);
-			}
-		}
-
-		// Copy result to host, overwrite swizzled image.
-		cudaMemcpy(h_result, m_result, count * 8, cudaMemcpyDeviceToHost);
-
-		// Output result.
-		if (outputOptions.outputHandler != NULL)
-		{
-			outputOptions.outputHandler->writeData(h_result, count * 8);
-		}
-
-		bn += count;
-	}
-
-	//clock_t end = clock();
-	//printf("\rCUDA time taken: %.3f seconds\n", float(end-start) / CLOCKS_PER_SEC);
-
-	free(h_result);
-
-#else
-	if (outputOptions.errorHandler != NULL)
-	{
-		outputOptions.errorHandler->error(Error_CudaError);
-	}
-#endif
-}
-
-
-
 /// Compress image using CUDA.
 void CudaCompressor::compressDXT3(const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions)
 {
@ -337,16 +338,16 @@ void CudaCompressor::compressDXT3(const CompressionOptions::Private & compressio
 	{
 		uint count = min(blockNum - bn, MAX_BLOCKS);

-	    cudaMemcpy(m_data, blockLinearImage + bn * 16, count * 64, cudaMemcpyHostToDevice);
+	    cudaMemcpy(m_ctx.data, blockLinearImage + bn * 16, count * 64, cudaMemcpyHostToDevice);

 		// Launch kernel.
 		if (m_alphaMode == AlphaMode_Transparency)
 		{
-			compressWeightedKernelDXT1(count, m_data, m_result, m_bitmapTable);
+			compressWeightedKernelDXT1(count, m_ctx.data, m_ctx.result, m_ctx.bitmapTable);
 		}
 		else
 		{
-			compressKernelDXT1_Level4(count, m_data, m_result, m_bitmapTable);
+			compressKernelDXT1_Level4(count, m_ctx.data, m_ctx.result, m_ctx.bitmapTable);
 		}

 		// Compress alpha in parallel with the GPU.
@ -369,7 +370,7 @@ void CudaCompressor::compressDXT3(const CompressionOptions::Private & compressio
 		}

 		// Copy result to host, overwrite swizzled image.
-		cudaMemcpy(blockLinearImage, m_result, count * 8, cudaMemcpyDeviceToHost);
+		cudaMemcpy(blockLinearImage, m_ctx.result, count * 8, cudaMemcpyDeviceToHost);

 		// Output result.
 		if (outputOptions.outputHandler != NULL)
@ -428,16 +429,16 @@ void CudaCompressor::compressDXT5(const CompressionOptions::Private & compressio
 	{
 		uint count = min(blockNum - bn, MAX_BLOCKS);

-	    cudaMemcpy(m_data, blockLinearImage + bn * 16, count * 64, cudaMemcpyHostToDevice);
+	    cudaMemcpy(m_ctx.data, blockLinearImage + bn * 16, count * 64, cudaMemcpyHostToDevice);

 		// Launch kernel.
 		if (m_alphaMode == AlphaMode_Transparency)
 		{
-			compressWeightedKernelDXT1(count, m_data, m_result, m_bitmapTable);
+			compressWeightedKernelDXT1(count, m_ctx.data, m_ctx.result, m_ctx.bitmapTable);
 		}
 		else
 		{
-			compressKernelDXT1_Level4(count, m_data, m_result, m_bitmapTable);
+			compressKernelDXT1_Level4(count, m_ctx.data, m_ctx.result, m_ctx.bitmapTable);
 		}

 		// Compress alpha in parallel with the GPU.
@ -460,7 +461,7 @@ void CudaCompressor::compressDXT5(const CompressionOptions::Private & compressio
 		}

 		// Copy result to host, overwrite swizzled image.
-		cudaMemcpy(blockLinearImage, m_result, count * 8, cudaMemcpyDeviceToHost);
+		cudaMemcpy(blockLinearImage, m_ctx.result, count * 8, cudaMemcpyDeviceToHost);

 		// Output result.
 		if (outputOptions.outputHandler != NULL)
@ -516,10 +517,10 @@ void CudaCompressor::compressDXT1n(const nvtt::CompressionOptions::Private & com
 	{
 		uint count = min(blockNum - bn, MAX_BLOCKS);

-	    cudaMemcpy(m_data, blockLinearImage + bn * 16, count * 64, cudaMemcpyHostToDevice);
+	    cudaMemcpy(m_ctx.data, blockLinearImage + bn * 16, count * 64, cudaMemcpyHostToDevice);

 		// Launch kernel.
-		compressNormalKernelDXT1(count, m_data, m_result, m_bitmapTable);
+		compressNormalKernelDXT1(count, m_ctx.data, m_ctx.result, m_ctx.bitmapTable);

 		// Check for errors.
 		cudaError_t err = cudaGetLastError();
@ -534,7 +535,7 @@ void CudaCompressor::compressDXT1n(const nvtt::CompressionOptions::Private & com
 		}

 		// Copy result to host, overwrite swizzled image.
-		cudaMemcpy(blockLinearImage, m_result, count * 8, cudaMemcpyDeviceToHost);
+		cudaMemcpy(blockLinearImage, m_ctx.result, count * 8, cudaMemcpyDeviceToHost);

 		// Output result.
 		if (outputOptions.outputHandler != NULL)
@ -585,10 +586,10 @@ void CudaCompressor::compressCTX1(const nvtt::CompressionOptions::Private & comp
 	{
 		uint count = min(blockNum - bn, MAX_BLOCKS);

-	    cudaMemcpy(m_data, blockLinearImage + bn * 16, count * 64, cudaMemcpyHostToDevice);
+	    cudaMemcpy(m_ctx.data, blockLinearImage + bn * 16, count * 64, cudaMemcpyHostToDevice);

 		// Launch kernel.
-		compressKernelCTX1(count, m_data, m_result, m_bitmapTableCTX);
+		compressKernelCTX1(count, m_ctx.data, m_ctx.result, m_ctx.bitmapTableCTX);

 		// Check for errors.
 		cudaError_t err = cudaGetLastError();
@ -603,7 +604,7 @@ void CudaCompressor::compressCTX1(const nvtt::CompressionOptions::Private & comp
 		}

 		// Copy result to host, overwrite swizzled image.
-		cudaMemcpy(blockLinearImage, m_result, count * 8, cudaMemcpyDeviceToHost);
+		cudaMemcpy(blockLinearImage, m_ctx.result, count * 8, cudaMemcpyDeviceToHost);

 		// Output result.
 		if (outputOptions.outputHandler != NULL)
@ -643,4 +644,4 @@ void CudaCompressor::compressDXT5n(const nvtt::CompressionOptions::Private & com
 #endif
 }

-
+#endif // 0
--- a/src/nvtt/cuda/CudaCompressDXT.h
+++ b/src/nvtt/cuda/CudaCompressDXT.h
@ -27,38 +27,86 @@
 #include <nvimage/nvimage.h>
 #include <nvtt/nvtt.h>

+#include "nvtt/CompressDXT.h"
+
+struct cudaArray;
+
 namespace nv
 {
 	class Image;

-	class CudaCompressor
+	class CudaContext
 	{
 	public:
-		CudaCompressor();
-		~CudaCompressor();
+		CudaContext();
+		~CudaContext();

 		bool isValid() const;

-		void setImage(const Image * image, nvtt::AlphaMode alphaMode);
-
-		void compressDXT1(const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
-		void compressDXT3(const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
-		void compressDXT5(const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
-		void compressDXT1n(const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
-		void compressCTX1(const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
-		void compressDXT5n(const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
-
-	private:
-
-		uint * m_bitmapTable;
-		uint * m_bitmapTableCTX;
-		uint * m_data;
-		uint * m_result;
-		
-		const Image * m_image;
-		nvtt::AlphaMode m_alphaMode;
+	public:
+		// Device pointers.
+		uint * bitmapTable;
+		uint * bitmapTableCTX;
+		uint * data;
+		uint * result;
 	};

+
+	struct CudaCompressor : public CompressorInterface
+	{
+		CudaCompressor(CudaContext & ctx);
+
+		virtual void compress(nvtt::InputFormat inputFormat, nvtt::AlphaMode alphaMode, uint w, uint h, void * data, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
+
+		virtual void setup(cudaArray * image, const nvtt::CompressionOptions::Private & compressionOptions) = 0;
+		virtual void compressBlocks(uint first, uint count, uint w, uint h, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output) = 0;
+		virtual uint blockSize() const = 0;
+
+	protected:
+		CudaContext & m_ctx;
+	};
+
+	struct CudaCompressorDXT1 : public CudaCompressor
+	{
+		CudaCompressorDXT1(CudaContext & ctx) : CudaCompressor(ctx) {}
+
+		virtual void setup(cudaArray * image, const nvtt::CompressionOptions::Private & compressionOptions);
+		virtual void compressBlocks(uint first, uint count, uint w, uint h, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
+		virtual uint blockSize() const { return 8; };
+	};
+
+	/*struct CudaCompressorDXT1n : public CudaCompressor
+	{
+		virtual void setup(const CompressionOptions::Private & compressionOptions);
+		virtual void compressBlocks(uint blockCount, const void * input, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output) = 0;
+		virtual uint blockSize() const { return 8; };
+	};*/
+
+	struct CudaCompressorDXT3 : public CudaCompressor
+	{
+		CudaCompressorDXT3(CudaContext & ctx) : CudaCompressor(ctx) {}
+
+		virtual void setup(cudaArray * image, const nvtt::CompressionOptions::Private & compressionOptions);
+		virtual void compressBlocks(uint first, uint count, uint w, uint h, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
+		virtual uint blockSize() const { return 16; };
+	};
+
+	struct CudaCompressorDXT5 : public CudaCompressor
+	{
+		CudaCompressorDXT5(CudaContext & ctx) : CudaCompressor(ctx) {}
+
+		virtual void setup(cudaArray * image, const nvtt::CompressionOptions::Private & compressionOptions);
+		virtual void compressBlocks(uint first, uint count, uint w, uint h, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
+		virtual uint blockSize() const { return 16; };
+	};
+
+	/*struct CudaCompressorCXT1 : public CudaCompressor
+	{
+		virtual void setup(const CompressionOptions::Private & compressionOptions);
+		virtual void compressBlocks(uint blockCount, const void * input, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output) = 0;
+		virtual uint blockSize() const { return 8; };
+	};*/
+
 } // nv namespace