diff --git a/src/nvimage/nvtt/cuda/CompressKernel.cu b/src/nvimage/nvtt/cuda/CompressKernel.cu
index ad29a94..784a7bf 100644
--- a/src/nvimage/nvtt/cuda/CompressKernel.cu
+++ b/src/nvimage/nvtt/cuda/CompressKernel.cu
@@ -141,6 +141,7 @@ __device__ void loadColorBlock(const uint * image, float3 colors[16], float3 sum
 	const int bid = blockIdx.x;
 	const int idx = threadIdx.x;
 
+	__shared__ float3 rawColors[16];
 	__shared__ float dps[16];
 
 	if (idx < 16)
@@ -148,10 +149,13 @@ __device__ void loadColorBlock(const uint * image, float3 colors[16], float3 sum
 		// Read color and copy to shared mem.
 		uint c = image[(bid) * 16 + idx];
 		
-		colors[idx].z = ((c >> 0) & 0xFF) * (1.0f / 255.0f);
-		colors[idx].y = ((c >> 8) & 0xFF) * (1.0f / 255.0f);
-		colors[idx].x = ((c >> 16) & 0xFF) * (1.0f / 255.0f);
-		weights[idx] = ((c >> 24) & 0xFF) * (1.0f / 255.0f);
+		rawColors[idx].z = ((c >> 0) & 0xFF) * (1.0f / 255.0f);
+		rawColors[idx].y = ((c >> 8) & 0xFF) * (1.0f / 255.0f);
+		rawColors[idx].x = ((c >> 16) & 0xFF) * (1.0f / 255.0f);
+		weights[idx] = (((c >> 24) & 0xFF) + 1) * (1.0f / 256.0f);
+		
+		colors[idx] = rawColors[idx] * weights[idx];
+
 		
 		// No need to synchronize, 16 < warp size.
 #if __DEVICE_EMULATION__
@@ -162,7 +166,7 @@ __device__ void loadColorBlock(const uint * image, float3 colors[16], float3 sum
 		colorSums(colors, sums);
 		float3 axis = bestFitLine(colors, sums[0]);
 
-		dps[idx] = dot(colors[idx], axis);
+		dps[idx] = dot(rawColors[idx], axis);
 		
 #if __DEVICE_EMULATION__
 		} __debugsync(); if (idx < 16) {
@@ -283,7 +287,7 @@ __constant__ float alphaTable3[4] = { 4.0f, 0.0f, 2.0f, 2.0f };
 __constant__ const uint prods4[4] = { 0x090000,0x000900,0x040102,0x010402 };
 __constant__ const uint prods3[4] = { 0x040000,0x000400,0x040101,0x010401 };
 
-__device__ float evalPermutation4(const float3 * colors, uint permutation, ushort * start, ushort * end, float3 color_sum)
+__device__ float evalPermutation4(const float3 * colors, float3 color_sum, uint permutation, ushort * start, ushort * end)
 {
     // Compute endpoints using least squares.
     float3 alphax_sum = make_float3(0.0f, 0.0f, 0.0f);
@@ -318,7 +322,7 @@ __device__ float evalPermutation4(const float3 * colors, uint permutation, ushor
     return (1.0f / 9.0f) * dot(e, kColorMetric);
 }
 
-__device__ float evalPermutation3(const float3 * colors, uint permutation, ushort * start, ushort * end, float3 color_sum)
+__device__ float evalPermutation3(const float3 * colors, float3 color_sum, uint permutation, ushort * start, ushort * end)
 {
     // Compute endpoints using least squares.
     float3 alphax_sum = make_float3(0.0f, 0.0f, 0.0f);
@@ -353,14 +357,13 @@ __device__ float evalPermutation3(const float3 * colors, uint permutation, ushor
     return (1.0f / 4.0f) * dot(e, kColorMetric);
 }
 
-__device__ float evalPermutation4(const float3 * colors, const float * weights, uint permutation, ushort * start, ushort * end)
+__device__ float evalPermutation4(const float3 * colors, const float * weights, float3 color_sum, uint permutation, ushort * start, ushort * end)
 {
     // Compute endpoints using least squares.
     float alpha2_sum = 0.0f;
     float beta2_sum = 0.0f;
     float alphabeta_sum = 0.0f;
     float3 alphax_sum = make_float3(0.0f, 0.0f, 0.0f);
-    float3 betax_sum = make_float3(0.0f, 0.0f, 0.0f);
 
     // Compute alpha & beta for this permutation.
     for (int i = 0; i < 16; i++)
@@ -374,10 +377,11 @@ __device__ float evalPermutation4(const float3 * colors, const float * weights,
 		alpha2_sum += alpha * alpha * weights[i];
         beta2_sum += beta * beta * weights[i];
         alphabeta_sum += alpha * beta * weights[i];
-        alphax_sum += alpha * colors[i] * weights[i];
-        betax_sum += beta * colors[i] * weights[i];
+        alphax_sum += alpha * colors[i];
     }
 
+	float3 betax_sum = color_sum - alphax_sum;
+
     const float factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);
 
     float3 a = (alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor;
@@ -393,45 +397,47 @@ __device__ float evalPermutation4(const float3 * colors, const float * weights,
     return dot(e, kColorMetric);
 }
 
-__device__ float evalPermutation3(const float3 * colors, const float * weights, uint permutation, ushort * start, ushort * end)
-{
-    // Compute endpoints using least squares.
-    float alpha2_sum = 0.0f;
-    float beta2_sum = 0.0f;
-    float alphabeta_sum = 0.0f;
-    float3 alphax_sum = make_float3(0.0f, 0.0f, 0.0f);
-    float3 betax_sum = make_float3(0.0f, 0.0f, 0.0f);
-
-    // Compute alpha & beta for this permutation.
-    for (int i = 0; i < 16; i++)
-    {
-        const uint bits = permutation >> (2*i);
-
-        float beta = (bits & 1);
-        if (bits & 2) beta = 0.5f;
-        float alpha = 1.0f - beta;
-    
-        alpha2_sum += alpha * alpha * weights[i];
-        beta2_sum += beta * beta * weights[i];
-        alphabeta_sum += alpha * beta * weights[i];
-        alphax_sum += alpha * colors[i] * weights[i];
-        betax_sum += beta * colors[i] * weights[i];
-    }
-
-    const float factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);
-
-    float3 a = (alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor;
-    float3 b = (betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor;
-    
-    // Round a, b to the closest 5-6-5 color and expand...
-    a = roundAndExpand(a, start);
-    b = roundAndExpand(b, end);
-
-    // compute the error
-    float3 e = a * a * alpha2_sum + b * b * beta2_sum + 2.0f * (a * b * alphabeta_sum - a * alphax_sum - b * betax_sum);
-
-    return dot(e, kColorMetric);
-}
+/*
+__device__ float evalPermutation3(const float3 * colors, const float * weights, uint permutation, ushort * start, ushort * end)
+{
+	// Compute endpoints using least squares.
+	float alpha2_sum = 0.0f;
+	float beta2_sum = 0.0f;
+	float alphabeta_sum = 0.0f;
+	float3 alphax_sum = make_float3(0.0f, 0.0f, 0.0f);
+
+	// Compute alpha & beta for this permutation.
+	for (int i = 0; i < 16; i++)
+	{
+		const uint bits = permutation >> (2*i);
+
+		float beta = (bits & 1);
+		if (bits & 2) beta = 0.5f;
+		float alpha = 1.0f - beta;
+
+		alpha2_sum += alpha * alpha * weights[i];
+		beta2_sum += beta * beta * weights[i];
+		alphabeta_sum += alpha * beta * weights[i];
+		alphax_sum += alpha * colors[i];
+	}
+
+	float3 betax_sum = color_sum - alphax_sum;
+
+	const float factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);
+
+	float3 a = (alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor;
+	float3 b = (betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor;
+
+	// Round a, b to the closest 5-6-5 color and expand...
+	a = roundAndExpand(a, start);
+	b = roundAndExpand(b, end);
+
+	// compute the error
+	float3 e = a * a * alpha2_sum + b * b * beta2_sum + 2.0f * (a * b * alphabeta_sum - a * alphax_sum - b * betax_sum);
+
+	return dot(e, kColorMetric);
+}
+*/
 
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -454,7 +460,7 @@ __device__ void evalAllPermutations(const float3 * colors, float3 colorSum, cons
         uint permutation = permutations[pidx];
         if (pidx < 160) s_permutations[pidx] = permutation;
                 
-        float error = evalPermutation4(colors, permutation, &start, &end, colorSum);
+        float error = evalPermutation4(colors, colorSum, permutation, &start, &end);
         
         if (error < bestError)
         {
@@ -478,7 +484,7 @@ __device__ void evalAllPermutations(const float3 * colors, float3 colorSum, cons
         
         ushort start, end;
         uint permutation = s_permutations[pidx];
-        float error = evalPermutation3(colors, permutation, &start, &end, colorSum);
+        float error = evalPermutation3(colors, colorSum, permutation, &start, &end);
         
         if (error < bestError)
         {
@@ -498,6 +504,7 @@ __device__ void evalAllPermutations(const float3 * colors, float3 colorSum, cons
 	errors[idx] = bestError;
 }
 
+/*
 __device__ void evalAllPermutations(const float3 * colors, const float * weights, const uint * permutations, ushort & bestStart, ushort & bestEnd, uint & bestPermutation, float * errors)
 {
 	const int idx = threadIdx.x;
@@ -558,9 +565,9 @@ __device__ void evalAllPermutations(const float3 * colors, const float * weights
 
 	errors[idx] = bestError;
 }
+*/
 
-
-__device__ void evalLevel4Permutations(const float3 * colors, const float * weights, const uint * permutations, ushort & bestStart, ushort & bestEnd, uint & bestPermutation, float * errors)
+__device__ void evalLevel4Permutations(const float3 * colors, const float * weights, float3 colorSum, const uint * permutations, ushort & bestStart, ushort & bestEnd, uint & bestPermutation, float * errors)
 {
 	const int idx = threadIdx.x;
 	
@@ -574,7 +581,7 @@ __device__ void evalLevel4Permutations(const float3 * colors, const float * weig
         ushort start, end;
         uint permutation = permutations[pidx];
 
-        float error = evalPermutation4(colors, weights, permutation, &start, &end);
+        float error = evalPermutation4(colors, weights, colorSum, permutation, &start, &end);
         
         if (error < bestError)
         {
@@ -755,7 +762,7 @@ __global__ void compressWeighted(const uint * permutations, const uint * image,
 
 	__shared__ float errors[NUM_THREADS];
 	
-	evalLevel4Permutations(colors, weights, permutations, bestStart, bestEnd, bestPermutation, errors);
+	evalLevel4Permutations(colors, weights, sums[0], permutations, bestStart, bestEnd, bestPermutation, errors);
 	
 	// Use a parallel reduction to find minimum error.
 	int minIdx = findMinError(errors);
@@ -768,6 +775,7 @@ __global__ void compressWeighted(const uint * permutations, const uint * image,
 }
 
 
+/*
 __device__ float computeError(const float weights[16], uchar a0, uchar a1)
 {
 	float palette[6];
@@ -803,7 +811,7 @@ inline __device__ uchar roundAndExpand(float a)
 {
     return rintf(__saturatef(a) * 255.0f);
 }
-
+*/
 /*
 __device__ void optimizeAlpha8(const float alphas[16], uchar & a0, uchar & a1)
 {
@@ -838,7 +846,7 @@ __device__ void optimizeAlpha8(const float alphas[16], uchar & a0, uchar & a1)
 	a1 = roundAndExpand(b);
 }
 */
-
+/*
 __device__ void compressAlpha(const float alphas[16], uint4 * result)
 {
 	const int tid = threadIdx.x;
@@ -932,7 +940,7 @@ __global__ void compressDXT5(const uint * permutations, const uint * image, uint
 
 	__shared__ float errors[NUM_THREADS];
 	
-	evalLevel4Permutations(colors, weights, permutations, bestStart, bestEnd, bestPermutation, errors);
+	evalLevel4Permutations(colors, weights, sums[0], permutations, bestStart, bestEnd, bestPermutation, errors);
 	
 	// Use a parallel reduction to find minimum error.
 	int minIdx = findMinError(errors);
@@ -943,7 +951,7 @@ __global__ void compressDXT5(const uint * permutations, const uint * image, uint
 		saveBlockDXT1(bestStart, bestEnd, bestPermutation, xrefs, (uint2 *)result);
 	}
 }
-
+*/
 
 ////////////////////////////////////////////////////////////////////////////////
 // Setup kernel
diff --git a/src/nvimage/nvtt/cuda/CudaCompressDXT.cpp b/src/nvimage/nvtt/cuda/CudaCompressDXT.cpp
index d0df912..d652b1a 100644
--- a/src/nvimage/nvtt/cuda/CudaCompressDXT.cpp
+++ b/src/nvimage/nvtt/cuda/CudaCompressDXT.cpp
@@ -29,10 +29,12 @@
 #include <nvimage/ColorBlock.h>
 #include <nvimage/BlockDXT.h>
 #include <nvimage/nvtt/CompressionOptions.h>
+#include <nvimage/nvtt/FastCompressDXT.h>
 
 #include "CudaCompressDXT.h"
 #include "CudaUtils.h"
 
+
 #if defined HAVE_CUDA
 #include <cuda_runtime.h>
 #endif
@@ -186,6 +188,28 @@ static void doPrecomputation()
 
 #endif
 
+// Convert linear image to block linear.
+static void convertToBlockLinear(const Image * image, uint * blockLinearImage)
+{
+	const uint w = (image->width() + 3) / 4;
+	const uint h = (image->height() + 3) / 4;
+
+	for(uint by = 0; by < h; by++) {
+		for(uint bx = 0; bx < w; bx++) {
+			const uint bw = min(image->width() - bx * 4, 4U);
+			const uint bh = min(image->height() - by * 4, 4U);
+
+			for (uint i = 0; i < 16; i++) {
+				const int x = (i % 4) % bw;
+				const int y = (i / 4) % bh;
+				blockLinearImage[(by * w + bx) * 16 + i] = image->pixel(bx * 4 + x, by * 4 + y).u;
+			}
+		}
+	}
+}
+
+// @@ This code is very repetitive and needs to be cleaned up.
+
 
 /// Compress image using CUDA.
 void nv::cudaCompressDXT1(const Image * image, const OutputOptions & outputOptions, const CompressionOptions::Private & compressionOptions)
@@ -201,24 +225,11 @@ void nv::cudaCompressDXT1(const Image * image, const OutputOptions & outputOptio
 
 	uint imageSize = w * h * 16 * sizeof(Color32);
     uint * blockLinearImage = (uint *) malloc(imageSize);
-
-	// Convert linear image to block linear.
-	for(uint by = 0; by < h; by++) {
-		for(uint bx = 0; bx < w; bx++) {
-			const uint bw = min(image->width() - bx * 4, 4U);
-			const uint bh = min(image->height() - by * 4, 4U);
-
-			for (uint i = 0; i < 16; i++) {
-				const int x = (i % 4) % bw;
-				const int y = (i / 4) % bh;
-				blockLinearImage[(by * w + bx) * 16 + i] = image->pixel(bx * 4 + x, by * 4 + y).u;
-			}
-		}
-	}
+	convertToBlockLinear(image, blockLinearImage);
 
 	const uint blockNum = w * h;
 	const uint compressedSize = blockNum * 8;
-	const uint blockMax = 32768; // 65535
+	const uint blockMax = 32768; // 49152, 65535
 
     // Allocate image in device memory.
     uint * d_data = NULL;
@@ -283,6 +294,201 @@ void nv::cudaCompressDXT1(const Image * image, const OutputOptions & outputOptio
 }
 
 
+/// Compress image using CUDA.
+void nv::cudaCompressDXT3(const Image * image, const OutputOptions & outputOptions, const CompressionOptions::Private & compressionOptions)
+{
+	nvDebugCheck(cuda::isHardwarePresent());
+#if defined HAVE_CUDA
+
+	doPrecomputation();
+
+	// Image size in blocks.
+	const uint w = (image->width() + 3) / 4;
+	const uint h = (image->height() + 3) / 4;
+
+	uint imageSize = w * h * 16 * sizeof(Color32);
+    uint * blockLinearImage = (uint *) malloc(imageSize);
+	convertToBlockLinear(image, blockLinearImage);
+
+	const uint blockNum = w * h;
+	const uint compressedSize = blockNum * 8;
+	const uint blockMax = 32768; // 49152, 65535
+
+    // Allocate image in device memory.
+    uint * d_data = NULL;
+    cudaMalloc((void**) &d_data, min(imageSize, blockMax * 64U));
+
+	// Allocate result.
+    uint * d_result = NULL;
+    cudaMalloc((void**) &d_result, min(compressedSize, blockMax * 8U));
+
+	AlphaBlockDXT3 * alphaBlocks = NULL;
+	alphaBlocks = (AlphaBlockDXT3 *)malloc(min(compressedSize, blockMax * 8U));
+
+	setupCompressKernel(compressionOptions.colorWeight.ptr());
+	
+	clock_t start = clock();
+
+	uint bn = 0;
+	while(bn != blockNum)
+	{
+		uint count = min(blockNum - bn, blockMax);
+
+	    cudaMemcpy(d_data, blockLinearImage + bn * 16, count * 64, cudaMemcpyHostToDevice);
+
+		// Launch kernel.
+		compressWeightedKernel(count, d_data, d_result, d_bitmaps);
+
+		// Compress alpha in parallel with the GPU.
+		for (uint i = 0; i < count; i++)
+		{
+			ColorBlock rgba(blockLinearImage + (bn + i) * 16);
+			compressBlock(rgba, alphaBlocks + i);
+		}
+
+		// Check for errors.
+		cudaError_t err = cudaGetLastError();
+		if (err != cudaSuccess)
+		{
+			nvDebug("CUDA Error: %s\n", cudaGetErrorString(err));
+
+			if (outputOptions.errorHandler != NULL)
+			{
+				outputOptions.errorHandler->error(Error_CudaError);
+			}
+		}
+
+		// Copy result to host, overwrite swizzled image.
+		cudaMemcpy(blockLinearImage, d_result, count * 8, cudaMemcpyDeviceToHost);
+
+		// Output result.
+		if (outputOptions.outputHandler != NULL)
+		{
+			for (uint i = 0; i < count; i++)
+			{
+				outputOptions.outputHandler->writeData(alphaBlocks + i, 8);
+				outputOptions.outputHandler->writeData(blockLinearImage + i * 2, 8);
+			}
+		}
+
+		bn += count;
+	}
+
+	clock_t end = clock();
+	printf("\rCUDA time taken: %.3f seconds\n", float(end-start) / CLOCKS_PER_SEC);
+
+	free(alphaBlocks);
+	free(blockLinearImage);
+	cudaFree(d_data);
+	cudaFree(d_result);
+
+#else
+	if (outputOptions.errorHandler != NULL)
+	{
+		outputOptions.errorHandler->error(Error_CudaError);
+	}
+#endif
+}
+
+
+/// Compress image using CUDA.
+void nv::cudaCompressDXT5(const Image * image, const OutputOptions & outputOptions, const CompressionOptions::Private & compressionOptions)
+{
+	nvDebugCheck(cuda::isHardwarePresent());
+#if defined HAVE_CUDA
+
+	doPrecomputation();
+
+	// Image size in blocks.
+	const uint w = (image->width() + 3) / 4;
+	const uint h = (image->height() + 3) / 4;
+
+	uint imageSize = w * h * 16 * sizeof(Color32);
+    uint * blockLinearImage = (uint *) malloc(imageSize);
+	convertToBlockLinear(image, blockLinearImage);
+
+	const uint blockNum = w * h;
+	const uint compressedSize = blockNum * 8;
+	const uint blockMax = 32768; // 49152, 65535
+
+    // Allocate image in device memory.
+    uint * d_data = NULL;
+    cudaMalloc((void**) &d_data, min(imageSize, blockMax * 64U));
+
+	// Allocate result.
+    uint * d_result = NULL;
+    cudaMalloc((void**) &d_result, min(compressedSize, blockMax * 8U));
+
+	AlphaBlockDXT5 * alphaBlocks = NULL;
+	alphaBlocks = (AlphaBlockDXT5 *)malloc(min(compressedSize, blockMax * 8U));
+
+	setupCompressKernel(compressionOptions.colorWeight.ptr());
+	
+	clock_t start = clock();
+
+	uint bn = 0;
+	while(bn != blockNum)
+	{
+		uint count = min(blockNum - bn, blockMax);
+
+	    cudaMemcpy(d_data, blockLinearImage + bn * 16, count * 64, cudaMemcpyHostToDevice);
+
+		// Launch kernel.
+		compressWeightedKernel(count, d_data, d_result, d_bitmaps);
+
+		// Compress alpha in parallel with the GPU.
+		for (uint i = 0; i < count; i++)
+		{
+			ColorBlock rgba(blockLinearImage + (bn + i) * 16);
+			compressBlock_Iterative(rgba, alphaBlocks + i);
+		}
+
+		// Check for errors.
+		cudaError_t err = cudaGetLastError();
+		if (err != cudaSuccess)
+		{
+			nvDebug("CUDA Error: %s\n", cudaGetErrorString(err));
+
+			if (outputOptions.errorHandler != NULL)
+			{
+				outputOptions.errorHandler->error(Error_CudaError);
+			}
+		}
+
+		// Copy result to host, overwrite swizzled image.
+		cudaMemcpy(blockLinearImage, d_result, count * 8, cudaMemcpyDeviceToHost);
+
+		// Output result.
+		if (outputOptions.outputHandler != NULL)
+		{
+			for (uint i = 0; i < count; i++)
+			{
+				outputOptions.outputHandler->writeData(alphaBlocks + i, 8);
+				outputOptions.outputHandler->writeData(blockLinearImage + i * 2, 8);
+			}
+		}
+
+		bn += count;
+	}
+
+	clock_t end = clock();
+	printf("\rCUDA time taken: %.3f seconds\n", float(end-start) / CLOCKS_PER_SEC);
+
+	free(alphaBlocks);
+	free(blockLinearImage);
+	cudaFree(d_data);
+	cudaFree(d_result);
+
+#else
+	if (outputOptions.errorHandler != NULL)
+	{
+		outputOptions.errorHandler->error(Error_CudaError);
+	}
+#endif
+}
+
+
+
 #if defined HAVE_CUDA
 
 class Task
@@ -429,7 +635,7 @@ void nv::cudaCompressDXT1_2(const Image * image, const OutputOptions & outputOpt
 	const uint h = image->height();
 	
 	const uint blockNum = ((w + 3) / 4) * ((h + 3) / 4);
-	const uint blockMax = 32768; // 65535
+	const uint blockMax = 32768; // 49152, 65535
 	
 	doPrecomputation();
 	
diff --git a/src/nvimage/nvtt/cuda/CudaCompressDXT.h b/src/nvimage/nvtt/cuda/CudaCompressDXT.h
index 17d8c01..b88cc5c 100644
--- a/src/nvimage/nvtt/cuda/CudaCompressDXT.h
+++ b/src/nvimage/nvtt/cuda/CudaCompressDXT.h
@@ -32,6 +32,9 @@ namespace nv
 	class Image;
 
 	void cudaCompressDXT1(const Image * image, const nvtt::OutputOptions & outputOptions, const nvtt::CompressionOptions::Private & compressionOptions);
+	void cudaCompressDXT3(const Image * image, const nvtt::OutputOptions & outputOptions, const nvtt::CompressionOptions::Private & compressionOptions);
+	void cudaCompressDXT5(const Image * image, const nvtt::OutputOptions & outputOptions, const nvtt::CompressionOptions::Private & compressionOptions);
+
 	void cudaCompressDXT1_2(const Image * image, const nvtt::OutputOptions & outputOptions, const nvtt::CompressionOptions::Private & compressionOptions);
 
 } // nv namespace