DXT1a single color compressor. Fixes issue 131.

Init single color tables at startup.
2010-07-22 09:23:17 +00:00 · 2010-07-22 09:23:17 +00:00 · ac7c017c35
commit ac7c017c35
parent da548fd03a
12 changed files with 1384 additions and 1853 deletions
--- a/project/vc9/nvtt.sln
+++ b/project/vc9/nvtt.sln
@ -6,6 +6,7 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "nvtt", "nvtt\nvtt.vcproj",
 		{CE017322-01FC-4851-9C8B-64E9A8E26C38} = {CE017322-01FC-4851-9C8B-64E9A8E26C38}
 		{F143D180-D4C4-4037-B3DE-BE89A21C8D1D} = {F143D180-D4C4-4037-B3DE-BE89A21C8D1D}
 		{4046F392-A18B-4C66-9639-3EABFFF5D531} = {4046F392-A18B-4C66-9639-3EABFFF5D531}
+		{C33787E3-5564-4834-9FE3-A9020455A669} = {C33787E3-5564-4834-9FE3-A9020455A669}
 		{50C465FE-B308-42BC-894D-89484482AF06} = {50C465FE-B308-42BC-894D-89484482AF06}
 	EndProjectSection
 EndProject
@ -80,6 +81,8 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "imperativeapi", "imperative
 		{1AEB7681-57D8-48EE-813D-5C41CC38B647} = {1AEB7681-57D8-48EE-813D-5C41CC38B647}
 	EndProjectSection
 EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "bc6h", "bc6h\bc6h.vcproj", "{C33787E3-5564-4834-9FE3-A9020455A669}"
+EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Debug (no cuda)|Mixed Platforms = Debug (no cuda)|Mixed Platforms
@ -426,6 +429,30 @@ Global
 		{7DCF280E-702B-49F3-84A7-AE7E146384D6}.Release|Win32.Build.0 = Release|Win32
 		{7DCF280E-702B-49F3-84A7-AE7E146384D6}.Release|x64.ActiveCfg = Release|x64
 		{7DCF280E-702B-49F3-84A7-AE7E146384D6}.Release|x64.Build.0 = Release|x64
+		{C33787E3-5564-4834-9FE3-A9020455A669}.Debug (no cuda)|Mixed Platforms.ActiveCfg = Debug|Win32
+		{C33787E3-5564-4834-9FE3-A9020455A669}.Debug (no cuda)|Mixed Platforms.Build.0 = Debug|Win32
+		{C33787E3-5564-4834-9FE3-A9020455A669}.Debug (no cuda)|Win32.ActiveCfg = Debug|Win32
+		{C33787E3-5564-4834-9FE3-A9020455A669}.Debug (no cuda)|Win32.Build.0 = Debug|Win32
+		{C33787E3-5564-4834-9FE3-A9020455A669}.Debug (no cuda)|x64.ActiveCfg = Debug|x64
+		{C33787E3-5564-4834-9FE3-A9020455A669}.Debug (no cuda)|x64.Build.0 = Debug|x64
+		{C33787E3-5564-4834-9FE3-A9020455A669}.Debug|Mixed Platforms.ActiveCfg = Debug|Win32
+		{C33787E3-5564-4834-9FE3-A9020455A669}.Debug|Mixed Platforms.Build.0 = Debug|Win32
+		{C33787E3-5564-4834-9FE3-A9020455A669}.Debug|Win32.ActiveCfg = Debug|Win32
+		{C33787E3-5564-4834-9FE3-A9020455A669}.Debug|Win32.Build.0 = Debug|Win32
+		{C33787E3-5564-4834-9FE3-A9020455A669}.Debug|x64.ActiveCfg = Debug|x64
+		{C33787E3-5564-4834-9FE3-A9020455A669}.Debug|x64.Build.0 = Debug|x64
+		{C33787E3-5564-4834-9FE3-A9020455A669}.Release (no cuda)|Mixed Platforms.ActiveCfg = Release|Win32
+		{C33787E3-5564-4834-9FE3-A9020455A669}.Release (no cuda)|Mixed Platforms.Build.0 = Release|Win32
+		{C33787E3-5564-4834-9FE3-A9020455A669}.Release (no cuda)|Win32.ActiveCfg = Release|Win32
+		{C33787E3-5564-4834-9FE3-A9020455A669}.Release (no cuda)|Win32.Build.0 = Release|Win32
+		{C33787E3-5564-4834-9FE3-A9020455A669}.Release (no cuda)|x64.ActiveCfg = Release|x64
+		{C33787E3-5564-4834-9FE3-A9020455A669}.Release (no cuda)|x64.Build.0 = Release|x64
+		{C33787E3-5564-4834-9FE3-A9020455A669}.Release|Mixed Platforms.ActiveCfg = Release|Win32
+		{C33787E3-5564-4834-9FE3-A9020455A669}.Release|Mixed Platforms.Build.0 = Release|Win32
+		{C33787E3-5564-4834-9FE3-A9020455A669}.Release|Win32.ActiveCfg = Release|Win32
+		{C33787E3-5564-4834-9FE3-A9020455A669}.Release|Win32.Build.0 = Release|Win32
+		{C33787E3-5564-4834-9FE3-A9020455A669}.Release|x64.ActiveCfg = Release|x64
+		{C33787E3-5564-4834-9FE3-A9020455A669}.Release|x64.Build.0 = Release|x64
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
--- a/project/vc9/nvtt/nvtt.vcproj
+++ b/project/vc9/nvtt/nvtt.vcproj
@ -770,6 +770,10 @@
 				RelativePath="..\..\..\src\nvtt\CompressorDX11.cpp"
 				>
 			</File>
+			<File
+				RelativePath="..\..\..\src\nvtt\CompressorDX9.cpp"
+				>
+			</File>
 			<File
 				RelativePath="..\..\..\src\nvtt\CompressorDXT.cpp"
 				>
@ -886,6 +890,10 @@
 				RelativePath="..\..\..\src\nvtt\QuickCompressDXT.cpp"
 				>
 			</File>
+			<File
+				RelativePath="..\..\..\src\nvtt\SingleColorLookup.cpp"
+				>
+			</File>
 			<File
 				RelativePath="..\..\..\src\nvtt\TexImage.cpp"
 				>
@ -916,6 +924,10 @@
 				RelativePath="..\..\..\src\nvtt\CompressorDX11.h"
 				>
 			</File>
+			<File
+				RelativePath="..\..\..\src\nvtt\CompressorDX9.h"
+				>
+			</File>
 			<File
 				RelativePath="..\..\..\src\nvtt\CompressorDXT.h"
 				>
--- a/project/vc9/squish/squish.vcproj
+++ b/project/vc9/squish/squish.vcproj
@ -281,86 +281,58 @@
 	<References>
 	</References>
 	<Files>
-		<Filter
-			Name="Source Files"
-			Filter="cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx"
-			UniqueIdentifier="{4FC737F1-C7A5-4376-A066-2A32D752A2FF}"
+		<File
+			RelativePath="..\..\..\src\nvtt\squish\colourblock.cpp"
 			>
-			<File
-				RelativePath="..\..\..\src\nvtt\squish\clusterfit.cpp"
-				>
-			</File>
-			<File
-				RelativePath="..\..\..\src\nvtt\squish\colourblock.cpp"
-				>
-			</File>
-			<File
-				RelativePath="..\..\..\src\nvtt\squish\colourfit.cpp"
-				>
-			</File>
-			<File
-				RelativePath="..\..\..\src\nvtt\squish\colourset.cpp"
-				>
-			</File>
-			<File
-				RelativePath="..\..\..\src\nvtt\squish\fastclusterfit.cpp"
-				>
-			</File>
-			<File
-				RelativePath="..\..\..\src\nvtt\squish\maths.cpp"
-				>
-			</File>
-			<File
-				RelativePath="..\..\..\src\nvtt\squish\weightedclusterfit.cpp"
-				>
-			</File>
-		</Filter>
-		<Filter
-			Name="Header Files"
-			Filter="h;hpp;hxx;hm;inl;inc;xsd"
-			UniqueIdentifier="{93995380-89BD-4b04-88EB-625FBE52EBFB}"
+		</File>
+		<File
+			RelativePath="..\..\..\src\nvtt\squish\colourblock.h"
 			>
-			<File
-				RelativePath="..\..\..\src\nvtt\squish\clusterfit.h"
-				>
-			</File>
-			<File
-				RelativePath="..\..\..\src\nvtt\squish\colourblock.h"
-				>
-			</File>
-			<File
-				RelativePath="..\..\..\src\nvtt\squish\colourfit.h"
-				>
-			</File>
-			<File
-				RelativePath="..\..\..\src\nvtt\squish\colourset.h"
-				>
-			</File>
-			<File
-				RelativePath="..\..\..\src\nvtt\squish\fastclusterfit.h"
-				>
-			</File>
-			<File
-				RelativePath="..\..\..\src\nvtt\squish\maths.h"
-				>
-			</File>
-			<File
-				RelativePath="..\..\..\src\nvtt\squish\simd.h"
-				>
-			</File>
-			<File
-				RelativePath="..\..\..\src\nvtt\squish\simd_sse.h"
-				>
-			</File>
-			<File
-				RelativePath="..\..\..\src\nvtt\squish\simd_ve.h"
-				>
-			</File>
-			<File
-				RelativePath="..\..\..\src\nvtt\squish\weightedclusterfit.h"
-				>
-			</File>
-		</Filter>
+		</File>
+		<File
+			RelativePath="..\..\..\src\nvtt\squish\colourfit.cpp"
+			>
+		</File>
+		<File
+			RelativePath="..\..\..\src\nvtt\squish\colourfit.h"
+			>
+		</File>
+		<File
+			RelativePath="..\..\..\src\nvtt\squish\colourset.cpp"
+			>
+		</File>
+		<File
+			RelativePath="..\..\..\src\nvtt\squish\colourset.h"
+			>
+		</File>
+		<File
+			RelativePath="..\..\..\src\nvtt\squish\maths.cpp"
+			>
+		</File>
+		<File
+			RelativePath="..\..\..\src\nvtt\squish\maths.h"
+			>
+		</File>
+		<File
+			RelativePath="..\..\..\src\nvtt\squish\simd.h"
+			>
+		</File>
+		<File
+			RelativePath="..\..\..\src\nvtt\squish\simd_sse.h"
+			>
+		</File>
+		<File
+			RelativePath="..\..\..\src\nvtt\squish\simd_ve.h"
+			>
+		</File>
+		<File
+			RelativePath="..\..\..\src\nvtt\squish\weightedclusterfit.cpp"
+			>
+		</File>
+		<File
+			RelativePath="..\..\..\src\nvtt\squish\weightedclusterfit.h"
+			>
+		</File>
 	</Files>
 	<Globals>
 	</Globals>
--- a/src/nvtt/CompressorDX9.cpp
+++ b/src/nvtt/CompressorDX9.cpp
@ -29,7 +29,6 @@

 // squish
 #include "squish/colourset.h"
-#include "squish/fastclusterfit.h"
 #include "squish/weightedclusterfit.h"

 #include "nvtt.h"
@ -130,21 +129,18 @@ void NormalCompressorDXT1::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alph

 void NormalCompressorDXT1a::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
 {
-	bool anyAlpha = false;
-	bool allAlpha = true;
-		
+    uint alphaMask = 0;
 	for (uint i = 0; i < 16; i++)
 	{
-		if (rgba.color(i).a < 128) anyAlpha = true;
-		else allAlpha = false;
+		if (rgba.color(i).a < 128) alphaMask |= (3 << (i * 2)); // Set two bits for each color.
 	}

 	const bool isSingleColor = rgba.isSingleColor();
 		
-	if ((!anyAlpha && isSingleColor || allAlpha))
+	if (isSingleColor)
 	{
 		BlockDXT1 * block = new(output) BlockDXT1;
-		OptimalCompress::compressDXT1a(rgba.color(0), block);
+        OptimalCompress::compressDXT1a(rgba.color(0), alphaMask, block);
 	}
 	else
 	{
--- a/src/nvtt/OptimalCompressDXT.cpp
+++ b/src/nvtt/OptimalCompressDXT.cpp
@ -266,18 +266,27 @@ void OptimalCompress::compressDXT1(Color32 c, BlockDXT1 * dxtBlock)
 	}
 }

-void OptimalCompress::compressDXT1a(Color32 rgba, BlockDXT1 * dxtBlock)
+void OptimalCompress::compressDXT1a(Color32 c, uint alphaMask, BlockDXT1 * dxtBlock)
 {
-	if (rgba.a < 128)
-	{
-		dxtBlock->col0.u = 0;
-		dxtBlock->col1.u = 0;
-		dxtBlock->indices = 0xFFFFFFFF;
-	}
-	else
-	{
-		compressDXT1(rgba, dxtBlock);
-	}
+    if (alphaMask == 0) {
+        compressDXT1(c, dxtBlock);
+    }
+    else {
+        dxtBlock->col0.r = OMatchAlpha5[c.r][0];
+        dxtBlock->col0.g = OMatchAlpha6[c.g][0];
+        dxtBlock->col0.b = OMatchAlpha5[c.b][0];
+        dxtBlock->col1.r = OMatchAlpha5[c.r][1];
+        dxtBlock->col1.g = OMatchAlpha6[c.g][1];
+        dxtBlock->col1.b = OMatchAlpha5[c.b][1];
+        dxtBlock->indices = 0xaaaaaaaa; // 0b1010..1010
+
+        if (dxtBlock->col0.u > dxtBlock->col1.u)
+        {
+	        swap(dxtBlock->col0.u, dxtBlock->col1.u);
+        }
+
+        dxtBlock->indices |= alphaMask;
+    }
 }

 void OptimalCompress::compressDXT1G(uint8 g, BlockDXT1 * dxtBlock)
--- a/src/nvtt/OptimalCompressDXT.h
+++ b/src/nvtt/OptimalCompressDXT.h
@ -40,7 +40,7 @@ namespace nv
 	namespace OptimalCompress
 	{
 		void compressDXT1(Color32 rgba, BlockDXT1 * dxtBlock);
-		void compressDXT1a(Color32 rgba, BlockDXT1 * dxtBlock);
+		void compressDXT1a(Color32 rgba, uint alphaMask, BlockDXT1 * dxtBlock);
 		void compressDXT1G(uint8 g, BlockDXT1 * dxtBlock);
 		
 		void compressDXT1G(const ColorBlock & rgba, BlockDXT1 * block);
--- a/src/nvtt/SingleColorLookup.cpp
+++ b/src/nvtt/SingleColorLookup.cpp
@ -0,0 +1,90 @@
+
+#include "SingleColorLookup.h"
+
+#include "nvcore/Debug.h"
+
+#include <stdlib.h> // abs
+
+// Globals
+uint8 OMatch5[256][2];
+uint8 OMatch6[256][2];
+uint8 OMatchAlpha5[256][2];
+uint8 OMatchAlpha6[256][2];
+
+
+
+static int Mul8Bit(int a, int b)
+{
+	int t = a * b + 128;
+	return (t + (t >> 8)) >> 8;
+}
+
+static inline int Lerp13(int a, int b)
+{
+#ifdef DXT_USE_ROUNDING_BIAS
+    // with rounding bias
+    return a + Mul8Bit(b-a, 0x55);
+#else
+    // without rounding bias
+    // replace "/ 3" by "* 0xaaab) >> 17" if your compiler sucks or you really need every ounce of speed.
+    return (a * 2 + b) / 3;
+#endif
+}
+
+static void PrepareOptTable(uint8 * table, const uint8 * expand, int size, bool alpha_mode)
+{
+	for (int i = 0; i < 256; i++)
+	{
+		int bestErr = 256 * 100;
+
+		for (int min = 0; min < size; min++)
+		{
+			for (int max = 0; max < size; max++)
+			{
+				int mine = expand[min];
+				int maxe = expand[max];
+
+				int err;
+                if (alpha_mode) err = abs((maxe + mine)/2 - i);
+                else err = abs(Lerp13(maxe, mine) - i);
+                err *= 100;
+
+                // DX10 spec says that interpolation must be within 3% of "correct" result,
+                // add this as error term. (normally we'd expect a random distribution of
+                // +-1.5% error, but nowhere in the spec does it say that the error has to be
+                // unbiased - better safe than sorry).
+				err += abs(max - min) * 3;
+
+				if (err < bestErr)
+				{
+					table[i*2+0] = max;
+					table[i*2+1] = min;
+					bestErr = err;
+				}
+			}
+		}
+	}
+}
+
+
+NV_AT_STARTUP(initSingleColorLookup());
+
+void initSingleColorLookup()
+{
+	uint8 expand5[32];
+	uint8 expand6[64];
+
+    for (int i = 0; i < 32; i++) {
+		expand5[i] = (i<<3) | (i>>2);
+    }
+
+    for (int i = 0; i < 64; i++) {
+		expand6[i] = (i<<2) | (i>>4);
+    }
+
+	PrepareOptTable(&OMatch5[0][0], expand5, 32, false);
+	PrepareOptTable(&OMatch6[0][0], expand6, 64, false);
+    PrepareOptTable(&OMatchAlpha5[0][0], expand5, 32, true);
+	PrepareOptTable(&OMatchAlpha6[0][0], expand6, 64, true);
+};
+
--- a/src/nvtt/SingleColorLookup.h
+++ b/src/nvtt/SingleColorLookup.h
@ -1,588 +1,9 @@

-/*
-typedef unsigned char uint8;
+#include "nvcore/nvcore.h" // uint8

-static int Mul8Bit(int a, int b)
-{
-	int t = a * b + 128;
-	return (t + (t >> 8)) >> 8;
-}
-
-static inline int Lerp13(int fm, int to)
-{
-	return (fm * 2 + to) / 3;
-}
-
-static void PrepareOptTable(uint8 * Table, const uint8 * expand, int size)
-{
-	for (int i = 0; i < 256; i++)
-	{
-		float bestErr = 256;
-
-		for (int min = 0; min < size; min++)
-		{
-			for (int max = 0; max < size; max++)
-			{
-				int mine = expand[min];
-				int maxe = expand[max];
-				float err = abs(maxe + Mul8Bit(mine-maxe, 0x55) - i);
-				err += 0.03f * abs(max - min);
-
-				if (err < bestErr)
-				{
-					Table[i*2+0] = max;
-					Table[i*2+1] = min;
-					bestErr = err;
-				}
-			}
-		}
-	}
-}
-
-
-void initTables()
-{
-	uint8 Expand5[32];
-	uint8 Expand6[64];
-
-	for(sInt i=0;i<32;i++)
-		Expand5[i] = (i<<3)|(i>>2);
-
-	for(sInt i=0;i<64;i++)
-		Expand6[i] = (i<<2)|(i>>4);
-
-	PrepareOptTable(OMatch5, Expand5, 32)
-	PrepareOptTable(OMatch6, Expand6, 64)
-};
-*/
-
-#if __CUDACC__
-__constant__ unsigned short
-#else
-const static uint8 
-#endif
-OMatch5[256][2] =
-{
-	{0x00, 0x00},
-	{0x00, 0x00},
-	{0x00, 0x01},
-	{0x00, 0x01},
-	{0x01, 0x00},
-	{0x01, 0x00},
-	{0x01, 0x00},
-	{0x01, 0x01},
-	{0x01, 0x01},
-	{0x01, 0x01},
-	{0x01, 0x02},
-	{0x00, 0x04},
-	{0x02, 0x01},
-	{0x02, 0x01},
-	{0x02, 0x01},
-	{0x02, 0x02},
-	{0x02, 0x02},
-	{0x02, 0x02},
-	{0x02, 0x03},
-	{0x01, 0x05},
-	{0x03, 0x02},
-	{0x03, 0x02},
-	{0x04, 0x00},
-	{0x03, 0x03},
-	{0x03, 0x03},
-	{0x03, 0x03},
-	{0x03, 0x04},
-	{0x03, 0x04},
-	{0x03, 0x04},
-	{0x03, 0x05},
-	{0x04, 0x03},
-	{0x04, 0x03},
-	{0x05, 0x02},
-	{0x04, 0x04},
-	{0x04, 0x04},
-	{0x04, 0x05},
-	{0x04, 0x05},
-	{0x05, 0x04},
-	{0x05, 0x04},
-	{0x05, 0x04},
-	{0x06, 0x03},
-	{0x05, 0x05},
-	{0x05, 0x05},
-	{0x05, 0x06},
-	{0x04, 0x08},
-	{0x06, 0x05},
-	{0x06, 0x05},
-	{0x06, 0x05},
-	{0x06, 0x06},
-	{0x06, 0x06},
-	{0x06, 0x06},
-	{0x06, 0x07},
-	{0x05, 0x09},
-	{0x07, 0x06},
-	{0x07, 0x06},
-	{0x08, 0x04},
-	{0x07, 0x07},
-	{0x07, 0x07},
-	{0x07, 0x07},
-	{0x07, 0x08},
-	{0x07, 0x08},
-	{0x07, 0x08},
-	{0x07, 0x09},
-	{0x08, 0x07},
-	{0x08, 0x07},
-	{0x09, 0x06},
-	{0x08, 0x08},
-	{0x08, 0x08},
-	{0x08, 0x09},
-	{0x08, 0x09},
-	{0x09, 0x08},
-	{0x09, 0x08},
-	{0x09, 0x08},
-	{0x0A, 0x07},
-	{0x09, 0x09},
-	{0x09, 0x09},
-	{0x09, 0x0A},
-	{0x08, 0x0C},
-	{0x0A, 0x09},
-	{0x0A, 0x09},
-	{0x0A, 0x09},
-	{0x0A, 0x0A},
-	{0x0A, 0x0A},
-	{0x0A, 0x0A},
-	{0x0A, 0x0B},
-	{0x09, 0x0D},
-	{0x0B, 0x0A},
-	{0x0B, 0x0A},
-	{0x0C, 0x08},
-	{0x0B, 0x0B},
-	{0x0B, 0x0B},
-	{0x0B, 0x0B},
-	{0x0B, 0x0C},
-	{0x0B, 0x0C},
-	{0x0B, 0x0C},
-	{0x0B, 0x0D},
-	{0x0C, 0x0B},
-	{0x0C, 0x0B},
-	{0x0D, 0x0A},
-	{0x0C, 0x0C},
-	{0x0C, 0x0C},
-	{0x0C, 0x0D},
-	{0x0C, 0x0D},
-	{0x0D, 0x0C},
-	{0x0D, 0x0C},
-	{0x0D, 0x0C},
-	{0x0E, 0x0B},
-	{0x0D, 0x0D},
-	{0x0D, 0x0D},
-	{0x0D, 0x0E},
-	{0x0C, 0x10},
-	{0x0E, 0x0D},
-	{0x0E, 0x0D},
-	{0x0E, 0x0D},
-	{0x0E, 0x0E},
-	{0x0E, 0x0E},
-	{0x0E, 0x0E},
-	{0x0E, 0x0F},
-	{0x0D, 0x11},
-	{0x0F, 0x0E},
-	{0x0F, 0x0E},
-	{0x10, 0x0C},
-	{0x0F, 0x0F},
-	{0x0F, 0x0F},
-	{0x0F, 0x0F},
-	{0x0F, 0x10},
-	{0x0F, 0x10},
-	{0x0F, 0x10},
-	{0x0F, 0x11},
-	{0x10, 0x0F},
-	{0x10, 0x0F},
-	{0x11, 0x0E},
-	{0x10, 0x10},
-	{0x10, 0x10},
-	{0x10, 0x11},
-	{0x10, 0x11},
-	{0x11, 0x10},
-	{0x11, 0x10},
-	{0x11, 0x10},
-	{0x12, 0x0F},
-	{0x11, 0x11},
-	{0x11, 0x11},
-	{0x11, 0x12},
-	{0x10, 0x14},
-	{0x12, 0x11},
-	{0x12, 0x11},
-	{0x12, 0x11},
-	{0x12, 0x12},
-	{0x12, 0x12},
-	{0x12, 0x12},
-	{0x12, 0x13},
-	{0x11, 0x15},
-	{0x13, 0x12},
-	{0x13, 0x12},
-	{0x14, 0x10},
-	{0x13, 0x13},
-	{0x13, 0x13},
-	{0x13, 0x13},
-	{0x13, 0x14},
-	{0x13, 0x14},
-	{0x13, 0x14},
-	{0x13, 0x15},
-	{0x14, 0x13},
-	{0x14, 0x13},
-	{0x15, 0x12},
-	{0x14, 0x14},
-	{0x14, 0x14},
-	{0x14, 0x15},
-	{0x14, 0x15},
-	{0x15, 0x14},
-	{0x15, 0x14},
-	{0x15, 0x14},
-	{0x16, 0x13},
-	{0x15, 0x15},
-	{0x15, 0x15},
-	{0x15, 0x16},
-	{0x14, 0x18},
-	{0x16, 0x15},
-	{0x16, 0x15},
-	{0x16, 0x15},
-	{0x16, 0x16},
-	{0x16, 0x16},
-	{0x16, 0x16},
-	{0x16, 0x17},
-	{0x15, 0x19},
-	{0x17, 0x16},
-	{0x17, 0x16},
-	{0x18, 0x14},
-	{0x17, 0x17},
-	{0x17, 0x17},
-	{0x17, 0x17},
-	{0x17, 0x18},
-	{0x17, 0x18},
-	{0x17, 0x18},
-	{0x17, 0x19},
-	{0x18, 0x17},
-	{0x18, 0x17},
-	{0x19, 0x16},
-	{0x18, 0x18},
-	{0x18, 0x18},
-	{0x18, 0x19},
-	{0x18, 0x19},
-	{0x19, 0x18},
-	{0x19, 0x18},
-	{0x19, 0x18},
-	{0x1A, 0x17},
-	{0x19, 0x19},
-	{0x19, 0x19},
-	{0x19, 0x1A},
-	{0x18, 0x1C},
-	{0x1A, 0x19},
-	{0x1A, 0x19},
-	{0x1A, 0x19},
-	{0x1A, 0x1A},
-	{0x1A, 0x1A},
-	{0x1A, 0x1A},
-	{0x1A, 0x1B},
-	{0x19, 0x1D},
-	{0x1B, 0x1A},
-	{0x1B, 0x1A},
-	{0x1C, 0x18},
-	{0x1B, 0x1B},
-	{0x1B, 0x1B},
-	{0x1B, 0x1B},
-	{0x1B, 0x1C},
-	{0x1B, 0x1C},
-	{0x1B, 0x1C},
-	{0x1B, 0x1D},
-	{0x1C, 0x1B},
-	{0x1C, 0x1B},
-	{0x1D, 0x1A},
-	{0x1C, 0x1C},
-	{0x1C, 0x1C},
-	{0x1C, 0x1D},
-	{0x1C, 0x1D},
-	{0x1D, 0x1C},
-	{0x1D, 0x1C},
-	{0x1D, 0x1C},
-	{0x1E, 0x1B},
-	{0x1D, 0x1D},
-	{0x1D, 0x1D},
-	{0x1D, 0x1E},
-	{0x1D, 0x1E},
-	{0x1E, 0x1D},
-	{0x1E, 0x1D},
-	{0x1E, 0x1D},
-	{0x1E, 0x1E},
-	{0x1E, 0x1E},
-	{0x1E, 0x1E},
-	{0x1E, 0x1F},
-	{0x1E, 0x1F},
-	{0x1F, 0x1E},
-	{0x1F, 0x1E},
-	{0x1F, 0x1E},
-	{0x1F, 0x1F},
-	{0x1F, 0x1F},
-};
-
-#if __CUDACC__
-__constant__ unsigned short
-#else
-const static uint8
-#endif
-OMatch6[256][2] =
-{
-	{0x00, 0x00},
-	{0x00, 0x01},
-	{0x01, 0x00},
-	{0x01, 0x01},
-	{0x01, 0x01},
-	{0x01, 0x02},
-	{0x02, 0x01},
-	{0x02, 0x02},
-	{0x02, 0x02},
-	{0x02, 0x03},
-	{0x03, 0x02},
-	{0x03, 0x03},
-	{0x03, 0x03},
-	{0x03, 0x04},
-	{0x04, 0x03},
-	{0x04, 0x04},
-	{0x04, 0x04},
-	{0x04, 0x05},
-	{0x05, 0x04},
-	{0x05, 0x05},
-	{0x05, 0x05},
-	{0x05, 0x06},
-	{0x06, 0x05},
-	{0x00, 0x11},
-	{0x06, 0x06},
-	{0x06, 0x07},
-	{0x07, 0x06},
-	{0x02, 0x10},
-	{0x07, 0x07},
-	{0x07, 0x08},
-	{0x08, 0x07},
-	{0x03, 0x11},
-	{0x08, 0x08},
-	{0x08, 0x09},
-	{0x09, 0x08},
-	{0x05, 0x10},
-	{0x09, 0x09},
-	{0x09, 0x0A},
-	{0x0A, 0x09},
-	{0x06, 0x11},
-	{0x0A, 0x0A},
-	{0x0A, 0x0B},
-	{0x0B, 0x0A},
-	{0x08, 0x10},
-	{0x0B, 0x0B},
-	{0x0B, 0x0C},
-	{0x0C, 0x0B},
-	{0x09, 0x11},
-	{0x0C, 0x0C},
-	{0x0C, 0x0D},
-	{0x0D, 0x0C},
-	{0x0B, 0x10},
-	{0x0D, 0x0D},
-	{0x0D, 0x0E},
-	{0x0E, 0x0D},
-	{0x0C, 0x11},
-	{0x0E, 0x0E},
-	{0x0E, 0x0F},
-	{0x0F, 0x0E},
-	{0x0E, 0x10},
-	{0x0F, 0x0F},
-	{0x0F, 0x10},
-	{0x10, 0x0E},
-	{0x10, 0x0F},
-	{0x11, 0x0E},
-	{0x10, 0x10},
-	{0x10, 0x11},
-	{0x11, 0x10},
-	{0x12, 0x0F},
-	{0x11, 0x11},
-	{0x11, 0x12},
-	{0x12, 0x11},
-	{0x14, 0x0E},
-	{0x12, 0x12},
-	{0x12, 0x13},
-	{0x13, 0x12},
-	{0x15, 0x0F},
-	{0x13, 0x13},
-	{0x13, 0x14},
-	{0x14, 0x13},
-	{0x17, 0x0E},
-	{0x14, 0x14},
-	{0x14, 0x15},
-	{0x15, 0x14},
-	{0x18, 0x0F},
-	{0x15, 0x15},
-	{0x15, 0x16},
-	{0x16, 0x15},
-	{0x1A, 0x0E},
-	{0x16, 0x16},
-	{0x16, 0x17},
-	{0x17, 0x16},
-	{0x1B, 0x0F},
-	{0x17, 0x17},
-	{0x17, 0x18},
-	{0x18, 0x17},
-	{0x13, 0x21},
-	{0x18, 0x18},
-	{0x18, 0x19},
-	{0x19, 0x18},
-	{0x15, 0x20},
-	{0x19, 0x19},
-	{0x19, 0x1A},
-	{0x1A, 0x19},
-	{0x16, 0x21},
-	{0x1A, 0x1A},
-	{0x1A, 0x1B},
-	{0x1B, 0x1A},
-	{0x18, 0x20},
-	{0x1B, 0x1B},
-	{0x1B, 0x1C},
-	{0x1C, 0x1B},
-	{0x19, 0x21},
-	{0x1C, 0x1C},
-	{0x1C, 0x1D},
-	{0x1D, 0x1C},
-	{0x1B, 0x20},
-	{0x1D, 0x1D},
-	{0x1D, 0x1E},
-	{0x1E, 0x1D},
-	{0x1C, 0x21},
-	{0x1E, 0x1E},
-	{0x1E, 0x1F},
-	{0x1F, 0x1E},
-	{0x1E, 0x20},
-	{0x1F, 0x1F},
-	{0x1F, 0x20},
-	{0x20, 0x1E},
-	{0x20, 0x1F},
-	{0x21, 0x1E},
-	{0x20, 0x20},
-	{0x20, 0x21},
-	{0x21, 0x20},
-	{0x22, 0x1F},
-	{0x21, 0x21},
-	{0x21, 0x22},
-	{0x22, 0x21},
-	{0x24, 0x1E},
-	{0x22, 0x22},
-	{0x22, 0x23},
-	{0x23, 0x22},
-	{0x25, 0x1F},
-	{0x23, 0x23},
-	{0x23, 0x24},
-	{0x24, 0x23},
-	{0x27, 0x1E},
-	{0x24, 0x24},
-	{0x24, 0x25},
-	{0x25, 0x24},
-	{0x28, 0x1F},
-	{0x25, 0x25},
-	{0x25, 0x26},
-	{0x26, 0x25},
-	{0x2A, 0x1E},
-	{0x26, 0x26},
-	{0x26, 0x27},
-	{0x27, 0x26},
-	{0x2B, 0x1F},
-	{0x27, 0x27},
-	{0x27, 0x28},
-	{0x28, 0x27},
-	{0x23, 0x31},
-	{0x28, 0x28},
-	{0x28, 0x29},
-	{0x29, 0x28},
-	{0x25, 0x30},
-	{0x29, 0x29},
-	{0x29, 0x2A},
-	{0x2A, 0x29},
-	{0x26, 0x31},
-	{0x2A, 0x2A},
-	{0x2A, 0x2B},
-	{0x2B, 0x2A},
-	{0x28, 0x30},
-	{0x2B, 0x2B},
-	{0x2B, 0x2C},
-	{0x2C, 0x2B},
-	{0x29, 0x31},
-	{0x2C, 0x2C},
-	{0x2C, 0x2D},
-	{0x2D, 0x2C},
-	{0x2B, 0x30},
-	{0x2D, 0x2D},
-	{0x2D, 0x2E},
-	{0x2E, 0x2D},
-	{0x2C, 0x31},
-	{0x2E, 0x2E},
-	{0x2E, 0x2F},
-	{0x2F, 0x2E},
-	{0x2E, 0x30},
-	{0x2F, 0x2F},
-	{0x2F, 0x30},
-	{0x30, 0x2E},
-	{0x30, 0x2F},
-	{0x31, 0x2E},
-	{0x30, 0x30},
-	{0x30, 0x31},
-	{0x31, 0x30},
-	{0x32, 0x2F},
-	{0x31, 0x31},
-	{0x31, 0x32},
-	{0x32, 0x31},
-	{0x34, 0x2E},
-	{0x32, 0x32},
-	{0x32, 0x33},
-	{0x33, 0x32},
-	{0x35, 0x2F},
-	{0x33, 0x33},
-	{0x33, 0x34},
-	{0x34, 0x33},
-	{0x37, 0x2E},
-	{0x34, 0x34},
-	{0x34, 0x35},
-	{0x35, 0x34},
-	{0x38, 0x2F},
-	{0x35, 0x35},
-	{0x35, 0x36},
-	{0x36, 0x35},
-	{0x3A, 0x2E},
-	{0x36, 0x36},
-	{0x36, 0x37},
-	{0x37, 0x36},
-	{0x3B, 0x2F},
-	{0x37, 0x37},
-	{0x37, 0x38},
-	{0x38, 0x37},
-	{0x3D, 0x2E},
-	{0x38, 0x38},
-	{0x38, 0x39},
-	{0x39, 0x38},
-	{0x3E, 0x2F},
-	{0x39, 0x39},
-	{0x39, 0x3A},
-	{0x3A, 0x39},
-	{0x3A, 0x3A},
-	{0x3A, 0x3A},
-	{0x3A, 0x3B},
-	{0x3B, 0x3A},
-	{0x3B, 0x3B},
-	{0x3B, 0x3B},
-	{0x3B, 0x3C},
-	{0x3C, 0x3B},
-	{0x3C, 0x3C},
-	{0x3C, 0x3C},
-	{0x3C, 0x3D},
-	{0x3D, 0x3C},
-	{0x3D, 0x3D},
-	{0x3D, 0x3D},
-	{0x3D, 0x3E},
-	{0x3E, 0x3D},
-	{0x3E, 0x3E},
-	{0x3E, 0x3E},
-	{0x3E, 0x3F},
-	{0x3F, 0x3E},
-	{0x3F, 0x3F},
-	{0x3F, 0x3F},
-};
+extern uint8 OMatch5[256][2];
+extern uint8 OMatch6[256][2];
+extern uint8 OMatchAlpha5[256][2];
+extern uint8 OMatchAlpha6[256][2];

+void initSingleColorLookup();
--- a/src/nvtt/cuda/BitmapTable.h
+++ b/src/nvtt/cuda/BitmapTable.h
--- a/src/nvtt/cuda/CompressKernel.cu
+++ b/src/nvtt/cuda/CompressKernel.cu
@ -26,7 +26,6 @@

 #include "CudaMath.h"

-#include "../SingleColorLookup.h"

 #define NUM_THREADS 64		// Number of threads per block.

@ -48,6 +47,9 @@ __device__ inline void swap(T & a, T & b)
 	b = tmp;
 }

+__constant__ uchar OMatch5[256][2];
+__constant__ uchar OMatch6[256][2];
+
 __constant__ float3 kColorMetric = { 1.0f, 1.0f, 1.0f };
 __constant__ float3 kColorMetricSqr = { 1.0f, 1.0f, 1.0f };

--- a/src/nvtt/cuda/CudaCompressorDXT.cpp
+++ b/src/nvtt/cuda/CudaCompressorDXT.cpp
@ -24,31 +24,24 @@
 #include "CudaCompressorDXT.h"
 #include "CudaUtils.h"

-#include <nvcore/Debug.h>
-#include <nvmath/Color.h>
-#include <nvimage/Image.h>
-#include <nvimage/ColorBlock.h>
-#include <nvimage/BlockDXT.h>
-#include <nvtt/CompressionOptions.h>
-#include <nvtt/OutputOptions.h>
-#include <nvtt/QuickCompressDXT.h>
-#include <nvtt/OptimalCompressDXT.h>
+#include "nvcore/Debug.h"
+#include "nvmath/Color.h"
+#include "nvimage/Image.h"
+#include "nvimage/ColorBlock.h"
+#include "nvimage/BlockDXT.h"
+#include "nvtt/CompressionOptions.h"
+#include "nvtt/OutputOptions.h"
+#include "nvtt/QuickCompressDXT.h"
+#include "nvtt/OptimalCompressDXT.h"


-#if defined HAVE_CUDA
-#include <cuda_runtime_api.h>
-#endif
-
 #include <time.h>
 #include <stdio.h>

-using namespace nv;
-using namespace nvtt;
-
 #if defined HAVE_CUDA
+#include <cuda_runtime_api.h>

-#define MAX_BLOCKS 8192U // 32768, 65535
-
+#define MAX_BLOCKS 8192U // 32768, 65535 // @@ Limit number of blocks on slow devices to prevent hitting the watchdog timer.

 extern "C" void setupCompressKernel(const float weights[3]);
 extern "C" void bindTextureToArray(cudaArray * d_data);
@ -62,31 +55,13 @@ extern "C" void compressKernelDXT3(uint firstBlock, uint blockNum, uint w, uint


 #include "BitmapTable.h"
-
-/*
-// Convert linear image to block linear.
-static void convertToBlockLinear(const Image * image, uint * blockLinearImage)
-{
-	const uint w = (image->width() + 3) / 4;
-	const uint h = (image->height() + 3) / 4;
-
-	for(uint by = 0; by < h; by++) {
-		for(uint bx = 0; bx < w; bx++) {
-			const uint bw = min(image->width() - bx * 4, 4U);
-			const uint bh = min(image->height() - by * 4, 4U);
-
-			for (uint i = 0; i < 16; i++) {
-				const int x = (i % 4) % bw;
-				const int y = (i / 4) % bh;
-				blockLinearImage[(by * w + bx) * 16 + i] = image->pixel(bx * 4 + x, by * 4 + y).u;
-			}
-		}
-	}
-}
-*/
+#include "nvtt/SingleColorLookup.h"

 #endif

+using namespace nv;
+using namespace nvtt;
+

 CudaContext::CudaContext() : 
 	bitmapTable(NULL), 
@ -111,6 +86,11 @@ CudaContext::CudaContext() :
 	// Allocate scratch buffers.
    cudaMalloc((void**) &data, MAX_BLOCKS * 64U);
    cudaMalloc((void**) &result, MAX_BLOCKS * 8U);
+
+    // Init single color lookup contant tables.
+    cudaMemcpyToSymbol("OMatch5", OMatch5, sizeof(OMatch5), 0, cudaMemcpyHostToDevice);
+    cudaMemcpyToSymbol("OMatch6", OMatch6, sizeof(OMatch6), 0, cudaMemcpyHostToDevice);
+
 #endif
 }

@ -300,6 +280,30 @@ void CudaCompressorDXT5::compressBlocks(uint first, uint count, uint w, uint h,

 #if 0

+
+/*
+// Convert linear image to block linear.
+static void convertToBlockLinear(const Image * image, uint * blockLinearImage)
+{
+	const uint w = (image->width() + 3) / 4;
+	const uint h = (image->height() + 3) / 4;
+
+	for(uint by = 0; by < h; by++) {
+		for(uint bx = 0; bx < w; bx++) {
+			const uint bw = min(image->width() - bx * 4, 4U);
+			const uint bh = min(image->height() - by * 4, 4U);
+
+			for (uint i = 0; i < 16; i++) {
+				const int x = (i % 4) % bw;
+				const int y = (i / 4) % bh;
+				blockLinearImage[(by * w + bx) * 16 + i] = image->pixel(bx * 4 + x, by * 4 + y).u;
+			}
+		}
+	}
+}
+*/
+
+
 /// Compress image using CUDA.
 void CudaCompressor::compressDXT3(const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions)
 {
--- a/src/nvtt/squish/clusterfit.cpp
+++ b/src/nvtt/squish/clusterfit.cpp
@ -383,8 +383,7 @@ Vec4 ClusterFit::SolveLeastSquares( Vec4& start, Vec4& end ) const

 	// clamp to the grid
 	Vec4 const grid( 31.0f, 63.0f, 31.0f, 0.0f );
-//	Vec4 const gridrcp( 1.0f/31.0f, 1.0f/63.0f, 1.0f/31.0f, 0.0f );
-	Vec4 const gridrcp( 0.03227752766457f, 0.01583151765563f, 0.03227752766457f, 0.0f ); // IC: use approximate grid fitting.
+	Vec4 const gridrcp( 1.0f/31.0f, 1.0f/63.0f, 1.0f/31.0f, 0.0f );
 	Vec4 const onethird = VEC4_CONST( 1.0f/3.0f );
 	Vec4 const twothirds = VEC4_CONST( 2.0f/3.0f );
 	a = Truncate( MultiplyAdd( grid, a, half ) )*gridrcp;
@ -459,8 +458,7 @@ float ClusterFit::SolveLeastSquares( Vec3& start, Vec3& end ) const

 	// clamp to the grid
 	Vec3 const grid( 31.0f, 63.0f, 31.0f );
-	//Vec3 const gridrcp( 1.0f/31.0f, 1.0f/63.0f, 1.0f/31.0f );
-	Vec3 const gridrcp(0.03227752766457f, 0.01583151765563f, 0.03227752766457f); // IC: use approximate grid fitting.
+	Vec3 const gridrcp( 1.0f/31.0f, 1.0f/63.0f, 1.0f/31.0f );
 	Vec3 const half( 0.5f );
 	a = Floor( grid*a + half )*gridrcp;
 	b = Floor( grid*b + half )*gridrcp;