From b41752aa84a291b710f1521c8d551dde84f09f8b Mon Sep 17 00:00:00 2001
From: castano <castano@95f4ed2b-212e-0410-8b90-d31948207fce>
Date: Sat, 16 Feb 2008 09:53:07 +0000
Subject: [PATCH] Apply singlecolorpatch.diff from Amir Ebrahimi. Add single
 color compressor to CUDA implementation.

---
 src/nvimage/ColorBlock.cpp      | 13 ++++++++++++
 src/nvimage/ColorBlock.h        |  1 +
 src/nvtt/CMakeLists.txt         |  3 +++
 src/nvtt/CompressDXT.cpp        | 23 +++++++++++++++-----
 src/nvtt/SingleColorLookup.h    | 14 +++++++++++--
 src/nvtt/cuda/CompressKernel.cu | 37 +++++++++++++++++++++++++++------
 src/nvtt/cuda/CudaMath.h        | 34 ++++++++++++++++++++++++++----
 src/nvtt/tests/stress.cpp       |  4 ++--
 8 files changed, 110 insertions(+), 19 deletions(-)

diff --git a/src/nvimage/ColorBlock.cpp b/src/nvimage/ColorBlock.cpp
index cdc7d3c..16faf15 100644
--- a/src/nvimage/ColorBlock.cpp
+++ b/src/nvimage/ColorBlock.cpp
@@ -110,6 +110,19 @@ void ColorBlock::splatY()
 	}
 }
 
+/// Returns true if the block has a single color.
+bool ColorBlock::isSingleColor() const
+{
+	for(int i = 1; i < 16; i++)
+	{
+		if (m_color[0] != m_color[i])
+		{
+			return false;
+		}
+	}
+	
+	return true;
+}
 
 /// Count number of unique colors in this color block.
 uint ColorBlock::countUniqueColors() const
diff --git a/src/nvimage/ColorBlock.h b/src/nvimage/ColorBlock.h
index 9087029..cdda4d6 100644
--- a/src/nvimage/ColorBlock.h
+++ b/src/nvimage/ColorBlock.h
@@ -24,6 +24,7 @@ namespace nv
 		void splatX();
 		void splatY();
 		
+		bool isSingleColor() const;
 		uint countUniqueColors() const;
 		Color32 averageColor() const;
 		bool hasAlpha() const;
diff --git a/src/nvtt/CMakeLists.txt b/src/nvtt/CMakeLists.txt
index 7465bf6..36d9cdd 100644
--- a/src/nvtt/CMakeLists.txt
+++ b/src/nvtt/CMakeLists.txt
@@ -79,6 +79,9 @@ TARGET_LINK_LIBRARIES(nvassemble nvcore nvmath nvimage)
 ADD_EXECUTABLE(filtertest tests/filtertest.cpp tools/cmdline.h)
 TARGET_LINK_LIBRARIES(filtertest nvcore nvmath nvimage)
 
+ADD_EXECUTABLE(stress tests/stress.cpp tools/cmdline.h)
+TARGET_LINK_LIBRARIES(stress nvcore nvmath nvimage nvtt)
+
 ADD_EXECUTABLE(nvzoom tools/resize.cpp tools/cmdline.h)
 TARGET_LINK_LIBRARIES(nvzoom nvcore nvmath nvimage)
 
diff --git a/src/nvtt/CompressDXT.cpp b/src/nvtt/CompressDXT.cpp
index 5287f13..06af8c2 100644
--- a/src/nvtt/CompressDXT.cpp
+++ b/src/nvtt/CompressDXT.cpp
@@ -69,7 +69,14 @@ void nv::fastCompressDXT1(const Image * image, const OutputOptions::Private & ou
 		for (uint x = 0; x < w; x += 4) {
 			rgba.init(image, x, y);
 			
-			QuickCompress::compressDXT1(rgba, &block);
+			if (rgba.isSingleColor())
+			{
+				QuickCompress::compressDXT1(rgba.color(0), &block);
+			}
+			else
+			{
+				QuickCompress::compressDXT1(rgba, &block);
+			}
 			
 			if (outputOptions.outputHandler != NULL) {
 				outputOptions.outputHandler->writeData(&block, sizeof(block));
@@ -214,10 +221,16 @@ void nv::compressDXT1(const Image * image, const OutputOptions::Private & output
 			
 			rgba.init(image, x, y);
 			
-			// Compress color.
-			squish::ColourSet colours((uint8 *)rgba.colors(), 0);
-			fit.SetColourSet(&colours, squish::kDxt1);
-			fit.Compress(&block);
+			if (rgba.isSingleColor())
+			{
+				QuickCompress::compressDXT1(rgba.color(0), &block);
+			}
+			else
+			{
+				squish::ColourSet colours((uint8 *)rgba.colors(), 0);
+				fit.SetColourSet(&colours, squish::kDxt1);
+				fit.Compress(&block);
+			}
 			
 			if (outputOptions.outputHandler != NULL) {
 				outputOptions.outputHandler->writeData(&block, sizeof(block));
diff --git a/src/nvtt/SingleColorLookup.h b/src/nvtt/SingleColorLookup.h
index 1eab018..63d0462 100644
--- a/src/nvtt/SingleColorLookup.h
+++ b/src/nvtt/SingleColorLookup.h
@@ -48,7 +48,12 @@ void initTables()
 };
 */
 
-const static uint8 OMatch5[256][2] =
+#if __CUDACC__
+__constant__ unsigned short
+#else
+const static uint8 
+#endif
+OMatch5[256][2] =
 {
 	{0x00, 0x00},
 	{0x00, 0x00},
@@ -308,7 +313,12 @@ const static uint8 OMatch5[256][2] =
 	{0x1F, 0x1F},
 };
 
-const static uint8 OMatch6[256][2] =
+#if __CUDACC__
+__constant__ unsigned short
+#else
+const static uint8
+#endif
+OMatch6[256][2] =
 {
 	{0x00, 0x00},
 	{0x00, 0x01},
diff --git a/src/nvtt/cuda/CompressKernel.cu b/src/nvtt/cuda/CompressKernel.cu
index 22394be..42c3144 100644
--- a/src/nvtt/cuda/CompressKernel.cu
+++ b/src/nvtt/cuda/CompressKernel.cu
@@ -28,6 +28,8 @@
 
 #include "CudaMath.h"
 
+#include "../SingleColorLookup.h"
+
 #define NUM_THREADS 64		// Number of threads per block.
 
 #if __DEVICE_EMULATION__
@@ -117,7 +119,7 @@ __device__ void sortColors(const float * values, int * cmp)
 ////////////////////////////////////////////////////////////////////////////////
 // Load color block to shared mem
 ////////////////////////////////////////////////////////////////////////////////
-__device__ void loadColorBlock(const uint * image, float3 colors[16], float3 sums[16], int xrefs[16])
+__device__ void loadColorBlock(const uint * image, float3 colors[16], float3 sums[16], int xrefs[16], int * sameColor)
 {
 	const int bid = blockIdx.x;
 	const int idx = threadIdx.x;
@@ -128,7 +130,7 @@ __device__ void loadColorBlock(const uint * image, float3 colors[16], float3 sum
 	{
 		// Read color and copy to shared mem.
 		uint c = image[(bid) * 16 + idx];
-	
+		
 		colors[idx].z = ((c >> 0) & 0xFF) * (1.0f / 255.0f);
 		colors[idx].y = ((c >> 8) & 0xFF) * (1.0f / 255.0f);
 		colors[idx].x = ((c >> 16) & 0xFF) * (1.0f / 255.0f);
@@ -137,11 +139,13 @@ __device__ void loadColorBlock(const uint * image, float3 colors[16], float3 sum
 #if __DEVICE_EMULATION__
 		} __debugsync(); if (idx < 16) {
 #endif
-
+		
 		// Sort colors along the best fit line.
 		colorSums(colors, sums);
 		float3 axis = bestFitLine(colors, sums[0], kColorMetric);
 		
+		*sameColor = (axis == make_float3(0, 0, 0));
+		
 		dps[idx] = dot(colors[idx], axis);
 		
 #if __DEVICE_EMULATION__
@@ -997,6 +1001,20 @@ __device__ void saveBlockCTX1(ushort start, ushort end, uint permutation, int xr
 	saveBlockDXT1(start, end, permutation, xrefs, result);
 }
 
+__device__ void saveSingleColorBlockDXT1(float3 color, uint2 * result)
+{
+	const int bid = blockIdx.x;
+
+	int r = color.x * 255;
+	int g = color.y * 255;
+	int b = color.z * 255;
+
+	ushort color0 = (OMatch5[r][0] << 11) | (OMatch6[g][0] << 5) | OMatch5[b][0];
+	ushort color1 = (OMatch5[r][1] << 11) | (OMatch6[g][1] << 5) | OMatch5[b][1];
+
+	result[bid].x = (color1 << 16) | color0;
+	result[bid].y = 0xaaaaaaaa;
+}
 
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -1007,9 +1025,16 @@ __global__ void compressDXT1(const uint * permutations, const uint * image, uint
 	__shared__ float3 colors[16];
 	__shared__ float3 sums[16];
 	__shared__ int xrefs[16];
+	__shared__ int sameColor;
 	
-	loadColorBlock(image, colors, sums, xrefs);
-	
+	loadColorBlock(image, colors, sums, xrefs, &sameColor);
+
+	if (sameColor)
+	{
+		if (threadIdx.x == 0) saveSingleColorBlockDXT1(colors[0], result);
+		return;
+	}
+
 	__syncthreads();
 
 	ushort bestStart, bestEnd;
@@ -1269,7 +1294,7 @@ __global__ void compressDXT5(const uint * permutations, const uint * image, uint
 	__shared__ int xrefs[16];
 	
 	loadColorBlock(image, colors, sums, weights, xrefs);
-		
+	
 	__syncthreads();
 
 	compressAlpha(weights, result);	
diff --git a/src/nvtt/cuda/CudaMath.h b/src/nvtt/cuda/CudaMath.h
index cd6dd3c..13e27df 100644
--- a/src/nvtt/cuda/CudaMath.h
+++ b/src/nvtt/cuda/CudaMath.h
@@ -82,6 +82,12 @@ inline __device__ __host__ void operator /=(float3 & b, float f)
     b.z *= inv;
 }
 
+inline __device__ __host__ bool operator ==(float3 a, float3 b)
+{
+	return a.x == b.x && a.y == b.y && a.z == b.z;
+}
+
+
 // float2 operators
 inline __device__ __host__ float2 operator *(float2 a, float2 b)
 {
@@ -187,15 +193,37 @@ inline __device__ __host__ float3 firstEigenVector( float matrix[6] )
 		float z = v.x * matrix[2] + v.y * matrix[4] + v.z * matrix[5];
 		float m = max(max(x, y), z);        
 		float iv = 1.0f / m;
-		#if __DEVICE_EMULATION__
 		if (m == 0.0f) iv = 0.0f;
-		#endif
 		v = make_float3(x*iv, y*iv, z*iv);
 	}
 
 	return v;
 }
 
+inline __device__ bool singleColor(const float3 * colors)
+{
+#if __DEVICE_EMULATION__
+	bool sameColor = false;
+	for (int i = 0; i < 16; i++)
+	{
+		sameColor &= (colors[idx] == colors[0]);
+	}
+	return sameColor;
+#else
+	__shared__ int sameColor[16];
+	
+	const int idx = threadIdx.x;
+	
+	sameColor[idx] = (colors[idx] == colors[0]);
+	sameColor[idx] &= sameColor[idx^8];
+	sameColor[idx] &= sameColor[idx^4];
+	sameColor[idx] &= sameColor[idx^2];
+	sameColor[idx] &= sameColor[idx^1];
+	
+	return sameColor[0];
+#endif
+}
+
 inline __device__ void colorSums(const float3 * colors, float3 * sums)
 {
 #if __DEVICE_EMULATION__
@@ -284,9 +312,7 @@ inline __device__ __host__ float2 firstEigenVector2D( float matrix[3] )
 		float y = v.x * matrix[1] + v.y * matrix[2];
 		float m = max(x, y);        
 		float iv = 1.0f / m;
-		#if __DEVICE_EMULATION__
 		if (m == 0.0f) iv = 0.0f;
-		#endif
 		v = make_float2(x*iv, y*iv);
 	}
 
diff --git a/src/nvtt/tests/stress.cpp b/src/nvtt/tests/stress.cpp
index 28e33eb..526bcfa 100644
--- a/src/nvtt/tests/stress.cpp
+++ b/src/nvtt/tests/stress.cpp
@@ -47,7 +47,7 @@ struct MyOutputHandler : public nvtt::OutputHandler
 	
 	virtual void beginImage(int size, int width, int height, int depth, int face, int miplevel)
 	{
-		assert(size == OUTPUT_SIZE);
+		assert(size == sizeof(int) * OUTPUT_SIZE);
 		assert(width == WIDTH);
 		assert(height == HEIGHT);
 		assert(depth == 1);
@@ -177,7 +177,7 @@ void precomp()
 
 int main(int argc, char *argv[])
 {
-	precomp();
+	//precomp();
 
 	nvtt::InputOptions inputOptions;
 	inputOptions.setTextureLayout(nvtt::TextureType_2D, 1024, 1024);