Add single color checks to CUDA compressors.

Use optimized bitmap table for CTX compressor.
2008-05-06 19:52:27 +00:00
parent 246f2a409a
commit d855d0461b
7 changed files with 863 additions and 20 deletions
--- a/src/nvtt/cuda/CompressKernel.cu
+++ b/src/nvtt/cuda/CompressKernel.cu
@ -159,7 +159,7 @@ __device__ void loadColorBlock(const uint * image, float3 colors[16], float3 sum
 	}
 }

-__device__ void loadColorBlock(const uint * image, float3 colors[16], float3 sums[16], float weights[16], int xrefs[16])
+__device__ void loadColorBlock(const uint * image, float3 colors[16], float3 sums[16], float weights[16], int xrefs[16], int * sameColor)
 {
 	const int bid = blockIdx.x;
 	const int idx = threadIdx.x;
@ -189,6 +189,8 @@ __device__ void loadColorBlock(const uint * image, float3 colors[16], float3 sum
 		colorSums(colors, sums);
 		float3 axis = bestFitLine(colors, sums[0], kColorMetric);

+		*sameColor = (axis == make_float3(0, 0, 0));
+
 		dps[idx] = dot(rawColors[idx], axis);
 		
 #if __DEVICE_EMULATION__
@ -205,7 +207,7 @@ __device__ void loadColorBlock(const uint * image, float3 colors[16], float3 sum
 	}
 }

-__device__ void loadColorBlock(const uint * image, float2 colors[16], float2 sums[16], int xrefs[16])
+__device__ void loadColorBlock(const uint * image, float2 colors[16], float2 sums[16], int xrefs[16], int * sameColor)
 {
 	const int bid = blockIdx.x;
 	const int idx = threadIdx.x;
@ -229,6 +231,8 @@ __device__ void loadColorBlock(const uint * image, float2 colors[16], float2 sum
 		colorSums(colors, sums);
 		float2 axis = bestFitLine(colors, sums[0]);
 		
+		*sameColor = (axis == make_float2(0, 0));
+
 		dps[idx] = dot(colors[idx], axis);
 		
 #if __DEVICE_EMULATION__
@ -861,7 +865,7 @@ __device__ void evalAllPermutationsCTX(const float2 * colors, float2 colorSum, c
 	for(int i = 0; i < 16; i++)
 	{
 		int pidx = idx + NUM_THREADS * i;
-		if (pidx >= 992) break;
+		if (pidx >= 704) break;
 		
 		ushort start, end;
 		uint permutation = permutations[pidx];
@ -1024,6 +1028,41 @@ __device__ void saveSingleColorBlockDXT1(float3 color, uint2 * result)
 	}
 }

+__device__ void saveSingleColorBlockDXT1(float2 color, uint2 * result)
+{
+	const int bid = blockIdx.x;
+
+	int r = color.x * 255;
+	int g = color.y * 255;
+
+	ushort color0 = (OMatch5[r][0] << 11) | (OMatch6[g][0] << 5);
+	ushort color1 = (OMatch5[r][1] << 11) | (OMatch6[g][1] << 5);
+
+	if (color0 < color1)
+	{
+		result[bid].x = (color0 << 16) | color1;
+		result[bid].y = 0xffffffff;
+	}
+	else
+	{
+		result[bid].x = (color1 << 16) | color0;
+		result[bid].y = 0xaaaaaaaa;
+	}
+}
+
+__device__ void saveSingleColorBlockCTX1(float2 color, uint2 * result)
+{
+	const int bid = blockIdx.x;
+
+	int r = color.x * 255;
+	int g = color.y * 255;
+
+	ushort color0 = (r << 8) | (g);
+	
+	result[bid].x = (color0 << 16) | color0;
+	result[bid].y = 0x00000000;
+}
+

 ////////////////////////////////////////////////////////////////////////////////
 // Compress color block
@ -1069,11 +1108,18 @@ __global__ void compressWeightedDXT1(const uint * permutations, const uint * ima
 	__shared__ float3 sums[16];
 	__shared__ float weights[16];
 	__shared__ int xrefs[16];
+	__shared__ int sameColor;
 	
-	loadColorBlock(image, colors, sums, weights, xrefs);
+	loadColorBlock(image, colors, sums, weights, xrefs, &sameColor);
 	
 	__syncthreads();

+	if (sameColor)
+	{
+		if (threadIdx.x == 0) saveSingleColorBlockDXT1(colors[0], result);
+		return;
+	}
+
 	ushort bestStart, bestEnd;
 	uint bestPermutation;

@ -1097,11 +1143,18 @@ __global__ void compressNormalDXT1(const uint * permutations, const uint * image
 	__shared__ float2 colors[16];
 	__shared__ float2 sums[16];
 	__shared__ int xrefs[16];
-	
-	loadColorBlock(image, colors, sums, xrefs);
+	__shared__ int sameColor;
+
+	loadColorBlock(image, colors, sums, xrefs, &sameColor);
 	
 	__syncthreads();

+	if (sameColor)
+	{
+		if (threadIdx.x == 0) saveSingleColorBlockDXT1(colors[0], result);
+		return;
+	}
+
 	ushort bestStart, bestEnd;
 	uint bestPermutation;

@ -1124,11 +1177,18 @@ __global__ void compressCTX1(const uint * permutations, const uint * image, uint
 	__shared__ float2 colors[16];
 	__shared__ float2 sums[16];
 	__shared__ int xrefs[16];
+	__shared__ int sameColor;
 	
-	loadColorBlock(image, colors, sums, xrefs);
+	loadColorBlock(image, colors, sums, xrefs, &sameColor);
 	
 	__syncthreads();

+	if (sameColor)
+	{
+		if (threadIdx.x == 0) saveSingleColorBlockCTX1(colors[0], result);
+		return;
+	}
+
 	ushort bestStart, bestEnd;
 	uint bestPermutation;