From 3d3409e66604d63a510b5c4f91f61d9d696977c5 Mon Sep 17 00:00:00 2001
From: castano <castano@95f4ed2b-212e-0410-8b90-d31948207fce>
Date: Sat, 19 May 2007 22:07:51 +0000
Subject: [PATCH] Cleanup CUDA compressor.

---
 src/nvimage/nvtt/cuda/CompressKernel.cu | 211 ++++++++++++------------
 1 file changed, 101 insertions(+), 110 deletions(-)

diff --git a/src/nvimage/nvtt/cuda/CompressKernel.cu b/src/nvimage/nvtt/cuda/CompressKernel.cu
index 891e08c..4121d9c 100644
--- a/src/nvimage/nvtt/cuda/CompressKernel.cu
+++ b/src/nvimage/nvtt/cuda/CompressKernel.cu
@@ -235,7 +235,7 @@ static __device__ float evalPermutation3(const float3 * colors, const float * we
 ////////////////////////////////////////////////////////////////////////////////
 // Sort colors
 ////////////////////////////////////////////////////////////////////////////////
-__device__ void sortColors(float * values, float3 * colors, int * cmp)
+__device__ void sortColors(float * values, int * cmp)
 {
 	int tid = threadIdx.x;
 
@@ -272,53 +272,6 @@ __device__ void sortColors(float * values, float3 * colors, int * cmp)
 	if (tid > 12 && cmp[tid] == cmp[12]) ++cmp[tid];
 	if (tid > 13 && cmp[tid] == cmp[13]) ++cmp[tid];
 	if (tid > 14 && cmp[tid] == cmp[14]) ++cmp[tid];
-
-	float3 tmp = colors[tid];
-	colors[cmp[tid]] = tmp;
-}
-
-__device__ void sortColors(float * values, float3 * colors, float * weights, int * cmp)
-{
-	int tid = threadIdx.x;
-
-	cmp[tid] = (values[0] < values[tid]);
-	cmp[tid] += (values[1] < values[tid]);
-	cmp[tid] += (values[2] < values[tid]);
-	cmp[tid] += (values[3] < values[tid]);
-	cmp[tid] += (values[4] < values[tid]);
-	cmp[tid] += (values[5] < values[tid]);
-	cmp[tid] += (values[6] < values[tid]);
-	cmp[tid] += (values[7] < values[tid]);
-	cmp[tid] += (values[8] < values[tid]);
-	cmp[tid] += (values[9] < values[tid]);
-	cmp[tid] += (values[10] < values[tid]);
-	cmp[tid] += (values[11] < values[tid]);
-	cmp[tid] += (values[12] < values[tid]);
-	cmp[tid] += (values[13] < values[tid]);
-	cmp[tid] += (values[14] < values[tid]);
-	cmp[tid] += (values[15] < values[tid]);
-	
-	// Resolve elements with the same index.
-	if (tid > 0 && cmp[tid] == cmp[0]) ++cmp[tid];
-	if (tid > 1 && cmp[tid] == cmp[1]) ++cmp[tid];
-	if (tid > 2 && cmp[tid] == cmp[2]) ++cmp[tid];
-	if (tid > 3 && cmp[tid] == cmp[3]) ++cmp[tid];
-	if (tid > 4 && cmp[tid] == cmp[4]) ++cmp[tid];
-	if (tid > 5 && cmp[tid] == cmp[5]) ++cmp[tid];
-	if (tid > 6 && cmp[tid] == cmp[6]) ++cmp[tid];
-	if (tid > 7 && cmp[tid] == cmp[7]) ++cmp[tid];
-	if (tid > 8 && cmp[tid] == cmp[8]) ++cmp[tid];
-	if (tid > 9 && cmp[tid] == cmp[9]) ++cmp[tid];
-	if (tid > 10 && cmp[tid] == cmp[10]) ++cmp[tid];
-	if (tid > 11 && cmp[tid] == cmp[11]) ++cmp[tid];
-	if (tid > 12 && cmp[tid] == cmp[12]) ++cmp[tid];
-	if (tid > 13 && cmp[tid] == cmp[13]) ++cmp[tid];
-	if (tid > 14 && cmp[tid] == cmp[14]) ++cmp[tid];
-
-	float3 tmp = colors[tid];
-	colors[cmp[tid]] = tmp;
-
-	weights[cmp[tid]] = weights[tid];
 }
 
 
@@ -330,11 +283,10 @@ __device__ void minimizeError(float * errors, int * indices)
 	const int idx = threadIdx.x;
 
 #if __DEVICE_EMULATION__
-
 	for(int d = NUM_THREADS/2; d > 0; d >>= 1)
 	{
 		__syncthreads();
-
+		
 		if (idx < d)
 		{
 			float err0 = errors[idx];
@@ -348,16 +300,15 @@ __device__ void minimizeError(float * errors, int * indices)
 	}
 
 #else
-
 	for(int d = NUM_THREADS/2; d > 32; d >>= 1)
 	{
 		__syncthreads();
-
+		
 		if (idx < d)
 		{
 			float err0 = errors[idx];
 			float err1 = errors[idx + d];
-
+			
 			if (err1 < err0) {
 				errors[idx] = err1;
 				indices[idx] = indices[idx + d];
@@ -365,7 +316,7 @@ __device__ void minimizeError(float * errors, int * indices)
 		}
 	}
 
-	// unroll last 6 steps 
+	// unroll last 6 iterations
 	if (idx <= 32)
 	{
 		if (errors[idx + 32] < errors[idx]) {
@@ -398,17 +349,15 @@ __device__ void minimizeError(float * errors, int * indices)
 
 
 ////////////////////////////////////////////////////////////////////////////////
-// Compress color block
+// Load color block to shared mem
 ////////////////////////////////////////////////////////////////////////////////
-__global__ void compress(const uint * permutations, const uint * image, uint2 * result)
+__device__ void loadColorBlock(const uint * image, float3 colors[16], int xrefs[16])
 {
 	const int bid = blockIdx.x;
 	const int idx = threadIdx.x;
-	
-	__shared__ float3 colors[16];
+
 	__shared__ float dps[16];
-	__shared__ int xrefs[16];
-	
+
 	if (idx < 16)
 	{
 		// Read color and copy to shared mem.
@@ -432,9 +381,94 @@ __global__ void compress(const uint * permutations, const uint * image, uint2 *
 		} __debugsync(); if (idx < 16) {
 #endif
 		
-		sortColors(dps, colors, xrefs);
+		sortColors(dps, xrefs);
+		
+		float3 tmp = colors[idx];
+		colors[xrefs[idx]] = tmp;
+	}
+}
+
+__device__ void loadColorBlock(const uint * image, float3 colors[16], float weights[16], int xrefs[16])
+{
+	const int bid = blockIdx.x;
+	const int idx = threadIdx.x;
+
+	__shared__ float dps[16];
+
+	if (idx < 16)
+	{
+		// Read color and copy to shared mem.
+		uint c = image[(bid) * 16 + idx];
+		
+		colors[idx].z = ((c >> 0) & 0xFF) * (1.0f / 255.0f);
+		colors[idx].y = ((c >> 8) & 0xFF) * (1.0f / 255.0f);
+		colors[idx].x = ((c >> 16) & 0xFF) * (1.0f / 255.0f);
+		weights[idx] = ((c >> 24) & 0xFF) * (1.0f / 255.0f);
+		
+		// No need to synchronize, 16 < warp size.
+#if __DEVICE_EMULATION__
+		} __debugsync(); if (idx < 16) {
+#endif
+
+		// Sort colors along the best fit line.
+		float3 axis = bestFitLine(colors);
+		
+		dps[idx] = dot(colors[idx], axis);
+		
+#if __DEVICE_EMULATION__
+		} __debugsync(); if (idx < 16) {
+#endif
+		
+		sortColors(dps, xrefs);
+		
+		float3 tmp = colors[idx];
+		colors[xrefs[idx]] = tmp;
+		
+		float w = weights[idx];
+		weights[xrefs[idx]] = tmp;
+	}
+}
+
+
+__device__ void saveBlockDXT1(ushort start, ushort end, uint permutation, int xrefs[16])
+{
+	const int bid = blockIdx.x;
+	const int idx = threadIdx.x;
+
+	if (start == end)
+	{
+		permutation = 0;
 	}
 	
+	// Reorder permutation.
+	uint indices = 0;
+	for(int i = 0; i < 16; i++)
+	{
+		int ref = xrefs[i];
+		indices |= ((permutation >> (2 * ref)) & 3) << (2 * i);
+	}
+	
+	// Write endpoints.
+	result[bid].x = (end << 16) | start;
+	
+	// Write palette indices.
+	result[bid].y = indices;
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+// Compress color block
+////////////////////////////////////////////////////////////////////////////////
+__global__ void compress(const uint * permutations, const uint * image, uint2 * result)
+{
+	const int bid = blockIdx.x;
+	const int idx = threadIdx.x;
+	
+	__shared__ float3 colors[16];
+	__shared__ int xrefs[16];
+	
+	loadColorBlock(image, colors, xrefs);
+	
 	ushort bestStart, bestEnd;
 	uint bestPermutation;
 	float bestError = FLT_MAX;
@@ -495,7 +529,7 @@ __global__ void compress(const uint * permutations, const uint * image, uint2 *
 		{
 			pidx = idx + NUM_THREADS * 2;
 		}
-
+		
 		ushort start, end;
 		uint permutation = permutations[pidx];
 		float error = evalPermutation4(colors, permutation, &start, &end);
@@ -552,7 +586,7 @@ __global__ void compress(const uint * permutations, const uint * image, uint2 *
                 bestPermutation ^= (~bestPermutation >> 1) & 0x55555555;    // Flip indices.
             }
         }
-
+		
         error = evalPermutation4(colors, permutation, &start, &end);
         
         if (error < bestError)
@@ -561,7 +595,7 @@ __global__ void compress(const uint * permutations, const uint * image, uint2 *
             bestPermutation = permutation;
             bestStart = start;
             bestEnd = end;
-
+			
 			if (bestStart < bestEnd)
 			{
 				swap(bestEnd, bestStart);
@@ -587,24 +621,7 @@ __global__ void compress(const uint * permutations, const uint * image, uint2 *
 	// Only write the result of the winner thread.
 	if (idx == indices[0])
 	{
-		if (bestStart == bestEnd)
-		{
-			bestPermutation = 0;
-		}
-	
-		// Reorder permutation.
-		uint perm = 0;
-		for(int i = 0; i < 16; i++)
-		{
-			int ref = xrefs[i];
-			perm |= ((bestPermutation >> (2 * ref)) & 3) << (2 * i);
-		}
-		
-		// Write endpoints. (bestStart, bestEnd)
-		result[bid].x = (bestEnd << 16) | bestStart;
-		
-		// Write palette indices (permutation).
-		result[bid].y = perm;
+		saveBlockDXT1(bestStart, bestEnd, bestPermutation, xrefs);
 	}
 }
 
@@ -616,36 +633,10 @@ __global__ void compressWeighted(const uint * permutations, const uint * image,
 	
 	__shared__ float3 colors[16];
 	__shared__ float weights[16];
-	__shared__ float dps[16];
 	__shared__ int xrefs[16];
 	
-	if (idx < 16)
-	{
-		// Read color and copy to shared mem.
-		uint c = image[(bid) * 16 + idx];
-		
-		colors[idx].z = ((c >> 0) & 0xFF) * (1.0f / 255.0f);
-		colors[idx].y = ((c >> 8) & 0xFF) * (1.0f / 255.0f);
-		colors[idx].x = ((c >> 16) & 0xFF) * (1.0f / 255.0f);
-		weights[idx] = ((c >> 24) & 0xFF) * (1.0f / 255.0f);
-		
-		// No need to synchronize, 16 < warp size.
-#if __DEVICE_EMULATION__
-		} __debugsync(); if (idx < 16) {
-#endif
-		
-		// Sort colors along the best fit line.
-		float3 axis = bestFitLine(colors);
-		
-		dps[idx] = dot(colors[idx], axis);
-		
-#if __DEVICE_EMULATION__
-		} __debugsync(); if (idx < 16) {
-#endif
-		
-		sortColors(dps, colors, weights, xrefs);
-	}
-	
+	loadColorBlock(image, colors, weights, xrefs);
+
 	ushort bestStart, bestEnd;
 	uint bestPermutation;
 	float bestError = FLT_MAX;