From f1f944f06cf410b5f42881b1076d711cd60d3c8a Mon Sep 17 00:00:00 2001
From: castano <castano@95f4ed2b-212e-0410-8b90-d31948207fce>
Date: Thu, 24 May 2007 09:57:16 +0000
Subject: [PATCH] Make some progress in separable convolution kernel in CUDA.

---
 src/nvimage/nvtt/cuda/ConvolveKernel.cu | 124 +++++++++++++++++++++++-
 1 file changed, 122 insertions(+), 2 deletions(-)

diff --git a/src/nvimage/nvtt/cuda/ConvolveKernel.cu b/src/nvimage/nvtt/cuda/ConvolveKernel.cu
index f5bfed0..005c2de 100644
--- a/src/nvimage/nvtt/cuda/ConvolveKernel.cu
+++ b/src/nvimage/nvtt/cuda/ConvolveKernel.cu
@@ -28,9 +28,16 @@
 
 #include "CudaMath.h"
 
-#define THREAD_COUNT 		256
+#define TW 16
+#define TH 16
+
+#define THREAD_COUNT 		(TW * TH)
+
 #define MAX_KERNEL_WIDTH	32
 
+#define KW 4
+
+
 
 #if __DEVICE_EMULATION__
 #define __debugsync() __syncthreads()
@@ -39,18 +46,131 @@
 #endif
 
 
+__constant__ float inputGamma, outputInverseGamma;
 __constant__ float kernel[MAX_KERNEL_WIDTH];
 
+// Use texture to access input?
+// That's the most simple approach.
+
+texture<> image;
+
+////////////////////////////////////////////////////////////////////////////////
+// Combined convolution filter
+////////////////////////////////////////////////////////////////////////////////
+
+__global__ void convolve(float4 * output)
+{
+	// @@ Use morton order to assing threads.
+	int x = threadIdx.x;
+	int y = threadIdx.y;
+	
+	float4 color = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+	
+	// texture coordinate.
+	int2 t;
+	t.x = 2 * (blockIdx.x * TW + x) - HW;
+	t.y = blockIdx.y * TH + y;
+	
+	// @@ We might want to loop and process strips, to reuse the results of the horizontal convolutions.
+	
+	// Horizontal convolution. @@ Unroll loops.
+	for (int e = HW; e > 0; e--)
+	{
+		t.x++;
+		float w = kernel[e-1];
+		color += w * tex2D(image, tc);
+	}
+	
+	for (int e = 0; e < HW; e++)
+	{
+		t.x++;
+		float w = kernel[e];
+		color += w * tex2D(image, tc);
+	}
+	
+	// Write color to shared memory.
+	__shared__ float tile[4 * THREAD_COUNT];
+
+	int tileIdx = y * TW + x;
+	tile[tileIdx + 0 * THREAD_COUNT] = color.x;
+	tile[tileIdx + 1 * THREAD_COUNT] = color.y;
+	tile[tileIdx + 2 * THREAD_COUNT] = color.z;
+	tile[tileIdx + 3 * THREAD_COUNT] = color.w;
+
+	__syncthreads();
+	
+	// tile coordinate.
+	t.x = x;
+	t.y = y - HW;
+	
+	// Vertical convolution. @@ Unroll loops.
+	for (int i = HW; i > 0; i--)
+	{
+		float w = kernel[i-1];
+		
+		t.y++;
+		int idx = t.y * TW + t.x;
+		
+		color.x += w * tile[idx + 0 * THREAD_COUNT];
+		color.y += w * tile[idx + 1 * THREAD_COUNT];
+		color.z += w * tile[idx + 2 * THREAD_COUNT];
+		color.w += w * tile[idx + 3 * THREAD_COUNT];
+	}
+	
+	for (int i = 0; i < HW; i++)
+	{
+		float w = kernel[i];
+		
+		t.y++;
+		int idx = t.y * TW + t.x;
+		
+		color.x += w * tile[idx + 0 * THREAD_COUNT];
+		color.y += w * tile[idx + 1 * THREAD_COUNT];
+		color.z += w * tile[idx + 2 * THREAD_COUNT];
+		color.w += w * tile[idx + 3 * THREAD_COUNT];
+	}	
+	
+	it (x < w && y < h)
+	{
+		// @@ Prevent unaligned writes.
+		
+		output[y * w + h] = color;
+	}
+}
+
 
 ////////////////////////////////////////////////////////////////////////////////
 // Monophase X convolution filter
 ////////////////////////////////////////////////////////////////////////////////
 
+__device__ void convolveY()
+{
+
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+// Mipmap convolution filter
+////////////////////////////////////////////////////////////////////////////////
+
+
 
 ////////////////////////////////////////////////////////////////////////////////
-// Monophase Y convolution filter
+// Gamma correction
 ////////////////////////////////////////////////////////////////////////////////
 
+/*
+__device__ float toLinear(float f, float gamma = 2.2f)
+{
+	return __pow(f, gamma);
+}
+
+__device__ float toGamma(float f, float gamma = 2.2f)
+{
+	return pow(f, 1.0f / gamma);
+}
+*/
+