You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
nvidia-texture-tools/src/nvtt/cuda/ConvolveKernel.cu

266 lines
5.8 KiB
Plaintext

// Copyright (c) 2009-2011 Ignacio Castano <castano@gmail.com>
// Copyright (c) 2007-2009 NVIDIA Corporation -- Ignacio Castano <icastano@nvidia.com>
//
// Permission is hereby granted, free of charge, to any person
// obtaining a copy of this software and associated documentation
// files (the "Software"), to deal in the Software without
// restriction, including without limitation the rights to use,
// copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following
// conditions:
//
// The above copyright notice and this permission notice shall be
// included in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
// OTHER DEALINGS IN THE SOFTWARE.
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include "CudaMath.h"
#define TW 16
#define TH 16
#define THREAD_COUNT (TW * TH)
#define MAX_KERNEL_WIDTH 32
#define KW 4
#if __DEVICE_EMULATION__
#define __debugsync() __syncthreads()
#else
#define __debugsync()
#endif
#define TN 256
#define WARP_COUNT (TN / 32)
#define HWARP_COUNT (TN / 16)
// Window size
#define WS 20
struct WrapClamp
{
int operator()(int i, int h)
{
i = min(max(i, 0), h-1);
}
};
struct WrapRepeat
{
int operator()(int i, int h)
{
i = abs(i) % h; // :( Non power of two!
}
};
struct WrapMirror
{
int operator()(int i, int h)
{
i = abs(i);
while (i >= h) i = 2 * w - i - 2;
}
};
// Vertical convolution filter that processes vertical strips.
__global__ void convolveStrip(float * d_channel, float * d_kernel, int width, int height)
{
__shared__ float s_kernel[32 * WS];
// Preload kernel in shared memory.
for (int i = 0; i < 32 * WS / TN; i++)
{
int idx = i * TN + tid;
if (idx < 32 * WS) s_kernel[idx] = d_kernel[idx];
}
__shared__ float s_strip[32 * WS]; // TN/32
int wid = tid / 32 - WS/2;
Mirror wrap;
int row = wrap(wid);
// Preload image block.
for (int i = 0; i < 32 * WS / TN; i++)
{
}
// @@ Apply kernel to TN/32 rows.
// @@ Load
}
__constant__ float inputGamma, outputInverseGamma;
__constant__ float kernel[MAX_KERNEL_WIDTH];
// Use texture to access input?
// That's the most simple approach.
texture<> image;
////////////////////////////////////////////////////////////////////////////////
// Combined convolution filter
////////////////////////////////////////////////////////////////////////////////
__global__ void convolve(float4 * output)
{
// @@ Use morton order to assing threads.
int x = threadIdx.x;
int y = threadIdx.y;
float4 color = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
// texture coordinate.
int2 t;
t.x = 2 * (blockIdx.x * TW + x) - HW;
t.y = blockIdx.y * TH + y;
// @@ We might want to loop and process strips, to reuse the results of the horizontal convolutions.
// Horizontal convolution. @@ Unroll loops.
for (int e = HW; e > 0; e--)
{
t.x++;
float w = kernel[e-1];
color += w * tex2D(image, tc);
}
for (int e = 0; e < HW; e++)
{
t.x++;
float w = kernel[e];
color += w * tex2D(image, tc);
}
// Write color to shared memory.
__shared__ float tile[4 * THREAD_COUNT];
int tileIdx = y * TW + x;
tile[tileIdx + 0 * THREAD_COUNT] = color.x;
tile[tileIdx + 1 * THREAD_COUNT] = color.y;
tile[tileIdx + 2 * THREAD_COUNT] = color.z;
tile[tileIdx + 3 * THREAD_COUNT] = color.w;
__syncthreads();
// tile coordinate.
t.x = x;
t.y = y - HW;
// Vertical convolution. @@ Unroll loops.
for (int i = HW; i > 0; i--)
{
float w = kernel[i-1];
t.y++;
int idx = t.y * TW + t.x;
color.x += w * tile[idx + 0 * THREAD_COUNT];
color.y += w * tile[idx + 1 * THREAD_COUNT];
color.z += w * tile[idx + 2 * THREAD_COUNT];
color.w += w * tile[idx + 3 * THREAD_COUNT];
}
for (int i = 0; i < HW; i++)
{
float w = kernel[i];
t.y++;
int idx = t.y * TW + t.x;
color.x += w * tile[idx + 0 * THREAD_COUNT];
color.y += w * tile[idx + 1 * THREAD_COUNT];
color.z += w * tile[idx + 2 * THREAD_COUNT];
color.w += w * tile[idx + 3 * THREAD_COUNT];
}
it (x < w && y < h)
{
// @@ Prevent unaligned writes.
output[y * w + h] = color;
}
}
////////////////////////////////////////////////////////////////////////////////
// Monophase X convolution filter
////////////////////////////////////////////////////////////////////////////////
__device__ void convolveY()
{
}
////////////////////////////////////////////////////////////////////////////////
// Mipmap convolution filter
////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
// Gamma correction
////////////////////////////////////////////////////////////////////////////////
/*
__device__ float toLinear(float f, float gamma = 2.2f)
{
return __pow(f, gamma);
}
__device__ float toGamma(float f, float gamma = 2.2f)
{
return pow(f, 1.0f / gamma);
}
*/
////////////////////////////////////////////////////////////////////////////////
// Setup kernel
////////////////////////////////////////////////////////////////////////////////
extern "C" void setupConvolveKernel(const float * k, int w)
{
w = min(w, MAX_KERNEL_WIDTH);
cudaMemcpyToSymbol(kernel, k, sizeof(float) * w, 0);
}
////////////////////////////////////////////////////////////////////////////////
// Launch kernel
////////////////////////////////////////////////////////////////////////////////