Add DXT1 compressor that uses texture to avoid CPU swizzling.
Fix errors under emulation. Experiment with DXT5 compressor.
This commit is contained in:
parent
1957120c26
commit
2f6e885ced
@ -21,10 +21,8 @@
|
|||||||
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
// OTHER DEALINGS IN THE SOFTWARE.
|
// OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
|
||||||
#include <stdlib.h>
|
|
||||||
#include <stdio.h>
|
|
||||||
#include <string.h>
|
|
||||||
#include <math.h>
|
#include <math.h>
|
||||||
|
#include <float.h> // FLT_MAX
|
||||||
|
|
||||||
#include "CudaMath.h"
|
#include "CudaMath.h"
|
||||||
|
|
||||||
@ -53,65 +51,57 @@ __device__ inline void swap(T & a, T & b)
|
|||||||
__constant__ float3 kColorMetric = { 1.0f, 1.0f, 1.0f };
|
__constant__ float3 kColorMetric = { 1.0f, 1.0f, 1.0f };
|
||||||
__constant__ float3 kColorMetricSqr = { 1.0f, 1.0f, 1.0f };
|
__constant__ float3 kColorMetricSqr = { 1.0f, 1.0f, 1.0f };
|
||||||
|
|
||||||
|
// Some kernels read the input through texture.
|
||||||
|
texture<uchar4, 2, cudaReadModeNormalizedFloat> tex;
|
||||||
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
// Sort colors
|
// Sort colors
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
__device__ void sortColors(const float * values, int * cmp)
|
__device__ void sortColors(const float * values, int * ranks)
|
||||||
{
|
{
|
||||||
int tid = threadIdx.x;
|
#if __DEVICE_EMULATION__
|
||||||
|
if (threadIdx.x == 0)
|
||||||
|
{
|
||||||
|
for (int tid = 0; tid < 16; tid++)
|
||||||
|
{
|
||||||
|
int rank = 0;
|
||||||
|
for (int i = 0; i < 16; i++)
|
||||||
|
{
|
||||||
|
rank += (values[i] < values[tid]);
|
||||||
|
}
|
||||||
|
|
||||||
|
ranks[tid] = rank;
|
||||||
|
}
|
||||||
|
|
||||||
#if 1
|
// Resolve elements with the same index.
|
||||||
cmp[tid] = (values[0] < values[tid]);
|
for (int i = 0; i < 15; i++)
|
||||||
cmp[tid] += (values[1] < values[tid]);
|
{
|
||||||
cmp[tid] += (values[2] < values[tid]);
|
for (int tid = 0; tid < 16; tid++)
|
||||||
cmp[tid] += (values[3] < values[tid]);
|
{
|
||||||
cmp[tid] += (values[4] < values[tid]);
|
if (tid > i && ranks[tid] == ranks[i]) ++ranks[tid];
|
||||||
cmp[tid] += (values[5] < values[tid]);
|
}
|
||||||
cmp[tid] += (values[6] < values[tid]);
|
}
|
||||||
cmp[tid] += (values[7] < values[tid]);
|
}
|
||||||
cmp[tid] += (values[8] < values[tid]);
|
|
||||||
cmp[tid] += (values[9] < values[tid]);
|
|
||||||
cmp[tid] += (values[10] < values[tid]);
|
|
||||||
cmp[tid] += (values[11] < values[tid]);
|
|
||||||
cmp[tid] += (values[12] < values[tid]);
|
|
||||||
cmp[tid] += (values[13] < values[tid]);
|
|
||||||
cmp[tid] += (values[14] < values[tid]);
|
|
||||||
cmp[tid] += (values[15] < values[tid]);
|
|
||||||
|
|
||||||
// Resolve elements with the same index.
|
|
||||||
if (tid > 0 && cmp[tid] == cmp[0]) ++cmp[tid];
|
|
||||||
if (tid > 1 && cmp[tid] == cmp[1]) ++cmp[tid];
|
|
||||||
if (tid > 2 && cmp[tid] == cmp[2]) ++cmp[tid];
|
|
||||||
if (tid > 3 && cmp[tid] == cmp[3]) ++cmp[tid];
|
|
||||||
if (tid > 4 && cmp[tid] == cmp[4]) ++cmp[tid];
|
|
||||||
if (tid > 5 && cmp[tid] == cmp[5]) ++cmp[tid];
|
|
||||||
if (tid > 6 && cmp[tid] == cmp[6]) ++cmp[tid];
|
|
||||||
if (tid > 7 && cmp[tid] == cmp[7]) ++cmp[tid];
|
|
||||||
if (tid > 8 && cmp[tid] == cmp[8]) ++cmp[tid];
|
|
||||||
if (tid > 9 && cmp[tid] == cmp[9]) ++cmp[tid];
|
|
||||||
if (tid > 10 && cmp[tid] == cmp[10]) ++cmp[tid];
|
|
||||||
if (tid > 11 && cmp[tid] == cmp[11]) ++cmp[tid];
|
|
||||||
if (tid > 12 && cmp[tid] == cmp[12]) ++cmp[tid];
|
|
||||||
if (tid > 13 && cmp[tid] == cmp[13]) ++cmp[tid];
|
|
||||||
if (tid > 14 && cmp[tid] == cmp[14]) ++cmp[tid];
|
|
||||||
#else
|
#else
|
||||||
|
const int tid = threadIdx.x;
|
||||||
|
|
||||||
cmp[tid] = 0;
|
int rank = 0;
|
||||||
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int i = 0; i < 16; i++)
|
for (int i = 0; i < 16; i++)
|
||||||
{
|
{
|
||||||
cmp[tid] += (values[i] < values[tid]);
|
rank += (values[i] < values[tid]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ranks[tid] = rank;
|
||||||
|
|
||||||
// Resolve elements with the same index.
|
// Resolve elements with the same index.
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int i = 0; i < 15; i++)
|
for (int i = 0; i < 15; i++)
|
||||||
{
|
{
|
||||||
if (tid > 0 && cmp[tid] == cmp[i]) ++cmp[tid];
|
if (tid > i && ranks[tid] == ranks[i]) ++ranks[tid];
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -136,9 +126,7 @@ __device__ void loadColorBlock(const uint * image, float3 colors[16], float3 sum
|
|||||||
colors[idx].x = ((c >> 16) & 0xFF) * (1.0f / 255.0f);
|
colors[idx].x = ((c >> 16) & 0xFF) * (1.0f / 255.0f);
|
||||||
|
|
||||||
// No need to synchronize, 16 < warp size.
|
// No need to synchronize, 16 < warp size.
|
||||||
#if __DEVICE_EMULATION__
|
__debugsync();
|
||||||
} __debugsync(); if (idx < 16) {
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// Sort colors along the best fit line.
|
// Sort colors along the best fit line.
|
||||||
colorSums(colors, sums);
|
colorSums(colors, sums);
|
||||||
@ -148,17 +136,74 @@ __device__ void loadColorBlock(const uint * image, float3 colors[16], float3 sum
|
|||||||
|
|
||||||
dps[idx] = dot(colors[idx], axis);
|
dps[idx] = dot(colors[idx], axis);
|
||||||
|
|
||||||
#if __DEVICE_EMULATION__
|
__debugsync();
|
||||||
} __debugsync(); if (idx < 16) {
|
|
||||||
#endif
|
|
||||||
|
|
||||||
sortColors(dps, xrefs);
|
sortColors(dps, xrefs);
|
||||||
|
|
||||||
float3 tmp = colors[idx];
|
float3 tmp = colors[idx];
|
||||||
|
__debugsync();
|
||||||
colors[xrefs[idx]] = tmp;
|
colors[xrefs[idx]] = tmp;
|
||||||
}
|
}
|
||||||
|
#if __DEVICE_EMULATION__
|
||||||
|
else
|
||||||
|
{
|
||||||
|
__debugsync();
|
||||||
|
__debugsync();
|
||||||
|
__debugsync();
|
||||||
|
}
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
__device__ void loadColorBlockTex(uint bn, uint w, float3 colors[16], float3 sums[16], int xrefs[16], int * sameColor)
|
||||||
|
{
|
||||||
|
const int bid = blockIdx.x;
|
||||||
|
const int idx = threadIdx.x;
|
||||||
|
|
||||||
|
__shared__ float dps[16];
|
||||||
|
|
||||||
|
if (idx < 16)
|
||||||
|
{
|
||||||
|
float x = 4 * ((bn + bid) % w) + idx % 4;
|
||||||
|
float y = 4 * ((bn + bid) / w) + idx / 4;
|
||||||
|
|
||||||
|
// Read color and copy to shared mem.
|
||||||
|
float4 c = tex2D(tex, x, y);
|
||||||
|
|
||||||
|
colors[idx].x = c.z;
|
||||||
|
colors[idx].y = c.y;
|
||||||
|
colors[idx].z = c.x;
|
||||||
|
|
||||||
|
// No need to synchronize, 16 < warp size.
|
||||||
|
__debugsync();
|
||||||
|
|
||||||
|
// Sort colors along the best fit line.
|
||||||
|
colorSums(colors, sums);
|
||||||
|
float3 axis = bestFitLine(colors, sums[0], kColorMetric);
|
||||||
|
|
||||||
|
*sameColor = (axis == make_float3(0, 0, 0));
|
||||||
|
|
||||||
|
dps[idx] = dot(colors[idx], axis);
|
||||||
|
|
||||||
|
__debugsync();
|
||||||
|
|
||||||
|
sortColors(dps, xrefs);
|
||||||
|
|
||||||
|
float3 tmp = colors[idx];
|
||||||
|
__debugsync();
|
||||||
|
colors[xrefs[idx]] = tmp;
|
||||||
|
}
|
||||||
|
#if __DEVICE_EMULATION__
|
||||||
|
else
|
||||||
|
{
|
||||||
|
__debugsync();
|
||||||
|
__debugsync();
|
||||||
|
__debugsync();
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
__device__ void loadColorBlock(const uint * image, float3 colors[16], float3 sums[16], float weights[16], int xrefs[16], int * sameColor)
|
__device__ void loadColorBlock(const uint * image, float3 colors[16], float3 sums[16], float weights[16], int xrefs[16], int * sameColor)
|
||||||
{
|
{
|
||||||
const int bid = blockIdx.x;
|
const int bid = blockIdx.x;
|
||||||
@ -179,11 +224,8 @@ __device__ void loadColorBlock(const uint * image, float3 colors[16], float3 sum
|
|||||||
|
|
||||||
colors[idx] = rawColors[idx] * weights[idx];
|
colors[idx] = rawColors[idx] * weights[idx];
|
||||||
|
|
||||||
|
|
||||||
// No need to synchronize, 16 < warp size.
|
// No need to synchronize, 16 < warp size.
|
||||||
#if __DEVICE_EMULATION__
|
__debugsync();
|
||||||
} __debugsync(); if (idx < 16) {
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// Sort colors along the best fit line.
|
// Sort colors along the best fit line.
|
||||||
colorSums(colors, sums);
|
colorSums(colors, sums);
|
||||||
@ -193,18 +235,24 @@ __device__ void loadColorBlock(const uint * image, float3 colors[16], float3 sum
|
|||||||
|
|
||||||
dps[idx] = dot(rawColors[idx], axis);
|
dps[idx] = dot(rawColors[idx], axis);
|
||||||
|
|
||||||
#if __DEVICE_EMULATION__
|
__debugsync();
|
||||||
} __debugsync(); if (idx < 16) {
|
|
||||||
#endif
|
|
||||||
|
|
||||||
sortColors(dps, xrefs);
|
sortColors(dps, xrefs);
|
||||||
|
|
||||||
float3 tmp = colors[idx];
|
float3 tmp = colors[idx];
|
||||||
colors[xrefs[idx]] = tmp;
|
|
||||||
|
|
||||||
float w = weights[idx];
|
float w = weights[idx];
|
||||||
|
__debugsync();
|
||||||
|
colors[xrefs[idx]] = tmp;
|
||||||
weights[xrefs[idx]] = w;
|
weights[xrefs[idx]] = w;
|
||||||
}
|
}
|
||||||
|
#if __DEVICE_EMULATION__
|
||||||
|
else
|
||||||
|
{
|
||||||
|
__debugsync();
|
||||||
|
__debugsync();
|
||||||
|
__debugsync();
|
||||||
|
}
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
__device__ void loadColorBlock(const uint * image, float2 colors[16], float2 sums[16], int xrefs[16], int * sameColor)
|
__device__ void loadColorBlock(const uint * image, float2 colors[16], float2 sums[16], int xrefs[16], int * sameColor)
|
||||||
@ -223,9 +271,7 @@ __device__ void loadColorBlock(const uint * image, float2 colors[16], float2 sum
|
|||||||
colors[idx].x = ((c >> 16) & 0xFF) * (1.0f / 255.0f);
|
colors[idx].x = ((c >> 16) & 0xFF) * (1.0f / 255.0f);
|
||||||
|
|
||||||
// No need to synchronize, 16 < warp size.
|
// No need to synchronize, 16 < warp size.
|
||||||
#if __DEVICE_EMULATION__
|
__debugsync();
|
||||||
} __debugsync(); if (idx < 16) {
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// Sort colors along the best fit line.
|
// Sort colors along the best fit line.
|
||||||
colorSums(colors, sums);
|
colorSums(colors, sums);
|
||||||
@ -235,15 +281,22 @@ __device__ void loadColorBlock(const uint * image, float2 colors[16], float2 sum
|
|||||||
|
|
||||||
dps[idx] = dot(colors[idx], axis);
|
dps[idx] = dot(colors[idx], axis);
|
||||||
|
|
||||||
#if __DEVICE_EMULATION__
|
__debugsync();
|
||||||
} __debugsync(); if (idx < 16) {
|
|
||||||
#endif
|
|
||||||
|
|
||||||
sortColors(dps, xrefs);
|
sortColors(dps, xrefs);
|
||||||
|
|
||||||
float2 tmp = colors[idx];
|
float2 tmp = colors[idx];
|
||||||
|
__debugsync();
|
||||||
colors[xrefs[idx]] = tmp;
|
colors[xrefs[idx]] = tmp;
|
||||||
}
|
}
|
||||||
|
#if __DEVICE_EMULATION__
|
||||||
|
else
|
||||||
|
{
|
||||||
|
__debugsync();
|
||||||
|
__debugsync();
|
||||||
|
__debugsync();
|
||||||
|
}
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -951,7 +1004,6 @@ __device__ int findMinError(float * errors)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#else
|
#else
|
||||||
for(int d = NUM_THREADS/2; d > 32; d >>= 1)
|
for(int d = NUM_THREADS/2; d > 32; d >>= 1)
|
||||||
{
|
{
|
||||||
@ -1135,6 +1187,41 @@ __global__ void compressDXT1(const uint * permutations, const uint * image, uint
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
__global__ void compressDXT1_Tex(uint bn, uint w, const uint * permutations, uint2 * result)
|
||||||
|
{
|
||||||
|
__shared__ float3 colors[16];
|
||||||
|
__shared__ float3 sums[16];
|
||||||
|
__shared__ int xrefs[16];
|
||||||
|
__shared__ int sameColor;
|
||||||
|
|
||||||
|
loadColorBlockTex(bn, w, colors, sums, xrefs, &sameColor);
|
||||||
|
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
if (sameColor)
|
||||||
|
{
|
||||||
|
if (threadIdx.x == 0) saveSingleColorBlockDXT1(colors[0], result);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
ushort bestStart, bestEnd;
|
||||||
|
uint bestPermutation;
|
||||||
|
|
||||||
|
__shared__ float errors[NUM_THREADS];
|
||||||
|
|
||||||
|
evalAllPermutations(colors, sums[0], permutations, bestStart, bestEnd, bestPermutation, errors);
|
||||||
|
|
||||||
|
// Use a parallel reduction to find minimum error.
|
||||||
|
const int minIdx = findMinError(errors);
|
||||||
|
|
||||||
|
// Only write the result of the winner thread.
|
||||||
|
if (threadIdx.x == minIdx)
|
||||||
|
{
|
||||||
|
saveBlockDXT1(bestStart, bestEnd, bestPermutation, xrefs, result);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
__global__ void compressLevel4DXT1(const uint * permutations, const uint * image, uint2 * result)
|
__global__ void compressLevel4DXT1(const uint * permutations, const uint * image, uint2 * result)
|
||||||
{
|
{
|
||||||
__shared__ float3 colors[16];
|
__shared__ float3 colors[16];
|
||||||
@ -1452,6 +1539,125 @@ __global__ void compressDXT5(const uint * permutations, const uint * image, uint
|
|||||||
}
|
}
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
/*__device__ void evaluatePalette(uint alpha0, uint alpha1, uint alphas[8])
|
||||||
|
{
|
||||||
|
alpha[0] = alpha0;
|
||||||
|
alpha[1] = alpha1;
|
||||||
|
alpha[2] = (6 * alpha[0] + 1 * alpha[1]) / 7; // bit code 010
|
||||||
|
alpha[3] = (5 * alpha[0] + 2 * alpha[1]) / 7; // bit code 011
|
||||||
|
alpha[4] = (4 * alpha[0] + 3 * alpha[1]) / 7; // bit code 100
|
||||||
|
alpha[5] = (3 * alpha[0] + 4 * alpha[1]) / 7; // bit code 101
|
||||||
|
alpha[6] = (2 * alpha[0] + 5 * alpha[1]) / 7; // bit code 110
|
||||||
|
alpha[7] = (1 * alpha[0] + 6 * alpha[1]) / 7; // bit code 111
|
||||||
|
}
|
||||||
|
|
||||||
|
__device__ uint computeAlphaError(const uint block[16], uint alpha0, uint alpha1, int bestError = INT_MAX)
|
||||||
|
{
|
||||||
|
uint8 alphas[8];
|
||||||
|
evaluatePalette(alpha0, alpha1, alphas);
|
||||||
|
|
||||||
|
int totalError = 0;
|
||||||
|
|
||||||
|
for (uint i = 0; i < 16; i++)
|
||||||
|
{
|
||||||
|
uint8 alpha = block[i];
|
||||||
|
|
||||||
|
// @@ It should be possible to do this much faster.
|
||||||
|
|
||||||
|
int minDist = INT_MAX;
|
||||||
|
for (uint p = 0; p < 8; p++)
|
||||||
|
{
|
||||||
|
int dist = alphaDistance(alpha, alphas[p]);
|
||||||
|
minDist = min(dist, minDist);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
totalError += minDist;
|
||||||
|
|
||||||
|
if (totalError > bestError)
|
||||||
|
{
|
||||||
|
// early out
|
||||||
|
return totalError;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return totalError;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void compressDXT5A(uint alpha[16])
|
||||||
|
{
|
||||||
|
// Get min/max alpha.
|
||||||
|
for (uint i = 0; i < 16; i++)
|
||||||
|
{
|
||||||
|
mina = min(mina, alpha[i]);
|
||||||
|
maxa = max(maxa, alpha[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
dxtBlock->alpha0 = maxa;
|
||||||
|
dxtBlock->alpha1 = mina;
|
||||||
|
|
||||||
|
if (maxa - mina > 8)
|
||||||
|
{
|
||||||
|
int besterror = computeAlphaError(rgba, dxtBlock);
|
||||||
|
int besta0 = maxa;
|
||||||
|
int besta1 = mina;
|
||||||
|
|
||||||
|
// Expand search space a bit.
|
||||||
|
const int alphaExpand = 8;
|
||||||
|
mina = (mina <= alphaExpand) ? 0 : mina - alphaExpand;
|
||||||
|
maxa = (maxa <= 255-alphaExpand) ? 255 : maxa + alphaExpand;
|
||||||
|
|
||||||
|
for (int a0 = mina+9; a0 < maxa; a0++)
|
||||||
|
{
|
||||||
|
for (int a1 = mina; a1 < a0-8; a1++)
|
||||||
|
{
|
||||||
|
nvDebugCheck(a0 - a1 > 8);
|
||||||
|
|
||||||
|
dxtBlock->alpha0 = a0;
|
||||||
|
dxtBlock->alpha1 = a1;
|
||||||
|
int error = computeAlphaError(rgba, dxtBlock, besterror);
|
||||||
|
|
||||||
|
if (error < besterror)
|
||||||
|
{
|
||||||
|
besterror = error;
|
||||||
|
besta0 = a0;
|
||||||
|
besta1 = a1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
dxtBlock->alpha0 = besta0;
|
||||||
|
dxtBlock->alpha1 = besta1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
__global__ void compressDXT5n(uint blockNum, uint2 * d_result)
|
||||||
|
{
|
||||||
|
uint idx = blockIdx.x * 128 + threadIdx.x;
|
||||||
|
|
||||||
|
if (idx >= blockNum)
|
||||||
|
{
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// @@ Ideally we would load the data to shared mem to achieve coalesced global mem access.
|
||||||
|
// @@ Blocks would require too much shared memory (8k) and limit occupancy.
|
||||||
|
|
||||||
|
// @@ Ideally we should use SIMD processing, multiple threads (4-8) processing the same block.
|
||||||
|
// That simplifies coalescing, and reduces divergence.
|
||||||
|
|
||||||
|
// @@ Experiment with texture. That's probably the most simple approach.
|
||||||
|
|
||||||
|
uint x[16];
|
||||||
|
uint y[16];
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
// Setup kernel
|
// Setup kernel
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
@ -1479,6 +1685,20 @@ extern "C" void compressKernelDXT1(uint blockNum, uint * d_data, uint * d_result
|
|||||||
compressDXT1<<<blockNum, NUM_THREADS>>>(d_bitmaps, d_data, (uint2 *)d_result);
|
compressDXT1<<<blockNum, NUM_THREADS>>>(d_bitmaps, d_data, (uint2 *)d_result);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
extern "C" void compressKernelDXT1_Tex(uint bn, uint blockNum, uint w, cudaArray * d_data, uint * d_result, uint * d_bitmaps)
|
||||||
|
{
|
||||||
|
// Setup texture
|
||||||
|
tex.normalized = false;
|
||||||
|
tex.filterMode = cudaFilterModePoint;
|
||||||
|
tex.addressMode[0] = cudaAddressModeClamp;
|
||||||
|
tex.addressMode[1] = cudaAddressModeClamp;
|
||||||
|
|
||||||
|
cudaBindTextureToArray(tex, d_data);
|
||||||
|
|
||||||
|
compressDXT1_Tex<<<blockNum, NUM_THREADS>>>(bn, w, d_bitmaps, (uint2 *)d_result);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
extern "C" void compressKernelDXT1_Level4(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps)
|
extern "C" void compressKernelDXT1_Level4(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps)
|
||||||
{
|
{
|
||||||
compressLevel4DXT1<<<blockNum, NUM_THREADS>>>(d_bitmaps, d_data, (uint2 *)d_result);
|
compressLevel4DXT1<<<blockNum, NUM_THREADS>>>(d_bitmaps, d_data, (uint2 *)d_result);
|
||||||
@ -1498,3 +1718,16 @@ extern "C" void compressKernelCTX1(uint blockNum, uint * d_data, uint * d_result
|
|||||||
{
|
{
|
||||||
compressCTX1<<<blockNum, NUM_THREADS>>>(d_bitmaps, d_data, (uint2 *)d_result);
|
compressCTX1<<<blockNum, NUM_THREADS>>>(d_bitmaps, d_data, (uint2 *)d_result);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
extern "C" void compressKernelDXT5n(uint blockNum, cudaArray * d_data, uint * d_result)
|
||||||
|
{
|
||||||
|
// Setup texture
|
||||||
|
tex.normalized = false;
|
||||||
|
tex.filterMode = cudaFilterModePoint;
|
||||||
|
tex.addressMode[0] = cudaAddressModeClamp;
|
||||||
|
tex.addressMode[1] = cudaAddressModeClamp;
|
||||||
|
|
||||||
|
cudaBindTextureToArray(tex, d_data);
|
||||||
|
|
||||||
|
// compressDXT5n<<<blockNum/128, 128>>>(blockNum, (uint2 *)d_result);
|
||||||
|
}
|
||||||
|
@ -53,6 +53,7 @@ using namespace nvtt;
|
|||||||
|
|
||||||
extern "C" void setupCompressKernel(const float weights[3]);
|
extern "C" void setupCompressKernel(const float weights[3]);
|
||||||
extern "C" void compressKernelDXT1(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps);
|
extern "C" void compressKernelDXT1(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps);
|
||||||
|
extern "C" void compressKernelDXT1_Tex(uint bn, uint blockNum, uint w, cudaArray * d_data, uint * d_result, uint * d_bitmaps);
|
||||||
extern "C" void compressKernelDXT1_Level4(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps);
|
extern "C" void compressKernelDXT1_Level4(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps);
|
||||||
extern "C" void compressWeightedKernelDXT1(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps);
|
extern "C" void compressWeightedKernelDXT1(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps);
|
||||||
extern "C" void compressNormalKernelDXT1(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps);
|
extern "C" void compressNormalKernelDXT1(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps);
|
||||||
@ -286,7 +287,7 @@ void CudaCompressor::compressDXT1(const CompressionOptions::Private & compressio
|
|||||||
}
|
}
|
||||||
|
|
||||||
clock_t end = clock();
|
clock_t end = clock();
|
||||||
//printf("\rCUDA time taken: %.3f seconds\n", float(end-start) / CLOCKS_PER_SEC);
|
printf("\rCUDA time taken: %.3f seconds\n", float(end-start) / CLOCKS_PER_SEC);
|
||||||
|
|
||||||
free(blockLinearImage);
|
free(blockLinearImage);
|
||||||
|
|
||||||
@ -298,6 +299,77 @@ void CudaCompressor::compressDXT1(const CompressionOptions::Private & compressio
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void CudaCompressor::compressDXT1_Tex(const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions)
|
||||||
|
{
|
||||||
|
nvDebugCheck(cuda::isHardwarePresent());
|
||||||
|
#if defined HAVE_CUDA
|
||||||
|
|
||||||
|
// Allocate image as a cuda array.
|
||||||
|
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsigned);
|
||||||
|
|
||||||
|
cudaArray * d_image;
|
||||||
|
const int imageSize = m_image->width() * m_image->height() * sizeof(uint);
|
||||||
|
cudaMallocArray(&d_image, &channelDesc, m_image->width(), m_image->height());
|
||||||
|
cudaMemcpyToArray(d_image, 0, 0, m_image->pixels(), imageSize, cudaMemcpyHostToDevice);
|
||||||
|
|
||||||
|
// Image size in blocks.
|
||||||
|
const uint w = (m_image->width() + 3) / 4;
|
||||||
|
const uint h = (m_image->height() + 3) / 4;
|
||||||
|
const uint blockNum = w * h;
|
||||||
|
const uint compressedSize = blockNum * 8;
|
||||||
|
|
||||||
|
void * h_result = malloc(MAX_BLOCKS * 8);
|
||||||
|
|
||||||
|
clock_t start = clock();
|
||||||
|
|
||||||
|
setupCompressKernel(compressionOptions.colorWeight.ptr());
|
||||||
|
|
||||||
|
uint bn = 0;
|
||||||
|
while(bn != blockNum)
|
||||||
|
{
|
||||||
|
uint count = min(blockNum - bn, MAX_BLOCKS);
|
||||||
|
|
||||||
|
// Launch kernel.
|
||||||
|
compressKernelDXT1_Tex(bn, count, w, d_image, m_result, m_bitmapTable);
|
||||||
|
|
||||||
|
// Check for errors.
|
||||||
|
cudaError_t err = cudaGetLastError();
|
||||||
|
if (err != cudaSuccess)
|
||||||
|
{
|
||||||
|
nvDebug("CUDA Error: %s\n", cudaGetErrorString(err));
|
||||||
|
|
||||||
|
if (outputOptions.errorHandler != NULL)
|
||||||
|
{
|
||||||
|
outputOptions.errorHandler->error(Error_CudaError);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Copy result to host, overwrite swizzled image.
|
||||||
|
cudaMemcpy(h_result, m_result, count * 8, cudaMemcpyDeviceToHost);
|
||||||
|
|
||||||
|
// Output result.
|
||||||
|
if (outputOptions.outputHandler != NULL)
|
||||||
|
{
|
||||||
|
outputOptions.outputHandler->writeData(h_result, count * 8);
|
||||||
|
}
|
||||||
|
|
||||||
|
bn += count;
|
||||||
|
}
|
||||||
|
|
||||||
|
clock_t end = clock();
|
||||||
|
printf("\rCUDA time taken: %.3f seconds\n", float(end-start) / CLOCKS_PER_SEC);
|
||||||
|
|
||||||
|
free(h_result);
|
||||||
|
|
||||||
|
#else
|
||||||
|
if (outputOptions.errorHandler != NULL)
|
||||||
|
{
|
||||||
|
outputOptions.errorHandler->error(Error_CudaError);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/// Compress image using CUDA.
|
/// Compress image using CUDA.
|
||||||
void CudaCompressor::compressDXT3(const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions)
|
void CudaCompressor::compressDXT3(const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions)
|
||||||
@ -618,3 +690,20 @@ void CudaCompressor::compressCTX1(const nvtt::CompressionOptions::Private & comp
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void CudaCompressor::compressDXT5n(const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions)
|
||||||
|
{
|
||||||
|
nvDebugCheck(cuda::isHardwarePresent());
|
||||||
|
#if defined HAVE_CUDA
|
||||||
|
|
||||||
|
// @@ TODO
|
||||||
|
|
||||||
|
#else
|
||||||
|
if (outputOptions.errorHandler != NULL)
|
||||||
|
{
|
||||||
|
outputOptions.errorHandler->error(Error_CudaError);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -42,10 +42,12 @@ namespace nv
|
|||||||
void setImage(const Image * image, nvtt::AlphaMode alphaMode);
|
void setImage(const Image * image, nvtt::AlphaMode alphaMode);
|
||||||
|
|
||||||
void compressDXT1(const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
|
void compressDXT1(const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
|
||||||
|
void compressDXT1_Tex(const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
|
||||||
void compressDXT3(const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
|
void compressDXT3(const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
|
||||||
void compressDXT5(const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
|
void compressDXT5(const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
|
||||||
void compressDXT1n(const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
|
void compressDXT1n(const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
|
||||||
void compressCTX1(const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
|
void compressCTX1(const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
|
||||||
|
void compressDXT5n(const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
|
||||||
|
@ -26,7 +26,6 @@
|
|||||||
#ifndef CUDAMATH_H
|
#ifndef CUDAMATH_H
|
||||||
#define CUDAMATH_H
|
#define CUDAMATH_H
|
||||||
|
|
||||||
#include <float.h>
|
|
||||||
|
|
||||||
|
|
||||||
inline __device__ __host__ float3 operator *(float3 a, float3 b)
|
inline __device__ __host__ float3 operator *(float3 a, float3 b)
|
||||||
@ -211,7 +210,7 @@ inline __device__ bool singleColor(const float3 * colors)
|
|||||||
bool sameColor = false;
|
bool sameColor = false;
|
||||||
for (int i = 0; i < 16; i++)
|
for (int i = 0; i < 16; i++)
|
||||||
{
|
{
|
||||||
sameColor &= (colors[idx] == colors[0]);
|
sameColor &= (colors[i] == colors[0]);
|
||||||
}
|
}
|
||||||
return sameColor;
|
return sameColor;
|
||||||
#else
|
#else
|
||||||
@ -232,16 +231,16 @@ inline __device__ bool singleColor(const float3 * colors)
|
|||||||
inline __device__ void colorSums(const float3 * colors, float3 * sums)
|
inline __device__ void colorSums(const float3 * colors, float3 * sums)
|
||||||
{
|
{
|
||||||
#if __DEVICE_EMULATION__
|
#if __DEVICE_EMULATION__
|
||||||
float3 color_sum = make_float3(0.0f, 0.0f, 0.0f);
|
float3 color_sum = make_float3(0.0f, 0.0f, 0.0f);
|
||||||
for (int i = 0; i < 16; i++)
|
for (int i = 0; i < 16; i++)
|
||||||
{
|
{
|
||||||
color_sum += colors[i];
|
color_sum += colors[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int i = 0; i < 16; i++)
|
for (int i = 0; i < 16; i++)
|
||||||
{
|
{
|
||||||
sums[i] = color_sum;
|
sums[i] = color_sum;
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
|
|
||||||
const int idx = threadIdx.x;
|
const int idx = threadIdx.x;
|
||||||
@ -327,7 +326,7 @@ inline __device__ __host__ float2 firstEigenVector2D( float matrix[3] )
|
|||||||
inline __device__ void colorSums(const float2 * colors, float2 * sums)
|
inline __device__ void colorSums(const float2 * colors, float2 * sums)
|
||||||
{
|
{
|
||||||
#if __DEVICE_EMULATION__
|
#if __DEVICE_EMULATION__
|
||||||
float2 color_sum = make_float2(0.0f, 0.0f, 0.0f);
|
float2 color_sum = make_float2(0.0f, 0.0f);
|
||||||
for (int i = 0; i < 16; i++)
|
for (int i = 0; i < 16; i++)
|
||||||
{
|
{
|
||||||
color_sum += colors[i];
|
color_sum += colors[i];
|
||||||
@ -360,7 +359,7 @@ inline __device__ float2 bestFitLine(const float2 * colors, float2 color_sum)
|
|||||||
float2 a = (colors[i] - color_sum * (1.0f / 16.0f));
|
float2 a = (colors[i] - color_sum * (1.0f / 16.0f));
|
||||||
covariance[0] += a.x * a.x;
|
covariance[0] += a.x * a.x;
|
||||||
covariance[1] += a.x * a.y;
|
covariance[1] += a.x * a.y;
|
||||||
covariance[3] += a.y * a.y;
|
covariance[2] += a.y * a.y;
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user