Add DXT1n and CTX1 CUDA compressors.
This commit is contained in:
parent
c7fcc3ef4b
commit
5dbfb20b60
@ -56,7 +56,7 @@ namespace
|
||||
|
||||
static int blockSize(Format format)
|
||||
{
|
||||
if (format == Format_DXT1 || format == Format_DXT1a) {
|
||||
if (format == Format_DXT1 || format == Format_DXT1a || format == Format_DXT1n) {
|
||||
return 8;
|
||||
}
|
||||
else if (format == Format_DXT3) {
|
||||
@ -71,6 +71,9 @@ namespace
|
||||
else if (format == Format_BC5) {
|
||||
return 16;
|
||||
}
|
||||
else if (format == Format_CTX1) {
|
||||
return 8;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -333,7 +336,7 @@ bool Compressor::Private::outputHeader(const InputOptions::Private & inputOption
|
||||
{
|
||||
header.setLinearSize(computeImageSize(inputOptions.targetWidth, inputOptions.targetHeight, inputOptions.targetDepth, compressionOptions.bitcount, compressionOptions.format));
|
||||
|
||||
if (compressionOptions.format == Format_DXT1 || compressionOptions.format == Format_DXT1a) {
|
||||
if (compressionOptions.format == Format_DXT1 || compressionOptions.format == Format_DXT1a || compressionOptions.format == Format_DXT1n) {
|
||||
header.setFourCC('D', 'X', 'T', '1');
|
||||
if (inputOptions.isNormalMap) header.setNormalFlag(true);
|
||||
}
|
||||
@ -354,6 +357,10 @@ bool Compressor::Private::outputHeader(const InputOptions::Private & inputOption
|
||||
header.setFourCC('A', 'T', 'I', '2');
|
||||
if (inputOptions.isNormalMap) header.setNormalFlag(true);
|
||||
}
|
||||
else if (compressionOptions.format == Format_CTX1) {
|
||||
header.setFourCC('C', 'T', 'X', '1');
|
||||
if (inputOptions.isNormalMap) header.setNormalFlag(true);
|
||||
}
|
||||
}
|
||||
|
||||
// Swap bytes if necessary.
|
||||
@ -705,6 +712,18 @@ bool Compressor::Private::compressMipmap(const Mipmap & mipmap, const Compressio
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (compressionOptions.format == Format_DXT1n)
|
||||
{
|
||||
if (cudaEnabled)
|
||||
{
|
||||
nvDebugCheck(cudaSupported);
|
||||
cuda->compressDXT1n(image, outputOptions, compressionOptions);
|
||||
}
|
||||
else
|
||||
{
|
||||
if (outputOptions.errorHandler) outputOptions.errorHandler->error(Error_UnsupportedFeature);
|
||||
}
|
||||
}
|
||||
else if (compressionOptions.format == Format_DXT3)
|
||||
{
|
||||
if (compressionOptions.quality == Quality_Fastest)
|
||||
@ -762,6 +781,18 @@ bool Compressor::Private::compressMipmap(const Mipmap & mipmap, const Compressio
|
||||
{
|
||||
compressBC5(image, outputOptions, compressionOptions);
|
||||
}
|
||||
else if (compressionOptions.format == Format_CTX1)
|
||||
{
|
||||
if (cudaEnabled)
|
||||
{
|
||||
nvDebugCheck(cudaSupported);
|
||||
cuda->compressCTX1(image, outputOptions, compressionOptions);
|
||||
}
|
||||
else
|
||||
{
|
||||
if (outputOptions.errorHandler) outputOptions.errorHandler->error(Error_UnsupportedFeature);
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
@ -122,7 +122,7 @@ static void doPrecomputation()
|
||||
*/
|
||||
|
||||
|
||||
const static uint bitmaps[992] =
|
||||
const static uint s_bitmapTable[992] =
|
||||
{
|
||||
0x80000000,
|
||||
0x40000000,
|
||||
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -40,6 +40,8 @@ namespace nv
|
||||
void compressDXT1(const Image * image, const nvtt::OutputOptions::Private & outputOptions, const nvtt::CompressionOptions::Private & compressionOptions);
|
||||
void compressDXT3(const Image * image, const nvtt::OutputOptions::Private & outputOptions, const nvtt::CompressionOptions::Private & compressionOptions);
|
||||
void compressDXT5(const Image * image, const nvtt::OutputOptions::Private & outputOptions, const nvtt::CompressionOptions::Private & compressionOptions);
|
||||
void compressDXT1n(const Image * image, const nvtt::OutputOptions::Private & outputOptions, const nvtt::CompressionOptions::Private & compressionOptions);
|
||||
void compressCTX1(const Image * image, const nvtt::OutputOptions::Private & outputOptions, const nvtt::CompressionOptions::Private & compressionOptions);
|
||||
|
||||
private:
|
||||
|
||||
|
@ -1,221 +1,363 @@
|
||||
// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person
|
||||
// obtaining a copy of this software and associated documentation
|
||||
// files (the "Software"), to deal in the Software without
|
||||
// restriction, including without limitation the rights to use,
|
||||
// copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the
|
||||
// Software is furnished to do so, subject to the following
|
||||
// conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be
|
||||
// included in all copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
// OTHER DEALINGS IN THE SOFTWARE.
|
||||
|
||||
// Math functions and operators to be used with vector types.
|
||||
|
||||
#ifndef CUDAMATH_H
|
||||
#define CUDAMATH_H
|
||||
|
||||
#include <float.h>
|
||||
|
||||
|
||||
inline __device__ __host__ float3 operator *(float3 a, float3 b)
|
||||
{
|
||||
return make_float3(a.x*b.x, a.y*b.y, a.z*b.z);
|
||||
}
|
||||
|
||||
inline __device__ __host__ float3 operator *(float f, float3 v)
|
||||
{
|
||||
return make_float3(v.x*f, v.y*f, v.z*f);
|
||||
}
|
||||
|
||||
inline __device__ __host__ float3 operator *(float3 v, float f)
|
||||
{
|
||||
return make_float3(v.x*f, v.y*f, v.z*f);
|
||||
}
|
||||
|
||||
inline __device__ __host__ float3 operator +(float3 a, float3 b)
|
||||
{
|
||||
return make_float3(a.x+b.x, a.y+b.y, a.z+b.z);
|
||||
}
|
||||
|
||||
inline __device__ __host__ void operator +=(float3 & b, float3 a)
|
||||
{
|
||||
b.x += a.x;
|
||||
b.y += a.y;
|
||||
b.z += a.z;
|
||||
}
|
||||
|
||||
inline __device__ __host__ float3 operator -(float3 a, float3 b)
|
||||
{
|
||||
return make_float3(a.x-b.x, a.y-b.y, a.z-b.z);
|
||||
}
|
||||
|
||||
inline __device__ __host__ void operator -=(float3 & b, float3 a)
|
||||
{
|
||||
b.x -= a.x;
|
||||
b.y -= a.y;
|
||||
b.z -= a.z;
|
||||
}
|
||||
|
||||
inline __device__ __host__ float3 operator /(float3 v, float f)
|
||||
{
|
||||
float inv = 1.0f / f;
|
||||
return v * inv;
|
||||
}
|
||||
|
||||
inline __device__ __host__ void operator /=(float3 & b, float f)
|
||||
{
|
||||
float inv = 1.0f / f;
|
||||
b.x *= inv;
|
||||
b.y *= inv;
|
||||
b.z *= inv;
|
||||
}
|
||||
|
||||
|
||||
inline __device__ __host__ float dot(float3 a, float3 b)
|
||||
{
|
||||
return a.x * b.x + a.y * b.y + a.z * b.z;
|
||||
}
|
||||
|
||||
inline __device__ __host__ float dot(float4 a, float4 b)
|
||||
{
|
||||
return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
|
||||
}
|
||||
|
||||
inline __device__ __host__ float clamp(float f, float a, float b)
|
||||
{
|
||||
return max(a, min(f, b));
|
||||
}
|
||||
|
||||
inline __device__ __host__ float3 clamp(float3 v, float a, float b)
|
||||
{
|
||||
return make_float3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b));
|
||||
}
|
||||
|
||||
inline __device__ __host__ float3 clamp(float3 v, float3 a, float3 b)
|
||||
{
|
||||
return make_float3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z));
|
||||
}
|
||||
|
||||
|
||||
inline __device__ __host__ float3 normalize(float3 v)
|
||||
{
|
||||
float len = 1.0f / sqrtf(dot(v, v));
|
||||
return make_float3(v.x * len, v.y * len, v.z * len);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
// Use power method to find the first eigenvector.
|
||||
// http://www.miislita.com/information-retrieval-tutorial/matrix-tutorial-3-eigenvalues-eigenvectors.html
|
||||
inline __device__ __host__ float3 firstEigenVector( float matrix[6] )
|
||||
{
|
||||
// 8 iterations seems to be more than enough.
|
||||
|
||||
float3 v = make_float3(1.0f, 1.0f, 1.0f);
|
||||
for(int i = 0; i < 8; i++) {
|
||||
float x = v.x * matrix[0] + v.y * matrix[1] + v.z * matrix[2];
|
||||
float y = v.x * matrix[1] + v.y * matrix[3] + v.z * matrix[4];
|
||||
float z = v.x * matrix[2] + v.y * matrix[4] + v.z * matrix[5];
|
||||
float m = max(max(x, y), z);
|
||||
float iv = 1.0f / m;
|
||||
#if __DEVICE_EMULATION__
|
||||
if (m == 0.0f) iv = 0.0f;
|
||||
#endif
|
||||
v = make_float3(x*iv, y*iv, z*iv);
|
||||
}
|
||||
|
||||
return v;
|
||||
}
|
||||
|
||||
inline __device__ void colorSums(const float3 * colors, float3 * sums)
|
||||
{
|
||||
#if __DEVICE_EMULATION__
|
||||
float3 color_sum = make_float3(0.0f, 0.0f, 0.0f);
|
||||
for (int i = 0; i < 16; i++)
|
||||
{
|
||||
color_sum += colors[i];
|
||||
}
|
||||
|
||||
for (int i = 0; i < 16; i++)
|
||||
{
|
||||
sums[i] = color_sum;
|
||||
}
|
||||
#else
|
||||
|
||||
const int idx = threadIdx.x;
|
||||
|
||||
// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person
|
||||
// obtaining a copy of this software and associated documentation
|
||||
// files (the "Software"), to deal in the Software without
|
||||
// restriction, including without limitation the rights to use,
|
||||
// copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the
|
||||
// Software is furnished to do so, subject to the following
|
||||
// conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be
|
||||
// included in all copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
// OTHER DEALINGS IN THE SOFTWARE.
|
||||
|
||||
// Math functions and operators to be used with vector types.
|
||||
|
||||
#ifndef CUDAMATH_H
|
||||
#define CUDAMATH_H
|
||||
|
||||
#include <float.h>
|
||||
|
||||
|
||||
inline __device__ __host__ float3 operator *(float3 a, float3 b)
|
||||
{
|
||||
return make_float3(a.x*b.x, a.y*b.y, a.z*b.z);
|
||||
}
|
||||
|
||||
inline __device__ __host__ float3 operator *(float f, float3 v)
|
||||
{
|
||||
return make_float3(v.x*f, v.y*f, v.z*f);
|
||||
}
|
||||
|
||||
inline __device__ __host__ float3 operator *(float3 v, float f)
|
||||
{
|
||||
return make_float3(v.x*f, v.y*f, v.z*f);
|
||||
}
|
||||
|
||||
inline __device__ __host__ float3 operator +(float3 a, float3 b)
|
||||
{
|
||||
return make_float3(a.x+b.x, a.y+b.y, a.z+b.z);
|
||||
}
|
||||
|
||||
inline __device__ __host__ void operator +=(float3 & b, float3 a)
|
||||
{
|
||||
b.x += a.x;
|
||||
b.y += a.y;
|
||||
b.z += a.z;
|
||||
}
|
||||
|
||||
inline __device__ __host__ float3 operator -(float3 a, float3 b)
|
||||
{
|
||||
return make_float3(a.x-b.x, a.y-b.y, a.z-b.z);
|
||||
}
|
||||
|
||||
inline __device__ __host__ void operator -=(float3 & b, float3 a)
|
||||
{
|
||||
b.x -= a.x;
|
||||
b.y -= a.y;
|
||||
b.z -= a.z;
|
||||
}
|
||||
|
||||
inline __device__ __host__ float3 operator /(float3 v, float f)
|
||||
{
|
||||
float inv = 1.0f / f;
|
||||
return v * inv;
|
||||
}
|
||||
|
||||
inline __device__ __host__ void operator /=(float3 & b, float f)
|
||||
{
|
||||
float inv = 1.0f / f;
|
||||
b.x *= inv;
|
||||
b.y *= inv;
|
||||
b.z *= inv;
|
||||
}
|
||||
|
||||
// float2 operators
|
||||
inline __device__ __host__ float2 operator *(float2 a, float2 b)
|
||||
{
|
||||
return make_float2(a.x*b.x, a.y*b.y);
|
||||
}
|
||||
|
||||
inline __device__ __host__ float2 operator *(float f, float2 v)
|
||||
{
|
||||
return make_float2(v.x*f, v.y*f);
|
||||
}
|
||||
|
||||
inline __device__ __host__ float2 operator *(float2 v, float f)
|
||||
{
|
||||
return make_float2(v.x*f, v.y*f);
|
||||
}
|
||||
|
||||
inline __device__ __host__ float2 operator +(float2 a, float2 b)
|
||||
{
|
||||
return make_float2(a.x+b.x, a.y+b.y);
|
||||
}
|
||||
|
||||
inline __device__ __host__ void operator +=(float2 & b, float2 a)
|
||||
{
|
||||
b.x += a.x;
|
||||
b.y += a.y;
|
||||
}
|
||||
|
||||
inline __device__ __host__ float2 operator -(float2 a, float2 b)
|
||||
{
|
||||
return make_float2(a.x-b.x, a.y-b.y);
|
||||
}
|
||||
|
||||
inline __device__ __host__ void operator -=(float2 & b, float2 a)
|
||||
{
|
||||
b.x -= a.x;
|
||||
b.y -= a.y;
|
||||
}
|
||||
|
||||
inline __device__ __host__ float2 operator /(float2 v, float f)
|
||||
{
|
||||
float inv = 1.0f / f;
|
||||
return v * inv;
|
||||
}
|
||||
|
||||
inline __device__ __host__ void operator /=(float2 & b, float f)
|
||||
{
|
||||
float inv = 1.0f / f;
|
||||
b.x *= inv;
|
||||
b.y *= inv;
|
||||
}
|
||||
|
||||
|
||||
inline __device__ __host__ float dot(float2 a, float2 b)
|
||||
{
|
||||
return a.x * b.x + a.y * b.y;
|
||||
}
|
||||
|
||||
inline __device__ __host__ float dot(float3 a, float3 b)
|
||||
{
|
||||
return a.x * b.x + a.y * b.y + a.z * b.z;
|
||||
}
|
||||
|
||||
inline __device__ __host__ float dot(float4 a, float4 b)
|
||||
{
|
||||
return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
|
||||
}
|
||||
|
||||
inline __device__ __host__ float clamp(float f, float a, float b)
|
||||
{
|
||||
return max(a, min(f, b));
|
||||
}
|
||||
|
||||
inline __device__ __host__ float3 clamp(float3 v, float a, float b)
|
||||
{
|
||||
return make_float3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b));
|
||||
}
|
||||
|
||||
inline __device__ __host__ float3 clamp(float3 v, float3 a, float3 b)
|
||||
{
|
||||
return make_float3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z));
|
||||
}
|
||||
|
||||
|
||||
inline __device__ __host__ float3 normalize(float3 v)
|
||||
{
|
||||
float len = 1.0f / sqrtf(dot(v, v));
|
||||
return make_float3(v.x * len, v.y * len, v.z * len);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
// Use power method to find the first eigenvector.
|
||||
// http://www.miislita.com/information-retrieval-tutorial/matrix-tutorial-3-eigenvalues-eigenvectors.html
|
||||
inline __device__ __host__ float3 firstEigenVector( float matrix[6] )
|
||||
{
|
||||
// 8 iterations seems to be more than enough.
|
||||
|
||||
float3 v = make_float3(1.0f, 1.0f, 1.0f);
|
||||
for(int i = 0; i < 8; i++) {
|
||||
float x = v.x * matrix[0] + v.y * matrix[1] + v.z * matrix[2];
|
||||
float y = v.x * matrix[1] + v.y * matrix[3] + v.z * matrix[4];
|
||||
float z = v.x * matrix[2] + v.y * matrix[4] + v.z * matrix[5];
|
||||
float m = max(max(x, y), z);
|
||||
float iv = 1.0f / m;
|
||||
#if __DEVICE_EMULATION__
|
||||
if (m == 0.0f) iv = 0.0f;
|
||||
#endif
|
||||
v = make_float3(x*iv, y*iv, z*iv);
|
||||
}
|
||||
|
||||
return v;
|
||||
}
|
||||
|
||||
inline __device__ void colorSums(const float3 * colors, float3 * sums)
|
||||
{
|
||||
#if __DEVICE_EMULATION__
|
||||
float3 color_sum = make_float3(0.0f, 0.0f, 0.0f);
|
||||
for (int i = 0; i < 16; i++)
|
||||
{
|
||||
color_sum += colors[i];
|
||||
}
|
||||
|
||||
for (int i = 0; i < 16; i++)
|
||||
{
|
||||
sums[i] = color_sum;
|
||||
}
|
||||
#else
|
||||
|
||||
const int idx = threadIdx.x;
|
||||
|
||||
sums[idx] = colors[idx];
|
||||
sums[idx] += sums[idx^8];
|
||||
sums[idx] += sums[idx^4];
|
||||
sums[idx] += sums[idx^2];
|
||||
sums[idx] += sums[idx^1];
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
inline __device__ float3 bestFitLine(const float3 * colors, float3 color_sum, float3 colorMetric)
|
||||
{
|
||||
// Compute covariance matrix of the given colors.
|
||||
#if __DEVICE_EMULATION__
|
||||
float covariance[6] = {0, 0, 0, 0, 0, 0};
|
||||
for (int i = 0; i < 16; i++)
|
||||
{
|
||||
float3 a = (colors[i] - color_sum * (1.0f / 16.0f)) * colorMetric;
|
||||
covariance[0] += a.x * a.x;
|
||||
covariance[1] += a.x * a.y;
|
||||
covariance[2] += a.x * a.z;
|
||||
covariance[3] += a.y * a.y;
|
||||
covariance[4] += a.y * a.z;
|
||||
covariance[5] += a.z * a.z;
|
||||
}
|
||||
#else
|
||||
|
||||
const int idx = threadIdx.x;
|
||||
|
||||
float3 diff = (colors[idx] - color_sum * (1.0f / 16.0f)) * colorMetric;
|
||||
|
||||
// @@ Eliminate two-way bank conflicts here.
|
||||
// @@ It seems that doing that and unrolling the reduction doesn't help...
|
||||
__shared__ float covariance[16*6];
|
||||
|
||||
covariance[6 * idx + 0] = diff.x * diff.x; // 0, 6, 12, 2, 8, 14, 4, 10, 0
|
||||
covariance[6 * idx + 1] = diff.x * diff.y;
|
||||
covariance[6 * idx + 2] = diff.x * diff.z;
|
||||
covariance[6 * idx + 3] = diff.y * diff.y;
|
||||
covariance[6 * idx + 4] = diff.y * diff.z;
|
||||
covariance[6 * idx + 5] = diff.z * diff.z;
|
||||
|
||||
for(int d = 8; d > 0; d >>= 1)
|
||||
{
|
||||
if (idx < d)
|
||||
{
|
||||
covariance[6 * idx + 0] += covariance[6 * (idx+d) + 0];
|
||||
covariance[6 * idx + 1] += covariance[6 * (idx+d) + 1];
|
||||
covariance[6 * idx + 2] += covariance[6 * (idx+d) + 2];
|
||||
covariance[6 * idx + 3] += covariance[6 * (idx+d) + 3];
|
||||
covariance[6 * idx + 4] += covariance[6 * (idx+d) + 4];
|
||||
covariance[6 * idx + 5] += covariance[6 * (idx+d) + 5];
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
// Compute first eigen vector.
|
||||
return firstEigenVector(covariance);
|
||||
}
|
||||
|
||||
|
||||
#endif // CUDAMATH_H
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
inline __device__ float3 bestFitLine(const float3 * colors, float3 color_sum, float3 colorMetric)
|
||||
{
|
||||
// Compute covariance matrix of the given colors.
|
||||
#if __DEVICE_EMULATION__
|
||||
float covariance[6] = {0, 0, 0, 0, 0, 0};
|
||||
for (int i = 0; i < 16; i++)
|
||||
{
|
||||
float3 a = (colors[i] - color_sum * (1.0f / 16.0f)) * colorMetric;
|
||||
covariance[0] += a.x * a.x;
|
||||
covariance[1] += a.x * a.y;
|
||||
covariance[2] += a.x * a.z;
|
||||
covariance[3] += a.y * a.y;
|
||||
covariance[4] += a.y * a.z;
|
||||
covariance[5] += a.z * a.z;
|
||||
}
|
||||
#else
|
||||
|
||||
const int idx = threadIdx.x;
|
||||
|
||||
float3 diff = (colors[idx] - color_sum * (1.0f / 16.0f)) * colorMetric;
|
||||
|
||||
// @@ Eliminate two-way bank conflicts here.
|
||||
// @@ It seems that doing that and unrolling the reduction doesn't help...
|
||||
__shared__ float covariance[16*6];
|
||||
|
||||
covariance[6 * idx + 0] = diff.x * diff.x; // 0, 6, 12, 2, 8, 14, 4, 10, 0
|
||||
covariance[6 * idx + 1] = diff.x * diff.y;
|
||||
covariance[6 * idx + 2] = diff.x * diff.z;
|
||||
covariance[6 * idx + 3] = diff.y * diff.y;
|
||||
covariance[6 * idx + 4] = diff.y * diff.z;
|
||||
covariance[6 * idx + 5] = diff.z * diff.z;
|
||||
|
||||
for(int d = 8; d > 0; d >>= 1)
|
||||
{
|
||||
if (idx < d)
|
||||
{
|
||||
covariance[6 * idx + 0] += covariance[6 * (idx+d) + 0];
|
||||
covariance[6 * idx + 1] += covariance[6 * (idx+d) + 1];
|
||||
covariance[6 * idx + 2] += covariance[6 * (idx+d) + 2];
|
||||
covariance[6 * idx + 3] += covariance[6 * (idx+d) + 3];
|
||||
covariance[6 * idx + 4] += covariance[6 * (idx+d) + 4];
|
||||
covariance[6 * idx + 5] += covariance[6 * (idx+d) + 5];
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
// Compute first eigen vector.
|
||||
return firstEigenVector(covariance);
|
||||
}
|
||||
|
||||
// @@ For 2D this may not be the most efficient method. It's a quadratic equation, right?
|
||||
inline __device__ __host__ float2 firstEigenVector2D( float matrix[3] )
|
||||
{
|
||||
// @@ 8 iterations is probably more than enough.
|
||||
|
||||
float2 v = make_float2(1.0f, 1.0f);
|
||||
for(int i = 0; i < 8; i++) {
|
||||
float x = v.x * matrix[0] + v.y * matrix[1];
|
||||
float y = v.x * matrix[1] + v.y * matrix[2];
|
||||
float m = max(x, y);
|
||||
float iv = 1.0f / m;
|
||||
#if __DEVICE_EMULATION__
|
||||
if (m == 0.0f) iv = 0.0f;
|
||||
#endif
|
||||
v = make_float2(x*iv, y*iv);
|
||||
}
|
||||
|
||||
return v;
|
||||
}
|
||||
|
||||
inline __device__ void colorSums(const float2 * colors, float2 * sums)
|
||||
{
|
||||
#if __DEVICE_EMULATION__
|
||||
float2 color_sum = make_float2(0.0f, 0.0f, 0.0f);
|
||||
for (int i = 0; i < 16; i++)
|
||||
{
|
||||
color_sum += colors[i];
|
||||
}
|
||||
|
||||
for (int i = 0; i < 16; i++)
|
||||
{
|
||||
sums[i] = color_sum;
|
||||
}
|
||||
#else
|
||||
|
||||
const int idx = threadIdx.x;
|
||||
|
||||
sums[idx] = colors[idx];
|
||||
sums[idx] += sums[idx^8];
|
||||
sums[idx] += sums[idx^4];
|
||||
sums[idx] += sums[idx^2];
|
||||
sums[idx] += sums[idx^1];
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
inline __device__ float2 bestFitLine(const float2 * colors, float2 color_sum)
|
||||
{
|
||||
// Compute covariance matrix of the given colors.
|
||||
#if __DEVICE_EMULATION__
|
||||
float covariance[3] = {0, 0, 0};
|
||||
for (int i = 0; i < 16; i++)
|
||||
{
|
||||
float2 a = (colors[i] - color_sum * (1.0f / 16.0f));
|
||||
covariance[0] += a.x * a.x;
|
||||
covariance[1] += a.x * a.y;
|
||||
covariance[3] += a.y * a.y;
|
||||
}
|
||||
#else
|
||||
|
||||
const int idx = threadIdx.x;
|
||||
|
||||
float2 diff = (colors[idx] - color_sum * (1.0f / 16.0f));
|
||||
|
||||
__shared__ float covariance[16*3];
|
||||
|
||||
covariance[3 * idx + 0] = diff.x * diff.x;
|
||||
covariance[3 * idx + 1] = diff.x * diff.y;
|
||||
covariance[3 * idx + 2] = diff.y * diff.y;
|
||||
|
||||
for(int d = 8; d > 0; d >>= 1)
|
||||
{
|
||||
if (idx < d)
|
||||
{
|
||||
covariance[3 * idx + 0] += covariance[3 * (idx+d) + 0];
|
||||
covariance[3 * idx + 1] += covariance[3 * (idx+d) + 1];
|
||||
covariance[3 * idx + 2] += covariance[3 * (idx+d) + 2];
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
// Compute first eigen vector.
|
||||
return firstEigenVector2D(covariance);
|
||||
}
|
||||
|
||||
|
||||
#endif // CUDAMATH_H
|
||||
|
@ -75,6 +75,9 @@ namespace nvtt
|
||||
Format_BC3n = Format_DXT5n,
|
||||
Format_BC4, // ATI1
|
||||
Format_BC5, // 3DC, ATI2
|
||||
|
||||
Format_DXT1n,
|
||||
Format_CTX1,
|
||||
};
|
||||
|
||||
/// Quality modes.
|
||||
|
@ -83,9 +83,102 @@ struct MyOutputHandler : public nvtt::OutputHandler
|
||||
|
||||
};
|
||||
|
||||
void precomp()
|
||||
{
|
||||
unsigned int bitmaps[1024];
|
||||
|
||||
int num = 0;
|
||||
|
||||
printf("{\n");
|
||||
printf("\t%8X,\n", 0);
|
||||
|
||||
bitmaps[0] = 0;
|
||||
|
||||
num = 1;
|
||||
for (int a = 1; a <= 15; a++)
|
||||
{
|
||||
for (int b = a; b <= 15; b++)
|
||||
{
|
||||
for (int c = b; c <= 15; c++)
|
||||
{
|
||||
int indices[16];
|
||||
|
||||
int i = 0;
|
||||
for(; i < a; i++) {
|
||||
indices[i] = 0;
|
||||
}
|
||||
for(; i < a+b; i++) {
|
||||
indices[i] = 2;
|
||||
}
|
||||
for(; i < a+b+c; i++) {
|
||||
indices[i] = 3;
|
||||
}
|
||||
for(; i < 16; i++) {
|
||||
indices[i] = 1;
|
||||
}
|
||||
|
||||
unsigned int bm = 0;
|
||||
for(i = 0; i < 16; i++) {
|
||||
bm |= indices[i] << (i * 2);
|
||||
}
|
||||
|
||||
printf("\t0x%8X, // %d %d %d %d\n", bm, a-0, b-a, c-b, 16-c);
|
||||
|
||||
bitmaps[num] = bm;
|
||||
num++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
printf("}\n");
|
||||
|
||||
printf("// num = %d\n", num);
|
||||
|
||||
/*
|
||||
for( int i = imax; i >= 0; --i )
|
||||
{
|
||||
// second cluster [i,j) is one third along
|
||||
for( int m = i; m < 16; ++m )
|
||||
{
|
||||
indices[m] = 2;
|
||||
}
|
||||
const int jmax = ( i == 0 ) ? 15 : 16;
|
||||
for( int j = jmax; j >= i; --j )
|
||||
{
|
||||
// third cluster [j,k) is two thirds along
|
||||
for( int m = j; m < 16; ++m )
|
||||
{
|
||||
indices[m] = 3;
|
||||
}
|
||||
|
||||
int kmax = ( j == 0 ) ? 15 : 16;
|
||||
for( int k = kmax; k >= j; --k )
|
||||
{
|
||||
// last cluster [k,n) is at the end
|
||||
if( k < 16 )
|
||||
{
|
||||
indices[k] = 1;
|
||||
}
|
||||
|
||||
uint bitmap = 0;
|
||||
|
||||
bool hasThree = false;
|
||||
for(int p = 0; p < 16; p++) {
|
||||
bitmap |= indices[p] << (p * 2);
|
||||
}
|
||||
|
||||
bitmaps[num] = bitmap;
|
||||
num++;
|
||||
}
|
||||
}
|
||||
}
|
||||
*/
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
precomp();
|
||||
|
||||
nvtt::InputOptions inputOptions;
|
||||
inputOptions.setTextureLayout(nvtt::TextureType_2D, 1024, 1024);
|
||||
|
||||
@ -98,6 +191,9 @@ int main(int argc, char *argv[])
|
||||
inputOptions.setMipmapGeneration(false);
|
||||
|
||||
nvtt::CompressionOptions compressionOptions;
|
||||
// compressionOptions.setFormat(nvtt::Format_DXT1);
|
||||
// compressionOptions.setFormat(nvtt::Format_DXT1n);
|
||||
compressionOptions.setFormat(nvtt::Format_CTX1);
|
||||
|
||||
nvtt::OutputOptions outputOptions;
|
||||
outputOptions.setOutputHeader(false);
|
||||
|
@ -130,10 +130,13 @@ struct NormalError
|
||||
|
||||
void done()
|
||||
{
|
||||
ade /= samples;
|
||||
mse /= samples * 3;
|
||||
rmse = sqrt(mse);
|
||||
psnr = (rmse == 0) ? 999.0f : 20.0f * log10(255.0f / rmse);
|
||||
if (samples)
|
||||
{
|
||||
ade /= samples;
|
||||
mse /= samples * 3;
|
||||
rmse = sqrt(mse);
|
||||
psnr = (rmse == 0) ? 999.0f : 20.0f * log10(255.0f / rmse);
|
||||
}
|
||||
}
|
||||
|
||||
void print()
|
||||
|
Loading…
Reference in New Issue
Block a user