10 Commits
2.0.0 ... 2.0.1

Author SHA1 Message Date
fa4a7b9af7 Add proper credits. 2008-02-28 22:52:00 +00:00
f111d23637 Tag 2.0.1 2008-02-28 22:37:49 +00:00
bce983f39e Add post build command to copy header files. 2008-02-28 22:07:08 +00:00
ff93ad41cb Fix end of lines. 2008-02-28 21:45:46 +00:00
56c7771100 Fix end of lines. 2008-02-28 21:45:26 +00:00
ccced843e3 Use smaller allocations to prevent errors.
Check for allocation errors.
2008-02-28 21:45:04 +00:00
dafe2b8841 Hide copy ctor and operator to prevent compiler warnings.
Wrap pimpl using NVTT_DECLARE_PIMPL macro.
2008-02-28 21:14:40 +00:00
e3e7fcb226 Check cuda errors to find out whether the cuda context initialization succeeded. 2008-02-28 17:52:32 +00:00
970395fba8 Fix osx build. 2008-02-28 17:02:29 +00:00
8a24a93e2f Disable CUDA when memory allocations fail. 2008-02-28 16:06:27 +00:00
21 changed files with 1459 additions and 1289 deletions

View File

@ -1,3 +1,9 @@
NVIDIA Texture Tools version 2.0.1
* Fix memory leaks.
* Pre-allocate device memory for CUDA compressor.
* Add single color compressor. Thanks to Amir Ebrahimi.
* Better CUDA error checking.
NVIDIA Texture Tools version 2.0.0
* Fixed PSNR formula in nvimgdiff.
* Added support for arbitrary RGB formats.

View File

@ -2,7 +2,7 @@
--------------------------------------------------------------------------------
NVIDIA Texture Tools
README.txt
Version 2.0.0
Version 2.0
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------

View File

@ -1 +1 @@
2.0.0
2.0.1

View File

@ -327,6 +327,10 @@
RelativePath="..\..\..\src\nvcore\nvcore.h"
>
</File>
<File
RelativePath="..\..\..\src\nvcore\Ptr.h"
>
</File>
<File
RelativePath="..\..\..\src\nvcore\StrLib.h"
>

View File

@ -53,8 +53,8 @@ END
//
VS_VERSION_INFO VERSIONINFO
FILEVERSION 2,0,0,0
PRODUCTVERSION 2,0,0,0
FILEVERSION 2,0,1,0
PRODUCTVERSION 2,0,1,0
FILEFLAGSMASK 0x17L
#ifdef _DEBUG
FILEFLAGS 0x1L
@ -71,12 +71,12 @@ BEGIN
BEGIN
VALUE "CompanyName", "NVIDIA Corporation"
VALUE "FileDescription", "NVIDIA Texture Tools Dynamic Link Library"
VALUE "FileVersion", "2, 0, 0, 0"
VALUE "FileVersion", "2, 0, 1, 0"
VALUE "InternalName", "nvtt"
VALUE "LegalCopyright", "Copyright (C) 2007"
VALUE "OriginalFilename", "nvtt.dll"
VALUE "ProductName", "NVIDIA Texture Tools Dynamic Link Library"
VALUE "ProductVersion", "2, 0, 0, 0"
VALUE "ProductVersion", "2, 0, 1, 0"
END
END
BLOCK "VarFileInfo"

View File

@ -96,6 +96,8 @@
/>
<Tool
Name="VCPostBuildEventTool"
Description="Copying header files..."
CommandLine="xcopy /y /f /i $(SolutionDir)\..\..\src\nvtt\nvtt*.h $(SolutionDir)\$(ConfigurationName).$(PlatformName)\include\"
/>
</Configuration>
<Configuration
@ -258,6 +260,8 @@
/>
<Tool
Name="VCPostBuildEventTool"
Description="Copying header files..."
CommandLine="xcopy /y /f /i $(SolutionDir)\..\..\src\nvtt\nvtt*.h $(SolutionDir)\$(ConfigurationName).$(PlatformName)\include\"
/>
</Configuration>
<Configuration
@ -420,6 +424,8 @@
/>
<Tool
Name="VCPostBuildEventTool"
Description="Copying header files..."
CommandLine="xcopy /y /f /i $(SolutionDir)\..\..\src\nvtt\nvtt*.h $(SolutionDir)\$(ConfigurationName).$(PlatformName)\include\"
/>
</Configuration>
<Configuration
@ -578,6 +584,8 @@
/>
<Tool
Name="VCPostBuildEventTool"
Description="Copying header files..."
CommandLine="xcopy /y /f /i $(SolutionDir)\..\..\src\nvtt\nvtt*.h $(SolutionDir)\$(ConfigurationName).$(PlatformName)\include\"
/>
</Configuration>
<Configuration

View File

@ -110,6 +110,19 @@ void ColorBlock::splatY()
}
}
/// Returns true if the block has a single color.
bool ColorBlock::isSingleColor() const
{
for(int i = 1; i < 16; i++)
{
if (m_color[0] != m_color[i])
{
return false;
}
}
return true;
}
/// Count number of unique colors in this color block.
uint ColorBlock::countUniqueColors() const

View File

@ -24,6 +24,7 @@ namespace nv
void splatX();
void splatY();
bool isSingleColor() const;
uint countUniqueColors() const;
Color32 averageColor() const;
bool hasAlpha() const;

View File

@ -69,7 +69,14 @@ void nv::fastCompressDXT1(const Image * image, const OutputOptions::Private & ou
for (uint x = 0; x < w; x += 4) {
rgba.init(image, x, y);
QuickCompress::compressDXT1(rgba, &block);
if (rgba.isSingleColor())
{
QuickCompress::compressDXT1(rgba.color(0), &block);
}
else
{
QuickCompress::compressDXT1(rgba, &block);
}
if (outputOptions.outputHandler != NULL) {
outputOptions.outputHandler->writeData(&block, sizeof(block));
@ -214,10 +221,16 @@ void nv::compressDXT1(const Image * image, const OutputOptions::Private & output
rgba.init(image, x, y);
// Compress color.
squish::ColourSet colours((uint8 *)rgba.colors(), 0);
fit.SetColourSet(&colours, squish::kDxt1);
fit.Compress(&block);
if (rgba.isSingleColor())
{
QuickCompress::compressDXT1(rgba.color(0), &block);
}
else
{
squish::ColourSet colours((uint8 *)rgba.colors(), 0);
fit.SetColourSet(&colours, squish::kDxt1);
fit.Compress(&block);
}
if (outputOptions.outputHandler != NULL) {
outputOptions.outputHandler->writeData(&block, sizeof(block));

View File

@ -205,16 +205,25 @@ namespace nvtt
Compressor::Compressor() : m(*new Compressor::Private())
{
// CUDA initialization.
m.cudaSupported = cuda::isHardwarePresent();
m.cudaEnabled = m.cudaSupported;
// @@ Do CUDA initialization here.
if (m.cudaEnabled)
{
m.cuda = new CudaCompressor();
if (!m.cuda->isValid())
{
m.cudaEnabled = false;
m.cuda = NULL;
}
}
}
Compressor::~Compressor()
{
// @@ Free CUDA resources here.
delete &m;
}
@ -225,6 +234,17 @@ void Compressor::enableCudaAcceleration(bool enable)
{
m.cudaEnabled = enable;
}
if (m.cudaEnabled && m.cuda == NULL)
{
m.cuda = new CudaCompressor();
if (!m.cuda->isValid())
{
m.cudaEnabled = false;
m.cuda = NULL;
}
}
}
/// Check if CUDA acceleration is enabled.
@ -670,7 +690,7 @@ bool Compressor::Private::compressMipmap(const Mipmap & mipmap, const Compressio
if (cudaEnabled)
{
nvDebugCheck(cudaSupported);
cudaCompressDXT1(image, outputOptions, compressionOptions);
cuda->compressDXT1(image, outputOptions, compressionOptions);
}
else
{
@ -708,7 +728,7 @@ bool Compressor::Private::compressMipmap(const Mipmap & mipmap, const Compressio
if (cudaEnabled)
{
nvDebugCheck(cudaSupported);
cudaCompressDXT3(image, outputOptions, compressionOptions);
cuda->compressDXT3(image, outputOptions, compressionOptions);
}
else
{
@ -727,7 +747,7 @@ bool Compressor::Private::compressMipmap(const Mipmap & mipmap, const Compressio
if (cudaEnabled)
{
nvDebugCheck(cudaSupported);
cudaCompressDXT5(image, outputOptions, compressionOptions);
cuda->compressDXT5(image, outputOptions, compressionOptions);
}
else
{

View File

@ -24,6 +24,10 @@
#ifndef NV_TT_COMPRESSOR_H
#define NV_TT_COMPRESSOR_H
#include <nvcore/Ptr.h>
#include <nvtt/cuda/CudaCompressDXT.h>
#include "nvtt.h"
namespace nv
@ -63,6 +67,9 @@ namespace nvtt
bool cudaSupported;
bool cudaEnabled;
nv::AutoPtr<nv::CudaCompressor> cuda;
};
} // nvtt namespace

View File

@ -353,12 +353,18 @@ static void optimizeAlpha8(const ColorBlock & rgba, AlphaBlockDXT5 * block)
void QuickCompress::compressDXT1(Color32 c, BlockDXT1 * dxtBlock)
{
dxtBlock->col0.r = OMatch5[c.r][0];
dxtBlock->col0.g = OMatch5[c.g][0];
dxtBlock->col0.g = OMatch6[c.g][0];
dxtBlock->col0.b = OMatch5[c.b][0];
dxtBlock->col1.r = OMatch5[c.r][1];
dxtBlock->col1.g = OMatch5[c.g][1];
dxtBlock->col1.g = OMatch6[c.g][1];
dxtBlock->col1.b = OMatch5[c.b][1];
dxtBlock->indices = 0xaaaaaaaa;
if (dxtBlock->col0.u < dxtBlock->col1.u)
{
swap(dxtBlock->col0.u, dxtBlock->col1.u);
dxtBlock->indices ^= 0x55555555;
}
}
void QuickCompress::compressDXT1(const ColorBlock & rgba, BlockDXT1 * dxtBlock)

View File

@ -48,7 +48,12 @@ void initTables()
};
*/
const static uint8 OMatch5[256][2] =
#if __CUDACC__
__constant__ unsigned short
#else
const static uint8
#endif
OMatch5[256][2] =
{
{0x00, 0x00},
{0x00, 0x00},
@ -308,7 +313,12 @@ const static uint8 OMatch5[256][2] =
{0x1F, 0x1F},
};
const static uint8 OMatch6[256][2] =
#if __CUDACC__
__constant__ unsigned short
#else
const static uint8
#endif
OMatch6[256][2] =
{
{0x00, 0x00},
{0x00, 0x01},

View File

@ -122,7 +122,7 @@ static void doPrecomputation()
*/
const static uint bitmaps[992] =
const static uint s_bitmapTable[992] =
{
0x80000000,
0x40000000,

File diff suppressed because it is too large Load Diff

View File

@ -48,29 +48,14 @@ using namespace nvtt;
#if defined HAVE_CUDA
#define MAX_BLOCKS 8192U // 32768, 65535
extern "C" void setupCompressKernel(const float weights[3]);
extern "C" void compressKernel(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps);
extern "C" void compressWeightedKernel(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps);
extern "C" void compressKernelDXT1(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps);
extern "C" void compressWeightedKernelDXT1(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps);
#include "Bitmaps.h"
// @@ Store this pointer in CompressionOptions. Allocate in ctor, free in dtor.
static uint * d_bitmaps = NULL;
static void doPrecomputation()
{
if (d_bitmaps != NULL) {
return;
}
// Upload bitmaps.
cudaMalloc((void**) &d_bitmaps, 992 * sizeof(uint));
cudaMemcpy(d_bitmaps, bitmaps, 992 * sizeof(uint), cudaMemcpyHostToDevice);
// @@ Check for errors.
// @@ Free allocated memory.
}
#include "Bitmaps.h" // @@ Rename to BitmapTable.h
// Convert linear image to block linear.
static void convertToBlockLinear(const Image * image, uint * blockLinearImage)
@ -92,53 +77,80 @@ static void convertToBlockLinear(const Image * image, uint * blockLinearImage)
}
}
#endif // defined HAVE_CUDA
#endif
CudaCompressor::CudaCompressor() : m_bitmapTable(NULL), m_data(NULL), m_result(NULL)
{
#if defined HAVE_CUDA
// Allocate and upload bitmaps.
cudaMalloc((void**) &m_bitmapTable, 992 * sizeof(uint));
if (m_bitmapTable != NULL)
{
cudaMemcpy(m_bitmapTable, s_bitmapTable, 992 * sizeof(uint), cudaMemcpyHostToDevice);
}
// Allocate scratch buffers.
cudaMalloc((void**) &m_data, MAX_BLOCKS * 64U);
cudaMalloc((void**) &m_result, MAX_BLOCKS * 8U);
#endif
}
CudaCompressor::~CudaCompressor()
{
#if defined HAVE_CUDA
// Free device mem allocations.
cudaFree(m_data);
cudaFree(m_result);
cudaFree(m_bitmapTable);
#endif
}
bool CudaCompressor::isValid() const
{
#if defined HAVE_CUDA
if (cudaGetLastError() != cudaSuccess)
{
return false;
}
#endif
return m_data != NULL && m_result != NULL && m_bitmapTable != NULL;
}
// @@ This code is very repetitive and needs to be cleaned up.
/// Compress image using CUDA.
void nv::cudaCompressDXT1(const Image * image, const OutputOptions::Private & outputOptions, const CompressionOptions::Private & compressionOptions)
void CudaCompressor::compressDXT1(const Image * image, const OutputOptions::Private & outputOptions, const CompressionOptions::Private & compressionOptions)
{
nvDebugCheck(cuda::isHardwarePresent());
#if defined HAVE_CUDA
doPrecomputation();
// Image size in blocks.
const uint w = (image->width() + 3) / 4;
const uint h = (image->height() + 3) / 4;
uint imageSize = w * h * 16 * sizeof(Color32);
uint * blockLinearImage = (uint *) malloc(imageSize);
convertToBlockLinear(image, blockLinearImage); // @@ Do this on the GPU!
convertToBlockLinear(image, blockLinearImage); // @@ Do this in parallel with the GPU, or in the GPU!
const uint blockNum = w * h;
const uint compressedSize = blockNum * 8;
const uint blockMax = 32768; // 49152, 65535
clock_t start = clock();
// Allocate image in device memory.
uint * d_data = NULL;
cudaMalloc((void**) &d_data, min(imageSize, blockMax * 64U));
// Allocate result.
uint * d_result = NULL;
cudaMalloc((void**) &d_result, min(compressedSize, blockMax * 8U));
setupCompressKernel(compressionOptions.colorWeight.ptr());
// TODO: Add support for multiple GPUs.
uint bn = 0;
while(bn != blockNum)
{
uint count = min(blockNum - bn, blockMax);
uint count = min(blockNum - bn, MAX_BLOCKS);
cudaMemcpy(d_data, blockLinearImage + bn * 16, count * 64, cudaMemcpyHostToDevice);
cudaMemcpy(m_data, blockLinearImage + bn * 16, count * 64, cudaMemcpyHostToDevice);
// Launch kernel.
compressKernel(count, d_data, d_result, d_bitmaps);
compressKernelDXT1(count, m_data, m_result, m_bitmapTable);
// Check for errors.
cudaError_t err = cudaGetLastError();
@ -153,7 +165,7 @@ void nv::cudaCompressDXT1(const Image * image, const OutputOptions::Private & ou
}
// Copy result to host, overwrite swizzled image.
cudaMemcpy(blockLinearImage, d_result, count * 8, cudaMemcpyDeviceToHost);
cudaMemcpy(blockLinearImage, m_result, count * 8, cudaMemcpyDeviceToHost);
// Output result.
if (outputOptions.outputHandler != NULL)
@ -168,8 +180,6 @@ void nv::cudaCompressDXT1(const Image * image, const OutputOptions::Private & ou
printf("\rCUDA time taken: %.3f seconds\n", float(end-start) / CLOCKS_PER_SEC);
free(blockLinearImage);
cudaFree(d_data);
cudaFree(d_result);
#else
if (outputOptions.errorHandler != NULL)
@ -181,13 +191,11 @@ void nv::cudaCompressDXT1(const Image * image, const OutputOptions::Private & ou
/// Compress image using CUDA.
void nv::cudaCompressDXT3(const Image * image, const OutputOptions::Private & outputOptions, const CompressionOptions::Private & compressionOptions)
void CudaCompressor::compressDXT3(const Image * image, const OutputOptions::Private & outputOptions, const CompressionOptions::Private & compressionOptions)
{
nvDebugCheck(cuda::isHardwarePresent());
#if defined HAVE_CUDA
doPrecomputation();
// Image size in blocks.
const uint w = (image->width() + 3) / 4;
const uint h = (image->height() + 3) / 4;
@ -198,18 +206,9 @@ void nv::cudaCompressDXT3(const Image * image, const OutputOptions::Private & ou
const uint blockNum = w * h;
const uint compressedSize = blockNum * 8;
const uint blockMax = 32768; // 49152, 65535
// Allocate image in device memory.
uint * d_data = NULL;
cudaMalloc((void**) &d_data, min(imageSize, blockMax * 64U));
// Allocate result.
uint * d_result = NULL;
cudaMalloc((void**) &d_result, min(compressedSize, blockMax * 8U));
AlphaBlockDXT3 * alphaBlocks = NULL;
alphaBlocks = (AlphaBlockDXT3 *)malloc(min(compressedSize, blockMax * 8U));
alphaBlocks = (AlphaBlockDXT3 *)malloc(min(compressedSize, MAX_BLOCKS * 8U));
setupCompressKernel(compressionOptions.colorWeight.ptr());
@ -218,12 +217,12 @@ void nv::cudaCompressDXT3(const Image * image, const OutputOptions::Private & ou
uint bn = 0;
while(bn != blockNum)
{
uint count = min(blockNum - bn, blockMax);
uint count = min(blockNum - bn, MAX_BLOCKS);
cudaMemcpy(d_data, blockLinearImage + bn * 16, count * 64, cudaMemcpyHostToDevice);
cudaMemcpy(m_data, blockLinearImage + bn * 16, count * 64, cudaMemcpyHostToDevice);
// Launch kernel.
compressWeightedKernel(count, d_data, d_result, d_bitmaps);
compressWeightedKernelDXT1(count, m_data, m_result, m_bitmapTable);
// Compress alpha in parallel with the GPU.
for (uint i = 0; i < count; i++)
@ -245,7 +244,7 @@ void nv::cudaCompressDXT3(const Image * image, const OutputOptions::Private & ou
}
// Copy result to host, overwrite swizzled image.
cudaMemcpy(blockLinearImage, d_result, count * 8, cudaMemcpyDeviceToHost);
cudaMemcpy(blockLinearImage, m_result, count * 8, cudaMemcpyDeviceToHost);
// Output result.
if (outputOptions.outputHandler != NULL)
@ -265,8 +264,6 @@ void nv::cudaCompressDXT3(const Image * image, const OutputOptions::Private & ou
free(alphaBlocks);
free(blockLinearImage);
cudaFree(d_data);
cudaFree(d_result);
#else
if (outputOptions.errorHandler != NULL)
@ -278,13 +275,11 @@ void nv::cudaCompressDXT3(const Image * image, const OutputOptions::Private & ou
/// Compress image using CUDA.
void nv::cudaCompressDXT5(const Image * image, const OutputOptions::Private & outputOptions, const CompressionOptions::Private & compressionOptions)
void CudaCompressor::compressDXT5(const Image * image, const OutputOptions::Private & outputOptions, const CompressionOptions::Private & compressionOptions)
{
nvDebugCheck(cuda::isHardwarePresent());
#if defined HAVE_CUDA
doPrecomputation();
// Image size in blocks.
const uint w = (image->width() + 3) / 4;
const uint h = (image->height() + 3) / 4;
@ -295,18 +290,9 @@ void nv::cudaCompressDXT5(const Image * image, const OutputOptions::Private & ou
const uint blockNum = w * h;
const uint compressedSize = blockNum * 8;
const uint blockMax = 32768; // 49152, 65535
// Allocate image in device memory.
uint * d_data = NULL;
cudaMalloc((void**) &d_data, min(imageSize, blockMax * 64U));
// Allocate result.
uint * d_result = NULL;
cudaMalloc((void**) &d_result, min(compressedSize, blockMax * 8U));
AlphaBlockDXT5 * alphaBlocks = NULL;
alphaBlocks = (AlphaBlockDXT5 *)malloc(min(compressedSize, blockMax * 8U));
alphaBlocks = (AlphaBlockDXT5 *)malloc(min(compressedSize, MAX_BLOCKS * 8U));
setupCompressKernel(compressionOptions.colorWeight.ptr());
@ -315,12 +301,12 @@ void nv::cudaCompressDXT5(const Image * image, const OutputOptions::Private & ou
uint bn = 0;
while(bn != blockNum)
{
uint count = min(blockNum - bn, blockMax);
uint count = min(blockNum - bn, MAX_BLOCKS);
cudaMemcpy(d_data, blockLinearImage + bn * 16, count * 64, cudaMemcpyHostToDevice);
cudaMemcpy(m_data, blockLinearImage + bn * 16, count * 64, cudaMemcpyHostToDevice);
// Launch kernel.
compressWeightedKernel(count, d_data, d_result, d_bitmaps);
compressWeightedKernelDXT1(count, m_data, m_result, m_bitmapTable);
// Compress alpha in parallel with the GPU.
for (uint i = 0; i < count; i++)
@ -342,7 +328,7 @@ void nv::cudaCompressDXT5(const Image * image, const OutputOptions::Private & ou
}
// Copy result to host, overwrite swizzled image.
cudaMemcpy(blockLinearImage, d_result, count * 8, cudaMemcpyDeviceToHost);
cudaMemcpy(blockLinearImage, m_result, count * 8, cudaMemcpyDeviceToHost);
// Output result.
if (outputOptions.outputHandler != NULL)
@ -362,8 +348,6 @@ void nv::cudaCompressDXT5(const Image * image, const OutputOptions::Private & ou
free(alphaBlocks);
free(blockLinearImage);
cudaFree(d_data);
cudaFree(d_result);
#else
if (outputOptions.errorHandler != NULL)
@ -375,7 +359,7 @@ void nv::cudaCompressDXT5(const Image * image, const OutputOptions::Private & ou
#if defined HAVE_CUDA
#if 0
class Task
{
@ -469,7 +453,7 @@ public:
cudaMemcpy(d_blockLinearImage, blockLinearImage, blockCount * 64, cudaMemcpyHostToDevice);
// Launch kernel.
compressKernel(blockCount, d_blockLinearImage, d_compressedImage, d_bitmaps);
compressKernelDXT1(blockCount, d_blockLinearImage, d_compressedImage, d_bitmaps);
// Check for errors.
cudaError_t err = cudaGetLastError();
@ -511,8 +495,6 @@ private:
};
#endif // defined HAVE_CUDA
void nv::cudaCompressDXT1_2(const Image * image, const OutputOptions::Private & outputOptions, const CompressionOptions::Private & compressionOptions)
{
@ -522,10 +504,8 @@ void nv::cudaCompressDXT1_2(const Image * image, const OutputOptions::Private &
const uint blockNum = ((w + 3) / 4) * ((h + 3) / 4);
const uint blockMax = 32768; // 49152, 65535
doPrecomputation();
setupCompressKernel(compressionOptions.colorWeight.ptr());
setupCompressKernelDXT1(compressionOptions.colorWeight.ptr());
ColorBlock rgba;
Task task(min(blockNum, blockMax));
@ -559,4 +539,4 @@ void nv::cudaCompressDXT1_2(const Image * image, const OutputOptions::Private &
#endif
}
#endif // 0

View File

@ -31,11 +31,24 @@ namespace nv
{
class Image;
void cudaCompressDXT1(const Image * image, const nvtt::OutputOptions::Private & outputOptions, const nvtt::CompressionOptions::Private & compressionOptions);
void cudaCompressDXT3(const Image * image, const nvtt::OutputOptions::Private & outputOptions, const nvtt::CompressionOptions::Private & compressionOptions);
void cudaCompressDXT5(const Image * image, const nvtt::OutputOptions::Private & outputOptions, const nvtt::CompressionOptions::Private & compressionOptions);
class CudaCompressor
{
public:
CudaCompressor();
~CudaCompressor();
void cudaCompressDXT1_2(const Image * image, const nvtt::OutputOptions::Private & outputOptions, const nvtt::CompressionOptions::Private & compressionOptions);
bool isValid() const;
void compressDXT1(const Image * image, const nvtt::OutputOptions::Private & outputOptions, const nvtt::CompressionOptions::Private & compressionOptions);
void compressDXT3(const Image * image, const nvtt::OutputOptions::Private & outputOptions, const nvtt::CompressionOptions::Private & compressionOptions);
void compressDXT5(const Image * image, const nvtt::OutputOptions::Private & outputOptions, const nvtt::CompressionOptions::Private & compressionOptions);
private:
uint * m_bitmapTable;
uint * m_data;
uint * m_result;
};
} // nv namespace

View File

@ -1,221 +1,247 @@
// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
//
// Permission is hereby granted, free of charge, to any person
// obtaining a copy of this software and associated documentation
// files (the "Software"), to deal in the Software without
// restriction, including without limitation the rights to use,
// copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following
// conditions:
//
// The above copyright notice and this permission notice shall be
// included in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
// OTHER DEALINGS IN THE SOFTWARE.
// Math functions and operators to be used with vector types.
#ifndef CUDAMATH_H
#define CUDAMATH_H
#include <float.h>
inline __device__ __host__ float3 operator *(float3 a, float3 b)
{
return make_float3(a.x*b.x, a.y*b.y, a.z*b.z);
}
inline __device__ __host__ float3 operator *(float f, float3 v)
{
return make_float3(v.x*f, v.y*f, v.z*f);
}
inline __device__ __host__ float3 operator *(float3 v, float f)
{
return make_float3(v.x*f, v.y*f, v.z*f);
}
inline __device__ __host__ float3 operator +(float3 a, float3 b)
{
return make_float3(a.x+b.x, a.y+b.y, a.z+b.z);
}
inline __device__ __host__ void operator +=(float3 & b, float3 a)
{
b.x += a.x;
b.y += a.y;
b.z += a.z;
}
inline __device__ __host__ float3 operator -(float3 a, float3 b)
{
return make_float3(a.x-b.x, a.y-b.y, a.z-b.z);
}
inline __device__ __host__ void operator -=(float3 & b, float3 a)
{
b.x -= a.x;
b.y -= a.y;
b.z -= a.z;
}
inline __device__ __host__ float3 operator /(float3 v, float f)
{
float inv = 1.0f / f;
return v * inv;
}
inline __device__ __host__ void operator /=(float3 & b, float f)
{
float inv = 1.0f / f;
b.x *= inv;
b.y *= inv;
b.z *= inv;
}
inline __device__ __host__ float dot(float3 a, float3 b)
{
return a.x * b.x + a.y * b.y + a.z * b.z;
}
inline __device__ __host__ float dot(float4 a, float4 b)
{
return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
}
inline __device__ __host__ float clamp(float f, float a, float b)
{
return max(a, min(f, b));
}
inline __device__ __host__ float3 clamp(float3 v, float a, float b)
{
return make_float3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b));
}
inline __device__ __host__ float3 clamp(float3 v, float3 a, float3 b)
{
return make_float3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z));
}
inline __device__ __host__ float3 normalize(float3 v)
{
float len = 1.0f / sqrtf(dot(v, v));
return make_float3(v.x * len, v.y * len, v.z * len);
}
// Use power method to find the first eigenvector.
// http://www.miislita.com/information-retrieval-tutorial/matrix-tutorial-3-eigenvalues-eigenvectors.html
inline __device__ __host__ float3 firstEigenVector( float matrix[6] )
{
// 8 iterations seems to be more than enough.
float3 v = make_float3(1.0f, 1.0f, 1.0f);
for(int i = 0; i < 8; i++) {
float x = v.x * matrix[0] + v.y * matrix[1] + v.z * matrix[2];
float y = v.x * matrix[1] + v.y * matrix[3] + v.z * matrix[4];
float z = v.x * matrix[2] + v.y * matrix[4] + v.z * matrix[5];
float m = max(max(x, y), z);
float iv = 1.0f / m;
#if __DEVICE_EMULATION__
if (m == 0.0f) iv = 0.0f;
#endif
v = make_float3(x*iv, y*iv, z*iv);
}
return v;
}
inline __device__ void colorSums(const float3 * colors, float3 * sums)
{
#if __DEVICE_EMULATION__
float3 color_sum = make_float3(0.0f, 0.0f, 0.0f);
for (int i = 0; i < 16; i++)
{
color_sum += colors[i];
}
for (int i = 0; i < 16; i++)
{
sums[i] = color_sum;
}
#else
const int idx = threadIdx.x;
// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
//
// Permission is hereby granted, free of charge, to any person
// obtaining a copy of this software and associated documentation
// files (the "Software"), to deal in the Software without
// restriction, including without limitation the rights to use,
// copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following
// conditions:
//
// The above copyright notice and this permission notice shall be
// included in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
// OTHER DEALINGS IN THE SOFTWARE.
// Math functions and operators to be used with vector types.
#ifndef CUDAMATH_H
#define CUDAMATH_H
#include <float.h>
inline __device__ __host__ float3 operator *(float3 a, float3 b)
{
return make_float3(a.x*b.x, a.y*b.y, a.z*b.z);
}
inline __device__ __host__ float3 operator *(float f, float3 v)
{
return make_float3(v.x*f, v.y*f, v.z*f);
}
inline __device__ __host__ float3 operator *(float3 v, float f)
{
return make_float3(v.x*f, v.y*f, v.z*f);
}
inline __device__ __host__ float3 operator +(float3 a, float3 b)
{
return make_float3(a.x+b.x, a.y+b.y, a.z+b.z);
}
inline __device__ __host__ void operator +=(float3 & b, float3 a)
{
b.x += a.x;
b.y += a.y;
b.z += a.z;
}
inline __device__ __host__ float3 operator -(float3 a, float3 b)
{
return make_float3(a.x-b.x, a.y-b.y, a.z-b.z);
}
inline __device__ __host__ void operator -=(float3 & b, float3 a)
{
b.x -= a.x;
b.y -= a.y;
b.z -= a.z;
}
inline __device__ __host__ float3 operator /(float3 v, float f)
{
float inv = 1.0f / f;
return v * inv;
}
inline __device__ __host__ void operator /=(float3 & b, float f)
{
float inv = 1.0f / f;
b.x *= inv;
b.y *= inv;
b.z *= inv;
}
inline __device__ __host__ bool operator ==(float3 a, float3 b)
{
return a.x == b.x && a.y == b.y && a.z == b.z;
}
inline __device__ __host__ float dot(float3 a, float3 b)
{
return a.x * b.x + a.y * b.y + a.z * b.z;
}
inline __device__ __host__ float dot(float4 a, float4 b)
{
return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
}
inline __device__ __host__ float clamp(float f, float a, float b)
{
return max(a, min(f, b));
}
inline __device__ __host__ float3 clamp(float3 v, float a, float b)
{
return make_float3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b));
}
inline __device__ __host__ float3 clamp(float3 v, float3 a, float3 b)
{
return make_float3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z));
}
inline __device__ __host__ float3 normalize(float3 v)
{
float len = 1.0f / sqrtf(dot(v, v));
return make_float3(v.x * len, v.y * len, v.z * len);
}
// Use power method to find the first eigenvector.
// http://www.miislita.com/information-retrieval-tutorial/matrix-tutorial-3-eigenvalues-eigenvectors.html
inline __device__ __host__ float3 firstEigenVector( float matrix[6] )
{
// 8 iterations seems to be more than enough.
float3 v = make_float3(1.0f, 1.0f, 1.0f);
for(int i = 0; i < 8; i++) {
float x = v.x * matrix[0] + v.y * matrix[1] + v.z * matrix[2];
float y = v.x * matrix[1] + v.y * matrix[3] + v.z * matrix[4];
float z = v.x * matrix[2] + v.y * matrix[4] + v.z * matrix[5];
float m = max(max(x, y), z);
float iv = 1.0f / m;
if (m == 0.0f) iv = 0.0f;
v = make_float3(x*iv, y*iv, z*iv);
}
return v;
}
inline __device__ bool singleColor(const float3 * colors)
{
#if __DEVICE_EMULATION__
bool sameColor = false;
for (int i = 0; i < 16; i++)
{
sameColor &= (colors[idx] == colors[0]);
}
return sameColor;
#else
__shared__ int sameColor[16];
const int idx = threadIdx.x;
sameColor[idx] = (colors[idx] == colors[0]);
sameColor[idx] &= sameColor[idx^8];
sameColor[idx] &= sameColor[idx^4];
sameColor[idx] &= sameColor[idx^2];
sameColor[idx] &= sameColor[idx^1];
return sameColor[0];
#endif
}
inline __device__ void colorSums(const float3 * colors, float3 * sums)
{
#if __DEVICE_EMULATION__
float3 color_sum = make_float3(0.0f, 0.0f, 0.0f);
for (int i = 0; i < 16; i++)
{
color_sum += colors[i];
}
for (int i = 0; i < 16; i++)
{
sums[i] = color_sum;
}
#else
const int idx = threadIdx.x;
sums[idx] = colors[idx];
sums[idx] += sums[idx^8];
sums[idx] += sums[idx^4];
sums[idx] += sums[idx^2];
sums[idx] += sums[idx^1];
#endif
}
inline __device__ float3 bestFitLine(const float3 * colors, float3 color_sum, float3 colorMetric)
{
// Compute covariance matrix of the given colors.
#if __DEVICE_EMULATION__
float covariance[6] = {0, 0, 0, 0, 0, 0};
for (int i = 0; i < 16; i++)
{
float3 a = (colors[i] - color_sum * (1.0f / 16.0f)) * colorMetric;
covariance[0] += a.x * a.x;
covariance[1] += a.x * a.y;
covariance[2] += a.x * a.z;
covariance[3] += a.y * a.y;
covariance[4] += a.y * a.z;
covariance[5] += a.z * a.z;
}
#else
const int idx = threadIdx.x;
float3 diff = (colors[idx] - color_sum * (1.0f / 16.0f)) * colorMetric;
// @@ Eliminate two-way bank conflicts here.
// @@ It seems that doing that and unrolling the reduction doesn't help...
__shared__ float covariance[16*6];
covariance[6 * idx + 0] = diff.x * diff.x; // 0, 6, 12, 2, 8, 14, 4, 10, 0
covariance[6 * idx + 1] = diff.x * diff.y;
covariance[6 * idx + 2] = diff.x * diff.z;
covariance[6 * idx + 3] = diff.y * diff.y;
covariance[6 * idx + 4] = diff.y * diff.z;
covariance[6 * idx + 5] = diff.z * diff.z;
for(int d = 8; d > 0; d >>= 1)
{
if (idx < d)
{
covariance[6 * idx + 0] += covariance[6 * (idx+d) + 0];
covariance[6 * idx + 1] += covariance[6 * (idx+d) + 1];
covariance[6 * idx + 2] += covariance[6 * (idx+d) + 2];
covariance[6 * idx + 3] += covariance[6 * (idx+d) + 3];
covariance[6 * idx + 4] += covariance[6 * (idx+d) + 4];
covariance[6 * idx + 5] += covariance[6 * (idx+d) + 5];
}
}
#endif
// Compute first eigen vector.
return firstEigenVector(covariance);
}
#endif // CUDAMATH_H
#endif
}
inline __device__ float3 bestFitLine(const float3 * colors, float3 color_sum, float3 colorMetric)
{
// Compute covariance matrix of the given colors.
#if __DEVICE_EMULATION__
float covariance[6] = {0, 0, 0, 0, 0, 0};
for (int i = 0; i < 16; i++)
{
float3 a = (colors[i] - color_sum * (1.0f / 16.0f)) * colorMetric;
covariance[0] += a.x * a.x;
covariance[1] += a.x * a.y;
covariance[2] += a.x * a.z;
covariance[3] += a.y * a.y;
covariance[4] += a.y * a.z;
covariance[5] += a.z * a.z;
}
#else
const int idx = threadIdx.x;
float3 diff = (colors[idx] - color_sum * (1.0f / 16.0f)) * colorMetric;
// @@ Eliminate two-way bank conflicts here.
// @@ It seems that doing that and unrolling the reduction doesn't help...
__shared__ float covariance[16*6];
covariance[6 * idx + 0] = diff.x * diff.x; // 0, 6, 12, 2, 8, 14, 4, 10, 0
covariance[6 * idx + 1] = diff.x * diff.y;
covariance[6 * idx + 2] = diff.x * diff.z;
covariance[6 * idx + 3] = diff.y * diff.y;
covariance[6 * idx + 4] = diff.y * diff.z;
covariance[6 * idx + 5] = diff.z * diff.z;
for(int d = 8; d > 0; d >>= 1)
{
if (idx < d)
{
covariance[6 * idx + 0] += covariance[6 * (idx+d) + 0];
covariance[6 * idx + 1] += covariance[6 * (idx+d) + 1];
covariance[6 * idx + 2] += covariance[6 * (idx+d) + 2];
covariance[6 * idx + 3] += covariance[6 * (idx+d) + 3];
covariance[6 * idx + 4] += covariance[6 * (idx+d) + 4];
covariance[6 * idx + 5] += covariance[6 * (idx+d) + 5];
}
}
#endif
// Compute first eigen vector.
return firstEigenVector(covariance);
}
#endif // CUDAMATH_H

View File

@ -49,6 +49,14 @@
#define NVTT_VERSION 200
#define NVTT_DECLARE_PIMPL(Class) \
private: \
Class(const Class &); \
void operator=(const Class &); \
public: \
struct Private; \
Private & m
// Public interface.
namespace nvtt
@ -89,6 +97,8 @@ namespace nvtt
/// Compression options. This class describes the desired compression format and other compression settings.
struct CompressionOptions
{
NVTT_DECLARE_PIMPL(CompressionOptions);
NVTT_API CompressionOptions();
NVTT_API ~CompressionOptions();
@ -104,10 +114,6 @@ namespace nvtt
NVTT_API void setPixelFormat(unsigned int bitcount, unsigned int rmask, unsigned int gmask, unsigned int bmask, unsigned int amask);
NVTT_API void setQuantization(bool colorDithering, bool alphaDithering, bool binaryAlpha, int alphaThreshold = 127);
//private:
struct Private;
Private & m;
};
@ -170,6 +176,8 @@ namespace nvtt
/// Input options. Specify format and layout of the input texture.
struct InputOptions
{
NVTT_DECLARE_PIMPL(InputOptions);
NVTT_API InputOptions();
NVTT_API ~InputOptions();
@ -214,10 +222,6 @@ namespace nvtt
// Set resizing options.
NVTT_API void setMaxExtents(int d);
NVTT_API void setRoundMode(RoundMode mode);
//private:
struct Private;
Private & m;
};
@ -258,6 +262,8 @@ namespace nvtt
/// the compressor to the user.
struct OutputOptions
{
NVTT_DECLARE_PIMPL(OutputOptions);
NVTT_API OutputOptions();
NVTT_API ~OutputOptions();
@ -269,16 +275,14 @@ namespace nvtt
NVTT_API void setOutputHandler(OutputHandler * outputHandler);
NVTT_API void setErrorHandler(ErrorHandler * errorHandler);
NVTT_API void setOutputHeader(bool outputHeader);
//private:
struct Private;
Private & m;
};
/// Texture compressor.
struct Compressor
{
NVTT_DECLARE_PIMPL(Compressor);
NVTT_API Compressor();
NVTT_API ~Compressor();
@ -290,10 +294,6 @@ namespace nvtt
// Estimate the size of compressing the input with the given options.
NVTT_API int estimateSize(const InputOptions & inputOptions, const CompressionOptions & compressionOptions) const;
//private:
struct Private;
Private & m;
};

View File

@ -50,6 +50,16 @@ public:
return *this;
}
Vec4( const float * v )
{
union { vector float v; float c[4]; } u;
u.c[0] = v[0];
u.c[1] = v[1];
u.c[2] = v[2];
u.c[3] = v[3];
m_v = u.v;
}
Vec4( float x, float y, float z, float w )
{
union { vector float v; float c[4]; } u;

View File

@ -130,10 +130,13 @@ struct NormalError
void done()
{
ade /= samples;
mse /= samples * 3;
rmse = sqrt(mse);
psnr = (rmse == 0) ? 999.0f : 20.0f * log10(255.0f / rmse);
if (samples)
{
ade /= samples;
mse /= samples * 3;
rmse = sqrt(mse);
psnr = (rmse == 0) ? 999.0f : 20.0f * log10(255.0f / rmse);
}
}
void print()