Large refactoring of compressor codes:

- Define compressor interface.
- Implement compressor interface for different compressors.
- Add parallel compressor using OpenMP. Experimental.
- Add generic GPU compressor, so far only DXT1 enabled.
pull/216/head
castano 15 years ago
parent 18a3abf794
commit 8820c43175

File diff suppressed because it is too large Load Diff

@ -30,68 +30,153 @@
namespace nv namespace nv
{ {
class Image; class Image;
class FloatImage; struct ColorBlock;
class FastCompressor struct CompressorInterface
{ {
public: virtual ~CompressorInterface() {}
FastCompressor(); virtual void compress(nvtt::InputFormat inputFormat, nvtt::AlphaMode alphaMode, uint w, uint h, void * data, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions) = 0;
~FastCompressor(); };
struct FixedBlockCompressor : public CompressorInterface
{
virtual void compress(nvtt::InputFormat inputFormat, nvtt::AlphaMode alphaMode, uint w, uint h, void * data, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
virtual void compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output) = 0;
virtual uint blockSize() const = 0;
};
// Fast CPU compressors.
struct FastCompressorDXT1 : public FixedBlockCompressor
{
virtual void compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
virtual uint blockSize() const { return 8; }
};
struct FastCompressorDXT1a : public FixedBlockCompressor
{
virtual void compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
virtual uint blockSize() const { return 8; }
};
struct FastCompressorDXT3 : public FixedBlockCompressor
{
virtual void compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
virtual uint blockSize() const { return 16; }
};
void setImage(const Image * image, nvtt::AlphaMode alphaMode); struct FastCompressorDXT5 : public FixedBlockCompressor
{
virtual void compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
virtual uint blockSize() const { return 16; }
};
void compressDXT1(const nvtt::OutputOptions::Private & outputOptions); struct FastCompressorDXT5n : public FixedBlockCompressor
void compressDXT1a(const nvtt::OutputOptions::Private & outputOptions); {
void compressDXT3(const nvtt::OutputOptions::Private & outputOptions); virtual void compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
void compressDXT5(const nvtt::OutputOptions::Private & outputOptions); virtual uint blockSize() const { return 16; }
void compressDXT5n(const nvtt::OutputOptions::Private & outputOptions); };
private: struct FastCompressorBC4 : public FixedBlockCompressor
const Image * m_image; {
nvtt::AlphaMode m_alphaMode; virtual void compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
virtual uint blockSize() const { return 8; }
}; };
class SlowCompressor struct FastCompressorBC5 : public FixedBlockCompressor
{ {
public: virtual void compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
SlowCompressor(); virtual uint blockSize() const { return 16; }
~SlowCompressor(); };
void setImage(const Image * image, nvtt::AlphaMode alphaMode);
void compressDXT1(const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions); // Normal CPU compressors.
void compressDXT1a(const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions); struct NormalCompressorDXT1 : public FixedBlockCompressor
void compressDXT3(const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions); {
void compressDXT5(const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions); virtual void compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
void compressDXT5n(const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions); virtual uint blockSize() const { return 8; }
void compressBC4(const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions); };
void compressBC5(const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
private: struct NormalCompressorDXT1a : public FixedBlockCompressor
const Image * m_image; {
nvtt::AlphaMode m_alphaMode; virtual void compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
virtual uint blockSize() const { return 8; }
}; };
struct NormalCompressorDXT3 : public FixedBlockCompressor
{
virtual void compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
virtual uint blockSize() const { return 16; }
};
struct NormalCompressorDXT5 : public FixedBlockCompressor
{
virtual void compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
virtual uint blockSize() const { return 16; }
};
struct NormalCompressorDXT5n : public FixedBlockCompressor
{
virtual void compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
virtual uint blockSize() const { return 16; }
};
// Production CPU compressors.
struct ProductionCompressorBC4 : public FixedBlockCompressor
{
virtual void compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
virtual uint blockSize() const { return 8; }
};
struct ProductionCompressorBC5 : public FixedBlockCompressor
{
virtual void compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
virtual uint blockSize() const { return 16; }
};
// External compressors. // External compressors.
#if defined(HAVE_S3QUANT) #if defined(HAVE_S3QUANT)
void s3CompressDXT1(const Image * image, const nvtt::OutputOptions::Private & outputOptions); struct S3CompressorDXT1 : public CompressorInterface
{
virtual void compress(nvtt::InputFormat inputFormat, nvtt::AlphaMode alphaMode, uint w, uint h, void * data, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
};
#endif #endif
#if defined(HAVE_ATITC) #if defined(HAVE_ATITC)
void atiCompressDXT1(const Image * image, const nvtt::OutputOptions::Private & outputOptions); struct AtiCompressorDXT1 : public CompressorInterface
void atiCompressDXT5(const Image * image, const nvtt::OutputOptions::Private & outputOptions); {
virtual void compress(nvtt::InputFormat inputFormat, nvtt::AlphaMode alphaMode, uint w, uint h, void * data, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
};
struct AtiCompressorDXT5 : public CompressorInterface
{
virtual void compress(nvtt::InputFormat inputFormat, nvtt::AlphaMode alphaMode, uint w, uint h, void * data, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
};
#endif #endif
#if defined(HAVE_SQUISH) #if defined(HAVE_SQUISH)
void squishCompressDXT1(const Image * image, const nvtt::OutputOptions::Private & outputOptions); struct SquishCompressorDXT1 : public CompressorInterface
{
virtual void compress(nvtt::InputFormat inputFormat, nvtt::AlphaMode alphaMode, uint w, uint h, void * data, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
};
#endif #endif
#if defined(HAVE_D3DX) #if defined(HAVE_D3DX)
void d3dxCompressDXT1(const Image * image, const nvtt::OutputOptions::Private & outputOptions); struct D3DXCompressorDXT1 : public CompressorInterface
{
virtual void compress(nvtt::InputFormat inputFormat, nvtt::AlphaMode alphaMode, uint w, uint h, void * data, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
};
#endif #endif
#if defined(HAVE_D3DX) #if defined(HAVE_STB)
void stbCompressDXT1(const Image * image, const nvtt::OutputOptions::Private & outputOptions); struct StbCompressorDXT1 : public FixedBlockCompressor
{
virtual void compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
virtual uint blockSize() const { return 8; }
};
#endif #endif
} // nv namespace } // nv namespace

@ -222,6 +222,7 @@ Compressor::Compressor() : m(*new Compressor::Private())
if (m.cudaEnabled) if (m.cudaEnabled)
{ {
#pragma message(NV_FILE_LINE "FIXME: This code is duplicated below.")
// Select fastest CUDA device. // Select fastest CUDA device.
int device = cuda::getFastestDevice(); int device = cuda::getFastestDevice();
if (!cuda::setDevice(device)) if (!cuda::setDevice(device))
@ -231,7 +232,7 @@ Compressor::Compressor() : m(*new Compressor::Private())
} }
else else
{ {
m.cuda = new CudaCompressor(); m.cuda = new CudaContext();
if (!m.cuda->isValid()) if (!m.cuda->isValid())
{ {
@ -268,7 +269,7 @@ void Compressor::enableCudaAcceleration(bool enable)
} }
else else
{ {
m.cuda = new CudaCompressor(); m.cuda = new CudaContext();
if (!m.cuda->isValid()) if (!m.cuda->isValid())
{ {
@ -292,17 +293,18 @@ bool Compressor::process(const InputOptions & inputOptions, const CompressionOpt
return m.compress(inputOptions.m, compressionOptions.m, outputOptions.m); return m.compress(inputOptions.m, compressionOptions.m, outputOptions.m);
} }
/// Estimate the size of compressing the input with the given options. /// Estimate the size of compressing the input with the given options.
int Compressor::estimateSize(const InputOptions & inputOptions, const CompressionOptions & compressionOptions) const int Compressor::estimateSize(const InputOptions & inputOptions, const CompressionOptions & compressionOptions) const
{ {
return m.estimateSize(inputOptions.m, compressionOptions.m); return m.estimateSize(inputOptions.m, compressionOptions.m);
} }
// RAW api. // RAW api.
bool Compressor::compress2D(InputFormat format, int w, int h, void * data, const CompressionOptions & compressionOptions, const OutputOptions & outputOptions) const bool Compressor::compress2D(InputFormat format, int w, int h, void * data, const CompressionOptions & compressionOptions, const OutputOptions & outputOptions) const
{ {
// @@ Make sure type of input format matches compression format. #pragma message(NV_FILE_LINE "TODO: Implement raw compress api")
return false;
} }
int Compressor::estimateSize(int w, int h, int d, const CompressionOptions & compressionOptions) const int Compressor::estimateSize(int w, int h, int d, const CompressionOptions & compressionOptions) const
@ -324,16 +326,21 @@ TexImage Compressor::createTexImage() const
return *new TexImage(); return *new TexImage();
} }
bool Compressor::outputHeader(const TexImage & tex, int mipmapCount, const CompressionOptions & compressionOptions, const OutputOptions & outputOptions) const bool Compressor::outputHeader(const TexImage & tex, int mipmapCount, const CompressionOptions & compressionOptions, const OutputOptions & outputOptions) const
{ {
m.outputHeader(tex, mipmapCount, compressionOptions.m, outputOptions.m); return m.outputHeader(tex, mipmapCount, compressionOptions.m, outputOptions.m);
} }
bool Compressor::compress(const TexImage & tex, const CompressionOptions & compressionOptions, const OutputOptions & outputOptions) const bool Compressor::compress(const TexImage & tex, const CompressionOptions & compressionOptions, const OutputOptions & outputOptions) const
{ {
#pragma message(NV_FILE_LINE "TODO: Implement TexImage compress api")
// @@ Convert to fixed point and call compress2D for each face. // @@ Convert to fixed point and call compress2D for each face.
return false;
} }
/// Estimate the size of compressing the given texture.
int Compressor::estimateSize(const TexImage & tex, const CompressionOptions & compressionOptions) const int Compressor::estimateSize(const TexImage & tex, const CompressionOptions & compressionOptions) const
{ {
const uint w = tex.width(); const uint w = tex.width();
@ -345,6 +352,8 @@ int Compressor::estimateSize(const TexImage & tex, const CompressionOptions & co
} }
bool Compressor::Private::compress(const InputOptions::Private & inputOptions, const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions) const bool Compressor::Private::compress(const InputOptions::Private & inputOptions, const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions) const
{ {
// Make sure enums match. // Make sure enums match.
@ -358,9 +367,7 @@ bool Compressor::Private::compress(const InputOptions::Private & inputOptions, c
if (outputOptions.errorHandler) outputOptions.errorHandler->error(Error_FileOpen); if (outputOptions.errorHandler) outputOptions.errorHandler->error(Error_FileOpen);
return false; return false;
} }
#pragma message(NV_FILE_LINE "TODO: If DefaultOutputHandler, then seek begining of the file.")
inputOptions.computeTargetExtents(); inputOptions.computeTargetExtents();
// Output DDS header. // Output DDS header.
@ -625,7 +632,10 @@ bool Compressor::Private::outputHeader(const TexImage & tex, int mipmapCount, co
{ {
if (tex.width() <= 0 || tex.height() <= 0 || tex.depth() <= 0 || mipmapCount <= 0) if (tex.width() <= 0 || tex.height() <= 0 || tex.depth() <= 0 || mipmapCount <= 0)
{ {
#pragma message(NV_FILE_LINE "TODO: Set invalid argument error.") if (outputOptions.errorHandler != NULL)
{
outputOptions.errorHandler->error(Error_InvalidInput);
}
return false; return false;
} }
@ -1252,216 +1262,222 @@ void Compressor::Private::quantizeMipmap(Mipmap & mipmap, const CompressionOptio
} }
// Compress the given mipmap. CompressorInterface * Compressor::Private::chooseCpuCompressor(const CompressionOptions::Private & compressionOptions) const
bool Compressor::Private::compressMipmap(const Mipmap & mipmap, const InputOptions::Private & inputOptions, const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions) const
{ {
if (compressionOptions.format == Format_RGBA) if (compressionOptions.format == Format_DXT1)
{ {
// Pixel format conversion.
if (compressionOptions.pixelType == PixelType_Float)
{
compressRGB(mipmap.asFloatImage(), outputOptions, compressionOptions);
}
else
{
compressRGB(mipmap.asFixedImage(), outputOptions, compressionOptions);
}
}
else
{
const Image * image = mipmap.asFixedImage();
nvDebugCheck(image != NULL);
// @@ Use FastCompressor::isSupported(compressionOptions.format) to chose compressor.
FastCompressor fast;
fast.setImage(image, inputOptions.alphaMode);
SlowCompressor slow;
slow.setImage(image, inputOptions.alphaMode);
const bool useCuda = cudaEnabled && image->width() * image->height() >= 512;
if (compressionOptions.format == Format_DXT1)
{
#if defined(HAVE_S3QUANT) #if defined(HAVE_S3QUANT)
if (compressionOptions.externalCompressor == "s3") if (compressionOptions.externalCompressor == "s3") return new S3CompressorDXT1;
{ else
s3CompressDXT1(image, outputOptions);
}
else
#endif #endif
#if defined(HAVE_ATITC) #if defined(HAVE_ATITC)
if (compressionOptions.externalCompressor == "ati") if (compressionOptions.externalCompressor == "ati") return new AtiCompressorDXT1;
{ else
atiCompressDXT1(image, outputOptions);
}
else
#endif #endif
#if defined(HAVE_SQUISH) #if defined(HAVE_SQUISH)
if (compressionOptions.externalCompressor == "squish") if (compressionOptions.externalCompressor == "squish") return new SquishCompressorDXT1;
{ else
squishCompressDXT1(image, outputOptions);
}
else
#endif #endif
#if defined(HAVE_D3DX) #if defined(HAVE_D3DX)
if (compressionOptions.externalCompressor == "d3dx") if (compressionOptions.externalCompressor == "d3dx") return new D3DXCompressorDXT1;
{ else
d3dxCompressDXT1(image, outputOptions);
}
else
#endif #endif
#if defined(HAVE_D3DX) #if defined(HAVE_D3DX)
if (compressionOptions.externalCompressor == "stb") if (compressionOptions.externalCompressor == "stb") return new StbCompressorDXT1;
{ else
stbCompressDXT1(image, outputOptions);
}
else
#endif #endif
if (compressionOptions.quality == Quality_Fastest) if (compressionOptions.quality == Quality_Fastest)
{
fast.compressDXT1(outputOptions);
}
else
{
if (useCuda)
{
nvDebugCheck(cudaSupported);
cuda->setImage(image, inputOptions.alphaMode);
//cuda->compressDXT1(compressionOptions, outputOptions);
cuda->compressDXT1(compressionOptions, outputOptions);
}
else
{
slow.compressDXT1(compressionOptions, outputOptions);
}
}
}
else if (compressionOptions.format == Format_DXT1a)
{ {
if (compressionOptions.quality == Quality_Fastest) return new FastCompressorDXT1;
{
fast.compressDXT1a(outputOptions);
}
else
{
if (useCuda)
{
nvDebugCheck(cudaSupported);
/*cuda*/slow.compressDXT1a(compressionOptions, outputOptions);
}
else
{
slow.compressDXT1a(compressionOptions, outputOptions);
}
}
} }
else if (compressionOptions.format == Format_DXT1n)
return new NormalCompressorDXT1;
}
else if (compressionOptions.format == Format_DXT1a)
{
if (compressionOptions.quality == Quality_Fastest)
{ {
if (useCuda) return new FastCompressorDXT1a;
{
nvDebugCheck(cudaSupported);
cuda->setImage(image, inputOptions.alphaMode);
cuda->compressDXT1n(compressionOptions, outputOptions);
}
else
{
if (outputOptions.errorHandler) outputOptions.errorHandler->error(Error_UnsupportedFeature);
}
} }
else if (compressionOptions.format == Format_DXT3)
return new NormalCompressorDXT1a;
}
else if (compressionOptions.format == Format_DXT1n)
{
// Not supported.
}
else if (compressionOptions.format == Format_DXT3)
{
if (compressionOptions.quality == Quality_Fastest)
{ {
if (compressionOptions.quality == Quality_Fastest) return new FastCompressorDXT3;
{
fast.compressDXT3(outputOptions);
}
else
{
if (useCuda)
{
nvDebugCheck(cudaSupported);
cuda->setImage(image, inputOptions.alphaMode);
cuda->compressDXT3(compressionOptions, outputOptions);
}
else
{
slow.compressDXT3(compressionOptions, outputOptions);
}
}
} }
else if (compressionOptions.format == Format_DXT5)
{ return new NormalCompressorDXT3;
}
else if (compressionOptions.format == Format_DXT5)
{
#if defined(HAVE_ATITC) #if defined(HAVE_ATITC)
if (compressionOptions.externalCompressor == "ati") if (compressionOptions.externalCompressor == "ati") return new AtiCompressorDXT5;
{ else
atiCompressDXT5(image, outputOptions);
}
else
#endif #endif
if (compressionOptions.quality == Quality_Fastest)
{ if (compressionOptions.quality == Quality_Fastest)
fast.compressDXT5(outputOptions); {
} return new FastCompressorDXT5;
else
{
if (useCuda)
{
nvDebugCheck(cudaSupported);
cuda->setImage(image, inputOptions.alphaMode);
cuda->compressDXT5(compressionOptions, outputOptions);
}
else
{
slow.compressDXT5(compressionOptions, outputOptions);
}
}
} }
else if (compressionOptions.format == Format_DXT5n)
return new NormalCompressorDXT5;
}
else if (compressionOptions.format == Format_DXT5n)
{
if (compressionOptions.quality == Quality_Fastest)
{ {
if (compressionOptions.quality == Quality_Fastest) return new FastCompressorDXT5n;
{
fast.compressDXT5n(outputOptions);
}
else
{
/*if (useCuda)
{
nvDebugCheck(cudaSupported);
cuda->setImage(image, inputOptions.alphaMode);
cuda->compressDXT5n(compressionOptions, outputOptions);
}
else*/
{
slow.compressDXT5n(compressionOptions, outputOptions);
}
}
} }
else if (compressionOptions.format == Format_BC4)
return new NormalCompressorDXT5n;
}
else if (compressionOptions.format == Format_BC4)
{
if (compressionOptions.quality == Quality_Fastest || compressionOptions.quality == Quality_Normal)
{ {
slow.compressBC4(compressionOptions, outputOptions); return new FastCompressorBC4;
} }
else if (compressionOptions.format == Format_BC5)
return new ProductionCompressorBC4;
}
else if (compressionOptions.format == Format_BC5)
{
if (compressionOptions.quality == Quality_Fastest || compressionOptions.quality == Quality_Normal)
{ {
slow.compressBC5(compressionOptions, outputOptions); return new FastCompressorBC5;
} }
else if (compressionOptions.format == Format_CTX1)
return new ProductionCompressorBC5;
}
else if (compressionOptions.format == Format_CTX1)
{
// Not supported.
}
else if (compressionOptions.format == Format_BC6)
{
// Not supported.
}
else if (compressionOptions.format == Format_BC7)
{
// Not supported.
}
return NULL;
}
CompressorInterface * Compressor::Private::chooseGpuCompressor(const CompressionOptions::Private & compressionOptions) const
{
nvDebugCheck(cudaSupported);
if (compressionOptions.quality == Quality_Fastest)
{
// Do not use CUDA compressors in fastest quality mode.
return NULL;
}
if (compressionOptions.format == Format_DXT1)
{
return new CudaCompressorDXT1(*cuda);
}
else if (compressionOptions.format == Format_DXT1a)
{
#pragma message(NV_FILE_LINE "TODO: Implement CUDA DXT1a compressor.")
}
else if (compressionOptions.format == Format_DXT1n)
{
// Not supported.
}
else if (compressionOptions.format == Format_DXT3)
{
return new CudaCompressorDXT3(*cuda);
}
else if (compressionOptions.format == Format_DXT5)
{
return new CudaCompressorDXT5(*cuda);
}
else if (compressionOptions.format == Format_DXT5n)
{
// @@ Return CUDA compressor.
}
else if (compressionOptions.format == Format_BC4)
{
// Not supported.
}
else if (compressionOptions.format == Format_BC5)
{
// Not supported.
}
else if (compressionOptions.format == Format_CTX1)
{
// @@ Return CUDA compressor.
}
else if (compressionOptions.format == Format_BC6)
{
// Not supported.
}
else if (compressionOptions.format == Format_BC7)
{
// Not supported.
}
return NULL;
}
// Compress the given mipmap.
bool Compressor::Private::compressMipmap(const Mipmap & mipmap, const InputOptions::Private & inputOptions, const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions) const
{
if (compressionOptions.format == Format_RGBA)
{
// Pixel format conversion.
if (compressionOptions.pixelType == PixelType_Float)
{ {
if (useCuda) compressRGB(mipmap.asFloatImage(), outputOptions, compressionOptions);
{ }
nvDebugCheck(cudaSupported); else
cuda->setImage(image, inputOptions.alphaMode); {
cuda->compressCTX1(compressionOptions, outputOptions); compressRGB(mipmap.asFixedImage(), outputOptions, compressionOptions);
} }
else }
{ else
if (outputOptions.errorHandler) outputOptions.errorHandler->error(Error_UnsupportedFeature); {
} const Image * image = mipmap.asFixedImage();
nvDebugCheck(image != NULL);
// Decide what compressor to use.
CompressorInterface * compressor = NULL;
if (cudaEnabled && image->width() * image->height() >= 512)
{
compressor = chooseGpuCompressor(compressionOptions);
}
if (compressor == NULL)
{
compressor = chooseCpuCompressor(compressionOptions);
}
if (compressor == NULL)
{
if (outputOptions.errorHandler) outputOptions.errorHandler->error(Error_UnsupportedFeature);
}
else
{
compressor->compress(InputFormat_BGRA_8UB, inputOptions.alphaMode, image->width(), image->height(), (void *)image->pixels(), compressionOptions, outputOptions);
delete compressor;
} }
} }

@ -27,6 +27,7 @@
#include <nvcore/Ptr.h> #include <nvcore/Ptr.h>
#include <nvtt/cuda/CudaCompressDXT.h> #include <nvtt/cuda/CudaCompressDXT.h>
#include <nvtt/CompressDXT.h>
#include "nvtt.h" #include "nvtt.h"
@ -44,6 +45,9 @@ namespace nvtt
Private() {} Private() {}
bool compress(const InputOptions::Private & inputOptions, const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions) const; bool compress(const InputOptions::Private & inputOptions, const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions) const;
bool compress(const void * data, int width, int height, const CompressionOptions & compressionOptions, const OutputOptions & outputOptions) const;
int estimateSize(const InputOptions::Private & inputOptions, const CompressionOptions::Private & compressionOptions) const; int estimateSize(const InputOptions::Private & inputOptions, const CompressionOptions::Private & compressionOptions) const;
bool outputHeader(const TexImage & tex, int mipmapCount, const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions); bool outputHeader(const TexImage & tex, int mipmapCount, const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions);
@ -51,6 +55,10 @@ namespace nvtt
private: private:
bool outputHeader(const InputOptions::Private & inputOptions, const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions) const; bool outputHeader(const InputOptions::Private & inputOptions, const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions) const;
nv::CompressorInterface * chooseCpuCompressor(const CompressionOptions::Private & compressionOptions) const;
nv::CompressorInterface * chooseGpuCompressor(const CompressionOptions::Private & compressionOptions) const;
bool compressMipmaps(uint f, const InputOptions::Private & inputOptions, const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions) const; bool compressMipmaps(uint f, const InputOptions::Private & inputOptions, const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions) const;
bool initMipmap(Mipmap & mipmap, const InputOptions::Private & inputOptions, uint w, uint h, uint d, uint f, uint m) const; bool initMipmap(Mipmap & mipmap, const InputOptions::Private & inputOptions, uint w, uint h, uint d, uint f, uint m) const;
@ -71,7 +79,7 @@ namespace nvtt
bool cudaSupported; bool cudaSupported;
bool cudaEnabled; bool cudaEnabled;
nv::AutoPtr<nv::CudaCompressor> cuda; nv::AutoPtr<nv::CudaContext> cuda;
}; };

@ -296,6 +296,51 @@ __device__ float3 blockError3(const float3 * colors, uint permutation, float3 a,
// Sort colors // Sort colors
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// @@ Experimental code to avoid duplicate colors for faster compression.
// We could first sort along the best fit line and only compare colors that have the same projection.
// The hardest part is to maintain the indices to map packed/sorted colors to the input colors.
// We also need to update several functions that assume the number of colors is fixed to 16.
// And compute different bit maps for the different color counts.
// This is a fairly high amount of work.
__device__ int packColors(float3 * values, float * weights, int * ranks)
{
const int tid = threadIdx.x;
__shared__ int count;
count = 0;
bool alive = true;
// Append this
for (int i = 0; i < 16; i++)
{
// One thread leads on each iteration.
if (tid == i) {
// If thread alive, then append element.
if (alive) {
values[count] = values[i];
weights[count] = weights[i];
count++;
}
// Otherwise update weight.
else {
weights[ranks[i]] += weights[i];
}
}
// Kill all threads that have the same element and record rank.
if (values[i] == values[tid]) {
alive = false;
ranks[tid] = count - 1;
}
}
return count;
}
__device__ void sortColors(const float * values, int * ranks) __device__ void sortColors(const float * values, int * ranks)
{ {
#if __DEVICE_EMULATION__ #if __DEVICE_EMULATION__
@ -343,12 +388,60 @@ __device__ void sortColors(const float * values, int * ranks)
#endif #endif
} }
__device__ void sortColors(const float * values, int * ranks, int count)
{
#if __DEVICE_EMULATION__
if (threadIdx.x == 0)
{
for (int tid = 0; tid < count; tid++)
{
int rank = 0;
for (int i = 0; i < count; i++)
{
rank += (values[i] < values[tid]);
}
ranks[tid] = rank;
}
// Resolve elements with the same index.
for (int i = 0; i < count-1; i++)
{
for (int tid = 0; tid < count; tid++)
{
if (tid > i && ranks[tid] == ranks[i]) ++ranks[tid];
}
}
}
#else
const int tid = threadIdx.x;
int rank = 0;
#pragma unroll
for (int i = 0; i < count; i++)
{
rank += (values[i] < values[tid]);
}
ranks[tid] = rank;
// Resolve elements with the same index.
#pragma unroll
for (int i = 0; i < count-1; i++)
{
if ((tid > i) & (ranks[tid] == ranks[i])) ++ranks[tid];
}
#endif
}
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Load color block to shared mem // Load color block to shared mem
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
__device__ void loadColorBlock(const uint * image, float3 colors[16], float3 sums[16], int xrefs[16], int * sameColor) /*__device__ void loadColorBlock(const uint * image, float3 colors[16], float3 sums[16], int xrefs[16], int * sameColor)
{ {
const int bid = blockIdx.x; const int bid = blockIdx.x;
const int idx = threadIdx.x; const int idx = threadIdx.x;
@ -389,9 +482,9 @@ __device__ void loadColorBlock(const uint * image, float3 colors[16], float3 sum
__debugsync(); __debugsync();
} }
#endif #endif
} }*/
__device__ void loadColorBlockTex(uint bn, uint w, float3 colors[16], float3 sums[16], int xrefs[16], int * sameColor) __device__ void loadColorBlockTex(uint firstBlock, uint width, float3 colors[16], float3 sums[16], int xrefs[16], int * sameColor)
{ {
const int bid = blockIdx.x; const int bid = blockIdx.x;
const int idx = threadIdx.x; const int idx = threadIdx.x;
@ -400,8 +493,8 @@ __device__ void loadColorBlockTex(uint bn, uint w, float3 colors[16], float3 sum
if (idx < 16) if (idx < 16)
{ {
float x = 4 * ((bn + bid) % w) + idx % 4; // @@ Avoid mod and div by using 2D grid? float x = 4 * ((firstBlock + bid) % width) + idx % 4; // @@ Avoid mod and div by using 2D grid?
float y = 4 * ((bn + bid) / w) + idx / 4; float y = 4 * ((firstBlock + bid) / width) + idx / 4;
// Read color and copy to shared mem. // Read color and copy to shared mem.
float4 c = tex2D(tex, x, y); float4 c = tex2D(tex, x, y);
@ -437,10 +530,107 @@ __device__ void loadColorBlockTex(uint bn, uint w, float3 colors[16], float3 sum
__debugsync(); __debugsync();
} }
#endif #endif
}
/*
__device__ void loadColorBlockTex(uint firstBlock, uint w, float3 colors[16], float3 sums[16], float weights[16], int xrefs[16], int * sameColor)
{
const int bid = blockIdx.x;
const int idx = threadIdx.x;
__shared__ float dps[16];
if (idx < 16)
{
float x = 4 * ((firstBlock + bid) % w) + idx % 4; // @@ Avoid mod and div by using 2D grid?
float y = 4 * ((firstBlock + bid) / w) + idx / 4;
// Read color and copy to shared mem.
float4 c = tex2D(tex, x, y);
colors[idx].x = c.z;
colors[idx].y = c.y;
colors[idx].z = c.x;
weights[idx] = 1;
int count = packColors(colors, weights);
if (idx < count)
{
// Sort colors along the best fit line.
colorSums(colors, sums);
float3 axis = bestFitLine(colors, sums[0], kColorMetric);
*sameColor = (axis == make_float3(0, 0, 0));
dps[idx] = dot(colors[idx], axis);
sortColors(dps, xrefs);
float3 tmp = colors[idx];
colors[xrefs[idx]] = tmp;
}
}
} }
*/
__device__ void loadColorBlockTex(uint firstBlock, uint width, float3 colors[16], float3 sums[16], float weights[16], int xrefs[16], int * sameColor)
{
const int bid = blockIdx.x;
const int idx = threadIdx.x;
__shared__ float3 rawColors[16];
__shared__ float dps[16];
if (idx < 16)
{
float x = 4 * ((firstBlock + bid) % width) + idx % 4; // @@ Avoid mod and div by using 2D grid?
float y = 4 * ((firstBlock + bid) / width) + idx / 4;
// Read color and copy to shared mem.
float4 c = tex2D(tex, x, y);
rawColors[idx].x = c.z;
rawColors[idx].y = c.y;
rawColors[idx].z = c.x;
weights[idx] = c.w;
colors[idx] = rawColors[idx] * weights[idx];
// No need to synchronize, 16 < warp size.
__debugsync();
// Sort colors along the best fit line.
colorSums(colors, sums);
float3 axis = bestFitLine(colors, sums[0], kColorMetric);
*sameColor = (axis == make_float3(0, 0, 0));
// Single color compressor needs unweighted colors.
if (*sameColor) colors[idx] = rawColors[idx];
dps[idx] = dot(colors[idx], axis);
__debugsync();
sortColors(dps, xrefs);
float3 tmp = colors[idx];
float w = weights[idx];
__debugsync();
colors[xrefs[idx]] = tmp;
weights[xrefs[idx]] = w;
}
#if __DEVICE_EMULATION__
else
{
__debugsync();
__debugsync();
__debugsync();
}
#endif
}
/*
__device__ void loadColorBlock(const uint * image, float3 colors[16], float3 sums[16], float weights[16], int xrefs[16], int * sameColor) __device__ void loadColorBlock(const uint * image, float3 colors[16], float3 sums[16], float weights[16], int xrefs[16], int * sameColor)
{ {
const int bid = blockIdx.x; const int bid = blockIdx.x;
@ -494,6 +684,7 @@ __device__ void loadColorBlock(const uint * image, float3 colors[16], float3 sum
} }
#endif #endif
} }
*/
__device__ void loadColorBlock(const uint * image, float2 colors[16], float2 sums[16], int xrefs[16], int * sameColor) __device__ void loadColorBlock(const uint * image, float2 colors[16], float2 sums[16], int xrefs[16], int * sameColor)
{ {
@ -1457,48 +1648,15 @@ __device__ void saveSingleColorBlockCTX1(float2 color, uint2 * result)
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Compress color block // Compress color block
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
__global__ void compressDXT1(const uint * permutations, const uint * image, uint2 * result)
{
__shared__ float3 colors[16];
__shared__ float3 sums[16];
__shared__ int xrefs[16];
__shared__ int sameColor;
loadColorBlock(image, colors, sums, xrefs, &sameColor);
__syncthreads(); __global__ void compressDXT1(uint firstBlock, uint w, const uint * permutations, uint2 * result)
if (sameColor)
{
if (threadIdx.x == 0) saveSingleColorBlockDXT1(colors[0], result);
return;
}
ushort bestStart, bestEnd;
uint bestPermutation;
__shared__ float errors[NUM_THREADS];
evalAllPermutations(colors, sums[0], permutations, bestStart, bestEnd, bestPermutation, errors);
// Use a parallel reduction to find minimum error.
const int minIdx = findMinError(errors);
// Only write the result of the winner thread.
if (threadIdx.x == minIdx)
{
saveBlockDXT1(bestStart, bestEnd, bestPermutation, xrefs, result);
}
}
__global__ void compressDXT1_Tex(uint bn, uint w, const uint * permutations, uint2 * result)
{ {
__shared__ float3 colors[16]; __shared__ float3 colors[16];
__shared__ float3 sums[16]; __shared__ float3 sums[16];
__shared__ int xrefs[16]; __shared__ int xrefs[16];
__shared__ int sameColor; __shared__ int sameColor;
loadColorBlockTex(bn, w, colors, sums, xrefs, &sameColor); loadColorBlockTex(firstBlock, w, colors, sums, xrefs, &sameColor);
__syncthreads(); __syncthreads();
@ -1534,14 +1692,14 @@ __global__ void compressDXT1_Tex(uint bn, uint w, const uint * permutations, uin
} }
__global__ void compressLevel4DXT1(const uint * permutations, const uint * image, uint2 * result) __global__ void compressLevel4DXT1(uint firstBlock, uint w, const uint * permutations, uint2 * result)
{ {
__shared__ float3 colors[16]; __shared__ float3 colors[16];
__shared__ float3 sums[16]; __shared__ float3 sums[16];
__shared__ int xrefs[16]; __shared__ int xrefs[16];
__shared__ int sameColor; __shared__ int sameColor;
loadColorBlock(image, colors, sums, xrefs, &sameColor); loadColorBlockTex(firstBlock, w, colors, sums, xrefs, &sameColor);
__syncthreads(); __syncthreads();
@ -1568,7 +1726,7 @@ __global__ void compressLevel4DXT1(const uint * permutations, const uint * image
} }
} }
__global__ void compressWeightedDXT1(const uint * permutations, const uint * image, uint2 * result) __global__ void compressWeightedDXT1(uint firstBlock, uint w, const uint * permutations, uint2 * result)
{ {
__shared__ float3 colors[16]; __shared__ float3 colors[16];
__shared__ float3 sums[16]; __shared__ float3 sums[16];
@ -1576,7 +1734,7 @@ __global__ void compressWeightedDXT1(const uint * permutations, const uint * ima
__shared__ int xrefs[16]; __shared__ int xrefs[16];
__shared__ int sameColor; __shared__ int sameColor;
loadColorBlock(image, colors, sums, weights, xrefs, &sameColor); loadColorBlockTex(firstBlock, w, colors, sums, weights, xrefs, &sameColor);
__syncthreads(); __syncthreads();
@ -1987,40 +2145,70 @@ extern "C" void setupCompressKernel(const float weights[3])
cudaMemcpyToSymbol(kColorMetricSqr, weightsSqr, sizeof(float) * 3, 0); cudaMemcpyToSymbol(kColorMetricSqr, weightsSqr, sizeof(float) * 3, 0);
} }
extern "C" void bindTextureToArray(cudaArray * d_data)
{
// Setup texture
tex.normalized = false;
tex.filterMode = cudaFilterModePoint;
tex.addressMode[0] = cudaAddressModeClamp;
tex.addressMode[1] = cudaAddressModeClamp;
cudaBindTextureToArray(tex, d_data);
}
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Launch kernel // Launch kernel
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
extern "C" void compressKernelDXT1(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps) // DXT1 compressors:
extern "C" void compressKernelDXT1(uint firstBlock, uint blockNum, uint w, uint * d_result, uint * d_bitmaps)
{ {
compressDXT1<<<blockNum, NUM_THREADS>>>(d_bitmaps, d_data, (uint2 *)d_result); compressDXT1<<<blockNum, NUM_THREADS>>>(firstBlock, w, d_bitmaps, (uint2 *)d_result);
} }
extern "C" void compressKernelDXT1_Tex(uint bn, uint blockNum, uint w, cudaArray * d_data, uint * d_result, uint * d_bitmaps) extern "C" void compressKernelDXT1_Level4(uint firstBlock, uint blockNum, uint w, uint * d_result, uint * d_bitmaps)
{ {
// Setup texture compressLevel4DXT1<<<blockNum, NUM_THREADS>>>(firstBlock, w, d_bitmaps, (uint2 *)d_result);
tex.normalized = false; }
tex.filterMode = cudaFilterModePoint;
tex.addressMode[0] = cudaAddressModeClamp;
tex.addressMode[1] = cudaAddressModeClamp;
cudaBindTextureToArray(tex, d_data);
compressDXT1_Tex<<<blockNum, NUM_THREADS>>>(bn, w, d_bitmaps, (uint2 *)d_result); extern "C" void compressWeightedKernelDXT1(uint firstBlock, uint blockNum, uint w, uint * d_result, uint * d_bitmaps)
{
compressWeightedDXT1<<<blockNum, NUM_THREADS>>>(firstBlock, w, d_bitmaps, (uint2 *)d_result);
} }
// @@ DXT1a compressors.
extern "C" void compressKernelDXT1_Level4(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps)
// @@ DXT3 compressors:
extern "C" void compressKernelDXT3(uint firstBlock, uint blockNum, uint w, uint * d_result, uint * d_bitmaps)
{ {
compressLevel4DXT1<<<blockNum, NUM_THREADS>>>(d_bitmaps, d_data, (uint2 *)d_result); //compressDXT3<<<blockNum, NUM_THREADS>>>(firstBlock, w, d_bitmaps, (uint2 *)d_result);
} }
extern "C" void compressWeightedKernelDXT1(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps) extern "C" void compressWeightedKernelDXT3(uint firstBlock, uint blockNum, uint w, uint * d_result, uint * d_bitmaps)
{
//compressWeightedDXT3<<<blockNum, NUM_THREADS>>>(firstBlock, w, d_bitmaps, (uint2 *)d_result);
}
// @@ DXT5 compressors.
extern "C" void compressKernelDXT5(uint firstBlock, uint blockNum, uint w, uint * d_result, uint * d_bitmaps)
{ {
compressWeightedDXT1<<<blockNum, NUM_THREADS>>>(d_bitmaps, d_data, (uint2 *)d_result); //compressDXT5<<<blockNum, NUM_THREADS>>>(firstBlock, w, d_bitmaps, (uint2 *)d_result);
} }
extern "C" void compressWeightedKernelDXT5(uint firstBlock, uint blockNum, uint w, uint * d_result, uint * d_bitmaps)
{
//compressWeightedDXT5<<<blockNum, NUM_THREADS>>>(firstBlock, w, d_bitmaps, (uint2 *)d_result);
}
/*
extern "C" void compressNormalKernelDXT1(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps) extern "C" void compressNormalKernelDXT1(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps)
{ {
compressNormalDXT1<<<blockNum, NUM_THREADS>>>(d_bitmaps, d_data, (uint2 *)d_result); compressNormalDXT1<<<blockNum, NUM_THREADS>>>(d_bitmaps, d_data, (uint2 *)d_result);
@ -2030,16 +2218,10 @@ extern "C" void compressKernelCTX1(uint blockNum, uint * d_data, uint * d_result
{ {
compressCTX1<<<blockNum, NUM_THREADS>>>(d_bitmaps, d_data, (uint2 *)d_result); compressCTX1<<<blockNum, NUM_THREADS>>>(d_bitmaps, d_data, (uint2 *)d_result);
} }
*/
/*
extern "C" void compressKernelDXT5n(uint blockNum, cudaArray * d_data, uint * d_result) extern "C" void compressKernelDXT5n(uint blockNum, cudaArray * d_data, uint * d_result)
{ {
// Setup texture
tex.normalized = false;
tex.filterMode = cudaFilterModePoint;
tex.addressMode[0] = cudaAddressModeClamp;
tex.addressMode[1] = cudaAddressModeClamp;
cudaBindTextureToArray(tex, d_data);
// compressDXT5n<<<blockNum/128, 128>>>(blockNum, (uint2 *)d_result); // compressDXT5n<<<blockNum/128, 128>>>(blockNum, (uint2 *)d_result);
} }
*/

@ -52,16 +52,20 @@ using namespace nvtt;
extern "C" void setupCompressKernel(const float weights[3]); extern "C" void setupCompressKernel(const float weights[3]);
extern "C" void compressKernelDXT1(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps); extern "C" void bindTextureToArray(cudaArray * d_data);
extern "C" void compressKernelDXT1_Tex(uint bn, uint blockNum, uint w, cudaArray * d_data, uint * d_result, uint * d_bitmaps);
extern "C" void compressKernelDXT1(uint firstBlock, uint blockNum, uint w, uint * d_result, uint * d_bitmaps);
extern "C" void compressKernelDXT1_Level4(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps); extern "C" void compressKernelDXT1_Level4(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps);
extern "C" void compressWeightedKernelDXT1(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps); extern "C" void compressWeightedKernelDXT1(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps);
extern "C" void compressNormalKernelDXT1(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps); extern "C" void compressKernelDXT3(uint firstBlock, uint blockNum, uint w, uint * d_result, uint * d_bitmaps);
extern "C" void compressKernelCTX1(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps); //extern "C" void compressNormalKernelDXT1(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps);
//extern "C" void compressKernelCTX1(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps);
#include "Bitmaps.h" // @@ Rename to BitmapTable.h #pragma message(NV_FILE_LINE "TODO: Rename Bitmaps.h to BitmapTable.h")
#include "Bitmaps.h"
/*
// Convert linear image to block linear. // Convert linear image to block linear.
static void convertToBlockLinear(const Image * image, uint * blockLinearImage) static void convertToBlockLinear(const Image * image, uint * blockLinearImage)
{ {
@ -81,45 +85,49 @@ static void convertToBlockLinear(const Image * image, uint * blockLinearImage)
} }
} }
} }
*/
#endif #endif
CudaCompressor::CudaCompressor() : m_bitmapTable(NULL), m_bitmapTableCTX(NULL), m_data(NULL), m_result(NULL) CudaContext::CudaContext() :
{ bitmapTable(NULL),
bitmapTableCTX(NULL),
data(NULL),
result(NULL)
{
#if defined HAVE_CUDA #if defined HAVE_CUDA
// Allocate and upload bitmaps. // Allocate and upload bitmaps.
cudaMalloc((void**) &m_bitmapTable, 992 * sizeof(uint)); cudaMalloc((void**) &bitmapTable, 992 * sizeof(uint));
if (m_bitmapTable != NULL) if (bitmapTable != NULL)
{ {
cudaMemcpy(m_bitmapTable, s_bitmapTable, 992 * sizeof(uint), cudaMemcpyHostToDevice); cudaMemcpy(bitmapTable, s_bitmapTable, 992 * sizeof(uint), cudaMemcpyHostToDevice);
} }
cudaMalloc((void**) &m_bitmapTableCTX, 704 * sizeof(uint)); cudaMalloc((void**) &bitmapTableCTX, 704 * sizeof(uint));
if (bitmapTableCTX != NULL)
if (m_bitmapTableCTX != NULL)
{ {
cudaMemcpy(m_bitmapTableCTX, s_bitmapTableCTX, 704 * sizeof(uint), cudaMemcpyHostToDevice); cudaMemcpy(bitmapTableCTX, s_bitmapTableCTX, 704 * sizeof(uint), cudaMemcpyHostToDevice);
} }
// Allocate scratch buffers. // Allocate scratch buffers.
cudaMalloc((void**) &m_data, MAX_BLOCKS * 64U); cudaMalloc((void**) &data, MAX_BLOCKS * 64U);
cudaMalloc((void**) &m_result, MAX_BLOCKS * 8U); cudaMalloc((void**) &result, MAX_BLOCKS * 8U);
#endif #endif
} }
CudaCompressor::~CudaCompressor() CudaContext::~CudaContext()
{ {
#if defined HAVE_CUDA #if defined HAVE_CUDA
// Free device mem allocations. // Free device mem allocations.
cudaFree(m_data); cudaFree(bitmapTableCTX);
cudaFree(m_result); cudaFree(bitmapTable);
cudaFree(m_bitmapTable); cudaFree(data);
cudaFree(m_bitmapTableCTX); cudaFree(result);
#endif #endif
} }
bool CudaCompressor::isValid() const bool CudaContext::isValid() const
{ {
#if defined HAVE_CUDA #if defined HAVE_CUDA
cudaError_t err = cudaGetLastError(); cudaError_t err = cudaGetLastError();
@ -129,91 +137,88 @@ bool CudaCompressor::isValid() const
return false; return false;
} }
#endif #endif
return m_data != NULL && m_result != NULL && m_bitmapTable != NULL; return bitmapTable != NULL && bitmapTableCTX != NULL && data != NULL && result != NULL;
} }
// @@ This code is very repetitive and needs to be cleaned up.
#if 0
struct CudaCompressionKernel CudaCompressor::CudaCompressor(CudaContext & ctx) : m_ctx(ctx)
{ {
virtual void setup(const CompressionOptions::Private & compressionOptions)
{
setupCompressKernel(compressionOptions.colorWeight.ptr());
}
virtual void setBitmapTable();
virtual void runDeviceCode(int count);
virtual void runHostCode(int count);
};
void CudaCompressor::compressKernel(CudaCompressionKernel * kernel) }
void CudaCompressor::compress(nvtt::InputFormat inputFormat, nvtt::AlphaMode alphaMode, uint w, uint h, void * data, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions)
{ {
nvDebugCheck(cuda::isHardwarePresent()); nvDebugCheck(cuda::isHardwarePresent());
#if defined HAVE_CUDA #if defined HAVE_CUDA
// Allocate image as a cuda array.
cudaArray * d_image;
if (inputFormat == nvtt::InputFormat_BGRA_8UB)
{
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsigned);
cudaMallocArray(&d_image, &channelDesc, w, h);
const int imageSize = w * h * sizeof(uint);
cudaMemcpyToArray(d_image, 0, 0, data, imageSize, cudaMemcpyHostToDevice);
}
else
{
#pragma message(NV_FILE_LINE "FIXME: Floating point textures not really supported by CUDA compressors.")
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 32, 32, 32, cudaChannelFormatKindFloat);
cudaMallocArray(&d_image, &channelDesc, w, h);
const int imageSize = w * h * sizeof(uint);
cudaMemcpyToArray(d_image, 0, 0, data, imageSize, cudaMemcpyHostToDevice);
}
// Image size in blocks. // Image size in blocks.
const uint w = (image->width() + 3) / 4; const uint bw = (w + 3) / 4;
const uint h = (image->height() + 3) / 4; const uint bh = (h + 3) / 4;
const uint bs = blockSize();
const uint blockNum = bw * bh;
const uint compressedSize = blockNum * bs;
uint imageSize = w * h * 16 * sizeof(Color32); void * h_result = malloc(min(blockNum, MAX_BLOCKS) * bs);
uint * blockLinearImage = (uint *) malloc(imageSize);
convertToBlockLinear(image, blockLinearImage); // @@ Do this in parallel with the GPU, or in the GPU!
const uint blockNum = w * h; setup(d_image, compressionOptions);
const uint compressedSize = blockNum * 8;
clock_t start = clock(); // Timer timer;
// timer.start();
kernel->setup(compressionOptions);
kernel->setBitmapTable(m_bitmapTable);
// TODO: Add support for multiple GPUs.
uint bn = 0; uint bn = 0;
while(bn != blockNum) while(bn != blockNum)
{ {
uint count = min(blockNum - bn, MAX_BLOCKS); uint count = min(blockNum - bn, MAX_BLOCKS);
cudaMemcpy(m_data, blockLinearImage + bn * 16, count * 64, cudaMemcpyHostToDevice); compressBlocks(bn, count, w, h, alphaMode, compressionOptions, h_result);
kernel->runDeviceCode(count, m_data, m_result);
kernel->runHostCode(count);
// Check for errors. // Check for errors.
cudaError_t err = cudaGetLastError(); cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) if (err != cudaSuccess)
{ {
nvDebug("CUDA Error: %s\n", cudaGetErrorString(err)); //nvDebug("CUDA Error: %s\n", cudaGetErrorString(err));
if (outputOptions.errorHandler != NULL) if (outputOptions.errorHandler != NULL)
{ {
outputOptions.errorHandler->error(Error_CudaError); outputOptions.errorHandler->error(Error_CudaError);
} }
} }
// Copy result to host, overwrite swizzled image.
cudaMemcpy(blockLinearImage, m_result, count * 8, cudaMemcpyDeviceToHost);
// Output result. // Output result.
kernel->outputResult(outputOptions.outputHandler);
if (outputOptions.outputHandler != NULL) if (outputOptions.outputHandler != NULL)
{ {
outputOptions.outputHandler->writeData(blockLinearImage, count * 8); outputOptions.outputHandler->writeData(h_result, count * bs);
} }
bn += count; bn += count;
} }
clock_t end = clock(); //timer.stop();
//printf("\rCUDA time taken: %.3f seconds\n", float(end-start) / CLOCKS_PER_SEC); //printf("\rCUDA time taken: %.3f seconds\n", timer.elapsed() / CLOCKS_PER_SEC);
free(blockLinearImage); free(h_result);
cudaFreeArray(d_image);
#else #else
if (outputOptions.errorHandler != NULL) if (outputOptions.errorHandler != NULL)
@ -221,92 +226,88 @@ void CudaCompressor::compressKernel(CudaCompressionKernel * kernel)
outputOptions.errorHandler->error(Error_CudaError); outputOptions.errorHandler->error(Error_CudaError);
} }
#endif #endif
}
#endif // 0 }
void CudaCompressor::setImage(const Image * image, nvtt::AlphaMode alphaMode) void CudaCompressorDXT1::setup(cudaArray * image, const nvtt::CompressionOptions::Private & compressionOptions)
{ {
m_image = image; setupCompressKernel(compressionOptions.colorWeight.ptr());
m_alphaMode = alphaMode; bindTextureToArray(image);
} }
void CudaCompressorDXT1::compressBlocks(uint first, uint count, uint w, uint h, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
/// Compress image using CUDA.
void CudaCompressor::compressDXT1(const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions)
{ {
nvDebugCheck(cuda::isHardwarePresent()); // Launch kernel.
#if defined HAVE_CUDA compressKernelDXT1(first, count, w, m_ctx.result, m_ctx.bitmapTable);
// Allocate image as a cuda array. // Copy result to host.
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsigned); cudaMemcpy(output, m_ctx.result, count * 8, cudaMemcpyDeviceToHost);
}
cudaArray * d_image;
const int imageSize = m_image->width() * m_image->height() * sizeof(uint);
cudaMallocArray(&d_image, &channelDesc, m_image->width(), m_image->height());
cudaMemcpyToArray(d_image, 0, 0, m_image->pixels(), imageSize, cudaMemcpyHostToDevice);
void CudaCompressorDXT3::setup(cudaArray * image, const nvtt::CompressionOptions::Private & compressionOptions)
{
setupCompressKernel(compressionOptions.colorWeight.ptr());
bindTextureToArray(image);
}
// Image size in blocks. void CudaCompressorDXT3::compressBlocks(uint first, uint count, uint w, uint h, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
const uint w = (m_image->width() + 3) / 4; {
const uint h = (m_image->height() + 3) / 4; // Launch kernel.
const uint blockNum = w * h; compressKernelDXT3(first, count, w, m_ctx.result, m_ctx.bitmapTable);
const uint compressedSize = blockNum * 8;
void * h_result = malloc(min(blockNum, MAX_BLOCKS) * 8); // Copy result to host.
cudaMemcpy(output, m_ctx.result, count * 16, cudaMemcpyDeviceToHost);
}
//clock_t start = clock();
void CudaCompressorDXT5::setup(cudaArray * image, const nvtt::CompressionOptions::Private & compressionOptions)
{
setupCompressKernel(compressionOptions.colorWeight.ptr()); setupCompressKernel(compressionOptions.colorWeight.ptr());
bindTextureToArray(image);
}
uint bn = 0; void CudaCompressorDXT5::compressBlocks(uint first, uint count, uint w, uint h, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
while(bn != blockNum) {
/*// Launch kernel.
compressKernelDXT5(first, count, w, m_ctx.result, m_ctx.bitmapTable);
// Copy result to host.
cudaMemcpy(output, m_ctx.result, count * 16, cudaMemcpyDeviceToHost);*/
// Launch kernel.
if (alphaMode == AlphaMode_Transparency)
{ {
uint count = min(blockNum - bn, MAX_BLOCKS); // compressWeightedKernelDXT1(first, count, w, m_ctx.result, m_ctx.bitmapTable);
}
else
{
// compressKernelDXT1_Level4(first, count, w, m_ctx.result, m_ctx.bitmapTable);
}
// Launch kernel. // Compress alpha in parallel with the GPU.
compressKernelDXT1_Tex(bn, count, w, d_image, m_result, m_bitmapTable); for (uint i = 0; i < count; i++)
{
//ColorBlock rgba(blockLinearImage + (first + i) * 16);
//OptimalCompress::compressDXT3A(rgba, alphaBlocks + i);
}
// Check for errors. // Copy result to host.
cudaError_t err = cudaGetLastError(); cudaMemcpy(output, m_ctx.result, count * 8, cudaMemcpyDeviceToHost);
if (err != cudaSuccess)
{
nvDebug("CUDA Error: %s\n", cudaGetErrorString(err));
if (outputOptions.errorHandler != NULL) // @@ Interleave color and alpha blocks.
{
outputOptions.errorHandler->error(Error_CudaError);
}
}
// Copy result to host, overwrite swizzled image. }
cudaMemcpy(h_result, m_result, count * 8, cudaMemcpyDeviceToHost);
// Output result.
if (outputOptions.outputHandler != NULL)
{
outputOptions.outputHandler->writeData(h_result, count * 8);
}
bn += count;
}
//clock_t end = clock();
//printf("\rCUDA time taken: %.3f seconds\n", float(end-start) / CLOCKS_PER_SEC);
free(h_result);
#else
if (outputOptions.errorHandler != NULL)
{
outputOptions.errorHandler->error(Error_CudaError);
}
#endif
}
// @@ This code is very repetitive and needs to be cleaned up.
#if 0
/// Compress image using CUDA. /// Compress image using CUDA.
void CudaCompressor::compressDXT3(const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions) void CudaCompressor::compressDXT3(const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions)
@ -337,16 +338,16 @@ void CudaCompressor::compressDXT3(const CompressionOptions::Private & compressio
{ {
uint count = min(blockNum - bn, MAX_BLOCKS); uint count = min(blockNum - bn, MAX_BLOCKS);
cudaMemcpy(m_data, blockLinearImage + bn * 16, count * 64, cudaMemcpyHostToDevice); cudaMemcpy(m_ctx.data, blockLinearImage + bn * 16, count * 64, cudaMemcpyHostToDevice);
// Launch kernel. // Launch kernel.
if (m_alphaMode == AlphaMode_Transparency) if (m_alphaMode == AlphaMode_Transparency)
{ {
compressWeightedKernelDXT1(count, m_data, m_result, m_bitmapTable); compressWeightedKernelDXT1(count, m_ctx.data, m_ctx.result, m_ctx.bitmapTable);
} }
else else
{ {
compressKernelDXT1_Level4(count, m_data, m_result, m_bitmapTable); compressKernelDXT1_Level4(count, m_ctx.data, m_ctx.result, m_ctx.bitmapTable);
} }
// Compress alpha in parallel with the GPU. // Compress alpha in parallel with the GPU.
@ -369,7 +370,7 @@ void CudaCompressor::compressDXT3(const CompressionOptions::Private & compressio
} }
// Copy result to host, overwrite swizzled image. // Copy result to host, overwrite swizzled image.
cudaMemcpy(blockLinearImage, m_result, count * 8, cudaMemcpyDeviceToHost); cudaMemcpy(blockLinearImage, m_ctx.result, count * 8, cudaMemcpyDeviceToHost);
// Output result. // Output result.
if (outputOptions.outputHandler != NULL) if (outputOptions.outputHandler != NULL)
@ -428,16 +429,16 @@ void CudaCompressor::compressDXT5(const CompressionOptions::Private & compressio
{ {
uint count = min(blockNum - bn, MAX_BLOCKS); uint count = min(blockNum - bn, MAX_BLOCKS);
cudaMemcpy(m_data, blockLinearImage + bn * 16, count * 64, cudaMemcpyHostToDevice); cudaMemcpy(m_ctx.data, blockLinearImage + bn * 16, count * 64, cudaMemcpyHostToDevice);
// Launch kernel. // Launch kernel.
if (m_alphaMode == AlphaMode_Transparency) if (m_alphaMode == AlphaMode_Transparency)
{ {
compressWeightedKernelDXT1(count, m_data, m_result, m_bitmapTable); compressWeightedKernelDXT1(count, m_ctx.data, m_ctx.result, m_ctx.bitmapTable);
} }
else else
{ {
compressKernelDXT1_Level4(count, m_data, m_result, m_bitmapTable); compressKernelDXT1_Level4(count, m_ctx.data, m_ctx.result, m_ctx.bitmapTable);
} }
// Compress alpha in parallel with the GPU. // Compress alpha in parallel with the GPU.
@ -460,7 +461,7 @@ void CudaCompressor::compressDXT5(const CompressionOptions::Private & compressio
} }
// Copy result to host, overwrite swizzled image. // Copy result to host, overwrite swizzled image.
cudaMemcpy(blockLinearImage, m_result, count * 8, cudaMemcpyDeviceToHost); cudaMemcpy(blockLinearImage, m_ctx.result, count * 8, cudaMemcpyDeviceToHost);
// Output result. // Output result.
if (outputOptions.outputHandler != NULL) if (outputOptions.outputHandler != NULL)
@ -516,10 +517,10 @@ void CudaCompressor::compressDXT1n(const nvtt::CompressionOptions::Private & com
{ {
uint count = min(blockNum - bn, MAX_BLOCKS); uint count = min(blockNum - bn, MAX_BLOCKS);
cudaMemcpy(m_data, blockLinearImage + bn * 16, count * 64, cudaMemcpyHostToDevice); cudaMemcpy(m_ctx.data, blockLinearImage + bn * 16, count * 64, cudaMemcpyHostToDevice);
// Launch kernel. // Launch kernel.
compressNormalKernelDXT1(count, m_data, m_result, m_bitmapTable); compressNormalKernelDXT1(count, m_ctx.data, m_ctx.result, m_ctx.bitmapTable);
// Check for errors. // Check for errors.
cudaError_t err = cudaGetLastError(); cudaError_t err = cudaGetLastError();
@ -534,7 +535,7 @@ void CudaCompressor::compressDXT1n(const nvtt::CompressionOptions::Private & com
} }
// Copy result to host, overwrite swizzled image. // Copy result to host, overwrite swizzled image.
cudaMemcpy(blockLinearImage, m_result, count * 8, cudaMemcpyDeviceToHost); cudaMemcpy(blockLinearImage, m_ctx.result, count * 8, cudaMemcpyDeviceToHost);
// Output result. // Output result.
if (outputOptions.outputHandler != NULL) if (outputOptions.outputHandler != NULL)
@ -585,10 +586,10 @@ void CudaCompressor::compressCTX1(const nvtt::CompressionOptions::Private & comp
{ {
uint count = min(blockNum - bn, MAX_BLOCKS); uint count = min(blockNum - bn, MAX_BLOCKS);
cudaMemcpy(m_data, blockLinearImage + bn * 16, count * 64, cudaMemcpyHostToDevice); cudaMemcpy(m_ctx.data, blockLinearImage + bn * 16, count * 64, cudaMemcpyHostToDevice);
// Launch kernel. // Launch kernel.
compressKernelCTX1(count, m_data, m_result, m_bitmapTableCTX); compressKernelCTX1(count, m_ctx.data, m_ctx.result, m_ctx.bitmapTableCTX);
// Check for errors. // Check for errors.
cudaError_t err = cudaGetLastError(); cudaError_t err = cudaGetLastError();
@ -603,7 +604,7 @@ void CudaCompressor::compressCTX1(const nvtt::CompressionOptions::Private & comp
} }
// Copy result to host, overwrite swizzled image. // Copy result to host, overwrite swizzled image.
cudaMemcpy(blockLinearImage, m_result, count * 8, cudaMemcpyDeviceToHost); cudaMemcpy(blockLinearImage, m_ctx.result, count * 8, cudaMemcpyDeviceToHost);
// Output result. // Output result.
if (outputOptions.outputHandler != NULL) if (outputOptions.outputHandler != NULL)
@ -643,4 +644,4 @@ void CudaCompressor::compressDXT5n(const nvtt::CompressionOptions::Private & com
#endif #endif
} }
#endif // 0

@ -27,38 +27,86 @@
#include <nvimage/nvimage.h> #include <nvimage/nvimage.h>
#include <nvtt/nvtt.h> #include <nvtt/nvtt.h>
#include "nvtt/CompressDXT.h"
struct cudaArray;
namespace nv namespace nv
{ {
class Image; class Image;
class CudaCompressor class CudaContext
{ {
public: public:
CudaCompressor(); CudaContext();
~CudaCompressor(); ~CudaContext();
bool isValid() const; bool isValid() const;
void setImage(const Image * image, nvtt::AlphaMode alphaMode); public:
// Device pointers.
uint * bitmapTable;
uint * bitmapTableCTX;
uint * data;
uint * result;
};
void compressDXT1(const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
void compressDXT3(const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
void compressDXT5(const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
void compressDXT1n(const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
void compressCTX1(const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
void compressDXT5n(const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
private: struct CudaCompressor : public CompressorInterface
{
CudaCompressor(CudaContext & ctx);
virtual void compress(nvtt::InputFormat inputFormat, nvtt::AlphaMode alphaMode, uint w, uint h, void * data, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
virtual void setup(cudaArray * image, const nvtt::CompressionOptions::Private & compressionOptions) = 0;
virtual void compressBlocks(uint first, uint count, uint w, uint h, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output) = 0;
virtual uint blockSize() const = 0;
protected:
CudaContext & m_ctx;
};
uint * m_bitmapTable; struct CudaCompressorDXT1 : public CudaCompressor
uint * m_bitmapTableCTX; {
uint * m_data; CudaCompressorDXT1(CudaContext & ctx) : CudaCompressor(ctx) {}
uint * m_result;
virtual void setup(cudaArray * image, const nvtt::CompressionOptions::Private & compressionOptions);
const Image * m_image; virtual void compressBlocks(uint first, uint count, uint w, uint h, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
nvtt::AlphaMode m_alphaMode; virtual uint blockSize() const { return 8; };
}; };
/*struct CudaCompressorDXT1n : public CudaCompressor
{
virtual void setup(const CompressionOptions::Private & compressionOptions);
virtual void compressBlocks(uint blockCount, const void * input, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output) = 0;
virtual uint blockSize() const { return 8; };
};*/
struct CudaCompressorDXT3 : public CudaCompressor
{
CudaCompressorDXT3(CudaContext & ctx) : CudaCompressor(ctx) {}
virtual void setup(cudaArray * image, const nvtt::CompressionOptions::Private & compressionOptions);
virtual void compressBlocks(uint first, uint count, uint w, uint h, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
virtual uint blockSize() const { return 16; };
};
struct CudaCompressorDXT5 : public CudaCompressor
{
CudaCompressorDXT5(CudaContext & ctx) : CudaCompressor(ctx) {}
virtual void setup(cudaArray * image, const nvtt::CompressionOptions::Private & compressionOptions);
virtual void compressBlocks(uint first, uint count, uint w, uint h, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
virtual uint blockSize() const { return 16; };
};
/*struct CudaCompressorCXT1 : public CudaCompressor
{
virtual void setup(const CompressionOptions::Private & compressionOptions);
virtual void compressBlocks(uint blockCount, const void * input, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output) = 0;
virtual uint blockSize() const { return 8; };
};*/
} // nv namespace } // nv namespace

@ -93,6 +93,9 @@ namespace nvtt
Format_DXT1n, Format_DXT1n,
Format_CTX1, Format_CTX1,
Format_YCoCg_DXT5, Format_YCoCg_DXT5,
Format_BC6,
Format_BC7,
}; };
/// Pixel types. /// Pixel types.

Loading…
Cancel
Save