diff --git a/src/nvtt/CompressDXT.cpp b/src/nvtt/CompressDXT.cpp index dea258b..a8684d6 100644 --- a/src/nvtt/CompressDXT.cpp +++ b/src/nvtt/CompressDXT.cpp @@ -1,784 +1,675 @@ -// Copyright NVIDIA Corporation 2007 -- Ignacio Castano -// -// Permission is hereby granted, free of charge, to any person -// obtaining a copy of this software and associated documentation -// files (the "Software"), to deal in the Software without -// restriction, including without limitation the rights to use, -// copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the -// Software is furnished to do so, subject to the following -// conditions: -// -// The above copyright notice and this permission notice shall be -// included in all copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -// OTHER DEALINGS IN THE SOFTWARE. - -#include "CompressDXT.h" -#include "QuickCompressDXT.h" -#include "OptimalCompressDXT.h" -#include "CompressionOptions.h" -#include "OutputOptions.h" - -// squish -#include "squish/colourset.h" -#include "squish/fastclusterfit.h" -#include "squish/weightedclusterfit.h" - -#include - -#include - -#include -#include -#include - - -// s3_quant -#if defined(HAVE_S3QUANT) -#include "s3tc/s3_quant.h" -#endif - -// ati tc -#if defined(HAVE_ATITC) -typedef int BOOL; -typedef _W64 unsigned long ULONG_PTR; -typedef ULONG_PTR DWORD_PTR; -#include "atitc/ATI_Compress.h" -#endif - -// squish -#if defined(HAVE_SQUISH) -//#include "squish/squish.h" -#include "squish-1.10/squish.h" -#endif - -// d3dx -#if defined(HAVE_D3DX) -#include -#endif - -// stb -#if defined(HAVE_STB) -#define STB_DEFINE -#include "stb/stb_dxt.h" -#endif - - -using namespace nv; -using namespace nvtt; - - -nv::FastCompressor::FastCompressor() : m_image(NULL), m_alphaMode(AlphaMode_None) -{ -} - -nv::FastCompressor::~FastCompressor() -{ -} - -void nv::FastCompressor::setImage(const Image * image, nvtt::AlphaMode alphaMode) -{ - m_image = image; - m_alphaMode = alphaMode; -} - -void nv::FastCompressor::compressDXT1(const OutputOptions::Private & outputOptions) -{ - const uint w = m_image->width(); - const uint h = m_image->height(); - - ColorBlock rgba; - BlockDXT1 block; - - for (uint y = 0; y < h; y += 4) { - for (uint x = 0; x < w; x += 4) { - rgba.init(m_image, x, y); - - QuickCompress::compressDXT1(rgba, &block); - - if (outputOptions.outputHandler != NULL) { - outputOptions.outputHandler->writeData(&block, sizeof(block)); - } - } - } -} - - -void nv::FastCompressor::compressDXT1a(const OutputOptions::Private & outputOptions) -{ - const uint w = m_image->width(); - const uint h = m_image->height(); - - ColorBlock rgba; - BlockDXT1 block; - - for (uint y = 0; y < h; y += 4) { - for (uint x = 0; x < w; x += 4) { - rgba.init(m_image, x, y); - - QuickCompress::compressDXT1a(rgba, &block); - - if (outputOptions.outputHandler != NULL) { - outputOptions.outputHandler->writeData(&block, sizeof(block)); - } - } - } -} - - -void nv::FastCompressor::compressDXT3(const nvtt::OutputOptions::Private & outputOptions) -{ - const uint w = m_image->width(); - const uint h = m_image->height(); - - ColorBlock rgba; - BlockDXT3 block; - - for (uint y = 0; y < h; y += 4) { - for (uint x = 0; x < w; x += 4) { - rgba.init(m_image, x, y); - - QuickCompress::compressDXT3(rgba, &block); - - if (outputOptions.outputHandler != NULL) { - outputOptions.outputHandler->writeData(&block, sizeof(block)); - } - } - } -} - - -void nv::FastCompressor::compressDXT5(const nvtt::OutputOptions::Private & outputOptions) -{ - const uint w = m_image->width(); - const uint h = m_image->height(); - - ColorBlock rgba; - BlockDXT5 block; - - for (uint y = 0; y < h; y += 4) { - for (uint x = 0; x < w; x += 4) { - rgba.init(m_image, x, y); - - QuickCompress::compressDXT5(rgba, &block, 0); - - if (outputOptions.outputHandler != NULL) { - outputOptions.outputHandler->writeData(&block, sizeof(block)); - } - } - } -} - - -void nv::FastCompressor::compressDXT5n(const nvtt::OutputOptions::Private & outputOptions) -{ - const uint w = m_image->width(); - const uint h = m_image->height(); - - ColorBlock rgba; - BlockDXT5 block; - - for (uint y = 0; y < h; y += 4) { - for (uint x = 0; x < w; x += 4) { - rgba.init(m_image, x, y); - - rgba.swizzleDXT5n(); - - QuickCompress::compressDXT5(rgba, &block, 0); - - if (outputOptions.outputHandler != NULL) { - outputOptions.outputHandler->writeData(&block, sizeof(block)); - } - } - } -} - - -nv::SlowCompressor::SlowCompressor() : m_image(NULL), m_alphaMode(AlphaMode_None) -{ -} - -nv::SlowCompressor::~SlowCompressor() -{ -} - -void nv::SlowCompressor::setImage(const Image * image, nvtt::AlphaMode alphaMode) -{ - m_image = image; - m_alphaMode = alphaMode; -} - -void nv::SlowCompressor::compressDXT1(const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions) -{ - const uint w = m_image->width(); - const uint h = m_image->height(); - - ColorBlock rgba; - BlockDXT1 block; - - nvsquish::WeightedClusterFit fit; - //nvsquish::ClusterFit fit; - //nvsquish::FastClusterFit fit; - fit.SetMetric(compressionOptions.colorWeight.x(), compressionOptions.colorWeight.y(), compressionOptions.colorWeight.z()); - - for (uint y = 0; y < h; y += 4) { - for (uint x = 0; x < w; x += 4) { - - rgba.init(m_image, x, y); - - if (rgba.isSingleColor()) - { - OptimalCompress::compressDXT1(rgba.color(0), &block); - } - else - { - nvsquish::ColourSet colours((uint8 *)rgba.colors(), 0, true); - fit.SetColourSet(&colours, nvsquish::kDxt1); - fit.Compress(&block); - } - - if (outputOptions.outputHandler != NULL) { - outputOptions.outputHandler->writeData(&block, sizeof(block)); - } - } - } -} - - -void nv::SlowCompressor::compressDXT1a(const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions) -{ - const uint w = m_image->width(); - const uint h = m_image->height(); - - ColorBlock rgba; - BlockDXT1 block; - - nvsquish::WeightedClusterFit fit; - fit.SetMetric(compressionOptions.colorWeight.x(), compressionOptions.colorWeight.y(), compressionOptions.colorWeight.z()); - - for (uint y = 0; y < h; y += 4) { - for (uint x = 0; x < w; x += 4) { - - rgba.init(m_image, x, y); - - bool anyAlpha = false; - bool allAlpha = true; - - for (uint i = 0; i < 16; i++) - { - if (rgba.color(i).a < 128) anyAlpha = true; - else allAlpha = false; - } - - if ((!anyAlpha && rgba.isSingleColor() || allAlpha)) - { - OptimalCompress::compressDXT1a(rgba.color(0), &block); - } - else - { - nvsquish::ColourSet colours((uint8 *)rgba.colors(), nvsquish::kDxt1|nvsquish::kWeightColourByAlpha); - fit.SetColourSet(&colours, nvsquish::kDxt1); - fit.Compress(&block); - } - - if (outputOptions.outputHandler != NULL) { - outputOptions.outputHandler->writeData(&block, sizeof(block)); - } - } - } -} - - -void nv::SlowCompressor::compressDXT3(const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions) -{ - const uint w = m_image->width(); - const uint h = m_image->height(); - - ColorBlock rgba; - BlockDXT3 block; - - nvsquish::WeightedClusterFit fit; - fit.SetMetric(compressionOptions.colorWeight.x(), compressionOptions.colorWeight.y(), compressionOptions.colorWeight.z()); - - int flags = 0; - if (m_alphaMode == AlphaMode_Transparency) - { - flags = nvsquish::kWeightColourByAlpha; - } - - for (uint y = 0; y < h; y += 4) { - for (uint x = 0; x < w; x += 4) { - - rgba.init(m_image, x, y); - - // Compress explicit alpha. - OptimalCompress::compressDXT3A(rgba, &block.alpha); - - // Compress color. - if (rgba.isSingleColor()) - { - OptimalCompress::compressDXT1(rgba.color(0), &block.color); - } - else - { - nvsquish::ColourSet colours((uint8 *)rgba.colors(), flags); - fit.SetColourSet(&colours, 0); - fit.Compress(&block.color); - } - - if (outputOptions.outputHandler != NULL) { - outputOptions.outputHandler->writeData(&block, sizeof(block)); - } - } - } -} - -void nv::SlowCompressor::compressDXT5(const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions) -{ - const uint w = m_image->width(); - const uint h = m_image->height(); - - ColorBlock rgba; - BlockDXT5 block; - - nvsquish::WeightedClusterFit fit; - fit.SetMetric(compressionOptions.colorWeight.x(), compressionOptions.colorWeight.y(), compressionOptions.colorWeight.z()); - - int flags = 0; - if (m_alphaMode == AlphaMode_Transparency) - { - flags = nvsquish::kWeightColourByAlpha; - } - - for (uint y = 0; y < h; y += 4) { - for (uint x = 0; x < w; x += 4) { - - rgba.init(m_image, x, y); - - // Compress alpha. - if (compressionOptions.quality == Quality_Highest) - { - OptimalCompress::compressDXT5A(rgba, &block.alpha); - } - else - { - QuickCompress::compressDXT5A(rgba, &block.alpha); - } - - // Compress color. - if (rgba.isSingleColor()) - { - OptimalCompress::compressDXT1(rgba.color(0), &block.color); - } - else - { - nvsquish::ColourSet colours((uint8 *)rgba.colors(), flags); - fit.SetColourSet(&colours, 0); - fit.Compress(&block.color); - } - - if (outputOptions.outputHandler != NULL) { - outputOptions.outputHandler->writeData(&block, sizeof(block)); - } - } - } -} - - -void nv::SlowCompressor::compressDXT5n(const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions) -{ - const uint w = m_image->width(); - const uint h = m_image->height(); - - ColorBlock rgba; - BlockDXT5 block; - - nvsquish::WeightedClusterFit fit; - fit.SetMetric(0, 1, 0); - - for (uint y = 0; y < h; y += 4) { - for (uint x = 0; x < w; x += 4) { - - rgba.init(m_image, x, y); - - rgba.swizzleDXT5n(); - - // Compress X. - if (compressionOptions.quality == Quality_Highest) - { - OptimalCompress::compressDXT5A(rgba, &block.alpha); - } - else - { - QuickCompress::compressDXT5A(rgba, &block.alpha); - } - - // Compress Y. - //OptimalCompress::compressDXT1G(rgba, &block.color); - - /*if (rgba.isSingleColor()) - { - OptimalCompress::compressDXT1G(rgba.color(0), &block.color); - } - else*/ - { - nvsquish::ColourSet colours((uint8 *)rgba.colors(), 0); - fit.SetColourSet(&colours, 0); - fit.Compress(&block.color); - } - - if (outputOptions.outputHandler != NULL) { - outputOptions.outputHandler->writeData(&block, sizeof(block)); - } - } - } -} - - -void nv::SlowCompressor::compressBC4(const CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions) -{ - const uint w = m_image->width(); - const uint h = m_image->height(); - - ColorBlock rgba; - AlphaBlockDXT5 block; - - for (uint y = 0; y < h; y += 4) { - for (uint x = 0; x < w; x += 4) { - - rgba.init(m_image, x, y); - - if (compressionOptions.quality == Quality_Highest) - { - OptimalCompress::compressDXT5A(rgba, &block); - } - else - { - QuickCompress::compressDXT5A(rgba, &block); - } - - if (outputOptions.outputHandler != NULL) { - outputOptions.outputHandler->writeData(&block, sizeof(block)); - } - } - } -} - - -void nv::SlowCompressor::compressBC5(const CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions) -{ - const uint w = m_image->width(); - const uint h = m_image->height(); - - ColorBlock xcolor; - ColorBlock ycolor; - - BlockATI2 block; - - for (uint y = 0; y < h; y += 4) { - for (uint x = 0; x < w; x += 4) { - - xcolor.init(m_image, x, y); - xcolor.splatX(); - - ycolor.init(m_image, x, y); - ycolor.splatY(); - - if (compressionOptions.quality == Quality_Highest) - { - OptimalCompress::compressDXT5A(xcolor, &block.x); - OptimalCompress::compressDXT5A(ycolor, &block.y); - } - else - { - QuickCompress::compressDXT5A(xcolor, &block.x); - QuickCompress::compressDXT5A(ycolor, &block.y); - } - - if (outputOptions.outputHandler != NULL) { - outputOptions.outputHandler->writeData(&block, sizeof(block)); - } - } - } -} - - -#if defined(HAVE_S3QUANT) - -void nv::s3CompressDXT1(const Image * image, const OutputOptions::Private & outputOptions) -{ - const uint w = image->width(); - const uint h = image->height(); - - float error = 0.0f; - - BlockDXT1 dxtBlock3; - BlockDXT1 dxtBlock4; - ColorBlock block; - - for (uint y = 0; y < h; y += 4) { - for (uint x = 0; x < w; x += 4) { - block.init(image, x, y); - - // Init rgb block. - RGBBlock rgbBlock; - rgbBlock.n = 16; - for (uint i = 0; i < 16; i++) { - rgbBlock.colorChannel[i][0] = clamp(float(block.color(i).r) / 255.0f, 0.0f, 1.0f); - rgbBlock.colorChannel[i][1] = clamp(float(block.color(i).g) / 255.0f, 0.0f, 1.0f); - rgbBlock.colorChannel[i][2] = clamp(float(block.color(i).b) / 255.0f, 0.0f, 1.0f); - } - rgbBlock.weight[0] = 1.0f; - rgbBlock.weight[1] = 1.0f; - rgbBlock.weight[2] = 1.0f; - - rgbBlock.inLevel = 4; - CodeRGBBlock(&rgbBlock); - - // Copy results to DXT block. - dxtBlock4.col0.r = rgbBlock.endPoint[0][0]; - dxtBlock4.col0.g = rgbBlock.endPoint[0][1]; - dxtBlock4.col0.b = rgbBlock.endPoint[0][2]; - - dxtBlock4.col1.r = rgbBlock.endPoint[1][0]; - dxtBlock4.col1.g = rgbBlock.endPoint[1][1]; - dxtBlock4.col1.b = rgbBlock.endPoint[1][2]; - - dxtBlock4.setIndices(rgbBlock.index); - - if (dxtBlock4.col0.u < dxtBlock4.col1.u) { - swap(dxtBlock4.col0.u, dxtBlock4.col1.u); - dxtBlock4.indices ^= 0x55555555; - } - - uint error4 = blockError(block, dxtBlock4); - - rgbBlock.inLevel = 3; - - CodeRGBBlock(&rgbBlock); - - // Copy results to DXT block. - dxtBlock3.col0.r = rgbBlock.endPoint[0][0]; - dxtBlock3.col0.g = rgbBlock.endPoint[0][1]; - dxtBlock3.col0.b = rgbBlock.endPoint[0][2]; - - dxtBlock3.col1.r = rgbBlock.endPoint[1][0]; - dxtBlock3.col1.g = rgbBlock.endPoint[1][1]; - dxtBlock3.col1.b = rgbBlock.endPoint[1][2]; - - dxtBlock3.setIndices(rgbBlock.index); - - if (dxtBlock3.col0.u > dxtBlock3.col1.u) { - swap(dxtBlock3.col0.u, dxtBlock3.col1.u); - dxtBlock3.indices ^= (~dxtBlock3.indices >> 1) & 0x55555555; - } - - uint error3 = blockError(block, dxtBlock3); - - if (error3 < error4) { - error += error3; - - if (outputOptions.outputHandler != NULL) { - outputOptions.outputHandler->writeData(&dxtBlock3, sizeof(dxtBlock3)); - } - } - else { - error += error4; - - if (outputOptions.outputHandler != NULL) { - outputOptions.outputHandler->writeData(&dxtBlock4, sizeof(dxtBlock4)); - } - } - } - } -} - -#endif // defined(HAVE_S3QUANT) - - -#if defined(HAVE_ATITC) - -void nv::atiCompressDXT1(const Image * image, const OutputOptions::Private & outputOptions) -{ - // Init source texture - ATI_TC_Texture srcTexture; - srcTexture.dwSize = sizeof(srcTexture); - srcTexture.dwWidth = image->width(); - srcTexture.dwHeight = image->height(); - srcTexture.dwPitch = image->width() * 4; - srcTexture.format = ATI_TC_FORMAT_ARGB_8888; - srcTexture.dwDataSize = ATI_TC_CalculateBufferSize(&srcTexture); - srcTexture.pData = (ATI_TC_BYTE*) image->pixels(); - - // Init dest texture - ATI_TC_Texture destTexture; - destTexture.dwSize = sizeof(destTexture); - destTexture.dwWidth = image->width(); - destTexture.dwHeight = image->height(); - destTexture.dwPitch = 0; - destTexture.format = ATI_TC_FORMAT_DXT1; - destTexture.dwDataSize = ATI_TC_CalculateBufferSize(&destTexture); - destTexture.pData = (ATI_TC_BYTE*) mem::malloc(destTexture.dwDataSize); - - // Compress - ATI_TC_ConvertTexture(&srcTexture, &destTexture, NULL, NULL, NULL, NULL); - - if (outputOptions.outputHandler != NULL) { - outputOptions.outputHandler->writeData(destTexture.pData, destTexture.dwDataSize); - } - - mem::free(destTexture.pData); -} - -void nv::atiCompressDXT5(const Image * image, const OutputOptions::Private & outputOptions) -{ - // Init source texture - ATI_TC_Texture srcTexture; - srcTexture.dwSize = sizeof(srcTexture); - srcTexture.dwWidth = image->width(); - srcTexture.dwHeight = image->height(); - srcTexture.dwPitch = image->width() * 4; - srcTexture.format = ATI_TC_FORMAT_ARGB_8888; - srcTexture.dwDataSize = ATI_TC_CalculateBufferSize(&srcTexture); - srcTexture.pData = (ATI_TC_BYTE*) image->pixels(); - - // Init dest texture - ATI_TC_Texture destTexture; - destTexture.dwSize = sizeof(destTexture); - destTexture.dwWidth = image->width(); - destTexture.dwHeight = image->height(); - destTexture.dwPitch = 0; - destTexture.format = ATI_TC_FORMAT_DXT5; - destTexture.dwDataSize = ATI_TC_CalculateBufferSize(&destTexture); - destTexture.pData = (ATI_TC_BYTE*) mem::malloc(destTexture.dwDataSize); - - // Compress - ATI_TC_ConvertTexture(&srcTexture, &destTexture, NULL, NULL, NULL, NULL); - - if (outputOptions.outputHandler != NULL) { - outputOptions.outputHandler->writeData(destTexture.pData, destTexture.dwDataSize); - } - - mem::free(destTexture.pData); -} - -#endif // defined(HAVE_ATITC) - -#if defined(HAVE_SQUISH) - -void nv::squishCompressDXT1(const Image * image, const OutputOptions::Private & outputOptions) -{ - Image img(*image); - int count = img.width() * img.height(); - for (int i = 0; i < count; i++) - { - Color32 c = img.pixel(i); - img.pixel(i) = Color32(c.b, c.g, c.r, c.a); - } - - int size = squish::GetStorageRequirements(img.width(), img.height(), squish::kDxt1); - void * blocks = mem::malloc(size); - - squish::CompressImage((const squish::u8 *)img.pixels(), img.width(), img.height(), blocks, squish::kDxt1 | squish::kColourClusterFit); - - if (outputOptions.outputHandler != NULL) { - outputOptions.outputHandler->writeData(blocks, size); - } - - mem::free(blocks); -} - -#endif // defined(HAVE_SQUISH) - - -#if defined(HAVE_D3DX) - -void nv::d3dxCompressDXT1(const Image * image, const OutputOptions::Private & outputOptions) -{ - IDirect3D9 * d3d = Direct3DCreate9(D3D_SDK_VERSION); - - D3DPRESENT_PARAMETERS presentParams; - ZeroMemory(&presentParams, sizeof(presentParams)); - presentParams.Windowed = TRUE; - presentParams.SwapEffect = D3DSWAPEFFECT_COPY; - presentParams.BackBufferWidth = 8; - presentParams.BackBufferHeight = 8; - presentParams.BackBufferFormat = D3DFMT_UNKNOWN; - - HRESULT err; - - IDirect3DDevice9 * device = NULL; - err = d3d->CreateDevice(D3DADAPTER_DEFAULT, D3DDEVTYPE_REF, GetDesktopWindow(), D3DCREATE_SOFTWARE_VERTEXPROCESSING, &presentParams, &device); - - IDirect3DTexture9 * texture = NULL; - err = D3DXCreateTexture(device, image->width(), image->height(), 1, 0, D3DFMT_DXT1, D3DPOOL_SYSTEMMEM, &texture); - - IDirect3DSurface9 * surface = NULL; - err = texture->GetSurfaceLevel(0, &surface); - - RECT rect; - rect.left = 0; - rect.top = 0; - rect.bottom = image->height(); - rect.right = image->width(); - - err = D3DXLoadSurfaceFromMemory(surface, NULL, NULL, image->pixels(), D3DFMT_A8R8G8B8, image->width() * sizeof(Color32), NULL, &rect, D3DX_DEFAULT, 0); - - if (err != D3DERR_INVALIDCALL && err != D3DXERR_INVALIDDATA) - { - D3DLOCKED_RECT rect; - ZeroMemory(&rect, sizeof(rect)); - - err = surface->LockRect(&rect, NULL, D3DLOCK_READONLY); - - if (outputOptions.outputHandler != NULL) { - int size = rect.Pitch * ((image->height() + 3) / 4); - outputOptions.outputHandler->writeData(rect.pBits, size); - } - - err = surface->UnlockRect(); - } - - surface->Release(); - device->Release(); - d3d->Release(); -} - -#endif // defined(HAVE_D3DX) - - -#if defined(HAVE_STB) - -void nv::stbCompressDXT1(const Image * image, const OutputOptions::Private & outputOptions) -{ - const uint w = image->width(); - const uint h = image->height(); - - float error = 0.0f; - - BlockDXT1 dxtBlock; - ColorBlock block; - - for (uint y = 0; y < h; y += 4) { - for (uint x = 0; x < w; x += 4) { - block.init(image, x, y); - block.swizzleSTB(); - - stb_compress_dxt_block((unsigned char *)&dxtBlock, (unsigned char *)block.colors(), 0, 0); - - if (outputOptions.outputHandler != NULL) { - outputOptions.outputHandler->writeData(&dxtBlock, sizeof(dxtBlock)); - } - } - } -} - -#endif // defined(HAVE_STB) +// Copyright NVIDIA Corporation 2007 -- Ignacio Castano +// +// Permission is hereby granted, free of charge, to any person +// obtaining a copy of this software and associated documentation +// files (the "Software"), to deal in the Software without +// restriction, including without limitation the rights to use, +// copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following +// conditions: +// +// The above copyright notice and this permission notice shall be +// included in all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +// OTHER DEALINGS IN THE SOFTWARE. + +#include "CompressDXT.h" +#include "QuickCompressDXT.h" +#include "OptimalCompressDXT.h" +#include "CompressionOptions.h" +#include "OutputOptions.h" + +// squish +#include "squish/colourset.h" +#include "squish/fastclusterfit.h" +#include "squish/weightedclusterfit.h" + +#include + +#include + +#include +#include +#include + + +// s3_quant +#if defined(HAVE_S3QUANT) +#include "s3tc/s3_quant.h" +#endif + +// ati tc +#if defined(HAVE_ATITC) +typedef int BOOL; +typedef _W64 unsigned long ULONG_PTR; +typedef ULONG_PTR DWORD_PTR; +#include "atitc/ATI_Compress.h" +#endif + +// squish +#if defined(HAVE_SQUISH) +//#include "squish/squish.h" +#include "squish-1.10/squish.h" +#endif + +// d3dx +#if defined(HAVE_D3DX) +#include +#endif + +// stb +#if defined(HAVE_STB) +#define STB_DEFINE +#include "stb/stb_dxt.h" +#endif + +#pragma message(NV_FILE_LINE "FIXME: Define HAVE_OPENMP from cmake.") +#define HAVE_OPENMP +#include + +using namespace nv; +using namespace nvtt; + + +void FixedBlockCompressor::compress(nvtt::InputFormat inputFormat, nvtt::AlphaMode alphaMode, uint w, uint h, void * data, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions) +{ + const uint bs = blockSize(); + const uint bw = (w + 3) / 4; + const uint bh = (h + 3) / 4; + const uint size = bs * bw * bh; + +#if defined(HAVE_OPENMP) + bool singleThreaded = false; +#else + bool singleThreaded = true; +#endif + + // Use a single thread to compress small textures. + if (bw * bh < 16) singleThreaded = true; + + if (singleThreaded) + { + nvDebugCheck(bs <= 16); + uint8 mem[16]; + + for (int y = 0; y < int(h); y += 4) { + for (uint x = 0; x < w; x += 4) { + + ColorBlock rgba; + if (inputFormat == nvtt::InputFormat_BGRA_8UB) { + rgba.init(w, h, (uint *)data, x, y); + } + else { + nvDebugCheck(inputFormat == nvtt::InputFormat_RGBA_32F); + rgba.init(w, h, (float *)data, x, y); + } + + compressBlock(rgba, alphaMode, compressionOptions, mem); + + if (outputOptions.outputHandler != NULL) { + outputOptions.outputHandler->writeData(mem, bs); + } + } + } + } +#if defined(HAVE_OPENMP) + else + { + uint8 * mem = new uint8[size]; + + #pragma omp parallel + { + #pragma omp for + for (int i = 0; i < int(bw*bh); i++) + { + const uint x = i % bw; + const uint y = i / bw; + + ColorBlock rgba; + if (inputFormat == nvtt::InputFormat_BGRA_8UB) { + rgba.init(w, h, (uint *)data, 4*x, 4*y); + } + else { + nvDebugCheck(inputFormat == nvtt::InputFormat_RGBA_32F); + rgba.init(w, h, (float *)data, 4*x, 4*y); + } + + uint8 * ptr = mem + (y * bw + x) * bs; + compressBlock(rgba, alphaMode, compressionOptions, ptr); + } // omp for + } // omp parallel + + if (outputOptions.outputHandler != NULL) { + outputOptions.outputHandler->writeData(mem, size); + } + + delete [] mem; + } +#endif +} + + +void FastCompressorDXT1::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output) +{ + BlockDXT1 * block = new(output) BlockDXT1; + QuickCompress::compressDXT1(rgba, block); +} + +void FastCompressorDXT1a::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output) +{ + BlockDXT1 * block = new(output) BlockDXT1; + QuickCompress::compressDXT1a(rgba, block); +} + +void FastCompressorDXT3::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output) +{ + BlockDXT3 * block = new(output) BlockDXT3; + QuickCompress::compressDXT3(rgba, block); +} + +void FastCompressorDXT5::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output) +{ + BlockDXT5 * block = new(output) BlockDXT5; + QuickCompress::compressDXT5(rgba, block); +} + +void FastCompressorDXT5n::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output) +{ + rgba.swizzle(4, 1, 5, 0); // 0xFF, G, 0, R + + BlockDXT5 * block = new(output) BlockDXT5; + QuickCompress::compressDXT5(rgba, block); +} + +void FastCompressorBC4::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output) +{ + BlockATI1 * block = new(output) BlockATI1; + + rgba.swizzle(0, 1, 2, 0); // Copy red to alpha + QuickCompress::compressDXT5A(rgba, &block->alpha); +} + +void FastCompressorBC5::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output) +{ + BlockATI2 * block = new(output) BlockATI2; + + rgba.swizzle(0, 1, 2, 0); // Copy red to alpha + QuickCompress::compressDXT5A(rgba, &block->x); + + rgba.swizzle(0, 1, 2, 1); // Copy green to alpha + QuickCompress::compressDXT5A(rgba, &block->y); +} + + +void NormalCompressorDXT1::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output) +{ + nvsquish::WeightedClusterFit fit; + fit.SetMetric(compressionOptions.colorWeight.x(), compressionOptions.colorWeight.y(), compressionOptions.colorWeight.z()); + + if (rgba.isSingleColor()) + { + BlockDXT1 * block = new(output) BlockDXT1; + OptimalCompress::compressDXT1(rgba.color(0), block); + } + else + { + nvsquish::ColourSet colours((uint8 *)rgba.colors(), 0); + fit.SetColourSet(&colours, nvsquish::kDxt1); + fit.Compress(output); + } +} + + +void NormalCompressorDXT1a::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output) +{ + bool anyAlpha = false; + bool allAlpha = true; + + for (uint i = 0; i < 16; i++) + { + if (rgba.color(i).a < 128) anyAlpha = true; + else allAlpha = false; + } + + const bool isSingleColor = rgba.isSingleColor(); + + if ((!anyAlpha && isSingleColor || allAlpha)) + { + BlockDXT1 * block = new(output) BlockDXT1; + OptimalCompress::compressDXT1a(rgba.color(0), block); + } + else + { + nvsquish::WeightedClusterFit fit; + fit.SetMetric(compressionOptions.colorWeight.x(), compressionOptions.colorWeight.y(), compressionOptions.colorWeight.z()); + + int flags = nvsquish::kDxt1; + if (alphaMode == nvtt::AlphaMode_Transparency) flags |= nvsquish::kWeightColourByAlpha; + + nvsquish::ColourSet colours((uint8 *)rgba.colors(), flags); + fit.SetColourSet(&colours, nvsquish::kDxt1); + + fit.Compress(output); + } +} + + +void NormalCompressorDXT3::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output) +{ + BlockDXT3 * block = new(output) BlockDXT3; + + // Compress explicit alpha. + OptimalCompress::compressDXT3A(rgba, &block->alpha); + + // Compress color. + if (rgba.isSingleColor()) + { + OptimalCompress::compressDXT1(rgba.color(0), &block->color); + } + else + { + nvsquish::WeightedClusterFit fit; + fit.SetMetric(compressionOptions.colorWeight.x(), compressionOptions.colorWeight.y(), compressionOptions.colorWeight.z()); + + int flags = 0; + if (alphaMode == nvtt::AlphaMode_Transparency) flags |= nvsquish::kWeightColourByAlpha; + + nvsquish::ColourSet colours((uint8 *)rgba.colors(), flags); + fit.SetColourSet(&colours, 0); + fit.Compress(&block->color); + } +} + + +void NormalCompressorDXT5::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output) +{ + BlockDXT5 * block = new(output) BlockDXT5; + + // Compress alpha. + if (compressionOptions.quality == Quality_Highest) + { + OptimalCompress::compressDXT5A(rgba, &block->alpha); + } + else + { + QuickCompress::compressDXT5A(rgba, &block->alpha); + } + + // Compress color. + if (rgba.isSingleColor()) + { + OptimalCompress::compressDXT1(rgba.color(0), &block->color); + } + else + { + nvsquish::WeightedClusterFit fit; + fit.SetMetric(compressionOptions.colorWeight.x(), compressionOptions.colorWeight.y(), compressionOptions.colorWeight.z()); + + int flags = 0; + if (alphaMode == nvtt::AlphaMode_Transparency) flags |= nvsquish::kWeightColourByAlpha; + + nvsquish::ColourSet colours((uint8 *)rgba.colors(), flags); + fit.SetColourSet(&colours, 0); + fit.Compress(&block->color); + } +} + + +void NormalCompressorDXT5n::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output) +{ + rgba.swizzle(4, 1, 5, 0); // 0xFF, G, 0, R + + BlockDXT5 * block = new(output) BlockDXT5; + + // Compress X. + if (compressionOptions.quality == Quality_Highest) + { + OptimalCompress::compressDXT5A(rgba, &block->alpha); + } + else + { + QuickCompress::compressDXT5A(rgba, &block->alpha); + } + + // Compress Y. + if (compressionOptions.quality == Quality_Highest) + { + OptimalCompress::compressDXT1G(rgba, &block->color); + } + else + { + if (rgba.isSingleColor()) + { + OptimalCompress::compressDXT1G(rgba.color(0), &block->color); + } + else + { + nvsquish::WeightedClusterFit fit; + fit.SetMetric(0, 1, 0); + + int flags = 0; + if (alphaMode == nvtt::AlphaMode_Transparency) flags |= nvsquish::kWeightColourByAlpha; + + nvsquish::ColourSet colours((uint8 *)rgba.colors(), flags); + fit.SetColourSet(&colours, 0); + fit.Compress(&block->color); + } + } +} + + +void ProductionCompressorBC4::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output) +{ + BlockATI1 * block = new(output) BlockATI1; + + rgba.swizzle(0, 1, 2, 0); // Copy red to alpha + OptimalCompress::compressDXT5A(rgba, &block->alpha); +} + +void ProductionCompressorBC5::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output) +{ + BlockATI2 * block = new(output) BlockATI2; + + rgba.swizzle(0, 1, 2, 0); // Copy red to alpha + OptimalCompress::compressDXT5A(rgba, &block->x); + + rgba.swizzle(0, 1, 2, 1); // Copy green to alpha + OptimalCompress::compressDXT5A(rgba, &block->y); +} + + + +#if defined(HAVE_S3QUANT) + +void S3CompressorDXT1::compress(nvtt::InputFormat inputFormat, nvtt::AlphaMode alphaMode, uint w, uint h, void * data, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions) +{ + float error = 0.0f; + + BlockDXT1 dxtBlock3; + BlockDXT1 dxtBlock4; + ColorBlock block; + + for (uint y = 0; y < h; y += 4) { + for (uint x = 0; x < w; x += 4) { + block.init(inputFormat, w, h, data, x, y); + + // Init rgb block. + RGBBlock rgbBlock; + rgbBlock.n = 16; + for (uint i = 0; i < 16; i++) { + rgbBlock.colorChannel[i][0] = clamp(float(block.color(i).r) / 255.0f, 0.0f, 1.0f); + rgbBlock.colorChannel[i][1] = clamp(float(block.color(i).g) / 255.0f, 0.0f, 1.0f); + rgbBlock.colorChannel[i][2] = clamp(float(block.color(i).b) / 255.0f, 0.0f, 1.0f); + } + rgbBlock.weight[0] = 1.0f; + rgbBlock.weight[1] = 1.0f; + rgbBlock.weight[2] = 1.0f; + + rgbBlock.inLevel = 4; + CodeRGBBlock(&rgbBlock); + + // Copy results to DXT block. + dxtBlock4.col0.r = rgbBlock.endPoint[0][0]; + dxtBlock4.col0.g = rgbBlock.endPoint[0][1]; + dxtBlock4.col0.b = rgbBlock.endPoint[0][2]; + + dxtBlock4.col1.r = rgbBlock.endPoint[1][0]; + dxtBlock4.col1.g = rgbBlock.endPoint[1][1]; + dxtBlock4.col1.b = rgbBlock.endPoint[1][2]; + + dxtBlock4.setIndices(rgbBlock.index); + + if (dxtBlock4.col0.u < dxtBlock4.col1.u) { + swap(dxtBlock4.col0.u, dxtBlock4.col1.u); + dxtBlock4.indices ^= 0x55555555; + } + + uint error4 = blockError(block, dxtBlock4); + + rgbBlock.inLevel = 3; + + CodeRGBBlock(&rgbBlock); + + // Copy results to DXT block. + dxtBlock3.col0.r = rgbBlock.endPoint[0][0]; + dxtBlock3.col0.g = rgbBlock.endPoint[0][1]; + dxtBlock3.col0.b = rgbBlock.endPoint[0][2]; + + dxtBlock3.col1.r = rgbBlock.endPoint[1][0]; + dxtBlock3.col1.g = rgbBlock.endPoint[1][1]; + dxtBlock3.col1.b = rgbBlock.endPoint[1][2]; + + dxtBlock3.setIndices(rgbBlock.index); + + if (dxtBlock3.col0.u > dxtBlock3.col1.u) { + swap(dxtBlock3.col0.u, dxtBlock3.col1.u); + dxtBlock3.indices ^= (~dxtBlock3.indices >> 1) & 0x55555555; + } + + uint error3 = blockError(block, dxtBlock3); + + if (error3 < error4) { + error += error3; + + if (outputOptions.outputHandler != NULL) { + outputOptions.outputHandler->writeData(&dxtBlock3, sizeof(dxtBlock3)); + } + } + else { + error += error4; + + if (outputOptions.outputHandler != NULL) { + outputOptions.outputHandler->writeData(&dxtBlock4, sizeof(dxtBlock4)); + } + } + } + } +} + +#endif // defined(HAVE_S3QUANT) + + +#if defined(HAVE_ATITC) + +void AtiCompressorDXT1::compress(nvtt::InputFormat inputFormat, nvtt::AlphaMode alphaMode, uint w, uint h, void * data, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions) +{ + // Init source texture + ATI_TC_Texture srcTexture; + srcTexture.dwSize = sizeof(srcTexture); + srcTexture.dwWidth = w; + srcTexture.dwHeight = h; + if (inputFormat == nvtt::InputFormat_BGRA_8UB) + { + srcTexture.dwPitch = w * 4; + srcTexture.format = ATI_TC_FORMAT_ARGB_8888; + } + else + { + srcTexture.dwPitch = w * 16; + srcTexture.format = ATI_TC_FORMAT_ARGB_32F; + } + srcTexture.dwDataSize = ATI_TC_CalculateBufferSize(&srcTexture); + srcTexture.pData = (ATI_TC_BYTE*) data; + + // Init dest texture + ATI_TC_Texture destTexture; + destTexture.dwSize = sizeof(destTexture); + destTexture.dwWidth = w; + destTexture.dwHeight = h; + destTexture.dwPitch = 0; + destTexture.format = ATI_TC_FORMAT_DXT1; + destTexture.dwDataSize = ATI_TC_CalculateBufferSize(&destTexture); + destTexture.pData = (ATI_TC_BYTE*) mem::malloc(destTexture.dwDataSize); + + ATI_TC_CompressOptions options; + options.dwSize = sizeof(options); + options.bUseChannelWeighting = false; + options.bUseAdaptiveWeighting = false; + options.bDXT1UseAlpha = false; + options.nCompressionSpeed = ATI_TC_Speed_Normal; + options.bDisableMultiThreading = false; + //options.bDisableMultiThreading = true; + + // Compress + ATI_TC_ConvertTexture(&srcTexture, &destTexture, &options, NULL, NULL, NULL); + + if (outputOptions.outputHandler != NULL) { + outputOptions.outputHandler->writeData(destTexture.pData, destTexture.dwDataSize); + } + + mem::free(destTexture.pData); +} + +void AtiCompressorDXT5::compress(nvtt::InputFormat inputFormat, nvtt::AlphaMode alphaMode, uint w, uint h, void * data, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions) +{ + // Init source texture + ATI_TC_Texture srcTexture; + srcTexture.dwSize = sizeof(srcTexture); + srcTexture.dwWidth = w; + srcTexture.dwHeight = h; + if (inputFormat == nvtt::InputFormat_BGRA_8UB) + { + srcTexture.dwPitch = w * 4; + srcTexture.format = ATI_TC_FORMAT_ARGB_8888; + } + else + { + srcTexture.dwPitch = w * 16; + srcTexture.format = ATI_TC_FORMAT_ARGB_32F; + } + srcTexture.dwDataSize = ATI_TC_CalculateBufferSize(&srcTexture); + srcTexture.pData = (ATI_TC_BYTE*) data; + + // Init dest texture + ATI_TC_Texture destTexture; + destTexture.dwSize = sizeof(destTexture); + destTexture.dwWidth = w; + destTexture.dwHeight = h; + destTexture.dwPitch = 0; + destTexture.format = ATI_TC_FORMAT_DXT5; + destTexture.dwDataSize = ATI_TC_CalculateBufferSize(&destTexture); + destTexture.pData = (ATI_TC_BYTE*) mem::malloc(destTexture.dwDataSize); + + // Compress + ATI_TC_ConvertTexture(&srcTexture, &destTexture, NULL, NULL, NULL, NULL); + + if (outputOptions.outputHandler != NULL) { + outputOptions.outputHandler->writeData(destTexture.pData, destTexture.dwDataSize); + } + + mem::free(destTexture.pData); +} + +#endif // defined(HAVE_ATITC) + +#if defined(HAVE_SQUISH) + +void SquishCompressorDXT1::compress(nvtt::InputFormat inputFormat, nvtt::AlphaMode alphaMode, uint w, uint h, void * data, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions) +{ +#pragma message(NV_FILE_LINE "TODO: Convert input to fixed point ABGR format instead of ARGB") + /* + Image img(*image); + int count = img.width() * img.height(); + for (int i = 0; i < count; i++) + { + Color32 c = img.pixel(i); + img.pixel(i) = Color32(c.b, c.g, c.r, c.a); + } + + int size = squish::GetStorageRequirements(img.width(), img.height(), squish::kDxt1); + void * blocks = mem::malloc(size); + + squish::CompressImage((const squish::u8 *)img.pixels(), img.width(), img.height(), blocks, squish::kDxt1 | squish::kColourClusterFit); + + if (outputOptions.outputHandler != NULL) { + outputOptions.outputHandler->writeData(blocks, size); + } + + mem::free(blocks); + */ +} + +#endif // defined(HAVE_SQUISH) + + +#if defined(HAVE_D3DX) + +void D3DXCompressorDXT1::compress(nvtt::InputFormat inputFormat, nvtt::AlphaMode alphaMode, uint w, uint h, void * data, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions) +{ + IDirect3D9 * d3d = Direct3DCreate9(D3D_SDK_VERSION); + + D3DPRESENT_PARAMETERS presentParams; + ZeroMemory(&presentParams, sizeof(presentParams)); + presentParams.Windowed = TRUE; + presentParams.SwapEffect = D3DSWAPEFFECT_COPY; + presentParams.BackBufferWidth = 8; + presentParams.BackBufferHeight = 8; + presentParams.BackBufferFormat = D3DFMT_UNKNOWN; + + HRESULT err; + + IDirect3DDevice9 * device = NULL; + err = d3d->CreateDevice(D3DADAPTER_DEFAULT, D3DDEVTYPE_REF, GetDesktopWindow(), D3DCREATE_SOFTWARE_VERTEXPROCESSING, &presentParams, &device); + + IDirect3DTexture9 * texture = NULL; + err = D3DXCreateTexture(device, w, h, 1, 0, D3DFMT_DXT1, D3DPOOL_SYSTEMMEM, &texture); + + IDirect3DSurface9 * surface = NULL; + err = texture->GetSurfaceLevel(0, &surface); + + RECT rect; + rect.left = 0; + rect.top = 0; + rect.bottom = h; + rect.right = w; + + if (inputFormat == nvtt::InputFormat_BGRA_8UB) + { + err = D3DXLoadSurfaceFromMemory(surface, NULL, NULL, data, D3DFMT_A8R8G8B8, w * 4, NULL, &rect, D3DX_DEFAULT, 0); + } + else + { + err = D3DXLoadSurfaceFromMemory(surface, NULL, NULL, data, D3DFMT_A32B32G32R32F, w * 16, NULL, &rect, D3DX_DEFAULT, 0); + } + + if (err != D3DERR_INVALIDCALL && err != D3DXERR_INVALIDDATA) + { + D3DLOCKED_RECT rect; + ZeroMemory(&rect, sizeof(rect)); + + err = surface->LockRect(&rect, NULL, D3DLOCK_READONLY); + + if (outputOptions.outputHandler != NULL) { + int size = rect.Pitch * ((h + 3) / 4); + outputOptions.outputHandler->writeData(rect.pBits, size); + } + + err = surface->UnlockRect(); + } + + surface->Release(); + device->Release(); + d3d->Release(); +} + +#endif // defined(HAVE_D3DX) + + +#if defined(HAVE_STB) + +void StbCompressorDXT1::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output) +{ + rgba.swizzle(2, 1, 0, 3); // Swap R and B + stb_compress_dxt_block((unsigned char *)output, (unsigned char *)rgba.colors(), 0, 0); +} + + +#endif // defined(HAVE_STB) diff --git a/src/nvtt/CompressDXT.h b/src/nvtt/CompressDXT.h index b914c14..6c946e3 100644 --- a/src/nvtt/CompressDXT.h +++ b/src/nvtt/CompressDXT.h @@ -30,68 +30,153 @@ namespace nv { class Image; - class FloatImage; + struct ColorBlock; - class FastCompressor + struct CompressorInterface { - public: - FastCompressor(); - ~FastCompressor(); + virtual ~CompressorInterface() {} + virtual void compress(nvtt::InputFormat inputFormat, nvtt::AlphaMode alphaMode, uint w, uint h, void * data, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions) = 0; + }; + + struct FixedBlockCompressor : public CompressorInterface + { + virtual void compress(nvtt::InputFormat inputFormat, nvtt::AlphaMode alphaMode, uint w, uint h, void * data, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions); + + virtual void compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output) = 0; + virtual uint blockSize() const = 0; + }; + + + // Fast CPU compressors. + struct FastCompressorDXT1 : public FixedBlockCompressor + { + virtual void compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output); + virtual uint blockSize() const { return 8; } + }; + + struct FastCompressorDXT1a : public FixedBlockCompressor + { + virtual void compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output); + virtual uint blockSize() const { return 8; } + }; + + struct FastCompressorDXT3 : public FixedBlockCompressor + { + virtual void compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output); + virtual uint blockSize() const { return 16; } + }; - void setImage(const Image * image, nvtt::AlphaMode alphaMode); + struct FastCompressorDXT5 : public FixedBlockCompressor + { + virtual void compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output); + virtual uint blockSize() const { return 16; } + }; - void compressDXT1(const nvtt::OutputOptions::Private & outputOptions); - void compressDXT1a(const nvtt::OutputOptions::Private & outputOptions); - void compressDXT3(const nvtt::OutputOptions::Private & outputOptions); - void compressDXT5(const nvtt::OutputOptions::Private & outputOptions); - void compressDXT5n(const nvtt::OutputOptions::Private & outputOptions); + struct FastCompressorDXT5n : public FixedBlockCompressor + { + virtual void compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output); + virtual uint blockSize() const { return 16; } + }; - private: - const Image * m_image; - nvtt::AlphaMode m_alphaMode; + struct FastCompressorBC4 : public FixedBlockCompressor + { + virtual void compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output); + virtual uint blockSize() const { return 8; } }; - class SlowCompressor + struct FastCompressorBC5 : public FixedBlockCompressor { - public: - SlowCompressor(); - ~SlowCompressor(); + virtual void compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output); + virtual uint blockSize() const { return 16; } + }; - void setImage(const Image * image, nvtt::AlphaMode alphaMode); - void compressDXT1(const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions); - void compressDXT1a(const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions); - void compressDXT3(const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions); - void compressDXT5(const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions); - void compressDXT5n(const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions); - void compressBC4(const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions); - void compressBC5(const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions); + // Normal CPU compressors. + struct NormalCompressorDXT1 : public FixedBlockCompressor + { + virtual void compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output); + virtual uint blockSize() const { return 8; } + }; - private: - const Image * m_image; - nvtt::AlphaMode m_alphaMode; + struct NormalCompressorDXT1a : public FixedBlockCompressor + { + virtual void compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output); + virtual uint blockSize() const { return 8; } }; + struct NormalCompressorDXT3 : public FixedBlockCompressor + { + virtual void compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output); + virtual uint blockSize() const { return 16; } + }; + + struct NormalCompressorDXT5 : public FixedBlockCompressor + { + virtual void compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output); + virtual uint blockSize() const { return 16; } + }; + + struct NormalCompressorDXT5n : public FixedBlockCompressor + { + virtual void compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output); + virtual uint blockSize() const { return 16; } + }; + + + // Production CPU compressors. + struct ProductionCompressorBC4 : public FixedBlockCompressor + { + virtual void compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output); + virtual uint blockSize() const { return 8; } + }; + + struct ProductionCompressorBC5 : public FixedBlockCompressor + { + virtual void compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output); + virtual uint blockSize() const { return 16; } + }; + + // External compressors. #if defined(HAVE_S3QUANT) - void s3CompressDXT1(const Image * image, const nvtt::OutputOptions::Private & outputOptions); + struct S3CompressorDXT1 : public CompressorInterface + { + virtual void compress(nvtt::InputFormat inputFormat, nvtt::AlphaMode alphaMode, uint w, uint h, void * data, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions); + }; #endif #if defined(HAVE_ATITC) - void atiCompressDXT1(const Image * image, const nvtt::OutputOptions::Private & outputOptions); - void atiCompressDXT5(const Image * image, const nvtt::OutputOptions::Private & outputOptions); + struct AtiCompressorDXT1 : public CompressorInterface + { + virtual void compress(nvtt::InputFormat inputFormat, nvtt::AlphaMode alphaMode, uint w, uint h, void * data, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions); + }; + + struct AtiCompressorDXT5 : public CompressorInterface + { + virtual void compress(nvtt::InputFormat inputFormat, nvtt::AlphaMode alphaMode, uint w, uint h, void * data, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions); + }; #endif #if defined(HAVE_SQUISH) - void squishCompressDXT1(const Image * image, const nvtt::OutputOptions::Private & outputOptions); + struct SquishCompressorDXT1 : public CompressorInterface + { + virtual void compress(nvtt::InputFormat inputFormat, nvtt::AlphaMode alphaMode, uint w, uint h, void * data, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions); + }; #endif #if defined(HAVE_D3DX) - void d3dxCompressDXT1(const Image * image, const nvtt::OutputOptions::Private & outputOptions); + struct D3DXCompressorDXT1 : public CompressorInterface + { + virtual void compress(nvtt::InputFormat inputFormat, nvtt::AlphaMode alphaMode, uint w, uint h, void * data, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions); + }; #endif -#if defined(HAVE_D3DX) - void stbCompressDXT1(const Image * image, const nvtt::OutputOptions::Private & outputOptions); +#if defined(HAVE_STB) + struct StbCompressorDXT1 : public FixedBlockCompressor + { + virtual void compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output); + virtual uint blockSize() const { return 8; } + }; #endif } // nv namespace diff --git a/src/nvtt/Context.cpp b/src/nvtt/Context.cpp index 68dfa98..9624de9 100644 --- a/src/nvtt/Context.cpp +++ b/src/nvtt/Context.cpp @@ -222,6 +222,7 @@ Compressor::Compressor() : m(*new Compressor::Private()) if (m.cudaEnabled) { +#pragma message(NV_FILE_LINE "FIXME: This code is duplicated below.") // Select fastest CUDA device. int device = cuda::getFastestDevice(); if (!cuda::setDevice(device)) @@ -231,7 +232,7 @@ Compressor::Compressor() : m(*new Compressor::Private()) } else { - m.cuda = new CudaCompressor(); + m.cuda = new CudaContext(); if (!m.cuda->isValid()) { @@ -268,7 +269,7 @@ void Compressor::enableCudaAcceleration(bool enable) } else { - m.cuda = new CudaCompressor(); + m.cuda = new CudaContext(); if (!m.cuda->isValid()) { @@ -292,17 +293,18 @@ bool Compressor::process(const InputOptions & inputOptions, const CompressionOpt return m.compress(inputOptions.m, compressionOptions.m, outputOptions.m); } - /// Estimate the size of compressing the input with the given options. int Compressor::estimateSize(const InputOptions & inputOptions, const CompressionOptions & compressionOptions) const { return m.estimateSize(inputOptions.m, compressionOptions.m); } + // RAW api. bool Compressor::compress2D(InputFormat format, int w, int h, void * data, const CompressionOptions & compressionOptions, const OutputOptions & outputOptions) const { - // @@ Make sure type of input format matches compression format. +#pragma message(NV_FILE_LINE "TODO: Implement raw compress api") + return false; } int Compressor::estimateSize(int w, int h, int d, const CompressionOptions & compressionOptions) const @@ -324,16 +326,21 @@ TexImage Compressor::createTexImage() const return *new TexImage(); } + bool Compressor::outputHeader(const TexImage & tex, int mipmapCount, const CompressionOptions & compressionOptions, const OutputOptions & outputOptions) const { - m.outputHeader(tex, mipmapCount, compressionOptions.m, outputOptions.m); + return m.outputHeader(tex, mipmapCount, compressionOptions.m, outputOptions.m); } bool Compressor::compress(const TexImage & tex, const CompressionOptions & compressionOptions, const OutputOptions & outputOptions) const { +#pragma message(NV_FILE_LINE "TODO: Implement TexImage compress api") + // @@ Convert to fixed point and call compress2D for each face. + return false; } +/// Estimate the size of compressing the given texture. int Compressor::estimateSize(const TexImage & tex, const CompressionOptions & compressionOptions) const { const uint w = tex.width(); @@ -345,6 +352,8 @@ int Compressor::estimateSize(const TexImage & tex, const CompressionOptions & co } + + bool Compressor::Private::compress(const InputOptions::Private & inputOptions, const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions) const { // Make sure enums match. @@ -358,9 +367,7 @@ bool Compressor::Private::compress(const InputOptions::Private & inputOptions, c if (outputOptions.errorHandler) outputOptions.errorHandler->error(Error_FileOpen); return false; } - -#pragma message(NV_FILE_LINE "TODO: If DefaultOutputHandler, then seek begining of the file.") - + inputOptions.computeTargetExtents(); // Output DDS header. @@ -625,7 +632,10 @@ bool Compressor::Private::outputHeader(const TexImage & tex, int mipmapCount, co { if (tex.width() <= 0 || tex.height() <= 0 || tex.depth() <= 0 || mipmapCount <= 0) { -#pragma message(NV_FILE_LINE "TODO: Set invalid argument error.") + if (outputOptions.errorHandler != NULL) + { + outputOptions.errorHandler->error(Error_InvalidInput); + } return false; } @@ -1252,216 +1262,222 @@ void Compressor::Private::quantizeMipmap(Mipmap & mipmap, const CompressionOptio } -// Compress the given mipmap. -bool Compressor::Private::compressMipmap(const Mipmap & mipmap, const InputOptions::Private & inputOptions, const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions) const +CompressorInterface * Compressor::Private::chooseCpuCompressor(const CompressionOptions::Private & compressionOptions) const { - if (compressionOptions.format == Format_RGBA) + if (compressionOptions.format == Format_DXT1) { - // Pixel format conversion. - if (compressionOptions.pixelType == PixelType_Float) - { - compressRGB(mipmap.asFloatImage(), outputOptions, compressionOptions); - } - else - { - compressRGB(mipmap.asFixedImage(), outputOptions, compressionOptions); - } - } - else - { - const Image * image = mipmap.asFixedImage(); - nvDebugCheck(image != NULL); - - // @@ Use FastCompressor::isSupported(compressionOptions.format) to chose compressor. - - FastCompressor fast; - fast.setImage(image, inputOptions.alphaMode); - - SlowCompressor slow; - slow.setImage(image, inputOptions.alphaMode); - - const bool useCuda = cudaEnabled && image->width() * image->height() >= 512; - - if (compressionOptions.format == Format_DXT1) - { #if defined(HAVE_S3QUANT) - if (compressionOptions.externalCompressor == "s3") - { - s3CompressDXT1(image, outputOptions); - } - else + if (compressionOptions.externalCompressor == "s3") return new S3CompressorDXT1; + else #endif #if defined(HAVE_ATITC) - if (compressionOptions.externalCompressor == "ati") - { - atiCompressDXT1(image, outputOptions); - } - else + if (compressionOptions.externalCompressor == "ati") return new AtiCompressorDXT1; + else #endif #if defined(HAVE_SQUISH) - if (compressionOptions.externalCompressor == "squish") - { - squishCompressDXT1(image, outputOptions); - } - else + if (compressionOptions.externalCompressor == "squish") return new SquishCompressorDXT1; + else #endif #if defined(HAVE_D3DX) - if (compressionOptions.externalCompressor == "d3dx") - { - d3dxCompressDXT1(image, outputOptions); - } - else + if (compressionOptions.externalCompressor == "d3dx") return new D3DXCompressorDXT1; + else #endif #if defined(HAVE_D3DX) - if (compressionOptions.externalCompressor == "stb") - { - stbCompressDXT1(image, outputOptions); - } - else + if (compressionOptions.externalCompressor == "stb") return new StbCompressorDXT1; + else #endif - if (compressionOptions.quality == Quality_Fastest) - { - fast.compressDXT1(outputOptions); - } - else - { - if (useCuda) - { - nvDebugCheck(cudaSupported); - cuda->setImage(image, inputOptions.alphaMode); - //cuda->compressDXT1(compressionOptions, outputOptions); - cuda->compressDXT1(compressionOptions, outputOptions); - } - else - { - slow.compressDXT1(compressionOptions, outputOptions); - } - } - } - else if (compressionOptions.format == Format_DXT1a) + if (compressionOptions.quality == Quality_Fastest) { - if (compressionOptions.quality == Quality_Fastest) - { - fast.compressDXT1a(outputOptions); - } - else - { - if (useCuda) - { - nvDebugCheck(cudaSupported); - /*cuda*/slow.compressDXT1a(compressionOptions, outputOptions); - } - else - { - slow.compressDXT1a(compressionOptions, outputOptions); - } - } + return new FastCompressorDXT1; } - else if (compressionOptions.format == Format_DXT1n) + + return new NormalCompressorDXT1; + } + else if (compressionOptions.format == Format_DXT1a) + { + if (compressionOptions.quality == Quality_Fastest) { - if (useCuda) - { - nvDebugCheck(cudaSupported); - cuda->setImage(image, inputOptions.alphaMode); - cuda->compressDXT1n(compressionOptions, outputOptions); - } - else - { - if (outputOptions.errorHandler) outputOptions.errorHandler->error(Error_UnsupportedFeature); - } + return new FastCompressorDXT1a; } - else if (compressionOptions.format == Format_DXT3) + + return new NormalCompressorDXT1a; + } + else if (compressionOptions.format == Format_DXT1n) + { + // Not supported. + } + else if (compressionOptions.format == Format_DXT3) + { + if (compressionOptions.quality == Quality_Fastest) { - if (compressionOptions.quality == Quality_Fastest) - { - fast.compressDXT3(outputOptions); - } - else - { - if (useCuda) - { - nvDebugCheck(cudaSupported); - cuda->setImage(image, inputOptions.alphaMode); - cuda->compressDXT3(compressionOptions, outputOptions); - } - else - { - slow.compressDXT3(compressionOptions, outputOptions); - } - } + return new FastCompressorDXT3; } - else if (compressionOptions.format == Format_DXT5) - { + + return new NormalCompressorDXT3; + } + else if (compressionOptions.format == Format_DXT5) + { #if defined(HAVE_ATITC) - if (compressionOptions.externalCompressor == "ati") - { - atiCompressDXT5(image, outputOptions); - } - else + if (compressionOptions.externalCompressor == "ati") return new AtiCompressorDXT5; + else #endif - if (compressionOptions.quality == Quality_Fastest) - { - fast.compressDXT5(outputOptions); - } - else - { - if (useCuda) - { - nvDebugCheck(cudaSupported); - cuda->setImage(image, inputOptions.alphaMode); - cuda->compressDXT5(compressionOptions, outputOptions); - } - else - { - slow.compressDXT5(compressionOptions, outputOptions); - } - } + + if (compressionOptions.quality == Quality_Fastest) + { + return new FastCompressorDXT5; } - else if (compressionOptions.format == Format_DXT5n) + + return new NormalCompressorDXT5; + } + else if (compressionOptions.format == Format_DXT5n) + { + if (compressionOptions.quality == Quality_Fastest) { - if (compressionOptions.quality == Quality_Fastest) - { - fast.compressDXT5n(outputOptions); - } - else - { - /*if (useCuda) - { - nvDebugCheck(cudaSupported); - cuda->setImage(image, inputOptions.alphaMode); - cuda->compressDXT5n(compressionOptions, outputOptions); - } - else*/ - { - slow.compressDXT5n(compressionOptions, outputOptions); - } - } + return new FastCompressorDXT5n; } - else if (compressionOptions.format == Format_BC4) + + return new NormalCompressorDXT5n; + } + else if (compressionOptions.format == Format_BC4) + { + if (compressionOptions.quality == Quality_Fastest || compressionOptions.quality == Quality_Normal) { - slow.compressBC4(compressionOptions, outputOptions); + return new FastCompressorBC4; } - else if (compressionOptions.format == Format_BC5) + + return new ProductionCompressorBC4; + } + else if (compressionOptions.format == Format_BC5) + { + if (compressionOptions.quality == Quality_Fastest || compressionOptions.quality == Quality_Normal) { - slow.compressBC5(compressionOptions, outputOptions); + return new FastCompressorBC5; } - else if (compressionOptions.format == Format_CTX1) + + return new ProductionCompressorBC5; + } + else if (compressionOptions.format == Format_CTX1) + { + // Not supported. + } + else if (compressionOptions.format == Format_BC6) + { + // Not supported. + } + else if (compressionOptions.format == Format_BC7) + { + // Not supported. + } + + return NULL; +} + + +CompressorInterface * Compressor::Private::chooseGpuCompressor(const CompressionOptions::Private & compressionOptions) const +{ + nvDebugCheck(cudaSupported); + + if (compressionOptions.quality == Quality_Fastest) + { + // Do not use CUDA compressors in fastest quality mode. + return NULL; + } + + if (compressionOptions.format == Format_DXT1) + { + return new CudaCompressorDXT1(*cuda); + } + else if (compressionOptions.format == Format_DXT1a) + { +#pragma message(NV_FILE_LINE "TODO: Implement CUDA DXT1a compressor.") + } + else if (compressionOptions.format == Format_DXT1n) + { + // Not supported. + } + else if (compressionOptions.format == Format_DXT3) + { + return new CudaCompressorDXT3(*cuda); + } + else if (compressionOptions.format == Format_DXT5) + { + return new CudaCompressorDXT5(*cuda); + } + else if (compressionOptions.format == Format_DXT5n) + { + // @@ Return CUDA compressor. + } + else if (compressionOptions.format == Format_BC4) + { + // Not supported. + } + else if (compressionOptions.format == Format_BC5) + { + // Not supported. + } + else if (compressionOptions.format == Format_CTX1) + { + // @@ Return CUDA compressor. + } + else if (compressionOptions.format == Format_BC6) + { + // Not supported. + } + else if (compressionOptions.format == Format_BC7) + { + // Not supported. + } + + return NULL; +} + + + +// Compress the given mipmap. +bool Compressor::Private::compressMipmap(const Mipmap & mipmap, const InputOptions::Private & inputOptions, const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions) const +{ + if (compressionOptions.format == Format_RGBA) + { + // Pixel format conversion. + if (compressionOptions.pixelType == PixelType_Float) { - if (useCuda) - { - nvDebugCheck(cudaSupported); - cuda->setImage(image, inputOptions.alphaMode); - cuda->compressCTX1(compressionOptions, outputOptions); - } - else - { - if (outputOptions.errorHandler) outputOptions.errorHandler->error(Error_UnsupportedFeature); - } + compressRGB(mipmap.asFloatImage(), outputOptions, compressionOptions); + } + else + { + compressRGB(mipmap.asFixedImage(), outputOptions, compressionOptions); + } + } + else + { + const Image * image = mipmap.asFixedImage(); + nvDebugCheck(image != NULL); + + // Decide what compressor to use. + CompressorInterface * compressor = NULL; + if (cudaEnabled && image->width() * image->height() >= 512) + { + compressor = chooseGpuCompressor(compressionOptions); + } + if (compressor == NULL) + { + compressor = chooseCpuCompressor(compressionOptions); + } + + if (compressor == NULL) + { + if (outputOptions.errorHandler) outputOptions.errorHandler->error(Error_UnsupportedFeature); + } + else + { + compressor->compress(InputFormat_BGRA_8UB, inputOptions.alphaMode, image->width(), image->height(), (void *)image->pixels(), compressionOptions, outputOptions); + + delete compressor; } } diff --git a/src/nvtt/Context.h b/src/nvtt/Context.h index 727bd1c..1c81db7 100644 --- a/src/nvtt/Context.h +++ b/src/nvtt/Context.h @@ -27,6 +27,7 @@ #include #include +#include #include "nvtt.h" @@ -44,6 +45,9 @@ namespace nvtt Private() {} bool compress(const InputOptions::Private & inputOptions, const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions) const; + + bool compress(const void * data, int width, int height, const CompressionOptions & compressionOptions, const OutputOptions & outputOptions) const; + int estimateSize(const InputOptions::Private & inputOptions, const CompressionOptions::Private & compressionOptions) const; bool outputHeader(const TexImage & tex, int mipmapCount, const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions); @@ -51,6 +55,10 @@ namespace nvtt private: bool outputHeader(const InputOptions::Private & inputOptions, const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions) const; + + nv::CompressorInterface * chooseCpuCompressor(const CompressionOptions::Private & compressionOptions) const; + nv::CompressorInterface * chooseGpuCompressor(const CompressionOptions::Private & compressionOptions) const; + bool compressMipmaps(uint f, const InputOptions::Private & inputOptions, const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions) const; bool initMipmap(Mipmap & mipmap, const InputOptions::Private & inputOptions, uint w, uint h, uint d, uint f, uint m) const; @@ -71,7 +79,7 @@ namespace nvtt bool cudaSupported; bool cudaEnabled; - nv::AutoPtr cuda; + nv::AutoPtr cuda; }; diff --git a/src/nvtt/cuda/CompressKernel.cu b/src/nvtt/cuda/CompressKernel.cu index e829a7c..cbdf492 100644 --- a/src/nvtt/cuda/CompressKernel.cu +++ b/src/nvtt/cuda/CompressKernel.cu @@ -296,6 +296,51 @@ __device__ float3 blockError3(const float3 * colors, uint permutation, float3 a, // Sort colors //////////////////////////////////////////////////////////////////////////////// +// @@ Experimental code to avoid duplicate colors for faster compression. +// We could first sort along the best fit line and only compare colors that have the same projection. +// The hardest part is to maintain the indices to map packed/sorted colors to the input colors. +// We also need to update several functions that assume the number of colors is fixed to 16. +// And compute different bit maps for the different color counts. +// This is a fairly high amount of work. +__device__ int packColors(float3 * values, float * weights, int * ranks) +{ + const int tid = threadIdx.x; + + __shared__ int count; + count = 0; + + bool alive = true; + + // Append this + for (int i = 0; i < 16; i++) + { + // One thread leads on each iteration. + if (tid == i) { + + // If thread alive, then append element. + if (alive) { + values[count] = values[i]; + weights[count] = weights[i]; + count++; + } + + // Otherwise update weight. + else { + weights[ranks[i]] += weights[i]; + } + } + + // Kill all threads that have the same element and record rank. + if (values[i] == values[tid]) { + alive = false; + ranks[tid] = count - 1; + } + } + + return count; +} + + __device__ void sortColors(const float * values, int * ranks) { #if __DEVICE_EMULATION__ @@ -343,12 +388,60 @@ __device__ void sortColors(const float * values, int * ranks) #endif } +__device__ void sortColors(const float * values, int * ranks, int count) +{ +#if __DEVICE_EMULATION__ + if (threadIdx.x == 0) + { + for (int tid = 0; tid < count; tid++) + { + int rank = 0; + for (int i = 0; i < count; i++) + { + rank += (values[i] < values[tid]); + } + + ranks[tid] = rank; + } + + // Resolve elements with the same index. + for (int i = 0; i < count-1; i++) + { + for (int tid = 0; tid < count; tid++) + { + if (tid > i && ranks[tid] == ranks[i]) ++ranks[tid]; + } + } + } +#else + const int tid = threadIdx.x; + + int rank = 0; + + #pragma unroll + for (int i = 0; i < count; i++) + { + rank += (values[i] < values[tid]); + } + + ranks[tid] = rank; + + // Resolve elements with the same index. + #pragma unroll + for (int i = 0; i < count-1; i++) + { + if ((tid > i) & (ranks[tid] == ranks[i])) ++ranks[tid]; + } +#endif +} + + //////////////////////////////////////////////////////////////////////////////// // Load color block to shared mem //////////////////////////////////////////////////////////////////////////////// -__device__ void loadColorBlock(const uint * image, float3 colors[16], float3 sums[16], int xrefs[16], int * sameColor) +/*__device__ void loadColorBlock(const uint * image, float3 colors[16], float3 sums[16], int xrefs[16], int * sameColor) { const int bid = blockIdx.x; const int idx = threadIdx.x; @@ -389,9 +482,9 @@ __device__ void loadColorBlock(const uint * image, float3 colors[16], float3 sum __debugsync(); } #endif -} +}*/ -__device__ void loadColorBlockTex(uint bn, uint w, float3 colors[16], float3 sums[16], int xrefs[16], int * sameColor) +__device__ void loadColorBlockTex(uint firstBlock, uint width, float3 colors[16], float3 sums[16], int xrefs[16], int * sameColor) { const int bid = blockIdx.x; const int idx = threadIdx.x; @@ -400,8 +493,8 @@ __device__ void loadColorBlockTex(uint bn, uint w, float3 colors[16], float3 sum if (idx < 16) { - float x = 4 * ((bn + bid) % w) + idx % 4; // @@ Avoid mod and div by using 2D grid? - float y = 4 * ((bn + bid) / w) + idx / 4; + float x = 4 * ((firstBlock + bid) % width) + idx % 4; // @@ Avoid mod and div by using 2D grid? + float y = 4 * ((firstBlock + bid) / width) + idx / 4; // Read color and copy to shared mem. float4 c = tex2D(tex, x, y); @@ -437,10 +530,107 @@ __device__ void loadColorBlockTex(uint bn, uint w, float3 colors[16], float3 sum __debugsync(); } #endif +} +/* +__device__ void loadColorBlockTex(uint firstBlock, uint w, float3 colors[16], float3 sums[16], float weights[16], int xrefs[16], int * sameColor) +{ + const int bid = blockIdx.x; + const int idx = threadIdx.x; + + __shared__ float dps[16]; + + if (idx < 16) + { + float x = 4 * ((firstBlock + bid) % w) + idx % 4; // @@ Avoid mod and div by using 2D grid? + float y = 4 * ((firstBlock + bid) / w) + idx / 4; + + // Read color and copy to shared mem. + float4 c = tex2D(tex, x, y); + + colors[idx].x = c.z; + colors[idx].y = c.y; + colors[idx].z = c.x; + weights[idx] = 1; + + int count = packColors(colors, weights); + if (idx < count) + { + // Sort colors along the best fit line. + colorSums(colors, sums); + float3 axis = bestFitLine(colors, sums[0], kColorMetric); + + *sameColor = (axis == make_float3(0, 0, 0)); + + dps[idx] = dot(colors[idx], axis); + + sortColors(dps, xrefs); + + float3 tmp = colors[idx]; + colors[xrefs[idx]] = tmp; + } + } } +*/ + +__device__ void loadColorBlockTex(uint firstBlock, uint width, float3 colors[16], float3 sums[16], float weights[16], int xrefs[16], int * sameColor) +{ + const int bid = blockIdx.x; + const int idx = threadIdx.x; + __shared__ float3 rawColors[16]; + __shared__ float dps[16]; + if (idx < 16) + { + float x = 4 * ((firstBlock + bid) % width) + idx % 4; // @@ Avoid mod and div by using 2D grid? + float y = 4 * ((firstBlock + bid) / width) + idx / 4; + + // Read color and copy to shared mem. + float4 c = tex2D(tex, x, y); + + rawColors[idx].x = c.z; + rawColors[idx].y = c.y; + rawColors[idx].z = c.x; + weights[idx] = c.w; + + colors[idx] = rawColors[idx] * weights[idx]; + + // No need to synchronize, 16 < warp size. + __debugsync(); + + // Sort colors along the best fit line. + colorSums(colors, sums); + float3 axis = bestFitLine(colors, sums[0], kColorMetric); + + *sameColor = (axis == make_float3(0, 0, 0)); + + // Single color compressor needs unweighted colors. + if (*sameColor) colors[idx] = rawColors[idx]; + + dps[idx] = dot(colors[idx], axis); + + __debugsync(); + + sortColors(dps, xrefs); + + float3 tmp = colors[idx]; + float w = weights[idx]; + __debugsync(); + colors[xrefs[idx]] = tmp; + weights[xrefs[idx]] = w; + } +#if __DEVICE_EMULATION__ + else + { + __debugsync(); + __debugsync(); + __debugsync(); + } +#endif +} + +/* __device__ void loadColorBlock(const uint * image, float3 colors[16], float3 sums[16], float weights[16], int xrefs[16], int * sameColor) { const int bid = blockIdx.x; @@ -494,6 +684,7 @@ __device__ void loadColorBlock(const uint * image, float3 colors[16], float3 sum } #endif } +*/ __device__ void loadColorBlock(const uint * image, float2 colors[16], float2 sums[16], int xrefs[16], int * sameColor) { @@ -1457,48 +1648,15 @@ __device__ void saveSingleColorBlockCTX1(float2 color, uint2 * result) //////////////////////////////////////////////////////////////////////////////// // Compress color block //////////////////////////////////////////////////////////////////////////////// -__global__ void compressDXT1(const uint * permutations, const uint * image, uint2 * result) -{ - __shared__ float3 colors[16]; - __shared__ float3 sums[16]; - __shared__ int xrefs[16]; - __shared__ int sameColor; - - loadColorBlock(image, colors, sums, xrefs, &sameColor); - __syncthreads(); - - if (sameColor) - { - if (threadIdx.x == 0) saveSingleColorBlockDXT1(colors[0], result); - return; - } - - ushort bestStart, bestEnd; - uint bestPermutation; - - __shared__ float errors[NUM_THREADS]; - - evalAllPermutations(colors, sums[0], permutations, bestStart, bestEnd, bestPermutation, errors); - - // Use a parallel reduction to find minimum error. - const int minIdx = findMinError(errors); - - // Only write the result of the winner thread. - if (threadIdx.x == minIdx) - { - saveBlockDXT1(bestStart, bestEnd, bestPermutation, xrefs, result); - } -} - -__global__ void compressDXT1_Tex(uint bn, uint w, const uint * permutations, uint2 * result) +__global__ void compressDXT1(uint firstBlock, uint w, const uint * permutations, uint2 * result) { __shared__ float3 colors[16]; __shared__ float3 sums[16]; __shared__ int xrefs[16]; __shared__ int sameColor; - loadColorBlockTex(bn, w, colors, sums, xrefs, &sameColor); + loadColorBlockTex(firstBlock, w, colors, sums, xrefs, &sameColor); __syncthreads(); @@ -1534,14 +1692,14 @@ __global__ void compressDXT1_Tex(uint bn, uint w, const uint * permutations, uin } -__global__ void compressLevel4DXT1(const uint * permutations, const uint * image, uint2 * result) +__global__ void compressLevel4DXT1(uint firstBlock, uint w, const uint * permutations, uint2 * result) { __shared__ float3 colors[16]; __shared__ float3 sums[16]; __shared__ int xrefs[16]; __shared__ int sameColor; - loadColorBlock(image, colors, sums, xrefs, &sameColor); + loadColorBlockTex(firstBlock, w, colors, sums, xrefs, &sameColor); __syncthreads(); @@ -1568,7 +1726,7 @@ __global__ void compressLevel4DXT1(const uint * permutations, const uint * image } } -__global__ void compressWeightedDXT1(const uint * permutations, const uint * image, uint2 * result) +__global__ void compressWeightedDXT1(uint firstBlock, uint w, const uint * permutations, uint2 * result) { __shared__ float3 colors[16]; __shared__ float3 sums[16]; @@ -1576,7 +1734,7 @@ __global__ void compressWeightedDXT1(const uint * permutations, const uint * ima __shared__ int xrefs[16]; __shared__ int sameColor; - loadColorBlock(image, colors, sums, weights, xrefs, &sameColor); + loadColorBlockTex(firstBlock, w, colors, sums, weights, xrefs, &sameColor); __syncthreads(); @@ -1987,40 +2145,70 @@ extern "C" void setupCompressKernel(const float weights[3]) cudaMemcpyToSymbol(kColorMetricSqr, weightsSqr, sizeof(float) * 3, 0); } +extern "C" void bindTextureToArray(cudaArray * d_data) +{ + // Setup texture + tex.normalized = false; + tex.filterMode = cudaFilterModePoint; + tex.addressMode[0] = cudaAddressModeClamp; + tex.addressMode[1] = cudaAddressModeClamp; + + cudaBindTextureToArray(tex, d_data); +} + + //////////////////////////////////////////////////////////////////////////////// // Launch kernel //////////////////////////////////////////////////////////////////////////////// -extern "C" void compressKernelDXT1(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps) +// DXT1 compressors: +extern "C" void compressKernelDXT1(uint firstBlock, uint blockNum, uint w, uint * d_result, uint * d_bitmaps) { - compressDXT1<<>>(d_bitmaps, d_data, (uint2 *)d_result); + compressDXT1<<>>(firstBlock, w, d_bitmaps, (uint2 *)d_result); } -extern "C" void compressKernelDXT1_Tex(uint bn, uint blockNum, uint w, cudaArray * d_data, uint * d_result, uint * d_bitmaps) +extern "C" void compressKernelDXT1_Level4(uint firstBlock, uint blockNum, uint w, uint * d_result, uint * d_bitmaps) { - // Setup texture - tex.normalized = false; - tex.filterMode = cudaFilterModePoint; - tex.addressMode[0] = cudaAddressModeClamp; - tex.addressMode[1] = cudaAddressModeClamp; - - cudaBindTextureToArray(tex, d_data); + compressLevel4DXT1<<>>(firstBlock, w, d_bitmaps, (uint2 *)d_result); +} - compressDXT1_Tex<<>>(bn, w, d_bitmaps, (uint2 *)d_result); +extern "C" void compressWeightedKernelDXT1(uint firstBlock, uint blockNum, uint w, uint * d_result, uint * d_bitmaps) +{ + compressWeightedDXT1<<>>(firstBlock, w, d_bitmaps, (uint2 *)d_result); } +// @@ DXT1a compressors. -extern "C" void compressKernelDXT1_Level4(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps) + +// @@ DXT3 compressors: +extern "C" void compressKernelDXT3(uint firstBlock, uint blockNum, uint w, uint * d_result, uint * d_bitmaps) { - compressLevel4DXT1<<>>(d_bitmaps, d_data, (uint2 *)d_result); + //compressDXT3<<>>(firstBlock, w, d_bitmaps, (uint2 *)d_result); } -extern "C" void compressWeightedKernelDXT1(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps) +extern "C" void compressWeightedKernelDXT3(uint firstBlock, uint blockNum, uint w, uint * d_result, uint * d_bitmaps) +{ + //compressWeightedDXT3<<>>(firstBlock, w, d_bitmaps, (uint2 *)d_result); +} + + +// @@ DXT5 compressors. +extern "C" void compressKernelDXT5(uint firstBlock, uint blockNum, uint w, uint * d_result, uint * d_bitmaps) { - compressWeightedDXT1<<>>(d_bitmaps, d_data, (uint2 *)d_result); + //compressDXT5<<>>(firstBlock, w, d_bitmaps, (uint2 *)d_result); } +extern "C" void compressWeightedKernelDXT5(uint firstBlock, uint blockNum, uint w, uint * d_result, uint * d_bitmaps) +{ + //compressWeightedDXT5<<>>(firstBlock, w, d_bitmaps, (uint2 *)d_result); +} + + + + + +/* extern "C" void compressNormalKernelDXT1(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps) { compressNormalDXT1<<>>(d_bitmaps, d_data, (uint2 *)d_result); @@ -2030,16 +2218,10 @@ extern "C" void compressKernelCTX1(uint blockNum, uint * d_data, uint * d_result { compressCTX1<<>>(d_bitmaps, d_data, (uint2 *)d_result); } - +*/ +/* extern "C" void compressKernelDXT5n(uint blockNum, cudaArray * d_data, uint * d_result) { - // Setup texture - tex.normalized = false; - tex.filterMode = cudaFilterModePoint; - tex.addressMode[0] = cudaAddressModeClamp; - tex.addressMode[1] = cudaAddressModeClamp; - - cudaBindTextureToArray(tex, d_data); - // compressDXT5n<<>>(blockNum, (uint2 *)d_result); } +*/ \ No newline at end of file diff --git a/src/nvtt/cuda/CudaCompressDXT.cpp b/src/nvtt/cuda/CudaCompressDXT.cpp index 9bfa7a0..8fe0f48 100644 --- a/src/nvtt/cuda/CudaCompressDXT.cpp +++ b/src/nvtt/cuda/CudaCompressDXT.cpp @@ -52,16 +52,20 @@ using namespace nvtt; extern "C" void setupCompressKernel(const float weights[3]); -extern "C" void compressKernelDXT1(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps); -extern "C" void compressKernelDXT1_Tex(uint bn, uint blockNum, uint w, cudaArray * d_data, uint * d_result, uint * d_bitmaps); +extern "C" void bindTextureToArray(cudaArray * d_data); + +extern "C" void compressKernelDXT1(uint firstBlock, uint blockNum, uint w, uint * d_result, uint * d_bitmaps); extern "C" void compressKernelDXT1_Level4(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps); extern "C" void compressWeightedKernelDXT1(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps); -extern "C" void compressNormalKernelDXT1(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps); -extern "C" void compressKernelCTX1(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps); +extern "C" void compressKernelDXT3(uint firstBlock, uint blockNum, uint w, uint * d_result, uint * d_bitmaps); +//extern "C" void compressNormalKernelDXT1(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps); +//extern "C" void compressKernelCTX1(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps); -#include "Bitmaps.h" // @@ Rename to BitmapTable.h +#pragma message(NV_FILE_LINE "TODO: Rename Bitmaps.h to BitmapTable.h") +#include "Bitmaps.h" +/* // Convert linear image to block linear. static void convertToBlockLinear(const Image * image, uint * blockLinearImage) { @@ -81,45 +85,49 @@ static void convertToBlockLinear(const Image * image, uint * blockLinearImage) } } } +*/ #endif -CudaCompressor::CudaCompressor() : m_bitmapTable(NULL), m_bitmapTableCTX(NULL), m_data(NULL), m_result(NULL) -{ +CudaContext::CudaContext() : + bitmapTable(NULL), + bitmapTableCTX(NULL), + data(NULL), + result(NULL) +{ #if defined HAVE_CUDA // Allocate and upload bitmaps. - cudaMalloc((void**) &m_bitmapTable, 992 * sizeof(uint)); - if (m_bitmapTable != NULL) + cudaMalloc((void**) &bitmapTable, 992 * sizeof(uint)); + if (bitmapTable != NULL) { - cudaMemcpy(m_bitmapTable, s_bitmapTable, 992 * sizeof(uint), cudaMemcpyHostToDevice); + cudaMemcpy(bitmapTable, s_bitmapTable, 992 * sizeof(uint), cudaMemcpyHostToDevice); } - cudaMalloc((void**) &m_bitmapTableCTX, 704 * sizeof(uint)); - - if (m_bitmapTableCTX != NULL) + cudaMalloc((void**) &bitmapTableCTX, 704 * sizeof(uint)); + if (bitmapTableCTX != NULL) { - cudaMemcpy(m_bitmapTableCTX, s_bitmapTableCTX, 704 * sizeof(uint), cudaMemcpyHostToDevice); + cudaMemcpy(bitmapTableCTX, s_bitmapTableCTX, 704 * sizeof(uint), cudaMemcpyHostToDevice); } // Allocate scratch buffers. - cudaMalloc((void**) &m_data, MAX_BLOCKS * 64U); - cudaMalloc((void**) &m_result, MAX_BLOCKS * 8U); + cudaMalloc((void**) &data, MAX_BLOCKS * 64U); + cudaMalloc((void**) &result, MAX_BLOCKS * 8U); #endif -} - -CudaCompressor::~CudaCompressor() -{ +} + +CudaContext::~CudaContext() +{ #if defined HAVE_CUDA // Free device mem allocations. - cudaFree(m_data); - cudaFree(m_result); - cudaFree(m_bitmapTable); - cudaFree(m_bitmapTableCTX); + cudaFree(bitmapTableCTX); + cudaFree(bitmapTable); + cudaFree(data); + cudaFree(result); #endif -} +} -bool CudaCompressor::isValid() const +bool CudaContext::isValid() const { #if defined HAVE_CUDA cudaError_t err = cudaGetLastError(); @@ -129,91 +137,88 @@ bool CudaCompressor::isValid() const return false; } #endif - return m_data != NULL && m_result != NULL && m_bitmapTable != NULL; + return bitmapTable != NULL && bitmapTableCTX != NULL && data != NULL && result != NULL; } -// @@ This code is very repetitive and needs to be cleaned up. -#if 0 -struct CudaCompressionKernel +CudaCompressor::CudaCompressor(CudaContext & ctx) : m_ctx(ctx) { - virtual void setup(const CompressionOptions::Private & compressionOptions) - { - setupCompressKernel(compressionOptions.colorWeight.ptr()); - } - - virtual void setBitmapTable(); - - virtual void runDeviceCode(int count); - - virtual void runHostCode(int count); - -}; -void CudaCompressor::compressKernel(CudaCompressionKernel * kernel) +} + +void CudaCompressor::compress(nvtt::InputFormat inputFormat, nvtt::AlphaMode alphaMode, uint w, uint h, void * data, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions) { nvDebugCheck(cuda::isHardwarePresent()); + #if defined HAVE_CUDA + // Allocate image as a cuda array. + cudaArray * d_image; + if (inputFormat == nvtt::InputFormat_BGRA_8UB) + { + cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsigned); + cudaMallocArray(&d_image, &channelDesc, w, h); + + const int imageSize = w * h * sizeof(uint); + cudaMemcpyToArray(d_image, 0, 0, data, imageSize, cudaMemcpyHostToDevice); + } + else + { +#pragma message(NV_FILE_LINE "FIXME: Floating point textures not really supported by CUDA compressors.") + cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 32, 32, 32, cudaChannelFormatKindFloat); + cudaMallocArray(&d_image, &channelDesc, w, h); + + const int imageSize = w * h * sizeof(uint); + cudaMemcpyToArray(d_image, 0, 0, data, imageSize, cudaMemcpyHostToDevice); + } + // Image size in blocks. - const uint w = (image->width() + 3) / 4; - const uint h = (image->height() + 3) / 4; + const uint bw = (w + 3) / 4; + const uint bh = (h + 3) / 4; + const uint bs = blockSize(); + const uint blockNum = bw * bh; + const uint compressedSize = blockNum * bs; - uint imageSize = w * h * 16 * sizeof(Color32); - uint * blockLinearImage = (uint *) malloc(imageSize); - convertToBlockLinear(image, blockLinearImage); // @@ Do this in parallel with the GPU, or in the GPU! + void * h_result = malloc(min(blockNum, MAX_BLOCKS) * bs); - const uint blockNum = w * h; - const uint compressedSize = blockNum * 8; + setup(d_image, compressionOptions); - clock_t start = clock(); + // Timer timer; + // timer.start(); - kernel->setup(compressionOptions); - kernel->setBitmapTable(m_bitmapTable); - - // TODO: Add support for multiple GPUs. uint bn = 0; while(bn != blockNum) { uint count = min(blockNum - bn, MAX_BLOCKS); - cudaMemcpy(m_data, blockLinearImage + bn * 16, count * 64, cudaMemcpyHostToDevice); - - kernel->runDeviceCode(count, m_data, m_result); + compressBlocks(bn, count, w, h, alphaMode, compressionOptions, h_result); - kernel->runHostCode(count); - // Check for errors. cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { - nvDebug("CUDA Error: %s\n", cudaGetErrorString(err)); - + //nvDebug("CUDA Error: %s\n", cudaGetErrorString(err)); if (outputOptions.errorHandler != NULL) { outputOptions.errorHandler->error(Error_CudaError); } } - // Copy result to host, overwrite swizzled image. - cudaMemcpy(blockLinearImage, m_result, count * 8, cudaMemcpyDeviceToHost); - // Output result. - kernel->outputResult(outputOptions.outputHandler); - if (outputOptions.outputHandler != NULL) { - outputOptions.outputHandler->writeData(blockLinearImage, count * 8); + outputOptions.outputHandler->writeData(h_result, count * bs); } bn += count; } - clock_t end = clock(); - //printf("\rCUDA time taken: %.3f seconds\n", float(end-start) / CLOCKS_PER_SEC); + //timer.stop(); + //printf("\rCUDA time taken: %.3f seconds\n", timer.elapsed() / CLOCKS_PER_SEC); - free(blockLinearImage); + free(h_result); + cudaFreeArray(d_image); #else if (outputOptions.errorHandler != NULL) @@ -221,92 +226,88 @@ void CudaCompressor::compressKernel(CudaCompressionKernel * kernel) outputOptions.errorHandler->error(Error_CudaError); } #endif -} -#endif // 0 +} -void CudaCompressor::setImage(const Image * image, nvtt::AlphaMode alphaMode) +void CudaCompressorDXT1::setup(cudaArray * image, const nvtt::CompressionOptions::Private & compressionOptions) { - m_image = image; - m_alphaMode = alphaMode; + setupCompressKernel(compressionOptions.colorWeight.ptr()); + bindTextureToArray(image); } - - -/// Compress image using CUDA. -void CudaCompressor::compressDXT1(const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions) +void CudaCompressorDXT1::compressBlocks(uint first, uint count, uint w, uint h, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output) { - nvDebugCheck(cuda::isHardwarePresent()); -#if defined HAVE_CUDA + // Launch kernel. + compressKernelDXT1(first, count, w, m_ctx.result, m_ctx.bitmapTable); - // Allocate image as a cuda array. - cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsigned); + // Copy result to host. + cudaMemcpy(output, m_ctx.result, count * 8, cudaMemcpyDeviceToHost); +} - cudaArray * d_image; - const int imageSize = m_image->width() * m_image->height() * sizeof(uint); - cudaMallocArray(&d_image, &channelDesc, m_image->width(), m_image->height()); - cudaMemcpyToArray(d_image, 0, 0, m_image->pixels(), imageSize, cudaMemcpyHostToDevice); +void CudaCompressorDXT3::setup(cudaArray * image, const nvtt::CompressionOptions::Private & compressionOptions) +{ + setupCompressKernel(compressionOptions.colorWeight.ptr()); + bindTextureToArray(image); +} - // Image size in blocks. - const uint w = (m_image->width() + 3) / 4; - const uint h = (m_image->height() + 3) / 4; - const uint blockNum = w * h; - const uint compressedSize = blockNum * 8; +void CudaCompressorDXT3::compressBlocks(uint first, uint count, uint w, uint h, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output) +{ + // Launch kernel. + compressKernelDXT3(first, count, w, m_ctx.result, m_ctx.bitmapTable); - void * h_result = malloc(min(blockNum, MAX_BLOCKS) * 8); + // Copy result to host. + cudaMemcpy(output, m_ctx.result, count * 16, cudaMemcpyDeviceToHost); +} - //clock_t start = clock(); +void CudaCompressorDXT5::setup(cudaArray * image, const nvtt::CompressionOptions::Private & compressionOptions) +{ setupCompressKernel(compressionOptions.colorWeight.ptr()); + bindTextureToArray(image); +} - uint bn = 0; - while(bn != blockNum) +void CudaCompressorDXT5::compressBlocks(uint first, uint count, uint w, uint h, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output) +{ + /*// Launch kernel. + compressKernelDXT5(first, count, w, m_ctx.result, m_ctx.bitmapTable); + + // Copy result to host. + cudaMemcpy(output, m_ctx.result, count * 16, cudaMemcpyDeviceToHost);*/ + + // Launch kernel. + if (alphaMode == AlphaMode_Transparency) { - uint count = min(blockNum - bn, MAX_BLOCKS); + // compressWeightedKernelDXT1(first, count, w, m_ctx.result, m_ctx.bitmapTable); + } + else + { + // compressKernelDXT1_Level4(first, count, w, m_ctx.result, m_ctx.bitmapTable); + } - // Launch kernel. - compressKernelDXT1_Tex(bn, count, w, d_image, m_result, m_bitmapTable); + // Compress alpha in parallel with the GPU. + for (uint i = 0; i < count; i++) + { + //ColorBlock rgba(blockLinearImage + (first + i) * 16); + //OptimalCompress::compressDXT3A(rgba, alphaBlocks + i); + } - // Check for errors. - cudaError_t err = cudaGetLastError(); - if (err != cudaSuccess) - { - nvDebug("CUDA Error: %s\n", cudaGetErrorString(err)); + // Copy result to host. + cudaMemcpy(output, m_ctx.result, count * 8, cudaMemcpyDeviceToHost); - if (outputOptions.errorHandler != NULL) - { - outputOptions.errorHandler->error(Error_CudaError); - } - } + // @@ Interleave color and alpha blocks. - // Copy result to host, overwrite swizzled image. - cudaMemcpy(h_result, m_result, count * 8, cudaMemcpyDeviceToHost); +} - // Output result. - if (outputOptions.outputHandler != NULL) - { - outputOptions.outputHandler->writeData(h_result, count * 8); - } - bn += count; - } - //clock_t end = clock(); - //printf("\rCUDA time taken: %.3f seconds\n", float(end-start) / CLOCKS_PER_SEC); - free(h_result); -#else - if (outputOptions.errorHandler != NULL) - { - outputOptions.errorHandler->error(Error_CudaError); - } -#endif -} +// @@ This code is very repetitive and needs to be cleaned up. +#if 0 /// Compress image using CUDA. void CudaCompressor::compressDXT3(const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions) @@ -337,16 +338,16 @@ void CudaCompressor::compressDXT3(const CompressionOptions::Private & compressio { uint count = min(blockNum - bn, MAX_BLOCKS); - cudaMemcpy(m_data, blockLinearImage + bn * 16, count * 64, cudaMemcpyHostToDevice); + cudaMemcpy(m_ctx.data, blockLinearImage + bn * 16, count * 64, cudaMemcpyHostToDevice); // Launch kernel. if (m_alphaMode == AlphaMode_Transparency) { - compressWeightedKernelDXT1(count, m_data, m_result, m_bitmapTable); + compressWeightedKernelDXT1(count, m_ctx.data, m_ctx.result, m_ctx.bitmapTable); } else { - compressKernelDXT1_Level4(count, m_data, m_result, m_bitmapTable); + compressKernelDXT1_Level4(count, m_ctx.data, m_ctx.result, m_ctx.bitmapTable); } // Compress alpha in parallel with the GPU. @@ -369,7 +370,7 @@ void CudaCompressor::compressDXT3(const CompressionOptions::Private & compressio } // Copy result to host, overwrite swizzled image. - cudaMemcpy(blockLinearImage, m_result, count * 8, cudaMemcpyDeviceToHost); + cudaMemcpy(blockLinearImage, m_ctx.result, count * 8, cudaMemcpyDeviceToHost); // Output result. if (outputOptions.outputHandler != NULL) @@ -428,16 +429,16 @@ void CudaCompressor::compressDXT5(const CompressionOptions::Private & compressio { uint count = min(blockNum - bn, MAX_BLOCKS); - cudaMemcpy(m_data, blockLinearImage + bn * 16, count * 64, cudaMemcpyHostToDevice); + cudaMemcpy(m_ctx.data, blockLinearImage + bn * 16, count * 64, cudaMemcpyHostToDevice); // Launch kernel. if (m_alphaMode == AlphaMode_Transparency) { - compressWeightedKernelDXT1(count, m_data, m_result, m_bitmapTable); + compressWeightedKernelDXT1(count, m_ctx.data, m_ctx.result, m_ctx.bitmapTable); } else { - compressKernelDXT1_Level4(count, m_data, m_result, m_bitmapTable); + compressKernelDXT1_Level4(count, m_ctx.data, m_ctx.result, m_ctx.bitmapTable); } // Compress alpha in parallel with the GPU. @@ -460,7 +461,7 @@ void CudaCompressor::compressDXT5(const CompressionOptions::Private & compressio } // Copy result to host, overwrite swizzled image. - cudaMemcpy(blockLinearImage, m_result, count * 8, cudaMemcpyDeviceToHost); + cudaMemcpy(blockLinearImage, m_ctx.result, count * 8, cudaMemcpyDeviceToHost); // Output result. if (outputOptions.outputHandler != NULL) @@ -516,10 +517,10 @@ void CudaCompressor::compressDXT1n(const nvtt::CompressionOptions::Private & com { uint count = min(blockNum - bn, MAX_BLOCKS); - cudaMemcpy(m_data, blockLinearImage + bn * 16, count * 64, cudaMemcpyHostToDevice); + cudaMemcpy(m_ctx.data, blockLinearImage + bn * 16, count * 64, cudaMemcpyHostToDevice); // Launch kernel. - compressNormalKernelDXT1(count, m_data, m_result, m_bitmapTable); + compressNormalKernelDXT1(count, m_ctx.data, m_ctx.result, m_ctx.bitmapTable); // Check for errors. cudaError_t err = cudaGetLastError(); @@ -534,7 +535,7 @@ void CudaCompressor::compressDXT1n(const nvtt::CompressionOptions::Private & com } // Copy result to host, overwrite swizzled image. - cudaMemcpy(blockLinearImage, m_result, count * 8, cudaMemcpyDeviceToHost); + cudaMemcpy(blockLinearImage, m_ctx.result, count * 8, cudaMemcpyDeviceToHost); // Output result. if (outputOptions.outputHandler != NULL) @@ -585,10 +586,10 @@ void CudaCompressor::compressCTX1(const nvtt::CompressionOptions::Private & comp { uint count = min(blockNum - bn, MAX_BLOCKS); - cudaMemcpy(m_data, blockLinearImage + bn * 16, count * 64, cudaMemcpyHostToDevice); + cudaMemcpy(m_ctx.data, blockLinearImage + bn * 16, count * 64, cudaMemcpyHostToDevice); // Launch kernel. - compressKernelCTX1(count, m_data, m_result, m_bitmapTableCTX); + compressKernelCTX1(count, m_ctx.data, m_ctx.result, m_ctx.bitmapTableCTX); // Check for errors. cudaError_t err = cudaGetLastError(); @@ -603,7 +604,7 @@ void CudaCompressor::compressCTX1(const nvtt::CompressionOptions::Private & comp } // Copy result to host, overwrite swizzled image. - cudaMemcpy(blockLinearImage, m_result, count * 8, cudaMemcpyDeviceToHost); + cudaMemcpy(blockLinearImage, m_ctx.result, count * 8, cudaMemcpyDeviceToHost); // Output result. if (outputOptions.outputHandler != NULL) @@ -643,4 +644,4 @@ void CudaCompressor::compressDXT5n(const nvtt::CompressionOptions::Private & com #endif } - +#endif // 0 diff --git a/src/nvtt/cuda/CudaCompressDXT.h b/src/nvtt/cuda/CudaCompressDXT.h index 18a7e79..b60bdf3 100644 --- a/src/nvtt/cuda/CudaCompressDXT.h +++ b/src/nvtt/cuda/CudaCompressDXT.h @@ -27,38 +27,86 @@ #include #include +#include "nvtt/CompressDXT.h" + +struct cudaArray; + namespace nv { class Image; - class CudaCompressor + class CudaContext { public: - CudaCompressor(); - ~CudaCompressor(); + CudaContext(); + ~CudaContext(); bool isValid() const; - void setImage(const Image * image, nvtt::AlphaMode alphaMode); + public: + // Device pointers. + uint * bitmapTable; + uint * bitmapTableCTX; + uint * data; + uint * result; + }; - void compressDXT1(const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions); - void compressDXT3(const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions); - void compressDXT5(const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions); - void compressDXT1n(const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions); - void compressCTX1(const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions); - void compressDXT5n(const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions); - private: + struct CudaCompressor : public CompressorInterface + { + CudaCompressor(CudaContext & ctx); + + virtual void compress(nvtt::InputFormat inputFormat, nvtt::AlphaMode alphaMode, uint w, uint h, void * data, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions); + + virtual void setup(cudaArray * image, const nvtt::CompressionOptions::Private & compressionOptions) = 0; + virtual void compressBlocks(uint first, uint count, uint w, uint h, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output) = 0; + virtual uint blockSize() const = 0; + + protected: + CudaContext & m_ctx; + }; - uint * m_bitmapTable; - uint * m_bitmapTableCTX; - uint * m_data; - uint * m_result; - - const Image * m_image; - nvtt::AlphaMode m_alphaMode; + struct CudaCompressorDXT1 : public CudaCompressor + { + CudaCompressorDXT1(CudaContext & ctx) : CudaCompressor(ctx) {} + + virtual void setup(cudaArray * image, const nvtt::CompressionOptions::Private & compressionOptions); + virtual void compressBlocks(uint first, uint count, uint w, uint h, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output); + virtual uint blockSize() const { return 8; }; }; + /*struct CudaCompressorDXT1n : public CudaCompressor + { + virtual void setup(const CompressionOptions::Private & compressionOptions); + virtual void compressBlocks(uint blockCount, const void * input, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output) = 0; + virtual uint blockSize() const { return 8; }; + };*/ + + struct CudaCompressorDXT3 : public CudaCompressor + { + CudaCompressorDXT3(CudaContext & ctx) : CudaCompressor(ctx) {} + + virtual void setup(cudaArray * image, const nvtt::CompressionOptions::Private & compressionOptions); + virtual void compressBlocks(uint first, uint count, uint w, uint h, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output); + virtual uint blockSize() const { return 16; }; + }; + + struct CudaCompressorDXT5 : public CudaCompressor + { + CudaCompressorDXT5(CudaContext & ctx) : CudaCompressor(ctx) {} + + virtual void setup(cudaArray * image, const nvtt::CompressionOptions::Private & compressionOptions); + virtual void compressBlocks(uint first, uint count, uint w, uint h, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output); + virtual uint blockSize() const { return 16; }; + }; + + /*struct CudaCompressorCXT1 : public CudaCompressor + { + virtual void setup(const CompressionOptions::Private & compressionOptions); + virtual void compressBlocks(uint blockCount, const void * input, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output) = 0; + virtual uint blockSize() const { return 8; }; + };*/ + } // nv namespace diff --git a/src/nvtt/nvtt.h b/src/nvtt/nvtt.h index 520ecfd..8a4d8da 100644 --- a/src/nvtt/nvtt.h +++ b/src/nvtt/nvtt.h @@ -93,6 +93,9 @@ namespace nvtt Format_DXT1n, Format_CTX1, Format_YCoCg_DXT5, + + Format_BC6, + Format_BC7, }; /// Pixel types.