diff --git a/src/nvtt/Compressor.cpp b/src/nvtt/Compressor.cpp index ada910a..fe31cc7 100644 --- a/src/nvtt/Compressor.cpp +++ b/src/nvtt/Compressor.cpp @@ -205,16 +205,19 @@ namespace nvtt Compressor::Compressor() : m(*new Compressor::Private()) { + // CUDA initialization. m.cudaSupported = cuda::isHardwarePresent(); m.cudaEnabled = m.cudaSupported; - // @@ Do CUDA initialization here. - + if (m.cudaEnabled) + { + m.cuda = new CudaCompressor(); + } } Compressor::~Compressor() { - // @@ Free CUDA resources here. + delete &m; } @@ -225,6 +228,11 @@ void Compressor::enableCudaAcceleration(bool enable) { m.cudaEnabled = enable; } + + if (m.cudaEnabled && m.cuda == NULL) + { + m.cuda = new CudaCompressor(); + } } /// Check if CUDA acceleration is enabled. @@ -670,7 +678,7 @@ bool Compressor::Private::compressMipmap(const Mipmap & mipmap, const Compressio if (cudaEnabled) { nvDebugCheck(cudaSupported); - cudaCompressDXT1(image, outputOptions, compressionOptions); + cuda->compressDXT1(image, outputOptions, compressionOptions); } else { @@ -708,7 +716,7 @@ bool Compressor::Private::compressMipmap(const Mipmap & mipmap, const Compressio if (cudaEnabled) { nvDebugCheck(cudaSupported); - cudaCompressDXT3(image, outputOptions, compressionOptions); + cuda->compressDXT3(image, outputOptions, compressionOptions); } else { @@ -727,7 +735,7 @@ bool Compressor::Private::compressMipmap(const Mipmap & mipmap, const Compressio if (cudaEnabled) { nvDebugCheck(cudaSupported); - cudaCompressDXT5(image, outputOptions, compressionOptions); + cuda->compressDXT5(image, outputOptions, compressionOptions); } else { diff --git a/src/nvtt/Compressor.h b/src/nvtt/Compressor.h index 474a25f..326b498 100644 --- a/src/nvtt/Compressor.h +++ b/src/nvtt/Compressor.h @@ -24,6 +24,10 @@ #ifndef NV_TT_COMPRESSOR_H #define NV_TT_COMPRESSOR_H +#include + +#include + #include "nvtt.h" namespace nv @@ -63,6 +67,9 @@ namespace nvtt bool cudaSupported; bool cudaEnabled; + + nv::AutoPtr cuda; + }; } // nvtt namespace diff --git a/src/nvtt/cuda/CudaCompressDXT.cpp b/src/nvtt/cuda/CudaCompressDXT.cpp index fd6ff22..429b0e6 100644 --- a/src/nvtt/cuda/CudaCompressDXT.cpp +++ b/src/nvtt/cuda/CudaCompressDXT.cpp @@ -48,29 +48,13 @@ using namespace nvtt; #if defined HAVE_CUDA +#define MAX_BLOCKS 32768 // 49152, 65535 + extern "C" void setupCompressKernel(const float weights[3]); extern "C" void compressKernel(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps); extern "C" void compressWeightedKernel(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps); -#include "Bitmaps.h" - -// @@ Store this pointer in CompressionOptions. Allocate in ctor, free in dtor. -static uint * d_bitmaps = NULL; - -static void doPrecomputation() -{ - if (d_bitmaps != NULL) { - return; - } - - // Upload bitmaps. - cudaMalloc((void**) &d_bitmaps, 992 * sizeof(uint)); - cudaMemcpy(d_bitmaps, bitmaps, 992 * sizeof(uint), cudaMemcpyHostToDevice); - - // @@ Check for errors. - - // @@ Free allocated memory. -} +#include "Bitmaps.h" // @@ Rename to BitmapTable.h // Convert linear image to block linear. static void convertToBlockLinear(const Image * image, uint * blockLinearImage) @@ -92,19 +76,43 @@ static void convertToBlockLinear(const Image * image, uint * blockLinearImage) } } -#endif // defined HAVE_CUDA +#endif + + +CudaCompressor::CudaCompressor() +{ +#if defined HAVE_CUDA + // Allocate and upload bitmaps. + cudaMalloc((void**) &m_bitmapTable, 992 * sizeof(uint)); + cudaMemcpy(m_bitmapTable, bitmaps, 992 * sizeof(uint), cudaMemcpyHostToDevice); + + // Allocate scratch buffers. + cudaMalloc((void**) &m_data, MAX_BLOCKS * 64U); + cudaMalloc((void**) &m_result, MAX_BLOCKS * 8U); +#endif +} + +CudaCompressor::~CudaCompressor() +{ +#if defined HAVE_CUDA + // Free device mem allocations. + cudaFree(m_data); + cudaFree(m_result); + cudaFree(m_bitmapTable); +#endif +} + + // @@ This code is very repetitive and needs to be cleaned up. /// Compress image using CUDA. -void nv::cudaCompressDXT1(const Image * image, const OutputOptions::Private & outputOptions, const CompressionOptions::Private & compressionOptions) +void CudaCompressor::compressDXT1(const Image * image, const OutputOptions::Private & outputOptions, const CompressionOptions::Private & compressionOptions) { nvDebugCheck(cuda::isHardwarePresent()); #if defined HAVE_CUDA - doPrecomputation(); - // Image size in blocks. const uint w = (image->width() + 3) / 4; const uint h = (image->height() + 3) / 4; @@ -119,14 +127,6 @@ void nv::cudaCompressDXT1(const Image * image, const OutputOptions::Private & ou clock_t start = clock(); - // Allocate image in device memory. - uint * d_data = NULL; - cudaMalloc((void**) &d_data, min(imageSize, blockMax * 64U)); - - // Allocate result. - uint * d_result = NULL; - cudaMalloc((void**) &d_result, min(compressedSize, blockMax * 8U)); - setupCompressKernel(compressionOptions.colorWeight.ptr()); // TODO: Add support for multiple GPUs. @@ -135,10 +135,10 @@ void nv::cudaCompressDXT1(const Image * image, const OutputOptions::Private & ou { uint count = min(blockNum - bn, blockMax); - cudaMemcpy(d_data, blockLinearImage + bn * 16, count * 64, cudaMemcpyHostToDevice); + cudaMemcpy(m_data, blockLinearImage + bn * 16, count * 64, cudaMemcpyHostToDevice); // Launch kernel. - compressKernel(count, d_data, d_result, d_bitmaps); + compressKernel(count, m_data, m_result, m_bitmapTable); // Check for errors. cudaError_t err = cudaGetLastError(); @@ -153,7 +153,7 @@ void nv::cudaCompressDXT1(const Image * image, const OutputOptions::Private & ou } // Copy result to host, overwrite swizzled image. - cudaMemcpy(blockLinearImage, d_result, count * 8, cudaMemcpyDeviceToHost); + cudaMemcpy(blockLinearImage, m_result, count * 8, cudaMemcpyDeviceToHost); // Output result. if (outputOptions.outputHandler != NULL) @@ -168,8 +168,6 @@ void nv::cudaCompressDXT1(const Image * image, const OutputOptions::Private & ou printf("\rCUDA time taken: %.3f seconds\n", float(end-start) / CLOCKS_PER_SEC); free(blockLinearImage); - cudaFree(d_data); - cudaFree(d_result); #else if (outputOptions.errorHandler != NULL) @@ -181,13 +179,11 @@ void nv::cudaCompressDXT1(const Image * image, const OutputOptions::Private & ou /// Compress image using CUDA. -void nv::cudaCompressDXT3(const Image * image, const OutputOptions::Private & outputOptions, const CompressionOptions::Private & compressionOptions) +void CudaCompressor::compressDXT3(const Image * image, const OutputOptions::Private & outputOptions, const CompressionOptions::Private & compressionOptions) { nvDebugCheck(cuda::isHardwarePresent()); #if defined HAVE_CUDA - doPrecomputation(); - // Image size in blocks. const uint w = (image->width() + 3) / 4; const uint h = (image->height() + 3) / 4; @@ -200,14 +196,6 @@ void nv::cudaCompressDXT3(const Image * image, const OutputOptions::Private & ou const uint compressedSize = blockNum * 8; const uint blockMax = 32768; // 49152, 65535 - // Allocate image in device memory. - uint * d_data = NULL; - cudaMalloc((void**) &d_data, min(imageSize, blockMax * 64U)); - - // Allocate result. - uint * d_result = NULL; - cudaMalloc((void**) &d_result, min(compressedSize, blockMax * 8U)); - AlphaBlockDXT3 * alphaBlocks = NULL; alphaBlocks = (AlphaBlockDXT3 *)malloc(min(compressedSize, blockMax * 8U)); @@ -220,10 +208,10 @@ void nv::cudaCompressDXT3(const Image * image, const OutputOptions::Private & ou { uint count = min(blockNum - bn, blockMax); - cudaMemcpy(d_data, blockLinearImage + bn * 16, count * 64, cudaMemcpyHostToDevice); + cudaMemcpy(m_data, blockLinearImage + bn * 16, count * 64, cudaMemcpyHostToDevice); // Launch kernel. - compressWeightedKernel(count, d_data, d_result, d_bitmaps); + compressWeightedKernel(count, m_data, m_result, m_bitmapTable); // Compress alpha in parallel with the GPU. for (uint i = 0; i < count; i++) @@ -245,7 +233,7 @@ void nv::cudaCompressDXT3(const Image * image, const OutputOptions::Private & ou } // Copy result to host, overwrite swizzled image. - cudaMemcpy(blockLinearImage, d_result, count * 8, cudaMemcpyDeviceToHost); + cudaMemcpy(blockLinearImage, m_result, count * 8, cudaMemcpyDeviceToHost); // Output result. if (outputOptions.outputHandler != NULL) @@ -265,8 +253,6 @@ void nv::cudaCompressDXT3(const Image * image, const OutputOptions::Private & ou free(alphaBlocks); free(blockLinearImage); - cudaFree(d_data); - cudaFree(d_result); #else if (outputOptions.errorHandler != NULL) @@ -278,13 +264,11 @@ void nv::cudaCompressDXT3(const Image * image, const OutputOptions::Private & ou /// Compress image using CUDA. -void nv::cudaCompressDXT5(const Image * image, const OutputOptions::Private & outputOptions, const CompressionOptions::Private & compressionOptions) +void CudaCompressor::compressDXT5(const Image * image, const OutputOptions::Private & outputOptions, const CompressionOptions::Private & compressionOptions) { nvDebugCheck(cuda::isHardwarePresent()); #if defined HAVE_CUDA - doPrecomputation(); - // Image size in blocks. const uint w = (image->width() + 3) / 4; const uint h = (image->height() + 3) / 4; @@ -297,14 +281,6 @@ void nv::cudaCompressDXT5(const Image * image, const OutputOptions::Private & ou const uint compressedSize = blockNum * 8; const uint blockMax = 32768; // 49152, 65535 - // Allocate image in device memory. - uint * d_data = NULL; - cudaMalloc((void**) &d_data, min(imageSize, blockMax * 64U)); - - // Allocate result. - uint * d_result = NULL; - cudaMalloc((void**) &d_result, min(compressedSize, blockMax * 8U)); - AlphaBlockDXT5 * alphaBlocks = NULL; alphaBlocks = (AlphaBlockDXT5 *)malloc(min(compressedSize, blockMax * 8U)); @@ -317,10 +293,10 @@ void nv::cudaCompressDXT5(const Image * image, const OutputOptions::Private & ou { uint count = min(blockNum - bn, blockMax); - cudaMemcpy(d_data, blockLinearImage + bn * 16, count * 64, cudaMemcpyHostToDevice); + cudaMemcpy(m_data, blockLinearImage + bn * 16, count * 64, cudaMemcpyHostToDevice); // Launch kernel. - compressWeightedKernel(count, d_data, d_result, d_bitmaps); + compressWeightedKernel(count, m_data, m_result, m_bitmapTable); // Compress alpha in parallel with the GPU. for (uint i = 0; i < count; i++) @@ -342,7 +318,7 @@ void nv::cudaCompressDXT5(const Image * image, const OutputOptions::Private & ou } // Copy result to host, overwrite swizzled image. - cudaMemcpy(blockLinearImage, d_result, count * 8, cudaMemcpyDeviceToHost); + cudaMemcpy(blockLinearImage, m_result, count * 8, cudaMemcpyDeviceToHost); // Output result. if (outputOptions.outputHandler != NULL) @@ -362,8 +338,6 @@ void nv::cudaCompressDXT5(const Image * image, const OutputOptions::Private & ou free(alphaBlocks); free(blockLinearImage); - cudaFree(d_data); - cudaFree(d_result); #else if (outputOptions.errorHandler != NULL) @@ -375,7 +349,7 @@ void nv::cudaCompressDXT5(const Image * image, const OutputOptions::Private & ou -#if defined HAVE_CUDA +#if 0 class Task { @@ -511,8 +485,6 @@ private: }; -#endif // defined HAVE_CUDA - void nv::cudaCompressDXT1_2(const Image * image, const OutputOptions::Private & outputOptions, const CompressionOptions::Private & compressionOptions) { @@ -522,9 +494,7 @@ void nv::cudaCompressDXT1_2(const Image * image, const OutputOptions::Private & const uint blockNum = ((w + 3) / 4) * ((h + 3) / 4); const uint blockMax = 32768; // 49152, 65535 - - doPrecomputation(); - + setupCompressKernel(compressionOptions.colorWeight.ptr()); ColorBlock rgba; @@ -559,4 +529,4 @@ void nv::cudaCompressDXT1_2(const Image * image, const OutputOptions::Private & #endif } - +#endif // 0 diff --git a/src/nvtt/cuda/CudaCompressDXT.h b/src/nvtt/cuda/CudaCompressDXT.h index d515871..c98c4e4 100644 --- a/src/nvtt/cuda/CudaCompressDXT.h +++ b/src/nvtt/cuda/CudaCompressDXT.h @@ -31,11 +31,22 @@ namespace nv { class Image; - void cudaCompressDXT1(const Image * image, const nvtt::OutputOptions::Private & outputOptions, const nvtt::CompressionOptions::Private & compressionOptions); - void cudaCompressDXT3(const Image * image, const nvtt::OutputOptions::Private & outputOptions, const nvtt::CompressionOptions::Private & compressionOptions); - void cudaCompressDXT5(const Image * image, const nvtt::OutputOptions::Private & outputOptions, const nvtt::CompressionOptions::Private & compressionOptions); + class CudaCompressor + { + public: + CudaCompressor(); + ~CudaCompressor(); - void cudaCompressDXT1_2(const Image * image, const nvtt::OutputOptions::Private & outputOptions, const nvtt::CompressionOptions::Private & compressionOptions); + void compressDXT1(const Image * image, const nvtt::OutputOptions::Private & outputOptions, const nvtt::CompressionOptions::Private & compressionOptions); + void compressDXT3(const Image * image, const nvtt::OutputOptions::Private & outputOptions, const nvtt::CompressionOptions::Private & compressionOptions); + void compressDXT5(const Image * image, const nvtt::OutputOptions::Private & outputOptions, const nvtt::CompressionOptions::Private & compressionOptions); + + private: + + uint * m_bitmapTable; + uint * m_data; + uint * m_result; + }; } // nv namespace diff --git a/src/nvtt/tests/stress.cpp b/src/nvtt/tests/stress.cpp new file mode 100644 index 0000000..4a919ba --- /dev/null +++ b/src/nvtt/tests/stress.cpp @@ -0,0 +1,62 @@ +// Copyright NVIDIA Corporation 2007 -- Ignacio Castano +// +// Permission is hereby granted, free of charge, to any person +// obtaining a copy of this software and associated documentation +// files (the "Software"), to deal in the Software without +// restriction, including without limitation the rights to use, +// copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following +// conditions: +// +// The above copyright notice and this permission notice shall be +// included in all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +// OTHER DEALINGS IN THE SOFTWARE. + +#include + +#include // printf +#include // rand? +#include // clock + + + +int main(int argc, char *argv[]) +{ + nvtt::InputOptions inputOptions; + inputOptions.setTextureLayout(nvtt::TextureType_2D, 1024, 1024); + + int * data = (int *)malloc(1024 * 1024 * 4); + for (int i = 0; i < 1024 * 1024; i++) + { + data[i] = rand(); + } + + inputOptions.setMipmapData(data, 1024, 1024); + inputOptions.setMipmapGeneration(false); + + nvtt::CompressionOptions compressionOptions; + nvtt::OutputOptions outputOptions; + nvtt::Compressor compressor; + + for (int i = 0; i < 1000; i++) + { + clock_t start = clock(); + + compressor.process(inputOptions, compressionOptions, outputOptions); + + clock_t end = clock(); + printf("time taken: %.3f seconds\n", float(end-start) / CLOCKS_PER_SEC); + } + + return 0; +} +