diff --git a/src/nvimage/nvtt/cuda/CompressKernel.cu b/src/nvimage/nvtt/cuda/CompressKernel.cu index ad29a94..784a7bf 100644 --- a/src/nvimage/nvtt/cuda/CompressKernel.cu +++ b/src/nvimage/nvtt/cuda/CompressKernel.cu @@ -141,6 +141,7 @@ __device__ void loadColorBlock(const uint * image, float3 colors[16], float3 sum const int bid = blockIdx.x; const int idx = threadIdx.x; + __shared__ float3 rawColors[16]; __shared__ float dps[16]; if (idx < 16) @@ -148,10 +149,13 @@ __device__ void loadColorBlock(const uint * image, float3 colors[16], float3 sum // Read color and copy to shared mem. uint c = image[(bid) * 16 + idx]; - colors[idx].z = ((c >> 0) & 0xFF) * (1.0f / 255.0f); - colors[idx].y = ((c >> 8) & 0xFF) * (1.0f / 255.0f); - colors[idx].x = ((c >> 16) & 0xFF) * (1.0f / 255.0f); - weights[idx] = ((c >> 24) & 0xFF) * (1.0f / 255.0f); + rawColors[idx].z = ((c >> 0) & 0xFF) * (1.0f / 255.0f); + rawColors[idx].y = ((c >> 8) & 0xFF) * (1.0f / 255.0f); + rawColors[idx].x = ((c >> 16) & 0xFF) * (1.0f / 255.0f); + weights[idx] = (((c >> 24) & 0xFF) + 1) * (1.0f / 256.0f); + + colors[idx] = rawColors[idx] * weights[idx]; + // No need to synchronize, 16 < warp size. #if __DEVICE_EMULATION__ @@ -162,7 +166,7 @@ __device__ void loadColorBlock(const uint * image, float3 colors[16], float3 sum colorSums(colors, sums); float3 axis = bestFitLine(colors, sums[0]); - dps[idx] = dot(colors[idx], axis); + dps[idx] = dot(rawColors[idx], axis); #if __DEVICE_EMULATION__ } __debugsync(); if (idx < 16) { @@ -283,7 +287,7 @@ __constant__ float alphaTable3[4] = { 4.0f, 0.0f, 2.0f, 2.0f }; __constant__ const uint prods4[4] = { 0x090000,0x000900,0x040102,0x010402 }; __constant__ const uint prods3[4] = { 0x040000,0x000400,0x040101,0x010401 }; -__device__ float evalPermutation4(const float3 * colors, uint permutation, ushort * start, ushort * end, float3 color_sum) +__device__ float evalPermutation4(const float3 * colors, float3 color_sum, uint permutation, ushort * start, ushort * end) { // Compute endpoints using least squares. float3 alphax_sum = make_float3(0.0f, 0.0f, 0.0f); @@ -318,7 +322,7 @@ __device__ float evalPermutation4(const float3 * colors, uint permutation, ushor return (1.0f / 9.0f) * dot(e, kColorMetric); } -__device__ float evalPermutation3(const float3 * colors, uint permutation, ushort * start, ushort * end, float3 color_sum) +__device__ float evalPermutation3(const float3 * colors, float3 color_sum, uint permutation, ushort * start, ushort * end) { // Compute endpoints using least squares. float3 alphax_sum = make_float3(0.0f, 0.0f, 0.0f); @@ -353,14 +357,13 @@ __device__ float evalPermutation3(const float3 * colors, uint permutation, ushor return (1.0f / 4.0f) * dot(e, kColorMetric); } -__device__ float evalPermutation4(const float3 * colors, const float * weights, uint permutation, ushort * start, ushort * end) +__device__ float evalPermutation4(const float3 * colors, const float * weights, float3 color_sum, uint permutation, ushort * start, ushort * end) { // Compute endpoints using least squares. float alpha2_sum = 0.0f; float beta2_sum = 0.0f; float alphabeta_sum = 0.0f; float3 alphax_sum = make_float3(0.0f, 0.0f, 0.0f); - float3 betax_sum = make_float3(0.0f, 0.0f, 0.0f); // Compute alpha & beta for this permutation. for (int i = 0; i < 16; i++) @@ -374,10 +377,11 @@ __device__ float evalPermutation4(const float3 * colors, const float * weights, alpha2_sum += alpha * alpha * weights[i]; beta2_sum += beta * beta * weights[i]; alphabeta_sum += alpha * beta * weights[i]; - alphax_sum += alpha * colors[i] * weights[i]; - betax_sum += beta * colors[i] * weights[i]; + alphax_sum += alpha * colors[i]; } + float3 betax_sum = color_sum - alphax_sum; + const float factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum); float3 a = (alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor; @@ -393,45 +397,47 @@ __device__ float evalPermutation4(const float3 * colors, const float * weights, return dot(e, kColorMetric); } -__device__ float evalPermutation3(const float3 * colors, const float * weights, uint permutation, ushort * start, ushort * end) -{ - // Compute endpoints using least squares. - float alpha2_sum = 0.0f; - float beta2_sum = 0.0f; - float alphabeta_sum = 0.0f; - float3 alphax_sum = make_float3(0.0f, 0.0f, 0.0f); - float3 betax_sum = make_float3(0.0f, 0.0f, 0.0f); - - // Compute alpha & beta for this permutation. - for (int i = 0; i < 16; i++) - { - const uint bits = permutation >> (2*i); - - float beta = (bits & 1); - if (bits & 2) beta = 0.5f; - float alpha = 1.0f - beta; - - alpha2_sum += alpha * alpha * weights[i]; - beta2_sum += beta * beta * weights[i]; - alphabeta_sum += alpha * beta * weights[i]; - alphax_sum += alpha * colors[i] * weights[i]; - betax_sum += beta * colors[i] * weights[i]; - } - - const float factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum); - - float3 a = (alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor; - float3 b = (betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor; - - // Round a, b to the closest 5-6-5 color and expand... - a = roundAndExpand(a, start); - b = roundAndExpand(b, end); - - // compute the error - float3 e = a * a * alpha2_sum + b * b * beta2_sum + 2.0f * (a * b * alphabeta_sum - a * alphax_sum - b * betax_sum); - - return dot(e, kColorMetric); -} +/* +__device__ float evalPermutation3(const float3 * colors, const float * weights, uint permutation, ushort * start, ushort * end) +{ + // Compute endpoints using least squares. + float alpha2_sum = 0.0f; + float beta2_sum = 0.0f; + float alphabeta_sum = 0.0f; + float3 alphax_sum = make_float3(0.0f, 0.0f, 0.0f); + + // Compute alpha & beta for this permutation. + for (int i = 0; i < 16; i++) + { + const uint bits = permutation >> (2*i); + + float beta = (bits & 1); + if (bits & 2) beta = 0.5f; + float alpha = 1.0f - beta; + + alpha2_sum += alpha * alpha * weights[i]; + beta2_sum += beta * beta * weights[i]; + alphabeta_sum += alpha * beta * weights[i]; + alphax_sum += alpha * colors[i]; + } + + float3 betax_sum = color_sum - alphax_sum; + + const float factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum); + + float3 a = (alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor; + float3 b = (betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor; + + // Round a, b to the closest 5-6-5 color and expand... + a = roundAndExpand(a, start); + b = roundAndExpand(b, end); + + // compute the error + float3 e = a * a * alpha2_sum + b * b * beta2_sum + 2.0f * (a * b * alphabeta_sum - a * alphax_sum - b * betax_sum); + + return dot(e, kColorMetric); +} +*/ //////////////////////////////////////////////////////////////////////////////// @@ -454,7 +460,7 @@ __device__ void evalAllPermutations(const float3 * colors, float3 colorSum, cons uint permutation = permutations[pidx]; if (pidx < 160) s_permutations[pidx] = permutation; - float error = evalPermutation4(colors, permutation, &start, &end, colorSum); + float error = evalPermutation4(colors, colorSum, permutation, &start, &end); if (error < bestError) { @@ -478,7 +484,7 @@ __device__ void evalAllPermutations(const float3 * colors, float3 colorSum, cons ushort start, end; uint permutation = s_permutations[pidx]; - float error = evalPermutation3(colors, permutation, &start, &end, colorSum); + float error = evalPermutation3(colors, colorSum, permutation, &start, &end); if (error < bestError) { @@ -498,6 +504,7 @@ __device__ void evalAllPermutations(const float3 * colors, float3 colorSum, cons errors[idx] = bestError; } +/* __device__ void evalAllPermutations(const float3 * colors, const float * weights, const uint * permutations, ushort & bestStart, ushort & bestEnd, uint & bestPermutation, float * errors) { const int idx = threadIdx.x; @@ -558,9 +565,9 @@ __device__ void evalAllPermutations(const float3 * colors, const float * weights errors[idx] = bestError; } +*/ - -__device__ void evalLevel4Permutations(const float3 * colors, const float * weights, const uint * permutations, ushort & bestStart, ushort & bestEnd, uint & bestPermutation, float * errors) +__device__ void evalLevel4Permutations(const float3 * colors, const float * weights, float3 colorSum, const uint * permutations, ushort & bestStart, ushort & bestEnd, uint & bestPermutation, float * errors) { const int idx = threadIdx.x; @@ -574,7 +581,7 @@ __device__ void evalLevel4Permutations(const float3 * colors, const float * weig ushort start, end; uint permutation = permutations[pidx]; - float error = evalPermutation4(colors, weights, permutation, &start, &end); + float error = evalPermutation4(colors, weights, colorSum, permutation, &start, &end); if (error < bestError) { @@ -755,7 +762,7 @@ __global__ void compressWeighted(const uint * permutations, const uint * image, __shared__ float errors[NUM_THREADS]; - evalLevel4Permutations(colors, weights, permutations, bestStart, bestEnd, bestPermutation, errors); + evalLevel4Permutations(colors, weights, sums[0], permutations, bestStart, bestEnd, bestPermutation, errors); // Use a parallel reduction to find minimum error. int minIdx = findMinError(errors); @@ -768,6 +775,7 @@ __global__ void compressWeighted(const uint * permutations, const uint * image, } +/* __device__ float computeError(const float weights[16], uchar a0, uchar a1) { float palette[6]; @@ -803,7 +811,7 @@ inline __device__ uchar roundAndExpand(float a) { return rintf(__saturatef(a) * 255.0f); } - +*/ /* __device__ void optimizeAlpha8(const float alphas[16], uchar & a0, uchar & a1) { @@ -838,7 +846,7 @@ __device__ void optimizeAlpha8(const float alphas[16], uchar & a0, uchar & a1) a1 = roundAndExpand(b); } */ - +/* __device__ void compressAlpha(const float alphas[16], uint4 * result) { const int tid = threadIdx.x; @@ -932,7 +940,7 @@ __global__ void compressDXT5(const uint * permutations, const uint * image, uint __shared__ float errors[NUM_THREADS]; - evalLevel4Permutations(colors, weights, permutations, bestStart, bestEnd, bestPermutation, errors); + evalLevel4Permutations(colors, weights, sums[0], permutations, bestStart, bestEnd, bestPermutation, errors); // Use a parallel reduction to find minimum error. int minIdx = findMinError(errors); @@ -943,7 +951,7 @@ __global__ void compressDXT5(const uint * permutations, const uint * image, uint saveBlockDXT1(bestStart, bestEnd, bestPermutation, xrefs, (uint2 *)result); } } - +*/ //////////////////////////////////////////////////////////////////////////////// // Setup kernel diff --git a/src/nvimage/nvtt/cuda/CudaCompressDXT.cpp b/src/nvimage/nvtt/cuda/CudaCompressDXT.cpp index d0df912..d652b1a 100644 --- a/src/nvimage/nvtt/cuda/CudaCompressDXT.cpp +++ b/src/nvimage/nvtt/cuda/CudaCompressDXT.cpp @@ -29,10 +29,12 @@ #include #include #include +#include #include "CudaCompressDXT.h" #include "CudaUtils.h" + #if defined HAVE_CUDA #include #endif @@ -186,6 +188,28 @@ static void doPrecomputation() #endif +// Convert linear image to block linear. +static void convertToBlockLinear(const Image * image, uint * blockLinearImage) +{ + const uint w = (image->width() + 3) / 4; + const uint h = (image->height() + 3) / 4; + + for(uint by = 0; by < h; by++) { + for(uint bx = 0; bx < w; bx++) { + const uint bw = min(image->width() - bx * 4, 4U); + const uint bh = min(image->height() - by * 4, 4U); + + for (uint i = 0; i < 16; i++) { + const int x = (i % 4) % bw; + const int y = (i / 4) % bh; + blockLinearImage[(by * w + bx) * 16 + i] = image->pixel(bx * 4 + x, by * 4 + y).u; + } + } + } +} + +// @@ This code is very repetitive and needs to be cleaned up. + /// Compress image using CUDA. void nv::cudaCompressDXT1(const Image * image, const OutputOptions & outputOptions, const CompressionOptions::Private & compressionOptions) @@ -201,24 +225,11 @@ void nv::cudaCompressDXT1(const Image * image, const OutputOptions & outputOptio uint imageSize = w * h * 16 * sizeof(Color32); uint * blockLinearImage = (uint *) malloc(imageSize); - - // Convert linear image to block linear. - for(uint by = 0; by < h; by++) { - for(uint bx = 0; bx < w; bx++) { - const uint bw = min(image->width() - bx * 4, 4U); - const uint bh = min(image->height() - by * 4, 4U); - - for (uint i = 0; i < 16; i++) { - const int x = (i % 4) % bw; - const int y = (i / 4) % bh; - blockLinearImage[(by * w + bx) * 16 + i] = image->pixel(bx * 4 + x, by * 4 + y).u; - } - } - } + convertToBlockLinear(image, blockLinearImage); const uint blockNum = w * h; const uint compressedSize = blockNum * 8; - const uint blockMax = 32768; // 65535 + const uint blockMax = 32768; // 49152, 65535 // Allocate image in device memory. uint * d_data = NULL; @@ -283,6 +294,201 @@ void nv::cudaCompressDXT1(const Image * image, const OutputOptions & outputOptio } +/// Compress image using CUDA. +void nv::cudaCompressDXT3(const Image * image, const OutputOptions & outputOptions, const CompressionOptions::Private & compressionOptions) +{ + nvDebugCheck(cuda::isHardwarePresent()); +#if defined HAVE_CUDA + + doPrecomputation(); + + // Image size in blocks. + const uint w = (image->width() + 3) / 4; + const uint h = (image->height() + 3) / 4; + + uint imageSize = w * h * 16 * sizeof(Color32); + uint * blockLinearImage = (uint *) malloc(imageSize); + convertToBlockLinear(image, blockLinearImage); + + const uint blockNum = w * h; + const uint compressedSize = blockNum * 8; + const uint blockMax = 32768; // 49152, 65535 + + // Allocate image in device memory. + uint * d_data = NULL; + cudaMalloc((void**) &d_data, min(imageSize, blockMax * 64U)); + + // Allocate result. + uint * d_result = NULL; + cudaMalloc((void**) &d_result, min(compressedSize, blockMax * 8U)); + + AlphaBlockDXT3 * alphaBlocks = NULL; + alphaBlocks = (AlphaBlockDXT3 *)malloc(min(compressedSize, blockMax * 8U)); + + setupCompressKernel(compressionOptions.colorWeight.ptr()); + + clock_t start = clock(); + + uint bn = 0; + while(bn != blockNum) + { + uint count = min(blockNum - bn, blockMax); + + cudaMemcpy(d_data, blockLinearImage + bn * 16, count * 64, cudaMemcpyHostToDevice); + + // Launch kernel. + compressWeightedKernel(count, d_data, d_result, d_bitmaps); + + // Compress alpha in parallel with the GPU. + for (uint i = 0; i < count; i++) + { + ColorBlock rgba(blockLinearImage + (bn + i) * 16); + compressBlock(rgba, alphaBlocks + i); + } + + // Check for errors. + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) + { + nvDebug("CUDA Error: %s\n", cudaGetErrorString(err)); + + if (outputOptions.errorHandler != NULL) + { + outputOptions.errorHandler->error(Error_CudaError); + } + } + + // Copy result to host, overwrite swizzled image. + cudaMemcpy(blockLinearImage, d_result, count * 8, cudaMemcpyDeviceToHost); + + // Output result. + if (outputOptions.outputHandler != NULL) + { + for (uint i = 0; i < count; i++) + { + outputOptions.outputHandler->writeData(alphaBlocks + i, 8); + outputOptions.outputHandler->writeData(blockLinearImage + i * 2, 8); + } + } + + bn += count; + } + + clock_t end = clock(); + printf("\rCUDA time taken: %.3f seconds\n", float(end-start) / CLOCKS_PER_SEC); + + free(alphaBlocks); + free(blockLinearImage); + cudaFree(d_data); + cudaFree(d_result); + +#else + if (outputOptions.errorHandler != NULL) + { + outputOptions.errorHandler->error(Error_CudaError); + } +#endif +} + + +/// Compress image using CUDA. +void nv::cudaCompressDXT5(const Image * image, const OutputOptions & outputOptions, const CompressionOptions::Private & compressionOptions) +{ + nvDebugCheck(cuda::isHardwarePresent()); +#if defined HAVE_CUDA + + doPrecomputation(); + + // Image size in blocks. + const uint w = (image->width() + 3) / 4; + const uint h = (image->height() + 3) / 4; + + uint imageSize = w * h * 16 * sizeof(Color32); + uint * blockLinearImage = (uint *) malloc(imageSize); + convertToBlockLinear(image, blockLinearImage); + + const uint blockNum = w * h; + const uint compressedSize = blockNum * 8; + const uint blockMax = 32768; // 49152, 65535 + + // Allocate image in device memory. + uint * d_data = NULL; + cudaMalloc((void**) &d_data, min(imageSize, blockMax * 64U)); + + // Allocate result. + uint * d_result = NULL; + cudaMalloc((void**) &d_result, min(compressedSize, blockMax * 8U)); + + AlphaBlockDXT5 * alphaBlocks = NULL; + alphaBlocks = (AlphaBlockDXT5 *)malloc(min(compressedSize, blockMax * 8U)); + + setupCompressKernel(compressionOptions.colorWeight.ptr()); + + clock_t start = clock(); + + uint bn = 0; + while(bn != blockNum) + { + uint count = min(blockNum - bn, blockMax); + + cudaMemcpy(d_data, blockLinearImage + bn * 16, count * 64, cudaMemcpyHostToDevice); + + // Launch kernel. + compressWeightedKernel(count, d_data, d_result, d_bitmaps); + + // Compress alpha in parallel with the GPU. + for (uint i = 0; i < count; i++) + { + ColorBlock rgba(blockLinearImage + (bn + i) * 16); + compressBlock_Iterative(rgba, alphaBlocks + i); + } + + // Check for errors. + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) + { + nvDebug("CUDA Error: %s\n", cudaGetErrorString(err)); + + if (outputOptions.errorHandler != NULL) + { + outputOptions.errorHandler->error(Error_CudaError); + } + } + + // Copy result to host, overwrite swizzled image. + cudaMemcpy(blockLinearImage, d_result, count * 8, cudaMemcpyDeviceToHost); + + // Output result. + if (outputOptions.outputHandler != NULL) + { + for (uint i = 0; i < count; i++) + { + outputOptions.outputHandler->writeData(alphaBlocks + i, 8); + outputOptions.outputHandler->writeData(blockLinearImage + i * 2, 8); + } + } + + bn += count; + } + + clock_t end = clock(); + printf("\rCUDA time taken: %.3f seconds\n", float(end-start) / CLOCKS_PER_SEC); + + free(alphaBlocks); + free(blockLinearImage); + cudaFree(d_data); + cudaFree(d_result); + +#else + if (outputOptions.errorHandler != NULL) + { + outputOptions.errorHandler->error(Error_CudaError); + } +#endif +} + + + #if defined HAVE_CUDA class Task @@ -429,7 +635,7 @@ void nv::cudaCompressDXT1_2(const Image * image, const OutputOptions & outputOpt const uint h = image->height(); const uint blockNum = ((w + 3) / 4) * ((h + 3) / 4); - const uint blockMax = 32768; // 65535 + const uint blockMax = 32768; // 49152, 65535 doPrecomputation(); diff --git a/src/nvimage/nvtt/cuda/CudaCompressDXT.h b/src/nvimage/nvtt/cuda/CudaCompressDXT.h index 17d8c01..b88cc5c 100644 --- a/src/nvimage/nvtt/cuda/CudaCompressDXT.h +++ b/src/nvimage/nvtt/cuda/CudaCompressDXT.h @@ -32,6 +32,9 @@ namespace nv class Image; void cudaCompressDXT1(const Image * image, const nvtt::OutputOptions & outputOptions, const nvtt::CompressionOptions::Private & compressionOptions); + void cudaCompressDXT3(const Image * image, const nvtt::OutputOptions & outputOptions, const nvtt::CompressionOptions::Private & compressionOptions); + void cudaCompressDXT5(const Image * image, const nvtt::OutputOptions & outputOptions, const nvtt::CompressionOptions::Private & compressionOptions); + void cudaCompressDXT1_2(const Image * image, const nvtt::OutputOptions & outputOptions, const nvtt::CompressionOptions::Private & compressionOptions); } // nv namespace