diff --git a/src/nvimage/nvtt/cuda/CompressKernel.cu b/src/nvimage/nvtt/cuda/CompressKernel.cu index c75751a..ad29a94 100644 --- a/src/nvimage/nvtt/cuda/CompressKernel.cu +++ b/src/nvimage/nvtt/cuda/CompressKernel.cu @@ -55,7 +55,7 @@ __constant__ float3 kColorMetric = { 1.0f, 1.0f, 1.0f }; //////////////////////////////////////////////////////////////////////////////// // Sort colors //////////////////////////////////////////////////////////////////////////////// -__device__ void sortColors(float * values, int * cmp) +__device__ void sortColors(const float * values, int * cmp) { int tid = threadIdx.x; @@ -98,7 +98,7 @@ __device__ void sortColors(float * values, int * cmp) //////////////////////////////////////////////////////////////////////////////// // Load color block to shared mem //////////////////////////////////////////////////////////////////////////////// -__device__ void loadColorBlock(const uint * image, float3 colors[16], int xrefs[16]) +__device__ void loadColorBlock(const uint * image, float3 colors[16], float3 sums[16], int xrefs[16]) { const int bid = blockIdx.x; const int idx = threadIdx.x; @@ -120,7 +120,8 @@ __device__ void loadColorBlock(const uint * image, float3 colors[16], int xrefs[ #endif // Sort colors along the best fit line. - float3 axis = bestFitLine(colors); + colorSums(colors, sums); + float3 axis = bestFitLine(colors, sums[0]); dps[idx] = dot(colors[idx], axis); @@ -135,7 +136,7 @@ __device__ void loadColorBlock(const uint * image, float3 colors[16], int xrefs[ } } -__device__ void loadColorBlock(const uint * image, float3 colors[16], float weights[16], int xrefs[16]) +__device__ void loadColorBlock(const uint * image, float3 colors[16], float3 sums[16], float weights[16], int xrefs[16]) { const int bid = blockIdx.x; const int idx = threadIdx.x; @@ -158,8 +159,9 @@ __device__ void loadColorBlock(const uint * image, float3 colors[16], float weig #endif // Sort colors along the best fit line. - float3 axis = bestFitLine(colors); - + colorSums(colors, sums); + float3 axis = bestFitLine(colors, sums[0]); + dps[idx] = dot(colors[idx], axis); #if __DEVICE_EMULATION__ @@ -276,6 +278,81 @@ __device__ float evalPermutation3(const float3 * colors, uint permutation, ushor return dot(e, kColorMetric); } +__constant__ float alphaTable4[4] = { 9.0f, 0.0f, 6.0f, 3.0f }; +__constant__ float alphaTable3[4] = { 4.0f, 0.0f, 2.0f, 2.0f }; +__constant__ const uint prods4[4] = { 0x090000,0x000900,0x040102,0x010402 }; +__constant__ const uint prods3[4] = { 0x040000,0x000400,0x040101,0x010401 }; + +__device__ float evalPermutation4(const float3 * colors, uint permutation, ushort * start, ushort * end, float3 color_sum) +{ + // Compute endpoints using least squares. + float3 alphax_sum = make_float3(0.0f, 0.0f, 0.0f); + uint akku = 0; + + // Compute alpha & beta for this permutation. + for (int i = 0; i < 16; i++) + { + const uint bits = permutation >> (2*i); + + alphax_sum += alphaTable4[bits & 3] * colors[i]; + akku += prods4[bits & 3]; + } + + float alpha2_sum = float(akku >> 16); + float beta2_sum = float((akku >> 8) & 0xff); + float alphabeta_sum = float(akku & 0xff); + float3 betax_sum = 9.0f * color_sum - alphax_sum; + + const float factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum); + + float3 a = (alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor; + float3 b = (betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor; + + // Round a, b to the closest 5-6-5 color and expand... + a = roundAndExpand(a, start); + b = roundAndExpand(b, end); + + // compute the error + float3 e = a * a * alpha2_sum + b * b * beta2_sum + 2.0f * (a * b * alphabeta_sum - a * alphax_sum - b * betax_sum); + + return (1.0f / 9.0f) * dot(e, kColorMetric); +} + +__device__ float evalPermutation3(const float3 * colors, uint permutation, ushort * start, ushort * end, float3 color_sum) +{ + // Compute endpoints using least squares. + float3 alphax_sum = make_float3(0.0f, 0.0f, 0.0f); + uint akku = 0; + + // Compute alpha & beta for this permutation. + for (int i = 0; i < 16; i++) + { + const uint bits = permutation >> (2*i); + + alphax_sum += alphaTable3[bits & 3] * colors[i]; + akku += prods3[bits & 3]; + } + + float alpha2_sum = float(akku >> 16); + float beta2_sum = float((akku >> 8) & 0xff); + float alphabeta_sum = float(akku & 0xff); + float3 betax_sum = 4.0f * color_sum - alphax_sum; + + const float factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum); + + float3 a = (alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor; + float3 b = (betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor; + + // Round a, b to the closest 5-6-5 color and expand... + a = roundAndExpand(a, start); + b = roundAndExpand(b, end); + + // compute the error + float3 e = a * a * alpha2_sum + b * b * beta2_sum + 2.0f * (a * b * alphabeta_sum - a * alphax_sum - b * betax_sum); + + return (1.0f / 4.0f) * dot(e, kColorMetric); +} + __device__ float evalPermutation4(const float3 * colors, const float * weights, uint permutation, ushort * start, ushort * end) { // Compute endpoints using least squares. @@ -360,7 +437,7 @@ __device__ float evalPermutation3(const float3 * colors, const float * weights, //////////////////////////////////////////////////////////////////////////////// // Evaluate all permutations //////////////////////////////////////////////////////////////////////////////// -__device__ void evalAllPermutations(const float3 * colors, const uint * permutations, ushort & bestStart, ushort & bestEnd, uint & bestPermutation, float * errors) +__device__ void evalAllPermutations(const float3 * colors, float3 colorSum, const uint * permutations, ushort & bestStart, ushort & bestEnd, uint & bestPermutation, float * errors) { const int idx = threadIdx.x; @@ -377,7 +454,7 @@ __device__ void evalAllPermutations(const float3 * colors, const uint * permutat uint permutation = permutations[pidx]; if (pidx < 160) s_permutations[pidx] = permutation; - float error = evalPermutation4(colors, permutation, &start, &end); + float error = evalPermutation4(colors, permutation, &start, &end, colorSum); if (error < bestError) { @@ -401,7 +478,7 @@ __device__ void evalAllPermutations(const float3 * colors, const uint * permutat ushort start, end; uint permutation = s_permutations[pidx]; - float error = evalPermutation3(colors, permutation, &start, &end); + float error = evalPermutation3(colors, permutation, &start, &end, colorSum); if (error < bestError) { @@ -563,8 +640,10 @@ __device__ int findMinError(float * errors) } } + __syncthreads(); + // unroll last 6 iterations - if (idx <= 32) + if (idx < 32) { if (errors[idx + 32] < errors[idx]) { errors[idx] = errors[idx + 32]; @@ -612,7 +691,7 @@ __device__ void saveBlockDXT1(ushort start, ushort end, uint permutation, int xr } // Reorder permutation. - uint indices = permutation; + uint indices = 0; for(int i = 0; i < 16; i++) { int ref = xrefs[i]; @@ -635,9 +714,10 @@ __device__ void saveBlockDXT1(ushort start, ushort end, uint permutation, int xr __global__ void compress(const uint * permutations, const uint * image, uint2 * result) { __shared__ float3 colors[16]; + __shared__ float3 sums[16]; __shared__ int xrefs[16]; - loadColorBlock(image, colors, xrefs); + loadColorBlock(image, colors, sums, xrefs); __syncthreads(); @@ -646,7 +726,7 @@ __global__ void compress(const uint * permutations, const uint * image, uint2 * __shared__ float errors[NUM_THREADS]; - evalAllPermutations(colors, permutations, bestStart, bestEnd, bestPermutation, errors); + evalAllPermutations(colors, sums[0], permutations, bestStart, bestEnd, bestPermutation, errors); // Use a parallel reduction to find minimum error. const int minIdx = findMinError(errors); @@ -662,10 +742,11 @@ __global__ void compress(const uint * permutations, const uint * image, uint2 * __global__ void compressWeighted(const uint * permutations, const uint * image, uint2 * result) { __shared__ float3 colors[16]; + __shared__ float3 sums[16]; __shared__ float weights[16]; __shared__ int xrefs[16]; - loadColorBlock(image, colors, weights, xrefs); + loadColorBlock(image, colors, sums, weights, xrefs); __syncthreads(); @@ -836,10 +917,11 @@ __device__ void compressAlpha(const float alphas[16], uint4 * result) __global__ void compressDXT5(const uint * permutations, const uint * image, uint4 * result) { __shared__ float3 colors[16]; + __shared__ float3 sums[16]; __shared__ float weights[16]; __shared__ int xrefs[16]; - loadColorBlock(image, colors, weights, xrefs); + loadColorBlock(image, colors, sums, weights, xrefs); __syncthreads(); diff --git a/src/nvimage/nvtt/cuda/CudaMath.h b/src/nvimage/nvtt/cuda/CudaMath.h index 4ca9e73..363b7b5 100644 --- a/src/nvimage/nvtt/cuda/CudaMath.h +++ b/src/nvimage/nvtt/cuda/CudaMath.h @@ -122,92 +122,99 @@ inline __device__ __host__ float3 normalize(float3 v) // http://www.miislita.com/information-retrieval-tutorial/matrix-tutorial-3-eigenvalues-eigenvectors.html inline __device__ __host__ float3 firstEigenVector( float matrix[6] ) { - // 8 iterations seems to be more than enough. + // 8 iterations seems to be more than enough. - float3 v = make_float3(1.0f, 1.0f, 1.0f); - for(int i = 0; i < 8; i++) { - float x = v.x * matrix[0] + v.y * matrix[1] + v.z * matrix[2]; - float y = v.x * matrix[1] + v.y * matrix[3] + v.z * matrix[4]; - float z = v.x * matrix[2] + v.y * matrix[4] + v.z * matrix[5]; - float m = max(max(x, y), z); - float iv = 1.0f / m; - #if __DEVICE_EMULATION__ - if (m == 0.0f) iv = 0.0f; - #endif - v = make_float3(x*iv, y*iv, z*iv); - } + float3 v = make_float3(1.0f, 1.0f, 1.0f); + for(int i = 0; i < 8; i++) { + float x = v.x * matrix[0] + v.y * matrix[1] + v.z * matrix[2]; + float y = v.x * matrix[1] + v.y * matrix[3] + v.z * matrix[4]; + float z = v.x * matrix[2] + v.y * matrix[4] + v.z * matrix[5]; + float m = max(max(x, y), z); + float iv = 1.0f / m; + #if __DEVICE_EMULATION__ + if (m == 0.0f) iv = 0.0f; + #endif + v = make_float3(x*iv, y*iv, z*iv); + } - return v; + return v; } -inline __device__ float3 bestFitLine(const float3 * colors) +inline __device__ void colorSums(const float3 * colors, float3 * sums) { #if __DEVICE_EMULATION__ + float3 color_sum = make_float3(0.0f, 0.0f, 0.0f); + for (int i = 0; i < 16; i++) + { + color_sum += colors[i]; + } + + for (int i = 0; i < 16; i++) + { + sums[i] = color_sum; + } +#else + + const int idx = threadIdx.x; - // Compute covariance matrix of the given colors. - float3 center = make_float3(0.0f, 0.0f, 0.0f); - for (int i = 0; i < 16; i++) - { - center += colors[i]; - } - center /= 16.0f; - - float covariance[6] = {0, 0, 0, 0, 0, 0}; - for (int i = 0; i < 16; i++) - { - float3 a = colors[i] - center; - covariance[0] += a.x * a.x; - covariance[1] += a.x * a.y; - covariance[2] += a.x * a.z; - covariance[3] += a.y * a.y; - covariance[4] += a.y * a.z; - covariance[5] += a.z * a.z; - } + sums[idx] = colors[idx]; + sums[idx] += sums[idx^8]; + sums[idx] += sums[idx^4]; + sums[idx] += sums[idx^2]; + sums[idx] += sums[idx^1]; + +#endif +} +inline __device__ float3 bestFitLine(const float3 * colors, float3 color_sum) +{ + // Compute covariance matrix of the given colors. +#if __DEVICE_EMULATION__ + float covariance[6] = {0, 0, 0, 0, 0, 0}; + for (int i = 0; i < 16; i++) + { + float3 a = colors[i] - color_sum * (1.0f / 16.0f); + covariance[0] += a.x * a.x; + covariance[1] += a.x * a.y; + covariance[2] += a.x * a.z; + covariance[3] += a.y * a.y; + covariance[4] += a.y * a.z; + covariance[5] += a.z * a.z; + } #else - const int idx = threadIdx.x; - - __shared__ float3 colorSum[16]; - colorSum[idx] = colors[idx]; - - // Unrolled parallel reduction. - if (idx < 8) { - colorSum[idx] += colorSum[idx + 8]; - colorSum[idx] += colorSum[idx + 4]; - colorSum[idx] += colorSum[idx + 2]; - colorSum[idx] += colorSum[idx + 1]; - } - - // @@ Eliminate two-way bank conflicts here. - // @@ It seems that doing that and unrolling the reduction doesn't help... - __shared__ float covariance[16*6]; - colorSum[idx] = colors[idx] - colorSum[0] / 16.0f; - - covariance[6 * idx + 0] = colorSum[idx].x * colorSum[idx].x; // 0, 6, 12, 2, 8, 14, 4, 10, 0 - covariance[6 * idx + 1] = colorSum[idx].x * colorSum[idx].y; - covariance[6 * idx + 2] = colorSum[idx].x * colorSum[idx].z; - covariance[6 * idx + 3] = colorSum[idx].y * colorSum[idx].y; - covariance[6 * idx + 4] = colorSum[idx].y * colorSum[idx].z; - covariance[6 * idx + 5] = colorSum[idx].z * colorSum[idx].z; - - for(int d = 8; d > 0; d >>= 1) - { - if (idx < d) - { - covariance[6 * idx + 0] += covariance[6 * (idx+d) + 0]; - covariance[6 * idx + 1] += covariance[6 * (idx+d) + 1]; - covariance[6 * idx + 2] += covariance[6 * (idx+d) + 2]; - covariance[6 * idx + 3] += covariance[6 * (idx+d) + 3]; - covariance[6 * idx + 4] += covariance[6 * (idx+d) + 4]; - covariance[6 * idx + 5] += covariance[6 * (idx+d) + 5]; - } - } + const int idx = threadIdx.x; + + float3 diff = colors[idx] - color_sum * (1.0f / 16.0f); + + // @@ Eliminate two-way bank conflicts here. + // @@ It seems that doing that and unrolling the reduction doesn't help... + __shared__ float covariance[16*6]; + + covariance[6 * idx + 0] = diff.x * diff.x; // 0, 6, 12, 2, 8, 14, 4, 10, 0 + covariance[6 * idx + 1] = diff.x * diff.y; + covariance[6 * idx + 2] = diff.x * diff.z; + covariance[6 * idx + 3] = diff.y * diff.y; + covariance[6 * idx + 4] = diff.y * diff.z; + covariance[6 * idx + 5] = diff.z * diff.z; + + for(int d = 8; d > 0; d >>= 1) + { + if (idx < d) + { + covariance[6 * idx + 0] += covariance[6 * (idx+d) + 0]; + covariance[6 * idx + 1] += covariance[6 * (idx+d) + 1]; + covariance[6 * idx + 2] += covariance[6 * (idx+d) + 2]; + covariance[6 * idx + 3] += covariance[6 * (idx+d) + 3]; + covariance[6 * idx + 4] += covariance[6 * (idx+d) + 4]; + covariance[6 * idx + 5] += covariance[6 * (idx+d) + 5]; + } + } #endif - - // Compute first eigen vector. - return firstEigenVector(covariance); + + // Compute first eigen vector. + return firstEigenVector(covariance); }