Add support for alpha modes in the CUDA compressors.
This commit is contained in:
@ -822,6 +822,40 @@ __device__ void evalAllPermutations(const float2 * colors, float2 colorSum, cons
|
|||||||
errors[idx] = bestError;
|
errors[idx] = bestError;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
__device__ void evalLevel4Permutations(const float3 * colors, float3 colorSum, const uint * permutations, ushort & bestStart, ushort & bestEnd, uint & bestPermutation, float * errors)
|
||||||
|
{
|
||||||
|
const int idx = threadIdx.x;
|
||||||
|
|
||||||
|
float bestError = FLT_MAX;
|
||||||
|
|
||||||
|
for(int i = 0; i < 16; i++)
|
||||||
|
{
|
||||||
|
int pidx = idx + NUM_THREADS * i;
|
||||||
|
if (pidx >= 992) break;
|
||||||
|
|
||||||
|
ushort start, end;
|
||||||
|
uint permutation = permutations[pidx];
|
||||||
|
|
||||||
|
float error = evalPermutation4(colors, colorSum, permutation, &start, &end);
|
||||||
|
|
||||||
|
if (error < bestError)
|
||||||
|
{
|
||||||
|
bestError = error;
|
||||||
|
bestPermutation = permutation;
|
||||||
|
bestStart = start;
|
||||||
|
bestEnd = end;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (bestStart < bestEnd)
|
||||||
|
{
|
||||||
|
swap(bestEnd, bestStart);
|
||||||
|
bestPermutation ^= 0x55555555; // Flip indices.
|
||||||
|
}
|
||||||
|
|
||||||
|
errors[idx] = bestError;
|
||||||
|
}
|
||||||
|
|
||||||
__device__ void evalLevel4Permutations(const float3 * colors, const float * weights, float3 colorSum, const uint * permutations, ushort & bestStart, ushort & bestEnd, uint & bestPermutation, float * errors)
|
__device__ void evalLevel4Permutations(const float3 * colors, const float * weights, float3 colorSum, const uint * permutations, ushort & bestStart, ushort & bestEnd, uint & bestPermutation, float * errors)
|
||||||
{
|
{
|
||||||
const int idx = threadIdx.x;
|
const int idx = threadIdx.x;
|
||||||
@ -1101,6 +1135,39 @@ __global__ void compressDXT1(const uint * permutations, const uint * image, uint
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
__global__ void compressLevel4DXT1(const uint * permutations, const uint * image, uint2 * result)
|
||||||
|
{
|
||||||
|
__shared__ float3 colors[16];
|
||||||
|
__shared__ float3 sums[16];
|
||||||
|
__shared__ int xrefs[16];
|
||||||
|
__shared__ int sameColor;
|
||||||
|
|
||||||
|
loadColorBlock(image, colors, sums, xrefs, &sameColor);
|
||||||
|
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
if (sameColor)
|
||||||
|
{
|
||||||
|
if (threadIdx.x == 0) saveSingleColorBlockDXT1(colors[0], result);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
ushort bestStart, bestEnd;
|
||||||
|
uint bestPermutation;
|
||||||
|
|
||||||
|
__shared__ float errors[NUM_THREADS];
|
||||||
|
|
||||||
|
evalLevel4Permutations(colors, sums[0], permutations, bestStart, bestEnd, bestPermutation, errors);
|
||||||
|
|
||||||
|
// Use a parallel reduction to find minimum error.
|
||||||
|
const int minIdx = findMinError(errors);
|
||||||
|
|
||||||
|
// Only write the result of the winner thread.
|
||||||
|
if (threadIdx.x == minIdx)
|
||||||
|
{
|
||||||
|
saveBlockDXT1(bestStart, bestEnd, bestPermutation, xrefs, result);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
__global__ void compressWeightedDXT1(const uint * permutations, const uint * image, uint2 * result)
|
__global__ void compressWeightedDXT1(const uint * permutations, const uint * image, uint2 * result)
|
||||||
{
|
{
|
||||||
@ -1412,6 +1479,11 @@ extern "C" void compressKernelDXT1(uint blockNum, uint * d_data, uint * d_result
|
|||||||
compressDXT1<<<blockNum, NUM_THREADS>>>(d_bitmaps, d_data, (uint2 *)d_result);
|
compressDXT1<<<blockNum, NUM_THREADS>>>(d_bitmaps, d_data, (uint2 *)d_result);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
extern "C" void compressKernelDXT1_Level4(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps)
|
||||||
|
{
|
||||||
|
compressLevel4DXT1<<<blockNum, NUM_THREADS>>>(d_bitmaps, d_data, (uint2 *)d_result);
|
||||||
|
}
|
||||||
|
|
||||||
extern "C" void compressWeightedKernelDXT1(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps)
|
extern "C" void compressWeightedKernelDXT1(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps)
|
||||||
{
|
{
|
||||||
compressWeightedDXT1<<<blockNum, NUM_THREADS>>>(d_bitmaps, d_data, (uint2 *)d_result);
|
compressWeightedDXT1<<<blockNum, NUM_THREADS>>>(d_bitmaps, d_data, (uint2 *)d_result);
|
||||||
|
@ -53,6 +53,7 @@ using namespace nvtt;
|
|||||||
|
|
||||||
extern "C" void setupCompressKernel(const float weights[3]);
|
extern "C" void setupCompressKernel(const float weights[3]);
|
||||||
extern "C" void compressKernelDXT1(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps);
|
extern "C" void compressKernelDXT1(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps);
|
||||||
|
extern "C" void compressKernelDXT1_Level4(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps);
|
||||||
extern "C" void compressWeightedKernelDXT1(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps);
|
extern "C" void compressWeightedKernelDXT1(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps);
|
||||||
extern "C" void compressNormalKernelDXT1(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps);
|
extern "C" void compressNormalKernelDXT1(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps);
|
||||||
extern "C" void compressKernelCTX1(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps);
|
extern "C" void compressKernelCTX1(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps);
|
||||||
@ -330,7 +331,14 @@ void CudaCompressor::compressDXT3(const CompressionOptions::Private & compressio
|
|||||||
cudaMemcpy(m_data, blockLinearImage + bn * 16, count * 64, cudaMemcpyHostToDevice);
|
cudaMemcpy(m_data, blockLinearImage + bn * 16, count * 64, cudaMemcpyHostToDevice);
|
||||||
|
|
||||||
// Launch kernel.
|
// Launch kernel.
|
||||||
compressWeightedKernelDXT1(count, m_data, m_result, m_bitmapTable);
|
if (m_alphaMode == AlphaMode_Transparency)
|
||||||
|
{
|
||||||
|
compressWeightedKernelDXT1(count, m_data, m_result, m_bitmapTable);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
compressKernelDXT1_Level4(count, m_data, m_result, m_bitmapTable);
|
||||||
|
}
|
||||||
|
|
||||||
// Compress alpha in parallel with the GPU.
|
// Compress alpha in parallel with the GPU.
|
||||||
for (uint i = 0; i < count; i++)
|
for (uint i = 0; i < count; i++)
|
||||||
@ -414,7 +422,14 @@ void CudaCompressor::compressDXT5(const CompressionOptions::Private & compressio
|
|||||||
cudaMemcpy(m_data, blockLinearImage + bn * 16, count * 64, cudaMemcpyHostToDevice);
|
cudaMemcpy(m_data, blockLinearImage + bn * 16, count * 64, cudaMemcpyHostToDevice);
|
||||||
|
|
||||||
// Launch kernel.
|
// Launch kernel.
|
||||||
compressWeightedKernelDXT1(count, m_data, m_result, m_bitmapTable);
|
if (m_alphaMode == AlphaMode_Transparency)
|
||||||
|
{
|
||||||
|
compressWeightedKernelDXT1(count, m_data, m_result, m_bitmapTable);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
compressKernelDXT1_Level4(count, m_data, m_result, m_bitmapTable);
|
||||||
|
}
|
||||||
|
|
||||||
// Compress alpha in parallel with the GPU.
|
// Compress alpha in parallel with the GPU.
|
||||||
for (uint i = 0; i < count; i++)
|
for (uint i = 0; i < count; i++)
|
||||||
|
Reference in New Issue
Block a user