You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
nvidia-texture-tools/extern/CMP_Core/shaders/BC1_Encode_kernel.cpp

583 lines
23 KiB
C++

//=====================================================================
// Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files(the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and / or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions :
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
//
//=====================================================================
#include "BC1_Encode_kernel.h"
//============================================== BC1 INTERFACES =======================================================
void CompressBlockBC1_Fast(
CMP_Vec4uc srcBlockTemp[16],
CMP_GLOBAL CGU_UINT32 compressedBlock[2])
{
int i, k;
CMP_Vec3f rgb;
CMP_Vec3f average_rgb; // The centrepoint of the axis
CMP_Vec3f v_rgb; // The axis
CMP_Vec3f uniques[16]; // The list of unique colours
int unique_pixels; // The number of unique pixels
CGU_FLOAT unique_recip; // Reciprocal of the above for fast multiplication
int index_map[16]; // The map of source pixels to unique indices
CGU_FLOAT pos_on_axis[16]; // The distance each unique falls along the compression axis
CGU_FLOAT dist_from_axis[16]; // The distance each unique falls from the compression axis
CGU_FLOAT left = 0, right = 0, centre = 0; // The extremities and centre (average of left/right) of uniques along the compression axis
CGU_FLOAT axis_mapping_error = 0; // The total computed error in mapping pixels to the axis
int swap; // Indicator if the RGB values need swapping to generate an opaque result
// -------------------------------------------------------------------------------------
// (3) Find the array of unique pixel values and sum them to find their average position
// -------------------------------------------------------------------------------------
{
// Find the array of unique pixel values and sum them to find their average position
int current_pixel, firstdiff;
current_pixel = unique_pixels = 0;
average_rgb = 0.0f;
firstdiff = -1;
for (i = 0; i<16; i++)
{
for (k = 0; k<i; k++)
if ((((srcBlockTemp[k].x ^ srcBlockTemp[i].x) & 0xf8) == 0) && (((srcBlockTemp[k].y ^ srcBlockTemp[i].y) & 0xfc) == 0) && (((srcBlockTemp[k].z ^ srcBlockTemp[i].z) & 0xf8) == 0))
break;
index_map[i] = current_pixel++;
//pixel_count[i] = 1;
CMP_Vec3f trgb;
rgb.x = (CGU_FLOAT)((srcBlockTemp[i].x) & 0xff);
rgb.y = (CGU_FLOAT)((srcBlockTemp[i].y) & 0xff);
rgb.z = (CGU_FLOAT)((srcBlockTemp[i].z) & 0xff);
trgb.x = CS_RED(rgb.x, rgb.y, rgb.z);
trgb.y = CS_GREEN(rgb.x, rgb.y, rgb.z);
trgb.z = CS_BLUE(rgb.x, rgb.y, rgb.z);
uniques[i] = trgb;
if (k == i)
{
unique_pixels++;
if ((i != 0) && (firstdiff < 0)) firstdiff = i;
}
average_rgb = average_rgb + trgb;
}
unique_pixels = 16;
// Compute average of the uniques
unique_recip = 1.0f / (CGU_FLOAT)unique_pixels;
average_rgb = average_rgb * unique_recip;
}
// -------------------------------------------------------------------------------------
// (4) For each component, reflect points about the average so all lie on the same side
// of the average, and compute the new average - this gives a second point that defines the axis
// To compute the sign of the axis sum the positive differences of G for each of R and B (the
// G axis is always positive in this implementation
// -------------------------------------------------------------------------------------
// An interesting situation occurs if the G axis contains no information, in which case the RB
// axis is also compared. I am not entirely sure if this is the correct implementation - should
// the priority axis be determined by magnitude?
{
CGU_FLOAT rg_pos, bg_pos, rb_pos;
v_rgb = 0.0f;
rg_pos = bg_pos = rb_pos = 0;
for (i = 0; i < unique_pixels; i++)
{
rgb = uniques[i] - average_rgb;
#ifndef ASPM_GPU
v_rgb.x += (CGU_FLOAT)fabs(rgb.x);
v_rgb.y += (CGU_FLOAT)fabs(rgb.y);
v_rgb.z += (CGU_FLOAT)fabs(rgb.z);
#else
v_rgb = v_rgb + fabs(rgb);
#endif
if (rgb.x > 0) { rg_pos += rgb.y; rb_pos += rgb.z; }
if (rgb.z > 0) bg_pos += rgb.y;
}
v_rgb = v_rgb*unique_recip;
if (rg_pos < 0) v_rgb.x = -v_rgb.x;
if (bg_pos < 0) v_rgb.z = -v_rgb.z;
if ((rg_pos == bg_pos) && (rg_pos == 0))
if (rb_pos < 0) v_rgb.z = -v_rgb.z;
}
// -------------------------------------------------------------------------------------
// (5) Axis projection and remapping
// -------------------------------------------------------------------------------------
{
CGU_FLOAT v2_recip;
// Normalise the axis for simplicity of future calculation
v2_recip = (v_rgb.x*v_rgb.x + v_rgb.y*v_rgb.y + v_rgb.z*v_rgb.z);
if (v2_recip > 0)
v2_recip = 1.0f / (CGU_FLOAT)sqrt(v2_recip);
else
v2_recip = 1.0f;
v_rgb = v_rgb*v2_recip;
}
// -------------------------------------------------------------------------------------
// (6) Map the axis
// -------------------------------------------------------------------------------------
// the line joining (and extended on either side of) average and axis
// defines the axis onto which the points will be projected
// Project all the points onto the axis, calculate the distance along
// the axis from the centre of the axis (average)
// From Foley & Van Dam: Closest point of approach of a line (P + v) to a point (R) is
// P + ((R-P).v) / (v.v))v
// The distance along v is therefore (R-P).v / (v.v)
// (v.v) is 1 if v is a unit vector.
//
// Calculate the extremities at the same time - these need to be reasonably accurately
// represented in all cases
//
// In this first calculation, also find the error of mapping the points to the axis - this
// is our major indicator of whether or not the block has compressed well - if the points
// map well onto the axis then most of the noise introduced is high-frequency noise
{
left = 10000.0f;
right = -10000.0f;
axis_mapping_error = 0;
for (i = 0; i < unique_pixels; i++)
{
// Compute the distance along the axis of the point of closest approach
CMP_Vec3f temp = (uniques[i] - average_rgb);
pos_on_axis[i] = (temp.x * v_rgb.x) + (temp.y * v_rgb.y) + (temp.z * v_rgb.z);
// Compute the actual point and thence the mapping error
rgb = uniques[i] - (average_rgb + (v_rgb * pos_on_axis[i]));
dist_from_axis[i] = rgb.x*rgb.x + rgb.y*rgb.y + rgb.z*rgb.z;
axis_mapping_error += dist_from_axis[i];
// Work out the extremities
if (pos_on_axis[i] < left)
left = pos_on_axis[i];
if (pos_on_axis[i] > right)
right = pos_on_axis[i];
}
}
// -------------------------------------------------------------------------------------
// (7) Now we have a good axis and the basic information about how the points are mapped
// to it
// Our initial guess is to represent the endpoints accurately, by moving the average
// to the centre and recalculating the point positions along the line
// -------------------------------------------------------------------------------------
{
centre = (left + right) / 2;
average_rgb = average_rgb + (v_rgb*centre);
for (i = 0; i<unique_pixels; i++)
pos_on_axis[i] -= centre;
right -= centre;
left -= centre;
// Accumulate our final resultant error
axis_mapping_error *= unique_recip * (1 / 255.0f);
}
// -------------------------------------------------------------------------------------
// (8) Calculate the high and low output colour values
// Involved in this is a rounding procedure which is undoubtedly slightly twitchy. A
// straight rounded average is not correct, as the decompressor 'unrounds' by replicating
// the top bits to the bottom.
// In order to take account of this process, we don't just apply a straight rounding correction,
// but base our rounding on the input value (a straight rounding is actually pretty good in terms of
// error measure, but creates a visual colour and/or brightness shift relative to the original image)
// The method used here is to apply a centre-biased rounding dependent on the input value, which was
// (mostly by experiment) found to give minimum MSE while preserving the visual characteristics of
// the image.
// rgb = (average_rgb + (left|right)*v_rgb);
// -------------------------------------------------------------------------------------
{
CGU_UINT32 c0, c1, t;
int rd, gd, bd;
rgb = (average_rgb + (v_rgb * left));
rd = ( CGU_INT32)DCS_RED(rgb.x, rgb.y, rgb.z);
gd = ( CGU_INT32)DCS_GREEN(rgb.x, rgb.y, rgb.z);
bd = ( CGU_INT32)DCS_BLUE(rgb.x, rgb.y, rgb.z);
ROUND_AND_CLAMP(rd, 5);
ROUND_AND_CLAMP(gd, 6);
ROUND_AND_CLAMP(bd, 5);
c0 = ((rd & 0xf8) << 8) + ((gd & 0xfc) << 3) + ((bd & 0xf8) >> 3);
rgb = average_rgb + (v_rgb * right);
rd = ( CGU_INT32)DCS_RED(rgb.x, rgb.y, rgb.z);
gd = ( CGU_INT32)DCS_GREEN(rgb.x, rgb.y, rgb.z);
bd = ( CGU_INT32)DCS_BLUE(rgb.x, rgb.y, rgb.z);
ROUND_AND_CLAMP(rd, 5);
ROUND_AND_CLAMP(gd, 6);
ROUND_AND_CLAMP(bd, 5);
c1 = (((rd & 0xf8) << 8) + ((gd & 0xfc) << 3) + ((bd & 0xf8) >> 3));
// Force to be a 4-colour opaque block - in which case, c0 is greater than c1
// blocktype == 4
{
if (c0 < c1)
{
t = c0;
c0 = c1;
c1 = t;
swap = 1;
}
else if (c0 == c1)
{
// This block will always be encoded in 3-colour mode
// Need to ensure that only one of the two points gets used,
// avoiding accidentally setting some transparent pixels into the block
for (i = 0; i<unique_pixels; i++)
pos_on_axis[i] = left;
swap = 0;
}
else
swap = 0;
}
compressedBlock[0] = c0 | (c1 << 16);
}
// -------------------------------------------------------------------------------------
// (9) Final clustering, creating the 2-bit values that define the output
// -------------------------------------------------------------------------------------
{
CGU_UINT32 bit;
CGU_FLOAT division;
CGU_FLOAT cluster_x[4];
CGU_FLOAT cluster_y[4];
int cluster_count[4];
// (blocktype == 4)
{
compressedBlock[1] = 0;
division = right*2.0f / 3.0f;
centre = (left + right) / 2; // Actually, this code only works if centre is 0 or approximately so
for (i = 0; i<4; i++)
{
cluster_x[i] = cluster_y[i] = 0.0f;
cluster_count[i] = 0;
}
for (i = 0; i<16; i++)
{
rgb.z = pos_on_axis[index_map[i]];
// Endpoints (indicated by block > average) are 0 and 1, while
// interpolants are 2 and 3
if (fabs(rgb.z) >= division)
bit = 0;
else
bit = 2;
// Positive is in the latter half of the block
if (rgb.z >= centre)
bit += 1;
// Set the output, taking swapping into account
compressedBlock[1] |= ((bit^swap) << (2 * i));
// Average the X and Y locations for each cluster
cluster_x[bit] += (CGU_FLOAT)(i & 3);
cluster_y[bit] += (CGU_FLOAT)(i >> 2);
cluster_count[bit]++;
}
for (i = 0; i<4; i++)
{
CGU_FLOAT cr;
if (cluster_count[i])
{
cr = 1.0f / cluster_count[i];
cluster_x[i] *= cr;
cluster_y[i] *= cr;
}
else
{
cluster_x[i] = cluster_y[i] = -1;
}
}
// patterns in axis position detection
// (same algorithm as used in the SSE version)
if ((compressedBlock[0] & 0xffff) != (compressedBlock[0] >> 16))
{
CGU_UINT32 i1, k1;
CGU_UINT32 x = 0, y = 0;
int xstep = 0, ystep = 0;
// Find a corner to search from
for (k1 = 0; k1<4; k1++)
{
switch (k1)
{
case 0:
x = 0; y = 0; xstep = 1; ystep = 1;
break;
case 1:
x = 0; y = 3; xstep = 1; ystep = -1;
break;
case 2:
x = 3; y = 0; xstep = -1; ystep = 1;
break;
case 3:
x = 3; y = 3; xstep = -1; ystep = -1;
break;
}
for (i1 = 0; i1<4; i1++)
{
if ((POS(x, y + ystep*i1) < POS(x + xstep, y + ystep*i1)) ||
(POS(x + xstep, y + ystep*i1) < POS(x + 2 * xstep, y + ystep*i1)) ||
(POS(x + 2 * xstep, y + ystep*i1) < POS(x + 3 * xstep, y + ystep*i1))
)
break;
if ((POS(x + xstep*i1, y) < POS(x + xstep*i1, y + ystep)) ||
(POS(x + xstep*i1, y + ystep) < POS(x + xstep*i1, y + 2 * ystep)) ||
(POS(x + xstep*i1, y + 2 * ystep) < POS(x + xstep*i1, y + 3 * ystep))
)
break;
}
if (i1 == 4)
break;
}
}
}
}
// done
}
INLINE void store_uint8(CMP_GLOBAL CGU_UINT8 u_dstptr[8], CGU_UINT32 data[2])
{
int shift = 0;
for (CGU_INT k=0; k<4; k++)
{
u_dstptr[k] = (data[0] >> shift)&0xFF;
shift += 8;
}
shift = 0;
for (CGU_INT k=4; k<8; k++)
{
u_dstptr[k] = (data[1] >> shift)&0xFF;
shift += 8;
}
}
void CompressBlockBC1_Internal(
const CMP_Vec4uc srcBlockTemp[16],
CMP_GLOBAL CGU_UINT32 compressedBlock[2],
CMP_GLOBAL const CMP_BC15Options *BC15options)
{
CGU_UINT8 blkindex = 0;
CGU_UINT8 srcindex = 0;
CGU_UINT8 rgbBlock[64];
for ( CGU_INT32 j = 0; j < 4; j++) {
for ( CGU_INT32 i = 0; i < 4; i++) {
rgbBlock[blkindex++] = (CGU_UINT8)srcBlockTemp[srcindex].z; // B
rgbBlock[blkindex++] = (CGU_UINT8)srcBlockTemp[srcindex].y; // G
rgbBlock[blkindex++] = (CGU_UINT8)srcBlockTemp[srcindex].x; // R
rgbBlock[blkindex++] = (CGU_UINT8)srcBlockTemp[srcindex].w; // A
srcindex++;
}
}
CMP_BC15Options internalOptions = *BC15options;
CalculateColourWeightings(rgbBlock, &internalOptions);
CompressRGBBlock(rgbBlock,
compressedBlock,
&internalOptions,
TRUE,
FALSE,
internalOptions.m_nAlphaThreshold);
}
//============================================== USER INTERFACES ========================================================
#ifndef ASPM_GPU
int CMP_CDECL CreateOptionsBC1(void **options)
{
CMP_BC15Options *BC15optionsDefault = new CMP_BC15Options;
if (BC15optionsDefault) {
SetDefaultBC15Options(BC15optionsDefault);
(*options) = BC15optionsDefault;
}
else {
(*options) = NULL;
return CGU_CORE_ERR_NEWMEM;
}
return CGU_CORE_OK;
}
int CMP_CDECL DestroyOptionsBC1(void *options)
{
if (!options) return CGU_CORE_ERR_INVALIDPTR;
CMP_BC15Options *BCOptions = reinterpret_cast <CMP_BC15Options *>(options);
delete BCOptions;
return CGU_CORE_OK;
}
int CMP_CDECL SetQualityBC1(void *options,
CGU_FLOAT fquality)
{
if (!options) return CGU_CORE_ERR_NEWMEM;
CMP_BC15Options *BC15optionsDefault = reinterpret_cast <CMP_BC15Options *>(options);
if (fquality < 0.0f) fquality = 0.0f;
else
if (fquality > 1.0f) fquality = 1.0f;
BC15optionsDefault->m_fquality = fquality;
return CGU_CORE_OK;
}
int CMP_CDECL SetAlphaThresholdBC1(void *options,
CGU_UINT8 alphaThreshold)
{
if (!options) return CGU_CORE_ERR_INVALIDPTR;
CMP_BC15Options *BC15optionsDefault = reinterpret_cast <CMP_BC15Options *>(options);
BC15optionsDefault->m_nAlphaThreshold = alphaThreshold;
return CGU_CORE_OK;
}
int CMP_CDECL SetDecodeChannelMapping(void *options,
CGU_BOOL mapRGBA)
{
if (!options) return CGU_CORE_ERR_INVALIDPTR;
CMP_BC15Options *BC15optionsDefault = reinterpret_cast <CMP_BC15Options *>(options);
BC15optionsDefault->m_mapDecodeRGBA = mapRGBA;
return CGU_CORE_OK;
}
int CMP_CDECL SetChannelWeightsBC1(void *options,
CGU_FLOAT WeightRed,
CGU_FLOAT WeightGreen,
CGU_FLOAT WeightBlue) {
if (!options) return CGU_CORE_ERR_INVALIDPTR;
CMP_BC15Options *BC15optionsDefault = (CMP_BC15Options *)options;
if ((WeightRed < 0.0f) || (WeightRed > 1.0f)) return CGU_CORE_ERR_RANGERED;
if ((WeightGreen < 0.0f) || (WeightGreen > 1.0f)) return CGU_CORE_ERR_RANGEGREEN;
if ((WeightBlue < 0.0f) || (WeightBlue > 1.0f)) return CGU_CORE_ERR_RANGEBLUE;
BC15optionsDefault->m_bUseChannelWeighting = true;
BC15optionsDefault->m_fChannelWeights[0] = WeightRed;
BC15optionsDefault->m_fChannelWeights[1] = WeightGreen;
BC15optionsDefault->m_fChannelWeights[2] = WeightBlue;
return CGU_CORE_OK;
}
int CMP_CDECL CompressBlockBC1(const unsigned char *srcBlock,
unsigned int srcStrideInBytes,
CMP_GLOBAL unsigned char cmpBlock[8],
const void *options = NULL) {
CMP_Vec4uc inBlock[16];
//----------------------------------
// Fill the inBlock with source data
//----------------------------------
CGU_INT srcpos = 0;
CGU_INT dstptr = 0;
for (CGU_UINT8 row=0; row < 4; row++)
{
srcpos = row * srcStrideInBytes;
for (CGU_UINT8 col = 0; col < 4; col++)
{
inBlock[dstptr].x = CGU_UINT8(srcBlock[srcpos++]);
inBlock[dstptr].y = CGU_UINT8(srcBlock[srcpos++]);
inBlock[dstptr].z = CGU_UINT8(srcBlock[srcpos++]);
inBlock[dstptr].w = CGU_UINT8(srcBlock[srcpos++]);
dstptr++;
}
}
CMP_BC15Options *BC15options = (CMP_BC15Options *)options;
CMP_BC15Options BC15optionsDefault;
if (BC15options == NULL)
{
BC15options = &BC15optionsDefault;
SetDefaultBC15Options(BC15options);
}
CompressBlockBC1_Internal(inBlock, (CMP_GLOBAL CGU_UINT32 *)cmpBlock, BC15options);
return CGU_CORE_OK;
}
int CMP_CDECL DecompressBlockBC1(const unsigned char cmpBlock[8],
CMP_GLOBAL unsigned char srcBlock[64],
const void *options = NULL) {
CMP_BC15Options *BC15options = (CMP_BC15Options *)options;
CMP_BC15Options BC15optionsDefault;
if (BC15options == NULL)
{
BC15options = &BC15optionsDefault;
SetDefaultBC15Options(BC15options);
}
DecompressDXTRGB_Internal(srcBlock, ( CGU_UINT32 *)cmpBlock, BC15options);
return CGU_CORE_OK;
}
#endif
//============================================== OpenCL USER INTERFACE ========================================================
#ifdef ASPM_GPU
CMP_STATIC CMP_KERNEL void CMP_GPUEncoder(
CMP_GLOBAL const CMP_Vec4uc* ImageSource,
CMP_GLOBAL CGU_UINT8* ImageDestination,
CMP_GLOBAL Source_Info* SourceInfo,
CMP_GLOBAL CMP_BC15Options* BC15options
)
{
CGU_UINT32 xID;
CGU_UINT32 yID;
//printf("SourceInfo: (H:%d,W:%d) Quality %1.2f \n", SourceInfo->m_src_height, SourceInfo->m_src_width, SourceInfo->m_fquality);
#ifdef ASPM_GPU
xID = get_global_id(0);
yID = get_global_id(1);
#else
xID = 0;
yID = 0;
#endif
if (xID >= (SourceInfo->m_src_width / BlockX)) return;
if (yID >= (SourceInfo->m_src_height / BlockX)) return;
int srcWidth = SourceInfo->m_src_width;
CGU_UINT32 destI = (xID*BC1CompBlockSize) + (yID*(srcWidth / BlockX)*BC1CompBlockSize);
int srcindex = 4 * (yID * srcWidth + xID);
int blkindex = 0;
CMP_Vec4uc srcData[16];
srcWidth = srcWidth - 4;
for ( CGU_INT32 j = 0; j < 4; j++) {
for ( CGU_INT32 i = 0; i < 4; i++) {
srcData[blkindex++] = ImageSource[srcindex++];
}
srcindex += srcWidth;
}
// fast low quality mode that matches v3.1 code
if (SourceInfo->m_fquality <= 0.04f)
CompressBlockBC1_Fast(srcData, (CMP_GLOBAL CGU_UINT32 *)&ImageDestination[destI]);
else
CompressBlockBC1_Internal(srcData, (CMP_GLOBAL CGU_UINT32 *)&ImageDestination[destI], BC15options);
}
#endif