You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
5490 lines
193 KiB
C++
5490 lines
193 KiB
C++
//=====================================================================
|
|
// Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
|
//
|
|
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
// of this software and associated documentation files(the "Software"), to deal
|
|
// in the Software without restriction, including without limitation the rights
|
|
// to use, copy, modify, merge, publish, distribute, sublicense, and / or sell
|
|
// copies of the Software, and to permit persons to whom the Software is
|
|
// furnished to do so, subject to the following conditions :
|
|
//
|
|
// The above copyright notice and this permission notice shall be included in
|
|
// all copies or substantial portions of the Software.
|
|
//
|
|
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE
|
|
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
// THE SOFTWARE.
|
|
//
|
|
//=====================================================================
|
|
// Ref: GPUOpen-Tools/Compressonator
|
|
|
|
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
// Copyright (c) 2016, Intel Corporation
|
|
// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
|
|
// documentation files (the "Software"), to deal in the Software without restriction, including without limitation
|
|
// the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
|
|
// permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
|
// The above copyright notice and this permission notice shall be included in all copies or substantial portions of
|
|
// the Software.
|
|
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO
|
|
// THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
|
// TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
// SOFTWARE.
|
|
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
|
|
//--------------------------------------
|
|
// Common BC7 Header
|
|
//--------------------------------------
|
|
#include "BC7_Encode_Kernel.h"
|
|
|
|
#ifndef ASPM
|
|
//---------------------------------------------
|
|
// Predefinitions for GPU and CPU compiled code
|
|
//---------------------------------------------
|
|
#define ENABLE_CODE
|
|
|
|
#ifndef ASPM_GPU
|
|
// using code for CPU or hybrid (CPU/GPU)
|
|
//#include "BC7.h"
|
|
#endif
|
|
|
|
|
|
INLINE CGU_INT a_compare( const void *arg1, const void *arg2 )
|
|
{
|
|
if (((CMP_di* )arg1)->image-((CMP_di* )arg2)->image > 0 ) return 1;
|
|
if (((CMP_di* )arg1)->image-((CMP_di* )arg2)->image < 0 ) return -1;
|
|
return 0;
|
|
};
|
|
|
|
#endif
|
|
|
|
#ifndef ASPM_GPU
|
|
CMP_GLOBAL BC7_EncodeRamps BC7EncodeRamps
|
|
#ifndef ASPM
|
|
= {0}
|
|
#endif
|
|
;
|
|
|
|
//---------------------------------------------
|
|
// CPU: Computes max of two float values
|
|
//---------------------------------------------
|
|
float bc7_maxf(float l1, float r1)
|
|
{
|
|
return (l1 > r1 ? l1 : r1);
|
|
}
|
|
|
|
//---------------------------------------------
|
|
// CPU: Computes max of two float values
|
|
//---------------------------------------------
|
|
float bc7_minf(float l1, float r1)
|
|
{
|
|
return (l1 < r1 ? l1 : r1);
|
|
}
|
|
|
|
#endif
|
|
|
|
INLINE CGV_EPOCODE shift_right_epocode(CGV_EPOCODE v, CGU_INT bits)
|
|
{
|
|
return v>>bits; // (perf warning expected)
|
|
}
|
|
|
|
INLINE CGV_EPOCODE expand_epocode(CGV_EPOCODE v, CGU_INT bits)
|
|
{
|
|
CGV_EPOCODE vv = v<<(8-bits);
|
|
return vv + shift_right_epocode(vv, bits);
|
|
}
|
|
|
|
// valid bit range is 0..8
|
|
CGU_INT expandbits(CGU_INT bits, CGU_INT v)
|
|
{
|
|
return ( v << (8-bits) | v >> (2* bits - 8));
|
|
}
|
|
|
|
CMP_EXPORT CGU_INT bc7_isa() {
|
|
#if defined(ISPC_TARGET_SSE2)
|
|
ASPM_PRINT(("SSE2"));
|
|
return 0;
|
|
#elif defined(ISPC_TARGET_SSE4)
|
|
ASPM_PRINT(("SSE4"));
|
|
return 1;
|
|
#elif defined(ISPC_TARGET_AVX)
|
|
ASPM_PRINT(("AVX"));
|
|
return 2;
|
|
#elif defined(ISPC_TARGET_AVX2)
|
|
ASPM_PRINT(("AVX2"));
|
|
return 3;
|
|
#else
|
|
ASPM_PRINT(("CPU"));
|
|
return -1;
|
|
#endif
|
|
}
|
|
|
|
CMP_EXPORT void init_BC7ramps()
|
|
{
|
|
#ifdef ASPM_GPU
|
|
#else
|
|
CMP_STATIC CGU_BOOL g_rampsInitialized = FALSE;
|
|
if (g_rampsInitialized == TRUE) return;
|
|
g_rampsInitialized = TRUE;
|
|
BC7EncodeRamps.ramp_init = TRUE;
|
|
|
|
//bc7_isa(); ASPM_PRINT((" INIT Ramps\n"));
|
|
|
|
CGU_INT bits;
|
|
CGU_INT p1;
|
|
CGU_INT p2;
|
|
CGU_INT clogBC7;
|
|
CGU_INT index;
|
|
CGU_INT j;
|
|
CGU_INT o1;
|
|
CGU_INT o2;
|
|
CGU_INT maxi = 0;
|
|
|
|
|
|
for (bits = BIT_BASE; bits<BIT_RANGE; bits++)
|
|
{
|
|
for (p1 = 0; p1<(1 << bits); p1++)
|
|
{
|
|
BC7EncodeRamps.ep_d[BTT(bits)][p1] = expandbits(bits, p1);
|
|
} //p1
|
|
}//bits<BIT_RANGE
|
|
|
|
for (clogBC7 = LOG_CL_BASE; clogBC7<LOG_CL_RANGE; clogBC7++)
|
|
{
|
|
for (bits = BIT_BASE; bits<BIT_RANGE; bits++)
|
|
{
|
|
|
|
#ifdef USE_BC7_RAMP
|
|
for (p1 = 0; p1<(1 << bits); p1++)
|
|
{
|
|
for (p2 = 0; p2<(1 << bits); p2++)
|
|
{
|
|
for (index = 0; index<(1 << clogBC7); index++)
|
|
{
|
|
if (index > maxi) maxi = index;
|
|
BC7EncodeRamps.ramp[(CLT(clogBC7)*4*256*256*16)+(BTT(bits)*256*256*16)+(p1*256*16)+(p2*16)+index] =
|
|
//floor((CGV_IMAGE)BC7EncodeRamps.ep_d[BTT(bits)][p1] + rampWeights[clogBC7][index] * (CGV_IMAGE)((BC7EncodeRamps.ep_d[BTT(bits)][p2] - BC7EncodeRamps.ep_d[BTT(bits)][p1]))+ 0.5F);
|
|
floor(BC7EncodeRamps.ep_d[BTT(bits)][p1] + rampWeights[clogBC7][index] *((BC7EncodeRamps.ep_d[BTT(bits)][p2] - BC7EncodeRamps.ep_d[BTT(bits)][p1]))+ 0.5F);
|
|
}//index<(1 << clogBC7)
|
|
}//p2<(1 << bits)
|
|
}//p1<(1 << bits)
|
|
#endif
|
|
|
|
#ifdef USE_BC7_SP_ERR_IDX
|
|
for (j = 0; j<256; j++)
|
|
{
|
|
for (o1 = 0; o1<2; o1++)
|
|
{
|
|
for (o2 = 0; o2<2; o2++)
|
|
{
|
|
for (index = 0; index<16; index++) {
|
|
BC7EncodeRamps.sp_idx[(CLT(clogBC7)*4*256*2*2*16*2)+(BTT(bits)*256*2*2*16*2)+(j*2*2*16*2)+(o1*2*16*2)+(o2*16*2)+(index*2)+0] = 0;
|
|
BC7EncodeRamps.sp_idx[(CLT(clogBC7)*4*256*2*2*16*2)+(BTT(bits)*256*2*2*16*2)+(j*2*2*16*2)+(o1*2*16*2)+(o2*16*2)+(index*2)+1] = 255;
|
|
BC7EncodeRamps.sp_err[(CLT(clogBC7)*4*256*2*2*16)+(BTT(bits)*256*2*2*16)+(j*2*2*16)+(o1*2*16)+(o2*16)+index] = 255;
|
|
} // i<16
|
|
}//o2<2;
|
|
}//o1<2
|
|
} //j<256
|
|
|
|
for (p1 = 0; p1<(1 << bits); p1++)
|
|
{
|
|
for (p2 = 0; p2<(1 << bits); p2++)
|
|
{
|
|
for (index = 0; index<(1 << clogBC7); index++)
|
|
{
|
|
#ifdef USE_BC7_RAMP
|
|
CGV_EPOCODE floatf = (CGV_EPOCODE)BC7EncodeRamps.ramp[(CLT(clogBC7)*4*256*256*16)+(BTT(bits)*256*256*16)+(p1*256*16)+(p2*16)+index];
|
|
#else
|
|
CGV_EPOCODE floatf = floor((CGV_IMAGE)BC7EncodeRamps.ep_d[BTT(bits)][p1] + rampWeights[clogBC7][index] * (CGV_IMAGE)((BC7EncodeRamps.ep_d[BTT(bits)][p2] - BC7EncodeRamps.ep_d[BTT(bits)][p1]))+ 0.5F);
|
|
#endif
|
|
BC7EncodeRamps.sp_idx[(CLT(clogBC7)*4*256*2*2*16*2)+(BTT(bits)*256*2*2*16*2)+(floatf*2*2*16*2)+((p1 & 0x1)*2*16*2)+((p2 & 0x1)*16*2)+(index*2)+0] = p1;
|
|
BC7EncodeRamps.sp_idx[(CLT(clogBC7)*4*256*2*2*16*2)+(BTT(bits)*256*2*2*16*2)+(floatf*2*2*16*2)+((p1 & 0x1)*2*16*2)+((p2 & 0x1)*16*2)+(index*2)+1] = p2;
|
|
BC7EncodeRamps.sp_err[(CLT(clogBC7)*4*256*2*2*16)+(BTT(bits)*256*2*2*16)+(floatf*2*2*16)+((p1 & 0x1)*2*16)+(p2 & 0x1*16)+index] = 0;
|
|
} //i<(1 << clogBC7)
|
|
} //p2
|
|
}//p1<(1 << bits)
|
|
|
|
for (j = 0; j<256; j++)
|
|
{
|
|
for (o1 = 0; o1<2; o1++)
|
|
{
|
|
for (o2 = 0; o2<2; o2++)
|
|
{
|
|
for (index = 0; index<(1 << clogBC7); index++)
|
|
{
|
|
if ( // check for unitialized sp_idx
|
|
(BC7EncodeRamps.sp_idx[(CLT(clogBC7)*4*256*2*2*16*2)+(BTT(bits)*256*2*2*16*2)+(j*2*2*16*2)+(o1*2*16*2)+(o2*16*2)+(index*2)+0] == 0) &&
|
|
(BC7EncodeRamps.sp_idx[(CLT(clogBC7)*4*256*2*2*16*2)+(BTT(bits)*256*2*2*16*2)+(j*2*2*16*2)+(o1*2*16*2)+(o2*16*2)+(index*2)+1] == 255)
|
|
)
|
|
|
|
{
|
|
CGU_INT k;
|
|
CGU_INT tf;
|
|
CGU_INT tc;
|
|
|
|
for (k = 1; k<256; k++)
|
|
{
|
|
tf = j - k;
|
|
tc = j + k;
|
|
if ((tf >= 0 && BC7EncodeRamps.sp_err[(CLT(clogBC7)*4*256*2*2*16)+(BTT(bits)*256*2*2*16)+(tf*2*2*16)+(o1*2*16)+(o2*16)+index] == 0))
|
|
{
|
|
BC7EncodeRamps.sp_idx[(CLT(clogBC7)*4*256*2*2*16*2)+(BTT(bits)*256*2*2*16*2)+(j*2*2*16*2)+(o1*2*16*2)+(o2*16*2)+(index*2)+0] = BC7EncodeRamps.sp_idx[(CLT(clogBC7)*4*256*2*2*16*2)+(BTT(bits)*256*2*2*16*2)+(tf*2*2*16*2)+(o1*2*16*2)+(o2*16*2)+(index*2)+0];
|
|
BC7EncodeRamps.sp_idx[(CLT(clogBC7)*4*256*2*2*16*2)+(BTT(bits)*256*2*2*16*2)+(j*2*2*16*2)+(o1*2*16*2)+(o2*16*2)+(index*2)+1] = BC7EncodeRamps.sp_idx[(CLT(clogBC7)*4*256*2*2*16*2)+(BTT(bits)*256*2*2*16*2)+(tf*2*2*16*2)+(o1*2*16*2)+(o2*16*2)+(index*2)+1];
|
|
break;
|
|
}
|
|
else if ((tc < 256 && BC7EncodeRamps.sp_err[(CLT(clogBC7)*4*256*2*2*16)+(BTT(bits)*256*2*2*16)+(tc*2*2*16)+(o1*2*16)+(o2*16)+index] == 0))
|
|
{
|
|
BC7EncodeRamps.sp_idx[(CLT(clogBC7)*4*256*2*2*16*2)+(BTT(bits)*256*2*2*16*2)+(j*2*2*16*2)+(o1*2*16*2)+(o2*16*2)+(index*2)+0] = BC7EncodeRamps.sp_idx[(CLT(clogBC7)*4*256*2*2*16*2)+(BTT(bits)*256*2*2*16*2)+(tc*2*2*16*2)+(o1*2*16*2)+(o2*16*2)+(index*2)+0];
|
|
break;
|
|
}
|
|
}
|
|
|
|
//BC7EncodeRamps.sp_err[(CLT(clogBC7)*4*256*2*2*16)+(BTT(bits)*256*2*2*16)+(j*2*2*16)+(o1*2*16)+(o2*16)+index] = (CGV_ERROR) k;
|
|
BC7EncodeRamps.sp_err[(CLT(clogBC7)*4*256*2*2*16)+(BTT(bits)*256*2*2*16)+(j*2*2*16)+(o1*2*16)+(o2*16)+index] = (CGU_UINT8)k;
|
|
|
|
} //sp_idx < 0
|
|
}//i<(1 << clogBC7)
|
|
}//o2
|
|
}//o1
|
|
}//j
|
|
#endif
|
|
|
|
} //bits<BIT_RANGE
|
|
} //clogBC7<LOG_CL_RANGE
|
|
#endif
|
|
}
|
|
|
|
//----------------------------------------------------------
|
|
//====== Common BC7 ASPM Code used for SPMD (CPU/GPU) ======
|
|
//----------------------------------------------------------
|
|
#ifndef ASPM_GPU
|
|
//#define USE_ICMP
|
|
#endif
|
|
|
|
#define SOURCE_BLOCK_SIZE 16 // Size of a source block in pixels (each pixel has RGBA:8888 channels)
|
|
#define COMPRESSED_BLOCK_SIZE 16 // Size of a compressed block in bytes
|
|
#define MAX_CHANNELS 4
|
|
#define MAX_SUBSETS 3 // Maximum number of possible subsets
|
|
#define MAX_SUBSET_SIZE 16 // Largest possible size for an individual subset
|
|
|
|
#ifndef ASPM_GPU
|
|
extern "C" CGU_INT timerStart(CGU_INT id);
|
|
extern "C" CGU_INT timerEnd(CGU_INT id);
|
|
|
|
#define TIMERSTART(x) timerStart(x)
|
|
#define TIMEREND(x) timerEnd(x)
|
|
#else
|
|
#define TIMERSTART(x)
|
|
#define TIMEREND(x)
|
|
#endif
|
|
|
|
#ifdef ASPM_GPU
|
|
#define GATHER_UINT8(x,y) x[y]
|
|
#else
|
|
#define GATHER_UINT8(x,y) gather_uint8(x,y)
|
|
#endif
|
|
// INLINE CGV_BYTE gather_uint8 (CMP_CONSTANT CGU_UINT8 * __constant uniform ptr, CGV_INT idx)
|
|
// {
|
|
// return ptr[idx]; // (perf warning expected)
|
|
// }
|
|
//
|
|
// INLINE CGV_CMPOUT gather_cmpout(CMP_CONSTANT CGV_CMPOUT * __constant uniform ptr, CGU_INT idx)
|
|
// {
|
|
// return ptr[idx]; // (perf warning expected)
|
|
// }
|
|
//
|
|
//INLINE CGV_INDEX gather_index(CMP_CONSTANT varying CGV_INDEX* __constant uniform ptr, CGV_INT idx)
|
|
//{
|
|
// return ptr[idx]; // (perf warning expected)
|
|
//}
|
|
//
|
|
//INLINE void scatter_index(CGV_INDEX* ptr, CGV_INT idx, CGV_INDEX value)
|
|
//{
|
|
// ptr[idx] = value; // (perf warning expected)
|
|
//}
|
|
//
|
|
|
|
#ifdef USE_VARYING
|
|
INLINE CGV_EPOCODE gather_epocode(CMP_CONSTANT CGV_EPOCODE* ptr, CGV_TYPEINT idx)
|
|
{
|
|
return ptr[idx]; // (perf warning expected)
|
|
}
|
|
#endif
|
|
|
|
INLINE CGV_SHIFT32 gather_partid(CMP_CONSTANT CGV_SHIFT32 * uniform ptr, CGV_PARTID idx)
|
|
{
|
|
return ptr[idx]; // (perf warning expected)
|
|
}
|
|
|
|
//INLINE CGV_BYTE gather_vuint8(CMP_CONSTANT varying CGV_BYTE* __constant uniform ptr, CGV_INT idx)
|
|
//{
|
|
// return ptr[idx]; // (perf warning expected)
|
|
//}
|
|
|
|
INLINE void cmp_swap_epo(CGV_EPOCODE u[], CGV_EPOCODE v[], CGV_EPOCODE n)
|
|
{
|
|
for (CGU_INT i=0; i<n; i++)
|
|
{
|
|
CGV_EPOCODE t = u[i];
|
|
u[i] = v[i];
|
|
v[i] = t;
|
|
}
|
|
}
|
|
|
|
INLINE void cmp_swap_index(CGV_INDEX u[], CGV_INDEX v[], CGU_INT n)
|
|
{
|
|
for (CGU_INT i=0; i<n; i++)
|
|
{
|
|
CGV_INDEX t = u[i];
|
|
u[i] = v[i];
|
|
v[i] = t;
|
|
}
|
|
}
|
|
|
|
void cmp_memsetBC7(CGV_BYTE ptr[], CGV_BYTE value, CGU_UINT32 size)
|
|
{
|
|
for (CGV_SHIFT32 i=0; i<size; i++)
|
|
{
|
|
ptr[i] = value;
|
|
}
|
|
}
|
|
|
|
void cmp_memcpy(CMP_GLOBAL CGU_UINT8 dst[],CGU_UINT8 src[],CGU_UINT32 size)
|
|
{
|
|
#ifdef ASPM_GPU
|
|
for (CGV_INT i=0; i<size; i++)
|
|
{
|
|
dst[i] = src[i];
|
|
}
|
|
#else
|
|
memcpy(dst,src,size);
|
|
#endif
|
|
}
|
|
|
|
INLINE CGV_IMAGE sq_image(CGV_IMAGE v)
|
|
{
|
|
return v*v;
|
|
}
|
|
|
|
INLINE CGV_EPOCODE clampEPO(CGV_EPOCODE v, CGV_EPOCODE a, CGV_EPOCODE b)
|
|
{
|
|
if (v < a)
|
|
return a;
|
|
else
|
|
if (v > b)
|
|
return b;
|
|
return v;
|
|
}
|
|
|
|
INLINE CGV_INDEX clampIndex(CGV_INDEX v, CGV_INDEX a, CGV_INDEX b)
|
|
{
|
|
if (v < a)
|
|
return a;
|
|
else
|
|
if (v > b)
|
|
return b;
|
|
return v;
|
|
}
|
|
|
|
INLINE CGV_SHIFT32 shift_right_uint32(CGV_SHIFT32 v, CGU_INT bits)
|
|
{
|
|
return v>>bits; // (perf warning expected)
|
|
}
|
|
|
|
INLINE CGV_BYTE shift_right_uint8(CGV_BYTE v, CGU_UINT8 bits)
|
|
{
|
|
return v>>bits; // (perf warning expected)
|
|
}
|
|
|
|
INLINE CGV_BYTE shift_right_uint8V(CGV_BYTE v, CGV_UINT8 bits)
|
|
{
|
|
return v>>bits; // (perf warning expected)
|
|
}
|
|
|
|
// valid bit range is 0..8
|
|
INLINE CGV_EPOCODE expandEPObits(CGV_EPOCODE v, uniform CGV_EPOCODE bits)
|
|
{
|
|
CGV_EPOCODE vv = v<<(8-bits);
|
|
return vv + shift_right_uint32(vv, bits);
|
|
}
|
|
|
|
CGV_ERROR err_absf(CGV_ERROR a) { return a>0.0F?a:-a;}
|
|
CGV_IMAGE img_absf(CGV_IMAGE a) { return a>0.0F?a:-a;}
|
|
|
|
CGU_UINT8 min8(CGU_UINT8 a, CGU_UINT8 b) { return a<b?a:b;}
|
|
CGU_UINT8 max8(CGU_UINT8 a, CGU_UINT8 b) { return a>b?a:b;}
|
|
|
|
void pack_index(CGV_INDEXPACKED packed_index[2], CGV_INDEX src_index[MAX_SUBSET_SIZE])
|
|
{
|
|
// Converts from unpacked index to packed index
|
|
packed_index[0] = 0x0000;
|
|
packed_index[1] = 0x0000;
|
|
CGV_BYTE shift = 0; // was CGV_UINT8
|
|
for (CGU_INT k=0; k<16; k++)
|
|
{
|
|
packed_index[k/8] |= (CGV_UINT32)(src_index[k]&0x0F) << shift;
|
|
shift +=4;
|
|
}
|
|
}
|
|
|
|
void unpack_index(CGV_INDEX unpacked_index[MAX_SUBSET_SIZE],CGV_INDEXPACKED src_packed[2])
|
|
{
|
|
// Converts from packed index to unpacked index
|
|
CGV_BYTE shift = 0; // was CGV_UINT8
|
|
for (CGV_BYTE k=0; k<16; k++)
|
|
{
|
|
unpacked_index[k] = (CGV_BYTE)(src_packed[k/8] >> shift)&0xF;
|
|
if (k == 7)
|
|
shift = 0;
|
|
else
|
|
shift +=4;
|
|
}
|
|
}
|
|
|
|
//====================================== CMP MATH UTILS ============================================
|
|
CGV_ERROR err_Total(
|
|
CGV_IMAGE image_src1[SOURCE_BLOCK_SIZE*MAX_CHANNELS],
|
|
CGV_IMAGE image_src2[SOURCE_BLOCK_SIZE*MAX_CHANNELS],
|
|
CGV_ENTRIES numEntries, // < 16
|
|
CGU_CHANNEL channels3or4) // IN: 3 = RGB or 4 = RGBA (4 = MAX_CHANNELS)
|
|
{
|
|
CGV_ERROR err_t=0.0F;
|
|
for (CGU_CHANNEL ch=0;ch<channels3or4; ch++)
|
|
for (CGV_ENTRIES k=0;k<numEntries;k++)
|
|
{
|
|
err_t = err_t + sq_image(image_src1[k+ch*SOURCE_BLOCK_SIZE]-image_src2[k+ch*SOURCE_BLOCK_SIZE]);
|
|
}
|
|
return err_t;
|
|
};
|
|
|
|
void GetImageCentered(
|
|
CGV_IMAGE image_centered_out[SOURCE_BLOCK_SIZE*MAX_CHANNELS],
|
|
CGV_IMAGE mean_out[MAX_CHANNELS],
|
|
CGV_IMAGE image_src[SOURCE_BLOCK_SIZE*MAX_CHANNELS],
|
|
CGV_ENTRIES numEntries, // < 16
|
|
CGU_CHANNEL channels3or4) // IN: 3 = RGB or 4 = RGBA (4 = MAX_CHANNELS)
|
|
{
|
|
for (CGU_CHANNEL ch=0; ch<channels3or4; ch++)
|
|
{
|
|
mean_out[ch]=0.0F;
|
|
if (numEntries > 0)
|
|
{
|
|
for (CGV_ENTRIES k=0;k<numEntries;k++)
|
|
{
|
|
mean_out[ch] = mean_out[ch] + image_src[k+(ch*SOURCE_BLOCK_SIZE)];
|
|
}
|
|
mean_out[ch] /= numEntries;
|
|
for (CGV_ENTRIES k=0;k<numEntries;k++)
|
|
image_centered_out[k+(ch*SOURCE_BLOCK_SIZE)] = image_src[k+(ch*SOURCE_BLOCK_SIZE)] - mean_out[ch];
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
void GetCovarianceVector(
|
|
CGV_IMAGE covariance_out[MAX_CHANNELS*MAX_CHANNELS], // OUT: Covariance vector
|
|
CGV_IMAGE image_centered[SOURCE_BLOCK_SIZE*MAX_CHANNELS],
|
|
CGV_ENTRIES numEntries, // < 16
|
|
CGU_CHANNEL channels3or4) // IN: 3 = RGB or 4 = RGBA (4 = MAX_CHANNELS)
|
|
{
|
|
for (CGU_CHANNEL ch1=0; ch1<channels3or4; ch1++)
|
|
for (CGU_CHANNEL ch2=0;ch2<=ch1;ch2++)
|
|
{
|
|
covariance_out[ch1+ch2*4]=0;
|
|
for (CGV_ENTRIES k=0;k<numEntries;k++)
|
|
covariance_out[ch1+ch2*4] += image_centered[k+(ch1*SOURCE_BLOCK_SIZE)]*image_centered[k+(ch2*SOURCE_BLOCK_SIZE)];
|
|
}
|
|
|
|
for (CGU_CHANNEL ch1=0; ch1<channels3or4; ch1++)
|
|
for (CGU_CHANNEL ch2=ch1+1;ch2<channels3or4;ch2++)
|
|
covariance_out[ch1+ch2*4] = covariance_out[ch2+ch1*4];
|
|
}
|
|
|
|
void GetProjecedImage(
|
|
CGV_IMAGE projection_out[SOURCE_BLOCK_SIZE], //output projected data
|
|
CGV_IMAGE image_centered[SOURCE_BLOCK_SIZE*MAX_CHANNELS],
|
|
CGV_ENTRIES numEntries, // < 16
|
|
CGV_IMAGE EigenVector[MAX_CHANNELS],
|
|
CGU_CHANNEL channels3or4) // 3 = RGB or 4 = RGBA
|
|
{
|
|
projection_out[0] = 0.0F;
|
|
|
|
// EigenVector must be normalized
|
|
for (CGV_ENTRIES k=0; k<numEntries; k++)
|
|
{
|
|
projection_out[k]=0.0F;
|
|
for (CGU_CHANNEL ch=0; ch<channels3or4; ch++)
|
|
{
|
|
projection_out[k] = projection_out[k] + (image_centered[k+(ch*SOURCE_BLOCK_SIZE)]*EigenVector[ch]);
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
INLINE CGV_UINT8 get_partition_subset(CGV_INT part_id, CGU_INT maxSubsets, CGV_INT index)
|
|
{
|
|
CMP_STATIC uniform CMP_CONSTANT CGU_UINT32 subset_mask_table[] = {
|
|
// 2 subset region patterns
|
|
0x0000CCCCu, // 0 1100 1100 1100 1100 (MSB..LSB)
|
|
0x00008888u, // 1 1000 1000 1000 1000
|
|
0x0000EEEEu, // 2 1110 1110 1110 1110
|
|
0x0000ECC8u, // 3 1110 1100 1100 1000
|
|
0x0000C880u, // 4 1100 1000 1000 0000
|
|
0x0000FEECu, // 5 1111 1110 1110 1100
|
|
0x0000FEC8u, // 6 1111 1110 1100 1000
|
|
0x0000EC80u, // 7 1110 1100 1000 0000
|
|
0x0000C800u, // 8 1100 1000 0000 0000
|
|
0x0000FFECu, // 9 1111 1111 1110 1100
|
|
0x0000FE80u, // 10 1111 1110 1000 0000
|
|
0x0000E800u, // 11 1110 1000 0000 0000
|
|
0x0000FFE8u, // 12 1111 1111 1110 1000
|
|
0x0000FF00u, // 13 1111 1111 0000 0000
|
|
0x0000FFF0u, // 14 1111 1111 1111 0000
|
|
0x0000F000u, // 15 1111 0000 0000 0000
|
|
0x0000F710u, // 16 1111 0111 0001 0000
|
|
0x0000008Eu, // 17 0000 0000 1000 1110
|
|
0x00007100u, // 18 0111 0001 0000 0000
|
|
0x000008CEu, // 19 0000 1000 1100 1110
|
|
0x0000008Cu, // 20 0000 0000 1000 1100
|
|
0x00007310u, // 21 0111 0011 0001 0000
|
|
0x00003100u, // 22 0011 0001 0000 0000
|
|
0x00008CCEu, // 23 1000 1100 1100 1110
|
|
0x0000088Cu, // 24 0000 1000 1000 1100
|
|
0x00003110u, // 25 0011 0001 0001 0000
|
|
0x00006666u, // 26 0110 0110 0110 0110
|
|
0x0000366Cu, // 27 0011 0110 0110 1100
|
|
0x000017E8u, // 28 0001 0111 1110 1000
|
|
0x00000FF0u, // 29 0000 1111 1111 0000
|
|
0x0000718Eu, // 30 0111 0001 1000 1110
|
|
0x0000399Cu, // 31 0011 1001 1001 1100
|
|
0x0000AAAAu, // 32 1010 1010 1010 1010
|
|
0x0000F0F0u, // 33 1111 0000 1111 0000
|
|
0x00005A5Au, // 34 0101 1010 0101 1010
|
|
0x000033CCu, // 35 0011 0011 1100 1100
|
|
0x00003C3Cu, // 36 0011 1100 0011 1100
|
|
0x000055AAu, // 37 0101 0101 1010 1010
|
|
0x00009696u, // 38 1001 0110 1001 0110
|
|
0x0000A55Au, // 39 1010 0101 0101 1010
|
|
0x000073CEu, // 40 0111 0011 1100 1110
|
|
0x000013C8u, // 41 0001 0011 1100 1000
|
|
0x0000324Cu, // 42 0011 0010 0100 1100
|
|
0x00003BDCu, // 43 0011 1011 1101 1100
|
|
0x00006996u, // 44 0110 1001 1001 0110
|
|
0x0000C33Cu, // 45 1100 0011 0011 1100
|
|
0x00009966u, // 46 1001 1001 0110 0110
|
|
0x00000660u, // 47 0000 0110 0110 0000
|
|
0x00000272u, // 48 0000 0010 0111 0010
|
|
0x000004E4u, // 49 0000 0100 1110 0100
|
|
0x00004E40u, // 50 0100 1110 0100 0000
|
|
0x00002720u, // 51 0010 0111 0010 0000
|
|
0x0000C936u, // 52 1100 1001 0011 0110
|
|
0x0000936Cu, // 53 1001 0011 0110 1100
|
|
0x000039C6u, // 54 0011 1001 1100 0110
|
|
0x0000639Cu, // 55 0110 0011 1001 1100
|
|
0x00009336u, // 56 1001 0011 0011 0110
|
|
0x00009CC6u, // 57 1001 1100 1100 0110
|
|
0x0000817Eu, // 58 1000 0001 0111 1110
|
|
0x0000E718u, // 59 1110 0111 0001 1000
|
|
0x0000CCF0u, // 60 1100 1100 1111 0000
|
|
0x00000FCCu, // 61 0000 1111 1100 1100
|
|
0x00007744u, // 62 0111 0111 0100 0100
|
|
0x0000EE22u, // 63 1110 1110 0010 0010
|
|
// 3 Subset region patterns
|
|
0xF60008CCu,// 0 1111 0110 0000 0000 : 0000 1000 1100 1100 = 2222122011001100 (MSB...LSB)
|
|
0x73008CC8u,// 1 0111 0011 0000 0000 : 1000 1100 1100 1000 = 1222112211001000
|
|
0x3310CC80u,// 2 0011 0011 0001 0000 : 1100 1100 1000 0000 = 1122112210020000
|
|
0x00CEEC00u,// 3 0000 0000 1100 1110 : 1110 1100 0000 0000 = 1110110022002220
|
|
0xCC003300u,// 4 1100 1100 0000 0000 : 0011 0011 0000 0000 = 2211221100000000
|
|
0xCC0000CCu,// 5 1100 1100 0000 0000 : 0000 0000 1100 1100 = 2200220011001100
|
|
0x00CCFF00u,// 6 0000 0000 1100 1100 : 1111 1111 0000 0000 = 1111111122002200
|
|
0x3300CCCCu,// 7 0011 0011 0000 0000 : 1100 1100 1100 1100 = 1122112211001100
|
|
0xF0000F00u,// 8 1111 0000 0000 0000 : 0000 1111 0000 0000 = 2222111100000000
|
|
0xF0000FF0u,// 9 1111 0000 0000 0000 : 0000 1111 1111 0000 = 2222111111110000
|
|
0xFF0000F0u,// 10 1111 1111 0000 0000 : 0000 0000 1111 0000 = 2222222211110000
|
|
0x88884444u,// 11 1000 1000 1000 1000 : 0100 0100 0100 0100 = 2100210021002100
|
|
0x88886666u,// 12 1000 1000 1000 1000 : 0110 0110 0110 0110 = 2110211021102110
|
|
0xCCCC2222u,// 13 1100 1100 1100 1100 : 0010 0010 0010 0010 = 2210221022102210
|
|
0xEC80136Cu,// 14 1110 1100 1000 0000 : 0001 0011 0110 1100 = 2221221121101100
|
|
0x7310008Cu,// 15 0111 0011 0001 0000 : 0000 0000 1000 1100 = 0222002210021100
|
|
0xC80036C8u,// 16 1100 1000 0000 0000 : 0011 0110 1100 1000 = 2211211011001000
|
|
0x310008CEu,// 17 0011 0001 0000 0000 : 0000 1000 1100 1110 = 0022100211001110
|
|
0xCCC03330u,// 18 1100 1100 1100 0000 : 0011 0011 0011 0000 = 2211221122110000
|
|
0x0CCCF000u,// 19 0000 1100 1100 1100 : 1111 0000 0000 0000 = 1111220022002200
|
|
0xEE0000EEu,// 20 1110 1110 0000 0000 : 0000 0000 1110 1110 = 2220222011101110
|
|
0x77008888u,// 21 0111 0111 0000 0000 : 1000 1000 1000 1000 = 1222122210001000
|
|
0xCC0022C0u,// 22 1100 1100 0000 0000 : 0010 0010 1100 0000 = 2210221011000000
|
|
0x33004430u,// 23 0011 0011 0000 0000 : 0100 0100 0011 0000 = 0122012200110000
|
|
0x00CC0C22u,// 24 0000 0000 1100 1100 : 0000 1100 0010 0010 = 0000110022102210
|
|
0xFC880344u,// 25 1111 1100 1000 1000 : 0000 0011 0100 0100 = 2222221121002100
|
|
0x06606996u,// 26 0000 0110 0110 0000 : 0110 1001 1001 0110 = 0110122112210110
|
|
0x66009960u,// 27 0110 0110 0000 0000 : 1001 1001 0110 0000 = 1221122101100000
|
|
0xC88C0330u,// 28 1100 1000 1000 1100 : 0000 0011 0011 0000 = 2200201120112200
|
|
0xF9000066u,// 29 1111 1001 0000 0000 : 0000 0000 0110 0110 = 2222200201100110
|
|
0x0CC0C22Cu,// 30 0000 1100 1100 0000 : 1100 0010 0010 1100 = 1100221022101100
|
|
0x73108C00u,// 31 0111 0011 0001 0000 : 1000 1100 0000 0000 = 1222112200020000
|
|
0xEC801300u,// 32 1110 1100 1000 0000 : 0001 0011 0000 0000 = 2221221120000000
|
|
0x08CEC400u,// 33 0000 1000 1100 1110 : 1100 0100 0000 0000 = 1100210022002220
|
|
0xEC80004Cu,// 34 1110 1100 1000 0000 : 0000 0000 0100 1100 = 2220220021001100
|
|
0x44442222u,// 35 0100 0100 0100 0100 : 0010 0010 0010 0010 = 0210021002100210
|
|
0x0F0000F0u,// 36 0000 1111 0000 0000 : 0000 0000 1111 0000 = 0000222211110000
|
|
0x49242492u,// 37 0100 1001 0010 0100 : 0010 0100 1001 0010 = 0210210210210210
|
|
0x42942942u,// 38 0100 0010 1001 0100 : 0010 1001 0100 0010 = 0210102121020210
|
|
0x0C30C30Cu,// 39 0000 1100 0011 0000 : 1100 0011 0000 1100 = 1100221100221100
|
|
0x03C0C03Cu,// 40 0000 0011 1100 0000 : 1100 0000 0011 1100 = 1100002222111100
|
|
0xFF0000AAu,// 41 1111 1111 0000 0000 : 0000 0000 1010 1010 = 2222222210101010
|
|
0x5500AA00u,// 42 0101 0101 0000 0000 : 1010 1010 0000 0000 = 1212121200000000
|
|
0xCCCC3030u,// 43 1100 1100 1100 1100 : 0011 0000 0011 0000 = 2211220022112200
|
|
0x0C0CC0C0u,// 44 0000 1100 0000 1100 : 1100 0000 1100 0000 = 1100220011002200
|
|
0x66669090u,// 45 0110 0110 0110 0110 : 1001 0000 1001 0000 = 1221022012210220
|
|
0x0FF0A00Au,// 46 0000 1111 1111 0000 : 1010 0000 0000 1010 = 1010222222221010
|
|
0x5550AAA0u,// 47 0101 0101 0101 0000 : 1010 1010 1010 0000 = 1212121212120000
|
|
0xF0000AAAu,// 48 1111 0000 0000 0000 : 0000 1010 1010 1010 = 2222101010101010
|
|
0x0E0EE0E0u,// 49 0000 1110 0000 1110 : 1110 0000 1110 0000 = 1110222011102220
|
|
0x88887070u,// 50 1000 1000 1000 1000 : 0111 0000 0111 0000 = 2111200021112000
|
|
0x99906660u,// 51 1001 1001 1001 0000 : 0110 0110 0110 0000 = 2112211221120000
|
|
0xE00E0EE0u,// 52 1110 0000 0000 1110 : 0000 1110 1110 0000 = 2220111011102220
|
|
0x88880770u,// 53 1000 1000 1000 1000 : 0000 0111 0111 0000 = 2000211121112000
|
|
0xF0000666u,// 54 1111 0000 0000 0000 : 0000 0110 0110 0110 = 2222011001100110
|
|
0x99006600u,// 55 1001 1001 0000 0000 : 0110 0110 0000 0000 = 2112211200000000
|
|
0xFF000066u,// 56 1111 1111 0000 0000 : 0000 0000 0110 0110 = 2222222201100110
|
|
0xC00C0CC0u,// 57 1100 0000 0000 1100 : 0000 1100 1100 0000 = 2200110011002200
|
|
0xCCCC0330u,// 58 1100 1100 1100 1100 : 0000 0011 0011 0000 = 2200221122112200
|
|
0x90006000u,// 59 1001 0000 0000 0000 : 0110 0000 0000 0000 = 2112000000000000
|
|
0x08088080u,// 60 0000 1000 0000 1000 : 1000 0000 1000 0000 = 1000200010002000
|
|
0xEEEE1010u,// 61 1110 1110 1110 1110 : 0001 0000 0001 0000 = 2221222022212220
|
|
0xFFF0000Au,// 62 1111 1111 1111 0000 : 0000 0000 0000 1010 = 2222222222221010
|
|
0x731008CEu,// 63 0111 0011 0001 0000 : 0000 1000 1100 1110 = 0222102211021110
|
|
};
|
|
|
|
if (maxSubsets == 2)
|
|
{
|
|
CGV_UINT32 mask_packed = subset_mask_table[part_id];
|
|
return ((mask_packed & (0x01<<index))?1:0); // This can be moved to caller, just return mask!!
|
|
}
|
|
|
|
// 3 region subsets
|
|
part_id += 64;
|
|
CGV_UINT32 mask0 = subset_mask_table[part_id] & 0xFFFF;
|
|
CGV_UINT32 mask1 = subset_mask_table[part_id] >> 16;
|
|
CGV_UINT32 mask = 0x01 << index;
|
|
|
|
return ((mask1 & mask)?2:0 + (mask0 & mask)?1:0); // This can be moved to caller, just return mask!!
|
|
}
|
|
|
|
void GetPartitionSubSet_mode01237(
|
|
CGV_IMAGE subsets_out[MAX_SUBSETS][SOURCE_BLOCK_SIZE][MAX_CHANNELS], // OUT: Subset pattern mapped with image src colors
|
|
CGV_ENTRIES entryCount_out[MAX_SUBSETS], // OUT: Number of entries per subset
|
|
CGV_UINT8 partition, // Partition Shape 0..63
|
|
CGV_IMAGE image_src[SOURCE_BLOCK_SIZE*MAX_CHANNELS], // Image colors
|
|
CGU_INT blockMode, // [0,1,2,3 or 7]
|
|
CGU_CHANNEL channels3or4) // 3 = RGB or 4 = RGBA (4 = MAX_CHANNELS)
|
|
{
|
|
CGU_UINT8 maxSubsets = 2; if (blockMode == 0 || blockMode == 2) maxSubsets = 3;
|
|
|
|
entryCount_out[0] = 0;
|
|
entryCount_out[1] = 0;
|
|
entryCount_out[2] = 0;
|
|
|
|
for (CGV_INT i = 0; i < MAX_SUBSET_SIZE; i++)
|
|
{
|
|
CGV_UINT8 subset = get_partition_subset(partition,maxSubsets,i);
|
|
|
|
for (CGU_INT ch = 0; ch<3; ch++)
|
|
subsets_out[subset][entryCount_out[subset]][ch] = image_src[i+(ch*SOURCE_BLOCK_SIZE)];
|
|
//subsets_out[subset*64+(entryCount_out[subset]*MAX_CHANNELS+ch)] = image_src[i+(ch*SOURCE_BLOCK_SIZE)];
|
|
|
|
// if we have only 3 channels then set the alpha subset to 0
|
|
if (channels3or4 == 3)
|
|
subsets_out[subset][entryCount_out[subset]][3] = 0.0F;
|
|
else
|
|
subsets_out[subset][entryCount_out[subset]][3] = image_src[i+(COMP_ALPHA*SOURCE_BLOCK_SIZE)];
|
|
entryCount_out[subset]++;
|
|
}
|
|
}
|
|
|
|
INLINE void GetClusterMean(
|
|
CGV_IMAGE cluster_mean_out[SOURCE_BLOCK_SIZE][MAX_CHANNELS],
|
|
CGV_IMAGE image_src[SOURCE_BLOCK_SIZE*MAX_CHANNELS],
|
|
CGV_INDEX index_in[MAX_SUBSET_SIZE],
|
|
CGV_ENTRIES numEntries, // < 16
|
|
CGU_CHANNEL channels3or4) // IN: 3 = RGB or 4 = RGBA (4 = MAX_CHANNELS)
|
|
{
|
|
// unused index values are underfined
|
|
CGV_INDEX i_cnt[MAX_SUBSET_SIZE];
|
|
CGV_INDEX i_comp[MAX_SUBSET_SIZE];
|
|
|
|
|
|
for (CGV_ENTRIES i=0;i< numEntries;i++)
|
|
for (CGU_CHANNEL ch=0; ch< channels3or4; ch++)
|
|
{
|
|
CGV_INDEX idx = index_in[i]&0x0F;
|
|
cluster_mean_out[idx][ch] = 0;
|
|
i_cnt[idx]=0;
|
|
}
|
|
|
|
CGV_INDEX ic = 0; // was CGV_INT
|
|
for (CGV_ENTRIES i=0;i< numEntries;i++)
|
|
{
|
|
CGV_INDEX idx = index_in[i]&0x0F;
|
|
if (i_cnt[idx]==0)
|
|
i_comp[ic++]=idx;
|
|
i_cnt[idx]++;
|
|
|
|
for (CGU_CHANNEL ch=0; ch< channels3or4; ch++)
|
|
{
|
|
cluster_mean_out[idx][ch] += image_src[i+(ch*SOURCE_BLOCK_SIZE)];
|
|
}
|
|
}
|
|
|
|
for (CGU_CHANNEL ch=0; ch< channels3or4; ch++)
|
|
for (CGU_INT i=0;i < ic;i++)
|
|
{
|
|
if (i_cnt[i_comp[i]] != 0)
|
|
{
|
|
CGV_INDEX icmp = i_comp[i];
|
|
cluster_mean_out[icmp][ch] = (CGV_IMAGE) floor( (cluster_mean_out[icmp][ch] / (CGV_IMAGE) i_cnt[icmp]) +0.5F);
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
INLINE void GetImageMean(
|
|
CGV_IMAGE image_mean_out[SOURCE_BLOCK_SIZE*MAX_CHANNELS],
|
|
CGV_IMAGE image_src[SOURCE_BLOCK_SIZE*MAX_CHANNELS],
|
|
CGV_ENTRIES numEntries,
|
|
CGU_CHANNEL channels)
|
|
{
|
|
for (CGU_CHANNEL ch=0; ch< channels; ch++)
|
|
image_mean_out[ch] =0;
|
|
|
|
for (CGV_ENTRIES i=0;i< numEntries;i++)
|
|
for (CGU_CHANNEL ch=0; ch< channels; ch++)
|
|
image_mean_out[ch] += image_src[i+ch*SOURCE_BLOCK_SIZE];
|
|
|
|
for (CGU_CHANNEL ch=0; ch< channels; ch++)
|
|
image_mean_out[ch] /=(CGV_IMAGE) numEntries; // Performance Warning: Conversion from unsigned int to float is slow. Use "int" if possible
|
|
}
|
|
|
|
// calculate an eigen vector corresponding to a biggest eigen value
|
|
// will work for non-zero non-negative matricies only
|
|
void GetEigenVector(
|
|
CGV_IMAGE EigenVector_out[MAX_CHANNELS], // Normalized Eigen Vector output
|
|
CGV_IMAGE CovarianceVector[MAX_CHANNELS*MAX_CHANNELS], // Covariance Vector
|
|
CGU_CHANNEL channels3or4) // IN: 3 = RGB or 4 = RGBA
|
|
{
|
|
CGV_IMAGE vector_covIn[MAX_CHANNELS*MAX_CHANNELS];
|
|
CGV_IMAGE vector_covOut[MAX_CHANNELS*MAX_CHANNELS];
|
|
CGV_IMAGE vector_maxCovariance;
|
|
|
|
for (CGU_CHANNEL ch1=0; ch1<channels3or4; ch1++)
|
|
for (CGU_CHANNEL ch2=0; ch2<channels3or4; ch2++)
|
|
{
|
|
vector_covIn[ch1+ch2*4] = CovarianceVector[ch1+ch2*4];
|
|
}
|
|
|
|
vector_maxCovariance = 0;
|
|
|
|
for (CGU_CHANNEL ch=0; ch<channels3or4; ch++)
|
|
{
|
|
if (vector_covIn[ch+ch*4] > vector_maxCovariance)
|
|
vector_maxCovariance = vector_covIn[ch+ch*4];
|
|
}
|
|
|
|
// Normalize Input Covariance Vector
|
|
for (CGU_CHANNEL ch1=0; ch1<channels3or4; ch1++)
|
|
for (CGU_CHANNEL ch2=0; ch2<channels3or4; ch2++)
|
|
{
|
|
if (vector_maxCovariance > 0)
|
|
vector_covIn[ch1+ch2*4] = vector_covIn[ch1+ch2*4] / vector_maxCovariance;
|
|
}
|
|
|
|
for (CGU_CHANNEL ch1=0; ch1<channels3or4; ch1++)
|
|
{
|
|
for (CGU_CHANNEL ch2=0; ch2<channels3or4; ch2++)
|
|
{
|
|
CGV_IMAGE vector_temp_cov=0;
|
|
for (CGU_CHANNEL ch3=0; ch3<channels3or4; ch3++)
|
|
{
|
|
vector_temp_cov = vector_temp_cov + vector_covIn[ch1+ch3*4]*vector_covIn[ch3+ch2*4];
|
|
}
|
|
vector_covOut[ch1+ch2*4] = vector_temp_cov;
|
|
}
|
|
}
|
|
|
|
vector_maxCovariance = 0;
|
|
|
|
CGV_TYPEINT maxCovariance_channel = 0;
|
|
|
|
for (CGU_CHANNEL ch=0; ch<channels3or4; ch++)
|
|
{
|
|
if (vector_covOut[ch+ch*4] > vector_maxCovariance)
|
|
{
|
|
maxCovariance_channel = ch;
|
|
vector_maxCovariance = vector_covOut[ch+ch*4];
|
|
}
|
|
}
|
|
|
|
CGV_IMAGE vector_t = 0;
|
|
|
|
for (CGU_CHANNEL ch=0; ch<channels3or4; ch++)
|
|
{
|
|
vector_t = vector_t + vector_covOut[maxCovariance_channel+ch*4]*vector_covOut[maxCovariance_channel+ch*4];
|
|
EigenVector_out[ch] = vector_covOut[maxCovariance_channel+ch*4];
|
|
}
|
|
|
|
// Normalize the Eigen Vector
|
|
vector_t= sqrt(vector_t);
|
|
for (CGU_CHANNEL ch=0; ch<channels3or4; ch++)
|
|
{
|
|
if (vector_t > 0)
|
|
EigenVector_out[ch] = EigenVector_out[ch] / vector_t;
|
|
}
|
|
|
|
}
|
|
|
|
CGV_INDEX index_collapse(
|
|
CGV_INDEX index[MAX_SUBSET_SIZE],
|
|
CGV_ENTRIES numEntries)
|
|
{
|
|
CGV_INDEX minIndex=index[0];
|
|
CGV_INDEX MaxIndex=index[0];
|
|
|
|
for (CGV_ENTRIES k=1;k<numEntries;k++) {
|
|
if (index[k] < minIndex)
|
|
minIndex = index[k];
|
|
if (index[k] > MaxIndex)
|
|
MaxIndex = index[k];
|
|
}
|
|
|
|
CGV_INDEX D=1;
|
|
|
|
for (CGV_INDEX d=2; d<= MaxIndex-minIndex; d++)
|
|
{
|
|
for (CGV_ENTRIES ent=0;ent<numEntries;ent++)
|
|
{
|
|
if ((index[ent] -minIndex) % d !=0)
|
|
{
|
|
if (ent>=numEntries)
|
|
D =d;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
for (CGV_ENTRIES k=0;k<numEntries;k++)
|
|
{
|
|
index[k] = (index[k]- minIndex) / D;
|
|
}
|
|
|
|
for (CGV_ENTRIES k=1;k<numEntries;k++) {
|
|
if (index[k] > MaxIndex)
|
|
MaxIndex = index[k];
|
|
}
|
|
|
|
return (MaxIndex);
|
|
|
|
}
|
|
|
|
void sortProjected_indexs(
|
|
CGV_INDEX index_ordered[MAX_SUBSET_SIZE],
|
|
CGV_IMAGE projection[SOURCE_BLOCK_SIZE],
|
|
CGV_ENTRIES numEntries // max 16
|
|
)
|
|
{
|
|
CMP_di what[SOURCE_BLOCK_SIZE];
|
|
|
|
for (CGV_INDEX i=0; i < numEntries;i++)
|
|
{
|
|
what[i].index = i;
|
|
what[i].image = projection[i];
|
|
}
|
|
|
|
CGV_INDEX tmp_index;
|
|
CGV_IMAGE tmp_image;
|
|
|
|
for (CGV_ENTRIES i = 1; i < numEntries; i++)
|
|
{
|
|
for (CGV_ENTRIES j=i; j>0; j--)
|
|
{
|
|
if (what[j - 1].image > what[j].image)
|
|
{
|
|
tmp_index = what[j].index;
|
|
tmp_image = what[j].image;
|
|
what[j].index = what[j - 1].index;
|
|
what[j].image = what[j - 1].image;
|
|
what[j - 1].index = tmp_index;
|
|
what[j - 1].image = tmp_image;
|
|
}
|
|
}
|
|
}
|
|
|
|
for (CGV_ENTRIES i=0; i < numEntries;i++)
|
|
index_ordered[i]=what[i].index;
|
|
|
|
};
|
|
|
|
void sortPartitionProjection(
|
|
CGV_IMAGE projection[MAX_PARTITION_ENTRIES],
|
|
CGV_UINT8 order[MAX_PARTITION_ENTRIES],
|
|
CGU_UINT8 numPartitions // max 64
|
|
)
|
|
{
|
|
CMP_du what[MAX_PARTITION_ENTRIES];
|
|
|
|
for (CGU_UINT8 Parti=0; Parti < numPartitions;Parti++)
|
|
{
|
|
what[Parti].index = Parti;
|
|
what[Parti].image = projection[Parti];
|
|
}
|
|
|
|
CGV_UINT8 index;
|
|
CGV_IMAGE data;
|
|
|
|
for (CGU_UINT8 Parti = 1; Parti < numPartitions; Parti++)
|
|
{
|
|
for (CGU_UINT8 Partj=Parti; Partj>0; Partj--)
|
|
{
|
|
if (what[Partj - 1].image > what[Partj].image)
|
|
{
|
|
index = what[Partj].index;
|
|
data = what[Partj].image;
|
|
what[Partj].index = what[Partj - 1].index;
|
|
what[Partj].image = what[Partj - 1].image;
|
|
what[Partj - 1].index = index;
|
|
what[Partj - 1].image = data;
|
|
}
|
|
}
|
|
}
|
|
|
|
for (CGU_UINT8 Parti=0; Parti < numPartitions;Parti++)
|
|
order[Parti]=what[Parti].index;
|
|
|
|
};
|
|
|
|
|
|
void cmp_Write8Bit(
|
|
CGV_CMPOUT base[],
|
|
CGU_INT* uniform offset,
|
|
CGU_INT bits,
|
|
CGV_BYTE bitVal)
|
|
{
|
|
base[*offset/8] |= bitVal << (*offset%8);
|
|
if (*offset%8+bits>8)
|
|
{
|
|
base[*offset/8+1] |= shift_right_uint8(bitVal, 8-*offset%8);
|
|
}
|
|
*offset += bits;
|
|
}
|
|
|
|
void cmp_Write8BitV(
|
|
CGV_CMPOUT base[],
|
|
CGV_INT offset,
|
|
CGU_INT bits,
|
|
CGV_BYTE bitVal)
|
|
{
|
|
base[offset/8] |= bitVal << (offset%8);
|
|
if (offset%8+bits>8)
|
|
{
|
|
base[offset/8+1] |= shift_right_uint8V(bitVal, 8-offset%8);
|
|
}
|
|
}
|
|
|
|
INLINE CGV_EPOCODE ep_find_floor(
|
|
CGV_IMAGE v,
|
|
CGU_UINT8 bits,
|
|
CGV_BYTE use_par,
|
|
CGV_BYTE odd)
|
|
{
|
|
CGV_EPOCODE i1=0;
|
|
CGV_EPOCODE i2=1<<(bits-use_par);
|
|
odd = use_par ? odd : 0;
|
|
while (i2-i1>1)
|
|
{
|
|
CGV_EPOCODE j = (i1+i2)/2; // Warning in ASMP code
|
|
CGV_EPOCODE ep_d = expandEPObits((j<<use_par)+odd,bits);
|
|
if (v >= ep_d )
|
|
i1=j;
|
|
else
|
|
i2=j;
|
|
}
|
|
|
|
return (i1<<use_par)+odd;
|
|
}
|
|
|
|
|
|
//==========================================================
|
|
|
|
// Not used for Modes 4&5
|
|
INLINE CGV_IMAGE GetRamp(
|
|
CGU_INT clogBC7, // ramp bits Valid range 2..4
|
|
CGU_INT bits, // Component Valid range 5..8
|
|
CGV_EPOCODE p1, // 0..255
|
|
CGV_EPOCODE p2, // 0..255
|
|
CGV_INDEX index) // 0..15
|
|
{
|
|
#ifdef ASPM_GPU // GPU Code
|
|
CGV_FLOAT rampf = 0.0F;
|
|
CMP_CONSTANT CGV_EPOCODE rampI[5*SOURCE_BLOCK_SIZE] = {
|
|
0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 , // 0 bit index
|
|
0 ,64,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 , // 1 bit index
|
|
0 ,21,43,64,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 , // 2 bit index
|
|
0 ,9 ,18,27,37,46,55,64,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 , // 3 bit index
|
|
0 ,4 ,9 ,13,17,21,26,30,34,38,43,47,51,55,60,64 // 4 bit index
|
|
};
|
|
|
|
CGV_EPOCODE e1 = expand_epocode(p1, bits);
|
|
CGV_EPOCODE e2 = expand_epocode(p2,bits);
|
|
CGV_FLOAT ramp = gather_epocode(rampI,clogBC7*16+index)/64.0F;
|
|
rampf = floor(e1 + ramp * (e2 - e1) + 0.5F);
|
|
return rampf;
|
|
#else // CPU ASPM Code
|
|
#ifdef USE_BC7_RAMP
|
|
return BC7EncodeRamps.ramp[(CLT(clogBC7)*4*256*256*16)+(BTT(bits)*256*256*16)+(p1*256*16)+(p2*16)+index];
|
|
#else
|
|
return (CGV_IMAGE)floor((CGV_IMAGE)BC7EncodeRamps.ep_d[BTT(bits)][p1] + rampWeights[clogBC7][index] * (CGV_IMAGE)((BC7EncodeRamps.ep_d[BTT(bits)][p2] - BC7EncodeRamps.ep_d[BTT(bits)][p1]))+ 0.5F);
|
|
#endif
|
|
#endif
|
|
}
|
|
|
|
|
|
// Not used for Modes 4&5
|
|
INLINE CGV_ERROR get_sperr(
|
|
CGU_INT clogBC7, // ramp bits Valid range 2..4
|
|
CGU_INT bits, // Component Valid range 5..8
|
|
CGV_EPOCODE p1, // 0..255
|
|
CGU_INT t1,
|
|
CGU_INT t2,
|
|
CGV_INDEX index)
|
|
{
|
|
#ifdef ASPM_GPU
|
|
return 0.0F;
|
|
#else
|
|
#ifdef USE_BC7_SP_ERR_IDX
|
|
if (BC7EncodeRamps.ramp_init)
|
|
return BC7EncodeRamps.sp_err[(CLT(clogBC7)*4*256*2*2*16)+(BTT(bits)*256*2*2*16)+(p1*2*2*16)+(t1*2*16)+(t2*16)+index];
|
|
else
|
|
return 0.0F;
|
|
#else
|
|
return 0.0F;
|
|
#endif
|
|
#endif
|
|
}
|
|
|
|
INLINE void get_fixuptable(CGV_FIXUPINDEX fixup[3], CGV_PARTID part_id)
|
|
{
|
|
// same as CMP SDK v3.1 BC7_FIXUPINDEX1 & BC7_FIXUPINDEX2 for each partition range 0..63
|
|
// The data is saved as a packed INT = (BC7_FIXUPINDEX1 << 4 + BC7_FIXUPINDEX2)
|
|
CMP_STATIC uniform __constant CGV_FIXUPINDEX FIXUPINDEX[] = {
|
|
// 2 subset partitions 0..63
|
|
0xf0u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0xf0u,
|
|
0xf0u, 0x20u, 0x80u, 0x20u, 0x20u, 0x80u, 0x80u, 0xf0u, 0x20u, 0x80u, 0x20u, 0x20u, 0x80u, 0x80u, 0x20u, 0x20u,
|
|
0xf0u, 0xf0u, 0x60u, 0x80u, 0x20u, 0x80u, 0xf0u, 0xf0u, 0x20u, 0x80u, 0x20u, 0x20u, 0x20u, 0xf0u, 0xf0u, 0x60u,
|
|
0x60u, 0x20u, 0x60u, 0x80u, 0xf0u, 0xf0u, 0x20u, 0x20u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0x20u, 0x20u, 0xf0u,
|
|
// 3 subset partitions 64..128
|
|
0x3fu, 0x38u, 0xf8u, 0xf3u, 0x8fu, 0x3fu, 0xf3u, 0xf8u, 0x8fu, 0x8fu, 0x6fu, 0x6fu, 0x6fu, 0x5fu, 0x3fu, 0x38u,
|
|
0x3fu, 0x38u, 0x8fu, 0xf3u, 0x3fu, 0x38u, 0x6fu, 0xa8u, 0x53u, 0x8fu, 0x86u, 0x6au, 0x8fu, 0x5fu, 0xfau, 0xf8u,
|
|
0x8fu, 0xf3u, 0x3fu, 0x5au, 0x6au, 0xa8u, 0x89u, 0xfau, 0xf6u, 0x3fu, 0xf8u, 0x5fu, 0xf3u, 0xf6u, 0xf6u, 0xf8u,
|
|
0x3fu, 0xf3u, 0x5fu, 0x5fu, 0x5fu, 0x8fu, 0x5fu, 0xafu, 0x5fu, 0xafu, 0x8fu, 0xdfu, 0xf3u, 0xcfu, 0x3fu, 0x38u
|
|
};
|
|
|
|
CGV_FIXUPINDEX skip_packed = FIXUPINDEX[part_id];// gather_int2(FIXUPINDEX, part_id);
|
|
fixup[0] = 0;
|
|
fixup[1] = skip_packed>>4;
|
|
fixup[2] = skip_packed&15;
|
|
}
|
|
|
|
//===================================== COMPRESS CODE =============================================
|
|
INLINE void SetDefaultIndex(CGV_INDEX index_io[MAX_SUBSET_SIZE])
|
|
{
|
|
// Use this a final call
|
|
for (CGU_INT i=0; i<MAX_SUBSET_SIZE; i++)
|
|
index_io[i] = 0;
|
|
}
|
|
|
|
INLINE void SetDefaultEPOCode(CGV_EPOCODE epo_code_io[8], CGV_EPOCODE R,CGV_EPOCODE G,CGV_EPOCODE B,CGV_EPOCODE A)
|
|
{
|
|
epo_code_io[0] = R;
|
|
epo_code_io[1] = G;
|
|
epo_code_io[2] = B;
|
|
epo_code_io[3] = A;
|
|
epo_code_io[4] = R;
|
|
epo_code_io[5] = G;
|
|
epo_code_io[6] = B;
|
|
epo_code_io[7] = A;
|
|
}
|
|
|
|
|
|
void GetProjectedIndex(
|
|
CGV_INDEX projected_index_out[MAX_SUBSET_SIZE], //output: index, uncentered, in the range 0..clusters-1
|
|
CGV_IMAGE image_projected[SOURCE_BLOCK_SIZE], // image_block points, might be uncentered
|
|
CGV_INT clusters, // clusters: number of points in the ramp (max 16)
|
|
CGV_ENTRIES numEntries) // n - number of points in v_ max 15
|
|
{
|
|
CMP_di what[SOURCE_BLOCK_SIZE];
|
|
CGV_IMAGE image_v[SOURCE_BLOCK_SIZE];
|
|
CGV_IMAGE image_z[SOURCE_BLOCK_SIZE];
|
|
CGV_IMAGE image_l;
|
|
CGV_IMAGE image_mm;
|
|
CGV_IMAGE image_r = 0.0F;
|
|
CGV_IMAGE image_dm = 0.0F;
|
|
CGV_IMAGE image_min;
|
|
CGV_IMAGE image_max;
|
|
CGV_IMAGE image_s;
|
|
|
|
SetDefaultIndex(projected_index_out);
|
|
|
|
image_min=image_projected[0];
|
|
image_max=image_projected[0];
|
|
|
|
for (CGV_ENTRIES i=1; i < numEntries;i++)
|
|
{
|
|
if (image_min < image_projected[i])
|
|
image_min = image_projected[i];
|
|
if (image_max > image_projected[i])
|
|
image_max = image_projected[i];
|
|
}
|
|
|
|
CGV_IMAGE img_diff = image_max-image_min;
|
|
|
|
if (img_diff == 0.0f) return;
|
|
if (isnan(img_diff)) return;
|
|
|
|
image_s = (clusters-1)/img_diff;
|
|
|
|
for (CGV_INDEX i=0; i < numEntries;i++)
|
|
{
|
|
|
|
image_v[i] = image_projected[i]*image_s;
|
|
image_z[i] = floor(image_v[i] + 0.5F - image_min *image_s);
|
|
projected_index_out[i] = (CGV_INDEX)image_z[i];
|
|
|
|
what[i].image = image_v[i]-image_z[i]- image_min *image_s;
|
|
what[i].index = i;
|
|
image_dm+= what[i].image;
|
|
image_r += what[i].image*what[i].image;
|
|
}
|
|
|
|
if (numEntries*image_r- image_dm*image_dm >= (CGV_IMAGE)(numEntries-1)/8)
|
|
{
|
|
|
|
image_dm /= numEntries;
|
|
|
|
for (CGV_INT i=0; i < numEntries;i++)
|
|
what[i].image -= image_dm;
|
|
|
|
CGV_INDEX tmp_index;
|
|
CGV_IMAGE tmp_image;
|
|
for (CGV_ENTRIES i = 1; i < numEntries; i++)
|
|
{
|
|
for (CGV_ENTRIES j=i; j>0; j--)
|
|
{
|
|
if (what[j - 1].image > what[j].image)
|
|
{
|
|
tmp_index = what[j].index;
|
|
tmp_image = what[j].image;
|
|
what[j].index = what[j - 1].index;
|
|
what[j].image = what[j - 1].image;
|
|
what[j - 1].index = tmp_index;
|
|
what[j - 1].image = tmp_image;
|
|
}
|
|
}
|
|
}
|
|
|
|
// got into fundamental simplex
|
|
// move coordinate system origin to its center
|
|
|
|
// i=0 < numEntries avoids varying int division by 0
|
|
for (CGV_ENTRIES i=0; i < numEntries;i++)
|
|
{
|
|
what[i].image = what[i].image - (CGV_IMAGE) (((2.0f*i+1)-numEntries)/(2.0f*numEntries));
|
|
}
|
|
|
|
image_mm=0.0F;
|
|
image_l=0.0F;
|
|
|
|
CGV_INT j = -1;
|
|
for (CGV_ENTRIES i=0; i < numEntries;i++)
|
|
{
|
|
image_l += what[i].image;
|
|
if (image_l < image_mm)
|
|
{
|
|
image_mm = image_l;
|
|
j=i;
|
|
}
|
|
}
|
|
|
|
|
|
j = j + 1;
|
|
// avoid j = j%numEntries us this
|
|
while (j > numEntries) j = j - numEntries;
|
|
|
|
for (CGV_ENTRIES i=j; i < numEntries;i++)
|
|
{
|
|
CGV_INDEX idx = what[i].index;
|
|
CGV_INDEX pidx = projected_index_out[idx] + 1; //gather_index(projected_index_out,idx)+1;
|
|
projected_index_out[idx] = pidx; // scatter_index(projected_index_out,idx,pidx);
|
|
}
|
|
}
|
|
|
|
// get minimum index
|
|
CGV_INDEX index_min=projected_index_out[0];
|
|
for (CGV_ENTRIES i=1; i < numEntries;i++)
|
|
{
|
|
if (projected_index_out[i] < index_min)
|
|
index_min = projected_index_out[i];
|
|
}
|
|
|
|
// reposition all index by min index (using min index as 0)
|
|
for (CGV_ENTRIES i=0; i < numEntries;i++)
|
|
{
|
|
projected_index_out[i] = clampIndex(projected_index_out[i] - index_min,0,15);
|
|
}
|
|
|
|
}
|
|
|
|
CGV_ERROR GetQuantizeIndex(
|
|
CGV_INDEXPACKED index_packed_out[2],
|
|
CGV_INDEX index_out[MAX_SUBSET_SIZE], // OUT:
|
|
CGV_IMAGE image_src[SOURCE_BLOCK_SIZE*MAX_CHANNELS],
|
|
CGV_ENTRIES numEntries, //IN: range 0..15 (MAX_SUBSET_SIZE)
|
|
CGU_INT numClusters,
|
|
CGU_CHANNEL channels3or4) // IN: 3 = RGB or 4 = RGBA (4 = MAX_CHANNELS)
|
|
{
|
|
CGV_IMAGE image_centered[SOURCE_BLOCK_SIZE*MAX_CHANNELS];
|
|
CGV_IMAGE image_mean[MAX_CHANNELS];
|
|
CGV_IMAGE eigen_vector[MAX_CHANNELS];
|
|
CGV_IMAGE covariance_vector[MAX_CHANNELS*MAX_CHANNELS];
|
|
|
|
GetImageCentered(image_centered,image_mean, image_src, numEntries, channels3or4);
|
|
GetCovarianceVector(covariance_vector, image_centered, numEntries, channels3or4);
|
|
|
|
//-----------------------------------------------------
|
|
// check if all covariances are the same
|
|
// if so then set all index to same value 0 and return
|
|
// use EPSILON to set the limit for all same limit
|
|
//-----------------------------------------------------
|
|
|
|
CGV_IMAGE image_covt=0.0F;
|
|
for (CGU_CHANNEL ch=0; ch<channels3or4; ch++)
|
|
image_covt = image_covt + covariance_vector[ch+ch*4];
|
|
|
|
if (image_covt < EPSILON)
|
|
{
|
|
SetDefaultIndex(index_out);
|
|
index_packed_out[0] = 0;
|
|
index_packed_out[1] = 0;
|
|
return 0.;
|
|
}
|
|
|
|
GetEigenVector(eigen_vector, covariance_vector,channels3or4);
|
|
|
|
CGV_IMAGE image_projected[SOURCE_BLOCK_SIZE];
|
|
|
|
GetProjecedImage(image_projected,image_centered, numEntries, eigen_vector, channels3or4);
|
|
GetProjectedIndex(index_out, image_projected, numClusters,numEntries);
|
|
|
|
//==========================================
|
|
// Refine
|
|
//==========================================
|
|
CGV_IMAGE image_q = 0.0F;
|
|
for (CGU_CHANNEL ch=0; ch<channels3or4; ch++)
|
|
{
|
|
eigen_vector[ch]=0;
|
|
for (CGV_ENTRIES k=0;k<numEntries;k++)
|
|
eigen_vector[ch] = eigen_vector[ch] + image_centered[k+(ch*SOURCE_BLOCK_SIZE)]*index_out[k];
|
|
image_q = image_q + eigen_vector[ch]* eigen_vector[ch];
|
|
}
|
|
|
|
image_q = sqrt(image_q);
|
|
|
|
// direction needs to be normalized
|
|
if (image_q != 0.0F)
|
|
for (CGU_CHANNEL ch=0; ch<channels3or4; ch++)
|
|
eigen_vector[ch] = eigen_vector[ch] / image_q;
|
|
|
|
// Get new projected data
|
|
GetProjecedImage(image_projected, image_centered, numEntries, eigen_vector, channels3or4);
|
|
GetProjectedIndex(index_out, image_projected, numClusters,numEntries);
|
|
|
|
// pack the index for use in icmp
|
|
pack_index(index_packed_out, index_out);
|
|
|
|
//===========================
|
|
// Calc Error
|
|
//===========================
|
|
// Get the new image based on new index
|
|
|
|
CGV_IMAGE image_t = 0.0F;
|
|
CGV_IMAGE index_average = 0.0F;
|
|
|
|
for (CGV_ENTRIES ik=0;ik<numEntries;ik++)
|
|
{
|
|
index_average = index_average + index_out[ik];
|
|
image_t = image_t + index_out[ik]*index_out[ik];
|
|
}
|
|
|
|
index_average = index_average / (CGV_IMAGE) numEntries;
|
|
image_t = image_t - index_average * index_average * (CGV_IMAGE) numEntries;
|
|
|
|
if (image_t != 0.0F)
|
|
image_t = 1.0F/image_t;
|
|
|
|
for (CGU_CHANNEL ch=0; ch<channels3or4; ch++)
|
|
{
|
|
eigen_vector[ch]=0;
|
|
for (CGV_ENTRIES nk=0; nk<numEntries; nk++)
|
|
eigen_vector[ch] = eigen_vector[ch] + image_centered[nk+(ch*SOURCE_BLOCK_SIZE)]*index_out[nk];
|
|
}
|
|
|
|
CGV_IMAGE image_decomp[SOURCE_BLOCK_SIZE*MAX_CHANNELS];
|
|
for (CGV_ENTRIES i=0;i<numEntries;i++)
|
|
for (CGU_CHANNEL ch=0; ch<channels3or4; ch++)
|
|
image_decomp[i+(ch*SOURCE_BLOCK_SIZE)] = image_mean[ch] + eigen_vector[ch]*image_t*(index_out[i]-index_average);
|
|
|
|
CGV_ERROR err_1 = err_Total(image_src,image_decomp,numEntries, channels3or4);
|
|
|
|
return err_1;
|
|
// return 0.0F;
|
|
}
|
|
|
|
CGV_ERROR quant_solid_color(
|
|
CGV_INDEX index_out[MAX_SUBSET_SIZE],
|
|
CGV_EPOCODE epo_code_out[2*MAX_CHANNELS],
|
|
CGV_IMAGE image_src[SOURCE_BLOCK_SIZE*MAX_CHANNELS],
|
|
CGV_ENTRIES numEntries,
|
|
CGU_UINT8 Mi_, // last cluster
|
|
CGU_UINT8 bits[3], // including parity
|
|
CGU_INT type,
|
|
CGU_CHANNEL channels3or4 // IN: 3 = RGB or 4 = RGBA (4 = MAX_CHANNELS)
|
|
)
|
|
{
|
|
|
|
CGU_INT clogBC7 = 0;
|
|
CGU_INT iv = Mi_ + 1;
|
|
while (iv >>= 1)
|
|
clogBC7++;
|
|
|
|
// init epo_0
|
|
CGV_EPOCODE epo_0[2*MAX_CHANNELS];
|
|
SetDefaultEPOCode(epo_0,0xFF,0,0,0);
|
|
|
|
CGV_INDEX image_log = 0;
|
|
CGV_INDEX image_idx = 0;
|
|
CGU_BOOL use_par = FALSE;
|
|
if (type != 0)
|
|
use_par = TRUE;
|
|
CGV_ERROR error_1 = CMP_FLOAT_MAX;
|
|
|
|
for (CGU_INT pn = 0; pn<npv_nd[channels3or4-3][type] && (error_1 != 0.0F); pn++)
|
|
{ //1
|
|
|
|
CGU_INT o1[2*MAX_CHANNELS]; // = { 0,2 };
|
|
CGU_INT o2[2*MAX_CHANNELS]; // = { 0,2 };
|
|
|
|
for (CGU_CHANNEL ch = 0; ch<channels3or4; ch++)
|
|
{ //A
|
|
o2[ ch] = o1[ ch] = 0;
|
|
o2[4+ch] = o1[4+ch] = 2;
|
|
|
|
if (use_par == TRUE)
|
|
{
|
|
if (par_vectors_nd[channels3or4-3][type][pn][0][ch])
|
|
o1[ch] = 1;
|
|
else
|
|
o1[4+ch] = 1;
|
|
if (par_vectors_nd[channels3or4-3][type][pn][1][ch])
|
|
o2[ch] = 1;
|
|
else
|
|
o2[4+ch] = 1;
|
|
}
|
|
} //A
|
|
|
|
CGV_EPOCODE image_tcr[MAX_CHANNELS];
|
|
CGV_EPOCODE epo_dr_0[MAX_CHANNELS];
|
|
CGV_ERROR error_tr;
|
|
CGV_ERROR error_0 = CMP_FLOAT_MAX;
|
|
|
|
for (CGV_INDEX iclogBC7 = 0; iclogBC7< (1 << clogBC7) && (error_0 != 0); iclogBC7++)
|
|
{ //E
|
|
CGV_ERROR error_t = 0;
|
|
CGV_EPOCODE t1o[MAX_CHANNELS], t2o[MAX_CHANNELS];
|
|
|
|
for (CGU_CHANNEL ch1 = 0; ch1<channels3or4; ch1++)
|
|
{ // D
|
|
CGV_ERROR error_ta = CMP_FLOAT_MAX;
|
|
|
|
for (CGU_INT t1 = o1[ch1]; t1<o1[4+ch1]; t1++)
|
|
{ // C
|
|
// This is needed for non-integer mean points of "collapsed" sets
|
|
for (CGU_INT t2 = o2[ch1]; t2<o2[4+ch1]; t2++)
|
|
{ // B
|
|
CGV_EPOCODE image_tf;
|
|
CGV_EPOCODE image_tc;
|
|
image_tf = (CGV_EPOCODE)floor(image_src[COMP_RED+(ch1*SOURCE_BLOCK_SIZE)]);
|
|
image_tc = (CGV_EPOCODE) ceil(image_src[COMP_RED+(ch1*SOURCE_BLOCK_SIZE)]);
|
|
|
|
#ifdef USE_BC7_SP_ERR_IDX
|
|
CGV_ERROR err_tf = get_sperr(clogBC7,bits[ch1],image_tf,t1,t2,iclogBC7);
|
|
CGV_ERROR err_tc = get_sperr(clogBC7,bits[ch1],image_tc,t1,t2,iclogBC7);
|
|
if (err_tf > err_tc)
|
|
image_tcr[ch1] = image_tc;
|
|
else if (err_tf < err_tc)
|
|
image_tcr[ch1] = image_tf;
|
|
else
|
|
image_tcr[ch1] = (CGV_EPOCODE)floor(image_src[COMP_RED+(ch1*SOURCE_BLOCK_SIZE)] + 0.5F);
|
|
|
|
//image_tcr[ch1] = image_tf + (image_tc - image_tf)/2;
|
|
|
|
//===============================
|
|
// Refine this for better quality!
|
|
//===============================
|
|
error_tr = get_sperr(clogBC7,bits[ch1],image_tcr[ch1],t1,t2,iclogBC7);
|
|
error_tr = (error_tr*error_tr)
|
|
+ 2 * error_tr
|
|
* img_absf(image_tcr[ch1]- image_src[COMP_RED+(ch1*SOURCE_BLOCK_SIZE)])
|
|
+ (image_tcr[ch1] - image_src[COMP_RED+(ch1*SOURCE_BLOCK_SIZE)])
|
|
* (image_tcr[ch1] - image_src[COMP_RED+(ch1*SOURCE_BLOCK_SIZE)]);
|
|
|
|
if (error_tr < error_ta)
|
|
{
|
|
error_ta = error_tr;
|
|
t1o[ch1] = t1;
|
|
t2o[ch1] = t2;
|
|
epo_dr_0[ch1] = clampEPO(image_tcr[ch1],0,255);
|
|
}
|
|
#else
|
|
image_tcr[ch1] = floor(image_src[COMP_RED+(ch1*SOURCE_BLOCK_SIZE)] + 0.5F);
|
|
error_ta = 0;
|
|
t1o[ch1] = t1;
|
|
t2o[ch1] = t2;
|
|
epo_dr_0[ch1] = clampi(image_tcr[ch1],0,255);
|
|
#endif
|
|
} // B
|
|
} //C
|
|
|
|
error_t += error_ta;
|
|
} // D
|
|
|
|
if (error_t < error_0)
|
|
{
|
|
image_log = iclogBC7;
|
|
image_idx = image_log;
|
|
CGU_BOOL srcIsWhite = FALSE;
|
|
if ((image_src[0] == 255.0f)&&(image_src[1] == 255.0f)&&(image_src[2] == 255.0f)) srcIsWhite = TRUE;
|
|
|
|
for (CGU_CHANNEL ch = 0; ch<channels3or4; ch++)
|
|
{
|
|
#ifdef ASPM_GPU
|
|
if (srcIsWhite == TRUE)
|
|
{
|
|
// Default White block!
|
|
epo_0[ ch] = 0x7F;
|
|
epo_0[4+ch] = 0x7F;
|
|
}
|
|
else
|
|
{
|
|
// Default black block!
|
|
epo_0[ ch] = 0;
|
|
epo_0[4+ch] = 0;
|
|
}
|
|
#else
|
|
#ifdef USE_BC7_SP_ERR_IDX
|
|
if (BC7EncodeRamps.ramp_init) {
|
|
CGV_EPOCODE index = (CLT(clogBC7)*4*256*2*2*16*2)+(BTT(bits[ch])*256*2*2*16*2)+(epo_dr_0[ch]*2*2*16*2)+(t1o[ch]*2*16*2)+(t2o[ch]*16*2)+(iclogBC7*2);
|
|
epo_0[ ch] = BC7EncodeRamps.sp_idx[index+0]&0xFF;// gather_epocode(u_BC7Encode->sp_idx,index+0)&0xFF;
|
|
epo_0[4+ch] = BC7EncodeRamps.sp_idx[index+1]&0xFF;// gather_epocode(u_BC7Encode->sp_idx,index+1)&0xFF;
|
|
}
|
|
else {
|
|
epo_0[ch] = 0;
|
|
epo_0[4 + ch] = 0;
|
|
}
|
|
#else
|
|
epo_0[ ch] = 0;
|
|
epo_0[4+ch] = 0;
|
|
#endif
|
|
#endif
|
|
}
|
|
error_0 = error_t;
|
|
}
|
|
//if (error_0 == 0)
|
|
// break;
|
|
} // E
|
|
|
|
if (error_0 < error_1)
|
|
{
|
|
|
|
image_idx = image_log;
|
|
for (CGU_CHANNEL chE = 0; chE<channels3or4; chE++)
|
|
{
|
|
epo_code_out[chE] = epo_0[chE];
|
|
epo_code_out[4+chE] = epo_0[4+chE];
|
|
}
|
|
error_1 = error_0;
|
|
}
|
|
|
|
} //1
|
|
|
|
// Get Image error
|
|
CGV_IMAGE image_decomp[SOURCE_BLOCK_SIZE*MAX_CHANNELS];
|
|
for (CGV_ENTRIES i = 0; i< numEntries; i++)
|
|
{
|
|
index_out[i] = image_idx;
|
|
for (CGU_CHANNEL ch = 0; ch<channels3or4; ch++)
|
|
{
|
|
image_decomp[i+(ch*SOURCE_BLOCK_SIZE)] = GetRamp(clogBC7,bits[ch],epo_code_out[ch],epo_code_out[4+ch],image_idx);
|
|
}
|
|
}
|
|
// Do we need to do this rather then err_1 * numEntries
|
|
CGV_ERROR error_quant;
|
|
error_quant = err_Total(image_src, image_decomp, numEntries, channels3or4);
|
|
|
|
return error_quant;
|
|
//return err_1 * numEntries;
|
|
}
|
|
|
|
CGV_ERROR requantized_image_err(
|
|
CGV_INDEX index_out[MAX_SUBSET_SIZE],
|
|
CGV_EPOCODE epo_code[2*MAX_CHANNELS],
|
|
CGU_INT clogBC7,
|
|
CGU_UINT8 max_bits[MAX_CHANNELS],
|
|
CGV_IMAGE image_src[SOURCE_BLOCK_SIZE*MAX_CHANNELS],
|
|
CGV_ENTRIES numEntries, // max 16
|
|
CGU_CHANNEL channels3or4) // IN: 3 = RGB or 4 = RGBA (4 = MAX_CHANNELS)
|
|
{
|
|
|
|
//=========================================
|
|
// requantized image based on new epo_code
|
|
//=========================================
|
|
CGV_IMAGE image_requantize[SOURCE_BLOCK_SIZE][MAX_CHANNELS];
|
|
CGV_ERROR err_r=0.0F;
|
|
|
|
for (CGU_CHANNEL ch = 0; ch<channels3or4; ch++)
|
|
{
|
|
for (CGU_INT k = 0; k<SOURCE_BLOCK_SIZE; k++)
|
|
{
|
|
image_requantize[k][ch] = GetRamp(clogBC7,max_bits[ch],epo_code[ch],epo_code[4+ch],(CGV_INDEX)k);
|
|
}
|
|
}
|
|
|
|
//=========================================
|
|
// Calc the error for the requantized image
|
|
//=========================================
|
|
|
|
for (CGV_ENTRIES k =0; k < numEntries; k++)
|
|
{
|
|
CGV_ERROR err_cmin = CMP_FLOAT_MAX;
|
|
CGV_TYPEINT hold_index_j = 0;
|
|
|
|
for (CGV_TYPEINT iclogBC7=0; iclogBC7 < (1<<clogBC7); iclogBC7++)
|
|
{
|
|
CGV_IMAGE image_err = 0.0F;
|
|
|
|
for (CGU_CHANNEL ch=0; ch<channels3or4; ch++)
|
|
{
|
|
image_err+= sq_image(image_requantize[iclogBC7][ch]-image_src[k+(ch*SOURCE_BLOCK_SIZE)]);
|
|
}
|
|
|
|
if(image_err < err_cmin)
|
|
{
|
|
err_cmin = image_err;
|
|
hold_index_j = iclogBC7;
|
|
}
|
|
}
|
|
|
|
index_out[k]=(CGV_INDEX)hold_index_j;
|
|
err_r +=err_cmin;
|
|
}
|
|
|
|
return err_r;
|
|
}
|
|
|
|
CGU_BOOL get_ideal_cluster(
|
|
CGV_IMAGE image_out[2*MAX_CHANNELS],
|
|
CGV_INDEX index_in[MAX_SUBSET_SIZE],
|
|
CGU_INT Mi_,
|
|
CGV_IMAGE image_src[SOURCE_BLOCK_SIZE*MAX_CHANNELS],
|
|
CGV_ENTRIES numEntries,
|
|
CGU_CHANNEL channels3or4 )
|
|
{
|
|
// get ideal cluster centers
|
|
CGV_IMAGE image_cluster_mean[SOURCE_BLOCK_SIZE][MAX_CHANNELS];
|
|
GetClusterMean(image_cluster_mean, image_src, index_in, numEntries, channels3or4); // unrounded
|
|
|
|
CGV_IMAGE image_matrix0[2] = {0,0}; // matrix /inverse matrix
|
|
CGV_IMAGE image_matrix1[2] = {0,0}; // matrix /inverse matrix
|
|
CGV_IMAGE image_rp[2*MAX_CHANNELS]; // right part for RMS fit problem
|
|
|
|
for (CGU_INT i=0; i<2*MAX_CHANNELS; i++) image_rp[i]=0;
|
|
|
|
// weight with cnt if runnning on compacted index
|
|
for (CGV_ENTRIES k=0;k<numEntries;k++)
|
|
{
|
|
image_matrix0[0] += (Mi_- index_in[k])* (Mi_-index_in[k]);
|
|
image_matrix0[1] += index_in[k] * (Mi_-index_in[k]); // im is symmetric
|
|
image_matrix1[1] += index_in[k] * index_in[k];
|
|
|
|
for (CGU_CHANNEL ch=0; ch<channels3or4; ch++)
|
|
{
|
|
image_rp[ ch] += (Mi_-index_in[k]) * image_cluster_mean[index_in[k]][ch];
|
|
image_rp[4+ch] += index_in[k] * image_cluster_mean[index_in[k]][ch];
|
|
}
|
|
}
|
|
|
|
CGV_IMAGE matrix_dd = image_matrix0[0]*image_matrix1[1]- image_matrix0[1]*image_matrix0[1];
|
|
|
|
// assert(matrix_dd !=0);
|
|
// matrix_dd=0 means that index_cidx[k] and (Mi_-index_cidx[k]) collinear which implies only one active index;
|
|
// taken care of separately
|
|
if (matrix_dd == 0)
|
|
{
|
|
for (CGU_CHANNEL ch=0; ch<channels3or4; ch++)
|
|
{
|
|
image_out[ ch]=0;
|
|
image_out[4+ch]=0;
|
|
}
|
|
return FALSE;
|
|
}
|
|
image_matrix1[0] = image_matrix0[0];
|
|
image_matrix0[0] = image_matrix1[1]/matrix_dd;
|
|
image_matrix1[1] = image_matrix1[0]/matrix_dd;
|
|
image_matrix1[0] = image_matrix0[1]=-image_matrix0[1]/matrix_dd;
|
|
|
|
CGV_IMAGE Mif = (CGV_IMAGE)Mi_;
|
|
|
|
for (CGU_CHANNEL ch=0; ch<channels3or4; ch++)
|
|
{
|
|
image_out[ ch]=(image_matrix0[0]*image_rp[ch]+image_matrix0[1]*image_rp[4+ch])*Mif;
|
|
image_out[4+ch]=(image_matrix1[0]*image_rp[ch]+image_matrix1[1]*image_rp[4+ch])*Mif;
|
|
}
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
CGV_ERROR shake(
|
|
CGV_EPOCODE epo_code_shaker_out[2*MAX_CHANNELS],
|
|
CGV_IMAGE image_ep[2*MAX_CHANNELS],
|
|
CGV_INDEX index_cidx[MAX_SUBSET_SIZE],
|
|
CGV_IMAGE image_src[SOURCE_BLOCK_SIZE*MAX_CHANNELS],
|
|
CGU_INT clogBC7,
|
|
CGU_INT type,
|
|
CGU_UINT8 max_bits[MAX_CHANNELS],
|
|
CGU_UINT8 use_par,
|
|
CGV_ENTRIES numEntries, // max 16
|
|
CGU_CHANNEL channels3or4 )
|
|
{
|
|
#define SHAKESIZE1 1
|
|
#define SHAKESIZE2 2
|
|
// shake single or - cartesian
|
|
// shake odd/odd and even/even or - same parity
|
|
// shake odd/odd odd/even , even/odd and even/even - bcc
|
|
|
|
CGV_ERROR best_err = CMP_FLOAT_MAX;
|
|
|
|
CGV_ERROR err_ed[16] = {0};
|
|
CGV_EPOCODE epo_code_par[2][2][2][MAX_CHANNELS];
|
|
|
|
for (CGU_CHANNEL ch=0; ch<channels3or4; ch++)
|
|
{
|
|
CGU_UINT8 ppA = 0;
|
|
CGU_UINT8 ppB = 0;
|
|
CGU_UINT8 rr = (use_par ? 2:1);
|
|
CGV_EPOCODE epo_code_epi[2][2]; // first/second, coord, begin rage end range
|
|
|
|
for (ppA=0; ppA<rr; ppA++) // loop max =2
|
|
{
|
|
for (ppB=0; ppB<rr; ppB++) //loop max =2
|
|
{
|
|
|
|
// set default ranges
|
|
epo_code_epi[0][0] = epo_code_epi[0][1]= ep_find_floor( image_ep[ ch],max_bits[ch], use_par, ppA);
|
|
epo_code_epi[1][0] = epo_code_epi[1][1]= ep_find_floor( image_ep[4+ch],max_bits[ch], use_par, ppB);
|
|
|
|
// set begin range
|
|
epo_code_epi[0][0] -= ( (epo_code_epi[0][0] < SHAKESIZE1 ? epo_code_epi[0][0]:SHAKESIZE1))&(~use_par);
|
|
epo_code_epi[1][0] -= ( (epo_code_epi[1][0] < SHAKESIZE1 ? epo_code_epi[1][0]:SHAKESIZE1))&(~use_par);
|
|
|
|
// set end range
|
|
epo_code_epi[0][1] += ((1<<max_bits[ch])-1 - epo_code_epi[0][1] < SHAKESIZE2 ? (1<<max_bits[ch])-1-epo_code_epi[0][1]:SHAKESIZE2)&(~use_par);
|
|
epo_code_epi[1][1] += ((1<<max_bits[ch])-1 - epo_code_epi[1][1] < SHAKESIZE2 ? (1<<max_bits[ch])-1-epo_code_epi[1][1]:SHAKESIZE2)&(~use_par);
|
|
|
|
CGV_EPOCODE step = (1<<use_par);
|
|
err_ed[(ppA*8)+(ppB*4)+ch]=CMP_FLOAT_MAX;
|
|
|
|
for (CGV_EPOCODE epo_p1=epo_code_epi[0][0]; epo_p1<=epo_code_epi[0][1]; epo_p1+=step)
|
|
for (CGV_EPOCODE epo_p2=epo_code_epi[1][0]; epo_p2<=epo_code_epi[1][1]; epo_p2+=step)
|
|
{
|
|
CGV_IMAGE image_square_diff =0.0F;
|
|
CGV_ENTRIES _mc = numEntries;
|
|
CGV_IMAGE image_ramp;
|
|
|
|
while(_mc > 0)
|
|
{
|
|
image_ramp = GetRamp(clogBC7,max_bits[ch],epo_p1,epo_p2,index_cidx[_mc-1]);
|
|
|
|
image_square_diff += sq_image(image_ramp-image_src[(_mc-1)+(ch*SOURCE_BLOCK_SIZE)]);
|
|
_mc--;
|
|
}
|
|
if (image_square_diff < err_ed[(ppA*8)+(ppB*4)+ch])
|
|
{
|
|
err_ed[(ppA*8)+(ppB*4)+ch] = image_square_diff;
|
|
epo_code_par[ppA][ppB][0][ch] = epo_p1;
|
|
epo_code_par[ppA][ppB][1][ch] = epo_p2;
|
|
}
|
|
}
|
|
} // pp1
|
|
} // pp0
|
|
} // j
|
|
|
|
//---------------------------------------------------------
|
|
for (CGU_INT pn=0; pn < npv_nd[channels3or4-3][type]; pn++)
|
|
{
|
|
CGV_ERROR err_2=0.0F;
|
|
CGU_INT d1;
|
|
CGU_INT d2;
|
|
|
|
for (CGU_CHANNEL ch=0; ch<channels3or4; ch++)
|
|
{
|
|
d1 = par_vectors_nd[channels3or4-3][type][pn][0][ch];
|
|
d2 = par_vectors_nd[channels3or4-3][type][pn][1][ch];
|
|
err_2+=err_ed[(d1*8)+(d2*4)+ch];
|
|
}
|
|
|
|
if (err_2 < best_err)
|
|
{
|
|
best_err = err_2;
|
|
for (CGU_CHANNEL ch=0; ch<channels3or4; ch++)
|
|
{
|
|
d1 = par_vectors_nd[channels3or4-3][type][pn][0][ch];
|
|
d2 = par_vectors_nd[channels3or4-3][type][pn][1][ch];
|
|
epo_code_shaker_out[ ch]=epo_code_par[d1][d2][0][ch];
|
|
epo_code_shaker_out[4+ch]=epo_code_par[d1][d2][1][ch];
|
|
}
|
|
}
|
|
}
|
|
|
|
return best_err;
|
|
}
|
|
|
|
CGV_ERROR optimize_IndexAndEndPoints(
|
|
CGV_INDEX index_io[MAX_SUBSET_SIZE],
|
|
CGV_EPOCODE epo_code_out[8],
|
|
CGV_IMAGE image_src[SOURCE_BLOCK_SIZE*MAX_CHANNELS],
|
|
CGV_ENTRIES numEntries, // max 16
|
|
CGU_UINT8 Mi_, // last cluster , This should be no larger than 16
|
|
CGU_UINT8 bits, // total for all components
|
|
CGU_CHANNEL channels3or4, // IN: 3 = RGB or 4 = RGBA (4 = MAX_CHANNELS)
|
|
uniform CMP_GLOBAL BC7_Encode u_BC7Encode[])
|
|
{
|
|
CGV_ERROR err_best = CMP_FLOAT_MAX;
|
|
CGU_INT type;
|
|
CGU_CHANNEL channels2 = 2*channels3or4;
|
|
|
|
type = bits % channels2;
|
|
|
|
CGU_UINT8 use_par =(type !=0);
|
|
|
|
CGU_UINT8 max_bits[MAX_CHANNELS];
|
|
for (CGU_UINT8 ch=0; ch<channels3or4; ch++)
|
|
max_bits[ch] = (bits+channels2-1) / channels2;
|
|
|
|
CGU_INT iv;
|
|
CGU_INT clogBC7=0;
|
|
iv = Mi_;
|
|
while (iv>>=1)
|
|
clogBC7++;
|
|
|
|
CGU_INT clt_clogBC7 = CLT(clogBC7);
|
|
|
|
if (clt_clogBC7 > 3)
|
|
{
|
|
ASPM_PRINT(("Err: optimize_IndexAndEndPoints, clt_clogBC7\n"));
|
|
return CMP_FLOAT_MAX;
|
|
}
|
|
|
|
Mi_ = Mi_ - 1;
|
|
|
|
CGV_INDEX MaxIndex;
|
|
CGV_INDEX index_tmp[MAX_SUBSET_SIZE];
|
|
CGU_INT maxTry = MAX_TRY_SHAKER;
|
|
|
|
CGV_INDEX index_best[MAX_SUBSET_SIZE];
|
|
|
|
for (CGV_ENTRIES k=0;k<numEntries;k++)
|
|
{
|
|
index_best[k] = index_tmp[k] = clampIndex(index_io[k],0,15);
|
|
}
|
|
|
|
CGV_EPOCODE epo_code_best[2*MAX_CHANNELS];
|
|
|
|
SetDefaultEPOCode(epo_code_out ,0xFF,0,0,0);
|
|
SetDefaultEPOCode(epo_code_best,0,0,0,0);
|
|
|
|
CGV_ERROR err_requant = 0.0F;
|
|
|
|
MaxIndex = index_collapse(index_tmp, numEntries);
|
|
|
|
//===============================
|
|
// we have a solid color 4x4 block
|
|
//===============================
|
|
if (MaxIndex == 0)
|
|
{
|
|
|
|
return quant_solid_color(index_io, epo_code_out, image_src, numEntries, Mi_, max_bits,type, channels3or4);
|
|
}
|
|
|
|
do {
|
|
//===============================
|
|
// We have ramp colors to process
|
|
//===============================
|
|
CGV_ERROR err_cluster = CMP_FLOAT_MAX;
|
|
CGV_ERROR err_shake;
|
|
CGV_INDEX index_cluster[MAX_PARTITION_ENTRIES];
|
|
|
|
for (CGV_INDEX index_slope=1; (MaxIndex != 0) && (index_slope*MaxIndex <= Mi_); index_slope++)
|
|
{
|
|
for (CGV_INDEX index_offset=0; index_offset<=Mi_-index_slope*MaxIndex; index_offset++)
|
|
{
|
|
//-------------------------------------
|
|
// set a new index data to try
|
|
//-------------------------------------
|
|
for (CGV_ENTRIES k=0;k<numEntries;k++)
|
|
index_cluster[k] = index_tmp[k] * index_slope + index_offset;
|
|
|
|
CGV_IMAGE image_cluster[2*MAX_CHANNELS];
|
|
CGV_EPOCODE epo_code_shake[2*MAX_CHANNELS];
|
|
SetDefaultEPOCode(epo_code_shake,0,0,0xFF,0);
|
|
|
|
if (get_ideal_cluster( image_cluster,
|
|
index_cluster,
|
|
Mi_,
|
|
image_src,
|
|
numEntries,
|
|
channels3or4) == FALSE)
|
|
{
|
|
break;
|
|
}
|
|
|
|
err_shake = shake( epo_code_shake, // return new epo
|
|
image_cluster,
|
|
index_cluster,
|
|
image_src,
|
|
clogBC7,
|
|
type,
|
|
max_bits,
|
|
use_par,
|
|
numEntries, // max 16
|
|
channels3or4);
|
|
|
|
if (err_shake < err_cluster)
|
|
{
|
|
err_cluster = err_shake;
|
|
for (CGU_CHANNEL ch=0; ch<channels3or4; ch++)
|
|
{
|
|
epo_code_best[ ch] = clampEPO(epo_code_shake[ ch], 0, 255);
|
|
epo_code_best[4+ch] = clampEPO(epo_code_shake[4+ch], 0, 255);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
CGV_TYPEINT change = 0;
|
|
CGV_TYPEINT better = 0;
|
|
|
|
if ((err_cluster != CMP_FLOAT_MAX))
|
|
{
|
|
//=========================
|
|
// test results for quality
|
|
//=========================
|
|
err_requant = requantized_image_err(
|
|
index_best, // new index results
|
|
epo_code_best, // prior result input
|
|
clogBC7,
|
|
max_bits,
|
|
image_src,
|
|
numEntries,
|
|
channels3or4);
|
|
|
|
// change/better
|
|
// Has the index values changed from that last set
|
|
for (CGV_ENTRIES k=0;k<numEntries;k++)
|
|
change = change || (index_cluster[k] != index_best[k]);
|
|
|
|
if (err_requant < err_best)
|
|
{
|
|
better = 1;
|
|
for (CGV_ENTRIES k=0;k<numEntries;k++)
|
|
{
|
|
index_io[k]=index_tmp[k]=index_best[k];
|
|
}
|
|
|
|
for (CGU_CHANNEL ch=0; ch<channels3or4; ch++)
|
|
{
|
|
epo_code_out[ ch]=epo_code_best[0*4+ch];
|
|
epo_code_out[4+ch]=epo_code_best[1*4+ch];
|
|
}
|
|
err_best=err_requant;
|
|
}
|
|
}
|
|
|
|
// Early out if we have our target err
|
|
if( err_best <= u_BC7Encode->errorThreshold)
|
|
{
|
|
break;
|
|
}
|
|
|
|
CGV_TYPEINT done;
|
|
done = !(change && better);
|
|
if ((maxTry > 0)&&(!done))
|
|
{
|
|
maxTry--;
|
|
MaxIndex = index_collapse(index_tmp, numEntries);
|
|
}
|
|
else
|
|
{
|
|
maxTry = 0;
|
|
}
|
|
|
|
} while (maxTry);
|
|
|
|
if (err_best == CMP_FLOAT_MAX)
|
|
{
|
|
ASPM_PRINT(("Err: requantized_image_err\n"));
|
|
}
|
|
|
|
return err_best;
|
|
}
|
|
|
|
CGU_UINT8 get_partitionsToTry(uniform CMP_GLOBAL BC7_Encode u_BC7Encode[],CGU_UINT8 maxPartitions)
|
|
{
|
|
CGU_FLOAT u_minPartitionSearchSize = 0.30f;
|
|
if(u_BC7Encode->quality <= BC7_qFAST_THRESHOLD) // Using this to match performance and quality of CPU code
|
|
{
|
|
u_minPartitionSearchSize = u_minPartitionSearchSize + ( u_BC7Encode->quality*BC7_qFAST_THRESHOLD);
|
|
}
|
|
else
|
|
{
|
|
u_minPartitionSearchSize = u_BC7Encode->quality;
|
|
}
|
|
return (CGU_UINT8)(maxPartitions * u_minPartitionSearchSize);
|
|
}
|
|
|
|
INLINE void cmp_encode_swap(CGV_EPOCODE endpoint[], CGU_INT channels, CGV_INDEX block_index[MAX_SUBSET_SIZE], CGU_INT bits)
|
|
{
|
|
CGU_INT levels = 1 << bits;
|
|
if (block_index[0]>=levels/2)
|
|
{
|
|
cmp_swap_epo(&endpoint[0], &endpoint[channels], channels);
|
|
for (CGU_INT k=0; k<SOURCE_BLOCK_SIZE; k++)
|
|
block_index[k] = CGV_INDEX(levels-1) - block_index[k];
|
|
}
|
|
}
|
|
|
|
void cmp_encode_index(CGV_CMPOUT data[16], CGU_INT* uniform pPos, CGV_INDEX block_index[MAX_SUBSET_SIZE], CGU_INT bits)
|
|
{
|
|
cmp_Write8Bit(data,pPos,bits-1,block_index[0]);
|
|
for (CGU_INT j=1;j<SOURCE_BLOCK_SIZE;j++)
|
|
{
|
|
CGV_INDEX qbits = block_index[j]&0xFF;
|
|
cmp_Write8Bit(data,pPos,bits,qbits);
|
|
}
|
|
}
|
|
|
|
void encode_endpoint(CGV_CMPOUT data[16], CGU_INT* uniform pPos, CGV_BYTE block_index[16], CGU_INT bits, CGV_SHIFT32 flips)
|
|
{
|
|
CGU_INT levels = 1 << bits;
|
|
CGV_TYPEINT flips_shifted = flips;
|
|
for (CGU_INT k1=0; k1<16; k1++)
|
|
{
|
|
CGV_BYTE qbits_shifted = block_index[k1];
|
|
for (CGU_INT k2=0; k2<8; k2++)
|
|
{
|
|
CGV_TYPEINT q = qbits_shifted&15;
|
|
if ((flips_shifted&1)>0) q = (levels-1)-q;
|
|
|
|
if (k1==0 && k2==0) cmp_Write8Bit(data, pPos, bits - 1, static_cast <CGV_BYTE>(q));
|
|
else cmp_Write8Bit(data, pPos, bits, static_cast<CGV_BYTE>(q));
|
|
qbits_shifted >>= 4;
|
|
flips_shifted >>= 1;
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
INLINE CGV_SHIFT32 pow32(CGV_SHIFT32 x)
|
|
{
|
|
return 1<<x;
|
|
}
|
|
|
|
void Encode_mode02137(
|
|
CGU_INT blockMode,
|
|
CGV_UINT8 bestPartition,
|
|
CGV_TYPEUINT32 packedEndpoints[MAX_SUBSETS*2],
|
|
CGV_BYTE index16[16],
|
|
CGV_CMPOUT cmp_out[COMPRESSED_BLOCK_SIZE])
|
|
{
|
|
CGU_INT partitionBits;
|
|
CGU_UINT32 componentBits;
|
|
CGU_UINT8 maxSubsets;
|
|
CGU_INT channels;
|
|
CGU_BYTE indexBits;
|
|
|
|
switch(blockMode)
|
|
{
|
|
case 0:
|
|
componentBits = 4;
|
|
maxSubsets = 3;
|
|
partitionBits = 4;
|
|
channels = 3;
|
|
indexBits = 3;
|
|
break;
|
|
case 2:
|
|
componentBits = 5;
|
|
maxSubsets = 3;
|
|
partitionBits = 6;
|
|
channels = 3;
|
|
indexBits = 2;
|
|
break;
|
|
case 3:
|
|
componentBits = 7;
|
|
maxSubsets = 2;
|
|
partitionBits = 6;
|
|
channels = 3;
|
|
indexBits = 2;
|
|
break;
|
|
case 7:
|
|
componentBits = 5;
|
|
maxSubsets = 2;
|
|
partitionBits = 6;
|
|
channels = 4;
|
|
indexBits = 2;
|
|
break;
|
|
default:
|
|
case 1:
|
|
componentBits = 6;
|
|
maxSubsets = 2;
|
|
partitionBits = 6;
|
|
channels = 3;
|
|
indexBits = 3;
|
|
break;
|
|
}
|
|
|
|
CGV_BYTE blockindex[SOURCE_BLOCK_SIZE];
|
|
CGV_INT indexBitsV = indexBits;
|
|
|
|
for (CGU_INT k=0; k<COMPRESSED_BLOCK_SIZE; k++) cmp_out[k] = 0;
|
|
|
|
// mode 0 = 1, mode 1 = 01, mode 2 = 001, mode 3 = 0001, ...
|
|
CGU_INT bitPosition = blockMode;
|
|
cmp_Write8Bit(cmp_out,&bitPosition,1,1);
|
|
|
|
// Write partition bits
|
|
cmp_Write8Bit(cmp_out,&bitPosition,partitionBits,bestPartition);
|
|
|
|
// Sort out the index set and tag whether we need to flip the
|
|
// endpoints to get the correct state in the implicit index bits
|
|
// The implicitly encoded MSB of the fixup index must be 0
|
|
CGV_FIXUPINDEX fixup[3];
|
|
get_fixuptable(fixup,(maxSubsets==2?bestPartition:bestPartition+64));
|
|
|
|
// Extract indices and mark subsets that need to have their colours flipped to get the
|
|
// right state for the implicit MSB of the fixup index
|
|
CGV_INT flipColours[3] = {0, 0, 0};
|
|
|
|
for (CGV_INT k=0; k<SOURCE_BLOCK_SIZE; k++)
|
|
{
|
|
blockindex[k] = index16[k];
|
|
for (CGU_UINT8 j=0;j<maxSubsets;j++)
|
|
{
|
|
if(k==fixup[j])
|
|
{
|
|
if(blockindex[k] & (1<<(indexBitsV-1)))
|
|
{
|
|
flipColours[j] = 1;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Now we must flip the endpoints where necessary so that the implicitly encoded
|
|
// index bits have the correct state
|
|
for (CGU_INT subset=0; subset<maxSubsets; subset++)
|
|
{
|
|
if(flipColours[subset] == 1)
|
|
{
|
|
CGV_TYPEUINT32 temp = packedEndpoints[subset*2+0];
|
|
packedEndpoints[subset*2+0] = packedEndpoints[subset*2+1];
|
|
packedEndpoints[subset*2+1] = temp;
|
|
}
|
|
}
|
|
|
|
// ...next flip the indices where necessary
|
|
|
|
|
|
for (CGV_INT k=0; k<SOURCE_BLOCK_SIZE; k++)
|
|
{
|
|
CGV_UINT8 partsub = get_partition_subset(bestPartition,maxSubsets,k);
|
|
|
|
if(flipColours[partsub] == 1)
|
|
{
|
|
blockindex[k] = ((1 << indexBitsV) - 1) - blockindex[k];
|
|
}
|
|
}
|
|
|
|
// Endpoints are stored in the following order RRRR GGGG BBBB (AAAA) (PPPP)
|
|
// i.e. components are packed together
|
|
CGV_SHIFT32 unpackedColours[MAX_SUBSETS*2*MAX_CHANNELS];
|
|
CGV_BYTE parityBits[MAX_SUBSETS][2];
|
|
|
|
// Unpack the colour values for the subsets
|
|
for (CGU_INT subset=0; subset<maxSubsets; subset++)
|
|
{
|
|
CGV_SHIFT32 packedColours[2] = {packedEndpoints[subset*2+0],packedEndpoints[subset*2+1]};
|
|
|
|
if(blockMode == 0 || blockMode == 3|| blockMode == 7) // TWO_PBIT
|
|
{
|
|
parityBits[subset][0] = packedColours[0] & 1;
|
|
parityBits[subset][1] = packedColours[1] & 1;
|
|
packedColours[0] >>= 1;
|
|
packedColours[1] >>= 1;
|
|
}
|
|
else
|
|
if(blockMode == 1) // ONE_PBIT
|
|
{
|
|
parityBits[subset][0] = packedColours[1] & 1;
|
|
parityBits[subset][1] = packedColours[1] & 1;
|
|
packedColours[0] >>= 1;
|
|
packedColours[1] >>= 1;
|
|
}
|
|
else
|
|
if(blockMode == 2)
|
|
{
|
|
parityBits[subset][0] = 0;
|
|
parityBits[subset][1] = 0;
|
|
}
|
|
|
|
for (CGU_INT ch=0; ch<channels;ch++)
|
|
{
|
|
unpackedColours[(subset*2+0)*MAX_CHANNELS+ch] = packedColours[0] & ((1 << componentBits) - 1);
|
|
unpackedColours[(subset*2+1)*MAX_CHANNELS+ch] = packedColours[1] & ((1 << componentBits) - 1);
|
|
packedColours[0] >>= componentBits;
|
|
packedColours[1] >>= componentBits;
|
|
}
|
|
}
|
|
|
|
// Loop over component
|
|
for (CGU_INT ch=0; ch < channels; ch++)
|
|
{
|
|
// loop over subsets
|
|
for (CGU_INT subset=0; subset<maxSubsets; subset++)
|
|
{
|
|
cmp_Write8Bit(cmp_out,&bitPosition,componentBits,unpackedColours[(subset*2+0)*MAX_CHANNELS+ch]&0xFF);
|
|
cmp_Write8Bit(cmp_out,&bitPosition,componentBits,unpackedColours[(subset*2+1)*MAX_CHANNELS+ch]&0xFF);
|
|
}
|
|
}
|
|
|
|
|
|
// write parity bits
|
|
if (blockMode != 2)
|
|
{
|
|
for (CGV_INT subset=0; subset<maxSubsets; subset++)
|
|
{
|
|
if(blockMode == 1) // ONE_PBIT
|
|
{
|
|
cmp_Write8Bit(cmp_out,&bitPosition,1,parityBits[subset][0]&0x01);
|
|
}
|
|
else // TWO_PBIT
|
|
{
|
|
cmp_Write8Bit(cmp_out,&bitPosition,1,parityBits[subset][0]&0x01);
|
|
cmp_Write8Bit(cmp_out,&bitPosition,1,parityBits[subset][1]&0x01);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Encode the index bits
|
|
CGV_INT bitPositionV = bitPosition;
|
|
for (CGV_FIXUPINDEX k=0; k<SOURCE_BLOCK_SIZE; k++)
|
|
{
|
|
CGV_UINT8 partsub = get_partition_subset(bestPartition,maxSubsets,k);
|
|
|
|
// If this is a fixup index then drop the MSB which is implicitly 0
|
|
if(k == fixup[partsub])
|
|
{
|
|
cmp_Write8BitV(cmp_out, bitPositionV, indexBits-1,blockindex[k]&0x07F);
|
|
bitPositionV += indexBits-1;
|
|
}
|
|
else
|
|
{
|
|
cmp_Write8BitV(cmp_out,bitPositionV, indexBits,blockindex[k]);
|
|
bitPositionV += indexBits;
|
|
}
|
|
}
|
|
}
|
|
|
|
void Encode_mode4( CGV_CMPOUT cmp_out[COMPRESSED_BLOCK_SIZE],
|
|
varying cmp_mode_parameters* uniform params )
|
|
{
|
|
CGU_INT bitPosition = 4; // Position the pointer at the LSB
|
|
|
|
for (CGU_INT k=0; k<COMPRESSED_BLOCK_SIZE; k++) cmp_out[k] = 0;
|
|
|
|
// mode 4 (5 bits) 00001
|
|
cmp_Write8Bit(cmp_out,&bitPosition,1,1);
|
|
|
|
// rotation 2 bits
|
|
cmp_Write8Bit(cmp_out, &bitPosition, 2, static_cast <CGV_BYTE> (params->rotated_channel));
|
|
|
|
// idxMode 1 bit
|
|
cmp_Write8Bit(cmp_out, &bitPosition, 1, static_cast <CGV_BYTE> (params->idxMode));
|
|
|
|
CGU_INT idxBits[2] = {2,3};
|
|
|
|
if(params->idxMode)
|
|
{
|
|
idxBits[0] = 3;
|
|
idxBits[1] = 2;
|
|
// Indicate if we need to fixup the index
|
|
cmp_swap_index(params->color_index,params->alpha_index,16);
|
|
cmp_encode_swap(params->alpha_qendpoint, 4, params->color_index,2);
|
|
cmp_encode_swap(params->color_qendpoint, 4, params->alpha_index,3);
|
|
}
|
|
else
|
|
{
|
|
cmp_encode_swap(params->color_qendpoint, 4, params->color_index,2);
|
|
cmp_encode_swap(params->alpha_qendpoint, 4, params->alpha_index,3);
|
|
}
|
|
|
|
// color endpoints 5 bits each
|
|
// R0 : R1
|
|
// G0 : G1
|
|
// B0 : B1
|
|
for (CGU_INT component=0; component < 3; component++)
|
|
{
|
|
cmp_Write8Bit(cmp_out, &bitPosition, 5, static_cast<CGV_BYTE> (params->color_qendpoint[component]));
|
|
cmp_Write8Bit(cmp_out, &bitPosition, 5, static_cast <CGV_BYTE> (params->color_qendpoint[4 + component]));
|
|
}
|
|
|
|
// alpha endpoints (6 bits each)
|
|
// A0 : A1
|
|
cmp_Write8Bit(cmp_out, &bitPosition, 6, static_cast<CGV_BYTE> (params->alpha_qendpoint[0]));
|
|
cmp_Write8Bit(cmp_out, &bitPosition, 6, static_cast<CGV_BYTE> (params->alpha_qendpoint[4]));
|
|
|
|
// index 2 bits each (31 bits total)
|
|
cmp_encode_index(cmp_out, &bitPosition, params->color_index, 2);
|
|
// index 3 bits each (47 bits total)
|
|
cmp_encode_index(cmp_out, &bitPosition, params->alpha_index, 3);
|
|
}
|
|
|
|
void Encode_mode5( CGV_CMPOUT cmp_out[COMPRESSED_BLOCK_SIZE],
|
|
varying cmp_mode_parameters* uniform params)
|
|
{
|
|
for (CGU_INT k=0; k<COMPRESSED_BLOCK_SIZE; k++) cmp_out[k] = 0;
|
|
|
|
// mode 5 bits = 000001
|
|
CGU_INT bitPosition = 5; // Position the pointer at the LSB
|
|
cmp_Write8Bit(cmp_out,&bitPosition,1,1);
|
|
|
|
// Write 2 bit rotation
|
|
cmp_Write8Bit(cmp_out, &bitPosition, 2, static_cast<CGV_BYTE> (params->rotated_channel));
|
|
|
|
cmp_encode_swap(params->color_qendpoint, 4, params->color_index,2);
|
|
cmp_encode_swap(params->alpha_qendpoint, 4, params->alpha_index,2);
|
|
|
|
// color endpoints (7 bits each)
|
|
// R0 : R1
|
|
// G0 : G1
|
|
// B0 : B1
|
|
for (CGU_INT component=0; component < 3; component++)
|
|
{
|
|
cmp_Write8Bit(cmp_out, &bitPosition, 7, static_cast<CGV_BYTE> (params->color_qendpoint[component]));
|
|
cmp_Write8Bit(cmp_out, &bitPosition, 7, static_cast <CGV_BYTE> (params->color_qendpoint[4 + component]));
|
|
}
|
|
|
|
// alpha endpoints (8 bits each)
|
|
// A0 : A1
|
|
cmp_Write8Bit(cmp_out, &bitPosition, 8, static_cast<CGV_BYTE> (params->alpha_qendpoint[0]));
|
|
cmp_Write8Bit(cmp_out, &bitPosition, 8, static_cast<CGV_BYTE> (params->alpha_qendpoint[4]));
|
|
|
|
|
|
// color index 2 bits each (31 bits total)
|
|
// alpha index 2 bits each (31 bits total)
|
|
cmp_encode_index(cmp_out, &bitPosition, params->color_index, 2);
|
|
cmp_encode_index(cmp_out, &bitPosition, params->alpha_index, 2);
|
|
}
|
|
|
|
void Encode_mode6(
|
|
CGV_INDEX index[MAX_SUBSET_SIZE],
|
|
CGV_EPOCODE epo_code[8],
|
|
CGV_CMPOUT cmp_out[COMPRESSED_BLOCK_SIZE])
|
|
{
|
|
for (CGU_INT k=0; k<COMPRESSED_BLOCK_SIZE; k++) cmp_out[k] = 0;
|
|
|
|
cmp_encode_swap(epo_code, 4, index,4);
|
|
|
|
// Mode = 6 bits = 0000001
|
|
CGU_INT bitPosition = 6; // Position the pointer at the LSB
|
|
cmp_Write8Bit(cmp_out,&bitPosition,1, 1);
|
|
|
|
// endpoints
|
|
for (CGU_INT p=0; p<4; p++)
|
|
{
|
|
cmp_Write8Bit(cmp_out, &bitPosition, 7, static_cast<CGV_BYTE> (epo_code[0 + p] >> 1));
|
|
cmp_Write8Bit(cmp_out, &bitPosition, 7, static_cast<CGV_BYTE> (epo_code[4 + p] >> 1));
|
|
}
|
|
|
|
// p bits
|
|
cmp_Write8Bit(cmp_out, &bitPosition, 1, epo_code[0]&1);
|
|
cmp_Write8Bit(cmp_out, &bitPosition, 1, epo_code[4]&1);
|
|
|
|
// quantized values
|
|
cmp_encode_index(cmp_out, &bitPosition, index, 4);
|
|
}
|
|
|
|
|
|
void Compress_mode01237(
|
|
CGU_INT blockMode,
|
|
BC7_EncodeState EncodeState[],
|
|
uniform CMP_GLOBAL BC7_Encode u_BC7Encode[])
|
|
{
|
|
CGV_INDEX storedBestindex[MAX_PARTITIONS][MAX_SUBSETS][MAX_SUBSET_SIZE];
|
|
CGV_ERROR storedError[MAX_PARTITIONS];
|
|
CGV_UINT8 sortedPartition[MAX_PARTITIONS];
|
|
|
|
EncodeState->numPartitionModes = 64;
|
|
EncodeState->maxSubSets = 2;
|
|
|
|
if (blockMode == 0)
|
|
{
|
|
EncodeState->numPartitionModes = 16;
|
|
EncodeState->channels3or4 = 3;
|
|
EncodeState->bits = 26;
|
|
EncodeState->clusters = 8;
|
|
EncodeState->componentBits = 4;
|
|
EncodeState->maxSubSets = 3;
|
|
}
|
|
else
|
|
if (blockMode == 2)
|
|
{
|
|
EncodeState->channels3or4 = 3;
|
|
EncodeState->bits = 30;
|
|
EncodeState->clusters = 4;
|
|
EncodeState->componentBits = 5;
|
|
EncodeState->maxSubSets = 3;
|
|
}
|
|
else
|
|
if (blockMode == 1)
|
|
{
|
|
|
|
EncodeState->channels3or4 = 3;
|
|
EncodeState->bits = 37;
|
|
EncodeState->clusters = 8;
|
|
EncodeState->componentBits = 6;
|
|
}
|
|
else
|
|
if (blockMode == 3)
|
|
{
|
|
EncodeState->channels3or4 = 3;
|
|
EncodeState->bits = 44;
|
|
EncodeState->clusters = 4;
|
|
EncodeState->componentBits = 7;
|
|
}
|
|
else
|
|
if (blockMode == 7)
|
|
{
|
|
EncodeState->channels3or4 = 4;
|
|
EncodeState->bits = 42; // (2* (R 5 + G 5 + B 5 + A 5)) + 2 parity bits
|
|
EncodeState->clusters = 4;
|
|
EncodeState->componentBits = 5; // 5 bit components
|
|
}
|
|
|
|
CGV_IMAGE image_subsets[MAX_SUBSETS][MAX_SUBSET_SIZE][MAX_CHANNELS];
|
|
CGV_ENTRIES subset_entryCount[MAX_SUBSETS] = {0,0,0};
|
|
|
|
// Loop over the available partitions for the block mode and quantize them
|
|
// to figure out the best candidates for further refinement
|
|
CGU_UINT8 mode_partitionsToTry;
|
|
mode_partitionsToTry = get_partitionsToTry(u_BC7Encode,EncodeState->numPartitionModes);
|
|
|
|
CGV_UINT8 bestPartition = 0;
|
|
|
|
for (CGU_INT mode_blockPartition = 0; mode_blockPartition < mode_partitionsToTry; mode_blockPartition++)
|
|
{
|
|
|
|
GetPartitionSubSet_mode01237(
|
|
image_subsets,
|
|
subset_entryCount,
|
|
static_cast<CGV_UINT8>(mode_blockPartition),
|
|
EncodeState->image_src,
|
|
blockMode,
|
|
EncodeState->channels3or4);
|
|
|
|
CGV_IMAGE subset_image_src[SOURCE_BLOCK_SIZE*MAX_CHANNELS];
|
|
CGV_INDEX index_out1[SOURCE_BLOCK_SIZE];
|
|
CGV_ERROR err_quant = 0.0F;
|
|
|
|
// Store the quntize error for this partition to be sorted and processed later
|
|
for (CGU_INT subset=0; subset < EncodeState->maxSubSets; subset++)
|
|
{
|
|
CGV_ENTRIES numEntries = subset_entryCount[subset];
|
|
|
|
for (CGU_INT ii=0; ii<SOURCE_BLOCK_SIZE; ii++)
|
|
{
|
|
subset_image_src[ii+COMP_RED *SOURCE_BLOCK_SIZE] = image_subsets[subset][ii][0];
|
|
subset_image_src[ii+COMP_GREEN*SOURCE_BLOCK_SIZE] = image_subsets[subset][ii][1];
|
|
subset_image_src[ii+COMP_BLUE *SOURCE_BLOCK_SIZE] = image_subsets[subset][ii][2];
|
|
subset_image_src[ii+COMP_ALPHA*SOURCE_BLOCK_SIZE] = image_subsets[subset][ii][3];
|
|
}
|
|
|
|
CGV_INDEXPACKED color_index2[2];
|
|
|
|
err_quant += GetQuantizeIndex(
|
|
color_index2,
|
|
index_out1,
|
|
subset_image_src,
|
|
numEntries,
|
|
EncodeState->clusters,
|
|
EncodeState->channels3or4);
|
|
|
|
for (CGV_INT idx=0; idx < numEntries; idx++)
|
|
{
|
|
storedBestindex[mode_blockPartition][subset][idx] = index_out1[idx];
|
|
}
|
|
}
|
|
|
|
storedError[mode_blockPartition] = err_quant;
|
|
}
|
|
|
|
// Sort the results
|
|
sortPartitionProjection( storedError,
|
|
sortedPartition,
|
|
mode_partitionsToTry);
|
|
|
|
CGV_EPOCODE epo_code[MAX_SUBSETS*2*MAX_CHANNELS];
|
|
CGV_EPOCODE bestEndpoints[MAX_SUBSETS*2*MAX_CHANNELS];
|
|
CGV_BYTE bestindex[MAX_SUBSETS*MAX_SUBSET_SIZE];
|
|
CGV_ENTRIES bestEntryCount[MAX_SUBSETS];
|
|
CGV_BYTE bestindex16[MAX_SUBSET_SIZE];
|
|
|
|
// Extensive shaking is most important when the ramp is short, and
|
|
// when we have less index. On a long ramp the quality of the
|
|
// initial quantizing is relatively more important
|
|
// We modulate the shake size according to the number of ramp index
|
|
// - the more index we have the less shaking should be required to find a near
|
|
// optimal match
|
|
|
|
CGU_UINT8 numShakeAttempts = max8(1, min8((CGU_UINT8)floor(8 * u_BC7Encode->quality + 0.5), mode_partitionsToTry));
|
|
CGV_ERROR err_best = CMP_FLOAT_MAX;
|
|
|
|
// Now do the endpoint shaking
|
|
for (CGU_INT nSA =0; nSA < numShakeAttempts; nSA++)
|
|
{
|
|
|
|
CGV_ERROR err_optimized = 0.0F;
|
|
CGV_UINT8 sortedBlockPartition;
|
|
sortedBlockPartition = sortedPartition[nSA];
|
|
|
|
//********************************************
|
|
// Get the partition shape for the given mode
|
|
//********************************************
|
|
GetPartitionSubSet_mode01237(
|
|
image_subsets,
|
|
subset_entryCount,
|
|
sortedBlockPartition,
|
|
EncodeState->image_src,
|
|
blockMode,
|
|
EncodeState->channels3or4);
|
|
|
|
//*****************************
|
|
// Process the partition shape
|
|
//*****************************
|
|
for (CGU_INT subset=0; subset < EncodeState->maxSubSets; subset++)
|
|
{
|
|
CGV_ENTRIES numEntries = subset_entryCount[subset];
|
|
CGV_IMAGE src_image_block[SOURCE_BLOCK_SIZE*MAX_CHANNELS];
|
|
CGV_INDEX index_io[MAX_SUBSET_SIZE];
|
|
CGV_EPOCODE tmp_epo_code[8];
|
|
|
|
for (CGU_INT k=0; k<SOURCE_BLOCK_SIZE; k++)
|
|
{
|
|
src_image_block[k+COMP_RED*SOURCE_BLOCK_SIZE] = image_subsets[subset][k][0];
|
|
src_image_block[k+COMP_GREEN*SOURCE_BLOCK_SIZE] = image_subsets[subset][k][1];
|
|
src_image_block[k+COMP_BLUE*SOURCE_BLOCK_SIZE] = image_subsets[subset][k][2];
|
|
src_image_block[k+COMP_ALPHA*SOURCE_BLOCK_SIZE] = image_subsets[subset][k][3];
|
|
}
|
|
|
|
for (CGU_INT k=0; k<MAX_SUBSET_SIZE; k++)
|
|
{
|
|
index_io[k] = storedBestindex[sortedBlockPartition][subset][k];
|
|
}
|
|
|
|
err_optimized += optimize_IndexAndEndPoints(
|
|
index_io,
|
|
tmp_epo_code,
|
|
src_image_block,
|
|
numEntries,
|
|
static_cast<CGU_INT8>(EncodeState->clusters), // Mi_
|
|
EncodeState->bits,
|
|
EncodeState->channels3or4,
|
|
u_BC7Encode);
|
|
|
|
for (CGU_INT k=0; k < MAX_SUBSET_SIZE; k++)
|
|
{
|
|
storedBestindex[sortedBlockPartition][subset][k] = index_io[k];
|
|
}
|
|
|
|
for (CGU_INT ch=0; ch<MAX_CHANNELS; ch++)
|
|
{
|
|
epo_code[(subset*2+0)*4+ch] = tmp_epo_code[ ch];
|
|
epo_code[(subset*2+1)*4+ch] = tmp_epo_code[4+ch];
|
|
}
|
|
}
|
|
|
|
//****************************************
|
|
// Check if result is better than the last
|
|
//****************************************
|
|
if(err_optimized < err_best)
|
|
{
|
|
bestPartition = sortedBlockPartition;
|
|
CGV_INT bestIndexCount = 0;
|
|
|
|
for (CGU_INT subset=0; subset < EncodeState->maxSubSets; subset++)
|
|
{
|
|
CGV_ENTRIES numEntries = subset_entryCount[subset];
|
|
bestEntryCount[subset] = numEntries;
|
|
|
|
if(numEntries)
|
|
{
|
|
for (CGU_INT ch=0; ch < EncodeState->channels3or4; ch++)
|
|
{
|
|
bestEndpoints[(subset*2+0)*4+ch] = epo_code[(subset*2+0)*4+ch];
|
|
bestEndpoints[(subset*2+1)*4+ch] = epo_code[(subset*2+1)*4+ch];
|
|
}
|
|
|
|
for (CGV_ENTRIES k=0; k< numEntries; k++)
|
|
{
|
|
bestindex[subset*MAX_SUBSET_SIZE+k] = storedBestindex[sortedBlockPartition][subset][k];
|
|
bestindex16[bestIndexCount++] = storedBestindex[sortedBlockPartition][subset][k];
|
|
}
|
|
}
|
|
}
|
|
|
|
err_best = err_optimized;
|
|
// Early out if we found we can compress with error below the quality threshold
|
|
if(err_best <= u_BC7Encode->errorThreshold)
|
|
{
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
if (blockMode != 7)
|
|
err_best += EncodeState->opaque_err;
|
|
|
|
if(err_best > EncodeState->best_err)
|
|
return;
|
|
|
|
//**************************
|
|
// Save the encoded block
|
|
//**************************
|
|
EncodeState->best_err = err_best;
|
|
|
|
|
|
// Now we have all the data needed to encode the block
|
|
// We need to pack the endpoints prior to encoding
|
|
CGV_TYPEUINT32 packedEndpoints[MAX_SUBSETS*2] = {0,0,0,0,0,0};
|
|
for (CGU_INT subset=0; subset<EncodeState->maxSubSets; subset++)
|
|
{
|
|
packedEndpoints[(subset*2)+0] = 0;
|
|
packedEndpoints[(subset*2)+1] = 0;
|
|
|
|
if(bestEntryCount[subset])
|
|
{
|
|
CGU_UINT32 rightAlignment = 0;
|
|
|
|
// Sort out parity bits
|
|
if(blockMode != 2)
|
|
{
|
|
// Sort out BCC parity bits
|
|
packedEndpoints[(subset*2)+0] = bestEndpoints[(subset*2+0)*4+0] & 1;
|
|
packedEndpoints[(subset*2)+1] = bestEndpoints[(subset*2+1)*4+0] & 1;
|
|
for (CGU_INT ch=0; ch<EncodeState->channels3or4; ch++)
|
|
{
|
|
bestEndpoints[(subset*2+0)*4+ch] >>= 1;
|
|
bestEndpoints[(subset*2+1)*4+ch] >>= 1;
|
|
}
|
|
rightAlignment++;
|
|
}
|
|
|
|
// Fixup endpoints
|
|
for (CGU_INT ch=0; ch<EncodeState->channels3or4; ch++)
|
|
{
|
|
packedEndpoints[(subset*2)+0] |= bestEndpoints[((subset*2)+0)*4+ch] << rightAlignment;
|
|
packedEndpoints[(subset*2)+1] |= bestEndpoints[((subset*2)+1)*4+ch] << rightAlignment;
|
|
rightAlignment += EncodeState->componentBits;
|
|
}
|
|
}
|
|
}
|
|
|
|
CGV_UINT8 idxCount[3] = {0, 0, 0};
|
|
for (CGV_INT k=0; k<SOURCE_BLOCK_SIZE; k++)
|
|
{
|
|
CGV_UINT8 partsub = get_partition_subset(bestPartition,EncodeState->maxSubSets,k);
|
|
CGV_UINT8 idxC = idxCount[partsub];
|
|
bestindex16[k] = bestindex[partsub*MAX_SUBSET_SIZE+idxC];
|
|
idxCount[partsub] = idxC + 1;
|
|
}
|
|
|
|
Encode_mode02137(
|
|
blockMode,
|
|
bestPartition,
|
|
packedEndpoints,
|
|
bestindex16,
|
|
EncodeState->cmp_out);
|
|
}
|
|
|
|
void Compress_mode45(
|
|
CGU_INT blockMode,
|
|
BC7_EncodeState EncodeState[],
|
|
uniform CMP_GLOBAL BC7_Encode u_BC7Encode[])
|
|
{
|
|
|
|
cmp_mode_parameters best_candidate;
|
|
EncodeState->channels3or4 = 4;
|
|
cmp_memsetBC7((CGV_BYTE *)&best_candidate, 0, sizeof(cmp_mode_parameters));
|
|
|
|
if (blockMode == 4)
|
|
{
|
|
EncodeState->max_idxMode = 2;
|
|
EncodeState->modeBits[0] = 30; // bits = 2 * (Red 5+ Grn 5+ blu 5)
|
|
EncodeState->modeBits[1] = 36; // bits = 2 * (Alpha 6+6+6)
|
|
EncodeState->numClusters0[0] = 4;
|
|
EncodeState->numClusters0[1] = 8;
|
|
EncodeState->numClusters1[0] = 8;
|
|
EncodeState->numClusters1[1] = 4;
|
|
}
|
|
else
|
|
{
|
|
EncodeState->max_idxMode = 1;
|
|
EncodeState->modeBits[0] = 42; // bits = 2 * (Red 7+ Grn 7+ blu 7)
|
|
EncodeState->modeBits[1] = 48; // bits = 2 * (Alpha 8+8+8) = 48
|
|
EncodeState->numClusters0[0] = 4;
|
|
EncodeState->numClusters0[1] = 4;
|
|
EncodeState->numClusters1[0] = 4;
|
|
EncodeState->numClusters1[1] = 4;
|
|
}
|
|
|
|
|
|
CGV_IMAGE src_color_Block[SOURCE_BLOCK_SIZE*MAX_CHANNELS];
|
|
CGV_IMAGE src_alpha_Block[SOURCE_BLOCK_SIZE*MAX_CHANNELS];
|
|
|
|
// Go through each possible rotation and selection of index rotationBits)
|
|
for (CGU_CHANNEL rotated_channel = 0; rotated_channel < EncodeState->channels3or4; rotated_channel++)
|
|
{ // A
|
|
|
|
for (CGU_INT k=0; k<SOURCE_BLOCK_SIZE; k++)
|
|
{
|
|
for (CGU_INT p=0; p<3; p++)
|
|
{
|
|
src_color_Block[k+p*SOURCE_BLOCK_SIZE] = EncodeState->image_src[k+componentRotations[rotated_channel][p+1]*SOURCE_BLOCK_SIZE];
|
|
src_alpha_Block[k+p*SOURCE_BLOCK_SIZE] = EncodeState->image_src[k+componentRotations[rotated_channel][0]*SOURCE_BLOCK_SIZE];
|
|
}
|
|
}
|
|
|
|
CGV_ERROR err_quantizer;
|
|
CGV_ERROR err_bestQuantizer = CMP_FLOAT_MAX;
|
|
|
|
for (CGU_INT idxMode = 0; idxMode < EncodeState->max_idxMode; idxMode++)
|
|
{ // B
|
|
CGV_INDEXPACKED color_index2[2]; // reserved .. Not used!
|
|
|
|
err_quantizer = GetQuantizeIndex(
|
|
color_index2,
|
|
best_candidate.color_index,
|
|
src_color_Block,
|
|
SOURCE_BLOCK_SIZE,
|
|
EncodeState->numClusters0[idxMode],
|
|
3);
|
|
|
|
err_quantizer += GetQuantizeIndex(
|
|
color_index2,
|
|
best_candidate.alpha_index,
|
|
src_alpha_Block,
|
|
SOURCE_BLOCK_SIZE,
|
|
EncodeState->numClusters1[idxMode],
|
|
3) / 3.0F;
|
|
|
|
// If quality is high then run the full shaking for this config and
|
|
// store the result if it beats the best overall error
|
|
// Otherwise only run the shaking if the error is better than the best
|
|
// quantizer error
|
|
if(err_quantizer <= err_bestQuantizer)
|
|
{
|
|
err_bestQuantizer = err_quantizer;
|
|
|
|
// Shake size gives the size of the shake cube
|
|
CGV_ERROR err_overallError;
|
|
|
|
err_overallError = optimize_IndexAndEndPoints(
|
|
best_candidate.color_index,
|
|
best_candidate.color_qendpoint,
|
|
src_color_Block,
|
|
SOURCE_BLOCK_SIZE,
|
|
EncodeState->numClusters0[idxMode],
|
|
static_cast<CGU_INT8>(EncodeState->modeBits[0]),
|
|
3,
|
|
u_BC7Encode);
|
|
|
|
// Alpha scalar block
|
|
err_overallError += optimize_IndexAndEndPoints(
|
|
best_candidate.alpha_index,
|
|
best_candidate.alpha_qendpoint,
|
|
src_alpha_Block,
|
|
SOURCE_BLOCK_SIZE,
|
|
EncodeState->numClusters1[idxMode],
|
|
static_cast<CGU_UINT8>(EncodeState->modeBits[1]),
|
|
3,
|
|
u_BC7Encode) / 3.0f;
|
|
|
|
// If we beat the previous best then encode the block
|
|
if(err_overallError < EncodeState->best_err)
|
|
{
|
|
best_candidate.idxMode = idxMode;
|
|
best_candidate.rotated_channel = rotated_channel;
|
|
if (blockMode == 4)
|
|
Encode_mode4( EncodeState->cmp_out, &best_candidate);
|
|
else
|
|
Encode_mode5( EncodeState->cmp_out, &best_candidate);
|
|
EncodeState->best_err = err_overallError;
|
|
}
|
|
}
|
|
} // B
|
|
} // A
|
|
}
|
|
|
|
|
|
void Compress_mode6( BC7_EncodeState EncodeState[],
|
|
uniform CMP_GLOBAL BC7_Encode u_BC7Encode[])
|
|
{
|
|
CGV_ERROR err;
|
|
|
|
CGV_EPOCODE epo_code_out[8] = {0};
|
|
CGV_INDEX best_index_out[MAX_SUBSET_SIZE];
|
|
CGV_INDEXPACKED best_packedindex_out[2];
|
|
|
|
|
|
// CGV_IMAGE block_endpoints[8];
|
|
// icmp_get_block_endpoints(block_endpoints, EncodeState->image_src, -1, 4);
|
|
// icmp_GetQuantizedEpoCode(epo_code_out, block_endpoints, 6,4);
|
|
// err = icmp_GetQuantizeIndex(best_packedindex_out, best_index_out, EncodeState->image_src, 4, block_endpoints, 0,4);
|
|
|
|
err = GetQuantizeIndex(
|
|
best_packedindex_out,
|
|
best_index_out,
|
|
EncodeState->image_src,
|
|
16, // numEntries
|
|
16, // clusters
|
|
4); // channels3or4
|
|
|
|
//*****************************
|
|
// Process the partition shape
|
|
//*****************************
|
|
err = optimize_IndexAndEndPoints(
|
|
best_index_out,
|
|
epo_code_out,
|
|
EncodeState->image_src,
|
|
16, //numEntries
|
|
16, // Mi_ = clusters
|
|
58, // bits
|
|
4, // channels3or4
|
|
u_BC7Encode);
|
|
|
|
//**************************
|
|
// Save the encoded block
|
|
//**************************
|
|
|
|
if (err < EncodeState->best_err)
|
|
{
|
|
EncodeState->best_err = err;
|
|
Encode_mode6(
|
|
best_index_out,
|
|
epo_code_out,
|
|
EncodeState->cmp_out);
|
|
}
|
|
}
|
|
|
|
void copy_BC7_Encode_settings(BC7_EncodeState EncodeState[], uniform CMP_GLOBAL BC7_Encode settings [])
|
|
{
|
|
EncodeState->best_err = CMP_FLOAT_MAX;
|
|
EncodeState->validModeMask = settings->validModeMask;
|
|
#ifdef USE_ICMP
|
|
EncodeState->part_count = settings->part_count;
|
|
EncodeState->channels = settings->channels;
|
|
#endif
|
|
}
|
|
|
|
//===================================== ICMP CODE =========================================================
|
|
#ifdef USE_ICMP
|
|
//========================================
|
|
// Modified Intel Texture Compression Code
|
|
//========================================
|
|
|
|
void icmp_Write32Bit(CGV_CMPOUTPACKED base[], CGU_INT* uniform offset, CGU_INT bits, CGV_CMPOUTPACKED bitVal)
|
|
{
|
|
base[*offset / 32] |= ((CGV_CMPOUTPACKED)bitVal) << (*offset % 32);
|
|
if (*offset % 32 + bits > 32)
|
|
{
|
|
base[*offset / 32 + 1] |= shift_right_uint32(bitVal, 32 - *offset % 32);
|
|
}
|
|
*offset += bits;
|
|
}
|
|
|
|
//================ 32 bit cmp_out mode encoders ===============
|
|
|
|
INLINE void icmp_swap_epocode(CGV_EPOCODE u[], CGV_EPOCODE v[], CGU_INT n)
|
|
{
|
|
for (CGU_INT i = 0; i < n; i++)
|
|
{
|
|
CGV_EPOCODE t = u[i];
|
|
u[i] = v[i];
|
|
v[i] = t;
|
|
}
|
|
}
|
|
|
|
void icmp_encode_apply_swap(CGV_EPOCODE endpoint[], CGU_INT channel, CGV_INDEXPACKED block_index[2], CGU_INT bits)
|
|
{
|
|
CGU_INT levels = 1 << bits;
|
|
if ((block_index[0] & 15) >= levels / 2)
|
|
{
|
|
icmp_swap_epocode(&endpoint[0], &endpoint[channel], channel);
|
|
|
|
for (CGU_INT k = 0; k < 2; k++)
|
|
block_index[k] = (CGV_INDEXPACKED)(0x11111111 * (levels - 1)) - block_index[k];
|
|
}
|
|
}
|
|
|
|
void icmp_encode_index(CGV_CMPOUTPACKED data[5], CGU_INT* uniform pPos, CGV_INDEXPACKED block_index[2], CGU_INT bits, CGV_MASK flips)
|
|
{
|
|
CGU_INT levels = 1 << bits;
|
|
CGV_MASK flips_shifted = flips;
|
|
for (CGU_INT k1 = 0; k1 < 2; k1++)
|
|
{
|
|
CGV_CMPOUTPACKED qbits_shifted = block_index[k1];
|
|
for (CGU_INT k2 = 0; k2 < 8; k2++)
|
|
{
|
|
CGV_CMPOUTPACKED q = qbits_shifted & 15;
|
|
if ((flips_shifted & 1) > 0) q = (levels - 1) - q;
|
|
|
|
if (k1 == 0 && k2 == 0) icmp_Write32Bit(data, pPos, bits - 1, q);
|
|
else icmp_Write32Bit(data, pPos, bits, q);
|
|
qbits_shifted >>= 4;
|
|
flips_shifted >>= 1;
|
|
}
|
|
}
|
|
}
|
|
|
|
void icmp_bc7_encode_endpoint2(CGV_CMPOUTPACKED data[5], CGU_INT* uniform pPos, CGV_INDEXPACKED color_index[2], CGU_INT bits, CGV_MASK flips)
|
|
{
|
|
CGU_INT levels = 1 << bits;
|
|
CGV_MASK flips_shifted = flips;
|
|
for (CGU_INT k1 = 0; k1 < 2; k1++)
|
|
{
|
|
CGV_INDEXPACKED qbits_shifted = color_index[k1];
|
|
for (CGU_INT k2 = 0; k2 < 8; k2++)
|
|
{
|
|
CGV_INDEXPACKED q = qbits_shifted & 15;
|
|
if ((flips_shifted & 1) > 0) q = (levels - 1) - q;
|
|
|
|
if (k1 == 0 && k2 == 0) icmp_Write32Bit(data, pPos, bits - 1, q);
|
|
else icmp_Write32Bit(data, pPos, bits, q);
|
|
qbits_shifted >>= 4;
|
|
flips_shifted >>= 1;
|
|
}
|
|
}
|
|
}
|
|
|
|
INLINE CGV_CMPOUTPACKED icmp_pow2Packed(CGV_FIXUPINDEX x)
|
|
{
|
|
return 1 << x;
|
|
}
|
|
|
|
INLINE void icmp_encode_data_shl_1bit_from(CGV_CMPOUTPACKED data[5], CGV_FIXUPINDEX from)
|
|
{
|
|
if (from < 96)
|
|
{
|
|
//assert(from > 64+10);
|
|
|
|
CGV_CMPOUTPACKED shifted = (data[2] >> 1) | (data[3] << 31);
|
|
CGV_CMPOUTPACKED mask = (icmp_pow2Packed(from - 64) - 1) >> 1;
|
|
data[2] = (mask&data[2]) | (~mask&shifted);
|
|
data[3] = (data[3] >> 1) | (data[4] << 31);
|
|
data[4] = data[4] >> 1;
|
|
}
|
|
else if (from < 128)
|
|
{
|
|
CGV_CMPOUTPACKED shifted = (data[3] >> 1) | (data[4] << 31);
|
|
CGV_CMPOUTPACKED mask = (icmp_pow2Packed(from - 96) - 1) >> 1;
|
|
data[3] = (mask&data[3]) | (~mask&shifted);
|
|
data[4] = data[4] >> 1;
|
|
}
|
|
}
|
|
|
|
INLINE void icmp_get_fixuptable(CGV_FIXUPINDEX fixup[3], CGV_PARTID part_id)
|
|
{
|
|
// same as CMP SDK v3.1 BC7_FIXUPINDEX1 & BC7_FIXUPINDEX2 for each partition range 0..63
|
|
// The data is saved as a packed INT = (BC7_FIXUPINDEX1 << 4 + BC7_FIXUPINDEX2)
|
|
CMP_STATIC uniform __constant CGV_FIXUPINDEX FIXUPINDEX[] = {
|
|
// 2 subset partitions 0..63
|
|
0xf0u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0xf0u,
|
|
0xf0u, 0x20u, 0x80u, 0x20u, 0x20u, 0x80u, 0x80u, 0xf0u, 0x20u, 0x80u, 0x20u, 0x20u, 0x80u, 0x80u, 0x20u, 0x20u,
|
|
0xf0u, 0xf0u, 0x60u, 0x80u, 0x20u, 0x80u, 0xf0u, 0xf0u, 0x20u, 0x80u, 0x20u, 0x20u, 0x20u, 0xf0u, 0xf0u, 0x60u,
|
|
0x60u, 0x20u, 0x60u, 0x80u, 0xf0u, 0xf0u, 0x20u, 0x20u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0x20u, 0x20u, 0xf0u,
|
|
// 3 subset partitions 64..128
|
|
0x3fu, 0x38u, 0xf8u, 0xf3u, 0x8fu, 0x3fu, 0xf3u, 0xf8u, 0x8fu, 0x8fu, 0x6fu, 0x6fu, 0x6fu, 0x5fu, 0x3fu, 0x38u,
|
|
0x3fu, 0x38u, 0x8fu, 0xf3u, 0x3fu, 0x38u, 0x6fu, 0xa8u, 0x53u, 0x8fu, 0x86u, 0x6au, 0x8fu, 0x5fu, 0xfau, 0xf8u,
|
|
0x8fu, 0xf3u, 0x3fu, 0x5au, 0x6au, 0xa8u, 0x89u, 0xfau, 0xf6u, 0x3fu, 0xf8u, 0x5fu, 0xf3u, 0xf6u, 0xf6u, 0xf8u,
|
|
0x3fu, 0xf3u, 0x5fu, 0x5fu, 0x5fu, 0x8fu, 0x5fu, 0xafu, 0x5fu, 0xafu, 0x8fu, 0xdfu, 0xf3u, 0xcfu, 0x3fu, 0x38u
|
|
};
|
|
|
|
CGV_FIXUPINDEX skip_packed = FIXUPINDEX[part_id];// gather_int2(FIXUPINDEX, part_id);
|
|
fixup[0] = 0;
|
|
fixup[1] = skip_packed >> 4;
|
|
fixup[2] = skip_packed & 15;
|
|
}
|
|
|
|
void icmp_bc7_encode_adjust_skip_mode01237_2(CGV_CMPOUTPACKED data[5], CGU_INT mode, CGV_PARTID part_id)
|
|
{
|
|
CGU_INT bits = 2; if (mode == 0 || mode == 1) bits = 3;
|
|
CGU_INT maxSubSets = 2; if (mode == 0 || mode == 2) maxSubSets = 3;
|
|
|
|
CGV_FIXUPINDEX fixup[3];
|
|
icmp_get_fixuptable(fixup, part_id);
|
|
|
|
if (maxSubSets > 2 && fixup[1] < fixup[2])
|
|
{
|
|
CGV_FIXUPINDEX t = fixup[1]; fixup[1] = fixup[2]; fixup[2] = t;
|
|
}
|
|
|
|
for (CGU_INT j = 1; j < maxSubSets; j++)
|
|
{
|
|
CGV_FIXUPINDEX k = fixup[j];
|
|
icmp_encode_data_shl_1bit_from(data, 128 + (maxSubSets - 1) - (15 - k)*bits);
|
|
}
|
|
}
|
|
|
|
INLINE CGV_UINT32 gather_uint32(__constant CGU_UINT32 * const uniform ptr, CGV_INT idx)
|
|
{
|
|
return ptr[idx]; // (perf warning expected)
|
|
}
|
|
|
|
INLINE CGV_MASK icmp_get_partition_mask(CGV_PARTID part_id, CGU_INT subset)
|
|
{
|
|
CMP_STATIC uniform __constant CGV_SHIFT32 pattern_mask_table[] = {
|
|
// 2 subset partitions
|
|
0xCCCC3333u, 0x88887777u, 0xEEEE1111u, 0xECC81337u, 0xC880377Fu, 0xFEEC0113u, 0xFEC80137u, 0xEC80137Fu,
|
|
0xC80037FFu, 0xFFEC0013u, 0xFE80017Fu, 0xE80017FFu, 0xFFE80017u, 0xFF0000FFu, 0xFFF0000Fu, 0xF0000FFFu,
|
|
0xF71008EFu, 0x008EFF71u, 0x71008EFFu, 0x08CEF731u, 0x008CFF73u, 0x73108CEFu, 0x3100CEFFu, 0x8CCE7331u,
|
|
0x088CF773u, 0x3110CEEFu, 0x66669999u, 0x366CC993u, 0x17E8E817u, 0x0FF0F00Fu, 0x718E8E71u, 0x399CC663u,
|
|
0xAAAA5555u, 0xF0F00F0Fu, 0x5A5AA5A5u, 0x33CCCC33u, 0x3C3CC3C3u, 0x55AAAA55u, 0x96966969u, 0xA55A5AA5u,
|
|
0x73CE8C31u, 0x13C8EC37u, 0x324CCDB3u, 0x3BDCC423u, 0x69969669u, 0xC33C3CC3u, 0x99666699u, 0x0660F99Fu,
|
|
0x0272FD8Du, 0x04E4FB1Bu, 0x4E40B1BFu, 0x2720D8DFu, 0xC93636C9u, 0x936C6C93u, 0x39C6C639u, 0x639C9C63u,
|
|
0x93366CC9u, 0x9CC66339u, 0x817E7E81u, 0xE71818E7u, 0xCCF0330Fu, 0x0FCCF033u, 0x774488BBu, 0xEE2211DDu,
|
|
|
|
// 3 subset partitions
|
|
0x08CC0133u, 0x8CC80037u, 0xCC80006Fu, 0xEC001331u, 0x330000FFu, 0x00CC3333u, 0xFF000033u, 0xCCCC0033u,
|
|
0x0F0000FFu, 0x0FF0000Fu, 0x00F0000Fu, 0x44443333u, 0x66661111u, 0x22221111u, 0x136C0013u, 0x008C8C63u,
|
|
0x36C80137u, 0x08CEC631u, 0x3330000Fu, 0xF0000333u, 0x00EE1111u, 0x88880077u, 0x22C0113Fu, 0x443088CFu,
|
|
0x0C22F311u, 0x03440033u, 0x69969009u, 0x9960009Fu, 0x03303443u, 0x00660699u, 0xC22C3113u, 0x8C0000EFu,
|
|
0x1300007Fu, 0xC4003331u, 0x004C1333u, 0x22229999u, 0x00F0F00Fu, 0x24929249u, 0x29429429u, 0xC30C30C3u,
|
|
0xC03C3C03u, 0x00AA0055u, 0xAA0000FFu, 0x30300303u, 0xC0C03333u, 0x90900909u, 0xA00A5005u, 0xAAA0000Fu,
|
|
0x0AAA0555u, 0xE0E01111u, 0x70700707u, 0x6660000Fu, 0x0EE01111u, 0x07707007u, 0x06660999u, 0x660000FFu,
|
|
0x00660099u, 0x0CC03333u, 0x03303003u, 0x60000FFFu, 0x80807777u, 0x10100101u, 0x000A0005u, 0x08CE8421u
|
|
};
|
|
|
|
CGV_MASK mask_packed = gather_uint32(pattern_mask_table, part_id);
|
|
CGV_MASK mask0 = mask_packed & 0xFFFF;
|
|
CGV_MASK mask1 = mask_packed >> 16;
|
|
|
|
CGV_MASK mask = (subset == 2) ? (~mask0)&(~mask1) : ((subset == 0) ? mask0 : mask1);
|
|
return mask;
|
|
}
|
|
|
|
#ifdef USE_VARYING
|
|
#ifdef ASPM_GPU
|
|
INLINE CGV_INDEXPACKED gather_packedindex(CGV_INDEXPACKED* ptr, CGV_FIXUPINDEX idx)
|
|
{
|
|
return ptr[idx];
|
|
}
|
|
#else
|
|
INLINE CGV_INDEXPACKED gather_packedindex(CMP_CONSTANT varying CGV_INDEXPACKED* CMP_CONSTANT uniform ptr, CGV_FIXUPINDEX idx)
|
|
{
|
|
return ptr[idx]; // (perf warning expected)
|
|
}
|
|
#endif
|
|
#endif
|
|
|
|
CGV_MASK icmp_encode_apply_swap_mode01237(CGV_EPOCODE qep[], CGV_INDEXPACKED color_index[2], CGU_INT blockMode, CGV_PARTID part_id)
|
|
{
|
|
CGU_INT bits = 2; if (blockMode == 0 || blockMode == 1) bits = 3;
|
|
CGU_INT maxSubSets = 2; if (blockMode == 0 || blockMode == 2) maxSubSets = 3;
|
|
|
|
CGV_MASK flips = 0;
|
|
CGU_INT levels = 1 << bits;
|
|
CGV_FIXUPINDEX fixup[3];
|
|
icmp_get_fixuptable(fixup, part_id);
|
|
|
|
for (CGU_INT j = 0; j < maxSubSets; j++)
|
|
{
|
|
CGV_FIXUPINDEX k0 = fixup[j];
|
|
|
|
#ifdef USE_VARYING
|
|
CGV_INDEXPACKED q = ((gather_packedindex(color_index, k0 >> 3) << (28 - (k0 & 7) * 4)) >> 28);
|
|
#else
|
|
CGV_INDEXPACKED q = ((color_index[k0 >> 3] << (28 - (k0 & 7) * 4)) >> 28);
|
|
#endif
|
|
|
|
if (q >= levels / 2)
|
|
{
|
|
icmp_swap_epocode(&qep[8 * j], &qep[8 * j + 4], 4);
|
|
CGV_MASK partition_mask = icmp_get_partition_mask(part_id, j);
|
|
flips |= partition_mask;
|
|
}
|
|
}
|
|
|
|
return flips;
|
|
}
|
|
|
|
void icmp_encode_mode01237(CGV_CMPOUTPACKED cmp_out[5], CGV_EPOCODE color_qendpoint[], CGV_INDEXPACKED color_index[2], CGV_PARTID part_id, CGU_INT blockMode)
|
|
{
|
|
CGU_INT bits = 2; if (blockMode == 0 || blockMode == 1) bits = 3;
|
|
CGU_INT maxSubSets = 2; if (blockMode == 0 || blockMode == 2) maxSubSets = 3;
|
|
CGU_INT channels = 3; if (blockMode == 7) channels = 4;
|
|
|
|
CGV_MASK flips = icmp_encode_apply_swap_mode01237(color_qendpoint, color_index, blockMode, part_id);
|
|
|
|
for (CGU_INT k = 0; k < 5; k++) cmp_out[k] = 0;
|
|
CGU_INT pos = 0;
|
|
|
|
// mode 0-3, 7
|
|
icmp_Write32Bit(cmp_out, &pos, blockMode + 1, 1 << blockMode);
|
|
|
|
// partition
|
|
if (blockMode == 0)
|
|
{
|
|
icmp_Write32Bit(cmp_out, &pos, 4, part_id & 15);
|
|
}
|
|
else
|
|
{
|
|
icmp_Write32Bit(cmp_out, &pos, 6, part_id & 63);
|
|
}
|
|
|
|
// endpoints
|
|
for (CGU_INT ch = 0; ch < channels; ch++)
|
|
for (CGU_INT j = 0; j < maxSubSets * 2; j++)
|
|
{
|
|
if (blockMode == 0)
|
|
{
|
|
icmp_Write32Bit(cmp_out, &pos, 4, color_qendpoint[j * 4 + 0 + ch] >> 1);
|
|
}
|
|
else if (blockMode == 1)
|
|
{
|
|
icmp_Write32Bit(cmp_out, &pos, 6, color_qendpoint[j * 4 + 0 + ch] >> 1);
|
|
}
|
|
else if (blockMode == 2)
|
|
{
|
|
icmp_Write32Bit(cmp_out, &pos, 5, color_qendpoint[j * 4 + 0 + ch]);
|
|
}
|
|
else if (blockMode == 3)
|
|
{
|
|
icmp_Write32Bit(cmp_out, &pos, 7, color_qendpoint[j * 4 + 0 + ch] >> 1);
|
|
}
|
|
else if (blockMode == 7)
|
|
{
|
|
icmp_Write32Bit(cmp_out, &pos, 5, color_qendpoint[j * 4 + 0 + ch] >> 1);
|
|
}
|
|
//else
|
|
//{
|
|
// assert(false);
|
|
//}
|
|
}
|
|
|
|
// p bits
|
|
if (blockMode == 1)
|
|
for (CGU_INT j = 0; j < 2; j++)
|
|
{
|
|
icmp_Write32Bit(cmp_out, &pos, 1, color_qendpoint[j * 8] & 1);
|
|
}
|
|
|
|
if (blockMode == 0 || blockMode == 3 || blockMode == 7)
|
|
for (CGU_INT j = 0; j < maxSubSets * 2; j++)
|
|
{
|
|
icmp_Write32Bit(cmp_out, &pos, 1, color_qendpoint[j * 4] & 1);
|
|
}
|
|
|
|
// quantized values
|
|
icmp_bc7_encode_endpoint2(cmp_out, &pos, color_index, bits, flips);
|
|
icmp_bc7_encode_adjust_skip_mode01237_2(cmp_out, blockMode, part_id);
|
|
}
|
|
|
|
INLINE void icmp_swap_indexpacked(CGV_INDEXPACKED u[], CGV_INDEXPACKED v[], CGU_INT n)
|
|
{
|
|
for (CGU_INT i = 0; i < n; i++)
|
|
{
|
|
CGV_INDEXPACKED t = u[i];
|
|
u[i] = v[i];
|
|
v[i] = t;
|
|
}
|
|
}
|
|
|
|
|
|
void icmp_encode_mode4(CGV_CMPOUTPACKED cmp_out[5], varying cmp_mode_parameters* uniform params)
|
|
{
|
|
CGV_EPOCODE color_qendpoint[8];
|
|
CGV_INDEXPACKED color_index[2];
|
|
CGV_EPOCODE alpha_qendpoint[2];
|
|
CGV_INDEXPACKED alpha_index[2];
|
|
|
|
CGV_CMPOUTPACKED rotated_channel = params->rotated_channel;
|
|
CGV_SHIFT32 idxMode = params->idxMode;
|
|
|
|
icmp_swap_epocode(params->color_qendpoint, color_qendpoint, 8);
|
|
icmp_swap_indexpacked(params->best_color_index, color_index, 2);
|
|
icmp_swap_epocode(params->alpha_qendpoint, alpha_qendpoint, 2);
|
|
icmp_swap_indexpacked(params->best_alpha_index, alpha_index, 2);
|
|
|
|
for (CGU_INT k = 0; k < 5; k++) cmp_out[k] = 0;
|
|
CGU_INT pos = 0;
|
|
|
|
// mode 4 (5 bits) 00001
|
|
icmp_Write32Bit(cmp_out, &pos, 5, 16);
|
|
|
|
// rotation channel 2 bits
|
|
icmp_Write32Bit(cmp_out, &pos, 2, (rotated_channel + 1) & 3);
|
|
|
|
// idxMode 1 bit
|
|
icmp_Write32Bit(cmp_out, &pos, 1, idxMode);
|
|
|
|
if (!idxMode)
|
|
{
|
|
icmp_encode_apply_swap(color_qendpoint, 4, color_index, 2);
|
|
icmp_encode_apply_swap(alpha_qendpoint, 1, alpha_index, 3);
|
|
}
|
|
else
|
|
{
|
|
icmp_swap_indexpacked(color_index, alpha_index, 2);
|
|
icmp_encode_apply_swap(alpha_qendpoint, 1, color_index, 2);
|
|
icmp_encode_apply_swap(color_qendpoint, 4, alpha_index, 3);
|
|
}
|
|
|
|
// color endpoints 5 bits each
|
|
// R0 : R1
|
|
// G0 : G1
|
|
// B0 : B1
|
|
for (CGU_INT p = 0; p < 3; p++)
|
|
{
|
|
CGV_EPOCODE c0 = color_qendpoint[0 + p];
|
|
CGV_EPOCODE c1 = color_qendpoint[4 + p];
|
|
icmp_Write32Bit(cmp_out, &pos, 5, c0); // 0
|
|
icmp_Write32Bit(cmp_out, &pos, 5, c1); // 1
|
|
}
|
|
|
|
// alpha endpoints (6 bits each)
|
|
// A0 : A1
|
|
icmp_Write32Bit(cmp_out, &pos, 6, alpha_qendpoint[0]);
|
|
icmp_Write32Bit(cmp_out, &pos, 6, alpha_qendpoint[1]);
|
|
|
|
// index data (color index 2 bits each) 31 bits total
|
|
icmp_encode_index(cmp_out, &pos, color_index, 2, 0);
|
|
|
|
// index data (alpha index 3 bits each) 47 bits total
|
|
icmp_encode_index(cmp_out, &pos, alpha_index, 3, 0);
|
|
}
|
|
|
|
void icmp_Encode_mode5(CGV_CMPOUTPACKED cmp_out[5], varying cmp_mode_parameters* uniform params)
|
|
{
|
|
|
|
CGV_EPOCODE qep[8];
|
|
CGV_INDEXPACKED color_index[2];
|
|
CGV_EPOCODE alpha_qendpoint[2];
|
|
CGV_INDEXPACKED alpha_index[2];
|
|
|
|
icmp_swap_epocode(params->color_qendpoint, qep, 8);
|
|
icmp_swap_indexpacked(params->best_color_index, color_index, 2);
|
|
icmp_swap_epocode(params->alpha_qendpoint, alpha_qendpoint, 2);
|
|
icmp_swap_indexpacked(params->best_alpha_index, alpha_index, 2);
|
|
|
|
CGV_CMPOUTPACKED rotated_channel = params->rotated_channel;
|
|
|
|
icmp_encode_apply_swap(qep, 4, color_index, 2);
|
|
icmp_encode_apply_swap(alpha_qendpoint, 1, alpha_index, 2);
|
|
|
|
for (CGU_INT k = 0; k < 5; k++) cmp_out[k] = 0;
|
|
CGU_INT pos = 0;
|
|
|
|
// mode 5
|
|
icmp_Write32Bit(cmp_out, &pos, 6, 1 << 5);
|
|
|
|
// rotated channel
|
|
icmp_Write32Bit(cmp_out, &pos, 2, (rotated_channel + 1) & 3);
|
|
|
|
// endpoints
|
|
for (CGU_INT p = 0; p < 3; p++)
|
|
{
|
|
icmp_Write32Bit(cmp_out, &pos, 7, qep[0 + p]);
|
|
icmp_Write32Bit(cmp_out, &pos, 7, qep[4 + p]);
|
|
}
|
|
|
|
// alpha endpoints
|
|
icmp_Write32Bit(cmp_out, &pos, 8, alpha_qendpoint[0]);
|
|
icmp_Write32Bit(cmp_out, &pos, 8, alpha_qendpoint[1]);
|
|
|
|
// quantized values
|
|
icmp_encode_index(cmp_out, &pos, color_index, 2, 0);
|
|
icmp_encode_index(cmp_out, &pos, alpha_index, 2, 0);
|
|
|
|
}
|
|
|
|
void icmp_encode_mode6(CGV_CMPOUTPACKED cmp_out[5], CGV_EPOCODE qep[8], CGV_INDEXPACKED color_index[2])
|
|
{
|
|
icmp_encode_apply_swap(qep, 4, color_index, 4);
|
|
|
|
for (CGU_INT k = 0; k < 5; k++) cmp_out[k] = 0;
|
|
CGU_INT pos = 0;
|
|
|
|
// mode 6
|
|
icmp_Write32Bit(cmp_out, &pos, 7, 64);
|
|
|
|
// endpoints
|
|
for (CGU_INT p = 0; p < 4; p++)
|
|
{
|
|
icmp_Write32Bit(cmp_out, &pos, 7, qep[0 + p] >> 1);
|
|
icmp_Write32Bit(cmp_out, &pos, 7, qep[4 + p] >> 1);
|
|
}
|
|
|
|
// p bits
|
|
icmp_Write32Bit(cmp_out, &pos, 1, qep[0] & 1);
|
|
icmp_Write32Bit(cmp_out, &pos, 1, qep[4] & 1);
|
|
|
|
// quantized values
|
|
icmp_encode_index(cmp_out, &pos, color_index, 4, 0);
|
|
}
|
|
|
|
///////////////////////////
|
|
// PCA helpers
|
|
|
|
INLINE void icmp_compute_stats_masked(CGV_IMAGE stats[15], CGV_IMAGE image_src[64], CGV_MASK mask, CGU_CHANNEL channels)
|
|
{
|
|
for (CGU_INT i = 0; i < 15; i++) stats[i] = 0;
|
|
|
|
CGV_MASK mask_shifted = mask << 1;
|
|
for (CGU_INT k = 0; k < 16; k++)
|
|
{
|
|
mask_shifted >>= 1;
|
|
//if ((mask_shifted&1) == 0) continue;
|
|
CGV_MASK flag = (mask_shifted & 1);
|
|
|
|
CGV_IMAGE rgba[4];
|
|
for (CGU_CHANNEL ch = 0; ch < channels; ch++) rgba[ch] = image_src[k + ch * 16];
|
|
|
|
for (CGU_CHANNEL ch = 0; ch < channels; ch++) rgba[ch] *= flag;
|
|
stats[14] += flag;
|
|
|
|
stats[10] += rgba[0];
|
|
stats[11] += rgba[1];
|
|
stats[12] += rgba[2];
|
|
|
|
stats[0] += rgba[0] * rgba[0];
|
|
stats[1] += rgba[0] * rgba[1];
|
|
stats[2] += rgba[0] * rgba[2];
|
|
|
|
stats[4] += rgba[1] * rgba[1];
|
|
stats[5] += rgba[1] * rgba[2];
|
|
|
|
stats[7] += rgba[2] * rgba[2];
|
|
|
|
if (channels == 4)
|
|
{
|
|
stats[13] += rgba[3];
|
|
|
|
stats[3] += rgba[0] * rgba[3];
|
|
stats[6] += rgba[1] * rgba[3];
|
|
stats[8] += rgba[2] * rgba[3];
|
|
stats[9] += rgba[3] * rgba[3];
|
|
}
|
|
}
|
|
}
|
|
|
|
INLINE void icmp_covar_from_stats(CGV_IMAGE covar[10], CGV_IMAGE stats[15], CGU_CHANNEL channels3or4)
|
|
{
|
|
covar[0] = stats[0] - stats[10 + 0] * stats[10 + 0] / stats[14];
|
|
covar[1] = stats[1] - stats[10 + 0] * stats[10 + 1] / stats[14];
|
|
covar[2] = stats[2] - stats[10 + 0] * stats[10 + 2] / stats[14];
|
|
|
|
covar[4] = stats[4] - stats[10 + 1] * stats[10 + 1] / stats[14];
|
|
covar[5] = stats[5] - stats[10 + 1] * stats[10 + 2] / stats[14];
|
|
|
|
covar[7] = stats[7] - stats[10 + 2] * stats[10 + 2] / stats[14];
|
|
|
|
if (channels3or4 == 4)
|
|
{
|
|
covar[3] = stats[3] - stats[10 + 0] * stats[10 + 3] / stats[14];
|
|
covar[6] = stats[6] - stats[10 + 1] * stats[10 + 3] / stats[14];
|
|
covar[8] = stats[8] - stats[10 + 2] * stats[10 + 3] / stats[14];
|
|
covar[9] = stats[9] - stats[10 + 3] * stats[10 + 3] / stats[14];
|
|
}
|
|
}
|
|
|
|
INLINE void icmp_compute_covar_dc_masked(CGV_IMAGE covar[6], CGV_IMAGE dc[3], CGV_IMAGE image_src[64], CGV_MASK mask, CGU_INT channels3or4)
|
|
{
|
|
CGV_IMAGE stats[15];
|
|
icmp_compute_stats_masked(stats, image_src, mask, channels3or4);
|
|
|
|
icmp_covar_from_stats(covar, stats, channels3or4);
|
|
for (CGU_INT ch = 0; ch < channels3or4; ch++) dc[ch] = stats[10 + ch] / stats[14];
|
|
}
|
|
|
|
INLINE void icmp_ssymv3(CGV_IMAGE a[4], CGV_IMAGE covar[10], CGV_IMAGE b[4])
|
|
{
|
|
a[0] = covar[0] * b[0] + covar[1] * b[1] + covar[2] * b[2];
|
|
a[1] = covar[1] * b[0] + covar[4] * b[1] + covar[5] * b[2];
|
|
a[2] = covar[2] * b[0] + covar[5] * b[1] + covar[7] * b[2];
|
|
}
|
|
|
|
INLINE void icmp_ssymv4_2(CGV_IMAGE a[4], CGV_IMAGE covar[10], CGV_IMAGE b[4])
|
|
{
|
|
a[0] = covar[0] * b[0] + covar[1] * b[1] + covar[2] * b[2] + covar[3] * b[3];
|
|
a[1] = covar[1] * b[0] + covar[4] * b[1] + covar[5] * b[2] + covar[6] * b[3];
|
|
a[2] = covar[2] * b[0] + covar[5] * b[1] + covar[7] * b[2] + covar[8] * b[3];
|
|
a[3] = covar[3] * b[0] + covar[6] * b[1] + covar[8] * b[2] + covar[9] * b[3];
|
|
}
|
|
|
|
#ifndef ASPM
|
|
// Computes inverse square root over an implementation-defined range. The maximum error is implementation-defined.
|
|
CGV_IMAGE Image_rsqrt(CGV_IMAGE f)
|
|
{
|
|
CGV_IMAGE sf = sqrt(f);
|
|
if (sf != 0)
|
|
return 1 / sqrt(f);
|
|
else
|
|
return 0.0f;
|
|
}
|
|
#endif
|
|
|
|
INLINE void icmp_compute_axis(CGV_IMAGE axis[4],
|
|
CGV_IMAGE covar[10],
|
|
#ifdef ASPM_GPU
|
|
CGV_ITTERATIONS powerIterations,
|
|
#else
|
|
uniform __constant CGV_ITTERATIONS powerIterations,
|
|
#endif
|
|
CGU_CHANNEL channels)
|
|
{
|
|
CGV_IMAGE vec[4] = { 1,1,1,1 };
|
|
|
|
for (CGU_INT i = 0; i < powerIterations; i++)
|
|
{
|
|
if (channels == 3) icmp_ssymv3(axis, covar, vec);
|
|
if (channels == 4) icmp_ssymv4_2(axis, covar, vec);
|
|
|
|
for (CGU_CHANNEL ch = 0; ch < channels; ch++) vec[ch] = axis[ch];
|
|
|
|
if (i % 2 == 1) // renormalize every other iteration
|
|
{
|
|
CGV_IMAGE norm_sq = 0;
|
|
for (CGU_CHANNEL ch = 0; ch < channels; ch++)
|
|
norm_sq += axis[ch] * axis[ch];
|
|
|
|
#ifndef ASPM
|
|
CGV_IMAGE rnorm = Image_rsqrt(norm_sq);
|
|
#else
|
|
CGV_IMAGE rnorm = rsqrt(norm_sq);
|
|
#endif
|
|
for (CGU_CHANNEL ch = 0; ch < channels; ch++) vec[ch] *= rnorm;
|
|
}
|
|
}
|
|
|
|
for (CGU_CHANNEL ch = 0; ch < channels; ch++) axis[ch] = vec[ch];
|
|
}
|
|
|
|
void icmp_block_pca_axis(CGV_IMAGE axis[4], CGV_IMAGE dc[4], CGV_IMAGE image_src[64], CGV_MASK mask, CGU_INT channels3or4)
|
|
{
|
|
uniform __constant CGV_ITTERATIONS powerIterations = 8; // 4 not enough for HQ
|
|
|
|
CGV_IMAGE covar[10];
|
|
icmp_compute_covar_dc_masked(covar, dc, image_src, mask, channels3or4);
|
|
|
|
CGV_IMAGE inv_var = 1.0 / (256 * 256);
|
|
for (CGU_INT k = 0; k < 10; k++)
|
|
{
|
|
covar[k] *= inv_var;
|
|
}
|
|
|
|
CGV_IMAGE eps = sq_image(0.001F);
|
|
covar[0] += eps;
|
|
covar[4] += eps;
|
|
covar[7] += eps;
|
|
covar[9] += eps;
|
|
|
|
icmp_compute_axis(axis, covar, powerIterations, channels3or4);
|
|
}
|
|
|
|
CGV_IMAGE minImage(CGV_IMAGE a, CGV_IMAGE b) { return a < b ? a : b; }
|
|
CGV_IMAGE maxImage(CGV_IMAGE a, CGV_IMAGE b) { return a > b ? a : b; }
|
|
|
|
|
|
void icmp_block_segment_core(CGV_IMAGE epo_code[], CGV_IMAGE image_src[64], CGV_MASK mask, CGU_INT channels3or4)
|
|
{
|
|
CGV_IMAGE axis[4];
|
|
CGV_IMAGE dc[4];
|
|
icmp_block_pca_axis(axis, dc, image_src, mask, channels3or4);
|
|
|
|
CGV_IMAGE ext[2];
|
|
ext[0] = +1e32;
|
|
ext[1] = -1e32;
|
|
|
|
// find min/max
|
|
CGV_MASK mask_shifted = mask << 1;
|
|
for (CGU_INT k = 0; k < 16; k++)
|
|
{
|
|
mask_shifted >>= 1;
|
|
if ((mask_shifted & 1) == 0) continue;
|
|
|
|
CGV_IMAGE dot = 0;
|
|
for (CGU_INT ch = 0; ch < channels3or4; ch++)
|
|
dot += axis[ch] * (image_src[16 * ch + k] - dc[ch]);
|
|
|
|
ext[0] = minImage(ext[0], dot);
|
|
ext[1] = maxImage(ext[1], dot);
|
|
}
|
|
|
|
// create some distance if the endpoints collapse
|
|
if (ext[1] - ext[0] < 1.0f)
|
|
{
|
|
ext[0] -= 0.5f;
|
|
ext[1] += 0.5f;
|
|
}
|
|
|
|
for (CGU_INT i = 0; i < 2; i++)
|
|
for (CGU_INT ch = 0; ch < channels3or4; ch++)
|
|
{
|
|
epo_code[4 * i + ch] = ext[i] * axis[ch] + dc[ch];
|
|
}
|
|
}
|
|
|
|
INLINE CGV_IMAGE clampf(CGV_IMAGE v, CGV_IMAGE a, CGV_IMAGE b)
|
|
{
|
|
if (v < a)
|
|
return a;
|
|
else
|
|
if (v > b)
|
|
return b;
|
|
return v;
|
|
}
|
|
|
|
|
|
void icmp_get_block_endpoints(CGV_IMAGE block_endpoints[], CGV_IMAGE image_src[64], CGV_MASK mask, CGU_CHANNEL channels3or4)
|
|
{
|
|
icmp_block_segment_core(block_endpoints, image_src, mask, channels3or4);
|
|
|
|
for (CGU_INT i = 0; i < 2; i++)
|
|
for (CGU_INT ch = 0; ch < channels3or4; ch++)
|
|
{
|
|
block_endpoints[4 * i + ch] = clampf(block_endpoints[4 * i + ch], 0.0f, 255.0f);
|
|
}
|
|
}
|
|
|
|
void icmp_ep_quant0367_2(CGV_EPOCODE qep[], CGV_IMAGE ep[], CGU_INT blockMode, CGU_INT channels)
|
|
{
|
|
CGU_INT bits = 7;
|
|
if (blockMode == 0) bits = 4;
|
|
if (blockMode == 7) bits = 5;
|
|
|
|
CGU_INT levels = 1 << bits;
|
|
CGU_INT levels2 = levels * 2 - 1;
|
|
|
|
for (CGU_INT i = 0; i < 2; i++)
|
|
{
|
|
CGV_EPOCODE qep_b[8];
|
|
|
|
for (CGU_INT b = 0; b < 2; b++)
|
|
for (CGU_INT p = 0; p < 4; p++)
|
|
{
|
|
CGV_EPOCODE v = (CGV_TYPEINT)((ep[i * 4 + p] / 255.0f*levels2 - b) / 2.0f + 0.5f) * 2 + b;
|
|
qep_b[b * 4 + p] = clampEPO(v, b, levels2 - 1 + b);
|
|
}
|
|
|
|
CGV_IMAGE ep_b[8];
|
|
for (CGU_INT j = 0; j < 8; j++)
|
|
ep_b[j] = qep_b[j];
|
|
|
|
if (blockMode == 0)
|
|
for (CGU_INT j = 0; j < 8; j++)
|
|
ep_b[j] = expandEPObits(qep_b[j], 5);
|
|
|
|
CGV_ERROR err0 = 0.0f;
|
|
CGV_ERROR err1 = 0.0f;
|
|
for (CGU_INT ch = 0; ch < channels; ch++)
|
|
{
|
|
err0 += sq_image(ep[i * 4 + ch] - ep_b[0 + ch]);
|
|
err1 += sq_image(ep[i * 4 + ch] - ep_b[4 + ch]);
|
|
}
|
|
|
|
for (CGU_INT p = 0; p < 4; p++)
|
|
qep[i * 4 + p] = (err0 < err1) ? qep_b[0 + p] : qep_b[4 + p];
|
|
}
|
|
}
|
|
|
|
void icmp_ep_quant245_2(CGV_EPOCODE qep[], CGV_IMAGE ep[], CGU_INT mode)
|
|
{
|
|
CGU_INT bits = 5;
|
|
if (mode == 5) bits = 7;
|
|
CGU_INT levels = 1 << bits;
|
|
|
|
for (CGU_INT i = 0; i < 8; i++)
|
|
{
|
|
CGV_EPOCODE v = ((CGV_TYPEINT)(ep[i] / 255.0f*(levels - 1) + 0.5));
|
|
qep[i] = clampEPO(v, 0, levels - 1);
|
|
}
|
|
}
|
|
|
|
void icmp_ep_quant1_2(CGV_EPOCODE qep[], CGV_IMAGE ep[], CGU_INT mode)
|
|
{
|
|
CGV_EPOCODE qep_b[16];
|
|
|
|
for (CGU_INT b = 0; b < 2; b++)
|
|
for (CGU_INT i = 0; i < 8; i++)
|
|
{
|
|
CGV_EPOCODE v = ((CGV_TYPEINT)((ep[i] / 255.0f*127.0f - b) / 2 + 0.5)) * 2 + b;
|
|
qep_b[b * 8 + i] = clampEPO(v, b, 126 + b);
|
|
}
|
|
|
|
// dequant
|
|
CGV_IMAGE ep_b[16];
|
|
for (CGU_INT k = 0; k < 16; k++)
|
|
ep_b[k] = expandEPObits(qep_b[k], 7);
|
|
|
|
CGV_ERROR err0 = 0.0f;
|
|
CGV_ERROR err1 = 0.0f;
|
|
for (CGU_INT j = 0; j < 2; j++)
|
|
for (CGU_INT p = 0; p < 3; p++)
|
|
{
|
|
err0 += sq_image(ep[j * 4 + p] - ep_b[0 + j * 4 + p]);
|
|
err1 += sq_image(ep[j * 4 + p] - ep_b[8 + j * 4 + p]);
|
|
}
|
|
|
|
for (CGU_INT i = 0; i < 8; i++)
|
|
qep[i] = (err0 < err1) ? qep_b[0 + i] : qep_b[8 + i];
|
|
|
|
}
|
|
|
|
void icmp_ep_quant2_2(CGV_EPOCODE qep[], CGV_IMAGE ep[], CGU_INT blockMode, CGU_INT channels3or4)
|
|
{
|
|
//assert(mode <= 7);
|
|
CMP_STATIC uniform __constant CGV_SUBSETS SubSetTable[] = { 3,2,3,2,1,1,1,2 };
|
|
#ifndef ASPM_GPU
|
|
uniform CMP_CONSTANT
|
|
#endif
|
|
CGV_SUBSETS maxSubSets = SubSetTable[blockMode];
|
|
|
|
if (blockMode == 0 || blockMode == 3 || blockMode == 6 || blockMode == 7)
|
|
{
|
|
for (CGU_INT i = 0; i < maxSubSets; i++)
|
|
icmp_ep_quant0367_2(&qep[i * 8], &ep[i * 8], blockMode, channels3or4);
|
|
}
|
|
else
|
|
if (blockMode == 1)
|
|
{
|
|
for (CGU_INT i = 0; i < maxSubSets; i++)
|
|
icmp_ep_quant1_2(&qep[i * 8], &ep[i * 8], blockMode);
|
|
}
|
|
else
|
|
if (blockMode == 2 || blockMode == 4 || blockMode == 5)
|
|
{
|
|
for (CGU_INT i = 0; i < maxSubSets; i++)
|
|
icmp_ep_quant245_2(&qep[i * 8], &ep[i * 8], blockMode);
|
|
}
|
|
// else
|
|
// assert(false);
|
|
|
|
}
|
|
|
|
void icmp_ep_dequant2(CGV_IMAGE ep[], CGV_EPOCODE qep[], CGU_INT blockMode)
|
|
{
|
|
//assert(mode <= 7);
|
|
CMP_STATIC uniform __constant CGV_SUBSETS subSetTable[] = { 3,2,3,2,1,1,1,2 };
|
|
#ifndef ASPM_GPU
|
|
uniform CMP_CONSTANT
|
|
#endif
|
|
CGV_SUBSETS maxSubSets = subSetTable[blockMode];
|
|
|
|
// mode 3, 6 are 8-bit
|
|
if (blockMode == 3 || blockMode == 6)
|
|
{
|
|
for (CGU_INT i = 0; i < 8 * maxSubSets; i++)
|
|
ep[i] = qep[i];
|
|
}
|
|
else
|
|
if (blockMode == 1 || blockMode == 5)
|
|
{
|
|
for (CGU_INT i = 0; i < 8 * maxSubSets; i++)
|
|
ep[i] = expandEPObits(qep[i], 7);
|
|
}
|
|
else
|
|
if (blockMode == 0 || blockMode == 2 || blockMode == 4)
|
|
{
|
|
for (CGU_INT i = 0; i < 8 * maxSubSets; i++)
|
|
ep[i] = expandEPObits(qep[i], 5);
|
|
}
|
|
else
|
|
if (blockMode == 7)
|
|
{
|
|
for (CGU_INT i = 0; i < 8 * maxSubSets; i++)
|
|
ep[i] = expandEPObits(qep[i], 6);
|
|
}
|
|
//else
|
|
// assert(false);
|
|
}
|
|
|
|
void icmp_GetQuantizedEpoCode(CGV_EPOCODE epo_code_out[], CGV_IMAGE block_endpoints[], CGU_INT blockMode, CGU_CHANNEL channels3or4)
|
|
{
|
|
icmp_ep_quant2_2(epo_code_out, block_endpoints, blockMode, channels3or4);
|
|
icmp_ep_dequant2(block_endpoints, epo_code_out, blockMode);
|
|
}
|
|
|
|
void icmp_ep_quant_dequant_mode4(CGV_EPOCODE qep[], CGV_IMAGE ep[])
|
|
{
|
|
icmp_ep_quant2_2(qep, ep, 4, 3);
|
|
icmp_ep_dequant2(ep, qep, 4);
|
|
}
|
|
|
|
///////////////////////////
|
|
// pixel quantization
|
|
//========================================
|
|
// Modified Intel Texture Compression Code
|
|
//========================================
|
|
|
|
INLINE uniform __constant CGV_RAMP* uniform icmp_GetRamp(CGU_INT bits)
|
|
{
|
|
//assert(bits>=2 && bits<=4); // invalid bit size
|
|
|
|
CMP_STATIC uniform __constant CGV_RAMP unquant_table_2bits[] = { 0, 21, 43, 64 };
|
|
CMP_STATIC uniform __constant CGV_RAMP unquant_table_3bits[] = { 0, 9, 18, 27, 37, 46, 55, 64 };
|
|
CMP_STATIC uniform __constant CGV_RAMP unquant_table_4bits[] = { 0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64 };
|
|
|
|
uniform __constant CGV_RAMP* uniform unquant_tables[] = { unquant_table_2bits, unquant_table_3bits, unquant_table_4bits };
|
|
|
|
return unquant_tables[bits - 2];
|
|
}
|
|
|
|
#ifdef USE_VARYING
|
|
INLINE CGV_IMAGE gather_image(varying CGV_IMAGE* uniform ptr, CGV_SHIFT32 idx)
|
|
{
|
|
return ptr[idx]; // (perf warning expected)
|
|
}
|
|
#endif
|
|
|
|
INLINE CGV_RAMP gather_ramp(
|
|
#ifdef ASPM_GPU
|
|
CMP_CONSTANT CGV_RAMP* ptr,
|
|
#else
|
|
CMP_CONSTANT CGV_RAMP* CMP_CONSTANT uniform ptr,
|
|
#endif
|
|
CGV_INDEX idx)
|
|
{
|
|
return ptr[idx]; // (perf warning expected)
|
|
}
|
|
|
|
CGV_ERROR icmp_GetQuantizeIndex(
|
|
CGV_INDEXPACKED index_packed_out[2],
|
|
CGV_INDEX index_out[MAX_SUBSET_SIZE],
|
|
CGV_IMAGE image_src[64],
|
|
CGU_INT bits,
|
|
CGV_IMAGE image_block[],
|
|
CGV_SHIFT32 pattern,
|
|
CGU_CHANNEL channels3or4)
|
|
{
|
|
CGV_ERROR total_err = 0;
|
|
uniform __constant CGV_RAMP* uniform Ramp = icmp_GetRamp(bits);
|
|
CGV_LEVELS levels = 1 << bits;
|
|
|
|
// 64-bit color_qendpoint: 5% overhead in this function
|
|
for (CGU_INT k = 0; k < 2; k++) index_packed_out[k] = 0;
|
|
|
|
CGV_SHIFT32 pattern_shifted = pattern;
|
|
for (CGU_INT k = 0; k < 16; k++)
|
|
{
|
|
CGV_SHIFT32 j = pattern_shifted & 3;
|
|
pattern_shifted >>= 2;
|
|
|
|
CGV_IMAGE proj = 0;
|
|
CGV_IMAGE div = 0;
|
|
for (CGU_CHANNEL ch = 0; ch < channels3or4; ch++)
|
|
{
|
|
#ifdef USE_VARYING
|
|
CGV_IMAGE ep_a = gather_image(image_block, 8 * j + 0 + ch);
|
|
CGV_IMAGE ep_b = gather_image(image_block, 8 * j + 4 + ch);
|
|
#else
|
|
CGV_IMAGE ep_a = image_block[8 * j + 0 + ch];
|
|
CGV_IMAGE ep_b = image_block[8 * j + 4 + ch];
|
|
#endif
|
|
proj += (image_src[k + ch * 16] - ep_a)*(ep_b - ep_a);
|
|
div += sq_image(ep_b - ep_a);
|
|
}
|
|
|
|
proj /= div;
|
|
|
|
CGV_INDEX index_q1 = (CGV_INDEX)(proj*levels + 0.5);
|
|
index_q1 = clampIndex(index_q1, 1, levels - 1);
|
|
|
|
CGV_ERROR err0 = 0;
|
|
CGV_ERROR err1 = 0;
|
|
CGV_RAMP ramp0 = gather_ramp(Ramp, index_q1 - 1);
|
|
CGV_RAMP ramp1 = gather_ramp(Ramp, index_q1);
|
|
|
|
for (CGU_CHANNEL ch = 0; ch < channels3or4; ch++)
|
|
{
|
|
#ifdef USE_VARYING
|
|
CGV_IMAGE ep_a = gather_image(image_block, 8 * j + 0 + ch);
|
|
CGV_IMAGE ep_b = gather_image(image_block, 8 * j + 4 + ch);
|
|
#else
|
|
CGV_IMAGE ep_a = image_block[8 * j + 0 + ch];
|
|
CGV_IMAGE ep_b = image_block[8 * j + 4 + ch];
|
|
#endif
|
|
CGV_IMAGE dec_v0 = (CGV_TYPEINT)(((64 - ramp0)*ep_a + ramp0 * ep_b + 32) / 64);
|
|
CGV_IMAGE dec_v1 = (CGV_TYPEINT)(((64 - ramp1)*ep_a + ramp1 * ep_b + 32) / 64);
|
|
err0 += sq_image(dec_v0 - image_src[k + ch * 16]);
|
|
err1 += sq_image(dec_v1 - image_src[k + ch * 16]);
|
|
}
|
|
|
|
CGV_ERROR best_err = err1;
|
|
CGV_INDEX best_index = index_q1;
|
|
if (err0 < err1)
|
|
{
|
|
best_err = err0;
|
|
best_index = index_q1 - 1;
|
|
}
|
|
|
|
index_out[k] = best_index;
|
|
index_packed_out[k / 8] += ((CGV_INDEXPACKED)best_index) << 4 * (k % 8);
|
|
total_err += best_err;
|
|
}
|
|
|
|
return total_err;
|
|
}
|
|
|
|
///////////////////////////
|
|
// LS endpoint refinement
|
|
|
|
void icmp_opt_endpoints(CGV_IMAGE ep[], CGV_IMAGE image_src[64], CGU_INT bits, CGV_INDEXPACKED color_qendpoint[2], CGV_MASK mask, CGU_CHANNEL channels3or4)
|
|
{
|
|
CGU_INT levels = 1 << bits;
|
|
|
|
CGV_IMAGE Atb1[4] = { 0,0,0,0 };
|
|
CGV_IMAGE sum_q = 0;
|
|
CGV_IMAGE sum_qq = 0;
|
|
CGV_IMAGE sum[5] = { 0,0,0,0,0 };
|
|
|
|
CGV_MASK mask_shifted = mask << 1;
|
|
for (CGU_INT k1 = 0; k1 < 2; k1++)
|
|
{
|
|
CGV_INDEXPACKED qbits_shifted = color_qendpoint[k1];
|
|
for (CGU_INT k2 = 0; k2 < 8; k2++)
|
|
{
|
|
CGU_INT k = k1 * 8 + k2;
|
|
CGV_IMAGE q = (CGV_TYPEINT)(qbits_shifted & 15);
|
|
|
|
qbits_shifted >>= 4;
|
|
|
|
mask_shifted >>= 1;
|
|
if ((mask_shifted & 1) == 0) continue;
|
|
|
|
CGV_LEVELS x = (levels - 1) - q;
|
|
CGV_LEVELS y = q;
|
|
|
|
sum_q += q;
|
|
sum_qq += q * q;
|
|
|
|
sum[4] += 1;
|
|
for (CGU_CHANNEL ch = 0; ch < channels3or4; ch++) sum[ch] += image_src[k + ch * 16];
|
|
for (CGU_CHANNEL ch = 0; ch < channels3or4; ch++) Atb1[ch] += x * image_src[k + ch * 16];
|
|
}
|
|
}
|
|
|
|
CGV_IMAGE Atb2[4];
|
|
for (CGU_CHANNEL ch = 0; ch < channels3or4; ch++)
|
|
{
|
|
//sum[ch] = dc[ch]*16;
|
|
Atb2[ch] = (levels - 1)*sum[ch] - Atb1[ch];
|
|
}
|
|
|
|
CGV_IMAGE Cxx = sum[4] * sq_image(levels - 1) - 2 * (levels - 1)*sum_q + sum_qq;
|
|
CGV_IMAGE Cyy = sum_qq;
|
|
CGV_IMAGE Cxy = (levels - 1)*sum_q - sum_qq;
|
|
CGV_IMAGE scale = (levels - 1) / (Cxx*Cyy - Cxy * Cxy);
|
|
|
|
for (CGU_CHANNEL ch = 0; ch < channels3or4; ch++)
|
|
{
|
|
ep[0 + ch] = (Atb1[ch] * Cyy - Atb2[ch] * Cxy)*scale;
|
|
ep[4 + ch] = (Atb2[ch] * Cxx - Atb1[ch] * Cxy)*scale;
|
|
|
|
//ep[0+ch] = clamp(ep[0+ch], 0, 255);
|
|
//ep[4+ch] = clamp(ep[4+ch], 0, 255);
|
|
}
|
|
|
|
if (img_absf(Cxx*Cyy - Cxy * Cxy) < 0.001f)
|
|
{
|
|
// flatten
|
|
for (CGU_CHANNEL ch = 0; ch < channels3or4; ch++)
|
|
{
|
|
ep[0 + ch] = sum[ch] / sum[4];
|
|
ep[4 + ch] = ep[0 + ch];
|
|
}
|
|
}
|
|
}
|
|
|
|
//////////////////////////
|
|
// parameter estimation
|
|
|
|
void icmp_channel_quant_dequant2(CGV_EPOCODE qep[2], CGV_IMAGE ep[2], CGU_INT epbits)
|
|
{
|
|
CGV_LEVELS elevels = (1 << epbits);
|
|
|
|
for (CGU_INT i = 0; i < 2; i++)
|
|
{
|
|
CGV_EPOCODE v = ((CGV_EPOCODE)(ep[i] / 255.0f*(elevels - 1) + 0.5f));
|
|
qep[i] = clampEPO(v, 0, elevels - 1);
|
|
ep[i] = expandEPObits(qep[i], epbits);
|
|
}
|
|
}
|
|
|
|
void icmp_refineEndpoints(CGV_IMAGE ep[2], CGV_IMAGE block[16], CGU_INT bits, CGV_INDEXPACKED color_index[2])
|
|
{
|
|
CGU_INT levels = 1 << bits;
|
|
|
|
CGV_IMAGE Atb1 = 0;
|
|
CGV_IMAGE sum_q = 0;
|
|
CGV_IMAGE sum_qq = 0;
|
|
CGV_IMAGE sum = 0;
|
|
|
|
for (CGU_INT k1 = 0; k1 < 2; k1++)
|
|
{
|
|
CGV_INDEXPACKED qbits_shifted = color_index[k1];
|
|
for (CGU_INT k2 = 0; k2 < 8; k2++)
|
|
{
|
|
CGU_INT k = k1 * 8 + k2;
|
|
CGV_IMAGE q = (CGV_TYPEINT)(qbits_shifted & 15);
|
|
qbits_shifted >>= 4;
|
|
|
|
CGV_TYPEINT x = (levels - 1) - q;
|
|
CGV_TYPEINT y = q;
|
|
|
|
sum_q += q;
|
|
sum_qq += q * q;
|
|
|
|
sum += block[k];
|
|
Atb1 += x * block[k];
|
|
}
|
|
}
|
|
|
|
CGV_IMAGE Atb2 = (levels - 1)*sum - Atb1;
|
|
|
|
CGV_IMAGE Cxx = 16 * sq_image(levels - 1) - 2 * (levels - 1)*sum_q + sum_qq;
|
|
CGV_IMAGE Cyy = sum_qq;
|
|
CGV_IMAGE Cxy = (levels - 1)*sum_q - sum_qq;
|
|
CGV_IMAGE scale = (levels - 1) / (Cxx*Cyy - Cxy * Cxy);
|
|
|
|
ep[0] = (Atb1*Cyy - Atb2 * Cxy)*scale;
|
|
ep[1] = (Atb2*Cxx - Atb1 * Cxy)*scale;
|
|
|
|
ep[0] = clampf(ep[0], 0.0f, 255.0f);
|
|
ep[1] = clampf(ep[1], 0.0f, 255.0f);
|
|
|
|
if (img_absf(Cxx*Cyy - Cxy * Cxy) < 0.001)
|
|
{
|
|
ep[0] = sum / 16;
|
|
ep[1] = ep[0];
|
|
}
|
|
}
|
|
|
|
CGV_ERROR icmp_channelQuantizeIndex(CGV_INDEXPACKED color_index[2], CGV_INDEX index[MAX_SUBSET_SIZE], CGV_IMAGE block[16], CGU_INT bits, CGV_IMAGE ep[])
|
|
{
|
|
uniform __constant CGV_RAMP* uniform Ramp = icmp_GetRamp(bits);
|
|
CGV_LEVELS levels = (1 << bits);
|
|
|
|
color_index[0] = 0;
|
|
color_index[1] = 0;
|
|
|
|
CGV_ERROR total_err = 0;
|
|
|
|
for (CGU_INT k = 0; k < 16; k++)
|
|
{
|
|
CGV_IMAGE proj = (block[k] - ep[0]) / (ep[1] - ep[0] + 0.001f);
|
|
|
|
CGV_INDEX q1 = (CGV_TYPEINT)(proj*levels + 0.5);
|
|
q1 = clampEPO(q1, 1, levels - 1);
|
|
|
|
CGV_ERROR err0 = 0;
|
|
CGV_ERROR err1 = 0;
|
|
CGV_RAMP ramp0 = gather_ramp(Ramp, q1 - 1);
|
|
CGV_RAMP ramp1 = gather_ramp(Ramp, q1);
|
|
|
|
CGV_IMAGE dec_v0 = (CGV_TYPEINT)(((64 - ramp0)*ep[0] + ramp0 * ep[1] + 32) / 64);
|
|
CGV_IMAGE dec_v1 = (CGV_TYPEINT)(((64 - ramp1)*ep[0] + ramp1 * ep[1] + 32) / 64);
|
|
err0 += sq_image(dec_v0 - block[k]);
|
|
err1 += sq_image(dec_v1 - block[k]);
|
|
|
|
CGV_TYPEINT best_err = err1;
|
|
CGV_INDEX best_q = q1;
|
|
if (err0 < err1)
|
|
{
|
|
best_err = err0;
|
|
best_q = q1 - 1;
|
|
}
|
|
|
|
index[k] = best_q;
|
|
color_index[k / 8] += ((CGV_INDEXPACKED)best_q) << 4 * (k % 8);
|
|
total_err += best_err;
|
|
}
|
|
|
|
return total_err;
|
|
}
|
|
|
|
CGV_ERROR icmp_optQuantizeIndex(BC7_EncodeState EncodeState[], CGV_INDEXPACKED color_index[2], CGV_INDEX index[MAX_SUBSET_SIZE], CGV_EPOCODE qep[2], CGV_IMAGE block[16], CGU_INT bits, CGU_INT epbits)
|
|
{
|
|
CGV_IMAGE ep[2] = { 255,0 };
|
|
|
|
for (CGU_INT k = 0; k < 16; k++)
|
|
{
|
|
ep[0] = minImage(ep[0], block[k]);
|
|
ep[1] = maxImage(ep[1], block[k]);
|
|
}
|
|
|
|
icmp_channel_quant_dequant2(qep, ep, epbits);
|
|
CGV_ERROR err = icmp_channelQuantizeIndex(color_index, index, block, bits, ep);
|
|
|
|
// refine
|
|
#ifndef ASPM_GPU
|
|
uniform CMP_CONSTANT
|
|
#endif
|
|
CGV_ITTERATIONS refineIterations = EncodeState->refineIterations;
|
|
for (CGU_INT i = 0; i < refineIterations; i++)
|
|
{
|
|
icmp_refineEndpoints(ep, block, bits, color_index);
|
|
icmp_channel_quant_dequant2(qep, ep, epbits);
|
|
err = icmp_channelQuantizeIndex(color_index, index, block, bits, ep);
|
|
}
|
|
|
|
return err;
|
|
}
|
|
|
|
|
|
INLINE CGV_SHIFT32 icmp_get_pattern2(CGV_PARTID part_id)
|
|
{
|
|
CMP_STATIC uniform __constant CGV_SHIFT32 pattern_table[] = {
|
|
0x50505050u, 0x40404040u, 0x54545454u, 0x54505040u, 0x50404000u, 0x55545450u, 0x55545040u, 0x54504000u,
|
|
0x50400000u, 0x55555450u, 0x55544000u, 0x54400000u, 0x55555440u, 0x55550000u, 0x55555500u, 0x55000000u,
|
|
0x55150100u, 0x00004054u, 0x15010000u, 0x00405054u, 0x00004050u, 0x15050100u, 0x05010000u, 0x40505054u,
|
|
0x00404050u, 0x05010100u, 0x14141414u, 0x05141450u, 0x01155440u, 0x00555500u, 0x15014054u, 0x05414150u,
|
|
0x44444444u, 0x55005500u, 0x11441144u, 0x05055050u, 0x05500550u, 0x11114444u, 0x41144114u, 0x44111144u,
|
|
0x15055054u, 0x01055040u, 0x05041050u, 0x05455150u, 0x14414114u, 0x50050550u, 0x41411414u, 0x00141400u,
|
|
0x00041504u, 0x00105410u, 0x10541000u, 0x04150400u, 0x50410514u, 0x41051450u, 0x05415014u, 0x14054150u,
|
|
0x41050514u, 0x41505014u, 0x40011554u, 0x54150140u, 0x50505500u, 0x00555050u, 0x15151010u, 0x54540404u,
|
|
0xAA685050u, 0x6A5A5040u, 0x5A5A4200u, 0x5450A0A8u, 0xA5A50000u, 0xA0A05050u, 0x5555A0A0u, 0x5A5A5050u,
|
|
0xAA550000u, 0xAA555500u, 0xAAAA5500u, 0x90909090u, 0x94949494u, 0xA4A4A4A4u, 0xA9A59450u, 0x2A0A4250u,
|
|
0xA5945040u, 0x0A425054u, 0xA5A5A500u, 0x55A0A0A0u, 0xA8A85454u, 0x6A6A4040u, 0xA4A45000u, 0x1A1A0500u,
|
|
0x0050A4A4u, 0xAAA59090u, 0x14696914u, 0x69691400u, 0xA08585A0u, 0xAA821414u, 0x50A4A450u, 0x6A5A0200u,
|
|
0xA9A58000u, 0x5090A0A8u, 0xA8A09050u, 0x24242424u, 0x00AA5500u, 0x24924924u, 0x24499224u, 0x50A50A50u,
|
|
0x500AA550u, 0xAAAA4444u, 0x66660000u, 0xA5A0A5A0u, 0x50A050A0u, 0x69286928u, 0x44AAAA44u, 0x66666600u,
|
|
0xAA444444u, 0x54A854A8u, 0x95809580u, 0x96969600u, 0xA85454A8u, 0x80959580u, 0xAA141414u, 0x96960000u,
|
|
0xAAAA1414u, 0xA05050A0u, 0xA0A5A5A0u, 0x96000000u, 0x40804080u, 0xA9A8A9A8u, 0xAAAAAA44u, 0x2A4A5254u
|
|
};
|
|
|
|
return gather_uint32(pattern_table, part_id);
|
|
}
|
|
|
|
CGV_IMAGE icmp_get_pca_bound(CGV_IMAGE covar[10], CGU_CHANNEL channels)
|
|
{
|
|
uniform __constant CGV_TYPEINT powerIterations = 4; // quite approximative, but enough for bounding
|
|
|
|
CGV_IMAGE inv_var = 1.0 / (256 * 256);
|
|
for (CGU_INT k = 0; k < 10; k++)
|
|
{
|
|
covar[k] *= inv_var;
|
|
}
|
|
|
|
CGV_IMAGE eps = sq_image(0.001);
|
|
covar[0] += eps;
|
|
covar[4] += eps;
|
|
covar[7] += eps;
|
|
|
|
CGV_IMAGE axis[4];
|
|
icmp_compute_axis(axis, covar, powerIterations, channels);
|
|
|
|
CGV_IMAGE vec[4];
|
|
if (channels == 3) icmp_ssymv3(vec, covar, axis);
|
|
if (channels == 4) icmp_ssymv4_2(vec, covar, axis);
|
|
|
|
CGV_IMAGE sq_sum = 0.0f;
|
|
for (CGU_INT p = 0; p < channels; p++) sq_sum += sq_image(vec[p]);
|
|
CGV_IMAGE lambda = sqrt(sq_sum);
|
|
|
|
CGV_IMAGE bound = covar[0] + covar[4] + covar[7];
|
|
if (channels == 4) bound += covar[9];
|
|
bound -= lambda;
|
|
bound = maxImage(bound, 0.0f);
|
|
|
|
return bound;
|
|
}
|
|
|
|
CGV_IMAGE icmp_block_pca_bound_split2(CGV_IMAGE image_src[64], CGV_MASK mask, CGV_IMAGE full_stats[15], CGU_CHANNEL channels)
|
|
{
|
|
CGV_IMAGE stats[15];
|
|
icmp_compute_stats_masked(stats, image_src, mask, channels);
|
|
|
|
CGV_IMAGE covar1[10];
|
|
icmp_covar_from_stats(covar1, stats, channels);
|
|
|
|
for (CGU_INT i = 0; i < 15; i++)
|
|
stats[i] = full_stats[i] - stats[i];
|
|
|
|
CGV_IMAGE covar2[10];
|
|
icmp_covar_from_stats(covar2, stats, channels);
|
|
|
|
CGV_IMAGE bound = 0.0f;
|
|
bound += icmp_get_pca_bound(covar1, channels);
|
|
bound += icmp_get_pca_bound(covar2, channels);
|
|
|
|
return sqrt(bound) * 256;
|
|
}
|
|
|
|
|
|
#ifdef USE_VARYING
|
|
INLINE void scatter_partid(varying CGV_PARTID* uniform ptr, CGV_TYPEINT idx, CGV_PARTID value)
|
|
{
|
|
ptr[idx] = value; // (perf warning expected)
|
|
}
|
|
#endif
|
|
|
|
void icmp_sort_partlist(CGV_PARTID list[], CGU_INT length, CGU_INT partial_count)
|
|
{
|
|
for (CGU_INT k = 0; k < partial_count; k++)
|
|
{
|
|
CGV_TYPEINT best_idx = k;
|
|
CGV_PARTID best_value = list[k];
|
|
for (CGU_INT i = k + 1; i < length; i++)
|
|
{
|
|
if (best_value > list[i])
|
|
{
|
|
best_value = list[i];
|
|
best_idx = i;
|
|
}
|
|
}
|
|
|
|
// swap
|
|
#ifdef USE_VARYING
|
|
scatter_partid(list, best_idx, list[k]);
|
|
#else
|
|
list[best_idx] = list[k];
|
|
#endif
|
|
list[k] = best_value;
|
|
}
|
|
}
|
|
|
|
INLINE void copy_epocode(CGV_EPOCODE u[], CGV_EPOCODE v[], CGU_INT n)
|
|
{
|
|
for (CGU_INT i = 0; i < n; i++)
|
|
{
|
|
u[i] = v[i];
|
|
}
|
|
}
|
|
|
|
|
|
INLINE void copy_indexpacked(CGV_INDEXPACKED u[], CGV_INDEXPACKED v[], CGU_INT n)
|
|
{
|
|
for (CGU_INT i = 0; i < n; i++)
|
|
{
|
|
u[i] = v[i];
|
|
}
|
|
}
|
|
|
|
|
|
void icmp_enc_mode4_candidate(
|
|
BC7_EncodeState EncodeState[],
|
|
cmp_mode_parameters best_candidate[],
|
|
CGV_ERROR best_err[],
|
|
CGU_INT rotated_channel,
|
|
CGU_INT idxMode)
|
|
{
|
|
CGU_INT bits = 2;
|
|
CGU_INT abits = 3;
|
|
CGU_INT aepbits = 6;
|
|
|
|
if (idxMode == 1)
|
|
{
|
|
bits = 3;
|
|
abits = 2;
|
|
}
|
|
|
|
CGV_IMAGE src_block[48];
|
|
for (CGU_INT k = 0; k < 16; k++)
|
|
{
|
|
for (CGU_INT p = 0; p < 3; p++)
|
|
src_block[k + p * 16] = EncodeState->image_src[k + p * 16];
|
|
|
|
if (rotated_channel < 3)
|
|
{
|
|
// apply channel rotation
|
|
if (EncodeState->channels == 4) src_block[k + rotated_channel * 16] = EncodeState->image_src[k + 3 * 16];
|
|
if (EncodeState->channels == 3) src_block[k + rotated_channel * 16] = 255;
|
|
}
|
|
}
|
|
|
|
CGV_IMAGE block_endpoints[8];
|
|
CGV_INDEXPACKED color_index[2];
|
|
CGV_INDEX c_index[MAX_SUBSET_SIZE];
|
|
CGV_EPOCODE color_qendpoint[8];
|
|
|
|
icmp_get_block_endpoints(block_endpoints, src_block, -1, 3);
|
|
icmp_ep_quant_dequant_mode4(color_qendpoint, block_endpoints);
|
|
CGV_ERROR err = icmp_GetQuantizeIndex(color_index, c_index, src_block, bits, block_endpoints, 0, 3);
|
|
|
|
// refine
|
|
CGU_INT refineIterations = EncodeState->refineIterations;
|
|
for (CGU_INT i = 0; i < refineIterations; i++)
|
|
{
|
|
icmp_opt_endpoints(block_endpoints, src_block, bits, color_index, -1, 3);
|
|
icmp_ep_quant_dequant_mode4(color_qendpoint, block_endpoints);
|
|
err = icmp_GetQuantizeIndex(color_index, c_index, src_block, bits, block_endpoints, 0, 3);
|
|
}
|
|
|
|
// encoding selected channel
|
|
CGV_EPOCODE alpha_qendpoint[2];
|
|
CGV_INDEXPACKED alpha_index[2];
|
|
CGV_INDEX a_index[MAX_SUBSET_SIZE];
|
|
err += icmp_optQuantizeIndex(EncodeState, alpha_index, a_index, alpha_qendpoint, &EncodeState->image_src[rotated_channel * 16], abits, aepbits);
|
|
|
|
if (err < *best_err)
|
|
{
|
|
copy_epocode(best_candidate->color_qendpoint, color_qendpoint, 8);
|
|
copy_epocode(best_candidate->alpha_qendpoint, alpha_qendpoint, 2);
|
|
copy_indexpacked(best_candidate->best_color_index, color_index, 2);
|
|
copy_indexpacked(best_candidate->best_alpha_index, alpha_index, 2);
|
|
best_candidate->rotated_channel = rotated_channel;
|
|
best_candidate->idxMode = idxMode;
|
|
*best_err = err;
|
|
}
|
|
}
|
|
|
|
void icmp_mode5_candidate(
|
|
BC7_EncodeState EncodeState[],
|
|
cmp_mode_parameters best_candidate[],
|
|
CGV_ERROR best_err[],
|
|
CGU_INT rotated_channel)
|
|
{
|
|
CGU_INT bits = 2;
|
|
CGU_INT abits = 2;
|
|
CGU_INT aepbits = 8;
|
|
|
|
CGV_IMAGE block[48];
|
|
for (CGU_INT k = 0; k < 16; k++)
|
|
{
|
|
for (CGU_INT p = 0; p < 3; p++)
|
|
block[k + p * 16] = EncodeState->image_src[k + p * 16];
|
|
|
|
if (rotated_channel < 3)
|
|
{
|
|
// apply channel rotation
|
|
if (EncodeState->channels == 4) block[k + rotated_channel * 16] = EncodeState->image_src[k + 3 * 16];
|
|
if (EncodeState->channels == 3) block[k + rotated_channel * 16] = 255;
|
|
}
|
|
}
|
|
|
|
CGV_IMAGE block_endpoints[8];
|
|
CGV_EPOCODE color_qendpoint[8];
|
|
CGV_INDEXPACKED color_index[2];
|
|
CGV_INDEX c_index[MAX_SUBSET_SIZE];
|
|
|
|
icmp_get_block_endpoints(block_endpoints, block, -1, 3);
|
|
icmp_GetQuantizedEpoCode(color_qendpoint, block_endpoints, 5, 3);
|
|
CGV_ERROR err = icmp_GetQuantizeIndex(color_index, c_index, block, bits, block_endpoints, 0, 3);
|
|
|
|
// refine
|
|
CGU_INT refineIterations = EncodeState->refineIterations;
|
|
for (CGU_INT i = 0; i < refineIterations; i++)
|
|
{
|
|
icmp_opt_endpoints(block_endpoints, block, bits, color_index, -1, 3);
|
|
icmp_GetQuantizedEpoCode(color_qendpoint, block_endpoints, 5, 3);
|
|
err = icmp_GetQuantizeIndex(color_index, c_index, block, bits, block_endpoints, 0, 3);
|
|
}
|
|
|
|
// encoding selected channel
|
|
CGV_EPOCODE alpha_qendpoint[2];
|
|
CGV_INDEXPACKED alpha_index[2];
|
|
CGV_INDEX a_index[MAX_SUBSET_SIZE];
|
|
err += icmp_optQuantizeIndex(EncodeState, alpha_index, a_index, alpha_qendpoint, &EncodeState->image_src[rotated_channel * 16], abits, aepbits);
|
|
|
|
if (err < *best_err)
|
|
{
|
|
|
|
icmp_swap_epocode(best_candidate->color_qendpoint, color_qendpoint, 8);
|
|
icmp_swap_indexpacked(best_candidate->best_color_index, color_index, 2);
|
|
icmp_swap_epocode(best_candidate->alpha_qendpoint, alpha_qendpoint, 2);
|
|
icmp_swap_indexpacked(best_candidate->best_alpha_index, alpha_index, 2);
|
|
best_candidate->rotated_channel = rotated_channel;
|
|
*best_err = err;
|
|
}
|
|
}
|
|
|
|
|
|
// =============== Mode Compression
|
|
|
|
CGV_ERROR icmp_enc_mode01237_part_fast(
|
|
CGV_EPOCODE qep[24],
|
|
CGV_INDEXPACKED color_index[2],
|
|
CGV_INDEX index[MAX_SUBSET_SIZE],
|
|
CGV_IMAGE image_src[64],
|
|
CGV_PARTID part_id,
|
|
CGU_INT blockMode)
|
|
{
|
|
CGV_SHIFT32 pattern = icmp_get_pattern2(part_id);
|
|
CGU_INT bits = 2; if (blockMode == 0 || blockMode == 1) bits = 3;
|
|
CGU_INT maxSubSets = 2; if (blockMode == 0 || blockMode == 2) maxSubSets = 3;
|
|
CGU_CHANNEL channels = 3; if (blockMode == 7) channels = 4;
|
|
|
|
CGV_IMAGE block_endpoints[24];
|
|
for (CGU_INT subset = 0; subset < maxSubSets; subset++)
|
|
{
|
|
CGV_MASK partition_mask = icmp_get_partition_mask(part_id, subset);
|
|
icmp_get_block_endpoints(&block_endpoints[subset * 8], image_src, partition_mask, channels);
|
|
}
|
|
|
|
icmp_GetQuantizedEpoCode(qep, block_endpoints, blockMode, channels);
|
|
CGV_ERROR total_err = icmp_GetQuantizeIndex(color_index, index, image_src, bits, block_endpoints, pattern, channels);
|
|
|
|
return total_err;
|
|
}
|
|
|
|
void icmp_enc_mode01237(BC7_EncodeState EncodeState[], CGU_INT blockMode, CGV_PARTID part_list[], CGU_INT part_count)
|
|
{
|
|
if (part_count == 0) return;
|
|
CGU_INT bits = 2; if (blockMode == 0 || blockMode == 1) bits = 3;
|
|
CGU_INT maxSubSets = 2; if (blockMode == 0 || blockMode == 2) maxSubSets = 3;
|
|
CGU_CHANNEL channels = 3; if (blockMode == 7) channels = 4;
|
|
|
|
CGV_EPOCODE best_qep[24];
|
|
CGV_INDEXPACKED best_endpoint[2];
|
|
CGV_PARTID best_part_id = -1;
|
|
CGV_ERROR best_err = 1e99;
|
|
|
|
for (CGU_INT part = 0; part < part_count; part++)
|
|
{
|
|
CGV_PARTID part_id = part_list[part] & 63;
|
|
if (maxSubSets == 3) part_id += 64;
|
|
|
|
CGV_EPOCODE qep[24];
|
|
CGV_INDEXPACKED color_index[2];
|
|
CGV_INDEX index[MAX_SUBSET_SIZE];
|
|
CGV_ERROR err = icmp_enc_mode01237_part_fast(qep, color_index, index, EncodeState->image_src, part_id, blockMode);
|
|
|
|
if (err < best_err)
|
|
{
|
|
for (CGU_INT subset = 0; subset < 8 * maxSubSets; subset++) best_qep[subset] = qep[subset];
|
|
for (CGU_INT k = 0; k < 2; k++) best_endpoint[k] = color_index[k];
|
|
best_part_id = part_id;
|
|
best_err = err;
|
|
}
|
|
}
|
|
|
|
// refine
|
|
CGU_INT refineIterations = EncodeState->refineIterations;
|
|
for (CGU_INT _i = 0; _i < refineIterations; _i++)
|
|
{
|
|
CGV_IMAGE ep[24];
|
|
for (CGU_INT subset = 0; subset < maxSubSets; subset++)
|
|
{
|
|
CGV_SHIFT32 partition_mask = icmp_get_partition_mask(best_part_id, subset);
|
|
icmp_opt_endpoints(&ep[subset * 8], EncodeState->image_src, bits, best_endpoint, partition_mask, channels);
|
|
}
|
|
|
|
CGV_EPOCODE qep[24];
|
|
CGV_INDEXPACKED color_index[2];
|
|
CGV_INDEX index[MAX_SUBSET_SIZE];
|
|
|
|
icmp_GetQuantizedEpoCode(qep, ep, blockMode, channels);
|
|
|
|
CGV_SHIFT32 pattern = icmp_get_pattern2(best_part_id);
|
|
CGV_ERROR err = icmp_GetQuantizeIndex(color_index, index, EncodeState->image_src, bits, ep, pattern, channels);
|
|
|
|
if (err < best_err)
|
|
{
|
|
for (CGU_INT subset = 0; subset < 8 * maxSubSets; subset++) best_qep[subset] = qep[subset];
|
|
for (CGU_INT k = 0; k < 2; k++) best_endpoint[k] = color_index[k];
|
|
best_err = err;
|
|
}
|
|
}
|
|
|
|
if (blockMode != 7) best_err += EncodeState->opaque_err; // take into account alpha channel
|
|
|
|
if (best_err < EncodeState->best_err)
|
|
{
|
|
EncodeState->best_err = best_err;
|
|
icmp_encode_mode01237(EncodeState->best_cmp_out, best_qep, best_endpoint, best_part_id, blockMode);
|
|
}
|
|
}
|
|
|
|
void icmp_mode5(BC7_EncodeState EncodeState[])
|
|
{
|
|
cmp_mode_parameters best_candidate;
|
|
CGV_ERROR best_err = EncodeState->best_err;
|
|
|
|
#ifdef ASPM_GPU
|
|
cmp_memsetBC7((CGV_BYTE *)&best_candidate, 0, sizeof(cmp_mode_parameters));
|
|
#else
|
|
memset(&best_candidate, 0, sizeof(cmp_mode_parameters));
|
|
#endif
|
|
|
|
for (CGU_CHANNEL ch = 0; ch < EncodeState->channels; ch++)
|
|
{
|
|
icmp_mode5_candidate(EncodeState, &best_candidate, &best_err, ch);
|
|
}
|
|
|
|
if (best_err < EncodeState->best_err)
|
|
{
|
|
EncodeState->best_err = best_err;
|
|
EncodeState->cmp_isout16Bytes = FALSE;
|
|
icmp_Encode_mode5(EncodeState->best_cmp_out, &best_candidate);
|
|
}
|
|
}
|
|
|
|
void icmp_mode6(BC7_EncodeState EncodeState[])
|
|
{
|
|
CGV_IMAGE block_endpoints[8];
|
|
icmp_get_block_endpoints(block_endpoints, EncodeState->image_src, -1, 4);
|
|
|
|
CGV_EPOCODE epo_code[8];
|
|
icmp_GetQuantizedEpoCode(epo_code, block_endpoints, 6, 4);
|
|
|
|
CGV_INDEXPACKED color_index[2];
|
|
CGV_INDEX index[MAX_SUBSET_SIZE];
|
|
CGV_ERROR err = icmp_GetQuantizeIndex(color_index, index, EncodeState->image_src, 4, block_endpoints, 0, 4);
|
|
|
|
// refine
|
|
CGU_INT refineIterations = EncodeState->refineIterations;
|
|
for (CGU_INT i = 0; i < refineIterations; i++)
|
|
{
|
|
icmp_opt_endpoints(block_endpoints, EncodeState->image_src, 4, color_index, -1, 4);
|
|
icmp_GetQuantizedEpoCode(epo_code, block_endpoints, 6, EncodeState->channels);
|
|
err = icmp_GetQuantizeIndex(color_index, index, EncodeState->image_src, 4, block_endpoints, 0, 4);
|
|
}
|
|
|
|
if (err < EncodeState->best_err)
|
|
{
|
|
EncodeState->best_err = err;
|
|
EncodeState->cmp_isout16Bytes = FALSE;
|
|
icmp_encode_mode6(EncodeState->best_cmp_out, epo_code, color_index);
|
|
}
|
|
}
|
|
|
|
void icmp_mode02(BC7_EncodeState EncodeState[])
|
|
{
|
|
CGV_PARTID part_list[64];
|
|
for (CGU_INT part = 0; part < 64; part++)
|
|
part_list[part] = part;
|
|
|
|
if (EncodeState->validModeMask & 0x01)
|
|
icmp_enc_mode01237(EncodeState, 0, part_list, 16);
|
|
if (EncodeState->validModeMask & 0x04)
|
|
icmp_enc_mode01237(EncodeState, 2, part_list, 64); // usually not worth the time
|
|
}
|
|
|
|
void icmp_mode7(BC7_EncodeState EncodeState[])
|
|
{
|
|
CGV_IMAGE full_stats[15];
|
|
icmp_compute_stats_masked(full_stats, EncodeState->image_src, -1, EncodeState->channels);
|
|
|
|
CGV_PARTID part_list[64];
|
|
for (CGU_INT part = 0; part < 64; part++)
|
|
{
|
|
CGV_MASK partition_mask = icmp_get_partition_mask(part + 0, 0);
|
|
CGV_IMAGE bound12 = icmp_block_pca_bound_split2(EncodeState->image_src, partition_mask, full_stats, EncodeState->channels);
|
|
CGV_PARTID bound = (CGV_TYPEINT)(bound12);
|
|
part_list[part] = part + bound * 64;
|
|
}
|
|
|
|
icmp_sort_partlist(part_list, 64, EncodeState->part_count);
|
|
icmp_enc_mode01237(EncodeState, 7, part_list, EncodeState->part_count);
|
|
}
|
|
|
|
void icmp_mode13(BC7_EncodeState EncodeState[])
|
|
{
|
|
CGV_IMAGE full_stats[15];
|
|
icmp_compute_stats_masked(full_stats, EncodeState->image_src, -1, 3);
|
|
|
|
CGV_PARTID part_list[64];
|
|
for (CGU_INT part = 0; part < 64; part++)
|
|
{
|
|
CGV_MASK partition_mask = icmp_get_partition_mask(part + 0, 0);
|
|
CGV_IMAGE bound12 = icmp_block_pca_bound_split2(EncodeState->image_src, partition_mask, full_stats, 3);
|
|
CGV_PARTID bound = (CGV_TYPEINT)(bound12);
|
|
part_list[part] = part + bound * 64;
|
|
}
|
|
|
|
icmp_sort_partlist(part_list, 64, EncodeState->part_count);
|
|
|
|
if (EncodeState->validModeMask & 0x02)
|
|
icmp_enc_mode01237(EncodeState, 1, part_list, EncodeState->part_count);
|
|
if (EncodeState->validModeMask & 0x08)
|
|
icmp_enc_mode01237(EncodeState, 3, part_list, EncodeState->part_count);
|
|
}
|
|
|
|
void icmp_mode4(BC7_EncodeState EncodeState[])
|
|
{
|
|
cmp_mode_parameters best_candidate;
|
|
CGV_ERROR best_err = EncodeState->best_err;
|
|
#ifdef ASPM_GPU
|
|
cmp_memsetBC7((CGV_BYTE *)&best_candidate, 0, sizeof(cmp_mode_parameters));
|
|
#else
|
|
memset(&best_candidate, 0, sizeof(cmp_mode_parameters));
|
|
#endif
|
|
|
|
for (CGU_CHANNEL rotated_channel = 0; rotated_channel < EncodeState->channels; rotated_channel++)
|
|
{
|
|
icmp_enc_mode4_candidate(EncodeState, &best_candidate, &best_err, rotated_channel, 0);
|
|
icmp_enc_mode4_candidate(EncodeState, &best_candidate, &best_err, rotated_channel, 1);
|
|
}
|
|
|
|
// mode 4
|
|
if (best_err < EncodeState->best_err)
|
|
{
|
|
EncodeState->best_err = best_err;
|
|
icmp_encode_mode4(EncodeState->best_cmp_out, &best_candidate);
|
|
}
|
|
}
|
|
|
|
#endif
|
|
//===================================== COMPRESS CODE =============================================
|
|
|
|
bool notValidBlockForMode(
|
|
CGU_UINT32 blockMode,
|
|
CGU_BOOL blockNeedsAlpha,
|
|
CGU_BOOL blockAlphaZeroOne,
|
|
uniform CMP_GLOBAL BC7_Encode u_BC7Encode[])
|
|
{
|
|
// Do we need to skip alpha processing blocks
|
|
if((blockNeedsAlpha == FALSE) && (blockMode > 3))
|
|
{
|
|
return TRUE;
|
|
}
|
|
|
|
// Optional restriction for colour-only blocks so that they
|
|
// don't use modes that have combined colour+alpha - this
|
|
// avoids the possibility that the encoder might choose an
|
|
// alpha other than 1.0 (due to parity) and cause something to
|
|
// become accidentally slightly transparent (it's possible that
|
|
// when encoding 3-component texture applications will assume that
|
|
// the 4th component can safely be assumed to be 1.0 all the time)
|
|
if ((blockNeedsAlpha == FALSE) &&
|
|
(u_BC7Encode->colourRestrict == TRUE) &&
|
|
((blockMode == 6)||(blockMode == 7))) // COMBINED_ALPHA
|
|
{
|
|
return TRUE;
|
|
}
|
|
|
|
// Optional restriction for blocks with alpha to avoid issues with
|
|
// punch-through or thresholded alpha encoding
|
|
if((blockNeedsAlpha == TRUE) &&
|
|
(u_BC7Encode->alphaRestrict == TRUE) &&
|
|
(blockAlphaZeroOne == TRUE) &&
|
|
((blockMode == 6)||(blockMode == 7))) // COMBINED_ALPHA
|
|
{
|
|
return TRUE;
|
|
}
|
|
|
|
return FALSE;
|
|
}
|
|
|
|
void BC7_CompressBlock(
|
|
BC7_EncodeState EncodeState[],
|
|
uniform CMP_GLOBAL BC7_Encode u_BC7Encode[])
|
|
{
|
|
CGU_BOOL blockNeedsAlpha = FALSE;
|
|
CGU_BOOL blockAlphaZeroOne = FALSE;
|
|
|
|
CGV_ERROR alpha_err = 0.0f;
|
|
CGV_IMAGE alpha_min = 255.0F;
|
|
|
|
for (CGU_INT k=0; k<SOURCE_BLOCK_SIZE; k++)
|
|
{
|
|
if ( EncodeState->image_src[k+COMP_ALPHA*SOURCE_BLOCK_SIZE] < alpha_min)
|
|
alpha_min = EncodeState->image_src[k+COMP_ALPHA*SOURCE_BLOCK_SIZE];
|
|
|
|
alpha_err += sq_image( EncodeState->image_src[k+COMP_ALPHA*SOURCE_BLOCK_SIZE]-255.0F);
|
|
|
|
if (blockAlphaZeroOne == FALSE)
|
|
{
|
|
if(( EncodeState->image_src[k+COMP_ALPHA*SOURCE_BLOCK_SIZE] == 255.0F) ||
|
|
( EncodeState->image_src[k+COMP_ALPHA*SOURCE_BLOCK_SIZE] == 0.0F))
|
|
{
|
|
blockAlphaZeroOne = TRUE;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (alpha_min != 255.0F)
|
|
{
|
|
blockNeedsAlpha = TRUE;
|
|
}
|
|
|
|
EncodeState->best_err = CMP_FLOAT_MAX;
|
|
EncodeState->opaque_err = alpha_err;
|
|
|
|
#ifdef USE_ICMP
|
|
EncodeState->refineIterations = 4;
|
|
EncodeState->fastSkipTreshold = 4;
|
|
EncodeState->channels = 4;
|
|
EncodeState->part_count = 64;
|
|
EncodeState->cmp_isout16Bytes = FALSE;
|
|
#else
|
|
EncodeState->cmp_isout16Bytes = TRUE;
|
|
#endif
|
|
|
|
// We change the order in which we visit the block modes to try to maximize the chance
|
|
// that we manage to early out as quickly as possible.
|
|
// This is a significant performance optimization for the lower quality modes where the
|
|
// exit threshold is higher, and also tends to improve quality (as the generally higher quality
|
|
// modes are now enumerated earlier, so the first encoding that passes the threshold will
|
|
// tend to pass by a greater margin than if we used a dumb ordering, and thus overall error will
|
|
// be improved)
|
|
CGU_INT blockModeOrder[NUM_BLOCK_TYPES] = {4, 6, 1, 3, 0, 2, 7, 5};
|
|
|
|
for (CGU_INT block=0; block < NUM_BLOCK_TYPES; block++)
|
|
{
|
|
CGU_INT blockMode = blockModeOrder[block];
|
|
|
|
if (u_BC7Encode->quality < BC7_qFAST_THRESHOLD)
|
|
{
|
|
if ( notValidBlockForMode(blockMode,blockNeedsAlpha,blockAlphaZeroOne,u_BC7Encode) )
|
|
continue;
|
|
}
|
|
|
|
CGU_INT Mode = 0x0001 << blockMode;
|
|
if (!(u_BC7Encode->validModeMask & Mode))
|
|
continue;
|
|
switch (blockMode)
|
|
{
|
|
// image processing with no alpha
|
|
case 0:
|
|
#ifdef USE_ICMP
|
|
icmp_mode02(EncodeState);
|
|
#else
|
|
Compress_mode01237(blockMode, EncodeState, u_BC7Encode);
|
|
#endif
|
|
break;
|
|
case 1:
|
|
#ifdef USE_ICMP
|
|
icmp_mode13(EncodeState);
|
|
#else
|
|
Compress_mode01237(blockMode, EncodeState, u_BC7Encode);
|
|
#endif
|
|
break;
|
|
case 2:
|
|
#ifdef USE_ICMP
|
|
icmp_mode13(EncodeState);
|
|
#else
|
|
Compress_mode01237(blockMode, EncodeState, u_BC7Encode);
|
|
#endif
|
|
break;
|
|
case 3:
|
|
#ifdef USE_ICMP
|
|
icmp_mode13(EncodeState);
|
|
#else
|
|
Compress_mode01237(blockMode, EncodeState, u_BC7Encode);
|
|
#endif
|
|
break;
|
|
// image processing with alpha
|
|
case 4:
|
|
#ifdef USE_ICMP
|
|
icmp_mode4(EncodeState);
|
|
#else
|
|
Compress_mode45(blockMode, EncodeState, u_BC7Encode);
|
|
#endif
|
|
break;
|
|
case 5:
|
|
#ifdef USE_ICMP
|
|
icmp_mode5(EncodeState);
|
|
#else
|
|
Compress_mode45(blockMode, EncodeState, u_BC7Encode);
|
|
#endif
|
|
break;
|
|
case 6:
|
|
#ifdef USE_ICMP
|
|
icmp_mode6(EncodeState);
|
|
#else
|
|
Compress_mode6( EncodeState, u_BC7Encode);
|
|
#endif
|
|
break;
|
|
case 7:
|
|
#ifdef USE_ICMP
|
|
icmp_mode7(EncodeState);
|
|
#else
|
|
Compress_mode01237(blockMode, EncodeState, u_BC7Encode);
|
|
#endif
|
|
break;
|
|
}
|
|
|
|
// Early out if we found we can compress with error below the quality threshold
|
|
if( EncodeState->best_err <= u_BC7Encode->errorThreshold)
|
|
{
|
|
break;
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
//====================================== BC7_ENCODECLASS END =============================================
|
|
|
|
#ifndef ASPM_GPU
|
|
|
|
INLINE void load_block_interleaved_rgba2(CGV_IMAGE image_src[64], uniform texture_surface* uniform src, CGUV_BLOCKWIDTH block_xx, CGU_INT block_yy)
|
|
{
|
|
for (CGU_INT y=0; y<4; y++)
|
|
for (CGU_INT x=0; x<4; x++)
|
|
{
|
|
CGU_UINT32 * uniform src_ptr = (CGV_SHIFT32*)&src->ptr[(block_yy*4+y)*src->stride];
|
|
#ifdef USE_VARYING
|
|
CGV_SHIFT32 rgba = gather_partid(src_ptr, block_xx*4+x);
|
|
image_src[16*0+y*4+x] = (CGV_FLOAT)((rgba>> 0)&255);
|
|
image_src[16*1+y*4+x] = (CGV_FLOAT)((rgba>> 8)&255);
|
|
image_src[16*2+y*4+x] = (CGV_FLOAT)((rgba>>16)&255);
|
|
image_src[16*3+y*4+x] = (CGV_FLOAT)((rgba>>24)&255);
|
|
#else
|
|
CGV_SHIFT32 rgba = src_ptr[block_xx*4+x];
|
|
image_src[16*0+y*4+x] = (CGU_FLOAT)((rgba>> 0)&255);
|
|
image_src[16*1+y*4+x] = (CGU_FLOAT)((rgba>> 8)&255);
|
|
image_src[16*2+y*4+x] = (CGU_FLOAT)((rgba>>16)&255);
|
|
image_src[16*3+y*4+x] = (CGU_FLOAT)((rgba>>24)&255);
|
|
#endif
|
|
}
|
|
}
|
|
|
|
|
|
#if defined(CMP_USE_FOREACH_ASPM) || defined(USE_VARYING)
|
|
INLINE void scatter_uint2(CGU_UINT32 * ptr, CGUV_BLOCKWIDTH idx, CGV_SHIFT32 value)
|
|
{
|
|
ptr[idx] = value; // (perf warning expected)
|
|
}
|
|
#endif
|
|
|
|
INLINE void store_data_uint32(CGU_UINT8 dst[], CGU_INT width, CGUV_BLOCKWIDTH v_xx, CGU_INT yy, CGV_SHIFT32 data[], CGU_INT data_size)
|
|
{
|
|
for (CGU_INT k=0; k<data_size; k++)
|
|
{
|
|
CGU_UINT32 * dst_ptr = (CGV_SHIFT32*)&dst[(yy)*width*data_size];
|
|
#ifdef USE_VARYING
|
|
scatter_uint2(dst_ptr, v_xx*data_size+k, data[k]);
|
|
#else
|
|
dst_ptr[v_xx*data_size+k] = data[k];
|
|
#endif
|
|
}
|
|
}
|
|
|
|
#ifdef USE_VARYING
|
|
INLINE void scatter_uint8(CGU_UINT8* ptr, CGV_SHIFT32 idx, CGV_CMPOUT value)
|
|
{
|
|
ptr[idx] = value; // (perf warning expected)
|
|
}
|
|
#endif
|
|
|
|
INLINE void store_data_uint8(CGU_UINT8 u_dstptr[], CGU_INT src_width, CGUV_BLOCKWIDTH block_x, CGU_INT block_y, CGUV_CMPOUT data[], CGU_INT data_size)
|
|
{
|
|
for (CGU_INT k=0; k<data_size; k++)
|
|
{
|
|
#ifdef USE_VARYING
|
|
CGU_UINT8* dst_blockptr = (CGUV_DSTPTR*)&u_dstptr[(block_y*src_width*4)];
|
|
scatter_uint8(dst_blockptr,k+(block_x*data_size),data[k]);
|
|
#else
|
|
u_dstptr[(block_y*src_width*4)+k+(block_x*data_size)] = data[k];
|
|
#endif
|
|
}
|
|
}
|
|
|
|
INLINE void store_data_uint32(CGU_UINT8 dst[], CGV_SHIFT32 width, CGUV_BLOCKWIDTH v_xx, CGU_INT yy, CGV_SHIFT32 data[], CGU_INT data_size)
|
|
{
|
|
for (CGU_INT k = 0; k < data_size; k++)
|
|
{
|
|
#if defined(CMP_USE_FOREACH_ASPM) || defined(USE_VARYING)
|
|
CGU_UINT32 * dst_ptr = (CGV_SHIFT32*)&dst[(yy)*width*data_size];
|
|
scatter_uint2(dst_ptr, v_xx*data_size + k, data[k]);
|
|
#else
|
|
dst[((yy)*width*data_size) + v_xx * data_size + k] = data[k];
|
|
#endif
|
|
}
|
|
}
|
|
|
|
void CompressBlockBC7_XY(uniform texture_surface u_srcptr[], CGUV_BLOCKWIDTH block_x, CGU_INT block_y, CGU_UINT8 u_dst[], uniform BC7_Encode u_settings[])
|
|
{
|
|
BC7_EncodeState _state;
|
|
varying BC7_EncodeState* uniform state = &_state;
|
|
|
|
copy_BC7_Encode_settings(state, u_settings);
|
|
|
|
load_block_interleaved_rgba2(state->image_src,u_srcptr, block_x, block_y);
|
|
|
|
BC7_CompressBlock(state, u_settings);
|
|
|
|
if (state->cmp_isout16Bytes)
|
|
store_data_uint8(u_dst, u_srcptr->width, block_x, block_y, state->cmp_out, 16);
|
|
else
|
|
store_data_uint32(u_dst, u_srcptr->width, block_x, block_y, state->best_cmp_out, 4);
|
|
|
|
}
|
|
|
|
CMP_EXPORT void CompressBlockBC7_encode( uniform texture_surface src[], CGU_UINT8 dst[], uniform BC7_Encode settings[])
|
|
{
|
|
// bc7_isa(); ASPM_PRINT(("ASPM encode [%d,%d]\n",bc7_isa(),src->width,src->height));
|
|
|
|
for (CGU_INT u_yy = 0; u_yy<src->height/4; u_yy++)
|
|
#ifdef CMP_USE_FOREACH_ASPM
|
|
foreach (v_xx = 0 ... src->width/4)
|
|
{
|
|
#else
|
|
for (CGUV_BLOCKWIDTH v_xx = 0; v_xx<src->width/4; v_xx++)
|
|
{
|
|
#endif
|
|
CompressBlockBC7_XY(src, v_xx, u_yy, dst, settings);
|
|
}
|
|
}
|
|
|
|
#endif
|
|
|
|
#ifndef ASPM_GPU
|
|
#ifndef ASPM
|
|
//======================= DECOMPRESS =========================================
|
|
#ifndef USE_HIGH_PRECISION_INTERPOLATION_BC7
|
|
CGU_UINT16 aWeight2[] = { 0, 21, 43, 64 };
|
|
CGU_UINT16 aWeight3[] = { 0, 9, 18, 27, 37, 46, 55, 64 };
|
|
CGU_UINT16 aWeight4[] = { 0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64 };
|
|
|
|
CGU_UINT8 interpolate(CGU_UINT8 e0, CGU_UINT8 e1, CGU_UINT8 index, CGU_UINT8 indexprecision)
|
|
{
|
|
if (indexprecision == 2)
|
|
return (CGU_UINT8)(((64 - aWeight2[index])*CGU_UINT16(e0) + aWeight2[index] * CGU_UINT16(e1) + 32) >> 6);
|
|
else if (indexprecision == 3)
|
|
return (CGU_UINT8)(((64 - aWeight3[index])*CGU_UINT16(e0) + aWeight3[index] * CGU_UINT16(e1) + 32) >> 6);
|
|
else // indexprecision == 4
|
|
return (CGU_UINT8)(((64 - aWeight4[index])*CGU_UINT16(e0) + aWeight4[index] * CGU_UINT16(e1) + 32) >> 6);
|
|
}
|
|
#endif
|
|
|
|
void GetBC7Ramp(CGU_UINT32 endpoint[][MAX_DIMENSION_BIG],
|
|
CGU_FLOAT ramp[MAX_DIMENSION_BIG][(1<<MAX_INDEX_BITS)],
|
|
CGU_UINT32 clusters[2],
|
|
CGU_UINT32 componentBits[MAX_DIMENSION_BIG])
|
|
{
|
|
CGU_UINT32 ep[2][MAX_DIMENSION_BIG];
|
|
CGU_UINT32 i;
|
|
|
|
// Expand each endpoint component to 8 bits by shifting the MSB to bit 7
|
|
// and then replicating the high bits to the low bits revealed by
|
|
// the shift
|
|
for(i=0; i<MAX_DIMENSION_BIG; i++)
|
|
{
|
|
ep[0][i] = 0;
|
|
ep[1][i] = 0;
|
|
if(componentBits[i])
|
|
{
|
|
ep[0][i] = (CGU_UINT32)(endpoint[0][i] << (8 - componentBits[i]));
|
|
ep[1][i] = (CGU_UINT32)(endpoint[1][i] << (8 - componentBits[i]));
|
|
ep[0][i] += (CGU_UINT32)(ep[0][i] >> componentBits[i]);
|
|
ep[1][i] += (CGU_UINT32)(ep[1][i] >> componentBits[i]);
|
|
|
|
ep[0][i] = min8(255, max8(0, static_cast<CGU_UINT8>(ep[0][i])));
|
|
ep[1][i] = min8(255, max8(0, static_cast<CGU_UINT8>(ep[1][i])));
|
|
}
|
|
}
|
|
|
|
// If this block type has no explicit alpha channel
|
|
// then make sure alpha is 1.0 for all points on the ramp
|
|
if(!componentBits[COMP_ALPHA])
|
|
{
|
|
ep[0][COMP_ALPHA] = ep[1][COMP_ALPHA] = 255;
|
|
}
|
|
|
|
CGU_UINT32 rampIndex = clusters[0];
|
|
|
|
rampIndex = (CGU_UINT32)(log((double)rampIndex) / log(2.0));
|
|
|
|
// Generate colours for the RGB ramp
|
|
for(i=0; i < clusters[0]; i++)
|
|
{
|
|
#ifdef USE_HIGH_PRECISION_INTERPOLATION_BC7
|
|
ramp[COMP_RED][i] = (CGU_FLOAT)floor((ep[0][COMP_RED] * (1.0-rampLerpWeightsBC7[rampIndex][i])) +
|
|
(ep[1][COMP_RED] * rampLerpWeightsBC7[rampIndex][i]) + 0.5);
|
|
ramp[COMP_RED][i] = bc7_minf(255.0, bc7_maxf(0., ramp[COMP_RED][i]));
|
|
ramp[COMP_GREEN][i] = (CGU_FLOAT)floor((ep[0][COMP_GREEN] * (1.0-rampLerpWeightsBC7[rampIndex][i])) +
|
|
(ep[1][COMP_GREEN] * rampLerpWeightsBC7[rampIndex][i]) + 0.5);
|
|
ramp[COMP_GREEN][i] = bc7_minf(255.0, bc7_maxf(0., ramp[COMP_GREEN][i]));
|
|
ramp[COMP_BLUE][i] = (CGU_FLOAT)floor((ep[0][COMP_BLUE] * (1.0-rampLerpWeightsBC7[rampIndex][i])) +
|
|
(ep[1][COMP_BLUE] * rampLerpWeightsBC7[rampIndex][i]) + 0.5);
|
|
ramp[COMP_BLUE][i] = bc7_minf(255.0, bc7_maxf(0., ramp[COMP_BLUE][i]));
|
|
#else
|
|
ramp[COMP_RED][i] = interpolate(ep[0][COMP_RED], ep[1][COMP_RED], i, rampIndex);
|
|
ramp[COMP_GREEN][i] = interpolate(ep[0][COMP_GREEN], ep[1][COMP_GREEN], i, rampIndex);
|
|
ramp[COMP_BLUE][i] = interpolate(ep[0][COMP_BLUE], ep[1][COMP_BLUE], i, rampIndex);
|
|
#endif
|
|
}
|
|
|
|
|
|
rampIndex = clusters[1];
|
|
rampIndex = (CGU_UINT32)(log((CGU_FLOAT)rampIndex) / log(2.0));
|
|
|
|
if(!componentBits[COMP_ALPHA])
|
|
{
|
|
for(i=0; i < clusters[1]; i++)
|
|
{
|
|
ramp[COMP_ALPHA][i] = 255.;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
|
|
// Generate alphas
|
|
for(i=0; i < clusters[1]; i++)
|
|
{
|
|
#ifdef USE_HIGH_PRECISION_INTERPOLATION_BC7
|
|
ramp[COMP_ALPHA][i] = (CGU_FLOAT)floor((ep[0][COMP_ALPHA] * (1.0-rampLerpWeightsBC7[rampIndex][i])) +
|
|
(ep[1][COMP_ALPHA] * rampLerpWeightsBC7[rampIndex][i]) + 0.5);
|
|
ramp[COMP_ALPHA][i] = bc7_minf(255.0, bc7_maxf(0., ramp[COMP_ALPHA][i]));
|
|
#else
|
|
ramp[COMP_ALPHA][i] = interpolate(ep[0][COMP_ALPHA], ep[1][COMP_ALPHA], i, rampIndex);
|
|
#endif
|
|
}
|
|
|
|
}
|
|
}
|
|
|
|
//
|
|
// Bit reader - reads one bit from a buffer at the current bit offset
|
|
// and increments the offset
|
|
//
|
|
|
|
CGU_UINT32 ReadBit(const CGU_UINT8 base[],CGU_UINT32 &m_bitPosition)
|
|
{
|
|
int byteLocation;
|
|
int remainder;
|
|
CGU_UINT32 bit = 0;
|
|
byteLocation = m_bitPosition/8;
|
|
remainder = m_bitPosition % 8;
|
|
|
|
bit = base[byteLocation];
|
|
bit >>= remainder;
|
|
bit &= 0x1;
|
|
// Increment bit position
|
|
m_bitPosition++;
|
|
return (bit);
|
|
}
|
|
|
|
void DecompressDualIndexBlock(
|
|
CGU_UINT8 out[MAX_SUBSET_SIZE][MAX_DIMENSION_BIG],
|
|
const CGU_UINT8 in[COMPRESSED_BLOCK_SIZE],
|
|
CGU_UINT32 endpoint[2][MAX_DIMENSION_BIG],
|
|
CGU_UINT32 &m_bitPosition,
|
|
CGU_UINT32 m_rotation,
|
|
CGU_UINT32 m_blockMode,
|
|
CGU_UINT32 m_indexSwap,
|
|
CGU_UINT32 m_componentBits[MAX_DIMENSION_BIG])
|
|
{
|
|
CGU_UINT32 i, j, k;
|
|
CGU_FLOAT ramp[MAX_DIMENSION_BIG][1<<MAX_INDEX_BITS];
|
|
CGU_UINT32 blockIndices[2][MAX_SUBSET_SIZE];
|
|
|
|
CGU_UINT32 clusters[2];
|
|
clusters[0] = 1 << bti[m_blockMode].indexBits[0];
|
|
clusters[1] = 1 << bti[m_blockMode].indexBits[1];
|
|
if(m_indexSwap)
|
|
{
|
|
CGU_UINT32 temp = clusters[0];
|
|
clusters[0] = clusters[1];
|
|
clusters[1] = temp;
|
|
}
|
|
|
|
GetBC7Ramp(endpoint,
|
|
ramp,
|
|
clusters,
|
|
m_componentBits);
|
|
|
|
// Extract the indices
|
|
for(i=0;i<2;i++)
|
|
{
|
|
for(j=0;j<MAX_SUBSET_SIZE;j++)
|
|
{
|
|
blockIndices[i][j] = 0;
|
|
// If this is a fixup index then clear the implicit bit
|
|
if(j==0)
|
|
{
|
|
blockIndices[i][j] &= ~(1 << (bti[m_blockMode].indexBits[i]-1));
|
|
for(k=0;k<static_cast <CGU_UINT32>(bti[m_blockMode].indexBits[i] - 1); k++)
|
|
{
|
|
blockIndices[i][j] |= (CGU_UINT32)ReadBit(in,m_bitPosition) << k;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
for(k=0;k<bti[m_blockMode].indexBits[i]; k++)
|
|
{
|
|
blockIndices[i][j] |= (CGU_UINT32)ReadBit(in,m_bitPosition) << k;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Generate block colours
|
|
for(i=0;i<MAX_SUBSET_SIZE;i++)
|
|
{
|
|
out[i][COMP_ALPHA] = (CGU_UINT8)ramp[COMP_ALPHA][blockIndices[m_indexSwap^1][i]];
|
|
out[i][COMP_RED] = (CGU_UINT8)ramp[COMP_RED][blockIndices[m_indexSwap][i]];
|
|
out[i][COMP_GREEN] = (CGU_UINT8)ramp[COMP_GREEN][blockIndices[m_indexSwap][i]];
|
|
out[i][COMP_BLUE] = (CGU_UINT8)ramp[COMP_BLUE][blockIndices[m_indexSwap][i]];
|
|
}
|
|
|
|
// Resolve the component rotation
|
|
CGU_INT8 swap;
|
|
for(i=0; i<MAX_SUBSET_SIZE; i++)
|
|
{
|
|
switch(m_rotation)
|
|
{
|
|
case 0:
|
|
// Do nothing
|
|
break;
|
|
case 1:
|
|
// Swap A and R
|
|
swap = out[i][COMP_ALPHA];
|
|
out[i][COMP_ALPHA] = out[i][COMP_RED];
|
|
out[i][COMP_RED] = swap;
|
|
break;
|
|
case 2:
|
|
// Swap A and G
|
|
swap = out[i][COMP_ALPHA];
|
|
out[i][COMP_ALPHA] = out[i][COMP_GREEN];
|
|
out[i][COMP_GREEN] = swap;
|
|
break;
|
|
case 3:
|
|
// Swap A and B
|
|
swap = out[i][COMP_ALPHA];
|
|
out[i][COMP_ALPHA] = out[i][COMP_BLUE];
|
|
out[i][COMP_BLUE] = swap;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
void DecompressBC7_internal(CGU_UINT8 out[MAX_SUBSET_SIZE][MAX_DIMENSION_BIG], const CGU_UINT8 in[COMPRESSED_BLOCK_SIZE], const BC7_Encode *u_BC7Encode)
|
|
{
|
|
if (u_BC7Encode) {}
|
|
CGU_UINT32 i, j;
|
|
CGU_UINT32 blockIndices[MAX_SUBSET_SIZE];
|
|
CGU_UINT32 endpoint[MAX_SUBSETS][2][MAX_DIMENSION_BIG];
|
|
|
|
CGU_UINT32 m_blockMode;
|
|
CGU_UINT32 m_partition;
|
|
CGU_UINT32 m_rotation;
|
|
CGU_UINT32 m_indexSwap;
|
|
|
|
CGU_UINT32 m_bitPosition;
|
|
CGU_UINT32 m_componentBits[MAX_DIMENSION_BIG];
|
|
|
|
m_blockMode = 0;
|
|
m_partition = 0;
|
|
m_rotation = 0;
|
|
m_indexSwap = 0;
|
|
|
|
// Position the read pointer at the LSB of the block
|
|
m_bitPosition = 0;
|
|
|
|
while (!ReadBit(in, m_bitPosition) && (m_blockMode < 8))
|
|
{
|
|
m_blockMode++;
|
|
}
|
|
|
|
if (m_blockMode > 7)
|
|
{
|
|
// Something really bad happened...
|
|
return;
|
|
}
|
|
|
|
for (i = 0; i < bti[m_blockMode].rotationBits; i++)
|
|
{
|
|
m_rotation |= ReadBit(in, m_bitPosition) << i;
|
|
}
|
|
for (i = 0; i < bti[m_blockMode].indexModeBits; i++)
|
|
{
|
|
m_indexSwap |= ReadBit(in, m_bitPosition) << i;
|
|
}
|
|
|
|
for (i = 0; i < bti[m_blockMode].partitionBits; i++)
|
|
{
|
|
m_partition |= ReadBit(in, m_bitPosition) << i;
|
|
}
|
|
|
|
|
|
|
|
if (bti[m_blockMode].encodingType == NO_ALPHA)
|
|
{
|
|
m_componentBits[COMP_ALPHA] = 0;
|
|
m_componentBits[COMP_RED] =
|
|
m_componentBits[COMP_GREEN] =
|
|
m_componentBits[COMP_BLUE] = bti[m_blockMode].vectorBits / 3;
|
|
}
|
|
else if (bti[m_blockMode].encodingType == COMBINED_ALPHA)
|
|
{
|
|
m_componentBits[COMP_ALPHA] =
|
|
m_componentBits[COMP_RED] =
|
|
m_componentBits[COMP_GREEN] =
|
|
m_componentBits[COMP_BLUE] = bti[m_blockMode].vectorBits / 4;
|
|
}
|
|
else if (bti[m_blockMode].encodingType == SEPARATE_ALPHA)
|
|
{
|
|
m_componentBits[COMP_ALPHA] = bti[m_blockMode].scalarBits;
|
|
m_componentBits[COMP_RED] =
|
|
m_componentBits[COMP_GREEN] =
|
|
m_componentBits[COMP_BLUE] = bti[m_blockMode].vectorBits / 3;
|
|
}
|
|
|
|
CGU_UINT32 subset, ep, component;
|
|
// Endpoints are stored in the following order RRRR GGGG BBBB (AAAA) (PPPP)
|
|
// i.e. components are packed together
|
|
// Loop over components
|
|
for (component = 0; component < MAX_DIMENSION_BIG; component++)
|
|
{
|
|
// loop over subsets
|
|
for (subset = 0; subset < (int)bti[m_blockMode].subsetCount; subset++)
|
|
{
|
|
// Loop over endpoints
|
|
for (ep = 0; ep < 2; ep++)
|
|
{
|
|
endpoint[subset][ep][component] = 0;
|
|
for (j = 0; j < m_componentBits[component]; j++)
|
|
{
|
|
endpoint[subset][ep][component] |= ReadBit(in, m_bitPosition) << j;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
// Now get any parity bits
|
|
if (bti[m_blockMode].pBitType != NO_PBIT)
|
|
{
|
|
for (subset = 0; subset < (int)bti[m_blockMode].subsetCount; subset++)
|
|
{
|
|
CGU_UINT32 pBit[2];
|
|
if (bti[m_blockMode].pBitType == ONE_PBIT)
|
|
{
|
|
pBit[0] = ReadBit(in, m_bitPosition);
|
|
pBit[1] = pBit[0];
|
|
}
|
|
else if (bti[m_blockMode].pBitType == TWO_PBIT)
|
|
{
|
|
pBit[0] = ReadBit(in, m_bitPosition);
|
|
pBit[1] = ReadBit(in, m_bitPosition);
|
|
}
|
|
|
|
for (component = 0; component < MAX_DIMENSION_BIG; component++)
|
|
{
|
|
if (m_componentBits[component])
|
|
{
|
|
endpoint[subset][0][component] <<= 1;
|
|
endpoint[subset][1][component] <<= 1;
|
|
endpoint[subset][0][component] |= pBit[0];
|
|
endpoint[subset][1][component] |= pBit[1];
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if (bti[m_blockMode].pBitType != NO_PBIT)
|
|
{
|
|
// Now that we've unpacked the parity bits, update the component size information
|
|
// for the ramp generator
|
|
for (j = 0; j < MAX_DIMENSION_BIG; j++)
|
|
{
|
|
if (m_componentBits[j])
|
|
{
|
|
m_componentBits[j] += 1;
|
|
}
|
|
}
|
|
}
|
|
|
|
// If this block has two independent sets of indices then put it to that decoder
|
|
if (bti[m_blockMode].encodingType == SEPARATE_ALPHA)
|
|
{
|
|
DecompressDualIndexBlock(out, in, endpoint[0], m_bitPosition, m_rotation, m_blockMode, m_indexSwap, m_componentBits);
|
|
return;
|
|
}
|
|
|
|
CGU_UINT32 fixup[MAX_SUBSETS] = { 0, 0, 0 };
|
|
switch (bti[m_blockMode].subsetCount)
|
|
{
|
|
case 3:
|
|
fixup[1] = BC7_FIXUPINDICES_LOCAL[2][m_partition][1];
|
|
fixup[2] = BC7_FIXUPINDICES_LOCAL[2][m_partition][2];
|
|
break;
|
|
case 2:
|
|
fixup[1] = BC7_FIXUPINDICES_LOCAL[1][m_partition][1];
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
|
|
//--------------------------------------------------------------------
|
|
// New Code : Possible replacement for BC7_PARTITIONS for CPU code
|
|
//--------------------------------------------------------------------
|
|
// Extract index bits
|
|
// for (i = 0; i < MAX_SUBSET_SIZE; i++)
|
|
// {
|
|
// CGV_UINT8 p = get_partition_subset(m_partition, bti[m_blockMode].subsetCount - 1, i);
|
|
// //CGU_UINT32 p = partitionTable[i];
|
|
// blockIndices[i] = 0;
|
|
// CGU_UINT32 bitsToRead = bti[m_blockMode].indexBits[0];
|
|
//
|
|
// // If this is a fixup index then set the implicit bit
|
|
// if (i == fixup[p])
|
|
// {
|
|
// blockIndices[i] &= ~(1 << (bitsToRead - 1));
|
|
// bitsToRead--;
|
|
// }
|
|
//
|
|
// for (j = 0; j < bitsToRead; j++)
|
|
// {
|
|
// blockIndices[i] |= ReadBit(in, m_bitPosition) << j;
|
|
// }
|
|
// }
|
|
CGU_BYTE *partitionTable = (CGU_BYTE*)BC7_PARTITIONS[bti[m_blockMode].subsetCount-1][m_partition];
|
|
|
|
// Extract index bits
|
|
for(i=0; i < MAX_SUBSET_SIZE; i++)
|
|
{
|
|
CGU_BYTE p = partitionTable[i];
|
|
blockIndices[i] = 0;
|
|
CGU_BYTE bitsToRead = bti[m_blockMode].indexBits[0];
|
|
|
|
// If this is a fixup index then set the implicit bit
|
|
if(i==fixup[p])
|
|
{
|
|
blockIndices[i] &= ~(1 << (bitsToRead-1));
|
|
bitsToRead--;
|
|
}
|
|
|
|
for(j=0;j<bitsToRead; j++)
|
|
{
|
|
blockIndices[i] |= ReadBit(in,m_bitPosition) << j;
|
|
}
|
|
}
|
|
|
|
// Get the ramps
|
|
CGU_UINT32 clusters[2];
|
|
clusters[0] = clusters[1] = 1 << bti[m_blockMode].indexBits[0];
|
|
|
|
// Colour Ramps
|
|
CGU_FLOAT c[MAX_SUBSETS][MAX_DIMENSION_BIG][1 << MAX_INDEX_BITS];
|
|
|
|
for (i = 0; i < (int)bti[m_blockMode].subsetCount; i++)
|
|
{
|
|
// Unpack the colours
|
|
GetBC7Ramp(endpoint[i],
|
|
c[i],
|
|
clusters,
|
|
m_componentBits);
|
|
}
|
|
|
|
//--------------------------------------------------------------------
|
|
// New Code : Possible replacement for BC7_PARTITIONS for CPU code
|
|
//--------------------------------------------------------------------
|
|
// Generate the block colours.
|
|
// for (i = 0; i < MAX_SUBSET_SIZE; i++)
|
|
// {
|
|
// CGV_UINT8 p = get_partition_subset(m_partition, bti[m_blockMode].subsetCount - 1, i);
|
|
// out[i][0] = c[p][0][blockIndices[i]];
|
|
// out[i][1] = c[p][1][blockIndices[i]];
|
|
// out[i][2] = c[p][2][blockIndices[i]];
|
|
// out[i][3] = c[p][3][blockIndices[i]];
|
|
// }
|
|
|
|
// Generate the block colours.
|
|
for(i=0; i<MAX_SUBSET_SIZE; i++)
|
|
{
|
|
for(j=0; j < MAX_DIMENSION_BIG; j++)
|
|
{
|
|
out[i][j] = (CGU_UINT8)c[partitionTable[i]][j][blockIndices[i]];
|
|
}
|
|
}
|
|
}
|
|
|
|
void CompressBlockBC7_Internal(
|
|
CGU_UINT8 image_src[SOURCE_BLOCK_SIZE][4],
|
|
CMP_GLOBAL CGV_CMPOUT cmp_out[COMPRESSED_BLOCK_SIZE],
|
|
uniform CMP_GLOBAL BC7_Encode u_BC7Encode[])
|
|
{
|
|
BC7_EncodeState _state = { 0 };
|
|
varying BC7_EncodeState* uniform state = &_state;
|
|
|
|
copy_BC7_Encode_settings(state, u_BC7Encode);
|
|
|
|
CGU_UINT8 offsetR = 0;
|
|
CGU_UINT8 offsetG = 16;
|
|
CGU_UINT8 offsetB = 32;
|
|
CGU_UINT8 offsetA = 48;
|
|
for (CGU_UINT8 i = 0; i < SOURCE_BLOCK_SIZE; i++)
|
|
{
|
|
state->image_src[offsetR++] = (CGV_IMAGE)image_src[i][0];
|
|
state->image_src[offsetG++] = (CGV_IMAGE)image_src[i][1];
|
|
state->image_src[offsetB++] = (CGV_IMAGE)image_src[i][2];
|
|
state->image_src[offsetA++] = (CGV_IMAGE)image_src[i][3];
|
|
}
|
|
|
|
BC7_CompressBlock(state, u_BC7Encode);
|
|
|
|
if (state->cmp_isout16Bytes)
|
|
{
|
|
for (CGU_UINT8 i = 0; i < COMPRESSED_BLOCK_SIZE; i++)
|
|
{
|
|
cmp_out[i] = state->cmp_out[i];
|
|
}
|
|
}
|
|
else
|
|
{
|
|
#ifdef ASPM_GPU
|
|
cmp_memcpy(cmp_out, (CGU_UINT8 *)state->best_cmp_out, 16);
|
|
#else
|
|
memcpy(cmp_out, state->best_cmp_out, 16);
|
|
#endif
|
|
}
|
|
}
|
|
|
|
//======================= CPU USER INTERFACES ====================================
|
|
|
|
int CMP_CDECL CreateOptionsBC7(void **options)
|
|
{
|
|
(*options) = new BC7_Encode;
|
|
if (!options) return CGU_CORE_ERR_NEWMEM;
|
|
init_BC7ramps();
|
|
SetDefaultBC7Options((BC7_Encode *)(*options));
|
|
return CGU_CORE_OK;
|
|
}
|
|
|
|
int CMP_CDECL DestroyOptionsBC7(void *options)
|
|
{
|
|
if (!options) return CGU_CORE_ERR_INVALIDPTR;
|
|
BC7_Encode *BCOptions = reinterpret_cast <BC7_Encode *>(options);
|
|
delete BCOptions;
|
|
return CGU_CORE_OK;
|
|
}
|
|
|
|
int CMP_CDECL SetErrorThresholdBC7(void *options, CGU_FLOAT minThreshold, CGU_FLOAT maxThreshold)
|
|
{
|
|
if (!options) return CGU_CORE_ERR_INVALIDPTR;
|
|
BC7_Encode *BC7optionsDefault = (BC7_Encode *)options;
|
|
|
|
if (minThreshold < 0.0f) minThreshold = 0.0f;
|
|
if (maxThreshold < 0.0f) maxThreshold = 0.0f;
|
|
|
|
BC7optionsDefault->minThreshold = minThreshold;
|
|
BC7optionsDefault->maxThreshold = maxThreshold;
|
|
return CGU_CORE_OK;
|
|
}
|
|
|
|
int CMP_CDECL SetQualityBC7(void *options, CGU_FLOAT fquality)
|
|
{
|
|
if (!options) return CGU_CORE_ERR_INVALIDPTR;
|
|
|
|
BC7_Encode *BC7optionsDefault = (BC7_Encode *)options;
|
|
if (fquality < 0.0f) fquality = 0.0f;
|
|
else
|
|
if (fquality > 1.0f) fquality = 1.0f;
|
|
BC7optionsDefault->quality = fquality;
|
|
|
|
// Set Error Thresholds
|
|
BC7optionsDefault->errorThreshold = BC7optionsDefault->maxThreshold * (1.0f - fquality);
|
|
if(fquality > BC7_qFAST_THRESHOLD)
|
|
BC7optionsDefault->errorThreshold += BC7optionsDefault->minThreshold;
|
|
|
|
return CGU_CORE_OK;
|
|
}
|
|
|
|
int CMP_CDECL SetMaskBC7(void *options, CGU_UINT8 mask)
|
|
{
|
|
if (!options) return CGU_CORE_ERR_INVALIDPTR;
|
|
BC7_Encode *BC7options = (BC7_Encode *)options;
|
|
BC7options->validModeMask = mask;
|
|
return CGU_CORE_OK;
|
|
}
|
|
|
|
int CMP_CDECL SetAlphaOptionsBC7(void *options, CGU_BOOL imageNeedsAlpha, CGU_BOOL colourRestrict, CGU_BOOL alphaRestrict)
|
|
{
|
|
if (!options) return CGU_CORE_ERR_INVALIDPTR;
|
|
BC7_Encode *u_BC7Encode = (BC7_Encode *)options;
|
|
u_BC7Encode->imageNeedsAlpha = imageNeedsAlpha;
|
|
u_BC7Encode->colourRestrict = colourRestrict;
|
|
u_BC7Encode->alphaRestrict = alphaRestrict;
|
|
return CGU_CORE_OK;
|
|
}
|
|
|
|
int CMP_CDECL CompressBlockBC7( const unsigned char *srcBlock,
|
|
unsigned int srcStrideInBytes,
|
|
CMP_GLOBAL unsigned char cmpBlock[16],
|
|
const void* options = NULL)
|
|
{
|
|
CMP_Vec4uc inBlock[SOURCE_BLOCK_SIZE];
|
|
|
|
//----------------------------------
|
|
// Fill the inBlock with source data
|
|
//----------------------------------
|
|
CGU_INT srcpos = 0;
|
|
CGU_INT dstptr = 0;
|
|
for (CGU_UINT8 row = 0; row < 4; row++)
|
|
{
|
|
srcpos = row * srcStrideInBytes;
|
|
for (CGU_UINT8 col = 0; col < 4; col++)
|
|
{
|
|
inBlock[dstptr].x = CGU_UINT8(srcBlock[srcpos++]);
|
|
inBlock[dstptr].y = CGU_UINT8(srcBlock[srcpos++]);
|
|
inBlock[dstptr].z = CGU_UINT8(srcBlock[srcpos++]);
|
|
inBlock[dstptr].w = CGU_UINT8(srcBlock[srcpos++]);
|
|
dstptr++;
|
|
}
|
|
}
|
|
|
|
|
|
BC7_Encode *u_BC7Encode = (BC7_Encode *)options;
|
|
BC7_Encode BC7EncodeDefault = { 0 };
|
|
if (u_BC7Encode == NULL)
|
|
{
|
|
u_BC7Encode = &BC7EncodeDefault;
|
|
SetDefaultBC7Options(u_BC7Encode);
|
|
init_BC7ramps();
|
|
}
|
|
|
|
BC7_EncodeState EncodeState
|
|
#ifndef ASPM
|
|
= { 0 }
|
|
#endif
|
|
;
|
|
EncodeState.best_err = CMP_FLOAT_MAX;
|
|
EncodeState.validModeMask = u_BC7Encode->validModeMask;
|
|
EncodeState.part_count = u_BC7Encode->part_count;
|
|
EncodeState.channels = static_cast<CGU_CHANNEL>(u_BC7Encode->channels);
|
|
|
|
CGU_UINT8 offsetR = 0;
|
|
CGU_UINT8 offsetG = 16;
|
|
CGU_UINT8 offsetB = 32;
|
|
CGU_UINT8 offsetA = 48;
|
|
CGU_UINT32 offsetSRC = 0;
|
|
for (CGU_UINT8 i = 0; i < SOURCE_BLOCK_SIZE; i++)
|
|
{
|
|
EncodeState.image_src[offsetR++] = (CGV_IMAGE)inBlock[offsetSRC].x;
|
|
EncodeState.image_src[offsetG++] = (CGV_IMAGE)inBlock[offsetSRC].y;
|
|
EncodeState.image_src[offsetB++] = (CGV_IMAGE)inBlock[offsetSRC].z;
|
|
EncodeState.image_src[offsetA++] = (CGV_IMAGE)inBlock[offsetSRC].w;
|
|
offsetSRC++;
|
|
}
|
|
|
|
BC7_CompressBlock(&EncodeState, u_BC7Encode);
|
|
|
|
if (EncodeState.cmp_isout16Bytes)
|
|
{
|
|
for (CGU_UINT8 i = 0; i < COMPRESSED_BLOCK_SIZE; i++)
|
|
{
|
|
cmpBlock[i] = EncodeState.cmp_out[i];
|
|
}
|
|
}
|
|
else
|
|
{
|
|
memcpy(cmpBlock, EncodeState.best_cmp_out, 16);
|
|
}
|
|
|
|
return CGU_CORE_OK;
|
|
}
|
|
|
|
int CMP_CDECL DecompressBlockBC7(const unsigned char cmpBlock[16],
|
|
unsigned char srcBlock[64],
|
|
const void *options = NULL) {
|
|
BC7_Encode *u_BC7Encode = (BC7_Encode *)options;
|
|
BC7_Encode BC7EncodeDefault = { 0 }; // for q = 0.05
|
|
if (u_BC7Encode == NULL)
|
|
{
|
|
// set for q = 1.0
|
|
u_BC7Encode = &BC7EncodeDefault;
|
|
SetDefaultBC7Options(u_BC7Encode);
|
|
init_BC7ramps();
|
|
}
|
|
DecompressBC7_internal((CGU_UINT8(*)[4])srcBlock, (CGU_UINT8 *)cmpBlock,u_BC7Encode);
|
|
return CGU_CORE_OK;
|
|
}
|
|
#endif
|
|
#endif
|
|
|
|
//============================================== OpenCL USER INTERFACE ====================================================
|
|
#ifdef ASPM_GPU
|
|
CMP_STATIC CMP_KERNEL void CMP_GPUEncoder(uniform CMP_GLOBAL const CGU_Vec4uc ImageSource[],
|
|
CMP_GLOBAL CGV_CMPOUT ImageDestination[],
|
|
uniform CMP_GLOBAL Source_Info SourceInfo[],
|
|
uniform CMP_GLOBAL BC7_Encode BC7Encode[] )
|
|
{
|
|
CGU_INT xID=0;
|
|
CGU_INT yID=0;
|
|
|
|
xID = get_global_id(0); // ToDo: Define a size_t 32 bit and 64 bit basd on clGetDeviceInfo
|
|
yID = get_global_id(1);
|
|
|
|
CGU_INT srcWidth = SourceInfo->m_src_width;
|
|
CGU_INT srcHeight = SourceInfo->m_src_height;
|
|
if (xID >= (srcWidth / BlockX)) return;
|
|
if (yID >= (srcHeight / BlockY)) return;
|
|
|
|
CGU_INT destI = (xID*COMPRESSED_BLOCK_SIZE) + (yID*(srcWidth / BlockX)*COMPRESSED_BLOCK_SIZE);
|
|
CGU_INT srcindex = 4 * (yID * srcWidth + xID);
|
|
CGU_INT blkindex = 0;
|
|
BC7_EncodeState EncodeState;
|
|
varying BC7_EncodeState* uniform state = &EncodeState;
|
|
|
|
copy_BC7_Encode_settings(state, BC7Encode);
|
|
|
|
//Check if it is a complete 4X4 block
|
|
if (((xID + 1)*BlockX <= srcWidth) && ((yID + 1)*BlockY <= srcHeight))
|
|
{
|
|
srcWidth = srcWidth - 4;
|
|
for (CGU_INT j = 0; j < 4; j++) {
|
|
for (CGU_INT i = 0; i < 4; i++) {
|
|
state->image_src[blkindex+0*SOURCE_BLOCK_SIZE] = ImageSource[srcindex].x;
|
|
state->image_src[blkindex+1*SOURCE_BLOCK_SIZE] = ImageSource[srcindex].y;
|
|
state->image_src[blkindex+2*SOURCE_BLOCK_SIZE] = ImageSource[srcindex].z;
|
|
state->image_src[blkindex+3*SOURCE_BLOCK_SIZE] = ImageSource[srcindex].w;
|
|
blkindex++;
|
|
srcindex++;
|
|
}
|
|
|
|
srcindex += srcWidth;
|
|
}
|
|
|
|
copy_BC7_Encode_settings(state, BC7Encode);
|
|
|
|
BC7_CompressBlock(&EncodeState, BC7Encode);
|
|
|
|
for (CGU_INT i=0; i<COMPRESSED_BLOCK_SIZE; i++)
|
|
{
|
|
ImageDestination[destI+i] = state->cmp_out[i];
|
|
}
|
|
|
|
}
|
|
else
|
|
{
|
|
ASPM_PRINT(("[ASPM_GPU] Unable to process, make sure image size is divisible by 4"));
|
|
}
|
|
}
|
|
#endif
|