You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
nvidia-texture-tools/extern/CMP_Core/shaders/BC6_Encode_kernel.h

480 lines
17 KiB
C++

//=====================================================================
// Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files(the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and / or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions :
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
//
//=====================================================================
#ifndef BC6_ENCODE_KERNEL_H
#define BC6_ENCODE_KERNEL_H
#include "Common_Def.h"
#define MAX_TRACE 10
#define MAX_ENTRIES_QUANT_TRACE 16
#define BlockX 4
#define BlockY 4
#define BYTEPP 4
#define COMPRESSED_BLOCK_SIZE 16 // Size of a compressed block in bytes
#define MAX_DIMENSION_BIG 4
#define MAX_SUBSET_SIZE 16 // Largest possible size for an individual subset
#define NUM_BLOCK_TYPES 8 // Number of block types in the format
#define MAX_SUBSETS 3 // Maximum number of possible subsets
#define MAX_PARTITIONS 64 // Maximum number of partition types
#define MAX_ENTRIES 64
#define MAX_TRY 20
#define MAX_PARTITIONS_TABLE (1+64+64)
#define DIMENSION 4
#define MAX_CLUSTERS_BIG 16
#define EPSILON 0.000001
#define MAX_CLUSTERS_QUANT_TRACE 8
//# Image Quality will increase as this number gets larger and end-to-end performance time will reduce
#define MAX_INDEX_BITS 4
#define HIGHQULITY_THRESHOLD 0.7F
#define qFAST_THRESHOLD 0.5F
#define F16NEGPREC_LIMIT_VAL -2048.0f //f16 negative precision limit value
#define LOG_CL_RANGE 5
#define LOG_CL_BASE 2
#define BIT_BASE 5
#define BIT_RANGE 9
#define MAX_CLUSTERS 8
#define BTT(bits) (bits-BIT_BASE)
#define CLT(cl) (cl-LOG_CL_BASE)
#define MASK(n) ((1<<(n))-1)
#define SIGN_EXTEND_TYPELESS(x,nb) ((((x)&(1<<((nb)-1)))?((~0)<<(nb)):0)|(x))
#define CMP_HALF_MAX 65504.0f // positive half max
#ifndef ASPM_GPU
#include <bitset>
#include <assert.h>
//typedef uint8_t byte;
#else
//typedef bitset uint8_t;
//typedef uint8 byte;
#endif
#define BC6CompBlockSize 16
#define BC6BlockX 4
#define BC6BlockY 4
typedef struct
{
CGU_INT k;
CGU_FLOAT d;
} BC6H_TRACE;
#define NCHANNELS 3
#define MAX_END_POINTS 2
#define MAX_BC6H_MODES 14
#define MAX_BC6H_PARTITIONS 32
#define MAX_TWOREGION_MODES 10
#define COMPRESSED_BLOCK_SIZE 16 // Size of a compressed block in bytes
#define ONE_REGION_INDEX_OFFSET 65 // bit location to start saving color index values for single region shape
#define TWO_REGION_INDEX_OFFSET 82 // bit location to start saving color index values for two region shapes
#define MIN_MODE_FOR_ONE_REGION 11 // Two regions shapes use modes 1..9 and single use 11..14
#define R_0(ep) (ep)[0][0][i]
#define R_1(ep) (ep)[0][1][i]
#define R_2(ep) (ep)[1][0][i]
#define R_3(ep) (ep)[1][1][i]
#define FLT16_MAX 0x7bff
#ifndef ASPM_GPU
#define USE_SHAKERHD
#endif
#define USE_NEWRAMP
typedef struct
{
CGU_FLOAT A[NCHANNELS];
CGU_FLOAT B[NCHANNELS];
} END_Points;
typedef struct
{
CGU_FLOAT x, y, z;
} BC6H_Vec3f;
typedef struct
{
CGU_INT nbits; // Number of bits
CGU_INT prec[3]; // precission of the Qunatized RGB endpoints
CGU_INT transformed; // if 0, deltas are unsigned and no transform; otherwise, signed and transformed
CGU_INT modebits; // number of mode bits
CGU_INT IndexPrec; // Index Precision
CGU_INT mode; // Mode value to save
CGU_INT lowestPrec; // Step size of each precesion incriment
} ModePartitions;
__constant ModePartitions ModePartition[MAX_BC6H_MODES + 1] =
{
0, 0,0,0, 0, 0, 0, 0, 0, // Mode = Invaild
// Two region Partition
10, 5,5,5, 1, 2, 3, 0x00, 31, // Mode = 1
7, 6,6,6, 1, 2, 3, 0x01, 248, // Mode = 2
11, 5,4,4, 1, 5, 3, 0x02, 15, // Mode = 3
11, 4,5,4, 1, 5, 3, 0x06, 15, // Mode = 4
11, 4,4,5, 1, 5, 3, 0x0a, 15, // Mode = 5
9, 5,5,5, 1, 5, 3, 0x0e, 62, // Mode = 6
8, 6,5,5, 1, 5, 3, 0x12, 124, // Mode = 7
8, 5,6,5, 1, 5, 3, 0x16, 124, // Mode = 8
8, 5,5,6, 1, 5, 3, 0x1a, 124, // Mode = 9
6, 6,6,6, 0, 5, 3, 0x1e, 496, // Mode = 10
// One region Partition
10, 10,10,10, 0, 5, 4, 0x03, 31, // Mode = 11
11, 9,9,9, 1, 5, 4, 0x07, 15, // Mode = 12
12, 8,8,8, 1, 5, 4, 0x0b, 7, // Mode = 13
16, 4,4,4, 1, 5, 4, 0x0f, 1, // Mode = 14
};
//================================================
// Mode Pathern order to try on endpoints
// The order can be rearranged to set which modes gets processed first
// for now it is set in order.
//================================================
__constant CGU_INT8 ModeFitOrder[MAX_BC6H_MODES + 1] =
{
0, //0: N/A
// ---- 2 region lower bits ---
1, // 10 5 5 5
2, // 7 6 6 6
3, // 11 5 4 5
4, // 11 4 5 4
5, // 11 4 4 5
6, // 9 5 5 5
7, // 8 6 5 5
8, // 8 5 6 5
9, // 8 5 5 6
10, // 6 6 6 6
//------ 1 region high bits ---
11, // 10 10 10 10
12, // 11 9 9 9
13, // 12 8 8 8
14 // 16 4 4 4
};
// The Region2FixUps are for our index[subset = 2][16][3] locations
// indexed by shape region 2
__constant CGU_INT g_Region2FixUp[32] =
{
7 , 3 , 11, 7,
3 , 11, 9 , 5,
2 , 12, 7 , 3,
11, 7 , 11, 3,
7 , 1 , 0 , 1,
0 , 1 , 0 , 7,
0 , 1 , 1 , 0,
4 , 4 , 1 , 0,
};
// Indexed by all shape regions
// Partition Set Fixups for region 1 note region 0 is always at 0
// that means normally we use 3 bits to define an index value
// if its at the fix up location then its one bit less
__constant CGU_INT g_indexfixups[32] =
{
15,15,15,15,
15,15,15,15,
15,15,15,15,
15,15,15,15,
15, 2, 8, 2,
2, 8, 8,15,
2, 8, 2, 2,
8, 8, 2, 2,
};
typedef struct
{
CGU_INT8 region; // one or two
CGU_INT8 m_mode; // m
CGU_INT8 d_shape_index; // d
CGU_INT rw; // endpt[0].A[0]
CGU_INT rx; // endpt[0].B[0]
CGU_INT ry; // endpt[1].A[0]
CGU_INT rz; // endpt[1].B[0]
CGU_INT gw; // endpt[0].A[1]
CGU_INT gx; // endpt[0].B[1]
CGU_INT gy; // endpt[1].A[1]
CGU_INT gz; // endpt[1].B[1]
CGU_INT bw; // endpt[0].A[2]
CGU_INT bx; // endpt[0].B[2]
CGU_INT by; // endpt[1].A[2]
CGU_INT bz; // endpt[1].B[2]
union
{
CGU_UINT8 indices[4][4]; // Indices data after header block
CGU_UINT8 indices16[16];
};
union
{
CGU_FLOAT din[MAX_SUBSET_SIZE][MAX_DIMENSION_BIG]; // Original data input as floats
unsigned char cdin[256]; // as uchar to match float
};
END_Points EC[MAX_END_POINTS]; // compressed endpoints expressed as endpt[0].A[] and endpt[1].B[]
END_Points E[MAX_END_POINTS]; // decompressed endpoints
CGU_BOOL issigned; // Format is 16 bit signed floating point
CGU_BOOL istransformed; // region two: all modes = true except mode=10
short wBits; // number of bits for the root endpoint
short tBits[NCHANNELS]; // number of bits used for the transformed endpoints
CGU_INT format; // floating point format are we using for decompression
BC6H_Vec3f Paletef[2][16];
CGU_INT index; // for debugging
CGU_FLOAT fEndPoints[MAX_SUBSETS][MAX_END_POINTS][MAX_DIMENSION_BIG];
CGU_FLOAT cur_best_fEndPoints[MAX_SUBSETS][MAX_END_POINTS][MAX_DIMENSION_BIG];
CGU_INT shape_indices[MAX_SUBSETS][MAX_SUBSET_SIZE];
CGU_INT cur_best_shape_indices[MAX_SUBSETS][MAX_SUBSET_SIZE];
CGU_INT entryCount[MAX_SUBSETS];
CGU_INT cur_best_entryCount[MAX_SUBSETS];
CGU_FLOAT partition[MAX_SUBSETS][MAX_SUBSET_SIZE][MAX_DIMENSION_BIG];
CGU_FLOAT cur_best_partition[MAX_SUBSETS][MAX_SUBSET_SIZE][MAX_DIMENSION_BIG];
CGU_BOOL optimized; // were end points optimized during final encoding
} BC6H_Encode_local;
#ifndef ASPM_GPU
using namespace std;
class BitHeader
{
public:
BitHeader(const CGU_UINT8 in[], CGU_INT sizeinbytes)
{
m_bits.reset();
m_sizeinbytes = sizeinbytes;
if ((in != NULL) && (sizeinbytes <= 16))
{
// Init bits set with given data
CGU_INT bitpos = 0;
for (CGU_INT i = 0; i < sizeinbytes; i++)
{
CGU_INT bit = 1;
for (CGU_INT j = 0; j < 8; j++)
{
m_bits[bitpos] = in[i] & bit ? 1 : 0;
bit = bit << 1;
bitpos++;
}
}
}
}
~BitHeader()
{
}
void transferbits(CGU_UINT8 in[], CGU_INT sizeinbytes)
{
if ((sizeinbytes <= m_sizeinbytes) && (in != NULL))
{
// Init bits set with given data
memset(in, 0, sizeinbytes);
CGU_INT bitpos = 0;
for (CGU_INT i = 0; i < sizeinbytes; i++)
{
CGU_INT bit = 1;
for (CGU_INT j = 0; j < 8; j++)
{
if (m_bits[bitpos]) in[i] |= bit;
bit = bit << 1;
bitpos++;
}
}
}
}
CGU_INT getvalue(CGU_INT start, CGU_INT bitsize)
{
CGU_INT value = 0;
CGU_INT end = start + bitsize - 1;
for (; end >= start; end--)
{
value |= m_bits[end] ? 1 : 0;
if (end > start) value <<= 1;
}
return value;
}
void setvalue(CGU_INT start, CGU_INT bitsize, CGU_INT value, CGU_INT maskshift = 0)
{
CGU_INT end = start + bitsize - 1;
CGU_INT mask = 0x1 << maskshift;
for (; start <= end; start++)
{
m_bits[start] = (value&mask) ? 1 : 0;
mask <<= 1;
}
}
bitset<128> m_bits; // 16 bytes max
CGU_INT m_sizeinbytes;
};
//==================== DECODER CODE ======================
#define MAXENDPOINTS 2
#define U16MAX 0xffff
#define S16MAX 0x7fff
#define SIGN_EXTEND(w,tbits) ((((signed(w))&(1<<((tbits)-1)))?((~0)<<(tbits)):0)|(signed(w)))
enum
{
UNSIGNED_F16 = 1,
SIGNED_F16 = 2
};
enum
{
BC6_ONE = 0,
BC6_TWO
};
enum
{
C_RED = 0,
C_GREEN,
C_BLUE
};
struct BC6H_Vec3
{
int x,y,z;
};
struct AMD_BC6H_Format
{
unsigned short region; // one or two
unsigned short m_mode; // m
int d_shape_index; // d
int rw; // endpt[0].A[0]
int rx; // endpt[0].B[0]
int ry; // endpt[1].A[0]
int rz; // endpt[1].B[0]
int gw; // endpt[0].A[1]
int gx; // endpt[0].B[1]
int gy; // endpt[1].A[1]
int gz; // endpt[1].B[1]
int bw; // endpt[0].A[2]
int bx; // endpt[0].B[2]
int by; // endpt[1].A[2]
int bz; // endpt[1].B[2]
union
{
CGU_UINT8 indices[4][4]; // Indices data after header block
CGU_UINT8 indices16[16];
};
float din[MAX_SUBSET_SIZE][MAX_DIMENSION_BIG]; // Original data input
END_Points EC[MAXENDPOINTS]; // compressed endpoints expressed as endpt[0].A[] and endpt[1].B[]
END_Points E[MAXENDPOINTS]; // decompressed endpoints
bool issigned; // Format is 16 bit signed floating point
bool istransformed; // region two: all modes = true except mode=10
short wBits; // number of bits for the root endpoint
short tBits[NCHANNELS]; // number of bits used for the transformed endpoints
int format; // floating point format are we using for decompression
BC6H_Vec3 Palete[2][16];
BC6H_Vec3f Paletef[2][16];
int index; // for debugging
float fEndPoints[MAX_SUBSETS][MAX_END_POINTS][MAX_DIMENSION_BIG];
float cur_best_fEndPoints[MAX_SUBSETS][MAX_END_POINTS][MAX_DIMENSION_BIG];
int shape_indices[MAX_SUBSETS][MAX_SUBSET_SIZE];
int cur_best_shape_indices[MAX_SUBSETS][MAX_SUBSET_SIZE];
int entryCount[MAX_SUBSETS];
int cur_best_entryCount[MAX_SUBSETS];
float partition[MAX_SUBSETS][MAX_SUBSET_SIZE][MAX_DIMENSION_BIG];
float cur_best_partition[MAX_SUBSETS][MAX_SUBSET_SIZE][MAX_DIMENSION_BIG];
bool optimized; // were end points optimized during final encoding
};
// =================================== END OF DECODER CODE ========================================================
#endif
//-------------------------------------------------
// Set by Host : Read only in kernel
//-------------------------------------------------
typedef struct
{
// Setup at initialization time
CGU_FLOAT m_quality;
CGU_FLOAT m_performance;
CGU_FLOAT m_errorThreshold;
CGU_DWORD m_validModeMask;
CGU_BOOL m_imageNeedsAlpha;
CGU_BOOL m_colourRestrict;
CGU_BOOL m_alphaRestrict;
CGU_BOOL m_isSigned;
} CMP_BC6HOptions;
typedef struct
{
// These are quality parameters used to select when to use the high precision quantizer
// and shaker paths
CGU_FLOAT m_quantizerRangeThreshold;
CGU_FLOAT m_shakerRangeThreshold;
CGU_FLOAT m_partitionSearchSize;
// Setup at initialization time
CGU_FLOAT m_quality;
CGU_FLOAT m_performance;
CGU_FLOAT m_errorThreshold;
CGU_DWORD m_validModeMask;
CGU_BOOL m_imageNeedsAlpha;
CGU_BOOL m_colourRestrict;
CGU_BOOL m_alphaRestrict;
CGU_BOOL m_isSigned;
// Source image info : must be set prior to use in kernel
CGU_UINT32 m_src_width;
CGU_UINT32 m_src_height;
CGU_UINT32 m_src_stride;
} BC6H_Encode;
CMP_STATIC void SetDefaultBC6Options(BC6H_Encode *BC6Encode)
{
if (BC6Encode)
{
BC6Encode->m_quality = 1.0f;
BC6Encode->m_quantizerRangeThreshold = 0.0f;
BC6Encode->m_shakerRangeThreshold = 0.0f;
BC6Encode->m_partitionSearchSize = 0.20f;
BC6Encode->m_performance = 0.0f;
BC6Encode->m_errorThreshold = 0.0f;
BC6Encode->m_validModeMask = 0;
BC6Encode->m_imageNeedsAlpha = 0;
BC6Encode->m_colourRestrict = 0;
BC6Encode->m_alphaRestrict = 0;
BC6Encode->m_isSigned = 0;
BC6Encode->m_src_width = 4;
BC6Encode->m_src_height = 4;
BC6Encode->m_src_stride = 0;
}
}
#endif