Make BC4 encoding a bit more readable

faster-single-tables
Andrew Cassidy 3 years ago
parent 460785ee7d
commit 4217d526cf

@ -68,6 +68,11 @@ class BC4Block {
SetSelectorBits(packed);
}
void PackSelectors(const std::array<uint8_t, 16>& unpacked) {
auto packed = Pack<uint8_t, uint64_t, 3, 16>(unpacked);
SetSelectorBits(packed);
}
inline uint32_t GetSelector(uint32_t x, uint32_t y, uint64_t selector_bits) const {
assert((x < 4U) && (y < 4U));
return (selector_bits >> (((y * 4) + x) * SelectorBits)) & (SelectorMask);

@ -23,85 +23,44 @@
namespace rgbcx {
void BC4Encoder::EncodeBlock(Byte4x4 pixels, BC4Block *const dest) const noexcept(ndebug) {
auto bytes = pixels.Flatten();
auto minmax = std::minmax_element(bytes.begin(), bytes.end());
auto flattened = pixels.Flatten();
auto minmax = std::minmax_element(flattened.begin(), flattened.end());
uint8_t min_v = *minmax.first;
uint8_t max_v = *minmax.second;
uint8_t min = *minmax.first;
uint8_t max = *minmax.second;
dest->high_alpha = min_v;
dest->low_alpha = max_v;
dest->high_alpha = min;
dest->low_alpha = max;
if (max_v == min_v) {
if (max == min) {
dest->SetSelectorBits(0);
return;
}
const uint32_t delta = max_v - min_v;
// min_v is now 0. Compute thresholds between values by scaling max_v. It's x14 because we're adding two x7 scale factors.
const int t0 = delta * 13;
const int t1 = delta * 11;
const int t2 = delta * 9;
const int t3 = delta * 7;
const int t4 = delta * 5;
const int t5 = delta * 3;
const int t6 = delta * 1;
std::array<uint8_t, 16> selectors = {};
const static std::array<uint8_t, 8> Levels = {1U, 7U, 6U, 5U, 4U, 3U, 2U, 0U}; // selector value options in linear order
// BC4 floors in its divisions, which we compensate for with the 4 bias.
// This function is optimal for all possible inputs (i.e. it outputs the same results as checking all 8 values and choosing the closest one).
const int bias = 4 - min_v * 14;
static const uint32_t s_tran0[8] = {1U, 7U, 6U, 5U, 4U, 3U, 2U, 0U};
static const uint32_t s_tran1[8] = {1U << 3U, 7U << 3U, 6U << 3U, 5U << 3U, 4U << 3U, 3U << 3U, 2U << 3U, 0U << 3U};
static const uint32_t s_tran2[8] = {1U << 6U, 7U << 6U, 6U << 6U, 5U << 6U, 4U << 6U, 3U << 6U, 2U << 6U, 0U << 6U};
static const uint32_t s_tran3[8] = {1U << 9U, 7U << 9U, 6U << 9U, 5U << 9U, 4U << 9U, 3U << 9U, 2U << 9U, 0U << 9U};
uint64_t a0, a1, a2, a3;
{
const int v0 = bytes[0] * 14 + bias;
const int v1 = bytes[1] * 14 + bias;
const int v2 = bytes[2] * 14 + bias;
const int v3 = bytes[3] * 14 + bias;
a0 = s_tran0[(v0 >= t0) + (v0 >= t1) + (v0 >= t2) + (v0 >= t3) + (v0 >= t4) + (v0 >= t5) + (v0 >= t6)];
a1 = s_tran1[(v1 >= t0) + (v1 >= t1) + (v1 >= t2) + (v1 >= t3) + (v1 >= t4) + (v1 >= t5) + (v1 >= t6)];
a2 = s_tran2[(v2 >= t0) + (v2 >= t1) + (v2 >= t2) + (v2 >= t3) + (v2 >= t4) + (v2 >= t5) + (v2 >= t6)];
a3 = s_tran3[(v3 >= t0) + (v3 >= t1) + (v3 >= t2) + (v3 >= t3) + (v3 >= t4) + (v3 >= t5) + (v3 >= t6)];
}
const int bias = 4 - min * 14;
const int delta = max - min;
{
const int v0 = bytes[4] * 14 + bias;
const int v1 = bytes[5] * 14 + bias;
const int v2 = bytes[6] * 14 + bias;
const int v3 = bytes[7] * 14 + bias;
a0 |= (uint64_t)(s_tran0[(v0 >= t0) + (v0 >= t1) + (v0 >= t2) + (v0 >= t3) + (v0 >= t4) + (v0 >= t5) + (v0 >= t6)] << 12U);
a1 |= (uint64_t)(s_tran1[(v1 >= t0) + (v1 >= t1) + (v1 >= t2) + (v1 >= t3) + (v1 >= t4) + (v1 >= t5) + (v1 >= t6)] << 12U);
a2 |= (uint64_t)(s_tran2[(v2 >= t0) + (v2 >= t1) + (v2 >= t2) + (v2 >= t3) + (v2 >= t4) + (v2 >= t5) + (v2 >= t6)] << 12U);
a3 |= (uint64_t)(s_tran3[(v3 >= t0) + (v3 >= t1) + (v3 >= t2) + (v3 >= t3) + (v3 >= t4) + (v3 >= t5) + (v3 >= t6)] << 12U);
}
// min is now 0. Compute thresholds between values by scaling max. It's x14 because we're adding two x7 scale factors.
// bias is applied here
std::array<int, 7> thresholds = {};
for (unsigned i = 0; i < 7; i++) thresholds[i] = delta * (1 + (2 * (int)i)) - bias;
{
const int v0 = bytes[8] * 14 + bias;
const int v1 = bytes[9] * 14 + bias;
const int v2 = bytes[10] * 14 + bias;
const int v3 = bytes[11] * 14 + bias;
a0 |= (((uint64_t)s_tran0[(v0 >= t0) + (v0 >= t1) + (v0 >= t2) + (v0 >= t3) + (v0 >= t4) + (v0 >= t5) + (v0 >= t6)]) << 24U);
a1 |= (((uint64_t)s_tran1[(v1 >= t0) + (v1 >= t1) + (v1 >= t2) + (v1 >= t3) + (v1 >= t4) + (v1 >= t5) + (v1 >= t6)]) << 24U);
a2 |= (((uint64_t)s_tran2[(v2 >= t0) + (v2 >= t1) + (v2 >= t2) + (v2 >= t3) + (v2 >= t4) + (v2 >= t5) + (v2 >= t6)]) << 24U);
a3 |= (((uint64_t)s_tran3[(v3 >= t0) + (v3 >= t1) + (v3 >= t2) + (v3 >= t3) + (v3 >= t4) + (v3 >= t5) + (v3 >= t6)]) << 24U);
}
// iterate over all values and calculate selectors
for (unsigned i = 0; i < 16; i++) {
int value = flattened[i] * 14; // multiply by demonimator
// level = number of thresholds this value is greater than
unsigned level = 0;
for (unsigned c = 0; c < 7; c++) level += value >= thresholds[c];
{
const int v0 = bytes[12] * 14 + bias;
const int v1 = bytes[13] * 14 + bias;
const int v2 = bytes[14] * 14 + bias;
const int v3 = bytes[15] * 14 + bias;
a0 |= (((uint64_t)s_tran0[(v0 >= t0) + (v0 >= t1) + (v0 >= t2) + (v0 >= t3) + (v0 >= t4) + (v0 >= t5) + (v0 >= t6)]) << 36U);
a1 |= (((uint64_t)s_tran1[(v1 >= t0) + (v1 >= t1) + (v1 >= t2) + (v1 >= t3) + (v1 >= t4) + (v1 >= t5) + (v1 >= t6)]) << 36U);
a2 |= (((uint64_t)s_tran2[(v2 >= t0) + (v2 >= t1) + (v2 >= t2) + (v2 >= t3) + (v2 >= t4) + (v2 >= t5) + (v2 >= t6)]) << 36U);
a3 |= (((uint64_t)s_tran3[(v3 >= t0) + (v3 >= t1) + (v3 >= t2) + (v3 >= t3) + (v3 >= t4) + (v3 >= t5) + (v3 >= t6)]) << 36U);
selectors[i] = Levels[level];
}
dest->SetSelectorBits(a0 | a1 | a2 | a3);
dest->PackSelectors(selectors);
}
} // namespace rgbcx
Loading…
Cancel
Save