BC4 encoding

faster-single-tables
Andrew Cassidy 3 years ago
parent b42c52030d
commit 460785ee7d

@ -44,9 +44,9 @@ void BC1Decoder::DecodeBlock(Color4x4 dest, BC1Block *const block) const noexcep
assert(selector < 4);
assert((color.a == 0 && selector == 3 && l <= h) || color.a == UINT8_MAX);
if (_write_alpha) {
dest.get(x, y).SetRGBA(color);
dest.Get(x, y).SetRGBA(color);
} else {
dest.get(x, y).SetRGB(color);
dest.Get(x, y).SetRGB(color);
}
}
}

@ -39,7 +39,7 @@ void rgbcx::BC4Decoder::DecodeBlock(Byte4x4 dest, BC4Block *const block) const n
for (unsigned x = 0; x < 4; x++) {
const auto selector = selectors[y][x];
assert(selector < 8);
dest.set(x, y, values[selector]);
dest.Set(x, y, values[selector]);
}
}
}

@ -29,7 +29,6 @@
namespace rgbcx {
class BC4Decoder : public BlockDecoder<BC4Block, 4, 4> {
public:
using Byte4x4 = BlockView<uint8_t, 4, 4>;
BC4Decoder(uint8_t channel = 3) : _channel(channel) { assert(channel < 4U); }
void DecodeBlock(Color4x4 dest, BC4Block *const block) const noexcept(ndebug) override { DecodeBlock(dest.GetChannel(_channel), block); }

@ -19,4 +19,89 @@
#include "BC4Encoder.h"
namespace rgbcx {} // namespace rgbcx
#include <algorithm>
namespace rgbcx {
void BC4Encoder::EncodeBlock(Byte4x4 pixels, BC4Block *const dest) const noexcept(ndebug) {
auto bytes = pixels.Flatten();
auto minmax = std::minmax_element(bytes.begin(), bytes.end());
uint8_t min_v = *minmax.first;
uint8_t max_v = *minmax.second;
dest->high_alpha = min_v;
dest->low_alpha = max_v;
if (max_v == min_v) {
dest->SetSelectorBits(0);
return;
}
const uint32_t delta = max_v - min_v;
// min_v is now 0. Compute thresholds between values by scaling max_v. It's x14 because we're adding two x7 scale factors.
const int t0 = delta * 13;
const int t1 = delta * 11;
const int t2 = delta * 9;
const int t3 = delta * 7;
const int t4 = delta * 5;
const int t5 = delta * 3;
const int t6 = delta * 1;
// BC4 floors in its divisions, which we compensate for with the 4 bias.
// This function is optimal for all possible inputs (i.e. it outputs the same results as checking all 8 values and choosing the closest one).
const int bias = 4 - min_v * 14;
static const uint32_t s_tran0[8] = {1U, 7U, 6U, 5U, 4U, 3U, 2U, 0U};
static const uint32_t s_tran1[8] = {1U << 3U, 7U << 3U, 6U << 3U, 5U << 3U, 4U << 3U, 3U << 3U, 2U << 3U, 0U << 3U};
static const uint32_t s_tran2[8] = {1U << 6U, 7U << 6U, 6U << 6U, 5U << 6U, 4U << 6U, 3U << 6U, 2U << 6U, 0U << 6U};
static const uint32_t s_tran3[8] = {1U << 9U, 7U << 9U, 6U << 9U, 5U << 9U, 4U << 9U, 3U << 9U, 2U << 9U, 0U << 9U};
uint64_t a0, a1, a2, a3;
{
const int v0 = bytes[0] * 14 + bias;
const int v1 = bytes[1] * 14 + bias;
const int v2 = bytes[2] * 14 + bias;
const int v3 = bytes[3] * 14 + bias;
a0 = s_tran0[(v0 >= t0) + (v0 >= t1) + (v0 >= t2) + (v0 >= t3) + (v0 >= t4) + (v0 >= t5) + (v0 >= t6)];
a1 = s_tran1[(v1 >= t0) + (v1 >= t1) + (v1 >= t2) + (v1 >= t3) + (v1 >= t4) + (v1 >= t5) + (v1 >= t6)];
a2 = s_tran2[(v2 >= t0) + (v2 >= t1) + (v2 >= t2) + (v2 >= t3) + (v2 >= t4) + (v2 >= t5) + (v2 >= t6)];
a3 = s_tran3[(v3 >= t0) + (v3 >= t1) + (v3 >= t2) + (v3 >= t3) + (v3 >= t4) + (v3 >= t5) + (v3 >= t6)];
}
{
const int v0 = bytes[4] * 14 + bias;
const int v1 = bytes[5] * 14 + bias;
const int v2 = bytes[6] * 14 + bias;
const int v3 = bytes[7] * 14 + bias;
a0 |= (uint64_t)(s_tran0[(v0 >= t0) + (v0 >= t1) + (v0 >= t2) + (v0 >= t3) + (v0 >= t4) + (v0 >= t5) + (v0 >= t6)] << 12U);
a1 |= (uint64_t)(s_tran1[(v1 >= t0) + (v1 >= t1) + (v1 >= t2) + (v1 >= t3) + (v1 >= t4) + (v1 >= t5) + (v1 >= t6)] << 12U);
a2 |= (uint64_t)(s_tran2[(v2 >= t0) + (v2 >= t1) + (v2 >= t2) + (v2 >= t3) + (v2 >= t4) + (v2 >= t5) + (v2 >= t6)] << 12U);
a3 |= (uint64_t)(s_tran3[(v3 >= t0) + (v3 >= t1) + (v3 >= t2) + (v3 >= t3) + (v3 >= t4) + (v3 >= t5) + (v3 >= t6)] << 12U);
}
{
const int v0 = bytes[8] * 14 + bias;
const int v1 = bytes[9] * 14 + bias;
const int v2 = bytes[10] * 14 + bias;
const int v3 = bytes[11] * 14 + bias;
a0 |= (((uint64_t)s_tran0[(v0 >= t0) + (v0 >= t1) + (v0 >= t2) + (v0 >= t3) + (v0 >= t4) + (v0 >= t5) + (v0 >= t6)]) << 24U);
a1 |= (((uint64_t)s_tran1[(v1 >= t0) + (v1 >= t1) + (v1 >= t2) + (v1 >= t3) + (v1 >= t4) + (v1 >= t5) + (v1 >= t6)]) << 24U);
a2 |= (((uint64_t)s_tran2[(v2 >= t0) + (v2 >= t1) + (v2 >= t2) + (v2 >= t3) + (v2 >= t4) + (v2 >= t5) + (v2 >= t6)]) << 24U);
a3 |= (((uint64_t)s_tran3[(v3 >= t0) + (v3 >= t1) + (v3 >= t2) + (v3 >= t3) + (v3 >= t4) + (v3 >= t5) + (v3 >= t6)]) << 24U);
}
{
const int v0 = bytes[12] * 14 + bias;
const int v1 = bytes[13] * 14 + bias;
const int v2 = bytes[14] * 14 + bias;
const int v3 = bytes[15] * 14 + bias;
a0 |= (((uint64_t)s_tran0[(v0 >= t0) + (v0 >= t1) + (v0 >= t2) + (v0 >= t3) + (v0 >= t4) + (v0 >= t5) + (v0 >= t6)]) << 36U);
a1 |= (((uint64_t)s_tran1[(v1 >= t0) + (v1 >= t1) + (v1 >= t2) + (v1 >= t3) + (v1 >= t4) + (v1 >= t5) + (v1 >= t6)]) << 36U);
a2 |= (((uint64_t)s_tran2[(v2 >= t0) + (v2 >= t1) + (v2 >= t2) + (v2 >= t3) + (v2 >= t4) + (v2 >= t5) + (v2 >= t6)]) << 36U);
a3 |= (((uint64_t)s_tran3[(v3 >= t0) + (v3 >= t1) + (v3 >= t2) + (v3 >= t3) + (v3 >= t4) + (v3 >= t5) + (v3 >= t6)]) << 36U);
}
dest->SetSelectorBits(a0 | a1 | a2 | a3);
}
} // namespace rgbcx

@ -19,7 +19,19 @@
#pragma once
#include "../BlockEncoder.h"
#include "BC4Block.h"
namespace rgbcx {
class BC4Encoder {};
class BC4Encoder : public BlockEncoder<BC4Block, 4, 4> {
public:
BC4Encoder(const uint8_t channel) : _channel(channel) { assert(channel < 4); }
void EncodeBlock(Color4x4 pixels, BC4Block *dest) const override { EncodeBlock(pixels.GetChannel(_channel), dest); }
void EncodeBlock(Color4x4 pixels, BC4Block *const dest, uint8_t channel) const noexcept(ndebug) { EncodeBlock(pixels.GetChannel(channel), dest); }
void EncodeBlock(Byte4x4 pixels, BC4Block *const dest) const noexcept(ndebug);
private:
const uint8_t _channel;
};
} // namespace rgbcx

@ -40,13 +40,14 @@ template <class B, size_t M, size_t N> class BlockDecoder {
virtual void DecodeBlock(DecodedBlock dest, EncodedBlock *const block) const noexcept(ndebug) = 0;
std::vector<Color> DecodeImage(uint8_t *bytes, unsigned image_width, unsigned image_height) {
unsigned block_width = maximum(1U, ((image_width + 3) / 4));
unsigned block_height = maximum(1U, ((image_height + 3) / 4));
using Row = typename DecodedBlock::Row;
void DecodeImage(uint8_t *encoded, Color *decoded, unsigned image_width, unsigned image_height) {
assert(image_width % N == 0);
assert(image_width % M == 0);
auto image = std::vector<Color>(block_width * block_height * N * M);
auto blocks = reinterpret_cast<B *>(bytes);
unsigned block_width = image_width / N;
unsigned block_height = image_height / M;
auto blocks = reinterpret_cast<B *>(encoded);
// from experimentation, multithreading this using OpenMP actually makes decoding slower
// due to thread creation/teardown taking longer than the decoding process itself.
@ -62,16 +63,11 @@ template <class B, size_t M, size_t N> class BlockDecoder {
assert(pixel_x + N <= image_width);
unsigned top_left = pixel_x + (pixel_y * image_width);
auto rows = std::array<Row *, M>();
for (unsigned i = 0; i < M; i++) { rows[i] = reinterpret_cast<Row *>(&image[top_left + i * image_width]); }
auto dest = DecodedBlock(&image[top_left], image_width);
auto dest = DecodedBlock(&decoded[top_left], image_width);
DecodeBlock(dest, &blocks[x + block_width * y]);
}
}
return image;
}
};
} // namespace rgbcx

@ -27,8 +27,42 @@ namespace rgbcx {
template <class B, size_t M, size_t N> class BlockEncoder {
public:
using DecodedBlock = BlockView<M, N>;
using DecodedBlock = ColorBlockView<M, N>;
using EncodedBlock = B;
virtual void EncodeBlock(EncodedBlock *dest, DecodedBlock *const pixels) const = 0;
BlockEncoder() noexcept = default;
virtual ~BlockEncoder() noexcept = default;
virtual void EncodeBlock(DecodedBlock pixels, EncodedBlock *dest) const = 0;
void EncodeImage(uint8_t *encoded, Color *decoded, unsigned image_width, unsigned image_height) {
assert(image_width % N == 0);
assert(image_width % M == 0);
unsigned block_width = image_width / N;
unsigned block_height = image_height / M;
auto blocks = reinterpret_cast<B *>(encoded);
// from experimentation, multithreading this using OpenMP actually makes decoding slower
// due to thread creation/teardown taking longer than the decoding process itself.
// As a result, this is left as a serial operation despite being embarassingly parallelizable
for (unsigned y = 0; y < block_height; y++) {
for (unsigned x = 0; x < block_width; x++) {
unsigned pixel_x = x * N;
unsigned pixel_y = y * M;
assert(pixel_x >= 0);
assert(pixel_y >= 0);
assert(pixel_y + M <= image_height);
assert(pixel_x + N <= image_width);
unsigned top_left = pixel_x + (pixel_y * image_width);
auto src = DecodedBlock(&decoded[top_left], image_width);
EncodeBlock(src, &blocks[x + block_width * y]);
}
}
}
};
} // namespace rgbcx

@ -42,7 +42,7 @@ template <typename S, size_t N> class RowView {
return start[index * pixel_stride];
}
constexpr int size() noexcept { return N; }
constexpr int Size() noexcept { return N; }
S *const start;
const int pixel_stride;
@ -59,29 +59,29 @@ template <typename S, size_t M, size_t N> class BlockView {
return RowView<S, N>(&start[row_stride * (int)index], pixel_stride);
}
constexpr int width() noexcept { return N; }
constexpr int height() noexcept { return M; }
constexpr int size() noexcept { return N * M; }
constexpr int Width() noexcept { return N; }
constexpr int Height() noexcept { return M; }
constexpr int Size() noexcept { return N * M; }
constexpr S &get(unsigned x, unsigned y) noexcept(ndebug) {
constexpr S &Get(unsigned x, unsigned y) noexcept(ndebug) {
assert(x < N);
assert(y < M);
return start[(row_stride * (int)y) + (pixel_stride * (int)x)];
}
constexpr S get(unsigned x, unsigned y) const noexcept(ndebug) {
constexpr S Get(unsigned x, unsigned y) const noexcept(ndebug) {
assert(x < N);
assert(y < M);
return start[(row_stride * (int)y) + (pixel_stride * (int)x)];
}
constexpr void set(unsigned x, unsigned y, S value) noexcept(ndebug) {
constexpr void Set(unsigned x, unsigned y, S value) noexcept(ndebug) {
assert(x < N);
assert(y < M);
start[(row_stride * (int)y) + (pixel_stride * (int)x)] = value;
}
constexpr std::array<S, M * N> flatten() noexcept {
constexpr std::array<S, M * N> Flatten() noexcept {
std::array<S, M * N> result;
for (int x = 0; x < N; x++) {
for (int y = 0; y < M; y++) { result[x + (N * y)] = start[(row_stride * y) + (pixel_stride * x)]; }
@ -107,9 +107,10 @@ template <size_t M, size_t N> class ColorBlockView : public BlockView<Color, M,
return ChannelView(channelStart, Base::row_stride * 4, Base::pixel_stride * 4);
}
void SetRGB(unsigned x, unsigned y, Color value) noexcept(ndebug) { Base::get(x, y).SetRGB(value); }
void SetRGB(unsigned x, unsigned y, Color value) noexcept(ndebug) { Base::Get(x, y).SetRGB(value); }
};
using Color4x4 = ColorBlockView<4, 4>;
using Byte4x4 = BlockView<uint8_t, 4, 4>;
} // namespace rgbcx

@ -17,6 +17,7 @@
#include <type_traits>
#include <vector>
#include "../BC4/BC4Encoder.h"
#include "../rgbcx.h"
#include "../rgbcxDecoders.h"
#include "../util.h"
@ -116,9 +117,7 @@ class image_u8 {
inline const color_quad_u8_vec &get_pixels() const { return m_pixels; }
inline color_quad_u8_vec &get_pixels() { return m_pixels; }
void set_pixels(const color_quad_u8_vec &pixels) {
m_pixels = pixels;
}
void set_pixels(const color_quad_u8_vec &pixels) { m_pixels = pixels; }
inline uint32_t width() const { return m_width; }
inline uint32_t height() const { return m_height; }
@ -266,7 +265,7 @@ class image_metrics {
const color_quad_u8 &cb = b(x, y);
if (!num_channels) {
// int luma_diff = ;
// int luma_diff = ;
unsigned index = iabs(ca.get_luma() - cb.get_luma());
hist[index]++;
} else {
@ -660,67 +659,75 @@ int main(int argc, char *argv[]) {
uint32_t bc7_mode_hist[8];
memset(bc7_mode_hist, 0, sizeof(bc7_mode_hist));
for (uint32_t by = 0; by < blocks_y; by++) {
for (uint32_t bx = 0; bx < blocks_x; bx++) {
color_quad_u8 pixels[16];
if (dxgi_format == DXGI_FORMAT_BC4_UNORM) {
auto bc4_encoder = BC4Encoder(bc45_channel0);
Color *src = &source_image.get_pixels()[0];
source_image.get_block(bx, by, 4, 4, pixels);
if (!has_alpha) {
for (uint32_t i = 0; i < 16; i++) {
if (pixels[i][3] < 255) {
has_alpha = true;
break;
bc4_encoder.EncodeImage(reinterpret_cast<uint8_t *>(&packed_image8[0]), src, source_image.width(), source_image.height());
} else {
for (uint32_t by = 0; by < blocks_y; by++) {
for (uint32_t bx = 0; bx < blocks_x; bx++) {
color_quad_u8 pixels[16];
source_image.get_block(bx, by, 4, 4, pixels);
if (!has_alpha) {
for (uint32_t i = 0; i < 16; i++) {
if (pixels[i][3] < 255) {
has_alpha = true;
break;
}
}
}
}
switch (dxgi_format) {
case DXGI_FORMAT_BC1_UNORM: {
block8 *pBlock = &packed_image8[bx + by * blocks_x];
switch (dxgi_format) {
case DXGI_FORMAT_BC1_UNORM: {
block8 *pBlock = &packed_image8[bx + by * blocks_x];
rgbcx::encode_bc1(bc1_quality_level, pBlock, &pixels[0][0], use_bc1_3color_mode, use_bc1_3color_mode_for_black);
break;
}
case DXGI_FORMAT_BC3_UNORM: {
BC3Block *pBlock = reinterpret_cast<BC3Block *>(&packed_image16[bx + by * blocks_x]);
rgbcx::encode_bc1(bc1_quality_level, pBlock, &pixels[0][0], use_bc1_3color_mode, use_bc1_3color_mode_for_black);
break;
}
case DXGI_FORMAT_BC3_UNORM: {
BC3Block *pBlock = reinterpret_cast<BC3Block *>(&packed_image16[bx + by * blocks_x]);
rgbcx::encode_bc3(bc1_quality_level, pBlock, &pixels[0][0]);
break;
}
case DXGI_FORMAT_BC4_UNORM: {
block8 *pBlock = &packed_image8[bx + by * blocks_x];
rgbcx::encode_bc3(bc1_quality_level, pBlock, &pixels[0][0]);
break;
}
case DXGI_FORMAT_BC4_UNORM: {
block8 *pBlock = &packed_image8[bx + by * blocks_x];
rgbcx::encode_bc4(pBlock, &pixels[0][bc45_channel0], 4);
break;
}
case DXGI_FORMAT_BC5_UNORM: {
block16 *pBlock = &packed_image16[bx + by * blocks_x];
rgbcx::encode_bc4(pBlock, &pixels[0][bc45_channel0], 4);
break;
}
case DXGI_FORMAT_BC5_UNORM: {
block16 *pBlock = &packed_image16[bx + by * blocks_x];
rgbcx::encode_bc5(reinterpret_cast<BC5Block *>(pBlock), &pixels[0][0], bc45_channel0, bc45_channel1, 4);
break;
}
case DXGI_FORMAT_BC7_UNORM: {
block16 *pBlock = &packed_image16[bx + by * blocks_x];
rgbcx::encode_bc5(reinterpret_cast<BC5Block *>(pBlock), &pixels[0][0], bc45_channel0, bc45_channel1, 4);
break;
}
case DXGI_FORMAT_BC7_UNORM: {
block16 *pBlock = &packed_image16[bx + by * blocks_x];
bc7enc_compress_block(pBlock, pixels, &pack_params);
bc7enc_compress_block(pBlock, pixels, &pack_params);
uint32_t mode = ((uint8_t *)pBlock)[0];
for (uint32_t m = 0; m <= 7; m++) {
if (mode & (1 << m)) {
bc7_mode_hist[m]++;
break;
uint32_t mode = ((uint8_t *)pBlock)[0];
for (uint32_t m = 0; m <= 7; m++) {
if (mode & (1 << m)) {
bc7_mode_hist[m]++;
break;
}
}
break;
}
default: {
assert(0);
break;
}
break;
}
default: {
assert(0);
break;
}
}
}
if ((by & 127) == 0) printf(".");
if ((by & 127) == 0) printf(".");
}
}
clock_t end_t = clock();
@ -749,23 +756,22 @@ int main(int argc, char *argv[]) {
bool punchthrough_flag = false;
auto decoder_bc1 = rgbcx::BC1Decoder();
auto decoder_bc3 = rgbcx::BC3Decoder();
auto decoder_bc4 = rgbcx::BC4Decoder();
auto decoder_bc4 = rgbcx::BC4Decoder(bc45_channel0);
auto decoder_bc5 = rgbcx::BC5Decoder();
Color *dest = &unpacked_image.get_pixels()[0];
switch (dxgi_format) {
case DXGI_FORMAT_BC1_UNORM:
unpacked_image.set_pixels(decoder_bc1.DecodeImage(reinterpret_cast<uint8_t *>(&packed_image8[0]), source_image.width(), source_image.height()));
decoder_bc1.DecodeImage(reinterpret_cast<uint8_t *>(&packed_image8[0]), dest, source_image.width(), source_image.height());
break;
case DXGI_FORMAT_BC3_UNORM:
unpacked_image.set_pixels(
decoder_bc3.DecodeImage(reinterpret_cast<uint8_t *>(&packed_image16[0]), source_image.width(), source_image.height()));
decoder_bc3.DecodeImage(reinterpret_cast<uint8_t *>(&packed_image16[0]), dest, source_image.width(), source_image.height());
break;
case DXGI_FORMAT_BC4_UNORM:
unpacked_image.set_pixels(decoder_bc4.DecodeImage(reinterpret_cast<uint8_t *>(&packed_image8[0]), source_image.width(), source_image.height()));
decoder_bc4.DecodeImage(reinterpret_cast<uint8_t *>(&packed_image8[0]), dest, source_image.width(), source_image.height());
break;
case DXGI_FORMAT_BC5_UNORM:
unpacked_image.set_pixels(
decoder_bc5.DecodeImage(reinterpret_cast<uint8_t *>(&packed_image16[0]), source_image.width(), source_image.height()));
decoder_bc5.DecodeImage(reinterpret_cast<uint8_t *>(&packed_image16[0]), dest, source_image.width(), source_image.height());
break;
default:
assert(0);
@ -777,7 +783,7 @@ int main(int argc, char *argv[]) {
// void *pBlock = (bytes_per_block == 16) ? (void *)&packed_image16[bx + by * blocks_x] : (void *)&packed_image8[bx + by * blocks_x];
//
// color_quad_u8 unpacked_pixels[16];
// for (uint32_t i = 0; i < 16; i++) unpacked_pixels[i].set(0, 0, 0, 255);
// for (uint32_t i = 0; i < 16; i++) unpacked_pixels[i].Set(0, 0, 0, 255);
//
// switch (dxgi_format) {
// case DXGI_FORMAT_BC1_UNORM:
@ -806,7 +812,6 @@ int main(int argc, char *argv[]) {
clock_t end_decode_t = clock();
printf("\nDecode time: %f secs\n", (double)(end_decode_t - start_decode_t) / CLOCKS_PER_SEC);
if ((punchthrough_flag) && (dxgi_format == DXGI_FORMAT_BC3_UNORM))
fprintf(stderr, "Warning: BC3 mode selected, but rgbcx::unpack_bc3() returned one or more blocks using 3-color mode!\n");
@ -849,7 +854,8 @@ int main(int argc, char *argv[]) {
for (uint32_t y = 0; y < unpacked_image_alpha.height(); y++)
for (uint32_t x = 0; x < unpacked_image_alpha.width(); x++) {
uint8_t alpha = unpacked_image_alpha(x, y).a;
unpacked_image_alpha(x, y).SetRGBA(alpha, alpha, alpha, 255); }
unpacked_image_alpha(x, y).SetRGBA(alpha, alpha, alpha, 255);
}
if (!save_png(png_alpha_output_filename.c_str(), unpacked_image_alpha, false))
failed = true;

Loading…
Cancel
Save