diff --git a/src/BC1/BC1Decoder.cpp b/src/BC1/BC1Decoder.cpp index 34bda12..77640fc 100644 --- a/src/BC1/BC1Decoder.cpp +++ b/src/BC1/BC1Decoder.cpp @@ -44,9 +44,9 @@ void BC1Decoder::DecodeBlock(Color4x4 dest, BC1Block *const block) const noexcep assert(selector < 4); assert((color.a == 0 && selector == 3 && l <= h) || color.a == UINT8_MAX); if (_write_alpha) { - dest.get(x, y).SetRGBA(color); + dest.Get(x, y).SetRGBA(color); } else { - dest.get(x, y).SetRGB(color); + dest.Get(x, y).SetRGB(color); } } } diff --git a/src/BC4/BC4Decoder.cpp b/src/BC4/BC4Decoder.cpp index b64e81d..c3e3675 100644 --- a/src/BC4/BC4Decoder.cpp +++ b/src/BC4/BC4Decoder.cpp @@ -39,7 +39,7 @@ void rgbcx::BC4Decoder::DecodeBlock(Byte4x4 dest, BC4Block *const block) const n for (unsigned x = 0; x < 4; x++) { const auto selector = selectors[y][x]; assert(selector < 8); - dest.set(x, y, values[selector]); + dest.Set(x, y, values[selector]); } } } diff --git a/src/BC4/BC4Decoder.h b/src/BC4/BC4Decoder.h index 1316c16..dc725cf 100644 --- a/src/BC4/BC4Decoder.h +++ b/src/BC4/BC4Decoder.h @@ -29,7 +29,6 @@ namespace rgbcx { class BC4Decoder : public BlockDecoder { public: - using Byte4x4 = BlockView; BC4Decoder(uint8_t channel = 3) : _channel(channel) { assert(channel < 4U); } void DecodeBlock(Color4x4 dest, BC4Block *const block) const noexcept(ndebug) override { DecodeBlock(dest.GetChannel(_channel), block); } diff --git a/src/BC4/BC4Encoder.cpp b/src/BC4/BC4Encoder.cpp index 295184e..da273c4 100644 --- a/src/BC4/BC4Encoder.cpp +++ b/src/BC4/BC4Encoder.cpp @@ -19,4 +19,89 @@ #include "BC4Encoder.h" -namespace rgbcx {} // namespace rgbcx \ No newline at end of file +#include + +namespace rgbcx { +void BC4Encoder::EncodeBlock(Byte4x4 pixels, BC4Block *const dest) const noexcept(ndebug) { + auto bytes = pixels.Flatten(); + auto minmax = std::minmax_element(bytes.begin(), bytes.end()); + + uint8_t min_v = *minmax.first; + uint8_t max_v = *minmax.second; + + dest->high_alpha = min_v; + dest->low_alpha = max_v; + + if (max_v == min_v) { + dest->SetSelectorBits(0); + return; + } + + const uint32_t delta = max_v - min_v; + + // min_v is now 0. Compute thresholds between values by scaling max_v. It's x14 because we're adding two x7 scale factors. + const int t0 = delta * 13; + const int t1 = delta * 11; + const int t2 = delta * 9; + const int t3 = delta * 7; + const int t4 = delta * 5; + const int t5 = delta * 3; + const int t6 = delta * 1; + + // BC4 floors in its divisions, which we compensate for with the 4 bias. + // This function is optimal for all possible inputs (i.e. it outputs the same results as checking all 8 values and choosing the closest one). + const int bias = 4 - min_v * 14; + + static const uint32_t s_tran0[8] = {1U, 7U, 6U, 5U, 4U, 3U, 2U, 0U}; + static const uint32_t s_tran1[8] = {1U << 3U, 7U << 3U, 6U << 3U, 5U << 3U, 4U << 3U, 3U << 3U, 2U << 3U, 0U << 3U}; + static const uint32_t s_tran2[8] = {1U << 6U, 7U << 6U, 6U << 6U, 5U << 6U, 4U << 6U, 3U << 6U, 2U << 6U, 0U << 6U}; + static const uint32_t s_tran3[8] = {1U << 9U, 7U << 9U, 6U << 9U, 5U << 9U, 4U << 9U, 3U << 9U, 2U << 9U, 0U << 9U}; + + uint64_t a0, a1, a2, a3; + { + const int v0 = bytes[0] * 14 + bias; + const int v1 = bytes[1] * 14 + bias; + const int v2 = bytes[2] * 14 + bias; + const int v3 = bytes[3] * 14 + bias; + a0 = s_tran0[(v0 >= t0) + (v0 >= t1) + (v0 >= t2) + (v0 >= t3) + (v0 >= t4) + (v0 >= t5) + (v0 >= t6)]; + a1 = s_tran1[(v1 >= t0) + (v1 >= t1) + (v1 >= t2) + (v1 >= t3) + (v1 >= t4) + (v1 >= t5) + (v1 >= t6)]; + a2 = s_tran2[(v2 >= t0) + (v2 >= t1) + (v2 >= t2) + (v2 >= t3) + (v2 >= t4) + (v2 >= t5) + (v2 >= t6)]; + a3 = s_tran3[(v3 >= t0) + (v3 >= t1) + (v3 >= t2) + (v3 >= t3) + (v3 >= t4) + (v3 >= t5) + (v3 >= t6)]; + } + + { + const int v0 = bytes[4] * 14 + bias; + const int v1 = bytes[5] * 14 + bias; + const int v2 = bytes[6] * 14 + bias; + const int v3 = bytes[7] * 14 + bias; + a0 |= (uint64_t)(s_tran0[(v0 >= t0) + (v0 >= t1) + (v0 >= t2) + (v0 >= t3) + (v0 >= t4) + (v0 >= t5) + (v0 >= t6)] << 12U); + a1 |= (uint64_t)(s_tran1[(v1 >= t0) + (v1 >= t1) + (v1 >= t2) + (v1 >= t3) + (v1 >= t4) + (v1 >= t5) + (v1 >= t6)] << 12U); + a2 |= (uint64_t)(s_tran2[(v2 >= t0) + (v2 >= t1) + (v2 >= t2) + (v2 >= t3) + (v2 >= t4) + (v2 >= t5) + (v2 >= t6)] << 12U); + a3 |= (uint64_t)(s_tran3[(v3 >= t0) + (v3 >= t1) + (v3 >= t2) + (v3 >= t3) + (v3 >= t4) + (v3 >= t5) + (v3 >= t6)] << 12U); + } + + { + const int v0 = bytes[8] * 14 + bias; + const int v1 = bytes[9] * 14 + bias; + const int v2 = bytes[10] * 14 + bias; + const int v3 = bytes[11] * 14 + bias; + a0 |= (((uint64_t)s_tran0[(v0 >= t0) + (v0 >= t1) + (v0 >= t2) + (v0 >= t3) + (v0 >= t4) + (v0 >= t5) + (v0 >= t6)]) << 24U); + a1 |= (((uint64_t)s_tran1[(v1 >= t0) + (v1 >= t1) + (v1 >= t2) + (v1 >= t3) + (v1 >= t4) + (v1 >= t5) + (v1 >= t6)]) << 24U); + a2 |= (((uint64_t)s_tran2[(v2 >= t0) + (v2 >= t1) + (v2 >= t2) + (v2 >= t3) + (v2 >= t4) + (v2 >= t5) + (v2 >= t6)]) << 24U); + a3 |= (((uint64_t)s_tran3[(v3 >= t0) + (v3 >= t1) + (v3 >= t2) + (v3 >= t3) + (v3 >= t4) + (v3 >= t5) + (v3 >= t6)]) << 24U); + } + + { + const int v0 = bytes[12] * 14 + bias; + const int v1 = bytes[13] * 14 + bias; + const int v2 = bytes[14] * 14 + bias; + const int v3 = bytes[15] * 14 + bias; + a0 |= (((uint64_t)s_tran0[(v0 >= t0) + (v0 >= t1) + (v0 >= t2) + (v0 >= t3) + (v0 >= t4) + (v0 >= t5) + (v0 >= t6)]) << 36U); + a1 |= (((uint64_t)s_tran1[(v1 >= t0) + (v1 >= t1) + (v1 >= t2) + (v1 >= t3) + (v1 >= t4) + (v1 >= t5) + (v1 >= t6)]) << 36U); + a2 |= (((uint64_t)s_tran2[(v2 >= t0) + (v2 >= t1) + (v2 >= t2) + (v2 >= t3) + (v2 >= t4) + (v2 >= t5) + (v2 >= t6)]) << 36U); + a3 |= (((uint64_t)s_tran3[(v3 >= t0) + (v3 >= t1) + (v3 >= t2) + (v3 >= t3) + (v3 >= t4) + (v3 >= t5) + (v3 >= t6)]) << 36U); + } + + dest->SetSelectorBits(a0 | a1 | a2 | a3); +} +} // namespace rgbcx \ No newline at end of file diff --git a/src/BC4/BC4Encoder.h b/src/BC4/BC4Encoder.h index 624939b..61c3e08 100644 --- a/src/BC4/BC4Encoder.h +++ b/src/BC4/BC4Encoder.h @@ -19,7 +19,19 @@ #pragma once +#include "../BlockEncoder.h" +#include "BC4Block.h" namespace rgbcx { -class BC4Encoder {}; +class BC4Encoder : public BlockEncoder { + public: + BC4Encoder(const uint8_t channel) : _channel(channel) { assert(channel < 4); } + + void EncodeBlock(Color4x4 pixels, BC4Block *dest) const override { EncodeBlock(pixels.GetChannel(_channel), dest); } + void EncodeBlock(Color4x4 pixels, BC4Block *const dest, uint8_t channel) const noexcept(ndebug) { EncodeBlock(pixels.GetChannel(channel), dest); } + void EncodeBlock(Byte4x4 pixels, BC4Block *const dest) const noexcept(ndebug); + + private: + const uint8_t _channel; +}; } // namespace rgbcx diff --git a/src/BlockDecoder.h b/src/BlockDecoder.h index 4e5299f..4a8db5a 100644 --- a/src/BlockDecoder.h +++ b/src/BlockDecoder.h @@ -40,13 +40,14 @@ template class BlockDecoder { virtual void DecodeBlock(DecodedBlock dest, EncodedBlock *const block) const noexcept(ndebug) = 0; - std::vector DecodeImage(uint8_t *bytes, unsigned image_width, unsigned image_height) { - unsigned block_width = maximum(1U, ((image_width + 3) / 4)); - unsigned block_height = maximum(1U, ((image_height + 3) / 4)); - using Row = typename DecodedBlock::Row; + void DecodeImage(uint8_t *encoded, Color *decoded, unsigned image_width, unsigned image_height) { + assert(image_width % N == 0); + assert(image_width % M == 0); - auto image = std::vector(block_width * block_height * N * M); - auto blocks = reinterpret_cast(bytes); + unsigned block_width = image_width / N; + unsigned block_height = image_height / M; + + auto blocks = reinterpret_cast(encoded); // from experimentation, multithreading this using OpenMP actually makes decoding slower // due to thread creation/teardown taking longer than the decoding process itself. @@ -62,16 +63,11 @@ template class BlockDecoder { assert(pixel_x + N <= image_width); unsigned top_left = pixel_x + (pixel_y * image_width); - auto rows = std::array(); - for (unsigned i = 0; i < M; i++) { rows[i] = reinterpret_cast(&image[top_left + i * image_width]); } - - auto dest = DecodedBlock(&image[top_left], image_width); + auto dest = DecodedBlock(&decoded[top_left], image_width); DecodeBlock(dest, &blocks[x + block_width * y]); } } - - return image; } }; } // namespace rgbcx diff --git a/src/BlockEncoder.h b/src/BlockEncoder.h index 9966055..b153d7c 100644 --- a/src/BlockEncoder.h +++ b/src/BlockEncoder.h @@ -27,8 +27,42 @@ namespace rgbcx { template class BlockEncoder { public: - using DecodedBlock = BlockView; + using DecodedBlock = ColorBlockView; using EncodedBlock = B; - virtual void EncodeBlock(EncodedBlock *dest, DecodedBlock *const pixels) const = 0; + + BlockEncoder() noexcept = default; + virtual ~BlockEncoder() noexcept = default; + + virtual void EncodeBlock(DecodedBlock pixels, EncodedBlock *dest) const = 0; + + void EncodeImage(uint8_t *encoded, Color *decoded, unsigned image_width, unsigned image_height) { + assert(image_width % N == 0); + assert(image_width % M == 0); + + unsigned block_width = image_width / N; + unsigned block_height = image_height / M; + + auto blocks = reinterpret_cast(encoded); + + // from experimentation, multithreading this using OpenMP actually makes decoding slower + // due to thread creation/teardown taking longer than the decoding process itself. + // As a result, this is left as a serial operation despite being embarassingly parallelizable + for (unsigned y = 0; y < block_height; y++) { + for (unsigned x = 0; x < block_width; x++) { + unsigned pixel_x = x * N; + unsigned pixel_y = y * M; + + assert(pixel_x >= 0); + assert(pixel_y >= 0); + assert(pixel_y + M <= image_height); + assert(pixel_x + N <= image_width); + + unsigned top_left = pixel_x + (pixel_y * image_width); + auto src = DecodedBlock(&decoded[top_left], image_width); + + EncodeBlock(src, &blocks[x + block_width * y]); + } + } + } }; } // namespace rgbcx diff --git a/src/BlockView.h b/src/BlockView.h index a8f6525..3a9ca2c 100644 --- a/src/BlockView.h +++ b/src/BlockView.h @@ -42,7 +42,7 @@ template class RowView { return start[index * pixel_stride]; } - constexpr int size() noexcept { return N; } + constexpr int Size() noexcept { return N; } S *const start; const int pixel_stride; @@ -59,29 +59,29 @@ template class BlockView { return RowView(&start[row_stride * (int)index], pixel_stride); } - constexpr int width() noexcept { return N; } - constexpr int height() noexcept { return M; } - constexpr int size() noexcept { return N * M; } + constexpr int Width() noexcept { return N; } + constexpr int Height() noexcept { return M; } + constexpr int Size() noexcept { return N * M; } - constexpr S &get(unsigned x, unsigned y) noexcept(ndebug) { + constexpr S &Get(unsigned x, unsigned y) noexcept(ndebug) { assert(x < N); assert(y < M); return start[(row_stride * (int)y) + (pixel_stride * (int)x)]; } - constexpr S get(unsigned x, unsigned y) const noexcept(ndebug) { + constexpr S Get(unsigned x, unsigned y) const noexcept(ndebug) { assert(x < N); assert(y < M); return start[(row_stride * (int)y) + (pixel_stride * (int)x)]; } - constexpr void set(unsigned x, unsigned y, S value) noexcept(ndebug) { + constexpr void Set(unsigned x, unsigned y, S value) noexcept(ndebug) { assert(x < N); assert(y < M); start[(row_stride * (int)y) + (pixel_stride * (int)x)] = value; } - constexpr std::array flatten() noexcept { + constexpr std::array Flatten() noexcept { std::array result; for (int x = 0; x < N; x++) { for (int y = 0; y < M; y++) { result[x + (N * y)] = start[(row_stride * y) + (pixel_stride * x)]; } @@ -107,9 +107,10 @@ template class ColorBlockView : public BlockView; +using Byte4x4 = BlockView; } // namespace rgbcx \ No newline at end of file diff --git a/src/test/test.cpp b/src/test/test.cpp index 5c98fcd..04481b3 100644 --- a/src/test/test.cpp +++ b/src/test/test.cpp @@ -17,6 +17,7 @@ #include #include +#include "../BC4/BC4Encoder.h" #include "../rgbcx.h" #include "../rgbcxDecoders.h" #include "../util.h" @@ -116,9 +117,7 @@ class image_u8 { inline const color_quad_u8_vec &get_pixels() const { return m_pixels; } inline color_quad_u8_vec &get_pixels() { return m_pixels; } - void set_pixels(const color_quad_u8_vec &pixels) { - m_pixels = pixels; - } + void set_pixels(const color_quad_u8_vec &pixels) { m_pixels = pixels; } inline uint32_t width() const { return m_width; } inline uint32_t height() const { return m_height; } @@ -266,7 +265,7 @@ class image_metrics { const color_quad_u8 &cb = b(x, y); if (!num_channels) { -// int luma_diff = ; + // int luma_diff = ; unsigned index = iabs(ca.get_luma() - cb.get_luma()); hist[index]++; } else { @@ -660,67 +659,75 @@ int main(int argc, char *argv[]) { uint32_t bc7_mode_hist[8]; memset(bc7_mode_hist, 0, sizeof(bc7_mode_hist)); - for (uint32_t by = 0; by < blocks_y; by++) { - for (uint32_t bx = 0; bx < blocks_x; bx++) { - color_quad_u8 pixels[16]; + if (dxgi_format == DXGI_FORMAT_BC4_UNORM) { + auto bc4_encoder = BC4Encoder(bc45_channel0); + Color *src = &source_image.get_pixels()[0]; - source_image.get_block(bx, by, 4, 4, pixels); - if (!has_alpha) { - for (uint32_t i = 0; i < 16; i++) { - if (pixels[i][3] < 255) { - has_alpha = true; + bc4_encoder.EncodeImage(reinterpret_cast(&packed_image8[0]), src, source_image.width(), source_image.height()); + + } else { + for (uint32_t by = 0; by < blocks_y; by++) { + for (uint32_t bx = 0; bx < blocks_x; bx++) { + color_quad_u8 pixels[16]; + + source_image.get_block(bx, by, 4, 4, pixels); + if (!has_alpha) { + for (uint32_t i = 0; i < 16; i++) { + if (pixels[i][3] < 255) { + has_alpha = true; + break; + } + } + } + + switch (dxgi_format) { + case DXGI_FORMAT_BC1_UNORM: { + block8 *pBlock = &packed_image8[bx + by * blocks_x]; + + rgbcx::encode_bc1(bc1_quality_level, pBlock, &pixels[0][0], use_bc1_3color_mode, use_bc1_3color_mode_for_black); + break; + } + case DXGI_FORMAT_BC3_UNORM: { + BC3Block *pBlock = reinterpret_cast(&packed_image16[bx + by * blocks_x]); + + rgbcx::encode_bc3(bc1_quality_level, pBlock, &pixels[0][0]); + break; + } + case DXGI_FORMAT_BC4_UNORM: { + block8 *pBlock = &packed_image8[bx + by * blocks_x]; + + rgbcx::encode_bc4(pBlock, &pixels[0][bc45_channel0], 4); + break; + } + case DXGI_FORMAT_BC5_UNORM: { + block16 *pBlock = &packed_image16[bx + by * blocks_x]; + + rgbcx::encode_bc5(reinterpret_cast(pBlock), &pixels[0][0], bc45_channel0, bc45_channel1, 4); + break; + } + case DXGI_FORMAT_BC7_UNORM: { + block16 *pBlock = &packed_image16[bx + by * blocks_x]; + + bc7enc_compress_block(pBlock, pixels, &pack_params); + + uint32_t mode = ((uint8_t *)pBlock)[0]; + for (uint32_t m = 0; m <= 7; m++) { + if (mode & (1 << m)) { + bc7_mode_hist[m]++; + break; + } + } + break; + } + default: { + assert(0); break; } } } - switch (dxgi_format) { - case DXGI_FORMAT_BC1_UNORM: { - block8 *pBlock = &packed_image8[bx + by * blocks_x]; - - rgbcx::encode_bc1(bc1_quality_level, pBlock, &pixels[0][0], use_bc1_3color_mode, use_bc1_3color_mode_for_black); - break; - } - case DXGI_FORMAT_BC3_UNORM: { - BC3Block *pBlock = reinterpret_cast(&packed_image16[bx + by * blocks_x]); - - rgbcx::encode_bc3(bc1_quality_level, pBlock, &pixels[0][0]); - break; - } - case DXGI_FORMAT_BC4_UNORM: { - block8 *pBlock = &packed_image8[bx + by * blocks_x]; - - rgbcx::encode_bc4(pBlock, &pixels[0][bc45_channel0], 4); - break; - } - case DXGI_FORMAT_BC5_UNORM: { - block16 *pBlock = &packed_image16[bx + by * blocks_x]; - - rgbcx::encode_bc5(reinterpret_cast(pBlock), &pixels[0][0], bc45_channel0, bc45_channel1, 4); - break; - } - case DXGI_FORMAT_BC7_UNORM: { - block16 *pBlock = &packed_image16[bx + by * blocks_x]; - - bc7enc_compress_block(pBlock, pixels, &pack_params); - - uint32_t mode = ((uint8_t *)pBlock)[0]; - for (uint32_t m = 0; m <= 7; m++) { - if (mode & (1 << m)) { - bc7_mode_hist[m]++; - break; - } - } - break; - } - default: { - assert(0); - break; - } - } + if ((by & 127) == 0) printf("."); } - - if ((by & 127) == 0) printf("."); } clock_t end_t = clock(); @@ -749,23 +756,22 @@ int main(int argc, char *argv[]) { bool punchthrough_flag = false; auto decoder_bc1 = rgbcx::BC1Decoder(); auto decoder_bc3 = rgbcx::BC3Decoder(); - auto decoder_bc4 = rgbcx::BC4Decoder(); + auto decoder_bc4 = rgbcx::BC4Decoder(bc45_channel0); auto decoder_bc5 = rgbcx::BC5Decoder(); + Color *dest = &unpacked_image.get_pixels()[0]; switch (dxgi_format) { case DXGI_FORMAT_BC1_UNORM: - unpacked_image.set_pixels(decoder_bc1.DecodeImage(reinterpret_cast(&packed_image8[0]), source_image.width(), source_image.height())); + decoder_bc1.DecodeImage(reinterpret_cast(&packed_image8[0]), dest, source_image.width(), source_image.height()); break; case DXGI_FORMAT_BC3_UNORM: - unpacked_image.set_pixels( - decoder_bc3.DecodeImage(reinterpret_cast(&packed_image16[0]), source_image.width(), source_image.height())); + decoder_bc3.DecodeImage(reinterpret_cast(&packed_image16[0]), dest, source_image.width(), source_image.height()); break; case DXGI_FORMAT_BC4_UNORM: - unpacked_image.set_pixels(decoder_bc4.DecodeImage(reinterpret_cast(&packed_image8[0]), source_image.width(), source_image.height())); + decoder_bc4.DecodeImage(reinterpret_cast(&packed_image8[0]), dest, source_image.width(), source_image.height()); break; case DXGI_FORMAT_BC5_UNORM: - unpacked_image.set_pixels( - decoder_bc5.DecodeImage(reinterpret_cast(&packed_image16[0]), source_image.width(), source_image.height())); + decoder_bc5.DecodeImage(reinterpret_cast(&packed_image16[0]), dest, source_image.width(), source_image.height()); break; default: assert(0); @@ -777,7 +783,7 @@ int main(int argc, char *argv[]) { // void *pBlock = (bytes_per_block == 16) ? (void *)&packed_image16[bx + by * blocks_x] : (void *)&packed_image8[bx + by * blocks_x]; // // color_quad_u8 unpacked_pixels[16]; - // for (uint32_t i = 0; i < 16; i++) unpacked_pixels[i].set(0, 0, 0, 255); + // for (uint32_t i = 0; i < 16; i++) unpacked_pixels[i].Set(0, 0, 0, 255); // // switch (dxgi_format) { // case DXGI_FORMAT_BC1_UNORM: @@ -806,7 +812,6 @@ int main(int argc, char *argv[]) { clock_t end_decode_t = clock(); printf("\nDecode time: %f secs\n", (double)(end_decode_t - start_decode_t) / CLOCKS_PER_SEC); - if ((punchthrough_flag) && (dxgi_format == DXGI_FORMAT_BC3_UNORM)) fprintf(stderr, "Warning: BC3 mode selected, but rgbcx::unpack_bc3() returned one or more blocks using 3-color mode!\n"); @@ -849,7 +854,8 @@ int main(int argc, char *argv[]) { for (uint32_t y = 0; y < unpacked_image_alpha.height(); y++) for (uint32_t x = 0; x < unpacked_image_alpha.width(); x++) { uint8_t alpha = unpacked_image_alpha(x, y).a; - unpacked_image_alpha(x, y).SetRGBA(alpha, alpha, alpha, 255); } + unpacked_image_alpha(x, y).SetRGBA(alpha, alpha, alpha, 255); + } if (!save_png(png_alpha_output_filename.c_str(), unpacked_image_alpha, false)) failed = true;