BC4 encoding

2024-09-13 06:37:34 +00:00 · 2021-02-16 09:00:12 -08:00 · 2021-02-16 09:00:12 -08:00 · 460785ee7d
commit 460785ee7d
parent b42c52030d
9 changed files with 230 additions and 97 deletions
--- a/src/BC1/BC1Decoder.cpp
+++ b/src/BC1/BC1Decoder.cpp
@ -44,9 +44,9 @@ void BC1Decoder::DecodeBlock(Color4x4 dest, BC1Block *const block) const noexcep
            assert(selector < 4);
            assert((color.a == 0 && selector == 3 && l <= h) || color.a == UINT8_MAX);
            if (_write_alpha) {
-                dest.get(x, y).SetRGBA(color);
+                dest.Get(x, y).SetRGBA(color);
            } else {
-                dest.get(x, y).SetRGB(color);
+                dest.Get(x, y).SetRGB(color);
            }
        }
    }
--- a/src/BC4/BC4Decoder.cpp
+++ b/src/BC4/BC4Decoder.cpp
@ -39,7 +39,7 @@ void rgbcx::BC4Decoder::DecodeBlock(Byte4x4 dest, BC4Block *const block) const n
        for (unsigned x = 0; x < 4; x++) {
            const auto selector = selectors[y][x];
            assert(selector < 8);
-            dest.set(x, y, values[selector]);
+            dest.Set(x, y, values[selector]);
        }
    }
 }
--- a/src/BC4/BC4Decoder.h
+++ b/src/BC4/BC4Decoder.h
@ -29,7 +29,6 @@
 namespace rgbcx {
 class BC4Decoder : public BlockDecoder<BC4Block, 4, 4> {
   public:
-    using Byte4x4 = BlockView<uint8_t, 4, 4>;
    BC4Decoder(uint8_t channel = 3) : _channel(channel) { assert(channel < 4U); }

    void DecodeBlock(Color4x4 dest, BC4Block *const block) const noexcept(ndebug) override { DecodeBlock(dest.GetChannel(_channel), block); }
--- a/src/BC4/BC4Encoder.cpp
+++ b/src/BC4/BC4Encoder.cpp
@ -19,4 +19,89 @@

 #include "BC4Encoder.h"

-namespace rgbcx {}  // namespace rgbcx
+#include <algorithm>
+
+namespace rgbcx {
+void BC4Encoder::EncodeBlock(Byte4x4 pixels, BC4Block *const dest) const noexcept(ndebug) {
+    auto bytes = pixels.Flatten();
+    auto minmax = std::minmax_element(bytes.begin(), bytes.end());
+
+    uint8_t min_v = *minmax.first;
+    uint8_t max_v = *minmax.second;
+
+    dest->high_alpha = min_v;
+    dest->low_alpha = max_v;
+
+    if (max_v == min_v) {
+        dest->SetSelectorBits(0);
+        return;
+    }
+
+    const uint32_t delta = max_v - min_v;
+
+    // min_v is now 0. Compute thresholds between values by scaling max_v. It's x14 because we're adding two x7 scale factors.
+    const int t0 = delta * 13;
+    const int t1 = delta * 11;
+    const int t2 = delta * 9;
+    const int t3 = delta * 7;
+    const int t4 = delta * 5;
+    const int t5 = delta * 3;
+    const int t6 = delta * 1;
+
+    // BC4 floors in its divisions, which we compensate for with the 4 bias.
+    // This function is optimal for all possible inputs (i.e. it outputs the same results as checking all 8 values and choosing the closest one).
+    const int bias = 4 - min_v * 14;
+
+    static const uint32_t s_tran0[8] = {1U, 7U, 6U, 5U, 4U, 3U, 2U, 0U};
+    static const uint32_t s_tran1[8] = {1U << 3U, 7U << 3U, 6U << 3U, 5U << 3U, 4U << 3U, 3U << 3U, 2U << 3U, 0U << 3U};
+    static const uint32_t s_tran2[8] = {1U << 6U, 7U << 6U, 6U << 6U, 5U << 6U, 4U << 6U, 3U << 6U, 2U << 6U, 0U << 6U};
+    static const uint32_t s_tran3[8] = {1U << 9U, 7U << 9U, 6U << 9U, 5U << 9U, 4U << 9U, 3U << 9U, 2U << 9U, 0U << 9U};
+
+    uint64_t a0, a1, a2, a3;
+    {
+        const int v0 = bytes[0] * 14 + bias;
+        const int v1 = bytes[1] * 14 + bias;
+        const int v2 = bytes[2] * 14 + bias;
+        const int v3 = bytes[3] * 14 + bias;
+        a0 = s_tran0[(v0 >= t0) + (v0 >= t1) + (v0 >= t2) + (v0 >= t3) + (v0 >= t4) + (v0 >= t5) + (v0 >= t6)];
+        a1 = s_tran1[(v1 >= t0) + (v1 >= t1) + (v1 >= t2) + (v1 >= t3) + (v1 >= t4) + (v1 >= t5) + (v1 >= t6)];
+        a2 = s_tran2[(v2 >= t0) + (v2 >= t1) + (v2 >= t2) + (v2 >= t3) + (v2 >= t4) + (v2 >= t5) + (v2 >= t6)];
+        a3 = s_tran3[(v3 >= t0) + (v3 >= t1) + (v3 >= t2) + (v3 >= t3) + (v3 >= t4) + (v3 >= t5) + (v3 >= t6)];
+    }
+
+    {
+        const int v0 = bytes[4] * 14 + bias;
+        const int v1 = bytes[5] * 14 + bias;
+        const int v2 = bytes[6] * 14 + bias;
+        const int v3 = bytes[7] * 14 + bias;
+        a0 |= (uint64_t)(s_tran0[(v0 >= t0) + (v0 >= t1) + (v0 >= t2) + (v0 >= t3) + (v0 >= t4) + (v0 >= t5) + (v0 >= t6)] << 12U);
+        a1 |= (uint64_t)(s_tran1[(v1 >= t0) + (v1 >= t1) + (v1 >= t2) + (v1 >= t3) + (v1 >= t4) + (v1 >= t5) + (v1 >= t6)] << 12U);
+        a2 |= (uint64_t)(s_tran2[(v2 >= t0) + (v2 >= t1) + (v2 >= t2) + (v2 >= t3) + (v2 >= t4) + (v2 >= t5) + (v2 >= t6)] << 12U);
+        a3 |= (uint64_t)(s_tran3[(v3 >= t0) + (v3 >= t1) + (v3 >= t2) + (v3 >= t3) + (v3 >= t4) + (v3 >= t5) + (v3 >= t6)] << 12U);
+    }
+
+    {
+        const int v0 = bytes[8] * 14 + bias;
+        const int v1 = bytes[9] * 14 + bias;
+        const int v2 = bytes[10] * 14 + bias;
+        const int v3 = bytes[11] * 14 + bias;
+        a0 |= (((uint64_t)s_tran0[(v0 >= t0) + (v0 >= t1) + (v0 >= t2) + (v0 >= t3) + (v0 >= t4) + (v0 >= t5) + (v0 >= t6)]) << 24U);
+        a1 |= (((uint64_t)s_tran1[(v1 >= t0) + (v1 >= t1) + (v1 >= t2) + (v1 >= t3) + (v1 >= t4) + (v1 >= t5) + (v1 >= t6)]) << 24U);
+        a2 |= (((uint64_t)s_tran2[(v2 >= t0) + (v2 >= t1) + (v2 >= t2) + (v2 >= t3) + (v2 >= t4) + (v2 >= t5) + (v2 >= t6)]) << 24U);
+        a3 |= (((uint64_t)s_tran3[(v3 >= t0) + (v3 >= t1) + (v3 >= t2) + (v3 >= t3) + (v3 >= t4) + (v3 >= t5) + (v3 >= t6)]) << 24U);
+    }
+
+    {
+        const int v0 = bytes[12] * 14 + bias;
+        const int v1 = bytes[13] * 14 + bias;
+        const int v2 = bytes[14] * 14 + bias;
+        const int v3 = bytes[15] * 14 + bias;
+        a0 |= (((uint64_t)s_tran0[(v0 >= t0) + (v0 >= t1) + (v0 >= t2) + (v0 >= t3) + (v0 >= t4) + (v0 >= t5) + (v0 >= t6)]) << 36U);
+        a1 |= (((uint64_t)s_tran1[(v1 >= t0) + (v1 >= t1) + (v1 >= t2) + (v1 >= t3) + (v1 >= t4) + (v1 >= t5) + (v1 >= t6)]) << 36U);
+        a2 |= (((uint64_t)s_tran2[(v2 >= t0) + (v2 >= t1) + (v2 >= t2) + (v2 >= t3) + (v2 >= t4) + (v2 >= t5) + (v2 >= t6)]) << 36U);
+        a3 |= (((uint64_t)s_tran3[(v3 >= t0) + (v3 >= t1) + (v3 >= t2) + (v3 >= t3) + (v3 >= t4) + (v3 >= t5) + (v3 >= t6)]) << 36U);
+    }
+
+    dest->SetSelectorBits(a0 | a1 | a2 | a3);
+}
+}  // namespace rgbcx
--- a/src/BC4/BC4Encoder.h
+++ b/src/BC4/BC4Encoder.h
@ -19,7 +19,19 @@

 #pragma once

+#include "../BlockEncoder.h"
+#include "BC4Block.h"
 namespace rgbcx {

-class BC4Encoder {};
+class BC4Encoder : public BlockEncoder<BC4Block, 4, 4> {
+   public:
+    BC4Encoder(const uint8_t channel) : _channel(channel) { assert(channel < 4); }
+
+    void EncodeBlock(Color4x4 pixels, BC4Block *dest) const override { EncodeBlock(pixels.GetChannel(_channel), dest); }
+    void EncodeBlock(Color4x4 pixels, BC4Block *const dest, uint8_t channel) const noexcept(ndebug) { EncodeBlock(pixels.GetChannel(channel), dest); }
+    void EncodeBlock(Byte4x4 pixels, BC4Block *const dest) const noexcept(ndebug);
+
+   private:
+    const uint8_t _channel;
+};
 }  // namespace rgbcx
--- a/src/BlockDecoder.h
+++ b/src/BlockDecoder.h
@ -40,13 +40,14 @@ template <class B, size_t M, size_t N> class BlockDecoder {

    virtual void DecodeBlock(DecodedBlock dest, EncodedBlock *const block) const noexcept(ndebug) = 0;

-    std::vector<Color> DecodeImage(uint8_t *bytes, unsigned image_width, unsigned image_height) {
-        unsigned block_width = maximum(1U, ((image_width + 3) / 4));
-        unsigned block_height = maximum(1U, ((image_height + 3) / 4));
-        using Row = typename DecodedBlock::Row;
+    void DecodeImage(uint8_t *encoded, Color *decoded, unsigned image_width, unsigned image_height) {
+        assert(image_width % N == 0);
+        assert(image_width % M == 0);

-        auto image = std::vector<Color>(block_width * block_height * N * M);
-        auto blocks = reinterpret_cast<B *>(bytes);
+        unsigned block_width = image_width / N;
+        unsigned block_height = image_height / M;
+
+        auto blocks = reinterpret_cast<B *>(encoded);

        // from experimentation, multithreading this using OpenMP actually makes decoding slower
        // due to thread creation/teardown taking longer than the decoding process itself.
@ -62,16 +63,11 @@ template <class B, size_t M, size_t N> class BlockDecoder {
                assert(pixel_x + N <= image_width);

                unsigned top_left = pixel_x + (pixel_y * image_width);
-                auto rows = std::array<Row *, M>();
-                for (unsigned i = 0; i < M; i++) { rows[i] = reinterpret_cast<Row *>(&image[top_left + i * image_width]); }
-
-                auto dest = DecodedBlock(&image[top_left], image_width);
+                auto dest = DecodedBlock(&decoded[top_left], image_width);

                DecodeBlock(dest, &blocks[x + block_width * y]);
            }
        }
-
-        return image;
    }
 };
 }  // namespace rgbcx
--- a/src/BlockEncoder.h
+++ b/src/BlockEncoder.h
@ -27,8 +27,42 @@ namespace rgbcx {

 template <class B, size_t M, size_t N> class BlockEncoder {
   public:
-    using DecodedBlock = BlockView<M, N>;
+    using DecodedBlock = ColorBlockView<M, N>;
    using EncodedBlock = B;
-    virtual void EncodeBlock(EncodedBlock *dest, DecodedBlock *const pixels) const = 0;
+
+    BlockEncoder() noexcept = default;
+    virtual ~BlockEncoder() noexcept = default;
+
+    virtual void EncodeBlock(DecodedBlock pixels, EncodedBlock *dest) const = 0;
+
+    void EncodeImage(uint8_t *encoded, Color *decoded, unsigned image_width, unsigned image_height) {
+        assert(image_width % N == 0);
+        assert(image_width % M == 0);
+
+        unsigned block_width = image_width / N;
+        unsigned block_height = image_height / M;
+
+        auto blocks = reinterpret_cast<B *>(encoded);
+
+        // from experimentation, multithreading this using OpenMP actually makes decoding slower
+        // due to thread creation/teardown taking longer than the decoding process itself.
+        // As a result, this is left as a serial operation despite being embarassingly parallelizable
+        for (unsigned y = 0; y < block_height; y++) {
+            for (unsigned x = 0; x < block_width; x++) {
+                unsigned pixel_x = x * N;
+                unsigned pixel_y = y * M;
+
+                assert(pixel_x >= 0);
+                assert(pixel_y >= 0);
+                assert(pixel_y + M <= image_height);
+                assert(pixel_x + N <= image_width);
+
+                unsigned top_left = pixel_x + (pixel_y * image_width);
+                auto src = DecodedBlock(&decoded[top_left], image_width);
+
+                EncodeBlock(src, &blocks[x + block_width * y]);
+            }
+        }
+    }
 };
 }  // namespace rgbcx
--- a/src/BlockView.h
+++ b/src/BlockView.h
@ -42,7 +42,7 @@ template <typename S, size_t N> class RowView {
        return start[index * pixel_stride];
    }

-    constexpr int size() noexcept { return N; }
+    constexpr int Size() noexcept { return N; }

    S *const start;
    const int pixel_stride;
@ -59,29 +59,29 @@ template <typename S, size_t M, size_t N> class BlockView {
        return RowView<S, N>(&start[row_stride * (int)index], pixel_stride);
    }

-    constexpr int width() noexcept { return N; }
-    constexpr int height() noexcept { return M; }
-    constexpr int size() noexcept { return N * M; }
+    constexpr int Width() noexcept { return N; }
+    constexpr int Height() noexcept { return M; }
+    constexpr int Size() noexcept { return N * M; }

-    constexpr S &get(unsigned x, unsigned y) noexcept(ndebug) {
+    constexpr S &Get(unsigned x, unsigned y) noexcept(ndebug) {
        assert(x < N);
        assert(y < M);
        return start[(row_stride * (int)y) + (pixel_stride * (int)x)];
    }

-    constexpr S get(unsigned x, unsigned y) const noexcept(ndebug) {
+    constexpr S Get(unsigned x, unsigned y) const noexcept(ndebug) {
        assert(x < N);
        assert(y < M);
        return start[(row_stride * (int)y) + (pixel_stride * (int)x)];
    }

-    constexpr void set(unsigned x, unsigned y, S value) noexcept(ndebug) {
+    constexpr void Set(unsigned x, unsigned y, S value) noexcept(ndebug) {
        assert(x < N);
        assert(y < M);
        start[(row_stride * (int)y) + (pixel_stride * (int)x)] = value;
    }

-    constexpr std::array<S, M * N> flatten() noexcept {
+    constexpr std::array<S, M * N> Flatten() noexcept {
        std::array<S, M * N> result;
        for (int x = 0; x < N; x++) {
            for (int y = 0; y < M; y++) { result[x + (N * y)] = start[(row_stride * y) + (pixel_stride * x)]; }
@ -107,9 +107,10 @@ template <size_t M, size_t N> class ColorBlockView : public BlockView<Color, M,
        return ChannelView(channelStart, Base::row_stride * 4, Base::pixel_stride * 4);
    }

-    void SetRGB(unsigned x, unsigned y, Color value) noexcept(ndebug) { Base::get(x, y).SetRGB(value); }
+    void SetRGB(unsigned x, unsigned y, Color value) noexcept(ndebug) { Base::Get(x, y).SetRGB(value); }
 };

 using Color4x4 = ColorBlockView<4, 4>;
+using Byte4x4 = BlockView<uint8_t, 4, 4>;

 }  // namespace rgbcx
--- a/src/test/test.cpp
+++ b/src/test/test.cpp
@ -17,6 +17,7 @@
 #include <type_traits>
 #include <vector>

+#include "../BC4/BC4Encoder.h"
 #include "../rgbcx.h"
 #include "../rgbcxDecoders.h"
 #include "../util.h"
@ -116,9 +117,7 @@ class image_u8 {
    inline const color_quad_u8_vec &get_pixels() const { return m_pixels; }
    inline color_quad_u8_vec &get_pixels() { return m_pixels; }

-    void set_pixels(const color_quad_u8_vec &pixels) {
-        m_pixels = pixels;
-    }
+    void set_pixels(const color_quad_u8_vec &pixels) { m_pixels = pixels; }

    inline uint32_t width() const { return m_width; }
    inline uint32_t height() const { return m_height; }
@ -266,7 +265,7 @@ class image_metrics {
                const color_quad_u8 &cb = b(x, y);

                if (!num_channels) {
-//                    int luma_diff = ;
+                    //                    int luma_diff = ;
                    unsigned index = iabs(ca.get_luma() - cb.get_luma());
                    hist[index]++;
                } else {
@ -660,67 +659,75 @@ int main(int argc, char *argv[]) {
    uint32_t bc7_mode_hist[8];
    memset(bc7_mode_hist, 0, sizeof(bc7_mode_hist));

-    for (uint32_t by = 0; by < blocks_y; by++) {
-        for (uint32_t bx = 0; bx < blocks_x; bx++) {
-            color_quad_u8 pixels[16];
+    if (dxgi_format == DXGI_FORMAT_BC4_UNORM) {
+        auto bc4_encoder = BC4Encoder(bc45_channel0);
+        Color *src = &source_image.get_pixels()[0];

-            source_image.get_block(bx, by, 4, 4, pixels);
-            if (!has_alpha) {
-                for (uint32_t i = 0; i < 16; i++) {
-                    if (pixels[i][3] < 255) {
-                        has_alpha = true;
+        bc4_encoder.EncodeImage(reinterpret_cast<uint8_t *>(&packed_image8[0]), src, source_image.width(), source_image.height());
+
+    } else {
+        for (uint32_t by = 0; by < blocks_y; by++) {
+            for (uint32_t bx = 0; bx < blocks_x; bx++) {
+                color_quad_u8 pixels[16];
+
+                source_image.get_block(bx, by, 4, 4, pixels);
+                if (!has_alpha) {
+                    for (uint32_t i = 0; i < 16; i++) {
+                        if (pixels[i][3] < 255) {
+                            has_alpha = true;
+                            break;
+                        }
+                    }
+                }
+
+                switch (dxgi_format) {
+                    case DXGI_FORMAT_BC1_UNORM: {
+                        block8 *pBlock = &packed_image8[bx + by * blocks_x];
+
+                        rgbcx::encode_bc1(bc1_quality_level, pBlock, &pixels[0][0], use_bc1_3color_mode, use_bc1_3color_mode_for_black);
+                        break;
+                    }
+                    case DXGI_FORMAT_BC3_UNORM: {
+                        BC3Block *pBlock = reinterpret_cast<BC3Block *>(&packed_image16[bx + by * blocks_x]);
+
+                        rgbcx::encode_bc3(bc1_quality_level, pBlock, &pixels[0][0]);
+                        break;
+                    }
+                    case DXGI_FORMAT_BC4_UNORM: {
+                        block8 *pBlock = &packed_image8[bx + by * blocks_x];
+
+                        rgbcx::encode_bc4(pBlock, &pixels[0][bc45_channel0], 4);
+                        break;
+                    }
+                    case DXGI_FORMAT_BC5_UNORM: {
+                        block16 *pBlock = &packed_image16[bx + by * blocks_x];
+
+                        rgbcx::encode_bc5(reinterpret_cast<BC5Block *>(pBlock), &pixels[0][0], bc45_channel0, bc45_channel1, 4);
+                        break;
+                    }
+                    case DXGI_FORMAT_BC7_UNORM: {
+                        block16 *pBlock = &packed_image16[bx + by * blocks_x];
+
+                        bc7enc_compress_block(pBlock, pixels, &pack_params);
+
+                        uint32_t mode = ((uint8_t *)pBlock)[0];
+                        for (uint32_t m = 0; m <= 7; m++) {
+                            if (mode & (1 << m)) {
+                                bc7_mode_hist[m]++;
+                                break;
+                            }
+                        }
+                        break;
+                    }
+                    default: {
+                        assert(0);
                        break;
                    }
                }
            }

-            switch (dxgi_format) {
-                case DXGI_FORMAT_BC1_UNORM: {
-                    block8 *pBlock = &packed_image8[bx + by * blocks_x];
-
-                    rgbcx::encode_bc1(bc1_quality_level, pBlock, &pixels[0][0], use_bc1_3color_mode, use_bc1_3color_mode_for_black);
-                    break;
-                }
-                case DXGI_FORMAT_BC3_UNORM: {
-                    BC3Block *pBlock = reinterpret_cast<BC3Block *>(&packed_image16[bx + by * blocks_x]);
-
-                    rgbcx::encode_bc3(bc1_quality_level, pBlock, &pixels[0][0]);
-                    break;
-                }
-                case DXGI_FORMAT_BC4_UNORM: {
-                    block8 *pBlock = &packed_image8[bx + by * blocks_x];
-
-                    rgbcx::encode_bc4(pBlock, &pixels[0][bc45_channel0], 4);
-                    break;
-                }
-                case DXGI_FORMAT_BC5_UNORM: {
-                    block16 *pBlock = &packed_image16[bx + by * blocks_x];
-
-                    rgbcx::encode_bc5(reinterpret_cast<BC5Block *>(pBlock), &pixels[0][0], bc45_channel0, bc45_channel1, 4);
-                    break;
-                }
-                case DXGI_FORMAT_BC7_UNORM: {
-                    block16 *pBlock = &packed_image16[bx + by * blocks_x];
-
-                    bc7enc_compress_block(pBlock, pixels, &pack_params);
-
-                    uint32_t mode = ((uint8_t *)pBlock)[0];
-                    for (uint32_t m = 0; m <= 7; m++) {
-                        if (mode & (1 << m)) {
-                            bc7_mode_hist[m]++;
-                            break;
-                        }
-                    }
-                    break;
-                }
-                default: {
-                    assert(0);
-                    break;
-                }
-            }
+            if ((by & 127) == 0) printf(".");
        }
-
-        if ((by & 127) == 0) printf(".");
    }

    clock_t end_t = clock();
@ -749,23 +756,22 @@ int main(int argc, char *argv[]) {
        bool punchthrough_flag = false;
        auto decoder_bc1 = rgbcx::BC1Decoder();
        auto decoder_bc3 = rgbcx::BC3Decoder();
-        auto decoder_bc4 = rgbcx::BC4Decoder();
+        auto decoder_bc4 = rgbcx::BC4Decoder(bc45_channel0);
        auto decoder_bc5 = rgbcx::BC5Decoder();
+        Color *dest = &unpacked_image.get_pixels()[0];

        switch (dxgi_format) {
            case DXGI_FORMAT_BC1_UNORM:
-                unpacked_image.set_pixels(decoder_bc1.DecodeImage(reinterpret_cast<uint8_t *>(&packed_image8[0]), source_image.width(), source_image.height()));
+                decoder_bc1.DecodeImage(reinterpret_cast<uint8_t *>(&packed_image8[0]), dest, source_image.width(), source_image.height());
                break;
            case DXGI_FORMAT_BC3_UNORM:
-                unpacked_image.set_pixels(
-                    decoder_bc3.DecodeImage(reinterpret_cast<uint8_t *>(&packed_image16[0]), source_image.width(), source_image.height()));
+                decoder_bc3.DecodeImage(reinterpret_cast<uint8_t *>(&packed_image16[0]), dest, source_image.width(), source_image.height());
                break;
            case DXGI_FORMAT_BC4_UNORM:
-                unpacked_image.set_pixels(decoder_bc4.DecodeImage(reinterpret_cast<uint8_t *>(&packed_image8[0]), source_image.width(), source_image.height()));
+                decoder_bc4.DecodeImage(reinterpret_cast<uint8_t *>(&packed_image8[0]), dest, source_image.width(), source_image.height());
                break;
            case DXGI_FORMAT_BC5_UNORM:
-                unpacked_image.set_pixels(
-                    decoder_bc5.DecodeImage(reinterpret_cast<uint8_t *>(&packed_image16[0]), source_image.width(), source_image.height()));
+                decoder_bc5.DecodeImage(reinterpret_cast<uint8_t *>(&packed_image16[0]), dest, source_image.width(), source_image.height());
                break;
            default:
                assert(0);
@ -777,7 +783,7 @@ int main(int argc, char *argv[]) {
        //                void *pBlock = (bytes_per_block == 16) ? (void *)&packed_image16[bx + by * blocks_x] : (void *)&packed_image8[bx + by * blocks_x];
        //
        //                color_quad_u8 unpacked_pixels[16];
-        //                for (uint32_t i = 0; i < 16; i++) unpacked_pixels[i].set(0, 0, 0, 255);
+        //                for (uint32_t i = 0; i < 16; i++) unpacked_pixels[i].Set(0, 0, 0, 255);
        //
        //                switch (dxgi_format) {
        //                    case DXGI_FORMAT_BC1_UNORM:
@ -806,7 +812,6 @@ int main(int argc, char *argv[]) {
        clock_t end_decode_t = clock();
        printf("\nDecode time: %f secs\n", (double)(end_decode_t - start_decode_t) / CLOCKS_PER_SEC);

-
        if ((punchthrough_flag) && (dxgi_format == DXGI_FORMAT_BC3_UNORM))
            fprintf(stderr, "Warning: BC3 mode selected, but rgbcx::unpack_bc3() returned one or more blocks using 3-color mode!\n");

@ -849,7 +854,8 @@ int main(int argc, char *argv[]) {
            for (uint32_t y = 0; y < unpacked_image_alpha.height(); y++)
                for (uint32_t x = 0; x < unpacked_image_alpha.width(); x++) {
                    uint8_t alpha = unpacked_image_alpha(x, y).a;
-                    unpacked_image_alpha(x, y).SetRGBA(alpha, alpha, alpha, 255); }
+                    unpacked_image_alpha(x, y).SetRGBA(alpha, alpha, alpha, 255);
+                }

            if (!save_png(png_alpha_output_filename.c_str(), unpacked_image_alpha, false))
                failed = true;