nvidia-texture-tools/src/nvtt/CompressorDXT5_RGBM.cpp

#include "CompressorDXT5_RGBM.h"
#include "CompressorDXT1.h"

#include "OptimalCompressDXT.h"
#include "QuickCompressDXT.h"
#include "CompressorETC.h"

#include "nvimage/ColorBlock.h"
#include "nvimage/BlockDXT.h"

#include "nvmath/Color.inl"
#include "nvmath/Vector.inl"
#include "nvmath/Fitting.h"
#include "nvmath/ftoi.h"

#include "nvthread/Atomic.h"
#include <stdio.h>

using namespace nv;


static void convert_to_rgbm(const Vector4 input_colors[16], const float input_weights[16], float min_m, Vector4 rgbm_colors[16], float rgb_weights[16]) {

    float weight_sum = 0;

    for (uint i = 0; i < 16; i++) {
        const Vector4 & c = input_colors[i];

        float R = saturate(c.x);
        float G = saturate(c.y);
        float B = saturate(c.z);

        float M = max(max(R, G), max(B, min_m));
        float r = R / M;
        float g = G / M;
        float b = B / M;
        float a = (M - min_m) / (1 - min_m);

        rgbm_colors[i] = Vector4(r, g, b, a);
        rgb_weights[i] = input_weights[i] * M;
        weight_sum += input_weights[i];
    }

    if (weight_sum == 0) {
        for (uint i = 0; i < 16; i++) rgb_weights[i] = 1;
    }

}


//static uint atomic_counter = 0;

float nv::compress_dxt5_rgbm(const Vector4 input_colors[16], const float input_weights[16], float min_m, BlockDXT5 * output) {

    // Convert to RGBM.
    Vector4 input_colors_rgbm[16]; // @@ Write over input_colors?
    float rgb_weights[16];
    convert_to_rgbm(input_colors, input_weights, min_m, input_colors_rgbm, rgb_weights);

    // Compress RGB.
    compress_dxt1(input_colors_rgbm, rgb_weights, Vector3(1), /*three_color_mode=*/false, /*hq=*/false, &output->color);

    // Decompress RGB/M block.
    nv::ColorBlock RGB;
    output->color.decodeBlock(&RGB);

    // Compute M values to compensate for RGB's error.
    AlphaBlock4x4 M;
    for (int i = 0; i < 16; i++) {
        const Vector4 & c = input_colors[i];

        float R = saturate(c.x);
        float G = saturate(c.y);
        float B = saturate(c.z);

        float rm = RGB.color(i).r / 255.0f;
        float gm = RGB.color(i).g / 255.0f;
        float bm = RGB.color(i).b / 255.0f;

        // compute m such that m * (r/M, g/M, b/M) == RGB

        // Three equations, one unknown:
        //  m * r/M == R
        //  m * g/M == G
        //  m * b/M == B

        // Solve in the least squares sense!

        // m (rm gm bm) (rm gm bm)^T == (rm gm bm) (R G B)^T

        // m == dot(rgb, RGB) / dot(rgb, rgb)

        float m = dot(Vector3(rm, gm, bm), Vector3(R, G, B)) / dot(Vector3(rm, gm, bm), Vector3(rm, gm, bm));

        m = (m - min_m) / (1 - min_m);

#if 0
        // IC: This does indeed happen. What does that mean? The best choice of m is above the available range. If this happened too often it would make sense to scale m in
        // the pixel shader to allow for more accurate reconstruction. However, that scaling would reduce the precision over the [0-1] range. I haven't measured how much
        // error is introduced by the clamping vs. how much the error would change with the increased range.
        if (m > 1.0f) {
            uint counter = atomicIncrement(&atomic_counter);
            printf("It happens %u times!", counter);
        }
#endif

        M.alpha[i] = U8(ftoi_round(saturate(m) * 255.0f));
        M.weights[i] = input_weights[i];
    }

    // Compress M.
    //if (compressionOptions.quality == Quality_Fastest) {
    //    QuickCompress::compressDXT5A(M, &output->alpha);
    /*}
    else {*/
        OptimalCompress::compressDXT5A(M, &output->alpha);
    //}


#if 0   // Multiple iterations do not seem to help.
    // Decompress M.
    output->alpha.decodeBlock(&M);

    // Feed it back to the input RGB block.
    for (uint i = 0; i < 16; i++) {
        const Vector4 & c = input_colors[i];

        float R = saturate(c.x);
        float G = saturate(c.y);
        float B = saturate(c.z);

        float m = float(M.alpha[i]) / 255.0f * (1 - min_m) + min_m;

        float r = R / m;
        float g = G / m;
        float b = B / m;
        float a = float(M.alpha[i]) / 255.0f;

        input_colors_rgbm[i] = Vector4(r, g, b, a);
        rgb_weights[i] = input_weights[i] * m;
    }
#endif

    return 0; // @@
}


float nv::compress_etc2_rgbm(Vector4 input_colors[16], float input_weights[16], float min_m, void * output) {

    // Convert to RGBM.
    Vector4 rgbm_colors[16];
    float rgb_weights[16];
    convert_to_rgbm(input_colors, input_weights, min_m, rgbm_colors, rgb_weights);

    void * etc_output = (uint8 *)output + 8;
    void * eac_output = output;

    // Compress RGB.
    compress_etc2(rgbm_colors, rgb_weights, Vector3(1), etc_output);

    // Decompress RGB/M block.
    decompress_etc(etc_output, rgbm_colors);

    // Compute M values to compensate for RGB's error.
    for (int i = 0; i < 16; i++) {
        const Vector4 & c = input_colors[i];

        float R = saturate(c.x);
        float G = saturate(c.y);
        float B = saturate(c.z);

        float rm = rgbm_colors[i].x;
        float gm = rgbm_colors[i].y;
        float bm = rgbm_colors[i].z;

        // compute m such that m * (r/M, g/M, b/M) == RGB

        // Three equations, one unknown:
        //  m * r/M == R
        //  m * g/M == G
        //  m * b/M == B

        // Solve in the least squares sense!

        // m (rm gm bm) (rm gm bm)^T == (rm gm bm) (R G B)^T

        // m == dot(rgb, RGB) / dot(rgb, rgb)

        float m = dot(Vector3(rm, gm, bm), Vector3(R, G, B)) / dot(Vector3(rm, gm, bm), Vector3(rm, gm, bm));
        if (!isFinite(m)) {
            m = 1;
        }

        m = (m - min_m) / (1 - min_m);

        // Store M in alpha channel.
        rgbm_colors[i].w = saturate(m);     // @@ What it we don't saturate?
    }

    // Compress M.
    compress_eac(rgbm_colors, input_weights, /*input_channel=*/3, /*search_radius=*/1, /*11bit_mode*/false, eac_output);

    return 0; // @@ Compute error.
}