diff --git a/data/testsuite/waterloo/baboon.png b/data/testsuite/waterloo/baboon.png new file mode 100644 index 0000000..2b1499a Binary files /dev/null and b/data/testsuite/waterloo/baboon.png differ diff --git a/extern/libsquish-1.15/squish.h b/extern/libsquish-1.15/squish.h new file mode 100644 index 0000000..14c9bb5 --- /dev/null +++ b/extern/libsquish-1.15/squish.h @@ -0,0 +1,309 @@ +/* ----------------------------------------------------------------------------- + + Copyright (c) 2006 Simon Brown si@sjbrown.co.uk + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be included + in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + -------------------------------------------------------------------------- */ + +#ifndef SQUISH_H +#define SQUISH_H + +//! All squish API functions live in this namespace. +namespace squish { + +// ----------------------------------------------------------------------------- + +//! Typedef a quantity that is a single unsigned byte. +typedef unsigned char u8; + +// ----------------------------------------------------------------------------- + +enum +{ + //! Use DXT1 compression. + kDxt1 = ( 1 << 0 ), + + //! Use DXT3 compression. + kDxt3 = ( 1 << 1 ), + + //! Use DXT5 compression. + kDxt5 = ( 1 << 2 ), + + //! Use BC4 compression. + kBc4 = ( 1 << 3 ), + + //! Use BC5 compression. + kBc5 = ( 1 << 4 ), + + //! Use a slow but high quality colour compressor (the default). + kColourClusterFit = ( 1 << 5 ), + + //! Use a fast but low quality colour compressor. + kColourRangeFit = ( 1 << 6 ), + + //! Weight the colour by alpha during cluster fit (disabled by default). + kWeightColourByAlpha = ( 1 << 7 ), + + //! Use a very slow but very high quality colour compressor. + kColourIterativeClusterFit = ( 1 << 8 ), + + //! Source is BGRA rather than RGBA + kSourceBGRA = ( 1 << 9 ) +}; + +// ----------------------------------------------------------------------------- + +/*! @brief Compresses a 4x4 block of pixels. + + @param rgba The rgba values of the 16 source pixels. + @param mask The valid pixel mask. + @param block Storage for the compressed DXT block. + @param flags Compression flags. + @param metric An optional perceptual metric. + + The source pixels should be presented as a contiguous array of 16 rgba + values, with each component as 1 byte each. In memory this should be: + + { r1, g1, b1, a1, .... , r16, g16, b16, a16 } + + The mask parameter enables only certain pixels within the block. The lowest + bit enables the first pixel and so on up to the 16th bit. Bits beyond the + 16th bit are ignored. Pixels that are not enabled are allowed to take + arbitrary colours in the output block. An example of how this can be used + is in the CompressImage function to disable pixels outside the bounds of + the image when the width or height is not divisible by 4. + + The flags parameter should specify kDxt1, kDxt3, kDxt5, kBc4, or kBc5 compression, + however, DXT1 will be used by default if none is specified. When using DXT1 + compression, 8 bytes of storage are required for the compressed DXT block. + DXT3 and DXT5 compression require 16 bytes of storage per block. + + The flags parameter can also specify a preferred colour compressor to use + when fitting the RGB components of the data. Possible colour compressors + are: kColourClusterFit (the default), kColourRangeFit (very fast, low + quality) or kColourIterativeClusterFit (slowest, best quality). + + When using kColourClusterFit or kColourIterativeClusterFit, an additional + flag can be specified to weight the importance of each pixel by its alpha + value. For images that are rendered using alpha blending, this can + significantly increase the perceived quality. + + The metric parameter can be used to weight the relative importance of each + colour channel, or pass NULL to use the default uniform weight of + { 1.0f, 1.0f, 1.0f }. This replaces the previous flag-based control that + allowed either uniform or "perceptual" weights with the fixed values + { 0.2126f, 0.7152f, 0.0722f }. If non-NULL, the metric should point to a + contiguous array of 3 floats. +*/ +void CompressMasked( u8 const* rgba, int mask, void* block, int flags, float* metric = 0 ); + +// ----------------------------------------------------------------------------- + +/*! @brief Compresses a 4x4 block of pixels. + + @param rgba The rgba values of the 16 source pixels. + @param block Storage for the compressed DXT block. + @param flags Compression flags. + @param metric An optional perceptual metric. + + The source pixels should be presented as a contiguous array of 16 rgba + values, with each component as 1 byte each. In memory this should be: + + { r1, g1, b1, a1, .... , r16, g16, b16, a16 } + + The flags parameter should specify kDxt1, kDxt3, kDxt5, kBc4, or kBc5 compression, + however, DXT1 will be used by default if none is specified. When using DXT1 + compression, 8 bytes of storage are required for the compressed DXT block. + DXT3 and DXT5 compression require 16 bytes of storage per block. + + The flags parameter can also specify a preferred colour compressor to use + when fitting the RGB components of the data. Possible colour compressors + are: kColourClusterFit (the default), kColourRangeFit (very fast, low + quality) or kColourIterativeClusterFit (slowest, best quality). + + When using kColourClusterFit or kColourIterativeClusterFit, an additional + flag can be specified to weight the importance of each pixel by its alpha + value. For images that are rendered using alpha blending, this can + significantly increase the perceived quality. + + The metric parameter can be used to weight the relative importance of each + colour channel, or pass NULL to use the default uniform weight of + { 1.0f, 1.0f, 1.0f }. This replaces the previous flag-based control that + allowed either uniform or "perceptual" weights with the fixed values + { 0.2126f, 0.7152f, 0.0722f }. If non-NULL, the metric should point to a + contiguous array of 3 floats. + + This method is an inline that calls CompressMasked with a mask of 0xffff, + provided for compatibility with older versions of squish. +*/ +inline void Compress( u8 const* rgba, void* block, int flags, float* metric = 0 ) +{ + CompressMasked( rgba, 0xffff, block, flags, metric ); +} + +// ----------------------------------------------------------------------------- + +/*! @brief Decompresses a 4x4 block of pixels. + + @param rgba Storage for the 16 decompressed pixels. + @param block The compressed DXT block. + @param flags Compression flags. + + The decompressed pixels will be written as a contiguous array of 16 rgba + values, with each component as 1 byte each. In memory this is: + + { r1, g1, b1, a1, .... , r16, g16, b16, a16 } + + The flags parameter should specify kDxt1, kDxt3, kDxt5, kBc4, or kBc5 compression, + however, DXT1 will be used by default if none is specified. All other flags + are ignored. +*/ +void Decompress( u8* rgba, void const* block, int flags ); + +// ----------------------------------------------------------------------------- + +/*! @brief Computes the amount of compressed storage required. + + @param width The width of the image. + @param height The height of the image. + @param flags Compression flags. + + The flags parameter should specify kDxt1, kDxt3, kDxt5, kBc4, or kBc5 compression, + however, DXT1 will be used by default if none is specified. All other flags + are ignored. + + Most DXT images will be a multiple of 4 in each dimension, but this + function supports arbitrary size images by allowing the outer blocks to + be only partially used. +*/ +int GetStorageRequirements( int width, int height, int flags ); + +// ----------------------------------------------------------------------------- + +/*! @brief Compresses an image in memory. + + @param rgba The pixels of the source. + @param width The width of the source image. + @param height The height of the source image. + @param pitch The pitch of the source image. + @param blocks Storage for the compressed output. + @param flags Compression flags. + @param metric An optional perceptual metric. + + The source pixels should be presented as a contiguous array of width*height + rgba values, with each component as 1 byte each. In memory this should be: + + { r1, g1, b1, a1, .... , rn, gn, bn, an } for n = width*height + + The flags parameter should specify kDxt1, kDxt3, kDxt5, kBc4, or kBc5 compression, + however, DXT1 will be used by default if none is specified. When using DXT1 + compression, 8 bytes of storage are required for each compressed DXT block. + DXT3 and DXT5 compression require 16 bytes of storage per block. + + The flags parameter can also specify a preferred colour compressor to use + when fitting the RGB components of the data. Possible colour compressors + are: kColourClusterFit (the default), kColourRangeFit (very fast, low + quality) or kColourIterativeClusterFit (slowest, best quality). + + When using kColourClusterFit or kColourIterativeClusterFit, an additional + flag can be specified to weight the importance of each pixel by its alpha + value. For images that are rendered using alpha blending, this can + significantly increase the perceived quality. + + The metric parameter can be used to weight the relative importance of each + colour channel, or pass NULL to use the default uniform weight of + { 1.0f, 1.0f, 1.0f }. This replaces the previous flag-based control that + allowed either uniform or "perceptual" weights with the fixed values + { 0.2126f, 0.7152f, 0.0722f }. If non-NULL, the metric should point to a + contiguous array of 3 floats. + + Internally this function calls squish::CompressMasked for each block, which + allows for pixels outside the image to take arbitrary values. The function + squish::GetStorageRequirements can be called to compute the amount of memory + to allocate for the compressed output. + + Note on compression quality: When compressing textures with + libsquish it is recommended to apply a gamma-correction + beforehand. This will reduce the blockiness in dark areas. The + level of necessary gamma-correction is platform dependent. For + example, a gamma correction with gamma = 0.5 before compression + and gamma = 2.0 after decompression yields good results on the + Windows platform but for other platforms like MacOS X a different + gamma value may be more suitable. +*/ +void CompressImage( u8 const* rgba, int width, int height, int pitch, void* blocks, int flags, float* metric = 0 ); +void CompressImage( u8 const* rgba, int width, int height, void* blocks, int flags, float* metric = 0 ); + +// ----------------------------------------------------------------------------- + +/*! @brief Decompresses an image in memory. + + @param rgba Storage for the decompressed pixels. + @param width The width of the source image. + @param height The height of the source image. + @param pitch The pitch of the decompressed pixels. + @param blocks The compressed DXT blocks. + @param flags Compression flags. + + The decompressed pixels will be written as a contiguous array of width*height + 16 rgba values, with each component as 1 byte each. In memory this is: + + { r1, g1, b1, a1, .... , rn, gn, bn, an } for n = width*height + + The flags parameter should specify kDxt1, kDxt3, kDxt5, kBc4, or kBc5 compression, + however, DXT1 will be used by default if none is specified. All other flags + are ignored. + + Internally this function calls squish::Decompress for each block. +*/ +void DecompressImage( u8* rgba, int width, int height, int pitch, void const* blocks, int flags ); +void DecompressImage( u8* rgba, int width, int height, void const* blocks, int flags ); + +// ----------------------------------------------------------------------------- + +/*! @brief Computes MSE of an compressed image in memory. + + @param rgba The original image pixels. + @param width The width of the source image. + @param height The height of the source image. + @param pitch The pitch of the source image. + @param dxt The compressed dxt blocks + @param flags Compression flags. + @param colourMSE The MSE of the colour values. + @param alphaMSE The MSE of the alpha values. + + The colour MSE and alpha MSE are computed across all pixels. The colour MSE is + averaged across all rgb values (i.e. colourMSE = sum sum_k ||dxt.k - rgba.k||/3) + + The flags parameter should specify kDxt1, kDxt3, kDxt5, kBc4, or kBc5 compression, + however, DXT1 will be used by default if none is specified. All other flags + are ignored. + + Internally this function calls squish::Decompress for each block. +*/ +void ComputeMSE(u8 const *rgba, int width, int height, int pitch, u8 const *dxt, int flags, double &colourMSE, double &alphaMSE); +void ComputeMSE(u8 const *rgba, int width, int height, u8 const *dxt, int flags, double &colourMSE, double &alphaMSE); + +// ----------------------------------------------------------------------------- + +} // namespace squish + +#endif // ndef SQUISH_H diff --git a/src/nvtt/tests/bc1enc.cpp b/src/nvtt/tests/bc1enc.cpp new file mode 100644 index 0000000..a48c2c1 --- /dev/null +++ b/src/nvtt/tests/bc1enc.cpp @@ -0,0 +1,694 @@ + +#define _CRT_SECURE_NO_WARNINGS +#include +#include + +//#define STBI_ASSERT(x) +#define STB_IMAGE_IMPLEMENTATION +#include "stb_image.h" + +#define STB_DXT_IMPLEMENTATION +#include "stb_dxt.h" + +#include "../extern/libsquish-1.15/squish.h" + +#include "../extern/CMP_Core/source/CMP_Core.h" + +#include "nvtt/CompressorDXT1.h" + +#include "nvmath/Vector.h" +#include "nvmath/Color.h" + +#include "nvcore/Timer.h" +#include "nvcore/Array.inl" + +using namespace nv; + +typedef unsigned char u8; +typedef unsigned int u32; + + +// Defer statement: +#define CONCAT_INTERNAL(x, y) x##y +#define CONCAT(x, y) CONCAT_INTERNAL(x, y) + +template +struct ExitScope +{ + T lambda; + ExitScope(T lambda) + : lambda(lambda) + { + } + ~ExitScope() { lambda(); } + +private: + ExitScope& operator=(const ExitScope&); +}; + +class ExitScopeHelp +{ +public: + template + ExitScope operator+(T t) { return t; } +}; + +#define defer const auto& __attribute__((unused)) CONCAT(defer__, __LINE__) = ExitScopeHelp() + [&]() + + +static float mse_to_psnr(float mse) { + float rms = sqrtf(mse); + float psnr = rms ? (float)clamp(log10(255.0 / rms) * 20.0, 0.0, 300.0) : 1e+10f; + return psnr; +} + +/* +void image_metrics::calc(const image &a, const image &b, uint32_t first_chan, uint32_t total_chans, bool avg_comp_error, bool use_601_luma) +{ + //assert((first_chan < 4U) && (first_chan + total_chans <= 4U)); + + const uint32_t width = std::min(a.get_width(), b.get_width()); + const uint32_t height = std::min(a.get_height(), b.get_height()); + + double hist[256]; + memset(hist, 0, sizeof(hist)); + + for (uint32_t y = 0; y < height; y++) + { + for (uint32_t x = 0; x < width; x++) + { + const color_rgba &ca = a(x, y), &cb = b(x, y); + + for (uint32_t c = 0; c < 3; c++) + hist[iabs(ca[first_chan + c] - cb[first_chan + c])]++; + } + } + + m_max = 0; + double sum = 0.0f, sum2 = 0.0f; + for (uint32_t i = 0; i < 256; i++) + { + if (hist[i]) + { + m_max = std::max(m_max, (float)i); + double v = i * hist[i]; + sum += v; + sum2 += i * v; + } + } + + double total_values = (double)width * (double)height; + if (avg_comp_error) + total_values *= (double)clamp(total_chans, 1, 4); + + m_mean = (float)clamp(sum / total_values, 0.0f, 255.0); + m_mean_squared = (float)clamp(sum2 / total_values, 0.0f, 255.0 * 255.0); + m_rms = (float)sqrt(m_mean_squared); + m_psnr = m_rms ? (float)clamp(log10(255.0 / m_rms) * 20.0, 0.0f, 300.0f) : 1e+10f; +} +*/ + +// Returns mse. +float evaluate_dxt1_mse(uint8 * rgba, uint8 * block, int block_count, int decoder = 2) { + double total = 0.0f; + for (int b = 0; b < block_count; b++) { + total += nv::evaluate_dxt1_error(rgba, (BlockDXT1 *)block, decoder) / 255.0; + rgba += 4 * 4 * 4; + block += 8; + } + return float(total / (3 * 16 * block_count)); +} + +#define MAKEFOURCC(str) (uint(str[0]) | (uint(str[1]) << 8) | (uint(str[2]) << 16) | (uint(str[3]) << 24 )) + + +bool output_dxt_dds (u32 w, u32 h, const u8* data, const char * filename) { + + const u32 DDSD_CAPS = 0x00000001; + const u32 DDSD_PIXELFORMAT = 0x00001000; + const u32 DDSD_WIDTH = 0x00000004; + const u32 DDSD_HEIGHT = 0x00000002; + const u32 DDSD_LINEARSIZE = 0x00080000; + const u32 DDPF_FOURCC = 0x00000004; + const u32 DDSCAPS_TEXTURE = 0x00001000; + + struct DDS { + u32 fourcc = MAKEFOURCC("DDS "); + u32 size = 124; + u32 flags = DDSD_CAPS|DDSD_PIXELFORMAT|DDSD_WIDTH|DDSD_HEIGHT|DDSD_LINEARSIZE; + u32 height; + u32 width; + u32 pitch; + u32 depth; + u32 mipmapcount; + u32 reserved [11]; + struct { + u32 size = 32; + u32 flags = DDPF_FOURCC; + u32 fourcc = MAKEFOURCC("DXT1"); + u32 bitcount; + u32 rmask; + u32 gmask; + u32 bmask; + u32 amask; + } pf; + struct { + u32 caps1 = DDSCAPS_TEXTURE; + u32 caps2; + u32 caps3; + u32 caps4; + } caps; + u32 notused; + } dds; + static_assert(sizeof(DDS) == 128, "DDS size must be 128"); + + dds.width = w; + dds.height = h; + dds.pitch = 8 * ((w+3)/4 * (h+3)/4); // linear size + + FILE * fp = fopen(filename, "wb"); + if (fp == nullptr) return false; + + // Write header: + fwrite(&dds, sizeof(dds), 1, fp); + + // Write dxt data: + fwrite(data, dds.pitch, 1, fp); + + fclose(fp); + + return true; +} + +const int COMPRESSOR_COUNT = 7; +struct Stats { + const char * compressorName; + Array mseArray; + Array timeArray; +}; + + +bool test_bc1(const char * inputFileName, int index, Stats * stats) { + + int w, h, n; + unsigned char *input_data = stbi_load(inputFileName, &w, &h, &n, 4); + defer { stbi_image_free(input_data); }; + + if (input_data == nullptr) { + printf("Failed to load input image '%s'.\n", inputFileName); + return false; + } + + + int block_count = (w / 4) * (h / 4); + u8 * rgba_block_data = (u8 *)malloc(block_count * 4 * 4 * 4); + defer { free(rgba_block_data); }; + + int bw = 4 * (w / 4); // Round down. + int bh = 4 * (h / 4); + + // Convert to block layout. + for (int y = 0, b = 0; y < bh; y += 4) { + for (int x = 0; x < bw; x += 4, b++) { + for (int yy = 0; yy < 4; yy++) { + for (int xx = 0; xx < 4; xx++) { + if (x + xx < w && y + yy < h) { + rgba_block_data[b * 4 * 4 * 4 + (yy * 4 + xx) * 4 + 0] = input_data[((y + yy) * w + x + xx) * 4 + 0]; + rgba_block_data[b * 4 * 4 * 4 + (yy * 4 + xx) * 4 + 1] = input_data[((y + yy) * w + x + xx) * 4 + 1]; + rgba_block_data[b * 4 * 4 * 4 + (yy * 4 + xx) * 4 + 2] = input_data[((y + yy) * w + x + xx) * 4 + 2]; + rgba_block_data[b * 4 * 4 * 4 + (yy * 4 + xx) * 4 + 3] = input_data[((y + yy) * w + x + xx) * 4 + 3]; + } + else { + rgba_block_data[b * 4 * 4 * 4 + (yy * 4 + xx) * 4 + 0] = 0; + rgba_block_data[b * 4 * 4 * 4 + (yy * 4 + xx) * 4 + 1] = 0; + rgba_block_data[b * 4 * 4 * 4 + (yy * 4 + xx) * 4 + 2] = 0; + rgba_block_data[b * 4 * 4 * 4 + (yy * 4 + xx) * 4 + 3] = 0; + } + } + } + } + } + + u8 * block_data = (u8 *)malloc(block_count * 8); + + Timer timer; + + // Warmup. + for (int b = 0; b < block_count; b++) { + stb_compress_dxt_block(block_data + b * 8, rgba_block_data + b * 4 * 4 * 4, 0, STB_DXT_NORMAL); + } + +#if _DEBUG + const int repeat_count = 1; +#else + const int repeat_count = 1; // 8 +#endif + + { + memset(block_data, 0, block_count * 8); + + timer.start(); + for (int i = 0; i < repeat_count; i++) { + for (int b = 0; b < block_count; b++) { + stb_compress_dxt_block(block_data + b * 8, rgba_block_data + b * 4 * 4 * 4, 0, STB_DXT_NORMAL); + } + } + timer.stop(); + + float mse = evaluate_dxt1_mse(rgba_block_data, block_data, block_count); + //printf("stb_dxt \t%f\t-> %f %f\n", timer.elapsed(), sqrt(mse), mse_to_psnr(mse)); + + //output_dxt_dds(bw, bh, block_data, "stb_dxt.dds"); + stats->compressorName = "stb"; + stats->mseArray[index] = mse; + stats->timeArray[index] = timer.elapsed(); + stats++; + } + + { + memset(block_data, 0, block_count * 8); + + timer.start(); + for (int i = 0; i < repeat_count; i++) { + for (int b = 0; b < block_count; b++) { + stb_compress_dxt_block(block_data + b * 8, rgba_block_data + b * 4 * 4 * 4, 0, STB_DXT_HIGHQUAL); + } + } + timer.stop(); + + float mse = evaluate_dxt1_mse(rgba_block_data, block_data, block_count); + //printf("stb_dxt hq \t%f\t-> %f %f\n", timer.elapsed(), sqrt(mse), mse_to_psnr(mse)); + + //output_dxt_dds(bw, bh, block_data, "stb_dxt_hq.dds"); + stats->compressorName = "stb-hq"; + stats->mseArray[index] = mse; + stats->timeArray[index] = timer.elapsed(); + stats++; + } + + { + memset(block_data, 0, block_count * 8); + Vector3 color_weights(1); + + timer.start(); + for (int i = 0; i < repeat_count; i++) { + for (int b = 0; b < block_count; b++) { + Vector4 input_colors[16]; + float input_weights[16]; + for (int j = 0; j < 16; j++) { + input_colors[j].x = rgba_block_data[b * 4 * 4 * 4 + j * 4 + 0] / 255.0f; + input_colors[j].y = rgba_block_data[b * 4 * 4 * 4 + j * 4 + 1] / 255.0f; + input_colors[j].z = rgba_block_data[b * 4 * 4 * 4 + j * 4 + 2] / 255.0f; + input_colors[j].w = 255.0f; + input_weights[j] = 1.0f; + } + + compress_dxt1_fast(input_colors, input_weights, color_weights, (BlockDXT1*)(block_data + b * 8)); + } + } + timer.stop(); + + float mse = evaluate_dxt1_mse(rgba_block_data, block_data, block_count); + //printf("nvtt fast \t%f\t-> %f %f\n", timer.elapsed(), sqrt(mse), mse_to_psnr(mse)); + + //output_dxt_dds(bw, bh, block_data, "nvtt_fast.dds"); + stats->compressorName = "nvtt-fast"; + stats->mseArray[index] = mse; + stats->timeArray[index] = timer.elapsed(); + stats++; + } + + { + memset(block_data, 0, block_count * 8); + + timer.start(); + for (int i = 0; i < repeat_count; i++) { + for (int b = 0; b < block_count; b++) { + //compress_dxt1_fast2(rgba_block_data + b * 4 * 4 * 4, (BlockDXT1*)(block_data + b * 8)); + compress_dxt1_fast_geld(rgba_block_data + b * 4 * 4 * 4, (BlockDXT1*)(block_data + b * 8)); + } + } + timer.stop(); + + float mse = evaluate_dxt1_mse(rgba_block_data, block_data, block_count); + //printf("nvtt fast2 \t%f\t-> %f %f\n", timer.elapsed(), sqrt(mse), mse_to_psnr(mse)); + + //output_dxt_dds(bw, bh, block_data, "nvtt_fast2.dds"); + stats->compressorName = "nvtt-geld"; + stats->mseArray[index] = mse; + stats->timeArray[index] = timer.elapsed(); + stats++; + } + + { + memset(block_data, 0, block_count * 8); + Vector3 color_weights(1); + + timer.start(); + for (int i = 0; i < repeat_count; i++) { + for (int b = 0; b < block_count; b++) { + Vector4 input_colors[16]; + float input_weights[16]; + for (int j = 0; j < 16; j++) { + input_colors[j].x = rgba_block_data[b * 4 * 4 * 4 + j * 4 + 0] / 255.0f; + input_colors[j].y = rgba_block_data[b * 4 * 4 * 4 + j * 4 + 1] / 255.0f; + input_colors[j].z = rgba_block_data[b * 4 * 4 * 4 + j * 4 + 2] / 255.0f; + input_colors[j].w = 1.0f; + input_weights[j] = 1.0f; + } + + compress_dxt1(input_colors, input_weights, color_weights, false, (BlockDXT1*)(block_data + b * 8)); + } + } + timer.stop(); + + float mse = evaluate_dxt1_mse(rgba_block_data, block_data, block_count); + //printf("nvtt hq \t%f\t-> %f %f\n", timer.elapsed(), sqrt(mse), mse_to_psnr(mse)); + + //output_dxt_dds(bw, bh, block_data, "nvtt_hq.dds"); + stats->compressorName = "nvtt-hq"; + stats->mseArray[index] = mse; + stats->timeArray[index] = timer.elapsed(); + stats++; + } + + { + memset(block_data, 0, block_count * 8); + + timer.start(); + for (int i = 0; i < repeat_count; i++) { + for (int b = 0; b < block_count; b++) { + squish::Compress(rgba_block_data + b * 4 * 4 * 4, block_data + b * 8, squish::kDxt1); + } + } + timer.stop(); + + float mse = evaluate_dxt1_mse(rgba_block_data, block_data, block_count); + //printf("squish \t%f\t-> %f %f\n", timer.elapsed(), sqrt(mse), mse_to_psnr(mse)); + + //output_dxt_dds(bw, bh, block_data, "squish.dds"); + stats->compressorName = "squish"; + stats->mseArray[index] = mse; + stats->timeArray[index] = timer.elapsed(); + stats++; + } + + /*{ + memset(block_data, 0, block_count * 8); + + timer.start(); + for (int i = 0; i < repeat_count; i++) { + for (int b = 0; b < block_count; b++) { + squish::Compress(rgba_block_data + b * 4 * 4 * 4, block_data + b * 8, squish::kDxt1 | squish::kColourIterativeClusterFit); + } + } + timer.stop(); + + float mse = evaluate_dxt1_mse(rgba_block_data, block_data, block_count); + //printf("squish hq\t%f\t-> %f %f\n", timer.elapsed(), sqrt(mse), mse_to_psnr(mse)); + + //output_dxt_dds(bw, bh, block_data, "squish_hq.dds"); + stats->compressorName = "squish-hq"; + stats->mseArray[index] = mse; + stats->timeArray[index] = timer.elapsed(); + stats++; + }*/ + + { + memset(block_data, 0, block_count * 8); + + timer.start(); + for (int i = 0; i < repeat_count; i++) { + for (int b = 0; b < block_count; b++) { + CompressBlockBC1(rgba_block_data + b * 4 * 4 * 4, 16, block_data + b * 8, nullptr); + } + } + timer.stop(); + + float mse = evaluate_dxt1_mse(rgba_block_data, block_data, block_count); + //printf("squish \t%f\t-> %f %f\n", timer.elapsed(), sqrt(mse), mse_to_psnr(mse)); + + //output_dxt_dds(bw, bh, block_data, "squish.dds"); + stats->compressorName = "cmp"; + stats->mseArray[index] = mse; + stats->timeArray[index] = timer.elapsed(); + stats++; + } + + return false; +} + + + +bool analyze_bc1(const char * inputFileName) { + + int w, h, n; + unsigned char *input_data = stbi_load(inputFileName, &w, &h, &n, 4); + defer { stbi_image_free(input_data); }; + + if (input_data == nullptr) { + printf("Failed to load input image '%s'.\n", inputFileName); + return false; + } + + int block_count = (w / 4) * (h / 4); + u8 * rgba_block_data = (u8 *)malloc(block_count * 4 * 4 * 4); + defer { free(rgba_block_data); }; + + int bw = 4 * (w / 4); // Round down. + int bh = 4 * (h / 4); + + // Convert to block layout. + for (int y = 0, b = 0; y < bh; y += 4) { + for (int x = 0; x < bw; x += 4, b++) { + for (int yy = 0; yy < 4; yy++) { + for (int xx = 0; xx < 4; xx++) { + if (x + xx < w && y + yy < h) { + rgba_block_data[b * 4 * 4 * 4 + (yy * 4 + xx) * 4 + 0] = input_data[((y + yy) * w + x + xx) * 4 + 0]; + rgba_block_data[b * 4 * 4 * 4 + (yy * 4 + xx) * 4 + 1] = input_data[((y + yy) * w + x + xx) * 4 + 1]; + rgba_block_data[b * 4 * 4 * 4 + (yy * 4 + xx) * 4 + 2] = input_data[((y + yy) * w + x + xx) * 4 + 2]; + rgba_block_data[b * 4 * 4 * 4 + (yy * 4 + xx) * 4 + 3] = input_data[((y + yy) * w + x + xx) * 4 + 3]; + } + else { + rgba_block_data[b * 4 * 4 * 4 + (yy * 4 + xx) * 4 + 0] = 0; + rgba_block_data[b * 4 * 4 * 4 + (yy * 4 + xx) * 4 + 1] = 0; + rgba_block_data[b * 4 * 4 * 4 + (yy * 4 + xx) * 4 + 2] = 0; + rgba_block_data[b * 4 * 4 * 4 + (yy * 4 + xx) * 4 + 3] = 0; + } + } + } + } + } + + u8 * block_data = (u8 *)malloc(block_count * 8); + memset(block_data, 0, block_count * 8); + + Timer timer; + + int stb_better_than_nvtt_fast = 0; + int stb_better_than_nvtt_hq = 0; + int squish_better_than_nvtt_hq = 0; + + int this_should_never_happen = 0; + int this_should_never_happen_either = 0; + + Vector3 color_weights(1); + + for (int b = 0; b < block_count; b++) { + + uint8 * rgba_block = rgba_block_data + b * 4 * 4 * 4; + uint8 * dxt_block = block_data + b * 8; + + Vector4 input_colors[16]; + float input_weights[16]; + for (int j = 0; j < 16; j++) { + input_colors[j].x = rgba_block[j * 4 + 0] / 255.0f; + input_colors[j].y = rgba_block[j * 4 + 1] / 255.0f; + input_colors[j].z = rgba_block[j * 4 + 2] / 255.0f; + input_colors[j].w = 255.0f; + input_weights[j] = 1.0f; + } + + // Compare all the different modes on the same block: + + stb_compress_dxt_block(dxt_block, rgba_block, 0, STB_DXT_NORMAL); + float mse_stb = nv::evaluate_dxt1_error(rgba_block, (BlockDXT1 *)dxt_block); + + stb_compress_dxt_block(dxt_block, rgba_block, 0, STB_DXT_HIGHQUAL); + float mse_stb_hq = nv::evaluate_dxt1_error(rgba_block, (BlockDXT1 *)dxt_block); + + compress_dxt1_fast(input_colors, input_weights, color_weights, (BlockDXT1*)dxt_block); + float mse_nvtt_fast = nv::evaluate_dxt1_error(rgba_block, (BlockDXT1 *)dxt_block); + + compress_dxt1_fast2(rgba_block, (BlockDXT1*)dxt_block); + float mse_nvtt_fast2 = nv::evaluate_dxt1_error(rgba_block, (BlockDXT1 *)dxt_block); + + compress_dxt1_fast_geld(rgba_block, (BlockDXT1*)dxt_block); + float mse_nvtt_geld = nv::evaluate_dxt1_error(rgba_block, (BlockDXT1 *)dxt_block); + + compress_dxt1(input_colors, input_weights, color_weights, false, (BlockDXT1*)dxt_block); + float mse_nvtt_hq = nv::evaluate_dxt1_error(rgba_block, (BlockDXT1 *)dxt_block); + + squish::Compress(rgba_block, dxt_block, squish::kDxt1); + float mse_squish = nv::evaluate_dxt1_error(rgba_block, (BlockDXT1 *)dxt_block); + + squish::Compress(rgba_block, dxt_block, squish::kDxt1 | squish::kColourIterativeClusterFit); + float mse_squish_hq = nv::evaluate_dxt1_error(rgba_block, (BlockDXT1 *)dxt_block); + + if (mse_stb < mse_nvtt_fast) { + stb_better_than_nvtt_fast++; + } + if (mse_stb < mse_nvtt_hq) { + stb_better_than_nvtt_hq++; + } + if (mse_squish < mse_nvtt_hq) { + squish_better_than_nvtt_hq++; + } + if (mse_nvtt_fast < mse_nvtt_hq) { + this_should_never_happen++; + } + if (mse_nvtt_fast2 < mse_nvtt_fast) { + this_should_never_happen_either++; + } + } + + return true; +} + + + + +const char * image_set[] = { + "testsuite/kodak/kodim01.png", + "testsuite/kodak/kodim02.png", + "testsuite/kodak/kodim03.png", + "testsuite/kodak/kodim04.png", + "testsuite/kodak/kodim05.png", + "testsuite/kodak/kodim06.png", + "testsuite/kodak/kodim07.png", + "testsuite/kodak/kodim08.png", + "testsuite/kodak/kodim09.png", + "testsuite/kodak/kodim10.png", + "testsuite/kodak/kodim11.png", + "testsuite/kodak/kodim12.png", + "testsuite/kodak/kodim13.png", + "testsuite/kodak/kodim14.png", + "testsuite/kodak/kodim15.png", + "testsuite/kodak/kodim16.png", + "testsuite/kodak/kodim17.png", + "testsuite/kodak/kodim18.png", + "testsuite/kodak/kodim19.png", + "testsuite/kodak/kodim20.png", + "testsuite/kodak/kodim21.png", + "testsuite/kodak/kodim22.png", + "testsuite/kodak/kodim23.png", + "testsuite/kodak/kodim24.png", + "testsuite/waterloo/clegg.png", + "testsuite/waterloo/frymire.png", + "testsuite/waterloo/lena.png", + "testsuite/waterloo/monarch.png", + "testsuite/waterloo/peppers.png", + "testsuite/waterloo/sail.png", + "testsuite/waterloo/serrano.png", + "testsuite/waterloo/tulips.png", +}; + +const char * roblox_set[] = { + "Roblox/asphalt_side/diffuse.tga", + "Roblox/asphalt_top/diffuse.tga", + "Roblox/basalt/diffuse.tga", + "Roblox/brick/diffuse.tga", + "Roblox/cobblestone_side/diffuse.tga", + "Roblox/cobblestone_top/diffuse.tga", + "Roblox/concrete_side/diffuse.tga", + "Roblox/concrete_top/diffuse.tga", + "Roblox/crackedlava/diffuse.tga", + "Roblox/glacier_bottom/diffuse.tga", + "Roblox/glacier_side/diffuse.tga", + "Roblox/glacier_top/diffuse.tga", + "Roblox/grass_bottom/diffuse.tga", + "Roblox/grass_side/diffuse.tga", + "Roblox/grass_top/diffuse.tga", + "Roblox/ground/diffuse.tga", + "Roblox/ice_side/diffuse.tga", + "Roblox/ice_top/diffuse.tga", + "Roblox/leafygrass_side/diffuse.tga", + "Roblox/leafygrass_top/diffuse.tga", + "Roblox/limestone_side/diffuse.tga", + "Roblox/limestone_top/diffuse.tga", + "Roblox/mud/diffuse.tga", + "Roblox/pavement_side/diffuse.tga", + "Roblox/pavement_top/diffuse.tga", + "Roblox/rock/diffuse.tga", + "Roblox/salt_side/diffuse.tga", + "Roblox/salt_top/diffuse.tga", + "Roblox/sand_side/diffuse.tga", + "Roblox/sand_top/diffuse.tga", + "Roblox/sandstone_bottom/diffuse.tga", + "Roblox/sandstone_side/diffuse.tga", + "Roblox/sandstone_top/diffuse.tga", + "Roblox/slate/diffuse.tga", + "Roblox/snow/diffuse.tga", + "Roblox/woodplanks/diffuse.tga", +}; + + + + +int main(int argc, char *argv[]) +{ + const char * inputFileName = "testsuite/kodak/kodim14.png"; + //const char * inputFileName = "testsuite/kodak/kodim18.png"; + //const char * inputFileName = "testsuite/kodak/kodim15.png"; + //const char * inputFileName = "testsuite/waterloo/frymire.png"; + // test_bc1(inputFileName, 0); + + analyze_bc1(inputFileName); + + //const char ** set = roblox_set; + //int count = sizeof(roblox_set) / sizeof(char*); + + const char ** set = image_set; + int count = sizeof(image_set) / sizeof(char*); + + Stats stats[COMPRESSOR_COUNT]; + + for (int i = 0; i < COMPRESSOR_COUNT; i++) { + stats[i].compressorName = nullptr; + stats[i].mseArray.resize(count, 0.0f); + stats[i].timeArray.resize(count, 0.0f); + } + + for (int i = 0; i < count; i++) { + printf("\nImage '%s'\n", set[i]); + + test_bc1(set[i], i, stats); + + for (int c = 0; c < COMPRESSOR_COUNT; c++) { + if (stats[c].compressorName) { + printf("%-16s %f\t%f\n", stats[c].compressorName, sqrtf(stats[c].mseArray[i]), stats[c].timeArray[i]); + } + } + } + + // Print stats. + printf("\nAverage Results:\n"); + for (int c = 0; c < COMPRESSOR_COUNT; c++) { + if (stats[c].compressorName) { + float sum = 0.0f; + for (float it : stats[c].mseArray) { + sum += it; + } + sum /= count; + + float time = 0.0f; + for (float it : stats[c].timeArray) { + time += it; + } + + printf("%-16s %f\t%f\n", stats[c].compressorName, sqrtf(sum), time); + } + } + + return EXIT_SUCCESS; +}