diff --git a/data/testsuite/waterloo/baboon.png b/data/testsuite/waterloo/baboon.png
new file mode 100644
index 0000000..2b1499a
Binary files /dev/null and b/data/testsuite/waterloo/baboon.png differ
diff --git a/extern/libsquish-1.15/squish.h b/extern/libsquish-1.15/squish.h
new file mode 100644
index 0000000..14c9bb5
--- /dev/null
+++ b/extern/libsquish-1.15/squish.h
@@ -0,0 +1,309 @@
+/* -----------------------------------------------------------------------------
+
+    Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+    Permission is hereby granted, free of charge, to any person obtaining
+    a copy of this software and associated documentation files (the
+    "Software"), to deal in the Software without restriction, including
+    without limitation the rights to use, copy, modify, merge, publish,
+    distribute, sublicense, and/or sell copies of the Software, and to
+    permit persons to whom the Software is furnished to do so, subject to
+    the following conditions:
+
+    The above copyright notice and this permission notice shall be included
+    in all copies or substantial portions of the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+    OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+    IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+    CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+    TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+    SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+   -------------------------------------------------------------------------- */
+
+#ifndef SQUISH_H
+#define SQUISH_H
+
+//! All squish API functions live in this namespace.
+namespace squish {
+
+// -----------------------------------------------------------------------------
+
+//! Typedef a quantity that is a single unsigned byte.
+typedef unsigned char u8;
+
+// -----------------------------------------------------------------------------
+
+enum
+{
+    //! Use DXT1 compression.
+    kDxt1 = ( 1 << 0 ),
+
+    //! Use DXT3 compression.
+    kDxt3 = ( 1 << 1 ),
+
+    //! Use DXT5 compression.
+    kDxt5 = ( 1 << 2 ),
+
+    //! Use BC4 compression.
+    kBc4 = ( 1 << 3 ),
+
+    //! Use BC5 compression.
+    kBc5 = ( 1 << 4 ),
+
+    //! Use a slow but high quality colour compressor (the default).
+    kColourClusterFit = ( 1 << 5 ),
+
+    //! Use a fast but low quality colour compressor.
+    kColourRangeFit = ( 1 << 6 ),
+
+    //! Weight the colour by alpha during cluster fit (disabled by default).
+    kWeightColourByAlpha = ( 1 << 7 ),
+
+    //! Use a very slow but very high quality colour compressor.
+    kColourIterativeClusterFit = ( 1 << 8 ),
+
+    //! Source is BGRA rather than RGBA
+    kSourceBGRA = ( 1 << 9 )
+};
+
+// -----------------------------------------------------------------------------
+
+/*! @brief Compresses a 4x4 block of pixels.
+
+    @param rgba   The rgba values of the 16 source pixels.
+    @param mask   The valid pixel mask.
+    @param block  Storage for the compressed DXT block.
+    @param flags  Compression flags.
+    @param metric An optional perceptual metric.
+
+    The source pixels should be presented as a contiguous array of 16 rgba
+    values, with each component as 1 byte each. In memory this should be:
+
+        { r1, g1, b1, a1, .... , r16, g16, b16, a16 }
+
+    The mask parameter enables only certain pixels within the block. The lowest
+    bit enables the first pixel and so on up to the 16th bit. Bits beyond the
+    16th bit are ignored. Pixels that are not enabled are allowed to take
+    arbitrary colours in the output block. An example of how this can be used
+    is in the CompressImage function to disable pixels outside the bounds of
+    the image when the width or height is not divisible by 4.
+
+    The flags parameter should specify kDxt1, kDxt3, kDxt5, kBc4, or kBc5 compression,
+    however, DXT1 will be used by default if none is specified. When using DXT1
+    compression, 8 bytes of storage are required for the compressed DXT block.
+    DXT3 and DXT5 compression require 16 bytes of storage per block.
+
+    The flags parameter can also specify a preferred colour compressor to use
+    when fitting the RGB components of the data. Possible colour compressors
+    are: kColourClusterFit (the default), kColourRangeFit (very fast, low
+    quality) or kColourIterativeClusterFit (slowest, best quality).
+
+    When using kColourClusterFit or kColourIterativeClusterFit, an additional
+    flag can be specified to weight the importance of each pixel by its alpha
+    value. For images that are rendered using alpha blending, this can
+    significantly increase the perceived quality.
+
+    The metric parameter can be used to weight the relative importance of each
+    colour channel, or pass NULL to use the default uniform weight of
+    { 1.0f, 1.0f, 1.0f }. This replaces the previous flag-based control that
+    allowed either uniform or "perceptual" weights with the fixed values
+    { 0.2126f, 0.7152f, 0.0722f }. If non-NULL, the metric should point to a
+    contiguous array of 3 floats.
+*/
+void CompressMasked( u8 const* rgba, int mask, void* block, int flags, float* metric = 0 );
+
+// -----------------------------------------------------------------------------
+
+/*! @brief Compresses a 4x4 block of pixels.
+
+    @param rgba   The rgba values of the 16 source pixels.
+    @param block  Storage for the compressed DXT block.
+    @param flags  Compression flags.
+    @param metric An optional perceptual metric.
+
+    The source pixels should be presented as a contiguous array of 16 rgba
+    values, with each component as 1 byte each. In memory this should be:
+
+        { r1, g1, b1, a1, .... , r16, g16, b16, a16 }
+
+    The flags parameter should specify kDxt1, kDxt3, kDxt5, kBc4, or kBc5 compression,
+    however, DXT1 will be used by default if none is specified. When using DXT1
+    compression, 8 bytes of storage are required for the compressed DXT block.
+    DXT3 and DXT5 compression require 16 bytes of storage per block.
+
+    The flags parameter can also specify a preferred colour compressor to use
+    when fitting the RGB components of the data. Possible colour compressors
+    are: kColourClusterFit (the default), kColourRangeFit (very fast, low
+    quality) or kColourIterativeClusterFit (slowest, best quality).
+
+    When using kColourClusterFit or kColourIterativeClusterFit, an additional
+    flag can be specified to weight the importance of each pixel by its alpha
+    value. For images that are rendered using alpha blending, this can
+    significantly increase the perceived quality.
+
+    The metric parameter can be used to weight the relative importance of each
+    colour channel, or pass NULL to use the default uniform weight of
+    { 1.0f, 1.0f, 1.0f }. This replaces the previous flag-based control that
+    allowed either uniform or "perceptual" weights with the fixed values
+    { 0.2126f, 0.7152f, 0.0722f }. If non-NULL, the metric should point to a
+    contiguous array of 3 floats.
+
+    This method is an inline that calls CompressMasked with a mask of 0xffff,
+    provided for compatibility with older versions of squish.
+*/
+inline void Compress( u8 const* rgba, void* block, int flags, float* metric = 0 )
+{
+    CompressMasked( rgba, 0xffff, block, flags, metric );
+}
+
+// -----------------------------------------------------------------------------
+
+/*! @brief Decompresses a 4x4 block of pixels.
+
+    @param rgba  Storage for the 16 decompressed pixels.
+    @param block The compressed DXT block.
+    @param flags Compression flags.
+
+    The decompressed pixels will be written as a contiguous array of 16 rgba
+    values, with each component as 1 byte each. In memory this is:
+
+        { r1, g1, b1, a1, .... , r16, g16, b16, a16 }
+
+    The flags parameter should specify kDxt1, kDxt3, kDxt5, kBc4, or kBc5 compression,
+    however, DXT1 will be used by default if none is specified. All other flags
+    are ignored.
+*/
+void Decompress( u8* rgba, void const* block, int flags );
+
+// -----------------------------------------------------------------------------
+
+/*! @brief Computes the amount of compressed storage required.
+
+    @param width  The width of the image.
+    @param height The height of the image.
+    @param flags  Compression flags.
+
+    The flags parameter should specify kDxt1, kDxt3, kDxt5, kBc4, or kBc5 compression,
+    however, DXT1 will be used by default if none is specified. All other flags
+    are ignored.
+
+    Most DXT images will be a multiple of 4 in each dimension, but this
+    function supports arbitrary size images by allowing the outer blocks to
+    be only partially used.
+*/
+int GetStorageRequirements( int width, int height, int flags );
+
+// -----------------------------------------------------------------------------
+
+/*! @brief Compresses an image in memory.
+
+    @param rgba   The pixels of the source.
+    @param width  The width of the source image.
+    @param height The height of the source image.
+    @param pitch  The pitch of the source image.
+    @param blocks Storage for the compressed output.
+    @param flags  Compression flags.
+    @param metric An optional perceptual metric.
+
+    The source pixels should be presented as a contiguous array of width*height
+    rgba values, with each component as 1 byte each. In memory this should be:
+
+        { r1, g1, b1, a1, .... , rn, gn, bn, an } for n = width*height
+
+    The flags parameter should specify kDxt1, kDxt3, kDxt5, kBc4, or kBc5 compression,
+    however, DXT1 will be used by default if none is specified. When using DXT1
+    compression, 8 bytes of storage are required for each compressed DXT block.
+    DXT3 and DXT5 compression require 16 bytes of storage per block.
+
+    The flags parameter can also specify a preferred colour compressor to use
+    when fitting the RGB components of the data. Possible colour compressors
+    are: kColourClusterFit (the default), kColourRangeFit (very fast, low
+    quality) or kColourIterativeClusterFit (slowest, best quality).
+
+    When using kColourClusterFit or kColourIterativeClusterFit, an additional
+    flag can be specified to weight the importance of each pixel by its alpha
+    value. For images that are rendered using alpha blending, this can
+    significantly increase the perceived quality.
+
+    The metric parameter can be used to weight the relative importance of each
+    colour channel, or pass NULL to use the default uniform weight of
+    { 1.0f, 1.0f, 1.0f }. This replaces the previous flag-based control that
+    allowed either uniform or "perceptual" weights with the fixed values
+    { 0.2126f, 0.7152f, 0.0722f }. If non-NULL, the metric should point to a
+    contiguous array of 3 floats.
+
+    Internally this function calls squish::CompressMasked for each block, which
+    allows for pixels outside the image to take arbitrary values. The function
+    squish::GetStorageRequirements can be called to compute the amount of memory
+    to allocate for the compressed output.
+
+    Note on compression quality: When compressing textures with
+    libsquish it is recommended to apply a gamma-correction
+    beforehand. This will reduce the blockiness in dark areas. The
+    level of necessary gamma-correction is platform dependent. For
+    example, a gamma correction with gamma = 0.5 before compression
+    and gamma = 2.0 after decompression yields good results on the
+    Windows platform but for other platforms like MacOS X a different
+    gamma value may be more suitable.
+*/
+void CompressImage( u8 const* rgba, int width, int height, int pitch, void* blocks, int flags, float* metric = 0 );
+void CompressImage( u8 const* rgba, int width, int height, void* blocks, int flags, float* metric = 0 );
+
+// -----------------------------------------------------------------------------
+
+/*! @brief Decompresses an image in memory.
+
+    @param rgba   Storage for the decompressed pixels.
+    @param width  The width of the source image.
+    @param height The height of the source image.
+    @param pitch  The pitch of the decompressed pixels.
+    @param blocks The compressed DXT blocks.
+    @param flags  Compression flags.
+
+    The decompressed pixels will be written as a contiguous array of width*height
+    16 rgba values, with each component as 1 byte each. In memory this is:
+
+        { r1, g1, b1, a1, .... , rn, gn, bn, an } for n = width*height
+
+    The flags parameter should specify kDxt1, kDxt3, kDxt5, kBc4, or kBc5 compression,
+    however, DXT1 will be used by default if none is specified. All other flags
+    are ignored.
+
+    Internally this function calls squish::Decompress for each block.
+*/
+void DecompressImage( u8* rgba, int width, int height, int pitch, void const* blocks, int flags );
+void DecompressImage( u8* rgba, int width, int height, void const* blocks, int flags );
+
+// -----------------------------------------------------------------------------
+
+/*! @brief Computes MSE of an compressed image in memory.
+
+    @param rgba      The original image pixels.
+    @param width     The width of the source image.
+    @param height    The height of the source image.
+    @param pitch     The pitch of the source image.
+    @param dxt       The compressed dxt blocks
+    @param flags     Compression flags.
+    @param colourMSE The MSE of the colour values.
+    @param alphaMSE  The MSE of the alpha values.
+
+    The colour MSE and alpha MSE are computed across all pixels. The colour MSE is
+    averaged across all rgb values (i.e. colourMSE = sum sum_k ||dxt.k - rgba.k||/3)
+
+    The flags parameter should specify kDxt1, kDxt3, kDxt5, kBc4, or kBc5 compression,
+    however, DXT1 will be used by default if none is specified. All other flags
+    are ignored.
+
+    Internally this function calls squish::Decompress for each block.
+*/
+void ComputeMSE(u8 const *rgba, int width, int height, int pitch, u8 const *dxt, int flags, double &colourMSE, double &alphaMSE);
+void ComputeMSE(u8 const *rgba, int width, int height, u8 const *dxt, int flags, double &colourMSE, double &alphaMSE);
+
+// -----------------------------------------------------------------------------
+
+} // namespace squish
+
+#endif // ndef SQUISH_H
diff --git a/src/nvtt/tests/bc1enc.cpp b/src/nvtt/tests/bc1enc.cpp
new file mode 100644
index 0000000..a48c2c1
--- /dev/null
+++ b/src/nvtt/tests/bc1enc.cpp
@@ -0,0 +1,694 @@
+
+#define  _CRT_SECURE_NO_WARNINGS
+#include <assert.h>
+#include <stdlib.h>
+
+//#define STBI_ASSERT(x)
+#define STB_IMAGE_IMPLEMENTATION
+#include "stb_image.h"
+
+#define STB_DXT_IMPLEMENTATION
+#include "stb_dxt.h"
+
+#include "../extern/libsquish-1.15/squish.h"
+
+#include "../extern/CMP_Core/source/CMP_Core.h"
+
+#include "nvtt/CompressorDXT1.h"
+
+#include "nvmath/Vector.h"
+#include "nvmath/Color.h"
+
+#include "nvcore/Timer.h"
+#include "nvcore/Array.inl"
+
+using namespace nv;
+
+typedef unsigned char u8;
+typedef unsigned int u32;
+
+
+// Defer statement:
+#define CONCAT_INTERNAL(x, y) x##y
+#define CONCAT(x, y) CONCAT_INTERNAL(x, y)
+
+template<typename T>
+struct ExitScope
+{
+    T lambda;
+    ExitScope(T lambda)
+        : lambda(lambda)
+    {
+    }
+    ~ExitScope() { lambda(); }
+
+private:
+    ExitScope& operator=(const ExitScope&);
+};
+
+class ExitScopeHelp
+{
+public:
+    template<typename T>
+    ExitScope<T> operator+(T t) { return t; }
+};
+
+#define defer const auto& __attribute__((unused)) CONCAT(defer__, __LINE__) = ExitScopeHelp() + [&]()
+
+
+static float mse_to_psnr(float mse) {
+    float rms = sqrtf(mse);
+    float psnr = rms ? (float)clamp(log10(255.0 / rms) * 20.0, 0.0, 300.0) : 1e+10f;
+    return psnr;
+}
+
+/*
+void image_metrics::calc(const image &a, const image &b, uint32_t first_chan, uint32_t total_chans, bool avg_comp_error, bool use_601_luma)
+{
+    //assert((first_chan < 4U) && (first_chan + total_chans <= 4U));
+
+    const uint32_t width = std::min(a.get_width(), b.get_width());
+    const uint32_t height = std::min(a.get_height(), b.get_height());
+
+    double hist[256];
+    memset(hist, 0, sizeof(hist));
+
+    for (uint32_t y = 0; y < height; y++)
+    {
+        for (uint32_t x = 0; x < width; x++)
+        {
+            const color_rgba &ca = a(x, y), &cb = b(x, y);
+
+            for (uint32_t c = 0; c < 3; c++)
+                hist[iabs(ca[first_chan + c] - cb[first_chan + c])]++;
+        }
+    }
+
+    m_max = 0;
+    double sum = 0.0f, sum2 = 0.0f;
+    for (uint32_t i = 0; i < 256; i++)
+    {
+        if (hist[i])
+        {
+            m_max = std::max<float>(m_max, (float)i);
+            double v = i * hist[i];
+            sum += v;
+            sum2 += i * v;
+        }
+    }
+
+    double total_values = (double)width * (double)height;
+    if (avg_comp_error)
+        total_values *= (double)clamp<uint32_t>(total_chans, 1, 4);
+
+    m_mean = (float)clamp<double>(sum / total_values, 0.0f, 255.0);
+    m_mean_squared = (float)clamp<double>(sum2 / total_values, 0.0f, 255.0 * 255.0);
+    m_rms = (float)sqrt(m_mean_squared);
+    m_psnr = m_rms ? (float)clamp<double>(log10(255.0 / m_rms) * 20.0, 0.0f, 300.0f) : 1e+10f;
+}
+*/
+
+// Returns mse.
+float evaluate_dxt1_mse(uint8 * rgba, uint8 * block, int block_count, int decoder = 2) {
+    double total = 0.0f;
+    for (int b = 0; b < block_count; b++) {
+        total += nv::evaluate_dxt1_error(rgba, (BlockDXT1 *)block, decoder) / 255.0;
+        rgba += 4 * 4 * 4;
+        block += 8;
+    }
+    return float(total / (3 * 16 * block_count));
+}
+
+#define MAKEFOURCC(str) (uint(str[0]) | (uint(str[1]) << 8) | (uint(str[2]) << 16) | (uint(str[3]) << 24 ))
+
+
+bool output_dxt_dds (u32 w, u32 h, const u8* data, const char * filename) {
+
+    const u32 DDSD_CAPS = 0x00000001;
+    const u32 DDSD_PIXELFORMAT = 0x00001000;
+    const u32 DDSD_WIDTH = 0x00000004;
+    const u32 DDSD_HEIGHT = 0x00000002;
+    const u32 DDSD_LINEARSIZE = 0x00080000;
+    const u32 DDPF_FOURCC = 0x00000004;
+    const u32 DDSCAPS_TEXTURE = 0x00001000;
+
+    struct DDS {
+        u32 fourcc = MAKEFOURCC("DDS ");
+        u32 size = 124;
+        u32 flags = DDSD_CAPS|DDSD_PIXELFORMAT|DDSD_WIDTH|DDSD_HEIGHT|DDSD_LINEARSIZE;
+        u32 height;
+        u32 width;
+        u32 pitch;
+        u32 depth;
+        u32 mipmapcount;
+        u32 reserved [11];
+        struct {
+            u32 size = 32;
+            u32 flags = DDPF_FOURCC;
+            u32 fourcc = MAKEFOURCC("DXT1");
+            u32 bitcount;
+            u32 rmask;
+            u32 gmask;
+            u32 bmask;
+            u32 amask;
+        } pf;
+        struct {
+            u32 caps1 = DDSCAPS_TEXTURE;
+            u32 caps2;
+            u32 caps3;
+            u32 caps4;
+        } caps;
+        u32 notused;
+    } dds;
+    static_assert(sizeof(DDS) == 128, "DDS size must be 128");
+
+    dds.width = w;
+    dds.height = h;
+    dds.pitch = 8 * ((w+3)/4 * (h+3)/4); // linear size
+
+    FILE * fp = fopen(filename, "wb");
+    if (fp == nullptr) return false;
+
+    // Write header:
+    fwrite(&dds, sizeof(dds), 1, fp);
+
+    // Write dxt data:
+    fwrite(data, dds.pitch, 1, fp);
+
+    fclose(fp);
+
+    return true;
+}
+
+const int COMPRESSOR_COUNT = 7;
+struct Stats {
+    const char * compressorName;
+    Array<float> mseArray;
+    Array<float> timeArray;
+};
+
+
+bool test_bc1(const char * inputFileName, int index, Stats * stats) {
+
+    int w, h, n;
+    unsigned char *input_data = stbi_load(inputFileName, &w, &h, &n, 4);
+    defer { stbi_image_free(input_data); };
+
+    if (input_data == nullptr) {
+        printf("Failed to load input image '%s'.\n", inputFileName);
+        return false;
+    }
+
+
+    int block_count = (w / 4) * (h / 4);
+    u8 * rgba_block_data = (u8 *)malloc(block_count * 4 * 4 * 4);
+    defer { free(rgba_block_data); };
+
+    int bw = 4 * (w / 4); // Round down.
+    int bh = 4 * (h / 4);
+
+    // Convert to block layout.
+    for (int y = 0, b = 0; y < bh; y += 4) {
+        for (int x = 0; x < bw; x += 4, b++) {
+            for (int yy = 0; yy < 4; yy++) {
+                for (int xx = 0; xx < 4; xx++) {
+                    if (x + xx < w && y + yy < h) {
+                        rgba_block_data[b * 4 * 4 * 4 + (yy * 4 + xx) * 4 + 0] = input_data[((y + yy) * w + x + xx) * 4 + 0];
+                        rgba_block_data[b * 4 * 4 * 4 + (yy * 4 + xx) * 4 + 1] = input_data[((y + yy) * w + x + xx) * 4 + 1];
+                        rgba_block_data[b * 4 * 4 * 4 + (yy * 4 + xx) * 4 + 2] = input_data[((y + yy) * w + x + xx) * 4 + 2];
+                        rgba_block_data[b * 4 * 4 * 4 + (yy * 4 + xx) * 4 + 3] = input_data[((y + yy) * w + x + xx) * 4 + 3];
+                    }
+                    else {
+                        rgba_block_data[b * 4 * 4 * 4 + (yy * 4 + xx) * 4 + 0] = 0;
+                        rgba_block_data[b * 4 * 4 * 4 + (yy * 4 + xx) * 4 + 1] = 0;
+                        rgba_block_data[b * 4 * 4 * 4 + (yy * 4 + xx) * 4 + 2] = 0;
+                        rgba_block_data[b * 4 * 4 * 4 + (yy * 4 + xx) * 4 + 3] = 0;
+                    }
+                }
+            }
+        }
+    }
+
+    u8 * block_data = (u8 *)malloc(block_count * 8);
+
+    Timer timer;
+
+    // Warmup.
+    for (int b = 0; b < block_count; b++) {
+        stb_compress_dxt_block(block_data + b * 8, rgba_block_data + b * 4 * 4 * 4, 0, STB_DXT_NORMAL);
+    }
+
+#if _DEBUG
+    const int repeat_count = 1;
+#else
+    const int repeat_count = 1; // 8
+#endif
+
+    {
+        memset(block_data, 0, block_count * 8);
+
+        timer.start();
+        for (int i = 0; i < repeat_count; i++) {
+            for (int b = 0; b < block_count; b++) {
+                stb_compress_dxt_block(block_data + b * 8, rgba_block_data + b * 4 * 4 * 4, 0, STB_DXT_NORMAL);
+            }
+        }
+        timer.stop();
+
+        float mse = evaluate_dxt1_mse(rgba_block_data, block_data, block_count);
+        //printf("stb_dxt \t%f\t-> %f %f\n", timer.elapsed(), sqrt(mse), mse_to_psnr(mse));
+
+        //output_dxt_dds(bw, bh, block_data, "stb_dxt.dds");
+        stats->compressorName = "stb";
+        stats->mseArray[index] = mse;
+        stats->timeArray[index] = timer.elapsed();
+        stats++;
+    }
+
+    {
+        memset(block_data, 0, block_count * 8);
+
+        timer.start();
+        for (int i = 0; i < repeat_count; i++) {
+            for (int b = 0; b < block_count; b++) {
+                stb_compress_dxt_block(block_data + b * 8, rgba_block_data + b * 4 * 4 * 4, 0, STB_DXT_HIGHQUAL);
+            }
+        }
+        timer.stop();
+
+        float mse = evaluate_dxt1_mse(rgba_block_data, block_data, block_count);
+        //printf("stb_dxt hq \t%f\t-> %f %f\n", timer.elapsed(), sqrt(mse), mse_to_psnr(mse));
+
+        //output_dxt_dds(bw, bh, block_data, "stb_dxt_hq.dds");
+        stats->compressorName = "stb-hq";
+        stats->mseArray[index] = mse;
+        stats->timeArray[index] = timer.elapsed();
+        stats++;
+    }
+
+    {
+        memset(block_data, 0, block_count * 8);
+        Vector3 color_weights(1);
+
+        timer.start();
+        for (int i = 0; i < repeat_count; i++) {
+            for (int b = 0; b < block_count; b++) {
+                Vector4 input_colors[16];
+                float input_weights[16];
+                for (int j = 0; j < 16; j++) {
+                    input_colors[j].x = rgba_block_data[b * 4 * 4 * 4 + j * 4 + 0] / 255.0f;
+                    input_colors[j].y = rgba_block_data[b * 4 * 4 * 4 + j * 4 + 1] / 255.0f;
+                    input_colors[j].z = rgba_block_data[b * 4 * 4 * 4 + j * 4 + 2] / 255.0f;
+                    input_colors[j].w = 255.0f;
+                    input_weights[j] = 1.0f;
+                }
+
+                compress_dxt1_fast(input_colors, input_weights, color_weights, (BlockDXT1*)(block_data + b * 8));
+            }
+        }
+        timer.stop();
+
+        float mse = evaluate_dxt1_mse(rgba_block_data, block_data, block_count);
+        //printf("nvtt fast \t%f\t-> %f %f\n", timer.elapsed(), sqrt(mse), mse_to_psnr(mse));
+
+        //output_dxt_dds(bw, bh, block_data, "nvtt_fast.dds");
+        stats->compressorName = "nvtt-fast";
+        stats->mseArray[index] = mse;
+        stats->timeArray[index] = timer.elapsed();
+        stats++;
+    }
+
+    {
+        memset(block_data, 0, block_count * 8);
+
+        timer.start();
+        for (int i = 0; i < repeat_count; i++) {
+            for (int b = 0; b < block_count; b++) {
+                //compress_dxt1_fast2(rgba_block_data + b * 4 * 4 * 4, (BlockDXT1*)(block_data + b * 8));
+                compress_dxt1_fast_geld(rgba_block_data + b * 4 * 4 * 4, (BlockDXT1*)(block_data + b * 8));
+            }
+        }
+        timer.stop();
+
+        float mse = evaluate_dxt1_mse(rgba_block_data, block_data, block_count);
+        //printf("nvtt fast2 \t%f\t-> %f %f\n", timer.elapsed(), sqrt(mse), mse_to_psnr(mse));
+
+        //output_dxt_dds(bw, bh, block_data, "nvtt_fast2.dds");
+        stats->compressorName = "nvtt-geld";
+        stats->mseArray[index] = mse;
+        stats->timeArray[index] = timer.elapsed();
+        stats++;
+    }
+
+    {
+        memset(block_data, 0, block_count * 8);
+        Vector3 color_weights(1);
+
+        timer.start();
+        for (int i = 0; i < repeat_count; i++) {
+            for (int b = 0; b < block_count; b++) {
+                Vector4 input_colors[16];
+                float input_weights[16];
+                for (int j = 0; j < 16; j++) {
+                    input_colors[j].x = rgba_block_data[b * 4 * 4 * 4 + j * 4 + 0] / 255.0f;
+                    input_colors[j].y = rgba_block_data[b * 4 * 4 * 4 + j * 4 + 1] / 255.0f;
+                    input_colors[j].z = rgba_block_data[b * 4 * 4 * 4 + j * 4 + 2] / 255.0f;
+                    input_colors[j].w = 1.0f;
+                    input_weights[j] = 1.0f;
+                }
+
+                compress_dxt1(input_colors, input_weights, color_weights, false, (BlockDXT1*)(block_data + b * 8));
+            }
+        }
+        timer.stop();
+
+        float mse = evaluate_dxt1_mse(rgba_block_data, block_data, block_count);
+        //printf("nvtt hq  \t%f\t-> %f %f\n", timer.elapsed(), sqrt(mse), mse_to_psnr(mse));
+
+        //output_dxt_dds(bw, bh, block_data, "nvtt_hq.dds");
+        stats->compressorName = "nvtt-hq";
+        stats->mseArray[index] = mse;
+        stats->timeArray[index] = timer.elapsed();
+        stats++;
+    }
+
+    {
+        memset(block_data, 0, block_count * 8);
+
+        timer.start();
+        for (int i = 0; i < repeat_count; i++) {
+            for (int b = 0; b < block_count; b++) {
+                squish::Compress(rgba_block_data + b * 4 * 4 * 4, block_data + b * 8, squish::kDxt1);
+            }
+        }
+        timer.stop();
+
+        float mse = evaluate_dxt1_mse(rgba_block_data, block_data, block_count);
+        //printf("squish   \t%f\t-> %f %f\n", timer.elapsed(), sqrt(mse), mse_to_psnr(mse));
+
+        //output_dxt_dds(bw, bh, block_data, "squish.dds");
+        stats->compressorName = "squish";
+        stats->mseArray[index] = mse;
+        stats->timeArray[index] = timer.elapsed();
+        stats++;
+    }
+
+    /*{
+        memset(block_data, 0, block_count * 8);
+
+        timer.start();
+        for (int i = 0; i < repeat_count; i++) {
+            for (int b = 0; b < block_count; b++) {
+                squish::Compress(rgba_block_data + b * 4 * 4 * 4, block_data + b * 8, squish::kDxt1 | squish::kColourIterativeClusterFit);
+            }
+        }
+        timer.stop();
+
+        float mse = evaluate_dxt1_mse(rgba_block_data, block_data, block_count);
+        //printf("squish hq\t%f\t-> %f %f\n", timer.elapsed(), sqrt(mse), mse_to_psnr(mse));
+
+        //output_dxt_dds(bw, bh, block_data, "squish_hq.dds");
+        stats->compressorName = "squish-hq";
+        stats->mseArray[index] = mse;
+        stats->timeArray[index] = timer.elapsed();
+        stats++;
+    }*/
+
+    {
+        memset(block_data, 0, block_count * 8);
+
+        timer.start();
+        for (int i = 0; i < repeat_count; i++) {
+            for (int b = 0; b < block_count; b++) {
+                CompressBlockBC1(rgba_block_data + b * 4 * 4 * 4, 16, block_data + b * 8, nullptr);
+            }
+        }
+        timer.stop();
+
+        float mse = evaluate_dxt1_mse(rgba_block_data, block_data, block_count);
+        //printf("squish   \t%f\t-> %f %f\n", timer.elapsed(), sqrt(mse), mse_to_psnr(mse));
+
+        //output_dxt_dds(bw, bh, block_data, "squish.dds");
+        stats->compressorName = "cmp";
+        stats->mseArray[index] = mse;
+        stats->timeArray[index] = timer.elapsed();
+        stats++;
+    }
+
+    return false;
+}
+
+
+
+bool analyze_bc1(const char * inputFileName) {
+
+    int w, h, n;
+    unsigned char *input_data = stbi_load(inputFileName, &w, &h, &n, 4);
+    defer { stbi_image_free(input_data); };
+
+    if (input_data == nullptr) {
+        printf("Failed to load input image '%s'.\n", inputFileName);
+        return false;
+    }
+
+    int block_count = (w / 4) * (h / 4);
+    u8 * rgba_block_data = (u8 *)malloc(block_count * 4 * 4 * 4);
+    defer { free(rgba_block_data); };
+
+    int bw = 4 * (w / 4); // Round down.
+    int bh = 4 * (h / 4);
+
+    // Convert to block layout.
+    for (int y = 0, b = 0; y < bh; y += 4) {
+        for (int x = 0; x < bw; x += 4, b++) {
+            for (int yy = 0; yy < 4; yy++) {
+                for (int xx = 0; xx < 4; xx++) {
+                    if (x + xx < w && y + yy < h) {
+                        rgba_block_data[b * 4 * 4 * 4 + (yy * 4 + xx) * 4 + 0] = input_data[((y + yy) * w + x + xx) * 4 + 0];
+                        rgba_block_data[b * 4 * 4 * 4 + (yy * 4 + xx) * 4 + 1] = input_data[((y + yy) * w + x + xx) * 4 + 1];
+                        rgba_block_data[b * 4 * 4 * 4 + (yy * 4 + xx) * 4 + 2] = input_data[((y + yy) * w + x + xx) * 4 + 2];
+                        rgba_block_data[b * 4 * 4 * 4 + (yy * 4 + xx) * 4 + 3] = input_data[((y + yy) * w + x + xx) * 4 + 3];
+                    }
+                    else {
+                        rgba_block_data[b * 4 * 4 * 4 + (yy * 4 + xx) * 4 + 0] = 0;
+                        rgba_block_data[b * 4 * 4 * 4 + (yy * 4 + xx) * 4 + 1] = 0;
+                        rgba_block_data[b * 4 * 4 * 4 + (yy * 4 + xx) * 4 + 2] = 0;
+                        rgba_block_data[b * 4 * 4 * 4 + (yy * 4 + xx) * 4 + 3] = 0;
+                    }
+                }
+            }
+        }
+    }
+
+    u8 * block_data = (u8 *)malloc(block_count * 8);
+    memset(block_data, 0, block_count * 8);
+
+    Timer timer;
+
+    int stb_better_than_nvtt_fast = 0;
+    int stb_better_than_nvtt_hq = 0;
+    int squish_better_than_nvtt_hq = 0;
+
+    int this_should_never_happen = 0;
+    int this_should_never_happen_either = 0;
+        
+    Vector3 color_weights(1);
+
+    for (int b = 0; b < block_count; b++) {
+
+        uint8 * rgba_block = rgba_block_data + b * 4 * 4 * 4;
+        uint8 * dxt_block = block_data + b * 8;
+
+        Vector4 input_colors[16];
+        float input_weights[16];
+        for (int j = 0; j < 16; j++) {
+            input_colors[j].x = rgba_block[j * 4 + 0] / 255.0f;
+            input_colors[j].y = rgba_block[j * 4 + 1] / 255.0f;
+            input_colors[j].z = rgba_block[j * 4 + 2] / 255.0f;
+            input_colors[j].w = 255.0f;
+            input_weights[j] = 1.0f;
+        }
+
+        // Compare all the different modes on the same block:
+
+        stb_compress_dxt_block(dxt_block, rgba_block, 0, STB_DXT_NORMAL);
+        float mse_stb = nv::evaluate_dxt1_error(rgba_block, (BlockDXT1 *)dxt_block);
+
+        stb_compress_dxt_block(dxt_block, rgba_block, 0, STB_DXT_HIGHQUAL);
+        float mse_stb_hq = nv::evaluate_dxt1_error(rgba_block, (BlockDXT1 *)dxt_block);
+
+        compress_dxt1_fast(input_colors, input_weights, color_weights, (BlockDXT1*)dxt_block);
+        float mse_nvtt_fast = nv::evaluate_dxt1_error(rgba_block, (BlockDXT1 *)dxt_block);
+
+        compress_dxt1_fast2(rgba_block, (BlockDXT1*)dxt_block);
+        float mse_nvtt_fast2 = nv::evaluate_dxt1_error(rgba_block, (BlockDXT1 *)dxt_block);
+
+        compress_dxt1_fast_geld(rgba_block, (BlockDXT1*)dxt_block);
+        float mse_nvtt_geld = nv::evaluate_dxt1_error(rgba_block, (BlockDXT1 *)dxt_block);
+
+        compress_dxt1(input_colors, input_weights, color_weights, false, (BlockDXT1*)dxt_block);
+        float mse_nvtt_hq = nv::evaluate_dxt1_error(rgba_block, (BlockDXT1 *)dxt_block);
+
+        squish::Compress(rgba_block, dxt_block, squish::kDxt1);
+        float mse_squish = nv::evaluate_dxt1_error(rgba_block, (BlockDXT1 *)dxt_block);
+
+        squish::Compress(rgba_block, dxt_block, squish::kDxt1 | squish::kColourIterativeClusterFit);
+        float mse_squish_hq = nv::evaluate_dxt1_error(rgba_block, (BlockDXT1 *)dxt_block);
+
+        if (mse_stb < mse_nvtt_fast) {
+            stb_better_than_nvtt_fast++;
+        }
+        if (mse_stb < mse_nvtt_hq) {
+            stb_better_than_nvtt_hq++;
+        }
+        if (mse_squish < mse_nvtt_hq) {
+            squish_better_than_nvtt_hq++;
+        }
+        if (mse_nvtt_fast < mse_nvtt_hq) {
+            this_should_never_happen++;
+        }
+        if (mse_nvtt_fast2 < mse_nvtt_fast) {
+            this_should_never_happen_either++;
+        }
+    }
+
+    return true;
+}
+
+
+
+
+const char * image_set[] = {
+    "testsuite/kodak/kodim01.png",
+    "testsuite/kodak/kodim02.png",
+    "testsuite/kodak/kodim03.png",
+    "testsuite/kodak/kodim04.png",
+    "testsuite/kodak/kodim05.png",
+    "testsuite/kodak/kodim06.png",
+    "testsuite/kodak/kodim07.png",
+    "testsuite/kodak/kodim08.png",
+    "testsuite/kodak/kodim09.png",
+    "testsuite/kodak/kodim10.png",
+    "testsuite/kodak/kodim11.png",
+    "testsuite/kodak/kodim12.png",
+    "testsuite/kodak/kodim13.png",
+    "testsuite/kodak/kodim14.png",
+    "testsuite/kodak/kodim15.png",
+    "testsuite/kodak/kodim16.png",
+    "testsuite/kodak/kodim17.png",
+    "testsuite/kodak/kodim18.png",
+    "testsuite/kodak/kodim19.png",
+    "testsuite/kodak/kodim20.png",
+    "testsuite/kodak/kodim21.png",
+    "testsuite/kodak/kodim22.png",
+    "testsuite/kodak/kodim23.png",
+    "testsuite/kodak/kodim24.png",
+    "testsuite/waterloo/clegg.png",
+    "testsuite/waterloo/frymire.png",
+    "testsuite/waterloo/lena.png",
+    "testsuite/waterloo/monarch.png",
+    "testsuite/waterloo/peppers.png",
+    "testsuite/waterloo/sail.png",
+    "testsuite/waterloo/serrano.png",
+    "testsuite/waterloo/tulips.png",
+};
+
+const char * roblox_set[] = {
+    "Roblox/asphalt_side/diffuse.tga",
+    "Roblox/asphalt_top/diffuse.tga",
+    "Roblox/basalt/diffuse.tga",
+    "Roblox/brick/diffuse.tga",
+    "Roblox/cobblestone_side/diffuse.tga",
+    "Roblox/cobblestone_top/diffuse.tga",
+    "Roblox/concrete_side/diffuse.tga",
+    "Roblox/concrete_top/diffuse.tga",
+    "Roblox/crackedlava/diffuse.tga",
+    "Roblox/glacier_bottom/diffuse.tga",
+    "Roblox/glacier_side/diffuse.tga",
+    "Roblox/glacier_top/diffuse.tga",
+    "Roblox/grass_bottom/diffuse.tga",
+    "Roblox/grass_side/diffuse.tga",
+    "Roblox/grass_top/diffuse.tga",
+    "Roblox/ground/diffuse.tga",
+    "Roblox/ice_side/diffuse.tga",
+    "Roblox/ice_top/diffuse.tga",
+    "Roblox/leafygrass_side/diffuse.tga",
+    "Roblox/leafygrass_top/diffuse.tga",
+    "Roblox/limestone_side/diffuse.tga",
+    "Roblox/limestone_top/diffuse.tga",
+    "Roblox/mud/diffuse.tga",
+    "Roblox/pavement_side/diffuse.tga",
+    "Roblox/pavement_top/diffuse.tga",
+    "Roblox/rock/diffuse.tga",
+    "Roblox/salt_side/diffuse.tga",
+    "Roblox/salt_top/diffuse.tga",
+    "Roblox/sand_side/diffuse.tga",
+    "Roblox/sand_top/diffuse.tga",
+    "Roblox/sandstone_bottom/diffuse.tga",
+    "Roblox/sandstone_side/diffuse.tga",
+    "Roblox/sandstone_top/diffuse.tga",
+    "Roblox/slate/diffuse.tga",
+    "Roblox/snow/diffuse.tga",
+    "Roblox/woodplanks/diffuse.tga",
+};
+
+
+
+
+int main(int argc, char *argv[])
+{
+    const char * inputFileName = "testsuite/kodak/kodim14.png";
+    //const char * inputFileName = "testsuite/kodak/kodim18.png";
+    //const char * inputFileName = "testsuite/kodak/kodim15.png";
+    //const char * inputFileName = "testsuite/waterloo/frymire.png";
+    // test_bc1(inputFileName, 0);
+
+    analyze_bc1(inputFileName);
+
+    //const char ** set = roblox_set;
+    //int count = sizeof(roblox_set) / sizeof(char*);
+
+    const char ** set = image_set;
+    int count = sizeof(image_set) / sizeof(char*);
+
+    Stats stats[COMPRESSOR_COUNT];
+
+    for (int i = 0; i < COMPRESSOR_COUNT; i++) {
+        stats[i].compressorName = nullptr;
+        stats[i].mseArray.resize(count, 0.0f);
+        stats[i].timeArray.resize(count, 0.0f);
+    }
+
+    for (int i = 0; i < count; i++) {
+        printf("\nImage '%s'\n", set[i]);
+
+        test_bc1(set[i], i, stats);
+
+        for (int c = 0; c < COMPRESSOR_COUNT; c++) {
+            if (stats[c].compressorName) {
+                printf("%-16s %f\t%f\n", stats[c].compressorName, sqrtf(stats[c].mseArray[i]), stats[c].timeArray[i]);
+            }
+        }
+    }
+
+    // Print stats.
+    printf("\nAverage Results:\n");
+    for (int c = 0; c < COMPRESSOR_COUNT; c++) {
+        if (stats[c].compressorName) {
+            float sum = 0.0f;
+            for (float it : stats[c].mseArray) {
+                sum += it;
+            }
+            sum /= count;
+
+            float time = 0.0f;
+            for (float it : stats[c].timeArray) {
+                time += it;
+            }
+
+            printf("%-16s %f\t%f\n", stats[c].compressorName, sqrtf(sum), time);
+        }
+    }
+
+    return EXIT_SUCCESS;
+}