Rework ranges library

Better matches the standard library, and iterators moved to their own file
Add subrange template
2024-09-13 06:37:34 +00:00 · 2022-07-05 22:51:25 -07:00 · 2022-07-03 19:08:15 -07:00 · 2022-07-03 11:56:37 -07:00 · 2022-07-02 17:14:12 -07:00 · 2022-07-02 17:02:28 -07:00
78 changed files with 3407 additions and 973 deletions
--- a/.clang-format
+++ b/.clang-format
@ -1,10 +1,11 @@
 ---
 BasedOnStyle: google
 IndentWidth: 4
-ColumnLimit: 160
+ColumnLimit: 120
 AllowShortBlocksOnASingleLine: Always
 AllowShortFunctionsOnASingleLine: All
 AlwaysBreakTemplateDeclarations: MultiLine
+#RequiresClausePositionStyle: SingleLine # requires Clang 15 :(
 #AlignConsecutiveDeclarations: true
 ---

--- a/.clang-tidy
+++ b/.clang-tidy
@ -2,15 +2,15 @@ FormatStyle: google

 Checks: '-*,clang-diagnostic-*,llvm-*,misc-*,-misc-unused-parameters,-misc-non-private-member-variables-in-classes,readability-identifier-naming,cppcoreguidelines-narrowing-conversions'
 CheckOptions:
-  - { key: readability-identifier-naming.NamespaceCase,          value: lower_case }
-  - { key: readability-identifier-naming.ClassCase,              value: CamelCase }
-  - { key: readability-identifier-naming.StructCase,             value: CamelCase }
-  - { key: readability-identifier-naming.TemplateParameterCase,  value: CamelCase }
-  - { key: readability-identifier-naming.FunctionCase,           value: aNy_CasE }
-  - { key: readability-identifier-naming.VariableCase,           value: lower_case }
-  - { key: readability-identifier-naming.MemberCase,        value: lower_case }
-  - { key: readability-identifier-naming.PrivateMemberPrefix,    value: _ }
-  - { key: readability-identifier-naming.ProtectedMemberPrefix,  value: _ }
+  - { key: readability-identifier-naming.NamespaceCase,            value: lower_case }
+  - { key: readability-identifier-naming.ClassCase,                value: CamelCase }
+  - { key: readability-identifier-naming.StructCase,               value: CamelCase }
+  - { key: readability-identifier-naming.TemplateParameterCase,    value: CamelCase }
+  - { key: readability-identifier-naming.FunctionCase,             value: lower_case }
+  - { key: readability-identifier-naming.VariableCase,             value: lower_case }
+  - { key: readability-identifier-naming.MemberCase,               value: lower_case }
+  - { key: readability-identifier-naming.PrivateMemberPrefix,      value: _ }
+  - { key: readability-identifier-naming.ProtectedMemberPrefix,    value: _ }
  - { key: readability-identifier-naming.EnumConstantCase,         value: CamelCase }
  - { key: readability-identifier-naming.ConstexprVariableCase,    value: CamelCase }
  - { key: readability-identifier-naming.GlobalConstantCase,       value: CamelCase }
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@ -6,23 +6,39 @@ name: Python Package
 on: [ push, pull_request ]

 jobs:
-  build-sdist:
-    name: Build SDist
-    runs-on: ubuntu-latest
-
+  test:
+    name: Run Unit Tests
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os: [ macos-12, windows-latest, ubuntu-latest ]
    steps:
      - uses: actions/checkout@v3
        with:
          fetch-depth: 0
+          submodules: recursive
+
+      - name: Set up GCC
+        if: runner.os == 'linux'
+        uses: egor-tensin/setup-gcc@v1
+        with:
+          version: 10
+
+      - name: Setup cmake
+        uses: jwlawson/actions-setup-cmake@v1.12
+        with:
+          cmake-version: 'latest'
+          github-api-token: ${{ secrets.GITHUB_TOKEN }}

      - name: Set up Python
        uses: actions/setup-python@v3.1.2
+        with:
+          python-version: '3.x'

      - name: Install dependencies
        run: |
          python -m pip install --upgrade pip
-          python -m pip install flake8
-          python -m pip install setuptools twine build
+          python -m pip install flake8 pybind11

      - name: Lint with flake8
        run: |
@ -31,6 +47,35 @@ jobs:
          # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
          flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics

+      - name: Build C code
+        run: |
+          ls
+          cmake -S . -B build -DQUICKTEX_NOPYTHON=TRUE -DCMAKE_BUILD_TYPE=Debug
+          cmake --build build
+
+      - name: Test C code
+        run: |
+          ctest -V --test-dir build -C Debug
+
+  build-sdist:
+    name: Build SDist
+    runs-on: ubuntu-latest
+    needs: test
+
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+          submodules: recursive
+
+      - name: Set up Python
+        uses: actions/setup-python@v3.1.2
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          python -m pip install setuptools twine build
+
      - name: Build SDist
        run: python -m build --sdist

@ -45,10 +90,11 @@ jobs:
  build-wheels:
    name: Build Wheels on ${{ matrix.os }} ${{ matrix.arch[0] }}
    runs-on: ${{ matrix.os }}
+    needs: test
    strategy:
      matrix:
-        os: [ macos-11, windows-latest, ubuntu-latest ]
-        arch: [ ['x86', 'x86_64', 'AMD64', 'x86_64' ] ] #[suffix, mac, windows, linux] arch names
+        os: [ macos-12, windows-latest, ubuntu-latest ]
+        arch: [ [ 'x86', 'x86_64', 'AMD64', 'x86_64' ] ] #[suffix, mac, windows, linux] arch names
        include:
          - os: ubuntu-latest
            arch: [ 'ARM', 'arm64', 'ARM64', 'aarch64' ]
@ -57,6 +103,7 @@ jobs:
      - uses: actions/checkout@v3
        with:
          fetch-depth: 0
+          submodules: recursive

      - name: Install libomp
        if: runner.os == 'macOS'
@ -78,7 +125,7 @@ jobs:
      - name: Build wheels
        uses: pypa/cibuildwheel@2.5.0
        env:
-          MACOSX_DEPLOYMENT_TARGET: "10.15"
+          MACOSX_DEPLOYMENT_TARGET: "10.9"
          CIBW_ARCHS_LINUX: ${{ matrix.arch[3] }}

      - name: Upload Wheels
--- a/.gitignore
+++ b/.gitignore
@ -33,3 +33,4 @@ compile_commands.json
 CTestTestfile.cmake
 _deps
 cmake-build-*
+*.a
--- a/.gitmodules
+++ b/.gitmodules
@ -0,0 +1,3 @@
+[submodule "external/xsimd"]
+	path = external/xsimd
+	url = https://github.com/xtensor-stack/xsimd.git
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -2,6 +2,13 @@

 All notable changes to this project will be documented in this file

+## Unreleased
+
+### Fixed
+
+- Fixed LeastSquares endpoint mode producint incorrect results
+
+
 ## 0.1.3 - 2022-04-13

 ### Fixed
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,63 +1,14 @@
 cmake_minimum_required(VERSION 3.18)
 include(tools/CompilerWarnings.cmake)
+include(tools/SIMDFlags.cmake)
 set(CMAKE_VERBOSE_MAKEFILE ON)

 project(quicktex)

-# Find dependencies
-find_package(Python COMPONENTS Interpreter Development.Module)
-find_package(pybind11 CONFIG REQUIRED)
-find_package(OpenMP)
+add_subdirectory(external/xsimd)

-# Collect source files
-file(GLOB SOURCE_FILES
-        "quicktex/*.cpp"
-        "quicktex/s3tc/*.cpp"
-        "quicktex/s3tc/bc1/*.cpp"
-        "quicktex/s3tc/bc3/*.cpp"
-        "quicktex/s3tc/bc4/*.cpp"
-        "quicktex/s3tc/bc5/*.cpp"
-        "quicktex/s3tc/interpolator/*.cpp"
-        )
+add_subdirectory(quicktex)
+add_subdirectory(tests)

-file(GLOB HEADER_FILES
-        "quicktex/*.h"
-        "quicktex/s3tc/*.h"
-        "quicktex/s3tc/bc1/*.h"
-        "quicktex/s3tc/bc3/*.h"
-        "quicktex/s3tc/bc4/*.h"
-        "quicktex/s3tc/bc5/*.h"
-        "quicktex/s3tc/interpolator/*.h"
-        )
-
-file(GLOB_RECURSE PYTHON_FILES "src/**/*.py")
-
-# Organize source files together for some IDEs
-source_group(TREE ${CMAKE_CURRENT_SOURCE_DIR} FILES ${SOURCE_FILES} ${HEADER_FILES} ${PYTHON_FILES})
-
-# Add python module
-pybind11_add_module(_quicktex
-        ${SOURCE_FILES}
-        ${HEADER_FILES})
-
-# Set Quicktex version info
-target_compile_definitions(_quicktex PRIVATE VERSION_INFO=${QUICKTEX_VERSION_INFO})
-
-# enable openMP if available
-if (OpenMP_CXX_FOUND)
-    target_link_libraries(_quicktex PUBLIC OpenMP::OpenMP_CXX)
-endif ()
-
-# Set module features, like C/C++ standards
-target_compile_features(_quicktex PUBLIC cxx_std_17 c_std_11)
-
-# Set compiler warnings
-set_project_warnings(_quicktex)
-
-# Clang-specific
-if (CMAKE_CXX_COMPILER_ID MATCHES ".*Clang")
-    set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -stdlib=libc++ -lc++abi")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -stdlib=libc++ -fno-omit-frame-pointer -mno-omit-leaf-frame-pointer")
-    set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -stdlib=libc++ -fsanitize=undefined")
-    set(PROJECT_WARNINGS ${CLANG_WARNINGS})
-endif ()
+enable_testing ()
+add_test (NAME QuicktexTest COMMAND Test)
--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -0,0 +1,2 @@
+graft external
+global-exclude *.afdesign # this is currently the vast majority of the repo size
--- a/external/xsimd
+++ b/external/xsimd
--- a/pyproject.toml
+++ b/pyproject.toml
@ -64,8 +64,10 @@ package-dir = { '' = '.' } # without this line, C++ source files get included in
 [tool.cibuildwheel]
 build = "cp*" # only build wheels for cpython.
 build-frontend = "build"
-test-command = "pytest {project}/tests --verbose"
+test-command = "pytest {project}/tests --verbose --full-trace --capture=tee-sys"
 test-extras = ["tests"]
+test-skip = "*-macosx_arm64 *-macosx_universal2:arm64" # skip testing on arm macOS because CIBW doesnt support it
+environment = { QUICKTEX_SIMD_MODE = "SSE4" } # SSE4 has a 99% market share and was released under the Bush administration

 [tool.cibuildwheel.macos]
 archs = ["x86_64", "universal2"] # build fat binaries, or x86-64 for python 3.7
@ -75,11 +77,16 @@ skip = ["cp{38,39,31*}-macosx_x86_64"] # skip x86-only builds where fat binaries
 archs = ["auto64"] # arm64 windows builds not yet supported

 [tool.cibuildwheel.linux]
-skip = ["cp37-musllinux*", "*musllinux_aarch64*"] # skip targets without available Pillow wheels
+skip = ["*musllinux*"]
 manylinux-x86_64-image = "manylinux2014"
 manylinux-aarch64-image = "manylinux2014"

 [tool.black]
 line-length = 120 # 80-column is stupid
 target-version = ['py37', 'py38', 'py39', 'py310']
-skip-string-normalization = true
+skip-string-normalization = true
+
+[tool.pytest.ini_options]
+minversion = "7.0"
+addopts = ["--full-trace", "--capture=tee-sys"]
+testpaths = ["tests"]
--- a/quicktex/CMakeLists.txt
+++ b/quicktex/CMakeLists.txt
@ -0,0 +1,71 @@
+
+# Find dependencies
+if (NOT QUICKTEX_NOPYTHON)
+    find_package(Python COMPONENTS Interpreter Development.Module)
+    find_package(pybind11 CONFIG REQUIRED)
+endif ()
+find_package(OpenMP)
+
+#Collect source files
+set(SOURCE_FILES
+        Matrix4x4.cpp OldColor.cpp
+        s3tc/bc1/BC1Block.cpp s3tc/bc1/BC1Decoder.cpp
+        s3tc/bc1/BC1Encoder.cpp s3tc/bc1/OrderTable.cpp s3tc/bc1/OrderTable4.cpp
+        s3tc/bc3/BC3Decoder.cpp s3tc/bc3/BC3Encoder.cpp
+        s3tc/bc4/BC4Block.cpp s3tc/bc4/BC4Decoder.cpp s3tc/bc4/BC4Encoder.cpp
+        s3tc/bc5/BC5Decoder.cpp s3tc/bc5/BC5Encoder.cpp
+        s3tc/interpolator/Interpolator.cpp
+        texture/RawTexture.cpp texture/Window.cpp test.cpp)
+
+set(BINDING_FILES
+        _bindings.cpp
+        s3tc/_bindings.cpp
+        s3tc/bc1/_bindings.cpp
+        s3tc/bc3/_bindings.cpp
+        s3tc/bc4/_bindings.cpp
+        s3tc/bc5/_bindings.cpp
+        s3tc/interpolator/_bindings.cpp)
+
+file(GLOB_RECURSE HEADER_FILES "**.h")
+file(GLOB_RECURSE PYTHON_FILES "**.py")
+
+# Organize source files together for some IDEs
+source_group(TREE ${CMAKE_CURRENT_SOURCE_DIR} FILES ${SOURCE_FILES} ${BINDING_FILES} ${HEADER_FILES} ${PYTHON_FILES})
+
+# Declare implementation module
+add_library(quicktex STATIC ${SOURCE_FILES} ${HEADER_FILES})
+
+# Link openMP if available
+if (OpenMP_CXX_FOUND)
+    target_link_libraries(quicktex PUBLIC OpenMP::OpenMP_CXX)
+endif ()
+
+# Link XSimd
+target_link_libraries(quicktex PUBLIC xsimd)
+
+# Set library features, like C/C++ standards
+target_compile_features(quicktex PUBLIC cxx_std_20 c_std_11)
+set_property(TARGET quicktex PROPERTY CXX_VISIBILITY_PRESET hidden)
+set_property(TARGET quicktex PROPERTY POSITION_INDEPENDENT_CODE ON)
+
+# Include source root for project-relative includes
+target_include_directories(quicktex PUBLIC .)
+
+# Set compiler warnings and SIMD flags
+set_project_warnings(quicktex)
+set_simd_flags(quicktex)
+
+if (NOT QUICKTEX_NOPYTHON)
+    # Declare python module
+    pybind11_add_module(_quicktex ${BINDING_FILES} ${HEADER_FILES})
+    target_compile_definitions(_quicktex PRIVATE VERSION_INFO=${QUICKTEX_VERSION_INFO})
+
+    # Link python module with implementation
+    target_link_libraries(_quicktex PUBLIC quicktex)
+
+    if ((NOT MSVC) AND (CMAKE_BUILD_TYPE MATCHES Debug) AND ($ENV{QUICKTEX_SANITIZE}))
+        target_compile_options(_quicktex PUBLIC -fsanitize=address,undefined -fno-sanitize-recover=address,undefined -fno-omit-frame-pointer)
+        target_link_options(_quicktex PUBLIC -fsanitize=address,undefined -fno-sanitize-recover=address,undefined -fno-omit-frame-pointer)
+    endif ()
+
+endif ()
--- a/quicktex/Color.h
+++ b/quicktex/Color.h
@ -1,5 +1,5 @@
 /*  Quicktex Texture Compression Library
-    Copyright (C) 2021-2022 Andrew Cassidy <drewcassidy@me.com>
+    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
    and licenced under the public domain

@ -18,82 +18,60 @@
 */

 #pragma once
-#include <cassert>  // for assert
-#include <cstddef>  // for size_t
-#include <cstdint>  // for uint8_t, uint16_t
+#include "Matrix.h"
+#include "util/bitbash.h"

 namespace quicktex {
-class Vector4;
-class Vector4Int;

-#pragma pack(push, 1)
-class Color {
+using Color = Vec<uint8_t, 4>;
+using ColorRGB = Vec<uint8_t, 3>;
+
+constexpr size_t uint5_max = (1 << 5) - 1;
+constexpr size_t uint6_max = (1 << 6) - 1;
+
+template <size_t N> struct MidpointTable {
   public:
-    uint8_t r;
-    uint8_t g;
-    uint8_t b;
-    uint8_t a;
-
-    constexpr Color() : Color(0, 0, 0, 0xFF) {}
-
-    constexpr Color(uint8_t vr, uint8_t vg, uint8_t vb, uint8_t va = 0xFF) : r(vr), g(vg), b(vb), a(va) {}
-
-    Color(Vector4Int v);
-
-    static uint16_t Pack565Unscaled(uint8_t r, uint8_t g, uint8_t b);
-    static uint16_t Pack565(uint8_t r, uint8_t g, uint8_t b);
-
-    static Color Unpack565Unscaled(uint16_t Packed);
-    static Color Unpack565(uint16_t Packed);
-
-    static Color PreciseRound565(Vector4 &v);
-
-    static Color Min(const Color &A, const Color &B);
-    static Color Max(const Color &A, const Color &B);
-
-    bool operator==(const Color &Rhs) const;
-    bool operator!=(const Color &Rhs) const;
-
-    uint8_t operator[](size_t index) const {
-        assert(index < 4);
-        return reinterpret_cast<const uint8_t *>(this)[index];
-    }
-    uint8_t &operator[](size_t index) {
-        assert(index < 4);
-        return reinterpret_cast<uint8_t *>(this)[index];
+    constexpr MidpointTable() : _values() {
+        constexpr float fN = (float)N;
+        for (unsigned i = 0; i < N - 1; i++) { _values[i] = ((float)i / fN) + (0.5f / fN); }
+        _values[N - 1] = 1e+37f;
    }

-    operator Vector4() const;
-    operator Vector4Int() const;
-    friend Vector4Int operator-(const Color &lhs, const Color &rhs);
-
-    void SetRGB(uint8_t vr, uint8_t vg, uint8_t vb);
-    void SetRGB(const Color &other) { SetRGB(other.r, other.g, other.b); }
-
-    uint16_t Pack565() const;
-    uint16_t Pack565Unscaled() const;
-
-    Color ScaleTo565() const;
-    Color ScaleFrom565() const;
-
-    size_t MinChannelRGB();
-    size_t MaxChannelRGB();
-
-    bool IsGrayscale() const { return ((r == g) && (r == b)); }
-    bool IsBlack() const { return (r | g | b) < 4; }
-
-    int GetLuma() const { return (13938U * r + 46869U * g + 4729U * b + 32768U) >> 16U; }  // REC709 weightings
+    float operator[](size_t i) const {
+        assert(i < N);
+        return _values[i];
+    }

   private:
-    static constexpr float Midpoints5bit[32] = {.015686f, .047059f, .078431f, .111765f, .145098f, .176471f, .207843f, .241176f, .274510f, .305882f, .337255f,
-                                                .370588f, .403922f, .435294f, .466667f, .5f,      .533333f, .564706f, .596078f, .629412f, .662745f, .694118f,
-                                                .725490f, .758824f, .792157f, .823529f, .854902f, .888235f, .921569f, .952941f, .984314f, 1e+37f};
-    static constexpr float Midpoints6bit[64] = {.007843f, .023529f, .039216f, .054902f, .070588f, .086275f, .101961f, .117647f, .133333f, .149020f, .164706f,
-                                                .180392f, .196078f, .211765f, .227451f, .245098f, .262745f, .278431f, .294118f, .309804f, .325490f, .341176f,
-                                                .356863f, .372549f, .388235f, .403922f, .419608f, .435294f, .450980f, .466667f, .482353f, .500000f, .517647f,
-                                                .533333f, .549020f, .564706f, .580392f, .596078f, .611765f, .627451f, .643137f, .658824f, .674510f, .690196f,
-                                                .705882f, .721569f, .737255f, .754902f, .772549f, .788235f, .803922f, .819608f, .835294f, .850980f, .866667f,
-                                                .882353f, .898039f, .913725f, .929412f, .945098f, .960784f, .976471f, .992157f, 1e+37f};
+    float _values[N];
 };
-#pragma pack(pop)
+
+constexpr MidpointTable<32> Midpoints5bit;
+constexpr MidpointTable<64> Midpoints6bit;
+
+template <typename T> Vec<T, 3> scale_to_565(Vec<T, 3> unscaled) {
+    return Vec<T, 3>{scale_from_8<T, 5>(unscaled.r()), scale_from_8<T, 6>(unscaled.g()),
+                     scale_from_8<T, 5>(unscaled.b())};
+}
+
+template <typename T> Vec<T, 3> scale_from_565(Vec<T, 3> scaled) {
+    return Vec<T, 3>{scale_to_8<T, 5>(scaled.r()), scale_to_8<T, 6>(scaled.g()), scale_to_8<T, 5>(scaled.b())};
+}
+
+template <typename T = int16_t> Vec<T, 3> precise_round_565(Vec<float, 3> &v) {
+    auto scaled = v * Vec<float, 3>{uint5_max, uint6_max, uint5_max};       // rescale by from (0,1) to (0,int_max)
+    auto rounded = (Vec<T, 3>)scaled;                                       // downcast to integral type
+    rounded = rounded.clamp({0, 0, 0}, {uint5_max, uint6_max, uint5_max});  // clamp to avoid out of bounds float errors
+
+    // increment each channel if above the rounding point
+    if (v.r() > Midpoints5bit[rounded.r()]) rounded.r()++;
+    if (v.g() > Midpoints6bit[rounded.g()]) rounded.g()++;
+    if (v.b() > Midpoints5bit[rounded.b()]) rounded.b()++;
+
+    assert(rounded.r() <= uint5_max);
+    assert(rounded.g() <= uint6_max);
+    assert(rounded.b() <= uint5_max);
+
+    return rounded;
+}
 }  // namespace quicktex
--- a/quicktex/ColorBlock.h
+++ b/quicktex/ColorBlock.h
@ -25,7 +25,7 @@
 #include <cstring>
 #include <stdexcept>

-#include "Color.h"
+#include "OldColor.h"
 #include "Vector4Int.h"

 namespace quicktex {
@ -34,9 +34,9 @@ using Coords = std::tuple<int, int>;
 template <int N, int M> class ColorBlock  {
   public:
    struct Metrics {
-        Color min;
-        Color max;
-        Color avg;
+        OldColor min;
+        OldColor max;
+        OldColor avg;
        bool is_greyscale;
        bool has_black;
        Vector4Int sums;
@ -45,37 +45,37 @@ template <int N, int M> class ColorBlock  {
    static constexpr int Width = N;
    static constexpr int Height = M;

-    constexpr Color Get(int x, int y) const {
+    constexpr OldColor Get(int x, int y) const {
        if (x >= Width || x < 0) throw std::invalid_argument("x value out of range");
        if (y >= Height || y < 0) throw std::invalid_argument("y value out of range");

        return _pixels[x + (N * y)];
    }

-    constexpr Color Get(int i) const {
+    constexpr OldColor Get(int i) const {
        if (i >= N * M || i < 0) throw std::invalid_argument("i value out of range");
        return _pixels[i];
    }

-    void Set(int x, int y, const Color &value) {
+    void Set(int x, int y, const OldColor &value) {
        if (x >= Width || x < 0) throw std::invalid_argument("x value out of range");
        if (y >= Height || y < 0) throw std::invalid_argument("y value out of range");
        _pixels[x + (N * y)] = value;
    }

-    void Set(int i, const Color &value) {
+    void Set(int i, const OldColor &value) {
        if (i >= N * M || i < 0) throw std::invalid_argument("i value out of range");
        _pixels[i] = value;
    }

-    void GetRow(int y, Color *dst) const {
+    void GetRow(int y, OldColor *dst) const {
        if (y >= Height || y < 0) throw std::invalid_argument("y value out of range");
-        std::memcpy(dst, &_pixels[N * y], N * sizeof(Color));
+        std::memcpy(dst, &_pixels[N * y], N * sizeof(OldColor));
    }

-    void SetRow(int y, const Color *src) {
+    void SetRow(int y, const OldColor *src) {
        if (y >= Height || y < 0) throw std::invalid_argument("y value out of range");
-        std::memcpy(&_pixels[N * y], src, N * sizeof(Color));
+        std::memcpy(&_pixels[N * y], src, N * sizeof(OldColor));
    }

    bool IsSingleColor() const {
@ -88,8 +88,8 @@ template <int N, int M> class ColorBlock  {

    Metrics GetMetrics(bool ignore_black = false) const {
        Metrics metrics;
-        metrics.min = Color(UINT8_MAX, UINT8_MAX, UINT8_MAX);
-        metrics.max = Color(0, 0, 0);
+        metrics.min = OldColor(UINT8_MAX, UINT8_MAX, UINT8_MAX);
+        metrics.max = OldColor(0, 0, 0);
        metrics.has_black = false;
        metrics.is_greyscale = true;
        metrics.sums = {0, 0, 0};
@ -97,7 +97,7 @@ template <int N, int M> class ColorBlock  {
        unsigned total = 0;

        for (unsigned i = 0; i < M * N; i++) {
-            Color val = Get(i);
+            OldColor val = Get(i);
            bool is_black = val.IsBlack();

            metrics.has_black |= is_black;
@ -118,7 +118,7 @@ template <int N, int M> class ColorBlock  {
    }

   private:
-    std::array<Color, N * M> _pixels;
+    std::array<OldColor, N * M> _pixels;
 };

 }  // namespace quicktex
--- a/quicktex/Decoder.h
+++ b/quicktex/Decoder.h
@ -22,7 +22,7 @@
 #include <memory>

 #include "ColorBlock.h"
-#include "Texture.h"
+#include "texture/RawTexture.h"

 namespace quicktex {

@ -46,19 +46,19 @@ template <class T> class BlockDecoder : public Decoder<T> {
    virtual DecodedBlock DecodeBlock(const EncodedBlock &block) const = 0;

    virtual RawTexture Decode(const T &encoded) const override {
-        auto decoded = RawTexture(encoded.Width(), encoded.Height());
+        auto decoded = RawTexture(encoded.width, encoded.height);

-        int blocks_x = encoded.BlocksX();
-        int blocks_y = encoded.BlocksY();
+        int blocks_x = encoded.bwidth();
+        int blocks_y = encoded.bheight();

        // from experimentation, multithreading this using OpenMP actually makes decoding slower
        // due to thread creation/teardown taking longer than the decoding process itself.
        // As a result, this is left as a serial operation despite being embarassingly parallelizable
        for (int y = 0; y < blocks_y; y++) {
            for (int x = 0; x < blocks_x; x++) {
-                auto block = encoded.GetBlock(x, y);
+                auto block = encoded.get_block(x, y);
                auto pixels = DecodeBlock(block);
-                decoded.SetBlock<BlockWidth, BlockHeight>(x, y, pixels);
+                decoded.set_block<BlockWidth, BlockHeight>(x, y, pixels);
            }
        }

--- a/quicktex/Encoder.h
+++ b/quicktex/Encoder.h
@ -22,7 +22,7 @@
 #include <memory>

 #include "ColorBlock.h"
-#include "Texture.h"
+#include "texture/RawTexture.h"

 namespace quicktex {

@ -46,21 +46,22 @@ template <typename T> class BlockEncoder : public Encoder<T> {
    virtual EncodedBlock EncodeBlock(const DecodedBlock &block) const = 0;

    virtual T Encode(const RawTexture &decoded) const override {
-        auto encoded = T(decoded.Width(), decoded.Height());
+        auto encoded = T(decoded.width, decoded.height);

-        int blocks_x = encoded.BlocksX();
-        int blocks_y = encoded.BlocksY();
+        unsigned blocks_x = encoded.bwidth();
+        unsigned blocks_y = encoded.bheight();

        // from experimentation, multithreading this using OpenMP sometimes actually makes encoding slower
        // due to thread creation/teardown taking longer than the encoding process itself.
        // As a result, this is sometimes left as a serial operation despite being embarassingly parallelizable
        // threshold for number of blocks before multithreading is set by overriding MTThreshold()
 #pragma omp parallel for if (blocks_x * blocks_y >= MTThreshold())
-        for (int y = 0; y < blocks_y; y++) {
-            for (int x = 0; x < blocks_x; x++) {
-                auto pixels = decoded.GetBlock<BlockWidth, BlockHeight>(x, y);
+        for (int y = 0; y < (int)blocks_y; y++) {
+            for (int x = 0; x < (int)blocks_x; x++) {
+                // index variables have to be signed for MSVC for some reason
+                auto pixels = decoded.get_block<BlockWidth, BlockHeight>(x, y);
                auto block = EncodeBlock(pixels);
-                encoded.SetBlock(x, y, block);
+                encoded.set_block(x, y, block);
            }
        }

--- a/quicktex/Matrix.h
+++ b/quicktex/Matrix.h
@ -0,0 +1,457 @@
+/*  Quicktex Texture Compression Library
+    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
+    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
+    and licenced under the public domain
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Lesser General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <algorithm>
+#include <cstdint>
+#include <numeric>
+#include <xsimd/xsimd.hpp>
+
+#include "util/iterator.h"
+#include "util/map.h"
+#include "util/math.h"
+#include "util/ranges.h"
+
+namespace quicktex {
+
+template <typename T, int M, int N> class Matrix;
+
+template <typename T, int M> using Vec = Matrix<T, M, 1>;
+
+// region helper concepts
+template <typename L, typename R, typename Op>
+concept operable = requires(L &l, R &r, Op &op) { op(l, r); };
+
+template <typename V>
+concept is_matrix = requires(V &v) {
+                        V::width();
+                        V::height();
+                        V::value_type;
+                    } && std::same_as < Matrix<typename V::value_type, V::height(), V::width()>,
+std::remove_cvref_t < V >> ;
+
+template <typename V> struct vector_stats {
+    static constexpr int width = 1;
+    static constexpr int height = 1;
+    static constexpr int dims = 0;
+};
+
+template <typename V>
+    requires is_matrix<V>
+struct vector_stats<V> {
+    static constexpr int width = V::width;
+    static constexpr int height = V::height;
+    static constexpr int dims = V::dims;
+};
+
+template <typename V> constexpr int vector_width = vector_stats<V>::width;
+template <typename V> constexpr int vector_height = vector_stats<V>::height;
+template <typename V> constexpr int vector_dims = vector_stats<V>::dims;
+
+// endregion
+
+template <typename R, typename T, int N> class VecBase {
+   public:
+    constexpr VecBase(T scalar = T()) : _c{} { _c.fill(scalar); }
+
+   protected:
+    const R &_at(int index) const { return _c.at(index); }
+    R &_at(int index) { return _c.at(index); }
+
+    constexpr auto _begin() const { return _c.data(); }
+    constexpr auto _begin() { return _c.data(); }
+    constexpr auto _end() const { return _c.data() + N; }
+    constexpr auto _end() { return _c.data() + N; }
+
+   private:
+    std::array<R, N> _c;
+};
+
+template <typename T, int N, int M> using matrix_row_type = std::conditional_t<N <= 1, T, Vec<T, N>>;
+template <typename T, int N, int M> using matrix_column_type = std::conditional_t<M <= 1, T, Vec<T, M>>;
+
+/**
+ * A matrix of values that can be operated on
+ * @tparam T Scalar type
+ * @tparam N Width of the matrix
+ * @tparam M Height of the matrix
+ */
+template <typename T, int M, int N>
+class Matrix : public VecBase<std::conditional_t<N == 1, T, VecBase<T, T, N>>, T, M> {
+   public:
+    using base = VecBase<std::conditional_t<N == 1, T, VecBase<T, T, N>>, T, M>;
+
+    using value_type = T;
+    using row_type = matrix_row_type<T, N, M>;
+    using column_type = matrix_column_type<T, N, M>;
+
+    using base::base;
+    //    using base::begin;
+    //    using base::end;
+    //    using base::operator[];
+
+    // region constructors
+    /**
+     * Create a vector from an intializer list
+     * @param il values to populate with
+     */
+    Matrix(std::initializer_list<row_type> il) : base() {
+        assert(il.size() == M);  // ensure il is of the right size
+        std::copy_n(il.begin(), M, this->begin());
+    }
+
+    /**
+     * Create a vector from a scalar value
+     * @param scalar value to populate with
+     */
+    //    constexpr Matrix(const T &scalar) { std::fill(this->begin(), this->end(), scalar); }
+
+    /**
+     * Create a vector from an iterator
+     * @tparam II input iterator type
+     * @param input_iterator iterator to copy from
+     */
+    template <typename II>
+        requires std::input_iterator<II> && std::convertible_to<std::iter_value_t<II>,
+                                                                const row_type>
+        Matrix(const II input_iterator) : base() {
+        std::copy_n(input_iterator, M, this->begin());
+    }
+
+    /**
+     * Create a vector from a range type
+     * @tparam R Range type
+     * @param input_range Range to copy from
+     */
+    template <typename R>
+    Matrix(const R &input_range)
+        requires range<R> && std::convertible_to<typename R::value_type, row_type>
+    : Matrix(input_range.begin()) {
+        assert(std::distance(input_range.begin(), input_range.end()) == M);
+    }
+
+    template <typename R = T>
+        requires(N == M)
+    static constexpr Matrix identity() {
+        Matrix result = Matrix(0);
+        for (int i = 0; i < N; i++) { result.element(i, i) = 1; }
+        return result;
+    }
+    // endregion
+
+    // region iterators and accessors
+    static constexpr int size() { return M; }
+    static constexpr int width = N;
+    static constexpr int height = M;
+    static constexpr int elements = N * M;
+    static constexpr int dims = ((width > 1) ? 1 : 0) + ((height > 1) ? 1 : 0);
+
+    const row_type &at(int index) const {
+        assert(index >= 0 && index < M);
+        return static_cast<const row_type &>(base::_at(index));
+    }
+    row_type &at(int index) {
+        assert(index >= 0 && index < M);
+        return static_cast<row_type &>(base::_at(index));
+    }
+
+    const row_type &operator[](int index) const { return at(index); }
+    row_type &operator[](int index) { return at(index); }
+
+    const row_type *begin() const { return static_cast<const row_type *>(base::_begin()); }
+    row_type *begin() { return static_cast<row_type *>(base::_begin()); }
+
+    const row_type *end() const { return static_cast<const row_type *>(base::_end()); }
+    row_type *end() { return static_cast<row_type *>(base::_end()); }
+
+    auto column_begin() const { return column_iterator(this, 0); }
+    auto column_end() const { return column_iterator(this, N); }
+
+    auto all_begin() const { return linear_iterator<const Matrix>(this, 0); }
+    auto all_begin() { return linear_iterator<Matrix>(this, 0); }
+
+    auto all_end() const { return linear_iterator<const Matrix>(this, N * M); }
+    auto all_end() { return linear_iterator<Matrix>(this, N * M); }
+
+    const row_type &get_row(int m) const { return static_cast<const row_type &>(this->at(m)); }
+    template <typename R> void set_row(int m, const R &value) { this->at(m) = value; }
+
+    template <typename S = T> column_type get_column(int n) const {
+        if constexpr (M == 1) {
+            return element(0, n);
+        } else {
+            column_type ret;
+            for (int m = 0; m < M; m++) { ret[m] = element(m, n); }
+            return ret;
+        }
+    }
+
+    void set_column(int n, const column_type &value) {
+        if constexpr (M == 1) {
+            element(0, n) = value;
+        } else {
+            for (int m = 0; m < M; m++) { element(m, n) = value[m]; }
+        }
+    }
+
+    // n/m accessors
+    const T &element(int m, int n) const {
+        if constexpr (N == 1) {
+            return this->at(m);
+        } else {
+            return this->at(m)[n];
+        }
+    }
+
+    T &element(int n, int m) { return const_cast<T &>(static_cast<const Matrix &>(*this).element(n, m)); }
+
+    // linear accessors
+    const T &element(int i) const { return element(i / N, i % N); }
+    T &element(int i) { return element(i / N, i % N); }
+
+    // RGBA accessors
+    const T &r() const { return (*this)[0]; }
+    T &r() { return this->at(0); }
+    template <typename S = T> std::enable_if_t<M >= 2, const S &> g() const { return this->at(1); }
+    template <typename S = T> std::enable_if_t<M >= 2, S &> g() { return this->at(1); }
+    template <typename S = T> std::enable_if_t<M >= 3, const S &> b() const { return this->at(2); }
+    template <typename S = T> std::enable_if_t<M >= 3, S &> b() { return this->at(2); }
+    template <typename S = T> std::enable_if_t<M >= 4, const S &> a() const { return this->at(3); }
+    template <typename S = T> std::enable_if_t<M >= 4, S &> a() { return this->at(3); }
+
+    // XYZW accessors
+    const T &x() const { return this->at(0); }
+    T &x() { return this->at(0); }
+    template <typename S = T> std::enable_if_t<M >= 2, const S &> y() const { return this->at(1); }
+    template <typename S = T> std::enable_if_t<M >= 2, S &> y() { return this->at(1); }
+    template <typename S = T> std::enable_if_t<M >= 3, const S &> z() const { return this->at(2); }
+    template <typename S = T> std::enable_if_t<M >= 3, S &> z() { return this->at(2); }
+    template <typename S = T> std::enable_if_t<M >= 4, const S &> w() const { return this->at(3); }
+    template <typename S = T> std::enable_if_t<M >= 4, S &> w() { return this->at(3); }
+    // endregion
+
+    template <typename R>
+        requires std::equality_comparable_with<T, R> bool
+    operator==(const Matrix<R, M, N> &rhs) const {
+        return size() == rhs.size() && std::equal(this->begin(), this->end(), rhs.begin());
+    };
+
+    // unary vector negation
+    template <typename S = T>
+        requires(!std::unsigned_integral<T>) && requires(T &t) { -t; }
+    Matrix operator-() const {
+        return map(std::negate(), *this);
+    };
+
+    // add vectors
+    template <typename R>
+        requires operable<R, T, std::plus<>>
+    Matrix operator+(const Matrix<R, M, N> &rhs) const {
+        return map(std::plus(), *this, rhs);
+    };
+
+    // subtract vectors
+    template <typename R>
+        requires operable<R, T, std::minus<>>
+    Matrix operator-(const Matrix<R, M, N> &rhs) const {
+        // we can't just add the negation because that's invalid for int types
+        return map(std::minus(), *this, rhs);
+    };
+
+    // multiply matrix with a matrix or column vector
+    template <typename R, int P>
+        requires(P == 1 || P == N) && operable<R, T, std::multiplies<>>
+    Matrix operator*(const Matrix<R, M, P> &rhs) const {
+        return map(std::multiplies(), *this, rhs);
+    };
+
+    // multiply matrix with a scalar
+    template <typename R>
+        requires operable<R, T, std::multiplies<>>
+    Matrix operator*(const R &rhs) const {
+        return map(std::multiplies(), *this, rhs);
+    };
+
+    // divides a matrix by a matrix or column vector
+    template <typename R, int NN>
+        requires(NN == 1 || NN == N) && operable<R, T, std::divides<>>
+    Matrix operator/(const Matrix<R, M, NN> &rhs) const {
+        return map(std::divides(), *this, rhs);
+    };
+
+    // divides a matrix by a scalar
+    template <typename R>
+        requires operable<R, T, std::divides<>>
+    Matrix operator/(const R &rhs) const {
+        return map(std::divides(), *this, rhs);
+    };
+
+    // add-assigns a matrix with a matrix
+    template <typename R>
+        requires operable<Matrix, R, std::plus<>>
+    Matrix &operator+=(const R &rhs) {
+        return *this = *this + rhs;
+    }
+
+    // subtract-assigns a matrix with a matrix
+    template <typename R>
+        requires operable<Matrix, R, std::minus<>>
+    Matrix &operator-=(const R &rhs) {
+        return *this = *this - rhs;
+    }
+
+    // multiply-assigns a matrix with a matrix, column vector, or a scalar
+    template <typename R>
+        requires operable<Matrix, R, std::multiplies<>>
+    Matrix &operator*=(const R &rhs) {
+        return *this = *this * rhs;
+    }
+
+    // divide-assigns a matrix by a matrix, column vector, or a scalar
+    template <typename R>
+        requires operable<Matrix, R, std::divides<>>
+    Matrix &operator/=(const R &rhs) {
+        return *this = *this / rhs;
+    }
+
+    // decay a 1x1 matrix to a scalar on demand
+    template <typename S = T>
+        requires(N == 1 && M == 1)
+    operator S &() {
+        return this->at(0);
+    }
+    template <typename S = T>
+        requires(N == 1 && M == 1)
+    operator const S &() const {
+        return this->at(0);
+    }
+
+    // sum up all columns
+    column_type hsum() const {
+        if constexpr (N == 1) { return *this; }
+        if constexpr (M == 1) { return sum(); }
+        for (int i = 0; i < M; i++) {}
+        return _map<column_type>([](auto row) { return quicktex::sum(row); }, *this);
+    }
+
+    // sum up all rows
+    row_type vsum() const {
+        if constexpr (N == 1) { return sum(); }
+        if constexpr (M == 1) { return *this; }
+        return std::accumulate(begin(), end(), row_type{});
+    }
+
+    // sum up all values
+    T sum() const {
+        // TODO: reintroduce SIMDing for this
+        return std::accumulate(all_begin(), all_end(), T(0));
+    }
+
+    template <typename R, int P>
+        requires operable<R, T, std::multiplies<>>
+    Matrix<T, M, P> mult(const Matrix<R, N, P> &rhs) const {
+        Matrix<T, M, P> res(0);
+        for (int p = 0; p < P; p++) {
+            // for each column of the RHS/Result
+            for (int m = 0; m < M; m++) {
+                // for each row of the LHS/Result
+                for (int n = 0; n < N; n++) { res.element(m, p) += element(m, n) * rhs.element(n, p); }
+            }
+        }
+        return res;
+    }
+
+    Matrix<T, N, M> transpose() const {
+        Matrix<T, N, M> res;
+        for (int m = 0; m < M; m++) { res.set_column(m, get_row(m)); }
+        return res;
+    }
+
+    template <typename R = T>
+        requires(N == M)
+    Matrix mirror() const {
+        Matrix result = *this;
+        for (int n = 0; n < N - 1; n++) {
+            for (int m = (n + 1); m < M; m++) { result.element(m, n) = result.element(n, m); }
+        }
+        return result;
+    }
+
+    // dot product of two compatible matrices
+    template <typename R>
+        requires(N == 1) && operable<T, R, std::multiplies<>> && operable<T, T, std::plus<>>
+    inline row_type dot(const Matrix<R, M, N> &rhs) const {
+        // technically this is Lt * R, but the vsum method is probably faster/more readable
+        // than allocationg a new transpose matrix
+        Matrix product = *this * rhs;
+        return product.vsum();
+    }
+
+    inline row_type sqr_mag() const { return dot(*this); }
+
+    inline Matrix abs() const {
+        return map([](auto c) { return quicktex::abs(c); }, *this);
+    }
+
+    inline Matrix clamp(T low, T high) {
+        return map([low, high](auto c) { return quicktex::clamp(c, low, high); }, *this);
+    }
+    inline Matrix clamp(const Matrix &low, const Matrix &high) {
+        return map([](auto c, auto l, auto h) { return quicktex::clamp(c, l, h); }, *this, low, high);
+    }
+
+   protected:
+    class column_iterator : public index_iterator_base<column_iterator, column_type> {
+       public:
+        using value_type = column_type;
+        using base = index_iterator_base<column_iterator, column_type>;
+
+        column_iterator(const Matrix *matrix = nullptr, int index = 0) : base(index), _matrix(matrix){};
+
+        column_type operator*() const { return _matrix->get_column(this->_index); }
+        const column_type *operator->() const { &(_matrix->get_column(this->_index)); }
+
+        friend bool operator==(const column_iterator &lhs, const column_iterator &rhs) {
+            return (lhs._matrix == rhs._matrix) && (lhs._index == rhs._index);
+        }
+
+       private:
+        const Matrix *_matrix;
+    };
+
+    template <typename V> class linear_iterator : public index_iterator_base<linear_iterator<V>, T> {
+       public:
+        using value_type = T;
+        using base = index_iterator_base<linear_iterator<V>, T>;
+
+        linear_iterator(V *matrix = nullptr, int index = 0) : base(index), _matrix(matrix){};
+
+        auto &operator*() { return _matrix->element(this->_index); }
+        auto *operator->() const { return &(_matrix->element(this->_index)); }
+
+        friend bool operator==(const linear_iterator &lhs, const linear_iterator &rhs) {
+            return (lhs._matrix == rhs._matrix) && (lhs._index == rhs._index);
+        }
+
+       private:
+        V *_matrix;
+    };
+};
+}  // namespace quicktex
--- a/quicktex/OldColor.cpp
+++ b/quicktex/OldColor.cpp
@ -16,18 +16,19 @@
    You should have received a copy of the GNU Lesser General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
-#include "Color.h"
+#include "OldColor.h"

 #include <algorithm>
 #include <stdexcept>

 #include "Vector4.h"
 #include "Vector4Int.h"
-#include "util.h"  // for scale5To8, scale8To5, assert5bit, scale6To8
+#include "util/bitbash.h"
+#include "util/math.h"  // for scale_to_8<5>, scale_from_8<5>, assert5bit, scale_to_8<6>

 namespace quicktex {

-Color::Color(Vector4Int v) {
+OldColor::OldColor(Vector4Int v) {
    if (v.MaxAbs() > 0xFF) throw std::invalid_argument("Vector members out of range");
    for (int i = 0; i < 4; i++) {
        if (v[i] < 0) throw std::range_error("Color members cannot be negative");
@ -39,40 +40,42 @@ Color::Color(Vector4Int v) {
    a = static_cast<uint8_t>(v[3]);
 }

-uint16_t Color::Pack565Unscaled(uint8_t r, uint8_t g, uint8_t b) {
+uint16_t OldColor::Pack565Unscaled(uint8_t r, uint8_t g, uint8_t b) {
    assert5bit(r);
    assert6bit(g);
    assert5bit(b);
    return static_cast<uint16_t>(b | (g << 5) | (r << 11));
 }

-uint16_t Color::Pack565(uint8_t r, uint8_t g, uint8_t b) { return Pack565Unscaled(scale8To5(r), scale8To6(g), scale8To5(b)); }
+uint16_t OldColor::Pack565(uint8_t r, uint8_t g, uint8_t b) {
+    return Pack565Unscaled(scale_from_8<5>(r), scale_from_8<6>(g), scale_from_8<5>(b));
+}

-Color Color::Unpack565Unscaled(uint16_t Packed) {
+OldColor OldColor::Unpack565Unscaled(uint16_t Packed) {
    uint8_t r = (Packed >> 11) & 0x1F;
    uint8_t g = (Packed >> 5) & 0x3F;
    uint8_t b = Packed & 0x1F;

-    return Color(r, g, b);
+    return OldColor(r, g, b);
 }

-Color Color::Unpack565(uint16_t Packed) {
-    uint8_t r = static_cast<uint8_t>(scale5To8((Packed >> 11) & 0x1FU));
-    uint8_t g = static_cast<uint8_t>(scale6To8((Packed >> 5) & 0x3FU));
-    uint8_t b = static_cast<uint8_t>(scale5To8(Packed & 0x1FU));
+OldColor OldColor::Unpack565(uint16_t Packed) {
+    uint8_t r = static_cast<uint8_t>(scale_to_8<5>((Packed >> 11) & 0x1FU));
+    uint8_t g = static_cast<uint8_t>(scale_to_8<6>((Packed >> 5) & 0x3FU));
+    uint8_t b = static_cast<uint8_t>(scale_to_8<5>(Packed & 0x1FU));

-    return Color(r, g, b);
+    return OldColor(r, g, b);
 }

-Color Color::PreciseRound565(Vector4 &v) {
+OldColor OldColor::PreciseRound565(Vector4 &v) {
    int trial_r = (int)(v[0] * UINT5_MAX);
    int trial_g = (int)(v[1] * UINT6_MAX);
    int trial_b = (int)(v[2] * UINT5_MAX);

    // clamp to prevent weirdness with slightly out of bounds float values
-    uint8_t r = (uint8_t)clampi(trial_r, 0, UINT5_MAX);
-    uint8_t g = (uint8_t)clampi(trial_g, 0, UINT6_MAX);
-    uint8_t b = (uint8_t)clampi(trial_b, 0, UINT5_MAX);
+    uint8_t r = (uint8_t)clamp<int>(trial_r, 0, UINT5_MAX);
+    uint8_t g = (uint8_t)clamp<int>(trial_g, 0, UINT6_MAX);
+    uint8_t b = (uint8_t)clamp<int>(trial_b, 0, UINT5_MAX);

    // increment each channel if above the rounding point
    r += v[0] > Midpoints5bit[r];
@ -83,46 +86,36 @@ Color Color::PreciseRound565(Vector4 &v) {
    assert6bit(g);
    assert5bit(b);

-    return Color(r, g, b);
+    return OldColor(r, g, b);
 }

-void Color::SetRGB(uint8_t vr, uint8_t vg, uint8_t vb) {
+void OldColor::SetRGB(uint8_t vr, uint8_t vg, uint8_t vb) {
    r = vr;
    g = vg;
    b = vb;
 }

-size_t Color::MinChannelRGB() {
-    if (r <= g && r <= b) return 0;
-    if (g <= b && g <= r) return 1;
-    return 2;
-}
-
-size_t Color::MaxChannelRGB() {
+size_t OldColor::MaxChannelRGB() {
    if (r >= g && r >= b) return 0;
    if (g >= b && g >= r) return 1;
    return 2;
 }

-Color Color::Min(const Color &A, const Color &B) { return Color(std::min(A[0], B[0]), std::min(A[1], B[1]), std::min(A[2], B[2]), std::min(A[3], B[3])); }
-
-Color Color::Max(const Color &a, const Color &b) { return Color(std::max(a[0], b[0]), std::max(a[1], b[1]), std::max(a[2], b[2]), std::max(a[3], b[3])); }
-
-Color::operator Vector4() const { return Vector4(r, g, b, a); }
-Color::operator Vector4Int() const { return Vector4Int(r, g, b, a); }
-Vector4Int operator-(const Color &lhs, const Color &rhs) {
+OldColor::operator Vector4() const { return Vector4(r, g, b, a); }
+OldColor::operator Vector4Int() const { return Vector4Int(r, g, b, a); }
+Vector4Int operator-(const OldColor &lhs, const OldColor &rhs) {
    Vector4Int result;
    for (unsigned i = 0; i < 4; i++) { result[i] = (int)lhs[i] - rhs[i]; }
    return result;
 }

-uint16_t Color::Pack565() const { return Pack565(r, g, b); }
-uint16_t Color::Pack565Unscaled() const { return Pack565Unscaled(r, g, b); }
+uint16_t OldColor::Pack565() const { return Pack565(r, g, b); }
+uint16_t OldColor::Pack565Unscaled() const { return Pack565Unscaled(r, g, b); }

-Color Color::ScaleTo565() const { return Color(scale8To5(r), scale8To6(g), scale8To5(b)); }
-Color Color::ScaleFrom565() const { return Color(scale5To8(r), scale6To8(g), scale5To8(b)); }
+OldColor OldColor::ScaleTo565() const { return OldColor(scale_from_8<5>(r), scale_from_8<6>(g), scale_from_8<5>(b)); }
+OldColor OldColor::ScaleFrom565() const { return OldColor(scale_to_8<5>(r), scale_to_8<6>(g), scale_to_8<5>(b)); }

-bool Color::operator==(const Color &Rhs) const { return r == Rhs.r && g == Rhs.g && b == Rhs.b && a == Rhs.a; }
-bool Color::operator!=(const Color &Rhs) const { return !(Rhs == *this); }
+bool OldColor::operator==(const OldColor &Rhs) const { return r == Rhs.r && g == Rhs.g && b == Rhs.b && a == Rhs.a; }
+bool OldColor::operator!=(const OldColor &Rhs) const { return !(Rhs == *this); }

 }  // namespace quicktex
--- a/quicktex/OldColor.h
+++ b/quicktex/OldColor.h
@ -0,0 +1,114 @@
+/*  Quicktex Texture Compression Library
+    Copyright (C) 2021-2022 Andrew Cassidy <drewcassidy@me.com>
+    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
+    and licenced under the public domain
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Lesser General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+#include <cassert>  // for assert
+#include <cstddef>  // for size_t
+#include <cstdint>  // for uint8_t, uint16_t
+
+#include "Matrix.h"
+
+namespace quicktex {
+class Vector4;
+class Vector4Int;
+
+#pragma pack(push, 1)
+class OldColor {
+   public:
+    uint8_t r;
+    uint8_t g;
+    uint8_t b;
+    uint8_t a;
+
+    constexpr OldColor() : OldColor(0, 0, 0, 0xFF) {}
+
+    constexpr OldColor(uint8_t vr, uint8_t vg, uint8_t vb, uint8_t va = 0xFF) : r(vr), g(vg), b(vb), a(va) {}
+
+    OldColor(Vector4Int v);
+
+    static uint16_t Pack565Unscaled(uint8_t r, uint8_t g, uint8_t b);
+    static uint16_t Pack565(uint8_t r, uint8_t g, uint8_t b);
+
+    static OldColor Unpack565Unscaled(uint16_t Packed);
+    static OldColor Unpack565(uint16_t Packed);
+
+    static OldColor PreciseRound565(Vector4 &v);
+
+    static OldColor Min(const OldColor &A, const OldColor &B);
+    static OldColor Max(const OldColor &A, const OldColor &B);
+
+    bool operator==(const OldColor &Rhs) const;
+    bool operator!=(const OldColor &Rhs) const;
+
+    uint8_t operator[](size_t index) const {
+        assert(index < 4);
+        return reinterpret_cast<const uint8_t *>(this)[index];
+    }
+    uint8_t &operator[](size_t index) {
+        assert(index < 4);
+        return reinterpret_cast<uint8_t *>(this)[index];
+    }
+
+    operator Vector4() const;
+    operator Vector4Int() const;
+    friend Vector4Int operator-(const OldColor &lhs, const OldColor &rhs);
+
+    void SetRGB(uint8_t vr, uint8_t vg, uint8_t vb);
+    void SetRGB(const OldColor &other) { SetRGB(other.r, other.g, other.b); }
+
+    uint16_t Pack565() const;
+    uint16_t Pack565Unscaled() const;
+
+    OldColor ScaleTo565() const;
+    OldColor ScaleFrom565() const;
+
+    size_t MinChannelRGB();
+    size_t MaxChannelRGB();
+
+    bool IsGrayscale() const { return ((r == g) && (r == b)); }
+    bool IsBlack() const { return (r | g | b) < 4; }
+
+    int GetLuma() const { return (13938U * r + 46869U * g + 4729U * b + 32768U) >> 16U; }  // REC709 weightings
+
+    operator Vec<uint8_t, 4>() const { return {r, g, b, a}; }
+
+    OldColor(const Vec<uint8_t, 4> v) {
+        r = v.r();
+        g = v.g();
+        b = v.b();
+        a = v.a();
+    }
+
+   private:
+    static constexpr float Midpoints5bit[32] = {
+        .015686f, .047059f, .078431f, .111765f, .145098f, .176471f, .207843f, .241176f, .274510f, .305882f, .337255f,
+        .370588f, .403922f, .435294f, .466667f, .5f,      .533333f, .564706f, .596078f, .629412f, .662745f, .694118f,
+        .725490f, .758824f, .792157f, .823529f, .854902f, .888235f, .921569f, .952941f, .984314f, 1e+37f};
+    static constexpr float Midpoints6bit[64] = {
+        .007843f, .023529f, .039216f, .054902f, .070588f, .086275f, .101961f, .117647f, .133333f, .149020f, .164706f,
+        .180392f, .196078f, .211765f, .227451f, .245098f, .262745f, .278431f, .294118f, .309804f, .325490f, .341176f,
+        .356863f, .372549f, .388235f, .403922f, .419608f, .435294f, .450980f, .466667f, .482353f, .500000f, .517647f,
+        .533333f, .549020f, .564706f, .580392f, .596078f, .611765f, .627451f, .643137f, .658824f, .674510f, .690196f,
+        .705882f, .721569f, .737255f, .754902f, .772549f, .788235f, .803922f, .819608f, .835294f, .850980f, .866667f,
+        .882353f, .898039f, .913725f, .929412f, .945098f, .960784f, .976471f, .992157f, 1e+37f};
+
+
+};
+#pragma pack(pop)
+}  // namespace quicktex
--- a/quicktex/Texture.h
+++ b/quicktex/Texture.h
@ -1,187 +0,0 @@
-/*  Quicktex Texture Compression Library
-    Copyright (C) 2021-2022 Andrew Cassidy <drewcassidy@me.com>
-    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
-    and licenced under the public domain
-
-    This program is free software: you can redistribute it and/or modify
-    it under the terms of the GNU Lesser General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU Lesser General Public License for more details.
-
-    You should have received a copy of the GNU Lesser General Public License
-    along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#pragma once
-
-#include <array>
-#include <climits>
-#include <cstdint>
-#include <cstdio>
-#include <cstring>
-#include <memory>
-#include <stdexcept>
-#include <tuple>
-#include <type_traits>
-#include <vector>
-
-#include "Color.h"
-#include "ColorBlock.h"
-
-namespace quicktex {
-
-class Texture {
-   public:
-    virtual ~Texture() = default;
-
-    virtual int Width() const { return _width; }
-    virtual int Height() const { return _height; }
-    virtual std::tuple<int, int> Size() const { return std::tuple<int, int>(_width, _height); }
-
-    /**
-     * The texture's total size
-     * @return The size of the texture in bytes.
-     */
-    virtual size_t NBytes() const noexcept = 0;
-
-    virtual const uint8_t *Data() const noexcept = 0;
-    virtual uint8_t *Data() noexcept = 0;
-
-   protected:
-    Texture(int width, int height) : _width(width), _height(height) {
-        if (width <= 0) throw std::invalid_argument("Texture width must be greater than 0");
-        if (height <= 0) throw std::invalid_argument("Texture height must be greater than 0");
-    }
-
-    int _width;
-    int _height;
-};
-
-class RawTexture : public Texture {
-    using Base = Texture;
-
-   public:
-    /**
-     * Create a new RawTexture
-     * @param width width of the texture in pixels
-     * @param height height of the texture in pixels
-     */
-    RawTexture(int width, int height) : Base(width, height), _pixels(_width * _height) {}
-
-    Color GetPixel(int x, int y) const {
-        if (x < 0 || x >= _width) throw std::invalid_argument("x value out of range.");
-        if (y < 0 || y >= _height) throw std::invalid_argument("y value out of range.");
-        return _pixels.at(x + (y * _width));
-    }
-
-    void SetPixel(int x, int y, Color val) {
-        if (x < 0 || x >= _width) throw std::invalid_argument("x value out of range.");
-        if (y < 0 || y >= _height) throw std::invalid_argument("y value out of range.");
-        _pixels.at(x + (y * _width)) = val;
-    }
-
-    size_t NBytes() const noexcept override { return static_cast<unsigned long>(Width() * Height()) * sizeof(Color); }
-
-    template <int N, int M> ColorBlock<N, M> GetBlock(int block_x, int block_y) const {
-        if (block_x < 0) throw std::out_of_range("x value out of range.");
-        if (block_y < 0) throw std::out_of_range("y value out of range.");
-
-        // coordinates in the image of the top-left pixel of the selected block
-        ColorBlock<N, M> block;
-        int pixel_x = block_x * N;
-        int pixel_y = block_y * M;
-
-        if (pixel_x + N < _width && pixel_y + M < _height) {
-            // fast memcpy if the block is entirely inside the bounds of the texture
-            for (int y = 0; y < M; y++) {
-                // copy each row into the ColorBlock
-                block.SetRow(y, &_pixels[pixel_x + (_width * (pixel_y + y))]);
-            }
-        } else {
-            // slower pixel-wise copy if the block goes over the edges
-            for (int x = 0; x < N; x++) {
-                for (int y = 0; y < M; y++) { block.Set(x, y, GetPixel((pixel_x + x) % _width, (pixel_y + y) % _height)); }
-            }
-        }
-
-        return block;
-    }
-
-    template <int N, int M> void SetBlock(int block_x, int block_y, const ColorBlock<N, M> &block) {
-        if (block_x < 0) throw std::out_of_range("x value out of range.");
-        if (block_y < 0) throw std::out_of_range("y value out of range.");
-
-        // coordinates in the image of the top-left pixel of the selected block
-        int pixel_x = block_x * N;
-        int pixel_y = block_y * M;
-
-        if (pixel_x + N < _width && pixel_y + M < _height) {
-            // fast row-wise memcpy if the block is entirely inside the bounds of the texture
-            for (int y = 0; y < M; y++) {
-                // copy each row out of the ColorBlock
-                block.GetRow(y, &_pixels[pixel_x + (_width * (pixel_y + y))]);
-            }
-        } else {
-            // slower pixel-wise copy if the block goes over the edges
-            for (int x = 0; x < N; x++) {
-                for (int y = 0; y < M; y++) { SetPixel((pixel_x + x) % _width, (pixel_y + y) % _height, block.Get(x, y)); }
-            }
-        }
-    }
-
-    virtual const uint8_t *Data() const noexcept override { return reinterpret_cast<const uint8_t *>(_pixels.data()); }
-    virtual uint8_t *Data() noexcept override { return reinterpret_cast<uint8_t *>(_pixels.data()); }
-
-   protected:
-    std::vector<Color> _pixels;
-};
-
-template <typename B> class BlockTexture final : public Texture {
-   private:
-    std::vector<B> _blocks;
-    int _width_b;
-    int _height_b;
-
-   public:
-    using BlockType = B;
-    using Base = Texture;
-
-    /**
-     * Create a new BlockTexture
-     * @param width width of the texture in pixels. must be divisible by B::Width
-     * @param height height of the texture in pixels. must be divisible by B::Height
-     */
-    BlockTexture(int width, int height) : Base(width, height) {
-        _width_b = (_width + B::Width - 1) / B::Width;
-        _height_b = (_height + B::Height - 1) / B::Height;
-        _blocks = std::vector<B>(_width_b * _height_b);
-    }
-
-    constexpr int BlocksX() const { return _width_b; }
-    constexpr int BlocksY() const { return _height_b; }
-    constexpr std::tuple<int, int> BlocksXY() const { return std::tuple<int, int>(_width_b, _height_b); }
-
-    B GetBlock(int x, int y) const {
-        if (x < 0 || x >= _width_b) throw std::out_of_range("x value out of range.");
-        if (y < 0 || y >= _height_b) throw std::out_of_range("y value out of range.");
-        return _blocks.at(x + (y * _width_b));
-    }
-
-    void SetBlock(int x, int y, const B &val) {
-        if (x < 0 || x >= _width_b) throw std::out_of_range("x value out of range.");
-        if (y < 0 || y >= _height_b) throw std::out_of_range("y value out of range.");
-        _blocks.at(x + (y * _width_b)) = val;
-    }
-
-    size_t NBytes() const noexcept override { return _blocks.size() * sizeof(B); }
-
-    const uint8_t *Data() const noexcept override { return reinterpret_cast<const uint8_t *>(_blocks.data()); }
-    uint8_t *Data() noexcept override { return reinterpret_cast<uint8_t *>(_blocks.data()); }
-};
-
-}  // namespace quicktex
--- a/quicktex/Vector4.h
+++ b/quicktex/Vector4.h
@ -23,7 +23,7 @@
 #include <cmath>
 #include <functional>

-#include "Color.h"
+#include "OldColor.h"

 namespace quicktex {

@ -45,11 +45,11 @@ class Vector4 {
        _c[3] = scalar;
    }

-    Vector4(const Color &c) : Vector4(c.r, c.g, c.b, c.a) {}
+    Vector4(const OldColor &c) : Vector4(c.r, c.g, c.b, c.a) {}

-    static Vector4 FromColor(const Color &c) { return Vector4(c); }
+    static Vector4 FromColor(const OldColor &c) { return Vector4(c); }

-    static Vector4 FromColorRGB(const Color &c) { return Vector4(c.r, c.g, c.b); }
+    static Vector4 FromColorRGB(const OldColor &c) { return Vector4(c.r, c.g, c.b); }

    static float Dot(const Vector4 &lhs, const Vector4 &rhs) {
        float sum = 0;
--- a/quicktex/Vector4Int.h
+++ b/quicktex/Vector4Int.h
@ -22,7 +22,7 @@
 #include <array>
 #include <functional>

-#include "Color.h"
+#include "OldColor.h"
 #include "Vector4.h"

 namespace quicktex {
@ -45,11 +45,11 @@ class Vector4Int {
        _c[3] = scalar;
    }

-    Vector4Int(const Color &c) : Vector4Int(c.r, c.g, c.b, c.a) {}
+    Vector4Int(const OldColor &c) : Vector4Int(c.r, c.g, c.b, c.a) {}

-    static Vector4Int FromColor(const Color &c) { return Vector4Int(c); }
+    static Vector4Int FromColor(const OldColor &c) { return Vector4Int(c); }

-    static Vector4Int FromColorRGB(const Color &c) { return Vector4Int(c.r, c.g, c.b); }
+    static Vector4Int FromColorRGB(const OldColor &c) { return Vector4Int(c.r, c.g, c.b); }

    static int Dot(const Vector4Int &lhs, const Vector4Int &rhs) {
        int sum = 0;
--- a/quicktex/_bindings.cpp
+++ b/quicktex/_bindings.cpp
@ -21,11 +21,12 @@

 #include <pybind11/pybind11.h>

-#include "Color.h"
 #include "Decoder.h"
 #include "Encoder.h"
-#include "Texture.h"
+#include "OldColor.h"
 #include "_bindings.h"
+#include "texture/RawTexture.h"
+#include "texture/Texture.h"

 #define STRINGIFY(x) #x
 #define MACRO_STRINGIFY(x) STRINGIFY(x)
@ -45,19 +46,26 @@ PYBIND11_MODULE(_quicktex, m) {
    m.attr("__version__") = "dev";
 #endif

+#ifdef NDEBUG
+    m.attr("_debug_build") = false;
+#else
+    m.attr("_debug_build") = true;
+#endif
+
    py::options options;

    // Texture

    py::class_<Texture> texture(m, "Texture", py::buffer_protocol());

-    texture.def_property_readonly("nbytes", &Texture::NBytes);
+    texture.def_property_readonly("nbytes", &Texture::nbytes);
    texture.def_property_readonly("size", &Texture::Size);
-    texture.def_property_readonly("width", &Texture::Width);
-    texture.def_property_readonly("height", &Texture::Height);
+    texture.def_readonly("width", &Texture::width);
+    texture.def_readonly("height", &Texture::height);

-    texture.def_buffer([](Texture &t) { return py::buffer_info(t.Data(), t.NBytes()); });
-    texture.def("tobytes", [](const Texture &t) { return py::bytes(reinterpret_cast<const char *>(t.Data()), t.NBytes()); });
+    texture.def_buffer([](Texture &t) { return py::buffer_info(t.data(), t.nbytes()); });
+    texture.def("tobytes",
+                [](const Texture &t) { return py::bytes(reinterpret_cast<const char *>(t.data()), t.nbytes()); });

    // RawTexture

@ -66,7 +74,9 @@ PYBIND11_MODULE(_quicktex, m) {
    raw_texture.def(py::init<int, int>(), "width"_a, "height"_a);
    raw_texture.def_static("frombytes", &BufferToTexture<RawTexture>, "data"_a, "width"_a, "height"_a);

-    DefSubscript2D(raw_texture, &RawTexture::GetPixel, &RawTexture::SetPixel, &RawTexture::Size);
+    DefSubscript2DRef(
+        raw_texture, [](RawTexture &self, int x, int y) -> Color { return self.pixel(x, y); },
+        [](RawTexture &self, int x, int y, Color val) { self.pixel(x, y) = val; }, &RawTexture::Size);

    InitS3TC(m);
 }
--- a/quicktex/_bindings.h
+++ b/quicktex/_bindings.h
@ -24,18 +24,66 @@

 #include <cstdint>
 #include <cstring>
-#include <memory>
 #include <stdexcept>
+#include <string>
+#include <tuple>
 #include <type_traits>
+#include <utility>
+#include <vector>

-#include "Color.h"
-#include "ColorBlock.h"
-#include "Texture.h"
-#include "util.h"
+#include "OldColor.h"
+#include "texture/BlockTexture.h"
+#include "util/math.h"

 namespace pybind11::detail {
 using namespace quicktex;
 /// Type caster for color class to allow it to be converted to and from a python tuple
+template <> struct type_caster<OldColor> {
+   public:
+    PYBIND11_TYPE_CASTER(OldColor, _("Color"));
+
+    bool load(handle src, bool) {
+        PyObject* source = src.ptr();
+
+        PyObject* tmp = PySequence_Tuple(source);
+
+        // if the object is not a tuple, return false
+        if (!tmp) { return false; }  // incorrect type
+
+        // check the size
+        Py_ssize_t size = PyTuple_Size(tmp);
+        if (size < 3 || size > 4) { return false; }  // incorrect size
+
+        value.a = 0xFF;
+        // now we get the contents
+        for (int i = 0; i < size; i++) {
+            PyObject* src_chan = PyTuple_GetItem(tmp, i);
+            PyObject* tmp_chan = PyNumber_Long(src_chan);
+
+            if (!tmp_chan) return false;  // incorrect channel type
+
+            auto chan = PyLong_AsLong(tmp_chan);
+            if (chan > 0xFF || chan < 0) return false;  // item out of range
+            value[static_cast<unsigned>(i)] = static_cast<uint8_t>(chan);
+            Py_DECREF(tmp_chan);
+        }
+        Py_DECREF(tmp);
+
+        return !PyErr_Occurred();
+    }
+
+    static handle cast(OldColor src, return_value_policy, handle) {
+        PyObject* val = PyTuple_New(4);
+
+        for (int i = 0; i < 4; i++) {
+            PyObject* chan = PyLong_FromLong(src[static_cast<unsigned>(i)]);
+            PyTuple_SetItem(val, i, chan);
+        }
+
+        return val;
+    }
+};
+
 template <> struct type_caster<Color> {
   public:
    PYBIND11_TYPE_CASTER(Color, _("Color"));
@ -52,7 +100,7 @@ template <> struct type_caster<Color> {
        Py_ssize_t size = PyTuple_Size(tmp);
        if (size < 3 || size > 4) { return false; }  // incorrect size

-        value.a = 0xFF;
+        value.a() = 0xFF;
        // now we get the contents
        for (int i = 0; i < size; i++) {
            PyObject* src_chan = PyTuple_GetItem(tmp, i);
@ -85,26 +133,49 @@ template <> struct type_caster<Color> {

 namespace py = pybind11;
 namespace quicktex::bindings {
+
 using namespace pybind11::literals;

+template <typename... Args> std::string Format(const char* str, const Args&... args) {
+    auto output = std::string(str);
+
+    std::vector<std::string> values = {{args...}};
+
+    for (unsigned i = 0; i < values.size(); i++) {
+        auto key = "{" + std::to_string(i) + "}";
+        auto value = values[i];
+        while (true) {
+            size_t where = output.find(key);
+            if (where == output.npos) break;
+            output.replace(where, key.length(), value);
+        }
+    }
+
+    return output;
+}
+
 template <typename T> T BufferToTexture(py::buffer buf, int width, int height) {
    static_assert(std::is_base_of<Texture, T>::value);
    static_assert(std::is_constructible<T, int, int>::value);

    auto info = buf.request(false);
    auto output = T(width, height);
-    auto dst_size = output.NBytes();
+    auto dst_size = output.nbytes();

-    if (info.format != py::format_descriptor<uint8_t>::format()) throw std::runtime_error("Incompatible format in python buffer: expected a byte array.");
-    if (info.size < (Py_ssize_t)dst_size) std::runtime_error("Incompatible format in python buffer: Input data is smaller than texture size.");
+    if (info.format != py::format_descriptor<uint8_t>::format())
+        throw std::runtime_error("Incompatible format in python buffer: expected a byte array.");
+    if (info.size < (Py_ssize_t)dst_size)
+        std::runtime_error("Incompatible format in python buffer: Input data is smaller than texture size.");
    if (info.ndim == 1) {
-        if (info.shape[0] < (Py_ssize_t)dst_size) throw std::runtime_error("Incompatible format in python buffer: 1-D buffer has incorrect length.");
-        if (info.strides[0] != 1) throw std::runtime_error("Incompatible format in python buffer: 1-D buffer is not contiguous.");
+        if (info.shape[0] < (Py_ssize_t)dst_size)
+            throw std::runtime_error("Incompatible format in python buffer: 1-D buffer has incorrect length.");
+        if (info.strides[0] != 1)
+            throw std::runtime_error("Incompatible format in python buffer: 1-D buffer is not contiguous.");
    } else {
        throw std::runtime_error("Incompatible format in python buffer: Incorrect number of dimensions.");
    }

-    std::memcpy(output.Data(), info.ptr, dst_size);
+    std::memcpy(output.data(), info.ptr, dst_size);

    return output;
 }
@ -114,11 +185,15 @@ template <typename T> T BufferToPOD(py::buffer buf) {

    auto info = buf.request(false);

-    if (info.format != py::format_descriptor<uint8_t>::format()) throw std::runtime_error("Incompatible format in python buffer: expected a byte array.");
-    if (info.size < (Py_ssize_t)sizeof(T)) std::runtime_error("Incompatible format in python buffer: Input data is smaller than texture size.");
+    if (info.format != py::format_descriptor<uint8_t>::format())
+        throw std::runtime_error("Incompatible format in python buffer: expected a byte array.");
+    if (info.size < (Py_ssize_t)sizeof(T))
+        std::runtime_error("Incompatible format in python buffer: Input data is smaller than texture size.");
    if (info.ndim == 1) {
-        if (info.shape[0] < (Py_ssize_t)sizeof(T)) throw std::runtime_error("Incompatible format in python buffer: 1-D buffer has incorrect length.");
-        if (info.strides[0] != 1) throw std::runtime_error("Incompatible format in python buffer: 1-D buffer is not contiguous.");
+        if (info.shape[0] < (Py_ssize_t)sizeof(T))
+            throw std::runtime_error("Incompatible format in python buffer: 1-D buffer has incorrect length.");
+        if (info.strides[0] != 1)
+            throw std::runtime_error("Incompatible format in python buffer: 1-D buffer is not contiguous.");
    } else {
        throw std::runtime_error("Incompatible format in python buffer: Incorrect number of dimensions.");
    }
@ -133,15 +208,18 @@ inline int PyIndex(int val, int size, std::string name = "index") {
    return val;
 }

-template <typename T, typename Getter, typename Setter, typename Extent> void DefSubscript(py::class_<T> t, Getter&& get, Setter&& set, Extent&& ext) {
+template <typename T, typename Getter, typename Setter, typename Extent>
+void DefSubscript(py::class_<T> t, Getter&& get, Setter&& set, Extent&& ext) {
    using V = typename std::invoke_result<Getter, T*, int>::type;
    t.def(
        "__getitem__", [get, ext](T& self, int index) { return (self.*get)(PyIndex(index, (self.*ext)())); }, "key"_a);
    t.def(
-        "__setitem__", [set, ext](T& self, int index, V val) { (self.*set)(PyIndex(index, (self.*ext)()), val); }, "key"_a, "value"_a);
+        "__setitem__", [set, ext](T& self, int index, V val) { (self.*set)(PyIndex(index, (self.*ext)()), val); },
+        "key"_a, "value"_a);
 }

-template <typename Tpy, typename Getter, typename Setter, typename Extent> void DefSubscript2D(Tpy t, Getter&& get, Setter&& set, Extent&& ext) {
+template <typename Tpy, typename Getter, typename Setter, typename Extent>
+void DefSubscript2D(Tpy t, Getter&& get, Setter&& set, Extent&& ext) {
    using T = typename Tpy::type;
    using V = typename std::invoke_result<Getter, T*, int, int>::type;
    using Coords = std::tuple<int, int>;
@ -165,6 +243,32 @@ template <typename Tpy, typename Getter, typename Setter, typename Extent> void
        "key"_a, "value"_a);
 }

+// TODO: untangle this mess
+template <typename Tpy, typename Getter, typename Setter, typename Extent>
+void DefSubscript2DRef(Tpy t, Getter&& get, Setter&& set, Extent&& ext) {
+    using T = typename Tpy::type;
+    using V = typename std::remove_cvref_t<std::invoke_result_t<Getter, T&, int, int>>;
+    using Coords = std::tuple<int, int>;
+    t.def(
+        "__getitem__",
+        [get, ext](T& self, Coords pnt) {
+            Coords s = (self.*ext)();
+            int x = PyIndex(std::get<0>(pnt), std::get<0>(s), "x");
+            int y = PyIndex(std::get<1>(pnt), std::get<1>(s), "y");
+            return get(self, x, y);
+        },
+        "key"_a);
+    t.def(
+        "__setitem__",
+        [set, ext](T& self, Coords pnt, const V& val) {
+            Coords s = (self.*ext)();
+            int x = PyIndex(std::get<0>(pnt), std::get<0>(s), "x");
+            int y = PyIndex(std::get<1>(pnt), std::get<1>(s), "y");
+            set(self, x, y, val);
+        },
+        "key"_a, "value"_a);
+}
+
 template <typename B> py::class_<B> BindBlock(py::module_& m, const char* name) {
    const char* frombytes_doc = R"doc(
        Create a new {0} by copying a bytes-like object.
@ -184,7 +288,8 @@ template <typename B> py::class_<B> BindBlock(py::module_& m, const char* name)
    block.def_readonly_static("width", &B::Width, "The width of the block in pixels.");
    block.def_readonly_static("height", &B::Height, "The height of the block in pixels.");
    block.def_property_readonly_static(
-        "size", [](py::object) { return std::make_tuple(B::Width, B::Height); }, "The dimensions of the block in pixels.");
+        "size", [](py::object) { return std::make_tuple(B::Width, B::Height); },
+        "The dimensions of the block in pixels.");
    block.def_property_readonly_static(
        "nbytes", [](py::object) { return sizeof(B); }, "The size of the block in bytes.");

@ -195,7 +300,7 @@ template <typename B> py::class_<B> BindBlock(py::module_& m, const char* name)
        "tobytes", [](const B& b) { return py::bytes(reinterpret_cast<const char*>(&b), sizeof(B)); },
        Format(tobytes_doc, name, std::to_string(sizeof(B))).c_str());

-    return std::move(block);
+    return block;
 }

 template <typename B> py::class_<BlockTexture<B>> BindBlockTexture(py::module_& m, const char* name) {
@ -223,14 +328,15 @@ template <typename B> py::class_<BlockTexture<B>> BindBlockTexture(py::module_&
    py::class_<BTex, Texture> block_texture(m, name);

    block_texture.def(py::init<int, int>(), "width"_a, "height"_a, Format(constructor_str, name).c_str());
-    block_texture.def_static("from_bytes", &BufferToTexture<BTex>, "data"_a, "width"_a, "height"_a, Format(from_bytes_str, name).c_str());
+    block_texture.def_static("from_bytes", &BufferToTexture<BTex>, "data"_a, "width"_a, "height"_a,
+                             Format(from_bytes_str, name).c_str());

-    block_texture.def_property_readonly("width_blocks", &BTex::BlocksX, "The width of the texture in blocks.");
-    block_texture.def_property_readonly("height_blocks", &BTex::BlocksY, "The height of the texture in blocks.");
-    block_texture.def_property_readonly("size_blocks", &BTex::BlocksXY, "The dimensions of the texture in blocks.");
+    block_texture.def_property_readonly("width_blocks", &BTex::bwidth, "The width of the texture in blocks.");
+    block_texture.def_property_readonly("height_blocks", &BTex::bheight, "The height of the texture in blocks.");
+    block_texture.def_property_readonly("size_blocks", &BTex::bsize, "The dimensions of the texture in blocks.");

-    DefSubscript2D(block_texture, &BTex::GetBlock, &BTex::SetBlock, &BTex::BlocksXY);
+    DefSubscript2D(block_texture, &BTex::get_block, &BTex::set_block, &BTex::bsize);

-    return std::move(block_texture);
+    return block_texture;
 }
 }  // namespace quicktex::bindings
--- a/quicktex/s3tc/bc1/BC1Block.cpp
+++ b/quicktex/s3tc/bc1/BC1Block.cpp
@ -20,28 +20,35 @@
 #include "BC1Block.h"

 #include <stdexcept>
-#include <algorithm>

-#include "../../util.h"
+#include "util/bitbash.h"
+#include "util/map.h"
+#include "util/math.h"
+#include "util/ranges.h"

 namespace quicktex::s3tc {
-uint16_t BC1Block::GetColor0Raw() const { return Pack<uint8_t, uint16_t, 8, EndpointSize>(_color0); }
-uint16_t BC1Block::GetColor1Raw() const { return Pack<uint8_t, uint16_t, 8, EndpointSize>(_color1); }

-void BC1Block::SetColor0Raw(uint16_t c) { _color0 = Unpack<uint16_t, uint8_t, 8, EndpointSize>(c); }
-void BC1Block::SetColor1Raw(uint16_t c) { _color1 = Unpack<uint16_t, uint8_t, 8, EndpointSize>(c); }
+uint16_t BC1Block::GetColor0Raw() const { return pack<uint16_t>(_color0, 8); }
+uint16_t BC1Block::GetColor1Raw() const { return pack<uint16_t>(_color1, 8); }

-BC1Block::SelectorArray BC1Block::GetSelectors() const { return MapArray(_selectors, Unpack<uint8_t, uint8_t, SelectorBits, Width>); }
+void BC1Block::SetColor0Raw(uint16_t c) { _color0 = unpack<uint8_t, EndpointSize>(c, 8); }
+void BC1Block::SetColor1Raw(uint16_t c) { _color1 = unpack<uint8_t, EndpointSize>(c, 8); }
+
+BC1Block::SelectorArray BC1Block::GetSelectors() const {
+    return map([](auto row) { return unpack<uint8_t, Width>(row, SelectorBits); }, _selectors);
+}

 void BC1Block::SetSelectors(const BC1Block::SelectorArray& unpacked) {
    for (unsigned y = 0; y < (unsigned)Height; y++) {
        if (std::any_of(unpacked[y].begin(), unpacked[y].end(), [](uint8_t i) { return i > SelectorMax; }))
            throw std::invalid_argument("Selector value out of bounds.");
    }
-    _selectors = MapArray(unpacked, Pack<uint8_t, uint8_t, SelectorBits, Width>);
+    _selectors = map([](auto row) { return pack<uint8_t>(row, SelectorBits, true); }, unpacked);
 }

-bool BC1Block::operator==(const BC1Block& Rhs) const { return _color0 == Rhs._color0 && _color1 == Rhs._color1 && _selectors == Rhs._selectors; }
+bool BC1Block::operator==(const BC1Block& Rhs) const {
+    return _color0 == Rhs._color0 && _color1 == Rhs._color1 && _selectors == Rhs._selectors;
+}
 bool BC1Block::operator!=(const BC1Block& Rhs) const { return !(Rhs == *this); }

 }  // namespace quicktex::s3tc
--- a/quicktex/s3tc/bc1/BC1Block.h
+++ b/quicktex/s3tc/bc1/BC1Block.h
@ -24,7 +24,7 @@
 #include <cstdlib>
 #include <utility>

-#include "../../Color.h"
+#include "OldColor.h"

 namespace quicktex::s3tc {

@ -39,7 +39,7 @@ class alignas(8) BC1Block {
    static constexpr uint8_t SelectorMax = (1 << SelectorBits) - 1;  // maximum value of a selector

    using SelectorArray = std::array<std::array<uint8_t, Width>, Height>;
-    using ColorPair = std::pair<Color, Color>;
+    using ColorPair = std::pair<OldColor, OldColor>;

   private:
    std::array<uint8_t, EndpointSize> _color0;
@ -60,7 +60,7 @@ class alignas(8) BC1Block {
     * @param color1 second endpoint color
     * @param selectors the selectors as a 4x4 list of integers, between 0 and 3 inclusive.
     */
-    BC1Block(Color color0, Color color1, const SelectorArray& selectors) {
+    BC1Block(OldColor color0, OldColor color1, const SelectorArray& selectors) {
        SetColor0(color0);
        SetColor1(color1);
        SetSelectors(selectors);
@ -96,12 +96,12 @@ class alignas(8) BC1Block {
    void SetColor0Raw(uint16_t c);
    void SetColor1Raw(uint16_t c);

-    Color GetColor0() const { return Color::Unpack565(GetColor0Raw()); }
-    Color GetColor1() const { return Color::Unpack565(GetColor1Raw()); }
+    OldColor GetColor0() const { return OldColor::Unpack565(GetColor0Raw()); }
+    OldColor GetColor1() const { return OldColor::Unpack565(GetColor1Raw()); }
    ColorPair GetColors() const { return {GetColor0(), GetColor1()}; }

-    void SetColor0(Color c) { SetColor0Raw(c.Pack565()); }
-    void SetColor1(Color c) { SetColor1Raw(c.Pack565()); }
+    void SetColor0(OldColor c) { SetColor0Raw(c.Pack565()); }
+    void SetColor1(OldColor c) { SetColor1Raw(c.Pack565()); }
    void SetColors(ColorPair cs) {
        SetColor0(cs.first);
        SetColor1(cs.second);
--- a/quicktex/s3tc/bc1/BC1Decoder.cpp
+++ b/quicktex/s3tc/bc1/BC1Decoder.cpp
@ -23,9 +23,9 @@
 #include <cassert>
 #include <cstdint>

-#include "../../Color.h"
-#include "../../ColorBlock.h"
-#include "BC1Block.h"
+#include "ColorBlock.h"
+#include "OldColor.h"
+#include "s3tc/bc1/BC1Block.h"

 namespace quicktex::s3tc {

--- a/quicktex/s3tc/bc1/BC1Decoder.h
+++ b/quicktex/s3tc/bc1/BC1Decoder.h
@ -21,18 +21,19 @@

 #include <memory>

-#include "../../ColorBlock.h"
-#include "../../Decoder.h"
-#include "../../Texture.h"
-#include "../interpolator/Interpolator.h"
-#include "BC1Block.h"
+#include "ColorBlock.h"
+#include "Decoder.h"
+#include "s3tc/bc1/BC1Block.h"
+#include "s3tc/interpolator/Interpolator.h"
+#include "texture/BlockTexture.h"

 namespace quicktex::s3tc {
 class BC1Decoder final : public BlockDecoder<BlockTexture<BC1Block>> {
   public:
    using InterpolatorPtr = std::shared_ptr<Interpolator>;

-    BC1Decoder(bool vwrite_alpha, InterpolatorPtr interpolator) : write_alpha(vwrite_alpha), _interpolator(interpolator) {}
+    BC1Decoder(bool vwrite_alpha, InterpolatorPtr interpolator)
+        : write_alpha(vwrite_alpha), _interpolator(interpolator) {}

    BC1Decoder(bool vwrite_alpha = false) : BC1Decoder(vwrite_alpha, std::make_shared<Interpolator>()) {}

--- a/quicktex/s3tc/bc1/BC1Encoder.cpp
+++ b/quicktex/s3tc/bc1/BC1Encoder.cpp
@ -29,24 +29,28 @@
 #include <stdexcept>
 #include <type_traits>

-#include "../../Color.h"
-#include "../../ColorBlock.h"
-#include "../../Matrix4x4.h"
-#include "../../Texture.h"
-#include "../../Vector4.h"
-#include "../../Vector4Int.h"
-#include "../../bitwiseEnums.h"
-#include "../../util.h"
+#include "ColorBlock.h"
 #include "Histogram.h"
-#include "OrderTable.h"
-#include "SingleColorTable.h"
+#include "Matrix4x4.h"
+#include "OldColor.h"
+#include "Vector4.h"
+#include "Vector4Int.h"
+#include "s3tc/bc1/BC1Block.h"
+#include "s3tc/bc1/OrderTable.h"
+#include "s3tc/bc1/SingleColorTable.h"
+#include "texture/Texture.h"
+#include "util/bitbash.h"
+#include "util/bitwiseEnums.h"
+#include "util/math.h"

 namespace quicktex::s3tc {

 // constructors

-BC1Encoder::BC1Encoder(unsigned int level, ColorMode color_mode, InterpolatorPtr interpolator) : _interpolator(interpolator), _color_mode(color_mode) {
-    if (color_mode != ColorMode::FourColor && color_mode != ColorMode::ThreeColor && color_mode != ColorMode::ThreeColorBlack) {
+BC1Encoder::BC1Encoder(unsigned int level, ColorMode color_mode, InterpolatorPtr interpolator)
+    : _interpolator(interpolator), _color_mode(color_mode) {
+    if (color_mode != ColorMode::FourColor && color_mode != ColorMode::ThreeColor &&
+        color_mode != ColorMode::ThreeColorBlack) {
        throw std::invalid_argument("Encoder color mode must be FourColor, ThreeColor, or ThreeColorBlack");
    }

@ -73,7 +77,9 @@ BC1Encoder::BC1Encoder(unsigned int level, ColorMode color_mode, InterpolatorPtr

 // Getters and Setters
 void BC1Encoder::SetLevel(unsigned level) {
-    if (level > 19) throw std::invalid_argument("Level out of range, bust be between 0 and 18 inclusive");  // theres a secret level 19 but shhhhhh
+    if (level > 19)
+        throw std::invalid_argument(
+            "Level out of range, bust be between 0 and 18 inclusive");  // theres a secret level 19 but shhhhhh

    two_ls_passes = false;
    two_ep_passes = false;
@ -249,14 +255,20 @@ void BC1Encoder::SetLevel(unsigned level) {
    _orderings3 = clamp(_orderings3, 1U, OrderTable<3>::BestOrderCount);
 }

-void BC1Encoder::SetOrderings4(unsigned orderings4) { _orderings4 = clamp(orderings4, 1U, OrderTable<4>::BestOrderCount); }
-void BC1Encoder::SetOrderings3(unsigned orderings3) { _orderings3 = clamp(orderings3, 1U, OrderTable<3>::BestOrderCount); }
+void BC1Encoder::SetOrderings4(unsigned orderings4) {
+    _orderings4 = clamp(orderings4, 1U, OrderTable<4>::BestOrderCount);
+}
+void BC1Encoder::SetOrderings3(unsigned orderings3) {
+    _orderings3 = clamp(orderings3, 1U, OrderTable<3>::BestOrderCount);
+}
 void BC1Encoder::SetOrderings(OrderingPair orderings) {
    SetOrderings4(std::get<0>(orderings));
    SetOrderings3(std::get<1>(orderings));
 }

-void BC1Encoder::SetPowerIterations(unsigned int power_iters) { _power_iterations = clamp(power_iters, min_power_iterations, max_power_iterations); }
+void BC1Encoder::SetPowerIterations(unsigned int power_iters) {
+    _power_iterations = clamp(power_iters, min_power_iterations, max_power_iterations);
+}

 // Public methods
 BC1Block BC1Encoder::EncodeBlock(const ColorBlock<4, 4> &pixels) const {
@ -304,7 +316,9 @@ BC1Block BC1Encoder::EncodeBlock(const ColorBlock<4, 4> &pixels) const {

    // First refinement pass using ordered cluster fit
    if (result.error > 0 && use_likely_orderings) {
-        for (unsigned iter = 0; iter < total_cf_passes; iter++) { RefineBlockCF<ColorMode::FourColor>(result, pixels, metrics, _error_mode, _orderings4); }
+        for (unsigned iter = 0; iter < total_cf_passes; iter++) {
+            RefineBlockCF<ColorMode::FourColor>(result, pixels, metrics, _error_mode, _orderings4);
+        }
    }

    // try for 3-color block
@ -325,13 +339,15 @@ BC1Block BC1Encoder::EncodeBlock(const ColorBlock<4, 4> &pixels) const {
    }

    // try for 3-color block with black
-    if (result.error > 0 && (_color_mode == ColorMode::ThreeColorBlack) && metrics.has_black && !metrics.max.IsBlack()) {
+    if (result.error > 0 && (_color_mode == ColorMode::ThreeColorBlack) && metrics.has_black &&
+        !metrics.max.IsBlack()) {
        EncodeResults trial_result;
        BlockMetrics metrics_no_black = pixels.GetMetrics(true);

        FindEndpoints(trial_result, pixels, metrics_no_black, EndpointMode::PCA, true);
        FindSelectors<ColorMode::ThreeColorBlack>(trial_result, pixels, ErrorMode::Full);
-        RefineBlockLS<ColorMode::ThreeColorBlack>(trial_result, pixels, metrics_no_black, ErrorMode::Full, total_ls_passes);
+        RefineBlockLS<ColorMode::ThreeColorBlack>(trial_result, pixels, metrics_no_black, ErrorMode::Full,
+                                                  total_ls_passes);

        if (trial_result.error < result.error) { result = trial_result; }
    }
@ -343,7 +359,7 @@ BC1Block BC1Encoder::EncodeBlock(const ColorBlock<4, 4> &pixels) const {
 }

 // Private methods
-BC1Block BC1Encoder::WriteBlockSolid(Color color) const {
+BC1Block BC1Encoder::WriteBlockSolid(OldColor color) const {
    uint8_t mask = 0xAA;  // 2222
    uint16_t min16, max16;

@ -441,7 +457,7 @@ BC1Block BC1Encoder::WriteBlock(EncodeResults &result) const {
    return BC1Block(ep0, ep1, selectors);
 }

-void BC1Encoder::FindEndpointsSingleColor(EncodeResults &result, Color color, bool is_3color) const {
+void BC1Encoder::FindEndpointsSingleColor(EncodeResults &result, OldColor color, bool is_3color) const {
    auto &match5 = is_3color ? _single_match5_half : _single_match5;
    auto &match6 = is_3color ? _single_match6_half : _single_match6;

@ -451,13 +467,14 @@ void BC1Encoder::FindEndpointsSingleColor(EncodeResults &result, Color color, bo

    result.color_mode = is_3color ? ColorMode::ThreeColor : ColorMode::FourColor;
    result.error = match_r.error + match_g.error + match_b.error;
-    result.low = Color(match_r.low, match_g.low, match_b.low);
-    result.high = Color(match_r.high, match_g.high, match_b.high);
+    result.low = OldColor(match_r.low, match_g.low, match_b.low);
+    result.high = OldColor(match_r.high, match_g.high, match_b.high);
    // selectors decided when writing, no point deciding them now
 }

-void BC1Encoder::FindEndpointsSingleColor(EncodeResults &result, const CBlock &pixels, Color color, bool is_3color) const {
-    std::array<Color, 4> colors = _interpolator->InterpolateBC1(result.low, result.high, is_3color);
+void BC1Encoder::FindEndpointsSingleColor(EncodeResults &result, const CBlock &pixels, OldColor color,
+                                          bool is_3color) const {
+    std::array<OldColor, 4> colors = _interpolator->InterpolateBC1(result.low, result.high, is_3color);
    Vector4Int result_vector = (Vector4Int)colors[2];

    FindEndpointsSingleColor(result, color, is_3color);
@ -471,40 +488,43 @@ void BC1Encoder::FindEndpointsSingleColor(EncodeResults &result, const CBlock &p
    }
 }

-void BC1Encoder::FindEndpoints(EncodeResults &result, const CBlock &pixels, const BlockMetrics &metrics, EndpointMode endpoint_mode, bool ignore_black) const {
+void BC1Encoder::FindEndpoints(EncodeResults &result, const CBlock &pixels, const BlockMetrics &metrics,
+                               EndpointMode endpoint_mode, bool ignore_black) const {
    if (metrics.is_greyscale) {
        // specialized greyscale case
        const unsigned fr = pixels.Get(0, 0).r;

        if (metrics.max.r - metrics.min.r < 2) {
            // single color block
-            uint8_t fr5 = (uint8_t)scale8To5(fr);
-            uint8_t fr6 = (uint8_t)scale8To6(fr);
+            uint8_t fr5 = (uint8_t)scale_from_8<5>(fr);
+            uint8_t fr6 = (uint8_t)scale_from_8<6>(fr);

-            result.low = Color(fr5, fr6, fr5);
+            result.low = OldColor(fr5, fr6, fr5);
            result.high = result.low;
        } else {
-            uint8_t lr5 = scale8To5(metrics.min.r);
-            uint8_t lr6 = scale8To6(metrics.min.r);
+            uint8_t lr5 = scale_from_8<5>(metrics.min.r);
+            uint8_t lr6 = scale_from_8<6>(metrics.min.r);

-            uint8_t hr5 = scale8To5(metrics.max.r);
-            uint8_t hr6 = scale8To6(metrics.max.r);
+            uint8_t hr5 = scale_from_8<5>(metrics.max.r);
+            uint8_t hr6 = scale_from_8<6>(metrics.max.r);

-            result.low = Color(lr5, lr6, lr5);
-            result.high = Color(hr5, hr6, hr5);
+            result.low = OldColor(lr5, lr6, lr5);
+            result.high = OldColor(hr5, hr6, hr5);
        }
    } else if (endpoint_mode == EndpointMode::LeastSquares) {
        //  2D Least Squares approach from Humus's example, with added inset and optimal rounding.
-        Color diff = Color(metrics.max.r - metrics.min.r, metrics.max.g - metrics.min.g, metrics.max.b - metrics.min.b);
+        OldColor diff =
+            OldColor(metrics.max.r - metrics.min.r, metrics.max.g - metrics.min.g, metrics.max.b - metrics.min.b);
        Vector4 l = {0, 0, 0};
        Vector4 h = {0, 0, 0};

        auto &sums = metrics.sums;
        auto &min = metrics.min;
+        auto &max = metrics.max;

        unsigned chan0 = (unsigned)diff.MaxChannelRGB();  // primary axis of the bounding box
        l[chan0] = (float)min[chan0];
-        h[chan0] = (float)min[chan0];
+        h[chan0] = (float)max[chan0];

        assert((diff[chan0] >= diff[(chan0 + 1) % 3]) && (diff[chan0] >= diff[(chan0 + 2) % 3]));

@ -521,7 +541,7 @@ void BC1Encoder::FindEndpoints(EncodeResults &result, const CBlock &pixels, cons
        float denominator = (float)(16 * sum_xx) - (float)(sum_x * sum_x);

        // once per secondary axis, calculate high and low using least squares
-        if (fabs(denominator) > 1e-8f) {
+        if (abs(denominator) > 1e-8f) {
            for (unsigned i = 1; i < 3; i++) {
                /* each secondary axis is fitted with a linear formula of the form
                 *  y = ax + b
@ -549,8 +569,8 @@ void BC1Encoder::FindEndpoints(EncodeResults &result, const CBlock &pixels, cons
            h[c] = ((h[c] - inset) / 255.0f);
        }

-        result.low = Color::PreciseRound565(l);
-        result.high = Color::PreciseRound565(h);
+        result.low = OldColor::PreciseRound565(l);
+        result.high = OldColor::PreciseRound565(h);
    } else if (endpoint_mode == EndpointMode::BoundingBox) {
        // Algorithm from icbc.h compress_dxt1_fast()
        Vector4 l, h;
@ -577,19 +597,20 @@ void BC1Encoder::FindEndpoints(EncodeResults &result, const CBlock &pixels, cons
        if (icov_xz < 0) std::swap(l[0], h[0]);
        if (icov_yz < 0) std::swap(l[1], h[1]);

-        result.low = Color::PreciseRound565(l);
-        result.high = Color::PreciseRound565(h);
+        result.low = OldColor::PreciseRound565(l);
+        result.high = OldColor::PreciseRound565(h);
    } else if (endpoint_mode == EndpointMode::BoundingBoxInt) {
        // Algorithm from icbc.h compress_dxt1_fast(), but converted to integer.
+        // TODO: handle constant blue channel better

-        Color min, max;
+        OldColor min, max;

        // rescale and inset values
        for (unsigned c = 0; c < 3; c++) {
            int inset = ((int)(metrics.max[c] - metrics.min[c]) - 8) >> 4;  // 1/16 of delta, with bias

-            min[c] = clamp255(metrics.min[c] + inset);
-            max[c] = clamp255(metrics.max[c] - inset);
+            min[c] = clamp(metrics.min[c] + inset, 0, 255);
+            max[c] = clamp(metrics.max[c] - inset, 0, 255);
        }

        int icov_xz = 0, icov_yz = 0;
@ -607,19 +628,21 @@ void BC1Encoder::FindEndpoints(EncodeResults &result, const CBlock &pixels, cons
    } else if (endpoint_mode == EndpointMode::PCA) {
        // the slow way
        // Select 2 colors along the principle axis. (There must be a faster/simpler way.)
-        auto min = Vector4::FromColorRGB(metrics.min);
-        auto max = Vector4::FromColorRGB(metrics.max);
-        auto avg = Vector4::FromColorRGB(metrics.avg);

-        Vector4 axis = {306, 601, 117};  // Luma vector
-        Matrix4x4 covariance = Matrix4x4::Identity();
+        // TODO: handle constant blue channel better
+
+        Color min = metrics.min;
+        Color max = metrics.max;
+        Color avg = metrics.avg;
+
+        Vec<float, 4> axis = {306, 601, 117, 0};  // Luma vector
+        auto covariance = Matrix<float, 4, 4>::identity();

        for (int i = 0; i < 16; i++) {
            auto val = pixels.Get(i);
            if (ignore_black && val.IsBlack()) continue;

-            auto color_vec = Vector4::FromColorRGB(val);
-            Vector4 diff = color_vec - avg;
+            auto diff = val - avg;
            for (unsigned c1 = 0; c1 < 3; c1++) {
                for (unsigned c2 = c1; c2 < 3; c2++) {
                    covariance[c1][c2] += (diff[c1] * diff[c2]);
@ -629,20 +652,24 @@ void BC1Encoder::FindEndpoints(EncodeResults &result, const CBlock &pixels, cons
        }

        covariance /= 255.0f;
-        covariance.Mirror();
+        covariance = covariance.mirror();

-        Vector4 delta = max - min;
+        Vec<float, 4> delta = max - min;

        // realign r and g axes to match
        if (covariance[0][2] < 0) delta[0] = -delta[0];  // r vs b
        if (covariance[1][2] < 0) delta[1] = -delta[1];  // g vs b

-        // using the covariance matrix, stretch the delta vector towards the primary axis of the data using power iteration
-        // the end result of this may actually be the same as the least squares approach, will have to do more research
-        for (unsigned power_iter = 0; power_iter < _power_iterations; power_iter++) { delta = covariance * delta; }
+        // using the covariance matrix, stretch the delta vector towards the primary axis of the data using power
+        // iteration the end result of this may actually be the same as the least squares approach, will have to do more
+        // research
+        for (unsigned power_iter = 0; power_iter < _power_iterations; power_iter++) {
+            delta = covariance.mult(delta);
+        }

        // if we found any correlation, then this is our new axis. otherwise we fallback to the luma vector
-        float k = delta.MaxAbs(3);
+        auto delta_abs = delta.abs();
+        float k = *std::max_element(delta_abs.begin(), delta_abs.end());
        if (k >= 2) { axis = delta * (2048.0f / k); }

        axis *= 16;
@ -653,13 +680,12 @@ void BC1Encoder::FindEndpoints(EncodeResults &result, const CBlock &pixels, cons
        int min_index = 0, max_index = 0;

        for (int i = 0; i < 16; i++) {
-            auto val = pixels.Get(i);
-            if (ignore_black && val.IsBlack()) continue;
+            Color val = pixels.Get(i); //todo: fix this mess
+            if (ignore_black && (val.r() | val.g() | val.b()) < 4) continue;

-            auto color_vec = Vector4::FromColorRGB(val);
            // since axis is constant here, I dont think its magnitude actually matters,
            // since we only care about the min or max dot product
-            float dot = color_vec.Dot(axis);
+            float dot = (Vec<float,4>(val)).dot(axis);
            if (dot > max_dot) {
                max_dot = dot;
                max_index = i;
@ -677,20 +703,21 @@ void BC1Encoder::FindEndpoints(EncodeResults &result, const CBlock &pixels, cons
    result.color_mode = ColorMode::Incomplete;
 }

-template <BC1Encoder::ColorMode M> void BC1Encoder::FindSelectors(EncodeResults &result, const CBlock &pixels, ErrorMode error_mode) const {
+template <BC1Encoder::ColorMode M>
+void BC1Encoder::FindSelectors(EncodeResults &result, const CBlock &pixels, ErrorMode error_mode) const {
    assert(!((error_mode != ErrorMode::Full) && (bool)(M & ColorMode::ThreeColor)));

    const int color_count = (unsigned)M & 0x0F;

-    std::array<Color, 4> colors = _interpolator->InterpolateBC1(result.low, result.high, color_count == 3);
+    std::array<OldColor, 4> colors = _interpolator->InterpolateBC1(result.low, result.high, color_count == 3);
    std::array<Vector4Int, 4> color_vectors;

    if (color_count == 4) {
-        color_vectors = {Vector4Int::FromColorRGB(colors[0]), Vector4Int::FromColorRGB(colors[2]), Vector4Int::FromColorRGB(colors[3]),
-                         Vector4Int::FromColorRGB(colors[1])};
+        color_vectors = {Vector4Int::FromColorRGB(colors[0]), Vector4Int::FromColorRGB(colors[2]),
+                         Vector4Int::FromColorRGB(colors[3]), Vector4Int::FromColorRGB(colors[1])};
    } else {
-        color_vectors = {Vector4Int::FromColorRGB(colors[0]), Vector4Int::FromColorRGB(colors[2]), Vector4Int::FromColorRGB(colors[1]),
-                         Vector4Int::FromColorRGB(colors[3])};
+        color_vectors = {Vector4Int::FromColorRGB(colors[0]), Vector4Int::FromColorRGB(colors[2]),
+                         Vector4Int::FromColorRGB(colors[1]), Vector4Int::FromColorRGB(colors[3])};
    }

    unsigned total_error = 0;
@ -714,7 +741,8 @@ template <BC1Encoder::ColorMode M> void BC1Encoder::FindSelectors(EncodeResults
                // llvm is just going to unswitch this anyways so its not an issue
                auto diff = pixel_vector - color_vectors[selector];
                total_error += diff.SqrMag();
-                if (i % 4 != 0 && total_error >= result.error) break;  // check only once per row if we're generating too much error
+                if (i % 4 != 0 && total_error >= result.error)
+                    break;  // check only once per row if we're generating too much error
            }

            result.selectors[i] = selector;
@ -727,7 +755,7 @@ template <BC1Encoder::ColorMode M> void BC1Encoder::FindSelectors(EncodeResults
            Vector4Int pixel_vector = Vector4Int::FromColorRGB(pixels.Get(i));
            auto diff = pixel_vector - color_vectors[0];
            float sel_f = (float)diff.Dot(axis) * f + 0.5f;
-            uint8_t sel = (uint8_t)clampi((int)sel_f, 1, 3);
+            uint8_t sel = (uint8_t)clamp<int>((int)sel_f, 1, 3);

            unsigned err0 = (color_vectors[sel - 1] - pixel_vector).SqrMag();
            unsigned err1 = (color_vectors[sel] - pixel_vector).SqrMag();
@ -779,7 +807,8 @@ template <BC1Encoder::ColorMode M> void BC1Encoder::FindSelectors(EncodeResults
    result.color_mode = M;
 }

-template <BC1Encoder::ColorMode M> bool BC1Encoder::RefineEndpointsLS(EncodeResults &result, const CBlock &pixels, BlockMetrics metrics) const {
+template <BC1Encoder::ColorMode M>
+bool BC1Encoder::RefineEndpointsLS(EncodeResults &result, const CBlock &pixels, BlockMetrics metrics) const {
    const int color_count = (unsigned)M & 0x0F;
    static_assert(color_count == 3 || color_count == 4);
    assert(result.color_mode != ColorMode::Incomplete);
@ -790,11 +819,12 @@ template <BC1Encoder::ColorMode M> bool BC1Encoder::RefineEndpointsLS(EncodeResu
    Vector4 matrix = Vector4(0);

    for (int i = 0; i < 16; i++) {
-        const Color color = pixels.Get(i);
+        const OldColor color = pixels.Get(i);
        const uint8_t sel = result.selectors[i];

        if ((bool)(M & ColorMode::ThreeColorBlack) && color.IsBlack()) continue;
-        if ((bool)(M & ColorMode::ThreeColor) && sel == 3U) continue;  // NOTE: selectors for 3-color are in linear order here, but not in original
+        if ((bool)(M & ColorMode::ThreeColor) && sel == 3U)
+            continue;  // NOTE: selectors for 3-color are in linear order here, but not in original
        assert(sel < color_count);

        const Vector4Int color_vector = Vector4Int::FromColorRGB(color);
@ -805,7 +835,7 @@ template <BC1Encoder::ColorMode M> bool BC1Encoder::RefineEndpointsLS(EncodeResu

    // invert matrix
    float det = matrix.Determinant2x2();  // z00 * z11 - z01 * z10;
-    if (fabs(det) < 1e-8f) {
+    if (abs(det) < 1e-8f) {
        result.color_mode = ColorMode::Incomplete;
        return false;
    }
@ -820,12 +850,14 @@ template <BC1Encoder::ColorMode M> bool BC1Encoder::RefineEndpointsLS(EncodeResu
    Vector4 high = (matrix[2] * q00) + (matrix[3] * q10);

    result.color_mode = M;
-    result.low = Color::PreciseRound565(low);
-    result.high = Color::PreciseRound565(high);
+    result.low = OldColor::PreciseRound565(low);
+    result.high = OldColor::PreciseRound565(high);
    return true;
 }

-template <BC1Encoder::ColorMode M> void BC1Encoder::RefineEndpointsLS(EncodeResults &result, std::array<Vector4, 17> &sums, Vector4 &matrix, Hash hash) const {
+template <BC1Encoder::ColorMode M>
+void BC1Encoder::RefineEndpointsLS(EncodeResults &result, std::array<Vector4, 17> &sums, Vector4 &matrix,
+                                   Hash hash) const {
    const int color_count = (unsigned)M & 0x0F;
    static_assert(color_count == 3 || color_count == 4);
    assert(result.color_mode != ColorMode::Incomplete);
@ -846,12 +878,13 @@ template <BC1Encoder::ColorMode M> void BC1Encoder::RefineEndpointsLS(EncodeResu
    Vector4 high = (matrix[2] * q00) + (matrix[3] * q10);

    result.color_mode = M;
-    result.low = Color::PreciseRound565(low);
-    result.high = Color::PreciseRound565(high);
+    result.low = OldColor::PreciseRound565(low);
+    result.high = OldColor::PreciseRound565(high);
 }

 template <BC1Encoder::ColorMode M>
-void BC1Encoder::RefineBlockLS(EncodeResults &result, const CBlock &pixels, const BlockMetrics &metrics, ErrorMode error_mode, unsigned passes) const {
+void BC1Encoder::RefineBlockLS(EncodeResults &result, const CBlock &pixels, const BlockMetrics &metrics,
+                               ErrorMode error_mode, unsigned passes) const {
    assert(error_mode != ErrorMode::None || passes == 1);

    for (unsigned pass = 0; pass < passes; pass++) {
@ -876,7 +909,8 @@ void BC1Encoder::RefineBlockLS(EncodeResults &result, const CBlock &pixels, cons
 }

 template <BC1Encoder::ColorMode M>
-void BC1Encoder::RefineBlockCF(EncodeResults &result, const CBlock &pixels, const BlockMetrics &metrics, ErrorMode error_mode, unsigned orderings) const {
+void BC1Encoder::RefineBlockCF(EncodeResults &result, const CBlock &pixels, const BlockMetrics &metrics,
+                               ErrorMode error_mode, unsigned orderings) const {
    const int color_count = (unsigned)M & 0x0F;
    static_assert(color_count == 3 || color_count == 4);
    assert(result.color_mode != ColorMode::Incomplete);
@ -955,7 +989,8 @@ void BC1Encoder::EndpointSearch(EncodeResults &result, const CBlock &pixels) con

    for (unsigned i = 0; i < _search_rounds; i++) {
        const unsigned voxel_index = (unsigned)(i & 15);
-        assert((unsigned)Voxels[(unsigned)Voxels[voxel_index][3]][3] == voxel_index);  // make sure voxels are symmetrical
+        assert((unsigned)Voxels[(unsigned)Voxels[voxel_index][3]][3] ==
+               voxel_index);  // make sure voxels are symmetrical

        if ((int)(i & 31) == forbidden_direction) continue;

--- a/quicktex/s3tc/bc1/BC1Encoder.h
+++ b/quicktex/s3tc/bc1/BC1Encoder.h
@ -26,13 +26,13 @@
 #include <memory>
 #include <tuple>

-#include "../../Color.h"
-#include "../../ColorBlock.h"
-#include "../../Encoder.h"
-#include "../../Texture.h"
-#include "../interpolator/Interpolator.h"
-#include "BC1Block.h"
-#include "SingleColorTable.h"
+#include "ColorBlock.h"
+#include "Encoder.h"
+#include "OldColor.h"
+#include "s3tc/bc1/BC1Block.h"
+#include "s3tc/bc1/SingleColorTable.h"
+#include "s3tc/interpolator/Interpolator.h"
+#include "texture/BlockTexture.h"

 namespace quicktex {
 class Vector4;
@ -79,7 +79,8 @@ class BC1Encoder final : public BlockEncoder<BlockTexture<BC1Block>> {
    };

    enum class EndpointMode {
-        // Use 2D least squares+inset+optimal rounding (the method used in Humus's GPU texture encoding demo), instead of PCA.
+        // Use 2D least squares+inset+optimal rounding (the method used in Humus's GPU texture encoding demo), instead
+        // of PCA.
        // Around 18% faster, very slightly lower average quality to better (depends on the content).
        LeastSquares,

@ -101,7 +102,8 @@ class BC1Encoder final : public BlockEncoder<BlockTexture<BC1Block>> {

    BC1Encoder(unsigned level, ColorMode color_mode, InterpolatorPtr interpolator);

-    BC1Encoder(unsigned int level = 5, ColorMode color_mode = ColorMode::FourColor) : BC1Encoder(level, color_mode, std::make_shared<Interpolator>()) {}
+    BC1Encoder(unsigned int level = 5, ColorMode color_mode = ColorMode::FourColor)
+        : BC1Encoder(level, color_mode, std::make_shared<Interpolator>()) {}

    // Getters and Setters
    void SetLevel(unsigned level);
@ -141,8 +143,8 @@ class BC1Encoder final : public BlockEncoder<BlockTexture<BC1Block>> {

    // Unpacked BC1 block with metadata
    struct EncodeResults {
-        Color low;
-        Color high;
+        OldColor low;
+        OldColor high;
        std::array<uint8_t, 16> selectors = {0};
        ColorMode color_mode = ColorMode::Incomplete;
        bool solid = false;
@ -169,24 +171,29 @@ class BC1Encoder final : public BlockEncoder<BlockTexture<BC1Block>> {
    unsigned _orderings4;
    unsigned _orderings3;

-    BC1Block WriteBlockSolid(Color color) const;
+    BC1Block WriteBlockSolid(OldColor color) const;
    BC1Block WriteBlock(EncodeResults &result) const;

-    void FindEndpoints(EncodeResults &result, const CBlock &pixels, const BlockMetrics &metrics, EndpointMode endpoint_mode, bool ignore_black = false) const;
-    void FindEndpointsSingleColor(EncodeResults &result, Color color, bool is_3color = false) const;
-    void FindEndpointsSingleColor(EncodeResults &result, const CBlock &pixels, Color color, bool is_3color) const;
+    void FindEndpoints(EncodeResults &result, const CBlock &pixels, const BlockMetrics &metrics,
+                       EndpointMode endpoint_mode, bool ignore_black = false) const;
+    void FindEndpointsSingleColor(EncodeResults &result, OldColor color, bool is_3color = false) const;
+    void FindEndpointsSingleColor(EncodeResults &result, const CBlock &pixels, OldColor color, bool is_3color) const;

    template <ColorMode M> void FindSelectors(EncodeResults &result, const CBlock &pixels, ErrorMode error_mode) const;

-    template <ColorMode M> bool RefineEndpointsLS(EncodeResults &result, const CBlock &pixels, BlockMetrics metrics) const;
-
-    template <ColorMode M> void RefineEndpointsLS(EncodeResults &result, std::array<Vector4, 17> &sums, Vector4 &matrix, Hash hash) const;
+    template <ColorMode M>
+    bool RefineEndpointsLS(EncodeResults &result, const CBlock &pixels, BlockMetrics metrics) const;

    template <ColorMode M>
-    void RefineBlockLS(EncodeResults &result, const CBlock &pixels, const BlockMetrics &metrics, ErrorMode error_mode, unsigned passes) const;
+    void RefineEndpointsLS(EncodeResults &result, std::array<Vector4, 17> &sums, Vector4 &matrix, Hash hash) const;

    template <ColorMode M>
-    void RefineBlockCF(EncodeResults &result, const CBlock &pixels, const BlockMetrics &metrics, ErrorMode error_mode, unsigned orderings) const;
+    void RefineBlockLS(EncodeResults &result, const CBlock &pixels, const BlockMetrics &metrics, ErrorMode error_mode,
+                       unsigned passes) const;
+
+    template <ColorMode M>
+    void RefineBlockCF(EncodeResults &result, const CBlock &pixels, const BlockMetrics &metrics, ErrorMode error_mode,
+                       unsigned orderings) const;

    void EndpointSearch(EncodeResults &result, const CBlock &pixels) const;
 };
--- a/quicktex/s3tc/bc1/Histogram.h
+++ b/quicktex/s3tc/bc1/Histogram.h
@ -27,10 +27,10 @@
 #include <mutex>
 #include <numeric>

-#include "../../Vector4.h"
-#include "../../util.h"
+#include "Vector4.h"
+#include "util/math.h"

-namespace quicktex::s3tc  {
+namespace quicktex::s3tc {
 template <size_t N> class Histogram {
   public:
    using Hash = uint16_t;
@ -71,7 +71,7 @@ template <size_t N> class Histogram {
    unsigned GetPacked() const {
        Hash packed = 0;

-        for (unsigned i = 0; i < (N-1); i++) {
+        for (unsigned i = 0; i < (N - 1); i++) {
            assert(_bins[i] <= (1U << 4) - 1U);
            packed |= static_cast<uint16_t>(_bins[i]) << (i * 4U);
        }
--- a/quicktex/s3tc/bc1/OrderTable.cpp
+++ b/quicktex/s3tc/bc1/OrderTable.cpp
@ -21,7 +21,7 @@

 #include <array>

-#include "../../Vector4.h"
+#include "Vector4.h"

 namespace quicktex::s3tc  {
 using Hash = uint16_t;
--- a/quicktex/s3tc/bc1/OrderTable.h
+++ b/quicktex/s3tc/bc1/OrderTable.h
@ -29,8 +29,9 @@
 #include <mutex>
 #include <type_traits>

-#include "../../Vector4.h"
 #include "Histogram.h"
+#include "Vector4.h"
+#include "util/math.h"

 namespace quicktex::s3tc {
 template <size_t N> class OrderTable {
@ -73,7 +74,7 @@ template <size_t N> class OrderTable {
                for (unsigned sel = 0; sel < N; sel++) factor_matrix += (Weights[sel] * h[sel]);

                float det = factor_matrix.Determinant2x2();
-                if (fabs(det) < 1e-8f) {
+                if (abs(det) < 1e-8f) {
                    factors->at(i) = Vector4(0);
                } else {
                    std::swap(factor_matrix[0], factor_matrix[3]);
@ -113,7 +114,9 @@ template <size_t N> class OrderTable {
        return factors->at(hash);
    }

-    static bool IsSingleColor(Hash hash) { return (std::find(SingleColorHashes.begin(), SingleColorHashes.end(), hash) != SingleColorHashes.end()); }
+    static bool IsSingleColor(Hash hash) {
+        return (std::find(SingleColorHashes.begin(), SingleColorHashes.end(), hash) != SingleColorHashes.end());
+    }

   private:
    static std::mutex table_mutex;
--- a/quicktex/s3tc/bc1/SingleColorTable.h
+++ b/quicktex/s3tc/bc1/SingleColorTable.h
@ -23,10 +23,11 @@
 #include <cstdint>
 #include <memory>

-#include "../../util.h"
-#include "../interpolator/Interpolator.h"
+#include "s3tc/interpolator/Interpolator.h"
+#include "util/bitbash.h"
+#include "util/math.h"

-namespace quicktex::s3tc  {
+namespace quicktex::s3tc {

 struct BC1MatchEntry {
    uint8_t high;
@ -59,10 +60,10 @@ template <size_t B, size_t N> MatchListPtr SingleColorTable(InterpolatorPtr inte
        // TODO: Can probably avoid testing for values that definitely wont yield good results,
        // e.g. low8 and high8 both much smaller or larger than index
        for (uint8_t low = 0; low < Size; low++) {
-            uint8_t low8 = (B == 5) ? scale5To8(low) : scale6To8(low);
+            uint8_t low8 = scale_to_8<B>(low);

            for (uint8_t high = 0; high < Size; high++) {
-                uint8_t high8 = (B == 5) ? scale5To8(high) : scale6To8(high);
+                uint8_t high8 = scale_to_8<B>(high);
                uint8_t value;

                if (use_8bit) {
@ -71,10 +72,10 @@ template <size_t B, size_t N> MatchListPtr SingleColorTable(InterpolatorPtr inte
                    value = (B == 5) ? interpolator->Interpolate5(high, low) : interpolator->Interpolate6(high, low);
                }

-                unsigned new_error = iabs(value - (int)i);
+                unsigned new_error = abs(value - (int)i);

                // We only need to factor in 3% error in BC1 ideal mode.
-                if (ideal) new_error += (iabs(high8 - (int)low8) * 3) / 100;
+                if (ideal) new_error += (abs(high8 - (int)low8) * 3) / 100;

                if ((new_error < error) || (new_error == error && low == high)) {
                    assert(new_error <= UINT8_MAX);
--- a/quicktex/s3tc/bc1/_bindings.cpp
+++ b/quicktex/s3tc/bc1/_bindings.cpp
@ -23,16 +23,12 @@
 #include <pybind11/stl.h>

 #include <array>
-#include <cstddef>
-#include <cstdint>
-#include <stdexcept>
-#include <string>
+#include <memory>

-#include "../../Decoder.h"
-#include "../../Encoder.h"
-#include "../interpolator/Interpolator.h"
-#include "BC1Decoder.h"
-#include "BC1Encoder.h"
+#include "s3tc/bc1/BC1Block.h"
+#include "s3tc/bc1/BC1Decoder.h"
+#include "s3tc/bc1/BC1Encoder.h"
+#include "s3tc/interpolator/Interpolator.h"

 namespace py = pybind11;
 namespace quicktex::bindings {
@ -50,7 +46,7 @@ void InitBC1(py::module_ &s3tc) {
    bc1_block.doc() = "A single BC1 block.";

    bc1_block.def(py::init<>());
-    bc1_block.def(py::init<Color, Color, BC1Block::SelectorArray>(), "color0"_a, "color1"_a, "selectors"_a, R"doc(
+    bc1_block.def(py::init<OldColor, OldColor, BC1Block::SelectorArray>(), "color0"_a, "color1"_a, "selectors"_a, R"doc(
        Create a new BC1Block with the specified endpoints and selectors

        :param color0: The first endpoint
@ -58,7 +54,8 @@ void InitBC1(py::module_ &s3tc) {
        :param selectors: the selectors as a 4x4 list of integers, between 0 and 3 inclusive.
    )doc");

-    bc1_block.def_property("endpoints", &BC1Block::GetColors, &BC1Block::SetColors, "The block's endpoint colors as a 2-tuple.");
+    bc1_block.def_property("endpoints", &BC1Block::GetColors, &BC1Block::SetColors,
+                           "The block's endpoint colors as a 2-tuple.");
    bc1_block.def_property("selectors", &BC1Block::GetSelectors, &BC1Block::SetSelectors, R"doc(
        The block's selectors as a 4x4 list of integers between 0 and 3 inclusive.

@ -83,27 +80,42 @@ void InitBC1(py::module_ &s3tc) {
    // region BC1Encoder
    py::class_<BC1Encoder> bc1_encoder(bc1, "BC1Encoder", "Encodes RGB textures to BC1.");

-    py::enum_<BC1Encoder::EndpointMode>(bc1_encoder, "EndpointMode", "Enum representing various methods of finding endpoints in a block.")
-        .value("LeastSquares", BC1Encoder::EndpointMode::LeastSquares, "Find endpoints using a 2D least squares approach.")
-        .value("BoundingBox", BC1Encoder::EndpointMode::BoundingBox, "Find endpoints using a simple bounding box. Fast but inaccurate.")
-        .value("BoundingBoxInt", BC1Encoder::EndpointMode::BoundingBoxInt, "Same as BoundingBox but using integers, slightly faster.")
-        .value("PCA", BC1Encoder::EndpointMode::PCA, "Find endpoints using Principle Component Analysis. Slowest but highest quality method.");
+    py::enum_<BC1Encoder::EndpointMode>(bc1_encoder, "EndpointMode",
+                                        "Enum representing various methods of finding endpoints in a block.")
+        .value("LeastSquares", BC1Encoder::EndpointMode::LeastSquares,
+               "Find endpoints using a 2D least squares approach.")
+        .value("BoundingBox", BC1Encoder::EndpointMode::BoundingBox,
+               "Find endpoints using a simple bounding box. Fast but inaccurate.")
+        .value("BoundingBoxInt", BC1Encoder::EndpointMode::BoundingBoxInt,
+               "Same as BoundingBox but using integers, slightly faster.")
+        .value("PCA", BC1Encoder::EndpointMode::PCA,
+               "Find endpoints using Principle Component Analysis. Slowest but highest quality method.");

-    py::enum_<BC1Encoder::ErrorMode>(bc1_encoder, "ErrorMode", "Enum representing various methods of finding selectors in a block.")
-        .value("None", BC1Encoder::ErrorMode::None, "The same as Faster but error is not calculated. This disables any cluster-fit options")
-        .value("Faster", BC1Encoder::ErrorMode::Faster, "Use a slightly lower quality, but ~30% faster MSE evaluation function for 4-color blocks.")
+    py::enum_<BC1Encoder::ErrorMode>(bc1_encoder, "ErrorMode",
+                                     "Enum representing various methods of finding selectors in a block.")
+        .value("None", BC1Encoder::ErrorMode::None,
+               "The same as Faster but error is not calculated. This disables any cluster-fit options")
+        .value("Faster", BC1Encoder::ErrorMode::Faster,
+               "Use a slightly lower quality, but ~30% faster MSE evaluation function for 4-color blocks.")
        .value("Check2", BC1Encoder::ErrorMode::Check2, "Default error-checking method.")
-        .value("Full", BC1Encoder::ErrorMode::Full, "Examine all colors to compute selectors/MSE. Slower but slightly higher quality.");
+        .value("Full", BC1Encoder::ErrorMode::Full,
+               "Examine all colors to compute selectors/MSE. Slower but slightly higher quality.");

-    py::enum_<BC1Encoder::ColorMode>(bc1_encoder, "ColorMode", "Enum representing various methods of writing BC1 blocks.")
-        .value("FourColor", BC1Encoder::ColorMode::FourColor, "Default color mode. Only 4-color blocks will be output, where color0 > color1")
-        .value("ThreeColor", BC1Encoder::ColorMode::ThreeColor, "Additionally use 3-color blocks when they have a lower error, where color0 <= color1")
+    py::enum_<BC1Encoder::ColorMode>(bc1_encoder, "ColorMode",
+                                     "Enum representing various methods of writing BC1 blocks.")
+        .value("FourColor", BC1Encoder::ColorMode::FourColor,
+               "Default color mode. Only 4-color blocks will be output, where color0 > color1")
+        .value("ThreeColor", BC1Encoder::ColorMode::ThreeColor,
+               "Additionally use 3-color blocks when they have a lower error, where color0 <= color1")
        .value("ThreeColorBlack", BC1Encoder::ColorMode::ThreeColorBlack,
-               "Additionally use 3-color blocks with black pixels (selector 3). Note that this requires your shader/engine to not sample the alpha channel "
+               "Additionally use 3-color blocks with black pixels (selector 3). Note that this requires your "
+               "shader/engine to not sample the alpha channel "
               "when using a BC1 texture.");

-    bc1_encoder.def(py::init<unsigned, BC1Encoder::ColorMode>(), "level"_a = 5, "color_mode"_a = BC1Encoder::ColorMode::FourColor);
-    bc1_encoder.def(py::init<unsigned, BC1Encoder::ColorMode, InterpolatorPtr>(), "level"_a, "color_mode"_a, "interpolator"_a, R"doc(
+    bc1_encoder.def(py::init<unsigned, BC1Encoder::ColorMode>(), "level"_a = 5,
+                    "color_mode"_a = BC1Encoder::ColorMode::FourColor);
+    bc1_encoder.def(py::init<unsigned, BC1Encoder::ColorMode, InterpolatorPtr>(), "level"_a, "color_mode"_a,
+                    "interpolator"_a, R"doc(
        Create a new BC1 encoder with the specified preset level, color mode, and interpolator.

        :param int level: The preset level of the resulting encoder, between 0 and 18 inclusive. See :py:meth:`set_level` for more information. Default: 5.
@ -125,44 +137,56 @@ void InitBC1(py::module_ &s3tc) {
        :param int level: The preset level of the resulting encoder, between 0 and 18 inclusive. Default: 5.
    )doc");

-    bc1_encoder.def_property_readonly("interpolator", &BC1Encoder::GetInterpolator,
-                                      "The :py:class:`~quicktex.s3tc.interpolator.Interpolator` used by this encoder. This is a readonly property.");
-    bc1_encoder.def_property_readonly("color_mode", &BC1Encoder::GetColorMode,
-                                      "The :py:class:`~quicktex.s3tc.bc1.BC1Encoder.ColorMode` used by this encoder. This is a readonly property.");
+    bc1_encoder.def_property_readonly(
+        "interpolator", &BC1Encoder::GetInterpolator,
+        "The :py:class:`~quicktex.s3tc.interpolator.Interpolator` used by this encoder. This is a readonly property.");
+    bc1_encoder.def_property_readonly(
+        "color_mode", &BC1Encoder::GetColorMode,
+        "The :py:class:`~quicktex.s3tc.bc1.BC1Encoder.ColorMode` used by this encoder. This is a readonly property.");

    // Advanced API

-    bc1_encoder.def_property("error_mode", &BC1Encoder::GetErrorMode, &BC1Encoder::SetErrorMode, "The error mode used by this encoder for finding selectors.");
-    bc1_encoder.def_property("endpoint_mode", &BC1Encoder::GetEndpointMode, &BC1Encoder::SetEndpointMode, "The endpoint mode used by this encoder.");
+    bc1_encoder.def_property("error_mode", &BC1Encoder::GetErrorMode, &BC1Encoder::SetErrorMode,
+                             "The error mode used by this encoder for finding selectors.");
+    bc1_encoder.def_property("endpoint_mode", &BC1Encoder::GetEndpointMode, &BC1Encoder::SetEndpointMode,
+                             "The endpoint mode used by this encoder.");

    bc1_encoder.def_readwrite("two_ls_passes", &BC1Encoder::two_ls_passes,
                              "Use 2 least squares pass, instead of one (same as stb_dxt's HIGHQUAL option).\n"
                              "Recommended if you're setting the orderings settings greater than 0.");

-    bc1_encoder.def_readwrite("two_ep_passes", &BC1Encoder::two_ep_passes, "Try 2 different ways of choosing the initial endpoints.");
+    bc1_encoder.def_readwrite("two_ep_passes", &BC1Encoder::two_ep_passes,
+                              "Try 2 different ways of choosing the initial endpoints.");

-    bc1_encoder.def_readwrite("two_cf_passes", &BC1Encoder::two_cf_passes,
-                              "Greatly increase encode time, with very slightly higher quality.\n"
-                              "Same as squish's iterative cluster fit option. Not really worth the tiny boost in quality, "
-                              "unless you just don't care about performance at all.");
+    bc1_encoder.def_readwrite(
+        "two_cf_passes", &BC1Encoder::two_cf_passes,
+        "Greatly increase encode time, with very slightly higher quality.\n"
+        "Same as squish's iterative cluster fit option. Not really worth the tiny boost in quality, "
+        "unless you just don't care about performance at all.");

-    bc1_encoder.def_readwrite("exhaustive", &BC1Encoder::exhaustive,
-                              "Check all total orderings - *very* slow. The encoder is not designed to be used in this way");
+    bc1_encoder.def_readwrite(
+        "exhaustive", &BC1Encoder::exhaustive,
+        "Check all total orderings - *very* slow. The encoder is not designed to be used in this way");

    bc1_encoder.def_property("search_rounds", &BC1Encoder::GetSearchRounds, &BC1Encoder::SetSearchRounds,
-                             "Setting search rounds > 0 enables refining the final endpoints by examining nearby colors. A higher value has a higher quality "
+                             "Setting search rounds > 0 enables refining the final endpoints by examining nearby "
+                             "colors. A higher value has a higher quality "
                             "at the expense of performance.");

-    bc1_encoder.def_property("orderings", &BC1Encoder::GetOrderings, &BC1Encoder::SetOrderings,
-                             "setting the orderings > 0 enables ordered cluster fit using a lookup table of similar blocks. Value is a tuple of (4 color "
-                             "orders, 3 color orders), where higher values have a higher quality at the expense of performance.");
+    bc1_encoder.def_property(
+        "orderings", &BC1Encoder::GetOrderings, &BC1Encoder::SetOrderings,
+        "setting the orderings > 0 enables ordered cluster fit using a lookup table of similar blocks. Value is a "
+        "tuple of (4 color "
+        "orders, 3 color orders), where higher values have a higher quality at the expense of performance.");

    bc1_encoder.def_readonly_static("max_power_iterations", &BC1Encoder::max_power_iterations);
    bc1_encoder.def_readonly_static("min_power_iterations", &BC1Encoder::min_power_iterations);

-    bc1_encoder.def_property("power_iterations", &BC1Encoder::GetPowerIterations, &BC1Encoder::SetPowerIterations,
-                             "Number of power iterations used with the PCA endpoint mode. Value should be around 4 to 6. "
-                             "Automatically clamped to between :py:const:`BC1Encoder.min_power_iterations` and :py:const:`BC1Encoder.max_power_iterations`");
+    bc1_encoder.def_property(
+        "power_iterations", &BC1Encoder::GetPowerIterations, &BC1Encoder::SetPowerIterations,
+        "Number of power iterations used with the PCA endpoint mode. Value should be around 4 to 6. "
+        "Automatically clamped to between :py:const:`BC1Encoder.min_power_iterations` and "
+        ":py:const:`BC1Encoder.max_power_iterations`");
    // endregion

    // region BC1Decoder
@ -185,8 +209,10 @@ void InitBC1(py::module_ &s3tc) {
        :returns: A new RawTexture with the same dimensions as the input
    )doc");

-    bc1_decoder.def_property_readonly("interpolator", &BC1Decoder::GetInterpolator, "The interpolator used by this decoder. This is a readonly property.");
-    bc1_decoder.def_readwrite("write_alpha", &BC1Decoder::write_alpha, "Determines if the alpha channel of the output is written to.");
+    bc1_decoder.def_property_readonly("interpolator", &BC1Decoder::GetInterpolator,
+                                      "The interpolator used by this decoder. This is a readonly property.");
+    bc1_decoder.def_readwrite("write_alpha", &BC1Decoder::write_alpha,
+                              "Determines if the alpha channel of the output is written to.");
    // endregion
 }
 }  // namespace quicktex::bindings
--- a/quicktex/s3tc/bc3/BC3Block.h
+++ b/quicktex/s3tc/bc3/BC3Block.h
@ -21,8 +21,8 @@

 #include <utility>

-#include "../bc1/BC1Block.h"
-#include "../bc4/BC4Block.h"
+#include "s3tc/bc1/BC1Block.h"
+#include "s3tc/bc4/BC4Block.h"

 namespace quicktex::s3tc {

@ -54,7 +54,9 @@ class alignas(8) BC3Block {
        color_block = blocks.second;
    }

-    bool operator==(const BC3Block &Rhs) const { return alpha_block == Rhs.alpha_block && color_block == Rhs.color_block; }
+    bool operator==(const BC3Block &Rhs) const {
+        return alpha_block == Rhs.alpha_block && color_block == Rhs.color_block;
+    }
    bool operator!=(const BC3Block &Rhs) const { return !(Rhs == *this); }
 };
 }  // namespace quicktex::s3tc
--- a/quicktex/s3tc/bc3/BC3Decoder.h
+++ b/quicktex/s3tc/bc3/BC3Decoder.h
@ -21,13 +21,13 @@

 #include <memory>

-#include "../../ColorBlock.h"
-#include "../../Decoder.h"
-#include "../../Texture.h"
-#include "../bc1/BC1Decoder.h"
-#include "../bc4/BC4Decoder.h"
-#include "../interpolator/Interpolator.h"
-#include "BC3Block.h"
+#include "ColorBlock.h"
+#include "Decoder.h"
+#include "s3tc/bc1/BC1Decoder.h"
+#include "s3tc/bc3/BC3Block.h"
+#include "s3tc/bc4/BC4Decoder.h"
+#include "s3tc/interpolator/Interpolator.h"
+#include "texture/BlockTexture.h"

 namespace quicktex::s3tc {

@ -37,7 +37,8 @@ class BC3Decoder : public BlockDecoder<BlockTexture<BC3Block>> {
    using BC4DecoderPtr = std::shared_ptr<BC4Decoder>;
    using InterpolatorPtr = std::shared_ptr<Interpolator>;

-    BC3Decoder(InterpolatorPtr interpolator) : _bc1_decoder(std::make_shared<BC1Decoder>(interpolator)), _bc4_decoder(std::make_shared<BC4Decoder>(3)) {}
+    BC3Decoder(InterpolatorPtr interpolator)
+        : _bc1_decoder(std::make_shared<BC1Decoder>(interpolator)), _bc4_decoder(std::make_shared<BC4Decoder>(3)) {}

    BC3Decoder() : BC3Decoder(std::make_shared<Interpolator>()) {}

--- a/quicktex/s3tc/bc3/BC3Encoder.cpp
+++ b/quicktex/s3tc/bc3/BC3Encoder.cpp
@ -19,10 +19,8 @@

 #include "BC3Encoder.h"

-#include "../../ColorBlock.h"
-#include "../bc1/BC1Block.h"
-#include "../bc4/BC4Block.h"
-#include "BC3Block.h"
+#include "ColorBlock.h"
+#include "s3tc/bc3/BC3Block.h"

 namespace quicktex::s3tc {
 BC3Block BC3Encoder::EncodeBlock(const ColorBlock<4, 4> &pixels) const {
--- a/quicktex/s3tc/bc3/BC3Encoder.h
+++ b/quicktex/s3tc/bc3/BC3Encoder.h
@ -21,13 +21,13 @@

 #include <memory>

-#include "../../ColorBlock.h"
-#include "../../Encoder.h"
-#include "../../Texture.h"
-#include "../bc1/BC1Encoder.h"
-#include "../bc4/BC4Encoder.h"
-#include "../interpolator/Interpolator.h"
-#include "BC3Block.h"
+#include "ColorBlock.h"
+#include "Encoder.h"
+#include "s3tc/bc1/BC1Encoder.h"
+#include "s3tc/bc3/BC3Block.h"
+#include "s3tc/bc4/BC4Encoder.h"
+#include "s3tc/interpolator/Interpolator.h"
+#include "texture/BlockTexture.h"

 namespace quicktex::s3tc {

@ -38,7 +38,8 @@ class BC3Encoder : public BlockEncoder<BlockTexture<BC3Block>> {
    using InterpolatorPtr = std::shared_ptr<Interpolator>;

    BC3Encoder(unsigned level, InterpolatorPtr interpolator)
-        : _bc1_encoder(std::make_shared<BC1Encoder>(level, BC1Encoder::ColorMode::FourColor, interpolator)), _bc4_encoder(std::make_shared<BC4Encoder>(3)) {}
+        : _bc1_encoder(std::make_shared<BC1Encoder>(level, BC1Encoder::ColorMode::FourColor, interpolator)),
+          _bc4_encoder(std::make_shared<BC4Encoder>(3)) {}

    BC3Encoder(unsigned level = 5) : BC3Encoder(level, std::make_shared<Interpolator>()) {}

--- a/quicktex/s3tc/bc3/_bindings.cpp
+++ b/quicktex/s3tc/bc3/_bindings.cpp
@ -22,16 +22,14 @@
 #include <pybind11/pybind11.h>

 #include <array>
-#include <cstddef>
-#include <cstdint>
-#include <stdexcept>
-#include <string>
+#include <memory>

-#include "../../Decoder.h"
-#include "../../Encoder.h"
-#include "../interpolator/Interpolator.h"
-#include "BC3Decoder.h"
-#include "BC3Encoder.h"
+#include "s3tc/bc1/BC1Block.h"
+#include "s3tc/bc3/BC3Block.h"
+#include "s3tc/bc3/BC3Decoder.h"
+#include "s3tc/bc3/BC3Encoder.h"
+#include "s3tc/bc4/BC4Block.h"
+#include "s3tc/interpolator/Interpolator.h"

 namespace py = pybind11;
 namespace quicktex::bindings {
@ -59,7 +57,8 @@ void InitBC3(py::module_ &s3tc) {

    bc3_block.def_readwrite("alpha_block", &BC3Block::alpha_block, "The BC4 block used for alpha data.");
    bc3_block.def_readwrite("color_block", &BC3Block::color_block, "The BC1 block used for rgb data.");
-    bc3_block.def_property("blocks", &BC3Block::GetBlocks, &BC3Block::SetBlocks, "The BC4 and BC1 blocks that make up this block as a 2-tuple.");
+    bc3_block.def_property("blocks", &BC3Block::GetBlocks, &BC3Block::SetBlocks,
+                           "The BC4 and BC1 blocks that make up this block as a 2-tuple.");
    // endregion

    // region BC3Texture
@ -88,10 +87,12 @@ void InitBC3(py::module_ &s3tc) {
        :returns: A new BC3Texture with the same dimension as the input.
    )doc");

-    bc3_encoder.def_property_readonly("bc1_encoder", &BC3Encoder::GetBC1Encoder,
-                                      "Internal :py:class:`~quicktex.s3tc.bc1.BC1Encoder` used for RGB data. Readonly.");
-    bc3_encoder.def_property_readonly("bc4_encoder", &BC3Encoder::GetBC4Encoder,
-                                      "Internal :py:class:`~quicktex.s3tc.bc4.BC4Encoder` used for alpha data. Readonly.");
+    bc3_encoder.def_property_readonly(
+        "bc1_encoder", &BC3Encoder::GetBC1Encoder,
+        "Internal :py:class:`~quicktex.s3tc.bc1.BC1Encoder` used for RGB data. Readonly.");
+    bc3_encoder.def_property_readonly(
+        "bc4_encoder", &BC3Encoder::GetBC4Encoder,
+        "Internal :py:class:`~quicktex.s3tc.bc4.BC4Encoder` used for alpha data. Readonly.");
    // endregion

    // region BC3Decoder
@ -113,10 +114,12 @@ void InitBC3(py::module_ &s3tc) {
        :returns: A new RawTexture with the same dimensions as the input
    )doc");

-    bc3_decoder.def_property_readonly("bc1_decoder", &BC3Decoder::GetBC1Decoder,
-                                      "Internal :py:class:`~quicktex.s3tc.bc1.BC1Decoder` used for RGB data. Readonly.");
-    bc3_decoder.def_property_readonly("bc4_decoder", &BC3Decoder::GetBC4Decoder,
-                                      "Internal :py:class:`~quicktex.s3tc.bc4.BC4Decoder` used for alpha data. Readonly.");
+    bc3_decoder.def_property_readonly(
+        "bc1_decoder", &BC3Decoder::GetBC1Decoder,
+        "Internal :py:class:`~quicktex.s3tc.bc1.BC1Decoder` used for RGB data. Readonly.");
+    bc3_decoder.def_property_readonly(
+        "bc4_decoder", &BC3Decoder::GetBC4Decoder,
+        "Internal :py:class:`~quicktex.s3tc.bc4.BC4Decoder` used for alpha data. Readonly.");
    // endregion
 }
 }  // namespace quicktex::bindings
--- a/quicktex/s3tc/bc4/BC4Block.cpp
+++ b/quicktex/s3tc/bc4/BC4Block.cpp
@ -22,14 +22,17 @@
 #include <algorithm>
 #include <stdexcept>

-#include "../../util.h"
+#include "util/bitbash.h"
+#include "util/map.h"
+#include "util/math.h"
+#include "util/ranges.h"

 namespace quicktex::s3tc {

 BC4Block::SelectorArray BC4Block::GetSelectors() const {
-    auto packed = Pack<uint8_t, uint64_t, 8, SelectorSize>(_selectors);
-    auto rows = Unpack<uint64_t, uint16_t, SelectorBits * Width, Height>(packed);
-    return MapArray(rows, Unpack<uint16_t, uint8_t, SelectorBits, Width>);
+    auto packed = pack<uint64_t>(_selectors, 8);
+    auto rows = unpack<uint16_t, Height>(packed, SelectorBits * Width);
+    return map([](auto row) { return unpack<uint8_t, Width>(row, SelectorBits); }, rows);
 }

 void BC4Block::SetSelectors(const BC4Block::SelectorArray& unpacked) {
@ -37,9 +40,9 @@ void BC4Block::SetSelectors(const BC4Block::SelectorArray& unpacked) {
        if (std::any_of(unpacked[y].begin(), unpacked[y].end(), [](uint8_t i) { return i > SelectorMax; }))
            throw std::invalid_argument("Selector value out of bounds.");
    }
-    auto rows = MapArray(unpacked, Pack<uint8_t, uint16_t, SelectorBits, Width>);
-    auto packed = Pack<uint16_t, uint64_t, SelectorBits * Width, Height>(rows);
-    _selectors = Unpack<uint64_t, uint8_t, 8, SelectorSize>(packed);
+    auto rows = map([](auto r) { return pack<uint16_t>(r, SelectorBits); }, unpacked);
+    auto packed = pack<uint64_t>(rows, SelectorBits * Width);
+    _selectors = unpack<uint8_t, SelectorSize>(packed, 8);
 }

 std::array<uint8_t, 8> BC4Block::GetValues6() const {
@ -64,6 +67,8 @@ std::array<uint8_t, 8> BC4Block::GetValues8() const {
            static_cast<uint8_t>((alpha0 + alpha1 * 6) / 7)};
 }

-bool BC4Block::operator==(const BC4Block& Rhs) const { return alpha0 == Rhs.alpha0 && alpha1 == Rhs.alpha1 && _selectors == Rhs._selectors; }
+bool BC4Block::operator==(const BC4Block& Rhs) const {
+    return alpha0 == Rhs.alpha0 && alpha1 == Rhs.alpha1 && _selectors == Rhs._selectors;
+}
 bool BC4Block::operator!=(const BC4Block& Rhs) const { return !(Rhs == *this); }
 }  // namespace quicktex::s3tc
--- a/quicktex/s3tc/bc4/BC4Decoder.cpp
+++ b/quicktex/s3tc/bc4/BC4Decoder.cpp
@ -22,8 +22,8 @@
 #include <array>    // for array
 #include <cassert>  // for assert

-#include "../../Color.h"
 #include "../../ColorBlock.h"
+#include "../../OldColor.h"
 #include "BC4Block.h"

 namespace quicktex::s3tc {
--- a/quicktex/s3tc/bc4/BC4Decoder.h
+++ b/quicktex/s3tc/bc4/BC4Decoder.h
@ -22,10 +22,10 @@
 #include <cstdint>
 #include <stdexcept>

-#include "../../ColorBlock.h"
-#include "../../Decoder.h"
-#include "../../Texture.h"
-#include "BC4Block.h"
+#include "ColorBlock.h"
+#include "Decoder.h"
+#include "s3tc/bc4/BC4Block.h"
+#include "texture/BlockTexture.h"

 namespace quicktex::s3tc {

--- a/quicktex/s3tc/bc4/BC4Encoder.cpp
+++ b/quicktex/s3tc/bc4/BC4Encoder.cpp
@ -23,8 +23,8 @@
 #include <array>
 #include <cstdint>

-#include "../../Color.h"
 #include "../../ColorBlock.h"
+#include "../../OldColor.h"
 #include "BC4Block.h"

 namespace quicktex::s3tc {
--- a/quicktex/s3tc/bc4/BC4Encoder.h
+++ b/quicktex/s3tc/bc4/BC4Encoder.h
@ -22,10 +22,10 @@
 #include <cstdint>
 #include <stdexcept>

-#include "../../ColorBlock.h"
-#include "../../Encoder.h"
-#include "../../Texture.h"
-#include "BC4Block.h"
+#include "ColorBlock.h"
+#include "Encoder.h"
+#include "s3tc/bc4/BC4Block.h"
+#include "texture/BlockTexture.h"

 namespace quicktex::s3tc {

--- a/quicktex/s3tc/bc4/_bindings.cpp
+++ b/quicktex/s3tc/bc4/_bindings.cpp
@ -23,15 +23,11 @@
 #include <pybind11/stl.h>

 #include <array>
-#include <cstddef>
 #include <cstdint>
-#include <stdexcept>
-#include <string>

-#include "../../Decoder.h"
-#include "../../Encoder.h"
-#include "BC4Decoder.h"
-#include "BC4Encoder.h"
+#include "s3tc/bc4/BC4Block.h"
+#include "s3tc/bc4/BC4Decoder.h"
+#include "s3tc/bc4/BC4Encoder.h"

 namespace py = pybind11;
 namespace quicktex::bindings {
@ -46,7 +42,8 @@ void InitBC4(py::module_ &s3tc) {
    bc4_block.doc() = "A single BC4 block.";

    bc4_block.def(py::init<>());
-    bc4_block.def(py::init<uint8_t, uint8_t, BC4Block::SelectorArray>(), "endpoint0"_a, "endpoint1"_a, "selectors"_a, R"doc(
+    bc4_block.def(py::init<uint8_t, uint8_t, BC4Block::SelectorArray>(), "endpoint0"_a, "endpoint1"_a, "selectors"_a,
+                  R"doc(
        Create a new BC4Block with the specified endpoints and selectors.

        :param int endpoint0: The first endpoint.
@ -54,7 +51,8 @@ void InitBC4(py::module_ &s3tc) {
        :param selectors: the selectors as a 4x4 list of integers, between 0 and 7 inclusive.
    )doc");

-    bc4_block.def_property("endpoints", &BC4Block::GetAlphas, &BC4Block::SetAlphas, "The block's endpoint values as a 2-tuple.");
+    bc4_block.def_property("endpoints", &BC4Block::GetAlphas, &BC4Block::SetAlphas,
+                           "The block's endpoint values as a 2-tuple.");
    bc4_block.def_property("selectors", &BC4Block::GetSelectors, &BC4Block::SetSelectors, R"doc(
        The block's selectors as a 4x4 list of integers between 0 and 7 inclusive.

@ -96,8 +94,9 @@ void InitBC4(py::module_ &s3tc) {
        :param RawTexture texture: Input texture to encode.
        :returns: A new BC4Texture with the same dimension as the input.
    )doc");
-    
-    bc4_encoder.def_property_readonly("channel", &BC4Encoder::GetChannel, "The channel that will be read from. 0 to 3 inclusive. Readonly.");
+
+    bc4_encoder.def_property_readonly("channel", &BC4Encoder::GetChannel,
+                                      "The channel that will be read from. 0 to 3 inclusive. Readonly.");
    // endregion

    // region BC4Decoder
@ -117,8 +116,9 @@ void InitBC4(py::module_ &s3tc) {
        :param RawTexture texture: Input texture to encode.
        :returns: A new RawTexture with the same dimensions as the input
    )doc");
-    
-    bc4_decoder.def_property_readonly("channel", &BC4Decoder::GetChannel, "The channel that will be written to. 0 to 3 inclusive. Readonly.");
+
+    bc4_decoder.def_property_readonly("channel", &BC4Decoder::GetChannel,
+                                      "The channel that will be written to. 0 to 3 inclusive. Readonly.");
    // endregion
 }

--- a/quicktex/s3tc/bc5/BC5Block.h
+++ b/quicktex/s3tc/bc5/BC5Block.h
@ -19,9 +19,7 @@

 #pragma once

-#include <utility>
-
-#include "../bc4/BC4Block.h"
+#include "s3tc/bc4/BC4Block.h"

 namespace quicktex::s3tc {

@ -53,7 +51,9 @@ class alignas(8) BC5Block {
        chan1_block = pair.second;
    }

-    bool operator==(const BC5Block &Rhs) const { return chan0_block == Rhs.chan0_block && chan1_block == Rhs.chan1_block; }
+    bool operator==(const BC5Block &Rhs) const {
+        return chan0_block == Rhs.chan0_block && chan1_block == Rhs.chan1_block;
+    }
    bool operator!=(const BC5Block &Rhs) const { return !(Rhs == *this); }
 };
 }  // namespace quicktex::s3tc
--- a/quicktex/s3tc/bc5/BC5Decoder.cpp
+++ b/quicktex/s3tc/bc5/BC5Decoder.cpp
@ -19,8 +19,8 @@

 #include "BC5Decoder.h"

-#include "../../ColorBlock.h"
-#include "BC5Block.h"
+#include "ColorBlock.h"
+#include "s3tc/bc5/BC5Block.h"

 namespace quicktex::s3tc {
 ColorBlock<4, 4> BC5Decoder::DecodeBlock(const BC5Block &block) const {
--- a/quicktex/s3tc/bc5/BC5Decoder.h
+++ b/quicktex/s3tc/bc5/BC5Decoder.h
@ -24,11 +24,11 @@
 #include <tuple>
 #include <type_traits>

-#include "../../ColorBlock.h"
-#include "../../Decoder.h"
-#include "../../Texture.h"
-#include "../bc4/BC4Decoder.h"
-#include "BC5Block.h"
+#include "ColorBlock.h"
+#include "Decoder.h"
+#include "s3tc/bc4/BC4Decoder.h"
+#include "s3tc/bc5/BC5Block.h"
+#include "texture/BlockTexture.h"

 namespace quicktex::s3tc {

@ -38,8 +38,10 @@ class BC5Decoder : public BlockDecoder<BlockTexture<BC5Block>> {
    using BC4DecoderPtr = std::shared_ptr<BC4Decoder>;
    using BC4DecoderPair = std::tuple<BC4DecoderPtr, BC4DecoderPtr>;

-    BC5Decoder(uint8_t chan0 = 0, uint8_t chan1 = 1) : BC5Decoder(std::make_shared<BC4Decoder>(chan0), std::make_shared<BC4Decoder>(chan1)) {}
-    BC5Decoder(BC4DecoderPtr chan0_decoder, BC4DecoderPtr chan1_decoder) : _chan0_decoder(chan0_decoder), _chan1_decoder(chan1_decoder) {}
+    BC5Decoder(uint8_t chan0 = 0, uint8_t chan1 = 1)
+        : BC5Decoder(std::make_shared<BC4Decoder>(chan0), std::make_shared<BC4Decoder>(chan1)) {}
+    BC5Decoder(BC4DecoderPtr chan0_decoder, BC4DecoderPtr chan1_decoder)
+        : _chan0_decoder(chan0_decoder), _chan1_decoder(chan1_decoder) {}

    ColorBlock<4, 4> DecodeBlock(const BC5Block &block) const override;

--- a/quicktex/s3tc/bc5/BC5Encoder.cpp
+++ b/quicktex/s3tc/bc5/BC5Encoder.cpp
@ -19,8 +19,8 @@

 #include "BC5Encoder.h"

-#include "../../ColorBlock.h"
-#include "../bc4/BC4Block.h"
+#include "ColorBlock.h"
+#include "s3tc/bc4/BC4Block.h"

 namespace quicktex::s3tc {
 BC5Block BC5Encoder::EncodeBlock(const ColorBlock<4, 4> &pixels) const {
--- a/quicktex/s3tc/bc5/BC5Encoder.h
+++ b/quicktex/s3tc/bc5/BC5Encoder.h
@ -24,11 +24,11 @@
 #include <tuple>
 #include <type_traits>

-#include "../../ColorBlock.h"
-#include "../../Encoder.h"
-#include "../../Texture.h"
-#include "../bc4/BC4Encoder.h"
-#include "BC5Block.h"
+#include "ColorBlock.h"
+#include "Encoder.h"
+#include "s3tc/bc4/BC4Encoder.h"
+#include "s3tc/bc5/BC5Block.h"
+#include "texture/BlockTexture.h"

 namespace quicktex::s3tc {
 class BC5Encoder : public BlockEncoder<BlockTexture<BC5Block>> {
@ -37,8 +37,10 @@ class BC5Encoder : public BlockEncoder<BlockTexture<BC5Block>> {
    using BC4EncoderPtr = std::shared_ptr<BC4Encoder>;
    using BC4EncoderPair = std::tuple<BC4EncoderPtr, BC4EncoderPtr>;

-    BC5Encoder(uint8_t chan0 = 0, uint8_t chan1 = 1) : BC5Encoder(std::make_shared<BC4Encoder>(chan0), std::make_shared<BC4Encoder>(chan1)) {}
-    BC5Encoder(BC4EncoderPtr chan0_encoder, BC4EncoderPtr chan1_encoder) : _chan0_encoder(chan0_encoder), _chan1_encoder(chan1_encoder) {}
+    BC5Encoder(uint8_t chan0 = 0, uint8_t chan1 = 1)
+        : BC5Encoder(std::make_shared<BC4Encoder>(chan0), std::make_shared<BC4Encoder>(chan1)) {}
+    BC5Encoder(BC4EncoderPtr chan0_encoder, BC4EncoderPtr chan1_encoder)
+        : _chan0_encoder(chan0_encoder), _chan1_encoder(chan1_encoder) {}

    BC5Block EncodeBlock(const ColorBlock<4, 4> &pixels) const override;

--- a/quicktex/s3tc/bc5/_bindings.cpp
+++ b/quicktex/s3tc/bc5/_bindings.cpp
@ -24,10 +24,10 @@
 #include <array>
 #include <cstdint>

-#include "../../Decoder.h"
-#include "../../Encoder.h"
-#include "BC5Decoder.h"
-#include "BC5Encoder.h"
+#include "s3tc/bc4/BC4Block.h"
+#include "s3tc/bc5/BC5Block.h"
+#include "s3tc/bc5/BC5Decoder.h"
+#include "s3tc/bc5/BC5Encoder.h"

 namespace py = pybind11;
 namespace quicktex::bindings {
@ -52,7 +52,8 @@ void InitBC5(py::module_ &s3tc) {

    bc5_block.def_readwrite("chan0_block", &BC5Block::chan0_block, "The BC4 block used for the first channel.");
    bc5_block.def_readwrite("chan1_block", &BC5Block::chan1_block, "The BC4 block used for the second channel.");
-    bc5_block.def_property("blocks", &BC5Block::GetBlocks, &BC5Block::SetBlocks, "The BC4 and BC1 blocks that make up this block as a 2-tuple.");
+    bc5_block.def_property("blocks", &BC5Block::GetBlocks, &BC5Block::SetBlocks,
+                           "The BC4 and BC1 blocks that make up this block as a 2-tuple.");
    // endregion

    // region BC5Texture
@ -79,9 +80,11 @@ void InitBC5(py::module_ &s3tc) {
        :returns: A new BC5Texture with the same dimension as the input.
    )doc");

-    bc5_encoder.def_property_readonly("channels", &BC5Encoder::GetChannels, "A 2-tuple of channels that will be read from. 0 to 3 inclusive. Readonly.");
-    bc5_encoder.def_property_readonly("bc4_encoders", &BC5Encoder::GetBC4Encoders,
-                                      "2-tuple of internal :py:class:`~quicktex.s3tc.bc4.BC4Encoder` s used for each channel. Readonly.");
+    bc5_encoder.def_property_readonly("channels", &BC5Encoder::GetChannels,
+                                      "A 2-tuple of channels that will be read from. 0 to 3 inclusive. Readonly.");
+    bc5_encoder.def_property_readonly(
+        "bc4_encoders", &BC5Encoder::GetBC4Encoders,
+        "2-tuple of internal :py:class:`~quicktex.s3tc.bc4.BC4Encoder` s used for each channel. Readonly.");
    // endregion

    // region BC5Decoder
@ -103,9 +106,11 @@ void InitBC5(py::module_ &s3tc) {
        :returns: A new RawTexture with the same dimensions as the input
    )doc");

-    bc5_decoder.def_property_readonly("channels", &BC5Decoder::GetChannels, "A 2-tuple of channels that will be written to. 0 to 3 inclusive. Readonly.");
-    bc5_decoder.def_property_readonly("bc4_decoders", &BC5Decoder::GetBC4Decoders,
-                                      "2-tuple of internal :py:class:`~quicktex.s3tc.bc4.BC4Decoder` s used for each channel. Readonly.");
+    bc5_decoder.def_property_readonly("channels", &BC5Decoder::GetChannels,
+                                      "A 2-tuple of channels that will be written to. 0 to 3 inclusive. Readonly.");
+    bc5_decoder.def_property_readonly(
+        "bc4_decoders", &BC5Decoder::GetBC4Decoders,
+        "2-tuple of internal :py:class:`~quicktex.s3tc.bc4.BC4Decoder` s used for each channel. Readonly.");
    // endregion
 }
 }  // namespace quicktex::bindings
--- a/quicktex/s3tc/interpolator/Interpolator.cpp
+++ b/quicktex/s3tc/interpolator/Interpolator.cpp
@ -24,8 +24,8 @@
 #include <cstdint>
 #include <stdexcept>

-#include "../../util.h"
-#include "../../Color.h"
+#include "OldColor.h"
+#include "util/bitbash.h"

 namespace quicktex::s3tc {

@ -45,25 +45,33 @@ std::unique_ptr<Interpolator> Interpolator::MakeInterpolator(Interpolator::Type
    }
 }

-uint8_t Interpolator::Interpolate5(uint8_t v0, uint8_t v1) const { return Interpolate8(scale5To8(v0), scale5To8(v1)); }
-uint8_t Interpolator::Interpolate6(uint8_t v0, uint8_t v1) const { return Interpolate8(scale6To8(v0), scale6To8(v1)); }
-uint8_t Interpolator::InterpolateHalf5(uint8_t v0, uint8_t v1) const { return InterpolateHalf8(scale5To8(v0), scale5To8(v1)); }
-uint8_t Interpolator::InterpolateHalf6(uint8_t v0, uint8_t v1) const { return InterpolateHalf8(scale6To8(v0), scale6To8(v1)); }
-
-std::array<Color, 4> Interpolator::Interpolate565BC1(uint16_t low, uint16_t high, bool allow_3color) const {
-    bool use_3color = allow_3color && (high >= low);
-    return InterpolateBC1(Color::Unpack565Unscaled(low), Color::Unpack565Unscaled(high), use_3color);
+uint8_t Interpolator::Interpolate5(uint8_t v0, uint8_t v1) const {
+    return Interpolate8(scale_to_8<5>(v0), scale_to_8<5>(v1));
+}
+uint8_t Interpolator::Interpolate6(uint8_t v0, uint8_t v1) const {
+    return Interpolate8(scale_to_8<6>(v0), scale_to_8<6>(v1));
+}
+uint8_t Interpolator::InterpolateHalf5(uint8_t v0, uint8_t v1) const {
+    return InterpolateHalf8(scale_to_8<5>(v0), scale_to_8<5>(v1));
+}
+uint8_t Interpolator::InterpolateHalf6(uint8_t v0, uint8_t v1) const {
+    return InterpolateHalf8(scale_to_8<6>(v0), scale_to_8<6>(v1));
 }

-std::array<Color, 4> Interpolator::InterpolateBC1(Color low, Color high, bool use_3color) const {
-    auto colors = std::array<Color, 4>();
+std::array<OldColor, 4> Interpolator::Interpolate565BC1(uint16_t low, uint16_t high, bool allow_3color) const {
+    bool use_3color = allow_3color && (high >= low);
+    return InterpolateBC1(OldColor::Unpack565Unscaled(low), OldColor::Unpack565Unscaled(high), use_3color);
+}
+
+std::array<OldColor, 4> Interpolator::InterpolateBC1(OldColor low, OldColor high, bool use_3color) const {
+    auto colors = std::array<OldColor, 4>();
    colors[0] = low.ScaleFrom565();
    colors[1] = high.ScaleFrom565();

    if (use_3color) {
        // 3-color mode
        colors[2] = InterpolateHalfColor24(colors[0], colors[1]);
-        colors[3] = Color(0, 0, 0, 0);  // transparent black
+        colors[3] = OldColor(0, 0, 0, 0);  // transparent black
    } else {
        // 4-color mode
        colors[2] = InterpolateColor24(colors[0], colors[1]);
@ -79,8 +87,12 @@ uint8_t Interpolator::InterpolateHalf8(uint8_t v0, uint8_t v1) const { return (v
 // endregion

 // region InterpolatorRound implementation
-uint8_t InterpolatorRound::Interpolate5(uint8_t v0, uint8_t v1) const { return Interpolate8(scale5To8(v0), scale5To8(v1)); }
-uint8_t InterpolatorRound::Interpolate6(uint8_t v0, uint8_t v1) const { return Interpolate8(scale6To8(v0), scale6To8(v1)); }
+uint8_t InterpolatorRound::Interpolate5(uint8_t v0, uint8_t v1) const {
+    return Interpolate8(scale_to_8<5>(v0), scale_to_8<5>(v1));
+}
+uint8_t InterpolatorRound::Interpolate6(uint8_t v0, uint8_t v1) const {
+    return Interpolate8(scale_to_8<6>(v0), scale_to_8<6>(v1));
+}

 uint8_t InterpolatorRound::Interpolate8(uint8_t v0, uint8_t v1) const { return (v0 * 2 + v1 + 1) / 3; }
 // endregion
@ -108,9 +120,9 @@ uint8_t InterpolatorNvidia::InterpolateHalf6(uint8_t v0, uint8_t v1) const {
    return static_cast<uint8_t>((256 * v0 + gdiff / 4 + 128 + gdiff * 128) >> 8);
 }

-std::array<Color, 4> InterpolatorNvidia::InterpolateBC1(Color low, Color high, bool use_3color) const {
+std::array<OldColor, 4> InterpolatorNvidia::InterpolateBC1(OldColor low, OldColor high, bool use_3color) const {
    // Nvidia is special and interpolation cant be done with 8-bit values, so we need to override the default behavior
-    std::array<Color, 4> colors;
+    std::array<OldColor, 4> colors;
    colors[0] = low.ScaleFrom565();
    colors[1] = high.ScaleFrom565();

@ -121,7 +133,7 @@ std::array<Color, 4> InterpolatorNvidia::InterpolateBC1(Color low, Color high, b
    } else {
        // 3-color mode
        colors[2] = InterpolateHalfColor565(low, high);
-        colors[3] = Color(0, 0, 0, 0);  // transparent black
+        colors[3] = OldColor(0, 0, 0, 0);  // transparent black
    }

    return colors;
@ -129,10 +141,18 @@ std::array<Color, 4> InterpolatorNvidia::InterpolateBC1(Color low, Color high, b
 // endregion

 // region InterpolatorAMD implementation
-uint8_t InterpolatorAMD::Interpolate5(uint8_t v0, uint8_t v1) const { return Interpolate8(scale5To8(v0), scale5To8(v1)); }
-uint8_t InterpolatorAMD::Interpolate6(uint8_t v0, uint8_t v1) const { return Interpolate8(scale6To8(v0), scale6To8(v1)); }
-uint8_t InterpolatorAMD::InterpolateHalf5(uint8_t v0, uint8_t v1) const { return InterpolateHalf8(scale5To8(v0), scale5To8(v1)); }
-uint8_t InterpolatorAMD::InterpolateHalf6(uint8_t v0, uint8_t v1) const { return InterpolateHalf8(scale6To8(v0), scale6To8(v1)); }
+uint8_t InterpolatorAMD::Interpolate5(uint8_t v0, uint8_t v1) const {
+    return Interpolate8(scale_to_8<5>(v0), scale_to_8<5>(v1));
+}
+uint8_t InterpolatorAMD::Interpolate6(uint8_t v0, uint8_t v1) const {
+    return Interpolate8(scale_to_8<6>(v0), scale_to_8<6>(v1));
+}
+uint8_t InterpolatorAMD::InterpolateHalf5(uint8_t v0, uint8_t v1) const {
+    return InterpolateHalf8(scale_to_8<5>(v0), scale_to_8<5>(v1));
+}
+uint8_t InterpolatorAMD::InterpolateHalf6(uint8_t v0, uint8_t v1) const {
+    return InterpolateHalf8(scale_to_8<6>(v0), scale_to_8<6>(v1));
+}

 uint8_t InterpolatorAMD::Interpolate8(uint8_t v0, uint8_t v1) const { return (v0 * 43 + v1 * 21 + 32) >> 6; }

--- a/quicktex/s3tc/interpolator/Interpolator.h
+++ b/quicktex/s3tc/interpolator/Interpolator.h
@ -22,7 +22,7 @@
 #include <cstdint>  // for uint8_t, uint16_t
 #include <memory>   // for unique_ptr

-#include "../../Color.h"  // for Color
+#include "OldColor.h"  // for Color

 namespace quicktex::s3tc {

@ -97,7 +97,7 @@ class Interpolator {
     * @param allow_3color if true, a different interpolation mode will be used if high >= low
     * @return an array of 4 Color values, with indices matching BC1 selectors
     */
-    std::array<Color, 4> Interpolate565BC1(uint16_t low, uint16_t high, bool allow_3color = true) const;
+    std::array<OldColor, 4> Interpolate565BC1(uint16_t low, uint16_t high, bool allow_3color = true) const;

    /**
     * Generates the 4 colors for a BC1 block from the given
@ -106,7 +106,7 @@ class Interpolator {
     * @param use_3color if the 3-color interpolation mode should be used
     * @return an array of 4 Color values, with indices matching BC1 selectors
     */
-    virtual std::array<Color, 4> InterpolateBC1(Color low, Color high, bool use_3color) const;
+    virtual std::array<OldColor, 4> InterpolateBC1(OldColor low, OldColor high, bool use_3color) const;

    /**
     * Gets the type of an interpolator
@ -126,12 +126,12 @@ class Interpolator {
    }

   private:
-    Color InterpolateColor24(const Color &c0, const Color &c1) const {
-        return Color(Interpolate8(c0.r, c1.r), Interpolate8(c0.g, c1.g), Interpolate8(c0.b, c1.b));
+    OldColor InterpolateColor24(const OldColor &c0, const OldColor &c1) const {
+        return OldColor(Interpolate8(c0.r, c1.r), Interpolate8(c0.g, c1.g), Interpolate8(c0.b, c1.b));
    }

-    Color InterpolateHalfColor24(const Color &c0, const Color &c1) const {
-        return Color(InterpolateHalf8(c0.r, c1.r), InterpolateHalf8(c0.g, c1.g), InterpolateHalf8(c0.b, c1.b));
+    OldColor InterpolateHalfColor24(const OldColor &c0, const OldColor &c1) const {
+        return OldColor(InterpolateHalf8(c0.r, c1.r), InterpolateHalf8(c0.g, c1.g), InterpolateHalf8(c0.b, c1.b));
    }
 };

@ -152,18 +152,18 @@ class InterpolatorNvidia final : public Interpolator {
    virtual uint8_t InterpolateHalf5(uint8_t v0, uint8_t v1) const override;
    virtual uint8_t InterpolateHalf6(uint8_t v0, uint8_t v1) const override;

-    virtual std::array<Color, 4> InterpolateBC1(Color low, Color high, bool use_3color) const override;
+    virtual std::array<OldColor, 4> InterpolateBC1(OldColor low, OldColor high, bool use_3color) const override;

    virtual Type GetType() const noexcept override { return Type::Nvidia; }
    virtual bool CanInterpolate8Bit() const noexcept override { return false; }

   private:
-    Color InterpolateColor565(const Color &c0, const Color &c1) const {
-        return Color(Interpolate5(c0.r, c1.r), Interpolate6(c0.g, c1.g), Interpolate5(c0.b, c1.b));
+    OldColor InterpolateColor565(const OldColor &c0, const OldColor &c1) const {
+        return OldColor(Interpolate5(c0.r, c1.r), Interpolate6(c0.g, c1.g), Interpolate5(c0.b, c1.b));
    }

-    Color InterpolateHalfColor565(const Color &c0, const Color &c1) const {
-        return Color(InterpolateHalf5(c0.r, c1.r), InterpolateHalf6(c0.g, c1.g), InterpolateHalf5(c0.b, c1.b));
+    OldColor InterpolateHalfColor565(const OldColor &c0, const OldColor &c1) const {
+        return OldColor(InterpolateHalf5(c0.r, c1.r), InterpolateHalf6(c0.g, c1.g), InterpolateHalf5(c0.b, c1.b));
    }
 };

--- a/quicktex/test.cpp
+++ b/quicktex/test.cpp
@ -0,0 +1,31 @@
+/*  Quicktex Texture Compression Library
+    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
+    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
+    and licenced under the public domain
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Lesser General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <array>
+#include <cstdint>
+#include <xsimd/xsimd.hpp>
+
+#include "Matrix.h"
+
+// Type your code here, or load an example.
+namespace quicktex {
+auto test(Matrix<float, 4, 1> a, Matrix<float, 4, 1> b, Matrix<float, 4, 1> c) {
+    return a * 7;
+};
+}  // namespace quicktex
--- a/quicktex/texture/BlockTexture.h
+++ b/quicktex/texture/BlockTexture.h
@ -0,0 +1,70 @@
+/*  Quicktex Texture Compression Library
+    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
+    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
+    and licenced under the public domain
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Lesser General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <array>
+#include <vector>
+
+#include "Texture.h"
+
+namespace quicktex {
+template <typename B> class BlockTexture final : public Texture {
+   private:
+    std::vector<B> _blocks;
+    unsigned _width_b;
+    unsigned _height_b;
+
+   public:
+    using BlockType = B;
+    using Base = Texture;
+
+    /**
+     * Create a new BlockTexture
+     * @param width width of the texture in pixels. must be divisible by B::width
+     * @param height height of the texture in pixels. must be divisible by B::height
+     */
+    BlockTexture(int w, int h) : Base(w, h) {
+        _width_b = (width + B::Width - 1) / B::Width;
+        _height_b = (height + B::Height - 1) / B::Height;
+        _blocks = std::vector<B>(_width_b * _height_b);
+    }
+
+    constexpr unsigned bwidth() const { return _width_b; }
+    constexpr unsigned bheight() const { return _height_b; }
+    constexpr std::tuple<int, int> bsize() const { return std::tuple<int, int>(_width_b, _height_b); }
+
+    B get_block(unsigned x, unsigned y) const {
+        if (x >= _width_b) throw std::out_of_range("x value out of range.");
+        if (y >= _height_b) throw std::out_of_range("y value out of range.");
+        return _blocks.at(x + (y * _width_b));
+    }
+
+    void set_block(unsigned x, unsigned y, const B &val) {
+        if (x >= _width_b) throw std::out_of_range("x value out of range.");
+        if (y >= _height_b) throw std::out_of_range("y value out of range.");
+        _blocks.at(x + (y * _width_b)) = val;
+    }
+
+    size_t nbytes() const noexcept override { return _blocks.size() * sizeof(B); }
+
+    const uint8_t *data() const noexcept override { return reinterpret_cast<const uint8_t *>(_blocks.data()); }
+    uint8_t *data() noexcept override { return reinterpret_cast<uint8_t *>(_blocks.data()); }
+};
+}  // namespace quicktex
--- a/quicktex/texture/RawTexture.cpp
+++ b/quicktex/texture/RawTexture.cpp
@ -0,0 +1,33 @@
+/*  Quicktex Texture Compression Library
+    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
+    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
+    and licenced under the public domain
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Lesser General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "RawTexture.h"
+
+namespace quicktex {
+Color RawTexture::pixel(unsigned x, unsigned y) const {
+    if (x >= width) throw std::invalid_argument("x value out of range.");
+    if (y >= height) throw std::invalid_argument("y value out of range.");
+    return _pixels.at(x + (y * width));
+}
+quicktex::Color& RawTexture::pixel(unsigned x, unsigned y) {
+    if (x >= width) throw std::invalid_argument("x value out of range.");
+    if (y >= height) throw std::invalid_argument("y value out of range.");
+    return _pixels.at(x + (y * width));
+}
+}  // namespace quicktex
--- a/quicktex/texture/RawTexture.h
+++ b/quicktex/texture/RawTexture.h
@ -0,0 +1,97 @@
+/*  Quicktex Texture Compression Library
+    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
+    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
+    and licenced under the public domain
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Lesser General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <array>
+#include <climits>
+#include <cstdint>
+#include <cstdio>
+#include <cstring>
+#include <memory>
+#include <stdexcept>
+#include <tuple>
+#include <type_traits>
+#include <vector>
+
+#include "Color.h"
+#include "ColorBlock.h"
+#include "OldColor.h"
+#include "texture/Texture.h"
+
+namespace quicktex {
+class RawTexture : public Texture {
+    using Base = Texture;
+
+   public:
+    /**
+     * Create a new RawTexture
+     * @param width width of the texture in pixels
+     * @param height height of the texture in pixels
+     */
+    RawTexture(int w, int h) : Base(w, h), _pixels(w* h) {}
+
+    quicktex::Color pixel(unsigned x, unsigned y) const;
+
+    quicktex::Color &pixel(unsigned x, unsigned y);
+
+    quicktex::Color pixel_wrapped(unsigned x, unsigned y) const { return pixel(x % width, y % height); }
+
+    quicktex::Color &pixel_wrapped(unsigned x, unsigned y) { return pixel(x % width, y % height); }
+
+    size_t nbytes() const noexcept override { return static_cast<size_t>(width * height) * sizeof(quicktex::Color); }
+
+    template <int N, int M> quicktex::ColorBlock<N, M> get_block(int block_x, int block_y) const {
+        if (block_x < 0) throw std::out_of_range("x value out of range.");
+        if (block_y < 0) throw std::out_of_range("y value out of range.");
+
+        // coordinates in the image of the top-left pixel of the selected block
+        quicktex::ColorBlock<N, M> block;
+        int pixel_x = block_x * N;
+        int pixel_y = block_y * M;
+
+        // slower pixel-wise copy if the block goes over the edges
+        for (int x = 0; x < N; x++) {
+            for (int y = 0; y < M; y++) { block.Set(x, y, pixel((pixel_x + x) % width, (pixel_y + y) % height)); }
+        }
+
+        return block;
+    }
+
+    template <int N, int M> void set_block(int block_x, int block_y, const quicktex::ColorBlock<N, M> &block) {
+        if (block_x < 0) throw std::out_of_range("x value out of range.");
+        if (block_y < 0) throw std::out_of_range("y value out of range.");
+
+        // coordinates in the image of the top-left pixel of the selected block
+        int pixel_x = block_x * N;
+        int pixel_y = block_y * M;
+
+        // slower pixel-wise copy if the block goes over the edges
+        for (int x = 0; x < N; x++) {
+            for (int y = 0; y < M; y++) { pixel((pixel_x + x) % width, (pixel_y + y) % height) = block.Get(x, y); }
+        }
+    }
+
+    virtual const uint8_t *data() const noexcept override { return reinterpret_cast<const uint8_t *>(_pixels.data()); }
+    virtual uint8_t *data() noexcept override { return reinterpret_cast<uint8_t *>(_pixels.data()); }
+
+   protected:
+    std::vector<quicktex::Color> _pixels;
+};
+}  // namespace quicktex
--- a/quicktex/texture/Texture.h
+++ b/quicktex/texture/Texture.h
@ -0,0 +1,62 @@
+/*  Quicktex Texture Compression Library
+    Copyright (C) 2021-2022 Andrew Cassidy <drewcassidy@me.com>
+    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
+    and licenced under the public domain
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Lesser General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <array>
+#include <climits>
+#include <cstdint>
+#include <cstdio>
+#include <cstring>
+#include <memory>
+#include <stdexcept>
+#include <tuple>
+#include <type_traits>
+#include <vector>
+
+#include "Color.h"
+#include "ColorBlock.h"
+#include "OldColor.h"
+#include "Window.h"
+
+namespace quicktex {
+
+class Texture {
+   public:
+    const unsigned width;
+    const unsigned height;
+
+    virtual ~Texture() = default;
+
+    virtual std::tuple<unsigned, unsigned> Size() const { return {width, height}; }
+
+    /**
+     * The texture's total size
+     * @return The size of the texture in bytes.
+     */
+    virtual size_t nbytes() const noexcept = 0;
+
+    virtual const uint8_t *data() const noexcept = 0;
+    virtual uint8_t *data() noexcept = 0;
+
+   protected:
+    Texture(unsigned w, unsigned h) : width(w), height(h) {}
+};
+
+}  // namespace quicktex
--- a/quicktex/texture/Window.cpp
+++ b/quicktex/texture/Window.cpp
@ -0,0 +1,90 @@
+/*  Quicktex Texture Compression Library
+    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
+    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
+    and licenced under the public domain
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Lesser General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "Window.h"
+
+#include "texture/RawTexture.h"
+
+namespace quicktex {
+
+// Window
+Window::Window(RawTexture& texture, unsigned w, unsigned h, unsigned px, unsigned py)
+    : width(w), height(h), x(px), y(py), _texture(texture) {
+    assert(x < texture.width);
+    assert(y < texture.height);
+}
+
+Color& Window::pixel(unsigned px, unsigned py) {
+    assert(px < width && py < height);
+    return _texture.pixel(x + px, y + py);
+}
+
+Color Window::pixel(unsigned px, unsigned py) const {
+    assert(px < width && py < height);
+    return _texture.pixel(x + px, y + py);
+}
+
+WindowIterator Window::begin() { return WindowIterator(*this, 0, 0); }
+WindowIterator Window::end() { return WindowIterator(*this, 0, height); }
+WindowIterator Window::row_begin(unsigned int row) { return WindowIterator(*this, 0, row); }
+WindowIterator Window::row_end(unsigned int row) { return WindowIterator(*this, 0, row + 1); }
+
+bool Window::operator==(const Window& rhs) const {
+    return width == rhs.width && height == rhs.height && x == rhs.x && y == rhs.y && &_texture == &rhs._texture;
+}
+
+// WindowIterator
+
+WindowIterator::WindowIterator(Window& view, unsigned px, unsigned py) : x(px), y(py), _view(&view) {
+    assert(x < view.width);
+    assert(y < view.height || (y == view.height && x == 0));
+    // if y == the height, and x == 0, then this is a sentinel for the end of iteration, and cannot be dereferenced
+}
+
+WindowIterator& quicktex::WindowIterator::operator++() {  // prefix increment
+    x++;
+    if (x >= _view->width) {
+        x = 0;
+        y++;
+    }
+    return *this;
+}
+
+WindowIterator WindowIterator::operator++(int) {  // postfix increment
+    WindowIterator old = *this;
+    ++(*this);
+    return old;
+}
+
+Color& WindowIterator::operator*() const {  // dereference operator
+    assert(_view != nullptr);
+    assert(x < _view->width && y < _view->height);
+    return _view->pixel(x, y);
+}
+
+Color* WindowIterator::operator->() { return &(**this); }  // returns a pointer to what's returned by operator*
+
+bool WindowIterator::operator==(const WindowIterator& rhs) const {
+    return x == rhs.x && y == rhs.y && _view == rhs._view;
+}
+
+static_assert(std::forward_iterator<WindowIterator>);
+// static_assert(sized_range<Window>);
+
+}  // namespace quicktex
--- a/quicktex/texture/Window.h
+++ b/quicktex/texture/Window.h
@ -0,0 +1,82 @@
+/*  Quicktex Texture Compression Library
+    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
+    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
+    and licenced under the public domain
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Lesser General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+#include "Color.h"
+#include "util/ranges.h"
+
+namespace quicktex {
+
+// forward declarations
+class WindowIterator;
+class RawTexture;
+
+/**
+ * Class representing a window into a RawTexture
+ */
+class Window {
+   public:
+    typedef Color value_type;
+
+    const unsigned width, height;
+    const unsigned x, y;
+
+    Window(RawTexture &texture, unsigned w, unsigned h, unsigned x, unsigned y);
+
+    Color &pixel(unsigned px, unsigned py);
+    Color pixel(unsigned px, unsigned py) const;
+
+    WindowIterator begin();
+    WindowIterator end();
+    WindowIterator row_begin(unsigned row);
+    WindowIterator row_end(unsigned row);
+
+    size_t size() const { return width * height; }
+
+    bool operator==(const Window &rhs) const;
+
+   private:
+    RawTexture &_texture;
+};
+
+/**
+ * Iterator returned by Window
+ */
+class WindowIterator {
+   public:
+    typedef long long difference_type;
+    typedef Color value_type;
+
+    unsigned x, y;
+
+    WindowIterator(Window &view, unsigned x, unsigned y);
+    WindowIterator() : x(0), y(0), _view(nullptr) {}
+
+    Color &operator*() const;  // dereference
+    Color *operator->();       // member access
+
+    WindowIterator &operator++();    // prefix increment
+    WindowIterator operator++(int);  // postfix increment
+    bool operator==(const WindowIterator &rhs) const;
+
+   private:
+    Window *_view;
+};
+
+}  // namespace quicktex
--- a/quicktex/util.h
+++ b/quicktex/util.h
@ -1,178 +0,0 @@
-/*  Quicktex Texture Compression Library
-    Copyright (C) 2021-2022 Andrew Cassidy <drewcassidy@me.com>
-    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
-    and licenced under the public domain
-
-    This program is free software: you can redistribute it and/or modify
-    it under the terms of the GNU Lesser General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU Lesser General Public License for more details.
-
-    You should have received a copy of the GNU Lesser General Public License
-    along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#pragma once
-#include <cassert>
-#include <cstdint>
-#include <limits>
-#include <string>
-#include <type_traits>
-#include <functional>
-#include <vector>
-
-#define UINT5_MAX 0x1FU  // 31
-#define UINT6_MAX 0x3FU  // 63
-
-#define assert5bit(x) assert(x <= UINT5_MAX)
-#define assert6bit(x) assert(x <= UINT6_MAX)
-
-template <typename S> constexpr auto iabs(S i) {
-    static_assert(!std::is_unsigned<S>::value);
-    using O = typename std::make_unsigned<S>::type;
-    return (i < 0) ? static_cast<O>(-i) : static_cast<O>(i);
-}
-
-/**
- * Unpacks an unsigned integer into an array of smaller integers.
- * @tparam I Input data type. Must be an unsigned integral type large enough to hold C * N bits.
- * @tparam O Output data type. must be an unsigned integral type large enough to hold C bits..
- * @tparam S Number of bits in each value.
- * @tparam C Number of values to unpack.
- * @param packed Packed integer input of type I.
- * @return Unpacked std::array of type O and size C.
- */
-template <typename I, typename O, size_t S, size_t C> constexpr std::array<O, C> Unpack(I packed) {
-    // type checking
-    static_assert(std::is_unsigned<I>::value, "Packed input type must be unsigned");
-    static_assert(std::is_unsigned<O>::value, "Unpacked output type must be unsigned");
-    static_assert(std::numeric_limits<I>::digits >= (C * S), "Packed input type must be big enough to represent the number of bits multiplied by count");
-    static_assert(std::numeric_limits<O>::digits >= S, "Unpacked output type must be big enough to represent the number of bits");
-
-    constexpr O mask = (1U << S) - 1U;  // maximum value representable by N bits
-    std::array<O, C> vals;              // output values array of size C
-
-    for (unsigned i = 0; i < C; i++) {
-        vals[i] = static_cast<O>(packed >> (i * S)) & mask;
-        assert(vals[i] <= mask);
-    }
-
-    return vals;
-}
-
-/**
- * Packs an array of unsigned integers into a single integer.
- * @tparam I Input data type. Must be an unsigned integral type large enough to hold C bits.
- * @tparam O Output data type. must be an unsigned integral type large enough to hold C * N bits.
- * @tparam S Number of bits in each value.
- * @tparam C Number of values to unpack.
- * @param vals Unpacked std::array of type I and size C.
- * @return Packed integer input of type O.
- */
-template <typename I, typename O, size_t S, size_t C> constexpr O Pack(const std::array<I, C> &vals) {
-    // type checking
-    static_assert(std::is_unsigned<I>::value, "Unpacked input type must be unsigned");
-    static_assert(std::is_unsigned<O>::value, "Packed output type must be unsigned");
-    static_assert(std::numeric_limits<I>::digits >= S, "Unpacked input type must be big enough to represent the number of bits");
-    static_assert(std::numeric_limits<O>::digits >= (C * S), "Packed output type must be big enough to represent the number of bits multiplied by count");
-
-    O packed = 0;  // output value of type O
-
-    for (unsigned i = 0; i < C; i++) {
-        assert(vals[i] <= (1U << S) - 1U);
-        packed |= static_cast<O>(vals[i]) << (i * S);
-    }
-
-    assert(packed <= (static_cast<O>(1U) << (C * S)) - 1U);
-    return packed;
-}
-
-template <size_t Size, int Op(int)> constexpr std::array<uint8_t, Size> ExpandArray() {
-    std::array<uint8_t, Size> res;
-    for (int i = 0; i < Size; i++) { res[i] = Op(i); }
-    return res;
-}
-
-template <typename Seq, typename Fn> constexpr auto MapArray(const Seq &input, Fn op) {
-    using I = typename Seq::value_type;
-    using O = decltype(op(std::declval<I>()));
-    constexpr size_t N = std::tuple_size<Seq>::value;
-
-    std::array<O, N> output;
-    for (unsigned i = 0; i < N; i++) { output[i] = op(input[i]); }
-    return output;
-}
-
-template <typename S> constexpr S scale8To5(S v) {
-    auto v2 = v * 31 + 128;
-    return static_cast<S>((v2 + (v2 >> 8)) >> 8);
-}
-template <typename S> constexpr S scale8To6(S v) {
-    auto v2 = v * 63 + 128;
-    return static_cast<S>((v2 + (v2 >> 8)) >> 8);
-}
-
-template <typename S> constexpr S scale5To8(S v) {
-    assert5bit(v);
-    return static_cast<S>((v << 3) | (v >> 2));
-}
-template <typename S> constexpr S scale6To8(S v) {
-    assert6bit(v);
-    return static_cast<S>((v << 2) | (v >> 4));
-}
-
-template <typename S> constexpr S maximum(S a, S b) { return (a > b) ? a : b; }
-template <typename S> constexpr S maximum(S a, S b, S c) { return maximum(maximum(a, b), c); }
-template <typename S> constexpr S maximum(S a, S b, S c, S d) { return maximum(maximum(maximum(a, b), c), d); }
-
-template <typename S> constexpr S minimum(S a, S b) { return (a < b) ? a : b; }
-template <typename S> constexpr S minimum(S a, S b, S c) { return minimum(minimum(a, b), c); }
-template <typename S> constexpr S minimum(S a, S b, S c, S d) { return minimum(minimum(minimum(a, b), c), d); }
-
-template <typename T> constexpr T square(T a) { return a * a; }
-
-constexpr float clampf(float value, float low = 0.0f, float high = 1.0f) {
-    if (value < low)
-        value = low;
-    else if (value > high)
-        value = high;
-    return value;
-}
-constexpr uint8_t clamp255(int32_t i) { return static_cast<uint8_t>((static_cast<unsigned int>(i) & 0xFFFFFF00U) ? (~(i >> 31)) : i); }
-
-template <typename S> constexpr S clamp(S value, S low, S high) { return (value < low) ? low : ((value > high) ? high : value); }
-constexpr int32_t clampi(int32_t value, int32_t low, int32_t high) {
-    if (value < low)
-        value = low;
-    else if (value > high)
-        value = high;
-    return value;
-}
-
-constexpr int squarei(int a) { return a * a; }
-constexpr int absi(int a) { return (a < 0) ? -a : a; }
-
-template <typename F> constexpr F lerp(F a, F b, F s) { return a + (b - a) * s; }
-
-template <typename... Args> std::string Format(const char *str, const Args &...args) {
-    auto output = std::string(str);
-
-    std::vector<std::string> values = {{args...}};
-
-    for (unsigned i = 0; i < values.size(); i++) {
-        auto key = "{" + std::to_string(i) + "}";
-        auto value = values[i];
-        while (true) {
-            size_t where = output.find(key);
-            if (where == output.npos) break;
-            output.replace(where, key.length(), value);
-        }
-    }
-
-    return output;
-}
--- a/quicktex/util/bitbash.h
+++ b/quicktex/util/bitbash.h
@ -0,0 +1,313 @@
+/*  Quicktex Texture Compression Library
+    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
+    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
+    and licenced under the public domain
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Lesser General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <array>
+#include <cassert>
+#include <concepts>
+#include <limits>
+#include <numeric>
+#include <type_traits>
+
+#include "iterator.h"
+#include "util/math.h"
+#include "util/ranges.h"
+
+#define UINT5_MAX 0x1FU  // 31
+#define UINT6_MAX 0x3FU  // 63
+
+#define assert5bit(x) assert(x <= UINT5_MAX)
+#define assert6bit(x) assert(x <= UINT6_MAX)
+
+namespace quicktex {
+
+template <size_t N, typename S> S scale_from_8(S v) {
+    static_assert(N < 8);
+    assert(v < (1 << 8));
+
+    unsigned max = (1 << N) - 1;
+    unsigned v2 = (v * max) + 128;
+    auto result = static_cast<S>((v2 + (v2 >> 8)) >> 8);
+
+    assert(result < (1 << N));
+
+    return result;
+}
+
+template <size_t N, typename S> S scale_to_8(S v) {
+    static_assert(N < 8);
+    assert(v < (1 << N));
+
+    constexpr unsigned Lshift = 8 - N;
+    constexpr unsigned Rshift = N - Lshift;
+    S result = static_cast<S>((v << Lshift) | (v >> Rshift));
+
+    assert(v < (1 << 8));
+
+    return result;
+}
+
+/**
+ * Unpacks an unsigned integer into a range of smaller integers.
+ * @param packed value to unpack
+ * @param begin destination start iterator
+ * @param end destination end iterator
+ * @param widths widths iterator. values are in bits
+ * @param little_endian if the input has the first element in the least significant place
+ * @return the total number of bits unpacked
+ */
+template <typename P, typename OI, typename WI>
+    requires std::unsigned_integral<P> && std::output_iterator<OI, P> && std::forward_iterator<WI>
+size_t unpack_into(P packed, OI begin, OI end, WI widths, bool little_endian = true) {
+    using U = std::remove_cvref_t<decltype(*begin)>;
+    if (little_endian) {
+        // first element is in the least significant place of packed
+
+        unsigned offset = 0;
+        while (begin < end) {
+            auto w = *(widths++);
+            assert(w <= std::numeric_limits<U>::digits);
+
+            auto mask = ((1 << w) - 1);              // least significant w bits all 1
+            *(begin++) = (packed >> offset) & mask;  // write to output
+
+            offset += w;  // increment offset
+        }
+
+        assert(offset <= std::numeric_limits<P>::digits);  // detect an overflow condition
+        return offset;
+    } else {
+        // first element is in the most significant place of packed
+
+        // with non-constant width, we either need to iterate backwards or
+        // add up all the widths beforehand to know where to begin
+        unsigned total_offset = std::accumulate(widths, widths + std::distance(begin, end), 0);
+        assert(total_offset <= std::numeric_limits<P>::digits);  // detect an overflow condition
+
+        unsigned offset = total_offset;
+        while (begin < end) {
+            auto w = *(widths++);
+            offset -= w;                                 // decrement offset
+            assert(w < std::numeric_limits<U>::digits);  // detect an overflow condition
+
+            auto mask = ((1 << w) - 1);              // least significant w bits all 1
+            *(begin++) = (packed >> offset) & mask;  // write to output
+        }
+
+        return total_offset;
+    }
+}
+
+/**
+ * Unpacks an unsigned integer into a range of smaller integers.
+ * @param packed value to unpack
+ * @param dest destination range
+ * @param widths widths range. values are in bits
+ * @param little_endian if the input has the first element in the least significant place
+ * @return the total number of bits unpacked
+ */
+template <typename P, typename OR, typename WR>
+    requires std::unsigned_integral<P> && range<OR> && range<WR>
+size_t unpack_into(P packed, OR &dest, const WR &widths, bool little_endian = true) {
+    assert(size(widths) == size(dest));
+    return unpack_into(packed, dest.begin(), dest.end(), widths.begin(), little_endian);
+}
+
+/**
+ * Unpacks an unsigned integer into a range of smaller integers.
+ * @param packed value to unpack
+ * @param begin destination start iterator
+ * @param end destination end iterator
+ * @param width width of each packed element in bits
+ * @param little_endian if the input has the first element in the least significant place
+ * @return the total number of bits unpacked
+ */
+template <typename P, typename OI>
+    requires std::unsigned_integral<P> && std::output_iterator<OI, P>
+size_t unpack_into(P packed, OI begin, OI end, size_t width, bool little_endian = true) {
+    return unpack_into(packed, begin, end, const_iterator(width), little_endian);
+}
+
+/**
+ * Unpacks an unsigned integer into a range of smaller integers.
+ * @param packed value to unpack
+ * @param dest destination range
+ * @param width width of each packed element in bits
+ * @param little_endian if the input has the first element in the least significant place
+ * @return the total number of bits unpacked
+ */
+template <typename P, typename OR>
+    requires std::unsigned_integral<P> && range<OR>
+size_t unpack_into(P packed, OR &dest, size_t width, bool little_endian = true) {
+    return unpack_into(packed, dest.begin(), dest.end(), const_iterator(width), little_endian);
+}
+
+/**
+ * Unpacks an unsigned integer into an array of smaller integers
+ * @tparam U unpacked data type
+ * @tparam N number of values to unpack
+ * @param packed value to unpack
+ * @param widths widths iterator. values are in bits
+ * @param little_endian if the input has the first element in the least significant place
+ * @return an array of unpacked values
+ */
+template <typename U, size_t N, typename P, typename WI>
+    requires std::unsigned_integral<P> && std::forward_iterator<WI>
+std::array<U, N> unpack(P packed, WI widths, bool little_endian = true) {
+    std::array<U, N> unpacked;
+    unpack_into(packed, unpacked, widths, little_endian);
+    return unpacked;
+}
+
+/**
+ * Unpacks an unsigned integer into an array of smaller integers
+ * @tparam U unpacked data type
+ * @param packed value to unpack
+ * @param widths widths array. values are in bits
+ * @param little_endian if the input has the first element in the least significant place
+ * @return an array of unpacked values
+ */
+template <typename U, size_t N, typename P>
+    requires std::unsigned_integral<P>
+std::array<U, N> unpack(P packed, const std::array<size_t, N> &widths, bool little_endian = true) {
+    return unpack<U, N>(packed, widths.begin(), little_endian);
+}
+
+/**
+ * Unpacks an unsigned integer into an array of smaller integers
+ * @tparam U unpacked data type
+ * @tparam N number of values to unpack
+ * @param packed value to unpack
+ * @param widths widths range. values are in bits
+ * @param little_endian if the input has the first element in the least significant place
+ * @return an array of unpacked values
+ */
+template <typename U, size_t N, typename P, typename WR>
+    requires std::unsigned_integral<P> && range<WR>
+std::array<U, N> unpack(P packed, const WR &widths, bool little_endian = true) {
+    assert(size(widths) == N);
+    return unpack<U, N>(packed, widths.begin(), little_endian);
+}
+
+/**
+ * Unpacks an unsigned integer into an array of smaller integers
+ * @tparam U unpacked data type
+ * @tparam N number of values to unpack
+ * @param packed value to unpack
+ * @param width width of each packed element in bits
+ * @param little_endian if the input has the first element in the least significant place
+ * @return an array of unpacked values
+ */
+template <typename U, size_t N, typename P>
+    requires std::unsigned_integral<P>
+std::array<U, N> unpack(P packed, size_t width, bool little_endian = true) {
+    std::array<U, N> unpacked;
+    unpack_into(packed, unpacked, width, little_endian);
+    return unpacked;
+}
+
+/**
+ * Packs an iterable of integers into a single integer.
+ * @tparam II input iterator type
+ * @tparam WI width iterator type
+ * @tparam P Output data type. must be an unsigned integral type large enough to hold all input values
+ * @param start start iterator
+ * @param end end iterator
+ * @param widths width iterator. must be at least as large as the input data
+ * @param little_endian if the output value should have the first element in the least significant place
+ * of the output or not
+ * @return Packed integer of type P.
+ */
+template <typename P, typename II, typename WI>
+    requires std::unsigned_integral<P> && std::input_iterator<II> && std::input_iterator<WI>
+inline constexpr P pack(II start, II end, WI widths, bool little_endian = true) {
+    P packed = 0;
+    unsigned offset = 0;
+    while (start < end) {
+        P val = static_cast<P>(*(start++));
+        auto w = *(widths++);
+
+        val &= ((1 << w) - 1);
+        assert(val < (1u << w));  // ensure value can fit in W bits
+
+        if (little_endian) {
+            packed |= static_cast<P>(val) << offset;  // first element is in the least significant place of packed
+        } else {
+            packed = (packed << w) | static_cast<P>(val);  // first element is in the most significant place of packed
+        }
+
+        offset += w;  // increment offset
+    }
+
+    assert(offset <= std::numeric_limits<P>::digits);  // detect an overflow condition
+    return packed;
+}
+
+/**
+ * Packs an iterable of integers into a single integer.
+ * @tparam IR input range type
+ * @tparam WR width range type
+ * @tparam P Output data type. must be an unsigned integral type large enough to hold all input values
+ * @param r range of values to pack
+ * @param widths range of widths to pack with. must be at least as large as r
+ * @param little_endian if the output value should have the first element in the least significant place
+ * of the output or not
+ * @return Packed integer of type P.
+ */
+template <typename P, typename IR, typename WR>
+    requires std::unsigned_integral<P> && range<IR> && range<WR>
+inline constexpr P pack(IR r, WR widths, bool little_endian = true) {
+    assert(size(widths) == size(r));
+    return pack<P>(r.begin(), r.end(), widths.start(), little_endian);
+}
+
+/**
+ * Packs an iterable of integers into a single integer.
+ * @tparam II input iterator type
+ * @tparam P Output data type. must be an unsigned integral type large enough to hold all input values
+ * @param start start iterator
+ * @param end end iterator
+ * @param width Number of bits in each value
+ * @param little_endian if the output value should have the first element in the least significant place
+ * of the output or not
+ * @return Packed integer of type P.
+ */
+template <typename P, typename II>
+    requires std::unsigned_integral<P> && std::input_iterator<II>
+inline constexpr P pack(II start, II end, size_t width, bool little_endian = true) {
+    return pack<P>(start, end, const_iterator(width), little_endian);
+}
+
+/**
+ * Packs a range of integers into a single integer.
+ * @tparam IR range type
+ * @tparam P Output data type. must be an unsigned integral type large enough to hold all input values
+ * @param r range of values to pack
+ * @param width Number of bits in each value
+ * @param little_endian if the output value should have the first element in the least significant place
+ * of the output or not
+ * @return Packed integer of type P.
+ */
+template <typename P, typename IR>
+    requires std::unsigned_integral<P> && range<IR>
+inline constexpr P pack(IR r, size_t width, bool little_endian = true) {
+    return pack<P>(r.begin(), r.end(), const_iterator(width), little_endian);
+}
+}  // namespace quicktex
--- a/quicktex/util/bitwiseEnums.h
+++ b/quicktex/util/bitwiseEnums.h
@ -21,38 +21,48 @@

 #include <type_traits>

+namespace quicktex {
+
 // Thanks dkavolis
-template <typename E, typename = std::enable_if_t<std::is_enum_v<E>>> constexpr inline auto operator~(E a) noexcept -> E {
+template <typename E> requires std::is_enum_v<E>
+constexpr inline auto operator~(E a) noexcept -> E {
    using Base = std::underlying_type_t<E>;
    return static_cast<E>(~static_cast<Base>(a));
 }

-template <typename E, typename = std::enable_if_t<std::is_enum_v<E>>> constexpr inline auto operator|(E a, E b) noexcept -> E {
+template <typename E> requires std::is_enum_v<E>
+constexpr inline auto operator|(E a, E b) noexcept -> E {
    using Base = std::underlying_type_t<E>;
    return static_cast<E>(static_cast<Base>(a) | static_cast<Base>(b));
 }

-template <typename E, typename = std::enable_if_t<std::is_enum_v<E>>> constexpr inline auto operator&(E a, E b) noexcept -> E {
+template <typename E> requires std::is_enum_v<E>
+constexpr inline auto operator&(E a, E b) noexcept -> E {
    using Base = std::underlying_type_t<E>;
    return static_cast<E>(static_cast<Base>(a) & static_cast<Base>(b));
 }

-template <typename E, typename = std::enable_if_t<std::is_enum_v<E>>> constexpr inline auto operator^(E a, E b) noexcept -> E {
+template <typename E> requires std::is_enum_v<E>
+constexpr inline auto operator^(E a, E b) noexcept -> E {
    using Base = std::underlying_type_t<E>;
    return static_cast<E>(static_cast<Base>(a) ^ static_cast<Base>(b));
 }

-template <typename E, typename = std::enable_if_t<std::is_enum_v<E>>> constexpr inline auto operator|=(E& a, E b) noexcept -> E& {
+template <typename E> requires std::is_enum_v<E>
+constexpr inline auto operator|=(E& a, E b) noexcept -> E& {
    a = a | b;
    return a;
 }

-template <typename E, typename = std::enable_if_t<std::is_enum_v<E>>> constexpr inline auto operator&=(E& a, E b) noexcept -> E& {
+template <typename E> requires std::is_enum_v<E>
+constexpr inline auto operator&=(E& a, E b) noexcept -> E& {
    a = a & b;
    return a;
 }

-template <typename E, typename = std::enable_if_t<std::is_enum_v<E>>> constexpr inline auto operator^=(E& a, E b) noexcept -> E& {
+template <typename E> requires std::is_enum_v<E>
+constexpr inline auto operator^=(E& a, E b) noexcept -> E& {
    a = a ^ b;
    return a;
-}
+}
+}  // namespace quicktex
--- a/quicktex/util/iterator.h
+++ b/quicktex/util/iterator.h
@ -0,0 +1,146 @@
+/*  Quicktex Texture Compression Library
+    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
+    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
+    and licenced under the public domain
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Lesser General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+namespace quicktex {
+
+namespace detail {
+template <class R> using subs_value_t = std::remove_reference_t<decltype(std::declval<R &>()[0])>;
+}
+
+template <typename D, typename T> class index_iterator_base {
+   public:
+    using value_type = T;
+    using size_type = int;
+    using difference_type = int;
+
+    D &operator++() {
+        _index++;
+        return static_cast<D &>(*this);
+    }
+    D operator++(int) {
+        D old = static_cast<D &>(*this);
+        _index++;
+        return old;
+    }
+    D &operator--() {
+        _index--;
+        return static_cast<D &>(*this);
+    }
+    D operator--(int) {
+        D old = static_cast<D &>(*this);
+        _index--;
+        return old;
+    }
+
+    D operator+(difference_type rhs) const {
+        D d = static_cast<const D &>(*this);
+        d._index += rhs;
+        return d;
+    }
+
+    D operator-(difference_type rhs) const {
+        D d = static_cast<const D &>(*this);
+        d._index -= rhs;
+        return d;
+    }
+
+    D &operator+=(difference_type rhs) {
+        *this = *this + rhs;
+        return *this;
+    }
+
+    D &operator-=(difference_type rhs) {
+        *this = *this - rhs;
+        return *this;
+    }
+
+    difference_type operator-(const D &rhs) const { return (difference_type)_index - rhs._index; }
+
+    friend D operator+(difference_type lhs, const D &rhs) { return rhs + lhs; }
+
+    friend auto operator<=>(const D &lhs, const D &rhs) { return lhs._index <=> rhs._index; }
+
+    T &operator[](difference_type i) { return *(static_cast<D &>(*this) + i); }
+    T &operator[](difference_type i) const { return *(static_cast<const D &>(*this) + i); }
+
+   protected:
+    int _index;
+
+   private:
+    friend D;
+    index_iterator_base(size_t index = 0) : _index(index) {}
+};
+
+template <typename R>
+    requires requires(const R &r) { r[0]; }
+class index_iterator : public index_iterator_base<index_iterator<R>, detail::subs_value_t<R>> {
+   public:
+    using base = index_iterator_base<index_iterator<R>, detail::subs_value_t<R>>;
+    using typename base::difference_type;
+    using typename base::size_type;
+    using typename base::value_type;
+
+    index_iterator() : base(0), _range(nullptr) {}
+    index_iterator(R &range, int index) : base(index), _range(&range) {}
+
+    value_type &operator*() const {
+        assert(_range != nullptr);
+        assert(this->_index >= 0);
+        assert(this->_index < (size_type)_range->size());
+        return (*_range)[this->_index];
+    }
+    value_type *operator->() const { return &(this->operator*()); }
+
+    friend bool operator==(const index_iterator &lhs, const index_iterator &rhs) {
+        return (lhs._range == rhs._range) && (lhs._index == rhs._index);
+    }
+
+   private:
+    R *_range;
+};
+
+template <typename T> class const_iterator : public index_iterator_base<const_iterator<T>, const T> {
+   public:
+    using base = index_iterator_base<const_iterator<T>, const T>;
+    using typename base::difference_type;
+    using typename base::size_type;
+    using typename base::value_type;
+
+    const_iterator() : base(0), _value(T{}) {}
+    const_iterator(T value, int index = 0) : base(index), _value(value) {}
+
+    value_type &operator*() const { return _value; }
+    value_type *operator->() const { return &_value; }
+
+    friend bool operator==(const const_iterator &lhs, const const_iterator &rhs) {
+        return (lhs._value == rhs._value) && (lhs._index == rhs._index);
+    }
+
+   private:
+    T _value;
+};
+
+// const_iterator is guaranteed to be a random access iterator. it is not writable for obvious reasons
+static_assert(std::random_access_iterator<const_iterator<int>>);
+
+// index_iterator satisfied forward_iterator
+static_assert(std::random_access_iterator<index_iterator<std::array<int, 4>>>);
+}  // namespace quicktex
--- a/quicktex/util/map.h
+++ b/quicktex/util/map.h
@ -0,0 +1,178 @@
+/*  Quicktex Texture Compression Library
+    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
+    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
+    and licenced under the public domain
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Lesser General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <array>
+#include <tuple>
+#include <xsimd/xsimd.hpp>
+
+#include "util/ranges.h"
+
+namespace quicktex {
+
+namespace detail {
+
+template <typename T>
+concept simdable = random_access_range<T> && std::contiguous_iterator<decltype(std::declval<T>().begin())> &&
+                   std::is_arithmetic_v<range_value_t<T>>;
+template <typename T, bool serial = false> struct chunker_impl {};
+
+template <typename T, bool serial>
+    requires simdable<T> && (!serial)
+struct chunker_impl<T, serial> {
+    // range with contiguous, SIMDable data
+
+    static constexpr size_t steps = 2;
+    using chunk_types = std::tuple<xsimd::batch<range_value_t<T>>, range_value_t<T>>;
+
+    template <size_t step> using chunk_type = std::tuple_element_t<step, chunk_types>;
+    static constexpr std::array<size_t, 2> chunk_sizes = {chunk_type<0>::size, 1};
+
+    template <size_t step> static constexpr size_t chunk_count(const T& r) {
+        if constexpr (step == 0) {
+            return std::size(r) / chunk_sizes[0];
+        } else {
+            return std::size(r) % chunk_sizes[0];
+        }
+    }
+
+    template <size_t step> static constexpr auto get_chunk(const T& r, size_t i) {
+        assert(i < chunk_count<step>(r));
+        if constexpr (step == 0) {
+            return xsimd::load_unaligned(&r[chunk_sizes[0] * i]);
+        } else {
+            return r[chunk_sizes[0] * chunk_count<0>(r) + i];
+        }
+    }
+
+    template <size_t step>
+    static constexpr void set_chunk(T& r, size_t i, const std::tuple_element_t<step, chunk_types>& c) {
+        assert(i < chunk_count<step>(r));
+        if constexpr (step == 0) {
+            xsimd::store_unaligned(&r[chunk_sizes[0] * i], c);
+        } else {
+            r[chunk_sizes[0] * chunk_count<0>(r) + i] = c;
+        }
+    }
+};
+
+template <typename T, bool serial>
+    requires random_access_range<T> && (!simdable<T> || serial)
+struct chunker_impl<T, serial> {
+    // range with data that cant be SIMDed
+    static constexpr size_t steps = 1;
+    template <size_t step> using chunk_type = range_value_t<T>;
+    static constexpr std::array<size_t, 1> chunk_sizes = {1};
+
+    template <size_t step> static constexpr size_t chunk_count(const T& r) { return r.size(); }
+    template <size_t step> static constexpr auto get_chunk(const T& r, size_t i) { return r[i]; }
+    template <size_t step> static constexpr void set_chunk(T& r, size_t i, const chunk_type<0>& c) { r[i] = c; }
+};
+
+template <typename T, bool serial>
+    requires(!sized_range<T>)
+struct chunker_impl<T, serial> {
+    static constexpr size_t steps = 1;
+    using chunk_types = std::tuple<T>;
+    template <size_t step> using chunk_type = T;
+
+    static constexpr std::array<size_t, 1> chunk_sizes = {1};
+
+    template <size_t step> static constexpr size_t chunk_count(const T&) { return 1; }
+    template <size_t step> static constexpr auto get_chunk(const T& r, size_t) { return r; }
+    template <size_t step> static constexpr void set_chunk(T& r, size_t, const T& c) { r = c; }
+};
+
+template <typename T, bool serial = false, size_t step = 0>
+using chunk_type = typename chunker_impl<T, serial>::template chunk_type<step>;
+
+template <typename T, bool serial, typename Op, std::size_t step, typename... Args>
+static constexpr bool callable_step() {
+    return std::is_invocable_r_v<typename chunker_impl<T, serial>::template chunk_type<step>, Op,
+                                 typename chunker_impl<Args, serial>::template chunk_type<step>...>;
+}
+
+template <typename T, bool serial, typename Op, typename... Args, std::size_t... steps>
+static constexpr bool callable_steps(std::index_sequence<steps...>) {
+    return (callable_step<T, serial, Op, steps, Args...>() && ...);
+}
+
+template <typename T, bool serial, typename Op, typename... Args> static constexpr bool callable() {
+    //    if constexpr (!(std::same_as<T, Args> && ...)) return false;
+    //    return callable_steps<T, serial, Op>(std::make_index_sequence<chunker_impl<T, serial>::steps>());
+    return callable_steps<T, serial, Op, Args...>(std::make_index_sequence<1>());
+}
+
+template <typename T, bool serial, size_t step, typename... Args>
+    requires((std::is_scalar_v<Args> || std::same_as<T, Args>) && ...)
+inline void do_map_step(auto f, T& result, const Args&... args) {
+    using impl = chunker_impl<T, serial>;
+    using chunk_type = typename impl::template chunk_type<step>;
+    size_t chunk_count = impl::template chunk_count<step>(result);
+
+    for (unsigned i = 0; i < chunk_count; i++) {
+        chunk_type out_chunk = f(chunker_impl<Args, serial>::template get_chunk<step>(args, i)...);
+        impl::template set_chunk<step>(result, i, out_chunk);
+    }
+}
+
+template <typename T, bool serial, typename Op, std::size_t... steps, typename... Args>
+    requires((std::is_scalar_v<Args> || std::same_as<T, Args>) && ...)
+inline void do_map_steps(Op f, T& result, std::index_sequence<steps...>, const Args&... args) {
+    //    static_assert(callable<T, serial, Op, Args...>());
+
+    (do_map_step<T, serial, steps>(f, result, args...), ...);
+}
+
+template <typename T, bool serial, typename Op, typename... Args>
+    requires((std::is_scalar_v<Args> || std::same_as<T, Args>) && ...)
+inline void do_map_all(Op f, T& result, const Args&... args) {
+    constexpr bool must_serialize = serial || !callable<T, false, Op, Args...>();
+    do_map_steps<T, must_serialize>(f, result, std::make_index_sequence<chunker_impl<T, serial>::steps>(), args...);
+}
+}  // namespace detail
+
+template <typename R, typename T, bool serial = false, typename Op, typename... Args>
+    requires sized_range<T> && (sized_range<Args> && ...)
+inline R map_to(Op f, const T& in, const Args&... args) {
+    // the input and result types are not the same, so attempting chunking is unsafe
+    R result{};
+    for (unsigned i = 0; i < in.size(); i++) { result[i] = f(in[i], args[i]...); }
+    return result;
+}
+
+template <typename T, bool serial = false, typename Op, typename... Args>
+    requires sized_range<T>
+inline auto map(Op f, const T& in, const Args&... args) {
+    //    assert(((in.size() == args.size())) && ...);
+
+    if constexpr (((std::is_scalar_v<Args> || std::same_as<T, Args>)&&...) &&
+                  (detail::callable<T, true, Op, T, Args...>())) {
+        // the input and result types are all the same type and size, so we can attempt chunking
+        T result{};
+        detail::do_map_all<T, serial>(f, result, in, args...);
+        return result;
+    } else {
+        using result_type = std::invoke_result_t<Op, typename detail::chunk_type<T, true>, range_value_t<Args>...>;
+        return map_to<std::array<result_type, std::tuple_size_v<T>>, T, serial>(f, in, args...);
+    }
+}
+
+}  // namespace quicktex
--- a/quicktex/util/math.h
+++ b/quicktex/util/math.h
@ -0,0 +1,84 @@
+/*  Quicktex Texture Compression Library
+    Copyright (C) 2021-2022 Andrew Cassidy <drewcassidy@me.com>
+    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
+    and licenced under the public domain
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Lesser General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+#include <cassert>
+#include <cstdint>
+#include <functional>
+#include <limits>
+#include <numeric>
+#include <string>
+#include <type_traits>
+#include <vector>
+
+#include "util/ranges.h"
+#include "xsimd/xsimd.hpp"
+
+namespace quicktex {
+
+using std::abs;    // abs overload for builtin types
+using xsimd::abs;  // abs overload for xsimd buffers
+
+template <typename S>
+    requires requires(S &s) { s.abs(); }
+constexpr S abs(S value) {
+    return value.abs();
+}
+
+template <typename S, typename R>
+    requires requires(S s, R r) { s.clamp(r, r); }
+constexpr S clamp(S value, R low, R high) {
+    return value.clamp(low, high);
+}
+
+template <typename S>
+    requires std::is_scalar_v<S>
+constexpr S clamp(S value, S low, S high) {
+    assert(low <= high);
+    if (value < low) return low;
+    if (value > high) return high;
+    return value;
+}
+
+template <typename S, typename A>
+constexpr xsimd::batch<S, A> clamp(xsimd::batch<S, A> value, const xsimd::batch<S, A> &low,
+                                   const xsimd::batch<S, A> &high) {
+    return xsimd::clip(value, low, high);
+}
+
+template <typename S, typename A>
+constexpr xsimd::batch<S, A> clamp(xsimd::batch<S, A> value, const S &low, const S &high) {
+    return clamp(value, xsimd::broadcast(low), xsimd::broadcast(high));
+}
+
+template <typename S>
+    requires requires(S &s) { s.sum(); }
+constexpr auto sum(S value) {
+    return value.sum();
+}
+
+template <typename S>
+    requires std::is_scalar_v<S>
+constexpr auto sum(S value) {
+    return value;
+    // horizontally adding a scalar is a noop
+}
+
+template <typename S, typename A> constexpr auto sum(xsimd::batch<S, A> value) { return xsimd::hadd(value); }
+}  // namespace quicktex
--- a/quicktex/util/ranges.h
+++ b/quicktex/util/ranges.h
@ -0,0 +1,74 @@
+/*  Quicktex Texture Compression Library
+    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
+    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
+    and licenced under the public domain
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Lesser General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <cstdint>
+#include <functional>
+#include <iterator>
+#include <limits>
+#include <numeric>
+#include <string>
+#include <type_traits>
+
+namespace quicktex {
+
+// std::ranges is not usable by default in libc++ 13
+template <class T>
+concept range = requires(T &t) {
+                    t.begin();
+                    t.end();
+                };
+
+using std::size;
+template <range T> constexpr auto size(const T &range) { return std::distance(range.begin(), range.end()); }
+
+template <class T>
+concept sized_range = range<T> && requires(T &t) { size(t); };
+
+template <class R> using iterator_t = decltype(std::declval<R &>().begin());
+template <class R> using sentinel_t = decltype(std::declval<R &>().end());
+template <class R> using range_size_t = decltype(size(std::declval<R &>()));
+template <class R> using range_difference_t = std::iter_difference_t<iterator_t<R>>;
+template <class R> using range_value_t = std::iter_value_t<iterator_t<R>>;
+template <class R> using range_reference_t = std::iter_reference_t<iterator_t<R>>;
+template <class R> using range_rvalue_reference_t = std::iter_rvalue_reference_t<iterator_t<R>>;
+
+template <class R>
+concept input_range = range<R> && std::input_iterator<iterator_t<R>>;
+
+template <class R, typename T>
+concept output_range = range<R> && (std::output_iterator<iterator_t<R>, T>);
+
+template <class R>
+concept forward_range = range<R> && std::forward_iterator<iterator_t<R>>;
+
+template <class R>
+concept bidirectional_range = range<R> && std::bidirectional_iterator<iterator_t<R>>;
+
+template <class R>
+concept random_access_range = range<R> && std::random_access_iterator<iterator_t<R>>;
+
+template <class R>
+concept contiguous_range = range<R> && std::contiguous_iterator<iterator_t<R>>;
+
+}  // namespace quicktex
--- a/quicktex/util/simd.h
+++ b/quicktex/util/simd.h
@ -0,0 +1,97 @@
+/*  Quicktex Texture Compression Library
+    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
+    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
+    and licenced under the public domain
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Lesser General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <array>
+#include <type_traits>
+
+#include "util/math.h"
+#include "util/types.h"
+#include "xsimd/xsimd.hpp"
+
+template <typename T> using requires_arch = xsimd::kernel::requires_arch<T>;
+
+namespace quicktex::simd {
+
+namespace kernel {
+
+#if XSIMD_WITH_NEON64
+template <class A> inline int16_t whadd(xsimd::batch<int8_t, A> const& arg, requires_arch<xsimd::neon64>) {
+    return vaddlvq_s8(arg);
+}
+
+template <class A> inline int32_t whadd(xsimd::batch<int16_t, A> const& arg, requires_arch<xsimd::neon64>) {
+    return vaddlvq_s16(arg);
+}
+
+template <class A> inline int64_t whadd(xsimd::batch<int32_t, A> const& arg, requires_arch<xsimd::neon64>) {
+    return vaddlvq_s32(arg);
+}
+
+template <class A> inline uint16_t whadd(xsimd::batch<uint8_t, A> const& arg, requires_arch<xsimd::neon64>) {
+    return vaddlvq_u8(arg);
+}
+
+template <class A> inline uint32_t whadd(xsimd::batch<uint16_t, A> const& arg, requires_arch<xsimd::neon64>) {
+    return vaddlvq_u16(arg);
+}
+
+template <class A> inline uint64_t whadd(xsimd::batch<uint32_t, A> const& arg, requires_arch<xsimd::neon64>) {
+    return vaddlvq_u32(arg);
+}
+#endif
+
+#if XSIMD_WITH_SSE2
+template <class A> inline int32_t whadd(xsimd::batch<int16_t, A> const& arg, requires_arch<xsimd::sse2>) {
+    // Pairwise widening sum with multiply by 1, then sum all N/2 widened lanes
+    xsimd::batch<int32_t, A> paired = _mm_madd_epi16(arg, _mm_set1_epi16(1));
+    return xsimd::hadd(paired);
+}
+#endif
+
+#if XSIMD_WITH_AVX2
+template <class A> inline int32_t whadd(xsimd::batch<int16_t, A> const& arg, requires_arch<xsimd::avx2>) {
+    // Pairwise widening sum with multiply by 1, then sum all N/2 widened lanes
+    xsimd::batch<int32_t, A> paired = _mm256_madd_epi16(arg, _mm256_set1_epi16(1));
+    return xsimd::hadd(paired);
+}
+#endif
+
+template <class A, class T> inline next_size_t<T> whadd(xsimd::batch<T, A> const& arg, requires_arch<xsimd::generic>) {
+    // Generic implementation that should work everywhere
+    using b_type = xsimd::batch<T, A>;
+    using r_type = next_size_t<T>;
+    const auto len = b_type::size;
+
+    alignas(A::alignment()) T buffer[len];
+    r_type sum = 0;
+
+    arg.store_aligned(buffer);
+    for (T val : buffer) { sum += static_cast<r_type>(val); }
+
+    return sum;
+}
+}  // namespace kernel
+
+template <class A, class T> inline next_size_t<T> whadd(xsimd::batch<T, A> const& arg) {
+    return kernel::whadd(arg, A{});
+}
+
+}  // namespace quicktex::simd
--- a/quicktex/util/subrange.h
+++ b/quicktex/util/subrange.h
@ -0,0 +1,97 @@
+/*  Quicktex Texture Compression Library
+    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
+    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
+    and licenced under the public domain
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Lesser General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <concepts>
+#include <iterator>
+
+#include "util/ranges.h"
+
+namespace quicktex {
+
+template <std::input_or_output_iterator I, std::sentinel_for<I> S = I> struct subrange {
+   public:
+    using iterator_type = I;
+    using sentinel_type = S;
+    using value_type = std::iter_value_t<I>;
+    using reference_type = std::iter_reference_t<I>;
+    using difference_type = std::iter_difference_t<I>;
+
+    constexpr subrange(const I& b, const S& e) : _begin(b), _end(e) {}
+
+    constexpr I begin() const { return _begin; }
+    constexpr S end() const { return _end; }
+    constexpr bool empty() const { return _begin == _end; }
+    constexpr difference_type size() const { return std::distance(_end, _begin); }
+
+    explicit constexpr operator bool() const { return !empty(); }
+
+    constexpr subrange& advance(difference_type n) {
+        assert(n >= 0 || std::bidirectional_iterator<I>);  // forward iterators cannot be decremented
+
+        if (n > 0) {
+            for (int i = 0; i < n && _begin != _end; i++) { _begin++; }
+        } else {
+            for (int i = 0; i > n && _begin != _end; i--) { _begin--; }
+        }
+        return *this;
+    }
+
+    constexpr subrange next(difference_type n = 1) const {
+        auto tmp = *this;
+        return tmp.advance(n);
+    }
+
+    template <typename _ = I>
+        requires std::bidirectional_iterator<I>
+    constexpr subrange prev(difference_type n = 1) const {
+        return next(-n);
+    }
+
+    template <typename _ = I>
+        requires std::random_access_iterator<I>
+    constexpr reference_type operator[](difference_type i) {
+        assert(i >= 0 && i < size());
+        return _begin[i];
+    }
+
+    template <typename _ = I>
+        requires std::random_access_iterator<I>
+    constexpr const reference_type operator[](difference_type i) const {
+        assert(i >= 0 && i < size());
+        return _begin[i];
+    }
+
+    template <typename _ = I>
+        requires std::contiguous_iterator<I>
+    constexpr value_type* data() {
+        return std::to_address(_begin);
+    }
+    template <typename _ = I>
+        requires std::contiguous_iterator<I>
+    constexpr value_type const* data() const {
+        return std::to_address(_begin);
+    }
+
+   private:
+    I _begin;
+    S _end;
+};
+}  // namespace quicktex
--- a/quicktex/util/types.h
+++ b/quicktex/util/types.h
@ -0,0 +1,49 @@
+/*  Quicktex Texture Compression Library
+    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
+    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
+    and licenced under the public domain
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Lesser General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+#include <cstdint>
+
+namespace quicktex {
+template <class> struct next_size;
+template <class T> using next_size_t = typename next_size<T>::type;
+template <class T> struct type_tag { using type = T; };
+
+template <> struct next_size<int8_t> : type_tag<int16_t> {};
+template <> struct next_size<int16_t> : type_tag<int32_t> {};
+template <> struct next_size<int32_t> : type_tag<int64_t> {};
+
+template <> struct next_size<uint8_t> : type_tag<uint16_t> {};
+template <> struct next_size<uint16_t> : type_tag<uint32_t> {};
+template <> struct next_size<uint32_t> : type_tag<uint64_t> {};
+
+template <auto bitCount>
+using unsigned_bits =
+    std::conditional_t<bitCount <= 8, std::uint8_t,
+                       std::conditional_t<bitCount <= 16, std::uint16_t,
+                                          std::conditional_t<bitCount <= 32, std::uint32_t,
+                                                             std::conditional_t<bitCount <= 64, std::uint64_t, void>>>>;
+
+template <auto bitCount>
+using signed_bits =
+    std::conditional_t<bitCount <= 8, std::int8_t,
+                       std::conditional_t<bitCount <= 16, std::int16_t,
+                                          std::conditional_t<bitCount <= 32, std::int32_t,
+                                                             std::conditional_t<bitCount <= 64, std::int64_t, void>>>>;
+}  // namespace quicktex::util
--- a/setup.py
+++ b/setup.py
@ -46,7 +46,6 @@ class CMakeBuild(build_ext):
            "-DPython_EXECUTABLE={}".format(sys.executable),
            "-DPython_ROOT_DIR={}".format(os.path.dirname(sys.executable)),
            "-DQUICKTEX_VERSION_INFO={}".format(version),  # include version info in module
-            "-DQUICKTEX_MODULE_ONLY=TRUE",  # only build the module, not the wrapper
            "-DCMAKE_BUILD_TYPE={}".format(cfg),  # not used on MSVC, but no harm
            # clear cached make program binary, see https://github.com/pypa/setuptools/issues/2912
            "-U",
@ -54,6 +53,9 @@ class CMakeBuild(build_ext):
        ]
        build_args = []

+        if self.verbose:
+            build_args += ["--verbose"]
+
        if self.compiler.compiler_type != "msvc":
            # Using Ninja-build since it a) is available as a wheel and b)
            # multithreads automatically. MSVC would require all variables be
@ -64,6 +66,9 @@ class CMakeBuild(build_ext):
                cmake_args += ["-GNinja"]

        else:
+            # if 'CC' in os.environ and 'clang-cl' in os.environ['CC']:
+            #     cmake_args += ["-T", 'ClangCL']  # https://stackoverflow.com/a/64189112/7645957
+
            # Single config generators are handled "normally"
            single_config = any(x in cmake_generator for x in {"NMake", "Ninja"})

--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@ -0,0 +1,28 @@
+include(FetchContent)
+FetchContent_Declare(
+        googletest
+        URL https://github.com/google/googletest/archive/e2239ee6043f73722e7aa812a459f54a28552929.zip
+)
+# For Windows: Prevent overriding the parent project's compiler/linker settings
+set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
+FetchContent_MakeAvailable(googletest)
+
+file(GLOB_RECURSE TEST_HEADER_FILES "**.h")
+file(GLOB_RECURSE TEST_SOURCE_FILES "**.cpp")
+file(GLOB_RECURSE TEST_PYTHON_FILES "**.py")
+
+# Organize source files together for some IDEs
+source_group(TREE ${CMAKE_CURRENT_SOURCE_DIR} FILES ${TEST_SOURCE_FILES} ${TEST_HEADER_FILES} ${TEST_PYTHON_FILES})
+
+add_executable(Test ${TEST_SOURCE_FILES} ${TEST_HEADER_FILES})
+
+if ((NOT MSVC) AND (CMAKE_BUILD_TYPE MATCHES Debug))
+    target_compile_options(Test PUBLIC -fsanitize=address,undefined -fno-sanitize-recover=address,undefined -fno-omit-frame-pointer)
+    target_link_options(Test PUBLIC -fsanitize=address,undefined -fno-sanitize-recover=address,undefined -fno-omit-frame-pointer)
+endif ()
+
+target_link_libraries(Test PUBLIC quicktex gtest_main)
+
+include(GoogleTest)
+gtest_discover_tests(Test)
+
--- a/tests/ctest/TestMatrix.cpp
+++ b/tests/ctest/TestMatrix.cpp
@ -0,0 +1,227 @@
+/*  Quicktex Texture Compression Library
+    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
+    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
+    and licenced under the public domain
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Lesser General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <Matrix.h>
+#include <gtest/gtest.h>
+#include <util/math.h>
+
+#include <array>
+#include <cstdlib>
+
+namespace quicktex::tests {
+
+#define EXPECT_MATRIX_EQ(value, expected)                                           \
+    {                                                                               \
+        auto v = value;                                                             \
+        auto e = expected;                                                          \
+        if constexpr (std::is_floating_point_v<typename decltype(v)::value_type>) { \
+            for (unsigned i = 0; i < v.elements; i++) {                             \
+                EXPECT_FLOAT_EQ(v.element(i), e.element(i)) << "At index " << i;    \
+            }                                                                       \
+        } else {                                                                    \
+            EXPECT_EQ(v, e);                                                        \
+        }                                                                           \
+    }
+
+constexpr size_t fibn(size_t n) { return (n < 2) ? n : fibn(n - 1) + fibn(n - 2); }
+
+template <typename T> constexpr T sqr(T n) { return n * n; }
+
+template <typename Op, typename... Args> constexpr void foreach (Op f, Args... args) { (f(args), ...); }
+
+template <typename T> class MatrixTest : public testing::Test {
+   public:
+    using Scalar = T;
+    template <size_t M> using Vec = quicktex::Vec<T, M>;
+    template <size_t M, size_t N> using Matrix = quicktex::Matrix<T, M, N>;
+
+    template <typename M> constexpr M iota(T start = 0, T stride = 1) {
+        M result(0);
+        for (unsigned i = 0; i < M::elements; i++) { result.element(i) = (static_cast<T>(i) + start) * stride; }
+        return result;
+    }
+
+    template <typename M> constexpr M sqr(T start = 0, T stride = 1) {
+        M result(0);
+        for (unsigned i = 0; i < M::elements; i++) {
+            result.element(i) = (static_cast<T>(i) + start) * (static_cast<T>(i) + start) * stride;
+        }
+        return result;
+    }
+
+    template <typename M> constexpr M fib(T start = 0) {
+        M result(0);
+        for (unsigned i = 0; i < M::elements; i++) { result.element(i) = fibn(i + start); }
+        return result;
+    }
+
+    static constexpr auto sizes = std::make_tuple(Vec<4>(0), Vec<7>(0), Matrix<4, 4>(0), Matrix<5, 6>(0));
+
+    template <typename Op> constexpr void foreach_size(Op f) {
+        auto foreach = [f]<typename... Args>(Args... args) { (f(args), ...); };
+        std::apply(foreach, sizes);
+    }
+};
+
+using Scalars = ::testing::Types<uint8_t, int8_t, uint16_t, int16_t, uint32_t, int32_t, float, double>;
+TYPED_TEST_SUITE(MatrixTest, Scalars);
+
+#define IOTA(M, start, stride) this->TestFixture::template iota<M>(start, stride)
+#define SQR(M, start, stride) this->TestFixture::template sqr<M>(start, stride)
+#define FIB(M, start) this->TestFixture::template fib<M>(start)
+
+TYPED_TEST(MatrixTest, negate) {
+    if constexpr (std::unsigned_integral<typename TestFixture::Scalar>) {
+        GTEST_SKIP();
+    } else {
+        TestFixture::foreach_size([&]<typename M>(M) {
+            EXPECT_MATRIX_EQ(-IOTA(M, 0, 1), IOTA(M, 0, -1));
+            EXPECT_MATRIX_EQ(-IOTA(M, 0, -1), IOTA(M, 0, 1));
+        });
+    }
+}
+
+TYPED_TEST(MatrixTest, add) {
+    TestFixture::foreach_size([&]<typename M>(M) {
+        EXPECT_MATRIX_EQ(IOTA(M, 0, 1) + IOTA(M, 0, 3), IOTA(M, 0, 4));
+        EXPECT_MATRIX_EQ(IOTA(M, 0, 2) + IOTA(M, 0, 2), IOTA(M, 0, 4));
+        if constexpr (!std::unsigned_integral<typename M::value_type>) {
+            EXPECT_MATRIX_EQ(IOTA(M, 0, 3) + IOTA(M, 0, -1), IOTA(M, 0, 2));
+        }
+    });
+}
+
+TYPED_TEST(MatrixTest, subtract) {
+    TestFixture::foreach_size([&]<typename M>(M) {
+        EXPECT_MATRIX_EQ(IOTA(M, 0, 4) - IOTA(M, 0, 1), IOTA(M, 0, 3));
+        EXPECT_MATRIX_EQ(IOTA(M, 0, 2) - IOTA(M, 0, 2), IOTA(M, 0, 0));
+        if constexpr (!std::unsigned_integral<typename M::value_type>) {
+            EXPECT_MATRIX_EQ(IOTA(M, 0, 3) - IOTA(M, 0, -1), IOTA(M, 0, 4));
+            EXPECT_MATRIX_EQ(IOTA(M, 0, 1) - IOTA(M, 0, 3), IOTA(M, 0, -2));
+        }
+    });
+}
+
+TYPED_TEST(MatrixTest, multiply) {
+    TestFixture::foreach_size([&]<typename M>(M) {
+        EXPECT_MATRIX_EQ(IOTA(M, 0, 2) * 2, IOTA(M, 0, 4));
+        EXPECT_MATRIX_EQ(IOTA(M, 0, 2) * 0, M(0));
+
+        if constexpr (!std::is_unsigned_v<typename M::value_type>) {
+            EXPECT_MATRIX_EQ(IOTA(M, 0, 2) * -2, IOTA(M, 0, -4));
+        }
+
+        if constexpr (std::numeric_limits<typename M::value_type>::max() >= sqr(M::elements - 1)) {
+            EXPECT_MATRIX_EQ(IOTA(M, 0, 1) * IOTA(M, 0, 1), SQR(M, 0, 1));
+        }
+
+        if constexpr (std::numeric_limits<typename M::value_type>::max() >= sqr(M::elements - 1) * 3) {
+            EXPECT_MATRIX_EQ(IOTA(M, 0, 1) * IOTA(M, 0, 3), SQR(M, 0, 3));
+            EXPECT_MATRIX_EQ(IOTA(M, 0, 0) * IOTA(M, 0, 3), SQR(M, 0, 0));
+        }
+
+        if constexpr (std::numeric_limits<typename M::value_type>::max() >= sqr(M::elements - 1) * 4) {
+            EXPECT_MATRIX_EQ(IOTA(M, 0, 2) * IOTA(M, 0, 2), SQR(M, 0, 4));
+            if constexpr (!std::is_unsigned_v<typename M::value_type>) {
+                EXPECT_MATRIX_EQ(IOTA(M, 0, 4) * IOTA(M, 0, -1), SQR(M, 0, -4));
+                EXPECT_MATRIX_EQ(IOTA(M, 0, -4) * IOTA(M, 0, -1), SQR(M, 0, 4));
+            }
+        }
+    });
+}
+
+TYPED_TEST(MatrixTest, divide) {
+    TestFixture::foreach_size([&]<typename M>(M) {
+        EXPECT_MATRIX_EQ(IOTA(M, 0, 4) / 2, IOTA(M, 0, 2));
+        EXPECT_MATRIX_EQ(IOTA(M, 0, 2) / 1, IOTA(M, 0, 2));
+
+        if constexpr (!std::is_unsigned_v<typename M::value_type>) {
+            EXPECT_MATRIX_EQ(IOTA(M, 0, 4) / -2, IOTA(M, 0, -2));
+            EXPECT_MATRIX_EQ(IOTA(M, 0, -4) / -2, IOTA(M, 0, 2));
+        }
+
+        if constexpr (std::numeric_limits<typename M::value_type>::max() >= sqr(M::elements)) {
+            EXPECT_MATRIX_EQ(SQR(M, 1, 1) / IOTA(M, 1, 1), IOTA(M, 1, 1));
+        }
+
+        if constexpr (std::numeric_limits<typename M::value_type>::max() >= sqr(M::elements) * 3) {
+            EXPECT_MATRIX_EQ(SQR(M, 1, 3) / IOTA(M, 1, 1), IOTA(M, 1, 3));
+            EXPECT_MATRIX_EQ(SQR(M, 1, 3) / IOTA(M, 1, 3), IOTA(M, 1, 1));
+        }
+
+        if constexpr (std::numeric_limits<typename M::value_type>::max() >= sqr(M::elements) * 4) {
+            EXPECT_MATRIX_EQ(SQR(M, 1, 4) / IOTA(M, 1, 2), IOTA(M, 1, 2));
+            if constexpr (!std::is_unsigned_v<typename M::value_type>) {
+                EXPECT_MATRIX_EQ(SQR(M, 1, -4) / IOTA(M, 1, -1), IOTA(M, 1, 4));
+                EXPECT_MATRIX_EQ(SQR(M, 1, 4) / IOTA(M, 1, -1), IOTA(M, 1, -4));
+            }
+        }
+    });
+}
+
+TYPED_TEST(MatrixTest, abs) {
+    if constexpr (std::unsigned_integral<typename TestFixture::Scalar>) {
+        GTEST_SKIP();
+    } else {
+        TestFixture::foreach_size([&]<typename M>(M) {
+            EXPECT_MATRIX_EQ(IOTA(M, 0, -1).abs(), IOTA(M, 0, 1));
+            EXPECT_MATRIX_EQ(IOTA(M, 0, 1).abs(), IOTA(M, 0, 1));
+        });
+    }
+}
+
+TYPED_TEST(MatrixTest, clamp) {
+    TestFixture::foreach_size([&]<typename M>(M) {
+        EXPECT_MATRIX_EQ(IOTA(M, 0, 1).clamp(0, M::elements - 1), IOTA(M, 0, 1));
+        EXPECT_MATRIX_EQ(IOTA(M, 0, 1).clamp(M(0), IOTA(M, 0, 1)), IOTA(M, 0, 1));
+        EXPECT_MATRIX_EQ(IOTA(M, 0, 2).clamp(IOTA(M, 0, 1), IOTA(M, 0, 3)), IOTA(M, 0, 2));
+        EXPECT_MATRIX_EQ(IOTA(M, 0, 3).clamp(IOTA(M, 0, 1), IOTA(M, 0, 2)), IOTA(M, 0, 2));
+        EXPECT_MATRIX_EQ(IOTA(M, 0, 1).clamp(M(0), M(0)), M(0));
+        if (std::numeric_limits<typename M::value_type>::max() >= fibn(M::elements)) {
+            EXPECT_MATRIX_EQ(FIB(M, 1).clamp(M(0), IOTA(M, 0, 1)), IOTA(M, 0, 1));
+        }
+        if (std::numeric_limits<typename M::value_type>::max() >= sqr(M::elements - 1)) {
+            EXPECT_MATRIX_EQ(SQR(M, 0, 1).clamp(M(0), IOTA(M, 0, 1)), IOTA(M, 0, 1));
+        }
+    });
+}
+
+TYPED_TEST(MatrixTest, matrix_multiply) {
+    TestFixture::foreach_size([&]<typename M>(M) {
+        auto identity = Matrix<typename M::value_type, M::height, M::height>::identity();
+        EXPECT_MATRIX_EQ(identity.mult(IOTA(M, 0, 1)), IOTA(M, 0, 1));
+    });
+}
+
+TYPED_TEST(MatrixTest, sum) {
+    TestFixture::foreach_size([&]<typename M>(M) {
+        EXPECT_FLOAT_EQ(M(1).sum(), M::elements);
+        EXPECT_FLOAT_EQ(M(0).sum(), 0);
+
+        if (std::numeric_limits<typename M::value_type>::max() >= M::elements * (M::elements + 1) / 2) {
+            EXPECT_FLOAT_EQ(IOTA(M, 1, 1).sum(), M::elements * (M::elements + 1) / 2);
+        }
+
+        if constexpr (!std::unsigned_integral<typename M::value_type>) {
+            EXPECT_FLOAT_EQ(M(-1).sum(), -1 * (int)M::elements);
+        }
+    });
+}
+// endregion
+}  // namespace quicktex::tests
--- a/tests/ctest/TestSIMD.cpp
+++ b/tests/ctest/TestSIMD.cpp
@ -0,0 +1,81 @@
+/*  Quicktex Texture Compression Library
+    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
+    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
+    and licenced under the public domain
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Lesser General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <gtest/gtest.h>
+#include <util/math.h>
+#include <util/simd.h>
+#include <util/types.h>
+
+#include <array>
+#include <cstdint>
+#include <limits>
+#include <numeric>
+#include <vector>
+#include <xsimd/xsimd.hpp>
+
+namespace quicktex::tests {
+
+template <typename T> constexpr auto make_arrays() {
+    std::vector<std::array<T, xsimd::batch<T>::size>> arrays;
+    std::array<T, xsimd::batch<T>::size> buffer;
+
+    std::iota(buffer.begin(), buffer.end(), 1);
+    arrays.push_back(buffer);
+
+    buffer.fill(1);
+    arrays.push_back(buffer);
+
+    buffer.fill(0);
+    arrays.push_back(buffer);
+
+    buffer.fill(std::numeric_limits<T>::max());
+    arrays.push_back(buffer);
+
+    if (std::is_signed_v<T>) {
+        std::iota(buffer.begin(), buffer.end(), -1 * (int)xsimd::batch<T>::size);
+        arrays.push_back(buffer);
+
+        buffer.fill(-1);
+        arrays.push_back(buffer);
+
+        buffer.fill(std::numeric_limits<T>::min());
+        arrays.push_back(buffer);
+    }
+
+    return arrays;
+}
+
+#define TEST_WHADD(TYPE)                                                                            \
+    TEST(simd, whadd_##TYPE) {                                                                      \
+        for (auto arr : make_arrays<TYPE>()) {                                                      \
+            auto v = xsimd::load_unaligned(&arr[0]);                                                \
+            auto vsum = simd::whadd(v);                                                             \
+            auto ssum = std::accumulate(arr.begin(), arr.end(), static_cast<next_size_t<TYPE>>(0)); \
+            EXPECT_EQ(vsum, ssum);                                                                  \
+        }                                                                                           \
+    }
+
+TEST_WHADD(int8_t)
+TEST_WHADD(uint8_t)
+TEST_WHADD(int16_t)
+TEST_WHADD(uint16_t)
+TEST_WHADD(int32_t)
+TEST_WHADD(uint32_t)
+
+}  // namespace quicktex::tests
--- a/tests/test_bc1.py
+++ b/tests/test_bc1.py
@ -138,20 +138,22 @@ class TestBC1Texture:
 class TestBC1Encoder:
    """Test BC1Encoder"""

-    def test_block_4color(self, color_mode):
+    @pytest.mark.parametrize('level', range(18))
+    def test_block_4color(self, level, color_mode):
        """Test encoder output with 4 color greyscale test block"""
-        encoder = BC1Encoder(color_mode=color_mode)
+        encoder = BC1Encoder(level, color_mode)
        out_tex = encoder.encode(BC1Blocks.greyscale.texture)
        out_block = out_tex[0, 0]

        assert out_tex.size_blocks == (1, 1)

        assert not out_block.is_3color
-        assert out_block == BC1Blocks.greyscale.block
+        assert out_block.tobytes() == BC1Blocks.greyscale.block.tobytes()

-    def test_block_3color(self, color_mode):
+    @pytest.mark.parametrize('level', range(2, 18))  # lowest 2 levels can be improved, but right now choke on this test
+    def test_block_3color(self, level, color_mode):
        """Test encoder output with 3 color test block"""
-        encoder = BC1Encoder(color_mode=color_mode)
+        encoder = BC1Encoder(level, color_mode)
        out_tex = encoder.encode(BC1Blocks.three_color.texture)
        out_block = out_tex[0, 0]

@ -160,13 +162,14 @@ class TestBC1Encoder:
        if encoder.color_mode != BC1Encoder.ColorMode.FourColor:
            # we only care about the selectors if we are in 3 color mode
            assert out_block.is_3color
-            assert out_block == BC1Blocks.three_color.block
+            assert out_block.tobytes() == BC1Blocks.three_color.block.tobytes()
        else:
            assert not out_block.is_3color

-    def test_block_3color_black(self, color_mode):
+    @pytest.mark.parametrize('level', range(2, 18))  # lowest 2 levels can be improved, but right now choke on this test
+    def test_block_3color_black(self, level, color_mode):
        """Test encoder output with 3 color test block with black pixels"""
-        encoder = BC1Encoder(color_mode=color_mode)
+        encoder = BC1Encoder(level, color_mode)
        out_tex = encoder.encode(BC1Blocks.three_color_black.texture)
        out_block = out_tex[0, 0]

@ -178,7 +181,7 @@ class TestBC1Encoder:
            # we only care about the selectors if we are in 3 color black mode
            assert out_block.is_3color
            assert has_black
-            assert out_block == BC1Blocks.three_color_black.block
+            assert out_block.tobytes() == BC1Blocks.three_color_black.block.tobytes()
        elif color_mode == BC1Encoder.ColorMode.ThreeColor:
            assert not (has_black and out_block.is_3color)
        else:
--- a/tests/test_install.py
+++ b/tests/test_install.py
@ -1,9 +1,12 @@
 """Test if everything is installed correctly"""
+import _quicktex
+import pytest

 import quicktex


 class TestInstall:
+    @pytest.mark.skipif(_quicktex._debug_build, reason="Debug builds dont have valid version strings")
    def test_version(self):
        """Test if the extension module version matches what setuptools returns"""
        try:
@ -16,4 +19,4 @@ class TestInstall:

        version = metadata.version('quicktex')

-        assert version == quicktex.__version__, 'incorrect version string from extension module'
+        assert version == quicktex.__version__
--- a/tools/CompilerWarnings.cmake
+++ b/tools/CompilerWarnings.cmake
@ -37,6 +37,7 @@ function(set_project_warnings project_name)
            /w14928 # illegal copy-initialization; more than one user-defined
            # conversion has been implicitly applied
            /permissive- # standards conformance mode for MSVC compiler.
+            /wd4701 # uninitialized variable checker is trigger-happy
            )

    set(CLANG_WARNINGS
@ -52,13 +53,14 @@ function(set_project_warnings project_name)
            -Wunused         # warn on anything being unused
            -Woverloaded-virtual # warn if you overload (not override) a virtual
            # function
-            -Wpedantic   # warn if non-standard C++ is used
+            # -Wpedantic   # warn if non-standard C++ is used
            #-Wconversion # warn on type conversions that may lose data
            #-Wsign-conversion  # warn on sign conversions
            -Wnull-dereference # warn if a null dereference is detected
            -Wdouble-promotion # warn if float is implicit promoted to double
            -Wformat=2 # warn on security issues around functions that format output
            # (ie printf)
+            -Wsign-compare
            )

    if (${PROJECT_NAME}_WARNINGS_AS_ERRORS)
@ -74,7 +76,7 @@ function(set_project_warnings project_name)
            -Wduplicated-branches # warn if if / else branches have duplicated code
            -Wlogical-op   # warn about logical operations being used where bitwise were
            # probably wanted
-            -Wuseless-cast # warn if you perform a cast to the same type
+            # -Wuseless-cast # warn if you perform a cast to the same type
            )

    if (MSVC)
--- a/tools/SIMDFlags.cmake
+++ b/tools/SIMDFlags.cmake
@ -0,0 +1,68 @@
+function(set_simd_flags target_name)
+    if (DEFINED ENV{QUICKTEX_SIMD_MODE})
+        set(simd_mode $ENV{QUICKTEX_SIMD_MODE})
+        message("SIMD mode is ${simd_mode}")
+    else ()
+        message("Defaulting to AUTO SIMD mode. Resulting binary is not fit for distributing to other computers!")
+        set(simd_mode "AUTO")
+    endif ()
+
+    if ((CMAKE_OSX_ARCHITECTURES MATCHES "x86_64") OR (CMAKE_SYSTEM_PROCESSOR MATCHES "(x86)|(X86)|(amd64)|(AMD64)"))
+        set(X86 TRUE)
+        message("X86 Detected")
+    else ()
+        set(X86 FALSE)
+    endif ()
+
+    if ((CMAKE_OSX_ARCHITECTURES MATCHES "arm64") OR (CMAKE_SYSTEM_PROCESSOR MATCHES "(arm64)|(ARM64)|(aarch64)"))
+        set(ARM TRUE)
+        message("ARM Detected")
+    else ()
+        set(ARM FALSE)
+    endif ()
+
+    if (simd_mode STREQUAL "SCALAR")
+        # force xsimd to use scalar ops. This should really only be used for testing,
+        # since SSE2 and NEON are guranteed on 64-bit platforms
+        if (MSVC)
+            target_compile_options(${target_name} PUBLIC /DXSIMD_NO_SUPPORTED_ARCHITECTURE=1)
+        else ()
+            target_compile_options(${target_name} PUBLIC -DXSIMD_NO_SUPPORTED_ARCHITECTURE=1)
+        endif ()
+        return()
+    endif ()
+
+    if (X86)
+        if (simd_mode STREQUAL "AUTO")
+            if (MSVC)
+                #MSVC has no -march=native equivalent. womp
+            elseif (NOT ARM)
+                # setting -march=native on an M1 causes Clang to freak out,
+                # and arm64 is pretty samey instruction set wise (arm9 and SVE2 notwithstanding)
+
+                # Currently AVX512 will cause problems with buffer overruns,
+                # and I dont have good test hardware for it anyways
+
+                target_compile_options(${target_name} PUBLIC -march=native -mno-avx512f)
+            endif ()
+        elseif (simd_mode STREQUAL "SSSE3")
+            if (MSVC)
+                target_compile_options(${target_name} PUBLIC /DXSIMD_WITH_SSSE3)
+            else ()
+                target_compile_options(${target_name} PUBLIC -mssse3)
+            endif ()
+        elseif (simd_mode STREQUAL "SSE4")
+            if (MSVC)
+                target_compile_options(${target_name} PUBLIC /DXSIMD_WITH_SSE4_2 /d2archSSE42)
+            else ()
+                target_compile_options(${target_name} PUBLIC -msse4)
+            endif ()
+        elseif (simd_mode STREQUAL "AVX2")
+            if (MSVC)
+                target_compile_options(${target_name} PUBLIC /arch:AVX2)
+            else ()
+                target_compile_options(${target_name} PUBLIC -mavx2)
+            endif ()
+        endif ()
+    endif ()
+endfunction()
Author	SHA1	Message	Date
Andrew Cassidy	c92d58d115	Rework ranges library Better matches the standard library, and iterators moved to their own file	2022-07-05 22:51:25 -07:00
Andrew Cassidy	9b3c1d0ca3	Add subrange template	2022-07-03 19:08:15 -07:00
Andrew Cassidy	db24af730e	Use int for sizes in matrix type	2022-07-03 11:56:37 -07:00
Andrew Cassidy	f77ea3be0f	MSVC is a joke For some reason index variables need to be signed?	2022-07-02 17:14:12 -07:00
Andrew Cassidy	6afe4851bd	Address some of the more annoying gcc warnings	2022-07-02 17:02:28 -07:00
Andrew Cassidy	3a27a89155	oopsz	2022-07-01 20:11:30 -07:00
Andrew Cassidy	768248c20d	Add option to enable sanitization in python module this is off by default, since it requires juggling some platform-specific environment variables	2022-07-01 19:30:21 -07:00
Andrew Cassidy	0bd0c6846f	you win this time, GCC	2022-06-30 23:46:24 -07:00
Andrew Cassidy	dcd9bf4287	Enable sanitizers in tests These all throw a fit when pointed at cpython unfortunately	2022-06-30 23:39:42 -07:00
Andrew Cassidy	32a411634e	Fix mistaken use of max() instead of max_element()	2022-06-30 21:54:55 -07:00
Andrew Cassidy	bac61eb0fe	Try to make testing return better errors	2022-06-28 17:47:37 -07:00
Andrew Cassidy	2cfcd26a90	Sum and matrix multiply tests	2022-06-28 17:08:30 -07:00
Andrew Cassidy	3119ba1a6c	oops	2022-06-28 16:28:04 -07:00
Andrew Cassidy	bfba3228f0	Last attempt	2022-06-28 16:22:29 -07:00
Andrew Cassidy	2d7aeeb2d8	attempt to make MSVC happy	2022-06-28 16:06:45 -07:00
Andrew Cassidy	3849303a9b	Workaround for GCC Technically this syntax isnt required, but GCC has a bug that hasnt been fixed since 2015. see https://gcc.gnu.org/bugzilla/show_bug.cgi?id=67274	2022-06-28 15:48:24 -07:00
Andrew Cassidy	c41e023735	Fix VecBase constructor to only take the scalar type	2022-06-28 15:04:08 -07:00
Andrew Cassidy	0ee45ba966	Add improved, generalized arithmetic matrix tests	2022-06-28 14:57:55 -07:00
Andrew Cassidy	73441d1ed3	MSVC stop being annoying	2022-06-26 20:17:56 -07:00
Andrew Cassidy	54d61e0bd3	Fix iterators maybe	2022-06-26 19:50:36 -07:00
Andrew Cassidy	487f05c90a	Pass config to ctest for windows	2022-06-26 19:33:43 -07:00
Andrew Cassidy	b5a55f606c	attempt 2	2022-06-26 19:27:25 -07:00
Andrew Cassidy	3ab354db74	Run ctest from build directory	2022-06-26 19:21:04 -07:00
Andrew Cassidy	ef8a41fe03	Run tests verbosely	2022-06-26 19:08:14 -07:00
Andrew Cassidy	598175739f	try using ctest im so close to giving up on windows support	2022-06-26 19:03:25 -07:00
Andrew Cassidy	963d985572	make MSVC shut up	2022-06-26 18:50:19 -07:00
Andrew Cassidy	143bde78d6	Fix running tests	2022-06-26 18:43:53 -07:00
Andrew Cassidy	a96aadc867	use gcc 10	2022-06-26 18:34:06 -07:00
Andrew Cassidy	29741447cd	syntax error	2022-06-26 18:13:38 -07:00
Andrew Cassidy	9011718c09	add NOPYTHON option	2022-06-26 18:09:40 -07:00
Andrew Cassidy	8b2c240094	syntax error	2022-06-26 18:02:40 -07:00
Andrew Cassidy	c97daa21ec	would help if i cloned the repo	2022-06-26 18:00:30 -07:00
Andrew Cassidy	f5defd2817	Move testing to its own step	2022-06-26 17:57:41 -07:00
Andrew Cassidy	49ba7e26b7	Run C tests in CI	2022-06-26 17:40:05 -07:00
Andrew Cassidy	bdd75ddddf	oops	2022-06-26 17:27:47 -07:00
Andrew Cassidy	e528c12b2d	Introduce custom map function	2022-06-26 17:16:02 -07:00
Andrew Cassidy	aec31a2fdc	improved clamp and sum	2022-06-24 21:41:43 -07:00
Andrew Cassidy	17663f4871	First trial run using the Matrix type	2022-06-22 22:32:19 -07:00
Andrew Cassidy	f2352f10fd	Smarter map function using variadics and chunking	2022-06-22 20:39:40 -07:00
Andrew Cassidy	3ceb028907	Attempt to batch some matrix ops	2022-06-22 00:39:36 -07:00
Andrew Cassidy	10ba6b2bd6	Remove utest	2022-06-20 18:42:31 -07:00
Andrew Cassidy	1c06cccd5c	More vector unit tests	2022-06-19 18:33:54 -07:00
Andrew Cassidy	232fb6cb41	use position independent code	2022-06-18 17:14:48 -07:00
Andrew Cassidy	19df5df68d	Rework project layout and tests	2022-06-18 17:06:17 -07:00
Andrew Cassidy	3756f31e20	matrix multiplication and transposition	2022-06-13 22:55:41 -07:00
Andrew Cassidy	2c59419bf0	Cleanup and replace Matrix.h with Vec.h Mysteriously this also (perhaps temporarily) fixed a CPU usage issue in Clion? I guess I'll take it	2022-06-12 20:01:56 -07:00
Andrew Cassidy	f767525aa1	Improved matrix/vector class	2022-06-12 17:06:53 -07:00
Andrew Cassidy	59fefae3f7	A	2022-06-09 22:48:52 -07:00
Andrew Cassidy	0bcfd50a44	I s2g I will make Stallman have a Nice Time	2022-06-08 23:15:51 -07:00
Andrew Cassidy	527067839f	GCC has a very toxic view on friendship as a concept tbh	2022-06-08 23:12:42 -07:00
Andrew Cassidy	a33cb8ea67	Add new vector ops with smarter type deduction	2022-06-08 23:07:43 -07:00
Andrew Cassidy	d293687424	Flesh out ranges library	2022-06-06 22:31:41 -07:00
Andrew Cassidy	6f075b6c1d	Reorganize and add Window class	2022-06-05 17:12:04 -07:00
Andrew Cassidy	f88212af85	ColorSet extension/alias for Matrix	2022-06-02 23:45:53 -07:00
Andrew Cassidy	d3515c1db8	Add matrix template	2022-06-02 23:15:57 -07:00
Andrew Cassidy	20305d2ea9	tweaks and formatting	2022-06-02 22:10:38 -07:00
Andrew Cassidy	abfe0b8d10	Header file reorganization	2022-06-01 23:50:35 -07:00
Andrew Cassidy	f097f71ba9	Assorted cleanup	2022-05-31 23:09:20 -07:00
Andrew Cassidy	961c2b7134	Build on macOS 12 This should build on older versions, so long as you have llvm 13. But the homebrew version of llvm the macos 11 runner has only includes x86 dylibs which cant be linked against when building for arm.	2022-05-31 01:16:18 -07:00
Andrew Cassidy	9388406769	oops	2022-05-31 01:13:21 -07:00
Andrew Cassidy	7430dccd5b	im going to break MSVC's kneecaps	2022-05-31 01:03:52 -07:00
Andrew Cassidy	fa0579ff03	thats not a dollar sign	2022-05-31 00:53:15 -07:00
Andrew Cassidy	9f7eb5fe57	Target LLVM 13	2022-05-31 00:49:30 -07:00
Andrew Cassidy	3b7164ffba	Refactor pack() and unpack()	2022-05-30 22:41:17 -07:00
Andrew Cassidy	dae507acc9	Don't build wheels for musl I don't understand the reason these don't compile	2022-05-29 20:37:49 -07:00
Andrew Cassidy	7eac371064	less iterators	2022-05-29 19:03:47 -07:00
Andrew Cassidy	53a6427dcc	iterators are confusing	2022-05-29 16:51:35 -07:00
Andrew Cassidy	b9c7c7cf6e	Fix broken constructor	2022-05-29 15:58:18 -07:00
Andrew Cassidy	debaa6b54d	Add Vector template class Also experimentally bump to C++20 just to see if it works on GCC 9.3	2022-05-29 15:54:55 -07:00
Andrew Cassidy	c96450b5fe	Rename Color to prepare for refactor	2022-05-25 23:42:06 -07:00
Andrew Cassidy	fffa291765	Fix LeastSquares mode and add tests for every quality level	2022-05-24 22:57:51 -07:00
Andrew Cassidy	829b5312b5	Tweak compiler warnings	2022-05-24 21:33:39 -07:00
Andrew Cassidy	c57106e3b2	Skip version number checking with debug builds Because I found a way to build the extension module directly which helps speed up development immensely	2022-05-24 20:47:55 -07:00
Andrew Cassidy	468414f339	Add arm whadd instructions for all sizes of integer	2022-05-23 23:42:14 -07:00
Andrew Cassidy	f9831b1f61	:unsmilebeale:	2022-05-23 00:46:23 -07:00
Andrew Cassidy	0046bef9d3	Cmake why	2022-05-23 00:41:46 -07:00
Andrew Cassidy	8f19ad6a1d	I cant spell This reverts commit `ed10899601`.	2022-05-23 00:37:28 -07:00
Andrew Cassidy	ed10899601	just never include windows.h tis a silly OS	2022-05-23 00:35:12 -07:00
Andrew Cassidy	2588ebcaa3	its late ok?	2022-05-23 00:28:01 -07:00
Andrew Cassidy	345344eef3	cleanup and prevent windows macros from stepping on everything	2022-05-23 00:21:33 -07:00
Andrew Cassidy	bf35983b2d	Add tests for c-level code	2022-05-22 22:20:30 -07:00
Andrew Cassidy	c2d4e9be4d	Merge branch 'feature/pytest' into feature/simd	2022-05-22 21:02:34 -07:00
Andrew Cassidy	c05879f1c1	Fixes and tweaks to whadd	2022-05-22 16:38:54 -07:00
Andrew Cassidy	aa6bd9602d	Add widening hadd	2022-05-21 21:23:22 -07:00
Andrew Cassidy	8c77356aca	Begin integrating xsimd	2022-05-20 20:46:45 -07:00
Andrew Cassidy	79f77a24b2	Remove Highway	2022-05-20 20:18:18 -07:00
Andrew Cassidy	04fece2771	Move clang-cl detection to cmake and allow it when setting flags	2022-05-16 23:41:42 -07:00
Andrew Cassidy	7ba2225644	Fix typo	2022-05-16 23:18:54 -07:00
Andrew Cassidy	4b3e236275	Tell Cmake to use clang-cl when requested	2022-05-16 23:13:40 -07:00
Andrew Cassidy	b2523dbe19	Try something	2022-05-16 23:05:09 -07:00
Andrew Cassidy	bcdfcb95fb	h	2022-05-16 21:33:33 -07:00
Andrew Cassidy	e5f1a45c6b	raspberry noise	2022-05-16 21:32:01 -07:00
Andrew Cassidy	014f7063fd	Build wheels for SSE4 and using clang on windows	2022-05-16 21:29:45 -07:00
Andrew Cassidy	74aaac00d7	Fix AVX2 bug and add cmake SIMD ISA selection By default, quicktex builds with -march=native on x86, unless an environment variable requesting a specific ISA is set. This doesnt work on MSVC though because it's a shit compiler, so it just falls back to no flags.	2022-05-16 20:52:31 -07:00
Andrew Cassidy	8168d6e249	Add emulated 128-bit support and fix x86	2022-05-15 20:03:05 -07:00
Andrew Cassidy	f7b0cbe76b	add widening horizontal add for s16 vectors	2022-05-15 18:08:36 -07:00
Andrew Cassidy	bc925d3949	Include Highway in build	2022-05-15 00:43:28 -07:00
Andrew Cassidy	7f75104d18	Spell external correctly and exclude all affinity designer files	2022-05-13 00:51:39 -07:00
Andrew Cassidy	643276660a	Include submodules in sdists and github actions	2022-05-13 00:36:00 -07:00
Andrew Cassidy	9789ecd159	Add Highway dependency	2022-05-13 00:32:35 -07:00