Release 0.2.1

### Fixed - Fixed broken transparency on palettized PNG files ### Changed - Changed which wheels are built by the CI. There are no changes to OS or Python version compatibility if you compile from source. - Stopped building Python 3.7 wheels - Stopped building macOS universal wheels - Wheels for macOS now require macOS 12 or later - Included macOS ARM wheels - Included Python 3.12 wheels
skip python3.7 because its EOL
2024-09-13 06:37:34 +00:00 · 2024-06-02 18:58:48 -07:00 · 2024-06-02 17:56:13 -07:00 · 2024-06-02 17:43:33 -07:00 · 2024-06-02 17:32:00 -07:00 · 2023-06-21 15:46:13 -07:00
83 changed files with 1059 additions and 3424 deletions
--- a/.clang-format
+++ b/.clang-format
@ -1,11 +1,10 @@
 ---
 BasedOnStyle: google
 IndentWidth: 4
-ColumnLimit: 120
+ColumnLimit: 160
 AllowShortBlocksOnASingleLine: Always
 AllowShortFunctionsOnASingleLine: All
 AlwaysBreakTemplateDeclarations: MultiLine
-#RequiresClausePositionStyle: SingleLine # requires Clang 15 :(
 #AlignConsecutiveDeclarations: true
 ---

--- a/.clang-tidy
+++ b/.clang-tidy
@ -2,15 +2,15 @@ FormatStyle: google

 Checks: '-*,clang-diagnostic-*,llvm-*,misc-*,-misc-unused-parameters,-misc-non-private-member-variables-in-classes,readability-identifier-naming,cppcoreguidelines-narrowing-conversions'
 CheckOptions:
-  - { key: readability-identifier-naming.NamespaceCase,            value: lower_case }
-  - { key: readability-identifier-naming.ClassCase,                value: CamelCase }
-  - { key: readability-identifier-naming.StructCase,               value: CamelCase }
-  - { key: readability-identifier-naming.TemplateParameterCase,    value: CamelCase }
-  - { key: readability-identifier-naming.FunctionCase,             value: lower_case }
-  - { key: readability-identifier-naming.VariableCase,             value: lower_case }
-  - { key: readability-identifier-naming.MemberCase,               value: lower_case }
-  - { key: readability-identifier-naming.PrivateMemberPrefix,      value: _ }
-  - { key: readability-identifier-naming.ProtectedMemberPrefix,    value: _ }
+  - { key: readability-identifier-naming.NamespaceCase,          value: lower_case }
+  - { key: readability-identifier-naming.ClassCase,              value: CamelCase }
+  - { key: readability-identifier-naming.StructCase,             value: CamelCase }
+  - { key: readability-identifier-naming.TemplateParameterCase,  value: CamelCase }
+  - { key: readability-identifier-naming.FunctionCase,           value: aNy_CasE }
+  - { key: readability-identifier-naming.VariableCase,           value: lower_case }
+  - { key: readability-identifier-naming.MemberCase,        value: lower_case }
+  - { key: readability-identifier-naming.PrivateMemberPrefix,    value: _ }
+  - { key: readability-identifier-naming.ProtectedMemberPrefix,  value: _ }
  - { key: readability-identifier-naming.EnumConstantCase,         value: CamelCase }
  - { key: readability-identifier-naming.ConstexprVariableCase,    value: CamelCase }
  - { key: readability-identifier-naming.GlobalConstantCase,       value: CamelCase }
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@ -7,4 +7,10 @@ updates:
    target-branch: "dev"
    schedule:
      # Check for updates to GitHub Actions every weekday
-      interval: "daily"
+      interval: "daily"
+      
+   # Maintain dependencies for pip
+  - package-ecosystem: "pip"
+    directory: "/"
+    schedule:
+      interval: "weekly"
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@ -6,39 +6,25 @@ name: Python Package
 on: [ push, pull_request ]

 jobs:
-  test:
-    name: Run Unit Tests
-    runs-on: ${{ matrix.os }}
-    strategy:
-      matrix:
-        os: [ macos-12, windows-latest, ubuntu-latest ]
+  build-sdist:
+    name: Build SDist
+    runs-on: ubuntu-latest
+
    steps:
      - uses: actions/checkout@v3
        with:
          fetch-depth: 0
-          submodules: recursive
-
-      - name: Set up GCC
-        if: runner.os == 'linux'
-        uses: egor-tensin/setup-gcc@v1
-        with:
-          version: 10
-
-      - name: Setup cmake
-        uses: jwlawson/actions-setup-cmake@v1.12
-        with:
-          cmake-version: 'latest'
-          github-api-token: ${{ secrets.GITHUB_TOKEN }}

      - name: Set up Python
-        uses: actions/setup-python@v3.1.2
+        uses: actions/setup-python@v4.3.0
        with:
          python-version: '3.x'

      - name: Install dependencies
        run: |
          python -m pip install --upgrade pip
-          python -m pip install flake8 pybind11
+          python -m pip install flake8
+          python -m pip install setuptools twine build

      - name: Lint with flake8
        run: |
@ -47,35 +33,6 @@ jobs:
          # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
          flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics

-      - name: Build C code
-        run: |
-          ls
-          cmake -S . -B build -DQUICKTEX_NOPYTHON=TRUE -DCMAKE_BUILD_TYPE=Debug
-          cmake --build build
-
-      - name: Test C code
-        run: |
-          ctest -V --test-dir build -C Debug
-
-  build-sdist:
-    name: Build SDist
-    runs-on: ubuntu-latest
-    needs: test
-
-    steps:
-      - uses: actions/checkout@v3
-        with:
-          fetch-depth: 0
-          submodules: recursive
-
-      - name: Set up Python
-        uses: actions/setup-python@v3.1.2
-
-      - name: Install dependencies
-        run: |
-          python -m pip install --upgrade pip
-          python -m pip install setuptools twine build
-
      - name: Build SDist
        run: python -m build --sdist

@ -88,22 +45,18 @@ jobs:
          path: dist/*.tar.gz

  build-wheels:
-    name: Build Wheels on ${{ matrix.os }} ${{ matrix.arch[0] }}
+    name: Build Wheels on ${{ matrix.os }}
    runs-on: ${{ matrix.os }}
-    needs: test
    strategy:
      matrix:
-        os: [ macos-12, windows-latest, ubuntu-latest ]
-        arch: [ [ 'x86', 'x86_64', 'AMD64', 'x86_64' ] ] #[suffix, mac, windows, linux] arch names
-        include:
-          - os: ubuntu-latest
-            arch: [ 'ARM', 'arm64', 'ARM64', 'aarch64' ]
+        # macos-13 is an intel runner, macos-14 is apple silicon
+        os: [ubuntu-latest, windows-latest, macos-13, macos-14]
+        linux_arch: [ 'x86_64' ] #[suffix, mac, windows, linux] arch names

    steps:
      - uses: actions/checkout@v3
        with:
          fetch-depth: 0
-          submodules: recursive

      - name: Install libomp
        if: runner.os == 'macOS'
@ -116,17 +69,19 @@ jobs:
          sudo tar fvxz openmp-*.tar.gz -C /

      - name: Install QEMU
-        # install QEMU if building for arm linux
+        # install QEMU if building for linux
        uses: docker/setup-qemu-action@v2
-        if: runner.os == 'linux' && matrix.arch[3] == 'aarch64'
+        if: runner.os == 'linux' 
        with:
          platforms: arm64

      - name: Build wheels
-        uses: pypa/cibuildwheel@2.5.0
+        uses: pypa/cibuildwheel@v2.18.1
        env:
-          MACOSX_DEPLOYMENT_TARGET: "10.9"
-          CIBW_ARCHS_LINUX: ${{ matrix.arch[3] }}
+          MACOSX_DEPLOYMENT_TARGET: "12"
+          CIBW_ARCHS_LINUX: 'x86_64 aarch64'
+          CIBW_ARCHS_MACOS: 'native'
+          CIBW_SKIP: 'cp37*'

      - name: Upload Wheels
        uses: actions/upload-artifact@v3
@ -143,7 +98,9 @@ jobs:
      - uses: actions/checkout@v3 # just need the changelog

      - name: Set up Python
-        uses: actions/setup-python@v3.1.2
+        uses: actions/setup-python@v4.3.0
+        with:
+          python-version: '3.x'

      - name: Install dependencies
        run: |
@ -182,4 +139,4 @@ jobs:
          name: ${{ env.VERSION_TITLE }}
          body_path: RELEASE.md
        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
--- a/.gitignore
+++ b/.gitignore
@ -33,4 +33,3 @@ compile_commands.json
 CTestTestfile.cmake
 _deps
 cmake-build-*
-*.a
--- a/.gitmodules
+++ b/.gitmodules
@ -1,3 +0,0 @@
-[submodule "external/xsimd"]
-	path = external/xsimd
-	url = https://github.com/xtensor-stack/xsimd.git
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -2,11 +2,33 @@

 All notable changes to this project will be documented in this file

-## Unreleased
+## 0.2.1 - 2024-06-03

 ### Fixed

- Fixed LeastSquares endpoint mode producint incorrect results
+- Fixed broken transparency on palettized PNG files
+
+### Changed
+
+- Changed which wheels are built by the CI. There are no changes to OS or Python version compatibility if you compile from source.
+	- Stopped building Python 3.7 wheels
+	- Stopped building macOS universal wheels
+	- Wheels for macOS now require macOS 12 or later
+	- Included macOS ARM wheels 
+	- Included Python 3.12 wheels
+
+
+## 0.2.0 - 2023-06-21
+
+### Changed
+
+- Updated Pybind11 to version 3.10, adding Python 3.11 support
+- Updated install instructions in readme to reflect availability on PyPI
+- Encode now skips .dds files in its input to prevent needless re-encoding
+
+### Added
+
+- Added the `-n` option for bc3 encoding to perform a BC3nm swizzle


 ## 0.1.3 - 2022-04-13
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,14 +1,63 @@
 cmake_minimum_required(VERSION 3.18)
 include(tools/CompilerWarnings.cmake)
-include(tools/SIMDFlags.cmake)
 set(CMAKE_VERBOSE_MAKEFILE ON)

 project(quicktex)

-add_subdirectory(external/xsimd)
+# Find dependencies
+find_package(Python COMPONENTS Interpreter Development.Module)
+find_package(pybind11 CONFIG REQUIRED)
+find_package(OpenMP)

-add_subdirectory(quicktex)
-add_subdirectory(tests)
+# Collect source files
+file(GLOB SOURCE_FILES
+        "quicktex/*.cpp"
+        "quicktex/s3tc/*.cpp"
+        "quicktex/s3tc/bc1/*.cpp"
+        "quicktex/s3tc/bc3/*.cpp"
+        "quicktex/s3tc/bc4/*.cpp"
+        "quicktex/s3tc/bc5/*.cpp"
+        "quicktex/s3tc/interpolator/*.cpp"
+        )

-enable_testing ()
-add_test (NAME QuicktexTest COMMAND Test)
+file(GLOB HEADER_FILES
+        "quicktex/*.h"
+        "quicktex/s3tc/*.h"
+        "quicktex/s3tc/bc1/*.h"
+        "quicktex/s3tc/bc3/*.h"
+        "quicktex/s3tc/bc4/*.h"
+        "quicktex/s3tc/bc5/*.h"
+        "quicktex/s3tc/interpolator/*.h"
+        )
+
+file(GLOB_RECURSE PYTHON_FILES "src/**/*.py")
+
+# Organize source files together for some IDEs
+source_group(TREE ${CMAKE_CURRENT_SOURCE_DIR} FILES ${SOURCE_FILES} ${HEADER_FILES} ${PYTHON_FILES})
+
+# Add python module
+pybind11_add_module(_quicktex
+        ${SOURCE_FILES}
+        ${HEADER_FILES})
+
+# Set Quicktex version info
+target_compile_definitions(_quicktex PRIVATE VERSION_INFO=${QUICKTEX_VERSION_INFO})
+
+# enable openMP if available
+if (OpenMP_CXX_FOUND)
+    target_link_libraries(_quicktex PUBLIC OpenMP::OpenMP_CXX)
+endif ()
+
+# Set module features, like C/C++ standards
+target_compile_features(_quicktex PUBLIC cxx_std_17 c_std_11)
+
+# Set compiler warnings
+set_project_warnings(_quicktex)
+
+# Clang-specific
+if (CMAKE_CXX_COMPILER_ID MATCHES ".*Clang")
+    set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -stdlib=libc++ -lc++abi")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -stdlib=libc++ -fno-omit-frame-pointer -mno-omit-leaf-frame-pointer")
+    set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -stdlib=libc++ -fsanitize=undefined")
+    set(PROJECT_WARNINGS ${CLANG_WARNINGS})
+endif ()
--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -1,2 +0,0 @@
-graft external
-global-exclude *.afdesign # this is currently the vast majority of the repo size
--- a/README.md
+++ b/README.md
@ -9,15 +9,32 @@ comparable to the original library.

 ## Installation

-To install, first clone this repo and cd into it, then run:
+### From Wheel (Easiest)
+
+To install, run
+
+```shell
+pip install quicktex
+```
+
+If you are on macOS, You need to install openMP from homebrew:
+
+```shell
+brew install libomp
+```
+
+### From Source
+
+To build from source, first clone this repo and cd into it, then run:

 ```shell
 git submodule update --init
 pip install .
 ```
+
 and setuptools will take care of any dependencies for you.

-If you are on macOS, it is recommended to first install openMP from homebrew to enable 
+If you are on macOS, it is recommended to first install openMP from homebrew to enable
 multithreading, since it is not included in the default Apple Clang install:

 ```shell
@ -31,8 +48,6 @@ required dependencies for them, install with options like so:
 pip install .[tests,stubs,docs]
 ```

-Quicktex will be available on Pypi once it is out of alpha.
-
 ## Usage

 ```
--- a/external/xsimd
+++ b/external/xsimd
--- a/pyproject.toml
+++ b/pyproject.toml
@ -4,7 +4,7 @@ requires = [
    "setuptools_scm>=6.2",
    "wheel",
    "cmake>=3.18",
-    "pybind11~=2.6.1",
+    "pybind11~=2.10",
    "ninja; sys_platform != 'win32'",
 ]
 build-backend = "setuptools.build_meta"
@ -46,7 +46,7 @@ docs = [
 stubs = ["pybind11-stubgen"]

 [project.urls]
-Docs = "https://quicktex.readthedocs.io/en/"
+Docs = "https://quicktex.readthedocs.io/en/latest/"
 Source = "https://github.com/drewcassidy/quicktex"
 Changelog = "https://github.com/drewcassidy/quicktex/blob/main/CHANGELOG.md"

@ -64,10 +64,8 @@ package-dir = { '' = '.' } # without this line, C++ source files get included in
 [tool.cibuildwheel]
 build = "cp*" # only build wheels for cpython.
 build-frontend = "build"
-test-command = "pytest {project}/tests --verbose --full-trace --capture=tee-sys"
+test-command = "pytest {project}/tests --verbose"
 test-extras = ["tests"]
-test-skip = "*-macosx_arm64 *-macosx_universal2:arm64" # skip testing on arm macOS because CIBW doesnt support it
-environment = { QUICKTEX_SIMD_MODE = "SSE4" } # SSE4 has a 99% market share and was released under the Bush administration

 [tool.cibuildwheel.macos]
 archs = ["x86_64", "universal2"] # build fat binaries, or x86-64 for python 3.7
@ -77,16 +75,11 @@ skip = ["cp{38,39,31*}-macosx_x86_64"] # skip x86-only builds where fat binaries
 archs = ["auto64"] # arm64 windows builds not yet supported

 [tool.cibuildwheel.linux]
-skip = ["*musllinux*"]
+skip = ["cp37-musllinux*", "*musllinux_aarch64*"] # skip targets without available Pillow wheels
 manylinux-x86_64-image = "manylinux2014"
 manylinux-aarch64-image = "manylinux2014"

 [tool.black]
 line-length = 120 # 80-column is stupid
-target-version = ['py37', 'py38', 'py39', 'py310']
-skip-string-normalization = true
-
-[tool.pytest.ini_options]
-minversion = "7.0"
-addopts = ["--full-trace", "--capture=tee-sys"]
-testpaths = ["tests"]
+target-version = ['py37', 'py38', 'py39', 'py310', 'py310']
+skip-string-normalization = true
--- a/quicktex/CMakeLists.txt
+++ b/quicktex/CMakeLists.txt
@ -1,71 +0,0 @@
-
-# Find dependencies
-if (NOT QUICKTEX_NOPYTHON)
-    find_package(Python COMPONENTS Interpreter Development.Module)
-    find_package(pybind11 CONFIG REQUIRED)
-endif ()
-find_package(OpenMP)
-
-#Collect source files
-set(SOURCE_FILES
-        Matrix4x4.cpp OldColor.cpp
-        s3tc/bc1/BC1Block.cpp s3tc/bc1/BC1Decoder.cpp
-        s3tc/bc1/BC1Encoder.cpp s3tc/bc1/OrderTable.cpp s3tc/bc1/OrderTable4.cpp
-        s3tc/bc3/BC3Decoder.cpp s3tc/bc3/BC3Encoder.cpp
-        s3tc/bc4/BC4Block.cpp s3tc/bc4/BC4Decoder.cpp s3tc/bc4/BC4Encoder.cpp
-        s3tc/bc5/BC5Decoder.cpp s3tc/bc5/BC5Encoder.cpp
-        s3tc/interpolator/Interpolator.cpp
-        texture/RawTexture.cpp texture/Window.cpp test.cpp)
-
-set(BINDING_FILES
-        _bindings.cpp
-        s3tc/_bindings.cpp
-        s3tc/bc1/_bindings.cpp
-        s3tc/bc3/_bindings.cpp
-        s3tc/bc4/_bindings.cpp
-        s3tc/bc5/_bindings.cpp
-        s3tc/interpolator/_bindings.cpp)
-
-file(GLOB_RECURSE HEADER_FILES "**.h")
-file(GLOB_RECURSE PYTHON_FILES "**.py")
-
-# Organize source files together for some IDEs
-source_group(TREE ${CMAKE_CURRENT_SOURCE_DIR} FILES ${SOURCE_FILES} ${BINDING_FILES} ${HEADER_FILES} ${PYTHON_FILES})
-
-# Declare implementation module
-add_library(quicktex STATIC ${SOURCE_FILES} ${HEADER_FILES})
-
-# Link openMP if available
-if (OpenMP_CXX_FOUND)
-    target_link_libraries(quicktex PUBLIC OpenMP::OpenMP_CXX)
-endif ()
-
-# Link XSimd
-target_link_libraries(quicktex PUBLIC xsimd)
-
-# Set library features, like C/C++ standards
-target_compile_features(quicktex PUBLIC cxx_std_20 c_std_11)
-set_property(TARGET quicktex PROPERTY CXX_VISIBILITY_PRESET hidden)
-set_property(TARGET quicktex PROPERTY POSITION_INDEPENDENT_CODE ON)
-
-# Include source root for project-relative includes
-target_include_directories(quicktex PUBLIC .)
-
-# Set compiler warnings and SIMD flags
-set_project_warnings(quicktex)
-set_simd_flags(quicktex)
-
-if (NOT QUICKTEX_NOPYTHON)
-    # Declare python module
-    pybind11_add_module(_quicktex ${BINDING_FILES} ${HEADER_FILES})
-    target_compile_definitions(_quicktex PRIVATE VERSION_INFO=${QUICKTEX_VERSION_INFO})
-
-    # Link python module with implementation
-    target_link_libraries(_quicktex PUBLIC quicktex)
-
-    if ((NOT MSVC) AND (CMAKE_BUILD_TYPE MATCHES Debug) AND ($ENV{QUICKTEX_SANITIZE}))
-        target_compile_options(_quicktex PUBLIC -fsanitize=address,undefined -fno-sanitize-recover=address,undefined -fno-omit-frame-pointer)
-        target_link_options(_quicktex PUBLIC -fsanitize=address,undefined -fno-sanitize-recover=address,undefined -fno-omit-frame-pointer)
-    endif ()
-
-endif ()
--- a/quicktex/OldColor.cpp
+++ b/quicktex/OldColor.cpp
@ -16,19 +16,18 @@
    You should have received a copy of the GNU Lesser General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
-#include "OldColor.h"
+#include "Color.h"

 #include <algorithm>
 #include <stdexcept>

 #include "Vector4.h"
 #include "Vector4Int.h"
-#include "util/bitbash.h"
-#include "util/math.h"  // for scale_to_8<5>, scale_from_8<5>, assert5bit, scale_to_8<6>
+#include "util.h"  // for scale5To8, scale8To5, assert5bit, scale6To8

 namespace quicktex {

-OldColor::OldColor(Vector4Int v) {
+Color::Color(Vector4Int v) {
    if (v.MaxAbs() > 0xFF) throw std::invalid_argument("Vector members out of range");
    for (int i = 0; i < 4; i++) {
        if (v[i] < 0) throw std::range_error("Color members cannot be negative");
@ -40,42 +39,40 @@ OldColor::OldColor(Vector4Int v) {
    a = static_cast<uint8_t>(v[3]);
 }

-uint16_t OldColor::Pack565Unscaled(uint8_t r, uint8_t g, uint8_t b) {
+uint16_t Color::Pack565Unscaled(uint8_t r, uint8_t g, uint8_t b) {
    assert5bit(r);
    assert6bit(g);
    assert5bit(b);
    return static_cast<uint16_t>(b | (g << 5) | (r << 11));
 }

-uint16_t OldColor::Pack565(uint8_t r, uint8_t g, uint8_t b) {
-    return Pack565Unscaled(scale_from_8<5>(r), scale_from_8<6>(g), scale_from_8<5>(b));
-}
+uint16_t Color::Pack565(uint8_t r, uint8_t g, uint8_t b) { return Pack565Unscaled(scale8To5(r), scale8To6(g), scale8To5(b)); }

-OldColor OldColor::Unpack565Unscaled(uint16_t Packed) {
+Color Color::Unpack565Unscaled(uint16_t Packed) {
    uint8_t r = (Packed >> 11) & 0x1F;
    uint8_t g = (Packed >> 5) & 0x3F;
    uint8_t b = Packed & 0x1F;

-    return OldColor(r, g, b);
+    return Color(r, g, b);
 }

-OldColor OldColor::Unpack565(uint16_t Packed) {
-    uint8_t r = static_cast<uint8_t>(scale_to_8<5>((Packed >> 11) & 0x1FU));
-    uint8_t g = static_cast<uint8_t>(scale_to_8<6>((Packed >> 5) & 0x3FU));
-    uint8_t b = static_cast<uint8_t>(scale_to_8<5>(Packed & 0x1FU));
+Color Color::Unpack565(uint16_t Packed) {
+    uint8_t r = static_cast<uint8_t>(scale5To8((Packed >> 11) & 0x1FU));
+    uint8_t g = static_cast<uint8_t>(scale6To8((Packed >> 5) & 0x3FU));
+    uint8_t b = static_cast<uint8_t>(scale5To8(Packed & 0x1FU));

-    return OldColor(r, g, b);
+    return Color(r, g, b);
 }

-OldColor OldColor::PreciseRound565(Vector4 &v) {
+Color Color::PreciseRound565(Vector4 &v) {
    int trial_r = (int)(v[0] * UINT5_MAX);
    int trial_g = (int)(v[1] * UINT6_MAX);
    int trial_b = (int)(v[2] * UINT5_MAX);

    // clamp to prevent weirdness with slightly out of bounds float values
-    uint8_t r = (uint8_t)clamp<int>(trial_r, 0, UINT5_MAX);
-    uint8_t g = (uint8_t)clamp<int>(trial_g, 0, UINT6_MAX);
-    uint8_t b = (uint8_t)clamp<int>(trial_b, 0, UINT5_MAX);
+    uint8_t r = (uint8_t)clampi(trial_r, 0, UINT5_MAX);
+    uint8_t g = (uint8_t)clampi(trial_g, 0, UINT6_MAX);
+    uint8_t b = (uint8_t)clampi(trial_b, 0, UINT5_MAX);

    // increment each channel if above the rounding point
    r += v[0] > Midpoints5bit[r];
@ -86,36 +83,46 @@ OldColor OldColor::PreciseRound565(Vector4 &v) {
    assert6bit(g);
    assert5bit(b);

-    return OldColor(r, g, b);
+    return Color(r, g, b);
 }

-void OldColor::SetRGB(uint8_t vr, uint8_t vg, uint8_t vb) {
+void Color::SetRGB(uint8_t vr, uint8_t vg, uint8_t vb) {
    r = vr;
    g = vg;
    b = vb;
 }

-size_t OldColor::MaxChannelRGB() {
+size_t Color::MinChannelRGB() {
+    if (r <= g && r <= b) return 0;
+    if (g <= b && g <= r) return 1;
+    return 2;
+}
+
+size_t Color::MaxChannelRGB() {
    if (r >= g && r >= b) return 0;
    if (g >= b && g >= r) return 1;
    return 2;
 }

-OldColor::operator Vector4() const { return Vector4(r, g, b, a); }
-OldColor::operator Vector4Int() const { return Vector4Int(r, g, b, a); }
-Vector4Int operator-(const OldColor &lhs, const OldColor &rhs) {
+Color Color::Min(const Color &A, const Color &B) { return Color(std::min(A[0], B[0]), std::min(A[1], B[1]), std::min(A[2], B[2]), std::min(A[3], B[3])); }
+
+Color Color::Max(const Color &a, const Color &b) { return Color(std::max(a[0], b[0]), std::max(a[1], b[1]), std::max(a[2], b[2]), std::max(a[3], b[3])); }
+
+Color::operator Vector4() const { return Vector4(r, g, b, a); }
+Color::operator Vector4Int() const { return Vector4Int(r, g, b, a); }
+Vector4Int operator-(const Color &lhs, const Color &rhs) {
    Vector4Int result;
    for (unsigned i = 0; i < 4; i++) { result[i] = (int)lhs[i] - rhs[i]; }
    return result;
 }

-uint16_t OldColor::Pack565() const { return Pack565(r, g, b); }
-uint16_t OldColor::Pack565Unscaled() const { return Pack565Unscaled(r, g, b); }
+uint16_t Color::Pack565() const { return Pack565(r, g, b); }
+uint16_t Color::Pack565Unscaled() const { return Pack565Unscaled(r, g, b); }

-OldColor OldColor::ScaleTo565() const { return OldColor(scale_from_8<5>(r), scale_from_8<6>(g), scale_from_8<5>(b)); }
-OldColor OldColor::ScaleFrom565() const { return OldColor(scale_to_8<5>(r), scale_to_8<6>(g), scale_to_8<5>(b)); }
+Color Color::ScaleTo565() const { return Color(scale8To5(r), scale8To6(g), scale8To5(b)); }
+Color Color::ScaleFrom565() const { return Color(scale5To8(r), scale6To8(g), scale5To8(b)); }

-bool OldColor::operator==(const OldColor &Rhs) const { return r == Rhs.r && g == Rhs.g && b == Rhs.b && a == Rhs.a; }
-bool OldColor::operator!=(const OldColor &Rhs) const { return !(Rhs == *this); }
+bool Color::operator==(const Color &Rhs) const { return r == Rhs.r && g == Rhs.g && b == Rhs.b && a == Rhs.a; }
+bool Color::operator!=(const Color &Rhs) const { return !(Rhs == *this); }

 }  // namespace quicktex
--- a/quicktex/Color.h
+++ b/quicktex/Color.h
@ -1,5 +1,5 @@
 /*  Quicktex Texture Compression Library
-    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
+    Copyright (C) 2021-2022 Andrew Cassidy <drewcassidy@me.com>
    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
    and licenced under the public domain

@ -18,60 +18,82 @@
 */

 #pragma once
-#include "Matrix.h"
-#include "util/bitbash.h"
+#include <cassert>  // for assert
+#include <cstddef>  // for size_t
+#include <cstdint>  // for uint8_t, uint16_t

 namespace quicktex {
+class Vector4;
+class Vector4Int;

-using Color = Vec<uint8_t, 4>;
-using ColorRGB = Vec<uint8_t, 3>;
-
-constexpr size_t uint5_max = (1 << 5) - 1;
-constexpr size_t uint6_max = (1 << 6) - 1;
-
-template <size_t N> struct MidpointTable {
+#pragma pack(push, 1)
+class Color {
   public:
-    constexpr MidpointTable() : _values() {
-        constexpr float fN = (float)N;
-        for (unsigned i = 0; i < N - 1; i++) { _values[i] = ((float)i / fN) + (0.5f / fN); }
-        _values[N - 1] = 1e+37f;
+    uint8_t r;
+    uint8_t g;
+    uint8_t b;
+    uint8_t a;
+
+    constexpr Color() : Color(0, 0, 0, 0xFF) {}
+
+    constexpr Color(uint8_t vr, uint8_t vg, uint8_t vb, uint8_t va = 0xFF) : r(vr), g(vg), b(vb), a(va) {}
+
+    Color(Vector4Int v);
+
+    static uint16_t Pack565Unscaled(uint8_t r, uint8_t g, uint8_t b);
+    static uint16_t Pack565(uint8_t r, uint8_t g, uint8_t b);
+
+    static Color Unpack565Unscaled(uint16_t Packed);
+    static Color Unpack565(uint16_t Packed);
+
+    static Color PreciseRound565(Vector4 &v);
+
+    static Color Min(const Color &A, const Color &B);
+    static Color Max(const Color &A, const Color &B);
+
+    bool operator==(const Color &Rhs) const;
+    bool operator!=(const Color &Rhs) const;
+
+    uint8_t operator[](size_t index) const {
+        assert(index < 4);
+        return reinterpret_cast<const uint8_t *>(this)[index];
+    }
+    uint8_t &operator[](size_t index) {
+        assert(index < 4);
+        return reinterpret_cast<uint8_t *>(this)[index];
    }

-    float operator[](size_t i) const {
-        assert(i < N);
-        return _values[i];
-    }
+    operator Vector4() const;
+    operator Vector4Int() const;
+    friend Vector4Int operator-(const Color &lhs, const Color &rhs);
+
+    void SetRGB(uint8_t vr, uint8_t vg, uint8_t vb);
+    void SetRGB(const Color &other) { SetRGB(other.r, other.g, other.b); }
+
+    uint16_t Pack565() const;
+    uint16_t Pack565Unscaled() const;
+
+    Color ScaleTo565() const;
+    Color ScaleFrom565() const;
+
+    size_t MinChannelRGB();
+    size_t MaxChannelRGB();
+
+    bool IsGrayscale() const { return ((r == g) && (r == b)); }
+    bool IsBlack() const { return (r | g | b) < 4; }
+
+    int GetLuma() const { return (13938U * r + 46869U * g + 4729U * b + 32768U) >> 16U; }  // REC709 weightings

   private:
-    float _values[N];
+    static constexpr float Midpoints5bit[32] = {.015686f, .047059f, .078431f, .111765f, .145098f, .176471f, .207843f, .241176f, .274510f, .305882f, .337255f,
+                                                .370588f, .403922f, .435294f, .466667f, .5f,      .533333f, .564706f, .596078f, .629412f, .662745f, .694118f,
+                                                .725490f, .758824f, .792157f, .823529f, .854902f, .888235f, .921569f, .952941f, .984314f, 1e+37f};
+    static constexpr float Midpoints6bit[64] = {.007843f, .023529f, .039216f, .054902f, .070588f, .086275f, .101961f, .117647f, .133333f, .149020f, .164706f,
+                                                .180392f, .196078f, .211765f, .227451f, .245098f, .262745f, .278431f, .294118f, .309804f, .325490f, .341176f,
+                                                .356863f, .372549f, .388235f, .403922f, .419608f, .435294f, .450980f, .466667f, .482353f, .500000f, .517647f,
+                                                .533333f, .549020f, .564706f, .580392f, .596078f, .611765f, .627451f, .643137f, .658824f, .674510f, .690196f,
+                                                .705882f, .721569f, .737255f, .754902f, .772549f, .788235f, .803922f, .819608f, .835294f, .850980f, .866667f,
+                                                .882353f, .898039f, .913725f, .929412f, .945098f, .960784f, .976471f, .992157f, 1e+37f};
 };
-
-constexpr MidpointTable<32> Midpoints5bit;
-constexpr MidpointTable<64> Midpoints6bit;
-
-template <typename T> Vec<T, 3> scale_to_565(Vec<T, 3> unscaled) {
-    return Vec<T, 3>{scale_from_8<T, 5>(unscaled.r()), scale_from_8<T, 6>(unscaled.g()),
-                     scale_from_8<T, 5>(unscaled.b())};
-}
-
-template <typename T> Vec<T, 3> scale_from_565(Vec<T, 3> scaled) {
-    return Vec<T, 3>{scale_to_8<T, 5>(scaled.r()), scale_to_8<T, 6>(scaled.g()), scale_to_8<T, 5>(scaled.b())};
-}
-
-template <typename T = int16_t> Vec<T, 3> precise_round_565(Vec<float, 3> &v) {
-    auto scaled = v * Vec<float, 3>{uint5_max, uint6_max, uint5_max};       // rescale by from (0,1) to (0,int_max)
-    auto rounded = (Vec<T, 3>)scaled;                                       // downcast to integral type
-    rounded = rounded.clamp({0, 0, 0}, {uint5_max, uint6_max, uint5_max});  // clamp to avoid out of bounds float errors
-
-    // increment each channel if above the rounding point
-    if (v.r() > Midpoints5bit[rounded.r()]) rounded.r()++;
-    if (v.g() > Midpoints6bit[rounded.g()]) rounded.g()++;
-    if (v.b() > Midpoints5bit[rounded.b()]) rounded.b()++;
-
-    assert(rounded.r() <= uint5_max);
-    assert(rounded.g() <= uint6_max);
-    assert(rounded.b() <= uint5_max);
-
-    return rounded;
-}
+#pragma pack(pop)
 }  // namespace quicktex
--- a/quicktex/ColorBlock.h
+++ b/quicktex/ColorBlock.h
@ -25,7 +25,7 @@
 #include <cstring>
 #include <stdexcept>

-#include "OldColor.h"
+#include "Color.h"
 #include "Vector4Int.h"

 namespace quicktex {
@ -34,9 +34,9 @@ using Coords = std::tuple<int, int>;
 template <int N, int M> class ColorBlock  {
   public:
    struct Metrics {
-        OldColor min;
-        OldColor max;
-        OldColor avg;
+        Color min;
+        Color max;
+        Color avg;
        bool is_greyscale;
        bool has_black;
        Vector4Int sums;
@ -45,37 +45,37 @@ template <int N, int M> class ColorBlock  {
    static constexpr int Width = N;
    static constexpr int Height = M;

-    constexpr OldColor Get(int x, int y) const {
+    constexpr Color Get(int x, int y) const {
        if (x >= Width || x < 0) throw std::invalid_argument("x value out of range");
        if (y >= Height || y < 0) throw std::invalid_argument("y value out of range");

        return _pixels[x + (N * y)];
    }

-    constexpr OldColor Get(int i) const {
+    constexpr Color Get(int i) const {
        if (i >= N * M || i < 0) throw std::invalid_argument("i value out of range");
        return _pixels[i];
    }

-    void Set(int x, int y, const OldColor &value) {
+    void Set(int x, int y, const Color &value) {
        if (x >= Width || x < 0) throw std::invalid_argument("x value out of range");
        if (y >= Height || y < 0) throw std::invalid_argument("y value out of range");
        _pixels[x + (N * y)] = value;
    }

-    void Set(int i, const OldColor &value) {
+    void Set(int i, const Color &value) {
        if (i >= N * M || i < 0) throw std::invalid_argument("i value out of range");
        _pixels[i] = value;
    }

-    void GetRow(int y, OldColor *dst) const {
+    void GetRow(int y, Color *dst) const {
        if (y >= Height || y < 0) throw std::invalid_argument("y value out of range");
-        std::memcpy(dst, &_pixels[N * y], N * sizeof(OldColor));
+        std::memcpy(dst, &_pixels[N * y], N * sizeof(Color));
    }

-    void SetRow(int y, const OldColor *src) {
+    void SetRow(int y, const Color *src) {
        if (y >= Height || y < 0) throw std::invalid_argument("y value out of range");
-        std::memcpy(&_pixels[N * y], src, N * sizeof(OldColor));
+        std::memcpy(&_pixels[N * y], src, N * sizeof(Color));
    }

    bool IsSingleColor() const {
@ -88,8 +88,8 @@ template <int N, int M> class ColorBlock  {

    Metrics GetMetrics(bool ignore_black = false) const {
        Metrics metrics;
-        metrics.min = OldColor(UINT8_MAX, UINT8_MAX, UINT8_MAX);
-        metrics.max = OldColor(0, 0, 0);
+        metrics.min = Color(UINT8_MAX, UINT8_MAX, UINT8_MAX);
+        metrics.max = Color(0, 0, 0);
        metrics.has_black = false;
        metrics.is_greyscale = true;
        metrics.sums = {0, 0, 0};
@ -97,7 +97,7 @@ template <int N, int M> class ColorBlock  {
        unsigned total = 0;

        for (unsigned i = 0; i < M * N; i++) {
-            OldColor val = Get(i);
+            Color val = Get(i);
            bool is_black = val.IsBlack();

            metrics.has_black |= is_black;
@ -118,7 +118,7 @@ template <int N, int M> class ColorBlock  {
    }

   private:
-    std::array<OldColor, N * M> _pixels;
+    std::array<Color, N * M> _pixels;
 };

 }  // namespace quicktex
--- a/quicktex/Decoder.h
+++ b/quicktex/Decoder.h
@ -22,7 +22,7 @@
 #include <memory>

 #include "ColorBlock.h"
-#include "texture/RawTexture.h"
+#include "Texture.h"

 namespace quicktex {

@ -46,19 +46,19 @@ template <class T> class BlockDecoder : public Decoder<T> {
    virtual DecodedBlock DecodeBlock(const EncodedBlock &block) const = 0;

    virtual RawTexture Decode(const T &encoded) const override {
-        auto decoded = RawTexture(encoded.width, encoded.height);
+        auto decoded = RawTexture(encoded.Width(), encoded.Height());

-        int blocks_x = encoded.bwidth();
-        int blocks_y = encoded.bheight();
+        int blocks_x = encoded.BlocksX();
+        int blocks_y = encoded.BlocksY();

        // from experimentation, multithreading this using OpenMP actually makes decoding slower
        // due to thread creation/teardown taking longer than the decoding process itself.
        // As a result, this is left as a serial operation despite being embarassingly parallelizable
        for (int y = 0; y < blocks_y; y++) {
            for (int x = 0; x < blocks_x; x++) {
-                auto block = encoded.get_block(x, y);
+                auto block = encoded.GetBlock(x, y);
                auto pixels = DecodeBlock(block);
-                decoded.set_block<BlockWidth, BlockHeight>(x, y, pixels);
+                decoded.SetBlock<BlockWidth, BlockHeight>(x, y, pixels);
            }
        }

--- a/quicktex/Encoder.h
+++ b/quicktex/Encoder.h
@ -22,7 +22,7 @@
 #include <memory>

 #include "ColorBlock.h"
-#include "texture/RawTexture.h"
+#include "Texture.h"

 namespace quicktex {

@ -46,22 +46,21 @@ template <typename T> class BlockEncoder : public Encoder<T> {
    virtual EncodedBlock EncodeBlock(const DecodedBlock &block) const = 0;

    virtual T Encode(const RawTexture &decoded) const override {
-        auto encoded = T(decoded.width, decoded.height);
+        auto encoded = T(decoded.Width(), decoded.Height());

-        unsigned blocks_x = encoded.bwidth();
-        unsigned blocks_y = encoded.bheight();
+        int blocks_x = encoded.BlocksX();
+        int blocks_y = encoded.BlocksY();

        // from experimentation, multithreading this using OpenMP sometimes actually makes encoding slower
        // due to thread creation/teardown taking longer than the encoding process itself.
        // As a result, this is sometimes left as a serial operation despite being embarassingly parallelizable
        // threshold for number of blocks before multithreading is set by overriding MTThreshold()
 #pragma omp parallel for if (blocks_x * blocks_y >= MTThreshold())
-        for (int y = 0; y < (int)blocks_y; y++) {
-            for (int x = 0; x < (int)blocks_x; x++) {
-                // index variables have to be signed for MSVC for some reason
-                auto pixels = decoded.get_block<BlockWidth, BlockHeight>(x, y);
+        for (int y = 0; y < blocks_y; y++) {
+            for (int x = 0; x < blocks_x; x++) {
+                auto pixels = decoded.GetBlock<BlockWidth, BlockHeight>(x, y);
                auto block = EncodeBlock(pixels);
-                encoded.set_block(x, y, block);
+                encoded.SetBlock(x, y, block);
            }
        }

--- a/quicktex/Matrix.h
+++ b/quicktex/Matrix.h
@ -1,457 +0,0 @@
-/*  Quicktex Texture Compression Library
-    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
-    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
-    and licenced under the public domain
-
-    This program is free software: you can redistribute it and/or modify
-    it under the terms of the GNU Lesser General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU Lesser General Public License for more details.
-
-    You should have received a copy of the GNU Lesser General Public License
-    along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#pragma once
-
-#include <algorithm>
-#include <cstdint>
-#include <numeric>
-#include <xsimd/xsimd.hpp>
-
-#include "util/iterator.h"
-#include "util/map.h"
-#include "util/math.h"
-#include "util/ranges.h"
-
-namespace quicktex {
-
-template <typename T, int M, int N> class Matrix;
-
-template <typename T, int M> using Vec = Matrix<T, M, 1>;
-
-// region helper concepts
-template <typename L, typename R, typename Op>
-concept operable = requires(L &l, R &r, Op &op) { op(l, r); };
-
-template <typename V>
-concept is_matrix = requires(V &v) {
-                        V::width();
-                        V::height();
-                        V::value_type;
-                    } && std::same_as < Matrix<typename V::value_type, V::height(), V::width()>,
-std::remove_cvref_t < V >> ;
-
-template <typename V> struct vector_stats {
-    static constexpr int width = 1;
-    static constexpr int height = 1;
-    static constexpr int dims = 0;
-};
-
-template <typename V>
-    requires is_matrix<V>
-struct vector_stats<V> {
-    static constexpr int width = V::width;
-    static constexpr int height = V::height;
-    static constexpr int dims = V::dims;
-};
-
-template <typename V> constexpr int vector_width = vector_stats<V>::width;
-template <typename V> constexpr int vector_height = vector_stats<V>::height;
-template <typename V> constexpr int vector_dims = vector_stats<V>::dims;
-
-// endregion
-
-template <typename R, typename T, int N> class VecBase {
-   public:
-    constexpr VecBase(T scalar = T()) : _c{} { _c.fill(scalar); }
-
-   protected:
-    const R &_at(int index) const { return _c.at(index); }
-    R &_at(int index) { return _c.at(index); }
-
-    constexpr auto _begin() const { return _c.data(); }
-    constexpr auto _begin() { return _c.data(); }
-    constexpr auto _end() const { return _c.data() + N; }
-    constexpr auto _end() { return _c.data() + N; }
-
-   private:
-    std::array<R, N> _c;
-};
-
-template <typename T, int N, int M> using matrix_row_type = std::conditional_t<N <= 1, T, Vec<T, N>>;
-template <typename T, int N, int M> using matrix_column_type = std::conditional_t<M <= 1, T, Vec<T, M>>;
-
-/**
- * A matrix of values that can be operated on
- * @tparam T Scalar type
- * @tparam N Width of the matrix
- * @tparam M Height of the matrix
- */
-template <typename T, int M, int N>
-class Matrix : public VecBase<std::conditional_t<N == 1, T, VecBase<T, T, N>>, T, M> {
-   public:
-    using base = VecBase<std::conditional_t<N == 1, T, VecBase<T, T, N>>, T, M>;
-
-    using value_type = T;
-    using row_type = matrix_row_type<T, N, M>;
-    using column_type = matrix_column_type<T, N, M>;
-
-    using base::base;
-    //    using base::begin;
-    //    using base::end;
-    //    using base::operator[];
-
-    // region constructors
-    /**
-     * Create a vector from an intializer list
-     * @param il values to populate with
-     */
-    Matrix(std::initializer_list<row_type> il) : base() {
-        assert(il.size() == M);  // ensure il is of the right size
-        std::copy_n(il.begin(), M, this->begin());
-    }
-
-    /**
-     * Create a vector from a scalar value
-     * @param scalar value to populate with
-     */
-    //    constexpr Matrix(const T &scalar) { std::fill(this->begin(), this->end(), scalar); }
-
-    /**
-     * Create a vector from an iterator
-     * @tparam II input iterator type
-     * @param input_iterator iterator to copy from
-     */
-    template <typename II>
-        requires std::input_iterator<II> && std::convertible_to<std::iter_value_t<II>,
-                                                                const row_type>
-        Matrix(const II input_iterator) : base() {
-        std::copy_n(input_iterator, M, this->begin());
-    }
-
-    /**
-     * Create a vector from a range type
-     * @tparam R Range type
-     * @param input_range Range to copy from
-     */
-    template <typename R>
-    Matrix(const R &input_range)
-        requires range<R> && std::convertible_to<typename R::value_type, row_type>
-    : Matrix(input_range.begin()) {
-        assert(std::distance(input_range.begin(), input_range.end()) == M);
-    }
-
-    template <typename R = T>
-        requires(N == M)
-    static constexpr Matrix identity() {
-        Matrix result = Matrix(0);
-        for (int i = 0; i < N; i++) { result.element(i, i) = 1; }
-        return result;
-    }
-    // endregion
-
-    // region iterators and accessors
-    static constexpr int size() { return M; }
-    static constexpr int width = N;
-    static constexpr int height = M;
-    static constexpr int elements = N * M;
-    static constexpr int dims = ((width > 1) ? 1 : 0) + ((height > 1) ? 1 : 0);
-
-    const row_type &at(int index) const {
-        assert(index >= 0 && index < M);
-        return static_cast<const row_type &>(base::_at(index));
-    }
-    row_type &at(int index) {
-        assert(index >= 0 && index < M);
-        return static_cast<row_type &>(base::_at(index));
-    }
-
-    const row_type &operator[](int index) const { return at(index); }
-    row_type &operator[](int index) { return at(index); }
-
-    const row_type *begin() const { return static_cast<const row_type *>(base::_begin()); }
-    row_type *begin() { return static_cast<row_type *>(base::_begin()); }
-
-    const row_type *end() const { return static_cast<const row_type *>(base::_end()); }
-    row_type *end() { return static_cast<row_type *>(base::_end()); }
-
-    auto column_begin() const { return column_iterator(this, 0); }
-    auto column_end() const { return column_iterator(this, N); }
-
-    auto all_begin() const { return linear_iterator<const Matrix>(this, 0); }
-    auto all_begin() { return linear_iterator<Matrix>(this, 0); }
-
-    auto all_end() const { return linear_iterator<const Matrix>(this, N * M); }
-    auto all_end() { return linear_iterator<Matrix>(this, N * M); }
-
-    const row_type &get_row(int m) const { return static_cast<const row_type &>(this->at(m)); }
-    template <typename R> void set_row(int m, const R &value) { this->at(m) = value; }
-
-    template <typename S = T> column_type get_column(int n) const {
-        if constexpr (M == 1) {
-            return element(0, n);
-        } else {
-            column_type ret;
-            for (int m = 0; m < M; m++) { ret[m] = element(m, n); }
-            return ret;
-        }
-    }
-
-    void set_column(int n, const column_type &value) {
-        if constexpr (M == 1) {
-            element(0, n) = value;
-        } else {
-            for (int m = 0; m < M; m++) { element(m, n) = value[m]; }
-        }
-    }
-
-    // n/m accessors
-    const T &element(int m, int n) const {
-        if constexpr (N == 1) {
-            return this->at(m);
-        } else {
-            return this->at(m)[n];
-        }
-    }
-
-    T &element(int n, int m) { return const_cast<T &>(static_cast<const Matrix &>(*this).element(n, m)); }
-
-    // linear accessors
-    const T &element(int i) const { return element(i / N, i % N); }
-    T &element(int i) { return element(i / N, i % N); }
-
-    // RGBA accessors
-    const T &r() const { return (*this)[0]; }
-    T &r() { return this->at(0); }
-    template <typename S = T> std::enable_if_t<M >= 2, const S &> g() const { return this->at(1); }
-    template <typename S = T> std::enable_if_t<M >= 2, S &> g() { return this->at(1); }
-    template <typename S = T> std::enable_if_t<M >= 3, const S &> b() const { return this->at(2); }
-    template <typename S = T> std::enable_if_t<M >= 3, S &> b() { return this->at(2); }
-    template <typename S = T> std::enable_if_t<M >= 4, const S &> a() const { return this->at(3); }
-    template <typename S = T> std::enable_if_t<M >= 4, S &> a() { return this->at(3); }
-
-    // XYZW accessors
-    const T &x() const { return this->at(0); }
-    T &x() { return this->at(0); }
-    template <typename S = T> std::enable_if_t<M >= 2, const S &> y() const { return this->at(1); }
-    template <typename S = T> std::enable_if_t<M >= 2, S &> y() { return this->at(1); }
-    template <typename S = T> std::enable_if_t<M >= 3, const S &> z() const { return this->at(2); }
-    template <typename S = T> std::enable_if_t<M >= 3, S &> z() { return this->at(2); }
-    template <typename S = T> std::enable_if_t<M >= 4, const S &> w() const { return this->at(3); }
-    template <typename S = T> std::enable_if_t<M >= 4, S &> w() { return this->at(3); }
-    // endregion
-
-    template <typename R>
-        requires std::equality_comparable_with<T, R> bool
-    operator==(const Matrix<R, M, N> &rhs) const {
-        return size() == rhs.size() && std::equal(this->begin(), this->end(), rhs.begin());
-    };
-
-    // unary vector negation
-    template <typename S = T>
-        requires(!std::unsigned_integral<T>) && requires(T &t) { -t; }
-    Matrix operator-() const {
-        return map(std::negate(), *this);
-    };
-
-    // add vectors
-    template <typename R>
-        requires operable<R, T, std::plus<>>
-    Matrix operator+(const Matrix<R, M, N> &rhs) const {
-        return map(std::plus(), *this, rhs);
-    };
-
-    // subtract vectors
-    template <typename R>
-        requires operable<R, T, std::minus<>>
-    Matrix operator-(const Matrix<R, M, N> &rhs) const {
-        // we can't just add the negation because that's invalid for int types
-        return map(std::minus(), *this, rhs);
-    };
-
-    // multiply matrix with a matrix or column vector
-    template <typename R, int P>
-        requires(P == 1 || P == N) && operable<R, T, std::multiplies<>>
-    Matrix operator*(const Matrix<R, M, P> &rhs) const {
-        return map(std::multiplies(), *this, rhs);
-    };
-
-    // multiply matrix with a scalar
-    template <typename R>
-        requires operable<R, T, std::multiplies<>>
-    Matrix operator*(const R &rhs) const {
-        return map(std::multiplies(), *this, rhs);
-    };
-
-    // divides a matrix by a matrix or column vector
-    template <typename R, int NN>
-        requires(NN == 1 || NN == N) && operable<R, T, std::divides<>>
-    Matrix operator/(const Matrix<R, M, NN> &rhs) const {
-        return map(std::divides(), *this, rhs);
-    };
-
-    // divides a matrix by a scalar
-    template <typename R>
-        requires operable<R, T, std::divides<>>
-    Matrix operator/(const R &rhs) const {
-        return map(std::divides(), *this, rhs);
-    };
-
-    // add-assigns a matrix with a matrix
-    template <typename R>
-        requires operable<Matrix, R, std::plus<>>
-    Matrix &operator+=(const R &rhs) {
-        return *this = *this + rhs;
-    }
-
-    // subtract-assigns a matrix with a matrix
-    template <typename R>
-        requires operable<Matrix, R, std::minus<>>
-    Matrix &operator-=(const R &rhs) {
-        return *this = *this - rhs;
-    }
-
-    // multiply-assigns a matrix with a matrix, column vector, or a scalar
-    template <typename R>
-        requires operable<Matrix, R, std::multiplies<>>
-    Matrix &operator*=(const R &rhs) {
-        return *this = *this * rhs;
-    }
-
-    // divide-assigns a matrix by a matrix, column vector, or a scalar
-    template <typename R>
-        requires operable<Matrix, R, std::divides<>>
-    Matrix &operator/=(const R &rhs) {
-        return *this = *this / rhs;
-    }
-
-    // decay a 1x1 matrix to a scalar on demand
-    template <typename S = T>
-        requires(N == 1 && M == 1)
-    operator S &() {
-        return this->at(0);
-    }
-    template <typename S = T>
-        requires(N == 1 && M == 1)
-    operator const S &() const {
-        return this->at(0);
-    }
-
-    // sum up all columns
-    column_type hsum() const {
-        if constexpr (N == 1) { return *this; }
-        if constexpr (M == 1) { return sum(); }
-        for (int i = 0; i < M; i++) {}
-        return _map<column_type>([](auto row) { return quicktex::sum(row); }, *this);
-    }
-
-    // sum up all rows
-    row_type vsum() const {
-        if constexpr (N == 1) { return sum(); }
-        if constexpr (M == 1) { return *this; }
-        return std::accumulate(begin(), end(), row_type{});
-    }
-
-    // sum up all values
-    T sum() const {
-        // TODO: reintroduce SIMDing for this
-        return std::accumulate(all_begin(), all_end(), T(0));
-    }
-
-    template <typename R, int P>
-        requires operable<R, T, std::multiplies<>>
-    Matrix<T, M, P> mult(const Matrix<R, N, P> &rhs) const {
-        Matrix<T, M, P> res(0);
-        for (int p = 0; p < P; p++) {
-            // for each column of the RHS/Result
-            for (int m = 0; m < M; m++) {
-                // for each row of the LHS/Result
-                for (int n = 0; n < N; n++) { res.element(m, p) += element(m, n) * rhs.element(n, p); }
-            }
-        }
-        return res;
-    }
-
-    Matrix<T, N, M> transpose() const {
-        Matrix<T, N, M> res;
-        for (int m = 0; m < M; m++) { res.set_column(m, get_row(m)); }
-        return res;
-    }
-
-    template <typename R = T>
-        requires(N == M)
-    Matrix mirror() const {
-        Matrix result = *this;
-        for (int n = 0; n < N - 1; n++) {
-            for (int m = (n + 1); m < M; m++) { result.element(m, n) = result.element(n, m); }
-        }
-        return result;
-    }
-
-    // dot product of two compatible matrices
-    template <typename R>
-        requires(N == 1) && operable<T, R, std::multiplies<>> && operable<T, T, std::plus<>>
-    inline row_type dot(const Matrix<R, M, N> &rhs) const {
-        // technically this is Lt * R, but the vsum method is probably faster/more readable
-        // than allocationg a new transpose matrix
-        Matrix product = *this * rhs;
-        return product.vsum();
-    }
-
-    inline row_type sqr_mag() const { return dot(*this); }
-
-    inline Matrix abs() const {
-        return map([](auto c) { return quicktex::abs(c); }, *this);
-    }
-
-    inline Matrix clamp(T low, T high) {
-        return map([low, high](auto c) { return quicktex::clamp(c, low, high); }, *this);
-    }
-    inline Matrix clamp(const Matrix &low, const Matrix &high) {
-        return map([](auto c, auto l, auto h) { return quicktex::clamp(c, l, h); }, *this, low, high);
-    }
-
-   protected:
-    class column_iterator : public index_iterator_base<column_iterator, column_type> {
-       public:
-        using value_type = column_type;
-        using base = index_iterator_base<column_iterator, column_type>;
-
-        column_iterator(const Matrix *matrix = nullptr, int index = 0) : base(index), _matrix(matrix){};
-
-        column_type operator*() const { return _matrix->get_column(this->_index); }
-        const column_type *operator->() const { &(_matrix->get_column(this->_index)); }
-
-        friend bool operator==(const column_iterator &lhs, const column_iterator &rhs) {
-            return (lhs._matrix == rhs._matrix) && (lhs._index == rhs._index);
-        }
-
-       private:
-        const Matrix *_matrix;
-    };
-
-    template <typename V> class linear_iterator : public index_iterator_base<linear_iterator<V>, T> {
-       public:
-        using value_type = T;
-        using base = index_iterator_base<linear_iterator<V>, T>;
-
-        linear_iterator(V *matrix = nullptr, int index = 0) : base(index), _matrix(matrix){};
-
-        auto &operator*() { return _matrix->element(this->_index); }
-        auto *operator->() const { return &(_matrix->element(this->_index)); }
-
-        friend bool operator==(const linear_iterator &lhs, const linear_iterator &rhs) {
-            return (lhs._matrix == rhs._matrix) && (lhs._index == rhs._index);
-        }
-
-       private:
-        V *_matrix;
-    };
-};
-}  // namespace quicktex
--- a/quicktex/OldColor.h
+++ b/quicktex/OldColor.h
@ -1,114 +0,0 @@
-/*  Quicktex Texture Compression Library
-    Copyright (C) 2021-2022 Andrew Cassidy <drewcassidy@me.com>
-    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
-    and licenced under the public domain
-
-    This program is free software: you can redistribute it and/or modify
-    it under the terms of the GNU Lesser General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU Lesser General Public License for more details.
-
-    You should have received a copy of the GNU Lesser General Public License
-    along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#pragma once
-#include <cassert>  // for assert
-#include <cstddef>  // for size_t
-#include <cstdint>  // for uint8_t, uint16_t
-
-#include "Matrix.h"
-
-namespace quicktex {
-class Vector4;
-class Vector4Int;
-
-#pragma pack(push, 1)
-class OldColor {
-   public:
-    uint8_t r;
-    uint8_t g;
-    uint8_t b;
-    uint8_t a;
-
-    constexpr OldColor() : OldColor(0, 0, 0, 0xFF) {}
-
-    constexpr OldColor(uint8_t vr, uint8_t vg, uint8_t vb, uint8_t va = 0xFF) : r(vr), g(vg), b(vb), a(va) {}
-
-    OldColor(Vector4Int v);
-
-    static uint16_t Pack565Unscaled(uint8_t r, uint8_t g, uint8_t b);
-    static uint16_t Pack565(uint8_t r, uint8_t g, uint8_t b);
-
-    static OldColor Unpack565Unscaled(uint16_t Packed);
-    static OldColor Unpack565(uint16_t Packed);
-
-    static OldColor PreciseRound565(Vector4 &v);
-
-    static OldColor Min(const OldColor &A, const OldColor &B);
-    static OldColor Max(const OldColor &A, const OldColor &B);
-
-    bool operator==(const OldColor &Rhs) const;
-    bool operator!=(const OldColor &Rhs) const;
-
-    uint8_t operator[](size_t index) const {
-        assert(index < 4);
-        return reinterpret_cast<const uint8_t *>(this)[index];
-    }
-    uint8_t &operator[](size_t index) {
-        assert(index < 4);
-        return reinterpret_cast<uint8_t *>(this)[index];
-    }
-
-    operator Vector4() const;
-    operator Vector4Int() const;
-    friend Vector4Int operator-(const OldColor &lhs, const OldColor &rhs);
-
-    void SetRGB(uint8_t vr, uint8_t vg, uint8_t vb);
-    void SetRGB(const OldColor &other) { SetRGB(other.r, other.g, other.b); }
-
-    uint16_t Pack565() const;
-    uint16_t Pack565Unscaled() const;
-
-    OldColor ScaleTo565() const;
-    OldColor ScaleFrom565() const;
-
-    size_t MinChannelRGB();
-    size_t MaxChannelRGB();
-
-    bool IsGrayscale() const { return ((r == g) && (r == b)); }
-    bool IsBlack() const { return (r | g | b) < 4; }
-
-    int GetLuma() const { return (13938U * r + 46869U * g + 4729U * b + 32768U) >> 16U; }  // REC709 weightings
-
-    operator Vec<uint8_t, 4>() const { return {r, g, b, a}; }
-
-    OldColor(const Vec<uint8_t, 4> v) {
-        r = v.r();
-        g = v.g();
-        b = v.b();
-        a = v.a();
-    }
-
-   private:
-    static constexpr float Midpoints5bit[32] = {
-        .015686f, .047059f, .078431f, .111765f, .145098f, .176471f, .207843f, .241176f, .274510f, .305882f, .337255f,
-        .370588f, .403922f, .435294f, .466667f, .5f,      .533333f, .564706f, .596078f, .629412f, .662745f, .694118f,
-        .725490f, .758824f, .792157f, .823529f, .854902f, .888235f, .921569f, .952941f, .984314f, 1e+37f};
-    static constexpr float Midpoints6bit[64] = {
-        .007843f, .023529f, .039216f, .054902f, .070588f, .086275f, .101961f, .117647f, .133333f, .149020f, .164706f,
-        .180392f, .196078f, .211765f, .227451f, .245098f, .262745f, .278431f, .294118f, .309804f, .325490f, .341176f,
-        .356863f, .372549f, .388235f, .403922f, .419608f, .435294f, .450980f, .466667f, .482353f, .500000f, .517647f,
-        .533333f, .549020f, .564706f, .580392f, .596078f, .611765f, .627451f, .643137f, .658824f, .674510f, .690196f,
-        .705882f, .721569f, .737255f, .754902f, .772549f, .788235f, .803922f, .819608f, .835294f, .850980f, .866667f,
-        .882353f, .898039f, .913725f, .929412f, .945098f, .960784f, .976471f, .992157f, 1e+37f};
-
-
-};
-#pragma pack(pop)
-}  // namespace quicktex
--- a/quicktex/Texture.h
+++ b/quicktex/Texture.h
@ -0,0 +1,187 @@
+/*  Quicktex Texture Compression Library
+    Copyright (C) 2021-2022 Andrew Cassidy <drewcassidy@me.com>
+    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
+    and licenced under the public domain
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Lesser General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <array>
+#include <climits>
+#include <cstdint>
+#include <cstdio>
+#include <cstring>
+#include <memory>
+#include <stdexcept>
+#include <tuple>
+#include <type_traits>
+#include <vector>
+
+#include "Color.h"
+#include "ColorBlock.h"
+
+namespace quicktex {
+
+class Texture {
+   public:
+    virtual ~Texture() = default;
+
+    virtual int Width() const { return _width; }
+    virtual int Height() const { return _height; }
+    virtual std::tuple<int, int> Size() const { return std::tuple<int, int>(_width, _height); }
+
+    /**
+     * The texture's total size
+     * @return The size of the texture in bytes.
+     */
+    virtual size_t NBytes() const noexcept = 0;
+
+    virtual const uint8_t *Data() const noexcept = 0;
+    virtual uint8_t *Data() noexcept = 0;
+
+   protected:
+    Texture(int width, int height) : _width(width), _height(height) {
+        if (width <= 0) throw std::invalid_argument("Texture width must be greater than 0");
+        if (height <= 0) throw std::invalid_argument("Texture height must be greater than 0");
+    }
+
+    int _width;
+    int _height;
+};
+
+class RawTexture : public Texture {
+    using Base = Texture;
+
+   public:
+    /**
+     * Create a new RawTexture
+     * @param width width of the texture in pixels
+     * @param height height of the texture in pixels
+     */
+    RawTexture(int width, int height) : Base(width, height), _pixels(_width * _height) {}
+
+    Color GetPixel(int x, int y) const {
+        if (x < 0 || x >= _width) throw std::invalid_argument("x value out of range.");
+        if (y < 0 || y >= _height) throw std::invalid_argument("y value out of range.");
+        return _pixels.at(x + (y * _width));
+    }
+
+    void SetPixel(int x, int y, Color val) {
+        if (x < 0 || x >= _width) throw std::invalid_argument("x value out of range.");
+        if (y < 0 || y >= _height) throw std::invalid_argument("y value out of range.");
+        _pixels.at(x + (y * _width)) = val;
+    }
+
+    size_t NBytes() const noexcept override { return static_cast<unsigned long>(Width() * Height()) * sizeof(Color); }
+
+    template <int N, int M> ColorBlock<N, M> GetBlock(int block_x, int block_y) const {
+        if (block_x < 0) throw std::out_of_range("x value out of range.");
+        if (block_y < 0) throw std::out_of_range("y value out of range.");
+
+        // coordinates in the image of the top-left pixel of the selected block
+        ColorBlock<N, M> block;
+        int pixel_x = block_x * N;
+        int pixel_y = block_y * M;
+
+        if (pixel_x + N < _width && pixel_y + M < _height) {
+            // fast memcpy if the block is entirely inside the bounds of the texture
+            for (int y = 0; y < M; y++) {
+                // copy each row into the ColorBlock
+                block.SetRow(y, &_pixels[pixel_x + (_width * (pixel_y + y))]);
+            }
+        } else {
+            // slower pixel-wise copy if the block goes over the edges
+            for (int x = 0; x < N; x++) {
+                for (int y = 0; y < M; y++) { block.Set(x, y, GetPixel((pixel_x + x) % _width, (pixel_y + y) % _height)); }
+            }
+        }
+
+        return block;
+    }
+
+    template <int N, int M> void SetBlock(int block_x, int block_y, const ColorBlock<N, M> &block) {
+        if (block_x < 0) throw std::out_of_range("x value out of range.");
+        if (block_y < 0) throw std::out_of_range("y value out of range.");
+
+        // coordinates in the image of the top-left pixel of the selected block
+        int pixel_x = block_x * N;
+        int pixel_y = block_y * M;
+
+        if (pixel_x + N < _width && pixel_y + M < _height) {
+            // fast row-wise memcpy if the block is entirely inside the bounds of the texture
+            for (int y = 0; y < M; y++) {
+                // copy each row out of the ColorBlock
+                block.GetRow(y, &_pixels[pixel_x + (_width * (pixel_y + y))]);
+            }
+        } else {
+            // slower pixel-wise copy if the block goes over the edges
+            for (int x = 0; x < N; x++) {
+                for (int y = 0; y < M; y++) { SetPixel((pixel_x + x) % _width, (pixel_y + y) % _height, block.Get(x, y)); }
+            }
+        }
+    }
+
+    virtual const uint8_t *Data() const noexcept override { return reinterpret_cast<const uint8_t *>(_pixels.data()); }
+    virtual uint8_t *Data() noexcept override { return reinterpret_cast<uint8_t *>(_pixels.data()); }
+
+   protected:
+    std::vector<Color> _pixels;
+};
+
+template <typename B> class BlockTexture final : public Texture {
+   private:
+    std::vector<B> _blocks;
+    int _width_b;
+    int _height_b;
+
+   public:
+    using BlockType = B;
+    using Base = Texture;
+
+    /**
+     * Create a new BlockTexture
+     * @param width width of the texture in pixels. must be divisible by B::Width
+     * @param height height of the texture in pixels. must be divisible by B::Height
+     */
+    BlockTexture(int width, int height) : Base(width, height) {
+        _width_b = (_width + B::Width - 1) / B::Width;
+        _height_b = (_height + B::Height - 1) / B::Height;
+        _blocks = std::vector<B>(_width_b * _height_b);
+    }
+
+    constexpr int BlocksX() const { return _width_b; }
+    constexpr int BlocksY() const { return _height_b; }
+    constexpr std::tuple<int, int> BlocksXY() const { return std::tuple<int, int>(_width_b, _height_b); }
+
+    B GetBlock(int x, int y) const {
+        if (x < 0 || x >= _width_b) throw std::out_of_range("x value out of range.");
+        if (y < 0 || y >= _height_b) throw std::out_of_range("y value out of range.");
+        return _blocks.at(x + (y * _width_b));
+    }
+
+    void SetBlock(int x, int y, const B &val) {
+        if (x < 0 || x >= _width_b) throw std::out_of_range("x value out of range.");
+        if (y < 0 || y >= _height_b) throw std::out_of_range("y value out of range.");
+        _blocks.at(x + (y * _width_b)) = val;
+    }
+
+    size_t NBytes() const noexcept override { return _blocks.size() * sizeof(B); }
+
+    const uint8_t *Data() const noexcept override { return reinterpret_cast<const uint8_t *>(_blocks.data()); }
+    uint8_t *Data() noexcept override { return reinterpret_cast<uint8_t *>(_blocks.data()); }
+};
+
+}  // namespace quicktex
--- a/quicktex/Vector4.h
+++ b/quicktex/Vector4.h
@ -23,7 +23,7 @@
 #include <cmath>
 #include <functional>

-#include "OldColor.h"
+#include "Color.h"

 namespace quicktex {

@ -45,11 +45,11 @@ class Vector4 {
        _c[3] = scalar;
    }

-    Vector4(const OldColor &c) : Vector4(c.r, c.g, c.b, c.a) {}
+    Vector4(const Color &c) : Vector4(c.r, c.g, c.b, c.a) {}

-    static Vector4 FromColor(const OldColor &c) { return Vector4(c); }
+    static Vector4 FromColor(const Color &c) { return Vector4(c); }

-    static Vector4 FromColorRGB(const OldColor &c) { return Vector4(c.r, c.g, c.b); }
+    static Vector4 FromColorRGB(const Color &c) { return Vector4(c.r, c.g, c.b); }

    static float Dot(const Vector4 &lhs, const Vector4 &rhs) {
        float sum = 0;
--- a/quicktex/Vector4Int.h
+++ b/quicktex/Vector4Int.h
@ -22,7 +22,7 @@
 #include <array>
 #include <functional>

-#include "OldColor.h"
+#include "Color.h"
 #include "Vector4.h"

 namespace quicktex {
@ -45,11 +45,11 @@ class Vector4Int {
        _c[3] = scalar;
    }

-    Vector4Int(const OldColor &c) : Vector4Int(c.r, c.g, c.b, c.a) {}
+    Vector4Int(const Color &c) : Vector4Int(c.r, c.g, c.b, c.a) {}

-    static Vector4Int FromColor(const OldColor &c) { return Vector4Int(c); }
+    static Vector4Int FromColor(const Color &c) { return Vector4Int(c); }

-    static Vector4Int FromColorRGB(const OldColor &c) { return Vector4Int(c.r, c.g, c.b); }
+    static Vector4Int FromColorRGB(const Color &c) { return Vector4Int(c.r, c.g, c.b); }

    static int Dot(const Vector4Int &lhs, const Vector4Int &rhs) {
        int sum = 0;
--- a/quicktex/_bindings.cpp
+++ b/quicktex/_bindings.cpp
@ -21,12 +21,11 @@

 #include <pybind11/pybind11.h>

+#include "Color.h"
 #include "Decoder.h"
 #include "Encoder.h"
-#include "OldColor.h"
+#include "Texture.h"
 #include "_bindings.h"
-#include "texture/RawTexture.h"
-#include "texture/Texture.h"

 #define STRINGIFY(x) #x
 #define MACRO_STRINGIFY(x) STRINGIFY(x)
@ -46,26 +45,19 @@ PYBIND11_MODULE(_quicktex, m) {
    m.attr("__version__") = "dev";
 #endif

-#ifdef NDEBUG
-    m.attr("_debug_build") = false;
-#else
-    m.attr("_debug_build") = true;
-#endif
-
    py::options options;

    // Texture

    py::class_<Texture> texture(m, "Texture", py::buffer_protocol());

-    texture.def_property_readonly("nbytes", &Texture::nbytes);
+    texture.def_property_readonly("nbytes", &Texture::NBytes);
    texture.def_property_readonly("size", &Texture::Size);
-    texture.def_readonly("width", &Texture::width);
-    texture.def_readonly("height", &Texture::height);
+    texture.def_property_readonly("width", &Texture::Width);
+    texture.def_property_readonly("height", &Texture::Height);

-    texture.def_buffer([](Texture &t) { return py::buffer_info(t.data(), t.nbytes()); });
-    texture.def("tobytes",
-                [](const Texture &t) { return py::bytes(reinterpret_cast<const char *>(t.data()), t.nbytes()); });
+    texture.def_buffer([](Texture &t) { return py::buffer_info(t.Data(), t.NBytes()); });
+    texture.def("tobytes", [](const Texture &t) { return py::bytes(reinterpret_cast<const char *>(t.Data()), t.NBytes()); });

    // RawTexture

@ -74,9 +66,7 @@ PYBIND11_MODULE(_quicktex, m) {
    raw_texture.def(py::init<int, int>(), "width"_a, "height"_a);
    raw_texture.def_static("frombytes", &BufferToTexture<RawTexture>, "data"_a, "width"_a, "height"_a);

-    DefSubscript2DRef(
-        raw_texture, [](RawTexture &self, int x, int y) -> Color { return self.pixel(x, y); },
-        [](RawTexture &self, int x, int y, Color val) { self.pixel(x, y) = val; }, &RawTexture::Size);
+    DefSubscript2D(raw_texture, &RawTexture::GetPixel, &RawTexture::SetPixel, &RawTexture::Size);

    InitS3TC(m);
 }
--- a/quicktex/_bindings.h
+++ b/quicktex/_bindings.h
@ -24,66 +24,18 @@

 #include <cstdint>
 #include <cstring>
+#include <memory>
 #include <stdexcept>
-#include <string>
-#include <tuple>
 #include <type_traits>
-#include <utility>
-#include <vector>

-#include "OldColor.h"
-#include "texture/BlockTexture.h"
-#include "util/math.h"
+#include "Color.h"
+#include "ColorBlock.h"
+#include "Texture.h"
+#include "util.h"

 namespace pybind11::detail {
 using namespace quicktex;
 /// Type caster for color class to allow it to be converted to and from a python tuple
-template <> struct type_caster<OldColor> {
-   public:
-    PYBIND11_TYPE_CASTER(OldColor, _("Color"));
-
-    bool load(handle src, bool) {
-        PyObject* source = src.ptr();
-
-        PyObject* tmp = PySequence_Tuple(source);
-
-        // if the object is not a tuple, return false
-        if (!tmp) { return false; }  // incorrect type
-
-        // check the size
-        Py_ssize_t size = PyTuple_Size(tmp);
-        if (size < 3 || size > 4) { return false; }  // incorrect size
-
-        value.a = 0xFF;
-        // now we get the contents
-        for (int i = 0; i < size; i++) {
-            PyObject* src_chan = PyTuple_GetItem(tmp, i);
-            PyObject* tmp_chan = PyNumber_Long(src_chan);
-
-            if (!tmp_chan) return false;  // incorrect channel type
-
-            auto chan = PyLong_AsLong(tmp_chan);
-            if (chan > 0xFF || chan < 0) return false;  // item out of range
-            value[static_cast<unsigned>(i)] = static_cast<uint8_t>(chan);
-            Py_DECREF(tmp_chan);
-        }
-        Py_DECREF(tmp);
-
-        return !PyErr_Occurred();
-    }
-
-    static handle cast(OldColor src, return_value_policy, handle) {
-        PyObject* val = PyTuple_New(4);
-
-        for (int i = 0; i < 4; i++) {
-            PyObject* chan = PyLong_FromLong(src[static_cast<unsigned>(i)]);
-            PyTuple_SetItem(val, i, chan);
-        }
-
-        return val;
-    }
-};
-
 template <> struct type_caster<Color> {
   public:
    PYBIND11_TYPE_CASTER(Color, _("Color"));
@ -100,7 +52,7 @@ template <> struct type_caster<Color> {
        Py_ssize_t size = PyTuple_Size(tmp);
        if (size < 3 || size > 4) { return false; }  // incorrect size

-        value.a() = 0xFF;
+        value.a = 0xFF;
        // now we get the contents
        for (int i = 0; i < size; i++) {
            PyObject* src_chan = PyTuple_GetItem(tmp, i);
@ -133,49 +85,26 @@ template <> struct type_caster<Color> {

 namespace py = pybind11;
 namespace quicktex::bindings {
-
 using namespace pybind11::literals;

-template <typename... Args> std::string Format(const char* str, const Args&... args) {
-    auto output = std::string(str);
-
-    std::vector<std::string> values = {{args...}};
-
-    for (unsigned i = 0; i < values.size(); i++) {
-        auto key = "{" + std::to_string(i) + "}";
-        auto value = values[i];
-        while (true) {
-            size_t where = output.find(key);
-            if (where == output.npos) break;
-            output.replace(where, key.length(), value);
-        }
-    }
-
-    return output;
-}
-
 template <typename T> T BufferToTexture(py::buffer buf, int width, int height) {
    static_assert(std::is_base_of<Texture, T>::value);
    static_assert(std::is_constructible<T, int, int>::value);

    auto info = buf.request(false);
    auto output = T(width, height);
-    auto dst_size = output.nbytes();
+    auto dst_size = output.NBytes();

-    if (info.format != py::format_descriptor<uint8_t>::format())
-        throw std::runtime_error("Incompatible format in python buffer: expected a byte array.");
-    if (info.size < (Py_ssize_t)dst_size)
-        std::runtime_error("Incompatible format in python buffer: Input data is smaller than texture size.");
+    if (info.format != py::format_descriptor<uint8_t>::format()) throw std::runtime_error("Incompatible format in python buffer: expected a byte array.");
+    if (info.size < (Py_ssize_t)dst_size) std::runtime_error("Incompatible format in python buffer: Input data is smaller than texture size.");
    if (info.ndim == 1) {
-        if (info.shape[0] < (Py_ssize_t)dst_size)
-            throw std::runtime_error("Incompatible format in python buffer: 1-D buffer has incorrect length.");
-        if (info.strides[0] != 1)
-            throw std::runtime_error("Incompatible format in python buffer: 1-D buffer is not contiguous.");
+        if (info.shape[0] < (Py_ssize_t)dst_size) throw std::runtime_error("Incompatible format in python buffer: 1-D buffer has incorrect length.");
+        if (info.strides[0] != 1) throw std::runtime_error("Incompatible format in python buffer: 1-D buffer is not contiguous.");
    } else {
        throw std::runtime_error("Incompatible format in python buffer: Incorrect number of dimensions.");
    }

-    std::memcpy(output.data(), info.ptr, dst_size);
+    std::memcpy(output.Data(), info.ptr, dst_size);

    return output;
 }
@ -185,15 +114,11 @@ template <typename T> T BufferToPOD(py::buffer buf) {

    auto info = buf.request(false);

-    if (info.format != py::format_descriptor<uint8_t>::format())
-        throw std::runtime_error("Incompatible format in python buffer: expected a byte array.");
-    if (info.size < (Py_ssize_t)sizeof(T))
-        std::runtime_error("Incompatible format in python buffer: Input data is smaller than texture size.");
+    if (info.format != py::format_descriptor<uint8_t>::format()) throw std::runtime_error("Incompatible format in python buffer: expected a byte array.");
+    if (info.size < (Py_ssize_t)sizeof(T)) std::runtime_error("Incompatible format in python buffer: Input data is smaller than texture size.");
    if (info.ndim == 1) {
-        if (info.shape[0] < (Py_ssize_t)sizeof(T))
-            throw std::runtime_error("Incompatible format in python buffer: 1-D buffer has incorrect length.");
-        if (info.strides[0] != 1)
-            throw std::runtime_error("Incompatible format in python buffer: 1-D buffer is not contiguous.");
+        if (info.shape[0] < (Py_ssize_t)sizeof(T)) throw std::runtime_error("Incompatible format in python buffer: 1-D buffer has incorrect length.");
+        if (info.strides[0] != 1) throw std::runtime_error("Incompatible format in python buffer: 1-D buffer is not contiguous.");
    } else {
        throw std::runtime_error("Incompatible format in python buffer: Incorrect number of dimensions.");
    }
@ -208,18 +133,15 @@ inline int PyIndex(int val, int size, std::string name = "index") {
    return val;
 }

-template <typename T, typename Getter, typename Setter, typename Extent>
-void DefSubscript(py::class_<T> t, Getter&& get, Setter&& set, Extent&& ext) {
+template <typename T, typename Getter, typename Setter, typename Extent> void DefSubscript(py::class_<T> t, Getter&& get, Setter&& set, Extent&& ext) {
    using V = typename std::invoke_result<Getter, T*, int>::type;
    t.def(
        "__getitem__", [get, ext](T& self, int index) { return (self.*get)(PyIndex(index, (self.*ext)())); }, "key"_a);
    t.def(
-        "__setitem__", [set, ext](T& self, int index, V val) { (self.*set)(PyIndex(index, (self.*ext)()), val); },
-        "key"_a, "value"_a);
+        "__setitem__", [set, ext](T& self, int index, V val) { (self.*set)(PyIndex(index, (self.*ext)()), val); }, "key"_a, "value"_a);
 }

-template <typename Tpy, typename Getter, typename Setter, typename Extent>
-void DefSubscript2D(Tpy t, Getter&& get, Setter&& set, Extent&& ext) {
+template <typename Tpy, typename Getter, typename Setter, typename Extent> void DefSubscript2D(Tpy t, Getter&& get, Setter&& set, Extent&& ext) {
    using T = typename Tpy::type;
    using V = typename std::invoke_result<Getter, T*, int, int>::type;
    using Coords = std::tuple<int, int>;
@ -243,32 +165,6 @@ void DefSubscript2D(Tpy t, Getter&& get, Setter&& set, Extent&& ext) {
        "key"_a, "value"_a);
 }

-// TODO: untangle this mess
-template <typename Tpy, typename Getter, typename Setter, typename Extent>
-void DefSubscript2DRef(Tpy t, Getter&& get, Setter&& set, Extent&& ext) {
-    using T = typename Tpy::type;
-    using V = typename std::remove_cvref_t<std::invoke_result_t<Getter, T&, int, int>>;
-    using Coords = std::tuple<int, int>;
-    t.def(
-        "__getitem__",
-        [get, ext](T& self, Coords pnt) {
-            Coords s = (self.*ext)();
-            int x = PyIndex(std::get<0>(pnt), std::get<0>(s), "x");
-            int y = PyIndex(std::get<1>(pnt), std::get<1>(s), "y");
-            return get(self, x, y);
-        },
-        "key"_a);
-    t.def(
-        "__setitem__",
-        [set, ext](T& self, Coords pnt, const V& val) {
-            Coords s = (self.*ext)();
-            int x = PyIndex(std::get<0>(pnt), std::get<0>(s), "x");
-            int y = PyIndex(std::get<1>(pnt), std::get<1>(s), "y");
-            set(self, x, y, val);
-        },
-        "key"_a, "value"_a);
-}
-
 template <typename B> py::class_<B> BindBlock(py::module_& m, const char* name) {
    const char* frombytes_doc = R"doc(
        Create a new {0} by copying a bytes-like object.
@ -288,8 +184,7 @@ template <typename B> py::class_<B> BindBlock(py::module_& m, const char* name)
    block.def_readonly_static("width", &B::Width, "The width of the block in pixels.");
    block.def_readonly_static("height", &B::Height, "The height of the block in pixels.");
    block.def_property_readonly_static(
-        "size", [](py::object) { return std::make_tuple(B::Width, B::Height); },
-        "The dimensions of the block in pixels.");
+        "size", [](py::object) { return std::make_tuple(B::Width, B::Height); }, "The dimensions of the block in pixels.");
    block.def_property_readonly_static(
        "nbytes", [](py::object) { return sizeof(B); }, "The size of the block in bytes.");

@ -300,7 +195,7 @@ template <typename B> py::class_<B> BindBlock(py::module_& m, const char* name)
        "tobytes", [](const B& b) { return py::bytes(reinterpret_cast<const char*>(&b), sizeof(B)); },
        Format(tobytes_doc, name, std::to_string(sizeof(B))).c_str());

-    return block;
+    return std::move(block);
 }

 template <typename B> py::class_<BlockTexture<B>> BindBlockTexture(py::module_& m, const char* name) {
@ -328,15 +223,14 @@ template <typename B> py::class_<BlockTexture<B>> BindBlockTexture(py::module_&
    py::class_<BTex, Texture> block_texture(m, name);

    block_texture.def(py::init<int, int>(), "width"_a, "height"_a, Format(constructor_str, name).c_str());
-    block_texture.def_static("from_bytes", &BufferToTexture<BTex>, "data"_a, "width"_a, "height"_a,
-                             Format(from_bytes_str, name).c_str());
+    block_texture.def_static("from_bytes", &BufferToTexture<BTex>, "data"_a, "width"_a, "height"_a, Format(from_bytes_str, name).c_str());

-    block_texture.def_property_readonly("width_blocks", &BTex::bwidth, "The width of the texture in blocks.");
-    block_texture.def_property_readonly("height_blocks", &BTex::bheight, "The height of the texture in blocks.");
-    block_texture.def_property_readonly("size_blocks", &BTex::bsize, "The dimensions of the texture in blocks.");
+    block_texture.def_property_readonly("width_blocks", &BTex::BlocksX, "The width of the texture in blocks.");
+    block_texture.def_property_readonly("height_blocks", &BTex::BlocksY, "The height of the texture in blocks.");
+    block_texture.def_property_readonly("size_blocks", &BTex::BlocksXY, "The dimensions of the texture in blocks.");

-    DefSubscript2D(block_texture, &BTex::get_block, &BTex::set_block, &BTex::bsize);
+    DefSubscript2D(block_texture, &BTex::GetBlock, &BTex::SetBlock, &BTex::BlocksXY);

-    return block_texture;
+    return std::move(block_texture);
 }
 }  // namespace quicktex::bindings
--- a/quicktex/util/bitwiseEnums.h
+++ b/quicktex/util/bitwiseEnums.h
@ -21,48 +21,38 @@

 #include <type_traits>

-namespace quicktex {
-
 // Thanks dkavolis
-template <typename E> requires std::is_enum_v<E>
-constexpr inline auto operator~(E a) noexcept -> E {
+template <typename E, typename = std::enable_if_t<std::is_enum_v<E>>> constexpr inline auto operator~(E a) noexcept -> E {
    using Base = std::underlying_type_t<E>;
    return static_cast<E>(~static_cast<Base>(a));
 }

-template <typename E> requires std::is_enum_v<E>
-constexpr inline auto operator|(E a, E b) noexcept -> E {
+template <typename E, typename = std::enable_if_t<std::is_enum_v<E>>> constexpr inline auto operator|(E a, E b) noexcept -> E {
    using Base = std::underlying_type_t<E>;
    return static_cast<E>(static_cast<Base>(a) | static_cast<Base>(b));
 }

-template <typename E> requires std::is_enum_v<E>
-constexpr inline auto operator&(E a, E b) noexcept -> E {
+template <typename E, typename = std::enable_if_t<std::is_enum_v<E>>> constexpr inline auto operator&(E a, E b) noexcept -> E {
    using Base = std::underlying_type_t<E>;
    return static_cast<E>(static_cast<Base>(a) & static_cast<Base>(b));
 }

-template <typename E> requires std::is_enum_v<E>
-constexpr inline auto operator^(E a, E b) noexcept -> E {
+template <typename E, typename = std::enable_if_t<std::is_enum_v<E>>> constexpr inline auto operator^(E a, E b) noexcept -> E {
    using Base = std::underlying_type_t<E>;
    return static_cast<E>(static_cast<Base>(a) ^ static_cast<Base>(b));
 }

-template <typename E> requires std::is_enum_v<E>
-constexpr inline auto operator|=(E& a, E b) noexcept -> E& {
+template <typename E, typename = std::enable_if_t<std::is_enum_v<E>>> constexpr inline auto operator|=(E& a, E b) noexcept -> E& {
    a = a | b;
    return a;
 }

-template <typename E> requires std::is_enum_v<E>
-constexpr inline auto operator&=(E& a, E b) noexcept -> E& {
+template <typename E, typename = std::enable_if_t<std::is_enum_v<E>>> constexpr inline auto operator&=(E& a, E b) noexcept -> E& {
    a = a & b;
    return a;
 }

-template <typename E> requires std::is_enum_v<E>
-constexpr inline auto operator^=(E& a, E b) noexcept -> E& {
+template <typename E, typename = std::enable_if_t<std::is_enum_v<E>>> constexpr inline auto operator^=(E& a, E b) noexcept -> E& {
    a = a ^ b;
    return a;
-}
-}  // namespace quicktex
+}
--- a/quicktex/cli/common.py
+++ b/quicktex/cli/common.py
@ -40,7 +40,7 @@ def path_pairs(inputs, output, suffix, extension):
    """

    if len(inputs) < 1:
-        raise click.BadArgumentUsage('No input files were provided.')
+        raise click.BadArgumentUsage('No valid input files were provided.')

    inpaths = [pathlib.Path(i) for i in inputs]

--- a/quicktex/cli/encode.py
+++ b/quicktex/cli/encode.py
@ -36,7 +36,8 @@ def encode():
    help="Output file or directory. If outputting to a file, input filenames must be only a single item. By default, files are decoded in place.",
 )
@click.argument('filenames', nargs=-1, type=click.Path(exists=True, readable=True, dir_okay=False))
-def encode_format(encoder, four_cc, flip, remove, suffix, output, filenames):
+def encode_format(encoder, four_cc, flip, remove, suffix, output, filenames, swizzle=False):
+    filenames = [f for f in filenames if not f.endswith('.dds')]
    path_pairs = common.path_pairs(filenames, output, suffix, '.dds')

    with click.progressbar(
@ -48,6 +49,11 @@ def encode_format(encoder, four_cc, flip, remove, suffix, output, filenames):
            if flip:
                image = image.transpose(Image.FLIP_TOP_BOTTOM)

+            if swizzle:
+                bands = image.split()
+                one = Image.new('L', image.size, 0xFF)
+                image = Image.merge('RGBA', (one, bands[1], bands[1], bands[0]))
+
            dds.encode(image, encoder, four_cc).save(outpath)

            if remove:
@ -107,8 +113,11 @@ def encode_auto(level, black, threecolor, flip, remove, suffix, output, filename

    bc1_encoder = quicktex.s3tc.bc1.BC1Encoder(level, mode)
    bc3_encoder = quicktex.s3tc.bc3.BC3Encoder(level)
+    filenames = [f for f in filenames if not f.endswith('.dds')]
    path_pairs = common.path_pairs(filenames, output, suffix, '.dds')

+    assert len(filenames) > 0
+
    with click.progressbar(
        path_pairs, show_eta=False, show_pos=True, item_show_func=lambda x: str(x[0]) if x else ''
    ) as bar:
@ -175,9 +184,16 @@ def encode_bc1(level, black, threecolor, **kwargs):
    default=18,
    help='Quality level to use. Higher values = higher quality, but slower.',
 )
-def encode_bc3(level, **kwargs):
+@click.option(
+    '-n/-N',
+    '--normal/--no-normal',
+    type=bool,
+    default=False,
+    help='Perform a BC3nm swizzle, copying the red channel into the alpha [default: no-normal]',
+)
+def encode_bc3(level, normal, **kwargs):
    """Encode images to BC4 (RGBA, 8-bit interpolated alpha)."""
-    encode_format.callback(quicktex.s3tc.bc3.BC3Encoder(level), 'DXT5', **kwargs)
+    encode_format.callback(quicktex.s3tc.bc3.BC3Encoder(level), 'DXT5', swizzle=normal, **kwargs)


@click.command('bc4')
--- a/quicktex/dds.py
+++ b/quicktex/dds.py
@ -275,6 +275,7 @@ def read(path: os.PathLike) -> DDSFile:
 def encode(image: Image.Image, encoder, four_cc: str, mip_count: typing.Optional[int] = None) -> DDSFile:
    if image.mode != 'RGBA' or image.mode != 'RGBX':
        mode = 'RGBA' if 'A' in image.mode else 'RGBX'
+        image.apply_transparency()  # why is this necessary what
        image = image.convert(mode)

    sizes = quicktex.image_utils.mip_sizes(image.size, mip_count)
--- a/quicktex/s3tc/bc1/BC1Block.cpp
+++ b/quicktex/s3tc/bc1/BC1Block.cpp
@ -20,35 +20,28 @@
 #include "BC1Block.h"

 #include <stdexcept>
+#include <algorithm>

-#include "util/bitbash.h"
-#include "util/map.h"
-#include "util/math.h"
-#include "util/ranges.h"
+#include "../../util.h"

 namespace quicktex::s3tc {
+uint16_t BC1Block::GetColor0Raw() const { return Pack<uint8_t, uint16_t, 8, EndpointSize>(_color0); }
+uint16_t BC1Block::GetColor1Raw() const { return Pack<uint8_t, uint16_t, 8, EndpointSize>(_color1); }

-uint16_t BC1Block::GetColor0Raw() const { return pack<uint16_t>(_color0, 8); }
-uint16_t BC1Block::GetColor1Raw() const { return pack<uint16_t>(_color1, 8); }
+void BC1Block::SetColor0Raw(uint16_t c) { _color0 = Unpack<uint16_t, uint8_t, 8, EndpointSize>(c); }
+void BC1Block::SetColor1Raw(uint16_t c) { _color1 = Unpack<uint16_t, uint8_t, 8, EndpointSize>(c); }

-void BC1Block::SetColor0Raw(uint16_t c) { _color0 = unpack<uint8_t, EndpointSize>(c, 8); }
-void BC1Block::SetColor1Raw(uint16_t c) { _color1 = unpack<uint8_t, EndpointSize>(c, 8); }
-
-BC1Block::SelectorArray BC1Block::GetSelectors() const {
-    return map([](auto row) { return unpack<uint8_t, Width>(row, SelectorBits); }, _selectors);
-}
+BC1Block::SelectorArray BC1Block::GetSelectors() const { return MapArray(_selectors, Unpack<uint8_t, uint8_t, SelectorBits, Width>); }

 void BC1Block::SetSelectors(const BC1Block::SelectorArray& unpacked) {
    for (unsigned y = 0; y < (unsigned)Height; y++) {
        if (std::any_of(unpacked[y].begin(), unpacked[y].end(), [](uint8_t i) { return i > SelectorMax; }))
            throw std::invalid_argument("Selector value out of bounds.");
    }
-    _selectors = map([](auto row) { return pack<uint8_t>(row, SelectorBits, true); }, unpacked);
+    _selectors = MapArray(unpacked, Pack<uint8_t, uint8_t, SelectorBits, Width>);
 }

-bool BC1Block::operator==(const BC1Block& Rhs) const {
-    return _color0 == Rhs._color0 && _color1 == Rhs._color1 && _selectors == Rhs._selectors;
-}
+bool BC1Block::operator==(const BC1Block& Rhs) const { return _color0 == Rhs._color0 && _color1 == Rhs._color1 && _selectors == Rhs._selectors; }
 bool BC1Block::operator!=(const BC1Block& Rhs) const { return !(Rhs == *this); }

 }  // namespace quicktex::s3tc
--- a/quicktex/s3tc/bc1/BC1Block.h
+++ b/quicktex/s3tc/bc1/BC1Block.h
@ -24,7 +24,7 @@
 #include <cstdlib>
 #include <utility>

-#include "OldColor.h"
+#include "../../Color.h"

 namespace quicktex::s3tc {

@ -39,7 +39,7 @@ class alignas(8) BC1Block {
    static constexpr uint8_t SelectorMax = (1 << SelectorBits) - 1;  // maximum value of a selector

    using SelectorArray = std::array<std::array<uint8_t, Width>, Height>;
-    using ColorPair = std::pair<OldColor, OldColor>;
+    using ColorPair = std::pair<Color, Color>;

   private:
    std::array<uint8_t, EndpointSize> _color0;
@ -60,7 +60,7 @@ class alignas(8) BC1Block {
     * @param color1 second endpoint color
     * @param selectors the selectors as a 4x4 list of integers, between 0 and 3 inclusive.
     */
-    BC1Block(OldColor color0, OldColor color1, const SelectorArray& selectors) {
+    BC1Block(Color color0, Color color1, const SelectorArray& selectors) {
        SetColor0(color0);
        SetColor1(color1);
        SetSelectors(selectors);
@ -96,12 +96,12 @@ class alignas(8) BC1Block {
    void SetColor0Raw(uint16_t c);
    void SetColor1Raw(uint16_t c);

-    OldColor GetColor0() const { return OldColor::Unpack565(GetColor0Raw()); }
-    OldColor GetColor1() const { return OldColor::Unpack565(GetColor1Raw()); }
+    Color GetColor0() const { return Color::Unpack565(GetColor0Raw()); }
+    Color GetColor1() const { return Color::Unpack565(GetColor1Raw()); }
    ColorPair GetColors() const { return {GetColor0(), GetColor1()}; }

-    void SetColor0(OldColor c) { SetColor0Raw(c.Pack565()); }
-    void SetColor1(OldColor c) { SetColor1Raw(c.Pack565()); }
+    void SetColor0(Color c) { SetColor0Raw(c.Pack565()); }
+    void SetColor1(Color c) { SetColor1Raw(c.Pack565()); }
    void SetColors(ColorPair cs) {
        SetColor0(cs.first);
        SetColor1(cs.second);
--- a/quicktex/s3tc/bc1/BC1Decoder.cpp
+++ b/quicktex/s3tc/bc1/BC1Decoder.cpp
@ -23,9 +23,9 @@
 #include <cassert>
 #include <cstdint>

-#include "ColorBlock.h"
-#include "OldColor.h"
-#include "s3tc/bc1/BC1Block.h"
+#include "../../Color.h"
+#include "../../ColorBlock.h"
+#include "BC1Block.h"

 namespace quicktex::s3tc {

--- a/quicktex/s3tc/bc1/BC1Decoder.h
+++ b/quicktex/s3tc/bc1/BC1Decoder.h
@ -21,19 +21,18 @@

 #include <memory>

-#include "ColorBlock.h"
-#include "Decoder.h"
-#include "s3tc/bc1/BC1Block.h"
-#include "s3tc/interpolator/Interpolator.h"
-#include "texture/BlockTexture.h"
+#include "../../ColorBlock.h"
+#include "../../Decoder.h"
+#include "../../Texture.h"
+#include "../interpolator/Interpolator.h"
+#include "BC1Block.h"

 namespace quicktex::s3tc {
 class BC1Decoder final : public BlockDecoder<BlockTexture<BC1Block>> {
   public:
    using InterpolatorPtr = std::shared_ptr<Interpolator>;

-    BC1Decoder(bool vwrite_alpha, InterpolatorPtr interpolator)
-        : write_alpha(vwrite_alpha), _interpolator(interpolator) {}
+    BC1Decoder(bool vwrite_alpha, InterpolatorPtr interpolator) : write_alpha(vwrite_alpha), _interpolator(interpolator) {}

    BC1Decoder(bool vwrite_alpha = false) : BC1Decoder(vwrite_alpha, std::make_shared<Interpolator>()) {}

--- a/quicktex/s3tc/bc1/BC1Encoder.cpp
+++ b/quicktex/s3tc/bc1/BC1Encoder.cpp
@ -29,28 +29,24 @@
 #include <stdexcept>
 #include <type_traits>

-#include "ColorBlock.h"
+#include "../../Color.h"
+#include "../../ColorBlock.h"
+#include "../../Matrix4x4.h"
+#include "../../Texture.h"
+#include "../../Vector4.h"
+#include "../../Vector4Int.h"
+#include "../../bitwiseEnums.h"
+#include "../../util.h"
 #include "Histogram.h"
-#include "Matrix4x4.h"
-#include "OldColor.h"
-#include "Vector4.h"
-#include "Vector4Int.h"
-#include "s3tc/bc1/BC1Block.h"
-#include "s3tc/bc1/OrderTable.h"
-#include "s3tc/bc1/SingleColorTable.h"
-#include "texture/Texture.h"
-#include "util/bitbash.h"
-#include "util/bitwiseEnums.h"
-#include "util/math.h"
+#include "OrderTable.h"
+#include "SingleColorTable.h"

 namespace quicktex::s3tc {

 // constructors

-BC1Encoder::BC1Encoder(unsigned int level, ColorMode color_mode, InterpolatorPtr interpolator)
-    : _interpolator(interpolator), _color_mode(color_mode) {
-    if (color_mode != ColorMode::FourColor && color_mode != ColorMode::ThreeColor &&
-        color_mode != ColorMode::ThreeColorBlack) {
+BC1Encoder::BC1Encoder(unsigned int level, ColorMode color_mode, InterpolatorPtr interpolator) : _interpolator(interpolator), _color_mode(color_mode) {
+    if (color_mode != ColorMode::FourColor && color_mode != ColorMode::ThreeColor && color_mode != ColorMode::ThreeColorBlack) {
        throw std::invalid_argument("Encoder color mode must be FourColor, ThreeColor, or ThreeColorBlack");
    }

@ -77,9 +73,7 @@ BC1Encoder::BC1Encoder(unsigned int level, ColorMode color_mode, InterpolatorPtr

 // Getters and Setters
 void BC1Encoder::SetLevel(unsigned level) {
-    if (level > 19)
-        throw std::invalid_argument(
-            "Level out of range, bust be between 0 and 18 inclusive");  // theres a secret level 19 but shhhhhh
+    if (level > 19) throw std::invalid_argument("Level out of range, bust be between 0 and 18 inclusive");  // theres a secret level 19 but shhhhhh

    two_ls_passes = false;
    two_ep_passes = false;
@ -255,20 +249,14 @@ void BC1Encoder::SetLevel(unsigned level) {
    _orderings3 = clamp(_orderings3, 1U, OrderTable<3>::BestOrderCount);
 }

-void BC1Encoder::SetOrderings4(unsigned orderings4) {
-    _orderings4 = clamp(orderings4, 1U, OrderTable<4>::BestOrderCount);
-}
-void BC1Encoder::SetOrderings3(unsigned orderings3) {
-    _orderings3 = clamp(orderings3, 1U, OrderTable<3>::BestOrderCount);
-}
+void BC1Encoder::SetOrderings4(unsigned orderings4) { _orderings4 = clamp(orderings4, 1U, OrderTable<4>::BestOrderCount); }
+void BC1Encoder::SetOrderings3(unsigned orderings3) { _orderings3 = clamp(orderings3, 1U, OrderTable<3>::BestOrderCount); }
 void BC1Encoder::SetOrderings(OrderingPair orderings) {
    SetOrderings4(std::get<0>(orderings));
    SetOrderings3(std::get<1>(orderings));
 }

-void BC1Encoder::SetPowerIterations(unsigned int power_iters) {
-    _power_iterations = clamp(power_iters, min_power_iterations, max_power_iterations);
-}
+void BC1Encoder::SetPowerIterations(unsigned int power_iters) { _power_iterations = clamp(power_iters, min_power_iterations, max_power_iterations); }

 // Public methods
 BC1Block BC1Encoder::EncodeBlock(const ColorBlock<4, 4> &pixels) const {
@ -316,9 +304,7 @@ BC1Block BC1Encoder::EncodeBlock(const ColorBlock<4, 4> &pixels) const {

    // First refinement pass using ordered cluster fit
    if (result.error > 0 && use_likely_orderings) {
-        for (unsigned iter = 0; iter < total_cf_passes; iter++) {
-            RefineBlockCF<ColorMode::FourColor>(result, pixels, metrics, _error_mode, _orderings4);
-        }
+        for (unsigned iter = 0; iter < total_cf_passes; iter++) { RefineBlockCF<ColorMode::FourColor>(result, pixels, metrics, _error_mode, _orderings4); }
    }

    // try for 3-color block
@ -339,15 +325,13 @@ BC1Block BC1Encoder::EncodeBlock(const ColorBlock<4, 4> &pixels) const {
    }

    // try for 3-color block with black
-    if (result.error > 0 && (_color_mode == ColorMode::ThreeColorBlack) && metrics.has_black &&
-        !metrics.max.IsBlack()) {
+    if (result.error > 0 && (_color_mode == ColorMode::ThreeColorBlack) && metrics.has_black && !metrics.max.IsBlack()) {
        EncodeResults trial_result;
        BlockMetrics metrics_no_black = pixels.GetMetrics(true);

        FindEndpoints(trial_result, pixels, metrics_no_black, EndpointMode::PCA, true);
        FindSelectors<ColorMode::ThreeColorBlack>(trial_result, pixels, ErrorMode::Full);
-        RefineBlockLS<ColorMode::ThreeColorBlack>(trial_result, pixels, metrics_no_black, ErrorMode::Full,
-                                                  total_ls_passes);
+        RefineBlockLS<ColorMode::ThreeColorBlack>(trial_result, pixels, metrics_no_black, ErrorMode::Full, total_ls_passes);

        if (trial_result.error < result.error) { result = trial_result; }
    }
@ -359,7 +343,7 @@ BC1Block BC1Encoder::EncodeBlock(const ColorBlock<4, 4> &pixels) const {
 }

 // Private methods
-BC1Block BC1Encoder::WriteBlockSolid(OldColor color) const {
+BC1Block BC1Encoder::WriteBlockSolid(Color color) const {
    uint8_t mask = 0xAA;  // 2222
    uint16_t min16, max16;

@ -457,7 +441,7 @@ BC1Block BC1Encoder::WriteBlock(EncodeResults &result) const {
    return BC1Block(ep0, ep1, selectors);
 }

-void BC1Encoder::FindEndpointsSingleColor(EncodeResults &result, OldColor color, bool is_3color) const {
+void BC1Encoder::FindEndpointsSingleColor(EncodeResults &result, Color color, bool is_3color) const {
    auto &match5 = is_3color ? _single_match5_half : _single_match5;
    auto &match6 = is_3color ? _single_match6_half : _single_match6;

@ -467,14 +451,13 @@ void BC1Encoder::FindEndpointsSingleColor(EncodeResults &result, OldColor color,

    result.color_mode = is_3color ? ColorMode::ThreeColor : ColorMode::FourColor;
    result.error = match_r.error + match_g.error + match_b.error;
-    result.low = OldColor(match_r.low, match_g.low, match_b.low);
-    result.high = OldColor(match_r.high, match_g.high, match_b.high);
+    result.low = Color(match_r.low, match_g.low, match_b.low);
+    result.high = Color(match_r.high, match_g.high, match_b.high);
    // selectors decided when writing, no point deciding them now
 }

-void BC1Encoder::FindEndpointsSingleColor(EncodeResults &result, const CBlock &pixels, OldColor color,
-                                          bool is_3color) const {
-    std::array<OldColor, 4> colors = _interpolator->InterpolateBC1(result.low, result.high, is_3color);
+void BC1Encoder::FindEndpointsSingleColor(EncodeResults &result, const CBlock &pixels, Color color, bool is_3color) const {
+    std::array<Color, 4> colors = _interpolator->InterpolateBC1(result.low, result.high, is_3color);
    Vector4Int result_vector = (Vector4Int)colors[2];

    FindEndpointsSingleColor(result, color, is_3color);
@ -488,43 +471,40 @@ void BC1Encoder::FindEndpointsSingleColor(EncodeResults &result, const CBlock &p
    }
 }

-void BC1Encoder::FindEndpoints(EncodeResults &result, const CBlock &pixels, const BlockMetrics &metrics,
-                               EndpointMode endpoint_mode, bool ignore_black) const {
+void BC1Encoder::FindEndpoints(EncodeResults &result, const CBlock &pixels, const BlockMetrics &metrics, EndpointMode endpoint_mode, bool ignore_black) const {
    if (metrics.is_greyscale) {
        // specialized greyscale case
        const unsigned fr = pixels.Get(0, 0).r;

        if (metrics.max.r - metrics.min.r < 2) {
            // single color block
-            uint8_t fr5 = (uint8_t)scale_from_8<5>(fr);
-            uint8_t fr6 = (uint8_t)scale_from_8<6>(fr);
+            uint8_t fr5 = (uint8_t)scale8To5(fr);
+            uint8_t fr6 = (uint8_t)scale8To6(fr);

-            result.low = OldColor(fr5, fr6, fr5);
+            result.low = Color(fr5, fr6, fr5);
            result.high = result.low;
        } else {
-            uint8_t lr5 = scale_from_8<5>(metrics.min.r);
-            uint8_t lr6 = scale_from_8<6>(metrics.min.r);
+            uint8_t lr5 = scale8To5(metrics.min.r);
+            uint8_t lr6 = scale8To6(metrics.min.r);

-            uint8_t hr5 = scale_from_8<5>(metrics.max.r);
-            uint8_t hr6 = scale_from_8<6>(metrics.max.r);
+            uint8_t hr5 = scale8To5(metrics.max.r);
+            uint8_t hr6 = scale8To6(metrics.max.r);

-            result.low = OldColor(lr5, lr6, lr5);
-            result.high = OldColor(hr5, hr6, hr5);
+            result.low = Color(lr5, lr6, lr5);
+            result.high = Color(hr5, hr6, hr5);
        }
    } else if (endpoint_mode == EndpointMode::LeastSquares) {
        //  2D Least Squares approach from Humus's example, with added inset and optimal rounding.
-        OldColor diff =
-            OldColor(metrics.max.r - metrics.min.r, metrics.max.g - metrics.min.g, metrics.max.b - metrics.min.b);
+        Color diff = Color(metrics.max.r - metrics.min.r, metrics.max.g - metrics.min.g, metrics.max.b - metrics.min.b);
        Vector4 l = {0, 0, 0};
        Vector4 h = {0, 0, 0};

        auto &sums = metrics.sums;
        auto &min = metrics.min;
-        auto &max = metrics.max;

        unsigned chan0 = (unsigned)diff.MaxChannelRGB();  // primary axis of the bounding box
        l[chan0] = (float)min[chan0];
-        h[chan0] = (float)max[chan0];
+        h[chan0] = (float)min[chan0];

        assert((diff[chan0] >= diff[(chan0 + 1) % 3]) && (diff[chan0] >= diff[(chan0 + 2) % 3]));

@ -541,7 +521,7 @@ void BC1Encoder::FindEndpoints(EncodeResults &result, const CBlock &pixels, cons
        float denominator = (float)(16 * sum_xx) - (float)(sum_x * sum_x);

        // once per secondary axis, calculate high and low using least squares
-        if (abs(denominator) > 1e-8f) {
+        if (fabs(denominator) > 1e-8f) {
            for (unsigned i = 1; i < 3; i++) {
                /* each secondary axis is fitted with a linear formula of the form
                 *  y = ax + b
@ -569,8 +549,8 @@ void BC1Encoder::FindEndpoints(EncodeResults &result, const CBlock &pixels, cons
            h[c] = ((h[c] - inset) / 255.0f);
        }

-        result.low = OldColor::PreciseRound565(l);
-        result.high = OldColor::PreciseRound565(h);
+        result.low = Color::PreciseRound565(l);
+        result.high = Color::PreciseRound565(h);
    } else if (endpoint_mode == EndpointMode::BoundingBox) {
        // Algorithm from icbc.h compress_dxt1_fast()
        Vector4 l, h;
@ -597,20 +577,19 @@ void BC1Encoder::FindEndpoints(EncodeResults &result, const CBlock &pixels, cons
        if (icov_xz < 0) std::swap(l[0], h[0]);
        if (icov_yz < 0) std::swap(l[1], h[1]);

-        result.low = OldColor::PreciseRound565(l);
-        result.high = OldColor::PreciseRound565(h);
+        result.low = Color::PreciseRound565(l);
+        result.high = Color::PreciseRound565(h);
    } else if (endpoint_mode == EndpointMode::BoundingBoxInt) {
        // Algorithm from icbc.h compress_dxt1_fast(), but converted to integer.
-        // TODO: handle constant blue channel better

-        OldColor min, max;
+        Color min, max;

        // rescale and inset values
        for (unsigned c = 0; c < 3; c++) {
            int inset = ((int)(metrics.max[c] - metrics.min[c]) - 8) >> 4;  // 1/16 of delta, with bias

-            min[c] = clamp(metrics.min[c] + inset, 0, 255);
-            max[c] = clamp(metrics.max[c] - inset, 0, 255);
+            min[c] = clamp255(metrics.min[c] + inset);
+            max[c] = clamp255(metrics.max[c] - inset);
        }

        int icov_xz = 0, icov_yz = 0;
@ -628,21 +607,19 @@ void BC1Encoder::FindEndpoints(EncodeResults &result, const CBlock &pixels, cons
    } else if (endpoint_mode == EndpointMode::PCA) {
        // the slow way
        // Select 2 colors along the principle axis. (There must be a faster/simpler way.)
+        auto min = Vector4::FromColorRGB(metrics.min);
+        auto max = Vector4::FromColorRGB(metrics.max);
+        auto avg = Vector4::FromColorRGB(metrics.avg);

-        // TODO: handle constant blue channel better
-
-        Color min = metrics.min;
-        Color max = metrics.max;
-        Color avg = metrics.avg;
-
-        Vec<float, 4> axis = {306, 601, 117, 0};  // Luma vector
-        auto covariance = Matrix<float, 4, 4>::identity();
+        Vector4 axis = {306, 601, 117};  // Luma vector
+        Matrix4x4 covariance = Matrix4x4::Identity();

        for (int i = 0; i < 16; i++) {
            auto val = pixels.Get(i);
            if (ignore_black && val.IsBlack()) continue;

-            auto diff = val - avg;
+            auto color_vec = Vector4::FromColorRGB(val);
+            Vector4 diff = color_vec - avg;
            for (unsigned c1 = 0; c1 < 3; c1++) {
                for (unsigned c2 = c1; c2 < 3; c2++) {
                    covariance[c1][c2] += (diff[c1] * diff[c2]);
@ -652,24 +629,20 @@ void BC1Encoder::FindEndpoints(EncodeResults &result, const CBlock &pixels, cons
        }

        covariance /= 255.0f;
-        covariance = covariance.mirror();
+        covariance.Mirror();

-        Vec<float, 4> delta = max - min;
+        Vector4 delta = max - min;

        // realign r and g axes to match
        if (covariance[0][2] < 0) delta[0] = -delta[0];  // r vs b
        if (covariance[1][2] < 0) delta[1] = -delta[1];  // g vs b

-        // using the covariance matrix, stretch the delta vector towards the primary axis of the data using power
-        // iteration the end result of this may actually be the same as the least squares approach, will have to do more
-        // research
-        for (unsigned power_iter = 0; power_iter < _power_iterations; power_iter++) {
-            delta = covariance.mult(delta);
-        }
+        // using the covariance matrix, stretch the delta vector towards the primary axis of the data using power iteration
+        // the end result of this may actually be the same as the least squares approach, will have to do more research
+        for (unsigned power_iter = 0; power_iter < _power_iterations; power_iter++) { delta = covariance * delta; }

        // if we found any correlation, then this is our new axis. otherwise we fallback to the luma vector
-        auto delta_abs = delta.abs();
-        float k = *std::max_element(delta_abs.begin(), delta_abs.end());
+        float k = delta.MaxAbs(3);
        if (k >= 2) { axis = delta * (2048.0f / k); }

        axis *= 16;
@ -680,12 +653,13 @@ void BC1Encoder::FindEndpoints(EncodeResults &result, const CBlock &pixels, cons
        int min_index = 0, max_index = 0;

        for (int i = 0; i < 16; i++) {
-            Color val = pixels.Get(i); //todo: fix this mess
-            if (ignore_black && (val.r() | val.g() | val.b()) < 4) continue;
+            auto val = pixels.Get(i);
+            if (ignore_black && val.IsBlack()) continue;

+            auto color_vec = Vector4::FromColorRGB(val);
            // since axis is constant here, I dont think its magnitude actually matters,
            // since we only care about the min or max dot product
-            float dot = (Vec<float,4>(val)).dot(axis);
+            float dot = color_vec.Dot(axis);
            if (dot > max_dot) {
                max_dot = dot;
                max_index = i;
@ -703,21 +677,20 @@ void BC1Encoder::FindEndpoints(EncodeResults &result, const CBlock &pixels, cons
    result.color_mode = ColorMode::Incomplete;
 }

-template <BC1Encoder::ColorMode M>
-void BC1Encoder::FindSelectors(EncodeResults &result, const CBlock &pixels, ErrorMode error_mode) const {
+template <BC1Encoder::ColorMode M> void BC1Encoder::FindSelectors(EncodeResults &result, const CBlock &pixels, ErrorMode error_mode) const {
    assert(!((error_mode != ErrorMode::Full) && (bool)(M & ColorMode::ThreeColor)));

    const int color_count = (unsigned)M & 0x0F;

-    std::array<OldColor, 4> colors = _interpolator->InterpolateBC1(result.low, result.high, color_count == 3);
+    std::array<Color, 4> colors = _interpolator->InterpolateBC1(result.low, result.high, color_count == 3);
    std::array<Vector4Int, 4> color_vectors;

    if (color_count == 4) {
-        color_vectors = {Vector4Int::FromColorRGB(colors[0]), Vector4Int::FromColorRGB(colors[2]),
-                         Vector4Int::FromColorRGB(colors[3]), Vector4Int::FromColorRGB(colors[1])};
+        color_vectors = {Vector4Int::FromColorRGB(colors[0]), Vector4Int::FromColorRGB(colors[2]), Vector4Int::FromColorRGB(colors[3]),
+                         Vector4Int::FromColorRGB(colors[1])};
    } else {
-        color_vectors = {Vector4Int::FromColorRGB(colors[0]), Vector4Int::FromColorRGB(colors[2]),
-                         Vector4Int::FromColorRGB(colors[1]), Vector4Int::FromColorRGB(colors[3])};
+        color_vectors = {Vector4Int::FromColorRGB(colors[0]), Vector4Int::FromColorRGB(colors[2]), Vector4Int::FromColorRGB(colors[1]),
+                         Vector4Int::FromColorRGB(colors[3])};
    }

    unsigned total_error = 0;
@ -741,8 +714,7 @@ void BC1Encoder::FindSelectors(EncodeResults &result, const CBlock &pixels, Erro
                // llvm is just going to unswitch this anyways so its not an issue
                auto diff = pixel_vector - color_vectors[selector];
                total_error += diff.SqrMag();
-                if (i % 4 != 0 && total_error >= result.error)
-                    break;  // check only once per row if we're generating too much error
+                if (i % 4 != 0 && total_error >= result.error) break;  // check only once per row if we're generating too much error
            }

            result.selectors[i] = selector;
@ -755,7 +727,7 @@ void BC1Encoder::FindSelectors(EncodeResults &result, const CBlock &pixels, Erro
            Vector4Int pixel_vector = Vector4Int::FromColorRGB(pixels.Get(i));
            auto diff = pixel_vector - color_vectors[0];
            float sel_f = (float)diff.Dot(axis) * f + 0.5f;
-            uint8_t sel = (uint8_t)clamp<int>((int)sel_f, 1, 3);
+            uint8_t sel = (uint8_t)clampi((int)sel_f, 1, 3);

            unsigned err0 = (color_vectors[sel - 1] - pixel_vector).SqrMag();
            unsigned err1 = (color_vectors[sel] - pixel_vector).SqrMag();
@ -807,8 +779,7 @@ void BC1Encoder::FindSelectors(EncodeResults &result, const CBlock &pixels, Erro
    result.color_mode = M;
 }

-template <BC1Encoder::ColorMode M>
-bool BC1Encoder::RefineEndpointsLS(EncodeResults &result, const CBlock &pixels, BlockMetrics metrics) const {
+template <BC1Encoder::ColorMode M> bool BC1Encoder::RefineEndpointsLS(EncodeResults &result, const CBlock &pixels, BlockMetrics metrics) const {
    const int color_count = (unsigned)M & 0x0F;
    static_assert(color_count == 3 || color_count == 4);
    assert(result.color_mode != ColorMode::Incomplete);
@ -819,12 +790,11 @@ bool BC1Encoder::RefineEndpointsLS(EncodeResults &result, const CBlock &pixels,
    Vector4 matrix = Vector4(0);

    for (int i = 0; i < 16; i++) {
-        const OldColor color = pixels.Get(i);
+        const Color color = pixels.Get(i);
        const uint8_t sel = result.selectors[i];

        if ((bool)(M & ColorMode::ThreeColorBlack) && color.IsBlack()) continue;
-        if ((bool)(M & ColorMode::ThreeColor) && sel == 3U)
-            continue;  // NOTE: selectors for 3-color are in linear order here, but not in original
+        if ((bool)(M & ColorMode::ThreeColor) && sel == 3U) continue;  // NOTE: selectors for 3-color are in linear order here, but not in original
        assert(sel < color_count);

        const Vector4Int color_vector = Vector4Int::FromColorRGB(color);
@ -835,7 +805,7 @@ bool BC1Encoder::RefineEndpointsLS(EncodeResults &result, const CBlock &pixels,

    // invert matrix
    float det = matrix.Determinant2x2();  // z00 * z11 - z01 * z10;
-    if (abs(det) < 1e-8f) {
+    if (fabs(det) < 1e-8f) {
        result.color_mode = ColorMode::Incomplete;
        return false;
    }
@ -850,14 +820,12 @@ bool BC1Encoder::RefineEndpointsLS(EncodeResults &result, const CBlock &pixels,
    Vector4 high = (matrix[2] * q00) + (matrix[3] * q10);

    result.color_mode = M;
-    result.low = OldColor::PreciseRound565(low);
-    result.high = OldColor::PreciseRound565(high);
+    result.low = Color::PreciseRound565(low);
+    result.high = Color::PreciseRound565(high);
    return true;
 }

-template <BC1Encoder::ColorMode M>
-void BC1Encoder::RefineEndpointsLS(EncodeResults &result, std::array<Vector4, 17> &sums, Vector4 &matrix,
-                                   Hash hash) const {
+template <BC1Encoder::ColorMode M> void BC1Encoder::RefineEndpointsLS(EncodeResults &result, std::array<Vector4, 17> &sums, Vector4 &matrix, Hash hash) const {
    const int color_count = (unsigned)M & 0x0F;
    static_assert(color_count == 3 || color_count == 4);
    assert(result.color_mode != ColorMode::Incomplete);
@ -878,13 +846,12 @@ void BC1Encoder::RefineEndpointsLS(EncodeResults &result, std::array<Vector4, 17
    Vector4 high = (matrix[2] * q00) + (matrix[3] * q10);

    result.color_mode = M;
-    result.low = OldColor::PreciseRound565(low);
-    result.high = OldColor::PreciseRound565(high);
+    result.low = Color::PreciseRound565(low);
+    result.high = Color::PreciseRound565(high);
 }

 template <BC1Encoder::ColorMode M>
-void BC1Encoder::RefineBlockLS(EncodeResults &result, const CBlock &pixels, const BlockMetrics &metrics,
-                               ErrorMode error_mode, unsigned passes) const {
+void BC1Encoder::RefineBlockLS(EncodeResults &result, const CBlock &pixels, const BlockMetrics &metrics, ErrorMode error_mode, unsigned passes) const {
    assert(error_mode != ErrorMode::None || passes == 1);

    for (unsigned pass = 0; pass < passes; pass++) {
@ -909,8 +876,7 @@ void BC1Encoder::RefineBlockLS(EncodeResults &result, const CBlock &pixels, cons
 }

 template <BC1Encoder::ColorMode M>
-void BC1Encoder::RefineBlockCF(EncodeResults &result, const CBlock &pixels, const BlockMetrics &metrics,
-                               ErrorMode error_mode, unsigned orderings) const {
+void BC1Encoder::RefineBlockCF(EncodeResults &result, const CBlock &pixels, const BlockMetrics &metrics, ErrorMode error_mode, unsigned orderings) const {
    const int color_count = (unsigned)M & 0x0F;
    static_assert(color_count == 3 || color_count == 4);
    assert(result.color_mode != ColorMode::Incomplete);
@ -989,8 +955,7 @@ void BC1Encoder::EndpointSearch(EncodeResults &result, const CBlock &pixels) con

    for (unsigned i = 0; i < _search_rounds; i++) {
        const unsigned voxel_index = (unsigned)(i & 15);
-        assert((unsigned)Voxels[(unsigned)Voxels[voxel_index][3]][3] ==
-               voxel_index);  // make sure voxels are symmetrical
+        assert((unsigned)Voxels[(unsigned)Voxels[voxel_index][3]][3] == voxel_index);  // make sure voxels are symmetrical

        if ((int)(i & 31) == forbidden_direction) continue;

--- a/quicktex/s3tc/bc1/BC1Encoder.h
+++ b/quicktex/s3tc/bc1/BC1Encoder.h
@ -26,13 +26,13 @@
 #include <memory>
 #include <tuple>

-#include "ColorBlock.h"
-#include "Encoder.h"
-#include "OldColor.h"
-#include "s3tc/bc1/BC1Block.h"
-#include "s3tc/bc1/SingleColorTable.h"
-#include "s3tc/interpolator/Interpolator.h"
-#include "texture/BlockTexture.h"
+#include "../../Color.h"
+#include "../../ColorBlock.h"
+#include "../../Encoder.h"
+#include "../../Texture.h"
+#include "../interpolator/Interpolator.h"
+#include "BC1Block.h"
+#include "SingleColorTable.h"

 namespace quicktex {
 class Vector4;
@ -79,8 +79,7 @@ class BC1Encoder final : public BlockEncoder<BlockTexture<BC1Block>> {
    };

    enum class EndpointMode {
-        // Use 2D least squares+inset+optimal rounding (the method used in Humus's GPU texture encoding demo), instead
-        // of PCA.
+        // Use 2D least squares+inset+optimal rounding (the method used in Humus's GPU texture encoding demo), instead of PCA.
        // Around 18% faster, very slightly lower average quality to better (depends on the content).
        LeastSquares,

@ -102,8 +101,7 @@ class BC1Encoder final : public BlockEncoder<BlockTexture<BC1Block>> {

    BC1Encoder(unsigned level, ColorMode color_mode, InterpolatorPtr interpolator);

-    BC1Encoder(unsigned int level = 5, ColorMode color_mode = ColorMode::FourColor)
-        : BC1Encoder(level, color_mode, std::make_shared<Interpolator>()) {}
+    BC1Encoder(unsigned int level = 5, ColorMode color_mode = ColorMode::FourColor) : BC1Encoder(level, color_mode, std::make_shared<Interpolator>()) {}

    // Getters and Setters
    void SetLevel(unsigned level);
@ -143,8 +141,8 @@ class BC1Encoder final : public BlockEncoder<BlockTexture<BC1Block>> {

    // Unpacked BC1 block with metadata
    struct EncodeResults {
-        OldColor low;
-        OldColor high;
+        Color low;
+        Color high;
        std::array<uint8_t, 16> selectors = {0};
        ColorMode color_mode = ColorMode::Incomplete;
        bool solid = false;
@ -171,29 +169,24 @@ class BC1Encoder final : public BlockEncoder<BlockTexture<BC1Block>> {
    unsigned _orderings4;
    unsigned _orderings3;

-    BC1Block WriteBlockSolid(OldColor color) const;
+    BC1Block WriteBlockSolid(Color color) const;
    BC1Block WriteBlock(EncodeResults &result) const;

-    void FindEndpoints(EncodeResults &result, const CBlock &pixels, const BlockMetrics &metrics,
-                       EndpointMode endpoint_mode, bool ignore_black = false) const;
-    void FindEndpointsSingleColor(EncodeResults &result, OldColor color, bool is_3color = false) const;
-    void FindEndpointsSingleColor(EncodeResults &result, const CBlock &pixels, OldColor color, bool is_3color) const;
+    void FindEndpoints(EncodeResults &result, const CBlock &pixels, const BlockMetrics &metrics, EndpointMode endpoint_mode, bool ignore_black = false) const;
+    void FindEndpointsSingleColor(EncodeResults &result, Color color, bool is_3color = false) const;
+    void FindEndpointsSingleColor(EncodeResults &result, const CBlock &pixels, Color color, bool is_3color) const;

    template <ColorMode M> void FindSelectors(EncodeResults &result, const CBlock &pixels, ErrorMode error_mode) const;

-    template <ColorMode M>
-    bool RefineEndpointsLS(EncodeResults &result, const CBlock &pixels, BlockMetrics metrics) const;
+    template <ColorMode M> bool RefineEndpointsLS(EncodeResults &result, const CBlock &pixels, BlockMetrics metrics) const;
+
+    template <ColorMode M> void RefineEndpointsLS(EncodeResults &result, std::array<Vector4, 17> &sums, Vector4 &matrix, Hash hash) const;

    template <ColorMode M>
-    void RefineEndpointsLS(EncodeResults &result, std::array<Vector4, 17> &sums, Vector4 &matrix, Hash hash) const;
+    void RefineBlockLS(EncodeResults &result, const CBlock &pixels, const BlockMetrics &metrics, ErrorMode error_mode, unsigned passes) const;

    template <ColorMode M>
-    void RefineBlockLS(EncodeResults &result, const CBlock &pixels, const BlockMetrics &metrics, ErrorMode error_mode,
-                       unsigned passes) const;
-
-    template <ColorMode M>
-    void RefineBlockCF(EncodeResults &result, const CBlock &pixels, const BlockMetrics &metrics, ErrorMode error_mode,
-                       unsigned orderings) const;
+    void RefineBlockCF(EncodeResults &result, const CBlock &pixels, const BlockMetrics &metrics, ErrorMode error_mode, unsigned orderings) const;

    void EndpointSearch(EncodeResults &result, const CBlock &pixels) const;
 };
--- a/quicktex/s3tc/bc1/Histogram.h
+++ b/quicktex/s3tc/bc1/Histogram.h
@ -27,10 +27,10 @@
 #include <mutex>
 #include <numeric>

-#include "Vector4.h"
-#include "util/math.h"
+#include "../../Vector4.h"
+#include "../../util.h"

-namespace quicktex::s3tc {
+namespace quicktex::s3tc  {
 template <size_t N> class Histogram {
   public:
    using Hash = uint16_t;
@ -71,7 +71,7 @@ template <size_t N> class Histogram {
    unsigned GetPacked() const {
        Hash packed = 0;

-        for (unsigned i = 0; i < (N - 1); i++) {
+        for (unsigned i = 0; i < (N-1); i++) {
            assert(_bins[i] <= (1U << 4) - 1U);
            packed |= static_cast<uint16_t>(_bins[i]) << (i * 4U);
        }
--- a/quicktex/s3tc/bc1/OrderTable.cpp
+++ b/quicktex/s3tc/bc1/OrderTable.cpp
@ -21,7 +21,7 @@

 #include <array>

-#include "Vector4.h"
+#include "../../Vector4.h"

 namespace quicktex::s3tc  {
 using Hash = uint16_t;
--- a/quicktex/s3tc/bc1/OrderTable.h
+++ b/quicktex/s3tc/bc1/OrderTable.h
@ -29,9 +29,8 @@
 #include <mutex>
 #include <type_traits>

+#include "../../Vector4.h"
 #include "Histogram.h"
-#include "Vector4.h"
-#include "util/math.h"

 namespace quicktex::s3tc {
 template <size_t N> class OrderTable {
@ -59,7 +58,7 @@ template <size_t N> class OrderTable {
    static bool Generate() {
        static_assert(N == 4 || N == 3);

-        table_mutex.lock();
+        std::scoped_lock{table_mutex};
        if (!generated) {
            hashes = new std::array<Hash, HashCount>();
            factors = new std::array<Vector4, OrderCount>();
@ -74,7 +73,7 @@ template <size_t N> class OrderTable {
                for (unsigned sel = 0; sel < N; sel++) factor_matrix += (Weights[sel] * h[sel]);

                float det = factor_matrix.Determinant2x2();
-                if (abs(det) < 1e-8f) {
+                if (fabs(det) < 1e-8f) {
                    factors->at(i) = Vector4(0);
                } else {
                    std::swap(factor_matrix[0], factor_matrix[3]);
@ -86,8 +85,6 @@ template <size_t N> class OrderTable {

            generated = true;
        }
-        table_mutex.unlock();
-
        assert(generated);
        return true;
    }
@ -114,9 +111,7 @@ template <size_t N> class OrderTable {
        return factors->at(hash);
    }

-    static bool IsSingleColor(Hash hash) {
-        return (std::find(SingleColorHashes.begin(), SingleColorHashes.end(), hash) != SingleColorHashes.end());
-    }
+    static bool IsSingleColor(Hash hash) { return (std::find(SingleColorHashes.begin(), SingleColorHashes.end(), hash) != SingleColorHashes.end()); }

   private:
    static std::mutex table_mutex;
--- a/quicktex/s3tc/bc1/SingleColorTable.h
+++ b/quicktex/s3tc/bc1/SingleColorTable.h
@ -23,11 +23,10 @@
 #include <cstdint>
 #include <memory>

-#include "s3tc/interpolator/Interpolator.h"
-#include "util/bitbash.h"
-#include "util/math.h"
+#include "../../util.h"
+#include "../interpolator/Interpolator.h"

-namespace quicktex::s3tc {
+namespace quicktex::s3tc  {

 struct BC1MatchEntry {
    uint8_t high;
@ -60,10 +59,10 @@ template <size_t B, size_t N> MatchListPtr SingleColorTable(InterpolatorPtr inte
        // TODO: Can probably avoid testing for values that definitely wont yield good results,
        // e.g. low8 and high8 both much smaller or larger than index
        for (uint8_t low = 0; low < Size; low++) {
-            uint8_t low8 = scale_to_8<B>(low);
+            uint8_t low8 = (B == 5) ? scale5To8(low) : scale6To8(low);

            for (uint8_t high = 0; high < Size; high++) {
-                uint8_t high8 = scale_to_8<B>(high);
+                uint8_t high8 = (B == 5) ? scale5To8(high) : scale6To8(high);
                uint8_t value;

                if (use_8bit) {
@ -72,10 +71,10 @@ template <size_t B, size_t N> MatchListPtr SingleColorTable(InterpolatorPtr inte
                    value = (B == 5) ? interpolator->Interpolate5(high, low) : interpolator->Interpolate6(high, low);
                }

-                unsigned new_error = abs(value - (int)i);
+                unsigned new_error = iabs(value - (int)i);

                // We only need to factor in 3% error in BC1 ideal mode.
-                if (ideal) new_error += (abs(high8 - (int)low8) * 3) / 100;
+                if (ideal) new_error += (iabs(high8 - (int)low8) * 3) / 100;

                if ((new_error < error) || (new_error == error && low == high)) {
                    assert(new_error <= UINT8_MAX);
--- a/quicktex/s3tc/bc1/_bindings.cpp
+++ b/quicktex/s3tc/bc1/_bindings.cpp
@ -23,12 +23,16 @@
 #include <pybind11/stl.h>

 #include <array>
-#include <memory>
+#include <cstddef>
+#include <cstdint>
+#include <stdexcept>
+#include <string>

-#include "s3tc/bc1/BC1Block.h"
-#include "s3tc/bc1/BC1Decoder.h"
-#include "s3tc/bc1/BC1Encoder.h"
-#include "s3tc/interpolator/Interpolator.h"
+#include "../../Decoder.h"
+#include "../../Encoder.h"
+#include "../interpolator/Interpolator.h"
+#include "BC1Decoder.h"
+#include "BC1Encoder.h"

 namespace py = pybind11;
 namespace quicktex::bindings {
@ -46,7 +50,7 @@ void InitBC1(py::module_ &s3tc) {
    bc1_block.doc() = "A single BC1 block.";

    bc1_block.def(py::init<>());
-    bc1_block.def(py::init<OldColor, OldColor, BC1Block::SelectorArray>(), "color0"_a, "color1"_a, "selectors"_a, R"doc(
+    bc1_block.def(py::init<Color, Color, BC1Block::SelectorArray>(), "color0"_a, "color1"_a, "selectors"_a, R"doc(
        Create a new BC1Block with the specified endpoints and selectors

        :param color0: The first endpoint
@ -54,8 +58,7 @@ void InitBC1(py::module_ &s3tc) {
        :param selectors: the selectors as a 4x4 list of integers, between 0 and 3 inclusive.
    )doc");

-    bc1_block.def_property("endpoints", &BC1Block::GetColors, &BC1Block::SetColors,
-                           "The block's endpoint colors as a 2-tuple.");
+    bc1_block.def_property("endpoints", &BC1Block::GetColors, &BC1Block::SetColors, "The block's endpoint colors as a 2-tuple.");
    bc1_block.def_property("selectors", &BC1Block::GetSelectors, &BC1Block::SetSelectors, R"doc(
        The block's selectors as a 4x4 list of integers between 0 and 3 inclusive.

@ -80,42 +83,27 @@ void InitBC1(py::module_ &s3tc) {
    // region BC1Encoder
    py::class_<BC1Encoder> bc1_encoder(bc1, "BC1Encoder", "Encodes RGB textures to BC1.");

-    py::enum_<BC1Encoder::EndpointMode>(bc1_encoder, "EndpointMode",
-                                        "Enum representing various methods of finding endpoints in a block.")
-        .value("LeastSquares", BC1Encoder::EndpointMode::LeastSquares,
-               "Find endpoints using a 2D least squares approach.")
-        .value("BoundingBox", BC1Encoder::EndpointMode::BoundingBox,
-               "Find endpoints using a simple bounding box. Fast but inaccurate.")
-        .value("BoundingBoxInt", BC1Encoder::EndpointMode::BoundingBoxInt,
-               "Same as BoundingBox but using integers, slightly faster.")
-        .value("PCA", BC1Encoder::EndpointMode::PCA,
-               "Find endpoints using Principle Component Analysis. Slowest but highest quality method.");
+    py::enum_<BC1Encoder::EndpointMode>(bc1_encoder, "EndpointMode", "Enum representing various methods of finding endpoints in a block.")
+        .value("LeastSquares", BC1Encoder::EndpointMode::LeastSquares, "Find endpoints using a 2D least squares approach.")
+        .value("BoundingBox", BC1Encoder::EndpointMode::BoundingBox, "Find endpoints using a simple bounding box. Fast but inaccurate.")
+        .value("BoundingBoxInt", BC1Encoder::EndpointMode::BoundingBoxInt, "Same as BoundingBox but using integers, slightly faster.")
+        .value("PCA", BC1Encoder::EndpointMode::PCA, "Find endpoints using Principle Component Analysis. Slowest but highest quality method.");

-    py::enum_<BC1Encoder::ErrorMode>(bc1_encoder, "ErrorMode",
-                                     "Enum representing various methods of finding selectors in a block.")
-        .value("None", BC1Encoder::ErrorMode::None,
-               "The same as Faster but error is not calculated. This disables any cluster-fit options")
-        .value("Faster", BC1Encoder::ErrorMode::Faster,
-               "Use a slightly lower quality, but ~30% faster MSE evaluation function for 4-color blocks.")
+    py::enum_<BC1Encoder::ErrorMode>(bc1_encoder, "ErrorMode", "Enum representing various methods of finding selectors in a block.")
+        .value("None", BC1Encoder::ErrorMode::None, "The same as Faster but error is not calculated. This disables any cluster-fit options")
+        .value("Faster", BC1Encoder::ErrorMode::Faster, "Use a slightly lower quality, but ~30% faster MSE evaluation function for 4-color blocks.")
        .value("Check2", BC1Encoder::ErrorMode::Check2, "Default error-checking method.")
-        .value("Full", BC1Encoder::ErrorMode::Full,
-               "Examine all colors to compute selectors/MSE. Slower but slightly higher quality.");
+        .value("Full", BC1Encoder::ErrorMode::Full, "Examine all colors to compute selectors/MSE. Slower but slightly higher quality.");

-    py::enum_<BC1Encoder::ColorMode>(bc1_encoder, "ColorMode",
-                                     "Enum representing various methods of writing BC1 blocks.")
-        .value("FourColor", BC1Encoder::ColorMode::FourColor,
-               "Default color mode. Only 4-color blocks will be output, where color0 > color1")
-        .value("ThreeColor", BC1Encoder::ColorMode::ThreeColor,
-               "Additionally use 3-color blocks when they have a lower error, where color0 <= color1")
+    py::enum_<BC1Encoder::ColorMode>(bc1_encoder, "ColorMode", "Enum representing various methods of writing BC1 blocks.")
+        .value("FourColor", BC1Encoder::ColorMode::FourColor, "Default color mode. Only 4-color blocks will be output, where color0 > color1")
+        .value("ThreeColor", BC1Encoder::ColorMode::ThreeColor, "Additionally use 3-color blocks when they have a lower error, where color0 <= color1")
        .value("ThreeColorBlack", BC1Encoder::ColorMode::ThreeColorBlack,
-               "Additionally use 3-color blocks with black pixels (selector 3). Note that this requires your "
-               "shader/engine to not sample the alpha channel "
+               "Additionally use 3-color blocks with black pixels (selector 3). Note that this requires your shader/engine to not sample the alpha channel "
               "when using a BC1 texture.");

-    bc1_encoder.def(py::init<unsigned, BC1Encoder::ColorMode>(), "level"_a = 5,
-                    "color_mode"_a = BC1Encoder::ColorMode::FourColor);
-    bc1_encoder.def(py::init<unsigned, BC1Encoder::ColorMode, InterpolatorPtr>(), "level"_a, "color_mode"_a,
-                    "interpolator"_a, R"doc(
+    bc1_encoder.def(py::init<unsigned, BC1Encoder::ColorMode>(), "level"_a = 5, "color_mode"_a = BC1Encoder::ColorMode::FourColor);
+    bc1_encoder.def(py::init<unsigned, BC1Encoder::ColorMode, InterpolatorPtr>(), "level"_a, "color_mode"_a, "interpolator"_a, R"doc(
        Create a new BC1 encoder with the specified preset level, color mode, and interpolator.

        :param int level: The preset level of the resulting encoder, between 0 and 18 inclusive. See :py:meth:`set_level` for more information. Default: 5.
@ -137,56 +125,44 @@ void InitBC1(py::module_ &s3tc) {
        :param int level: The preset level of the resulting encoder, between 0 and 18 inclusive. Default: 5.
    )doc");

-    bc1_encoder.def_property_readonly(
-        "interpolator", &BC1Encoder::GetInterpolator,
-        "The :py:class:`~quicktex.s3tc.interpolator.Interpolator` used by this encoder. This is a readonly property.");
-    bc1_encoder.def_property_readonly(
-        "color_mode", &BC1Encoder::GetColorMode,
-        "The :py:class:`~quicktex.s3tc.bc1.BC1Encoder.ColorMode` used by this encoder. This is a readonly property.");
+    bc1_encoder.def_property_readonly("interpolator", &BC1Encoder::GetInterpolator,
+                                      "The :py:class:`~quicktex.s3tc.interpolator.Interpolator` used by this encoder. This is a readonly property.");
+    bc1_encoder.def_property_readonly("color_mode", &BC1Encoder::GetColorMode,
+                                      "The :py:class:`~quicktex.s3tc.bc1.BC1Encoder.ColorMode` used by this encoder. This is a readonly property.");

    // Advanced API

-    bc1_encoder.def_property("error_mode", &BC1Encoder::GetErrorMode, &BC1Encoder::SetErrorMode,
-                             "The error mode used by this encoder for finding selectors.");
-    bc1_encoder.def_property("endpoint_mode", &BC1Encoder::GetEndpointMode, &BC1Encoder::SetEndpointMode,
-                             "The endpoint mode used by this encoder.");
+    bc1_encoder.def_property("error_mode", &BC1Encoder::GetErrorMode, &BC1Encoder::SetErrorMode, "The error mode used by this encoder for finding selectors.");
+    bc1_encoder.def_property("endpoint_mode", &BC1Encoder::GetEndpointMode, &BC1Encoder::SetEndpointMode, "The endpoint mode used by this encoder.");

    bc1_encoder.def_readwrite("two_ls_passes", &BC1Encoder::two_ls_passes,
                              "Use 2 least squares pass, instead of one (same as stb_dxt's HIGHQUAL option).\n"
                              "Recommended if you're setting the orderings settings greater than 0.");

-    bc1_encoder.def_readwrite("two_ep_passes", &BC1Encoder::two_ep_passes,
-                              "Try 2 different ways of choosing the initial endpoints.");
+    bc1_encoder.def_readwrite("two_ep_passes", &BC1Encoder::two_ep_passes, "Try 2 different ways of choosing the initial endpoints.");

-    bc1_encoder.def_readwrite(
-        "two_cf_passes", &BC1Encoder::two_cf_passes,
-        "Greatly increase encode time, with very slightly higher quality.\n"
-        "Same as squish's iterative cluster fit option. Not really worth the tiny boost in quality, "
-        "unless you just don't care about performance at all.");
+    bc1_encoder.def_readwrite("two_cf_passes", &BC1Encoder::two_cf_passes,
+                              "Greatly increase encode time, with very slightly higher quality.\n"
+                              "Same as squish's iterative cluster fit option. Not really worth the tiny boost in quality, "
+                              "unless you just don't care about performance at all.");

-    bc1_encoder.def_readwrite(
-        "exhaustive", &BC1Encoder::exhaustive,
-        "Check all total orderings - *very* slow. The encoder is not designed to be used in this way");
+    bc1_encoder.def_readwrite("exhaustive", &BC1Encoder::exhaustive,
+                              "Check all total orderings - *very* slow. The encoder is not designed to be used in this way");

    bc1_encoder.def_property("search_rounds", &BC1Encoder::GetSearchRounds, &BC1Encoder::SetSearchRounds,
-                             "Setting search rounds > 0 enables refining the final endpoints by examining nearby "
-                             "colors. A higher value has a higher quality "
+                             "Setting search rounds > 0 enables refining the final endpoints by examining nearby colors. A higher value has a higher quality "
                             "at the expense of performance.");

-    bc1_encoder.def_property(
-        "orderings", &BC1Encoder::GetOrderings, &BC1Encoder::SetOrderings,
-        "setting the orderings > 0 enables ordered cluster fit using a lookup table of similar blocks. Value is a "
-        "tuple of (4 color "
-        "orders, 3 color orders), where higher values have a higher quality at the expense of performance.");
+    bc1_encoder.def_property("orderings", &BC1Encoder::GetOrderings, &BC1Encoder::SetOrderings,
+                             "setting the orderings > 0 enables ordered cluster fit using a lookup table of similar blocks. Value is a tuple of (4 color "
+                             "orders, 3 color orders), where higher values have a higher quality at the expense of performance.");

    bc1_encoder.def_readonly_static("max_power_iterations", &BC1Encoder::max_power_iterations);
    bc1_encoder.def_readonly_static("min_power_iterations", &BC1Encoder::min_power_iterations);

-    bc1_encoder.def_property(
-        "power_iterations", &BC1Encoder::GetPowerIterations, &BC1Encoder::SetPowerIterations,
-        "Number of power iterations used with the PCA endpoint mode. Value should be around 4 to 6. "
-        "Automatically clamped to between :py:const:`BC1Encoder.min_power_iterations` and "
-        ":py:const:`BC1Encoder.max_power_iterations`");
+    bc1_encoder.def_property("power_iterations", &BC1Encoder::GetPowerIterations, &BC1Encoder::SetPowerIterations,
+                             "Number of power iterations used with the PCA endpoint mode. Value should be around 4 to 6. "
+                             "Automatically clamped to between :py:const:`BC1Encoder.min_power_iterations` and :py:const:`BC1Encoder.max_power_iterations`");
    // endregion

    // region BC1Decoder
@ -209,10 +185,8 @@ void InitBC1(py::module_ &s3tc) {
        :returns: A new RawTexture with the same dimensions as the input
    )doc");

-    bc1_decoder.def_property_readonly("interpolator", &BC1Decoder::GetInterpolator,
-                                      "The interpolator used by this decoder. This is a readonly property.");
-    bc1_decoder.def_readwrite("write_alpha", &BC1Decoder::write_alpha,
-                              "Determines if the alpha channel of the output is written to.");
+    bc1_decoder.def_property_readonly("interpolator", &BC1Decoder::GetInterpolator, "The interpolator used by this decoder. This is a readonly property.");
+    bc1_decoder.def_readwrite("write_alpha", &BC1Decoder::write_alpha, "Determines if the alpha channel of the output is written to.");
    // endregion
 }
 }  // namespace quicktex::bindings
--- a/quicktex/s3tc/bc3/BC3Block.h
+++ b/quicktex/s3tc/bc3/BC3Block.h
@ -21,8 +21,8 @@

 #include <utility>

-#include "s3tc/bc1/BC1Block.h"
-#include "s3tc/bc4/BC4Block.h"
+#include "../bc1/BC1Block.h"
+#include "../bc4/BC4Block.h"

 namespace quicktex::s3tc {

@ -54,9 +54,7 @@ class alignas(8) BC3Block {
        color_block = blocks.second;
    }

-    bool operator==(const BC3Block &Rhs) const {
-        return alpha_block == Rhs.alpha_block && color_block == Rhs.color_block;
-    }
+    bool operator==(const BC3Block &Rhs) const { return alpha_block == Rhs.alpha_block && color_block == Rhs.color_block; }
    bool operator!=(const BC3Block &Rhs) const { return !(Rhs == *this); }
 };
 }  // namespace quicktex::s3tc
--- a/quicktex/s3tc/bc3/BC3Decoder.h
+++ b/quicktex/s3tc/bc3/BC3Decoder.h
@ -21,13 +21,13 @@

 #include <memory>

-#include "ColorBlock.h"
-#include "Decoder.h"
-#include "s3tc/bc1/BC1Decoder.h"
-#include "s3tc/bc3/BC3Block.h"
-#include "s3tc/bc4/BC4Decoder.h"
-#include "s3tc/interpolator/Interpolator.h"
-#include "texture/BlockTexture.h"
+#include "../../ColorBlock.h"
+#include "../../Decoder.h"
+#include "../../Texture.h"
+#include "../bc1/BC1Decoder.h"
+#include "../bc4/BC4Decoder.h"
+#include "../interpolator/Interpolator.h"
+#include "BC3Block.h"

 namespace quicktex::s3tc {

@ -37,8 +37,7 @@ class BC3Decoder : public BlockDecoder<BlockTexture<BC3Block>> {
    using BC4DecoderPtr = std::shared_ptr<BC4Decoder>;
    using InterpolatorPtr = std::shared_ptr<Interpolator>;

-    BC3Decoder(InterpolatorPtr interpolator)
-        : _bc1_decoder(std::make_shared<BC1Decoder>(interpolator)), _bc4_decoder(std::make_shared<BC4Decoder>(3)) {}
+    BC3Decoder(InterpolatorPtr interpolator) : _bc1_decoder(std::make_shared<BC1Decoder>(interpolator)), _bc4_decoder(std::make_shared<BC4Decoder>(3)) {}

    BC3Decoder() : BC3Decoder(std::make_shared<Interpolator>()) {}

--- a/quicktex/s3tc/bc3/BC3Encoder.cpp
+++ b/quicktex/s3tc/bc3/BC3Encoder.cpp
@ -19,8 +19,10 @@

 #include "BC3Encoder.h"

-#include "ColorBlock.h"
-#include "s3tc/bc3/BC3Block.h"
+#include "../../ColorBlock.h"
+#include "../bc1/BC1Block.h"
+#include "../bc4/BC4Block.h"
+#include "BC3Block.h"

 namespace quicktex::s3tc {
 BC3Block BC3Encoder::EncodeBlock(const ColorBlock<4, 4> &pixels) const {
--- a/quicktex/s3tc/bc3/BC3Encoder.h
+++ b/quicktex/s3tc/bc3/BC3Encoder.h
@ -21,13 +21,13 @@

 #include <memory>

-#include "ColorBlock.h"
-#include "Encoder.h"
-#include "s3tc/bc1/BC1Encoder.h"
-#include "s3tc/bc3/BC3Block.h"
-#include "s3tc/bc4/BC4Encoder.h"
-#include "s3tc/interpolator/Interpolator.h"
-#include "texture/BlockTexture.h"
+#include "../../ColorBlock.h"
+#include "../../Encoder.h"
+#include "../../Texture.h"
+#include "../bc1/BC1Encoder.h"
+#include "../bc4/BC4Encoder.h"
+#include "../interpolator/Interpolator.h"
+#include "BC3Block.h"

 namespace quicktex::s3tc {

@ -38,8 +38,7 @@ class BC3Encoder : public BlockEncoder<BlockTexture<BC3Block>> {
    using InterpolatorPtr = std::shared_ptr<Interpolator>;

    BC3Encoder(unsigned level, InterpolatorPtr interpolator)
-        : _bc1_encoder(std::make_shared<BC1Encoder>(level, BC1Encoder::ColorMode::FourColor, interpolator)),
-          _bc4_encoder(std::make_shared<BC4Encoder>(3)) {}
+        : _bc1_encoder(std::make_shared<BC1Encoder>(level, BC1Encoder::ColorMode::FourColor, interpolator)), _bc4_encoder(std::make_shared<BC4Encoder>(3)) {}

    BC3Encoder(unsigned level = 5) : BC3Encoder(level, std::make_shared<Interpolator>()) {}

--- a/quicktex/s3tc/bc3/_bindings.cpp
+++ b/quicktex/s3tc/bc3/_bindings.cpp
@ -22,14 +22,16 @@
 #include <pybind11/pybind11.h>

 #include <array>
-#include <memory>
+#include <cstddef>
+#include <cstdint>
+#include <stdexcept>
+#include <string>

-#include "s3tc/bc1/BC1Block.h"
-#include "s3tc/bc3/BC3Block.h"
-#include "s3tc/bc3/BC3Decoder.h"
-#include "s3tc/bc3/BC3Encoder.h"
-#include "s3tc/bc4/BC4Block.h"
-#include "s3tc/interpolator/Interpolator.h"
+#include "../../Decoder.h"
+#include "../../Encoder.h"
+#include "../interpolator/Interpolator.h"
+#include "BC3Decoder.h"
+#include "BC3Encoder.h"

 namespace py = pybind11;
 namespace quicktex::bindings {
@ -57,8 +59,7 @@ void InitBC3(py::module_ &s3tc) {

    bc3_block.def_readwrite("alpha_block", &BC3Block::alpha_block, "The BC4 block used for alpha data.");
    bc3_block.def_readwrite("color_block", &BC3Block::color_block, "The BC1 block used for rgb data.");
-    bc3_block.def_property("blocks", &BC3Block::GetBlocks, &BC3Block::SetBlocks,
-                           "The BC4 and BC1 blocks that make up this block as a 2-tuple.");
+    bc3_block.def_property("blocks", &BC3Block::GetBlocks, &BC3Block::SetBlocks, "The BC4 and BC1 blocks that make up this block as a 2-tuple.");
    // endregion

    // region BC3Texture
@ -87,12 +88,10 @@ void InitBC3(py::module_ &s3tc) {
        :returns: A new BC3Texture with the same dimension as the input.
    )doc");

-    bc3_encoder.def_property_readonly(
-        "bc1_encoder", &BC3Encoder::GetBC1Encoder,
-        "Internal :py:class:`~quicktex.s3tc.bc1.BC1Encoder` used for RGB data. Readonly.");
-    bc3_encoder.def_property_readonly(
-        "bc4_encoder", &BC3Encoder::GetBC4Encoder,
-        "Internal :py:class:`~quicktex.s3tc.bc4.BC4Encoder` used for alpha data. Readonly.");
+    bc3_encoder.def_property_readonly("bc1_encoder", &BC3Encoder::GetBC1Encoder,
+                                      "Internal :py:class:`~quicktex.s3tc.bc1.BC1Encoder` used for RGB data. Readonly.");
+    bc3_encoder.def_property_readonly("bc4_encoder", &BC3Encoder::GetBC4Encoder,
+                                      "Internal :py:class:`~quicktex.s3tc.bc4.BC4Encoder` used for alpha data. Readonly.");
    // endregion

    // region BC3Decoder
@ -114,12 +113,10 @@ void InitBC3(py::module_ &s3tc) {
        :returns: A new RawTexture with the same dimensions as the input
    )doc");

-    bc3_decoder.def_property_readonly(
-        "bc1_decoder", &BC3Decoder::GetBC1Decoder,
-        "Internal :py:class:`~quicktex.s3tc.bc1.BC1Decoder` used for RGB data. Readonly.");
-    bc3_decoder.def_property_readonly(
-        "bc4_decoder", &BC3Decoder::GetBC4Decoder,
-        "Internal :py:class:`~quicktex.s3tc.bc4.BC4Decoder` used for alpha data. Readonly.");
+    bc3_decoder.def_property_readonly("bc1_decoder", &BC3Decoder::GetBC1Decoder,
+                                      "Internal :py:class:`~quicktex.s3tc.bc1.BC1Decoder` used for RGB data. Readonly.");
+    bc3_decoder.def_property_readonly("bc4_decoder", &BC3Decoder::GetBC4Decoder,
+                                      "Internal :py:class:`~quicktex.s3tc.bc4.BC4Decoder` used for alpha data. Readonly.");
    // endregion
 }
 }  // namespace quicktex::bindings
--- a/quicktex/s3tc/bc4/BC4Block.cpp
+++ b/quicktex/s3tc/bc4/BC4Block.cpp
@ -22,17 +22,14 @@
 #include <algorithm>
 #include <stdexcept>

-#include "util/bitbash.h"
-#include "util/map.h"
-#include "util/math.h"
-#include "util/ranges.h"
+#include "../../util.h"

 namespace quicktex::s3tc {

 BC4Block::SelectorArray BC4Block::GetSelectors() const {
-    auto packed = pack<uint64_t>(_selectors, 8);
-    auto rows = unpack<uint16_t, Height>(packed, SelectorBits * Width);
-    return map([](auto row) { return unpack<uint8_t, Width>(row, SelectorBits); }, rows);
+    auto packed = Pack<uint8_t, uint64_t, 8, SelectorSize>(_selectors);
+    auto rows = Unpack<uint64_t, uint16_t, SelectorBits * Width, Height>(packed);
+    return MapArray(rows, Unpack<uint16_t, uint8_t, SelectorBits, Width>);
 }

 void BC4Block::SetSelectors(const BC4Block::SelectorArray& unpacked) {
@ -40,9 +37,9 @@ void BC4Block::SetSelectors(const BC4Block::SelectorArray& unpacked) {
        if (std::any_of(unpacked[y].begin(), unpacked[y].end(), [](uint8_t i) { return i > SelectorMax; }))
            throw std::invalid_argument("Selector value out of bounds.");
    }
-    auto rows = map([](auto r) { return pack<uint16_t>(r, SelectorBits); }, unpacked);
-    auto packed = pack<uint64_t>(rows, SelectorBits * Width);
-    _selectors = unpack<uint8_t, SelectorSize>(packed, 8);
+    auto rows = MapArray(unpacked, Pack<uint8_t, uint16_t, SelectorBits, Width>);
+    auto packed = Pack<uint16_t, uint64_t, SelectorBits * Width, Height>(rows);
+    _selectors = Unpack<uint64_t, uint8_t, 8, SelectorSize>(packed);
 }

 std::array<uint8_t, 8> BC4Block::GetValues6() const {
@ -67,8 +64,6 @@ std::array<uint8_t, 8> BC4Block::GetValues8() const {
            static_cast<uint8_t>((alpha0 + alpha1 * 6) / 7)};
 }

-bool BC4Block::operator==(const BC4Block& Rhs) const {
-    return alpha0 == Rhs.alpha0 && alpha1 == Rhs.alpha1 && _selectors == Rhs._selectors;
-}
+bool BC4Block::operator==(const BC4Block& Rhs) const { return alpha0 == Rhs.alpha0 && alpha1 == Rhs.alpha1 && _selectors == Rhs._selectors; }
 bool BC4Block::operator!=(const BC4Block& Rhs) const { return !(Rhs == *this); }
 }  // namespace quicktex::s3tc
--- a/quicktex/s3tc/bc4/BC4Decoder.cpp
+++ b/quicktex/s3tc/bc4/BC4Decoder.cpp
@ -22,8 +22,8 @@
 #include <array>    // for array
 #include <cassert>  // for assert

+#include "../../Color.h"
 #include "../../ColorBlock.h"
-#include "../../OldColor.h"
 #include "BC4Block.h"

 namespace quicktex::s3tc {
--- a/quicktex/s3tc/bc4/BC4Decoder.h
+++ b/quicktex/s3tc/bc4/BC4Decoder.h
@ -22,10 +22,10 @@
 #include <cstdint>
 #include <stdexcept>

-#include "ColorBlock.h"
-#include "Decoder.h"
-#include "s3tc/bc4/BC4Block.h"
-#include "texture/BlockTexture.h"
+#include "../../ColorBlock.h"
+#include "../../Decoder.h"
+#include "../../Texture.h"
+#include "BC4Block.h"

 namespace quicktex::s3tc {

--- a/quicktex/s3tc/bc4/BC4Encoder.cpp
+++ b/quicktex/s3tc/bc4/BC4Encoder.cpp
@ -23,8 +23,8 @@
 #include <array>
 #include <cstdint>

+#include "../../Color.h"
 #include "../../ColorBlock.h"
-#include "../../OldColor.h"
 #include "BC4Block.h"

 namespace quicktex::s3tc {
--- a/quicktex/s3tc/bc4/BC4Encoder.h
+++ b/quicktex/s3tc/bc4/BC4Encoder.h
@ -22,10 +22,10 @@
 #include <cstdint>
 #include <stdexcept>

-#include "ColorBlock.h"
-#include "Encoder.h"
-#include "s3tc/bc4/BC4Block.h"
-#include "texture/BlockTexture.h"
+#include "../../ColorBlock.h"
+#include "../../Encoder.h"
+#include "../../Texture.h"
+#include "BC4Block.h"

 namespace quicktex::s3tc {

--- a/quicktex/s3tc/bc4/_bindings.cpp
+++ b/quicktex/s3tc/bc4/_bindings.cpp
@ -23,11 +23,15 @@
 #include <pybind11/stl.h>

 #include <array>
+#include <cstddef>
 #include <cstdint>
+#include <stdexcept>
+#include <string>

-#include "s3tc/bc4/BC4Block.h"
-#include "s3tc/bc4/BC4Decoder.h"
-#include "s3tc/bc4/BC4Encoder.h"
+#include "../../Decoder.h"
+#include "../../Encoder.h"
+#include "BC4Decoder.h"
+#include "BC4Encoder.h"

 namespace py = pybind11;
 namespace quicktex::bindings {
@ -42,8 +46,7 @@ void InitBC4(py::module_ &s3tc) {
    bc4_block.doc() = "A single BC4 block.";

    bc4_block.def(py::init<>());
-    bc4_block.def(py::init<uint8_t, uint8_t, BC4Block::SelectorArray>(), "endpoint0"_a, "endpoint1"_a, "selectors"_a,
-                  R"doc(
+    bc4_block.def(py::init<uint8_t, uint8_t, BC4Block::SelectorArray>(), "endpoint0"_a, "endpoint1"_a, "selectors"_a, R"doc(
        Create a new BC4Block with the specified endpoints and selectors.

        :param int endpoint0: The first endpoint.
@ -51,8 +54,7 @@ void InitBC4(py::module_ &s3tc) {
        :param selectors: the selectors as a 4x4 list of integers, between 0 and 7 inclusive.
    )doc");

-    bc4_block.def_property("endpoints", &BC4Block::GetAlphas, &BC4Block::SetAlphas,
-                           "The block's endpoint values as a 2-tuple.");
+    bc4_block.def_property("endpoints", &BC4Block::GetAlphas, &BC4Block::SetAlphas, "The block's endpoint values as a 2-tuple.");
    bc4_block.def_property("selectors", &BC4Block::GetSelectors, &BC4Block::SetSelectors, R"doc(
        The block's selectors as a 4x4 list of integers between 0 and 7 inclusive.

@ -94,9 +96,8 @@ void InitBC4(py::module_ &s3tc) {
        :param RawTexture texture: Input texture to encode.
        :returns: A new BC4Texture with the same dimension as the input.
    )doc");
-
-    bc4_encoder.def_property_readonly("channel", &BC4Encoder::GetChannel,
-                                      "The channel that will be read from. 0 to 3 inclusive. Readonly.");
+    
+    bc4_encoder.def_property_readonly("channel", &BC4Encoder::GetChannel, "The channel that will be read from. 0 to 3 inclusive. Readonly.");
    // endregion

    // region BC4Decoder
@ -116,9 +117,8 @@ void InitBC4(py::module_ &s3tc) {
        :param RawTexture texture: Input texture to encode.
        :returns: A new RawTexture with the same dimensions as the input
    )doc");
-
-    bc4_decoder.def_property_readonly("channel", &BC4Decoder::GetChannel,
-                                      "The channel that will be written to. 0 to 3 inclusive. Readonly.");
+    
+    bc4_decoder.def_property_readonly("channel", &BC4Decoder::GetChannel, "The channel that will be written to. 0 to 3 inclusive. Readonly.");
    // endregion
 }

--- a/quicktex/s3tc/bc5/BC5Block.h
+++ b/quicktex/s3tc/bc5/BC5Block.h
@ -19,7 +19,9 @@

 #pragma once

-#include "s3tc/bc4/BC4Block.h"
+#include <utility>
+
+#include "../bc4/BC4Block.h"

 namespace quicktex::s3tc {

@ -51,9 +53,7 @@ class alignas(8) BC5Block {
        chan1_block = pair.second;
    }

-    bool operator==(const BC5Block &Rhs) const {
-        return chan0_block == Rhs.chan0_block && chan1_block == Rhs.chan1_block;
-    }
+    bool operator==(const BC5Block &Rhs) const { return chan0_block == Rhs.chan0_block && chan1_block == Rhs.chan1_block; }
    bool operator!=(const BC5Block &Rhs) const { return !(Rhs == *this); }
 };
 }  // namespace quicktex::s3tc
--- a/quicktex/s3tc/bc5/BC5Decoder.cpp
+++ b/quicktex/s3tc/bc5/BC5Decoder.cpp
@ -19,8 +19,8 @@

 #include "BC5Decoder.h"

-#include "ColorBlock.h"
-#include "s3tc/bc5/BC5Block.h"
+#include "../../ColorBlock.h"
+#include "BC5Block.h"

 namespace quicktex::s3tc {
 ColorBlock<4, 4> BC5Decoder::DecodeBlock(const BC5Block &block) const {
--- a/quicktex/s3tc/bc5/BC5Decoder.h
+++ b/quicktex/s3tc/bc5/BC5Decoder.h
@ -24,11 +24,11 @@
 #include <tuple>
 #include <type_traits>

-#include "ColorBlock.h"
-#include "Decoder.h"
-#include "s3tc/bc4/BC4Decoder.h"
-#include "s3tc/bc5/BC5Block.h"
-#include "texture/BlockTexture.h"
+#include "../../ColorBlock.h"
+#include "../../Decoder.h"
+#include "../../Texture.h"
+#include "../bc4/BC4Decoder.h"
+#include "BC5Block.h"

 namespace quicktex::s3tc {

@ -38,10 +38,8 @@ class BC5Decoder : public BlockDecoder<BlockTexture<BC5Block>> {
    using BC4DecoderPtr = std::shared_ptr<BC4Decoder>;
    using BC4DecoderPair = std::tuple<BC4DecoderPtr, BC4DecoderPtr>;

-    BC5Decoder(uint8_t chan0 = 0, uint8_t chan1 = 1)
-        : BC5Decoder(std::make_shared<BC4Decoder>(chan0), std::make_shared<BC4Decoder>(chan1)) {}
-    BC5Decoder(BC4DecoderPtr chan0_decoder, BC4DecoderPtr chan1_decoder)
-        : _chan0_decoder(chan0_decoder), _chan1_decoder(chan1_decoder) {}
+    BC5Decoder(uint8_t chan0 = 0, uint8_t chan1 = 1) : BC5Decoder(std::make_shared<BC4Decoder>(chan0), std::make_shared<BC4Decoder>(chan1)) {}
+    BC5Decoder(BC4DecoderPtr chan0_decoder, BC4DecoderPtr chan1_decoder) : _chan0_decoder(chan0_decoder), _chan1_decoder(chan1_decoder) {}

    ColorBlock<4, 4> DecodeBlock(const BC5Block &block) const override;

--- a/quicktex/s3tc/bc5/BC5Encoder.cpp
+++ b/quicktex/s3tc/bc5/BC5Encoder.cpp
@ -19,8 +19,8 @@

 #include "BC5Encoder.h"

-#include "ColorBlock.h"
-#include "s3tc/bc4/BC4Block.h"
+#include "../../ColorBlock.h"
+#include "../bc4/BC4Block.h"

 namespace quicktex::s3tc {
 BC5Block BC5Encoder::EncodeBlock(const ColorBlock<4, 4> &pixels) const {
--- a/quicktex/s3tc/bc5/BC5Encoder.h
+++ b/quicktex/s3tc/bc5/BC5Encoder.h
@ -24,11 +24,11 @@
 #include <tuple>
 #include <type_traits>

-#include "ColorBlock.h"
-#include "Encoder.h"
-#include "s3tc/bc4/BC4Encoder.h"
-#include "s3tc/bc5/BC5Block.h"
-#include "texture/BlockTexture.h"
+#include "../../ColorBlock.h"
+#include "../../Encoder.h"
+#include "../../Texture.h"
+#include "../bc4/BC4Encoder.h"
+#include "BC5Block.h"

 namespace quicktex::s3tc {
 class BC5Encoder : public BlockEncoder<BlockTexture<BC5Block>> {
@ -37,10 +37,8 @@ class BC5Encoder : public BlockEncoder<BlockTexture<BC5Block>> {
    using BC4EncoderPtr = std::shared_ptr<BC4Encoder>;
    using BC4EncoderPair = std::tuple<BC4EncoderPtr, BC4EncoderPtr>;

-    BC5Encoder(uint8_t chan0 = 0, uint8_t chan1 = 1)
-        : BC5Encoder(std::make_shared<BC4Encoder>(chan0), std::make_shared<BC4Encoder>(chan1)) {}
-    BC5Encoder(BC4EncoderPtr chan0_encoder, BC4EncoderPtr chan1_encoder)
-        : _chan0_encoder(chan0_encoder), _chan1_encoder(chan1_encoder) {}
+    BC5Encoder(uint8_t chan0 = 0, uint8_t chan1 = 1) : BC5Encoder(std::make_shared<BC4Encoder>(chan0), std::make_shared<BC4Encoder>(chan1)) {}
+    BC5Encoder(BC4EncoderPtr chan0_encoder, BC4EncoderPtr chan1_encoder) : _chan0_encoder(chan0_encoder), _chan1_encoder(chan1_encoder) {}

    BC5Block EncodeBlock(const ColorBlock<4, 4> &pixels) const override;

--- a/quicktex/s3tc/bc5/_bindings.cpp
+++ b/quicktex/s3tc/bc5/_bindings.cpp
@ -24,10 +24,10 @@
 #include <array>
 #include <cstdint>

-#include "s3tc/bc4/BC4Block.h"
-#include "s3tc/bc5/BC5Block.h"
-#include "s3tc/bc5/BC5Decoder.h"
-#include "s3tc/bc5/BC5Encoder.h"
+#include "../../Decoder.h"
+#include "../../Encoder.h"
+#include "BC5Decoder.h"
+#include "BC5Encoder.h"

 namespace py = pybind11;
 namespace quicktex::bindings {
@ -52,8 +52,7 @@ void InitBC5(py::module_ &s3tc) {

    bc5_block.def_readwrite("chan0_block", &BC5Block::chan0_block, "The BC4 block used for the first channel.");
    bc5_block.def_readwrite("chan1_block", &BC5Block::chan1_block, "The BC4 block used for the second channel.");
-    bc5_block.def_property("blocks", &BC5Block::GetBlocks, &BC5Block::SetBlocks,
-                           "The BC4 and BC1 blocks that make up this block as a 2-tuple.");
+    bc5_block.def_property("blocks", &BC5Block::GetBlocks, &BC5Block::SetBlocks, "The BC4 and BC1 blocks that make up this block as a 2-tuple.");
    // endregion

    // region BC5Texture
@ -80,11 +79,9 @@ void InitBC5(py::module_ &s3tc) {
        :returns: A new BC5Texture with the same dimension as the input.
    )doc");

-    bc5_encoder.def_property_readonly("channels", &BC5Encoder::GetChannels,
-                                      "A 2-tuple of channels that will be read from. 0 to 3 inclusive. Readonly.");
-    bc5_encoder.def_property_readonly(
-        "bc4_encoders", &BC5Encoder::GetBC4Encoders,
-        "2-tuple of internal :py:class:`~quicktex.s3tc.bc4.BC4Encoder` s used for each channel. Readonly.");
+    bc5_encoder.def_property_readonly("channels", &BC5Encoder::GetChannels, "A 2-tuple of channels that will be read from. 0 to 3 inclusive. Readonly.");
+    bc5_encoder.def_property_readonly("bc4_encoders", &BC5Encoder::GetBC4Encoders,
+                                      "2-tuple of internal :py:class:`~quicktex.s3tc.bc4.BC4Encoder` s used for each channel. Readonly.");
    // endregion

    // region BC5Decoder
@ -106,11 +103,9 @@ void InitBC5(py::module_ &s3tc) {
        :returns: A new RawTexture with the same dimensions as the input
    )doc");

-    bc5_decoder.def_property_readonly("channels", &BC5Decoder::GetChannels,
-                                      "A 2-tuple of channels that will be written to. 0 to 3 inclusive. Readonly.");
-    bc5_decoder.def_property_readonly(
-        "bc4_decoders", &BC5Decoder::GetBC4Decoders,
-        "2-tuple of internal :py:class:`~quicktex.s3tc.bc4.BC4Decoder` s used for each channel. Readonly.");
+    bc5_decoder.def_property_readonly("channels", &BC5Decoder::GetChannels, "A 2-tuple of channels that will be written to. 0 to 3 inclusive. Readonly.");
+    bc5_decoder.def_property_readonly("bc4_decoders", &BC5Decoder::GetBC4Decoders,
+                                      "2-tuple of internal :py:class:`~quicktex.s3tc.bc4.BC4Decoder` s used for each channel. Readonly.");
    // endregion
 }
 }  // namespace quicktex::bindings
--- a/quicktex/s3tc/interpolator/Interpolator.cpp
+++ b/quicktex/s3tc/interpolator/Interpolator.cpp
@ -24,8 +24,8 @@
 #include <cstdint>
 #include <stdexcept>

-#include "OldColor.h"
-#include "util/bitbash.h"
+#include "../../util.h"
+#include "../../Color.h"

 namespace quicktex::s3tc {

@ -45,33 +45,25 @@ std::unique_ptr<Interpolator> Interpolator::MakeInterpolator(Interpolator::Type
    }
 }

-uint8_t Interpolator::Interpolate5(uint8_t v0, uint8_t v1) const {
-    return Interpolate8(scale_to_8<5>(v0), scale_to_8<5>(v1));
-}
-uint8_t Interpolator::Interpolate6(uint8_t v0, uint8_t v1) const {
-    return Interpolate8(scale_to_8<6>(v0), scale_to_8<6>(v1));
-}
-uint8_t Interpolator::InterpolateHalf5(uint8_t v0, uint8_t v1) const {
-    return InterpolateHalf8(scale_to_8<5>(v0), scale_to_8<5>(v1));
-}
-uint8_t Interpolator::InterpolateHalf6(uint8_t v0, uint8_t v1) const {
-    return InterpolateHalf8(scale_to_8<6>(v0), scale_to_8<6>(v1));
-}
+uint8_t Interpolator::Interpolate5(uint8_t v0, uint8_t v1) const { return Interpolate8(scale5To8(v0), scale5To8(v1)); }
+uint8_t Interpolator::Interpolate6(uint8_t v0, uint8_t v1) const { return Interpolate8(scale6To8(v0), scale6To8(v1)); }
+uint8_t Interpolator::InterpolateHalf5(uint8_t v0, uint8_t v1) const { return InterpolateHalf8(scale5To8(v0), scale5To8(v1)); }
+uint8_t Interpolator::InterpolateHalf6(uint8_t v0, uint8_t v1) const { return InterpolateHalf8(scale6To8(v0), scale6To8(v1)); }

-std::array<OldColor, 4> Interpolator::Interpolate565BC1(uint16_t low, uint16_t high, bool allow_3color) const {
+std::array<Color, 4> Interpolator::Interpolate565BC1(uint16_t low, uint16_t high, bool allow_3color) const {
    bool use_3color = allow_3color && (high >= low);
-    return InterpolateBC1(OldColor::Unpack565Unscaled(low), OldColor::Unpack565Unscaled(high), use_3color);
+    return InterpolateBC1(Color::Unpack565Unscaled(low), Color::Unpack565Unscaled(high), use_3color);
 }

-std::array<OldColor, 4> Interpolator::InterpolateBC1(OldColor low, OldColor high, bool use_3color) const {
-    auto colors = std::array<OldColor, 4>();
+std::array<Color, 4> Interpolator::InterpolateBC1(Color low, Color high, bool use_3color) const {
+    auto colors = std::array<Color, 4>();
    colors[0] = low.ScaleFrom565();
    colors[1] = high.ScaleFrom565();

    if (use_3color) {
        // 3-color mode
        colors[2] = InterpolateHalfColor24(colors[0], colors[1]);
-        colors[3] = OldColor(0, 0, 0, 0);  // transparent black
+        colors[3] = Color(0, 0, 0, 0);  // transparent black
    } else {
        // 4-color mode
        colors[2] = InterpolateColor24(colors[0], colors[1]);
@ -87,12 +79,8 @@ uint8_t Interpolator::InterpolateHalf8(uint8_t v0, uint8_t v1) const { return (v
 // endregion

 // region InterpolatorRound implementation
-uint8_t InterpolatorRound::Interpolate5(uint8_t v0, uint8_t v1) const {
-    return Interpolate8(scale_to_8<5>(v0), scale_to_8<5>(v1));
-}
-uint8_t InterpolatorRound::Interpolate6(uint8_t v0, uint8_t v1) const {
-    return Interpolate8(scale_to_8<6>(v0), scale_to_8<6>(v1));
-}
+uint8_t InterpolatorRound::Interpolate5(uint8_t v0, uint8_t v1) const { return Interpolate8(scale5To8(v0), scale5To8(v1)); }
+uint8_t InterpolatorRound::Interpolate6(uint8_t v0, uint8_t v1) const { return Interpolate8(scale6To8(v0), scale6To8(v1)); }

 uint8_t InterpolatorRound::Interpolate8(uint8_t v0, uint8_t v1) const { return (v0 * 2 + v1 + 1) / 3; }
 // endregion
@ -120,9 +108,9 @@ uint8_t InterpolatorNvidia::InterpolateHalf6(uint8_t v0, uint8_t v1) const {
    return static_cast<uint8_t>((256 * v0 + gdiff / 4 + 128 + gdiff * 128) >> 8);
 }

-std::array<OldColor, 4> InterpolatorNvidia::InterpolateBC1(OldColor low, OldColor high, bool use_3color) const {
+std::array<Color, 4> InterpolatorNvidia::InterpolateBC1(Color low, Color high, bool use_3color) const {
    // Nvidia is special and interpolation cant be done with 8-bit values, so we need to override the default behavior
-    std::array<OldColor, 4> colors;
+    std::array<Color, 4> colors;
    colors[0] = low.ScaleFrom565();
    colors[1] = high.ScaleFrom565();

@ -133,7 +121,7 @@ std::array<OldColor, 4> InterpolatorNvidia::InterpolateBC1(OldColor low, OldColo
    } else {
        // 3-color mode
        colors[2] = InterpolateHalfColor565(low, high);
-        colors[3] = OldColor(0, 0, 0, 0);  // transparent black
+        colors[3] = Color(0, 0, 0, 0);  // transparent black
    }

    return colors;
@ -141,18 +129,10 @@ std::array<OldColor, 4> InterpolatorNvidia::InterpolateBC1(OldColor low, OldColo
 // endregion

 // region InterpolatorAMD implementation
-uint8_t InterpolatorAMD::Interpolate5(uint8_t v0, uint8_t v1) const {
-    return Interpolate8(scale_to_8<5>(v0), scale_to_8<5>(v1));
-}
-uint8_t InterpolatorAMD::Interpolate6(uint8_t v0, uint8_t v1) const {
-    return Interpolate8(scale_to_8<6>(v0), scale_to_8<6>(v1));
-}
-uint8_t InterpolatorAMD::InterpolateHalf5(uint8_t v0, uint8_t v1) const {
-    return InterpolateHalf8(scale_to_8<5>(v0), scale_to_8<5>(v1));
-}
-uint8_t InterpolatorAMD::InterpolateHalf6(uint8_t v0, uint8_t v1) const {
-    return InterpolateHalf8(scale_to_8<6>(v0), scale_to_8<6>(v1));
-}
+uint8_t InterpolatorAMD::Interpolate5(uint8_t v0, uint8_t v1) const { return Interpolate8(scale5To8(v0), scale5To8(v1)); }
+uint8_t InterpolatorAMD::Interpolate6(uint8_t v0, uint8_t v1) const { return Interpolate8(scale6To8(v0), scale6To8(v1)); }
+uint8_t InterpolatorAMD::InterpolateHalf5(uint8_t v0, uint8_t v1) const { return InterpolateHalf8(scale5To8(v0), scale5To8(v1)); }
+uint8_t InterpolatorAMD::InterpolateHalf6(uint8_t v0, uint8_t v1) const { return InterpolateHalf8(scale6To8(v0), scale6To8(v1)); }

 uint8_t InterpolatorAMD::Interpolate8(uint8_t v0, uint8_t v1) const { return (v0 * 43 + v1 * 21 + 32) >> 6; }

--- a/quicktex/s3tc/interpolator/Interpolator.h
+++ b/quicktex/s3tc/interpolator/Interpolator.h
@ -22,7 +22,7 @@
 #include <cstdint>  // for uint8_t, uint16_t
 #include <memory>   // for unique_ptr

-#include "OldColor.h"  // for Color
+#include "../../Color.h"  // for Color

 namespace quicktex::s3tc {

@ -97,7 +97,7 @@ class Interpolator {
     * @param allow_3color if true, a different interpolation mode will be used if high >= low
     * @return an array of 4 Color values, with indices matching BC1 selectors
     */
-    std::array<OldColor, 4> Interpolate565BC1(uint16_t low, uint16_t high, bool allow_3color = true) const;
+    std::array<Color, 4> Interpolate565BC1(uint16_t low, uint16_t high, bool allow_3color = true) const;

    /**
     * Generates the 4 colors for a BC1 block from the given
@ -106,7 +106,7 @@ class Interpolator {
     * @param use_3color if the 3-color interpolation mode should be used
     * @return an array of 4 Color values, with indices matching BC1 selectors
     */
-    virtual std::array<OldColor, 4> InterpolateBC1(OldColor low, OldColor high, bool use_3color) const;
+    virtual std::array<Color, 4> InterpolateBC1(Color low, Color high, bool use_3color) const;

    /**
     * Gets the type of an interpolator
@ -126,12 +126,12 @@ class Interpolator {
    }

   private:
-    OldColor InterpolateColor24(const OldColor &c0, const OldColor &c1) const {
-        return OldColor(Interpolate8(c0.r, c1.r), Interpolate8(c0.g, c1.g), Interpolate8(c0.b, c1.b));
+    Color InterpolateColor24(const Color &c0, const Color &c1) const {
+        return Color(Interpolate8(c0.r, c1.r), Interpolate8(c0.g, c1.g), Interpolate8(c0.b, c1.b));
    }

-    OldColor InterpolateHalfColor24(const OldColor &c0, const OldColor &c1) const {
-        return OldColor(InterpolateHalf8(c0.r, c1.r), InterpolateHalf8(c0.g, c1.g), InterpolateHalf8(c0.b, c1.b));
+    Color InterpolateHalfColor24(const Color &c0, const Color &c1) const {
+        return Color(InterpolateHalf8(c0.r, c1.r), InterpolateHalf8(c0.g, c1.g), InterpolateHalf8(c0.b, c1.b));
    }
 };

@ -152,18 +152,18 @@ class InterpolatorNvidia final : public Interpolator {
    virtual uint8_t InterpolateHalf5(uint8_t v0, uint8_t v1) const override;
    virtual uint8_t InterpolateHalf6(uint8_t v0, uint8_t v1) const override;

-    virtual std::array<OldColor, 4> InterpolateBC1(OldColor low, OldColor high, bool use_3color) const override;
+    virtual std::array<Color, 4> InterpolateBC1(Color low, Color high, bool use_3color) const override;

    virtual Type GetType() const noexcept override { return Type::Nvidia; }
    virtual bool CanInterpolate8Bit() const noexcept override { return false; }

   private:
-    OldColor InterpolateColor565(const OldColor &c0, const OldColor &c1) const {
-        return OldColor(Interpolate5(c0.r, c1.r), Interpolate6(c0.g, c1.g), Interpolate5(c0.b, c1.b));
+    Color InterpolateColor565(const Color &c0, const Color &c1) const {
+        return Color(Interpolate5(c0.r, c1.r), Interpolate6(c0.g, c1.g), Interpolate5(c0.b, c1.b));
    }

-    OldColor InterpolateHalfColor565(const OldColor &c0, const OldColor &c1) const {
-        return OldColor(InterpolateHalf5(c0.r, c1.r), InterpolateHalf6(c0.g, c1.g), InterpolateHalf5(c0.b, c1.b));
+    Color InterpolateHalfColor565(const Color &c0, const Color &c1) const {
+        return Color(InterpolateHalf5(c0.r, c1.r), InterpolateHalf6(c0.g, c1.g), InterpolateHalf5(c0.b, c1.b));
    }
 };

--- a/quicktex/test.cpp
+++ b/quicktex/test.cpp
@ -1,31 +0,0 @@
-/*  Quicktex Texture Compression Library
-    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
-    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
-    and licenced under the public domain
-
-    This program is free software: you can redistribute it and/or modify
-    it under the terms of the GNU Lesser General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU Lesser General Public License for more details.
-
-    You should have received a copy of the GNU Lesser General Public License
-    along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <array>
-#include <cstdint>
-#include <xsimd/xsimd.hpp>
-
-#include "Matrix.h"
-
-// Type your code here, or load an example.
-namespace quicktex {
-auto test(Matrix<float, 4, 1> a, Matrix<float, 4, 1> b, Matrix<float, 4, 1> c) {
-    return a * 7;
-};
-}  // namespace quicktex
--- a/quicktex/texture/BlockTexture.h
+++ b/quicktex/texture/BlockTexture.h
@ -1,70 +0,0 @@
-/*  Quicktex Texture Compression Library
-    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
-    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
-    and licenced under the public domain
-
-    This program is free software: you can redistribute it and/or modify
-    it under the terms of the GNU Lesser General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU Lesser General Public License for more details.
-
-    You should have received a copy of the GNU Lesser General Public License
-    along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#pragma once
-
-#include <array>
-#include <vector>
-
-#include "Texture.h"
-
-namespace quicktex {
-template <typename B> class BlockTexture final : public Texture {
-   private:
-    std::vector<B> _blocks;
-    unsigned _width_b;
-    unsigned _height_b;
-
-   public:
-    using BlockType = B;
-    using Base = Texture;
-
-    /**
-     * Create a new BlockTexture
-     * @param width width of the texture in pixels. must be divisible by B::width
-     * @param height height of the texture in pixels. must be divisible by B::height
-     */
-    BlockTexture(int w, int h) : Base(w, h) {
-        _width_b = (width + B::Width - 1) / B::Width;
-        _height_b = (height + B::Height - 1) / B::Height;
-        _blocks = std::vector<B>(_width_b * _height_b);
-    }
-
-    constexpr unsigned bwidth() const { return _width_b; }
-    constexpr unsigned bheight() const { return _height_b; }
-    constexpr std::tuple<int, int> bsize() const { return std::tuple<int, int>(_width_b, _height_b); }
-
-    B get_block(unsigned x, unsigned y) const {
-        if (x >= _width_b) throw std::out_of_range("x value out of range.");
-        if (y >= _height_b) throw std::out_of_range("y value out of range.");
-        return _blocks.at(x + (y * _width_b));
-    }
-
-    void set_block(unsigned x, unsigned y, const B &val) {
-        if (x >= _width_b) throw std::out_of_range("x value out of range.");
-        if (y >= _height_b) throw std::out_of_range("y value out of range.");
-        _blocks.at(x + (y * _width_b)) = val;
-    }
-
-    size_t nbytes() const noexcept override { return _blocks.size() * sizeof(B); }
-
-    const uint8_t *data() const noexcept override { return reinterpret_cast<const uint8_t *>(_blocks.data()); }
-    uint8_t *data() noexcept override { return reinterpret_cast<uint8_t *>(_blocks.data()); }
-};
-}  // namespace quicktex
--- a/quicktex/texture/RawTexture.cpp
+++ b/quicktex/texture/RawTexture.cpp
@ -1,33 +0,0 @@
-/*  Quicktex Texture Compression Library
-    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
-    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
-    and licenced under the public domain
-
-    This program is free software: you can redistribute it and/or modify
-    it under the terms of the GNU Lesser General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU Lesser General Public License for more details.
-
-    You should have received a copy of the GNU Lesser General Public License
-    along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include "RawTexture.h"
-
-namespace quicktex {
-Color RawTexture::pixel(unsigned x, unsigned y) const {
-    if (x >= width) throw std::invalid_argument("x value out of range.");
-    if (y >= height) throw std::invalid_argument("y value out of range.");
-    return _pixels.at(x + (y * width));
-}
-quicktex::Color& RawTexture::pixel(unsigned x, unsigned y) {
-    if (x >= width) throw std::invalid_argument("x value out of range.");
-    if (y >= height) throw std::invalid_argument("y value out of range.");
-    return _pixels.at(x + (y * width));
-}
-}  // namespace quicktex
--- a/quicktex/texture/RawTexture.h
+++ b/quicktex/texture/RawTexture.h
@ -1,97 +0,0 @@
-/*  Quicktex Texture Compression Library
-    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
-    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
-    and licenced under the public domain
-
-    This program is free software: you can redistribute it and/or modify
-    it under the terms of the GNU Lesser General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU Lesser General Public License for more details.
-
-    You should have received a copy of the GNU Lesser General Public License
-    along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#pragma once
-
-#include <array>
-#include <climits>
-#include <cstdint>
-#include <cstdio>
-#include <cstring>
-#include <memory>
-#include <stdexcept>
-#include <tuple>
-#include <type_traits>
-#include <vector>
-
-#include "Color.h"
-#include "ColorBlock.h"
-#include "OldColor.h"
-#include "texture/Texture.h"
-
-namespace quicktex {
-class RawTexture : public Texture {
-    using Base = Texture;
-
-   public:
-    /**
-     * Create a new RawTexture
-     * @param width width of the texture in pixels
-     * @param height height of the texture in pixels
-     */
-    RawTexture(int w, int h) : Base(w, h), _pixels(w* h) {}
-
-    quicktex::Color pixel(unsigned x, unsigned y) const;
-
-    quicktex::Color &pixel(unsigned x, unsigned y);
-
-    quicktex::Color pixel_wrapped(unsigned x, unsigned y) const { return pixel(x % width, y % height); }
-
-    quicktex::Color &pixel_wrapped(unsigned x, unsigned y) { return pixel(x % width, y % height); }
-
-    size_t nbytes() const noexcept override { return static_cast<size_t>(width * height) * sizeof(quicktex::Color); }
-
-    template <int N, int M> quicktex::ColorBlock<N, M> get_block(int block_x, int block_y) const {
-        if (block_x < 0) throw std::out_of_range("x value out of range.");
-        if (block_y < 0) throw std::out_of_range("y value out of range.");
-
-        // coordinates in the image of the top-left pixel of the selected block
-        quicktex::ColorBlock<N, M> block;
-        int pixel_x = block_x * N;
-        int pixel_y = block_y * M;
-
-        // slower pixel-wise copy if the block goes over the edges
-        for (int x = 0; x < N; x++) {
-            for (int y = 0; y < M; y++) { block.Set(x, y, pixel((pixel_x + x) % width, (pixel_y + y) % height)); }
-        }
-
-        return block;
-    }
-
-    template <int N, int M> void set_block(int block_x, int block_y, const quicktex::ColorBlock<N, M> &block) {
-        if (block_x < 0) throw std::out_of_range("x value out of range.");
-        if (block_y < 0) throw std::out_of_range("y value out of range.");
-
-        // coordinates in the image of the top-left pixel of the selected block
-        int pixel_x = block_x * N;
-        int pixel_y = block_y * M;
-
-        // slower pixel-wise copy if the block goes over the edges
-        for (int x = 0; x < N; x++) {
-            for (int y = 0; y < M; y++) { pixel((pixel_x + x) % width, (pixel_y + y) % height) = block.Get(x, y); }
-        }
-    }
-
-    virtual const uint8_t *data() const noexcept override { return reinterpret_cast<const uint8_t *>(_pixels.data()); }
-    virtual uint8_t *data() noexcept override { return reinterpret_cast<uint8_t *>(_pixels.data()); }
-
-   protected:
-    std::vector<quicktex::Color> _pixels;
-};
-}  // namespace quicktex
--- a/quicktex/texture/Texture.h
+++ b/quicktex/texture/Texture.h
@ -1,62 +0,0 @@
-/*  Quicktex Texture Compression Library
-    Copyright (C) 2021-2022 Andrew Cassidy <drewcassidy@me.com>
-    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
-    and licenced under the public domain
-
-    This program is free software: you can redistribute it and/or modify
-    it under the terms of the GNU Lesser General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU Lesser General Public License for more details.
-
-    You should have received a copy of the GNU Lesser General Public License
-    along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#pragma once
-
-#include <array>
-#include <climits>
-#include <cstdint>
-#include <cstdio>
-#include <cstring>
-#include <memory>
-#include <stdexcept>
-#include <tuple>
-#include <type_traits>
-#include <vector>
-
-#include "Color.h"
-#include "ColorBlock.h"
-#include "OldColor.h"
-#include "Window.h"
-
-namespace quicktex {
-
-class Texture {
-   public:
-    const unsigned width;
-    const unsigned height;
-
-    virtual ~Texture() = default;
-
-    virtual std::tuple<unsigned, unsigned> Size() const { return {width, height}; }
-
-    /**
-     * The texture's total size
-     * @return The size of the texture in bytes.
-     */
-    virtual size_t nbytes() const noexcept = 0;
-
-    virtual const uint8_t *data() const noexcept = 0;
-    virtual uint8_t *data() noexcept = 0;
-
-   protected:
-    Texture(unsigned w, unsigned h) : width(w), height(h) {}
-};
-
-}  // namespace quicktex
--- a/quicktex/texture/Window.cpp
+++ b/quicktex/texture/Window.cpp
@ -1,90 +0,0 @@
-/*  Quicktex Texture Compression Library
-    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
-    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
-    and licenced under the public domain
-
-    This program is free software: you can redistribute it and/or modify
-    it under the terms of the GNU Lesser General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU Lesser General Public License for more details.
-
-    You should have received a copy of the GNU Lesser General Public License
-    along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include "Window.h"
-
-#include "texture/RawTexture.h"
-
-namespace quicktex {
-
-// Window
-Window::Window(RawTexture& texture, unsigned w, unsigned h, unsigned px, unsigned py)
-    : width(w), height(h), x(px), y(py), _texture(texture) {
-    assert(x < texture.width);
-    assert(y < texture.height);
-}
-
-Color& Window::pixel(unsigned px, unsigned py) {
-    assert(px < width && py < height);
-    return _texture.pixel(x + px, y + py);
-}
-
-Color Window::pixel(unsigned px, unsigned py) const {
-    assert(px < width && py < height);
-    return _texture.pixel(x + px, y + py);
-}
-
-WindowIterator Window::begin() { return WindowIterator(*this, 0, 0); }
-WindowIterator Window::end() { return WindowIterator(*this, 0, height); }
-WindowIterator Window::row_begin(unsigned int row) { return WindowIterator(*this, 0, row); }
-WindowIterator Window::row_end(unsigned int row) { return WindowIterator(*this, 0, row + 1); }
-
-bool Window::operator==(const Window& rhs) const {
-    return width == rhs.width && height == rhs.height && x == rhs.x && y == rhs.y && &_texture == &rhs._texture;
-}
-
-// WindowIterator
-
-WindowIterator::WindowIterator(Window& view, unsigned px, unsigned py) : x(px), y(py), _view(&view) {
-    assert(x < view.width);
-    assert(y < view.height || (y == view.height && x == 0));
-    // if y == the height, and x == 0, then this is a sentinel for the end of iteration, and cannot be dereferenced
-}
-
-WindowIterator& quicktex::WindowIterator::operator++() {  // prefix increment
-    x++;
-    if (x >= _view->width) {
-        x = 0;
-        y++;
-    }
-    return *this;
-}
-
-WindowIterator WindowIterator::operator++(int) {  // postfix increment
-    WindowIterator old = *this;
-    ++(*this);
-    return old;
-}
-
-Color& WindowIterator::operator*() const {  // dereference operator
-    assert(_view != nullptr);
-    assert(x < _view->width && y < _view->height);
-    return _view->pixel(x, y);
-}
-
-Color* WindowIterator::operator->() { return &(**this); }  // returns a pointer to what's returned by operator*
-
-bool WindowIterator::operator==(const WindowIterator& rhs) const {
-    return x == rhs.x && y == rhs.y && _view == rhs._view;
-}
-
-static_assert(std::forward_iterator<WindowIterator>);
-// static_assert(sized_range<Window>);
-
-}  // namespace quicktex
--- a/quicktex/texture/Window.h
+++ b/quicktex/texture/Window.h
@ -1,82 +0,0 @@
-/*  Quicktex Texture Compression Library
-    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
-    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
-    and licenced under the public domain
-
-    This program is free software: you can redistribute it and/or modify
-    it under the terms of the GNU Lesser General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU Lesser General Public License for more details.
-
-    You should have received a copy of the GNU Lesser General Public License
-    along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#pragma once
-#include "Color.h"
-#include "util/ranges.h"
-
-namespace quicktex {
-
-// forward declarations
-class WindowIterator;
-class RawTexture;
-
-/**
- * Class representing a window into a RawTexture
- */
-class Window {
-   public:
-    typedef Color value_type;
-
-    const unsigned width, height;
-    const unsigned x, y;
-
-    Window(RawTexture &texture, unsigned w, unsigned h, unsigned x, unsigned y);
-
-    Color &pixel(unsigned px, unsigned py);
-    Color pixel(unsigned px, unsigned py) const;
-
-    WindowIterator begin();
-    WindowIterator end();
-    WindowIterator row_begin(unsigned row);
-    WindowIterator row_end(unsigned row);
-
-    size_t size() const { return width * height; }
-
-    bool operator==(const Window &rhs) const;
-
-   private:
-    RawTexture &_texture;
-};
-
-/**
- * Iterator returned by Window
- */
-class WindowIterator {
-   public:
-    typedef long long difference_type;
-    typedef Color value_type;
-
-    unsigned x, y;
-
-    WindowIterator(Window &view, unsigned x, unsigned y);
-    WindowIterator() : x(0), y(0), _view(nullptr) {}
-
-    Color &operator*() const;  // dereference
-    Color *operator->();       // member access
-
-    WindowIterator &operator++();    // prefix increment
-    WindowIterator operator++(int);  // postfix increment
-    bool operator==(const WindowIterator &rhs) const;
-
-   private:
-    Window *_view;
-};
-
-}  // namespace quicktex
--- a/quicktex/util.h
+++ b/quicktex/util.h
@ -0,0 +1,178 @@
+/*  Quicktex Texture Compression Library
+    Copyright (C) 2021-2022 Andrew Cassidy <drewcassidy@me.com>
+    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
+    and licenced under the public domain
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Lesser General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+#include <cassert>
+#include <cstdint>
+#include <limits>
+#include <string>
+#include <type_traits>
+#include <functional>
+#include <vector>
+
+#define UINT5_MAX 0x1FU  // 31
+#define UINT6_MAX 0x3FU  // 63
+
+#define assert5bit(x) assert(x <= UINT5_MAX)
+#define assert6bit(x) assert(x <= UINT6_MAX)
+
+template <typename S> constexpr auto iabs(S i) {
+    static_assert(!std::is_unsigned<S>::value);
+    using O = typename std::make_unsigned<S>::type;
+    return (i < 0) ? static_cast<O>(-i) : static_cast<O>(i);
+}
+
+/**
+ * Unpacks an unsigned integer into an array of smaller integers.
+ * @tparam I Input data type. Must be an unsigned integral type large enough to hold C * N bits.
+ * @tparam O Output data type. must be an unsigned integral type large enough to hold C bits..
+ * @tparam S Number of bits in each value.
+ * @tparam C Number of values to unpack.
+ * @param packed Packed integer input of type I.
+ * @return Unpacked std::array of type O and size C.
+ */
+template <typename I, typename O, size_t S, size_t C> constexpr std::array<O, C> Unpack(I packed) {
+    // type checking
+    static_assert(std::is_unsigned<I>::value, "Packed input type must be unsigned");
+    static_assert(std::is_unsigned<O>::value, "Unpacked output type must be unsigned");
+    static_assert(std::numeric_limits<I>::digits >= (C * S), "Packed input type must be big enough to represent the number of bits multiplied by count");
+    static_assert(std::numeric_limits<O>::digits >= S, "Unpacked output type must be big enough to represent the number of bits");
+
+    constexpr O mask = (1U << S) - 1U;  // maximum value representable by N bits
+    std::array<O, C> vals;              // output values array of size C
+
+    for (unsigned i = 0; i < C; i++) {
+        vals[i] = static_cast<O>(packed >> (i * S)) & mask;
+        assert(vals[i] <= mask);
+    }
+
+    return vals;
+}
+
+/**
+ * Packs an array of unsigned integers into a single integer.
+ * @tparam I Input data type. Must be an unsigned integral type large enough to hold C bits.
+ * @tparam O Output data type. must be an unsigned integral type large enough to hold C * N bits.
+ * @tparam S Number of bits in each value.
+ * @tparam C Number of values to unpack.
+ * @param vals Unpacked std::array of type I and size C.
+ * @return Packed integer input of type O.
+ */
+template <typename I, typename O, size_t S, size_t C> constexpr O Pack(const std::array<I, C> &vals) {
+    // type checking
+    static_assert(std::is_unsigned<I>::value, "Unpacked input type must be unsigned");
+    static_assert(std::is_unsigned<O>::value, "Packed output type must be unsigned");
+    static_assert(std::numeric_limits<I>::digits >= S, "Unpacked input type must be big enough to represent the number of bits");
+    static_assert(std::numeric_limits<O>::digits >= (C * S), "Packed output type must be big enough to represent the number of bits multiplied by count");
+
+    O packed = 0;  // output value of type O
+
+    for (unsigned i = 0; i < C; i++) {
+        assert(vals[i] <= (1U << S) - 1U);
+        packed |= static_cast<O>(vals[i]) << (i * S);
+    }
+
+    assert(packed <= (static_cast<O>(1U) << (C * S)) - 1U);
+    return packed;
+}
+
+template <size_t Size, int Op(int)> constexpr std::array<uint8_t, Size> ExpandArray() {
+    std::array<uint8_t, Size> res;
+    for (int i = 0; i < Size; i++) { res[i] = Op(i); }
+    return res;
+}
+
+template <typename Seq, typename Fn> constexpr auto MapArray(const Seq &input, Fn op) {
+    using I = typename Seq::value_type;
+    using O = decltype(op(std::declval<I>()));
+    constexpr size_t N = std::tuple_size<Seq>::value;
+
+    std::array<O, N> output;
+    for (unsigned i = 0; i < N; i++) { output[i] = op(input[i]); }
+    return output;
+}
+
+template <typename S> constexpr S scale8To5(S v) {
+    auto v2 = v * 31 + 128;
+    return static_cast<S>((v2 + (v2 >> 8)) >> 8);
+}
+template <typename S> constexpr S scale8To6(S v) {
+    auto v2 = v * 63 + 128;
+    return static_cast<S>((v2 + (v2 >> 8)) >> 8);
+}
+
+template <typename S> constexpr S scale5To8(S v) {
+    assert5bit(v);
+    return static_cast<S>((v << 3) | (v >> 2));
+}
+template <typename S> constexpr S scale6To8(S v) {
+    assert6bit(v);
+    return static_cast<S>((v << 2) | (v >> 4));
+}
+
+template <typename S> constexpr S maximum(S a, S b) { return (a > b) ? a : b; }
+template <typename S> constexpr S maximum(S a, S b, S c) { return maximum(maximum(a, b), c); }
+template <typename S> constexpr S maximum(S a, S b, S c, S d) { return maximum(maximum(maximum(a, b), c), d); }
+
+template <typename S> constexpr S minimum(S a, S b) { return (a < b) ? a : b; }
+template <typename S> constexpr S minimum(S a, S b, S c) { return minimum(minimum(a, b), c); }
+template <typename S> constexpr S minimum(S a, S b, S c, S d) { return minimum(minimum(minimum(a, b), c), d); }
+
+template <typename T> constexpr T square(T a) { return a * a; }
+
+constexpr float clampf(float value, float low = 0.0f, float high = 1.0f) {
+    if (value < low)
+        value = low;
+    else if (value > high)
+        value = high;
+    return value;
+}
+constexpr uint8_t clamp255(int32_t i) { return static_cast<uint8_t>((static_cast<unsigned int>(i) & 0xFFFFFF00U) ? (~(i >> 31)) : i); }
+
+template <typename S> constexpr S clamp(S value, S low, S high) { return (value < low) ? low : ((value > high) ? high : value); }
+constexpr int32_t clampi(int32_t value, int32_t low, int32_t high) {
+    if (value < low)
+        value = low;
+    else if (value > high)
+        value = high;
+    return value;
+}
+
+constexpr int squarei(int a) { return a * a; }
+constexpr int absi(int a) { return (a < 0) ? -a : a; }
+
+template <typename F> constexpr F lerp(F a, F b, F s) { return a + (b - a) * s; }
+
+template <typename... Args> std::string Format(const char *str, const Args &...args) {
+    auto output = std::string(str);
+
+    std::vector<std::string> values = {{args...}};
+
+    for (unsigned i = 0; i < values.size(); i++) {
+        auto key = "{" + std::to_string(i) + "}";
+        auto value = values[i];
+        while (true) {
+            size_t where = output.find(key);
+            if (where == output.npos) break;
+            output.replace(where, key.length(), value);
+        }
+    }
+
+    return output;
+}
--- a/quicktex/util/bitbash.h
+++ b/quicktex/util/bitbash.h
@ -1,313 +0,0 @@
-/*  Quicktex Texture Compression Library
-    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
-    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
-    and licenced under the public domain
-
-    This program is free software: you can redistribute it and/or modify
-    it under the terms of the GNU Lesser General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU Lesser General Public License for more details.
-
-    You should have received a copy of the GNU Lesser General Public License
-    along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#pragma once
-
-#include <array>
-#include <cassert>
-#include <concepts>
-#include <limits>
-#include <numeric>
-#include <type_traits>
-
-#include "iterator.h"
-#include "util/math.h"
-#include "util/ranges.h"
-
-#define UINT5_MAX 0x1FU  // 31
-#define UINT6_MAX 0x3FU  // 63
-
-#define assert5bit(x) assert(x <= UINT5_MAX)
-#define assert6bit(x) assert(x <= UINT6_MAX)
-
-namespace quicktex {
-
-template <size_t N, typename S> S scale_from_8(S v) {
-    static_assert(N < 8);
-    assert(v < (1 << 8));
-
-    unsigned max = (1 << N) - 1;
-    unsigned v2 = (v * max) + 128;
-    auto result = static_cast<S>((v2 + (v2 >> 8)) >> 8);
-
-    assert(result < (1 << N));
-
-    return result;
-}
-
-template <size_t N, typename S> S scale_to_8(S v) {
-    static_assert(N < 8);
-    assert(v < (1 << N));
-
-    constexpr unsigned Lshift = 8 - N;
-    constexpr unsigned Rshift = N - Lshift;
-    S result = static_cast<S>((v << Lshift) | (v >> Rshift));
-
-    assert(v < (1 << 8));
-
-    return result;
-}
-
-/**
- * Unpacks an unsigned integer into a range of smaller integers.
- * @param packed value to unpack
- * @param begin destination start iterator
- * @param end destination end iterator
- * @param widths widths iterator. values are in bits
- * @param little_endian if the input has the first element in the least significant place
- * @return the total number of bits unpacked
- */
-template <typename P, typename OI, typename WI>
-    requires std::unsigned_integral<P> && std::output_iterator<OI, P> && std::forward_iterator<WI>
-size_t unpack_into(P packed, OI begin, OI end, WI widths, bool little_endian = true) {
-    using U = std::remove_cvref_t<decltype(*begin)>;
-    if (little_endian) {
-        // first element is in the least significant place of packed
-
-        unsigned offset = 0;
-        while (begin < end) {
-            auto w = *(widths++);
-            assert(w <= std::numeric_limits<U>::digits);
-
-            auto mask = ((1 << w) - 1);              // least significant w bits all 1
-            *(begin++) = (packed >> offset) & mask;  // write to output
-
-            offset += w;  // increment offset
-        }
-
-        assert(offset <= std::numeric_limits<P>::digits);  // detect an overflow condition
-        return offset;
-    } else {
-        // first element is in the most significant place of packed
-
-        // with non-constant width, we either need to iterate backwards or
-        // add up all the widths beforehand to know where to begin
-        unsigned total_offset = std::accumulate(widths, widths + std::distance(begin, end), 0);
-        assert(total_offset <= std::numeric_limits<P>::digits);  // detect an overflow condition
-
-        unsigned offset = total_offset;
-        while (begin < end) {
-            auto w = *(widths++);
-            offset -= w;                                 // decrement offset
-            assert(w < std::numeric_limits<U>::digits);  // detect an overflow condition
-
-            auto mask = ((1 << w) - 1);              // least significant w bits all 1
-            *(begin++) = (packed >> offset) & mask;  // write to output
-        }
-
-        return total_offset;
-    }
-}
-
-/**
- * Unpacks an unsigned integer into a range of smaller integers.
- * @param packed value to unpack
- * @param dest destination range
- * @param widths widths range. values are in bits
- * @param little_endian if the input has the first element in the least significant place
- * @return the total number of bits unpacked
- */
-template <typename P, typename OR, typename WR>
-    requires std::unsigned_integral<P> && range<OR> && range<WR>
-size_t unpack_into(P packed, OR &dest, const WR &widths, bool little_endian = true) {
-    assert(size(widths) == size(dest));
-    return unpack_into(packed, dest.begin(), dest.end(), widths.begin(), little_endian);
-}
-
-/**
- * Unpacks an unsigned integer into a range of smaller integers.
- * @param packed value to unpack
- * @param begin destination start iterator
- * @param end destination end iterator
- * @param width width of each packed element in bits
- * @param little_endian if the input has the first element in the least significant place
- * @return the total number of bits unpacked
- */
-template <typename P, typename OI>
-    requires std::unsigned_integral<P> && std::output_iterator<OI, P>
-size_t unpack_into(P packed, OI begin, OI end, size_t width, bool little_endian = true) {
-    return unpack_into(packed, begin, end, const_iterator(width), little_endian);
-}
-
-/**
- * Unpacks an unsigned integer into a range of smaller integers.
- * @param packed value to unpack
- * @param dest destination range
- * @param width width of each packed element in bits
- * @param little_endian if the input has the first element in the least significant place
- * @return the total number of bits unpacked
- */
-template <typename P, typename OR>
-    requires std::unsigned_integral<P> && range<OR>
-size_t unpack_into(P packed, OR &dest, size_t width, bool little_endian = true) {
-    return unpack_into(packed, dest.begin(), dest.end(), const_iterator(width), little_endian);
-}
-
-/**
- * Unpacks an unsigned integer into an array of smaller integers
- * @tparam U unpacked data type
- * @tparam N number of values to unpack
- * @param packed value to unpack
- * @param widths widths iterator. values are in bits
- * @param little_endian if the input has the first element in the least significant place
- * @return an array of unpacked values
- */
-template <typename U, size_t N, typename P, typename WI>
-    requires std::unsigned_integral<P> && std::forward_iterator<WI>
-std::array<U, N> unpack(P packed, WI widths, bool little_endian = true) {
-    std::array<U, N> unpacked;
-    unpack_into(packed, unpacked, widths, little_endian);
-    return unpacked;
-}
-
-/**
- * Unpacks an unsigned integer into an array of smaller integers
- * @tparam U unpacked data type
- * @param packed value to unpack
- * @param widths widths array. values are in bits
- * @param little_endian if the input has the first element in the least significant place
- * @return an array of unpacked values
- */
-template <typename U, size_t N, typename P>
-    requires std::unsigned_integral<P>
-std::array<U, N> unpack(P packed, const std::array<size_t, N> &widths, bool little_endian = true) {
-    return unpack<U, N>(packed, widths.begin(), little_endian);
-}
-
-/**
- * Unpacks an unsigned integer into an array of smaller integers
- * @tparam U unpacked data type
- * @tparam N number of values to unpack
- * @param packed value to unpack
- * @param widths widths range. values are in bits
- * @param little_endian if the input has the first element in the least significant place
- * @return an array of unpacked values
- */
-template <typename U, size_t N, typename P, typename WR>
-    requires std::unsigned_integral<P> && range<WR>
-std::array<U, N> unpack(P packed, const WR &widths, bool little_endian = true) {
-    assert(size(widths) == N);
-    return unpack<U, N>(packed, widths.begin(), little_endian);
-}
-
-/**
- * Unpacks an unsigned integer into an array of smaller integers
- * @tparam U unpacked data type
- * @tparam N number of values to unpack
- * @param packed value to unpack
- * @param width width of each packed element in bits
- * @param little_endian if the input has the first element in the least significant place
- * @return an array of unpacked values
- */
-template <typename U, size_t N, typename P>
-    requires std::unsigned_integral<P>
-std::array<U, N> unpack(P packed, size_t width, bool little_endian = true) {
-    std::array<U, N> unpacked;
-    unpack_into(packed, unpacked, width, little_endian);
-    return unpacked;
-}
-
-/**
- * Packs an iterable of integers into a single integer.
- * @tparam II input iterator type
- * @tparam WI width iterator type
- * @tparam P Output data type. must be an unsigned integral type large enough to hold all input values
- * @param start start iterator
- * @param end end iterator
- * @param widths width iterator. must be at least as large as the input data
- * @param little_endian if the output value should have the first element in the least significant place
- * of the output or not
- * @return Packed integer of type P.
- */
-template <typename P, typename II, typename WI>
-    requires std::unsigned_integral<P> && std::input_iterator<II> && std::input_iterator<WI>
-inline constexpr P pack(II start, II end, WI widths, bool little_endian = true) {
-    P packed = 0;
-    unsigned offset = 0;
-    while (start < end) {
-        P val = static_cast<P>(*(start++));
-        auto w = *(widths++);
-
-        val &= ((1 << w) - 1);
-        assert(val < (1u << w));  // ensure value can fit in W bits
-
-        if (little_endian) {
-            packed |= static_cast<P>(val) << offset;  // first element is in the least significant place of packed
-        } else {
-            packed = (packed << w) | static_cast<P>(val);  // first element is in the most significant place of packed
-        }
-
-        offset += w;  // increment offset
-    }
-
-    assert(offset <= std::numeric_limits<P>::digits);  // detect an overflow condition
-    return packed;
-}
-
-/**
- * Packs an iterable of integers into a single integer.
- * @tparam IR input range type
- * @tparam WR width range type
- * @tparam P Output data type. must be an unsigned integral type large enough to hold all input values
- * @param r range of values to pack
- * @param widths range of widths to pack with. must be at least as large as r
- * @param little_endian if the output value should have the first element in the least significant place
- * of the output or not
- * @return Packed integer of type P.
- */
-template <typename P, typename IR, typename WR>
-    requires std::unsigned_integral<P> && range<IR> && range<WR>
-inline constexpr P pack(IR r, WR widths, bool little_endian = true) {
-    assert(size(widths) == size(r));
-    return pack<P>(r.begin(), r.end(), widths.start(), little_endian);
-}
-
-/**
- * Packs an iterable of integers into a single integer.
- * @tparam II input iterator type
- * @tparam P Output data type. must be an unsigned integral type large enough to hold all input values
- * @param start start iterator
- * @param end end iterator
- * @param width Number of bits in each value
- * @param little_endian if the output value should have the first element in the least significant place
- * of the output or not
- * @return Packed integer of type P.
- */
-template <typename P, typename II>
-    requires std::unsigned_integral<P> && std::input_iterator<II>
-inline constexpr P pack(II start, II end, size_t width, bool little_endian = true) {
-    return pack<P>(start, end, const_iterator(width), little_endian);
-}
-
-/**
- * Packs a range of integers into a single integer.
- * @tparam IR range type
- * @tparam P Output data type. must be an unsigned integral type large enough to hold all input values
- * @param r range of values to pack
- * @param width Number of bits in each value
- * @param little_endian if the output value should have the first element in the least significant place
- * of the output or not
- * @return Packed integer of type P.
- */
-template <typename P, typename IR>
-    requires std::unsigned_integral<P> && range<IR>
-inline constexpr P pack(IR r, size_t width, bool little_endian = true) {
-    return pack<P>(r.begin(), r.end(), const_iterator(width), little_endian);
-}
-}  // namespace quicktex
--- a/quicktex/util/iterator.h
+++ b/quicktex/util/iterator.h
@ -1,146 +0,0 @@
-/*  Quicktex Texture Compression Library
-    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
-    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
-    and licenced under the public domain
-
-    This program is free software: you can redistribute it and/or modify
-    it under the terms of the GNU Lesser General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU Lesser General Public License for more details.
-
-    You should have received a copy of the GNU Lesser General Public License
-    along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#pragma once
-
-namespace quicktex {
-
-namespace detail {
-template <class R> using subs_value_t = std::remove_reference_t<decltype(std::declval<R &>()[0])>;
-}
-
-template <typename D, typename T> class index_iterator_base {
-   public:
-    using value_type = T;
-    using size_type = int;
-    using difference_type = int;
-
-    D &operator++() {
-        _index++;
-        return static_cast<D &>(*this);
-    }
-    D operator++(int) {
-        D old = static_cast<D &>(*this);
-        _index++;
-        return old;
-    }
-    D &operator--() {
-        _index--;
-        return static_cast<D &>(*this);
-    }
-    D operator--(int) {
-        D old = static_cast<D &>(*this);
-        _index--;
-        return old;
-    }
-
-    D operator+(difference_type rhs) const {
-        D d = static_cast<const D &>(*this);
-        d._index += rhs;
-        return d;
-    }
-
-    D operator-(difference_type rhs) const {
-        D d = static_cast<const D &>(*this);
-        d._index -= rhs;
-        return d;
-    }
-
-    D &operator+=(difference_type rhs) {
-        *this = *this + rhs;
-        return *this;
-    }
-
-    D &operator-=(difference_type rhs) {
-        *this = *this - rhs;
-        return *this;
-    }
-
-    difference_type operator-(const D &rhs) const { return (difference_type)_index - rhs._index; }
-
-    friend D operator+(difference_type lhs, const D &rhs) { return rhs + lhs; }
-
-    friend auto operator<=>(const D &lhs, const D &rhs) { return lhs._index <=> rhs._index; }
-
-    T &operator[](difference_type i) { return *(static_cast<D &>(*this) + i); }
-    T &operator[](difference_type i) const { return *(static_cast<const D &>(*this) + i); }
-
-   protected:
-    int _index;
-
-   private:
-    friend D;
-    index_iterator_base(size_t index = 0) : _index(index) {}
-};
-
-template <typename R>
-    requires requires(const R &r) { r[0]; }
-class index_iterator : public index_iterator_base<index_iterator<R>, detail::subs_value_t<R>> {
-   public:
-    using base = index_iterator_base<index_iterator<R>, detail::subs_value_t<R>>;
-    using typename base::difference_type;
-    using typename base::size_type;
-    using typename base::value_type;
-
-    index_iterator() : base(0), _range(nullptr) {}
-    index_iterator(R &range, int index) : base(index), _range(&range) {}
-
-    value_type &operator*() const {
-        assert(_range != nullptr);
-        assert(this->_index >= 0);
-        assert(this->_index < (size_type)_range->size());
-        return (*_range)[this->_index];
-    }
-    value_type *operator->() const { return &(this->operator*()); }
-
-    friend bool operator==(const index_iterator &lhs, const index_iterator &rhs) {
-        return (lhs._range == rhs._range) && (lhs._index == rhs._index);
-    }
-
-   private:
-    R *_range;
-};
-
-template <typename T> class const_iterator : public index_iterator_base<const_iterator<T>, const T> {
-   public:
-    using base = index_iterator_base<const_iterator<T>, const T>;
-    using typename base::difference_type;
-    using typename base::size_type;
-    using typename base::value_type;
-
-    const_iterator() : base(0), _value(T{}) {}
-    const_iterator(T value, int index = 0) : base(index), _value(value) {}
-
-    value_type &operator*() const { return _value; }
-    value_type *operator->() const { return &_value; }
-
-    friend bool operator==(const const_iterator &lhs, const const_iterator &rhs) {
-        return (lhs._value == rhs._value) && (lhs._index == rhs._index);
-    }
-
-   private:
-    T _value;
-};
-
-// const_iterator is guaranteed to be a random access iterator. it is not writable for obvious reasons
-static_assert(std::random_access_iterator<const_iterator<int>>);
-
-// index_iterator satisfied forward_iterator
-static_assert(std::random_access_iterator<index_iterator<std::array<int, 4>>>);
-}  // namespace quicktex
--- a/quicktex/util/map.h
+++ b/quicktex/util/map.h
@ -1,178 +0,0 @@
-/*  Quicktex Texture Compression Library
-    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
-    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
-    and licenced under the public domain
-
-    This program is free software: you can redistribute it and/or modify
-    it under the terms of the GNU Lesser General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU Lesser General Public License for more details.
-
-    You should have received a copy of the GNU Lesser General Public License
-    along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#pragma once
-
-#include <array>
-#include <tuple>
-#include <xsimd/xsimd.hpp>
-
-#include "util/ranges.h"
-
-namespace quicktex {
-
-namespace detail {
-
-template <typename T>
-concept simdable = random_access_range<T> && std::contiguous_iterator<decltype(std::declval<T>().begin())> &&
-                   std::is_arithmetic_v<range_value_t<T>>;
-template <typename T, bool serial = false> struct chunker_impl {};
-
-template <typename T, bool serial>
-    requires simdable<T> && (!serial)
-struct chunker_impl<T, serial> {
-    // range with contiguous, SIMDable data
-
-    static constexpr size_t steps = 2;
-    using chunk_types = std::tuple<xsimd::batch<range_value_t<T>>, range_value_t<T>>;
-
-    template <size_t step> using chunk_type = std::tuple_element_t<step, chunk_types>;
-    static constexpr std::array<size_t, 2> chunk_sizes = {chunk_type<0>::size, 1};
-
-    template <size_t step> static constexpr size_t chunk_count(const T& r) {
-        if constexpr (step == 0) {
-            return std::size(r) / chunk_sizes[0];
-        } else {
-            return std::size(r) % chunk_sizes[0];
-        }
-    }
-
-    template <size_t step> static constexpr auto get_chunk(const T& r, size_t i) {
-        assert(i < chunk_count<step>(r));
-        if constexpr (step == 0) {
-            return xsimd::load_unaligned(&r[chunk_sizes[0] * i]);
-        } else {
-            return r[chunk_sizes[0] * chunk_count<0>(r) + i];
-        }
-    }
-
-    template <size_t step>
-    static constexpr void set_chunk(T& r, size_t i, const std::tuple_element_t<step, chunk_types>& c) {
-        assert(i < chunk_count<step>(r));
-        if constexpr (step == 0) {
-            xsimd::store_unaligned(&r[chunk_sizes[0] * i], c);
-        } else {
-            r[chunk_sizes[0] * chunk_count<0>(r) + i] = c;
-        }
-    }
-};
-
-template <typename T, bool serial>
-    requires random_access_range<T> && (!simdable<T> || serial)
-struct chunker_impl<T, serial> {
-    // range with data that cant be SIMDed
-    static constexpr size_t steps = 1;
-    template <size_t step> using chunk_type = range_value_t<T>;
-    static constexpr std::array<size_t, 1> chunk_sizes = {1};
-
-    template <size_t step> static constexpr size_t chunk_count(const T& r) { return r.size(); }
-    template <size_t step> static constexpr auto get_chunk(const T& r, size_t i) { return r[i]; }
-    template <size_t step> static constexpr void set_chunk(T& r, size_t i, const chunk_type<0>& c) { r[i] = c; }
-};
-
-template <typename T, bool serial>
-    requires(!sized_range<T>)
-struct chunker_impl<T, serial> {
-    static constexpr size_t steps = 1;
-    using chunk_types = std::tuple<T>;
-    template <size_t step> using chunk_type = T;
-
-    static constexpr std::array<size_t, 1> chunk_sizes = {1};
-
-    template <size_t step> static constexpr size_t chunk_count(const T&) { return 1; }
-    template <size_t step> static constexpr auto get_chunk(const T& r, size_t) { return r; }
-    template <size_t step> static constexpr void set_chunk(T& r, size_t, const T& c) { r = c; }
-};
-
-template <typename T, bool serial = false, size_t step = 0>
-using chunk_type = typename chunker_impl<T, serial>::template chunk_type<step>;
-
-template <typename T, bool serial, typename Op, std::size_t step, typename... Args>
-static constexpr bool callable_step() {
-    return std::is_invocable_r_v<typename chunker_impl<T, serial>::template chunk_type<step>, Op,
-                                 typename chunker_impl<Args, serial>::template chunk_type<step>...>;
-}
-
-template <typename T, bool serial, typename Op, typename... Args, std::size_t... steps>
-static constexpr bool callable_steps(std::index_sequence<steps...>) {
-    return (callable_step<T, serial, Op, steps, Args...>() && ...);
-}
-
-template <typename T, bool serial, typename Op, typename... Args> static constexpr bool callable() {
-    //    if constexpr (!(std::same_as<T, Args> && ...)) return false;
-    //    return callable_steps<T, serial, Op>(std::make_index_sequence<chunker_impl<T, serial>::steps>());
-    return callable_steps<T, serial, Op, Args...>(std::make_index_sequence<1>());
-}
-
-template <typename T, bool serial, size_t step, typename... Args>
-    requires((std::is_scalar_v<Args> || std::same_as<T, Args>) && ...)
-inline void do_map_step(auto f, T& result, const Args&... args) {
-    using impl = chunker_impl<T, serial>;
-    using chunk_type = typename impl::template chunk_type<step>;
-    size_t chunk_count = impl::template chunk_count<step>(result);
-
-    for (unsigned i = 0; i < chunk_count; i++) {
-        chunk_type out_chunk = f(chunker_impl<Args, serial>::template get_chunk<step>(args, i)...);
-        impl::template set_chunk<step>(result, i, out_chunk);
-    }
-}
-
-template <typename T, bool serial, typename Op, std::size_t... steps, typename... Args>
-    requires((std::is_scalar_v<Args> || std::same_as<T, Args>) && ...)
-inline void do_map_steps(Op f, T& result, std::index_sequence<steps...>, const Args&... args) {
-    //    static_assert(callable<T, serial, Op, Args...>());
-
-    (do_map_step<T, serial, steps>(f, result, args...), ...);
-}
-
-template <typename T, bool serial, typename Op, typename... Args>
-    requires((std::is_scalar_v<Args> || std::same_as<T, Args>) && ...)
-inline void do_map_all(Op f, T& result, const Args&... args) {
-    constexpr bool must_serialize = serial || !callable<T, false, Op, Args...>();
-    do_map_steps<T, must_serialize>(f, result, std::make_index_sequence<chunker_impl<T, serial>::steps>(), args...);
-}
-}  // namespace detail
-
-template <typename R, typename T, bool serial = false, typename Op, typename... Args>
-    requires sized_range<T> && (sized_range<Args> && ...)
-inline R map_to(Op f, const T& in, const Args&... args) {
-    // the input and result types are not the same, so attempting chunking is unsafe
-    R result{};
-    for (unsigned i = 0; i < in.size(); i++) { result[i] = f(in[i], args[i]...); }
-    return result;
-}
-
-template <typename T, bool serial = false, typename Op, typename... Args>
-    requires sized_range<T>
-inline auto map(Op f, const T& in, const Args&... args) {
-    //    assert(((in.size() == args.size())) && ...);
-
-    if constexpr (((std::is_scalar_v<Args> || std::same_as<T, Args>)&&...) &&
-                  (detail::callable<T, true, Op, T, Args...>())) {
-        // the input and result types are all the same type and size, so we can attempt chunking
-        T result{};
-        detail::do_map_all<T, serial>(f, result, in, args...);
-        return result;
-    } else {
-        using result_type = std::invoke_result_t<Op, typename detail::chunk_type<T, true>, range_value_t<Args>...>;
-        return map_to<std::array<result_type, std::tuple_size_v<T>>, T, serial>(f, in, args...);
-    }
-}
-
-}  // namespace quicktex
--- a/quicktex/util/math.h
+++ b/quicktex/util/math.h
@ -1,84 +0,0 @@
-/*  Quicktex Texture Compression Library
-    Copyright (C) 2021-2022 Andrew Cassidy <drewcassidy@me.com>
-    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
-    and licenced under the public domain
-
-    This program is free software: you can redistribute it and/or modify
-    it under the terms of the GNU Lesser General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU Lesser General Public License for more details.
-
-    You should have received a copy of the GNU Lesser General Public License
-    along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#pragma once
-#include <cassert>
-#include <cstdint>
-#include <functional>
-#include <limits>
-#include <numeric>
-#include <string>
-#include <type_traits>
-#include <vector>
-
-#include "util/ranges.h"
-#include "xsimd/xsimd.hpp"
-
-namespace quicktex {
-
-using std::abs;    // abs overload for builtin types
-using xsimd::abs;  // abs overload for xsimd buffers
-
-template <typename S>
-    requires requires(S &s) { s.abs(); }
-constexpr S abs(S value) {
-    return value.abs();
-}
-
-template <typename S, typename R>
-    requires requires(S s, R r) { s.clamp(r, r); }
-constexpr S clamp(S value, R low, R high) {
-    return value.clamp(low, high);
-}
-
-template <typename S>
-    requires std::is_scalar_v<S>
-constexpr S clamp(S value, S low, S high) {
-    assert(low <= high);
-    if (value < low) return low;
-    if (value > high) return high;
-    return value;
-}
-
-template <typename S, typename A>
-constexpr xsimd::batch<S, A> clamp(xsimd::batch<S, A> value, const xsimd::batch<S, A> &low,
-                                   const xsimd::batch<S, A> &high) {
-    return xsimd::clip(value, low, high);
-}
-
-template <typename S, typename A>
-constexpr xsimd::batch<S, A> clamp(xsimd::batch<S, A> value, const S &low, const S &high) {
-    return clamp(value, xsimd::broadcast(low), xsimd::broadcast(high));
-}
-
-template <typename S>
-    requires requires(S &s) { s.sum(); }
-constexpr auto sum(S value) {
-    return value.sum();
-}
-
-template <typename S>
-    requires std::is_scalar_v<S>
-constexpr auto sum(S value) {
-    return value;
-    // horizontally adding a scalar is a noop
-}
-
-template <typename S, typename A> constexpr auto sum(xsimd::batch<S, A> value) { return xsimd::hadd(value); }
-}  // namespace quicktex
--- a/quicktex/util/ranges.h
+++ b/quicktex/util/ranges.h
@ -1,74 +0,0 @@
-/*  Quicktex Texture Compression Library
-    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
-    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
-    and licenced under the public domain
-
-    This program is free software: you can redistribute it and/or modify
-    it under the terms of the GNU Lesser General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU Lesser General Public License for more details.
-
-    You should have received a copy of the GNU Lesser General Public License
-    along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#pragma once
-
-#include <algorithm>
-#include <array>
-#include <cassert>
-#include <cstdint>
-#include <functional>
-#include <iterator>
-#include <limits>
-#include <numeric>
-#include <string>
-#include <type_traits>
-
-namespace quicktex {
-
-// std::ranges is not usable by default in libc++ 13
-template <class T>
-concept range = requires(T &t) {
-                    t.begin();
-                    t.end();
-                };
-
-using std::size;
-template <range T> constexpr auto size(const T &range) { return std::distance(range.begin(), range.end()); }
-
-template <class T>
-concept sized_range = range<T> && requires(T &t) { size(t); };
-
-template <class R> using iterator_t = decltype(std::declval<R &>().begin());
-template <class R> using sentinel_t = decltype(std::declval<R &>().end());
-template <class R> using range_size_t = decltype(size(std::declval<R &>()));
-template <class R> using range_difference_t = std::iter_difference_t<iterator_t<R>>;
-template <class R> using range_value_t = std::iter_value_t<iterator_t<R>>;
-template <class R> using range_reference_t = std::iter_reference_t<iterator_t<R>>;
-template <class R> using range_rvalue_reference_t = std::iter_rvalue_reference_t<iterator_t<R>>;
-
-template <class R>
-concept input_range = range<R> && std::input_iterator<iterator_t<R>>;
-
-template <class R, typename T>
-concept output_range = range<R> && (std::output_iterator<iterator_t<R>, T>);
-
-template <class R>
-concept forward_range = range<R> && std::forward_iterator<iterator_t<R>>;
-
-template <class R>
-concept bidirectional_range = range<R> && std::bidirectional_iterator<iterator_t<R>>;
-
-template <class R>
-concept random_access_range = range<R> && std::random_access_iterator<iterator_t<R>>;
-
-template <class R>
-concept contiguous_range = range<R> && std::contiguous_iterator<iterator_t<R>>;
-
-}  // namespace quicktex
--- a/quicktex/util/simd.h
+++ b/quicktex/util/simd.h
@ -1,97 +0,0 @@
-/*  Quicktex Texture Compression Library
-    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
-    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
-    and licenced under the public domain
-
-    This program is free software: you can redistribute it and/or modify
-    it under the terms of the GNU Lesser General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU Lesser General Public License for more details.
-
-    You should have received a copy of the GNU Lesser General Public License
-    along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#pragma once
-
-#include <array>
-#include <type_traits>
-
-#include "util/math.h"
-#include "util/types.h"
-#include "xsimd/xsimd.hpp"
-
-template <typename T> using requires_arch = xsimd::kernel::requires_arch<T>;
-
-namespace quicktex::simd {
-
-namespace kernel {
-
-#if XSIMD_WITH_NEON64
-template <class A> inline int16_t whadd(xsimd::batch<int8_t, A> const& arg, requires_arch<xsimd::neon64>) {
-    return vaddlvq_s8(arg);
-}
-
-template <class A> inline int32_t whadd(xsimd::batch<int16_t, A> const& arg, requires_arch<xsimd::neon64>) {
-    return vaddlvq_s16(arg);
-}
-
-template <class A> inline int64_t whadd(xsimd::batch<int32_t, A> const& arg, requires_arch<xsimd::neon64>) {
-    return vaddlvq_s32(arg);
-}
-
-template <class A> inline uint16_t whadd(xsimd::batch<uint8_t, A> const& arg, requires_arch<xsimd::neon64>) {
-    return vaddlvq_u8(arg);
-}
-
-template <class A> inline uint32_t whadd(xsimd::batch<uint16_t, A> const& arg, requires_arch<xsimd::neon64>) {
-    return vaddlvq_u16(arg);
-}
-
-template <class A> inline uint64_t whadd(xsimd::batch<uint32_t, A> const& arg, requires_arch<xsimd::neon64>) {
-    return vaddlvq_u32(arg);
-}
-#endif
-
-#if XSIMD_WITH_SSE2
-template <class A> inline int32_t whadd(xsimd::batch<int16_t, A> const& arg, requires_arch<xsimd::sse2>) {
-    // Pairwise widening sum with multiply by 1, then sum all N/2 widened lanes
-    xsimd::batch<int32_t, A> paired = _mm_madd_epi16(arg, _mm_set1_epi16(1));
-    return xsimd::hadd(paired);
-}
-#endif
-
-#if XSIMD_WITH_AVX2
-template <class A> inline int32_t whadd(xsimd::batch<int16_t, A> const& arg, requires_arch<xsimd::avx2>) {
-    // Pairwise widening sum with multiply by 1, then sum all N/2 widened lanes
-    xsimd::batch<int32_t, A> paired = _mm256_madd_epi16(arg, _mm256_set1_epi16(1));
-    return xsimd::hadd(paired);
-}
-#endif
-
-template <class A, class T> inline next_size_t<T> whadd(xsimd::batch<T, A> const& arg, requires_arch<xsimd::generic>) {
-    // Generic implementation that should work everywhere
-    using b_type = xsimd::batch<T, A>;
-    using r_type = next_size_t<T>;
-    const auto len = b_type::size;
-
-    alignas(A::alignment()) T buffer[len];
-    r_type sum = 0;
-
-    arg.store_aligned(buffer);
-    for (T val : buffer) { sum += static_cast<r_type>(val); }
-
-    return sum;
-}
-}  // namespace kernel
-
-template <class A, class T> inline next_size_t<T> whadd(xsimd::batch<T, A> const& arg) {
-    return kernel::whadd(arg, A{});
-}
-
-}  // namespace quicktex::simd
--- a/quicktex/util/subrange.h
+++ b/quicktex/util/subrange.h
@ -1,97 +0,0 @@
-/*  Quicktex Texture Compression Library
-    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
-    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
-    and licenced under the public domain
-
-    This program is free software: you can redistribute it and/or modify
-    it under the terms of the GNU Lesser General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU Lesser General Public License for more details.
-
-    You should have received a copy of the GNU Lesser General Public License
-    along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#pragma once
-
-#include <concepts>
-#include <iterator>
-
-#include "util/ranges.h"
-
-namespace quicktex {
-
-template <std::input_or_output_iterator I, std::sentinel_for<I> S = I> struct subrange {
-   public:
-    using iterator_type = I;
-    using sentinel_type = S;
-    using value_type = std::iter_value_t<I>;
-    using reference_type = std::iter_reference_t<I>;
-    using difference_type = std::iter_difference_t<I>;
-
-    constexpr subrange(const I& b, const S& e) : _begin(b), _end(e) {}
-
-    constexpr I begin() const { return _begin; }
-    constexpr S end() const { return _end; }
-    constexpr bool empty() const { return _begin == _end; }
-    constexpr difference_type size() const { return std::distance(_end, _begin); }
-
-    explicit constexpr operator bool() const { return !empty(); }
-
-    constexpr subrange& advance(difference_type n) {
-        assert(n >= 0 || std::bidirectional_iterator<I>);  // forward iterators cannot be decremented
-
-        if (n > 0) {
-            for (int i = 0; i < n && _begin != _end; i++) { _begin++; }
-        } else {
-            for (int i = 0; i > n && _begin != _end; i--) { _begin--; }
-        }
-        return *this;
-    }
-
-    constexpr subrange next(difference_type n = 1) const {
-        auto tmp = *this;
-        return tmp.advance(n);
-    }
-
-    template <typename _ = I>
-        requires std::bidirectional_iterator<I>
-    constexpr subrange prev(difference_type n = 1) const {
-        return next(-n);
-    }
-
-    template <typename _ = I>
-        requires std::random_access_iterator<I>
-    constexpr reference_type operator[](difference_type i) {
-        assert(i >= 0 && i < size());
-        return _begin[i];
-    }
-
-    template <typename _ = I>
-        requires std::random_access_iterator<I>
-    constexpr const reference_type operator[](difference_type i) const {
-        assert(i >= 0 && i < size());
-        return _begin[i];
-    }
-
-    template <typename _ = I>
-        requires std::contiguous_iterator<I>
-    constexpr value_type* data() {
-        return std::to_address(_begin);
-    }
-    template <typename _ = I>
-        requires std::contiguous_iterator<I>
-    constexpr value_type const* data() const {
-        return std::to_address(_begin);
-    }
-
-   private:
-    I _begin;
-    S _end;
-};
-}  // namespace quicktex
--- a/quicktex/util/types.h
+++ b/quicktex/util/types.h
@ -1,49 +0,0 @@
-/*  Quicktex Texture Compression Library
-    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
-    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
-    and licenced under the public domain
-
-    This program is free software: you can redistribute it and/or modify
-    it under the terms of the GNU Lesser General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU Lesser General Public License for more details.
-
-    You should have received a copy of the GNU Lesser General Public License
-    along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#pragma once
-#include <cstdint>
-
-namespace quicktex {
-template <class> struct next_size;
-template <class T> using next_size_t = typename next_size<T>::type;
-template <class T> struct type_tag { using type = T; };
-
-template <> struct next_size<int8_t> : type_tag<int16_t> {};
-template <> struct next_size<int16_t> : type_tag<int32_t> {};
-template <> struct next_size<int32_t> : type_tag<int64_t> {};
-
-template <> struct next_size<uint8_t> : type_tag<uint16_t> {};
-template <> struct next_size<uint16_t> : type_tag<uint32_t> {};
-template <> struct next_size<uint32_t> : type_tag<uint64_t> {};
-
-template <auto bitCount>
-using unsigned_bits =
-    std::conditional_t<bitCount <= 8, std::uint8_t,
-                       std::conditional_t<bitCount <= 16, std::uint16_t,
-                                          std::conditional_t<bitCount <= 32, std::uint32_t,
-                                                             std::conditional_t<bitCount <= 64, std::uint64_t, void>>>>;
-
-template <auto bitCount>
-using signed_bits =
-    std::conditional_t<bitCount <= 8, std::int8_t,
-                       std::conditional_t<bitCount <= 16, std::int16_t,
-                                          std::conditional_t<bitCount <= 32, std::int32_t,
-                                                             std::conditional_t<bitCount <= 64, std::int64_t, void>>>>;
-}  // namespace quicktex::util
--- a/setup.py
+++ b/setup.py
@ -46,6 +46,7 @@ class CMakeBuild(build_ext):
            "-DPython_EXECUTABLE={}".format(sys.executable),
            "-DPython_ROOT_DIR={}".format(os.path.dirname(sys.executable)),
            "-DQUICKTEX_VERSION_INFO={}".format(version),  # include version info in module
+            "-DQUICKTEX_MODULE_ONLY=TRUE",  # only build the module, not the wrapper
            "-DCMAKE_BUILD_TYPE={}".format(cfg),  # not used on MSVC, but no harm
            # clear cached make program binary, see https://github.com/pypa/setuptools/issues/2912
            "-U",
@ -53,9 +54,6 @@ class CMakeBuild(build_ext):
        ]
        build_args = []

-        if self.verbose:
-            build_args += ["--verbose"]
-
        if self.compiler.compiler_type != "msvc":
            # Using Ninja-build since it a) is available as a wheel and b)
            # multithreads automatically. MSVC would require all variables be
@ -66,9 +64,6 @@ class CMakeBuild(build_ext):
                cmake_args += ["-GNinja"]

        else:
-            # if 'CC' in os.environ and 'clang-cl' in os.environ['CC']:
-            #     cmake_args += ["-T", 'ClangCL']  # https://stackoverflow.com/a/64189112/7645957
-
            # Single config generators are handled "normally"
            single_config = any(x in cmake_generator for x in {"NMake", "Ninja"})

--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@ -1,28 +0,0 @@
-include(FetchContent)
-FetchContent_Declare(
-        googletest
-        URL https://github.com/google/googletest/archive/e2239ee6043f73722e7aa812a459f54a28552929.zip
-)
-# For Windows: Prevent overriding the parent project's compiler/linker settings
-set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
-FetchContent_MakeAvailable(googletest)
-
-file(GLOB_RECURSE TEST_HEADER_FILES "**.h")
-file(GLOB_RECURSE TEST_SOURCE_FILES "**.cpp")
-file(GLOB_RECURSE TEST_PYTHON_FILES "**.py")
-
-# Organize source files together for some IDEs
-source_group(TREE ${CMAKE_CURRENT_SOURCE_DIR} FILES ${TEST_SOURCE_FILES} ${TEST_HEADER_FILES} ${TEST_PYTHON_FILES})
-
-add_executable(Test ${TEST_SOURCE_FILES} ${TEST_HEADER_FILES})
-
-if ((NOT MSVC) AND (CMAKE_BUILD_TYPE MATCHES Debug))
-    target_compile_options(Test PUBLIC -fsanitize=address,undefined -fno-sanitize-recover=address,undefined -fno-omit-frame-pointer)
-    target_link_options(Test PUBLIC -fsanitize=address,undefined -fno-sanitize-recover=address,undefined -fno-omit-frame-pointer)
-endif ()
-
-target_link_libraries(Test PUBLIC quicktex gtest_main)
-
-include(GoogleTest)
-gtest_discover_tests(Test)
-
--- a/tests/ctest/TestMatrix.cpp
+++ b/tests/ctest/TestMatrix.cpp
@ -1,227 +0,0 @@
-/*  Quicktex Texture Compression Library
-    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
-    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
-    and licenced under the public domain
-
-    This program is free software: you can redistribute it and/or modify
-    it under the terms of the GNU Lesser General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU Lesser General Public License for more details.
-
-    You should have received a copy of the GNU Lesser General Public License
-    along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <Matrix.h>
-#include <gtest/gtest.h>
-#include <util/math.h>
-
-#include <array>
-#include <cstdlib>
-
-namespace quicktex::tests {
-
-#define EXPECT_MATRIX_EQ(value, expected)                                           \
-    {                                                                               \
-        auto v = value;                                                             \
-        auto e = expected;                                                          \
-        if constexpr (std::is_floating_point_v<typename decltype(v)::value_type>) { \
-            for (unsigned i = 0; i < v.elements; i++) {                             \
-                EXPECT_FLOAT_EQ(v.element(i), e.element(i)) << "At index " << i;    \
-            }                                                                       \
-        } else {                                                                    \
-            EXPECT_EQ(v, e);                                                        \
-        }                                                                           \
-    }
-
-constexpr size_t fibn(size_t n) { return (n < 2) ? n : fibn(n - 1) + fibn(n - 2); }
-
-template <typename T> constexpr T sqr(T n) { return n * n; }
-
-template <typename Op, typename... Args> constexpr void foreach (Op f, Args... args) { (f(args), ...); }
-
-template <typename T> class MatrixTest : public testing::Test {
-   public:
-    using Scalar = T;
-    template <size_t M> using Vec = quicktex::Vec<T, M>;
-    template <size_t M, size_t N> using Matrix = quicktex::Matrix<T, M, N>;
-
-    template <typename M> constexpr M iota(T start = 0, T stride = 1) {
-        M result(0);
-        for (unsigned i = 0; i < M::elements; i++) { result.element(i) = (static_cast<T>(i) + start) * stride; }
-        return result;
-    }
-
-    template <typename M> constexpr M sqr(T start = 0, T stride = 1) {
-        M result(0);
-        for (unsigned i = 0; i < M::elements; i++) {
-            result.element(i) = (static_cast<T>(i) + start) * (static_cast<T>(i) + start) * stride;
-        }
-        return result;
-    }
-
-    template <typename M> constexpr M fib(T start = 0) {
-        M result(0);
-        for (unsigned i = 0; i < M::elements; i++) { result.element(i) = fibn(i + start); }
-        return result;
-    }
-
-    static constexpr auto sizes = std::make_tuple(Vec<4>(0), Vec<7>(0), Matrix<4, 4>(0), Matrix<5, 6>(0));
-
-    template <typename Op> constexpr void foreach_size(Op f) {
-        auto foreach = [f]<typename... Args>(Args... args) { (f(args), ...); };
-        std::apply(foreach, sizes);
-    }
-};
-
-using Scalars = ::testing::Types<uint8_t, int8_t, uint16_t, int16_t, uint32_t, int32_t, float, double>;
-TYPED_TEST_SUITE(MatrixTest, Scalars);
-
-#define IOTA(M, start, stride) this->TestFixture::template iota<M>(start, stride)
-#define SQR(M, start, stride) this->TestFixture::template sqr<M>(start, stride)
-#define FIB(M, start) this->TestFixture::template fib<M>(start)
-
-TYPED_TEST(MatrixTest, negate) {
-    if constexpr (std::unsigned_integral<typename TestFixture::Scalar>) {
-        GTEST_SKIP();
-    } else {
-        TestFixture::foreach_size([&]<typename M>(M) {
-            EXPECT_MATRIX_EQ(-IOTA(M, 0, 1), IOTA(M, 0, -1));
-            EXPECT_MATRIX_EQ(-IOTA(M, 0, -1), IOTA(M, 0, 1));
-        });
-    }
-}
-
-TYPED_TEST(MatrixTest, add) {
-    TestFixture::foreach_size([&]<typename M>(M) {
-        EXPECT_MATRIX_EQ(IOTA(M, 0, 1) + IOTA(M, 0, 3), IOTA(M, 0, 4));
-        EXPECT_MATRIX_EQ(IOTA(M, 0, 2) + IOTA(M, 0, 2), IOTA(M, 0, 4));
-        if constexpr (!std::unsigned_integral<typename M::value_type>) {
-            EXPECT_MATRIX_EQ(IOTA(M, 0, 3) + IOTA(M, 0, -1), IOTA(M, 0, 2));
-        }
-    });
-}
-
-TYPED_TEST(MatrixTest, subtract) {
-    TestFixture::foreach_size([&]<typename M>(M) {
-        EXPECT_MATRIX_EQ(IOTA(M, 0, 4) - IOTA(M, 0, 1), IOTA(M, 0, 3));
-        EXPECT_MATRIX_EQ(IOTA(M, 0, 2) - IOTA(M, 0, 2), IOTA(M, 0, 0));
-        if constexpr (!std::unsigned_integral<typename M::value_type>) {
-            EXPECT_MATRIX_EQ(IOTA(M, 0, 3) - IOTA(M, 0, -1), IOTA(M, 0, 4));
-            EXPECT_MATRIX_EQ(IOTA(M, 0, 1) - IOTA(M, 0, 3), IOTA(M, 0, -2));
-        }
-    });
-}
-
-TYPED_TEST(MatrixTest, multiply) {
-    TestFixture::foreach_size([&]<typename M>(M) {
-        EXPECT_MATRIX_EQ(IOTA(M, 0, 2) * 2, IOTA(M, 0, 4));
-        EXPECT_MATRIX_EQ(IOTA(M, 0, 2) * 0, M(0));
-
-        if constexpr (!std::is_unsigned_v<typename M::value_type>) {
-            EXPECT_MATRIX_EQ(IOTA(M, 0, 2) * -2, IOTA(M, 0, -4));
-        }
-
-        if constexpr (std::numeric_limits<typename M::value_type>::max() >= sqr(M::elements - 1)) {
-            EXPECT_MATRIX_EQ(IOTA(M, 0, 1) * IOTA(M, 0, 1), SQR(M, 0, 1));
-        }
-
-        if constexpr (std::numeric_limits<typename M::value_type>::max() >= sqr(M::elements - 1) * 3) {
-            EXPECT_MATRIX_EQ(IOTA(M, 0, 1) * IOTA(M, 0, 3), SQR(M, 0, 3));
-            EXPECT_MATRIX_EQ(IOTA(M, 0, 0) * IOTA(M, 0, 3), SQR(M, 0, 0));
-        }
-
-        if constexpr (std::numeric_limits<typename M::value_type>::max() >= sqr(M::elements - 1) * 4) {
-            EXPECT_MATRIX_EQ(IOTA(M, 0, 2) * IOTA(M, 0, 2), SQR(M, 0, 4));
-            if constexpr (!std::is_unsigned_v<typename M::value_type>) {
-                EXPECT_MATRIX_EQ(IOTA(M, 0, 4) * IOTA(M, 0, -1), SQR(M, 0, -4));
-                EXPECT_MATRIX_EQ(IOTA(M, 0, -4) * IOTA(M, 0, -1), SQR(M, 0, 4));
-            }
-        }
-    });
-}
-
-TYPED_TEST(MatrixTest, divide) {
-    TestFixture::foreach_size([&]<typename M>(M) {
-        EXPECT_MATRIX_EQ(IOTA(M, 0, 4) / 2, IOTA(M, 0, 2));
-        EXPECT_MATRIX_EQ(IOTA(M, 0, 2) / 1, IOTA(M, 0, 2));
-
-        if constexpr (!std::is_unsigned_v<typename M::value_type>) {
-            EXPECT_MATRIX_EQ(IOTA(M, 0, 4) / -2, IOTA(M, 0, -2));
-            EXPECT_MATRIX_EQ(IOTA(M, 0, -4) / -2, IOTA(M, 0, 2));
-        }
-
-        if constexpr (std::numeric_limits<typename M::value_type>::max() >= sqr(M::elements)) {
-            EXPECT_MATRIX_EQ(SQR(M, 1, 1) / IOTA(M, 1, 1), IOTA(M, 1, 1));
-        }
-
-        if constexpr (std::numeric_limits<typename M::value_type>::max() >= sqr(M::elements) * 3) {
-            EXPECT_MATRIX_EQ(SQR(M, 1, 3) / IOTA(M, 1, 1), IOTA(M, 1, 3));
-            EXPECT_MATRIX_EQ(SQR(M, 1, 3) / IOTA(M, 1, 3), IOTA(M, 1, 1));
-        }
-
-        if constexpr (std::numeric_limits<typename M::value_type>::max() >= sqr(M::elements) * 4) {
-            EXPECT_MATRIX_EQ(SQR(M, 1, 4) / IOTA(M, 1, 2), IOTA(M, 1, 2));
-            if constexpr (!std::is_unsigned_v<typename M::value_type>) {
-                EXPECT_MATRIX_EQ(SQR(M, 1, -4) / IOTA(M, 1, -1), IOTA(M, 1, 4));
-                EXPECT_MATRIX_EQ(SQR(M, 1, 4) / IOTA(M, 1, -1), IOTA(M, 1, -4));
-            }
-        }
-    });
-}
-
-TYPED_TEST(MatrixTest, abs) {
-    if constexpr (std::unsigned_integral<typename TestFixture::Scalar>) {
-        GTEST_SKIP();
-    } else {
-        TestFixture::foreach_size([&]<typename M>(M) {
-            EXPECT_MATRIX_EQ(IOTA(M, 0, -1).abs(), IOTA(M, 0, 1));
-            EXPECT_MATRIX_EQ(IOTA(M, 0, 1).abs(), IOTA(M, 0, 1));
-        });
-    }
-}
-
-TYPED_TEST(MatrixTest, clamp) {
-    TestFixture::foreach_size([&]<typename M>(M) {
-        EXPECT_MATRIX_EQ(IOTA(M, 0, 1).clamp(0, M::elements - 1), IOTA(M, 0, 1));
-        EXPECT_MATRIX_EQ(IOTA(M, 0, 1).clamp(M(0), IOTA(M, 0, 1)), IOTA(M, 0, 1));
-        EXPECT_MATRIX_EQ(IOTA(M, 0, 2).clamp(IOTA(M, 0, 1), IOTA(M, 0, 3)), IOTA(M, 0, 2));
-        EXPECT_MATRIX_EQ(IOTA(M, 0, 3).clamp(IOTA(M, 0, 1), IOTA(M, 0, 2)), IOTA(M, 0, 2));
-        EXPECT_MATRIX_EQ(IOTA(M, 0, 1).clamp(M(0), M(0)), M(0));
-        if (std::numeric_limits<typename M::value_type>::max() >= fibn(M::elements)) {
-            EXPECT_MATRIX_EQ(FIB(M, 1).clamp(M(0), IOTA(M, 0, 1)), IOTA(M, 0, 1));
-        }
-        if (std::numeric_limits<typename M::value_type>::max() >= sqr(M::elements - 1)) {
-            EXPECT_MATRIX_EQ(SQR(M, 0, 1).clamp(M(0), IOTA(M, 0, 1)), IOTA(M, 0, 1));
-        }
-    });
-}
-
-TYPED_TEST(MatrixTest, matrix_multiply) {
-    TestFixture::foreach_size([&]<typename M>(M) {
-        auto identity = Matrix<typename M::value_type, M::height, M::height>::identity();
-        EXPECT_MATRIX_EQ(identity.mult(IOTA(M, 0, 1)), IOTA(M, 0, 1));
-    });
-}
-
-TYPED_TEST(MatrixTest, sum) {
-    TestFixture::foreach_size([&]<typename M>(M) {
-        EXPECT_FLOAT_EQ(M(1).sum(), M::elements);
-        EXPECT_FLOAT_EQ(M(0).sum(), 0);
-
-        if (std::numeric_limits<typename M::value_type>::max() >= M::elements * (M::elements + 1) / 2) {
-            EXPECT_FLOAT_EQ(IOTA(M, 1, 1).sum(), M::elements * (M::elements + 1) / 2);
-        }
-
-        if constexpr (!std::unsigned_integral<typename M::value_type>) {
-            EXPECT_FLOAT_EQ(M(-1).sum(), -1 * (int)M::elements);
-        }
-    });
-}
-// endregion
-}  // namespace quicktex::tests
--- a/tests/ctest/TestSIMD.cpp
+++ b/tests/ctest/TestSIMD.cpp
@ -1,81 +0,0 @@
-/*  Quicktex Texture Compression Library
-    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
-    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
-    and licenced under the public domain
-
-    This program is free software: you can redistribute it and/or modify
-    it under the terms of the GNU Lesser General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU Lesser General Public License for more details.
-
-    You should have received a copy of the GNU Lesser General Public License
-    along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <gtest/gtest.h>
-#include <util/math.h>
-#include <util/simd.h>
-#include <util/types.h>
-
-#include <array>
-#include <cstdint>
-#include <limits>
-#include <numeric>
-#include <vector>
-#include <xsimd/xsimd.hpp>
-
-namespace quicktex::tests {
-
-template <typename T> constexpr auto make_arrays() {
-    std::vector<std::array<T, xsimd::batch<T>::size>> arrays;
-    std::array<T, xsimd::batch<T>::size> buffer;
-
-    std::iota(buffer.begin(), buffer.end(), 1);
-    arrays.push_back(buffer);
-
-    buffer.fill(1);
-    arrays.push_back(buffer);
-
-    buffer.fill(0);
-    arrays.push_back(buffer);
-
-    buffer.fill(std::numeric_limits<T>::max());
-    arrays.push_back(buffer);
-
-    if (std::is_signed_v<T>) {
-        std::iota(buffer.begin(), buffer.end(), -1 * (int)xsimd::batch<T>::size);
-        arrays.push_back(buffer);
-
-        buffer.fill(-1);
-        arrays.push_back(buffer);
-
-        buffer.fill(std::numeric_limits<T>::min());
-        arrays.push_back(buffer);
-    }
-
-    return arrays;
-}
-
-#define TEST_WHADD(TYPE)                                                                            \
-    TEST(simd, whadd_##TYPE) {                                                                      \
-        for (auto arr : make_arrays<TYPE>()) {                                                      \
-            auto v = xsimd::load_unaligned(&arr[0]);                                                \
-            auto vsum = simd::whadd(v);                                                             \
-            auto ssum = std::accumulate(arr.begin(), arr.end(), static_cast<next_size_t<TYPE>>(0)); \
-            EXPECT_EQ(vsum, ssum);                                                                  \
-        }                                                                                           \
-    }
-
-TEST_WHADD(int8_t)
-TEST_WHADD(uint8_t)
-TEST_WHADD(int16_t)
-TEST_WHADD(uint16_t)
-TEST_WHADD(int32_t)
-TEST_WHADD(uint32_t)
-
-}  // namespace quicktex::tests
--- a/tests/test_bc1.py
+++ b/tests/test_bc1.py
@ -138,22 +138,20 @@ class TestBC1Texture:
 class TestBC1Encoder:
    """Test BC1Encoder"""

-    @pytest.mark.parametrize('level', range(18))
-    def test_block_4color(self, level, color_mode):
+    def test_block_4color(self, color_mode):
        """Test encoder output with 4 color greyscale test block"""
-        encoder = BC1Encoder(level, color_mode)
+        encoder = BC1Encoder(color_mode=color_mode)
        out_tex = encoder.encode(BC1Blocks.greyscale.texture)
        out_block = out_tex[0, 0]

        assert out_tex.size_blocks == (1, 1)

        assert not out_block.is_3color
-        assert out_block.tobytes() == BC1Blocks.greyscale.block.tobytes()
+        assert out_block == BC1Blocks.greyscale.block

-    @pytest.mark.parametrize('level', range(2, 18))  # lowest 2 levels can be improved, but right now choke on this test
-    def test_block_3color(self, level, color_mode):
+    def test_block_3color(self, color_mode):
        """Test encoder output with 3 color test block"""
-        encoder = BC1Encoder(level, color_mode)
+        encoder = BC1Encoder(color_mode=color_mode)
        out_tex = encoder.encode(BC1Blocks.three_color.texture)
        out_block = out_tex[0, 0]

@ -162,14 +160,13 @@ class TestBC1Encoder:
        if encoder.color_mode != BC1Encoder.ColorMode.FourColor:
            # we only care about the selectors if we are in 3 color mode
            assert out_block.is_3color
-            assert out_block.tobytes() == BC1Blocks.three_color.block.tobytes()
+            assert out_block == BC1Blocks.three_color.block
        else:
            assert not out_block.is_3color

-    @pytest.mark.parametrize('level', range(2, 18))  # lowest 2 levels can be improved, but right now choke on this test
-    def test_block_3color_black(self, level, color_mode):
+    def test_block_3color_black(self, color_mode):
        """Test encoder output with 3 color test block with black pixels"""
-        encoder = BC1Encoder(level, color_mode)
+        encoder = BC1Encoder(color_mode=color_mode)
        out_tex = encoder.encode(BC1Blocks.three_color_black.texture)
        out_block = out_tex[0, 0]

@ -181,7 +178,7 @@ class TestBC1Encoder:
            # we only care about the selectors if we are in 3 color black mode
            assert out_block.is_3color
            assert has_black
-            assert out_block.tobytes() == BC1Blocks.three_color_black.block.tobytes()
+            assert out_block == BC1Blocks.three_color_black.block
        elif color_mode == BC1Encoder.ColorMode.ThreeColor:
            assert not (has_black and out_block.is_3color)
        else:
--- a/tests/test_install.py
+++ b/tests/test_install.py
@ -1,12 +1,9 @@
 """Test if everything is installed correctly"""
-import _quicktex
-import pytest

 import quicktex


 class TestInstall:
-    @pytest.mark.skipif(_quicktex._debug_build, reason="Debug builds dont have valid version strings")
    def test_version(self):
        """Test if the extension module version matches what setuptools returns"""
        try:
@ -19,4 +16,4 @@ class TestInstall:

        version = metadata.version('quicktex')

-        assert version == quicktex.__version__
+        assert version == quicktex.__version__, 'incorrect version string from extension module'
--- a/tools/CompilerWarnings.cmake
+++ b/tools/CompilerWarnings.cmake
@ -37,7 +37,6 @@ function(set_project_warnings project_name)
            /w14928 # illegal copy-initialization; more than one user-defined
            # conversion has been implicitly applied
            /permissive- # standards conformance mode for MSVC compiler.
-            /wd4701 # uninitialized variable checker is trigger-happy
            )

    set(CLANG_WARNINGS
@ -53,14 +52,13 @@ function(set_project_warnings project_name)
            -Wunused         # warn on anything being unused
            -Woverloaded-virtual # warn if you overload (not override) a virtual
            # function
-            # -Wpedantic   # warn if non-standard C++ is used
+            -Wpedantic   # warn if non-standard C++ is used
            #-Wconversion # warn on type conversions that may lose data
            #-Wsign-conversion  # warn on sign conversions
            -Wnull-dereference # warn if a null dereference is detected
            -Wdouble-promotion # warn if float is implicit promoted to double
            -Wformat=2 # warn on security issues around functions that format output
            # (ie printf)
-            -Wsign-compare
            )

    if (${PROJECT_NAME}_WARNINGS_AS_ERRORS)
@ -76,7 +74,7 @@ function(set_project_warnings project_name)
            -Wduplicated-branches # warn if if / else branches have duplicated code
            -Wlogical-op   # warn about logical operations being used where bitwise were
            # probably wanted
-            # -Wuseless-cast # warn if you perform a cast to the same type
+            -Wuseless-cast # warn if you perform a cast to the same type
            )

    if (MSVC)
--- a/tools/SIMDFlags.cmake
+++ b/tools/SIMDFlags.cmake
@ -1,68 +0,0 @@
-function(set_simd_flags target_name)
-    if (DEFINED ENV{QUICKTEX_SIMD_MODE})
-        set(simd_mode $ENV{QUICKTEX_SIMD_MODE})
-        message("SIMD mode is ${simd_mode}")
-    else ()
-        message("Defaulting to AUTO SIMD mode. Resulting binary is not fit for distributing to other computers!")
-        set(simd_mode "AUTO")
-    endif ()
-
-    if ((CMAKE_OSX_ARCHITECTURES MATCHES "x86_64") OR (CMAKE_SYSTEM_PROCESSOR MATCHES "(x86)|(X86)|(amd64)|(AMD64)"))
-        set(X86 TRUE)
-        message("X86 Detected")
-    else ()
-        set(X86 FALSE)
-    endif ()
-
-    if ((CMAKE_OSX_ARCHITECTURES MATCHES "arm64") OR (CMAKE_SYSTEM_PROCESSOR MATCHES "(arm64)|(ARM64)|(aarch64)"))
-        set(ARM TRUE)
-        message("ARM Detected")
-    else ()
-        set(ARM FALSE)
-    endif ()
-
-    if (simd_mode STREQUAL "SCALAR")
-        # force xsimd to use scalar ops. This should really only be used for testing,
-        # since SSE2 and NEON are guranteed on 64-bit platforms
-        if (MSVC)
-            target_compile_options(${target_name} PUBLIC /DXSIMD_NO_SUPPORTED_ARCHITECTURE=1)
-        else ()
-            target_compile_options(${target_name} PUBLIC -DXSIMD_NO_SUPPORTED_ARCHITECTURE=1)
-        endif ()
-        return()
-    endif ()
-
-    if (X86)
-        if (simd_mode STREQUAL "AUTO")
-            if (MSVC)
-                #MSVC has no -march=native equivalent. womp
-            elseif (NOT ARM)
-                # setting -march=native on an M1 causes Clang to freak out,
-                # and arm64 is pretty samey instruction set wise (arm9 and SVE2 notwithstanding)
-
-                # Currently AVX512 will cause problems with buffer overruns,
-                # and I dont have good test hardware for it anyways
-
-                target_compile_options(${target_name} PUBLIC -march=native -mno-avx512f)
-            endif ()
-        elseif (simd_mode STREQUAL "SSSE3")
-            if (MSVC)
-                target_compile_options(${target_name} PUBLIC /DXSIMD_WITH_SSSE3)
-            else ()
-                target_compile_options(${target_name} PUBLIC -mssse3)
-            endif ()
-        elseif (simd_mode STREQUAL "SSE4")
-            if (MSVC)
-                target_compile_options(${target_name} PUBLIC /DXSIMD_WITH_SSE4_2 /d2archSSE42)
-            else ()
-                target_compile_options(${target_name} PUBLIC -msse4)
-            endif ()
-        elseif (simd_mode STREQUAL "AVX2")
-            if (MSVC)
-                target_compile_options(${target_name} PUBLIC /arch:AVX2)
-            else ()
-                target_compile_options(${target_name} PUBLIC -mavx2)
-            endif ()
-        endif ()
-    endif ()
-endfunction()
Author	SHA1	Message	Date
Andrew Cassidy	a05c1e352e	Release 0.2.1 ### Fixed - Fixed broken transparency on palettized PNG files ### Changed - Changed which wheels are built by the CI. There are no changes to OS or Python version compatibility if you compile from source. - Stopped building Python 3.7 wheels - Stopped building macOS universal wheels - Wheels for macOS now require macOS 12 or later - Included macOS ARM wheels - Included Python 3.12 wheels	2024-06-02 18:58:48 -07:00
Andrew Cassidy	1f7aad7218	skip python3.7 because its EOL Still supported for now, just no wheels provided	2024-06-02 17:56:13 -07:00
Andrew Cassidy	23133eb802	Fix build matrix and macOS target	2024-06-02 17:43:33 -07:00
Andrew Cassidy	0448dbe6e1	Update CIBuildWheel	2024-06-02 17:32:00 -07:00
Andrew Cassidy	ec7953dcff	Release 0.2.0 ### Changed - Updated Pybind11 to version 3.10, adding Python 3.11 support - Updated install instructions in readme to reflect availability on PyPI - Encode now skips .dds files in its input to prevent needless re-encoding ### Added - Added the `-n` option for bc3 encoding to perform a BC3nm swizzle	2023-06-21 15:46:13 -07:00
Andrew Cassidy	3280fc74be	Merge branch 'dev'	2023-06-21 15:45:35 -07:00
Andrew Cassidy	cbec93ed55	Skip any dds files when encoding	2023-06-21 15:41:08 -07:00
Andrew Cassidy	8509384bff	Add -n option to `encode bc3` which performs a BC3nm swizzle	2023-06-21 15:32:24 -07:00
Andrew Cassidy	1c86b09ca0	Fix docs link	2023-06-21 15:31:47 -07:00
Andrew Cassidy	d4eada16f9	Update readme with easier install directions	2023-06-21 15:11:17 -07:00
Andrew Cassidy	aed575edc6	Release 0.1.4 ### Changed - Updated Pybind11 to version 3.10, adding Python 3.11 support	2022-10-29 23:32:05 -07:00
Andrew Cassidy	4cdcb65f3a	Merge branch 'dev'	2022-10-29 23:31:45 -07:00
Andrew Cassidy	0a66fcca20	Merge pull request #35 from drewcassidy/dependabot/github_actions/dev/pypa/cibuildwheel-2.11.2 Bump pypa/cibuildwheel from 2.5.0 to 2.11.2	2022-10-29 22:13:20 -07:00
Andrew Cassidy	37f0673e95	Merge pull request #33 from drewcassidy/dependabot/github_actions/dev/actions/setup-python-4.3.0 Bump actions/setup-python from 4.0.0 to 4.3.0	2022-10-29 22:12:59 -07:00
Andrew Cassidy	b81df96990	track python dependencies	2022-10-29 21:55:53 -07:00
dependabot[bot]	38beffef05	Bump pypa/cibuildwheel from 2.5.0 to 2.11.2 Bumps [pypa/cibuildwheel](https://github.com/pypa/cibuildwheel) from 2.5.0 to 2.11.2. - [Release notes](https://github.com/pypa/cibuildwheel/releases) - [Changelog](https://github.com/pypa/cibuildwheel/blob/main/docs/changelog.md) - [Commits](https://github.com/pypa/cibuildwheel/compare/2.5.0...v2.11.2) --- updated-dependencies: - dependency-name: pypa/cibuildwheel dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] <support@github.com>	2022-10-29 21:05:59 -07:00
Andrew Cassidy	0dccd1cd07	Update pybind to 3.10 to allow Python 3.11 support	2022-10-29 21:03:12 -07:00
dependabot[bot]	7ea104f712	Bump actions/setup-python from 4.0.0 to 4.3.0 Bumps [actions/setup-python](https://github.com/actions/setup-python) from 4.0.0 to 4.3.0. - [Release notes](https://github.com/actions/setup-python/releases) - [Commits](https://github.com/actions/setup-python/compare/v4.0.0...v4.3.0) --- updated-dependencies: - dependency-name: actions/setup-python dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] <support@github.com>	2022-10-10 12:46:28 +00:00
Andrew Cassidy	9cb60f0ce2	Merge pull request #21 from drewcassidy/dependabot/github_actions/dev/actions/setup-python-4.0.0 Bump actions/setup-python from 3.1.2 to 4.0.0	2022-06-12 18:02:11 -07:00
Andrew Cassidy	15e0c68df6	Merge branch 'dev' into dependabot/github_actions/dev/actions/setup-python-4.0.0	2022-06-12 17:18:18 -07:00
Andrew Cassidy	9f54349556	Specify python versions	2022-06-12 17:16:18 -07:00
dependabot[bot]	71c069d30c	Bump actions/setup-python from 3.1.2 to 4.0.0 Bumps [actions/setup-python](https://github.com/actions/setup-python) from 3.1.2 to 4.0.0. - [Release notes](https://github.com/actions/setup-python/releases) - [Commits](https://github.com/actions/setup-python/compare/v3.1.2...v4.0.0) --- updated-dependencies: - dependency-name: actions/setup-python dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] <support@github.com>	2022-06-09 11:32:38 +00:00
Andrew Cassidy	661536e6f6	use scoped lock	2022-05-22 20:59:37 -07:00