Rework ranges library

Better matches the standard library, and iterators moved to their own file
Add subrange template
2024-09-13 06:37:34 +00:00 · 2022-07-05 22:51:25 -07:00 · 2022-07-03 19:08:15 -07:00 · 2022-07-03 11:56:37 -07:00 · 2022-07-02 17:14:12 -07:00 · 2022-07-02 17:02:28 -07:00
128 changed files with 4310 additions and 1529 deletions
--- a/.clang-format
+++ b/.clang-format
@ -1,10 +1,11 @@
 ---
 BasedOnStyle: google
 IndentWidth: 4
-ColumnLimit: 160
+ColumnLimit: 120
 AllowShortBlocksOnASingleLine: Always
 AllowShortFunctionsOnASingleLine: All
 AlwaysBreakTemplateDeclarations: MultiLine
+#RequiresClausePositionStyle: SingleLine # requires Clang 15 :(
 #AlignConsecutiveDeclarations: true
 ---

--- a/.clang-tidy
+++ b/.clang-tidy
@ -2,15 +2,15 @@ FormatStyle: google

 Checks: '-*,clang-diagnostic-*,llvm-*,misc-*,-misc-unused-parameters,-misc-non-private-member-variables-in-classes,readability-identifier-naming,cppcoreguidelines-narrowing-conversions'
 CheckOptions:
-  - { key: readability-identifier-naming.NamespaceCase,          value: lower_case }
-  - { key: readability-identifier-naming.ClassCase,              value: CamelCase }
-  - { key: readability-identifier-naming.StructCase,             value: CamelCase }
-  - { key: readability-identifier-naming.TemplateParameterCase,  value: CamelCase }
-  - { key: readability-identifier-naming.FunctionCase,           value: aNy_CasE }
-  - { key: readability-identifier-naming.VariableCase,           value: lower_case }
-  - { key: readability-identifier-naming.MemberCase,        value: lower_case }
-  - { key: readability-identifier-naming.PrivateMemberPrefix,    value: _ }
-  - { key: readability-identifier-naming.ProtectedMemberPrefix,  value: _ }
+  - { key: readability-identifier-naming.NamespaceCase,            value: lower_case }
+  - { key: readability-identifier-naming.ClassCase,                value: CamelCase }
+  - { key: readability-identifier-naming.StructCase,               value: CamelCase }
+  - { key: readability-identifier-naming.TemplateParameterCase,    value: CamelCase }
+  - { key: readability-identifier-naming.FunctionCase,             value: lower_case }
+  - { key: readability-identifier-naming.VariableCase,             value: lower_case }
+  - { key: readability-identifier-naming.MemberCase,               value: lower_case }
+  - { key: readability-identifier-naming.PrivateMemberPrefix,      value: _ }
+  - { key: readability-identifier-naming.ProtectedMemberPrefix,    value: _ }
  - { key: readability-identifier-naming.EnumConstantCase,         value: CamelCase }
  - { key: readability-identifier-naming.ConstexprVariableCase,    value: CamelCase }
  - { key: readability-identifier-naming.GlobalConstantCase,       value: CamelCase }
--- a/.git-blame-ignore-revs
+++ b/.git-blame-ignore-revs
@ -0,0 +1,8 @@
+# git-blame ignored revisions
+# To configure, run
+#   git config blame.ignoreRevsFile .git-blame-ignore-revs
+# Requires Git > 2.23
+# See https://git-scm.com/docs/git-blame#Documentation/git-blame.txt---ignore-revs-fileltfilegt
+
+# Migrate code style to Black
+cb84f32edab717389d03a3855aa5bd4d0db1ae3c
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@ -0,0 +1,10 @@
+# Set update schedule for GitHub Actions
+version: 2
+updates:
+
+  - package-ecosystem: "github-actions"
+    directory: "/"
+    target-branch: "dev"
+    schedule:
+      # Check for updates to GitHub Actions every weekday
+      interval: "daily"
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@ -6,25 +6,39 @@ name: Python Package
 on: [ push, pull_request ]

 jobs:
-  build-sdist:
-    name: Build SDist
-    runs-on: ubuntu-latest
-
+  test:
+    name: Run Unit Tests
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os: [ macos-12, windows-latest, ubuntu-latest ]
    steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
        with:
-          # Whether to checkout submodules: `true` to checkout submodules or `recursive` to
-          # recursively checkout submodules.
-          submodules: 'true'
+          fetch-depth: 0
+          submodules: recursive
+
+      - name: Set up GCC
+        if: runner.os == 'linux'
+        uses: egor-tensin/setup-gcc@v1
+        with:
+          version: 10
+
+      - name: Setup cmake
+        uses: jwlawson/actions-setup-cmake@v1.12
+        with:
+          cmake-version: 'latest'
+          github-api-token: ${{ secrets.GITHUB_TOKEN }}

      - name: Set up Python
-        uses: actions/setup-python@v2
+        uses: actions/setup-python@v3.1.2
+        with:
+          python-version: '3.x'

      - name: Install dependencies
        run: |
          python -m pip install --upgrade pip
-          python -m pip install flake8
-          python -m pip install setuptools twine build
+          python -m pip install flake8 pybind11

      - name: Lint with flake8
        run: |
@ -33,6 +47,35 @@ jobs:
          # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
          flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics

+      - name: Build C code
+        run: |
+          ls
+          cmake -S . -B build -DQUICKTEX_NOPYTHON=TRUE -DCMAKE_BUILD_TYPE=Debug
+          cmake --build build
+
+      - name: Test C code
+        run: |
+          ctest -V --test-dir build -C Debug
+
+  build-sdist:
+    name: Build SDist
+    runs-on: ubuntu-latest
+    needs: test
+
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+          submodules: recursive
+
+      - name: Set up Python
+        uses: actions/setup-python@v3.1.2
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          python -m pip install setuptools twine build
+
      - name: Build SDist
        run: python -m build --sdist

@ -40,80 +83,67 @@ jobs:
        run: python -m twine check dist/*

      - name: Upload SDist
-        uses: actions/upload-artifact@v2
+        uses: actions/upload-artifact@v3
        with:
          path: dist/*.tar.gz

  build-wheels:
-    name: Build Wheels for ${{ matrix.os }}-${{ matrix.arch }}
+    name: Build Wheels on ${{ matrix.os }} ${{ matrix.arch[0] }}
    runs-on: ${{ matrix.os }}
+    needs: test
    strategy:
      matrix:
-        os: [ macos-latest, windows-latest, ubuntu-latest ]
-        arch: [ x86_64 ]
+        os: [ macos-12, windows-latest, ubuntu-latest ]
+        arch: [ [ 'x86', 'x86_64', 'AMD64', 'x86_64' ] ] #[suffix, mac, windows, linux] arch names
        include:
-          - arch: arm64
-            os: macos-latest
-          - arch: aarch64
-            os: ubuntu-latest
-            qemu: true
+          - os: ubuntu-latest
+            arch: [ 'ARM', 'arm64', 'ARM64', 'aarch64' ]

    steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
        with:
-          # Whether to checkout submodules: `true` to checkout submodules or `recursive` to
-          # recursively checkout submodules.
-          submodules: 'true'
+          fetch-depth: 0
+          submodules: recursive

      - name: Install libomp
        if: runner.os == 'macOS'
        # openMP isnt part of core apple clang for some reason?
-        run: brew install libomp
+        # libomp is in homebrew, which works for end users but its not a fat binary
+        # so we have to install it manually
+        # compiled dylibs courtesy of https://mac.r-project.org/openmp/ and mirrored on my own server
+        run: |
+          wget https://pileof.rocks/openmp-13.0.0-darwin21-Release.tar.gz
+          sudo tar fvxz openmp-*.tar.gz -C /

      - name: Install QEMU
        # install QEMU if building for arm linux
-        uses: docker/setup-qemu-action@v1
-        if: matrix.qemu
+        uses: docker/setup-qemu-action@v2
+        if: runner.os == 'linux' && matrix.arch[3] == 'aarch64'
        with:
          platforms: arm64

-      - name: Install test images
-        run: git clone https://git.pileof.rocks/drewcassidy/quicktex-test-images.git tests/images
-
      - name: Build wheels
-        uses: joerick/cibuildwheel@v2.3.1
+        uses: pypa/cibuildwheel@2.5.0
        env:
-          MACOSX_DEPLOYMENT_TARGET: "10.15"
-          CIBW_BUILD: "cp{37,38,39,310}-*"
-          CIBW_ARCHS_MACOS: ${{ matrix.arch }}
-          CIBW_ARCHS_LINUX: ${{ matrix.arch }}
-          CIBW_ARCHS_WINDOWS: "auto64"
-          CIBW_MANYLINUX_X86_64_IMAGE: "manylinux2014"
-          CIBW_MANYLINUX_AARCH64_IMAGE: "manylinux2014"
-          CIBW_TEST_EXTRAS: "tests"
-          CIBW_TEST_COMMAND: nosetests {project}/tests -d
-          CIBW_TEST_SKIP: "*-macosx_arm64"
+          MACOSX_DEPLOYMENT_TARGET: "10.9"
+          CIBW_ARCHS_LINUX: ${{ matrix.arch[3] }}

      - name: Upload Wheels
-        uses: actions/upload-artifact@v2
+        uses: actions/upload-artifact@v3
        with:
          path: ./wheelhouse/*.whl

  publish:
    name: Publish to PyPI and Github
-    needs: [build-wheels, build-sdist]
+    needs: [ build-wheels, build-sdist ]
    if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags')
    runs-on: ubuntu-latest

    steps:
-      - uses: actions/checkout@v2
-        with:
-          # Whether to checkout submodules: `true` to checkout submodules or `recursive` to
-          # recursively checkout submodules.
-          submodules: 'true'
+      - uses: actions/checkout@v3 # just need the changelog

      - name: Set up Python
-        uses: actions/setup-python@v2
+        uses: actions/setup-python@v3.1.2

      - name: Install dependencies
        run: |
@ -126,7 +156,7 @@ jobs:
          echo "$(yaclog show -mb)" >> RELEASE.md

      - name: Download Artifacts
-        uses: actions/download-artifact@v2
+        uses: actions/download-artifact@v3
        with:
          name: artifact
          path: dist
--- a/.gitignore
+++ b/.gitignore
@ -5,6 +5,8 @@ build/
 *.egg-info
 *.pyc
 *.pyi
+*.whl
+*.tar.gz

 #sphinx
 docs/_build/
@ -12,9 +14,6 @@ docs/_build/
 #mypy
 out

-# Test images
-tests/images/
-
 # IDEs
 **/.idea

@ -34,4 +33,4 @@ compile_commands.json
 CTestTestfile.cmake
 _deps
 cmake-build-*
-_skbuild
+*.a
--- a/.gitmodules
+++ b/.gitmodules
@ -1,4 +1,3 @@
-[submodule "extern/pybind11"]
-	path = extern/pybind11
-	url = https://github.com/pybind/pybind11.git
-	branch = stable
+[submodule "external/xsimd"]
+	path = external/xsimd
+	url = https://github.com/xtensor-stack/xsimd.git
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@ -0,0 +1,28 @@
+# .readthedocs.yaml
+# Read the Docs configuration file
+# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
+
+# Required
+version: 2
+
+# Set the version of Python and other tools you might need
+build:
+  os: ubuntu-20.04
+  tools:
+    python: "3.10"
+
+# Build documentation in the docs/ directory with Sphinx
+sphinx:
+  configuration: docs/conf.py
+
+# If using Sphinx, optionally build your docs in additional formats such as PDF
+formats:
+  - pdf
+
+# Give python build instructions
+python:
+  install:
+    - method: pip
+      path: .
+      extra_requirements:
+        - docs
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -2,6 +2,26 @@

 All notable changes to this project will be documented in this file

+## Unreleased
+
+### Fixed
+
+- Fixed LeastSquares endpoint mode producint incorrect results
+
+
+## 0.1.3 - 2022-04-13
+
+### Fixed
+
+- Fixed quicktex not compiling for python 3.10 on Windows
+
+### Changed
+
+- Reworked CI job, adding wheels for ARM macOS, ARM Linux, and x86 musl Linux.
+- Added wheels for python 3.10
+- Added a more useful error message when importing quicktex on macOS when libomp.dylib isn't installed
+
+
 ## 0.1.2 - 2022-03-27

 ### Fixed
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,89 +1,14 @@
-cmake_minimum_required(VERSION 3.17)
-include(CheckIPOSupported)
+cmake_minimum_required(VERSION 3.18)
 include(tools/CompilerWarnings.cmake)
+include(tools/SIMDFlags.cmake)
+set(CMAKE_VERBOSE_MAKEFILE ON)

 project(quicktex)

-if (SKBUILD)
-    # Scikit-Build does not add your site-packages to the search path
-    # automatically, so we need to add it _or_ the pybind11 specific directory
-    # here.
-    execute_process(
-            COMMAND "${PYTHON_EXECUTABLE}" -c
-            "import pybind11; print(pybind11.get_cmake_dir())"
-            OUTPUT_VARIABLE _tmp_dir
-            OUTPUT_STRIP_TRAILING_WHITESPACE COMMAND_ECHO STDOUT)
-    list(APPEND CMAKE_PREFIX_PATH "${_tmp_dir}")
-endif ()
+add_subdirectory(external/xsimd)

-# Find dependencies
-find_package(Python COMPONENTS Interpreter Development)
-find_package(pybind11 CONFIG REQUIRED)
-find_package(OpenMP)
+add_subdirectory(quicktex)
+add_subdirectory(tests)

-# Collect source files
-file(GLOB SOURCE_FILES
-        "quicktex/*.cpp"
-        "quicktex/s3tc/*.cpp"
-        "quicktex/s3tc/bc1/*.cpp"
-        "quicktex/s3tc/bc3/*.cpp"
-        "quicktex/s3tc/bc4/*.cpp"
-        "quicktex/s3tc/bc5/*.cpp"
-        "quicktex/s3tc/interpolator/*.cpp"
-        )
-
-file(GLOB HEADER_FILES
-        "quicktex/*.h"
-        "quicktex/s3tc/*.h"
-        "quicktex/s3tc/bc1/*.h"
-        "quicktex/s3tc/bc3/*.h"
-        "quicktex/s3tc/bc4/*.h"
-        "quicktex/s3tc/bc5/*.h"
-        "quicktex/s3tc/interpolator/*.h"
-        )
-
-file(GLOB TEST_FILES "tests/*.cpp")
-
-file(GLOB_RECURSE PYTHON_FILES "src/**/*.py")
-
-# Organize source files together for some IDEs
-source_group(TREE ${CMAKE_CURRENT_SOURCE_DIR} FILES ${SOURCE_FILES} ${HEADER_FILES} ${PYTHON_FILES})
-
-# Add python module
-pybind11_add_module(_quicktex
-        ${SOURCE_FILES}
-        ${HEADER_FILES})
-
-add_executable(test_quicktex
-        ${SOURCE_FILES}
-        ${HEADER_FILES}
-        ${TEST_FILES})
-
-target_link_libraries(test_quicktex PRIVATE pybind11::embed)
-
-target_compile_definitions(test_quicktex PRIVATE -DCUSTOM_SYS_PATH="${CMAKE_HOME_DIRECTORY}/env/lib/python3.9/site-packages")
-target_compile_definitions(_quicktex PRIVATE VERSION_INFO=${QUICKTEX_VERSION_INFO})
-
-# enable openMP if available
-if (OpenMP_CXX_FOUND)
-    target_link_libraries(_quicktex PUBLIC OpenMP::OpenMP_CXX)
-    target_link_libraries(test_quicktex PUBLIC OpenMP::OpenMP_CXX)
-endif ()
-
-# Set module features, like C/C++ standards
-target_compile_features(_quicktex PUBLIC cxx_std_17 c_std_11)
-target_compile_features(test_quicktex PUBLIC cxx_std_17 c_std_11)
-
-set_project_warnings(_quicktex)
-set_project_warnings(test_quicktex)
-
-set(CMAKE_VERBOSE_MAKEFILE ON)
-
-if (CMAKE_CXX_COMPILER_ID MATCHES ".*Clang")
-    set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -stdlib=libc++ -lc++abi")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -stdlib=libc++")
-    set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -stdlib=libc++ -fsanitize=undefined")
-    set(PROJECT_WARNINGS ${CLANG_WARNINGS})
-endif ()
-
-install(TARGETS _quicktex LIBRARY DESTINATION quicktex)
+enable_testing ()
+add_test (NAME QuicktexTest COMMAND Test)
--- a/DEVELOPMENT.md
+++ b/DEVELOPMENT.md
@ -0,0 +1,25 @@
+# Development
+
+This document outlines how to set up a development environment for Quicktex. Documentation on writing Python extension modules is sparse, so I hope this document is useful for other projects as well. The [Coding Patterns for Python Extensions](https://pythonextensionpatterns.readthedocs.io/en/latest/) site has some useful information and will be linked to often in this document. 
+
+## Setting up Debug Python
+
+Many development tools require debug symbols to function, and since the front-end for accessing an extension module is Python, that usually means adding debug symbols to Python. [This Page](https://pythonextensionpatterns.readthedocs.io/en/latest/debugging/debug_python.html) has some instructions on building python with debug symbols. 
+
+If you plan to use DTrace, enable the `--with-dtrace` flag when running `configure`. 
+
+It's useful for this debug python to have SSL enabled so that packages can be installed using pip. Enable SSL with the `--with-openssl` flag when running `configure`. If you are on macOS and installed OpenSSL through Homebrew, you may need to use `--with-openssl=$(brew --prefix openssl)` to help the compiler find it. 
+
+### Installing Debug Python
+
+You can keep the resulting binary in your local copy of the cpython repo and symlink to it, but I like to install it somewhere like `/opt/python-debug/`. The install location is set in the `configure` tool using the `--prefix` flag, and installation is done by running `make install`
+
+### Mixing Debug and Release Python
+
+The debug build of python is slow (It may be possible to build with debug symbols but full optimization, I have not looked into it). If you already have a venv setup for your project, you can just symlink the debug python binary into `env/bin` with a full name like `python3.9d`. Make sure that the debug build has the same minor version (e.g '3.9') as the version you made the virtual environment with to maintain ABI compatibility. 
+
+## Profiling with Dtrace
+
+DTrace is the default program profiler on macOS and other Unix systems, but it's also available for use on Windows and Linux. Using DTrace requires building Python with DTrace hooks as seen above. 
+
+Your extension module does not need a full debug build to profile, but it does need frame pointers to see the stack trace at each sample, as well as debug symbols to give functions names. The cmake build type `RelWithDebInfo` handles this automatically. 
--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -1 +1,2 @@
-graft extern
+graft external
+global-exclude *.afdesign # this is currently the vast majority of the repo size
--- a/docs/changelog.md
+++ b/docs/changelog.md
@ -0,0 +1,2 @@
+```{include} ../CHANGELOG.md
+```
--- a/docs/conf.py
+++ b/docs/conf.py
@ -13,12 +13,13 @@
 # import os
 # import sys
 # sys.path.insert(0, os.path.abspath('..'))
+from datetime import date


 # -- Project information -----------------------------------------------------

 project = 'Quicktex'
-copyright = '2021, Andrew Cassidy'
+copyright = f'{date.today().year}, Andrew Cassidy'
 author = 'Andrew Cassidy'

 # -- General configuration ---------------------------------------------------
@ -28,11 +29,14 @@ author = 'Andrew Cassidy'
 # ones.
 extensions = [
    'myst_parser',
+    'sphinx_click',
    'sphinx_rtd_theme',
    'sphinx.ext.autodoc',
    'sphinx.ext.intersphinx',
 ]

+myst_heading_anchors = 2
+
 # Add any paths that contain templates here, relative to this directory.
 templates_path = ['_templates']

@ -70,5 +74,5 @@ autodoc_default_options = {
 # should be linked to in this documentation.
 intersphinx_mapping = {
    'python': ('https://docs.python.org/3', None),
-    'PIL': ('https://pillow.readthedocs.io/en/stable/', None)
+    'PIL': ('https://pillow.readthedocs.io/en/stable/', None),
 }
--- a/docs/development.md
+++ b/docs/development.md
@ -0,0 +1,2 @@
+```{include} ../DEVELOPMENT.md
+```
--- a/docs/handbook/commands.md
+++ b/docs/handbook/commands.md
@ -0,0 +1,7 @@
+# Command Reference
+
+```{eval-rst}
+.. click:: quicktex.__main__:main
+   :prog: quicktex
+   :nested: full
+```
--- a/docs/handbook/getting_started.md
+++ b/docs/handbook/getting_started.md
@ -0,0 +1,84 @@
+# Getting Started
+
+## Installation
+
+Install and update using [pip](https://pip.pypa.io/en/stable/quickstart/):
+
+```shell
+pip install -U quicktex
+```
+
+If you are on macOS, you need to install openMP to allow multithreading, since it does not ship with the built-in Clang.
+This can be done easily
+using [homebrew](https://brew.sh). This is not required if building from source, but highly recommended.
+
+```shell
+brew install libomp
+```
+
+If you want, you can also install from source. First clone the [git repo](https://github.com/drewcassidy/quicktex) and
+install it with:
+
+```shell
+pip install .
+```
+
+and setuptools will take care of any dependencies for you.
+
+The package also makes tests, stub generation, and docs available. To install the
+required dependencies for them, install with options like so:
+
+```shell
+pip install quicktex[tests,stubs,docs]
+```
+
+## Usage
+
+For detailed documentation on the {command}`quicktex` command and its subcommands see the {doc}`commands`.
+
+### Examples
+
+#### Encoding a file
+
+To encode a file in place, use the {command}`encode` command
+
+```shell
+quicktex encode auto bun.png # chooses format based on alpha
+quicktex encode bc3 bun.png # encodes as bc3
+```
+
+the auto subcommand automatically chooses between bc1 and bc3 for your image depending on the contents of its alpha
+channel. Quicktex supports reading from all image formats supported by [pillow](https://pillow.readthedocs.io/en/stable/handbook/image-file-formats.html).
+
+By default, Quicktex converts in place, meaning the above command will produce a `bun.dds` file alongside the png. If
+you want to replace the png, use the `-r` flag to remove it after converting.
+
+if you want to specify an output filename or directory use the `-o` flag.
+
+```shell
+quicktex encode auto -o rabbit.dds bun.png # produces rabbit.dds
+quicktex.encode auto -o textures/ bun.png # produces textures/bun.dds, if textures/ exists
+```
+
+#### Encoding multiple files
+
+quicktex is also able to convert multiple files at once, for example, to encode every png file in the images folder,
+use:
+
+```shell
+quicktex encode auto images/*.png # encodes in-place
+quicktex encode auto -o textures/ images/*.png # encodes to the textures/ directory
+```
+
+please note that globbing is an operation performed by your shell and is not supported by the built in windows `cmd.exe`
+. If you are on Windows, please use Powershell or any posix-ish shell like [fish](https://fishshell.com).
+
+#### Decoding files
+
+decoding is performed exactly the same as encoding, except without having to specify a format. The output image format
+is set using the `-x` flag, and defaults to png. Quicktex supports writing to all image formats supported by [pillow](https://pillow.readthedocs.io/en/stable/handbook/image-file-formats.html)
+
+```shell
+quicktex decode bun.dds # produces bun.png
+quicktex decode -x .tga bun2.dds # produces bun.tga
+```
--- a/docs/handbook/index.md
+++ b/docs/handbook/index.md
@ -0,0 +1,10 @@
+# Handbook
+
+```{toctree}
+---
+maxdepth: 3
+---
+
+commands
+getting_started
+```
--- a/docs/index.md
+++ b/docs/index.md
@ -0,0 +1,35 @@
+# Welcome to Quicktex's Documentation
+
+[![Documentation Status](https://readthedocs.org/projects/quicktex/badge/?version=latest)](https://quicktex.readthedocs.io/en/latest/?badge=latest)
+[![Python Package](https://github.com/drewcassidy/quicktex/actions/workflows/python-package.yml/badge.svg)](https://github.com/drewcassidy/quicktex/actions/workflows/python-package.yml)
+[![PyPI version](https://badge.fury.io/py/quicktex.svg)](https://badge.fury.io/py/quicktex)
+
+Quicktex is a Python library for encoding, decoding, and manipulating compressed textures. It uses a backend written in
+C++ for superior performance, as well as an extensive API for low-level access to the texture data. The compression
+engine is based in [rgbcx](https://github.com/richgel999/bc7enc).
+
+```{toctree}
+---
+maxdepth: 2
+caption: Contents
+---
+
+handbook/index
+reference/index
+```
+
+```{toctree}
+---
+maxdepth: 1
+---
+
+development
+Changelog <changelog>
+License <license>
+```
+
+## Indices and tables
+
+* {ref}`genindex`
+* {ref}`modindex`
+* {ref}`search`
--- a/docs/index.rst
+++ b/docs/index.rst
@ -1,15 +0,0 @@
-Welcome to Quicktex's documentation!
-========================================
-
-.. toctree::
-   :maxdepth: 2
-
-   reference/index.rst
-
-
-Indices and tables
-==================
-
-* :ref:`genindex`
-* :ref:`modindex`
-* :ref:`search`
--- a/docs/license.md
+++ b/docs/license.md
@ -0,0 +1,2 @@
+```{include} ../LICENSE.md
+```
--- a/docs/reference/conversion.rst
+++ b/docs/reference/conversion.rst
@ -1,10 +0,0 @@
-.. py:currentmodule:: quicktex
-
-Conversion
-============
-
-.. autoclass:: BlockEncoder
-    :members:
-
-.. autoclass:: BlockDecoder
-    :members:
--- a/docs/reference/index.rst
+++ b/docs/reference/index.rst
@ -1,10 +1,9 @@
-Reference
-=========
+API Reference
+=============

 .. toctree::
    :maxdepth: 2

-    conversion.rst
    dds.rst
    image_utils.rst
    formats/index.rst
--- a/external/xsimd
+++ b/external/xsimd
--- a/pyproject.toml
+++ b/pyproject.toml
@ -3,9 +3,8 @@ requires = [
    "setuptools>=61",
    "setuptools_scm>=6.2",
    "wheel",
-    "pybind11>=2.9.0",
-    "cmake>=3.22",
-    "scikit-build>0.13",
+    "cmake>=3.18",
+    "pybind11~=2.6.1",
    "ninja; sys_platform != 'win32'",
 ]
 build-backend = "setuptools.build_meta"
@ -14,7 +13,6 @@ build-backend = "setuptools.build_meta"
 name = "quicktex"
 description = "A fast block compression library for python"
 readme = "README.md"
-license = { file = "LICENSE.md" }
 authors = [{ name = "Andrew Cassidy", email = "drewcassidy@me.com" }]

 classifiers = [
@ -38,13 +36,19 @@ dependencies = ["Pillow", "click"]
 dynamic = ["version"]

 [project.optional-dependencies]
-tests = ["nose", "parameterized"]
-docs = ["sphinx", "myst-parser", "sphinx-rtd-theme"]
+tests = ["parameterized", "pytest"]
+docs = [
+    "Sphinx >= 3.5",
+    "sphinx-click >= 2.7",
+    "sphinx-rtd-theme",
+    "myst-parser >= 0.14",
+]
 stubs = ["pybind11-stubgen"]

 [project.urls]
-repository = "https://github.com/drewcassidy/quicktex"
-changelog = "https://github.com/drewcassidy/quicktex/blob/main/CHANGELOG.md"
+Docs = "https://quicktex.readthedocs.io/en/"
+Source = "https://github.com/drewcassidy/quicktex"
+Changelog = "https://github.com/drewcassidy/quicktex/blob/main/CHANGELOG.md"

 [project.scripts]
 quicktex = "quicktex.__main__:main"
@ -56,3 +60,33 @@ package-data = { '*' = ['py.typed', '*.pyi'] } # include stubs
 package-dir = { '' = '.' } # without this line, C++ source files get included in the bdist

 [tool.setuptools_scm]
+
+[tool.cibuildwheel]
+build = "cp*" # only build wheels for cpython.
+build-frontend = "build"
+test-command = "pytest {project}/tests --verbose --full-trace --capture=tee-sys"
+test-extras = ["tests"]
+test-skip = "*-macosx_arm64 *-macosx_universal2:arm64" # skip testing on arm macOS because CIBW doesnt support it
+environment = { QUICKTEX_SIMD_MODE = "SSE4" } # SSE4 has a 99% market share and was released under the Bush administration
+
+[tool.cibuildwheel.macos]
+archs = ["x86_64", "universal2"] # build fat binaries, or x86-64 for python 3.7
+skip = ["cp{38,39,31*}-macosx_x86_64"] # skip x86-only builds where fat binaries are supported
+
+[tool.cibuildwheel.windows]
+archs = ["auto64"] # arm64 windows builds not yet supported
+
+[tool.cibuildwheel.linux]
+skip = ["*musllinux*"]
+manylinux-x86_64-image = "manylinux2014"
+manylinux-aarch64-image = "manylinux2014"
+
+[tool.black]
+line-length = 120 # 80-column is stupid
+target-version = ['py37', 'py38', 'py39', 'py310']
+skip-string-normalization = true
+
+[tool.pytest.ini_options]
+minversion = "7.0"
+addopts = ["--full-trace", "--capture=tee-sys"]
+testpaths = ["tests"]
--- a/quicktex/CMakeLists.txt
+++ b/quicktex/CMakeLists.txt
@ -0,0 +1,71 @@
+
+# Find dependencies
+if (NOT QUICKTEX_NOPYTHON)
+    find_package(Python COMPONENTS Interpreter Development.Module)
+    find_package(pybind11 CONFIG REQUIRED)
+endif ()
+find_package(OpenMP)
+
+#Collect source files
+set(SOURCE_FILES
+        Matrix4x4.cpp OldColor.cpp
+        s3tc/bc1/BC1Block.cpp s3tc/bc1/BC1Decoder.cpp
+        s3tc/bc1/BC1Encoder.cpp s3tc/bc1/OrderTable.cpp s3tc/bc1/OrderTable4.cpp
+        s3tc/bc3/BC3Decoder.cpp s3tc/bc3/BC3Encoder.cpp
+        s3tc/bc4/BC4Block.cpp s3tc/bc4/BC4Decoder.cpp s3tc/bc4/BC4Encoder.cpp
+        s3tc/bc5/BC5Decoder.cpp s3tc/bc5/BC5Encoder.cpp
+        s3tc/interpolator/Interpolator.cpp
+        texture/RawTexture.cpp texture/Window.cpp test.cpp)
+
+set(BINDING_FILES
+        _bindings.cpp
+        s3tc/_bindings.cpp
+        s3tc/bc1/_bindings.cpp
+        s3tc/bc3/_bindings.cpp
+        s3tc/bc4/_bindings.cpp
+        s3tc/bc5/_bindings.cpp
+        s3tc/interpolator/_bindings.cpp)
+
+file(GLOB_RECURSE HEADER_FILES "**.h")
+file(GLOB_RECURSE PYTHON_FILES "**.py")
+
+# Organize source files together for some IDEs
+source_group(TREE ${CMAKE_CURRENT_SOURCE_DIR} FILES ${SOURCE_FILES} ${BINDING_FILES} ${HEADER_FILES} ${PYTHON_FILES})
+
+# Declare implementation module
+add_library(quicktex STATIC ${SOURCE_FILES} ${HEADER_FILES})
+
+# Link openMP if available
+if (OpenMP_CXX_FOUND)
+    target_link_libraries(quicktex PUBLIC OpenMP::OpenMP_CXX)
+endif ()
+
+# Link XSimd
+target_link_libraries(quicktex PUBLIC xsimd)
+
+# Set library features, like C/C++ standards
+target_compile_features(quicktex PUBLIC cxx_std_20 c_std_11)
+set_property(TARGET quicktex PROPERTY CXX_VISIBILITY_PRESET hidden)
+set_property(TARGET quicktex PROPERTY POSITION_INDEPENDENT_CODE ON)
+
+# Include source root for project-relative includes
+target_include_directories(quicktex PUBLIC .)
+
+# Set compiler warnings and SIMD flags
+set_project_warnings(quicktex)
+set_simd_flags(quicktex)
+
+if (NOT QUICKTEX_NOPYTHON)
+    # Declare python module
+    pybind11_add_module(_quicktex ${BINDING_FILES} ${HEADER_FILES})
+    target_compile_definitions(_quicktex PRIVATE VERSION_INFO=${QUICKTEX_VERSION_INFO})
+
+    # Link python module with implementation
+    target_link_libraries(_quicktex PUBLIC quicktex)
+
+    if ((NOT MSVC) AND (CMAKE_BUILD_TYPE MATCHES Debug) AND ($ENV{QUICKTEX_SANITIZE}))
+        target_compile_options(_quicktex PUBLIC -fsanitize=address,undefined -fno-sanitize-recover=address,undefined -fno-omit-frame-pointer)
+        target_link_options(_quicktex PUBLIC -fsanitize=address,undefined -fno-sanitize-recover=address,undefined -fno-omit-frame-pointer)
+    endif ()
+
+endif ()
--- a/quicktex/Color.h
+++ b/quicktex/Color.h
@ -18,82 +18,60 @@
 */

 #pragma once
-#include <cassert>  // for assert
-#include <cstddef>  // for size_t
-#include <cstdint>  // for uint8_t, uint16_t
+#include "Matrix.h"
+#include "util/bitbash.h"

 namespace quicktex {
-class Vector4;
-class Vector4Int;

-#pragma pack(push, 1)
-class Color {
+using Color = Vec<uint8_t, 4>;
+using ColorRGB = Vec<uint8_t, 3>;
+
+constexpr size_t uint5_max = (1 << 5) - 1;
+constexpr size_t uint6_max = (1 << 6) - 1;
+
+template <size_t N> struct MidpointTable {
   public:
-    uint8_t r;
-    uint8_t g;
-    uint8_t b;
-    uint8_t a;
-
-    constexpr Color() : Color(0, 0, 0, 0xFF) {}
-
-    constexpr Color(uint8_t vr, uint8_t vg, uint8_t vb, uint8_t va = 0xFF) : r(vr), g(vg), b(vb), a(va) {}
-
-    Color(Vector4Int v);
-
-    static uint16_t Pack565Unscaled(uint8_t r, uint8_t g, uint8_t b);
-    static uint16_t Pack565(uint8_t r, uint8_t g, uint8_t b);
-
-    static Color Unpack565Unscaled(uint16_t Packed);
-    static Color Unpack565(uint16_t Packed);
-
-    static Color PreciseRound565(Vector4 &v);
-
-    static Color Min(const Color &A, const Color &B);
-    static Color Max(const Color &A, const Color &B);
-
-    bool operator==(const Color &Rhs) const;
-    bool operator!=(const Color &Rhs) const;
-
-    uint8_t operator[](size_t index) const {
-        assert(index < 4);
-        return reinterpret_cast<const uint8_t *>(this)[index];
-    }
-    uint8_t &operator[](size_t index) {
-        assert(index < 4);
-        return reinterpret_cast<uint8_t *>(this)[index];
+    constexpr MidpointTable() : _values() {
+        constexpr float fN = (float)N;
+        for (unsigned i = 0; i < N - 1; i++) { _values[i] = ((float)i / fN) + (0.5f / fN); }
+        _values[N - 1] = 1e+37f;
    }

-    operator Vector4() const;
-    operator Vector4Int() const;
-    friend Vector4Int operator-(const Color &lhs, const Color &rhs);
-
-    void SetRGB(uint8_t vr, uint8_t vg, uint8_t vb);
-    void SetRGB(const Color &other) { SetRGB(other.r, other.g, other.b); }
-
-    uint16_t Pack565() const;
-    uint16_t Pack565Unscaled() const;
-
-    Color ScaleTo565() const;
-    Color ScaleFrom565() const;
-
-    size_t MinChannelRGB();
-    size_t MaxChannelRGB();
-
-    bool IsGrayscale() const { return ((r == g) && (r == b)); }
-    bool IsBlack() const { return (r | g | b) < 4; }
-
-    int GetLuma() const { return (13938U * r + 46869U * g + 4729U * b + 32768U) >> 16U; }  // REC709 weightings
+    float operator[](size_t i) const {
+        assert(i < N);
+        return _values[i];
+    }

   private:
-    static constexpr float Midpoints5bit[32] = {.015686f, .047059f, .078431f, .111765f, .145098f, .176471f, .207843f, .241176f, .274510f, .305882f, .337255f,
-                                                .370588f, .403922f, .435294f, .466667f, .5f,      .533333f, .564706f, .596078f, .629412f, .662745f, .694118f,
-                                                .725490f, .758824f, .792157f, .823529f, .854902f, .888235f, .921569f, .952941f, .984314f, 1e+37f};
-    static constexpr float Midpoints6bit[64] = {.007843f, .023529f, .039216f, .054902f, .070588f, .086275f, .101961f, .117647f, .133333f, .149020f, .164706f,
-                                                .180392f, .196078f, .211765f, .227451f, .245098f, .262745f, .278431f, .294118f, .309804f, .325490f, .341176f,
-                                                .356863f, .372549f, .388235f, .403922f, .419608f, .435294f, .450980f, .466667f, .482353f, .500000f, .517647f,
-                                                .533333f, .549020f, .564706f, .580392f, .596078f, .611765f, .627451f, .643137f, .658824f, .674510f, .690196f,
-                                                .705882f, .721569f, .737255f, .754902f, .772549f, .788235f, .803922f, .819608f, .835294f, .850980f, .866667f,
-                                                .882353f, .898039f, .913725f, .929412f, .945098f, .960784f, .976471f, .992157f, 1e+37f};
+    float _values[N];
 };
-#pragma pack(pop)
+
+constexpr MidpointTable<32> Midpoints5bit;
+constexpr MidpointTable<64> Midpoints6bit;
+
+template <typename T> Vec<T, 3> scale_to_565(Vec<T, 3> unscaled) {
+    return Vec<T, 3>{scale_from_8<T, 5>(unscaled.r()), scale_from_8<T, 6>(unscaled.g()),
+                     scale_from_8<T, 5>(unscaled.b())};
+}
+
+template <typename T> Vec<T, 3> scale_from_565(Vec<T, 3> scaled) {
+    return Vec<T, 3>{scale_to_8<T, 5>(scaled.r()), scale_to_8<T, 6>(scaled.g()), scale_to_8<T, 5>(scaled.b())};
+}
+
+template <typename T = int16_t> Vec<T, 3> precise_round_565(Vec<float, 3> &v) {
+    auto scaled = v * Vec<float, 3>{uint5_max, uint6_max, uint5_max};       // rescale by from (0,1) to (0,int_max)
+    auto rounded = (Vec<T, 3>)scaled;                                       // downcast to integral type
+    rounded = rounded.clamp({0, 0, 0}, {uint5_max, uint6_max, uint5_max});  // clamp to avoid out of bounds float errors
+
+    // increment each channel if above the rounding point
+    if (v.r() > Midpoints5bit[rounded.r()]) rounded.r()++;
+    if (v.g() > Midpoints6bit[rounded.g()]) rounded.g()++;
+    if (v.b() > Midpoints5bit[rounded.b()]) rounded.b()++;
+
+    assert(rounded.r() <= uint5_max);
+    assert(rounded.g() <= uint6_max);
+    assert(rounded.b() <= uint5_max);
+
+    return rounded;
+}
 }  // namespace quicktex
--- a/quicktex/ColorBlock.h
+++ b/quicktex/ColorBlock.h
@ -1,5 +1,5 @@
 /*  Quicktex Texture Compression Library
-    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
+    Copyright (C) 2021-2022 Andrew Cassidy <drewcassidy@me.com>
    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
    and licenced under the public domain

@ -25,7 +25,7 @@
 #include <cstring>
 #include <stdexcept>

-#include "Color.h"
+#include "OldColor.h"
 #include "Vector4Int.h"

 namespace quicktex {
@ -34,9 +34,9 @@ using Coords = std::tuple<int, int>;
 template <int N, int M> class ColorBlock  {
   public:
    struct Metrics {
-        Color min;
-        Color max;
-        Color avg;
+        OldColor min;
+        OldColor max;
+        OldColor avg;
        bool is_greyscale;
        bool has_black;
        Vector4Int sums;
@ -45,37 +45,37 @@ template <int N, int M> class ColorBlock  {
    static constexpr int Width = N;
    static constexpr int Height = M;

-    constexpr Color Get(int x, int y) const {
+    constexpr OldColor Get(int x, int y) const {
        if (x >= Width || x < 0) throw std::invalid_argument("x value out of range");
        if (y >= Height || y < 0) throw std::invalid_argument("y value out of range");

        return _pixels[x + (N * y)];
    }

-    constexpr Color Get(int i) const {
+    constexpr OldColor Get(int i) const {
        if (i >= N * M || i < 0) throw std::invalid_argument("i value out of range");
        return _pixels[i];
    }

-    void Set(int x, int y, const Color &value) {
+    void Set(int x, int y, const OldColor &value) {
        if (x >= Width || x < 0) throw std::invalid_argument("x value out of range");
        if (y >= Height || y < 0) throw std::invalid_argument("y value out of range");
        _pixels[x + (N * y)] = value;
    }

-    void Set(int i, const Color &value) {
+    void Set(int i, const OldColor &value) {
        if (i >= N * M || i < 0) throw std::invalid_argument("i value out of range");
        _pixels[i] = value;
    }

-    void GetRow(int y, Color *dst) const {
+    void GetRow(int y, OldColor *dst) const {
        if (y >= Height || y < 0) throw std::invalid_argument("y value out of range");
-        std::memcpy(dst, &_pixels[N * y], N * sizeof(Color));
+        std::memcpy(dst, &_pixels[N * y], N * sizeof(OldColor));
    }

-    void SetRow(int y, const Color *src) {
+    void SetRow(int y, const OldColor *src) {
        if (y >= Height || y < 0) throw std::invalid_argument("y value out of range");
-        std::memcpy(&_pixels[N * y], src, N * sizeof(Color));
+        std::memcpy(&_pixels[N * y], src, N * sizeof(OldColor));
    }

    bool IsSingleColor() const {
@ -88,8 +88,8 @@ template <int N, int M> class ColorBlock  {

    Metrics GetMetrics(bool ignore_black = false) const {
        Metrics metrics;
-        metrics.min = Color(UINT8_MAX, UINT8_MAX, UINT8_MAX);
-        metrics.max = Color(0, 0, 0);
+        metrics.min = OldColor(UINT8_MAX, UINT8_MAX, UINT8_MAX);
+        metrics.max = OldColor(0, 0, 0);
        metrics.has_black = false;
        metrics.is_greyscale = true;
        metrics.sums = {0, 0, 0};
@ -97,7 +97,7 @@ template <int N, int M> class ColorBlock  {
        unsigned total = 0;

        for (unsigned i = 0; i < M * N; i++) {
-            Color val = Get(i);
+            OldColor val = Get(i);
            bool is_black = val.IsBlack();

            metrics.has_black |= is_black;
@ -118,7 +118,7 @@ template <int N, int M> class ColorBlock  {
    }

   private:
-    std::array<Color, N * M> _pixels;
+    std::array<OldColor, N * M> _pixels;
 };

 }  // namespace quicktex
--- a/quicktex/Decoder.h
+++ b/quicktex/Decoder.h
@ -1,5 +1,5 @@
 /*  Quicktex Texture Compression Library
-    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
+    Copyright (C) 2021-2022 Andrew Cassidy <drewcassidy@me.com>
    Partially derived from rgbcx.h written by Richard Geldreich 2020 <richgel99@gmail.com>
    and licenced under the public domain

@ -22,7 +22,7 @@
 #include <memory>

 #include "ColorBlock.h"
-#include "Texture.h"
+#include "texture/RawTexture.h"

 namespace quicktex {

@ -46,19 +46,19 @@ template <class T> class BlockDecoder : public Decoder<T> {
    virtual DecodedBlock DecodeBlock(const EncodedBlock &block) const = 0;

    virtual RawTexture Decode(const T &encoded) const override {
-        auto decoded = RawTexture(encoded.Width(), encoded.Height());
+        auto decoded = RawTexture(encoded.width, encoded.height);

-        int blocks_x = encoded.BlocksX();
-        int blocks_y = encoded.BlocksY();
+        int blocks_x = encoded.bwidth();
+        int blocks_y = encoded.bheight();

        // from experimentation, multithreading this using OpenMP actually makes decoding slower
        // due to thread creation/teardown taking longer than the decoding process itself.
        // As a result, this is left as a serial operation despite being embarassingly parallelizable
        for (int y = 0; y < blocks_y; y++) {
            for (int x = 0; x < blocks_x; x++) {
-                auto block = encoded.GetBlock(x, y);
+                auto block = encoded.get_block(x, y);
                auto pixels = DecodeBlock(block);
-                decoded.SetBlock<BlockWidth, BlockHeight>(x, y, pixels);
+                decoded.set_block<BlockWidth, BlockHeight>(x, y, pixels);
            }
        }

--- a/quicktex/Encoder.h
+++ b/quicktex/Encoder.h
@ -1,5 +1,5 @@
 /*  Quicktex Texture Compression Library
-    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
+    Copyright (C) 2021-2022 Andrew Cassidy <drewcassidy@me.com>
    Partially derived from rgbcx.h written by Richard Geldreich 2020 <richgel99@gmail.com>
    and licenced under the public domain

@ -22,7 +22,7 @@
 #include <memory>

 #include "ColorBlock.h"
-#include "Texture.h"
+#include "texture/RawTexture.h"

 namespace quicktex {

@ -46,21 +46,22 @@ template <typename T> class BlockEncoder : public Encoder<T> {
    virtual EncodedBlock EncodeBlock(const DecodedBlock &block) const = 0;

    virtual T Encode(const RawTexture &decoded) const override {
-        auto encoded = T(decoded.Width(), decoded.Height());
+        auto encoded = T(decoded.width, decoded.height);

-        int blocks_x = encoded.BlocksX();
-        int blocks_y = encoded.BlocksY();
+        unsigned blocks_x = encoded.bwidth();
+        unsigned blocks_y = encoded.bheight();

        // from experimentation, multithreading this using OpenMP sometimes actually makes encoding slower
        // due to thread creation/teardown taking longer than the encoding process itself.
        // As a result, this is sometimes left as a serial operation despite being embarassingly parallelizable
        // threshold for number of blocks before multithreading is set by overriding MTThreshold()
 #pragma omp parallel for if (blocks_x * blocks_y >= MTThreshold())
-        for (int y = 0; y < blocks_y; y++) {
-            for (int x = 0; x < blocks_x; x++) {
-                auto pixels = decoded.GetBlock<BlockWidth, BlockHeight>(x, y);
+        for (int y = 0; y < (int)blocks_y; y++) {
+            for (int x = 0; x < (int)blocks_x; x++) {
+                // index variables have to be signed for MSVC for some reason
+                auto pixels = decoded.get_block<BlockWidth, BlockHeight>(x, y);
                auto block = EncodeBlock(pixels);
-                encoded.SetBlock(x, y, block);
+                encoded.set_block(x, y, block);
            }
        }

--- a/quicktex/Matrix.h
+++ b/quicktex/Matrix.h
@ -0,0 +1,457 @@
+/*  Quicktex Texture Compression Library
+    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
+    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
+    and licenced under the public domain
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Lesser General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <algorithm>
+#include <cstdint>
+#include <numeric>
+#include <xsimd/xsimd.hpp>
+
+#include "util/iterator.h"
+#include "util/map.h"
+#include "util/math.h"
+#include "util/ranges.h"
+
+namespace quicktex {
+
+template <typename T, int M, int N> class Matrix;
+
+template <typename T, int M> using Vec = Matrix<T, M, 1>;
+
+// region helper concepts
+template <typename L, typename R, typename Op>
+concept operable = requires(L &l, R &r, Op &op) { op(l, r); };
+
+template <typename V>
+concept is_matrix = requires(V &v) {
+                        V::width();
+                        V::height();
+                        V::value_type;
+                    } && std::same_as < Matrix<typename V::value_type, V::height(), V::width()>,
+std::remove_cvref_t < V >> ;
+
+template <typename V> struct vector_stats {
+    static constexpr int width = 1;
+    static constexpr int height = 1;
+    static constexpr int dims = 0;
+};
+
+template <typename V>
+    requires is_matrix<V>
+struct vector_stats<V> {
+    static constexpr int width = V::width;
+    static constexpr int height = V::height;
+    static constexpr int dims = V::dims;
+};
+
+template <typename V> constexpr int vector_width = vector_stats<V>::width;
+template <typename V> constexpr int vector_height = vector_stats<V>::height;
+template <typename V> constexpr int vector_dims = vector_stats<V>::dims;
+
+// endregion
+
+template <typename R, typename T, int N> class VecBase {
+   public:
+    constexpr VecBase(T scalar = T()) : _c{} { _c.fill(scalar); }
+
+   protected:
+    const R &_at(int index) const { return _c.at(index); }
+    R &_at(int index) { return _c.at(index); }
+
+    constexpr auto _begin() const { return _c.data(); }
+    constexpr auto _begin() { return _c.data(); }
+    constexpr auto _end() const { return _c.data() + N; }
+    constexpr auto _end() { return _c.data() + N; }
+
+   private:
+    std::array<R, N> _c;
+};
+
+template <typename T, int N, int M> using matrix_row_type = std::conditional_t<N <= 1, T, Vec<T, N>>;
+template <typename T, int N, int M> using matrix_column_type = std::conditional_t<M <= 1, T, Vec<T, M>>;
+
+/**
+ * A matrix of values that can be operated on
+ * @tparam T Scalar type
+ * @tparam N Width of the matrix
+ * @tparam M Height of the matrix
+ */
+template <typename T, int M, int N>
+class Matrix : public VecBase<std::conditional_t<N == 1, T, VecBase<T, T, N>>, T, M> {
+   public:
+    using base = VecBase<std::conditional_t<N == 1, T, VecBase<T, T, N>>, T, M>;
+
+    using value_type = T;
+    using row_type = matrix_row_type<T, N, M>;
+    using column_type = matrix_column_type<T, N, M>;
+
+    using base::base;
+    //    using base::begin;
+    //    using base::end;
+    //    using base::operator[];
+
+    // region constructors
+    /**
+     * Create a vector from an intializer list
+     * @param il values to populate with
+     */
+    Matrix(std::initializer_list<row_type> il) : base() {
+        assert(il.size() == M);  // ensure il is of the right size
+        std::copy_n(il.begin(), M, this->begin());
+    }
+
+    /**
+     * Create a vector from a scalar value
+     * @param scalar value to populate with
+     */
+    //    constexpr Matrix(const T &scalar) { std::fill(this->begin(), this->end(), scalar); }
+
+    /**
+     * Create a vector from an iterator
+     * @tparam II input iterator type
+     * @param input_iterator iterator to copy from
+     */
+    template <typename II>
+        requires std::input_iterator<II> && std::convertible_to<std::iter_value_t<II>,
+                                                                const row_type>
+        Matrix(const II input_iterator) : base() {
+        std::copy_n(input_iterator, M, this->begin());
+    }
+
+    /**
+     * Create a vector from a range type
+     * @tparam R Range type
+     * @param input_range Range to copy from
+     */
+    template <typename R>
+    Matrix(const R &input_range)
+        requires range<R> && std::convertible_to<typename R::value_type, row_type>
+    : Matrix(input_range.begin()) {
+        assert(std::distance(input_range.begin(), input_range.end()) == M);
+    }
+
+    template <typename R = T>
+        requires(N == M)
+    static constexpr Matrix identity() {
+        Matrix result = Matrix(0);
+        for (int i = 0; i < N; i++) { result.element(i, i) = 1; }
+        return result;
+    }
+    // endregion
+
+    // region iterators and accessors
+    static constexpr int size() { return M; }
+    static constexpr int width = N;
+    static constexpr int height = M;
+    static constexpr int elements = N * M;
+    static constexpr int dims = ((width > 1) ? 1 : 0) + ((height > 1) ? 1 : 0);
+
+    const row_type &at(int index) const {
+        assert(index >= 0 && index < M);
+        return static_cast<const row_type &>(base::_at(index));
+    }
+    row_type &at(int index) {
+        assert(index >= 0 && index < M);
+        return static_cast<row_type &>(base::_at(index));
+    }
+
+    const row_type &operator[](int index) const { return at(index); }
+    row_type &operator[](int index) { return at(index); }
+
+    const row_type *begin() const { return static_cast<const row_type *>(base::_begin()); }
+    row_type *begin() { return static_cast<row_type *>(base::_begin()); }
+
+    const row_type *end() const { return static_cast<const row_type *>(base::_end()); }
+    row_type *end() { return static_cast<row_type *>(base::_end()); }
+
+    auto column_begin() const { return column_iterator(this, 0); }
+    auto column_end() const { return column_iterator(this, N); }
+
+    auto all_begin() const { return linear_iterator<const Matrix>(this, 0); }
+    auto all_begin() { return linear_iterator<Matrix>(this, 0); }
+
+    auto all_end() const { return linear_iterator<const Matrix>(this, N * M); }
+    auto all_end() { return linear_iterator<Matrix>(this, N * M); }
+
+    const row_type &get_row(int m) const { return static_cast<const row_type &>(this->at(m)); }
+    template <typename R> void set_row(int m, const R &value) { this->at(m) = value; }
+
+    template <typename S = T> column_type get_column(int n) const {
+        if constexpr (M == 1) {
+            return element(0, n);
+        } else {
+            column_type ret;
+            for (int m = 0; m < M; m++) { ret[m] = element(m, n); }
+            return ret;
+        }
+    }
+
+    void set_column(int n, const column_type &value) {
+        if constexpr (M == 1) {
+            element(0, n) = value;
+        } else {
+            for (int m = 0; m < M; m++) { element(m, n) = value[m]; }
+        }
+    }
+
+    // n/m accessors
+    const T &element(int m, int n) const {
+        if constexpr (N == 1) {
+            return this->at(m);
+        } else {
+            return this->at(m)[n];
+        }
+    }
+
+    T &element(int n, int m) { return const_cast<T &>(static_cast<const Matrix &>(*this).element(n, m)); }
+
+    // linear accessors
+    const T &element(int i) const { return element(i / N, i % N); }
+    T &element(int i) { return element(i / N, i % N); }
+
+    // RGBA accessors
+    const T &r() const { return (*this)[0]; }
+    T &r() { return this->at(0); }
+    template <typename S = T> std::enable_if_t<M >= 2, const S &> g() const { return this->at(1); }
+    template <typename S = T> std::enable_if_t<M >= 2, S &> g() { return this->at(1); }
+    template <typename S = T> std::enable_if_t<M >= 3, const S &> b() const { return this->at(2); }
+    template <typename S = T> std::enable_if_t<M >= 3, S &> b() { return this->at(2); }
+    template <typename S = T> std::enable_if_t<M >= 4, const S &> a() const { return this->at(3); }
+    template <typename S = T> std::enable_if_t<M >= 4, S &> a() { return this->at(3); }
+
+    // XYZW accessors
+    const T &x() const { return this->at(0); }
+    T &x() { return this->at(0); }
+    template <typename S = T> std::enable_if_t<M >= 2, const S &> y() const { return this->at(1); }
+    template <typename S = T> std::enable_if_t<M >= 2, S &> y() { return this->at(1); }
+    template <typename S = T> std::enable_if_t<M >= 3, const S &> z() const { return this->at(2); }
+    template <typename S = T> std::enable_if_t<M >= 3, S &> z() { return this->at(2); }
+    template <typename S = T> std::enable_if_t<M >= 4, const S &> w() const { return this->at(3); }
+    template <typename S = T> std::enable_if_t<M >= 4, S &> w() { return this->at(3); }
+    // endregion
+
+    template <typename R>
+        requires std::equality_comparable_with<T, R> bool
+    operator==(const Matrix<R, M, N> &rhs) const {
+        return size() == rhs.size() && std::equal(this->begin(), this->end(), rhs.begin());
+    };
+
+    // unary vector negation
+    template <typename S = T>
+        requires(!std::unsigned_integral<T>) && requires(T &t) { -t; }
+    Matrix operator-() const {
+        return map(std::negate(), *this);
+    };
+
+    // add vectors
+    template <typename R>
+        requires operable<R, T, std::plus<>>
+    Matrix operator+(const Matrix<R, M, N> &rhs) const {
+        return map(std::plus(), *this, rhs);
+    };
+
+    // subtract vectors
+    template <typename R>
+        requires operable<R, T, std::minus<>>
+    Matrix operator-(const Matrix<R, M, N> &rhs) const {
+        // we can't just add the negation because that's invalid for int types
+        return map(std::minus(), *this, rhs);
+    };
+
+    // multiply matrix with a matrix or column vector
+    template <typename R, int P>
+        requires(P == 1 || P == N) && operable<R, T, std::multiplies<>>
+    Matrix operator*(const Matrix<R, M, P> &rhs) const {
+        return map(std::multiplies(), *this, rhs);
+    };
+
+    // multiply matrix with a scalar
+    template <typename R>
+        requires operable<R, T, std::multiplies<>>
+    Matrix operator*(const R &rhs) const {
+        return map(std::multiplies(), *this, rhs);
+    };
+
+    // divides a matrix by a matrix or column vector
+    template <typename R, int NN>
+        requires(NN == 1 || NN == N) && operable<R, T, std::divides<>>
+    Matrix operator/(const Matrix<R, M, NN> &rhs) const {
+        return map(std::divides(), *this, rhs);
+    };
+
+    // divides a matrix by a scalar
+    template <typename R>
+        requires operable<R, T, std::divides<>>
+    Matrix operator/(const R &rhs) const {
+        return map(std::divides(), *this, rhs);
+    };
+
+    // add-assigns a matrix with a matrix
+    template <typename R>
+        requires operable<Matrix, R, std::plus<>>
+    Matrix &operator+=(const R &rhs) {
+        return *this = *this + rhs;
+    }
+
+    // subtract-assigns a matrix with a matrix
+    template <typename R>
+        requires operable<Matrix, R, std::minus<>>
+    Matrix &operator-=(const R &rhs) {
+        return *this = *this - rhs;
+    }
+
+    // multiply-assigns a matrix with a matrix, column vector, or a scalar
+    template <typename R>
+        requires operable<Matrix, R, std::multiplies<>>
+    Matrix &operator*=(const R &rhs) {
+        return *this = *this * rhs;
+    }
+
+    // divide-assigns a matrix by a matrix, column vector, or a scalar
+    template <typename R>
+        requires operable<Matrix, R, std::divides<>>
+    Matrix &operator/=(const R &rhs) {
+        return *this = *this / rhs;
+    }
+
+    // decay a 1x1 matrix to a scalar on demand
+    template <typename S = T>
+        requires(N == 1 && M == 1)
+    operator S &() {
+        return this->at(0);
+    }
+    template <typename S = T>
+        requires(N == 1 && M == 1)
+    operator const S &() const {
+        return this->at(0);
+    }
+
+    // sum up all columns
+    column_type hsum() const {
+        if constexpr (N == 1) { return *this; }
+        if constexpr (M == 1) { return sum(); }
+        for (int i = 0; i < M; i++) {}
+        return _map<column_type>([](auto row) { return quicktex::sum(row); }, *this);
+    }
+
+    // sum up all rows
+    row_type vsum() const {
+        if constexpr (N == 1) { return sum(); }
+        if constexpr (M == 1) { return *this; }
+        return std::accumulate(begin(), end(), row_type{});
+    }
+
+    // sum up all values
+    T sum() const {
+        // TODO: reintroduce SIMDing for this
+        return std::accumulate(all_begin(), all_end(), T(0));
+    }
+
+    template <typename R, int P>
+        requires operable<R, T, std::multiplies<>>
+    Matrix<T, M, P> mult(const Matrix<R, N, P> &rhs) const {
+        Matrix<T, M, P> res(0);
+        for (int p = 0; p < P; p++) {
+            // for each column of the RHS/Result
+            for (int m = 0; m < M; m++) {
+                // for each row of the LHS/Result
+                for (int n = 0; n < N; n++) { res.element(m, p) += element(m, n) * rhs.element(n, p); }
+            }
+        }
+        return res;
+    }
+
+    Matrix<T, N, M> transpose() const {
+        Matrix<T, N, M> res;
+        for (int m = 0; m < M; m++) { res.set_column(m, get_row(m)); }
+        return res;
+    }
+
+    template <typename R = T>
+        requires(N == M)
+    Matrix mirror() const {
+        Matrix result = *this;
+        for (int n = 0; n < N - 1; n++) {
+            for (int m = (n + 1); m < M; m++) { result.element(m, n) = result.element(n, m); }
+        }
+        return result;
+    }
+
+    // dot product of two compatible matrices
+    template <typename R>
+        requires(N == 1) && operable<T, R, std::multiplies<>> && operable<T, T, std::plus<>>
+    inline row_type dot(const Matrix<R, M, N> &rhs) const {
+        // technically this is Lt * R, but the vsum method is probably faster/more readable
+        // than allocationg a new transpose matrix
+        Matrix product = *this * rhs;
+        return product.vsum();
+    }
+
+    inline row_type sqr_mag() const { return dot(*this); }
+
+    inline Matrix abs() const {
+        return map([](auto c) { return quicktex::abs(c); }, *this);
+    }
+
+    inline Matrix clamp(T low, T high) {
+        return map([low, high](auto c) { return quicktex::clamp(c, low, high); }, *this);
+    }
+    inline Matrix clamp(const Matrix &low, const Matrix &high) {
+        return map([](auto c, auto l, auto h) { return quicktex::clamp(c, l, h); }, *this, low, high);
+    }
+
+   protected:
+    class column_iterator : public index_iterator_base<column_iterator, column_type> {
+       public:
+        using value_type = column_type;
+        using base = index_iterator_base<column_iterator, column_type>;
+
+        column_iterator(const Matrix *matrix = nullptr, int index = 0) : base(index), _matrix(matrix){};
+
+        column_type operator*() const { return _matrix->get_column(this->_index); }
+        const column_type *operator->() const { &(_matrix->get_column(this->_index)); }
+
+        friend bool operator==(const column_iterator &lhs, const column_iterator &rhs) {
+            return (lhs._matrix == rhs._matrix) && (lhs._index == rhs._index);
+        }
+
+       private:
+        const Matrix *_matrix;
+    };
+
+    template <typename V> class linear_iterator : public index_iterator_base<linear_iterator<V>, T> {
+       public:
+        using value_type = T;
+        using base = index_iterator_base<linear_iterator<V>, T>;
+
+        linear_iterator(V *matrix = nullptr, int index = 0) : base(index), _matrix(matrix){};
+
+        auto &operator*() { return _matrix->element(this->_index); }
+        auto *operator->() const { return &(_matrix->element(this->_index)); }
+
+        friend bool operator==(const linear_iterator &lhs, const linear_iterator &rhs) {
+            return (lhs._matrix == rhs._matrix) && (lhs._index == rhs._index);
+        }
+
+       private:
+        V *_matrix;
+    };
+};
+}  // namespace quicktex
--- a/quicktex/Matrix4x4.cpp
+++ b/quicktex/Matrix4x4.cpp
@ -1,5 +1,5 @@
 /*  Quicktex Texture Compression Library
-    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
+    Copyright (C) 2021-2022 Andrew Cassidy <drewcassidy@me.com>
    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
    and licenced under the public domain

--- a/quicktex/Matrix4x4.h
+++ b/quicktex/Matrix4x4.h
@ -1,5 +1,5 @@
 /*  Quicktex Texture Compression Library
-    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
+    Copyright (C) 2021-2022 Andrew Cassidy <drewcassidy@me.com>
    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
    and licenced under the public domain

--- a/quicktex/OldColor.cpp
+++ b/quicktex/OldColor.cpp
@ -1,5 +1,5 @@
 /*  Quicktex Texture Compression Library
-    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
+    Copyright (C) 2021-2022 Andrew Cassidy <drewcassidy@me.com>
    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
    and licenced under the public domain

@ -16,18 +16,19 @@
    You should have received a copy of the GNU Lesser General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
-#include "Color.h"
+#include "OldColor.h"

 #include <algorithm>
 #include <stdexcept>

 #include "Vector4.h"
 #include "Vector4Int.h"
-#include "util.h"  // for scale5To8, scale8To5, assert5bit, scale6To8
+#include "util/bitbash.h"
+#include "util/math.h"  // for scale_to_8<5>, scale_from_8<5>, assert5bit, scale_to_8<6>

 namespace quicktex {

-Color::Color(Vector4Int v) {
+OldColor::OldColor(Vector4Int v) {
    if (v.MaxAbs() > 0xFF) throw std::invalid_argument("Vector members out of range");
    for (int i = 0; i < 4; i++) {
        if (v[i] < 0) throw std::range_error("Color members cannot be negative");
@ -39,40 +40,42 @@ Color::Color(Vector4Int v) {
    a = static_cast<uint8_t>(v[3]);
 }

-uint16_t Color::Pack565Unscaled(uint8_t r, uint8_t g, uint8_t b) {
+uint16_t OldColor::Pack565Unscaled(uint8_t r, uint8_t g, uint8_t b) {
    assert5bit(r);
    assert6bit(g);
    assert5bit(b);
    return static_cast<uint16_t>(b | (g << 5) | (r << 11));
 }

-uint16_t Color::Pack565(uint8_t r, uint8_t g, uint8_t b) { return Pack565Unscaled(scale8To5(r), scale8To6(g), scale8To5(b)); }
+uint16_t OldColor::Pack565(uint8_t r, uint8_t g, uint8_t b) {
+    return Pack565Unscaled(scale_from_8<5>(r), scale_from_8<6>(g), scale_from_8<5>(b));
+}

-Color Color::Unpack565Unscaled(uint16_t Packed) {
+OldColor OldColor::Unpack565Unscaled(uint16_t Packed) {
    uint8_t r = (Packed >> 11) & 0x1F;
    uint8_t g = (Packed >> 5) & 0x3F;
    uint8_t b = Packed & 0x1F;

-    return Color(r, g, b);
+    return OldColor(r, g, b);
 }

-Color Color::Unpack565(uint16_t Packed) {
-    uint8_t r = static_cast<uint8_t>(scale5To8((Packed >> 11) & 0x1FU));
-    uint8_t g = static_cast<uint8_t>(scale6To8((Packed >> 5) & 0x3FU));
-    uint8_t b = static_cast<uint8_t>(scale5To8(Packed & 0x1FU));
+OldColor OldColor::Unpack565(uint16_t Packed) {
+    uint8_t r = static_cast<uint8_t>(scale_to_8<5>((Packed >> 11) & 0x1FU));
+    uint8_t g = static_cast<uint8_t>(scale_to_8<6>((Packed >> 5) & 0x3FU));
+    uint8_t b = static_cast<uint8_t>(scale_to_8<5>(Packed & 0x1FU));

-    return Color(r, g, b);
+    return OldColor(r, g, b);
 }

-Color Color::PreciseRound565(Vector4 &v) {
+OldColor OldColor::PreciseRound565(Vector4 &v) {
    int trial_r = (int)(v[0] * UINT5_MAX);
    int trial_g = (int)(v[1] * UINT6_MAX);
    int trial_b = (int)(v[2] * UINT5_MAX);

    // clamp to prevent weirdness with slightly out of bounds float values
-    uint8_t r = (uint8_t)clampi(trial_r, 0, UINT5_MAX);
-    uint8_t g = (uint8_t)clampi(trial_g, 0, UINT6_MAX);
-    uint8_t b = (uint8_t)clampi(trial_b, 0, UINT5_MAX);
+    uint8_t r = (uint8_t)clamp<int>(trial_r, 0, UINT5_MAX);
+    uint8_t g = (uint8_t)clamp<int>(trial_g, 0, UINT6_MAX);
+    uint8_t b = (uint8_t)clamp<int>(trial_b, 0, UINT5_MAX);

    // increment each channel if above the rounding point
    r += v[0] > Midpoints5bit[r];
@ -83,46 +86,36 @@ Color Color::PreciseRound565(Vector4 &v) {
    assert6bit(g);
    assert5bit(b);

-    return Color(r, g, b);
+    return OldColor(r, g, b);
 }

-void Color::SetRGB(uint8_t vr, uint8_t vg, uint8_t vb) {
+void OldColor::SetRGB(uint8_t vr, uint8_t vg, uint8_t vb) {
    r = vr;
    g = vg;
    b = vb;
 }

-size_t Color::MinChannelRGB() {
-    if (r <= g && r <= b) return 0;
-    if (g <= b && g <= r) return 1;
-    return 2;
-}
-
-size_t Color::MaxChannelRGB() {
+size_t OldColor::MaxChannelRGB() {
    if (r >= g && r >= b) return 0;
    if (g >= b && g >= r) return 1;
    return 2;
 }

-Color Color::Min(const Color &A, const Color &B) { return Color(std::min(A[0], B[0]), std::min(A[1], B[1]), std::min(A[2], B[2]), std::min(A[3], B[3])); }
-
-Color Color::Max(const Color &a, const Color &b) { return Color(std::max(a[0], b[0]), std::max(a[1], b[1]), std::max(a[2], b[2]), std::max(a[3], b[3])); }
-
-Color::operator Vector4() const { return Vector4(r, g, b, a); }
-Color::operator Vector4Int() const { return Vector4Int(r, g, b, a); }
-Vector4Int operator-(const Color &lhs, const Color &rhs) {
+OldColor::operator Vector4() const { return Vector4(r, g, b, a); }
+OldColor::operator Vector4Int() const { return Vector4Int(r, g, b, a); }
+Vector4Int operator-(const OldColor &lhs, const OldColor &rhs) {
    Vector4Int result;
    for (unsigned i = 0; i < 4; i++) { result[i] = (int)lhs[i] - rhs[i]; }
    return result;
 }

-uint16_t Color::Pack565() const { return Pack565(r, g, b); }
-uint16_t Color::Pack565Unscaled() const { return Pack565Unscaled(r, g, b); }
+uint16_t OldColor::Pack565() const { return Pack565(r, g, b); }
+uint16_t OldColor::Pack565Unscaled() const { return Pack565Unscaled(r, g, b); }

-Color Color::ScaleTo565() const { return Color(scale8To5(r), scale8To6(g), scale8To5(b)); }
-Color Color::ScaleFrom565() const { return Color(scale5To8(r), scale6To8(g), scale5To8(b)); }
+OldColor OldColor::ScaleTo565() const { return OldColor(scale_from_8<5>(r), scale_from_8<6>(g), scale_from_8<5>(b)); }
+OldColor OldColor::ScaleFrom565() const { return OldColor(scale_to_8<5>(r), scale_to_8<6>(g), scale_to_8<5>(b)); }

-bool Color::operator==(const Color &Rhs) const { return r == Rhs.r && g == Rhs.g && b == Rhs.b && a == Rhs.a; }
-bool Color::operator!=(const Color &Rhs) const { return !(Rhs == *this); }
+bool OldColor::operator==(const OldColor &Rhs) const { return r == Rhs.r && g == Rhs.g && b == Rhs.b && a == Rhs.a; }
+bool OldColor::operator!=(const OldColor &Rhs) const { return !(Rhs == *this); }

 }  // namespace quicktex
--- a/quicktex/OldColor.h
+++ b/quicktex/OldColor.h
@ -0,0 +1,114 @@
+/*  Quicktex Texture Compression Library
+    Copyright (C) 2021-2022 Andrew Cassidy <drewcassidy@me.com>
+    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
+    and licenced under the public domain
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Lesser General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+#include <cassert>  // for assert
+#include <cstddef>  // for size_t
+#include <cstdint>  // for uint8_t, uint16_t
+
+#include "Matrix.h"
+
+namespace quicktex {
+class Vector4;
+class Vector4Int;
+
+#pragma pack(push, 1)
+class OldColor {
+   public:
+    uint8_t r;
+    uint8_t g;
+    uint8_t b;
+    uint8_t a;
+
+    constexpr OldColor() : OldColor(0, 0, 0, 0xFF) {}
+
+    constexpr OldColor(uint8_t vr, uint8_t vg, uint8_t vb, uint8_t va = 0xFF) : r(vr), g(vg), b(vb), a(va) {}
+
+    OldColor(Vector4Int v);
+
+    static uint16_t Pack565Unscaled(uint8_t r, uint8_t g, uint8_t b);
+    static uint16_t Pack565(uint8_t r, uint8_t g, uint8_t b);
+
+    static OldColor Unpack565Unscaled(uint16_t Packed);
+    static OldColor Unpack565(uint16_t Packed);
+
+    static OldColor PreciseRound565(Vector4 &v);
+
+    static OldColor Min(const OldColor &A, const OldColor &B);
+    static OldColor Max(const OldColor &A, const OldColor &B);
+
+    bool operator==(const OldColor &Rhs) const;
+    bool operator!=(const OldColor &Rhs) const;
+
+    uint8_t operator[](size_t index) const {
+        assert(index < 4);
+        return reinterpret_cast<const uint8_t *>(this)[index];
+    }
+    uint8_t &operator[](size_t index) {
+        assert(index < 4);
+        return reinterpret_cast<uint8_t *>(this)[index];
+    }
+
+    operator Vector4() const;
+    operator Vector4Int() const;
+    friend Vector4Int operator-(const OldColor &lhs, const OldColor &rhs);
+
+    void SetRGB(uint8_t vr, uint8_t vg, uint8_t vb);
+    void SetRGB(const OldColor &other) { SetRGB(other.r, other.g, other.b); }
+
+    uint16_t Pack565() const;
+    uint16_t Pack565Unscaled() const;
+
+    OldColor ScaleTo565() const;
+    OldColor ScaleFrom565() const;
+
+    size_t MinChannelRGB();
+    size_t MaxChannelRGB();
+
+    bool IsGrayscale() const { return ((r == g) && (r == b)); }
+    bool IsBlack() const { return (r | g | b) < 4; }
+
+    int GetLuma() const { return (13938U * r + 46869U * g + 4729U * b + 32768U) >> 16U; }  // REC709 weightings
+
+    operator Vec<uint8_t, 4>() const { return {r, g, b, a}; }
+
+    OldColor(const Vec<uint8_t, 4> v) {
+        r = v.r();
+        g = v.g();
+        b = v.b();
+        a = v.a();
+    }
+
+   private:
+    static constexpr float Midpoints5bit[32] = {
+        .015686f, .047059f, .078431f, .111765f, .145098f, .176471f, .207843f, .241176f, .274510f, .305882f, .337255f,
+        .370588f, .403922f, .435294f, .466667f, .5f,      .533333f, .564706f, .596078f, .629412f, .662745f, .694118f,
+        .725490f, .758824f, .792157f, .823529f, .854902f, .888235f, .921569f, .952941f, .984314f, 1e+37f};
+    static constexpr float Midpoints6bit[64] = {
+        .007843f, .023529f, .039216f, .054902f, .070588f, .086275f, .101961f, .117647f, .133333f, .149020f, .164706f,
+        .180392f, .196078f, .211765f, .227451f, .245098f, .262745f, .278431f, .294118f, .309804f, .325490f, .341176f,
+        .356863f, .372549f, .388235f, .403922f, .419608f, .435294f, .450980f, .466667f, .482353f, .500000f, .517647f,
+        .533333f, .549020f, .564706f, .580392f, .596078f, .611765f, .627451f, .643137f, .658824f, .674510f, .690196f,
+        .705882f, .721569f, .737255f, .754902f, .772549f, .788235f, .803922f, .819608f, .835294f, .850980f, .866667f,
+        .882353f, .898039f, .913725f, .929412f, .945098f, .960784f, .976471f, .992157f, 1e+37f};
+
+
+};
+#pragma pack(pop)
+}  // namespace quicktex
--- a/quicktex/Texture.h
+++ b/quicktex/Texture.h
@ -1,187 +0,0 @@
-/*  Quicktex Texture Compression Library
-    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
-    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
-    and licenced under the public domain
-
-    This program is free software: you can redistribute it and/or modify
-    it under the terms of the GNU Lesser General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU Lesser General Public License for more details.
-
-    You should have received a copy of the GNU Lesser General Public License
-    along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#pragma once
-
-#include <array>
-#include <climits>
-#include <cstdint>
-#include <cstdio>
-#include <cstring>
-#include <memory>
-#include <stdexcept>
-#include <tuple>
-#include <type_traits>
-#include <vector>
-
-#include "Color.h"
-#include "ColorBlock.h"
-
-namespace quicktex {
-
-class Texture {
-   public:
-    virtual ~Texture() = default;
-
-    virtual int Width() const { return _width; }
-    virtual int Height() const { return _height; }
-    virtual std::tuple<int, int> Size() const { return std::tuple<int, int>(_width, _height); }
-
-    /**
-     * The texture's total size
-     * @return The size of the texture in bytes.
-     */
-    virtual size_t NBytes() const noexcept = 0;
-
-    virtual const uint8_t *Data() const noexcept = 0;
-    virtual uint8_t *Data() noexcept = 0;
-
-   protected:
-    Texture(int width, int height) : _width(width), _height(height) {
-        if (width <= 0) throw std::invalid_argument("Texture width must be greater than 0");
-        if (height <= 0) throw std::invalid_argument("Texture height must be greater than 0");
-    }
-
-    int _width;
-    int _height;
-};
-
-class RawTexture : public Texture {
-    using Base = Texture;
-
-   public:
-    /**
-     * Create a new RawTexture
-     * @param width width of the texture in pixels
-     * @param height height of the texture in pixels
-     */
-    RawTexture(int width, int height) : Base(width, height), _pixels(_width * _height) {}
-
-    Color GetPixel(int x, int y) const {
-        if (x < 0 || x >= _width) throw std::invalid_argument("x value out of range.");
-        if (y < 0 || y >= _height) throw std::invalid_argument("y value out of range.");
-        return _pixels.at(x + (y * _width));
-    }
-
-    void SetPixel(int x, int y, Color val) {
-        if (x < 0 || x >= _width) throw std::invalid_argument("x value out of range.");
-        if (y < 0 || y >= _height) throw std::invalid_argument("y value out of range.");
-        _pixels.at(x + (y * _width)) = val;
-    }
-
-    size_t NBytes() const noexcept override { return static_cast<unsigned long>(Width() * Height()) * sizeof(Color); }
-
-    template <int N, int M> ColorBlock<N, M> GetBlock(int block_x, int block_y) const {
-        if (block_x < 0) throw std::out_of_range("x value out of range.");
-        if (block_y < 0) throw std::out_of_range("y value out of range.");
-
-        // coordinates in the image of the top-left pixel of the selected block
-        ColorBlock<N, M> block;
-        int pixel_x = block_x * N;
-        int pixel_y = block_y * M;
-
-        if (pixel_x + N < _width && pixel_y + M < _height) {
-            // fast memcpy if the block is entirely inside the bounds of the texture
-            for (int y = 0; y < M; y++) {
-                // copy each row into the ColorBlock
-                block.SetRow(y, &_pixels[pixel_x + (_width * (pixel_y + y))]);
-            }
-        } else {
-            // slower pixel-wise copy if the block goes over the edges
-            for (int x = 0; x < N; x++) {
-                for (int y = 0; y < M; y++) { block.Set(x, y, GetPixel((pixel_x + x) % _width, (pixel_y + y) % _height)); }
-            }
-        }
-
-        return block;
-    }
-
-    template <int N, int M> void SetBlock(int block_x, int block_y, const ColorBlock<N, M> &block) {
-        if (block_x < 0) throw std::out_of_range("x value out of range.");
-        if (block_y < 0) throw std::out_of_range("y value out of range.");
-
-        // coordinates in the image of the top-left pixel of the selected block
-        int pixel_x = block_x * N;
-        int pixel_y = block_y * M;
-
-        if (pixel_x + N < _width && pixel_y + M < _height) {
-            // fast row-wise memcpy if the block is entirely inside the bounds of the texture
-            for (int y = 0; y < M; y++) {
-                // copy each row out of the ColorBlock
-                block.GetRow(y, &_pixels[pixel_x + (_width * (pixel_y + y))]);
-            }
-        } else {
-            // slower pixel-wise copy if the block goes over the edges
-            for (int x = 0; x < N; x++) {
-                for (int y = 0; y < M; y++) { SetPixel((pixel_x + x) % _width, (pixel_y + y) % _height, block.Get(x, y)); }
-            }
-        }
-    }
-
-    virtual const uint8_t *Data() const noexcept override { return reinterpret_cast<const uint8_t *>(_pixels.data()); }
-    virtual uint8_t *Data() noexcept override { return reinterpret_cast<uint8_t *>(_pixels.data()); }
-
-   protected:
-    std::vector<Color> _pixels;
-};
-
-template <typename B> class BlockTexture final : public Texture {
-   private:
-    std::vector<B> _blocks;
-    int _width_b;
-    int _height_b;
-
-   public:
-    using BlockType = B;
-    using Base = Texture;
-
-    /**
-     * Create a new BlockTexture
-     * @param width width of the texture in pixels. must be divisible by B::Width
-     * @param height height of the texture in pixels. must be divisible by B::Height
-     */
-    BlockTexture(int width, int height) : Base(width, height) {
-        _width_b = (_width + B::Width - 1) / B::Width;
-        _height_b = (_height + B::Height - 1) / B::Height;
-        _blocks = std::vector<B>(_width_b * _height_b);
-    }
-
-    constexpr int BlocksX() const { return _width_b; }
-    constexpr int BlocksY() const { return _height_b; }
-    constexpr std::tuple<int, int> BlocksXY() const { return std::tuple<int, int>(_width_b, _height_b); }
-
-    B GetBlock(int x, int y) const {
-        if (x < 0 || x >= _width_b) throw std::out_of_range("x value out of range.");
-        if (y < 0 || y >= _height_b) throw std::out_of_range("y value out of range.");
-        return _blocks.at(x + (y * _width_b));
-    }
-
-    void SetBlock(int x, int y, const B &val) {
-        if (x < 0 || x >= _width_b) throw std::out_of_range("x value out of range.");
-        if (y < 0 || y >= _height_b) throw std::out_of_range("y value out of range.");
-        _blocks.at(x + (y * _width_b)) = val;
-    }
-
-    size_t NBytes() const noexcept override { return _blocks.size() * sizeof(B); }
-
-    const uint8_t *Data() const noexcept override { return reinterpret_cast<const uint8_t *>(_blocks.data()); }
-    uint8_t *Data() noexcept override { return reinterpret_cast<uint8_t *>(_blocks.data()); }
-};
-
-}  // namespace quicktex
--- a/quicktex/Vector4.h
+++ b/quicktex/Vector4.h
@ -1,5 +1,5 @@
 /*  Quicktex Texture Compression Library
-    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
+    Copyright (C) 2021-2022 Andrew Cassidy <drewcassidy@me.com>
    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
    and licenced under the public domain

@ -23,7 +23,7 @@
 #include <cmath>
 #include <functional>

-#include "Color.h"
+#include "OldColor.h"

 namespace quicktex {

@ -45,11 +45,11 @@ class Vector4 {
        _c[3] = scalar;
    }

-    Vector4(const Color &c) : Vector4(c.r, c.g, c.b, c.a) {}
+    Vector4(const OldColor &c) : Vector4(c.r, c.g, c.b, c.a) {}

-    static Vector4 FromColor(const Color &c) { return Vector4(c); }
+    static Vector4 FromColor(const OldColor &c) { return Vector4(c); }

-    static Vector4 FromColorRGB(const Color &c) { return Vector4(c.r, c.g, c.b); }
+    static Vector4 FromColorRGB(const OldColor &c) { return Vector4(c.r, c.g, c.b); }

    static float Dot(const Vector4 &lhs, const Vector4 &rhs) {
        float sum = 0;
--- a/quicktex/Vector4Int.h
+++ b/quicktex/Vector4Int.h
@ -1,5 +1,5 @@
 /*  Quicktex Texture Compression Library
-    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
+    Copyright (C) 2021-2022 Andrew Cassidy <drewcassidy@me.com>
    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
    and licenced under the public domain

@ -22,7 +22,7 @@
 #include <array>
 #include <functional>

-#include "Color.h"
+#include "OldColor.h"
 #include "Vector4.h"

 namespace quicktex {
@ -45,11 +45,11 @@ class Vector4Int {
        _c[3] = scalar;
    }

-    Vector4Int(const Color &c) : Vector4Int(c.r, c.g, c.b, c.a) {}
+    Vector4Int(const OldColor &c) : Vector4Int(c.r, c.g, c.b, c.a) {}

-    static Vector4Int FromColor(const Color &c) { return Vector4Int(c); }
+    static Vector4Int FromColor(const OldColor &c) { return Vector4Int(c); }

-    static Vector4Int FromColorRGB(const Color &c) { return Vector4Int(c.r, c.g, c.b); }
+    static Vector4Int FromColorRGB(const OldColor &c) { return Vector4Int(c.r, c.g, c.b); }

    static int Dot(const Vector4Int &lhs, const Vector4Int &rhs) {
        int sum = 0;
--- a/quicktex/init.py
+++ b/quicktex/init.py
@ -1,2 +1,8 @@
-from _quicktex import *
-from _quicktex import __version__
+try:
+    from _quicktex import *
+    from _quicktex import __version__
+except ImportError as e:
+    if 'libomp.dylib' in e.msg:
+        print('\033[41m\033[01mERROR: LIBOMP NOT FOUND! PLEASE INSTALL IT WITH \033[04m`brew install libomp`\033[0m')
+        print('original error message:')
+    raise e
--- a/quicktex/main.py
+++ b/quicktex/main.py
@ -1,6 +1,7 @@
 import click
-from quicktex.cli.encode import encode
+
 from quicktex.cli.decode import decode
+from quicktex.cli.encode import encode


@click.group()
--- a/quicktex/_bindings.cpp
+++ b/quicktex/_bindings.cpp
@ -1,5 +1,5 @@
 /*  Quicktex Texture Compression Library
-    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
+    Copyright (C) 2021-2022 Andrew Cassidy <drewcassidy@me.com>
    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
    and licenced under the public domain

@ -21,11 +21,12 @@

 #include <pybind11/pybind11.h>

-#include "Color.h"
 #include "Decoder.h"
 #include "Encoder.h"
-#include "Texture.h"
+#include "OldColor.h"
 #include "_bindings.h"
+#include "texture/RawTexture.h"
+#include "texture/Texture.h"

 #define STRINGIFY(x) #x
 #define MACRO_STRINGIFY(x) STRINGIFY(x)
@ -45,19 +46,26 @@ PYBIND11_MODULE(_quicktex, m) {
    m.attr("__version__") = "dev";
 #endif

+#ifdef NDEBUG
+    m.attr("_debug_build") = false;
+#else
+    m.attr("_debug_build") = true;
+#endif
+
    py::options options;

    // Texture

    py::class_<Texture> texture(m, "Texture", py::buffer_protocol());

-    texture.def_property_readonly("nbytes", &Texture::NBytes);
+    texture.def_property_readonly("nbytes", &Texture::nbytes);
    texture.def_property_readonly("size", &Texture::Size);
-    texture.def_property_readonly("width", &Texture::Width);
-    texture.def_property_readonly("height", &Texture::Height);
+    texture.def_readonly("width", &Texture::width);
+    texture.def_readonly("height", &Texture::height);

-    texture.def_buffer([](Texture &t) { return py::buffer_info(t.Data(), t.NBytes()); });
-    texture.def("tobytes", [](const Texture &t) { return py::bytes(reinterpret_cast<const char *>(t.Data()), t.NBytes()); });
+    texture.def_buffer([](Texture &t) { return py::buffer_info(t.data(), t.nbytes()); });
+    texture.def("tobytes",
+                [](const Texture &t) { return py::bytes(reinterpret_cast<const char *>(t.data()), t.nbytes()); });

    // RawTexture

@ -66,7 +74,9 @@ PYBIND11_MODULE(_quicktex, m) {
    raw_texture.def(py::init<int, int>(), "width"_a, "height"_a);
    raw_texture.def_static("frombytes", &BufferToTexture<RawTexture>, "data"_a, "width"_a, "height"_a);

-    DefSubscript2D(raw_texture, &RawTexture::GetPixel, &RawTexture::SetPixel, &RawTexture::Size);
+    DefSubscript2DRef(
+        raw_texture, [](RawTexture &self, int x, int y) -> Color { return self.pixel(x, y); },
+        [](RawTexture &self, int x, int y, Color val) { self.pixel(x, y) = val; }, &RawTexture::Size);

    InitS3TC(m);
 }
--- a/quicktex/_bindings.h
+++ b/quicktex/_bindings.h
@ -1,5 +1,5 @@
 /*  Quicktex Texture Compression Library
-    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
+    Copyright (C) 2021-2022 Andrew Cassidy <drewcassidy@me.com>
    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
    and licenced under the public domain

@ -24,18 +24,66 @@

 #include <cstdint>
 #include <cstring>
-#include <memory>
 #include <stdexcept>
+#include <string>
+#include <tuple>
 #include <type_traits>
+#include <utility>
+#include <vector>

-#include "Color.h"
-#include "ColorBlock.h"
-#include "Texture.h"
-#include "util.h"
+#include "OldColor.h"
+#include "texture/BlockTexture.h"
+#include "util/math.h"

 namespace pybind11::detail {
 using namespace quicktex;
 /// Type caster for color class to allow it to be converted to and from a python tuple
+template <> struct type_caster<OldColor> {
+   public:
+    PYBIND11_TYPE_CASTER(OldColor, _("Color"));
+
+    bool load(handle src, bool) {
+        PyObject* source = src.ptr();
+
+        PyObject* tmp = PySequence_Tuple(source);
+
+        // if the object is not a tuple, return false
+        if (!tmp) { return false; }  // incorrect type
+
+        // check the size
+        Py_ssize_t size = PyTuple_Size(tmp);
+        if (size < 3 || size > 4) { return false; }  // incorrect size
+
+        value.a = 0xFF;
+        // now we get the contents
+        for (int i = 0; i < size; i++) {
+            PyObject* src_chan = PyTuple_GetItem(tmp, i);
+            PyObject* tmp_chan = PyNumber_Long(src_chan);
+
+            if (!tmp_chan) return false;  // incorrect channel type
+
+            auto chan = PyLong_AsLong(tmp_chan);
+            if (chan > 0xFF || chan < 0) return false;  // item out of range
+            value[static_cast<unsigned>(i)] = static_cast<uint8_t>(chan);
+            Py_DECREF(tmp_chan);
+        }
+        Py_DECREF(tmp);
+
+        return !PyErr_Occurred();
+    }
+
+    static handle cast(OldColor src, return_value_policy, handle) {
+        PyObject* val = PyTuple_New(4);
+
+        for (int i = 0; i < 4; i++) {
+            PyObject* chan = PyLong_FromLong(src[static_cast<unsigned>(i)]);
+            PyTuple_SetItem(val, i, chan);
+        }
+
+        return val;
+    }
+};
+
 template <> struct type_caster<Color> {
   public:
    PYBIND11_TYPE_CASTER(Color, _("Color"));
@ -52,7 +100,7 @@ template <> struct type_caster<Color> {
        Py_ssize_t size = PyTuple_Size(tmp);
        if (size < 3 || size > 4) { return false; }  // incorrect size

-        value.a = 0xFF;
+        value.a() = 0xFF;
        // now we get the contents
        for (int i = 0; i < size; i++) {
            PyObject* src_chan = PyTuple_GetItem(tmp, i);
@ -85,26 +133,49 @@ template <> struct type_caster<Color> {

 namespace py = pybind11;
 namespace quicktex::bindings {
+
 using namespace pybind11::literals;

+template <typename... Args> std::string Format(const char* str, const Args&... args) {
+    auto output = std::string(str);
+
+    std::vector<std::string> values = {{args...}};
+
+    for (unsigned i = 0; i < values.size(); i++) {
+        auto key = "{" + std::to_string(i) + "}";
+        auto value = values[i];
+        while (true) {
+            size_t where = output.find(key);
+            if (where == output.npos) break;
+            output.replace(where, key.length(), value);
+        }
+    }
+
+    return output;
+}
+
 template <typename T> T BufferToTexture(py::buffer buf, int width, int height) {
    static_assert(std::is_base_of<Texture, T>::value);
    static_assert(std::is_constructible<T, int, int>::value);

    auto info = buf.request(false);
    auto output = T(width, height);
-    auto dst_size = output.NBytes();
+    auto dst_size = output.nbytes();

-    if (info.format != py::format_descriptor<uint8_t>::format()) throw std::runtime_error("Incompatible format in python buffer: expected a byte array.");
-    if (info.size < (ssize_t)dst_size) std::runtime_error("Incompatible format in python buffer: Input data is smaller than texture size.");
+    if (info.format != py::format_descriptor<uint8_t>::format())
+        throw std::runtime_error("Incompatible format in python buffer: expected a byte array.");
+    if (info.size < (Py_ssize_t)dst_size)
+        std::runtime_error("Incompatible format in python buffer: Input data is smaller than texture size.");
    if (info.ndim == 1) {
-        if (info.shape[0] < (ssize_t)dst_size) throw std::runtime_error("Incompatible format in python buffer: 1-D buffer has incorrect length.");
-        if (info.strides[0] != 1) throw std::runtime_error("Incompatible format in python buffer: 1-D buffer is not contiguous.");
+        if (info.shape[0] < (Py_ssize_t)dst_size)
+            throw std::runtime_error("Incompatible format in python buffer: 1-D buffer has incorrect length.");
+        if (info.strides[0] != 1)
+            throw std::runtime_error("Incompatible format in python buffer: 1-D buffer is not contiguous.");
    } else {
        throw std::runtime_error("Incompatible format in python buffer: Incorrect number of dimensions.");
    }

-    std::memcpy(output.Data(), info.ptr, dst_size);
+    std::memcpy(output.data(), info.ptr, dst_size);

    return output;
 }
@ -114,11 +185,15 @@ template <typename T> T BufferToPOD(py::buffer buf) {

    auto info = buf.request(false);

-    if (info.format != py::format_descriptor<uint8_t>::format()) throw std::runtime_error("Incompatible format in python buffer: expected a byte array.");
-    if (info.size < (ssize_t)sizeof(T)) std::runtime_error("Incompatible format in python buffer: Input data is smaller than texture size.");
+    if (info.format != py::format_descriptor<uint8_t>::format())
+        throw std::runtime_error("Incompatible format in python buffer: expected a byte array.");
+    if (info.size < (Py_ssize_t)sizeof(T))
+        std::runtime_error("Incompatible format in python buffer: Input data is smaller than texture size.");
    if (info.ndim == 1) {
-        if (info.shape[0] < (ssize_t)sizeof(T)) throw std::runtime_error("Incompatible format in python buffer: 1-D buffer has incorrect length.");
-        if (info.strides[0] != 1) throw std::runtime_error("Incompatible format in python buffer: 1-D buffer is not contiguous.");
+        if (info.shape[0] < (Py_ssize_t)sizeof(T))
+            throw std::runtime_error("Incompatible format in python buffer: 1-D buffer has incorrect length.");
+        if (info.strides[0] != 1)
+            throw std::runtime_error("Incompatible format in python buffer: 1-D buffer is not contiguous.");
    } else {
        throw std::runtime_error("Incompatible format in python buffer: Incorrect number of dimensions.");
    }
@ -133,15 +208,18 @@ inline int PyIndex(int val, int size, std::string name = "index") {
    return val;
 }

-template <typename T, typename Getter, typename Setter, typename Extent> void DefSubscript(py::class_<T> t, Getter&& get, Setter&& set, Extent&& ext) {
+template <typename T, typename Getter, typename Setter, typename Extent>
+void DefSubscript(py::class_<T> t, Getter&& get, Setter&& set, Extent&& ext) {
    using V = typename std::invoke_result<Getter, T*, int>::type;
    t.def(
        "__getitem__", [get, ext](T& self, int index) { return (self.*get)(PyIndex(index, (self.*ext)())); }, "key"_a);
    t.def(
-        "__setitem__", [set, ext](T& self, int index, V val) { (self.*set)(PyIndex(index, (self.*ext)()), val); }, "key"_a, "value"_a);
+        "__setitem__", [set, ext](T& self, int index, V val) { (self.*set)(PyIndex(index, (self.*ext)()), val); },
+        "key"_a, "value"_a);
 }

-template <typename Tpy, typename Getter, typename Setter, typename Extent> void DefSubscript2D(Tpy t, Getter&& get, Setter&& set, Extent&& ext) {
+template <typename Tpy, typename Getter, typename Setter, typename Extent>
+void DefSubscript2D(Tpy t, Getter&& get, Setter&& set, Extent&& ext) {
    using T = typename Tpy::type;
    using V = typename std::invoke_result<Getter, T*, int, int>::type;
    using Coords = std::tuple<int, int>;
@ -165,6 +243,32 @@ template <typename Tpy, typename Getter, typename Setter, typename Extent> void
        "key"_a, "value"_a);
 }

+// TODO: untangle this mess
+template <typename Tpy, typename Getter, typename Setter, typename Extent>
+void DefSubscript2DRef(Tpy t, Getter&& get, Setter&& set, Extent&& ext) {
+    using T = typename Tpy::type;
+    using V = typename std::remove_cvref_t<std::invoke_result_t<Getter, T&, int, int>>;
+    using Coords = std::tuple<int, int>;
+    t.def(
+        "__getitem__",
+        [get, ext](T& self, Coords pnt) {
+            Coords s = (self.*ext)();
+            int x = PyIndex(std::get<0>(pnt), std::get<0>(s), "x");
+            int y = PyIndex(std::get<1>(pnt), std::get<1>(s), "y");
+            return get(self, x, y);
+        },
+        "key"_a);
+    t.def(
+        "__setitem__",
+        [set, ext](T& self, Coords pnt, const V& val) {
+            Coords s = (self.*ext)();
+            int x = PyIndex(std::get<0>(pnt), std::get<0>(s), "x");
+            int y = PyIndex(std::get<1>(pnt), std::get<1>(s), "y");
+            set(self, x, y, val);
+        },
+        "key"_a, "value"_a);
+}
+
 template <typename B> py::class_<B> BindBlock(py::module_& m, const char* name) {
    const char* frombytes_doc = R"doc(
        Create a new {0} by copying a bytes-like object.
@ -184,7 +288,8 @@ template <typename B> py::class_<B> BindBlock(py::module_& m, const char* name)
    block.def_readonly_static("width", &B::Width, "The width of the block in pixels.");
    block.def_readonly_static("height", &B::Height, "The height of the block in pixels.");
    block.def_property_readonly_static(
-        "size", [](py::object) { return std::make_tuple(B::Width, B::Height); }, "The dimensions of the block in pixels.");
+        "size", [](py::object) { return std::make_tuple(B::Width, B::Height); },
+        "The dimensions of the block in pixels.");
    block.def_property_readonly_static(
        "nbytes", [](py::object) { return sizeof(B); }, "The size of the block in bytes.");

@ -195,7 +300,7 @@ template <typename B> py::class_<B> BindBlock(py::module_& m, const char* name)
        "tobytes", [](const B& b) { return py::bytes(reinterpret_cast<const char*>(&b), sizeof(B)); },
        Format(tobytes_doc, name, std::to_string(sizeof(B))).c_str());

-    return std::move(block);
+    return block;
 }

 template <typename B> py::class_<BlockTexture<B>> BindBlockTexture(py::module_& m, const char* name) {
@ -223,14 +328,15 @@ template <typename B> py::class_<BlockTexture<B>> BindBlockTexture(py::module_&
    py::class_<BTex, Texture> block_texture(m, name);

    block_texture.def(py::init<int, int>(), "width"_a, "height"_a, Format(constructor_str, name).c_str());
-    block_texture.def_static("from_bytes", &BufferToTexture<BTex>, "data"_a, "width"_a, "height"_a, Format(from_bytes_str, name).c_str());
+    block_texture.def_static("from_bytes", &BufferToTexture<BTex>, "data"_a, "width"_a, "height"_a,
+                             Format(from_bytes_str, name).c_str());

-    block_texture.def_property_readonly("width_blocks", &BTex::BlocksX, "The width of the texture in blocks.");
-    block_texture.def_property_readonly("height_blocks", &BTex::BlocksY, "The height of the texture in blocks.");
-    block_texture.def_property_readonly("size_blocks", &BTex::BlocksXY, "The dimensions of the texture in blocks.");
+    block_texture.def_property_readonly("width_blocks", &BTex::bwidth, "The width of the texture in blocks.");
+    block_texture.def_property_readonly("height_blocks", &BTex::bheight, "The height of the texture in blocks.");
+    block_texture.def_property_readonly("size_blocks", &BTex::bsize, "The dimensions of the texture in blocks.");

-    DefSubscript2D(block_texture, &BTex::GetBlock, &BTex::SetBlock, &BTex::BlocksXY);
+    DefSubscript2D(block_texture, &BTex::get_block, &BTex::set_block, &BTex::bsize);

-    return std::move(block_texture);
+    return block_texture;
 }
 }  // namespace quicktex::bindings
--- a/quicktex/cli/common.py
+++ b/quicktex/cli/common.py
@ -1,7 +1,8 @@
-from PIL import Image
-from typing import List
 import pathlib
+from typing import List
+
 import click
+from PIL import Image


 def get_decoded_extensions(feature: str = 'open') -> List[str]:
--- a/quicktex/cli/decode.py
+++ b/quicktex/cli/decode.py
@ -1,28 +1,49 @@
-import click
 import os.path
-import quicktex.dds as dds
-import quicktex.cli.common as common
+
+import click
 from PIL import Image

+import quicktex.cli.common as common
+import quicktex.dds as dds
+

@click.command()
-@click.option('-f/-F', '--flip/--no-flip', default=True, show_default=True, help="Vertically flip image after converting.")
+@click.option(
+    '-f/-F', '--flip/--no-flip', default=True, show_default=True, help="Vertically flip image after converting."
+)
@click.option('-r', '--remove', is_flag=True, help="Remove input images after converting.")
-@click.option('-s', '--suffix', type=str, default='', help="Suffix to append to output file(s). Ignored if output is a single file.")
-@click.option('-x', '--extension',
-              callback=common.validate_decoded_extension,
-              type=str, default='.png', show_default=True,
-              help="Extension to use for output. Ignored if output is a single file. Output filetype is deduced from this")
-@click.option('-o', '--output',
-              type=click.Path(writable=True), default=None,
-              help="Output file or directory. If outputting to a file, input filenames must be only a single item. By default, files are decoded in place.")
+@click.option(
+    '-s',
+    '--suffix',
+    type=str,
+    default='',
+    help="Suffix to append to output file(s). Ignored if output is a single file.",
+)
+@click.option(
+    '-x',
+    '--extension',
+    callback=common.validate_decoded_extension,
+    type=str,
+    default='.png',
+    show_default=True,
+    help="Extension to use for output. Ignored if output is a single file. Output filetype is deduced from this",
+)
+@click.option(
+    '-o',
+    '--output',
+    type=click.Path(writable=True),
+    default=None,
+    help="Output file or directory. If outputting to a file, input filenames must be only a single item. By default, files are decoded in place.",
+)
@click.argument('filenames', nargs=-1, type=click.Path(exists=True, readable=True, dir_okay=False))
 def decode(flip, remove, suffix, extension, output, filenames):
    """Decode DDS files to images."""

    path_pairs = common.path_pairs(filenames, output, suffix, extension)

-    with click.progressbar(path_pairs, show_eta=False, show_pos=True, item_show_func=lambda x: str(x[0]) if x else '') as bar:
+    with click.progressbar(
+        path_pairs, show_eta=False, show_pos=True, item_show_func=lambda x: str(x[0]) if x else ''
+    ) as bar:
        for inpath, outpath in bar:
            if inpath.suffix != '.dds':
                raise click.BadArgumentUsage(f"Input file '{inpath}' is not a DDS file.")
--- a/quicktex/cli/encode.py
+++ b/quicktex/cli/encode.py
@ -1,13 +1,14 @@
-import click
 import os
-import pathlib
+
+import click
+from PIL import Image
+
+import quicktex.cli.common as common
+import quicktex.dds as dds
 import quicktex.s3tc.bc1
 import quicktex.s3tc.bc3
 import quicktex.s3tc.bc4
 import quicktex.s3tc.bc5
-import quicktex.dds as dds
-import quicktex.cli.common as common
-from PIL import Image


@click.group()
@ -16,17 +17,31 @@ def encode():


@click.command()
-@click.option('-f/-F', '--flip/--no-flip', default=True, show_default=True, help="Vertically flip image before converting.")
+@click.option(
+    '-f/-F', '--flip/--no-flip', default=True, show_default=True, help="Vertically flip image before converting."
+)
@click.option('-r', '--remove', is_flag=True, help="Remove input images after converting.")
-@click.option('-s', '--suffix', type=str, default='', help="Suffix to append to output file(s). Ignored if output is a single file.")
-@click.option('-o', '--output',
-              type=click.Path(writable=True), default=None,
-              help="Output file or directory. If outputting to a file, input filenames must be only a single item. By default, files are decoded in place.")
+@click.option(
+    '-s',
+    '--suffix',
+    type=str,
+    default='',
+    help="Suffix to append to output file(s). Ignored if output is a single file.",
+)
+@click.option(
+    '-o',
+    '--output',
+    type=click.Path(writable=True),
+    default=None,
+    help="Output file or directory. If outputting to a file, input filenames must be only a single item. By default, files are decoded in place.",
+)
@click.argument('filenames', nargs=-1, type=click.Path(exists=True, readable=True, dir_okay=False))
 def encode_format(encoder, four_cc, flip, remove, suffix, output, filenames):
    path_pairs = common.path_pairs(filenames, output, suffix, '.dds')

-    with click.progressbar(path_pairs, show_eta=False, show_pos=True, item_show_func=lambda x: str(x[0]) if x else '') as bar:
+    with click.progressbar(
+        path_pairs, show_eta=False, show_pos=True, item_show_func=lambda x: str(x[0]) if x else ''
+    ) as bar:
        for inpath, outpath in bar:
            image = Image.open(inpath)

@ -40,17 +55,44 @@ def encode_format(encoder, four_cc, flip, remove, suffix, output, filenames):


@click.command('auto')
-@click.option('-l', '--level', type=click.IntRange(0, 18), default=18, help='Quality level to use. Higher values = higher quality, but slower.')
-@click.option('-b/-B', '--black/--no-black',
-              help='[BC1 only] Enable 3-color mode for blocks containing black or very dark pixels. --3color must also be enabled for this to work.'
-                   ' (Important: engine/shader MUST ignore decoded texture alpha if this flag is enabled!)')
-@click.option('-3/-4', '--3color/--4color', 'threecolor', default=True, help='[BC1 only] Enable 3-color mode for non-black pixels. Higher quality, but slightly slower.')
-@click.option('-f/-F', '--flip/--no-flip', default=True, show_default=True, help="Vertically flip image before converting.")
+@click.option(
+    '-l',
+    '--level',
+    type=click.IntRange(0, 18),
+    default=18,
+    help='Quality level to use. Higher values = higher quality, but slower.',
+)
+@click.option(
+    '-b/-B',
+    '--black/--no-black',
+    help='[BC1 only] Enable 3-color mode for blocks containing black or very dark pixels. --3color must also be enabled for this to work.'
+    ' (Important: engine/shader MUST ignore decoded texture alpha if this flag is enabled!)',
+)
+@click.option(
+    '-3/-4',
+    '--3color/--4color',
+    'threecolor',
+    default=True,
+    help='[BC1 only] Enable 3-color mode for non-black pixels. Higher quality, but slightly slower.',
+)
+@click.option(
+    '-f/-F', '--flip/--no-flip', default=True, show_default=True, help="Vertically flip image before converting."
+)
@click.option('-r', '--remove', is_flag=True, help="Remove input images after converting.")
-@click.option('-s', '--suffix', type=str, default='', help="Suffix to append to output file(s). Ignored if output is a single file.")
-@click.option('-o', '--output',
-              type=click.Path(writable=True), default=None,
-              help="Output file or directory. If outputting to a file, input filenames must be only a single item. By default, files are decoded in place.")
+@click.option(
+    '-s',
+    '--suffix',
+    type=str,
+    default='',
+    help="Suffix to append to output file(s). Ignored if output is a single file.",
+)
+@click.option(
+    '-o',
+    '--output',
+    type=click.Path(writable=True),
+    default=None,
+    help="Output file or directory. If outputting to a file, input filenames must be only a single item. By default, files are decoded in place.",
+)
@click.argument('filenames', nargs=-1, type=click.Path(exists=True, readable=True, dir_okay=False))
 def encode_auto(level, black, threecolor, flip, remove, suffix, output, filenames):
    """Encode images to BC1 or BC3, with the format chosen based on each image's alpha channel."""
@ -67,7 +109,9 @@ def encode_auto(level, black, threecolor, flip, remove, suffix, output, filename
    bc3_encoder = quicktex.s3tc.bc3.BC3Encoder(level)
    path_pairs = common.path_pairs(filenames, output, suffix, '.dds')

-    with click.progressbar(path_pairs, show_eta=False, show_pos=True, item_show_func=lambda x: str(x[0]) if x else '') as bar:
+    with click.progressbar(
+        path_pairs, show_eta=False, show_pos=True, item_show_func=lambda x: str(x[0]) if x else ''
+    ) as bar:
        for inpath, outpath in bar:
            image = Image.open(inpath)

@ -90,11 +134,26 @@ def encode_auto(level, black, threecolor, flip, remove, suffix, output, filename


@click.command('bc1')
-@click.option('-l', '--level', type=click.IntRange(0, 18), default=18, help='Quality level to use. Higher values = higher quality, but slower.')
-@click.option('-b/-B', '--black/--no-black',
-              help='Enable 3-color mode for blocks containing black or very dark pixels. --3color must also be enabled for this to work.'
-                   ' (Important: engine/shader MUST ignore decoded texture alpha if this flag is enabled!)')
-@click.option('-3/-4', '--3color/--4color', 'threecolor', default=True, help='Enable 3-color mode for non-black pixels. Higher quality, but slightly slower.')
+@click.option(
+    '-l',
+    '--level',
+    type=click.IntRange(0, 18),
+    default=18,
+    help='Quality level to use. Higher values = higher quality, but slower.',
+)
+@click.option(
+    '-b/-B',
+    '--black/--no-black',
+    help='Enable 3-color mode for blocks containing black or very dark pixels. --3color must also be enabled for this to work.'
+    ' (Important: engine/shader MUST ignore decoded texture alpha if this flag is enabled!)',
+)
+@click.option(
+    '-3/-4',
+    '--3color/--4color',
+    'threecolor',
+    default=True,
+    help='Enable 3-color mode for non-black pixels. Higher quality, but slightly slower.',
+)
 def encode_bc1(level, black, threecolor, **kwargs):
    """Encode images to BC1 (RGB, no alpha)."""
    color_mode = quicktex.s3tc.bc1.BC1Encoder.ColorMode
@ -109,7 +168,13 @@ def encode_bc1(level, black, threecolor, **kwargs):


@click.command('bc3')
-@click.option('-l', '--level', type=click.IntRange(0, 18), default=18, help='Quality level to use. Higher values = higher quality, but slower.')
+@click.option(
+    '-l',
+    '--level',
+    type=click.IntRange(0, 18),
+    default=18,
+    help='Quality level to use. Higher values = higher quality, but slower.',
+)
 def encode_bc3(level, **kwargs):
    """Encode images to BC4 (RGBA, 8-bit interpolated alpha)."""
    encode_format.callback(quicktex.s3tc.bc3.BC3Encoder(level), 'DXT5', **kwargs)
--- a/quicktex/dds.py
+++ b/quicktex/dds.py
@ -4,12 +4,14 @@ import enum
 import os
 import struct
 import typing
+
+from PIL import Image
+
 import quicktex.image_utils
 import quicktex.s3tc.bc1 as bc1
 import quicktex.s3tc.bc3 as bc3
 import quicktex.s3tc.bc4 as bc4
 import quicktex.s3tc.bc5 as bc5
-from PIL import Image


 class DDSFormat:
@ -165,8 +167,28 @@ class DDSFile:
            file.write(DDSFile.magic)

            # WRITE HEADER
-            file.write(struct.pack('<7I44x', DDSFile.header_bytes, int(self.flags), self.size[1], self.size[0], self.pitch, self.depth, self.mipmap_count))
-            file.write(struct.pack('<2I4s5I', 32, int(self.pf_flags), bytes(self.four_cc, 'ascii'), self.pixel_size, *self.pixel_bitmasks))
+            file.write(
+                struct.pack(
+                    '<7I44x',
+                    DDSFile.header_bytes,
+                    int(self.flags),
+                    self.size[1],
+                    self.size[0],
+                    self.pitch,
+                    self.depth,
+                    self.mipmap_count,
+                )
+            )
+            file.write(
+                struct.pack(
+                    '<2I4s5I',
+                    32,
+                    int(self.pf_flags),
+                    bytes(self.four_cc, 'ascii'),
+                    self.pixel_size,
+                    *self.pixel_bitmasks,
+                )
+            )
            file.write(struct.pack('<4I4x', *self.caps))

            assert file.tell() == 4 + DDSFile.header_bytes, 'error writing file: incorrect header size'
--- a/quicktex/image_utils.py
+++ b/quicktex/image_utils.py
@ -1,8 +1,9 @@
 """Various utilities for working with Pillow images"""

-from PIL import Image
-from typing import List, Tuple, Optional
 import math
+from typing import List, Tuple, Optional
+
+from PIL import Image


 def mip_sizes(dimensions: Tuple[int, int], mip_count: Optional[int] = None) -> List[Tuple[int, int]]:
--- a/quicktex/s3tc/_bindings.cpp
+++ b/quicktex/s3tc/_bindings.cpp
@ -1,5 +1,5 @@
 /*  Quicktex Texture Compression Library
-    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
+    Copyright (C) 2021-2022 Andrew Cassidy <drewcassidy@me.com>
    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
    and licenced under the public domain

--- a/quicktex/s3tc/bc1/BC1Block.cpp
+++ b/quicktex/s3tc/bc1/BC1Block.cpp
@ -1,5 +1,5 @@
 /*  Quicktex Texture Compression Library
-    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
+    Copyright (C) 2021-2022 Andrew Cassidy <drewcassidy@me.com>
    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
    and licenced under the public domain

@ -20,28 +20,35 @@
 #include "BC1Block.h"

 #include <stdexcept>
-#include <algorithm>

-#include "../../util.h"
+#include "util/bitbash.h"
+#include "util/map.h"
+#include "util/math.h"
+#include "util/ranges.h"

 namespace quicktex::s3tc {
-uint16_t BC1Block::GetColor0Raw() const { return Pack<uint8_t, uint16_t, 8, EndpointSize>(_color0); }
-uint16_t BC1Block::GetColor1Raw() const { return Pack<uint8_t, uint16_t, 8, EndpointSize>(_color1); }

-void BC1Block::SetColor0Raw(uint16_t c) { _color0 = Unpack<uint16_t, uint8_t, 8, EndpointSize>(c); }
-void BC1Block::SetColor1Raw(uint16_t c) { _color1 = Unpack<uint16_t, uint8_t, 8, EndpointSize>(c); }
+uint16_t BC1Block::GetColor0Raw() const { return pack<uint16_t>(_color0, 8); }
+uint16_t BC1Block::GetColor1Raw() const { return pack<uint16_t>(_color1, 8); }

-BC1Block::SelectorArray BC1Block::GetSelectors() const { return MapArray(_selectors, Unpack<uint8_t, uint8_t, SelectorBits, Width>); }
+void BC1Block::SetColor0Raw(uint16_t c) { _color0 = unpack<uint8_t, EndpointSize>(c, 8); }
+void BC1Block::SetColor1Raw(uint16_t c) { _color1 = unpack<uint8_t, EndpointSize>(c, 8); }
+
+BC1Block::SelectorArray BC1Block::GetSelectors() const {
+    return map([](auto row) { return unpack<uint8_t, Width>(row, SelectorBits); }, _selectors);
+}

 void BC1Block::SetSelectors(const BC1Block::SelectorArray& unpacked) {
    for (unsigned y = 0; y < (unsigned)Height; y++) {
        if (std::any_of(unpacked[y].begin(), unpacked[y].end(), [](uint8_t i) { return i > SelectorMax; }))
            throw std::invalid_argument("Selector value out of bounds.");
    }
-    _selectors = MapArray(unpacked, Pack<uint8_t, uint8_t, SelectorBits, Width>);
+    _selectors = map([](auto row) { return pack<uint8_t>(row, SelectorBits, true); }, unpacked);
 }

-bool BC1Block::operator==(const BC1Block& Rhs) const { return _color0 == Rhs._color0 && _color1 == Rhs._color1 && _selectors == Rhs._selectors; }
+bool BC1Block::operator==(const BC1Block& Rhs) const {
+    return _color0 == Rhs._color0 && _color1 == Rhs._color1 && _selectors == Rhs._selectors;
+}
 bool BC1Block::operator!=(const BC1Block& Rhs) const { return !(Rhs == *this); }

 }  // namespace quicktex::s3tc
--- a/quicktex/s3tc/bc1/BC1Block.h
+++ b/quicktex/s3tc/bc1/BC1Block.h
@ -1,5 +1,5 @@
 /*  Quicktex Texture Compression Library
-    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
+    Copyright (C) 2021-2022 Andrew Cassidy <drewcassidy@me.com>
    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
    and licenced under the public domain

@ -24,7 +24,7 @@
 #include <cstdlib>
 #include <utility>

-#include "../../Color.h"
+#include "OldColor.h"

 namespace quicktex::s3tc {

@ -39,7 +39,7 @@ class alignas(8) BC1Block {
    static constexpr uint8_t SelectorMax = (1 << SelectorBits) - 1;  // maximum value of a selector

    using SelectorArray = std::array<std::array<uint8_t, Width>, Height>;
-    using ColorPair = std::pair<Color, Color>;
+    using ColorPair = std::pair<OldColor, OldColor>;

   private:
    std::array<uint8_t, EndpointSize> _color0;
@ -60,7 +60,7 @@ class alignas(8) BC1Block {
     * @param color1 second endpoint color
     * @param selectors the selectors as a 4x4 list of integers, between 0 and 3 inclusive.
     */
-    BC1Block(Color color0, Color color1, const SelectorArray& selectors) {
+    BC1Block(OldColor color0, OldColor color1, const SelectorArray& selectors) {
        SetColor0(color0);
        SetColor1(color1);
        SetSelectors(selectors);
@ -96,12 +96,12 @@ class alignas(8) BC1Block {
    void SetColor0Raw(uint16_t c);
    void SetColor1Raw(uint16_t c);

-    Color GetColor0() const { return Color::Unpack565(GetColor0Raw()); }
-    Color GetColor1() const { return Color::Unpack565(GetColor1Raw()); }
+    OldColor GetColor0() const { return OldColor::Unpack565(GetColor0Raw()); }
+    OldColor GetColor1() const { return OldColor::Unpack565(GetColor1Raw()); }
    ColorPair GetColors() const { return {GetColor0(), GetColor1()}; }

-    void SetColor0(Color c) { SetColor0Raw(c.Pack565()); }
-    void SetColor1(Color c) { SetColor1Raw(c.Pack565()); }
+    void SetColor0(OldColor c) { SetColor0Raw(c.Pack565()); }
+    void SetColor1(OldColor c) { SetColor1Raw(c.Pack565()); }
    void SetColors(ColorPair cs) {
        SetColor0(cs.first);
        SetColor1(cs.second);
--- a/quicktex/s3tc/bc1/BC1Decoder.cpp
+++ b/quicktex/s3tc/bc1/BC1Decoder.cpp
@ -1,5 +1,5 @@
 /*  Quicktex Texture Compression Library
-    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
+    Copyright (C) 2021-2022 Andrew Cassidy <drewcassidy@me.com>
    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
    and licenced under the public domain

@ -23,9 +23,9 @@
 #include <cassert>
 #include <cstdint>

-#include "../../Color.h"
-#include "../../ColorBlock.h"
-#include "BC1Block.h"
+#include "ColorBlock.h"
+#include "OldColor.h"
+#include "s3tc/bc1/BC1Block.h"

 namespace quicktex::s3tc {

--- a/quicktex/s3tc/bc1/BC1Decoder.h
+++ b/quicktex/s3tc/bc1/BC1Decoder.h
@ -1,5 +1,5 @@
 /*  Quicktex Texture Compression Library
-    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
+    Copyright (C) 2021-2022 Andrew Cassidy <drewcassidy@me.com>
    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
    and licenced under the public domain

@ -21,18 +21,19 @@

 #include <memory>

-#include "../../ColorBlock.h"
-#include "../../Decoder.h"
-#include "../../Texture.h"
-#include "../interpolator/Interpolator.h"
-#include "BC1Block.h"
+#include "ColorBlock.h"
+#include "Decoder.h"
+#include "s3tc/bc1/BC1Block.h"
+#include "s3tc/interpolator/Interpolator.h"
+#include "texture/BlockTexture.h"

 namespace quicktex::s3tc {
 class BC1Decoder final : public BlockDecoder<BlockTexture<BC1Block>> {
   public:
    using InterpolatorPtr = std::shared_ptr<Interpolator>;

-    BC1Decoder(bool vwrite_alpha, InterpolatorPtr interpolator) : write_alpha(vwrite_alpha), _interpolator(interpolator) {}
+    BC1Decoder(bool vwrite_alpha, InterpolatorPtr interpolator)
+        : write_alpha(vwrite_alpha), _interpolator(interpolator) {}

    BC1Decoder(bool vwrite_alpha = false) : BC1Decoder(vwrite_alpha, std::make_shared<Interpolator>()) {}

--- a/quicktex/s3tc/bc1/BC1Encoder.cpp
+++ b/quicktex/s3tc/bc1/BC1Encoder.cpp
@ -1,5 +1,5 @@
 /*  Quicktex Texture Compression Library
-    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
+    Copyright (C) 2021-2022 Andrew Cassidy <drewcassidy@me.com>
    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
    and licenced under the public domain

@ -29,24 +29,28 @@
 #include <stdexcept>
 #include <type_traits>

-#include "../../Color.h"
-#include "../../ColorBlock.h"
-#include "../../Matrix4x4.h"
-#include "../../Texture.h"
-#include "../../Vector4.h"
-#include "../../Vector4Int.h"
-#include "../../bitwiseEnums.h"
-#include "../../util.h"
+#include "ColorBlock.h"
 #include "Histogram.h"
-#include "OrderTable.h"
-#include "SingleColorTable.h"
+#include "Matrix4x4.h"
+#include "OldColor.h"
+#include "Vector4.h"
+#include "Vector4Int.h"
+#include "s3tc/bc1/BC1Block.h"
+#include "s3tc/bc1/OrderTable.h"
+#include "s3tc/bc1/SingleColorTable.h"
+#include "texture/Texture.h"
+#include "util/bitbash.h"
+#include "util/bitwiseEnums.h"
+#include "util/math.h"

 namespace quicktex::s3tc {

 // constructors

-BC1Encoder::BC1Encoder(unsigned int level, ColorMode color_mode, InterpolatorPtr interpolator) : _interpolator(interpolator), _color_mode(color_mode) {
-    if (color_mode != ColorMode::FourColor && color_mode != ColorMode::ThreeColor && color_mode != ColorMode::ThreeColorBlack) {
+BC1Encoder::BC1Encoder(unsigned int level, ColorMode color_mode, InterpolatorPtr interpolator)
+    : _interpolator(interpolator), _color_mode(color_mode) {
+    if (color_mode != ColorMode::FourColor && color_mode != ColorMode::ThreeColor &&
+        color_mode != ColorMode::ThreeColorBlack) {
        throw std::invalid_argument("Encoder color mode must be FourColor, ThreeColor, or ThreeColorBlack");
    }

@ -73,7 +77,9 @@ BC1Encoder::BC1Encoder(unsigned int level, ColorMode color_mode, InterpolatorPtr

 // Getters and Setters
 void BC1Encoder::SetLevel(unsigned level) {
-    if (level > 19) throw std::invalid_argument("Level out of range, bust be between 0 and 18 inclusive");  // theres a secret level 19 but shhhhhh
+    if (level > 19)
+        throw std::invalid_argument(
+            "Level out of range, bust be between 0 and 18 inclusive");  // theres a secret level 19 but shhhhhh

    two_ls_passes = false;
    two_ep_passes = false;
@ -249,14 +255,20 @@ void BC1Encoder::SetLevel(unsigned level) {
    _orderings3 = clamp(_orderings3, 1U, OrderTable<3>::BestOrderCount);
 }

-void BC1Encoder::SetOrderings4(unsigned orderings4) { _orderings4 = clamp(orderings4, 1U, OrderTable<4>::BestOrderCount); }
-void BC1Encoder::SetOrderings3(unsigned orderings3) { _orderings3 = clamp(orderings3, 1U, OrderTable<3>::BestOrderCount); }
+void BC1Encoder::SetOrderings4(unsigned orderings4) {
+    _orderings4 = clamp(orderings4, 1U, OrderTable<4>::BestOrderCount);
+}
+void BC1Encoder::SetOrderings3(unsigned orderings3) {
+    _orderings3 = clamp(orderings3, 1U, OrderTable<3>::BestOrderCount);
+}
 void BC1Encoder::SetOrderings(OrderingPair orderings) {
    SetOrderings4(std::get<0>(orderings));
    SetOrderings3(std::get<1>(orderings));
 }

-void BC1Encoder::SetPowerIterations(unsigned int power_iters) { _power_iterations = clamp(power_iters, min_power_iterations, max_power_iterations); }
+void BC1Encoder::SetPowerIterations(unsigned int power_iters) {
+    _power_iterations = clamp(power_iters, min_power_iterations, max_power_iterations);
+}

 // Public methods
 BC1Block BC1Encoder::EncodeBlock(const ColorBlock<4, 4> &pixels) const {
@ -304,7 +316,9 @@ BC1Block BC1Encoder::EncodeBlock(const ColorBlock<4, 4> &pixels) const {

    // First refinement pass using ordered cluster fit
    if (result.error > 0 && use_likely_orderings) {
-        for (unsigned iter = 0; iter < total_cf_passes; iter++) { RefineBlockCF<ColorMode::FourColor>(result, pixels, metrics, _error_mode, _orderings4); }
+        for (unsigned iter = 0; iter < total_cf_passes; iter++) {
+            RefineBlockCF<ColorMode::FourColor>(result, pixels, metrics, _error_mode, _orderings4);
+        }
    }

    // try for 3-color block
@ -325,13 +339,15 @@ BC1Block BC1Encoder::EncodeBlock(const ColorBlock<4, 4> &pixels) const {
    }

    // try for 3-color block with black
-    if (result.error > 0 && (_color_mode == ColorMode::ThreeColorBlack) && metrics.has_black && !metrics.max.IsBlack()) {
+    if (result.error > 0 && (_color_mode == ColorMode::ThreeColorBlack) && metrics.has_black &&
+        !metrics.max.IsBlack()) {
        EncodeResults trial_result;
        BlockMetrics metrics_no_black = pixels.GetMetrics(true);

        FindEndpoints(trial_result, pixels, metrics_no_black, EndpointMode::PCA, true);
        FindSelectors<ColorMode::ThreeColorBlack>(trial_result, pixels, ErrorMode::Full);
-        RefineBlockLS<ColorMode::ThreeColorBlack>(trial_result, pixels, metrics_no_black, ErrorMode::Full, total_ls_passes);
+        RefineBlockLS<ColorMode::ThreeColorBlack>(trial_result, pixels, metrics_no_black, ErrorMode::Full,
+                                                  total_ls_passes);

        if (trial_result.error < result.error) { result = trial_result; }
    }
@ -343,7 +359,7 @@ BC1Block BC1Encoder::EncodeBlock(const ColorBlock<4, 4> &pixels) const {
 }

 // Private methods
-BC1Block BC1Encoder::WriteBlockSolid(Color color) const {
+BC1Block BC1Encoder::WriteBlockSolid(OldColor color) const {
    uint8_t mask = 0xAA;  // 2222
    uint16_t min16, max16;

@ -441,7 +457,7 @@ BC1Block BC1Encoder::WriteBlock(EncodeResults &result) const {
    return BC1Block(ep0, ep1, selectors);
 }

-void BC1Encoder::FindEndpointsSingleColor(EncodeResults &result, Color color, bool is_3color) const {
+void BC1Encoder::FindEndpointsSingleColor(EncodeResults &result, OldColor color, bool is_3color) const {
    auto &match5 = is_3color ? _single_match5_half : _single_match5;
    auto &match6 = is_3color ? _single_match6_half : _single_match6;

@ -451,13 +467,14 @@ void BC1Encoder::FindEndpointsSingleColor(EncodeResults &result, Color color, bo

    result.color_mode = is_3color ? ColorMode::ThreeColor : ColorMode::FourColor;
    result.error = match_r.error + match_g.error + match_b.error;
-    result.low = Color(match_r.low, match_g.low, match_b.low);
-    result.high = Color(match_r.high, match_g.high, match_b.high);
+    result.low = OldColor(match_r.low, match_g.low, match_b.low);
+    result.high = OldColor(match_r.high, match_g.high, match_b.high);
    // selectors decided when writing, no point deciding them now
 }

-void BC1Encoder::FindEndpointsSingleColor(EncodeResults &result, const CBlock &pixels, Color color, bool is_3color) const {
-    std::array<Color, 4> colors = _interpolator->InterpolateBC1(result.low, result.high, is_3color);
+void BC1Encoder::FindEndpointsSingleColor(EncodeResults &result, const CBlock &pixels, OldColor color,
+                                          bool is_3color) const {
+    std::array<OldColor, 4> colors = _interpolator->InterpolateBC1(result.low, result.high, is_3color);
    Vector4Int result_vector = (Vector4Int)colors[2];

    FindEndpointsSingleColor(result, color, is_3color);
@ -471,40 +488,43 @@ void BC1Encoder::FindEndpointsSingleColor(EncodeResults &result, const CBlock &p
    }
 }

-void BC1Encoder::FindEndpoints(EncodeResults &result, const CBlock &pixels, const BlockMetrics &metrics, EndpointMode endpoint_mode, bool ignore_black) const {
+void BC1Encoder::FindEndpoints(EncodeResults &result, const CBlock &pixels, const BlockMetrics &metrics,
+                               EndpointMode endpoint_mode, bool ignore_black) const {
    if (metrics.is_greyscale) {
        // specialized greyscale case
        const unsigned fr = pixels.Get(0, 0).r;

        if (metrics.max.r - metrics.min.r < 2) {
            // single color block
-            uint8_t fr5 = (uint8_t)scale8To5(fr);
-            uint8_t fr6 = (uint8_t)scale8To6(fr);
+            uint8_t fr5 = (uint8_t)scale_from_8<5>(fr);
+            uint8_t fr6 = (uint8_t)scale_from_8<6>(fr);

-            result.low = Color(fr5, fr6, fr5);
+            result.low = OldColor(fr5, fr6, fr5);
            result.high = result.low;
        } else {
-            uint8_t lr5 = scale8To5(metrics.min.r);
-            uint8_t lr6 = scale8To6(metrics.min.r);
+            uint8_t lr5 = scale_from_8<5>(metrics.min.r);
+            uint8_t lr6 = scale_from_8<6>(metrics.min.r);

-            uint8_t hr5 = scale8To5(metrics.max.r);
-            uint8_t hr6 = scale8To6(metrics.max.r);
+            uint8_t hr5 = scale_from_8<5>(metrics.max.r);
+            uint8_t hr6 = scale_from_8<6>(metrics.max.r);

-            result.low = Color(lr5, lr6, lr5);
-            result.high = Color(hr5, hr6, hr5);
+            result.low = OldColor(lr5, lr6, lr5);
+            result.high = OldColor(hr5, hr6, hr5);
        }
    } else if (endpoint_mode == EndpointMode::LeastSquares) {
        //  2D Least Squares approach from Humus's example, with added inset and optimal rounding.
-        Color diff = Color(metrics.max.r - metrics.min.r, metrics.max.g - metrics.min.g, metrics.max.b - metrics.min.b);
+        OldColor diff =
+            OldColor(metrics.max.r - metrics.min.r, metrics.max.g - metrics.min.g, metrics.max.b - metrics.min.b);
        Vector4 l = {0, 0, 0};
        Vector4 h = {0, 0, 0};

        auto &sums = metrics.sums;
        auto &min = metrics.min;
+        auto &max = metrics.max;

        unsigned chan0 = (unsigned)diff.MaxChannelRGB();  // primary axis of the bounding box
        l[chan0] = (float)min[chan0];
-        h[chan0] = (float)min[chan0];
+        h[chan0] = (float)max[chan0];

        assert((diff[chan0] >= diff[(chan0 + 1) % 3]) && (diff[chan0] >= diff[(chan0 + 2) % 3]));

@ -521,7 +541,7 @@ void BC1Encoder::FindEndpoints(EncodeResults &result, const CBlock &pixels, cons
        float denominator = (float)(16 * sum_xx) - (float)(sum_x * sum_x);

        // once per secondary axis, calculate high and low using least squares
-        if (fabs(denominator) > 1e-8f) {
+        if (abs(denominator) > 1e-8f) {
            for (unsigned i = 1; i < 3; i++) {
                /* each secondary axis is fitted with a linear formula of the form
                 *  y = ax + b
@ -549,8 +569,8 @@ void BC1Encoder::FindEndpoints(EncodeResults &result, const CBlock &pixels, cons
            h[c] = ((h[c] - inset) / 255.0f);
        }

-        result.low = Color::PreciseRound565(l);
-        result.high = Color::PreciseRound565(h);
+        result.low = OldColor::PreciseRound565(l);
+        result.high = OldColor::PreciseRound565(h);
    } else if (endpoint_mode == EndpointMode::BoundingBox) {
        // Algorithm from icbc.h compress_dxt1_fast()
        Vector4 l, h;
@ -577,19 +597,20 @@ void BC1Encoder::FindEndpoints(EncodeResults &result, const CBlock &pixels, cons
        if (icov_xz < 0) std::swap(l[0], h[0]);
        if (icov_yz < 0) std::swap(l[1], h[1]);

-        result.low = Color::PreciseRound565(l);
-        result.high = Color::PreciseRound565(h);
+        result.low = OldColor::PreciseRound565(l);
+        result.high = OldColor::PreciseRound565(h);
    } else if (endpoint_mode == EndpointMode::BoundingBoxInt) {
        // Algorithm from icbc.h compress_dxt1_fast(), but converted to integer.
+        // TODO: handle constant blue channel better

-        Color min, max;
+        OldColor min, max;

        // rescale and inset values
        for (unsigned c = 0; c < 3; c++) {
            int inset = ((int)(metrics.max[c] - metrics.min[c]) - 8) >> 4;  // 1/16 of delta, with bias

-            min[c] = clamp255(metrics.min[c] + inset);
-            max[c] = clamp255(metrics.max[c] - inset);
+            min[c] = clamp(metrics.min[c] + inset, 0, 255);
+            max[c] = clamp(metrics.max[c] - inset, 0, 255);
        }

        int icov_xz = 0, icov_yz = 0;
@ -607,19 +628,21 @@ void BC1Encoder::FindEndpoints(EncodeResults &result, const CBlock &pixels, cons
    } else if (endpoint_mode == EndpointMode::PCA) {
        // the slow way
        // Select 2 colors along the principle axis. (There must be a faster/simpler way.)
-        auto min = Vector4::FromColorRGB(metrics.min);
-        auto max = Vector4::FromColorRGB(metrics.max);
-        auto avg = Vector4::FromColorRGB(metrics.avg);

-        Vector4 axis = {306, 601, 117};  // Luma vector
-        Matrix4x4 covariance = Matrix4x4::Identity();
+        // TODO: handle constant blue channel better
+
+        Color min = metrics.min;
+        Color max = metrics.max;
+        Color avg = metrics.avg;
+
+        Vec<float, 4> axis = {306, 601, 117, 0};  // Luma vector
+        auto covariance = Matrix<float, 4, 4>::identity();

        for (int i = 0; i < 16; i++) {
            auto val = pixels.Get(i);
            if (ignore_black && val.IsBlack()) continue;

-            auto color_vec = Vector4::FromColorRGB(val);
-            Vector4 diff = color_vec - avg;
+            auto diff = val - avg;
            for (unsigned c1 = 0; c1 < 3; c1++) {
                for (unsigned c2 = c1; c2 < 3; c2++) {
                    covariance[c1][c2] += (diff[c1] * diff[c2]);
@ -629,20 +652,24 @@ void BC1Encoder::FindEndpoints(EncodeResults &result, const CBlock &pixels, cons
        }

        covariance /= 255.0f;
-        covariance.Mirror();
+        covariance = covariance.mirror();

-        Vector4 delta = max - min;
+        Vec<float, 4> delta = max - min;

        // realign r and g axes to match
        if (covariance[0][2] < 0) delta[0] = -delta[0];  // r vs b
        if (covariance[1][2] < 0) delta[1] = -delta[1];  // g vs b

-        // using the covariance matrix, stretch the delta vector towards the primary axis of the data using power iteration
-        // the end result of this may actually be the same as the least squares approach, will have to do more research
-        for (unsigned power_iter = 0; power_iter < _power_iterations; power_iter++) { delta = covariance * delta; }
+        // using the covariance matrix, stretch the delta vector towards the primary axis of the data using power
+        // iteration the end result of this may actually be the same as the least squares approach, will have to do more
+        // research
+        for (unsigned power_iter = 0; power_iter < _power_iterations; power_iter++) {
+            delta = covariance.mult(delta);
+        }

        // if we found any correlation, then this is our new axis. otherwise we fallback to the luma vector
-        float k = delta.MaxAbs(3);
+        auto delta_abs = delta.abs();
+        float k = *std::max_element(delta_abs.begin(), delta_abs.end());
        if (k >= 2) { axis = delta * (2048.0f / k); }

        axis *= 16;
@ -653,13 +680,12 @@ void BC1Encoder::FindEndpoints(EncodeResults &result, const CBlock &pixels, cons
        int min_index = 0, max_index = 0;

        for (int i = 0; i < 16; i++) {
-            auto val = pixels.Get(i);
-            if (ignore_black && val.IsBlack()) continue;
+            Color val = pixels.Get(i); //todo: fix this mess
+            if (ignore_black && (val.r() | val.g() | val.b()) < 4) continue;

-            auto color_vec = Vector4::FromColorRGB(val);
            // since axis is constant here, I dont think its magnitude actually matters,
            // since we only care about the min or max dot product
-            float dot = color_vec.Dot(axis);
+            float dot = (Vec<float,4>(val)).dot(axis);
            if (dot > max_dot) {
                max_dot = dot;
                max_index = i;
@ -677,20 +703,21 @@ void BC1Encoder::FindEndpoints(EncodeResults &result, const CBlock &pixels, cons
    result.color_mode = ColorMode::Incomplete;
 }

-template <BC1Encoder::ColorMode M> void BC1Encoder::FindSelectors(EncodeResults &result, const CBlock &pixels, ErrorMode error_mode) const {
+template <BC1Encoder::ColorMode M>
+void BC1Encoder::FindSelectors(EncodeResults &result, const CBlock &pixels, ErrorMode error_mode) const {
    assert(!((error_mode != ErrorMode::Full) && (bool)(M & ColorMode::ThreeColor)));

    const int color_count = (unsigned)M & 0x0F;

-    std::array<Color, 4> colors = _interpolator->InterpolateBC1(result.low, result.high, color_count == 3);
+    std::array<OldColor, 4> colors = _interpolator->InterpolateBC1(result.low, result.high, color_count == 3);
    std::array<Vector4Int, 4> color_vectors;

    if (color_count == 4) {
-        color_vectors = {Vector4Int::FromColorRGB(colors[0]), Vector4Int::FromColorRGB(colors[2]), Vector4Int::FromColorRGB(colors[3]),
-                         Vector4Int::FromColorRGB(colors[1])};
+        color_vectors = {Vector4Int::FromColorRGB(colors[0]), Vector4Int::FromColorRGB(colors[2]),
+                         Vector4Int::FromColorRGB(colors[3]), Vector4Int::FromColorRGB(colors[1])};
    } else {
-        color_vectors = {Vector4Int::FromColorRGB(colors[0]), Vector4Int::FromColorRGB(colors[2]), Vector4Int::FromColorRGB(colors[1]),
-                         Vector4Int::FromColorRGB(colors[3])};
+        color_vectors = {Vector4Int::FromColorRGB(colors[0]), Vector4Int::FromColorRGB(colors[2]),
+                         Vector4Int::FromColorRGB(colors[1]), Vector4Int::FromColorRGB(colors[3])};
    }

    unsigned total_error = 0;
@ -714,7 +741,8 @@ template <BC1Encoder::ColorMode M> void BC1Encoder::FindSelectors(EncodeResults
                // llvm is just going to unswitch this anyways so its not an issue
                auto diff = pixel_vector - color_vectors[selector];
                total_error += diff.SqrMag();
-                if (i % 4 != 0 && total_error >= result.error) break;  // check only once per row if we're generating too much error
+                if (i % 4 != 0 && total_error >= result.error)
+                    break;  // check only once per row if we're generating too much error
            }

            result.selectors[i] = selector;
@ -727,7 +755,7 @@ template <BC1Encoder::ColorMode M> void BC1Encoder::FindSelectors(EncodeResults
            Vector4Int pixel_vector = Vector4Int::FromColorRGB(pixels.Get(i));
            auto diff = pixel_vector - color_vectors[0];
            float sel_f = (float)diff.Dot(axis) * f + 0.5f;
-            uint8_t sel = (uint8_t)clampi((int)sel_f, 1, 3);
+            uint8_t sel = (uint8_t)clamp<int>((int)sel_f, 1, 3);

            unsigned err0 = (color_vectors[sel - 1] - pixel_vector).SqrMag();
            unsigned err1 = (color_vectors[sel] - pixel_vector).SqrMag();
@ -779,7 +807,8 @@ template <BC1Encoder::ColorMode M> void BC1Encoder::FindSelectors(EncodeResults
    result.color_mode = M;
 }

-template <BC1Encoder::ColorMode M> bool BC1Encoder::RefineEndpointsLS(EncodeResults &result, const CBlock &pixels, BlockMetrics metrics) const {
+template <BC1Encoder::ColorMode M>
+bool BC1Encoder::RefineEndpointsLS(EncodeResults &result, const CBlock &pixels, BlockMetrics metrics) const {
    const int color_count = (unsigned)M & 0x0F;
    static_assert(color_count == 3 || color_count == 4);
    assert(result.color_mode != ColorMode::Incomplete);
@ -790,11 +819,12 @@ template <BC1Encoder::ColorMode M> bool BC1Encoder::RefineEndpointsLS(EncodeResu
    Vector4 matrix = Vector4(0);

    for (int i = 0; i < 16; i++) {
-        const Color color = pixels.Get(i);
+        const OldColor color = pixels.Get(i);
        const uint8_t sel = result.selectors[i];

        if ((bool)(M & ColorMode::ThreeColorBlack) && color.IsBlack()) continue;
-        if ((bool)(M & ColorMode::ThreeColor) && sel == 3U) continue;  // NOTE: selectors for 3-color are in linear order here, but not in original
+        if ((bool)(M & ColorMode::ThreeColor) && sel == 3U)
+            continue;  // NOTE: selectors for 3-color are in linear order here, but not in original
        assert(sel < color_count);

        const Vector4Int color_vector = Vector4Int::FromColorRGB(color);
@ -805,7 +835,7 @@ template <BC1Encoder::ColorMode M> bool BC1Encoder::RefineEndpointsLS(EncodeResu

    // invert matrix
    float det = matrix.Determinant2x2();  // z00 * z11 - z01 * z10;
-    if (fabs(det) < 1e-8f) {
+    if (abs(det) < 1e-8f) {
        result.color_mode = ColorMode::Incomplete;
        return false;
    }
@ -820,12 +850,14 @@ template <BC1Encoder::ColorMode M> bool BC1Encoder::RefineEndpointsLS(EncodeResu
    Vector4 high = (matrix[2] * q00) + (matrix[3] * q10);

    result.color_mode = M;
-    result.low = Color::PreciseRound565(low);
-    result.high = Color::PreciseRound565(high);
+    result.low = OldColor::PreciseRound565(low);
+    result.high = OldColor::PreciseRound565(high);
    return true;
 }

-template <BC1Encoder::ColorMode M> void BC1Encoder::RefineEndpointsLS(EncodeResults &result, std::array<Vector4, 17> &sums, Vector4 &matrix, Hash hash) const {
+template <BC1Encoder::ColorMode M>
+void BC1Encoder::RefineEndpointsLS(EncodeResults &result, std::array<Vector4, 17> &sums, Vector4 &matrix,
+                                   Hash hash) const {
    const int color_count = (unsigned)M & 0x0F;
    static_assert(color_count == 3 || color_count == 4);
    assert(result.color_mode != ColorMode::Incomplete);
@ -846,12 +878,13 @@ template <BC1Encoder::ColorMode M> void BC1Encoder::RefineEndpointsLS(EncodeResu
    Vector4 high = (matrix[2] * q00) + (matrix[3] * q10);

    result.color_mode = M;
-    result.low = Color::PreciseRound565(low);
-    result.high = Color::PreciseRound565(high);
+    result.low = OldColor::PreciseRound565(low);
+    result.high = OldColor::PreciseRound565(high);
 }

 template <BC1Encoder::ColorMode M>
-void BC1Encoder::RefineBlockLS(EncodeResults &result, const CBlock &pixels, const BlockMetrics &metrics, ErrorMode error_mode, unsigned passes) const {
+void BC1Encoder::RefineBlockLS(EncodeResults &result, const CBlock &pixels, const BlockMetrics &metrics,
+                               ErrorMode error_mode, unsigned passes) const {
    assert(error_mode != ErrorMode::None || passes == 1);

    for (unsigned pass = 0; pass < passes; pass++) {
@ -876,7 +909,8 @@ void BC1Encoder::RefineBlockLS(EncodeResults &result, const CBlock &pixels, cons
 }

 template <BC1Encoder::ColorMode M>
-void BC1Encoder::RefineBlockCF(EncodeResults &result, const CBlock &pixels, const BlockMetrics &metrics, ErrorMode error_mode, unsigned orderings) const {
+void BC1Encoder::RefineBlockCF(EncodeResults &result, const CBlock &pixels, const BlockMetrics &metrics,
+                               ErrorMode error_mode, unsigned orderings) const {
    const int color_count = (unsigned)M & 0x0F;
    static_assert(color_count == 3 || color_count == 4);
    assert(result.color_mode != ColorMode::Incomplete);
@ -955,7 +989,8 @@ void BC1Encoder::EndpointSearch(EncodeResults &result, const CBlock &pixels) con

    for (unsigned i = 0; i < _search_rounds; i++) {
        const unsigned voxel_index = (unsigned)(i & 15);
-        assert((unsigned)Voxels[(unsigned)Voxels[voxel_index][3]][3] == voxel_index);  // make sure voxels are symmetrical
+        assert((unsigned)Voxels[(unsigned)Voxels[voxel_index][3]][3] ==
+               voxel_index);  // make sure voxels are symmetrical

        if ((int)(i & 31) == forbidden_direction) continue;

--- a/quicktex/s3tc/bc1/BC1Encoder.h
+++ b/quicktex/s3tc/bc1/BC1Encoder.h
@ -1,5 +1,5 @@
 /*  Quicktex Texture Compression Library
-    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
+    Copyright (C) 2021-2022 Andrew Cassidy <drewcassidy@me.com>
    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
    and licenced under the public domain

@ -26,13 +26,13 @@
 #include <memory>
 #include <tuple>

-#include "../../Color.h"
-#include "../../ColorBlock.h"
-#include "../../Encoder.h"
-#include "../../Texture.h"
-#include "../interpolator/Interpolator.h"
-#include "BC1Block.h"
-#include "SingleColorTable.h"
+#include "ColorBlock.h"
+#include "Encoder.h"
+#include "OldColor.h"
+#include "s3tc/bc1/BC1Block.h"
+#include "s3tc/bc1/SingleColorTable.h"
+#include "s3tc/interpolator/Interpolator.h"
+#include "texture/BlockTexture.h"

 namespace quicktex {
 class Vector4;
@ -79,7 +79,8 @@ class BC1Encoder final : public BlockEncoder<BlockTexture<BC1Block>> {
    };

    enum class EndpointMode {
-        // Use 2D least squares+inset+optimal rounding (the method used in Humus's GPU texture encoding demo), instead of PCA.
+        // Use 2D least squares+inset+optimal rounding (the method used in Humus's GPU texture encoding demo), instead
+        // of PCA.
        // Around 18% faster, very slightly lower average quality to better (depends on the content).
        LeastSquares,

@ -101,7 +102,8 @@ class BC1Encoder final : public BlockEncoder<BlockTexture<BC1Block>> {

    BC1Encoder(unsigned level, ColorMode color_mode, InterpolatorPtr interpolator);

-    BC1Encoder(unsigned int level = 5, ColorMode color_mode = ColorMode::FourColor) : BC1Encoder(level, color_mode, std::make_shared<Interpolator>()) {}
+    BC1Encoder(unsigned int level = 5, ColorMode color_mode = ColorMode::FourColor)
+        : BC1Encoder(level, color_mode, std::make_shared<Interpolator>()) {}

    // Getters and Setters
    void SetLevel(unsigned level);
@ -141,8 +143,8 @@ class BC1Encoder final : public BlockEncoder<BlockTexture<BC1Block>> {

    // Unpacked BC1 block with metadata
    struct EncodeResults {
-        Color low;
-        Color high;
+        OldColor low;
+        OldColor high;
        std::array<uint8_t, 16> selectors = {0};
        ColorMode color_mode = ColorMode::Incomplete;
        bool solid = false;
@ -169,24 +171,29 @@ class BC1Encoder final : public BlockEncoder<BlockTexture<BC1Block>> {
    unsigned _orderings4;
    unsigned _orderings3;

-    BC1Block WriteBlockSolid(Color color) const;
+    BC1Block WriteBlockSolid(OldColor color) const;
    BC1Block WriteBlock(EncodeResults &result) const;

-    void FindEndpoints(EncodeResults &result, const CBlock &pixels, const BlockMetrics &metrics, EndpointMode endpoint_mode, bool ignore_black = false) const;
-    void FindEndpointsSingleColor(EncodeResults &result, Color color, bool is_3color = false) const;
-    void FindEndpointsSingleColor(EncodeResults &result, const CBlock &pixels, Color color, bool is_3color) const;
+    void FindEndpoints(EncodeResults &result, const CBlock &pixels, const BlockMetrics &metrics,
+                       EndpointMode endpoint_mode, bool ignore_black = false) const;
+    void FindEndpointsSingleColor(EncodeResults &result, OldColor color, bool is_3color = false) const;
+    void FindEndpointsSingleColor(EncodeResults &result, const CBlock &pixels, OldColor color, bool is_3color) const;

    template <ColorMode M> void FindSelectors(EncodeResults &result, const CBlock &pixels, ErrorMode error_mode) const;

-    template <ColorMode M> bool RefineEndpointsLS(EncodeResults &result, const CBlock &pixels, BlockMetrics metrics) const;
-
-    template <ColorMode M> void RefineEndpointsLS(EncodeResults &result, std::array<Vector4, 17> &sums, Vector4 &matrix, Hash hash) const;
+    template <ColorMode M>
+    bool RefineEndpointsLS(EncodeResults &result, const CBlock &pixels, BlockMetrics metrics) const;

    template <ColorMode M>
-    void RefineBlockLS(EncodeResults &result, const CBlock &pixels, const BlockMetrics &metrics, ErrorMode error_mode, unsigned passes) const;
+    void RefineEndpointsLS(EncodeResults &result, std::array<Vector4, 17> &sums, Vector4 &matrix, Hash hash) const;

    template <ColorMode M>
-    void RefineBlockCF(EncodeResults &result, const CBlock &pixels, const BlockMetrics &metrics, ErrorMode error_mode, unsigned orderings) const;
+    void RefineBlockLS(EncodeResults &result, const CBlock &pixels, const BlockMetrics &metrics, ErrorMode error_mode,
+                       unsigned passes) const;
+
+    template <ColorMode M>
+    void RefineBlockCF(EncodeResults &result, const CBlock &pixels, const BlockMetrics &metrics, ErrorMode error_mode,
+                       unsigned orderings) const;

    void EndpointSearch(EncodeResults &result, const CBlock &pixels) const;
 };
--- a/quicktex/s3tc/bc1/Histogram.h
+++ b/quicktex/s3tc/bc1/Histogram.h
@ -1,5 +1,5 @@
 /*  Quicktex Texture Compression Library
-    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
+    Copyright (C) 2021-2022 Andrew Cassidy <drewcassidy@me.com>
    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
    and licenced under the public domain

@ -27,10 +27,10 @@
 #include <mutex>
 #include <numeric>

-#include "../../Vector4.h"
-#include "../../util.h"
+#include "Vector4.h"
+#include "util/math.h"

-namespace quicktex::s3tc  {
+namespace quicktex::s3tc {
 template <size_t N> class Histogram {
   public:
    using Hash = uint16_t;
@ -71,7 +71,7 @@ template <size_t N> class Histogram {
    unsigned GetPacked() const {
        Hash packed = 0;

-        for (unsigned i = 0; i < (N-1); i++) {
+        for (unsigned i = 0; i < (N - 1); i++) {
            assert(_bins[i] <= (1U << 4) - 1U);
            packed |= static_cast<uint16_t>(_bins[i]) << (i * 4U);
        }
--- a/quicktex/s3tc/bc1/OrderTable.cpp
+++ b/quicktex/s3tc/bc1/OrderTable.cpp
@ -1,5 +1,5 @@
 /*  Quicktex Texture Compression Library
-    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
+    Copyright (C) 2021-2022 Andrew Cassidy <drewcassidy@me.com>
    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
    and licenced under the public domain

@ -21,7 +21,7 @@

 #include <array>

-#include "../../Vector4.h"
+#include "Vector4.h"

 namespace quicktex::s3tc  {
 using Hash = uint16_t;
--- a/quicktex/s3tc/bc1/OrderTable.h
+++ b/quicktex/s3tc/bc1/OrderTable.h
@ -1,5 +1,5 @@
 /*  Quicktex Texture Compression Library
-    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
+    Copyright (C) 2021-2022 Andrew Cassidy <drewcassidy@me.com>
    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
    and licenced under the public domain

@ -29,8 +29,9 @@
 #include <mutex>
 #include <type_traits>

-#include "../../Vector4.h"
 #include "Histogram.h"
+#include "Vector4.h"
+#include "util/math.h"

 namespace quicktex::s3tc {
 template <size_t N> class OrderTable {
@ -73,7 +74,7 @@ template <size_t N> class OrderTable {
                for (unsigned sel = 0; sel < N; sel++) factor_matrix += (Weights[sel] * h[sel]);

                float det = factor_matrix.Determinant2x2();
-                if (fabs(det) < 1e-8f) {
+                if (abs(det) < 1e-8f) {
                    factors->at(i) = Vector4(0);
                } else {
                    std::swap(factor_matrix[0], factor_matrix[3]);
@ -113,7 +114,9 @@ template <size_t N> class OrderTable {
        return factors->at(hash);
    }

-    static bool IsSingleColor(Hash hash) { return (std::find(SingleColorHashes.begin(), SingleColorHashes.end(), hash) != SingleColorHashes.end()); }
+    static bool IsSingleColor(Hash hash) {
+        return (std::find(SingleColorHashes.begin(), SingleColorHashes.end(), hash) != SingleColorHashes.end());
+    }

   private:
    static std::mutex table_mutex;
--- a/quicktex/s3tc/bc1/OrderTable4.cpp
+++ b/quicktex/s3tc/bc1/OrderTable4.cpp
@ -1,5 +1,5 @@
 /*  Quicktex Texture Compression Library
-    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
+    Copyright (C) 2021-2022 Andrew Cassidy <drewcassidy@me.com>
    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
    and licenced under the public domain

--- a/quicktex/s3tc/bc1/SingleColorTable.h
+++ b/quicktex/s3tc/bc1/SingleColorTable.h
@ -1,5 +1,5 @@
 /*  Quicktex Texture Compression Library
-    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
+    Copyright (C) 2021-2022 Andrew Cassidy <drewcassidy@me.com>
    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
    and licenced under the public domain

@ -23,10 +23,11 @@
 #include <cstdint>
 #include <memory>

-#include "../../util.h"
-#include "../interpolator/Interpolator.h"
+#include "s3tc/interpolator/Interpolator.h"
+#include "util/bitbash.h"
+#include "util/math.h"

-namespace quicktex::s3tc  {
+namespace quicktex::s3tc {

 struct BC1MatchEntry {
    uint8_t high;
@ -59,10 +60,10 @@ template <size_t B, size_t N> MatchListPtr SingleColorTable(InterpolatorPtr inte
        // TODO: Can probably avoid testing for values that definitely wont yield good results,
        // e.g. low8 and high8 both much smaller or larger than index
        for (uint8_t low = 0; low < Size; low++) {
-            uint8_t low8 = (B == 5) ? scale5To8(low) : scale6To8(low);
+            uint8_t low8 = scale_to_8<B>(low);

            for (uint8_t high = 0; high < Size; high++) {
-                uint8_t high8 = (B == 5) ? scale5To8(high) : scale6To8(high);
+                uint8_t high8 = scale_to_8<B>(high);
                uint8_t value;

                if (use_8bit) {
@ -71,10 +72,10 @@ template <size_t B, size_t N> MatchListPtr SingleColorTable(InterpolatorPtr inte
                    value = (B == 5) ? interpolator->Interpolate5(high, low) : interpolator->Interpolate6(high, low);
                }

-                unsigned new_error = iabs(value - (int)i);
+                unsigned new_error = abs(value - (int)i);

                // We only need to factor in 3% error in BC1 ideal mode.
-                if (ideal) new_error += (iabs(high8 - (int)low8) * 3) / 100;
+                if (ideal) new_error += (abs(high8 - (int)low8) * 3) / 100;

                if ((new_error < error) || (new_error == error && low == high)) {
                    assert(new_error <= UINT8_MAX);
--- a/quicktex/s3tc/bc1/_bindings.cpp
+++ b/quicktex/s3tc/bc1/_bindings.cpp
@ -1,5 +1,5 @@
 /*  Quicktex Texture Compression Library
-    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
+    Copyright (C) 2021-2022 Andrew Cassidy <drewcassidy@me.com>
    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
    and licenced under the public domain

@ -23,16 +23,12 @@
 #include <pybind11/stl.h>

 #include <array>
-#include <cstddef>
-#include <cstdint>
-#include <stdexcept>
-#include <string>
+#include <memory>

-#include "../../Decoder.h"
-#include "../../Encoder.h"
-#include "../interpolator/Interpolator.h"
-#include "BC1Decoder.h"
-#include "BC1Encoder.h"
+#include "s3tc/bc1/BC1Block.h"
+#include "s3tc/bc1/BC1Decoder.h"
+#include "s3tc/bc1/BC1Encoder.h"
+#include "s3tc/interpolator/Interpolator.h"

 namespace py = pybind11;
 namespace quicktex::bindings {
@ -50,7 +46,7 @@ void InitBC1(py::module_ &s3tc) {
    bc1_block.doc() = "A single BC1 block.";

    bc1_block.def(py::init<>());
-    bc1_block.def(py::init<Color, Color, BC1Block::SelectorArray>(), "color0"_a, "color1"_a, "selectors"_a, R"doc(
+    bc1_block.def(py::init<OldColor, OldColor, BC1Block::SelectorArray>(), "color0"_a, "color1"_a, "selectors"_a, R"doc(
        Create a new BC1Block with the specified endpoints and selectors

        :param color0: The first endpoint
@ -58,7 +54,8 @@ void InitBC1(py::module_ &s3tc) {
        :param selectors: the selectors as a 4x4 list of integers, between 0 and 3 inclusive.
    )doc");

-    bc1_block.def_property("endpoints", &BC1Block::GetColors, &BC1Block::SetColors, "The block's endpoint colors as a 2-tuple.");
+    bc1_block.def_property("endpoints", &BC1Block::GetColors, &BC1Block::SetColors,
+                           "The block's endpoint colors as a 2-tuple.");
    bc1_block.def_property("selectors", &BC1Block::GetSelectors, &BC1Block::SetSelectors, R"doc(
        The block's selectors as a 4x4 list of integers between 0 and 3 inclusive.

@ -83,27 +80,42 @@ void InitBC1(py::module_ &s3tc) {
    // region BC1Encoder
    py::class_<BC1Encoder> bc1_encoder(bc1, "BC1Encoder", "Encodes RGB textures to BC1.");

-    py::enum_<BC1Encoder::EndpointMode>(bc1_encoder, "EndpointMode", "Enum representing various methods of finding endpoints in a block.")
-        .value("LeastSquares", BC1Encoder::EndpointMode::LeastSquares, "Find endpoints using a 2D least squares approach.")
-        .value("BoundingBox", BC1Encoder::EndpointMode::BoundingBox, "Find endpoints using a simple bounding box. Fast but inaccurate.")
-        .value("BoundingBoxInt", BC1Encoder::EndpointMode::BoundingBoxInt, "Same as BoundingBox but using integers, slightly faster.")
-        .value("PCA", BC1Encoder::EndpointMode::PCA, "Find endpoints using Principle Component Analysis. Slowest but highest quality method.");
+    py::enum_<BC1Encoder::EndpointMode>(bc1_encoder, "EndpointMode",
+                                        "Enum representing various methods of finding endpoints in a block.")
+        .value("LeastSquares", BC1Encoder::EndpointMode::LeastSquares,
+               "Find endpoints using a 2D least squares approach.")
+        .value("BoundingBox", BC1Encoder::EndpointMode::BoundingBox,
+               "Find endpoints using a simple bounding box. Fast but inaccurate.")
+        .value("BoundingBoxInt", BC1Encoder::EndpointMode::BoundingBoxInt,
+               "Same as BoundingBox but using integers, slightly faster.")
+        .value("PCA", BC1Encoder::EndpointMode::PCA,
+               "Find endpoints using Principle Component Analysis. Slowest but highest quality method.");

-    py::enum_<BC1Encoder::ErrorMode>(bc1_encoder, "ErrorMode", "Enum representing various methods of finding selectors in a block.")
-        .value("None", BC1Encoder::ErrorMode::None, "The same as Faster but error is not calculated. This disables any cluster-fit options")
-        .value("Faster", BC1Encoder::ErrorMode::Faster, "Use a slightly lower quality, but ~30% faster MSE evaluation function for 4-color blocks.")
+    py::enum_<BC1Encoder::ErrorMode>(bc1_encoder, "ErrorMode",
+                                     "Enum representing various methods of finding selectors in a block.")
+        .value("None", BC1Encoder::ErrorMode::None,
+               "The same as Faster but error is not calculated. This disables any cluster-fit options")
+        .value("Faster", BC1Encoder::ErrorMode::Faster,
+               "Use a slightly lower quality, but ~30% faster MSE evaluation function for 4-color blocks.")
        .value("Check2", BC1Encoder::ErrorMode::Check2, "Default error-checking method.")
-        .value("Full", BC1Encoder::ErrorMode::Full, "Examine all colors to compute selectors/MSE. Slower but slightly higher quality.");
+        .value("Full", BC1Encoder::ErrorMode::Full,
+               "Examine all colors to compute selectors/MSE. Slower but slightly higher quality.");

-    py::enum_<BC1Encoder::ColorMode>(bc1_encoder, "ColorMode", "Enum representing various methods of writing BC1 blocks.")
-        .value("FourColor", BC1Encoder::ColorMode::FourColor, "Default color mode. Only 4-color blocks will be output, where color0 > color1")
-        .value("ThreeColor", BC1Encoder::ColorMode::ThreeColor, "Additionally use 3-color blocks when they have a lower error, where color0 <= color1")
+    py::enum_<BC1Encoder::ColorMode>(bc1_encoder, "ColorMode",
+                                     "Enum representing various methods of writing BC1 blocks.")
+        .value("FourColor", BC1Encoder::ColorMode::FourColor,
+               "Default color mode. Only 4-color blocks will be output, where color0 > color1")
+        .value("ThreeColor", BC1Encoder::ColorMode::ThreeColor,
+               "Additionally use 3-color blocks when they have a lower error, where color0 <= color1")
        .value("ThreeColorBlack", BC1Encoder::ColorMode::ThreeColorBlack,
-               "Additionally use 3-color blocks with black pixels (selector 3). Note that this requires your shader/engine to not sample the alpha channel "
+               "Additionally use 3-color blocks with black pixels (selector 3). Note that this requires your "
+               "shader/engine to not sample the alpha channel "
               "when using a BC1 texture.");

-    bc1_encoder.def(py::init<unsigned, BC1Encoder::ColorMode>(), "level"_a = 5, "color_mode"_a = BC1Encoder::ColorMode::FourColor);
-    bc1_encoder.def(py::init<unsigned, BC1Encoder::ColorMode, InterpolatorPtr>(), "level"_a, "color_mode"_a, "interpolator"_a, R"doc(
+    bc1_encoder.def(py::init<unsigned, BC1Encoder::ColorMode>(), "level"_a = 5,
+                    "color_mode"_a = BC1Encoder::ColorMode::FourColor);
+    bc1_encoder.def(py::init<unsigned, BC1Encoder::ColorMode, InterpolatorPtr>(), "level"_a, "color_mode"_a,
+                    "interpolator"_a, R"doc(
        Create a new BC1 encoder with the specified preset level, color mode, and interpolator.

        :param int level: The preset level of the resulting encoder, between 0 and 18 inclusive. See :py:meth:`set_level` for more information. Default: 5.
@ -125,44 +137,56 @@ void InitBC1(py::module_ &s3tc) {
        :param int level: The preset level of the resulting encoder, between 0 and 18 inclusive. Default: 5.
    )doc");

-    bc1_encoder.def_property_readonly("interpolator", &BC1Encoder::GetInterpolator,
-                                      "The :py:class:`~quicktex.s3tc.interpolator.Interpolator` used by this encoder. This is a readonly property.");
-    bc1_encoder.def_property_readonly("color_mode", &BC1Encoder::GetColorMode,
-                                      "The :py:class:`~quicktex.s3tc.bc1.BC1Encoder.ColorMode` used by this encoder. This is a readonly property.");
+    bc1_encoder.def_property_readonly(
+        "interpolator", &BC1Encoder::GetInterpolator,
+        "The :py:class:`~quicktex.s3tc.interpolator.Interpolator` used by this encoder. This is a readonly property.");
+    bc1_encoder.def_property_readonly(
+        "color_mode", &BC1Encoder::GetColorMode,
+        "The :py:class:`~quicktex.s3tc.bc1.BC1Encoder.ColorMode` used by this encoder. This is a readonly property.");

    // Advanced API

-    bc1_encoder.def_property("error_mode", &BC1Encoder::GetErrorMode, &BC1Encoder::SetErrorMode, "The error mode used by this encoder for finding selectors.");
-    bc1_encoder.def_property("endpoint_mode", &BC1Encoder::GetEndpointMode, &BC1Encoder::SetEndpointMode, "The endpoint mode used by this encoder.");
+    bc1_encoder.def_property("error_mode", &BC1Encoder::GetErrorMode, &BC1Encoder::SetErrorMode,
+                             "The error mode used by this encoder for finding selectors.");
+    bc1_encoder.def_property("endpoint_mode", &BC1Encoder::GetEndpointMode, &BC1Encoder::SetEndpointMode,
+                             "The endpoint mode used by this encoder.");

    bc1_encoder.def_readwrite("two_ls_passes", &BC1Encoder::two_ls_passes,
                              "Use 2 least squares pass, instead of one (same as stb_dxt's HIGHQUAL option).\n"
                              "Recommended if you're setting the orderings settings greater than 0.");

-    bc1_encoder.def_readwrite("two_ep_passes", &BC1Encoder::two_ep_passes, "Try 2 different ways of choosing the initial endpoints.");
+    bc1_encoder.def_readwrite("two_ep_passes", &BC1Encoder::two_ep_passes,
+                              "Try 2 different ways of choosing the initial endpoints.");

-    bc1_encoder.def_readwrite("two_cf_passes", &BC1Encoder::two_cf_passes,
-                              "Greatly increase encode time, with very slightly higher quality.\n"
-                              "Same as squish's iterative cluster fit option. Not really worth the tiny boost in quality, "
-                              "unless you just don't care about performance at all.");
+    bc1_encoder.def_readwrite(
+        "two_cf_passes", &BC1Encoder::two_cf_passes,
+        "Greatly increase encode time, with very slightly higher quality.\n"
+        "Same as squish's iterative cluster fit option. Not really worth the tiny boost in quality, "
+        "unless you just don't care about performance at all.");

-    bc1_encoder.def_readwrite("exhaustive", &BC1Encoder::exhaustive,
-                              "Check all total orderings - *very* slow. The encoder is not designed to be used in this way");
+    bc1_encoder.def_readwrite(
+        "exhaustive", &BC1Encoder::exhaustive,
+        "Check all total orderings - *very* slow. The encoder is not designed to be used in this way");

    bc1_encoder.def_property("search_rounds", &BC1Encoder::GetSearchRounds, &BC1Encoder::SetSearchRounds,
-                             "Setting search rounds > 0 enables refining the final endpoints by examining nearby colors. A higher value has a higher quality "
+                             "Setting search rounds > 0 enables refining the final endpoints by examining nearby "
+                             "colors. A higher value has a higher quality "
                             "at the expense of performance.");

-    bc1_encoder.def_property("orderings", &BC1Encoder::GetOrderings, &BC1Encoder::SetOrderings,
-                             "setting the orderings > 0 enables ordered cluster fit using a lookup table of similar blocks. Value is a tuple of (4 color "
-                             "orders, 3 color orders), where higher values have a higher quality at the expense of performance.");
+    bc1_encoder.def_property(
+        "orderings", &BC1Encoder::GetOrderings, &BC1Encoder::SetOrderings,
+        "setting the orderings > 0 enables ordered cluster fit using a lookup table of similar blocks. Value is a "
+        "tuple of (4 color "
+        "orders, 3 color orders), where higher values have a higher quality at the expense of performance.");

    bc1_encoder.def_readonly_static("max_power_iterations", &BC1Encoder::max_power_iterations);
    bc1_encoder.def_readonly_static("min_power_iterations", &BC1Encoder::min_power_iterations);

-    bc1_encoder.def_property("power_iterations", &BC1Encoder::GetPowerIterations, &BC1Encoder::SetPowerIterations,
-                             "Number of power iterations used with the PCA endpoint mode. Value should be around 4 to 6. "
-                             "Automatically clamped to between :py:const:`BC1Encoder.min_power_iterations` and :py:const:`BC1Encoder.max_power_iterations`");
+    bc1_encoder.def_property(
+        "power_iterations", &BC1Encoder::GetPowerIterations, &BC1Encoder::SetPowerIterations,
+        "Number of power iterations used with the PCA endpoint mode. Value should be around 4 to 6. "
+        "Automatically clamped to between :py:const:`BC1Encoder.min_power_iterations` and "
+        ":py:const:`BC1Encoder.max_power_iterations`");
    // endregion

    // region BC1Decoder
@ -185,8 +209,10 @@ void InitBC1(py::module_ &s3tc) {
        :returns: A new RawTexture with the same dimensions as the input
    )doc");

-    bc1_decoder.def_property_readonly("interpolator", &BC1Decoder::GetInterpolator, "The interpolator used by this decoder. This is a readonly property.");
-    bc1_decoder.def_readwrite("write_alpha", &BC1Decoder::write_alpha, "Determines if the alpha channel of the output is written to.");
+    bc1_decoder.def_property_readonly("interpolator", &BC1Decoder::GetInterpolator,
+                                      "The interpolator used by this decoder. This is a readonly property.");
+    bc1_decoder.def_readwrite("write_alpha", &BC1Decoder::write_alpha,
+                              "Determines if the alpha channel of the output is written to.");
    // endregion
 }
 }  // namespace quicktex::bindings
--- a/quicktex/s3tc/bc3/BC3Block.h
+++ b/quicktex/s3tc/bc3/BC3Block.h
@ -1,5 +1,5 @@
 /*  Quicktex Texture Compression Library
-    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
+    Copyright (C) 2021-2022 Andrew Cassidy <drewcassidy@me.com>
    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
    and licenced under the public domain

@ -21,8 +21,8 @@

 #include <utility>

-#include "../bc1/BC1Block.h"
-#include "../bc4/BC4Block.h"
+#include "s3tc/bc1/BC1Block.h"
+#include "s3tc/bc4/BC4Block.h"

 namespace quicktex::s3tc {

@ -54,7 +54,9 @@ class alignas(8) BC3Block {
        color_block = blocks.second;
    }

-    bool operator==(const BC3Block &Rhs) const { return alpha_block == Rhs.alpha_block && color_block == Rhs.color_block; }
+    bool operator==(const BC3Block &Rhs) const {
+        return alpha_block == Rhs.alpha_block && color_block == Rhs.color_block;
+    }
    bool operator!=(const BC3Block &Rhs) const { return !(Rhs == *this); }
 };
 }  // namespace quicktex::s3tc
--- a/quicktex/s3tc/bc3/BC3Decoder.cpp
+++ b/quicktex/s3tc/bc3/BC3Decoder.cpp
@ -1,5 +1,5 @@
 /*  Quicktex Texture Compression Library
-    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
+    Copyright (C) 2021-2022 Andrew Cassidy <drewcassidy@me.com>
    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
    and licenced under the public domain

--- a/quicktex/s3tc/bc3/BC3Decoder.h
+++ b/quicktex/s3tc/bc3/BC3Decoder.h
@ -1,5 +1,5 @@
 /*  Quicktex Texture Compression Library
-    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
+    Copyright (C) 2021-2022 Andrew Cassidy <drewcassidy@me.com>
    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
    and licenced under the public domain

@ -21,13 +21,13 @@

 #include <memory>

-#include "../../ColorBlock.h"
-#include "../../Decoder.h"
-#include "../../Texture.h"
-#include "../bc1/BC1Decoder.h"
-#include "../bc4/BC4Decoder.h"
-#include "../interpolator/Interpolator.h"
-#include "BC3Block.h"
+#include "ColorBlock.h"
+#include "Decoder.h"
+#include "s3tc/bc1/BC1Decoder.h"
+#include "s3tc/bc3/BC3Block.h"
+#include "s3tc/bc4/BC4Decoder.h"
+#include "s3tc/interpolator/Interpolator.h"
+#include "texture/BlockTexture.h"

 namespace quicktex::s3tc {

@ -37,7 +37,8 @@ class BC3Decoder : public BlockDecoder<BlockTexture<BC3Block>> {
    using BC4DecoderPtr = std::shared_ptr<BC4Decoder>;
    using InterpolatorPtr = std::shared_ptr<Interpolator>;

-    BC3Decoder(InterpolatorPtr interpolator) : _bc1_decoder(std::make_shared<BC1Decoder>(interpolator)), _bc4_decoder(std::make_shared<BC4Decoder>(3)) {}
+    BC3Decoder(InterpolatorPtr interpolator)
+        : _bc1_decoder(std::make_shared<BC1Decoder>(interpolator)), _bc4_decoder(std::make_shared<BC4Decoder>(3)) {}

    BC3Decoder() : BC3Decoder(std::make_shared<Interpolator>()) {}

--- a/quicktex/s3tc/bc3/BC3Encoder.cpp
+++ b/quicktex/s3tc/bc3/BC3Encoder.cpp
@ -1,5 +1,5 @@
 /*  Quicktex Texture Compression Library
-    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
+    Copyright (C) 2021-2022 Andrew Cassidy <drewcassidy@me.com>
    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
    and licenced under the public domain

@ -19,10 +19,8 @@

 #include "BC3Encoder.h"

-#include "../../ColorBlock.h"
-#include "../bc1/BC1Block.h"
-#include "../bc4/BC4Block.h"
-#include "BC3Block.h"
+#include "ColorBlock.h"
+#include "s3tc/bc3/BC3Block.h"

 namespace quicktex::s3tc {
 BC3Block BC3Encoder::EncodeBlock(const ColorBlock<4, 4> &pixels) const {
--- a/quicktex/s3tc/bc3/BC3Encoder.h
+++ b/quicktex/s3tc/bc3/BC3Encoder.h
@ -1,5 +1,5 @@
 /*  Quicktex Texture Compression Library
-    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
+    Copyright (C) 2021-2022 Andrew Cassidy <drewcassidy@me.com>
    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
    and licenced under the public domain

@ -21,13 +21,13 @@

 #include <memory>

-#include "../../ColorBlock.h"
-#include "../../Encoder.h"
-#include "../../Texture.h"
-#include "../bc1/BC1Encoder.h"
-#include "../bc4/BC4Encoder.h"
-#include "../interpolator/Interpolator.h"
-#include "BC3Block.h"
+#include "ColorBlock.h"
+#include "Encoder.h"
+#include "s3tc/bc1/BC1Encoder.h"
+#include "s3tc/bc3/BC3Block.h"
+#include "s3tc/bc4/BC4Encoder.h"
+#include "s3tc/interpolator/Interpolator.h"
+#include "texture/BlockTexture.h"

 namespace quicktex::s3tc {

@ -38,7 +38,8 @@ class BC3Encoder : public BlockEncoder<BlockTexture<BC3Block>> {
    using InterpolatorPtr = std::shared_ptr<Interpolator>;

    BC3Encoder(unsigned level, InterpolatorPtr interpolator)
-        : _bc1_encoder(std::make_shared<BC1Encoder>(level, BC1Encoder::ColorMode::FourColor, interpolator)), _bc4_encoder(std::make_shared<BC4Encoder>(3)) {}
+        : _bc1_encoder(std::make_shared<BC1Encoder>(level, BC1Encoder::ColorMode::FourColor, interpolator)),
+          _bc4_encoder(std::make_shared<BC4Encoder>(3)) {}

    BC3Encoder(unsigned level = 5) : BC3Encoder(level, std::make_shared<Interpolator>()) {}

--- a/quicktex/s3tc/bc3/_bindings.cpp
+++ b/quicktex/s3tc/bc3/_bindings.cpp
@ -1,5 +1,5 @@
 /*  Quicktex Texture Compression Library
-    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
+    Copyright (C) 2021-2022 Andrew Cassidy <drewcassidy@me.com>
    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
    and licenced under the public domain

@ -22,16 +22,14 @@
 #include <pybind11/pybind11.h>

 #include <array>
-#include <cstddef>
-#include <cstdint>
-#include <stdexcept>
-#include <string>
+#include <memory>

-#include "../../Decoder.h"
-#include "../../Encoder.h"
-#include "../interpolator/Interpolator.h"
-#include "BC3Decoder.h"
-#include "BC3Encoder.h"
+#include "s3tc/bc1/BC1Block.h"
+#include "s3tc/bc3/BC3Block.h"
+#include "s3tc/bc3/BC3Decoder.h"
+#include "s3tc/bc3/BC3Encoder.h"
+#include "s3tc/bc4/BC4Block.h"
+#include "s3tc/interpolator/Interpolator.h"

 namespace py = pybind11;
 namespace quicktex::bindings {
@ -59,7 +57,8 @@ void InitBC3(py::module_ &s3tc) {

    bc3_block.def_readwrite("alpha_block", &BC3Block::alpha_block, "The BC4 block used for alpha data.");
    bc3_block.def_readwrite("color_block", &BC3Block::color_block, "The BC1 block used for rgb data.");
-    bc3_block.def_property("blocks", &BC3Block::GetBlocks, &BC3Block::SetBlocks, "The BC4 and BC1 blocks that make up this block as a 2-tuple.");
+    bc3_block.def_property("blocks", &BC3Block::GetBlocks, &BC3Block::SetBlocks,
+                           "The BC4 and BC1 blocks that make up this block as a 2-tuple.");
    // endregion

    // region BC3Texture
@ -88,10 +87,12 @@ void InitBC3(py::module_ &s3tc) {
        :returns: A new BC3Texture with the same dimension as the input.
    )doc");

-    bc3_encoder.def_property_readonly("bc1_encoder", &BC3Encoder::GetBC1Encoder,
-                                      "Internal :py:class:`~quicktex.s3tc.bc1.BC1Encoder` used for RGB data. Readonly.");
-    bc3_encoder.def_property_readonly("bc4_encoder", &BC3Encoder::GetBC4Encoder,
-                                      "Internal :py:class:`~quicktex.s3tc.bc4.BC4Encoder` used for alpha data. Readonly.");
+    bc3_encoder.def_property_readonly(
+        "bc1_encoder", &BC3Encoder::GetBC1Encoder,
+        "Internal :py:class:`~quicktex.s3tc.bc1.BC1Encoder` used for RGB data. Readonly.");
+    bc3_encoder.def_property_readonly(
+        "bc4_encoder", &BC3Encoder::GetBC4Encoder,
+        "Internal :py:class:`~quicktex.s3tc.bc4.BC4Encoder` used for alpha data. Readonly.");
    // endregion

    // region BC3Decoder
@ -113,10 +114,12 @@ void InitBC3(py::module_ &s3tc) {
        :returns: A new RawTexture with the same dimensions as the input
    )doc");

-    bc3_decoder.def_property_readonly("bc1_decoder", &BC3Decoder::GetBC1Decoder,
-                                      "Internal :py:class:`~quicktex.s3tc.bc1.BC1Decoder` used for RGB data. Readonly.");
-    bc3_decoder.def_property_readonly("bc4_decoder", &BC3Decoder::GetBC4Decoder,
-                                      "Internal :py:class:`~quicktex.s3tc.bc4.BC4Decoder` used for alpha data. Readonly.");
+    bc3_decoder.def_property_readonly(
+        "bc1_decoder", &BC3Decoder::GetBC1Decoder,
+        "Internal :py:class:`~quicktex.s3tc.bc1.BC1Decoder` used for RGB data. Readonly.");
+    bc3_decoder.def_property_readonly(
+        "bc4_decoder", &BC3Decoder::GetBC4Decoder,
+        "Internal :py:class:`~quicktex.s3tc.bc4.BC4Decoder` used for alpha data. Readonly.");
    // endregion
 }
 }  // namespace quicktex::bindings
--- a/quicktex/s3tc/bc4/BC4Block.cpp
+++ b/quicktex/s3tc/bc4/BC4Block.cpp
@ -1,5 +1,5 @@
 /*  Quicktex Texture Compression Library
-    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
+    Copyright (C) 2021-2022 Andrew Cassidy <drewcassidy@me.com>
    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
    and licenced under the public domain

@ -22,14 +22,17 @@
 #include <algorithm>
 #include <stdexcept>

-#include "../../util.h"
+#include "util/bitbash.h"
+#include "util/map.h"
+#include "util/math.h"
+#include "util/ranges.h"

 namespace quicktex::s3tc {

 BC4Block::SelectorArray BC4Block::GetSelectors() const {
-    auto packed = Pack<uint8_t, uint64_t, 8, SelectorSize>(_selectors);
-    auto rows = Unpack<uint64_t, uint16_t, SelectorBits * Width, Height>(packed);
-    return MapArray(rows, Unpack<uint16_t, uint8_t, SelectorBits, Width>);
+    auto packed = pack<uint64_t>(_selectors, 8);
+    auto rows = unpack<uint16_t, Height>(packed, SelectorBits * Width);
+    return map([](auto row) { return unpack<uint8_t, Width>(row, SelectorBits); }, rows);
 }

 void BC4Block::SetSelectors(const BC4Block::SelectorArray& unpacked) {
@ -37,9 +40,9 @@ void BC4Block::SetSelectors(const BC4Block::SelectorArray& unpacked) {
        if (std::any_of(unpacked[y].begin(), unpacked[y].end(), [](uint8_t i) { return i > SelectorMax; }))
            throw std::invalid_argument("Selector value out of bounds.");
    }
-    auto rows = MapArray(unpacked, Pack<uint8_t, uint16_t, SelectorBits, Width>);
-    auto packed = Pack<uint16_t, uint64_t, SelectorBits * Width, Height>(rows);
-    _selectors = Unpack<uint64_t, uint8_t, 8, SelectorSize>(packed);
+    auto rows = map([](auto r) { return pack<uint16_t>(r, SelectorBits); }, unpacked);
+    auto packed = pack<uint64_t>(rows, SelectorBits * Width);
+    _selectors = unpack<uint8_t, SelectorSize>(packed, 8);
 }

 std::array<uint8_t, 8> BC4Block::GetValues6() const {
@ -64,6 +67,8 @@ std::array<uint8_t, 8> BC4Block::GetValues8() const {
            static_cast<uint8_t>((alpha0 + alpha1 * 6) / 7)};
 }

-bool BC4Block::operator==(const BC4Block& Rhs) const { return alpha0 == Rhs.alpha0 && alpha1 == Rhs.alpha1 && _selectors == Rhs._selectors; }
+bool BC4Block::operator==(const BC4Block& Rhs) const {
+    return alpha0 == Rhs.alpha0 && alpha1 == Rhs.alpha1 && _selectors == Rhs._selectors;
+}
 bool BC4Block::operator!=(const BC4Block& Rhs) const { return !(Rhs == *this); }
 }  // namespace quicktex::s3tc
--- a/quicktex/s3tc/bc4/BC4Block.h
+++ b/quicktex/s3tc/bc4/BC4Block.h
@ -1,5 +1,5 @@
 /*  Quicktex Texture Compression Library
-    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
+    Copyright (C) 2021-2022 Andrew Cassidy <drewcassidy@me.com>
    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
    and licenced under the public domain

--- a/quicktex/s3tc/bc4/BC4Decoder.cpp
+++ b/quicktex/s3tc/bc4/BC4Decoder.cpp
@ -1,5 +1,5 @@
 /*  Quicktex Texture Compression Library
-    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
+    Copyright (C) 2021-2022 Andrew Cassidy <drewcassidy@me.com>
    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
    and licenced under the public domain

@ -22,8 +22,8 @@
 #include <array>    // for array
 #include <cassert>  // for assert

-#include "../../Color.h"
 #include "../../ColorBlock.h"
+#include "../../OldColor.h"
 #include "BC4Block.h"

 namespace quicktex::s3tc {
--- a/quicktex/s3tc/bc4/BC4Decoder.h
+++ b/quicktex/s3tc/bc4/BC4Decoder.h
@ -1,5 +1,5 @@
 /*  Quicktex Texture Compression Library
-    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
+    Copyright (C) 2021-2022 Andrew Cassidy <drewcassidy@me.com>
    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
    and licenced under the public domain

@ -22,10 +22,10 @@
 #include <cstdint>
 #include <stdexcept>

-#include "../../ColorBlock.h"
-#include "../../Decoder.h"
-#include "../../Texture.h"
-#include "BC4Block.h"
+#include "ColorBlock.h"
+#include "Decoder.h"
+#include "s3tc/bc4/BC4Block.h"
+#include "texture/BlockTexture.h"

 namespace quicktex::s3tc {

--- a/quicktex/s3tc/bc4/BC4Encoder.cpp
+++ b/quicktex/s3tc/bc4/BC4Encoder.cpp
@ -1,5 +1,5 @@
 /*  Quicktex Texture Compression Library
-    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
+    Copyright (C) 2021-2022 Andrew Cassidy <drewcassidy@me.com>
    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
    and licenced under the public domain

@ -23,8 +23,8 @@
 #include <array>
 #include <cstdint>

-#include "../../Color.h"
 #include "../../ColorBlock.h"
+#include "../../OldColor.h"
 #include "BC4Block.h"

 namespace quicktex::s3tc {
--- a/quicktex/s3tc/bc4/BC4Encoder.h
+++ b/quicktex/s3tc/bc4/BC4Encoder.h
@ -1,5 +1,5 @@
 /*  Quicktex Texture Compression Library
-    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
+    Copyright (C) 2021-2022 Andrew Cassidy <drewcassidy@me.com>
    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
    and licenced under the public domain

@ -22,10 +22,10 @@
 #include <cstdint>
 #include <stdexcept>

-#include "../../ColorBlock.h"
-#include "../../Encoder.h"
-#include "../../Texture.h"
-#include "BC4Block.h"
+#include "ColorBlock.h"
+#include "Encoder.h"
+#include "s3tc/bc4/BC4Block.h"
+#include "texture/BlockTexture.h"

 namespace quicktex::s3tc {

--- a/quicktex/s3tc/bc4/_bindings.cpp
+++ b/quicktex/s3tc/bc4/_bindings.cpp
@ -1,5 +1,5 @@
 /*  Quicktex Texture Compression Library
-    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
+    Copyright (C) 2021-2022 Andrew Cassidy <drewcassidy@me.com>
    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
    and licenced under the public domain

@ -23,15 +23,11 @@
 #include <pybind11/stl.h>

 #include <array>
-#include <cstddef>
 #include <cstdint>
-#include <stdexcept>
-#include <string>

-#include "../../Decoder.h"
-#include "../../Encoder.h"
-#include "BC4Decoder.h"
-#include "BC4Encoder.h"
+#include "s3tc/bc4/BC4Block.h"
+#include "s3tc/bc4/BC4Decoder.h"
+#include "s3tc/bc4/BC4Encoder.h"

 namespace py = pybind11;
 namespace quicktex::bindings {
@ -46,7 +42,8 @@ void InitBC4(py::module_ &s3tc) {
    bc4_block.doc() = "A single BC4 block.";

    bc4_block.def(py::init<>());
-    bc4_block.def(py::init<uint8_t, uint8_t, BC4Block::SelectorArray>(), "endpoint0"_a, "endpoint1"_a, "selectors"_a, R"doc(
+    bc4_block.def(py::init<uint8_t, uint8_t, BC4Block::SelectorArray>(), "endpoint0"_a, "endpoint1"_a, "selectors"_a,
+                  R"doc(
        Create a new BC4Block with the specified endpoints and selectors.

        :param int endpoint0: The first endpoint.
@ -54,7 +51,8 @@ void InitBC4(py::module_ &s3tc) {
        :param selectors: the selectors as a 4x4 list of integers, between 0 and 7 inclusive.
    )doc");

-    bc4_block.def_property("endpoints", &BC4Block::GetAlphas, &BC4Block::SetAlphas, "The block's endpoint values as a 2-tuple.");
+    bc4_block.def_property("endpoints", &BC4Block::GetAlphas, &BC4Block::SetAlphas,
+                           "The block's endpoint values as a 2-tuple.");
    bc4_block.def_property("selectors", &BC4Block::GetSelectors, &BC4Block::SetSelectors, R"doc(
        The block's selectors as a 4x4 list of integers between 0 and 7 inclusive.

@ -96,8 +94,9 @@ void InitBC4(py::module_ &s3tc) {
        :param RawTexture texture: Input texture to encode.
        :returns: A new BC4Texture with the same dimension as the input.
    )doc");
-    
-    bc4_encoder.def_property_readonly("channel", &BC4Encoder::GetChannel, "The channel that will be read from. 0 to 3 inclusive. Readonly.");
+
+    bc4_encoder.def_property_readonly("channel", &BC4Encoder::GetChannel,
+                                      "The channel that will be read from. 0 to 3 inclusive. Readonly.");
    // endregion

    // region BC4Decoder
@ -117,8 +116,9 @@ void InitBC4(py::module_ &s3tc) {
        :param RawTexture texture: Input texture to encode.
        :returns: A new RawTexture with the same dimensions as the input
    )doc");
-    
-    bc4_decoder.def_property_readonly("channel", &BC4Decoder::GetChannel, "The channel that will be written to. 0 to 3 inclusive. Readonly.");
+
+    bc4_decoder.def_property_readonly("channel", &BC4Decoder::GetChannel,
+                                      "The channel that will be written to. 0 to 3 inclusive. Readonly.");
    // endregion
 }

--- a/quicktex/s3tc/bc5/BC5Block.h
+++ b/quicktex/s3tc/bc5/BC5Block.h
@ -1,5 +1,5 @@
 /*  Quicktex Texture Compression Library
-    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
+    Copyright (C) 2021-2022 Andrew Cassidy <drewcassidy@me.com>
    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
    and licenced under the public domain

@ -19,9 +19,7 @@

 #pragma once

-#include <utility>
-
-#include "../bc4/BC4Block.h"
+#include "s3tc/bc4/BC4Block.h"

 namespace quicktex::s3tc {

@ -53,7 +51,9 @@ class alignas(8) BC5Block {
        chan1_block = pair.second;
    }

-    bool operator==(const BC5Block &Rhs) const { return chan0_block == Rhs.chan0_block && chan1_block == Rhs.chan1_block; }
+    bool operator==(const BC5Block &Rhs) const {
+        return chan0_block == Rhs.chan0_block && chan1_block == Rhs.chan1_block;
+    }
    bool operator!=(const BC5Block &Rhs) const { return !(Rhs == *this); }
 };
 }  // namespace quicktex::s3tc
--- a/quicktex/s3tc/bc5/BC5Decoder.cpp
+++ b/quicktex/s3tc/bc5/BC5Decoder.cpp
@ -1,5 +1,5 @@
 /*  Quicktex Texture Compression Library
-    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
+    Copyright (C) 2021-2022 Andrew Cassidy <drewcassidy@me.com>
    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
    and licenced under the public domain

@ -19,8 +19,8 @@

 #include "BC5Decoder.h"

-#include "../../ColorBlock.h"
-#include "BC5Block.h"
+#include "ColorBlock.h"
+#include "s3tc/bc5/BC5Block.h"

 namespace quicktex::s3tc {
 ColorBlock<4, 4> BC5Decoder::DecodeBlock(const BC5Block &block) const {
--- a/quicktex/s3tc/bc5/BC5Decoder.h
+++ b/quicktex/s3tc/bc5/BC5Decoder.h
@ -1,5 +1,5 @@
 /*  Quicktex Texture Compression Library
-    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
+    Copyright (C) 2021-2022 Andrew Cassidy <drewcassidy@me.com>
    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
    and licenced under the public domain

@ -24,11 +24,11 @@
 #include <tuple>
 #include <type_traits>

-#include "../../ColorBlock.h"
-#include "../../Decoder.h"
-#include "../../Texture.h"
-#include "../bc4/BC4Decoder.h"
-#include "BC5Block.h"
+#include "ColorBlock.h"
+#include "Decoder.h"
+#include "s3tc/bc4/BC4Decoder.h"
+#include "s3tc/bc5/BC5Block.h"
+#include "texture/BlockTexture.h"

 namespace quicktex::s3tc {

@ -38,8 +38,10 @@ class BC5Decoder : public BlockDecoder<BlockTexture<BC5Block>> {
    using BC4DecoderPtr = std::shared_ptr<BC4Decoder>;
    using BC4DecoderPair = std::tuple<BC4DecoderPtr, BC4DecoderPtr>;

-    BC5Decoder(uint8_t chan0 = 0, uint8_t chan1 = 1) : BC5Decoder(std::make_shared<BC4Decoder>(chan0), std::make_shared<BC4Decoder>(chan1)) {}
-    BC5Decoder(BC4DecoderPtr chan0_decoder, BC4DecoderPtr chan1_decoder) : _chan0_decoder(chan0_decoder), _chan1_decoder(chan1_decoder) {}
+    BC5Decoder(uint8_t chan0 = 0, uint8_t chan1 = 1)
+        : BC5Decoder(std::make_shared<BC4Decoder>(chan0), std::make_shared<BC4Decoder>(chan1)) {}
+    BC5Decoder(BC4DecoderPtr chan0_decoder, BC4DecoderPtr chan1_decoder)
+        : _chan0_decoder(chan0_decoder), _chan1_decoder(chan1_decoder) {}

    ColorBlock<4, 4> DecodeBlock(const BC5Block &block) const override;

--- a/quicktex/s3tc/bc5/BC5Encoder.cpp
+++ b/quicktex/s3tc/bc5/BC5Encoder.cpp
@ -1,5 +1,5 @@
 /*  Quicktex Texture Compression Library
-    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
+    Copyright (C) 2021-2022 Andrew Cassidy <drewcassidy@me.com>
    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
    and licenced under the public domain

@ -19,8 +19,8 @@

 #include "BC5Encoder.h"

-#include "../../ColorBlock.h"
-#include "../bc4/BC4Block.h"
+#include "ColorBlock.h"
+#include "s3tc/bc4/BC4Block.h"

 namespace quicktex::s3tc {
 BC5Block BC5Encoder::EncodeBlock(const ColorBlock<4, 4> &pixels) const {
--- a/quicktex/s3tc/bc5/BC5Encoder.h
+++ b/quicktex/s3tc/bc5/BC5Encoder.h
@ -1,5 +1,5 @@
 /*  Quicktex Texture Compression Library
-    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
+    Copyright (C) 2021-2022 Andrew Cassidy <drewcassidy@me.com>
    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
    and licenced under the public domain

@ -24,11 +24,11 @@
 #include <tuple>
 #include <type_traits>

-#include "../../ColorBlock.h"
-#include "../../Encoder.h"
-#include "../../Texture.h"
-#include "../bc4/BC4Encoder.h"
-#include "BC5Block.h"
+#include "ColorBlock.h"
+#include "Encoder.h"
+#include "s3tc/bc4/BC4Encoder.h"
+#include "s3tc/bc5/BC5Block.h"
+#include "texture/BlockTexture.h"

 namespace quicktex::s3tc {
 class BC5Encoder : public BlockEncoder<BlockTexture<BC5Block>> {
@ -37,8 +37,10 @@ class BC5Encoder : public BlockEncoder<BlockTexture<BC5Block>> {
    using BC4EncoderPtr = std::shared_ptr<BC4Encoder>;
    using BC4EncoderPair = std::tuple<BC4EncoderPtr, BC4EncoderPtr>;

-    BC5Encoder(uint8_t chan0 = 0, uint8_t chan1 = 1) : BC5Encoder(std::make_shared<BC4Encoder>(chan0), std::make_shared<BC4Encoder>(chan1)) {}
-    BC5Encoder(BC4EncoderPtr chan0_encoder, BC4EncoderPtr chan1_encoder) : _chan0_encoder(chan0_encoder), _chan1_encoder(chan1_encoder) {}
+    BC5Encoder(uint8_t chan0 = 0, uint8_t chan1 = 1)
+        : BC5Encoder(std::make_shared<BC4Encoder>(chan0), std::make_shared<BC4Encoder>(chan1)) {}
+    BC5Encoder(BC4EncoderPtr chan0_encoder, BC4EncoderPtr chan1_encoder)
+        : _chan0_encoder(chan0_encoder), _chan1_encoder(chan1_encoder) {}

    BC5Block EncodeBlock(const ColorBlock<4, 4> &pixels) const override;

--- a/quicktex/s3tc/bc5/_bindings.cpp
+++ b/quicktex/s3tc/bc5/_bindings.cpp
@ -1,5 +1,5 @@
 /*  Quicktex Texture Compression Library
-    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
+    Copyright (C) 2021-2022 Andrew Cassidy <drewcassidy@me.com>
    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
    and licenced under the public domain

@ -24,10 +24,10 @@
 #include <array>
 #include <cstdint>

-#include "../../Decoder.h"
-#include "../../Encoder.h"
-#include "BC5Decoder.h"
-#include "BC5Encoder.h"
+#include "s3tc/bc4/BC4Block.h"
+#include "s3tc/bc5/BC5Block.h"
+#include "s3tc/bc5/BC5Decoder.h"
+#include "s3tc/bc5/BC5Encoder.h"

 namespace py = pybind11;
 namespace quicktex::bindings {
@ -52,7 +52,8 @@ void InitBC5(py::module_ &s3tc) {

    bc5_block.def_readwrite("chan0_block", &BC5Block::chan0_block, "The BC4 block used for the first channel.");
    bc5_block.def_readwrite("chan1_block", &BC5Block::chan1_block, "The BC4 block used for the second channel.");
-    bc5_block.def_property("blocks", &BC5Block::GetBlocks, &BC5Block::SetBlocks, "The BC4 and BC1 blocks that make up this block as a 2-tuple.");
+    bc5_block.def_property("blocks", &BC5Block::GetBlocks, &BC5Block::SetBlocks,
+                           "The BC4 and BC1 blocks that make up this block as a 2-tuple.");
    // endregion

    // region BC5Texture
@ -79,9 +80,11 @@ void InitBC5(py::module_ &s3tc) {
        :returns: A new BC5Texture with the same dimension as the input.
    )doc");

-    bc5_encoder.def_property_readonly("channels", &BC5Encoder::GetChannels, "A 2-tuple of channels that will be read from. 0 to 3 inclusive. Readonly.");
-    bc5_encoder.def_property_readonly("bc4_encoders", &BC5Encoder::GetBC4Encoders,
-                                      "2-tuple of internal :py:class:`~quicktex.s3tc.bc4.BC4Encoder` s used for each channel. Readonly.");
+    bc5_encoder.def_property_readonly("channels", &BC5Encoder::GetChannels,
+                                      "A 2-tuple of channels that will be read from. 0 to 3 inclusive. Readonly.");
+    bc5_encoder.def_property_readonly(
+        "bc4_encoders", &BC5Encoder::GetBC4Encoders,
+        "2-tuple of internal :py:class:`~quicktex.s3tc.bc4.BC4Encoder` s used for each channel. Readonly.");
    // endregion

    // region BC5Decoder
@ -103,9 +106,11 @@ void InitBC5(py::module_ &s3tc) {
        :returns: A new RawTexture with the same dimensions as the input
    )doc");

-    bc5_decoder.def_property_readonly("channels", &BC5Decoder::GetChannels, "A 2-tuple of channels that will be written to. 0 to 3 inclusive. Readonly.");
-    bc5_decoder.def_property_readonly("bc4_decoders", &BC5Decoder::GetBC4Decoders,
-                                      "2-tuple of internal :py:class:`~quicktex.s3tc.bc4.BC4Decoder` s used for each channel. Readonly.");
+    bc5_decoder.def_property_readonly("channels", &BC5Decoder::GetChannels,
+                                      "A 2-tuple of channels that will be written to. 0 to 3 inclusive. Readonly.");
+    bc5_decoder.def_property_readonly(
+        "bc4_decoders", &BC5Decoder::GetBC4Decoders,
+        "2-tuple of internal :py:class:`~quicktex.s3tc.bc4.BC4Decoder` s used for each channel. Readonly.");
    // endregion
 }
 }  // namespace quicktex::bindings
--- a/quicktex/s3tc/interpolator/Interpolator.cpp
+++ b/quicktex/s3tc/interpolator/Interpolator.cpp
@ -1,5 +1,5 @@
 /*  Quicktex Texture Compression Library
-    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
+    Copyright (C) 2021-2022 Andrew Cassidy <drewcassidy@me.com>
    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
    and licenced under the public domain

@ -24,8 +24,8 @@
 #include <cstdint>
 #include <stdexcept>

-#include "../../util.h"
-#include "../../Color.h"
+#include "OldColor.h"
+#include "util/bitbash.h"

 namespace quicktex::s3tc {

@ -45,25 +45,33 @@ std::unique_ptr<Interpolator> Interpolator::MakeInterpolator(Interpolator::Type
    }
 }

-uint8_t Interpolator::Interpolate5(uint8_t v0, uint8_t v1) const { return Interpolate8(scale5To8(v0), scale5To8(v1)); }
-uint8_t Interpolator::Interpolate6(uint8_t v0, uint8_t v1) const { return Interpolate8(scale6To8(v0), scale6To8(v1)); }
-uint8_t Interpolator::InterpolateHalf5(uint8_t v0, uint8_t v1) const { return InterpolateHalf8(scale5To8(v0), scale5To8(v1)); }
-uint8_t Interpolator::InterpolateHalf6(uint8_t v0, uint8_t v1) const { return InterpolateHalf8(scale6To8(v0), scale6To8(v1)); }
-
-std::array<Color, 4> Interpolator::Interpolate565BC1(uint16_t low, uint16_t high, bool allow_3color) const {
-    bool use_3color = allow_3color && (high >= low);
-    return InterpolateBC1(Color::Unpack565Unscaled(low), Color::Unpack565Unscaled(high), use_3color);
+uint8_t Interpolator::Interpolate5(uint8_t v0, uint8_t v1) const {
+    return Interpolate8(scale_to_8<5>(v0), scale_to_8<5>(v1));
+}
+uint8_t Interpolator::Interpolate6(uint8_t v0, uint8_t v1) const {
+    return Interpolate8(scale_to_8<6>(v0), scale_to_8<6>(v1));
+}
+uint8_t Interpolator::InterpolateHalf5(uint8_t v0, uint8_t v1) const {
+    return InterpolateHalf8(scale_to_8<5>(v0), scale_to_8<5>(v1));
+}
+uint8_t Interpolator::InterpolateHalf6(uint8_t v0, uint8_t v1) const {
+    return InterpolateHalf8(scale_to_8<6>(v0), scale_to_8<6>(v1));
 }

-std::array<Color, 4> Interpolator::InterpolateBC1(Color low, Color high, bool use_3color) const {
-    auto colors = std::array<Color, 4>();
+std::array<OldColor, 4> Interpolator::Interpolate565BC1(uint16_t low, uint16_t high, bool allow_3color) const {
+    bool use_3color = allow_3color && (high >= low);
+    return InterpolateBC1(OldColor::Unpack565Unscaled(low), OldColor::Unpack565Unscaled(high), use_3color);
+}
+
+std::array<OldColor, 4> Interpolator::InterpolateBC1(OldColor low, OldColor high, bool use_3color) const {
+    auto colors = std::array<OldColor, 4>();
    colors[0] = low.ScaleFrom565();
    colors[1] = high.ScaleFrom565();

    if (use_3color) {
        // 3-color mode
        colors[2] = InterpolateHalfColor24(colors[0], colors[1]);
-        colors[3] = Color(0, 0, 0, 0);  // transparent black
+        colors[3] = OldColor(0, 0, 0, 0);  // transparent black
    } else {
        // 4-color mode
        colors[2] = InterpolateColor24(colors[0], colors[1]);
@ -79,8 +87,12 @@ uint8_t Interpolator::InterpolateHalf8(uint8_t v0, uint8_t v1) const { return (v
 // endregion

 // region InterpolatorRound implementation
-uint8_t InterpolatorRound::Interpolate5(uint8_t v0, uint8_t v1) const { return Interpolate8(scale5To8(v0), scale5To8(v1)); }
-uint8_t InterpolatorRound::Interpolate6(uint8_t v0, uint8_t v1) const { return Interpolate8(scale6To8(v0), scale6To8(v1)); }
+uint8_t InterpolatorRound::Interpolate5(uint8_t v0, uint8_t v1) const {
+    return Interpolate8(scale_to_8<5>(v0), scale_to_8<5>(v1));
+}
+uint8_t InterpolatorRound::Interpolate6(uint8_t v0, uint8_t v1) const {
+    return Interpolate8(scale_to_8<6>(v0), scale_to_8<6>(v1));
+}

 uint8_t InterpolatorRound::Interpolate8(uint8_t v0, uint8_t v1) const { return (v0 * 2 + v1 + 1) / 3; }
 // endregion
@ -108,9 +120,9 @@ uint8_t InterpolatorNvidia::InterpolateHalf6(uint8_t v0, uint8_t v1) const {
    return static_cast<uint8_t>((256 * v0 + gdiff / 4 + 128 + gdiff * 128) >> 8);
 }

-std::array<Color, 4> InterpolatorNvidia::InterpolateBC1(Color low, Color high, bool use_3color) const {
+std::array<OldColor, 4> InterpolatorNvidia::InterpolateBC1(OldColor low, OldColor high, bool use_3color) const {
    // Nvidia is special and interpolation cant be done with 8-bit values, so we need to override the default behavior
-    std::array<Color, 4> colors;
+    std::array<OldColor, 4> colors;
    colors[0] = low.ScaleFrom565();
    colors[1] = high.ScaleFrom565();

@ -121,7 +133,7 @@ std::array<Color, 4> InterpolatorNvidia::InterpolateBC1(Color low, Color high, b
    } else {
        // 3-color mode
        colors[2] = InterpolateHalfColor565(low, high);
-        colors[3] = Color(0, 0, 0, 0);  // transparent black
+        colors[3] = OldColor(0, 0, 0, 0);  // transparent black
    }

    return colors;
@ -129,10 +141,18 @@ std::array<Color, 4> InterpolatorNvidia::InterpolateBC1(Color low, Color high, b
 // endregion

 // region InterpolatorAMD implementation
-uint8_t InterpolatorAMD::Interpolate5(uint8_t v0, uint8_t v1) const { return Interpolate8(scale5To8(v0), scale5To8(v1)); }
-uint8_t InterpolatorAMD::Interpolate6(uint8_t v0, uint8_t v1) const { return Interpolate8(scale6To8(v0), scale6To8(v1)); }
-uint8_t InterpolatorAMD::InterpolateHalf5(uint8_t v0, uint8_t v1) const { return InterpolateHalf8(scale5To8(v0), scale5To8(v1)); }
-uint8_t InterpolatorAMD::InterpolateHalf6(uint8_t v0, uint8_t v1) const { return InterpolateHalf8(scale6To8(v0), scale6To8(v1)); }
+uint8_t InterpolatorAMD::Interpolate5(uint8_t v0, uint8_t v1) const {
+    return Interpolate8(scale_to_8<5>(v0), scale_to_8<5>(v1));
+}
+uint8_t InterpolatorAMD::Interpolate6(uint8_t v0, uint8_t v1) const {
+    return Interpolate8(scale_to_8<6>(v0), scale_to_8<6>(v1));
+}
+uint8_t InterpolatorAMD::InterpolateHalf5(uint8_t v0, uint8_t v1) const {
+    return InterpolateHalf8(scale_to_8<5>(v0), scale_to_8<5>(v1));
+}
+uint8_t InterpolatorAMD::InterpolateHalf6(uint8_t v0, uint8_t v1) const {
+    return InterpolateHalf8(scale_to_8<6>(v0), scale_to_8<6>(v1));
+}

 uint8_t InterpolatorAMD::Interpolate8(uint8_t v0, uint8_t v1) const { return (v0 * 43 + v1 * 21 + 32) >> 6; }

--- a/quicktex/s3tc/interpolator/Interpolator.h
+++ b/quicktex/s3tc/interpolator/Interpolator.h
@ -1,5 +1,5 @@
 /*  Quicktex Texture Compression Library
-    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
+    Copyright (C) 2021-2022 Andrew Cassidy <drewcassidy@me.com>
    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
    and licenced under the public domain

@ -22,7 +22,7 @@
 #include <cstdint>  // for uint8_t, uint16_t
 #include <memory>   // for unique_ptr

-#include "../../Color.h"  // for Color
+#include "OldColor.h"  // for Color

 namespace quicktex::s3tc {

@ -97,7 +97,7 @@ class Interpolator {
     * @param allow_3color if true, a different interpolation mode will be used if high >= low
     * @return an array of 4 Color values, with indices matching BC1 selectors
     */
-    std::array<Color, 4> Interpolate565BC1(uint16_t low, uint16_t high, bool allow_3color = true) const;
+    std::array<OldColor, 4> Interpolate565BC1(uint16_t low, uint16_t high, bool allow_3color = true) const;

    /**
     * Generates the 4 colors for a BC1 block from the given
@ -106,7 +106,7 @@ class Interpolator {
     * @param use_3color if the 3-color interpolation mode should be used
     * @return an array of 4 Color values, with indices matching BC1 selectors
     */
-    virtual std::array<Color, 4> InterpolateBC1(Color low, Color high, bool use_3color) const;
+    virtual std::array<OldColor, 4> InterpolateBC1(OldColor low, OldColor high, bool use_3color) const;

    /**
     * Gets the type of an interpolator
@ -126,12 +126,12 @@ class Interpolator {
    }

   private:
-    Color InterpolateColor24(const Color &c0, const Color &c1) const {
-        return Color(Interpolate8(c0.r, c1.r), Interpolate8(c0.g, c1.g), Interpolate8(c0.b, c1.b));
+    OldColor InterpolateColor24(const OldColor &c0, const OldColor &c1) const {
+        return OldColor(Interpolate8(c0.r, c1.r), Interpolate8(c0.g, c1.g), Interpolate8(c0.b, c1.b));
    }

-    Color InterpolateHalfColor24(const Color &c0, const Color &c1) const {
-        return Color(InterpolateHalf8(c0.r, c1.r), InterpolateHalf8(c0.g, c1.g), InterpolateHalf8(c0.b, c1.b));
+    OldColor InterpolateHalfColor24(const OldColor &c0, const OldColor &c1) const {
+        return OldColor(InterpolateHalf8(c0.r, c1.r), InterpolateHalf8(c0.g, c1.g), InterpolateHalf8(c0.b, c1.b));
    }
 };

@ -152,18 +152,18 @@ class InterpolatorNvidia final : public Interpolator {
    virtual uint8_t InterpolateHalf5(uint8_t v0, uint8_t v1) const override;
    virtual uint8_t InterpolateHalf6(uint8_t v0, uint8_t v1) const override;

-    virtual std::array<Color, 4> InterpolateBC1(Color low, Color high, bool use_3color) const override;
+    virtual std::array<OldColor, 4> InterpolateBC1(OldColor low, OldColor high, bool use_3color) const override;

    virtual Type GetType() const noexcept override { return Type::Nvidia; }
    virtual bool CanInterpolate8Bit() const noexcept override { return false; }

   private:
-    Color InterpolateColor565(const Color &c0, const Color &c1) const {
-        return Color(Interpolate5(c0.r, c1.r), Interpolate6(c0.g, c1.g), Interpolate5(c0.b, c1.b));
+    OldColor InterpolateColor565(const OldColor &c0, const OldColor &c1) const {
+        return OldColor(Interpolate5(c0.r, c1.r), Interpolate6(c0.g, c1.g), Interpolate5(c0.b, c1.b));
    }

-    Color InterpolateHalfColor565(const Color &c0, const Color &c1) const {
-        return Color(InterpolateHalf5(c0.r, c1.r), InterpolateHalf6(c0.g, c1.g), InterpolateHalf5(c0.b, c1.b));
+    OldColor InterpolateHalfColor565(const OldColor &c0, const OldColor &c1) const {
+        return OldColor(InterpolateHalf5(c0.r, c1.r), InterpolateHalf6(c0.g, c1.g), InterpolateHalf5(c0.b, c1.b));
    }
 };

--- a/quicktex/s3tc/interpolator/_bindings.cpp
+++ b/quicktex/s3tc/interpolator/_bindings.cpp
@ -1,5 +1,5 @@
 /*  Quicktex Texture Compression Library
-    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
+    Copyright (C) 2021-2022 Andrew Cassidy <drewcassidy@me.com>
    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com> and licenced under the public domain

    This program is free software: you can redistribute it and/or modify
--- a/tests/run_tests.cpp
+++ b/tests/run_tests.cpp
@ -17,28 +17,15 @@
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

-// This file allows for easy debugging in CLion or other IDEs that dont natively support cross-debugging between Python and C++
-
-#include <pybind11/embed.h>
-
 #include <array>
-#include <string>
+#include <cstdint>
+#include <xsimd/xsimd.hpp>

-namespace py = pybind11;
-using namespace pybind11::literals;
+#include "Matrix.h"

-#define STRINGIFY(x) #x
-#define MACRO_STRINGIFY(x) STRINGIFY(x)
-
-int main() {
-    py::scoped_interpreter guard{};
-
-    py::module_ site = py::module_::import("site");
-
-    site.attr("addsitedir")(CUSTOM_SYS_PATH);
-
-    py::module_ nose = py::module_::import("nose");
-    py::module_ tests = py::module_::import("tests");
-    py::list argv(1);
-    nose.attr("runmodule")("name"_a = "tests.test_bc1", "exit"_a = false);
-}
+// Type your code here, or load an example.
+namespace quicktex {
+auto test(Matrix<float, 4, 1> a, Matrix<float, 4, 1> b, Matrix<float, 4, 1> c) {
+    return a * 7;
+};
+}  // namespace quicktex
--- a/quicktex/texture/BlockTexture.h
+++ b/quicktex/texture/BlockTexture.h
@ -0,0 +1,70 @@
+/*  Quicktex Texture Compression Library
+    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
+    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
+    and licenced under the public domain
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Lesser General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <array>
+#include <vector>
+
+#include "Texture.h"
+
+namespace quicktex {
+template <typename B> class BlockTexture final : public Texture {
+   private:
+    std::vector<B> _blocks;
+    unsigned _width_b;
+    unsigned _height_b;
+
+   public:
+    using BlockType = B;
+    using Base = Texture;
+
+    /**
+     * Create a new BlockTexture
+     * @param width width of the texture in pixels. must be divisible by B::width
+     * @param height height of the texture in pixels. must be divisible by B::height
+     */
+    BlockTexture(int w, int h) : Base(w, h) {
+        _width_b = (width + B::Width - 1) / B::Width;
+        _height_b = (height + B::Height - 1) / B::Height;
+        _blocks = std::vector<B>(_width_b * _height_b);
+    }
+
+    constexpr unsigned bwidth() const { return _width_b; }
+    constexpr unsigned bheight() const { return _height_b; }
+    constexpr std::tuple<int, int> bsize() const { return std::tuple<int, int>(_width_b, _height_b); }
+
+    B get_block(unsigned x, unsigned y) const {
+        if (x >= _width_b) throw std::out_of_range("x value out of range.");
+        if (y >= _height_b) throw std::out_of_range("y value out of range.");
+        return _blocks.at(x + (y * _width_b));
+    }
+
+    void set_block(unsigned x, unsigned y, const B &val) {
+        if (x >= _width_b) throw std::out_of_range("x value out of range.");
+        if (y >= _height_b) throw std::out_of_range("y value out of range.");
+        _blocks.at(x + (y * _width_b)) = val;
+    }
+
+    size_t nbytes() const noexcept override { return _blocks.size() * sizeof(B); }
+
+    const uint8_t *data() const noexcept override { return reinterpret_cast<const uint8_t *>(_blocks.data()); }
+    uint8_t *data() noexcept override { return reinterpret_cast<uint8_t *>(_blocks.data()); }
+};
+}  // namespace quicktex
--- a/quicktex/texture/RawTexture.cpp
+++ b/quicktex/texture/RawTexture.cpp
@ -0,0 +1,33 @@
+/*  Quicktex Texture Compression Library
+    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
+    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
+    and licenced under the public domain
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Lesser General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "RawTexture.h"
+
+namespace quicktex {
+Color RawTexture::pixel(unsigned x, unsigned y) const {
+    if (x >= width) throw std::invalid_argument("x value out of range.");
+    if (y >= height) throw std::invalid_argument("y value out of range.");
+    return _pixels.at(x + (y * width));
+}
+quicktex::Color& RawTexture::pixel(unsigned x, unsigned y) {
+    if (x >= width) throw std::invalid_argument("x value out of range.");
+    if (y >= height) throw std::invalid_argument("y value out of range.");
+    return _pixels.at(x + (y * width));
+}
+}  // namespace quicktex
--- a/quicktex/texture/RawTexture.h
+++ b/quicktex/texture/RawTexture.h
@ -0,0 +1,97 @@
+/*  Quicktex Texture Compression Library
+    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
+    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
+    and licenced under the public domain
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Lesser General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <array>
+#include <climits>
+#include <cstdint>
+#include <cstdio>
+#include <cstring>
+#include <memory>
+#include <stdexcept>
+#include <tuple>
+#include <type_traits>
+#include <vector>
+
+#include "Color.h"
+#include "ColorBlock.h"
+#include "OldColor.h"
+#include "texture/Texture.h"
+
+namespace quicktex {
+class RawTexture : public Texture {
+    using Base = Texture;
+
+   public:
+    /**
+     * Create a new RawTexture
+     * @param width width of the texture in pixels
+     * @param height height of the texture in pixels
+     */
+    RawTexture(int w, int h) : Base(w, h), _pixels(w* h) {}
+
+    quicktex::Color pixel(unsigned x, unsigned y) const;
+
+    quicktex::Color &pixel(unsigned x, unsigned y);
+
+    quicktex::Color pixel_wrapped(unsigned x, unsigned y) const { return pixel(x % width, y % height); }
+
+    quicktex::Color &pixel_wrapped(unsigned x, unsigned y) { return pixel(x % width, y % height); }
+
+    size_t nbytes() const noexcept override { return static_cast<size_t>(width * height) * sizeof(quicktex::Color); }
+
+    template <int N, int M> quicktex::ColorBlock<N, M> get_block(int block_x, int block_y) const {
+        if (block_x < 0) throw std::out_of_range("x value out of range.");
+        if (block_y < 0) throw std::out_of_range("y value out of range.");
+
+        // coordinates in the image of the top-left pixel of the selected block
+        quicktex::ColorBlock<N, M> block;
+        int pixel_x = block_x * N;
+        int pixel_y = block_y * M;
+
+        // slower pixel-wise copy if the block goes over the edges
+        for (int x = 0; x < N; x++) {
+            for (int y = 0; y < M; y++) { block.Set(x, y, pixel((pixel_x + x) % width, (pixel_y + y) % height)); }
+        }
+
+        return block;
+    }
+
+    template <int N, int M> void set_block(int block_x, int block_y, const quicktex::ColorBlock<N, M> &block) {
+        if (block_x < 0) throw std::out_of_range("x value out of range.");
+        if (block_y < 0) throw std::out_of_range("y value out of range.");
+
+        // coordinates in the image of the top-left pixel of the selected block
+        int pixel_x = block_x * N;
+        int pixel_y = block_y * M;
+
+        // slower pixel-wise copy if the block goes over the edges
+        for (int x = 0; x < N; x++) {
+            for (int y = 0; y < M; y++) { pixel((pixel_x + x) % width, (pixel_y + y) % height) = block.Get(x, y); }
+        }
+    }
+
+    virtual const uint8_t *data() const noexcept override { return reinterpret_cast<const uint8_t *>(_pixels.data()); }
+    virtual uint8_t *data() noexcept override { return reinterpret_cast<uint8_t *>(_pixels.data()); }
+
+   protected:
+    std::vector<quicktex::Color> _pixels;
+};
+}  // namespace quicktex
--- a/quicktex/texture/Texture.h
+++ b/quicktex/texture/Texture.h
@ -0,0 +1,62 @@
+/*  Quicktex Texture Compression Library
+    Copyright (C) 2021-2022 Andrew Cassidy <drewcassidy@me.com>
+    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
+    and licenced under the public domain
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Lesser General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <array>
+#include <climits>
+#include <cstdint>
+#include <cstdio>
+#include <cstring>
+#include <memory>
+#include <stdexcept>
+#include <tuple>
+#include <type_traits>
+#include <vector>
+
+#include "Color.h"
+#include "ColorBlock.h"
+#include "OldColor.h"
+#include "Window.h"
+
+namespace quicktex {
+
+class Texture {
+   public:
+    const unsigned width;
+    const unsigned height;
+
+    virtual ~Texture() = default;
+
+    virtual std::tuple<unsigned, unsigned> Size() const { return {width, height}; }
+
+    /**
+     * The texture's total size
+     * @return The size of the texture in bytes.
+     */
+    virtual size_t nbytes() const noexcept = 0;
+
+    virtual const uint8_t *data() const noexcept = 0;
+    virtual uint8_t *data() noexcept = 0;
+
+   protected:
+    Texture(unsigned w, unsigned h) : width(w), height(h) {}
+};
+
+}  // namespace quicktex
--- a/quicktex/texture/Window.cpp
+++ b/quicktex/texture/Window.cpp
@ -0,0 +1,90 @@
+/*  Quicktex Texture Compression Library
+    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
+    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
+    and licenced under the public domain
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Lesser General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "Window.h"
+
+#include "texture/RawTexture.h"
+
+namespace quicktex {
+
+// Window
+Window::Window(RawTexture& texture, unsigned w, unsigned h, unsigned px, unsigned py)
+    : width(w), height(h), x(px), y(py), _texture(texture) {
+    assert(x < texture.width);
+    assert(y < texture.height);
+}
+
+Color& Window::pixel(unsigned px, unsigned py) {
+    assert(px < width && py < height);
+    return _texture.pixel(x + px, y + py);
+}
+
+Color Window::pixel(unsigned px, unsigned py) const {
+    assert(px < width && py < height);
+    return _texture.pixel(x + px, y + py);
+}
+
+WindowIterator Window::begin() { return WindowIterator(*this, 0, 0); }
+WindowIterator Window::end() { return WindowIterator(*this, 0, height); }
+WindowIterator Window::row_begin(unsigned int row) { return WindowIterator(*this, 0, row); }
+WindowIterator Window::row_end(unsigned int row) { return WindowIterator(*this, 0, row + 1); }
+
+bool Window::operator==(const Window& rhs) const {
+    return width == rhs.width && height == rhs.height && x == rhs.x && y == rhs.y && &_texture == &rhs._texture;
+}
+
+// WindowIterator
+
+WindowIterator::WindowIterator(Window& view, unsigned px, unsigned py) : x(px), y(py), _view(&view) {
+    assert(x < view.width);
+    assert(y < view.height || (y == view.height && x == 0));
+    // if y == the height, and x == 0, then this is a sentinel for the end of iteration, and cannot be dereferenced
+}
+
+WindowIterator& quicktex::WindowIterator::operator++() {  // prefix increment
+    x++;
+    if (x >= _view->width) {
+        x = 0;
+        y++;
+    }
+    return *this;
+}
+
+WindowIterator WindowIterator::operator++(int) {  // postfix increment
+    WindowIterator old = *this;
+    ++(*this);
+    return old;
+}
+
+Color& WindowIterator::operator*() const {  // dereference operator
+    assert(_view != nullptr);
+    assert(x < _view->width && y < _view->height);
+    return _view->pixel(x, y);
+}
+
+Color* WindowIterator::operator->() { return &(**this); }  // returns a pointer to what's returned by operator*
+
+bool WindowIterator::operator==(const WindowIterator& rhs) const {
+    return x == rhs.x && y == rhs.y && _view == rhs._view;
+}
+
+static_assert(std::forward_iterator<WindowIterator>);
+// static_assert(sized_range<Window>);
+
+}  // namespace quicktex
--- a/quicktex/texture/Window.h
+++ b/quicktex/texture/Window.h
@ -0,0 +1,82 @@
+/*  Quicktex Texture Compression Library
+    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
+    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
+    and licenced under the public domain
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Lesser General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+#include "Color.h"
+#include "util/ranges.h"
+
+namespace quicktex {
+
+// forward declarations
+class WindowIterator;
+class RawTexture;
+
+/**
+ * Class representing a window into a RawTexture
+ */
+class Window {
+   public:
+    typedef Color value_type;
+
+    const unsigned width, height;
+    const unsigned x, y;
+
+    Window(RawTexture &texture, unsigned w, unsigned h, unsigned x, unsigned y);
+
+    Color &pixel(unsigned px, unsigned py);
+    Color pixel(unsigned px, unsigned py) const;
+
+    WindowIterator begin();
+    WindowIterator end();
+    WindowIterator row_begin(unsigned row);
+    WindowIterator row_end(unsigned row);
+
+    size_t size() const { return width * height; }
+
+    bool operator==(const Window &rhs) const;
+
+   private:
+    RawTexture &_texture;
+};
+
+/**
+ * Iterator returned by Window
+ */
+class WindowIterator {
+   public:
+    typedef long long difference_type;
+    typedef Color value_type;
+
+    unsigned x, y;
+
+    WindowIterator(Window &view, unsigned x, unsigned y);
+    WindowIterator() : x(0), y(0), _view(nullptr) {}
+
+    Color &operator*() const;  // dereference
+    Color *operator->();       // member access
+
+    WindowIterator &operator++();    // prefix increment
+    WindowIterator operator++(int);  // postfix increment
+    bool operator==(const WindowIterator &rhs) const;
+
+   private:
+    Window *_view;
+};
+
+}  // namespace quicktex
--- a/quicktex/util.h
+++ b/quicktex/util.h
@ -1,178 +0,0 @@
-/*  Quicktex Texture Compression Library
-    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
-    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
-    and licenced under the public domain
-
-    This program is free software: you can redistribute it and/or modify
-    it under the terms of the GNU Lesser General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU Lesser General Public License for more details.
-
-    You should have received a copy of the GNU Lesser General Public License
-    along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#pragma once
-#include <cassert>
-#include <cstdint>
-#include <limits>
-#include <string>
-#include <type_traits>
-#include <functional>
-#include <vector>
-
-#define UINT5_MAX 0x1FU  // 31
-#define UINT6_MAX 0x3FU  // 63
-
-#define assert5bit(x) assert(x <= UINT5_MAX)
-#define assert6bit(x) assert(x <= UINT6_MAX)
-
-template <typename S> constexpr auto iabs(S i) {
-    static_assert(!std::is_unsigned<S>::value);
-    using O = typename std::make_unsigned<S>::type;
-    return (i < 0) ? static_cast<O>(-i) : static_cast<O>(i);
-}
-
-/**
- * Unpacks an unsigned integer into an array of smaller integers.
- * @tparam I Input data type. Must be an unsigned integral type large enough to hold C * N bits.
- * @tparam O Output data type. must be an unsigned integral type large enough to hold C bits..
- * @tparam S Number of bits in each value.
- * @tparam C Number of values to unpack.
- * @param packed Packed integer input of type I.
- * @return Unpacked std::array of type O and size C.
- */
-template <typename I, typename O, size_t S, size_t C> constexpr std::array<O, C> Unpack(I packed) {
-    // type checking
-    static_assert(std::is_unsigned<I>::value, "Packed input type must be unsigned");
-    static_assert(std::is_unsigned<O>::value, "Unpacked output type must be unsigned");
-    static_assert(std::numeric_limits<I>::digits >= (C * S), "Packed input type must be big enough to represent the number of bits multiplied by count");
-    static_assert(std::numeric_limits<O>::digits >= S, "Unpacked output type must be big enough to represent the number of bits");
-
-    constexpr O mask = (1U << S) - 1U;  // maximum value representable by N bits
-    std::array<O, C> vals;              // output values array of size C
-
-    for (unsigned i = 0; i < C; i++) {
-        vals[i] = static_cast<O>(packed >> (i * S)) & mask;
-        assert(vals[i] <= mask);
-    }
-
-    return vals;
-}
-
-/**
- * Packs an array of unsigned integers into a single integer.
- * @tparam I Input data type. Must be an unsigned integral type large enough to hold C bits.
- * @tparam O Output data type. must be an unsigned integral type large enough to hold C * N bits.
- * @tparam S Number of bits in each value.
- * @tparam C Number of values to unpack.
- * @param vals Unpacked std::array of type I and size C.
- * @return Packed integer input of type O.
- */
-template <typename I, typename O, size_t S, size_t C> constexpr O Pack(const std::array<I, C> &vals) {
-    // type checking
-    static_assert(std::is_unsigned<I>::value, "Unpacked input type must be unsigned");
-    static_assert(std::is_unsigned<O>::value, "Packed output type must be unsigned");
-    static_assert(std::numeric_limits<I>::digits >= S, "Unpacked input type must be big enough to represent the number of bits");
-    static_assert(std::numeric_limits<O>::digits >= (C * S), "Packed output type must be big enough to represent the number of bits multiplied by count");
-
-    O packed = 0;  // output value of type O
-
-    for (unsigned i = 0; i < C; i++) {
-        assert(vals[i] <= (1U << S) - 1U);
-        packed |= static_cast<O>(vals[i]) << (i * S);
-    }
-
-    assert(packed <= (static_cast<O>(1U) << (C * S)) - 1U);
-    return packed;
-}
-
-template <size_t Size, int Op(int)> constexpr std::array<uint8_t, Size> ExpandArray() {
-    std::array<uint8_t, Size> res;
-    for (int i = 0; i < Size; i++) { res[i] = Op(i); }
-    return res;
-}
-
-template <typename Seq, typename Fn> constexpr auto MapArray(const Seq &input, Fn op) {
-    using I = typename Seq::value_type;
-    using O = decltype(op(std::declval<I>()));
-    constexpr size_t N = std::tuple_size<Seq>::value;
-
-    std::array<O, N> output;
-    for (unsigned i = 0; i < N; i++) { output[i] = op(input[i]); }
-    return output;
-}
-
-template <typename S> constexpr S scale8To5(S v) {
-    auto v2 = v * 31 + 128;
-    return static_cast<S>((v2 + (v2 >> 8)) >> 8);
-}
-template <typename S> constexpr S scale8To6(S v) {
-    auto v2 = v * 63 + 128;
-    return static_cast<S>((v2 + (v2 >> 8)) >> 8);
-}
-
-template <typename S> constexpr S scale5To8(S v) {
-    assert5bit(v);
-    return static_cast<S>((v << 3) | (v >> 2));
-}
-template <typename S> constexpr S scale6To8(S v) {
-    assert6bit(v);
-    return static_cast<S>((v << 2) | (v >> 4));
-}
-
-template <typename S> constexpr S maximum(S a, S b) { return (a > b) ? a : b; }
-template <typename S> constexpr S maximum(S a, S b, S c) { return maximum(maximum(a, b), c); }
-template <typename S> constexpr S maximum(S a, S b, S c, S d) { return maximum(maximum(maximum(a, b), c), d); }
-
-template <typename S> constexpr S minimum(S a, S b) { return (a < b) ? a : b; }
-template <typename S> constexpr S minimum(S a, S b, S c) { return minimum(minimum(a, b), c); }
-template <typename S> constexpr S minimum(S a, S b, S c, S d) { return minimum(minimum(minimum(a, b), c), d); }
-
-template <typename T> constexpr T square(T a) { return a * a; }
-
-constexpr float clampf(float value, float low = 0.0f, float high = 1.0f) {
-    if (value < low)
-        value = low;
-    else if (value > high)
-        value = high;
-    return value;
-}
-constexpr uint8_t clamp255(int32_t i) { return static_cast<uint8_t>((static_cast<unsigned int>(i) & 0xFFFFFF00U) ? (~(i >> 31)) : i); }
-
-template <typename S> constexpr S clamp(S value, S low, S high) { return (value < low) ? low : ((value > high) ? high : value); }
-constexpr int32_t clampi(int32_t value, int32_t low, int32_t high) {
-    if (value < low)
-        value = low;
-    else if (value > high)
-        value = high;
-    return value;
-}
-
-constexpr int squarei(int a) { return a * a; }
-constexpr int absi(int a) { return (a < 0) ? -a : a; }
-
-template <typename F> constexpr F lerp(F a, F b, F s) { return a + (b - a) * s; }
-
-template <typename... Args> std::string Format(const char *str, const Args &...args) {
-    auto output = std::string(str);
-
-    std::vector<std::string> values = {{args...}};
-
-    for (unsigned i = 0; i < values.size(); i++) {
-        auto key = "{" + std::to_string(i) + "}";
-        auto value = values[i];
-        while (true) {
-            size_t where = output.find(key);
-            if (where == output.npos) break;
-            output.replace(where, key.length(), value);
-        }
-    }
-
-    return output;
-}
--- a/quicktex/util/bitbash.h
+++ b/quicktex/util/bitbash.h
@ -0,0 +1,313 @@
+/*  Quicktex Texture Compression Library
+    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
+    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
+    and licenced under the public domain
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Lesser General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <array>
+#include <cassert>
+#include <concepts>
+#include <limits>
+#include <numeric>
+#include <type_traits>
+
+#include "iterator.h"
+#include "util/math.h"
+#include "util/ranges.h"
+
+#define UINT5_MAX 0x1FU  // 31
+#define UINT6_MAX 0x3FU  // 63
+
+#define assert5bit(x) assert(x <= UINT5_MAX)
+#define assert6bit(x) assert(x <= UINT6_MAX)
+
+namespace quicktex {
+
+template <size_t N, typename S> S scale_from_8(S v) {
+    static_assert(N < 8);
+    assert(v < (1 << 8));
+
+    unsigned max = (1 << N) - 1;
+    unsigned v2 = (v * max) + 128;
+    auto result = static_cast<S>((v2 + (v2 >> 8)) >> 8);
+
+    assert(result < (1 << N));
+
+    return result;
+}
+
+template <size_t N, typename S> S scale_to_8(S v) {
+    static_assert(N < 8);
+    assert(v < (1 << N));
+
+    constexpr unsigned Lshift = 8 - N;
+    constexpr unsigned Rshift = N - Lshift;
+    S result = static_cast<S>((v << Lshift) | (v >> Rshift));
+
+    assert(v < (1 << 8));
+
+    return result;
+}
+
+/**
+ * Unpacks an unsigned integer into a range of smaller integers.
+ * @param packed value to unpack
+ * @param begin destination start iterator
+ * @param end destination end iterator
+ * @param widths widths iterator. values are in bits
+ * @param little_endian if the input has the first element in the least significant place
+ * @return the total number of bits unpacked
+ */
+template <typename P, typename OI, typename WI>
+    requires std::unsigned_integral<P> && std::output_iterator<OI, P> && std::forward_iterator<WI>
+size_t unpack_into(P packed, OI begin, OI end, WI widths, bool little_endian = true) {
+    using U = std::remove_cvref_t<decltype(*begin)>;
+    if (little_endian) {
+        // first element is in the least significant place of packed
+
+        unsigned offset = 0;
+        while (begin < end) {
+            auto w = *(widths++);
+            assert(w <= std::numeric_limits<U>::digits);
+
+            auto mask = ((1 << w) - 1);              // least significant w bits all 1
+            *(begin++) = (packed >> offset) & mask;  // write to output
+
+            offset += w;  // increment offset
+        }
+
+        assert(offset <= std::numeric_limits<P>::digits);  // detect an overflow condition
+        return offset;
+    } else {
+        // first element is in the most significant place of packed
+
+        // with non-constant width, we either need to iterate backwards or
+        // add up all the widths beforehand to know where to begin
+        unsigned total_offset = std::accumulate(widths, widths + std::distance(begin, end), 0);
+        assert(total_offset <= std::numeric_limits<P>::digits);  // detect an overflow condition
+
+        unsigned offset = total_offset;
+        while (begin < end) {
+            auto w = *(widths++);
+            offset -= w;                                 // decrement offset
+            assert(w < std::numeric_limits<U>::digits);  // detect an overflow condition
+
+            auto mask = ((1 << w) - 1);              // least significant w bits all 1
+            *(begin++) = (packed >> offset) & mask;  // write to output
+        }
+
+        return total_offset;
+    }
+}
+
+/**
+ * Unpacks an unsigned integer into a range of smaller integers.
+ * @param packed value to unpack
+ * @param dest destination range
+ * @param widths widths range. values are in bits
+ * @param little_endian if the input has the first element in the least significant place
+ * @return the total number of bits unpacked
+ */
+template <typename P, typename OR, typename WR>
+    requires std::unsigned_integral<P> && range<OR> && range<WR>
+size_t unpack_into(P packed, OR &dest, const WR &widths, bool little_endian = true) {
+    assert(size(widths) == size(dest));
+    return unpack_into(packed, dest.begin(), dest.end(), widths.begin(), little_endian);
+}
+
+/**
+ * Unpacks an unsigned integer into a range of smaller integers.
+ * @param packed value to unpack
+ * @param begin destination start iterator
+ * @param end destination end iterator
+ * @param width width of each packed element in bits
+ * @param little_endian if the input has the first element in the least significant place
+ * @return the total number of bits unpacked
+ */
+template <typename P, typename OI>
+    requires std::unsigned_integral<P> && std::output_iterator<OI, P>
+size_t unpack_into(P packed, OI begin, OI end, size_t width, bool little_endian = true) {
+    return unpack_into(packed, begin, end, const_iterator(width), little_endian);
+}
+
+/**
+ * Unpacks an unsigned integer into a range of smaller integers.
+ * @param packed value to unpack
+ * @param dest destination range
+ * @param width width of each packed element in bits
+ * @param little_endian if the input has the first element in the least significant place
+ * @return the total number of bits unpacked
+ */
+template <typename P, typename OR>
+    requires std::unsigned_integral<P> && range<OR>
+size_t unpack_into(P packed, OR &dest, size_t width, bool little_endian = true) {
+    return unpack_into(packed, dest.begin(), dest.end(), const_iterator(width), little_endian);
+}
+
+/**
+ * Unpacks an unsigned integer into an array of smaller integers
+ * @tparam U unpacked data type
+ * @tparam N number of values to unpack
+ * @param packed value to unpack
+ * @param widths widths iterator. values are in bits
+ * @param little_endian if the input has the first element in the least significant place
+ * @return an array of unpacked values
+ */
+template <typename U, size_t N, typename P, typename WI>
+    requires std::unsigned_integral<P> && std::forward_iterator<WI>
+std::array<U, N> unpack(P packed, WI widths, bool little_endian = true) {
+    std::array<U, N> unpacked;
+    unpack_into(packed, unpacked, widths, little_endian);
+    return unpacked;
+}
+
+/**
+ * Unpacks an unsigned integer into an array of smaller integers
+ * @tparam U unpacked data type
+ * @param packed value to unpack
+ * @param widths widths array. values are in bits
+ * @param little_endian if the input has the first element in the least significant place
+ * @return an array of unpacked values
+ */
+template <typename U, size_t N, typename P>
+    requires std::unsigned_integral<P>
+std::array<U, N> unpack(P packed, const std::array<size_t, N> &widths, bool little_endian = true) {
+    return unpack<U, N>(packed, widths.begin(), little_endian);
+}
+
+/**
+ * Unpacks an unsigned integer into an array of smaller integers
+ * @tparam U unpacked data type
+ * @tparam N number of values to unpack
+ * @param packed value to unpack
+ * @param widths widths range. values are in bits
+ * @param little_endian if the input has the first element in the least significant place
+ * @return an array of unpacked values
+ */
+template <typename U, size_t N, typename P, typename WR>
+    requires std::unsigned_integral<P> && range<WR>
+std::array<U, N> unpack(P packed, const WR &widths, bool little_endian = true) {
+    assert(size(widths) == N);
+    return unpack<U, N>(packed, widths.begin(), little_endian);
+}
+
+/**
+ * Unpacks an unsigned integer into an array of smaller integers
+ * @tparam U unpacked data type
+ * @tparam N number of values to unpack
+ * @param packed value to unpack
+ * @param width width of each packed element in bits
+ * @param little_endian if the input has the first element in the least significant place
+ * @return an array of unpacked values
+ */
+template <typename U, size_t N, typename P>
+    requires std::unsigned_integral<P>
+std::array<U, N> unpack(P packed, size_t width, bool little_endian = true) {
+    std::array<U, N> unpacked;
+    unpack_into(packed, unpacked, width, little_endian);
+    return unpacked;
+}
+
+/**
+ * Packs an iterable of integers into a single integer.
+ * @tparam II input iterator type
+ * @tparam WI width iterator type
+ * @tparam P Output data type. must be an unsigned integral type large enough to hold all input values
+ * @param start start iterator
+ * @param end end iterator
+ * @param widths width iterator. must be at least as large as the input data
+ * @param little_endian if the output value should have the first element in the least significant place
+ * of the output or not
+ * @return Packed integer of type P.
+ */
+template <typename P, typename II, typename WI>
+    requires std::unsigned_integral<P> && std::input_iterator<II> && std::input_iterator<WI>
+inline constexpr P pack(II start, II end, WI widths, bool little_endian = true) {
+    P packed = 0;
+    unsigned offset = 0;
+    while (start < end) {
+        P val = static_cast<P>(*(start++));
+        auto w = *(widths++);
+
+        val &= ((1 << w) - 1);
+        assert(val < (1u << w));  // ensure value can fit in W bits
+
+        if (little_endian) {
+            packed |= static_cast<P>(val) << offset;  // first element is in the least significant place of packed
+        } else {
+            packed = (packed << w) | static_cast<P>(val);  // first element is in the most significant place of packed
+        }
+
+        offset += w;  // increment offset
+    }
+
+    assert(offset <= std::numeric_limits<P>::digits);  // detect an overflow condition
+    return packed;
+}
+
+/**
+ * Packs an iterable of integers into a single integer.
+ * @tparam IR input range type
+ * @tparam WR width range type
+ * @tparam P Output data type. must be an unsigned integral type large enough to hold all input values
+ * @param r range of values to pack
+ * @param widths range of widths to pack with. must be at least as large as r
+ * @param little_endian if the output value should have the first element in the least significant place
+ * of the output or not
+ * @return Packed integer of type P.
+ */
+template <typename P, typename IR, typename WR>
+    requires std::unsigned_integral<P> && range<IR> && range<WR>
+inline constexpr P pack(IR r, WR widths, bool little_endian = true) {
+    assert(size(widths) == size(r));
+    return pack<P>(r.begin(), r.end(), widths.start(), little_endian);
+}
+
+/**
+ * Packs an iterable of integers into a single integer.
+ * @tparam II input iterator type
+ * @tparam P Output data type. must be an unsigned integral type large enough to hold all input values
+ * @param start start iterator
+ * @param end end iterator
+ * @param width Number of bits in each value
+ * @param little_endian if the output value should have the first element in the least significant place
+ * of the output or not
+ * @return Packed integer of type P.
+ */
+template <typename P, typename II>
+    requires std::unsigned_integral<P> && std::input_iterator<II>
+inline constexpr P pack(II start, II end, size_t width, bool little_endian = true) {
+    return pack<P>(start, end, const_iterator(width), little_endian);
+}
+
+/**
+ * Packs a range of integers into a single integer.
+ * @tparam IR range type
+ * @tparam P Output data type. must be an unsigned integral type large enough to hold all input values
+ * @param r range of values to pack
+ * @param width Number of bits in each value
+ * @param little_endian if the output value should have the first element in the least significant place
+ * of the output or not
+ * @return Packed integer of type P.
+ */
+template <typename P, typename IR>
+    requires std::unsigned_integral<P> && range<IR>
+inline constexpr P pack(IR r, size_t width, bool little_endian = true) {
+    return pack<P>(r.begin(), r.end(), const_iterator(width), little_endian);
+}
+}  // namespace quicktex
--- a/quicktex/util/bitwiseEnums.h
+++ b/quicktex/util/bitwiseEnums.h
@ -1,5 +1,5 @@
 /*  Quicktex Texture Compression Library
-    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
+    Copyright (C) 2021-2022 Andrew Cassidy <drewcassidy@me.com>
    Partially derived from rgbcx.h written by Richard Geldreich 2020 <richgel99@gmail.com>
    and licenced under the public domain

@ -21,38 +21,48 @@

 #include <type_traits>

+namespace quicktex {
+
 // Thanks dkavolis
-template <typename E, typename = std::enable_if_t<std::is_enum_v<E>>> constexpr inline auto operator~(E a) noexcept -> E {
+template <typename E> requires std::is_enum_v<E>
+constexpr inline auto operator~(E a) noexcept -> E {
    using Base = std::underlying_type_t<E>;
    return static_cast<E>(~static_cast<Base>(a));
 }

-template <typename E, typename = std::enable_if_t<std::is_enum_v<E>>> constexpr inline auto operator|(E a, E b) noexcept -> E {
+template <typename E> requires std::is_enum_v<E>
+constexpr inline auto operator|(E a, E b) noexcept -> E {
    using Base = std::underlying_type_t<E>;
    return static_cast<E>(static_cast<Base>(a) | static_cast<Base>(b));
 }

-template <typename E, typename = std::enable_if_t<std::is_enum_v<E>>> constexpr inline auto operator&(E a, E b) noexcept -> E {
+template <typename E> requires std::is_enum_v<E>
+constexpr inline auto operator&(E a, E b) noexcept -> E {
    using Base = std::underlying_type_t<E>;
    return static_cast<E>(static_cast<Base>(a) & static_cast<Base>(b));
 }

-template <typename E, typename = std::enable_if_t<std::is_enum_v<E>>> constexpr inline auto operator^(E a, E b) noexcept -> E {
+template <typename E> requires std::is_enum_v<E>
+constexpr inline auto operator^(E a, E b) noexcept -> E {
    using Base = std::underlying_type_t<E>;
    return static_cast<E>(static_cast<Base>(a) ^ static_cast<Base>(b));
 }

-template <typename E, typename = std::enable_if_t<std::is_enum_v<E>>> constexpr inline auto operator|=(E& a, E b) noexcept -> E& {
+template <typename E> requires std::is_enum_v<E>
+constexpr inline auto operator|=(E& a, E b) noexcept -> E& {
    a = a | b;
    return a;
 }

-template <typename E, typename = std::enable_if_t<std::is_enum_v<E>>> constexpr inline auto operator&=(E& a, E b) noexcept -> E& {
+template <typename E> requires std::is_enum_v<E>
+constexpr inline auto operator&=(E& a, E b) noexcept -> E& {
    a = a & b;
    return a;
 }

-template <typename E, typename = std::enable_if_t<std::is_enum_v<E>>> constexpr inline auto operator^=(E& a, E b) noexcept -> E& {
+template <typename E> requires std::is_enum_v<E>
+constexpr inline auto operator^=(E& a, E b) noexcept -> E& {
    a = a ^ b;
    return a;
-}
+}
+}  // namespace quicktex
--- a/quicktex/util/iterator.h
+++ b/quicktex/util/iterator.h
@ -0,0 +1,146 @@
+/*  Quicktex Texture Compression Library
+    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
+    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
+    and licenced under the public domain
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Lesser General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+namespace quicktex {
+
+namespace detail {
+template <class R> using subs_value_t = std::remove_reference_t<decltype(std::declval<R &>()[0])>;
+}
+
+template <typename D, typename T> class index_iterator_base {
+   public:
+    using value_type = T;
+    using size_type = int;
+    using difference_type = int;
+
+    D &operator++() {
+        _index++;
+        return static_cast<D &>(*this);
+    }
+    D operator++(int) {
+        D old = static_cast<D &>(*this);
+        _index++;
+        return old;
+    }
+    D &operator--() {
+        _index--;
+        return static_cast<D &>(*this);
+    }
+    D operator--(int) {
+        D old = static_cast<D &>(*this);
+        _index--;
+        return old;
+    }
+
+    D operator+(difference_type rhs) const {
+        D d = static_cast<const D &>(*this);
+        d._index += rhs;
+        return d;
+    }
+
+    D operator-(difference_type rhs) const {
+        D d = static_cast<const D &>(*this);
+        d._index -= rhs;
+        return d;
+    }
+
+    D &operator+=(difference_type rhs) {
+        *this = *this + rhs;
+        return *this;
+    }
+
+    D &operator-=(difference_type rhs) {
+        *this = *this - rhs;
+        return *this;
+    }
+
+    difference_type operator-(const D &rhs) const { return (difference_type)_index - rhs._index; }
+
+    friend D operator+(difference_type lhs, const D &rhs) { return rhs + lhs; }
+
+    friend auto operator<=>(const D &lhs, const D &rhs) { return lhs._index <=> rhs._index; }
+
+    T &operator[](difference_type i) { return *(static_cast<D &>(*this) + i); }
+    T &operator[](difference_type i) const { return *(static_cast<const D &>(*this) + i); }
+
+   protected:
+    int _index;
+
+   private:
+    friend D;
+    index_iterator_base(size_t index = 0) : _index(index) {}
+};
+
+template <typename R>
+    requires requires(const R &r) { r[0]; }
+class index_iterator : public index_iterator_base<index_iterator<R>, detail::subs_value_t<R>> {
+   public:
+    using base = index_iterator_base<index_iterator<R>, detail::subs_value_t<R>>;
+    using typename base::difference_type;
+    using typename base::size_type;
+    using typename base::value_type;
+
+    index_iterator() : base(0), _range(nullptr) {}
+    index_iterator(R &range, int index) : base(index), _range(&range) {}
+
+    value_type &operator*() const {
+        assert(_range != nullptr);
+        assert(this->_index >= 0);
+        assert(this->_index < (size_type)_range->size());
+        return (*_range)[this->_index];
+    }
+    value_type *operator->() const { return &(this->operator*()); }
+
+    friend bool operator==(const index_iterator &lhs, const index_iterator &rhs) {
+        return (lhs._range == rhs._range) && (lhs._index == rhs._index);
+    }
+
+   private:
+    R *_range;
+};
+
+template <typename T> class const_iterator : public index_iterator_base<const_iterator<T>, const T> {
+   public:
+    using base = index_iterator_base<const_iterator<T>, const T>;
+    using typename base::difference_type;
+    using typename base::size_type;
+    using typename base::value_type;
+
+    const_iterator() : base(0), _value(T{}) {}
+    const_iterator(T value, int index = 0) : base(index), _value(value) {}
+
+    value_type &operator*() const { return _value; }
+    value_type *operator->() const { return &_value; }
+
+    friend bool operator==(const const_iterator &lhs, const const_iterator &rhs) {
+        return (lhs._value == rhs._value) && (lhs._index == rhs._index);
+    }
+
+   private:
+    T _value;
+};
+
+// const_iterator is guaranteed to be a random access iterator. it is not writable for obvious reasons
+static_assert(std::random_access_iterator<const_iterator<int>>);
+
+// index_iterator satisfied forward_iterator
+static_assert(std::random_access_iterator<index_iterator<std::array<int, 4>>>);
+}  // namespace quicktex
--- a/quicktex/util/map.h
+++ b/quicktex/util/map.h
@ -0,0 +1,178 @@
+/*  Quicktex Texture Compression Library
+    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
+    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
+    and licenced under the public domain
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Lesser General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <array>
+#include <tuple>
+#include <xsimd/xsimd.hpp>
+
+#include "util/ranges.h"
+
+namespace quicktex {
+
+namespace detail {
+
+template <typename T>
+concept simdable = random_access_range<T> && std::contiguous_iterator<decltype(std::declval<T>().begin())> &&
+                   std::is_arithmetic_v<range_value_t<T>>;
+template <typename T, bool serial = false> struct chunker_impl {};
+
+template <typename T, bool serial>
+    requires simdable<T> && (!serial)
+struct chunker_impl<T, serial> {
+    // range with contiguous, SIMDable data
+
+    static constexpr size_t steps = 2;
+    using chunk_types = std::tuple<xsimd::batch<range_value_t<T>>, range_value_t<T>>;
+
+    template <size_t step> using chunk_type = std::tuple_element_t<step, chunk_types>;
+    static constexpr std::array<size_t, 2> chunk_sizes = {chunk_type<0>::size, 1};
+
+    template <size_t step> static constexpr size_t chunk_count(const T& r) {
+        if constexpr (step == 0) {
+            return std::size(r) / chunk_sizes[0];
+        } else {
+            return std::size(r) % chunk_sizes[0];
+        }
+    }
+
+    template <size_t step> static constexpr auto get_chunk(const T& r, size_t i) {
+        assert(i < chunk_count<step>(r));
+        if constexpr (step == 0) {
+            return xsimd::load_unaligned(&r[chunk_sizes[0] * i]);
+        } else {
+            return r[chunk_sizes[0] * chunk_count<0>(r) + i];
+        }
+    }
+
+    template <size_t step>
+    static constexpr void set_chunk(T& r, size_t i, const std::tuple_element_t<step, chunk_types>& c) {
+        assert(i < chunk_count<step>(r));
+        if constexpr (step == 0) {
+            xsimd::store_unaligned(&r[chunk_sizes[0] * i], c);
+        } else {
+            r[chunk_sizes[0] * chunk_count<0>(r) + i] = c;
+        }
+    }
+};
+
+template <typename T, bool serial>
+    requires random_access_range<T> && (!simdable<T> || serial)
+struct chunker_impl<T, serial> {
+    // range with data that cant be SIMDed
+    static constexpr size_t steps = 1;
+    template <size_t step> using chunk_type = range_value_t<T>;
+    static constexpr std::array<size_t, 1> chunk_sizes = {1};
+
+    template <size_t step> static constexpr size_t chunk_count(const T& r) { return r.size(); }
+    template <size_t step> static constexpr auto get_chunk(const T& r, size_t i) { return r[i]; }
+    template <size_t step> static constexpr void set_chunk(T& r, size_t i, const chunk_type<0>& c) { r[i] = c; }
+};
+
+template <typename T, bool serial>
+    requires(!sized_range<T>)
+struct chunker_impl<T, serial> {
+    static constexpr size_t steps = 1;
+    using chunk_types = std::tuple<T>;
+    template <size_t step> using chunk_type = T;
+
+    static constexpr std::array<size_t, 1> chunk_sizes = {1};
+
+    template <size_t step> static constexpr size_t chunk_count(const T&) { return 1; }
+    template <size_t step> static constexpr auto get_chunk(const T& r, size_t) { return r; }
+    template <size_t step> static constexpr void set_chunk(T& r, size_t, const T& c) { r = c; }
+};
+
+template <typename T, bool serial = false, size_t step = 0>
+using chunk_type = typename chunker_impl<T, serial>::template chunk_type<step>;
+
+template <typename T, bool serial, typename Op, std::size_t step, typename... Args>
+static constexpr bool callable_step() {
+    return std::is_invocable_r_v<typename chunker_impl<T, serial>::template chunk_type<step>, Op,
+                                 typename chunker_impl<Args, serial>::template chunk_type<step>...>;
+}
+
+template <typename T, bool serial, typename Op, typename... Args, std::size_t... steps>
+static constexpr bool callable_steps(std::index_sequence<steps...>) {
+    return (callable_step<T, serial, Op, steps, Args...>() && ...);
+}
+
+template <typename T, bool serial, typename Op, typename... Args> static constexpr bool callable() {
+    //    if constexpr (!(std::same_as<T, Args> && ...)) return false;
+    //    return callable_steps<T, serial, Op>(std::make_index_sequence<chunker_impl<T, serial>::steps>());
+    return callable_steps<T, serial, Op, Args...>(std::make_index_sequence<1>());
+}
+
+template <typename T, bool serial, size_t step, typename... Args>
+    requires((std::is_scalar_v<Args> || std::same_as<T, Args>) && ...)
+inline void do_map_step(auto f, T& result, const Args&... args) {
+    using impl = chunker_impl<T, serial>;
+    using chunk_type = typename impl::template chunk_type<step>;
+    size_t chunk_count = impl::template chunk_count<step>(result);
+
+    for (unsigned i = 0; i < chunk_count; i++) {
+        chunk_type out_chunk = f(chunker_impl<Args, serial>::template get_chunk<step>(args, i)...);
+        impl::template set_chunk<step>(result, i, out_chunk);
+    }
+}
+
+template <typename T, bool serial, typename Op, std::size_t... steps, typename... Args>
+    requires((std::is_scalar_v<Args> || std::same_as<T, Args>) && ...)
+inline void do_map_steps(Op f, T& result, std::index_sequence<steps...>, const Args&... args) {
+    //    static_assert(callable<T, serial, Op, Args...>());
+
+    (do_map_step<T, serial, steps>(f, result, args...), ...);
+}
+
+template <typename T, bool serial, typename Op, typename... Args>
+    requires((std::is_scalar_v<Args> || std::same_as<T, Args>) && ...)
+inline void do_map_all(Op f, T& result, const Args&... args) {
+    constexpr bool must_serialize = serial || !callable<T, false, Op, Args...>();
+    do_map_steps<T, must_serialize>(f, result, std::make_index_sequence<chunker_impl<T, serial>::steps>(), args...);
+}
+}  // namespace detail
+
+template <typename R, typename T, bool serial = false, typename Op, typename... Args>
+    requires sized_range<T> && (sized_range<Args> && ...)
+inline R map_to(Op f, const T& in, const Args&... args) {
+    // the input and result types are not the same, so attempting chunking is unsafe
+    R result{};
+    for (unsigned i = 0; i < in.size(); i++) { result[i] = f(in[i], args[i]...); }
+    return result;
+}
+
+template <typename T, bool serial = false, typename Op, typename... Args>
+    requires sized_range<T>
+inline auto map(Op f, const T& in, const Args&... args) {
+    //    assert(((in.size() == args.size())) && ...);
+
+    if constexpr (((std::is_scalar_v<Args> || std::same_as<T, Args>)&&...) &&
+                  (detail::callable<T, true, Op, T, Args...>())) {
+        // the input and result types are all the same type and size, so we can attempt chunking
+        T result{};
+        detail::do_map_all<T, serial>(f, result, in, args...);
+        return result;
+    } else {
+        using result_type = std::invoke_result_t<Op, typename detail::chunk_type<T, true>, range_value_t<Args>...>;
+        return map_to<std::array<result_type, std::tuple_size_v<T>>, T, serial>(f, in, args...);
+    }
+}
+
+}  // namespace quicktex
--- a/quicktex/util/math.h
+++ b/quicktex/util/math.h
@ -0,0 +1,84 @@
+/*  Quicktex Texture Compression Library
+    Copyright (C) 2021-2022 Andrew Cassidy <drewcassidy@me.com>
+    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
+    and licenced under the public domain
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Lesser General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+#include <cassert>
+#include <cstdint>
+#include <functional>
+#include <limits>
+#include <numeric>
+#include <string>
+#include <type_traits>
+#include <vector>
+
+#include "util/ranges.h"
+#include "xsimd/xsimd.hpp"
+
+namespace quicktex {
+
+using std::abs;    // abs overload for builtin types
+using xsimd::abs;  // abs overload for xsimd buffers
+
+template <typename S>
+    requires requires(S &s) { s.abs(); }
+constexpr S abs(S value) {
+    return value.abs();
+}
+
+template <typename S, typename R>
+    requires requires(S s, R r) { s.clamp(r, r); }
+constexpr S clamp(S value, R low, R high) {
+    return value.clamp(low, high);
+}
+
+template <typename S>
+    requires std::is_scalar_v<S>
+constexpr S clamp(S value, S low, S high) {
+    assert(low <= high);
+    if (value < low) return low;
+    if (value > high) return high;
+    return value;
+}
+
+template <typename S, typename A>
+constexpr xsimd::batch<S, A> clamp(xsimd::batch<S, A> value, const xsimd::batch<S, A> &low,
+                                   const xsimd::batch<S, A> &high) {
+    return xsimd::clip(value, low, high);
+}
+
+template <typename S, typename A>
+constexpr xsimd::batch<S, A> clamp(xsimd::batch<S, A> value, const S &low, const S &high) {
+    return clamp(value, xsimd::broadcast(low), xsimd::broadcast(high));
+}
+
+template <typename S>
+    requires requires(S &s) { s.sum(); }
+constexpr auto sum(S value) {
+    return value.sum();
+}
+
+template <typename S>
+    requires std::is_scalar_v<S>
+constexpr auto sum(S value) {
+    return value;
+    // horizontally adding a scalar is a noop
+}
+
+template <typename S, typename A> constexpr auto sum(xsimd::batch<S, A> value) { return xsimd::hadd(value); }
+}  // namespace quicktex
--- a/quicktex/util/ranges.h
+++ b/quicktex/util/ranges.h
@ -0,0 +1,74 @@
+/*  Quicktex Texture Compression Library
+    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
+    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
+    and licenced under the public domain
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Lesser General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <cstdint>
+#include <functional>
+#include <iterator>
+#include <limits>
+#include <numeric>
+#include <string>
+#include <type_traits>
+
+namespace quicktex {
+
+// std::ranges is not usable by default in libc++ 13
+template <class T>
+concept range = requires(T &t) {
+                    t.begin();
+                    t.end();
+                };
+
+using std::size;
+template <range T> constexpr auto size(const T &range) { return std::distance(range.begin(), range.end()); }
+
+template <class T>
+concept sized_range = range<T> && requires(T &t) { size(t); };
+
+template <class R> using iterator_t = decltype(std::declval<R &>().begin());
+template <class R> using sentinel_t = decltype(std::declval<R &>().end());
+template <class R> using range_size_t = decltype(size(std::declval<R &>()));
+template <class R> using range_difference_t = std::iter_difference_t<iterator_t<R>>;
+template <class R> using range_value_t = std::iter_value_t<iterator_t<R>>;
+template <class R> using range_reference_t = std::iter_reference_t<iterator_t<R>>;
+template <class R> using range_rvalue_reference_t = std::iter_rvalue_reference_t<iterator_t<R>>;
+
+template <class R>
+concept input_range = range<R> && std::input_iterator<iterator_t<R>>;
+
+template <class R, typename T>
+concept output_range = range<R> && (std::output_iterator<iterator_t<R>, T>);
+
+template <class R>
+concept forward_range = range<R> && std::forward_iterator<iterator_t<R>>;
+
+template <class R>
+concept bidirectional_range = range<R> && std::bidirectional_iterator<iterator_t<R>>;
+
+template <class R>
+concept random_access_range = range<R> && std::random_access_iterator<iterator_t<R>>;
+
+template <class R>
+concept contiguous_range = range<R> && std::contiguous_iterator<iterator_t<R>>;
+
+}  // namespace quicktex
--- a/quicktex/util/simd.h
+++ b/quicktex/util/simd.h
@ -0,0 +1,97 @@
+/*  Quicktex Texture Compression Library
+    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
+    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
+    and licenced under the public domain
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Lesser General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <array>
+#include <type_traits>
+
+#include "util/math.h"
+#include "util/types.h"
+#include "xsimd/xsimd.hpp"
+
+template <typename T> using requires_arch = xsimd::kernel::requires_arch<T>;
+
+namespace quicktex::simd {
+
+namespace kernel {
+
+#if XSIMD_WITH_NEON64
+template <class A> inline int16_t whadd(xsimd::batch<int8_t, A> const& arg, requires_arch<xsimd::neon64>) {
+    return vaddlvq_s8(arg);
+}
+
+template <class A> inline int32_t whadd(xsimd::batch<int16_t, A> const& arg, requires_arch<xsimd::neon64>) {
+    return vaddlvq_s16(arg);
+}
+
+template <class A> inline int64_t whadd(xsimd::batch<int32_t, A> const& arg, requires_arch<xsimd::neon64>) {
+    return vaddlvq_s32(arg);
+}
+
+template <class A> inline uint16_t whadd(xsimd::batch<uint8_t, A> const& arg, requires_arch<xsimd::neon64>) {
+    return vaddlvq_u8(arg);
+}
+
+template <class A> inline uint32_t whadd(xsimd::batch<uint16_t, A> const& arg, requires_arch<xsimd::neon64>) {
+    return vaddlvq_u16(arg);
+}
+
+template <class A> inline uint64_t whadd(xsimd::batch<uint32_t, A> const& arg, requires_arch<xsimd::neon64>) {
+    return vaddlvq_u32(arg);
+}
+#endif
+
+#if XSIMD_WITH_SSE2
+template <class A> inline int32_t whadd(xsimd::batch<int16_t, A> const& arg, requires_arch<xsimd::sse2>) {
+    // Pairwise widening sum with multiply by 1, then sum all N/2 widened lanes
+    xsimd::batch<int32_t, A> paired = _mm_madd_epi16(arg, _mm_set1_epi16(1));
+    return xsimd::hadd(paired);
+}
+#endif
+
+#if XSIMD_WITH_AVX2
+template <class A> inline int32_t whadd(xsimd::batch<int16_t, A> const& arg, requires_arch<xsimd::avx2>) {
+    // Pairwise widening sum with multiply by 1, then sum all N/2 widened lanes
+    xsimd::batch<int32_t, A> paired = _mm256_madd_epi16(arg, _mm256_set1_epi16(1));
+    return xsimd::hadd(paired);
+}
+#endif
+
+template <class A, class T> inline next_size_t<T> whadd(xsimd::batch<T, A> const& arg, requires_arch<xsimd::generic>) {
+    // Generic implementation that should work everywhere
+    using b_type = xsimd::batch<T, A>;
+    using r_type = next_size_t<T>;
+    const auto len = b_type::size;
+
+    alignas(A::alignment()) T buffer[len];
+    r_type sum = 0;
+
+    arg.store_aligned(buffer);
+    for (T val : buffer) { sum += static_cast<r_type>(val); }
+
+    return sum;
+}
+}  // namespace kernel
+
+template <class A, class T> inline next_size_t<T> whadd(xsimd::batch<T, A> const& arg) {
+    return kernel::whadd(arg, A{});
+}
+
+}  // namespace quicktex::simd
--- a/quicktex/util/subrange.h
+++ b/quicktex/util/subrange.h
@ -0,0 +1,97 @@
+/*  Quicktex Texture Compression Library
+    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
+    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
+    and licenced under the public domain
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Lesser General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <concepts>
+#include <iterator>
+
+#include "util/ranges.h"
+
+namespace quicktex {
+
+template <std::input_or_output_iterator I, std::sentinel_for<I> S = I> struct subrange {
+   public:
+    using iterator_type = I;
+    using sentinel_type = S;
+    using value_type = std::iter_value_t<I>;
+    using reference_type = std::iter_reference_t<I>;
+    using difference_type = std::iter_difference_t<I>;
+
+    constexpr subrange(const I& b, const S& e) : _begin(b), _end(e) {}
+
+    constexpr I begin() const { return _begin; }
+    constexpr S end() const { return _end; }
+    constexpr bool empty() const { return _begin == _end; }
+    constexpr difference_type size() const { return std::distance(_end, _begin); }
+
+    explicit constexpr operator bool() const { return !empty(); }
+
+    constexpr subrange& advance(difference_type n) {
+        assert(n >= 0 || std::bidirectional_iterator<I>);  // forward iterators cannot be decremented
+
+        if (n > 0) {
+            for (int i = 0; i < n && _begin != _end; i++) { _begin++; }
+        } else {
+            for (int i = 0; i > n && _begin != _end; i--) { _begin--; }
+        }
+        return *this;
+    }
+
+    constexpr subrange next(difference_type n = 1) const {
+        auto tmp = *this;
+        return tmp.advance(n);
+    }
+
+    template <typename _ = I>
+        requires std::bidirectional_iterator<I>
+    constexpr subrange prev(difference_type n = 1) const {
+        return next(-n);
+    }
+
+    template <typename _ = I>
+        requires std::random_access_iterator<I>
+    constexpr reference_type operator[](difference_type i) {
+        assert(i >= 0 && i < size());
+        return _begin[i];
+    }
+
+    template <typename _ = I>
+        requires std::random_access_iterator<I>
+    constexpr const reference_type operator[](difference_type i) const {
+        assert(i >= 0 && i < size());
+        return _begin[i];
+    }
+
+    template <typename _ = I>
+        requires std::contiguous_iterator<I>
+    constexpr value_type* data() {
+        return std::to_address(_begin);
+    }
+    template <typename _ = I>
+        requires std::contiguous_iterator<I>
+    constexpr value_type const* data() const {
+        return std::to_address(_begin);
+    }
+
+   private:
+    I _begin;
+    S _end;
+};
+}  // namespace quicktex
--- a/quicktex/util/types.h
+++ b/quicktex/util/types.h
@ -0,0 +1,49 @@
+/*  Quicktex Texture Compression Library
+    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
+    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
+    and licenced under the public domain
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Lesser General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+#include <cstdint>
+
+namespace quicktex {
+template <class> struct next_size;
+template <class T> using next_size_t = typename next_size<T>::type;
+template <class T> struct type_tag { using type = T; };
+
+template <> struct next_size<int8_t> : type_tag<int16_t> {};
+template <> struct next_size<int16_t> : type_tag<int32_t> {};
+template <> struct next_size<int32_t> : type_tag<int64_t> {};
+
+template <> struct next_size<uint8_t> : type_tag<uint16_t> {};
+template <> struct next_size<uint16_t> : type_tag<uint32_t> {};
+template <> struct next_size<uint32_t> : type_tag<uint64_t> {};
+
+template <auto bitCount>
+using unsigned_bits =
+    std::conditional_t<bitCount <= 8, std::uint8_t,
+                       std::conditional_t<bitCount <= 16, std::uint16_t,
+                                          std::conditional_t<bitCount <= 32, std::uint32_t,
+                                                             std::conditional_t<bitCount <= 64, std::uint64_t, void>>>>;
+
+template <auto bitCount>
+using signed_bits =
+    std::conditional_t<bitCount <= 8, std::int8_t,
+                       std::conditional_t<bitCount <= 16, std::int16_t,
+                                          std::conditional_t<bitCount <= 32, std::int32_t,
+                                                             std::conditional_t<bitCount <= 64, std::int64_t, void>>>>;
+}  // namespace quicktex::util
--- a/setup.py
+++ b/setup.py
@ -1,15 +1,116 @@
+import os
+import re
+import subprocess
 import sys

-try:
-    from skbuild import setup
-except ImportError:
-    print(
-        "Please update pip, you need pip 10 or greater,\n"
-        " or you need to install the PEP 518 requirements in pyproject.toml yourself",
-        file=sys.stderr,
-    )
-    raise
+import pybind11
+from setuptools import setup, Extension
+from setuptools.command.build_ext import build_ext

-setup(
-    cmake_install_dir='.'
-)
+project_path = os.path.dirname(os.path.realpath(__file__))
+
+
+# A CMakeExtension needs a sourcedir instead of a file list.
+# The name must be the _single_ output extension from the CMake build.
+# If you need multiple extensions, see scikit-build.
+class CMakeExtension(Extension):
+    def __init__(self, name, sourcedir=""):
+        Extension.__init__(self, name, sources=[])
+        self.sourcedir = os.path.abspath(sourcedir)
+
+
+class CMakeBuild(build_ext):
+    def build_extension(self, ext):
+        from setuptools_scm import get_version
+
+        version = get_version(root='.', relative_to=__file__)
+
+        extdir = os.path.abspath(os.path.dirname(self.get_ext_fullpath(ext.name)))
+
+        # required for auto-detection of auxiliary "native" libs
+        if not extdir.endswith(os.path.sep):
+            extdir += os.path.sep
+
+        cfg = "Debug" if self.debug else "RelWithDebInfo"
+        if 'QUICKTEX_DEBUG' in os.environ:
+            cfg = "Debug"
+
+        # CMake lets you override the generator - we need to check this.
+        # Can be set with Conda-Build, for example.
+        cmake_generator = os.environ.get("CMAKE_GENERATOR", "")
+
+        # Set Python_EXECUTABLE instead if you use PYBIND11_FINDPYTHON
+        cmake_args = [
+            "-DCMAKE_LIBRARY_OUTPUT_DIRECTORY={}".format(extdir),
+            "-Dpybind11_DIR={}".format(pybind11.get_cmake_dir()),
+            "-DPython_EXECUTABLE={}".format(sys.executable),
+            "-DPython_ROOT_DIR={}".format(os.path.dirname(sys.executable)),
+            "-DQUICKTEX_VERSION_INFO={}".format(version),  # include version info in module
+            "-DCMAKE_BUILD_TYPE={}".format(cfg),  # not used on MSVC, but no harm
+            # clear cached make program binary, see https://github.com/pypa/setuptools/issues/2912
+            "-U",
+            "CMAKE_MAKE_PROGRAM",
+        ]
+        build_args = []
+
+        if self.verbose:
+            build_args += ["--verbose"]
+
+        if self.compiler.compiler_type != "msvc":
+            # Using Ninja-build since it a) is available as a wheel and b)
+            # multithreads automatically. MSVC would require all variables be
+            # exported for Ninja to pick it up, which is a little tricky to do.
+            # Users can override the generator with CMAKE_GENERATOR in CMake
+            # 3.15+.
+            if not cmake_generator:
+                cmake_args += ["-GNinja"]
+
+        else:
+            # if 'CC' in os.environ and 'clang-cl' in os.environ['CC']:
+            #     cmake_args += ["-T", 'ClangCL']  # https://stackoverflow.com/a/64189112/7645957
+
+            # Single config generators are handled "normally"
+            single_config = any(x in cmake_generator for x in {"NMake", "Ninja"})
+
+            # CMake allows an arch-in-generator style for backward compatibility
+            contains_arch = any(x in cmake_generator for x in {"ARM", "Win64"})
+
+            # Convert distutils Windows platform specifiers to CMake -A arguments
+            plat_to_cmake = {"win32": "Win32", "win-amd64": "x64", "win-arm32": "ARM", "win-arm64": "ARM64"}
+
+            # Specify the arch if using MSVC generator, but only if it doesn't
+            # contain a backward-compatibility arch spec already in the
+            # generator name.
+            if not single_config and not contains_arch:
+                cmake_args += ["-A", plat_to_cmake[self.plat_name]]
+
+            # Multi-config generators have a different way to specify configs
+            if not single_config:
+                cmake_args += ["-DCMAKE_LIBRARY_OUTPUT_DIRECTORY_{}={}".format(cfg.upper(), extdir)]
+                build_args += ["--config", cfg]
+
+        if sys.platform.startswith("darwin"):
+            # Cross-compile support for macOS - respect ARCHFLAGS if set
+            archs = re.findall(r"-arch (\S+)", os.environ.get("ARCHFLAGS", ""))
+            if archs:
+                cmake_args += ["-DCMAKE_OSX_ARCHITECTURES={}".format(";".join(archs))]
+
+        # Set CMAKE_BUILD_PARALLEL_LEVEL to control the parallel build level
+        # across all generators.
+        if "CMAKE_BUILD_PARALLEL_LEVEL" not in os.environ:
+            # self.parallel is a Python 3 only way to set parallel jobs by hand
+            # using -j in the build_ext call, not supported by pip or PyPA-build.
+            if hasattr(self, "parallel") and self.parallel:
+                # CMake 3.12+ only.
+                build_args += ["-j{}".format(self.parallel)]
+
+        if not os.path.exists(self.build_temp):
+            os.makedirs(self.build_temp)
+
+        subprocess.check_call(["cmake", ext.sourcedir] + cmake_args, cwd=self.build_temp)
+        subprocess.check_call(["cmake", "--build", ".", "--target", ext.name] + build_args, cwd=self.build_temp)
+
+
+# The information here can also be placed in setup.cfg - better separation of
+# logic and declaration, and simpler if you include description/version in a file.
+setup(use_scm_version=True, ext_modules=[CMakeExtension("_quicktex")], cmdclass={"build_ext": CMakeBuild})
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Andrew Cassidy	c92d58d115	Rework ranges library Better matches the standard library, and iterators moved to their own file	2022-07-05 22:51:25 -07:00
Andrew Cassidy	9b3c1d0ca3	Add subrange template	2022-07-03 19:08:15 -07:00
Andrew Cassidy	db24af730e	Use int for sizes in matrix type	2022-07-03 11:56:37 -07:00
Andrew Cassidy	f77ea3be0f	MSVC is a joke For some reason index variables need to be signed?	2022-07-02 17:14:12 -07:00
Andrew Cassidy	6afe4851bd	Address some of the more annoying gcc warnings	2022-07-02 17:02:28 -07:00
Andrew Cassidy	3a27a89155	oopsz	2022-07-01 20:11:30 -07:00
Andrew Cassidy	768248c20d	Add option to enable sanitization in python module this is off by default, since it requires juggling some platform-specific environment variables	2022-07-01 19:30:21 -07:00
Andrew Cassidy	0bd0c6846f	you win this time, GCC	2022-06-30 23:46:24 -07:00
Andrew Cassidy	dcd9bf4287	Enable sanitizers in tests These all throw a fit when pointed at cpython unfortunately	2022-06-30 23:39:42 -07:00
Andrew Cassidy	32a411634e	Fix mistaken use of max() instead of max_element()	2022-06-30 21:54:55 -07:00
Andrew Cassidy	bac61eb0fe	Try to make testing return better errors	2022-06-28 17:47:37 -07:00
Andrew Cassidy	2cfcd26a90	Sum and matrix multiply tests	2022-06-28 17:08:30 -07:00
Andrew Cassidy	3119ba1a6c	oops	2022-06-28 16:28:04 -07:00
Andrew Cassidy	bfba3228f0	Last attempt	2022-06-28 16:22:29 -07:00
Andrew Cassidy	2d7aeeb2d8	attempt to make MSVC happy	2022-06-28 16:06:45 -07:00
Andrew Cassidy	3849303a9b	Workaround for GCC Technically this syntax isnt required, but GCC has a bug that hasnt been fixed since 2015. see https://gcc.gnu.org/bugzilla/show_bug.cgi?id=67274	2022-06-28 15:48:24 -07:00
Andrew Cassidy	c41e023735	Fix VecBase constructor to only take the scalar type	2022-06-28 15:04:08 -07:00
Andrew Cassidy	0ee45ba966	Add improved, generalized arithmetic matrix tests	2022-06-28 14:57:55 -07:00
Andrew Cassidy	73441d1ed3	MSVC stop being annoying	2022-06-26 20:17:56 -07:00
Andrew Cassidy	54d61e0bd3	Fix iterators maybe	2022-06-26 19:50:36 -07:00
Andrew Cassidy	487f05c90a	Pass config to ctest for windows	2022-06-26 19:33:43 -07:00
Andrew Cassidy	b5a55f606c	attempt 2	2022-06-26 19:27:25 -07:00
Andrew Cassidy	3ab354db74	Run ctest from build directory	2022-06-26 19:21:04 -07:00
Andrew Cassidy	ef8a41fe03	Run tests verbosely	2022-06-26 19:08:14 -07:00
Andrew Cassidy	598175739f	try using ctest im so close to giving up on windows support	2022-06-26 19:03:25 -07:00
Andrew Cassidy	963d985572	make MSVC shut up	2022-06-26 18:50:19 -07:00
Andrew Cassidy	143bde78d6	Fix running tests	2022-06-26 18:43:53 -07:00
Andrew Cassidy	a96aadc867	use gcc 10	2022-06-26 18:34:06 -07:00
Andrew Cassidy	29741447cd	syntax error	2022-06-26 18:13:38 -07:00
Andrew Cassidy	9011718c09	add NOPYTHON option	2022-06-26 18:09:40 -07:00
Andrew Cassidy	8b2c240094	syntax error	2022-06-26 18:02:40 -07:00
Andrew Cassidy	c97daa21ec	would help if i cloned the repo	2022-06-26 18:00:30 -07:00
Andrew Cassidy	f5defd2817	Move testing to its own step	2022-06-26 17:57:41 -07:00
Andrew Cassidy	49ba7e26b7	Run C tests in CI	2022-06-26 17:40:05 -07:00
Andrew Cassidy	bdd75ddddf	oops	2022-06-26 17:27:47 -07:00
Andrew Cassidy	e528c12b2d	Introduce custom map function	2022-06-26 17:16:02 -07:00
Andrew Cassidy	aec31a2fdc	improved clamp and sum	2022-06-24 21:41:43 -07:00
Andrew Cassidy	17663f4871	First trial run using the Matrix type	2022-06-22 22:32:19 -07:00
Andrew Cassidy	f2352f10fd	Smarter map function using variadics and chunking	2022-06-22 20:39:40 -07:00
Andrew Cassidy	3ceb028907	Attempt to batch some matrix ops	2022-06-22 00:39:36 -07:00
Andrew Cassidy	10ba6b2bd6	Remove utest	2022-06-20 18:42:31 -07:00
Andrew Cassidy	1c06cccd5c	More vector unit tests	2022-06-19 18:33:54 -07:00
Andrew Cassidy	232fb6cb41	use position independent code	2022-06-18 17:14:48 -07:00
Andrew Cassidy	19df5df68d	Rework project layout and tests	2022-06-18 17:06:17 -07:00
Andrew Cassidy	3756f31e20	matrix multiplication and transposition	2022-06-13 22:55:41 -07:00
Andrew Cassidy	2c59419bf0	Cleanup and replace Matrix.h with Vec.h Mysteriously this also (perhaps temporarily) fixed a CPU usage issue in Clion? I guess I'll take it	2022-06-12 20:01:56 -07:00
Andrew Cassidy	f767525aa1	Improved matrix/vector class	2022-06-12 17:06:53 -07:00
Andrew Cassidy	59fefae3f7	A	2022-06-09 22:48:52 -07:00
Andrew Cassidy	0bcfd50a44	I s2g I will make Stallman have a Nice Time	2022-06-08 23:15:51 -07:00
Andrew Cassidy	527067839f	GCC has a very toxic view on friendship as a concept tbh	2022-06-08 23:12:42 -07:00
Andrew Cassidy	a33cb8ea67	Add new vector ops with smarter type deduction	2022-06-08 23:07:43 -07:00
Andrew Cassidy	d293687424	Flesh out ranges library	2022-06-06 22:31:41 -07:00
Andrew Cassidy	6f075b6c1d	Reorganize and add Window class	2022-06-05 17:12:04 -07:00
Andrew Cassidy	f88212af85	ColorSet extension/alias for Matrix	2022-06-02 23:45:53 -07:00
Andrew Cassidy	d3515c1db8	Add matrix template	2022-06-02 23:15:57 -07:00
Andrew Cassidy	20305d2ea9	tweaks and formatting	2022-06-02 22:10:38 -07:00
Andrew Cassidy	abfe0b8d10	Header file reorganization	2022-06-01 23:50:35 -07:00
Andrew Cassidy	f097f71ba9	Assorted cleanup	2022-05-31 23:09:20 -07:00
Andrew Cassidy	961c2b7134	Build on macOS 12 This should build on older versions, so long as you have llvm 13. But the homebrew version of llvm the macos 11 runner has only includes x86 dylibs which cant be linked against when building for arm.	2022-05-31 01:16:18 -07:00
Andrew Cassidy	9388406769	oops	2022-05-31 01:13:21 -07:00
Andrew Cassidy	7430dccd5b	im going to break MSVC's kneecaps	2022-05-31 01:03:52 -07:00
Andrew Cassidy	fa0579ff03	thats not a dollar sign	2022-05-31 00:53:15 -07:00
Andrew Cassidy	9f7eb5fe57	Target LLVM 13	2022-05-31 00:49:30 -07:00
Andrew Cassidy	3b7164ffba	Refactor pack() and unpack()	2022-05-30 22:41:17 -07:00
Andrew Cassidy	dae507acc9	Don't build wheels for musl I don't understand the reason these don't compile	2022-05-29 20:37:49 -07:00
Andrew Cassidy	7eac371064	less iterators	2022-05-29 19:03:47 -07:00
Andrew Cassidy	53a6427dcc	iterators are confusing	2022-05-29 16:51:35 -07:00
Andrew Cassidy	b9c7c7cf6e	Fix broken constructor	2022-05-29 15:58:18 -07:00
Andrew Cassidy	debaa6b54d	Add Vector template class Also experimentally bump to C++20 just to see if it works on GCC 9.3	2022-05-29 15:54:55 -07:00
Andrew Cassidy	c96450b5fe	Rename Color to prepare for refactor	2022-05-25 23:42:06 -07:00
Andrew Cassidy	fffa291765	Fix LeastSquares mode and add tests for every quality level	2022-05-24 22:57:51 -07:00
Andrew Cassidy	829b5312b5	Tweak compiler warnings	2022-05-24 21:33:39 -07:00
Andrew Cassidy	c57106e3b2	Skip version number checking with debug builds Because I found a way to build the extension module directly which helps speed up development immensely	2022-05-24 20:47:55 -07:00
Andrew Cassidy	468414f339	Add arm whadd instructions for all sizes of integer	2022-05-23 23:42:14 -07:00
Andrew Cassidy	f9831b1f61	:unsmilebeale:	2022-05-23 00:46:23 -07:00
Andrew Cassidy	0046bef9d3	Cmake why	2022-05-23 00:41:46 -07:00
Andrew Cassidy	8f19ad6a1d	I cant spell This reverts commit `ed10899601`.	2022-05-23 00:37:28 -07:00
Andrew Cassidy	ed10899601	just never include windows.h tis a silly OS	2022-05-23 00:35:12 -07:00
Andrew Cassidy	2588ebcaa3	its late ok?	2022-05-23 00:28:01 -07:00
Andrew Cassidy	345344eef3	cleanup and prevent windows macros from stepping on everything	2022-05-23 00:21:33 -07:00
Andrew Cassidy	bf35983b2d	Add tests for c-level code	2022-05-22 22:20:30 -07:00
Andrew Cassidy	c2d4e9be4d	Merge branch 'feature/pytest' into feature/simd	2022-05-22 21:02:34 -07:00
Andrew Cassidy	920059bea1	Migrate tests to pytest	2022-05-22 18:40:13 -07:00
Andrew Cassidy	daae86cf50	Switch to pytest	2022-05-22 16:50:24 -07:00
Andrew Cassidy	c05879f1c1	Fixes and tweaks to whadd	2022-05-22 16:38:54 -07:00
Andrew Cassidy	aa6bd9602d	Add widening hadd	2022-05-21 21:23:22 -07:00
Andrew Cassidy	8c77356aca	Begin integrating xsimd	2022-05-20 20:46:45 -07:00
Andrew Cassidy	79f77a24b2	Remove Highway	2022-05-20 20:18:18 -07:00
Andrew Cassidy	04fece2771	Move clang-cl detection to cmake and allow it when setting flags	2022-05-16 23:41:42 -07:00
Andrew Cassidy	7ba2225644	Fix typo	2022-05-16 23:18:54 -07:00
Andrew Cassidy	4b3e236275	Tell Cmake to use clang-cl when requested	2022-05-16 23:13:40 -07:00
Andrew Cassidy	b2523dbe19	Try something	2022-05-16 23:05:09 -07:00
Andrew Cassidy	bcdfcb95fb	h	2022-05-16 21:33:33 -07:00
Andrew Cassidy	e5f1a45c6b	raspberry noise	2022-05-16 21:32:01 -07:00
Andrew Cassidy	014f7063fd	Build wheels for SSE4 and using clang on windows	2022-05-16 21:29:45 -07:00
Andrew Cassidy	74aaac00d7	Fix AVX2 bug and add cmake SIMD ISA selection By default, quicktex builds with -march=native on x86, unless an environment variable requesting a specific ISA is set. This doesnt work on MSVC though because it's a shit compiler, so it just falls back to no flags.	2022-05-16 20:52:31 -07:00
Andrew Cassidy	8168d6e249	Add emulated 128-bit support and fix x86	2022-05-15 20:03:05 -07:00
Andrew Cassidy	f7b0cbe76b	add widening horizontal add for s16 vectors	2022-05-15 18:08:36 -07:00
Andrew Cassidy	bc925d3949	Include Highway in build	2022-05-15 00:43:28 -07:00
Andrew Cassidy	7f75104d18	Spell external correctly and exclude all affinity designer files	2022-05-13 00:51:39 -07:00
Andrew Cassidy	643276660a	Include submodules in sdists and github actions	2022-05-13 00:36:00 -07:00
Andrew Cassidy	9789ecd159	Add Highway dependency	2022-05-13 00:32:35 -07:00
Andrew Cassidy	5c87c82702	Add file documenting development environment setup	2022-05-11 23:22:51 -07:00
Andrew Cassidy	ddbeff43cb	Update copyright year	2022-05-11 20:51:35 -07:00
Andrew Cassidy	5c94782876	Remove debug wrapper, now that I know how to use a debug python build	2022-05-10 22:08:01 -07:00
Andrew Cassidy	9eaaf901f3	Fix compilation of test wrapper	2022-05-08 16:06:36 -07:00
dependabot[bot]	c79ffc8794	Bump docker/setup-qemu-action from 1 to 2 Bumps [docker/setup-qemu-action](https://github.com/docker/setup-qemu-action) from 1 to 2. - [Release notes](https://github.com/docker/setup-qemu-action/releases) - [Commits](https://github.com/docker/setup-qemu-action/compare/v1...v2) --- updated-dependencies: - dependency-name: docker/setup-qemu-action dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] <support@github.com>	2022-05-06 18:33:39 -07:00
Andrew Cassidy	55f0ced229	getting started instructions	2022-05-04 23:29:28 -07:00
Andrew Cassidy	eb7b259d53	Autogenerated command documentation from helpstrings	2022-05-04 23:03:06 -07:00
Andrew Cassidy	03801e2e1b	New index page and remove broken page	2022-05-04 22:51:40 -07:00
Andrew Cassidy	3a28ec690c	Merge remote-tracking branch 'origin/dependabot/github_actions/pypa/cibuildwheel-2.5.0' into dev	2022-05-02 22:42:11 -07:00
Andrew Cassidy	697f7243a0	Documentation nice-to-haves	2022-05-02 22:41:39 -07:00
Andrew Cassidy	22e1455ceb	tell dependabot to target the dev branch	2022-05-02 22:26:05 -07:00
dependabot[bot]	c13f64828f	Bump pypa/cibuildwheel from 2.4.0 to 2.5.0 Bumps [pypa/cibuildwheel](https://github.com/pypa/cibuildwheel) from 2.4.0 to 2.5.0. - [Release notes](https://github.com/pypa/cibuildwheel/releases) - [Changelog](https://github.com/pypa/cibuildwheel/blob/main/docs/changelog.md) - [Commits](https://github.com/pypa/cibuildwheel/compare/v2.4.0...2.5.0) --- updated-dependencies: - dependency-name: pypa/cibuildwheel dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] <support@github.com>	2022-05-02 11:41:47 +00:00
Andrew Cassidy	9a57b096f5	Set min version for sphinx tools	2022-04-21 09:34:21 -07:00
Andrew Cassidy	82f079f1b6	Enable RTD building	2022-04-19 21:55:00 -07:00
Andrew Cassidy	2c72b7ad22	Ignore previous commit in blames	2022-04-18 19:56:41 -07:00
Andrew Cassidy	cb84f32eda	Migrate code style to Black	2022-04-18 19:53:26 -07:00
Andrew Cassidy	b34fdf2316	Ignore wheels and sdists	2022-04-16 22:40:54 -07:00
Andrew Cassidy	ac4e5b2679	Release 0.1.3 ### Fixed - Fixed quicktex not compiling for python 3.10 on Windows ### Changed - Reworked CI job, adding wheels for ARM macOS, ARM Linux, and x86 musl Linux. - Added wheels for python 3.10 - Added a more useful error message when importing quicktex on macOS when libomp.dylib isn't installed	2022-04-12 19:21:06 -07:00
dependabot[bot]	25e74b9b08	Bump actions/setup-python from 3.1.1 to 3.1.2 Bumps [actions/setup-python](https://github.com/actions/setup-python) from 3.1.1 to 3.1.2. - [Release notes](https://github.com/actions/setup-python/releases) - [Commits](https://github.com/actions/setup-python/compare/v3.1.1...v3.1.2) --- updated-dependencies: - dependency-name: actions/setup-python dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] <support@github.com>	2022-04-12 18:24:37 -07:00
Andrew Cassidy	a881a0a36b	Add more helpful error when importing without libomp installed Also use non-shallow clones in ci	2022-04-11 23:05:20 -07:00
Andrew Cassidy	3fdfc3ecaa	Pretty job names	2022-04-11 21:48:22 -07:00
Andrew Cassidy	b440543de3	why	2022-04-11 21:18:43 -07:00
Andrew Cassidy	23ed54c7a2	fix conditional	2022-04-11 21:15:15 -07:00
Andrew Cassidy	e7e8657100	fix syntax error	2022-04-11 21:11:19 -07:00
Andrew Cassidy	2a07db8c8f	Run arm64 linux builds on their own job for faster CI	2022-04-11 21:09:50 -07:00
dependabot[bot]	b8a80235f8	Bump actions/upload-artifact from 2 to 3 Bumps [actions/upload-artifact](https://github.com/actions/upload-artifact) from 2 to 3. - [Release notes](https://github.com/actions/upload-artifact/releases) - [Commits](https://github.com/actions/upload-artifact/compare/v2...v3) --- updated-dependencies: - dependency-name: actions/upload-artifact dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] <support@github.com>	2022-04-11 10:26:24 -07:00
dependabot[bot]	4cac24798e	Bump actions/download-artifact from 2 to 3 Bumps [actions/download-artifact](https://github.com/actions/download-artifact) from 2 to 3. - [Release notes](https://github.com/actions/download-artifact/releases) - [Commits](https://github.com/actions/download-artifact/compare/v2...v3) --- updated-dependencies: - dependency-name: actions/download-artifact dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] <support@github.com>	2022-04-11 10:25:24 -07:00
Andrew Cassidy	9b6097373e	Another attempt	2022-04-10 22:03:17 -07:00
Andrew Cassidy	b954ac6ccc	Attempt to hint to cmake what python install to use correctly Should fix my linux building issue?	2022-04-10 21:51:18 -07:00
Andrew Cassidy	593a0c3f46	Fix windows test command and install fat binaries for libomp	2022-04-10 21:08:07 -07:00
Andrew Cassidy	3b73bc8bce	use Py_ssize_t to make msvc happy	2022-04-10 01:06:25 -07:00
Andrew Cassidy	abeb08fc81	Upgrade cibuildwheel to 2.4.0	2022-04-10 00:26:00 -07:00
Andrew Cassidy	77637f6abd	Run tests in the right directory and skip linux for now	2022-04-10 00:22:13 -07:00
Andrew Cassidy	df6d5b1848	Use latest setuptools instead of pinning it	2022-04-10 00:09:44 -07:00
Andrew Cassidy	b5aea803d5	Use relative imports in tests	2022-04-10 00:04:08 -07:00
Andrew Cassidy	b80a6d2229	Fix arch selection and test command	2022-04-09 23:15:23 -07:00
Andrew Cassidy	dac7f07db4	Build for musl linux for platforms supported by Pillow Specifically x64 for cpython 3.8-3.10	2022-04-09 22:54:06 -07:00
Andrew Cassidy	7dfefa3007	Skip musl linux wheel builds Pillow appears to fail to compile on these without installing a bunch of dependencies, so... sorry alpine users	2022-04-09 22:40:05 -07:00
Andrew Cassidy	eaca455a08	syntax error	2022-04-09 22:24:35 -07:00
Andrew Cassidy	e5ccdbb4f4	Don't try to download test images	2022-04-09 22:23:00 -07:00
Andrew Cassidy	94d88c7e00	Remove nose dependence Also move test images into the base repo because they're not very big anyways	2022-04-09 22:20:09 -07:00
Andrew Cassidy	9421a6d372	Merge branch 'main' into build-modernization	2022-04-09 20:39:43 -07:00
Andrew Cassidy	cab0eeebae	Merge pull request #10 from drewcassidy/dependabot/github_actions/actions/checkout-3 Bump actions/checkout from 2 to 3	2022-04-09 20:21:43 -07:00
Andrew Cassidy	3d98b37a37	Merge pull request #13 from drewcassidy/dependabot/github_actions/actions/setup-python-3.1.1 Bump actions/setup-python from 2 to 3.1.1	2022-04-09 20:21:33 -07:00
Andrew Cassidy	654b6d628a	Slightly modernize how project is compiled	2022-04-09 19:43:40 -07:00
dependabot[bot]	1502c5318c	Bump actions/setup-python from 2 to 3.1.1 Bumps [actions/setup-python](https://github.com/actions/setup-python) from 2 to 3.1.1. - [Release notes](https://github.com/actions/setup-python/releases) - [Commits](https://github.com/actions/setup-python/compare/v2...v3.1.1) --- updated-dependencies: - dependency-name: actions/setup-python dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] <support@github.com>	2022-04-07 11:45:36 +00:00
dependabot[bot]	8b4e3c5746	Bump actions/checkout from 2 to 3 Bumps [actions/checkout](https://github.com/actions/checkout) from 2 to 3. - [Release notes](https://github.com/actions/checkout/releases) - [Changelog](https://github.com/actions/checkout/blob/main/CHANGELOG.md) - [Commits](https://github.com/actions/checkout/compare/v2...v3) --- updated-dependencies: - dependency-name: actions/checkout dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] <support@github.com>	2022-04-03 20:15:09 +00:00
Andrew Cassidy	8e7b95609c	Dependabot for gh actions	2022-04-03 13:12:29 -07:00