diff --git a/src/BC1/BC1Encoder.cpp b/src/BC1/BC1Encoder.cpp
index ba3faf6..c9803a3 100644
--- a/src/BC1/BC1Encoder.cpp
+++ b/src/BC1/BC1Encoder.cpp
@@ -19,11 +19,15 @@
 
 #include "BC1Encoder.h"
 
+#include <algorithm>
+#include <array>
 #include <cstdint>
 #include <memory>
 
 #include "../BlockView.h"
 #include "../Color.h"
+#include "../Matrix4x4.h"
+#include "../Vector4.h"
 #include "../bitwiseEnums.h"
 
 namespace rgbcx {
@@ -98,14 +102,15 @@ void BC1Encoder::EncodeBlock(Color4x4 pixels, BC1Block *dest) const {
     auto g_view = pixels.GetChannel(1);
     auto b_view = pixels.GetChannel(2);
 
-    if (pixels.IsSingleColor() || true) {  // for now assume (wrongly) everything is a single-color block
+    Color first = pixels.Get(0, 0);
+
+    if (pixels.IsSingleColor()) {  // for now assume (wrongly) everything is a single-color block
         // single-color pixel block, do it the fast way
-        EncodeBlockSingleColor(pixels.Get(0, 0), dest);
+        EncodeBlockSingleColor(first, dest);
         return;
     }
 
-    Color min, max, avg;
-    pixels.GetMinMaxAvgRGB(min, max, avg);
+    auto metrics = pixels.GetMetrics();
 }
 
 void BC1Encoder::EncodeBlockSingleColor(Color color, BC1Block *dest) const {
@@ -155,7 +160,7 @@ void BC1Encoder::EncodeBlockSingleColor(Color color, BC1Block *dest) const {
                 assert(min16 == 0 && max16 == 0);
                 max16 = 1;
                 min16 = 0;
-                mask = 0x55;  // 1111 (min value only, max is ignored)
+                mask = 0x55;  // 1111 (Min value only, max is ignored)
             }
         } else if (max16 < min16) {
             std::swap(min16, max16);
@@ -172,4 +177,193 @@ void BC1Encoder::EncodeBlockSingleColor(Color color, BC1Block *dest) const {
     dest->selectors[3] = mask;
 }
 
+void BC1Encoder::FindEndpoints(Color4x4 pixels, BC1Encoder::Flags flags, const BC1Encoder::BlockMetrics metrics, Color &low, Color &high) const {
+    if (metrics.is_greyscale) {
+        // specialized greyscale case
+        const unsigned fr = pixels.Get(0).r;
+
+        if (metrics.max.r - metrics.min.r < 2) {
+            // single color block
+            low.r = high.r = scale8To5(fr);
+            low.g = high.g = scale8To6(fr);
+            low.b = high.b = low.r;
+        } else {
+            low.r = low.b = scale8To5(metrics.min.r);
+            low.g = scale8To6(metrics.min.r);
+
+            high.r = high.b = scale8To5(metrics.max.r);
+            high.g = scale8To6(metrics.max.r);
+        }
+    } else if ((flags & Flags::Use2DLS) != Flags::None) {
+        //  2D Least Squares approach from Humus's example, with added inset and optimal rounding.
+        Color diff = Color(metrics.max.r - metrics.min.r, metrics.max.g - metrics.min.g, metrics.max.b - metrics.min.b);
+        Vector4 l = {0, 0, 0};
+        Vector4 h = {0, 0, 0};
+
+        auto &sums = metrics.sums;
+        auto &min = metrics.min;
+        auto &max = metrics.max;
+
+        unsigned chan0 = diff.MaxChannelRGB();  // primary axis of the bounding box
+        l[chan0] = (float)min[chan0];
+        h[chan0] = (float)min[chan0];
+
+        assert(diff[chan0] >= diff[(chan0 + 1) % 3] && diff[chan0] >= diff[(chan0 + 2) % 3]);
+
+        std::array<unsigned, 3> sums_xy;
+
+        for (unsigned i = 0; i < 16; i++) {
+            auto val = pixels.Get(i);
+            for (unsigned c = 0; c < 3; c++) { sums_xy[c] += val[chan0] * val[c]; }
+        }
+
+        auto &sum_x = sums[chan0];
+        auto &sum_xx = sums_xy[chan0];
+
+        float denominator = (float)(16 * sum_xx) - (float)(sum_x * sum_x);
+
+        // once per secondary axis, calculate high and low using least squares
+        if (fabs(denominator > 1e-8f)) {
+            for (unsigned i = 1; i < 3; i++) {
+                /* each secondary axis is fitted with a linear formula of the form
+                 *  y = ax + b
+                 * where y is the secondary axis and x is the primary axis
+                 *  a = (m∑xy - ∑x∑y) / m∑x² - (∑x)²
+                 *  b = (∑x²∑y - ∑xy∑x) / m∑x² - (∑x)²
+                 * see Giordano/Weir pg.103 */
+                const unsigned chan = (chan0 + i) % 3;
+                const unsigned &sum_y = sums[chan];
+                const unsigned &sum_xy = sums_xy[chan];
+
+                float a = (float)((16 * sum_xy) - (sum_x * sum_y)) / denominator;
+                float b = (float)((sum_xx * sum_y) - (sum_xy * sum_x)) / denominator;
+
+                l[chan] = b + (a * l[chan0]);
+                h[chan] = b + (a * h[chan0]);
+            }
+        }
+
+        // once per axis, inset towards the center by 1/16 of the delta and scale
+        for (unsigned c = 0; c < 3; c++) {
+            float inset = (h[c] - l[c]) / 16.0f;
+
+            l[c] = ((l[c] + inset) / 255.0f);
+            h[c] = ((h[c] - inset) / 255.0f);
+        }
+
+        low = Color::PreciseRound565(l);
+        high = Color::PreciseRound565(h);
+    } else if ((flags & Flags::BoundingBox) != Flags::None) {
+        // Algorithm from icbc.h compress_dxt1_fast()
+        Vector4 l, h;
+        const float bias = 8.0f / 255.0f;
+
+        // rescale and inset values
+        for (unsigned c = 0; c < 3; c++) {  // heh, c++
+            l[c] = (float)metrics.min[c] / 255.0f;
+            h[c] = (float)metrics.max[c] / 255.0f;
+
+            float inset = (h[c] - l[c] - bias) / 16.0f;
+            l[c] += inset;
+            h[c] -= inset;
+        }
+
+        // Select the correct diagonal across the bounding box
+        int icov_xz = 0, icov_yz = 0;
+        for (unsigned i = 0; i < 16; i++) {
+            int b = (int)pixels.Get(i).b - metrics.avg.b;
+            icov_xz += b * (int)pixels.Get(i).r - metrics.avg.r;
+            icov_yz += b * (int)pixels.Get(i).g - metrics.avg.g;
+        }
+
+        if (icov_xz < 0) std::swap(l[0], h[0]);
+        if (icov_yz < 0) std::swap(l[1], h[1]);
+
+        low = Color::PreciseRound565(l);
+        high = Color::PreciseRound565(h);
+    } else if ((flags & Flags::BoundingBoxInt) != Flags::None) {
+        // Algorithm from icbc.h compress_dxt1_fast(), but converted to integer.
+
+        Color min, max;
+
+        const float bias = 8.0f / 255.0f;
+
+        // rescale and inset values
+        for (unsigned c = 0; c < 3; c++) {
+            int inset = ((int)(metrics.max[c] - metrics.min[c]) - 8) >> 4;  // 1/16 of delta, with bias
+
+            min[c] = clamp255(metrics.min[c] + inset);
+            max[c] = clamp255(metrics.max[c] - inset);
+        }
+
+        int icov_xz = 0, icov_yz = 0;
+        for (unsigned i = 0; i < 16; i++) {
+            int b = (int)pixels.Get(i).b - metrics.avg.b;
+            icov_xz += b * (int)pixels.Get(i).r - metrics.avg.r;
+            icov_yz += b * (int)pixels.Get(i).g - metrics.avg.g;
+        }
+
+        if (icov_xz < 0) std::swap(min.r, max.r);
+        if (icov_yz < 0) std::swap(min.g, max.g);
+
+        low = min.ScaleTo565();
+        high = max.ScaleTo565();
+    } else {
+        // the slow way
+        // Select 2 colors along the principle axis. (There must be a faster/simpler way.)
+        auto min = Vector4::FromColorRGB(metrics.min);
+        auto max = Vector4::FromColorRGB(metrics.max);
+        auto avg = Vector4::FromColorRGB(metrics.avg);
+
+        std::array<Vector4, 16> colors;
+
+        Vector4 axis = {306, 601, 117};  // I think this is luma?
+        Matrix4x4 covariance;
+        const unsigned total_power_iters = (flags & Flags::Use6PowerIters) != Flags::None ? 6 : 4;
+
+        for (unsigned i = 0; i < 16; i++) {
+            colors[i] = Vector4::FromColorRGB(pixels.Get(i));
+            Vector4 diff = colors[i] - avg;
+            for (unsigned c1 = 0; c1 < 3; c1++) {
+                for (unsigned c2 = c1; c2 < 3; c2++) {
+                    covariance[c1][c2] += (diff[c1] * diff[c2]);
+                    assert(c1 <= c2);
+                }
+            }
+        }
+
+        covariance /= 255.0f;
+        covariance.Mirror();
+
+        Vector4 delta = max - min;
+
+        // realign r and g axes to match
+        if (covariance[0][2] < 0) delta[0] = -delta[0];  // r vs b
+        if (covariance[1][2] < 0) delta[1] = -delta[1];  // g vs b
+
+        for (unsigned power_iter = 0; power_iter < total_power_iters; power_iter++) { delta = covariance * delta; }
+
+        float k = delta.MaxAbs(3);
+        if (k > 2) { axis = delta * (2048.0f / k); }
+
+        float min_dot = INFINITY;
+        float max_dot = -INFINITY;
+
+        unsigned min_index, max_index;
+
+        for (unsigned i = 0; i < 16; i++) {
+            float dot = colors[i].Dot(axis);
+            if (dot > max_dot) {
+                max_dot = dot;
+                max_index = i;
+            } else if (dot < min_dot) {
+                min_dot = dot;
+                min_index = i;
+            }
+        }
+
+        low = pixels.Get(min_index).ScaleTo565();
+        high = pixels.Get(max_index).ScaleTo565();
+    }
+}
 }  // namespace rgbcx
\ No newline at end of file
diff --git a/src/BC1/BC1Encoder.h b/src/BC1/BC1Encoder.h
index 4b20885..9266002 100644
--- a/src/BC1/BC1Encoder.h
+++ b/src/BC1/BC1Encoder.h
@@ -29,8 +29,8 @@
 #include "../Interpolator.h"
 #include "../bitwiseEnums.h"
 #include "../ndebug.h"
-#include "../tables.h"
 #include "BC1Block.h"
+#include "tables.h"
 
 namespace rgbcx {
 
@@ -43,6 +43,7 @@ struct BC1MatchEntry {
 class BC1Encoder : public BlockEncoder<BC1Block, 4, 4> {
    public:
     using InterpolatorPtr = std::shared_ptr<Interpolator>;
+    using BlockMetrics = Color4x4::BlockMetrics;
 
     enum class Flags : uint32_t {
         None = 0,
@@ -105,10 +106,11 @@ class BC1Encoder : public BlockEncoder<BC1Block, 4, 4> {
 
     BC1Encoder(InterpolatorPtr interpolator);
 
-
     void EncodeBlock(Color4x4 pixels, BC1Block *dest) const override;
 
    private:
+    using Vec3F = std::array<float, 3>;
+
     const InterpolatorPtr _interpolator;
 
     Flags _flags;
@@ -116,7 +118,16 @@ class BC1Encoder : public BlockEncoder<BC1Block, 4, 4> {
     unsigned _orderings4;
     unsigned _orderings3;
 
+    // Unpacked BC1 block with metadata
+    struct EncodeResults {
+        Color low;
+        Color high;
+        std::array<uint8_t, 16> selectors;
+        bool is_3_color;
+    };
+
     void EncodeBlockSingleColor(Color color, BC1Block *dest) const;
+    void FindEndpoints(Color4x4 pixels, Flags flags, BlockMetrics const metrics, Color &low, Color &high) const;
 
     // match tables used for single-color blocks
     // Each entry includes a high and low pair that best reproduces the 8-bit index as well as possible,
diff --git a/src/table4.h b/src/BC1/table4.h
similarity index 100%
rename from src/table4.h
rename to src/BC1/table4.h
diff --git a/src/tables.cpp b/src/BC1/tables.cpp
similarity index 99%
rename from src/tables.cpp
rename to src/BC1/tables.cpp
index e53e973..b69b252 100644
--- a/src/tables.cpp
+++ b/src/BC1/tables.cpp
@@ -3,6 +3,15 @@
 
 #include "tables.h"
 
+const float g_midpoint5[32] = {.015686f, .047059f, .078431f, .111765f, .145098f, .176471f, .207843f, .241176f, .274510f, .305882f, .337255f,
+                               .370588f, .403922f, .435294f, .466667f, .5f,      .533333f, .564706f, .596078f, .629412f, .662745f, .694118f,
+                               .725490f, .758824f, .792157f, .823529f, .854902f, .888235f, .921569f, .952941f, .984314f, 1e+37f};
+const float g_midpoint6[64] = {.007843f, .023529f, .039216f, .054902f, .070588f, .086275f, .101961f, .117647f, .133333f, .149020f, .164706f, .180392f, .196078f,
+                               .211765f, .227451f, .245098f, .262745f, .278431f, .294118f, .309804f, .325490f, .341176f, .356863f, .372549f, .388235f, .403922f,
+                               .419608f, .435294f, .450980f, .466667f, .482353f, .500000f, .517647f, .533333f, .549020f, .564706f, .580392f, .596078f, .611765f,
+                               .627451f, .643137f, .658824f, .674510f, .690196f, .705882f, .721569f, .737255f, .754902f, .772549f, .788235f, .803922f, .819608f,
+                               .835294f, .850980f, .866667f, .882353f, .898039f, .913725f, .929412f, .945098f, .960784f, .976471f, .992157f, 1e+37f};
+
 // All total orderings for 16 pixels 2-bit selectors.
 // BC1 selector order 0, 2, 3, 1 (i.e. the selectors are reordered into linear order).
 const uint8_t g_unique_total_orders4[NUM_UNIQUE_TOTAL_ORDERINGS4][4] = {
diff --git a/src/tables.h b/src/BC1/tables.h
similarity index 97%
rename from src/tables.h
rename to src/BC1/tables.h
index c7228e2..f306935 100644
--- a/src/tables.h
+++ b/src/BC1/tables.h
@@ -13,6 +13,9 @@ const uint32_t MAX_TOTAL_ORDERINGS4 = 32;
 const uint32_t MAX_TOTAL_ORDERINGS4 = 128;
 #endif
 
+extern const float g_midpoint5[32];
+extern const float g_midpoint6[64];
+
 const uint32_t NUM_UNIQUE_TOTAL_ORDERINGS4 = 969;
 extern const uint8_t g_unique_total_orders4[NUM_UNIQUE_TOTAL_ORDERINGS4][4];
 
diff --git a/src/BC4/BC4Encoder.cpp b/src/BC4/BC4Encoder.cpp
index 0319914..17c5dd0 100644
--- a/src/BC4/BC4Encoder.cpp
+++ b/src/BC4/BC4Encoder.cpp
@@ -50,7 +50,7 @@ void BC4Encoder::EncodeBlock(Byte4x4 pixels, BC4Block *const dest) const noexcep
     const int bias = 4 - min * 14;
     const int delta = max - min;
 
-    // min is now 0. Compute thresholds between values by scaling max. It's x14 because we're adding two x7 scale factors.
+    // Min is now 0. Compute thresholds between values by scaling max. It's x14 because we're adding two x7 scale factors.
     // bias is applied here
     std::array<int, 7> thresholds = {};
     for (unsigned i = 0; i < 7; i++) thresholds[i] = delta * (1 + (2 * (int)i)) - bias;
diff --git a/src/BlockView.h b/src/BlockView.h
index 973660c..c495354 100644
--- a/src/BlockView.h
+++ b/src/BlockView.h
@@ -102,6 +102,15 @@ template <size_t M, size_t N> class ColorBlockView : public BlockView<Color, M,
     using Base = BlockView<Color, M, N>;
     using ChannelView = BlockView<uint8_t, M, N>;
 
+    struct BlockMetrics {
+        Color min;
+        Color max;
+        Color avg;
+        bool is_greyscale;
+        bool has_black;
+        std::array<unsigned, 3> sums;
+    };
+
     ColorBlockView(Color *start, int row_stride = N, int pixel_stride = 1) : Base(start, row_stride, pixel_stride) {}
 
     constexpr ChannelView GetChannel(uint8_t index) noexcept(ndebug) {
@@ -120,24 +129,32 @@ template <size_t M, size_t N> class ColorBlockView : public BlockView<Color, M,
         return true;
     }
 
-    void GetMinMaxAvgRGB(Color &min, Color &max, Color &avg) {
-        min = Base::Get(0, 0);
-        max = Base::Get(0, 0);
+    BlockMetrics GetMetrics(unsigned black_threshold = 4) {
+        BlockMetrics metrics;
+        metrics.min = Color(UINT8_MAX, UINT8_MAX, UINT8_MAX);
+        metrics.max = Color(0, 0, 0);
+        metrics.has_black = false;
+        metrics.is_greyscale = true;
+
         std::array<unsigned, 3> sums;
 
-        for (unsigned i = 1; i < M * N; i++) {
+        for (unsigned i = 0; i < M * N; i++) {
             auto val = Base::Get(i);
             for (unsigned c = 0; c < 3; c++) {
-                if (val[c] < min[c]) {
-                    min[c] = val[c];
+                if (val[c] < metrics.min[c]) {
+                    metrics.min[c] = val[c];
                 } else {
-                    max[c] = val[c];
+                    metrics.max[c] = val[c];
                 }
                 sums[c] += val[c];
             }
+            metrics.is_greyscale &= ((val.r == val.g) && (val.r == val.b));
+            metrics.has_black |= (val.r | val.g | val.b < black_threshold);
         }
 
-        for (unsigned c = 0; c < 3; c++) { avg[c] = (uint8_t)(sums[c] / (M * N)); }
+        for (unsigned c = 0; c < 3; c++) { metrics.avg[c] = (uint8_t)(metrics.sums[c] / (M * N)); }
+
+        return metrics;
     }
 };
 
diff --git a/src/Color.cpp b/src/Color.cpp
index a6ed99c..88a53c4 100644
--- a/src/Color.cpp
+++ b/src/Color.cpp
@@ -16,14 +16,15 @@
     You should have received a copy of the GNU Lesser General Public License
     along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
-
 #include "Color.h"
 
-#include <algorithm>  // for max, min
+#include <algorithm>  // for max, Min
 
+#include "Vector4.h"
 #include "util.h"  // for scale5To8, scale8To5, assert5bit, scale6To8
 
-// region Color implementation
+namespace rgbcx {
+
 Color::Color() { SetRGBA(0, 0, 0, 0xFF); }
 
 Color::Color(uint8_t r, uint8_t g, uint8_t b, uint8_t a) { SetRGBA(r, g, b, a); }
@@ -37,6 +38,14 @@ uint16_t Color::Pack565Unscaled(uint8_t r, uint8_t g, uint8_t b) {
 
 uint16_t Color::Pack565(uint8_t r, uint8_t g, uint8_t b) { return Pack565Unscaled(scale8To5(r), scale8To6(g), scale8To5(b)); }
 
+Color Color::Unpack565Unscaled(uint16_t Packed) {
+    uint8_t r = (Packed >> 11) & 0x1F;
+    uint8_t g = (Packed >> 5) & 0x3F;
+    uint8_t b = Packed & 0x1F;
+
+    return Color(r, g, b);
+}
+
 Color Color::Unpack565(uint16_t Packed) {
     uint8_t r = static_cast<uint8_t>(scale5To8((Packed >> 11) & 0x1FU));
     uint8_t g = static_cast<uint8_t>(scale6To8((Packed >> 5) & 0x3FU));
@@ -45,10 +54,24 @@ Color Color::Unpack565(uint16_t Packed) {
     return Color(r, g, b);
 }
 
-Color Color::Unpack565Unscaled(uint16_t Packed) {
-    uint8_t r = (Packed >> 11) & 0x1F;
-    uint8_t g = (Packed >> 5) & 0x3F;
-    uint8_t b = Packed & 0x1F;
+Color Color::PreciseRound565(Vector4 &v) {
+    int trial_r = (int)(v[0] * UINT5_MAX);
+    int trial_g = (int)(v[1] * UINT6_MAX);
+    int trial_b = (int)(v[2] * UINT5_MAX);
+
+    // clamp to prevent weirdness with slightly out of bounds float values
+    uint8_t r = (uint8_t)clampi(trial_r, 0, UINT5_MAX);
+    uint8_t g = (uint8_t)clampi(trial_g, 0, UINT6_MAX);
+    uint8_t b = (uint8_t)clampi(trial_b, 0, UINT5_MAX);
+
+    // increment each channel if above the rounding point
+    r += v[0] > Midpoints5bit[r];
+    g += v[1] > Midpoints6bit[g];
+    b += v[2] > Midpoints5bit[b];
+
+    assert5bit(r);
+    assert6bit(g);
+    assert5bit(b);
 
     return Color(r, g, b);
 }
@@ -66,15 +89,26 @@ void Color::SetRGB(uint8_t vr, uint8_t vg, uint8_t vb) {
     b = vb;
 }
 
-Color Color::min(const Color &a, const Color &b) { return Color(std::min(a[0], b[0]), std::min(a[1], b[1]), std::min(a[2], b[2]), std::min(a[3], b[3])); }
+size_t Color::MinChannelRGB() {
+    if (r < g && r < b) return 0;
+    if (g < b && g < r) return 1;
+    return 2;
+}
 
-Color Color::max(const Color &a, const Color &b) { return Color(std::max(a[0], b[0]), std::max(a[1], b[1]), std::max(a[2], b[2]), std::max(a[3], b[3])); }
+size_t Color::MaxChannelRGB() {
+    if (r > g && r > b) return 0;
+    if (g > b && g > r) return 1;
+    return 2;
+}
+
+Color Color::Min(const Color &A, const Color &B) { return Color(std::min(A[0], B[0]), std::min(A[1], B[1]), std::min(A[2], B[2]), std::min(A[3], B[3])); }
+
+Color Color::Max(const Color &a, const Color &b) { return Color(std::max(a[0], b[0]), std::max(a[1], b[1]), std::max(a[2], b[2]), std::max(a[3], b[3])); }
 
 uint16_t Color::pack565() { return Pack565(r, g, b); }
-
 uint16_t Color::pack565Unscaled() { return Pack565Unscaled(r, g, b); }
 
 Color Color::ScaleTo565() const { return Color(scale8To5(r), scale8To6(g), scale8To5(b)); }
 Color Color::ScaleFrom565() const { return Color(scale5To8(r), scale6To8(g), scale5To8(b)); }
 
-// endregion
\ No newline at end of file
+}  // namespace rgbcx
\ No newline at end of file
diff --git a/src/Color.h b/src/Color.h
index 6544633..b5db7bd 100644
--- a/src/Color.h
+++ b/src/Color.h
@@ -23,7 +23,11 @@
 
 #include <cstdint>  // for uint8_t, uint16_t
 
+namespace rgbcx {
+class Vector4;
+
 #pragma pack(push, 1)
+
 class Color {
    public:
     uint8_t r;
@@ -41,6 +45,11 @@ class Color {
     static Color Unpack565Unscaled(uint16_t Packed);
     static Color Unpack565(uint16_t Packed);
 
+    static Color PreciseRound565(Vector4 &v);
+
+    static Color Min(const Color &A, const Color &B);
+    static Color Max(const Color &A, const Color &B);
+
     bool operator==(const Color &Rhs) const { return r == Rhs.r && g == Rhs.g && b == Rhs.b && a == Rhs.a; }
 
     uint8_t operator[](size_t index) const {
@@ -64,9 +73,23 @@ class Color {
     Color ScaleTo565() const;
     Color ScaleFrom565() const;
 
-    static Color min(const Color &A, const Color &B);
-    static Color max(const Color &A, const Color &B);
+    size_t MinChannelRGB();
+    size_t MaxChannelRGB();
 
-    int get_luma() const { return (13938U * r + 46869U * g + 4729U * b + 32768U) >> 16U; }  // REC709 weightings
+    bool IsGrayscale() const { return ((r == g) && (r == b)); }
+
+    int GetLuma() const { return (13938U * r + 46869U * g + 4729U * b + 32768U) >> 16U; }  // REC709 weightings
+
+   private:
+    static constexpr float Midpoints5bit[32] = {.015686f, .047059f, .078431f, .111765f, .145098f, .176471f, .207843f, .241176f, .274510f, .305882f, .337255f,
+                                                .370588f, .403922f, .435294f, .466667f, .5f,      .533333f, .564706f, .596078f, .629412f, .662745f, .694118f,
+                                                .725490f, .758824f, .792157f, .823529f, .854902f, .888235f, .921569f, .952941f, .984314f, 1e+37f};
+    static constexpr float Midpoints6bit[64] = {.007843f, .023529f, .039216f, .054902f, .070588f, .086275f, .101961f, .117647f, .133333f, .149020f, .164706f,
+                                                .180392f, .196078f, .211765f, .227451f, .245098f, .262745f, .278431f, .294118f, .309804f, .325490f, .341176f,
+                                                .356863f, .372549f, .388235f, .403922f, .419608f, .435294f, .450980f, .466667f, .482353f, .500000f, .517647f,
+                                                .533333f, .549020f, .564706f, .580392f, .596078f, .611765f, .627451f, .643137f, .658824f, .674510f, .690196f,
+                                                .705882f, .721569f, .737255f, .754902f, .772549f, .788235f, .803922f, .819608f, .835294f, .850980f, .866667f,
+                                                .882353f, .898039f, .913725f, .929412f, .945098f, .960784f, .976471f, .992157f, 1e+37f};
 };
-#pragma pack(pop)
\ No newline at end of file
+#pragma pack(pop)
+}  // namespace rgbcx
\ No newline at end of file
diff --git a/src/Image.cpp b/src/Image.cpp
deleted file mode 100644
index c80973b..0000000
--- a/src/Image.cpp
+++ /dev/null
@@ -1,22 +0,0 @@
-/*  Python-rgbcx Texture Compression Library
-    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
-    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
-    and licenced under the public domain
-
-    This program is free software: you can redistribute it and/or modify
-    it under the terms of the GNU Lesser General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU Lesser General Public License for more details.
-
-    You should have received a copy of the GNU Lesser General Public License
-    along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include "Image.h"
-
-namespace rgbcx {}  // namespace rgbcx
\ No newline at end of file
diff --git a/src/Image.h b/src/Matrix4x4.cpp
similarity index 59%
rename from src/Image.h
rename to src/Matrix4x4.cpp
index 4de17a4..8013f50 100644
--- a/src/Image.h
+++ b/src/Matrix4x4.cpp
@@ -17,9 +17,34 @@
     along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
-#pragma once
+#include "Matrix4x4.h"
 
 namespace rgbcx {
 
-class Image {};
+Matrix4x4 operator*(Matrix4x4& lhs, Matrix4x4& rhs) {
+    Matrix4x4 trans_rhs = rhs.Transpose();  // 🏳️‍⚧️
+    Matrix4x4 result;
+    for (unsigned r = 0; r < 4; r++) {
+        for (unsigned c = 0; c < 4; c++) { result[r][c] = lhs[r].Dot(trans_rhs[c]); }
+    }
+
+    return result;
+}
+
+Vector4 operator*(Matrix4x4& lhs, Vector4& rhs) {
+    Vector4 result;
+
+    for (unsigned r = 0; r < 4; r++) { result[r] = rhs.Dot(lhs[r]); }
+
+    return result;
+}
+
+void Matrix4x4::Mirror() {
+    for (unsigned r = 0; r < 3; r++) {
+        for (unsigned c = (r + 1); c < 4; c++) {
+            _r[c][r] = _r[r][c];
+        }
+    }
+}
+
 }  // namespace rgbcx
diff --git a/src/Matrix4x4.h b/src/Matrix4x4.h
new file mode 100644
index 0000000..aa70c32
--- /dev/null
+++ b/src/Matrix4x4.h
@@ -0,0 +1,93 @@
+/*  Python-rgbcx Texture Compression Library
+    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
+    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
+    and licenced under the public domain
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Lesser General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <array>
+#include <functional>
+
+#include "Vector4.h"
+
+namespace rgbcx {
+
+class Matrix4x4 {
+   public:
+    static Matrix4x4 Identity() {
+        Matrix4x4 result;
+        for (unsigned i = 0; i < 4; i++) { result[i][i] = 1; }
+        return result;
+    }
+
+    static Matrix4x4 Transpose(Matrix4x4 &val) {
+        Matrix4x4 result;
+        for (unsigned r = 0; r < 3; r++) {
+            for (unsigned c = 0; c < 3; c++) { result[r][c] = val[c][r]; }
+        }
+        return result;
+    }
+
+    Vector4 operator[](size_t index) const {
+        assert(index < 4);
+        return _r[index];
+    }
+    Vector4 &operator[](size_t index) {
+        assert(index < 4);
+        return _r[index];
+    }
+
+    friend Matrix4x4 operator*(const Matrix4x4 &lhs, const Matrix4x4 &rhs);
+    friend Vector4 operator*(const Matrix4x4 &lhs, const Vector4 &rhs);
+
+    friend Matrix4x4 operator+(const Matrix4x4 &lhs, const Matrix4x4 &rhs) { return DoOp(lhs, rhs, std::plus()); }
+    friend Matrix4x4 operator-(const Matrix4x4 &lhs, const Matrix4x4 &rhs) { return DoOp(lhs, rhs, std::minus()); }
+
+    friend Matrix4x4 operator+(const Matrix4x4 &lhs, const float &rhs) { return DoOp(lhs, rhs, std::plus()); }
+    friend Matrix4x4 operator-(const Matrix4x4 &lhs, const float &rhs) { return DoOp(lhs, rhs, std::minus()); }
+    friend Matrix4x4 operator*(const Matrix4x4 &lhs, const float &rhs) { return DoOp(lhs, rhs, std::multiplies()); }
+    friend Matrix4x4 operator/(const Matrix4x4 &lhs, const float &rhs) { return DoOp(lhs, rhs, std::divides()); }
+
+    friend Matrix4x4 &operator+=(Matrix4x4 &lhs, const Matrix4x4 &rhs) { return lhs = lhs + rhs; }
+    friend Matrix4x4 &operator-=(Matrix4x4 &lhs, const Matrix4x4 &rhs) { return lhs = lhs - rhs; }
+    friend Matrix4x4 &operator*=(Matrix4x4 &lhs, const Matrix4x4 &rhs) { return lhs = lhs * rhs; }
+
+    friend Matrix4x4 &operator+=(Matrix4x4 &lhs, const float &rhs) { return lhs = lhs + rhs; }
+    friend Matrix4x4 &operator-=(Matrix4x4 &lhs, const float &rhs) { return lhs = lhs - rhs; }
+    friend Matrix4x4 &operator*=(Matrix4x4 &lhs, const float &rhs) { return lhs = lhs * rhs; }
+    friend Matrix4x4 &operator/=(Matrix4x4 &lhs, const float &rhs) { return lhs = lhs / rhs; }
+
+    Matrix4x4 Transpose() { return Transpose(*this); }
+
+    void Mirror();
+
+   private:
+    template <typename Op> friend Matrix4x4 DoOp(const Matrix4x4 &lhs, const Matrix4x4 &rhs, Op f) {
+        Matrix4x4 result;
+        for (unsigned r = 0; r < 4; r++) { result[r] = f(lhs[r], rhs[r]); }
+        return result;
+    }
+
+    template <typename Op> friend Matrix4x4 DoOp(const Matrix4x4 &lhs, const float &rhs, Op f) {
+        Matrix4x4 result;
+        for (unsigned r = 0; r < 4; r++) { result[r] = f(lhs[r], rhs); }
+        return result;
+    }
+
+    std::array<Vector4, 4> _r;
+};
+}  // namespace rgbcx
diff --git a/src/Vector4.h b/src/Vector4.h
new file mode 100644
index 0000000..7b73fb9
--- /dev/null
+++ b/src/Vector4.h
@@ -0,0 +1,114 @@
+/*  Python-rgbcx Texture Compression Library
+    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
+    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
+    and licenced under the public domain
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Lesser General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <array>
+#include <functional>
+
+#include "Color.h"
+
+namespace rgbcx {
+
+class Vector4 {
+   public:
+    Vector4(float x = 0, float y = 0, float z = 0, float w = 0) {
+        _c[0] = x;
+        _c[1] = y;
+        _c[2] = z;
+        _c[3] = w;
+    }
+
+    Vector4(float scalar) {
+        _c[0] = scalar;
+        _c[1] = scalar;
+        _c[2] = scalar;
+        _c[3] = scalar;
+    }
+
+    Vector4(const Color &c) : Vector4(c.r, c.g, c.b, c.a) {}
+
+    static Vector4 FromColor(const Color &c) { return Vector4(c); }
+
+    static Vector4 FromColorRGB(const Color &c) { return Vector4(c.r, c.g, c.b); }
+
+    static float Dot(Vector4 &lhs, Vector4 &rhs) {
+        float sum = 0;
+        for (unsigned i = 0; i < 4; i++) { sum += lhs[i] * rhs[i]; }
+        return sum;
+    }
+
+    float operator[](size_t index) const {
+        assert(index < 4);
+        return _c[index];
+    }
+    float &operator[](size_t index) {
+        assert(index < 4);
+        return _c[index];
+    }
+
+    friend Vector4 operator+(const Vector4 &lhs, const Vector4 &rhs) { return DoOp(lhs, rhs, std::plus()); }
+    friend Vector4 operator-(const Vector4 &lhs, const Vector4 &rhs) { return DoOp(lhs, rhs, std::minus()); }
+    friend Vector4 operator*(const Vector4 &lhs, const Vector4 &rhs) { return DoOp(lhs, rhs, std::multiplies()); }
+    friend Vector4 operator/(const Vector4 &lhs, const Vector4 &rhs) { return DoOp(lhs, rhs, std::divides()); }
+
+    friend Vector4 operator+(const Vector4 &lhs, const float &rhs) { return DoOp(lhs, rhs, std::plus()); }
+    friend Vector4 operator-(const Vector4 &lhs, const float &rhs) { return DoOp(lhs, rhs, std::minus()); }
+    friend Vector4 operator*(const Vector4 &lhs, const float &rhs) { return DoOp(lhs, rhs, std::multiplies()); }
+    friend Vector4 operator/(const Vector4 &lhs, const float &rhs) { return DoOp(lhs, rhs, std::divides()); }
+
+    friend Vector4 &operator+=(Vector4 &lhs, const Vector4 &rhs) { return lhs = lhs + rhs; }
+    friend Vector4 &operator-=(Vector4 &lhs, const Vector4 &rhs) { return lhs = lhs - rhs; }
+    friend Vector4 &operator*=(Vector4 &lhs, const Vector4 &rhs) { return lhs = lhs * rhs; }
+    friend Vector4 &operator/=(Vector4 &lhs, const Vector4 &rhs) { return lhs = lhs / rhs; }
+
+    friend Vector4 &operator+=(Vector4 &lhs, const float &rhs) { return lhs = lhs + rhs; }
+    friend Vector4 &operator-=(Vector4 &lhs, const float &rhs) { return lhs = lhs - rhs; }
+    friend Vector4 &operator*=(Vector4 &lhs, const float &rhs) { return lhs = lhs * rhs; }
+    friend Vector4 &operator/=(Vector4 &lhs, const float &rhs) { return lhs = lhs / rhs; }
+
+    float Dot(Vector4 other) { return Dot(*this, other); }
+    float MaxAbs(unsigned channels = 4) {
+        assert(channels < 5);
+        assert(channels > 0);
+        float max = 0;
+        for (unsigned i = 0; i < channels; i++) {
+            float a = fabs((*this)[i]);
+            if (a > max) max = a;
+        }
+        return max;
+    }
+
+   private:
+    template <typename Op> friend Vector4 DoOp(const Vector4 &lhs, const Vector4 &rhs, Op f) {
+        Vector4 r;
+        for (unsigned i = 0; i < 4; i++) { r[i] = f(lhs[i], rhs[i]); }
+        return r;
+    }
+
+    template <typename Op> friend Vector4 DoOp(const Vector4 &lhs, const float &rhs, Op f) {
+        Vector4 r;
+        for (unsigned i = 0; i < 4; i++) { r[i] = f(lhs[i], rhs); }
+        return r;
+    }
+
+    std::array<float, 4> _c;
+};
+
+}  // namespace rgbcx
diff --git a/src/rgbcx.cpp b/src/rgbcx.cpp
index cb6a6e4..987d0f7 100644
--- a/src/rgbcx.cpp
+++ b/src/rgbcx.cpp
@@ -13,8 +13,8 @@
 #include <type_traits>
 
 #include "BC1/BC1Block.h"
+#include "BC1/tables.h"
 #include "Color.h"
-#include "tables.h"
 #include "util.h"
 
 namespace rgbcx {
@@ -1703,12 +1703,12 @@ static inline void encode_bc1_pick_initial(const Color *pSrc_pixels, uint32_t fl
             int r = (int)pSrc_pixels[i].r - avg_r;
             int g = (int)pSrc_pixels[i].g - avg_g;
             int b = (int)pSrc_pixels[i].b - avg_b;
-            icov[0] += r * r;
-            icov[1] += r * g;
-            icov[2] += r * b;
-            icov[3] += g * g;
-            icov[4] += g * b;
-            icov[5] += b * b;
+            icov[0] += r * r; //0, 0, 0
+            icov[1] += r * g; //1, 0, 1
+            icov[2] += r * b; //2, 0, 2
+            icov[3] += g * g; //3, 1, 1
+            icov[4] += g * b; //4, 1, 2
+            icov[5] += b * b; //5, 2, 2
         }
 
         int saxis_r = 306, saxis_g = 601, saxis_b = 117;
diff --git a/src/test/test.cpp b/src/test/test.cpp
index 8e17cce..712721f 100644
--- a/src/test/test.cpp
+++ b/src/test/test.cpp
@@ -104,7 +104,7 @@ struct color_quad_u8 {
         return m_c[i];
     }
 
-    inline int get_luma() const { return (13938U * m_c[0] + 46869U * m_c[1] + 4729U * m_c[2] + 32768U) >> 16U; }  // REC709 weightings
+    inline int GetLuma() const { return (13938U * m_c[0] + 46869U * m_c[1] + 4729U * m_c[2] + 32768U) >> 16U; }  // REC709 weightings
 };*/
 using color_quad_u8 = Color;
 typedef std::vector<color_quad_u8> color_quad_u8_vec;
@@ -267,7 +267,7 @@ class image_metrics {
 
                 if (!num_channels) {
                     //                    int luma_diff = ;
-                    unsigned index = iabs(ca.get_luma() - cb.get_luma());
+                    unsigned index = iabs(ca.GetLuma() - cb.GetLuma());
                     hist[index]++;
                 } else {
                     for (uint32_t c = 0; c < num_channels; c++) hist[iabs(ca[first_channel + c] - cb[first_channel + c])]++;
diff --git a/src/util.h b/src/util.h
index b6f7d1c..4da4af4 100644
--- a/src/util.h
+++ b/src/util.h
@@ -125,7 +125,7 @@ template <typename S> constexpr S minimum(S a, S b, S c, S d) { return minimum(m
 
 template <typename T> constexpr T square(T a) { return a * a; }
 
-constexpr float clampf(float value, float low, float high) {
+constexpr float clampf(float value, float low = 0.0f, float high = 1.0f) {
     if (value < low)
         value = low;
     else if (value > high)