From fd6b8449bf0cbb23787c873fcb541c6a03fc4982 Mon Sep 17 00:00:00 2001
From: castano <castano@95f4ed2b-212e-0410-8b90-d31948207fce>
Date: Sat, 29 May 2010 02:47:57 +0000
Subject: [PATCH] Add bc6 and bc7 compressors from nvidia.

---
 src/nvtt/bc6h/arvo/ArvoMath.cpp |  342 +++++++++
 src/nvtt/bc6h/arvo/ArvoMath.h   |  187 +++++
 src/nvtt/bc6h/arvo/Char.cpp     |  420 +++++++++++
 src/nvtt/bc6h/arvo/Char.h       |  245 +++++++
 src/nvtt/bc6h/arvo/Complex.cpp  |   76 ++
 src/nvtt/bc6h/arvo/Complex.h    |  187 +++++
 src/nvtt/bc6h/arvo/Matrix.cpp   | 1201 ++++++++++++++++++++++++++++++
 src/nvtt/bc6h/arvo/Matrix.h     |  142 ++++
 src/nvtt/bc6h/arvo/Perm.cpp     |  503 +++++++++++++
 src/nvtt/bc6h/arvo/Perm.h       |  111 +++
 src/nvtt/bc6h/arvo/Rand.cpp     |  230 ++++++
 src/nvtt/bc6h/arvo/Rand.h       |  114 +++
 src/nvtt/bc6h/arvo/SI_units.h   |  232 ++++++
 src/nvtt/bc6h/arvo/SVD.cpp      |  398 ++++++++++
 src/nvtt/bc6h/arvo/SVD.h        |   54 ++
 src/nvtt/bc6h/arvo/SphTri.cpp   |  292 ++++++++
 src/nvtt/bc6h/arvo/SphTri.h     |  124 ++++
 src/nvtt/bc6h/arvo/Token.cpp    |  913 +++++++++++++++++++++++
 src/nvtt/bc6h/arvo/Token.h      |  203 +++++
 src/nvtt/bc6h/arvo/Vec2.cpp     |   94 +++
 src/nvtt/bc6h/arvo/Vec2.h       |  358 +++++++++
 src/nvtt/bc6h/arvo/Vec3.cpp     |  119 +++
 src/nvtt/bc6h/arvo/Vec3.h       |  517 +++++++++++++
 src/nvtt/bc6h/arvo/Vector.cpp   |  366 +++++++++
 src/nvtt/bc6h/arvo/Vector.h     |  103 +++
 src/nvtt/bc6h/arvo/form.h       |   26 +
 src/nvtt/bc6h/bits.h            |   73 ++
 src/nvtt/bc6h/exr.cpp           |   51 ++
 src/nvtt/bc6h/exr.h             |   37 +
 src/nvtt/bc6h/shapes_two.h      |  133 ++++
 src/nvtt/bc6h/tile.h            |  115 +++
 src/nvtt/bc6h/utils.cpp         |  466 ++++++++++++
 src/nvtt/bc6h/utils.h           |   79 ++
 src/nvtt/bc6h/zoh.cpp           |  205 ++++++
 src/nvtt/bc6h/zoh.h             |   78 ++
 src/nvtt/bc6h/zoh.sln           |   21 +
 src/nvtt/bc6h/zoh.vcproj        |  281 +++++++
 src/nvtt/bc6h/zohc.cpp          |  301 ++++++++
 src/nvtt/bc6h/zohone.cpp        |  804 ++++++++++++++++++++
 src/nvtt/bc6h/zohtwo.cpp        |  892 ++++++++++++++++++++++
 src/nvtt/bc7/ImfArray.h         |  261 +++++++
 src/nvtt/bc7/arvo/ArvoMath.cpp  |  342 +++++++++
 src/nvtt/bc7/arvo/ArvoMath.h    |  212 ++++++
 src/nvtt/bc7/arvo/Char.cpp      |  420 +++++++++++
 src/nvtt/bc7/arvo/Char.h        |  245 +++++++
 src/nvtt/bc7/arvo/Complex.cpp   |   76 ++
 src/nvtt/bc7/arvo/Complex.h     |  187 +++++
 src/nvtt/bc7/arvo/Matrix.cpp    | 1201 ++++++++++++++++++++++++++++++
 src/nvtt/bc7/arvo/Matrix.h      |  142 ++++
 src/nvtt/bc7/arvo/Perm.cpp      |  503 +++++++++++++
 src/nvtt/bc7/arvo/Perm.h        |  111 +++
 src/nvtt/bc7/arvo/Rand.cpp      |  230 ++++++
 src/nvtt/bc7/arvo/Rand.h        |  114 +++
 src/nvtt/bc7/arvo/SI_units.h    |  232 ++++++
 src/nvtt/bc7/arvo/SVD.cpp       |  398 ++++++++++
 src/nvtt/bc7/arvo/SVD.h         |   54 ++
 src/nvtt/bc7/arvo/SphTri.cpp    |  292 ++++++++
 src/nvtt/bc7/arvo/SphTri.h      |  124 ++++
 src/nvtt/bc7/arvo/Token.cpp     |  913 +++++++++++++++++++++++
 src/nvtt/bc7/arvo/Token.h       |  203 +++++
 src/nvtt/bc7/arvo/Vec2.cpp      |   94 +++
 src/nvtt/bc7/arvo/Vec2.h        |  358 +++++++++
 src/nvtt/bc7/arvo/Vec3.cpp      |  119 +++
 src/nvtt/bc7/arvo/Vec3.h        |  517 +++++++++++++
 src/nvtt/bc7/arvo/Vec4.cpp      |   79 ++
 src/nvtt/bc7/arvo/Vec4.h        |  238 ++++++
 src/nvtt/bc7/arvo/Vector.cpp    |  366 +++++++++
 src/nvtt/bc7/arvo/Vector.h      |  103 +++
 src/nvtt/bc7/arvo/form.h        |   26 +
 src/nvtt/bc7/avpcl.cpp          |  263 +++++++
 src/nvtt/bc7/avpcl.h            |  107 +++
 src/nvtt/bc7/avpcl.sln          |   21 +
 src/nvtt/bc7/avpcl.vcproj       |  314 ++++++++
 src/nvtt/bc7/avpcl_mode0.cpp    | 1068 +++++++++++++++++++++++++++
 src/nvtt/bc7/avpcl_mode1.cpp    | 1049 ++++++++++++++++++++++++++
 src/nvtt/bc7/avpcl_mode2.cpp    | 1005 +++++++++++++++++++++++++
 src/nvtt/bc7/avpcl_mode3.cpp    | 1061 +++++++++++++++++++++++++++
 src/nvtt/bc7/avpcl_mode4.cpp    | 1220 ++++++++++++++++++++++++++++++
 src/nvtt/bc7/avpcl_mode5.cpp    | 1222 +++++++++++++++++++++++++++++++
 src/nvtt/bc7/avpcl_mode6.cpp    | 1059 ++++++++++++++++++++++++++
 src/nvtt/bc7/avpcl_mode7.cpp    | 1098 +++++++++++++++++++++++++++
 src/nvtt/bc7/avpclc.cpp         |  348 +++++++++
 src/nvtt/bc7/bits.h             |   73 ++
 src/nvtt/bc7/endpts.h           |   80 ++
 src/nvtt/bc7/rgba.h             |   27 +
 src/nvtt/bc7/shapes_three.h     |  132 ++++
 src/nvtt/bc7/shapes_two.h       |  133 ++++
 src/nvtt/bc7/targa.cpp          |  179 +++++
 src/nvtt/bc7/targa.h            |   30 +
 src/nvtt/bc7/tile.h             |   67 ++
 src/nvtt/bc7/utils.cpp          |  391 ++++++++++
 src/nvtt/bc7/utils.h            |   69 ++
 92 files changed, 30269 insertions(+)
 create mode 100755 src/nvtt/bc6h/arvo/ArvoMath.cpp
 create mode 100755 src/nvtt/bc6h/arvo/ArvoMath.h
 create mode 100755 src/nvtt/bc6h/arvo/Char.cpp
 create mode 100755 src/nvtt/bc6h/arvo/Char.h
 create mode 100755 src/nvtt/bc6h/arvo/Complex.cpp
 create mode 100755 src/nvtt/bc6h/arvo/Complex.h
 create mode 100755 src/nvtt/bc6h/arvo/Matrix.cpp
 create mode 100755 src/nvtt/bc6h/arvo/Matrix.h
 create mode 100755 src/nvtt/bc6h/arvo/Perm.cpp
 create mode 100755 src/nvtt/bc6h/arvo/Perm.h
 create mode 100755 src/nvtt/bc6h/arvo/Rand.cpp
 create mode 100755 src/nvtt/bc6h/arvo/Rand.h
 create mode 100755 src/nvtt/bc6h/arvo/SI_units.h
 create mode 100755 src/nvtt/bc6h/arvo/SVD.cpp
 create mode 100755 src/nvtt/bc6h/arvo/SVD.h
 create mode 100755 src/nvtt/bc6h/arvo/SphTri.cpp
 create mode 100755 src/nvtt/bc6h/arvo/SphTri.h
 create mode 100755 src/nvtt/bc6h/arvo/Token.cpp
 create mode 100755 src/nvtt/bc6h/arvo/Token.h
 create mode 100755 src/nvtt/bc6h/arvo/Vec2.cpp
 create mode 100755 src/nvtt/bc6h/arvo/Vec2.h
 create mode 100755 src/nvtt/bc6h/arvo/Vec3.cpp
 create mode 100755 src/nvtt/bc6h/arvo/Vec3.h
 create mode 100755 src/nvtt/bc6h/arvo/Vector.cpp
 create mode 100755 src/nvtt/bc6h/arvo/Vector.h
 create mode 100755 src/nvtt/bc6h/arvo/form.h
 create mode 100755 src/nvtt/bc6h/bits.h
 create mode 100755 src/nvtt/bc6h/exr.cpp
 create mode 100755 src/nvtt/bc6h/exr.h
 create mode 100755 src/nvtt/bc6h/shapes_two.h
 create mode 100755 src/nvtt/bc6h/tile.h
 create mode 100755 src/nvtt/bc6h/utils.cpp
 create mode 100755 src/nvtt/bc6h/utils.h
 create mode 100755 src/nvtt/bc6h/zoh.cpp
 create mode 100755 src/nvtt/bc6h/zoh.h
 create mode 100755 src/nvtt/bc6h/zoh.sln
 create mode 100755 src/nvtt/bc6h/zoh.vcproj
 create mode 100755 src/nvtt/bc6h/zohc.cpp
 create mode 100755 src/nvtt/bc6h/zohone.cpp
 create mode 100755 src/nvtt/bc6h/zohtwo.cpp
 create mode 100644 src/nvtt/bc7/ImfArray.h
 create mode 100644 src/nvtt/bc7/arvo/ArvoMath.cpp
 create mode 100644 src/nvtt/bc7/arvo/ArvoMath.h
 create mode 100644 src/nvtt/bc7/arvo/Char.cpp
 create mode 100644 src/nvtt/bc7/arvo/Char.h
 create mode 100644 src/nvtt/bc7/arvo/Complex.cpp
 create mode 100644 src/nvtt/bc7/arvo/Complex.h
 create mode 100644 src/nvtt/bc7/arvo/Matrix.cpp
 create mode 100644 src/nvtt/bc7/arvo/Matrix.h
 create mode 100644 src/nvtt/bc7/arvo/Perm.cpp
 create mode 100644 src/nvtt/bc7/arvo/Perm.h
 create mode 100644 src/nvtt/bc7/arvo/Rand.cpp
 create mode 100644 src/nvtt/bc7/arvo/Rand.h
 create mode 100644 src/nvtt/bc7/arvo/SI_units.h
 create mode 100644 src/nvtt/bc7/arvo/SVD.cpp
 create mode 100644 src/nvtt/bc7/arvo/SVD.h
 create mode 100644 src/nvtt/bc7/arvo/SphTri.cpp
 create mode 100644 src/nvtt/bc7/arvo/SphTri.h
 create mode 100644 src/nvtt/bc7/arvo/Token.cpp
 create mode 100644 src/nvtt/bc7/arvo/Token.h
 create mode 100644 src/nvtt/bc7/arvo/Vec2.cpp
 create mode 100644 src/nvtt/bc7/arvo/Vec2.h
 create mode 100644 src/nvtt/bc7/arvo/Vec3.cpp
 create mode 100644 src/nvtt/bc7/arvo/Vec3.h
 create mode 100644 src/nvtt/bc7/arvo/Vec4.cpp
 create mode 100644 src/nvtt/bc7/arvo/Vec4.h
 create mode 100644 src/nvtt/bc7/arvo/Vector.cpp
 create mode 100644 src/nvtt/bc7/arvo/Vector.h
 create mode 100644 src/nvtt/bc7/arvo/form.h
 create mode 100644 src/nvtt/bc7/avpcl.cpp
 create mode 100644 src/nvtt/bc7/avpcl.h
 create mode 100644 src/nvtt/bc7/avpcl.sln
 create mode 100644 src/nvtt/bc7/avpcl.vcproj
 create mode 100644 src/nvtt/bc7/avpcl_mode0.cpp
 create mode 100644 src/nvtt/bc7/avpcl_mode1.cpp
 create mode 100644 src/nvtt/bc7/avpcl_mode2.cpp
 create mode 100644 src/nvtt/bc7/avpcl_mode3.cpp
 create mode 100644 src/nvtt/bc7/avpcl_mode4.cpp
 create mode 100644 src/nvtt/bc7/avpcl_mode5.cpp
 create mode 100644 src/nvtt/bc7/avpcl_mode6.cpp
 create mode 100644 src/nvtt/bc7/avpcl_mode7.cpp
 create mode 100644 src/nvtt/bc7/avpclc.cpp
 create mode 100644 src/nvtt/bc7/bits.h
 create mode 100644 src/nvtt/bc7/endpts.h
 create mode 100644 src/nvtt/bc7/rgba.h
 create mode 100644 src/nvtt/bc7/shapes_three.h
 create mode 100644 src/nvtt/bc7/shapes_two.h
 create mode 100644 src/nvtt/bc7/targa.cpp
 create mode 100644 src/nvtt/bc7/targa.h
 create mode 100644 src/nvtt/bc7/tile.h
 create mode 100644 src/nvtt/bc7/utils.cpp
 create mode 100644 src/nvtt/bc7/utils.h

diff --git a/src/nvtt/bc6h/arvo/ArvoMath.cpp b/src/nvtt/bc6h/arvo/ArvoMath.cpp
new file mode 100755
index 0000000..95d1a7d
--- /dev/null
+++ b/src/nvtt/bc6h/arvo/ArvoMath.cpp
@@ -0,0 +1,342 @@
+/***************************************************************************
+* Math.C                                                                   *
+*                                                                          *
+* Some basic math functions.                                               *
+*                                                                          *
+*   HISTORY                                                                *
+*      Name    Date        Description                                     *
+*                                                                          *
+*      arvo    06/21/93    Initial coding.                                 *
+*                                                                          *
+*--------------------------------------------------------------------------*
+* Copyright (C) 1999, James Arvo                                           *
+*                                                                          *
+* This program is free software; you can redistribute it and/or modify it  *
+* under the terms of the GNU General Public License as published by the    *
+* Free Software Foundation.  See http://www.fsf.org/copyleft/gpl.html      *
+*                                                                          *
+* This program is distributed in the hope that it will be useful, but      *
+* WITHOUT EXPRESS OR IMPLIED WARRANTY of merchantability or fitness for    *
+* any particular purpose.  See the GNU General Public License for more     *
+* details.                                                                 *
+*                                                                          *
+***************************************************************************/
+#include <math.h>
+#include <stdlib.h>
+#include <iostream>
+#include <assert.h>
+#include "ArvoMath.h"
+#include "form.h"
+
+namespace ArvoMath {
+	static const float  Epsilon = 1.0E-5;
+	static const double LogTwo  = log( 2.0 );
+
+#define BinCoeffMax 500
+
+	double RelErr( double x, double y )
+	{
+		double z = x - y;
+		if( x < 0.0 ) x = -x;
+		if( y < 0.0 ) y = -y;
+		return z / ( x > y ? x : y );
+	}
+
+	/***************************************************************************
+	*  A R C   Q U A D                                                         *
+	*                                                                          *
+	* Returns the theta / ( 2*PI ) where the input variables x and y are       *
+	* such that  x == COS( theta ) and  y == SIN( theta ).                     *
+	*                                                                          *
+	***************************************************************************/
+	float ArcQuad( float x, float y )
+	{
+		if( Abs( x ) > Epsilon )
+		{
+			float temp = OverTwoPi * atan( Abs( y ) / Abs( x ) );
+			if( x < 0.0 ) temp = 0.5 - temp;
+			if( y < 0.0 ) temp = 1.0 - temp;
+			return( temp );
+		}
+		else if( y >  Epsilon ) return( 0.25 );
+		else if( y < -Epsilon ) return( 0.75 );
+		else return( 0.0 ); 
+	}
+
+	/***************************************************************************
+	*  A R C   T A N                                                           *
+	*                                                                          *
+	* Returns the angle theta such that x = COS( theta ) & y = SIN( theta ).   *
+	*                                                                          *
+	***************************************************************************/
+	float ArcTan( float x, float y )
+	{
+		if( Abs( x ) > Epsilon )
+		{
+			float temp = atan( Abs( y ) / Abs( x ) );
+			if( x < 0.0 ) temp = Pi    - temp;
+			if( y < 0.0 ) temp = TwoPi - temp;
+			return( temp );
+		}
+		else if( y >  Epsilon ) return(     PiOverTwo );
+		else if( y < -Epsilon ) return( 3 * PiOverTwo );
+		else return( 0.0 ); 
+	}
+
+	/***************************************************************************
+	*  M A C H I N E   E P S I L O N                                           *
+	*                                                                          *
+	* Returns the machine epsilon.                                             *
+	*                                                                          *
+	***************************************************************************/
+	float MachineEpsilon()
+	{
+		float x = 1.0;
+		float y;
+		float z = 1.0 + x;
+		while( z > 1.0 )
+		{
+			y = x;
+			x /= 2.0;
+			z = (float)( 1.0 + (float)x );  // Avoid double precision!
+		}
+		return (float)y;
+	}
+
+	/***************************************************************************
+	*  L O G   G A M M A                                                       *
+	*                                                                          *
+	*  Computes the natural log of the gamma function using the Lanczos        *
+	*  approximation formula.  Gamma is defined by                             *
+	*                                                                          *
+	*                                 ( z - 1 )   -t                           *
+	*         gamma( z ) = Integral[ t           e    dt ]                     *
+	*                                                                          *
+	*                                                                          *
+	*  where the integral ranges from 0 to infinity.  The gamma function       *
+	*  satisfies                                                               *
+	*                    gamma( n + 1 ) = n!                                   *
+	*                                                                          *
+	*  This algorithm has been adapted from "Numerical Recipes", p. 157.       *
+	*                                                                          *
+	***************************************************************************/
+	double LogGamma( double x )
+	{
+		static const double 
+			coeff0 =  7.61800917300E+1,
+			coeff1 = -8.65053203300E+1,
+			coeff2 =  2.40140982200E+1,
+			coeff3 = -1.23173951600E+0,
+			coeff4 =  1.20858003000E-3,
+			coeff5 = -5.36382000000E-6,
+			stp    =  2.50662827465E+0,
+			half   =  5.00000000000E-1,
+			fourpf =  4.50000000000E+0,
+			one    =  1.00000000000E+0,
+			two    =  2.00000000000E+0, 
+			three  =  3.00000000000E+0,
+			four   =  4.00000000000E+0, 
+			five   =  5.00000000000E+0;
+		double r = coeff0 / ( x        ) + coeff1 / ( x + one   ) +
+			coeff2 / ( x + two  ) + coeff3 / ( x + three ) +
+			coeff4 / ( x + four ) + coeff5 / ( x + five  ) ;
+		double s = x + fourpf;
+		double t = ( x - half ) * log( s ) - s;
+		return t + log( stp * ( r + one ) );
+	}
+
+	/***************************************************************************
+	*  L O G   F A C T                                                         *
+	*                                                                          *
+	*  Returns the natural logarithm of n factorial.  For efficiency, some     *
+	*  of the values are cached, so they need be computed only once.           *
+	*                                                                          *
+	***************************************************************************/
+	double LogFact( int n )
+	{
+		static const int Cache_Size = 100;
+		static double c[ Cache_Size ] = { 0.0 }; // Cache some of the values.
+		if( n <= 1 ) return 0.0;
+		if( n < Cache_Size )
+		{
+			if( c[n] == 0.0 ) c[n] = LogGamma((double)(n+1));
+			return c[n];
+		}
+		return LogGamma((double)(n+1)); // gamma(n+1) == n!
+	}
+
+	/***************************************************************************
+	*  M U L T I N O M I A L    C O E F F                                      *
+	*                                                                          *
+	*  Returns the multinomial coefficient ( n; X1 X2 ... Xk ) which is        *
+	*  defined to be n! / ( X1! X2! ... Xk! ).  This is done by computing      *
+	*  exp( log(n!) - log(X1!) - log(X2!) - ... - log(Xk!) ).  The value of    *
+	*  n is obtained by summing the Xi's.                                      *
+	*                                                                          *
+	***************************************************************************/
+	double MultinomialCoeff( int k, int X[] )
+	{
+		int i;
+		// Find n by summing the coefficients.
+
+		int  n = X[0];
+		for( i = 1; i < k; i++ ) n += X[i];
+
+		// Compute log(n!) then subtract log(X!) for each X.
+
+		double LogCoeff = LogFact( n );
+		for( i = 0; i < k; i++ ) LogCoeff -= LogFact( X[i] );
+
+		// Round the exponential of the result to the nearest integer.
+
+		return floor( exp( LogCoeff ) + 0.5 );
+	}
+
+
+	double MultinomialCoeff( int i, int j, int k )
+	{
+		int    n = i + j + k;
+		double x = LogFact( n ) - LogFact( i ) - LogFact( j ) - LogFact( k );
+		return floor( exp( x ) + 0.5 );
+	}
+
+	/***************************************************************************
+	*  B I N O M I A L    C O E F F S                                          *
+	*                                                                          *
+	*  Generate all n+1 binomial coefficents for a given n.  This is done by   *
+	*  computing the n'th row of Pascal's triange, starting from the top.      *
+	*  No additional storage is required.                                      *
+	*                                                                          *
+	***************************************************************************/
+	void BinomialCoeffs( int n, long *coeff )
+	{
+		coeff[0] = 1;
+		for( int i = 1; i <= n; i++ )
+		{
+			long a = coeff[0];
+			long b = coeff[1];
+			for( int j = 1; j < i; j++ )  // Make next row of Pascal's triangle.
+			{
+				coeff[j] = a + b; // Overwrite the old row.
+				a = b;
+				b = coeff[j+1];
+			}
+			coeff[i] = 1;  // The last entry in any row is always 1.
+		}
+	}
+
+	void BinomialCoeffs( int n, double *coeff )
+	{
+		coeff[0] = 1.0;
+		for( int i = 1; i <= n; i++ )
+		{
+			double a = coeff[0];
+			double b = coeff[1];
+			for( int j = 1; j < i; j++ )  // Make next row of Pascal's triangle.
+			{
+				coeff[j] = a + b; // Overwrite the old row.
+				a = b;
+				b = coeff[j+1];
+			}
+			coeff[i] = 1.0;  // The last entry in any row is always 1.
+		}
+	}
+
+	const double *BinomialCoeffs( int n )
+	{
+		static double *coeff[ BinCoeffMax + 1 ] = { 0 };
+		if( n > BinCoeffMax || n < 0 ) 
+		{
+			std::cerr << form( "%d is outside of (0,%d) in BinomialCoeffs", n, BinCoeffMax );
+			return NULL;
+		}
+		if( coeff[n] == NULL ) // Fill in this entry.
+		{
+			double *c = new double[ n + 1 ];
+			if( c == NULL )
+			{
+				std::cerr << form( "Could not allocate for BinomialCoeffs(%d)", n );
+				return NULL;
+			}
+			BinomialCoeffs( n, c );
+			coeff[n] = c;
+		}
+		return coeff[n];
+	}
+
+	/***************************************************************************
+	*  B I N O M I A L    C O E F F                                            *
+	*                                                                          *
+	*  Compute a given binomial coefficient.  Several rows of Pascal's         *
+	*  triangle are stored for efficiently computing the small coefficients.   *
+	*  Higher-order terms are computed using LogFact.                          *
+	*                                                                          *
+	***************************************************************************/
+	double BinomialCoeff( int n, int k )
+	{
+		double b;
+		int    p = n - k;
+		if( k <= 1 || p <= 1 )  // Check for errors and special cases.
+		{
+			if( k == 0 || p == 0 ) return 1;
+			if( k == 1 || p == 1 ) return n;
+			std::cerr << form( "BinomialCoeff(%d,%d) is undefined", n, k );
+			return 0;
+		}
+		static const int  // Store part of Pascal's triange for small coeffs.
+			n0[] = { 1 },
+			n1[] = { 1, 1 },
+			n2[] = { 1, 2, 1 },
+			n3[] = { 1, 3, 3, 1 },
+			n4[] = { 1, 4, 6, 4, 1 },
+			n5[] = { 1, 5, 10, 10, 5, 1 },
+			n6[] = { 1, 6, 15, 20, 15, 6, 1 },
+			n7[] = { 1, 7, 21, 35, 35, 21, 7, 1 },
+			n8[] = { 1, 8, 28, 56, 70, 56, 28, 8, 1 },
+			n9[] = { 1, 9, 36, 84, 126, 126, 84, 36, 9, 1 };
+		switch( n )
+		{
+		case 0 : b = n0[k]; break;
+		case 1 : b = n1[k]; break;
+		case 2 : b = n2[k]; break;
+		case 3 : b = n3[k]; break;
+		case 4 : b = n4[k]; break;
+		case 5 : b = n5[k]; break;
+		case 6 : b = n6[k]; break;
+		case 7 : b = n7[k]; break;
+		case 8 : b = n8[k]; break;
+		case 9 : b = n9[k]; break;
+		default:
+			{
+				double x = LogFact( n ) - LogFact( p ) - LogFact( k );
+				b = floor( exp( x ) + 0.5 );
+			}
+		}
+		return b;
+	}
+
+
+	/***************************************************************************
+	*  L O G   D O U B L E   F A C T   (Log of double factorial)               *
+	*                                                                          *
+	*  Return log( n!! ) where the double factorial is defined by              *
+	*                                                                          *
+	*      (2 n + 1)!! = 1 * 3 * 5 * ... * (2n + 1)    (Odd integers)          *
+	*                                                                          *
+	*      (2 n)!!     = 2 * 4 * 6 * ... * 2n          (Even integers)         *
+	*                                                                          *
+	*  and is related to the single factorial via                              *
+	*                                                                          *
+	*      (2 n + 1)!! = (2 n + 1)! / ( 2^n n! )       (Odd integers)          *
+	*                                                                          *
+	*      (2 n)!!     = 2^n n!                        (Even integers)         *
+	*                                                                          *
+	***************************************************************************/
+	double LogDoubleFact( int n )   // log( n!! )
+	{
+		int    k = n / 2;
+		double f = LogFact( k ) + k * LogTwo;
+		if( Odd(n) ) f = LogFact( n ) - f;
+		return f;
+	}
+};
diff --git a/src/nvtt/bc6h/arvo/ArvoMath.h b/src/nvtt/bc6h/arvo/ArvoMath.h
new file mode 100755
index 0000000..957cb25
--- /dev/null
+++ b/src/nvtt/bc6h/arvo/ArvoMath.h
@@ -0,0 +1,187 @@
+/***************************************************************************
+* Math.h                                                                   *
+*                                                                          *
+* Convenient constants, macros, and inline functions for basic math        *
+* functions.                                                               *
+*                                                                          *
+*   HISTORY                                                                *
+*      Name    Date        Description                                     *
+*                                                                          *
+*      arvo    06/17/93    Initial coding.                                 *
+*                                                                          *
+*--------------------------------------------------------------------------*
+* Copyright (C) 1999, James Arvo                                           *
+*                                                                          *
+* This program is free software; you can redistribute it and/or modify it  *
+* under the terms of the GNU General Public License as published by the    *
+* Free Software Foundation.  See http://www.fsf.org/copyleft/gpl.html      *
+*                                                                          *
+* This program is distributed in the hope that it will be useful, but      *
+* WITHOUT EXPRESS OR IMPLIED WARRANTY of merchantability or fitness for    *
+* any particular purpose.  See the GNU General Public License for more     *
+* details.                                                                 *
+*                                                                          *
+***************************************************************************/
+#ifndef __MATH_INCLUDED__
+#define __MATH_INCLUDED__
+
+#include <math.h>
+#include <stdlib.h>
+
+namespace ArvoMath {
+
+#ifndef MAXFLOAT
+#define MAXFLOAT 1.0E+20
+#endif
+
+	static const double
+		Pi            = 3.14159265358979,
+		PiSquared     = Pi * Pi,
+		TwoPi         = 2.0 * Pi,
+		FourPi        = 4.0 * Pi,
+		PiOverTwo     = Pi / 2.0,
+		PiOverFour    = Pi / 4.0,
+		OverPi        = 1.0 / Pi,
+		OverTwoPi     = 1.0 / TwoPi,
+		OverFourPi    = 1.0 / FourPi,
+		Infinity      = MAXFLOAT,
+		Tiny          = 1.0 / MAXFLOAT,
+		DegreesToRad  = Pi / 180.0,
+		RadToDegrees  = 180.0 / Pi;
+
+	inline int    Odd   ( int    k           ) { return k & 1; }
+	inline int    Even  ( int    k           ) { return !(k & 1); }
+	inline float  Abs   ( int    x           ) { return x > 0  ? x : -x; }
+	inline float  Abs   ( float  x           ) { return x > 0. ? x : -x; }
+	inline float  Abs   ( double x           ) { return x > 0. ? x : -x; }
+	inline float  Min   ( float  x, float  y ) { return x < y ? x : y; }
+	inline float  Max   ( float  x, float  y ) { return x > y ? x : y; }
+	inline double dMin  ( double x, double y ) { return x < y ? x : y; }
+	inline double dMax  ( double x, double y ) { return x > y ? x : y; }
+	inline float  Sqr   ( int    x           ) { return x * x; }
+	inline float  Sqr   ( float  x           ) { return x * x; }
+	inline float  Sqr   ( double x           ) { return x * x; }
+	inline float  Sqrt  ( double x           ) { return x > 0. ? sqrt(x) : 0.; }
+	inline float  Cubed ( float  x           ) { return x * x * x; }
+	inline int    Sign  ( float  x           ) { return x > 0. ? 1 : (x < 0. ? -1 : 0); }
+	inline void   Swap  ( float &a, float &b ) { float c = a; a = b; b = c; }
+	inline void   Swap  ( int   &a, int   &b ) { int   c = a; a = b; b = c; }
+	inline double Sin   ( double x, int    n ) { return pow( sin(x), n ); }
+	inline double Cos   ( double x, int    n ) { return pow( cos(x), n ); }
+	inline float  ToSin ( double x           ) { return Sqrt( 1.0 - Sqr(x) ); }
+	inline float  ToCos ( double x           ) { return Sqrt( 1.0 - Sqr(x) ); }
+	inline float  MaxAbs( float  x, float  y ) { return Max( Abs(x), Abs(y) ); }
+	inline float  MinAbs( float  x, float  y ) { return Min( Abs(x), Abs(y) ); }
+	inline float  Pythag( double x, double y ) { return Sqrt( x*x + y*y ); }
+
+	inline double ArcCos( double x )
+	{
+		double y;
+		if( -1.0 <= x && x <= 1.0 ) y = acos( x );
+		else if( x >  1.0 ) y = 0.0;
+		else if( x < -1.0 ) y = Pi;
+		return y;
+	}
+
+	inline double ArcSin( double x )
+	{
+		if( x < -1.0 ) x = -1.0;
+		if( x >  1.0 ) x =  1.0;
+		return asin( x );
+	}
+
+	inline float Clamp( float min, float &x, float max )
+	{
+		if( x < min ) x = min; else
+			if( x > max ) x = max;
+		return x;
+	}
+
+	inline double Clamp( float min, double &x, float max )
+	{
+		if( x < min ) x = min; else
+			if( x > max ) x = max;
+		return x;
+	}
+
+	inline float Max( float x, float y, float z )
+	{
+		float t;
+		if( x >= y && x >= z ) t = x;
+		else if( y >= z ) t = y;
+		else t = z;
+		return t;
+	}
+
+	inline float Min( float x, float y, float z )
+	{
+		float t;
+		if( x <= y && x <= z ) t = x;
+		else if( y <= z ) t = y;
+		else t = z;
+		return t;
+	}
+
+	inline double dMax( double x, double y, double z )
+	{
+		double t;
+		if( x >= y && x >= z ) t = x;
+		else if( y >= z ) t = y;
+		else t = z;
+		return t;
+	}
+
+	inline double dMin( double x, double y, double z )
+	{
+		double t;
+		if( x <= y && x <= z ) t = x;
+		else if( y <= z ) t = y;
+		else t = z;
+		return t;
+	}
+
+	inline float MaxAbs( float x, float y, float z )
+	{
+		return Max( Abs( x ), Abs( y ), Abs( z ) );
+	}
+
+	inline float Pythag( float x, float y, float z )
+	{
+		return sqrt( x * x  +  y * y  +  z * z );
+	}
+
+	extern float  ArcTan          ( float x, float y      );
+	extern float  ArcQuad         ( float x, float y      );
+	extern float  MachineEpsilon  (                       );
+	extern double LogGamma        ( double x              );
+	extern double LogFact         ( int n                 );
+	extern double LogDoubleFact   ( int n                 );   // log( n!! )
+	extern double BinomialCoeff   ( int n, int k          );
+	extern void   BinomialCoeffs  ( int n, long   *coeffs );
+	extern void   BinomialCoeffs  ( int n, double *coeffs );
+	extern double MultinomialCoeff( int i, int j, int k   );
+	extern double MultinomialCoeff( int k, int N[]        );
+	extern double RelErr          ( double x, double y    );
+
+#ifndef ABS
+#define ABS( x ) ((x) > 0 ? (x) : -(x))
+#endif
+
+#ifndef MAX
+#define MAX( x, y ) ((x) > (y) ? (x) : (y))
+#endif
+
+#ifndef MIN
+#define MIN( x, y ) ((x) < (y) ? (x) : (y))
+#endif
+
+};
+
+#endif
+
+
+
+
+
+
+
diff --git a/src/nvtt/bc6h/arvo/Char.cpp b/src/nvtt/bc6h/arvo/Char.cpp
new file mode 100755
index 0000000..cc450a5
--- /dev/null
+++ b/src/nvtt/bc6h/arvo/Char.cpp
@@ -0,0 +1,420 @@
+/***************************************************************************
+* Char.h                                                                   *
+*                                                                          *
+* Convenient constants, macros, and inline functions for manipulation of   *
+* characters and strings.                                                  *
+*                                                                          *
+*   HISTORY                                                                *
+*      Name    Date        Description                                     *
+*                                                                          *
+*      arvo    07/01/93    Initial coding.                                 *
+*                                                                          *
+*--------------------------------------------------------------------------*
+* Copyright (C) 1999, James Arvo                                           *
+*                                                                          *
+* This program is free software; you can redistribute it and/or modify it  *
+* under the terms of the GNU General Public License as published by the    *
+* Free Software Foundation.  See http://www.fsf.org/copyleft/gpl.html      *
+*                                                                          *
+* This program is distributed in the hope that it will be useful, but      *
+* WITHOUT EXPRESS OR IMPLIED WARRANTY of merchantability or fitness for    *
+* any particular purpose.  See the GNU General Public License for more     *
+* details.                                                                 *
+*                                                                          *
+***************************************************************************/
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include "Char.h"
+
+namespace ArvoMath {
+
+	typedef char *charPtr;
+
+	// Treat "str" as a file name, and return just the directory
+	// portion -- i.e. strip off the name of the leaf object (but
+	// leave the final "/".
+	const char *getPath( const char *str, char *buff )
+	{
+		int k;
+		for( k = strlen( str ) - 1; k >= 0; k-- )
+		{
+			if( str[k] == Slash ) break;
+		}
+		for( int i = 0; i <= k; i++ ) buff[i] = str[i];
+		buff[k+1] = NullChar;
+		return buff;
+	}
+
+	// Treat "str" as a file name, and return just the file name
+	// portion -- i.e. strip off everything up to and including
+	// the final "/".
+	const char *getFile( const char *str, char *buff )
+	{
+		int k;
+		int len = strlen( str );
+		for( k = len - 1; k >= 0; k-- )
+		{
+			if( str[k] == Slash ) break;
+		}
+		for( int i = 0; i < len - k; i++ ) buff[i] = str[ i + k + 1 ];
+		return buff;
+	}
+
+	int getPrefix( const char *str, char *buff )
+	{
+		int len = 0;
+		while( *str != NullChar && *str != Period ) 
+		{
+			*buff++ = *str++;
+			len++;
+		}
+		*buff = NullChar;
+		return len;
+	}
+
+	int getSuffix( const char *str, char *buff )
+	{
+		int n = strlen( str );
+		int k = n - 1;
+		while( k >= 0 && str[k] != Period ) k--;
+		for( int i = k + 1; i < n; i++ ) *buff++ = str[i];
+		*buff = NullChar;    
+		return n - k - 1;
+	}
+
+	const char* toString( int number, char *buff )
+	{
+		static char local_buff[32];
+		char *str = ( buff == NULL ) ? local_buff : buff;
+		sprintf( str, "%d", number );
+		return str;
+	}
+
+	const char* toString( float number, char *buff )
+	{
+		static char local_buff[32];
+		char *str = ( buff == NULL ) ? local_buff : buff;
+		sprintf( str, "%g", number );
+		return str;
+	}
+
+	int isInteger( const char *str )
+	{
+		int n = strlen( str );
+		for( int i = 0; i < n; i++ )
+		{
+			char c = str[i];
+			if( isDigit(c) ) continue;
+			if( c == Plus || c == Minus ) continue;
+			if( c == Space ) continue;
+			return 0;
+		}
+		return 1;
+	}
+
+	// Test to see if a string has a given suffix.
+	int hasSuffix( const char *string, const char *suffix )
+	{
+		if( suffix == NULL ) return 1; // The null suffix always matches.
+		if( string == NULL ) return 0; // The null string can only have a null suffix.
+		int m = strlen( string );
+		int k = strlen( suffix );
+		if( k <= 0    ) return 1; // Empty suffix always matches.
+		if( m < k + 1 ) return 0; // String is too short to have this suffix.
+
+		// See if the file has the given suffix.
+		int s = m - k;  // Beginning of suffix (if it matches).
+		for( int i = 0; i < k; i++ )
+			if( string[ s + i ] != suffix[ i ] ) return 0;
+		return s;  // Always > 0.
+	}
+
+	// Test to see if a string has a given prefix.
+	int hasPrefix( const char *string, const char *prefix )
+	{
+		if( prefix == NULL ) return 1; // The null prefix always matches.
+		if( string == NULL ) return 0; // The null string can only have a null suffix.
+		while( *prefix )
+		{
+			if( *prefix++ != *string++ ) return 0;
+		}
+		return 1;
+	}
+
+	// Test to see if the string contains the given character.
+	int inString( char c, const char *str )
+	{
+		if( str == NULL || str[0] == NullChar ) return 0;
+		while( *str != '\0' ) 
+			if( *str++ == c ) return 1;
+		return 0;
+	}
+
+	int nullString( const char *str )
+	{
+		return str == NULL || str[0] == NullChar;
+	}
+
+	const char *stripSuffix( const char *string, const char *suffix, char *buff )
+	{
+		static char local_buff[256];
+		if( buff == NULL ) buff = local_buff;
+		buff[0] = NullChar;
+		if( !hasSuffix( string, suffix ) ) return NULL;
+		int s = strlen( string ) - strlen( suffix );
+		for( int i = 0; i < s; i++ )
+		{
+			buff[i] = string[i];
+		}
+		buff[s] = NullChar;
+		return buff;
+	}
+
+	int getIndex( const char *pat, const char *str )
+	{
+		int p_len = strlen( pat );
+		int s_len = strlen( str );
+		if( p_len == 0 || s_len == 0 ) return -1;
+		for( int i = 0; i <= s_len - p_len; i++ )
+		{
+			int match = 1;
+			for( int j = 0; j < p_len; j++ )
+			{
+				if( str[ i + j ] != pat[ j ] ) { match = 0; break; }
+			}
+			if( match ) return i;
+		}
+		return -1;
+	}
+
+	int getSubstringAfter( const char *pat, const char *str, char *buff )
+	{
+		int ind = getIndex( pat, str );
+		if( ind < 0 ) return -1;
+		int p_len = strlen( pat );
+		int k = 0;
+		for( int i = ind + p_len; ; i++ )
+		{
+			buff[ k++ ] = str[ i ];
+			if( str[ i ] == NullChar ) break;
+		}
+		return k;
+	}
+
+	const char *SubstringAfter( const char *pat, const char *str, char *user_buff )
+	{
+		static char temp[128];
+		char *buff = ( user_buff != NULL ) ? user_buff : temp;
+		int k = getSubstringAfter( pat, str, buff );
+		if( k > 0 ) return buff;
+		return str;
+	}
+
+	const char *metaString( const char *str, char *user_buff )
+	{
+		static char temp[128];
+		char *buff = ( user_buff != NULL ) ? user_buff : temp;
+		sprintf( buff, "\"%s\"", str );
+		return buff;
+	}
+
+	// This is the opposite of metaString.
+	const char *stripQuotes( const char *str, char *user_buff )
+	{
+		static char temp[128];
+		char *buff = ( user_buff != NULL ) ? user_buff : temp;
+		char *b = buff;
+		for(;;)
+		{
+			if( *str != DoubleQuote ) *b++ = *str;
+			if( *str == NullChar ) break; 
+			str++;
+		}
+		return buff;
+	}
+
+	int getIntFlag( const char *flags, const char *flag, int &value )
+	{
+		while( *flags )
+		{
+			if( hasPrefix( flags, flag ) )
+			{
+				int k = strlen( flag );
+				if( flags[k] == '=' )
+				{
+					value = atoi( flags + k + 1 );
+					return 1;
+				}
+			}
+			flags++;
+		}
+		return 0;
+	}
+
+	int getFloatFlag( const char *flags, const char *flag, float &value )
+	{
+		while( *flags )
+		{
+			if( hasPrefix( flags, flag ) )
+			{
+				int k = strlen( flag );
+				if( flags[k] == '=' )
+				{
+					value = atof( flags + k + 1 );
+					return 1;
+				}
+			}
+			flags++;
+		}
+		return 0;
+	}
+
+	SortedList::SortedList( sort_type type_, int ascend_ )
+	{
+		type         = type_;
+		ascend       = ascend_;
+		num_elements = 0;
+		max_elements = 0;
+		sorted       = 1;
+		list         = NULL;
+	}
+
+	SortedList::~SortedList()
+	{
+		Clear();
+		delete[] list;
+	}
+
+	void SortedList::Clear()
+	{
+		// Delete all the private copies of the strings and re-initialize the
+		// list.  Reuse the same list, expanding it when necessary.
+		for( int i = 0; i < num_elements; i++ ) 
+		{
+			delete list[i];
+			list[i] = NULL;
+		}
+		num_elements = 0;
+		sorted       = 1;
+	}
+
+	SortedList &SortedList::operator<<( const char *str )
+	{
+		// Add a new string to the end of the list, expanding the list if necessary.
+		// Mark the list as unsorted, so that the next reference to an element will
+		// cause the list to be sorted again.
+		if( num_elements == max_elements ) Expand();
+		list[ num_elements++ ] = strdup( str );
+		sorted = 0;
+		return *this;
+	}
+
+	const char *SortedList::operator()( int i )
+	{
+		// Return the i'th element of the list.  Sort first if necessary.
+		static char *null = "";
+		if( num_elements == 0 || i < 0 || i >= num_elements ) return null;
+		if( !sorted ) Sort();
+		return list[i];
+	}
+
+	void SortedList::Expand()
+	{
+		// Create a new list of twice the size and copy the old list into it.
+		// This doubles "max_elements", but leaves "num_elements" unchanged.
+		if( max_elements == 0 ) max_elements = 1;
+		max_elements *= 2;
+		charPtr *new_list = new charPtr[ max_elements ];
+		for( int i = 0; i < max_elements; i++ ) 
+			new_list[i] = ( i < num_elements ) ? list[i] : NULL;
+		delete[] list;
+		list = new_list;
+	}
+
+	void SortedList::Swap( int i, int j )
+	{
+		char *temp = list[i];
+		list[i] = list[j];
+		list[j] = temp;
+	}
+
+	int SortedList::inOrder( int p, int q ) const
+	{
+		int test;
+		if( type == sort_alphabetic )
+			test = ( strcmp( list[p], list[q] ) <= 0 );
+		else
+		{
+			int len_p = strlen( list[p] );
+			int len_q = strlen( list[q] );
+			test = ( len_p <  len_q ) || 
+				( len_p == len_q && strcmp( list[p], list[q] ) <= 0 );
+		}
+		if( ascend ) return test;
+		return !test;
+	}
+
+	// This is an insertion sort that operates on subsets of the
+	// input defined by the step length.
+	void SortedList::InsertionSort( int start, int size, int step ) 
+	{
+		for( int i = 0; i + step < size; i += step )
+		{
+			for( int j = i; j >= 0; j -= step )
+			{
+				int p = start + j;
+				int q = p + step;
+				if( inOrder( p, q ) ) break;
+				Swap( p, q );
+			}
+		}
+	}
+
+	// This is a Shell sort.
+	void SortedList::Sort()
+	{
+		for( int step  = num_elements / 2; step > 1; step /= 2 )
+			for( int start = 0; start < step; start++ )
+				InsertionSort( start, num_elements  - start, step );
+		InsertionSort( 0, num_elements, 1 );
+		sorted = 1;
+	}
+
+	void SortedList::SetOrder( sort_type type_, int ascend_ )
+	{
+		if( type_ != type || ascend_ != ascend )
+		{
+			type   = type_;
+			ascend = ascend_;
+			sorted = 0;
+		}
+	}
+
+	int getstring( std::istream &in, const char *str )
+	{
+		char ch;
+		if( str == NULL ) return 1;
+		while( *str != NullChar )
+		{
+			in >> ch;
+			if( *str != ch ) return 0;
+			str++;
+		}
+		return 1;
+	}
+
+	std::istream &skipWhite( std::istream &in )
+	{
+		char c;
+		while( in.get(c) ) 
+		{
+			if( !isWhite( c ) ) 
+			{
+				in.putback(c);
+				break;
+			}
+		}
+		return in;
+	}
+};
diff --git a/src/nvtt/bc6h/arvo/Char.h b/src/nvtt/bc6h/arvo/Char.h
new file mode 100755
index 0000000..2742c1d
--- /dev/null
+++ b/src/nvtt/bc6h/arvo/Char.h
@@ -0,0 +1,245 @@
+/***************************************************************************
+* Char.h                                                                   *
+*                                                                          *
+* Convenient constants, macros, and inline functions for manipulation of   *
+* characters and strings.                                                  *
+*                                                                          *
+*   HISTORY                                                                *
+*      Name    Date        Description                                     *
+*                                                                          *
+*      arvo    07/01/93    Initial coding.                                 *
+*                                                                          *
+*--------------------------------------------------------------------------*
+* Copyright (C) 1999, James Arvo                                           *
+*                                                                          *
+* This program is free software; you can redistribute it and/or modify it  *
+* under the terms of the GNU General Public License as published by the    *
+* Free Software Foundation.  See http://www.fsf.org/copyleft/gpl.html      *
+*                                                                          *
+* This program is distributed in the hope that it will be useful, but      *
+* WITHOUT EXPRESS OR IMPLIED WARRANTY of merchantability or fitness for    *
+* any particular purpose.  See the GNU General Public License for more     *
+* details.                                                                 *
+*                                                                          *
+***************************************************************************/
+#ifndef __CHAR_INCLUDED__
+#define __CHAR_INCLUDED__
+
+#include <string>
+#include <iostream>
+
+namespace ArvoMath {
+
+	static const char 
+		Apostrophe  = '\'' ,
+		Asterisk    = '*'  ,
+		Atsign      = '@'  ,
+		Backslash   = '\\' ,
+		Bell        = '\7' ,
+		Colon       = ':'  ,
+		Comma       = ','  ,
+		Dash        = '-'  ,
+		DoubleQuote = '"'  ,
+		EqualSign   = '='  ,
+		Exclamation = '!'  ,
+		GreaterThan = '>'  ,
+		Hash        = '#'  ,
+		Lbrack      = '['  ,
+		Lcurley     = '{'  ,
+		LessThan    = '<'  ,
+		Lparen      = '('  ,
+		Minus       = '-'  ,
+		NewLine     = '\n' ,
+		NullChar    = '\0' ,
+		Percent     = '%'  ,
+		Period      = '.'  ,
+		Pound       = '#'  ,
+		Plus        = '+'  ,
+		Rbrack      = ']'  ,
+		Rcurley     = '}'  ,
+		Rparen      = ')'  ,
+		Semicolon   = ';'  ,
+		Space       = ' '  ,
+		Slash       = '/'  ,
+		Star        = '*'  ,
+		Tab         = '\t' ,
+		Tilde       = '~'  ,
+		Underscore  = '_'  ;
+
+	inline int  isWhite( char c ) { return c == Space || c == NewLine || c == Tab; }
+	inline int  isUcase( char c ) { return 'A' <= c && c <= 'Z'; }
+	inline int  isLcase( char c ) { return 'a' <= c && c <= 'z'; }
+	inline int  isAlpha( char c ) { return isUcase( c ) || isLcase( c ); }
+	inline int  isDigit( char c ) { return '0' <= c && c <= '9'; }
+	inline char ToLower( char c ) { return isUcase( c ) ? c + ( 'a' - 'A' ) : c; }
+	inline char ToUpper( char c ) { return isLcase( c ) ? c + ( 'A' - 'a' ) : c; }
+
+	extern const char *getPath( 
+		const char *str, 
+		char *buff 
+		);
+
+	extern const char *getFile( 
+		const char *str, 
+		char *buff 
+		);
+
+	extern int getPrefix( 
+		const char *str, 
+		char *buff 
+		);
+
+	extern int getSuffix( 
+		const char *str, 
+		char *buff 
+		);
+
+	extern int isInteger( 
+		const char *str
+		);
+
+	extern int hasSuffix( 
+		const char *string, 
+		const char *suffix 
+		);
+
+	extern int hasPrefix( 
+		const char *string, 
+		const char *prefix 
+		);
+
+	extern int inString( 
+		char c, 
+		const char *str 
+		);
+
+	extern int nullString( 
+		const char *str 
+		);
+
+	extern const char *stripSuffix(  // Return NULL if unsuccessful.
+		const char *string,  // The string to truncate.
+		const char *suffix,  // The suffix to remove.
+		char  *buff = NULL   // Defaults to internal buffer.
+		);
+
+	extern const char* toString( 
+		int  n,            // An integer to convert to a string.
+		char *buff = NULL  // Defauts to internal buffer.
+		);
+
+	extern const char* toString( 
+		float x,           // A float to convert to a string.
+		char *buff = NULL  // Defauts to internal buffer.
+		);
+
+	extern int getIndex( // The index of the start of a pattern in a string.
+		const char *pat, // The pattern to look for.
+		const char *str  // The string to search.
+		);
+
+	extern int getSubstringAfter( 
+		const char *pat, 
+		const char *str, 
+		char *buff 
+		);
+
+	extern const char *SubstringAfter( 
+		const char *pat, 
+		const char *str,
+		char *buff = NULL  // Defauts to internal buffer.
+		);
+
+	extern const char *metaString(
+		const char *str,   // Make this a string within a string.
+		char *buff = NULL  // Defauts to internal buffer.
+		);
+
+	extern const char *stripQuotes(
+		const char *str,   // This is the opposite of metaString.
+		char *buff = NULL  // Defauts to internal buffer.
+		);
+
+	extern int getIntFlag( 
+		const char *flags, // List of assignment statements.
+		const char *flag,  // A specific flag to look for.
+		int &value         // The variable to assign the value to.
+		);
+
+	extern int getFloatFlag( 
+		const char *flags, // List of assignment statements.
+		const char *flag,  // A specific flag to look for.
+		float &value       // The variable to assign the value to.
+		);
+
+	extern int getstring( 
+		std::istream &in, 
+		const char *str 
+		);
+
+	enum sort_type {
+		sort_alphabetic,    // Standard dictionary ordering.
+		sort_lexicographic  // Sort first by length, then alphabetically.
+	};
+
+	class SortedList {
+
+	public:
+		SortedList( sort_type = sort_alphabetic, int ascending = 1 );
+		~SortedList();
+		SortedList &operator<<( const char * );
+		int Size() const { return num_elements; }
+		const char *operator()( int i );
+		void Clear();
+		void SetOrder( sort_type = sort_alphabetic, int ascending = 1 );
+
+	private:
+		void Sort();
+		void InsertionSort( int start, int size, int step );
+		void Swap( int i, int j );
+		void Expand();
+		int  inOrder( int i, int j ) const;
+		int  num_elements;
+		int  max_elements;
+		int  sorted;
+		int  ascend;
+		sort_type type;
+		char **list;
+	};
+
+
+	inline int Match( const char *s, const char *t )
+	{
+		return s != NULL && 
+			(t != NULL && strcmp( s, t ) == 0);
+	}
+
+	inline int Match( const char *s, const char *t1, const char *t2 )
+	{
+		return s != NULL && (
+			(t1 != NULL && strcmp( s, t1 ) == 0) ||
+			(t2 != NULL && strcmp( s, t2 ) == 0) );
+	}
+
+	union long_union_float {
+		long  i;
+		float f;
+	};
+
+	inline long float_as_long( float x )
+	{
+		long_union_float u;
+		u.f = x;
+		return u.i;
+	}
+
+	inline float long_as_float( long i )
+	{
+		long_union_float u;
+		u.i = i;
+		return u.f;
+	}
+
+	extern std::istream &skipWhite( std::istream &in );
+};
+#endif
diff --git a/src/nvtt/bc6h/arvo/Complex.cpp b/src/nvtt/bc6h/arvo/Complex.cpp
new file mode 100755
index 0000000..468704f
--- /dev/null
+++ b/src/nvtt/bc6h/arvo/Complex.cpp
@@ -0,0 +1,76 @@
+/***************************************************************************
+* Complex.C                                                                *
+*                                                                          *
+* Complex numbers, complex arithmetic, and functions of a complex          *
+* variable.                                                                *
+*                                                                          *
+*   HISTORY                                                                *
+*      Name    Date        Description                                     *
+*                                                                          *
+*      arvo    03/02/2000  Initial coding.                                 *
+*                                                                          *
+*--------------------------------------------------------------------------*
+* Copyright (C) 2000, James Arvo                                           *
+*                                                                          *
+* This program is free software; you can redistribute it and/or modify it  *
+* under the terms of the GNU General Public License as published by the    *
+* Free Software Foundation.  See http://www.fsf.org/copyleft/gpl.html      *
+*                                                                          *
+* This program is distributed in the hope that it will be useful, but      *
+* WITHOUT EXPRESS OR IMPLIED WARRANTY of merchantability or fitness for    *
+* any particular purpose.  See the GNU General Public License for more     *
+* details.                                                                 *
+*                                                                          *
+***************************************************************************/
+#include "Complex.h"
+#include "form.h"
+
+namespace ArvoMath {
+	const Complex Complex::i( 0.0, 1.0 );
+
+	std::ostream &operator<<( std::ostream &out, const Complex &z )
+	{
+		out << form( "(%f,%f) ", z.Real(), z.Imag() );
+		return out;
+	}
+
+	Complex cos( const Complex &z )
+	{
+		return Complex( 
+			::cos( z.Real() ) * ::cosh( z.Imag() ), 
+			-::sin( z.Real() ) * ::sinh( z.Imag() )
+			);
+	}
+
+	Complex sin( const Complex &z )
+	{
+		return Complex( 
+			::sin( z.Real() ) * ::cosh( z.Imag() ), 
+			::cos( z.Real() ) * ::sinh( z.Imag() )
+			);
+	}
+
+	Complex cosh( const Complex &z )
+	{
+		return Complex( 
+			::cosh( z.Real() ) * ::cos( z.Imag() ), 
+			::sinh( z.Real() ) * ::sin( z.Imag() )
+			);
+	}
+
+	Complex sinh( const Complex &z )
+	{
+		return Complex( 
+			::sinh( z.Real() ) * ::cos( z.Imag() ), 
+			::cosh( z.Real() ) * ::sin( z.Imag() )
+			);
+	}
+
+	Complex log( const Complex &z )
+	{
+		float r = ::sqrt( z.Real() * z.Real() + z.Imag() * z.Imag() );
+		float t = ::acos( z.Real() / r );
+		if( z.Imag() < 0.0 ) t = 2.0 * 3.1415926 - t;
+		return Complex( ::log(r), t );
+	}
+};
diff --git a/src/nvtt/bc6h/arvo/Complex.h b/src/nvtt/bc6h/arvo/Complex.h
new file mode 100755
index 0000000..671fd57
--- /dev/null
+++ b/src/nvtt/bc6h/arvo/Complex.h
@@ -0,0 +1,187 @@
+/***************************************************************************
+* Complex.h                                                                *
+*                                                                          *
+* Complex numbers, complex arithmetic, and functions of a complex          *
+* variable.                                                                *
+*                                                                          *
+*   HISTORY                                                                *
+*      Name    Date        Description                                     *
+*                                                                          *
+*      arvo    03/02/2000  Initial coding.                                 *
+*                                                                          *
+*--------------------------------------------------------------------------*
+* Copyright (C) 2000, James Arvo                                           *
+*                                                                          *
+* This program is free software; you can redistribute it and/or modify it  *
+* under the terms of the GNU General Public License as published by the    *
+* Free Software Foundation.  See http://www.fsf.org/copyleft/gpl.html      *
+*                                                                          *
+* This program is distributed in the hope that it will be useful, but      *
+* WITHOUT EXPRESS OR IMPLIED WARRANTY of merchantability or fitness for    *
+* any particular purpose.  See the GNU General Public License for more     *
+* details.                                                                 *
+*                                                                          *
+***************************************************************************/
+#ifndef __COMPLEX_INCLUDED__
+#define __COMPLEX_INCLUDED__
+
+#include <math.h>
+#include <iostream>
+
+namespace ArvoMath {
+
+	class Complex {
+	public:
+		Complex()                   { x = 0; y = 0; }
+		Complex( float a          ) { x = a; y = 0; }
+		Complex( float a, float b ) { x = a; y = b; }
+		Complex( const Complex &z ) { *this = z; }
+		float &Real() { return x; }
+		float &Imag() { return y; }
+		float Real() const { return x; }
+		float Imag() const { return y; }
+		inline Complex &operator=( const Complex &z );
+		static const Complex i;
+	private:
+		float x;
+		float y;
+	};
+
+	inline Complex &Complex::operator=( const Complex &z ) 
+	{ 
+		x = z.Real(); 
+		y = z.Imag(); 
+		return *this;
+	}
+
+	inline float Real( const Complex &z )
+	{
+		return z.Real();
+	}
+
+	inline float Imag( const Complex &z )
+	{
+		return z.Imag();
+	}
+
+	inline Complex conj( const Complex &z )
+	{
+		return Complex( z.Real(), -z.Imag() );
+	}
+
+	inline double modsqr( const Complex &z )
+	{
+		return z.Real() * z.Real() + z.Imag() * z.Imag();
+	}
+
+	inline double modulus( const Complex &z )
+	{
+		return sqrt( z.Real() * z.Real() + z.Imag() * z.Imag() );
+	}
+
+	inline double arg( const Complex &z )
+	{
+		float t = acos( z.Real() / modulus(z) );
+		if( z.Imag() < 0.0 ) t = 2.0 * 3.1415926 - t;
+		return t;
+	}
+
+	inline Complex operator*( const Complex &z, float a )
+	{
+		return Complex( a * z.Real(), a * z.Imag() );
+	}
+
+	inline Complex operator*( float a, const Complex &z )
+	{
+		return Complex( a * z.Real(), a * z.Imag() );
+	}
+
+	inline Complex operator*( const Complex &z, const Complex &w )
+	{
+		return Complex( 
+			z.Real() * w.Real() - z.Imag() * w.Imag(),
+			z.Real() * w.Imag() + z.Imag() * w.Real()
+			);
+	}
+
+	inline Complex operator+( const Complex &z, const Complex &w )
+	{
+		return Complex( z.Real() + w.Real(), z.Imag() + w.Imag() );
+	}
+
+	inline Complex operator-( const Complex &z, const Complex &w )
+	{
+		return Complex( z.Real() - w.Real(), z.Imag() - w.Imag() );
+	}
+
+	inline Complex operator-( const Complex &z )
+	{
+		return Complex( -z.Real(), -z.Imag() );
+	}
+
+	inline Complex operator/( const Complex &z, float w )
+	{
+		return Complex( z.Real() / w, z.Imag() / w );
+	}
+
+	inline Complex operator/( const Complex &z, const Complex &w )
+	{
+		return ( z * conj(w) ) / modsqr(w);
+	}
+
+	inline Complex operator/( float a, const Complex &w )
+	{
+		return conj(w) * ( a / modsqr(w) );
+	}
+
+	inline Complex &operator+=( Complex &z, const Complex &w )
+	{
+		z.Real() += w.Real();
+		z.Imag() += w.Imag();
+		return z;
+	}
+
+	inline Complex &operator*=( Complex &z, const Complex &w )
+	{
+		return z = ( z * w );
+	}
+
+	inline Complex &operator-=( Complex &z, const Complex &w )
+	{
+		z.Real() -= w.Real();
+		z.Imag() -= w.Imag();
+		return z;
+	}
+
+	inline Complex exp( const Complex &z )
+	{
+		float r = ::exp( z.Real() );
+		return Complex( r * cos( z.Imag() ), r * sin( z.Imag() ) );
+	}
+
+	inline Complex pow( const Complex &z, int n )
+	{
+		float r = ::pow( modulus( z ), (double)n );
+		float t = arg( z );
+		return Complex( r * cos( n * t ), r * sin( n * t ) );
+	}
+
+	inline Complex polar( float r, float theta )
+	{
+		return Complex( r * cos( theta ), r * sin( theta ) );
+	}
+
+
+	extern Complex cos ( const Complex &z );
+	extern Complex sin ( const Complex &z );
+	extern Complex cosh( const Complex &z );
+	extern Complex sinh( const Complex &z );
+	extern Complex log ( const Complex &z );
+
+	extern std::ostream &operator<<( 
+		std::ostream &out, 
+		const Complex & 
+		);
+};
+#endif
+
diff --git a/src/nvtt/bc6h/arvo/Matrix.cpp b/src/nvtt/bc6h/arvo/Matrix.cpp
new file mode 100755
index 0000000..d84b7ef
--- /dev/null
+++ b/src/nvtt/bc6h/arvo/Matrix.cpp
@@ -0,0 +1,1201 @@
+/***************************************************************************
+* Matrix.C                                                                 *
+*                                                                          *
+* General Vector and Matrix classes, with all the associated methods.      *
+*                                                                          *
+*   HISTORY                                                                *
+*      Name    Date        Description                                     *
+*                                                                          *
+*      arvo    08/16/2000    Revamped for CIT tools.                       *
+*      arvo    10/31/1994    Combined RowVec & ColVec into Vector.         *
+*      arvo    06/30/1993    Added singular value decomposition class.     *
+*      arvo    06/25/1993    Major revisions.                              *
+*      arvo    09/08/1991    Initial implementation.                       *
+*                                                                          *
+*--------------------------------------------------------------------------*
+* Copyright (C) 2000, James Arvo                                           *
+*                                                                          *
+* This program is free software; you can redistribute it and/or modify it  *
+* under the terms of the GNU General Public License as published by the    *
+* Free Software Foundation.  See http://www.fsf.org/copyleft/gpl.html      *
+*                                                                          *
+* This program is distributed in the hope that it will be useful, but      *
+* WITHOUT EXPRESS OR IMPLIED WARRANTY of merchantability or fitness for    *
+* any particular purpose.  See the GNU General Public License for more     *
+* details.                                                                 *
+*                                                                          *
+***************************************************************************/
+#include <iostream>
+#include <assert.h>
+#include <math.h>
+#include "ArvoMath.h"
+#include "Vector.h"
+#include "Matrix.h"
+#include "form.h"
+
+namespace ArvoMath {
+	const Matrix Matrix::Null(0);
+
+	/*-------------------------------------------------------------------------*
+	*                                                                         *
+	*  C O N S T R U C T O R S                                                *
+	*                                                                         *
+	*-------------------------------------------------------------------------*/
+
+	// Create a new matrix of the given size.  If n_cols is zero (the default), 
+	// it is assumed that the matrix is to be square; that is, n_rows x n_rows.  
+	// The matrix is filled with "value", which defaults to zero.
+	Matrix::Matrix( int n_rows, int n_cols, float value ) 
+	{
+		assert( n_rows >= 0 && n_cols >= 0 );
+		rows = 0;
+		cols = 0;
+		elem = NULL;
+		SetSize( n_rows, n_cols );
+		float *e = elem;
+		for( register int i = 0; i < rows * cols; i++ ) *e++ = value;
+	}
+
+	// Copy constructor.
+	Matrix::Matrix( const Matrix &M ) 
+	{
+		rows = 0;
+		cols = 0;
+		elem = NULL;
+		SetSize( M.Rows(), M.Cols() );
+		register float *e = elem;
+		register float *m = M.Array();
+		for( register int i = 0; i < rows * cols; i++ ) *e++ = *m++;
+	}
+
+	Matrix::~Matrix() 
+	{
+		SetSize( 0, 0 );
+	}
+
+	/*-------------------------------------------------------------------------*
+	*                                                                         *
+	*  M I S C E L L A N E O U S   M E T H O D S                              *
+	*                                                                         *
+	*-------------------------------------------------------------------------*/
+
+	// Re-shape the matrix.  If the number of elements in the new matrix is
+	// different from the original matrix, the original data is deleted and
+	// replaced with a new array.  If new_cols is zero (the default), it is
+	// assumed to be the same as new_rows -- i.e. a square matrix.
+	void Matrix::SetSize( int new_rows, int new_cols )
+	{
+		if( new_cols == 0 ) new_cols = new_rows;
+		int n = new_rows * new_cols;
+		if( rows * cols != n )
+		{
+			if( elem != NULL ) delete[] elem;
+			elem = ( n == 0 ) ? NULL : new float[ n ];
+		}
+		rows = new_rows;
+		cols = new_cols;
+	}
+
+	Vector Matrix::GetCol( int j ) const
+	{
+		Vector C( rows );
+		float *e = elem + j;
+		float *c = C.Array();
+		for( int i = 0; i < rows; i++ )
+		{
+			*c++ = *e;
+			e += cols;
+		}
+		return C;
+	}
+
+	Vector Matrix::GetRow( int i ) const
+	{
+		Vector R( cols );
+		float *e = elem + ( i * cols );
+		float *r = R.Array();
+		for( int j = 0; j < cols; j++ ) *r++ = *e++;
+		return R;
+	}
+
+	void Matrix::SetCol( int j, const Vector &C )
+	{
+		assert( rows == C.Size() );
+		float *e = elem + j;
+		float *c = C.Array();
+		for( int i = 0; i < rows; i++ )
+		{
+			*e = *c++;
+			e += cols;
+		}
+	}
+
+	void Matrix::SetRow( int i, const Vector &R )
+	{
+		assert( cols == R.Size() );
+		float *e = elem + ( i * cols );
+		float *r = R.Array();
+		for( int j = 0; j < cols; j++ ) *e++ = *r++;
+	}
+
+	Matrix Matrix::GetBlock( int imin, int imax, int jmin, int jmax ) const
+	{
+		if( imax < imin || jmax < jmin ) return Matrix(0,0);
+		Matrix M( imax - imin + 1, jmax - jmin + 1 );
+		for( int i = imin; i <= imax; i++ )
+			for( int j = jmin; j <= jmax; j++ )
+			{
+				M( i - imin, j - jmin ) = (*this)( i, j );
+			}
+			return M;
+	}
+
+	void Matrix::SetBlock( int imin, int imax, int jmin, int jmax, const Matrix &B )
+	{
+		int ni = imax - imin + 1;
+		int nj = jmax - jmin + 1;
+		assert( ni == B.Rows() );
+		assert( nj == B.Cols() );
+		int k = imin * cols + jmin;
+		for( int i = 0; i < ni; i++ )
+			for( int j = 0; j < nj; j++ )
+			{
+				elem[ k + i * cols + j ] = B(i,j);
+			}
+	}
+
+	void Matrix::SetBlock( int imin, int imax, int jmin, int jmax, const Vector &V )
+	{
+		int k = imin * cols + jmin;
+		if( imin == imax )
+		{
+			int nj = jmax - jmin + 1;
+			assert( nj == V.Size() );
+			for( int j = 0; j < nj; j++ ) elem[ k + j ] = V(j);
+		}
+		else if( jmin == jmax )
+		{
+			int ni = imax - imin + 1;
+			assert( ni == V.Size() );
+			for( int i = 0; i < ni; i++ ) elem[ k + i * cols ] = V(i);
+		}
+		else 
+		{
+			// This assertion will be false, and will signal an error.
+			assert( imin == imax || jmin == jmax );
+		}
+	}
+
+	Matrix &Matrix::SwapRows( int i1, int i2 )
+	{
+		float temp;
+		float *r1 = elem + ( i1 * cols );
+		float *r2 = elem + ( i2 * cols );
+		for( register int j = 0; j < cols; j++ )
+		{
+			temp = *r1;
+			*r1  = *r2;
+			*r2  = temp;
+			r1++;
+			r2++;
+		}
+		return *this;
+	}
+
+	Matrix &Matrix::SwapCols( int j1, int j2 )
+	{
+		float temp;
+		float *c1 = elem + j1;
+		float *c2 = elem + j2;
+		for( register int i = 0; i < rows; i++ )
+		{
+			temp = *c1;
+			*c1  = *c2;
+			*c2  = temp;
+			c1 += cols;
+			c2 += cols;
+		}
+		return *this;
+	}
+
+	/*-------------------------------------------------------------------------*
+	*                                                                         *
+	*  A S S I G N M E N T    O P E R A T O R S                               *
+	*                                                                         *
+	*-------------------------------------------------------------------------*/
+	Matrix& Matrix::operator=( const Matrix &M ) 
+	{
+		SetSize( M.Rows(), M.Cols() );
+		register float *e = elem;
+		register float *m = M.Array();
+		for( register int i = 0; i < rows * cols; i++ ) *e++ = *m++;
+		return *this;
+	}
+
+	Matrix& Matrix::operator=( float s ) 
+	{
+		register float *e = elem;
+		for( register int i = 0; i < rows * cols; i++ ) *e++ = s;
+		return *this;
+	}
+
+	/*-------------------------------------------------------------------------*
+	*                                                                         *
+	*  O P E R A T O R S                                                      *
+	*                                                                         *
+	*-------------------------------------------------------------------------*/
+	Vector operator*( const Matrix &M, const Vector &A ) 
+	{
+		// Handle the special case with translation built in.
+		if( M.Cols() == 4 && M.Rows() == 4 && A.Size() == 3 )
+		{
+			Vector C(3);
+			C(0) = M(0,0) * A(0) + M(0,1) * A(1) + M(0,2) * A(2) + M(0,3);
+			C(1) = M(1,0) * A(0) + M(1,1) * A(1) + M(1,2) * A(2) + M(1,3);
+			C(2) = M(2,0) * A(0) + M(2,1) * A(1) + M(2,2) * A(2) + M(2,3);
+			return C;
+		}
+		assert( M.Cols() == A.Size() );
+		Vector C( M.Rows() );
+		float *m = M.Array();
+		for( int i = 0; i < M.Rows(); i++ ) 
+		{
+			register float *a  = A.Array();
+			register double sum = (*m++) * (*a++);
+			for( register int j = 1; j < M.Cols(); j++ ) 
+				sum += (*m++) * (*a++);
+			C(i) = sum;
+		}
+		return C;
+	}
+
+	Vector operator*( const Vector &A, const Matrix &M ) 
+	{
+		assert( A.Size() == M.Rows() );
+		Vector C( M.Cols() );
+		for( register int j = 0; j < M.Cols(); j++ ) 
+		{
+			register double sum = 0.0;
+			register float *a = A.Array();
+			for( register int i = 0; i < M.Rows(); i++ ) 
+				sum += (*a++) * M(i,j);
+			C(j) = sum;
+		}
+		return C;
+	}
+
+	Vector& operator*=( Vector &A, const Matrix &M ) 
+	{
+		// Handle the special case with translation built in.
+		if( M.Cols() == 4 && M.Rows() == 4 && A.Size() == 3 )  
+		{
+			float x = M(0,0) * A(0) + M(0,1) * A(1) + M(0,2) * A(2) + M(0,3);
+			float y = M(1,0) * A(0) + M(1,1) * A(1) + M(1,2) * A(2) + M(1,3);
+			float z = M(2,0) * A(0) + M(2,1) * A(1) + M(2,2) * A(2) + M(2,3);
+			A(0) = x;
+			A(1) = y;
+			A(2) = z;
+			return A;
+		}
+		assert( M.Cols() == A.Size() );
+		Vector C( M.Rows() );
+		float *m = M.Array();
+		for( register int i = 0; i < M.Rows(); i++ ) 
+		{
+			double sum = 0.0;
+			for( register int j = 0; j < A.Size(); j++ ) 
+				sum += (*m++) * A(j);
+			C(i) = sum;
+		}
+		return A = C;
+	}
+
+	Matrix& operator*=( Matrix &M, float s ) 
+	{
+		register float *m = M.Array();
+		for( register int i = 0; i < M.Rows() * M.Cols(); i++ ) *m++ *= s;
+		return M;
+	}
+
+	Matrix& operator/=( Matrix &M, float s ) 
+	{
+		assert( s != 0.0 );
+		register float *m = M.Array();
+		for( register int i = 0; i < M.Rows() * M.Cols(); i++ ) *m++ /= s;
+		return M;
+	}
+
+	Matrix operator+( const Matrix &A, const Matrix &B ) 
+	{
+		assert( A.Rows() == B.Rows() );
+		assert( A.Cols() == B.Cols() );
+		Matrix C( A.Rows(), A.Cols() );
+		register float *a = A.Array();
+		register float *b = B.Array();
+		register float *c = C.Array();
+		for( register int i = 0; i < A.Rows() * A.Cols(); i++ ) (*c++) = (*a++) + (*b++);
+		return C;
+	}
+
+	Matrix operator-( const Matrix &A, const Matrix &B ) 
+	{
+		assert( A.Rows() == B.Rows() );
+		assert( A.Cols() == B.Cols() );
+		Matrix C( A.Rows(), A.Cols() );
+		register float *a = A.Array();
+		register float *b = B.Array();
+		register float *c = C.Array();
+		for( register int i = 0; i < A.Rows() * A.Cols(); i++ ) (*c++) = (*a++) - (*b++);
+		return C;
+	}
+
+	Matrix operator-( const Matrix &A )
+	{
+		Matrix B( A.Cols(), A.Rows() );
+		register float *a = A.Array();
+		register float *b = B.Array();
+		for( register int i = 0; i < A.Rows() * A.Cols(); i++ )
+		{
+			*b++ = -(*a++);
+		}
+		return B;
+	}
+
+	Matrix& operator+=( Matrix &A, const Matrix &B ) 
+	{
+		assert( A.Rows() == B.Rows() );
+		assert( A.Cols() == B.Cols() );
+		register float *a = A.Array();
+		register float *b = B.Array();
+		for( register int i = 0; i < A.Rows() * A.Cols(); i++ ) (*a++) += (*b++);
+		return A;
+	}
+
+	Matrix operator*( const Matrix &A, const Matrix &B )
+	{
+		assert( A.Cols() == B.Rows() );
+		Matrix M( A.Rows(), B.Cols() );
+		for( register int i = 0; i < A.Rows(); i++ )
+			for( register int j = 0; j < B.Cols(); j++ )
+			{
+				double sum = 0.0;
+				for( register int k = 0; k < A.Cols(); k++ ) sum += A(i,k) * B(k,j);
+				M(i,j) = sum;
+			}
+			return M;
+	}
+
+	Matrix operator*( float s, const Matrix &A )
+	{
+		Matrix B( A.Cols(), A.Rows() );
+		register float *a = A.Array();
+		register float *b = B.Array();
+		for( register int i = 0; i < A.Rows() * A.Cols(); i++ )
+		{
+			*b++ = s * (*a++);
+		}
+		return B;
+	}
+
+	Matrix operator*( const Matrix &A, float s )
+	{
+		Matrix B( A.Cols(), A.Rows() );
+		register float *a = A.Array();
+		register float *b = B.Array();
+		for( register int i = 0; i < A.Rows() * A.Cols(); i++ )
+		{
+			*b++ = s * (*a++);
+		}
+		return B;
+	}
+
+	Matrix operator/( const Matrix &A, float s )
+	{
+		assert( s != 0.0 );
+		Matrix B( A.Cols(), A.Rows() );
+		register float *a = A.Array();
+		register float *b = B.Array();
+		for( register int i = 0; i < A.Rows() * A.Cols(); i++ )
+		{
+			*b++ = (*a++) / s;
+		}
+		return B;
+	}
+
+	Matrix& operator*=( Matrix &A, const Matrix &B )
+	{
+		assert( A.Cols() == B.Rows() );
+		Vector R( B.Cols() );
+		for( register int i = 0; i < A.Rows(); i++ )
+		{
+			for( register int j = 0; j < B.Cols(); j++ )  // Compute the ith row of A * B.
+			{
+				double sum = A(i,0) * B(0,j);
+				for( register int k = 1; k < A.Cols(); k++ ) sum += A(i,k) * B(k,j);
+				R(j) = sum;
+			}
+			// Copy the new i'th row back into A.
+			for( register int k = 0; k < A.Cols(); k++ ) A(i,k) = R(k); 
+		}
+		return A;
+	}
+
+	/*-------------------------------------------------------------------------*
+	*                                                                         *
+	*  M I S C E L L A N E O U S   F U N C T I O N S                          *
+	*                                                                         *
+	*-------------------------------------------------------------------------*/
+	Matrix Transp( const Matrix &M )
+	{
+		Matrix T( M.Cols(), M.Rows() );
+		register float *m = M.Array();
+		for( register int i = 0; i < M.Rows(); i++ )
+			for( register int j = 0; j < M.Cols(); j++ ) T(j,i) = *m++;
+		return T;
+	}
+
+	// Computes A * Transp(A).
+	Matrix AATransp( const Matrix &A )
+	{
+		int n = A.Rows();
+		Matrix B( n, n );
+		for( register int i = 0; i < n; i++ )
+			for( register int j = 0; j < n; j++ ) 
+			{
+				double sum = 0.0;
+				for( register int k = 0; k < A.Cols(); k++ ) 
+					sum += A(i,k) * A(j,k);
+				B(i,j) = sum;
+			}
+			return B;
+	}
+
+	// Computes Transp(A) * A.
+	Matrix ATranspA( const Matrix &A )
+	{
+		int n = A.Cols();
+		Matrix B( n, n );
+		for( register int i = 0; i < n; i++ )
+			for( register int j = 0; j < n; j++ ) 
+			{
+				double sum = 0.0;
+				for( register int k = 0; k < A.Rows(); k++ ) 
+					sum += A(k,i) * A(k,j);
+				B(i,j) = sum;
+			}
+			return B;
+	}
+
+	// Computes the outer product of the vectors A and B.
+	Matrix Outer( const Vector &A, const Vector &B ) 
+	{
+		Matrix M( A.Size(), B.Size() );
+		for( register int i = 0; i < A.Size(); i++ )
+		{
+			float c = A(i);
+			for( register int j = 0; j < B.Size(); j++ ) M(i,j) = c * B(j);
+		}
+		return M;
+	}
+
+	// Computes the L1-norm of the matrix A, which is the maximum absolute
+	// row sum.
+	double OneNorm( const Matrix &A )
+	{
+		double norm = 0.0;
+		for( register int i = 0; i < A.Rows(); i++ )
+		{
+			double sum = 0.0;
+			for( register int j = 0; j < A.Cols(); j++ ) sum += Abs( A(i,j) );
+			if( sum > norm ) norm = sum;
+		}
+		return norm;
+	}
+
+	// Computes the L-infinity norm of the matrix A, which is the maximum 
+	// absolute column sum.
+	double SupNorm( const Matrix &A )
+	{
+		double norm = 0.0;
+		for( register int j = 0; j < A.Cols(); j++ )
+		{
+			double sum = 0.0;
+			for( register int i = 0; i < A.Rows(); i++ ) sum += Abs( A(i,j) );
+			if( sum > norm ) norm = sum;
+		}
+		return norm;
+	}
+
+	// Returns the square matrix with the elements of the vector d along
+	// its diagonal.
+	Matrix Diag( const Vector &d ) 
+	{
+		Matrix D( d.Size() );
+		for( register int i = 0; i < d.Size(); i++ ) D(i,i) = d(i);
+		return D;
+	}
+
+	// Returns the 3 x 3 diagonal matrix with x, y, and z as its diagonal
+	// elements.
+	Matrix Diag( float x, float y, float z )
+	{
+		Matrix D(3,3);
+		D(0,0) = x;
+		D(1,1) = y;
+		D(2,2) = z;
+		return D;
+	}
+
+	// Returns the vector consisting of the diagonal elements of the
+	// matrix M, which need not be square.
+	Vector Diag( const Matrix &M )
+	{
+		int m = Min( M.Rows(), M.Cols() );
+		Vector V(m);
+		for( register int i = 0; i < m; i++ ) V(i) = M(i,i);
+		return V;
+	}
+
+	// Returns the n x n identity matrix.
+	Matrix Ident( int n )
+	{
+		Matrix I( n );
+		for( register int i = 0; i < n; i++ ) I(i,i) = 1.0;
+		return I;
+	}
+
+	// Determines whether the matrix M is "Null" -- i.e. has zero rows
+	// or columns.
+	int Null( const Matrix &M ) 
+	{
+		return M.Rows() == 0 || M.Cols() == 0;
+	}
+
+	int Square( const Matrix &M )
+	{
+		return M.Rows() == M.Cols();
+	}
+
+	// Convert a "vector-shaped" matrix to a vector.  That is, represent a
+	// matrix with a single row or a single column as a vector.
+	Vector ToVector( const Matrix &M ) 
+	{
+		if( M.Rows() == 1 )
+		{
+			Vector V( M.Cols() );
+			for( int j = 0; j < M.Cols(); j++ ) V(j) = M(0,j);
+			return V;
+		}
+		else if( M.Cols() == 1 )
+		{
+			Vector V( M.Rows() );
+			for( int i = 0; i < M.Rows(); i++ ) V(i) = M(i,0);
+			return V;
+		}
+		else 
+		{
+			// Report an error.     
+			assert( M.Rows() == 1 || M.Cols() == 1 );
+		}
+		return Vector();
+	}
+
+	std::ostream &operator<<( std::ostream &out, const Matrix &M )
+	{
+		if( M.Rows() == 0 || M.Cols() == 0 )
+		{
+			out << "NULL" << std::endl;
+		}
+		else for( register int i = 0; i < M.Rows(); i++ )
+		{
+			out << form( "%3d: ", i );
+			for( register int j = 0; j < M.Cols(); j++ )
+				out << form( " %10.5g", M(i,j) );
+			out << std::endl;
+		}
+		return out;
+	}
+
+	/*-------------------------------------------------------------------------*
+	* R O T A T I O N                                                         *
+	*                                                                         * 
+	* Builds a 3x3 modeling matrix that performs a rotation about an          *
+	* arbitrary axis.  The rotation is right-handed about this axis and       *
+	* "angle" is taken to be in radians.  The only error that can occur is    *
+	* when "axis" is the zero-vector.                                         *
+	*                                                                         *  
+	*-------------------------------------------------------------------------*/
+	Matrix Rotation( const Vector &Axis, float angle )
+	{
+		// Compute a unit quaternion (a,b,c,d) that performs the rotation.
+
+		float t = TwoNormSqr( Axis );
+		if( t == 0.0 ) return Matrix(3,3);
+		t = sin( angle * 0.5 ) / sqrt( t );
+
+		// Fill in the entries of the quaternion.
+
+		float a = cos( angle * 0.5 );
+		float b = t * Axis(0);
+		float c = t * Axis(1);
+		float d = t * Axis(2);
+
+		// Compute all the double products of a, b, c, and d, except a * a.
+
+		float bb = b * b;
+		float cc = c * c;
+		float dd = d * d;
+		float ab = a * b;
+		float ac = a * c;
+		float ad = a * d;
+		float bc = b * c;
+		float bd = b * d;
+		float cd = c * d;
+
+		// Fill in the entries of the rotation matrix.
+
+		Matrix R(3,3);
+
+		R(0,0) = 1.0 - 2.0 * ( cc + dd );
+		R(0,1) =       2.0 * ( bc + ad );
+		R(0,2) =       2.0 * ( bd - ac );
+
+		R(1,0) =       2.0 * ( bc - ad );
+		R(1,1) = 1.0 - 2.0 * ( bb + dd );
+		R(1,2) =       2.0 * ( cd + ab );
+
+		R(2,0) =       2.0 * ( bd + ac );
+		R(2,1) =       2.0 * ( cd - ab );
+		R(2,2) = 1.0 - 2.0 * ( bb + cc );
+
+		return R;
+	}
+
+	/*-------------------------------------------------------------------------*
+	* R O T A T I O N                                                         *
+	*                                                                         * 
+	* Builds a 4x4 modeling matrix that performs a rotation about an          *
+	* arbitrary axis through an arbitrary point.  The rotation is             *
+	* right-handed about this axis and "angle" is taken to be in radians.     *
+	*                                                                         *  
+	*-------------------------------------------------------------------------*/
+	Matrix Rotation( const Vector &Axis, const Vector &Origin, float angle )
+	{
+		Matrix R = Rotation( Axis, angle );   // A simple 3x3 rotation.
+		Matrix M = Ident(4);                  // A 4x4 including translation.
+
+		// Compute the last row of the matrix (the translation) using the
+		// 3x3 rotation matrix.  We need to compute the last row of the 4x4
+		// matrix that performs Translate( -Origin ) * Rotate * Translate( Origin ).
+		//
+		//       | I   p | | R   0 | | I  -p |   | R   p - Rp |
+		//       |       | |       | |       | = |            |
+		//       | 0   1 | | 0   1 | | 0   1 |   | 0      1   |
+		//
+		// So, the desired column is  p - R p.
+
+		Vector V( Origin - R * Origin );
+		for( int i = 0; i < 3; i++ )
+		{
+			M(i,3) = V(i);
+			for( int j = 0; j < 3; j++ )
+				M(i,j) = R(i,j);
+		}
+		return M;
+	}
+
+	/*-------------------------------------------------------------------------*
+	* X  R O T A T I O N                                                      *
+	*                                                                         * 
+	* Builds a 3x3 modeling matrix that performs a rotation about the X-axis. *
+	*                                                                         *  
+	*-------------------------------------------------------------------------*/
+	Matrix Xrotation( float angle )
+	{
+		Matrix M = Ident(3);
+		float c = cos( angle );
+		float s = sin( angle );
+		M(1,1) = c;  M(1,2) = -s;
+		M(2,1) = s;  M(2,2) =  c;
+		return M;
+	}
+
+	/*-------------------------------------------------------------------------*
+	* Y  R O T A T I O N                                                      *
+	*                                                                         * 
+	* Builds a 3x3 modeling matrix that performs a rotation about the Y-axis. *
+	*                                                                         *  
+	*-------------------------------------------------------------------------*/
+	Matrix Yrotation( float angle )
+	{
+		Matrix M = Ident(3);
+		float c = cos( angle );
+		float s = sin( angle );
+		M(0,0) = c;  M(0,2) = -s;
+		M(2,0) = s;  M(2,2) =  c;
+		return M;
+	}
+
+	/*-------------------------------------------------------------------------*
+	* Z  R O T A T I O N                                                      *
+	*                                                                         * 
+	* Builds a 3x3 modeling matrix that performs a rotation about the Z-axis. *
+	*                                                                         *  
+	*-------------------------------------------------------------------------*/
+	Matrix Zrotation( float angle )
+	{
+		Matrix M = Ident(3);
+		float c = cos( angle );
+		float s = sin( angle );
+		M(0,0) = c;  M(0,1) = -s;
+		M(1,0) = s;  M(1,1) =  c;
+		return M;
+	}
+
+	/*-------------------------------------------------------------------------*
+	* H O U S E H O L D E R                                                   *
+	*                                                                         * 
+	* Returns the Householder reflection matrix that reflects through the     *  
+	* plane orthogonal to V.  The vector V is not assumed to be normalized.   *  
+	*                                                                         *  
+	*-------------------------------------------------------------------------*/
+	Matrix Householder( const Vector &V )
+	{
+		Matrix I = Ident( V.Size() );
+		float  c = 2.0 / ( V * V );
+		return I - Outer( c * V, V );
+	}
+
+	/*=========================================================================*
+	*  R O T A T I O N                Author: Jim Arvo, 1991                  *
+	*                                                                         *
+	*  This routine maps three values (x1, x2, x3) in the range [0,1] into    *
+	*  a 3x3 rotation matrix, M.  Uniformly distributed random variables      *
+	*  x1, x2, and x3 create uniformly distributed random rotation matrices.  *
+	*  To create small uniformly distributed "perturbations", supply          *
+	*  samples in the following ranges                                        *
+	*                                                                         *
+	*      x1 in [ 0, d ]                                                     *
+	*      x2 in [ 0, 1 ]                                                     *
+	*      x3 in [ 0, d ]                                                     *
+	*                                                                         *
+	* where 0 < d < 1 controls the size of the perturbation.  Any of the      *
+	* random variables may be stratified (or "jittered") for a slightly more  *
+	* even distribution.                                                      *
+	*                                                                         *
+	*=========================================================================*/
+	Matrix Rotation( float x1, float x2, float x3 )
+	{
+		Matrix M(3,3);
+		float theta = x1 * TwoPi; // Rotation about the pole (Z). 
+		float phi   = x2 * TwoPi; // For direction of pole deflection.
+		float z     = x3 * 2.0;   // For magnitude of pole deflection.
+
+		// Compute a vector V used for distributing points over the sphere
+		// via the reflection I - V Transpose(V).  This formulation of V
+		// will guarantee that if x1 and x2 are uniformly distributed,
+		// the reflected points will be uniform on the sphere.  Note that V
+		// has length sqrt(2) to eliminate the 2 in the Householder matrix.
+
+		float r  = sqrt( z );
+		float Vx = sin( phi ) * r;
+		float Vy = cos( phi ) * r;
+		float Vz = sqrt( 2.0 - z );    
+
+		// Compute the row vector S = Transpose(V) * R, where R is a simple
+		// rotation by theta about the z-axis.  No need to compute Sz since
+		// it's just Vz.
+
+		float st = sin( theta );
+		float ct = cos( theta );
+		float Sx = Vx * ct - Vy * st;
+		float Sy = Vx * st + Vy * ct;
+
+		// Construct the rotation matrix  ( V Transpose(V) - I ) R, which
+		// is equivalent to V S - R.
+
+		M(0,0) = Vx * Sx - ct;
+		M(0,1) = Vx * Sy - st;
+		M(0,2) = Vx * Vz;
+
+		M(1,0) = Vy * Sx + st;
+		M(1,1) = Vy * Sy - ct;
+		M(1,2) = Vy * Vz;
+
+		M(2,0) = Vz * Sx;
+		M(2,1) = Vz * Sy;
+		M(2,2) = 1.0 - z;   // This equals Vz * Vz - 1.0 
+
+		return M;
+	}
+
+	/*-------------------------------------------------------------------------*
+	* P A R T I A L   P I V O T                                               *
+	*                                                                         * 
+	* Look for the element with the largest magnitude on or below the         *
+	* diagonal in column "col" of the matrix A.  Bring this element to the    *
+	* diagonal by a row interchange.  Perform the same row interchange on b.  *
+	*                                                                         *  
+	*-------------------------------------------------------------------------*/
+	static int PartialPivot( int col, Matrix &A, Vector &b )
+	{
+		int n = A.Cols();
+		float a_max = Abs( A( col, col ) );
+		int   i_max = col;
+		for( int i = col + 1; i < n; i++ )
+		{
+			float temp = Abs( A( i, col ) );
+			if( temp > a_max )
+			{
+				a_max = temp;
+				i_max = i;
+			}
+		}
+		if( a_max == 0.0 ) return 0;
+		if( i_max != col )
+		{
+			A.SwapRows( col, i_max );
+			b.Swap    ( col, i_max );
+		}
+		return 1;
+	}
+
+	/*-------------------------------------------------------------------------*
+	* G A U S S I A N   E L I M I N A T I O N                                 *
+	*                                                                         * 
+	* Solves the linear system A x = b using Gaussian elimination, with or    *
+	* without partial pivoting.                                               *
+	*                                                                         *  
+	*-------------------------------------------------------------------------*/
+	int GaussElimination( const Matrix &A, const Vector &b, Vector &x, pivot_type pivot )
+	{
+		assert( Square( A ) );
+		assert( A.Rows() == b.Size() );
+		Matrix B( A );
+		Vector c( b );
+		x.SetSize( A.Cols() );
+		int m = B.Rows();
+		register int i, j, k;
+
+		// Perform Gaussian elimination on the copies, B and c.
+
+		for( i = 0; i < m; i++ )
+		{
+			if( pivot == pivot_partial ) PartialPivot( i, B, c );
+
+			for( j = i + 1; j < m; j++ )
+			{
+				double scale = -B(j,i) / B(i,i);
+				for( k = i; k < m; k++ )
+					B(j,k) += scale * B(i,k);
+				B(j,i) = 0.0;
+				c(j) += scale * c(i);
+			}
+		}
+
+		// Now solve by back substitution.
+
+		for( i = m - 1; i >= 0; i-- )
+		{
+			double a = 0.0;
+			for( j = i + 1; j < m; j++ ) a += B(i,j) * x(j);
+			x(i) = ( c(i) - a ) / B(i,i);
+		}
+
+		return 1;
+	}
+
+	/*-------------------------------------------------------------------------*
+	*  L E A S T   S Q U A R E S                                              *
+	*                                                                         *
+	* Solves the normal equations associated with the system A x = b, which   *
+	* are given by  Transp(A) A x = Transp(A) b.                              *
+	*                                                                         *
+	*-------------------------------------------------------------------------*/
+	int LeastSquares( const Matrix &A, const Vector &b, Vector &x )
+	{
+		//
+		// Set up and solve the normal equations Transp(A) A x = Transp(A) b.
+		// Note that Transp(A) * b is computed here as b * A.
+		//
+		GaussElimination( ATranspA(A), b * A, x );
+		return 1;
+	}
+
+	/*-------------------------------------------------------------------------*
+	*  D E T E R M I N A N T                                                  *
+	*                                                                         *
+	* Computes the determinant of the n by n matrix M using Householder       *
+	* transformations.                                                        *
+	*                                                                         *
+	*-------------------------------------------------------------------------*/
+	double Determinant( const Matrix &M )
+	{
+		static const float MachEps = MachineEpsilon();
+		assert( Square(M) );
+
+		double dot;
+		int    k;
+		Matrix A    = M;    // Make a copy that we can destroy.
+		double det  = 1.0;  // Multiply diagonal elements as they are generated.
+		int    sign = 1;	// Keep track of sign (each reflection has det -1).
+		int    n    = M.Cols();
+
+		for( int i = 0; i < n - 1; i++ ) 
+		{
+			// Compute the 2-norm of the first column of the (n-i)x(n-i) submatrix.
+
+			dot = 0.0;
+			for( k = i; k < n; k++ ) dot += Sqr( A(k,i) );
+
+			double Xnorm = sqrt( dot );
+			if( Xnorm == 0.0 ) return 0.0;
+
+			// This norm is another diagonal element of the upper triangular
+			// matrix, so we multiply it into the running product for det.
+
+			det *= Xnorm;		
+
+			// If X is already of the right form we must not perform the
+			// processing because V will be zero.
+
+			float x1   = Abs( A(i,i) );
+			float diff = Abs( Xnorm - x1 );
+			if( diff < MachEps * Max( Xnorm, x1 ) ) continue;  // This column is okay as is.
+
+			// Each Householder transformation has a determinant of -1,
+			// so we must keep track of how many we apply.
+
+			sign *= -1;
+
+			// Compute the V vector, which will define the Householder
+			// transformation via  H = I - V transp(V).  Leave it in the
+			// i'th column of A.  V = sqrt(2) * Normalized( X - ( Xnorm, 0, 0,... ) ).
+
+			float scale = 1.0 / sqrt( Xnorm * Abs( A(i,i) - Xnorm ) );  // sqrt(2) / || p ||
+			A(i,i) = ( A(i,i) - Xnorm ) * scale;        
+			for( k = i + 1; k < n; k++ ) A(k,i) *= scale;
+
+			// Now apply the transformation I - V Transp(V) to all the remaining columns, 
+			// except for the first row.
+
+			for( int j = i + 1; j < n; j++ ) 
+			{
+				// Compute Y dot V.
+
+				dot = 0.0;
+				for( k = i; k < n; k++ ) dot += A(k,i) * A(k,j);
+
+				// Subtract V ( V dot A(*,j) ) from A(*,j), ignoring the first row.
+
+				for( k = i + 1; k < n; k++ ) A(k,j) -= A(k,i) * dot;
+
+			} // for j
+
+		} // for i
+
+		// Now multiply in the very last element of the matrix and
+		// the accumulated sign.
+
+		return det * A(n-1,n-1) * sign;
+	}	
+
+	/*-------------------------------------------------------------------------*
+	*  C O F A C T O R                                                        *
+	*                                                                         *
+	* Computes the (i,j) cofactor of the n by n matrix M.                     *
+	*                                                                         *
+	*-------------------------------------------------------------------------*/
+	double Matrix::Cofactor( int omit_i, int omit_j ) const
+	{
+		assert( Square( *this ) );
+		assert( omit_i >= 0 && omit_j >= 0 );
+		assert( omit_i < Rows() );
+		assert( omit_j < Cols() );
+
+		// Create a new matrix that is smaller by one in both dimensions and
+		// copy the old matrix into it, omitting the specified row and column.
+
+		Matrix A( Rows() - 1, Cols() - 1 );
+		for( int i = 0; i < Rows() - 1; i++ )
+		{
+			int ii = ( i < omit_i ) ? i : i + 1;
+			for( int j = 0; j < Cols() - 1; j++ )
+			{
+				int jj = ( j < omit_j ) ? j : j + 1;
+				A( i, j ) = (*this)(ii,jj);
+			}
+		}
+
+		// Return the determinant of the smaller matrix.
+
+		return Determinant( A );
+	}
+
+	/*-------------------------------------------------------------------------*
+	*  A D J O I N T                                                          *
+	*                                                                         *
+	* Computes the adjoint of a matrix.                                       *
+	*                                                                         *
+	*-------------------------------------------------------------------------*/
+	Matrix Adjoint( const Matrix &M )
+	{
+		double det;
+		return Adjoint( M, det );  // Discard the determinant.
+	}
+
+	Matrix Adjoint( const Matrix &M, double &det )
+	{
+		int n = M.Rows();
+		det   = 0.0;
+		Matrix A( n, n );
+		assert( Square(M) );
+		if( n == 3 )
+		{
+			A(0,0) = M(1,1) * M(2,2) - M(1,2) * M(2,1);
+			A(0,1) = M(1,2) * M(2,0) - M(1,0) * M(2,2);
+			A(0,2) = M(1,0) * M(2,1) - M(1,1) * M(2,0);
+
+			A(1,0) = M(0,2) * M(2,1) - M(0,1) * M(2,2);
+			A(1,1) = M(0,0) * M(2,2) - M(0,2) * M(2,0);
+			A(1,2) = M(0,1) * M(2,0) - M(0,0) * M(2,1);
+
+			A(2,0) = M(0,1) * M(1,2) - M(0,2) * M(1,1);
+			A(2,1) = M(0,2) * M(1,0) - M(0,0) * M(1,2);
+			A(2,2) = M(0,0) * M(1,1) - M(0,1) * M(1,0);
+
+			det = A(0,0) * M(0,0) + A(1,0) * M(1,0) + A(2,0) * M(2,0);
+		}
+		else
+		{
+			for( register int i = 0; i < n; i++ )
+			{
+				for( register int j = 0; j < n; j++ )
+				{
+					if( Odd( i + j ) )
+						A(i,j) = -M.Cofactor(i,j);
+					else A(i,j) =  M.Cofactor(i,j);
+				}
+				det += M(i,0) * A(i,0);
+			}
+		}
+		return A;
+	}
+
+	/*-------------------------------------------------------------------------*
+	*  I N V E R S E                                                          *
+	*                                                                         *
+	* Computes the inverse of a square matrix.                                *
+	*                                                                         *
+	*-------------------------------------------------------------------------*/
+	Matrix Inverse( const Matrix &M )
+	{
+		assert( Square( M ) );
+		int n = M.Cols();
+		Matrix Inv( n, n );
+		Vector b( n ), x( n );
+
+		for( int i = 0; i < n; i++ )
+		{
+			if( i > 0 ) b( i - 1 ) = 0.0;
+			b(i) = 1.0;
+			GaussElimination( M, b, x );
+			Inv.SetCol( i, x );
+		}
+		return Inv;
+	}
+
+	/*-------------------------------------------------------------------------*
+	*  T R A C E                                                              *
+	*                                                                         *
+	* Computes the trace of a square matrix.                                  *
+	*                                                                         *
+	*-------------------------------------------------------------------------*/
+	extern double Trace( const Matrix &M )
+	{
+		assert( Square(M) );
+		double trace = M(0,0);
+		for( int i = 1; i < M.Cols(); i++ ) trace += M(i,i);
+		return trace;
+	}
+};
+
+
+
+/*
+
+C
+C  Subroutine GAUSS solves the the system Ax = b by using Gaussian elimination.
+C
+
+SUBROUTINE GAUSS( A, B, X, LDA, N, IFLAG )
+REAL A( LDA, N ), B( N ), X( N )
+
+DO 300 I = 1 , N - 1
+I2 = I
+CALL PIVOT( A, B, LDA, N, I2, IFLAG )
+IF ( IFLAG .LT. 0 ) RETURN
+DO 200 J = I + 1 , N
+TEMP = A( J , I ) / A( I , I )
+A( J , I ) = 0.0
+B( J ) = B( J ) - TEMP * B( I )
+DO 100 K = I + 1 , N
+A( J , K ) = A( J , K ) - TEMP * A( I , K )
+100           CONTINUE
+200       CONTINUE
+300   CONTINUE
+
+X( N ) = B( N ) / A( N , N )
+DO 500 I = N - 1 , 1 , -1
+TEMP = 0.0
+DO 400 J = I + 1 , N
+TEMP = TEMP + A( I , J ) * X( J )
+400       CONTINUE
+X( I ) = ( B( I ) - TEMP ) / A( I , I )
+500   CONTINUE
+
+RETURN
+END
+
+
+
+SUBROUTINE PIVOT( A, B, LDA, N, J, IFLAG )
+REAL A( LDA, N ), B( N ), AMAX, TEMP
+DATA TOL / 1.0E-6 /
+
+IFLAG = -1
+IF ( J .GT. N ) RETURN
+IF ( J .EQ. N .AND. ABS( A(N,N) ) .LT. TOL ) RETURN
+IF ( J .EQ. N ) GO TO 40
+
+AMAX  = ABS( A( J , J ) )
+INDEX = J
+10   DO 20 I = J + 1 , N
+IF ( ABS( A( I , J ) ) .LE. AMAX ) GO TO 20
+AMAX = ABS( A( I , J ) )
+INDEX = I
+20   CONTINUE
+
+IF ( AMAX .LT. TOL ) RETURN
+
+TEMP = B( J )
+B( J ) = B( INDEX )
+B( INDEX ) = TEMP
+
+DO 30 K = 1 , N
+TEMP = A( J , K )
+A( J , K ) = A( INDEX , K )
+A( INDEX , K ) = TEMP
+30   CONTINUE
+
+40   IFLAG = 1
+RETURN
+END
+
+
+*/
+
+
+
+
+
diff --git a/src/nvtt/bc6h/arvo/Matrix.h b/src/nvtt/bc6h/arvo/Matrix.h
new file mode 100755
index 0000000..1832c8f
--- /dev/null
+++ b/src/nvtt/bc6h/arvo/Matrix.h
@@ -0,0 +1,142 @@
+/***************************************************************************
+* Matrix.h                                                                 *
+*                                                                          *
+* General Vector and Matrix classes, with all the associated methods.      *
+*                                                                          *
+*   HISTORY                                                                *
+*      Name    Date        Description                                     *
+*                                                                          *
+*      arvo    08/16/2000    Revamped for CIT tools.                       *
+*      arvo    10/31/1994    Combined RowVec & ColVec into Vector.         *
+*      arvo    06/30/1993    Added singular value decomposition class.     *
+*      arvo    06/25/1993    Major revisions.                              *
+*      arvo    09/08/1991    Initial implementation.                       *
+*                                                                          *
+*--------------------------------------------------------------------------*
+* Copyright (C) 2000, James Arvo                                           *
+*                                                                          *
+* This program is free software; you can redistribute it and/or modify it  *
+* under the terms of the GNU General Public License as published by the    *
+* Free Software Foundation.  See http://www.fsf.org/copyleft/gpl.html      *
+*                                                                          *
+* This program is distributed in the hope that it will be useful, but      *
+* WITHOUT EXPRESS OR IMPLIED WARRANTY of merchantability or fitness for    *
+* any particular purpose.  See the GNU General Public License for more     *
+* details.                                                                 *
+*                                                                          *
+***************************************************************************/
+#ifndef __MATRIX_INCLUDED__
+#define __MATRIX_INCLUDED__
+
+#include <iostream>
+#include "Vector.h"
+
+namespace ArvoMath {
+
+	class Matrix {
+	public:
+		Matrix( const Matrix & );
+		Matrix( int num_rows = 0, int num_cols = 0, float value = 0.0 );
+		~Matrix();
+		Matrix &operator=( const Matrix &M );
+		Matrix &operator=( float s );
+		Vector  GetCol( int col ) const;
+		Vector  GetRow( int row ) const;
+		void    SetCol( int col, const Vector & );
+		void    SetRow( int row, const Vector & );
+		Matrix  GetBlock( int imin, int imax, int jmin, int jmax ) const;
+		void    SetBlock( int imin, int imax, int jmin, int jmax, const Matrix & );
+		void    SetBlock( int imin, int imax, int jmin, int jmax, const Vector & );
+		Matrix &SwapRows( int i1, int i2 );
+		Matrix &SwapCols( int j1, int j2 );
+		void    SetSize( int rows, int cols = 0 );
+		double  Cofactor( int i, int j ) const;
+		static  const Matrix Null;
+
+	public: // Inlined functions.
+		inline float  operator()( int i, int j ) const { return elem[ i * cols + j ]; }
+		inline float &operator()( int i, int j )       { return elem[ i * cols + j ]; }
+		inline int    Rows  () const { return rows; }
+		inline int    Cols  () const { return cols; }
+		inline float *Array () const { return elem; }
+
+	private:
+		int    rows; // Number of rows in the matrix.
+		int    cols; // Number of columns in the matrix.
+		float *elem; // Pointer to the actual data.
+	};
+
+
+	extern Vector  operator *  ( const Matrix &, const Vector & );
+	extern Vector  operator *  ( const Vector &, const Matrix & );
+	extern Vector& operator *= (       Vector &, const Matrix & );
+	extern Matrix  Outer       ( const Vector &, const Vector & );  // Outer product.
+	extern Matrix  operator +  ( const Matrix &, const Matrix & );
+	extern Matrix  operator -  ( const Matrix &                 );
+	extern Matrix  operator -  ( const Matrix &, const Matrix & );
+	extern Matrix  operator *  ( const Matrix &, const Matrix & );
+	extern Matrix  operator *  ( const Matrix &,       float    );
+	extern Matrix  operator *  (       float  ,  const Matrix & );
+	extern Matrix  operator /  ( const Matrix &,       float    );
+	extern Matrix& operator += (       Matrix &, const Matrix & );
+	extern Matrix& operator *= (       Matrix &,       float    );
+	extern Matrix& operator *= (       Matrix &, const Matrix & );
+	extern Matrix& operator /= (       Matrix &,       float    );
+	extern Matrix  Ident       (       int    n );
+	extern Matrix  Householder ( const Vector & );
+	extern Matrix  Rotation    ( const Vector &Axis, float angle );
+	extern Matrix  Rotation    ( const Vector &Axis, const Vector &Origin, float angle );
+	extern Matrix  Rotation    (       float, float, float ); // For random 3D rotations.
+	extern Matrix  Xrotation   (       float    );
+	extern Matrix  Yrotation   (       float    );
+	extern Matrix  Zrotation   (       float    );
+	extern Matrix  Diag        ( const Vector & );
+	extern Vector  Diag        ( const Matrix & );
+	extern Matrix  Diag        ( float, float, float );
+	extern Matrix  Adjoint     ( const Matrix & );
+	extern Matrix  Adjoint     ( const Matrix &, double &det );
+	extern Matrix  AATransp    ( const Matrix & );
+	extern Matrix  ATranspA    ( const Matrix & );
+	extern double  OneNorm     ( const Matrix & );
+	extern double  SupNorm     ( const Matrix & );
+	extern double  Determinant ( const Matrix & );
+	extern double  Trace       ( const Matrix & );
+	extern Matrix  Transp      ( const Matrix & );
+	extern Matrix  Inverse     ( const Matrix & );
+	extern int     Null        ( const Matrix & );
+	extern int     Square      ( const Matrix & );
+	extern Vector  ToVector    ( const Matrix & ); // Only for vector-shaped matrices.
+
+	enum pivot_type {
+		pivot_off,
+		pivot_partial,
+		pivot_total
+	};
+
+	extern int GaussElimination( 
+		const Matrix &A, 
+		const Vector &b, // This is the right-hand side.
+		Vector       &x, // This is the matrix we are solving for.
+		pivot_type = pivot_off
+		);
+
+	extern int LeastSquares( 
+		const Matrix &A, 
+		const Vector &b, 
+		Vector       &x
+		);
+
+	extern int WeightedLeastSquares( 
+		const Matrix &A, 
+		const Vector &b, 
+		const Vector &w, 
+		Vector       &x 
+		);
+
+	std::ostream &operator<<( 
+		std::ostream &out, 
+		const Matrix &
+		);
+};
+
+#endif
diff --git a/src/nvtt/bc6h/arvo/Perm.cpp b/src/nvtt/bc6h/arvo/Perm.cpp
new file mode 100755
index 0000000..87e98e3
--- /dev/null
+++ b/src/nvtt/bc6h/arvo/Perm.cpp
@@ -0,0 +1,503 @@
+/***************************************************************************
+* Perm.C                                                                   *
+*                                                                          *
+* This file defines permutation class: that is, a class for creating and   *
+* manipulating finite sequences of distinct integers.  The main feature    *
+* of the class is the "++" operator that can be used to step through all   *
+* N! permutations of a sequence of N integers.  As the set of permutations *
+* forms a multiplicative group, a multiplication operator and an           *
+* exponentiation operator are also defined.                                *
+*                                                                          *
+*   HISTORY                                                                *
+*      Name    Date        Description                                     *
+*                                                                          *
+*      arvo    07/01/93    Added the Partition class.                      *
+*      arvo    03/23/93    Initial coding.                                 *
+*                                                                          *
+*--------------------------------------------------------------------------*
+* Copyright (C) 1999, James Arvo                                           *
+*                                                                          *
+* This program is free software; you can redistribute it and/or modify it  *
+* under the terms of the GNU General Public License as published by the    *
+* Free Software Foundation.  See http://www.fsf.org/copyleft/gpl.html      *
+*                                                                          *
+* This program is distributed in the hope that it will be useful, but      *
+* WITHOUT EXPRESS OR IMPLIED WARRANTY of merchantability or fitness for    *
+* any particular purpose.  See the GNU General Public License for more     *
+* details.                                                                 *
+*                                                                          *
+***************************************************************************/
+#include <stdio.h>
+#include <string.h>
+#include "Perm.h"
+#include "ArvoMath.h"
+#include "Char.h"
+
+namespace ArvoMath {
+
+	/***************************************************************************
+	*                                                              
+	*  L O C A L   F U N C T I O N S
+	*
+	***************************************************************************/
+
+	static void Reverse( int *p, int n )
+	{
+		int k = n >> 1;
+		int m = n - 1;
+		for( int i = 0; i < k; i++ ) Swap( p[i], p[m-i] );
+	}
+
+	static void Error( char *msg )
+	{
+		fprintf( stderr, "ERROR: Perm, %s.\n", msg );
+	}
+
+	/***************************************************************************
+	**
+	**  M E M B E R   F U N C T I O N S
+	**
+	***************************************************************************/
+
+	Perm::Perm( int Left, int Right )
+	{
+		a = ( Left < Right ) ? Left : Right;
+		b = ( Left > Right ) ? Left : Right;
+		p = new int[ Size() ];
+		Reset( *this );
+	}
+
+	Perm::Perm( const Perm &Q )
+	{
+		a = Q.Min();
+		b = Q.Max();
+		p = new int[ Q.Size() ];
+		for( int i = 0; i < Size(); i++ ) p[i] = Q[i];
+	}
+
+	Perm::Perm( const char *str )
+	{
+		(*this) = str;
+	}
+
+	Perm &Perm::operator=( const char *str )
+	{
+		int  k, m = 0, n = 0;
+		char dig[10];
+		char c;
+		if( p != NULL ) delete[] p;
+		p = new int[ strlen(str)/2 + 1 ];
+		for(;;)
+		{
+			c = *str++;
+			if( isDigit(c) ) dig[m++] = c;
+			else if( m > 0 )
+			{ 
+				dig[m] = NullChar;
+				sscanf( dig, "%d", &k );
+				if( n == 0 ) a = k; else if( k < a ) a = k;
+				if( n == 0 ) b = k; else if( k > b ) b = k;
+				p[n++] = k;
+				m = 0; 
+			}
+			if( c == NullChar ) break;
+		}
+		for( int i = 0; i < n; i++ )
+		{
+			int N = i + a;
+			int okay = 0;
+			for( int j = 0; j < n; j++ )
+				if( p[j] == N ) { okay = 1; break; }
+				if( !okay )
+				{
+					Error( "string is not a valid permutation" );
+					return *this;
+				}
+		}
+		return *this;
+	}
+
+	void Perm::Get( char *str ) const
+	{
+		for( int i = 0; i < Size(); i++ )
+			str += sprintf( str, "%d ", p[i] );
+		*str = NullChar;
+	}
+
+	int Perm::Next()
+	{
+		int i, m, k = 0;
+		int N, M = 0;
+
+		// Look for the first element of p that is larger than its successor.
+		// If no such element exists, we are done.
+
+		M = p[0];                      // M is always the "previous" value.
+		for( i = 1; i < Size(); i++ )  // Now start with second element.
+		{
+			if( p[i] > M ) { k = i; break; }
+			M = p[i];
+		}
+		if( k == 0 ) return 0; // Already in descending order.
+		m = k - 1;
+
+		// Find the largest entry before k that is less than p[k].
+		// One exists because p[k] is bigger than M, i.e. p[k-1].
+
+		N = p[k];
+		for( i = 0; i < k - 1; i++ )
+		{
+			if( p[i] < N && p[i] > M ) { M = p[i]; m = i; }
+		}
+		Swap( p[m], p[k] ); // Entries 0..k-1 are still decreasing.
+		Reverse( p, k );    // Make first k elements increasing.
+		return 1;
+	}
+
+	int Perm::Prev()
+	{
+		int i, m, k = 0;
+		int N, M = 0;
+
+		// Look for the first element of p that is less than its successor.
+		// If no such element exists, we are done.
+
+		M = p[0];                      // M will always be the "previous" value.
+		for( i = 1; i < Size(); i++ )  // Start with the second element.
+		{
+			if( p[i] < M ) { k = i; break; }
+			M = p[i];
+		}
+		if( k == 0 ) return 0; // Already in ascending order.
+		m = k - 1;
+
+		// Find the smallest entry before k that is greater than p[k].
+		// One exists because p[k] is less than M, i.e. p[k-1].
+
+		N = p[k];
+		for( i = 0; i < k - 1; i++ )
+		{
+			if( p[i] > N && p[i] < M ) { M = p[i]; m = i; }
+		}
+		Swap( p[m], p[k] ); // Entries 0..k-1 are still increasing.
+		Reverse( p, k );    // Make first k elements decreasing.
+		return 1;
+	}
+
+
+	/***************************************************************************
+	**
+	**  O P E R A T O R S
+	**
+	***************************************************************************/
+
+	int Perm::operator++()
+	{
+		return Next();
+	}
+
+	int Perm::operator--()
+	{
+		return Prev();
+	}
+
+	Perm &Perm::operator+=( int n )
+	{
+		int i;
+		if( n > 0 ) for( i = 0; i < n; i++ ) if( !Next() ) break;
+		if( n < 0 ) for( i = n; i < 0; i++ ) if( !Prev() ) break;
+		return *this;
+	}
+
+	Perm &Perm::operator-=( int n )
+	{
+		int i;
+		if( n > 0 ) for( i = 0; i < n; i++ ) if( !Prev() ) break;
+		if( n < 0 ) for( i = n; i < 0; i++ ) if( !Next() ) break;
+		return *this;
+	}
+
+	int Perm::operator[]( int n ) const
+	{
+		if( n < 0 || Size() <= n ) 
+		{
+			Error( "permutation index[] out of range" );
+			return 0;
+		}
+		return p[ n ];
+	}
+
+	int Perm::operator()( int n ) const
+	{
+		if( n < Min() || Max() < n ) 
+		{
+			Error( "permutation index() out of range" );
+			return 0;
+		}
+		return p[ n - Min() ];
+	}
+
+	Perm &Perm::operator=( const Perm &Q )
+	{
+		if( Size() != Q.Size() )
+		{
+			delete[] p;
+			p = new int[ Q.Size() ];
+		}
+		a = Q.Min();
+		b = Q.Max();
+		for( int i = 0; i < Size(); i++ ) p[i] = Q[i];
+		return *this;
+	}
+
+	Perm Perm::operator*( const Perm &Q ) const
+	{
+		if( Min() != Q.Min() ) return Perm(0);
+		if( Max() != Q.Max() ) return Perm(0);
+		Perm A( Min(), Max() );
+		for( int i = 0; i < Size(); i++ ) A.Elem(i) = p[ Q[i] - Min() ];
+		return A;
+	}
+
+	Perm Perm::operator^( int n ) const
+	{
+		Perm A( Min(), Max() );
+		int pn = n;
+		if( n < 0 ) // First compute the inverse.
+		{
+			for( int i = 0; i < Size(); i++ )
+				A.Elem( p[i] - Min() ) = i + Min();
+			pn = -n;
+		}
+		for( int i = 0; i < Size(); i++ )
+		{
+			int k = ( n < 0 ) ? A[i] : p[i];
+			for( int j = 1; j < pn; j++ ) k = p[ k - Min() ];
+			A.Elem(i) = k;
+		}
+		return A;
+	}
+
+	Perm &Perm::operator()( int i, int j )
+	{
+		Swap( p[ i - Min() ], p[ j - Min() ] );
+		return *this;
+	}
+
+	int Perm::operator==( const Perm &Q ) const
+	{
+		int i;
+		if( Min() != Q.Min() ) return 0;
+		if( Max() != Q.Max() ) return 0;
+		for( i = 0; i < Size(); i++ ) if( p[i] != Q[i] ) return 0;
+		return 1;
+	}
+
+	int Perm::operator<=( const Perm &Q ) const
+	{
+		int i;
+		if( Min() != Q.Min() ) return 0;
+		if( Max() != Q.Max() ) return 0;
+		for( i = 0; i < Size(); i++ ) if( p[i] != Q[i] ) return p[i] < Q[i];
+		return 1;
+	}
+
+	void Reset( Perm &P )
+	{
+		for( int i = 0; i < P.Size(); i++ ) P.Elem(i) = P.Min() + i;
+	}
+
+	int End( const Perm &P )
+	{
+		int c = P[0];
+		for( int i = 1; i < P.Size(); i++ ) 
+		{
+			if( c < P[i] ) return 0;
+			c = P[i];
+		}
+		return 1;
+	}
+
+	void Print( const Perm &P )
+	{
+		if( P.Size() > 0 )
+		{
+			printf( "%d", P[0] );
+			for( int i = 1; i < P.Size(); i++ ) printf( " %d", P[i] );
+			printf( "\n" );
+		}
+	}
+
+	int Even( const Perm &P )
+	{
+		return !Odd( P );
+	}
+
+	int Odd( const Perm &P )
+	{
+		int count = 0;
+		Perm Q( P );
+		for( int i = P.Min(); i < P.Max(); i++ )
+		{
+			if( Q(i) == i ) continue;
+			for( int j = P.Min(); j <= P.Max(); j++ )
+			{
+				if( j == i ) continue;
+				if( Q(j) == i )
+				{
+					Q(i,j);
+					count = ( j - i ) + ( count % 2 );
+				}
+			}
+		}
+		return count % 2;
+	}
+
+
+	/***************************************************************************
+	**
+	**  P A R T I T I O N S
+	**
+	***************************************************************************/
+
+	Partition::Partition( )
+	{
+		Bin   = NULL;
+		bins  = 0;
+		balls = 0;
+	}
+
+	Partition::Partition( const Partition &Q )
+	{
+		Bin   = new int[ Q.Bins() ];
+		bins  = Q.Bins();
+		balls = Q.Balls();
+		for( int i = 0; i < bins; i++ ) Bin[i] = Q[i];
+	}
+
+	Partition::Partition( int bins_, int balls_ )
+	{
+		bins  = bins_;    
+		balls = balls_;
+		Bin   = new int[ bins ];
+		Reset( *this );
+	}
+
+	void Partition::operator+=( int bin )  // Add a ball to this bin.
+	{
+		if( bin < 0 || bin >= bins ) fprintf( stderr, "ERROR -- bin number out of range.\n" );
+		balls++;
+		Bin[ bin ]++;
+	}
+
+	int Partition::operator==( const Partition &P ) const  // Compare two partitions.
+	{
+		if( Balls() != P.Balls() ) return 0;
+		if( Bins () != P.Bins () ) return 0;
+		for( int i = 0; i < bins; i++ )
+		{
+			if( Bin[i] != P[i] ) return 0;
+		}
+		return 1;
+	}
+
+	void Partition::operator=( int n )  // Set to the n'th configuration.
+	{
+		Reset( *this );
+		for( int i = 0; i < n; i++ ) ++(*this);
+	}
+
+	int Partition::operator!=( const Partition &P ) const
+	{
+		return !( *this == P );
+	}
+
+	void Partition::operator=( const Partition &Q )
+	{
+		if( bins != Q.Bins() )
+		{
+			delete[] Bin;
+			Bin = new int[ Q.Bins() ];
+		}
+		bins  = Q.Bins();
+		balls = Q.Balls();
+		for( int i = 0; i < bins; i++ ) Bin[i] = Q[i];
+	}
+
+	void Partition::Get( char *str ) const
+	{
+		for( int i = 0; i < bins; i++ )
+			str += sprintf( str, "%d ", Bin[i] );
+		*str = NullChar;
+	}
+
+	int Partition::operator[]( int i ) const
+	{
+		if( i < 0 || i >= bins ) return 0;
+		else return Bin[i];
+	}
+
+	long Partition::NumCombinations() const  // How many distinct configurations.
+	{
+		// Think of the k "bins" as being k - 1 "partitions" mixed in with
+		// the n "balls".  If the balls and partitions were each distinguishable
+		// objects, there would be (n + k - 1)! distinct configurations.  
+		// But since both the balls and the partitions are  indistinguishable, 
+		// we simply divide by n! (k - 1)!.  This is the binomial coefficient 
+		// ( n + k - 1, n ).
+		//
+		if( balls == 0 ) return 0;
+		if( bins  == 1 ) return 1;
+		return (long)floor( BinomialCoeff( balls + bins - 1, balls ) + 0.5 );
+	}
+
+	/***************************************************************************
+	*  O P E R A T O R + +   (Next Partition)                                  *
+	*                                                                          *
+	*  Rearranges the n "balls" in k "bins" into the next configuration.       *
+	*  The first config is assumed to be all balls in the first bin -- i.e.    *
+	*  Bin[0].  All possible groupings are generated, each exactly once.  The  *
+	*  function returns 1 if successful, 0 if the last config has already been *
+	*  reached.  (Algorithm by Harold Zatz)                                    *
+	*                                                                          *
+	***************************************************************************/
+	int Partition::operator++()
+	{
+		int i;
+		if( Bin[0] > 0 )
+		{
+			Bin[1] += 1;
+			Bin[0] -= 1;
+		}
+		else
+		{
+			for( i = 1; Bin[i] == 0; i++ );
+			if( i == bins - 1 ) return 0;
+			Bin[i+1] += 1;
+			Bin[0] = Bin[i] - 1;
+			Bin[i] = 0;
+		}
+		return 1;
+	}
+
+	void Reset( Partition &P )
+	{
+		P.Bin[0] = P.Balls();
+		for( int i = 1; i < P.Bins(); i++ ) P.Bin[i] = 0;
+	}
+
+	int End( const Partition &P )
+	{
+		return P[ P.Bins() - 1 ] == P.Balls();
+	}
+
+	void Print( const Partition &P )
+	{
+		if( P.Bins() > 0 )
+		{
+			printf( "%d", P[0] );
+			for( int i = 1; i < P.Bins(); i++ ) printf( " %d", P[i] );
+			printf( "\n" );
+		}
+	}
+};
diff --git a/src/nvtt/bc6h/arvo/Perm.h b/src/nvtt/bc6h/arvo/Perm.h
new file mode 100755
index 0000000..2af4776
--- /dev/null
+++ b/src/nvtt/bc6h/arvo/Perm.h
@@ -0,0 +1,111 @@
+/***************************************************************************
+* Perm.h                                                                   *
+*                                                                          *
+* This file defines permutation class: that is, a class for creating and   *
+* manipulating finite sequences of distinct integers.  The main feature    *
+* of the class is the "++" operator that can be used to step through all   *
+* N! permutations of a sequence of N integers.  As the set of permutations *
+* forms a multiplicative group, a multiplication operator and an           *
+* exponentiation operator are also defined.                                *
+*                                                                          *
+*   HISTORY                                                                *
+*      Name    Date        Description                                     *
+*                                                                          *
+*      arvo    07/01/93    Added the Partition class.                      *
+*      arvo    03/23/93    Initial coding.                                 *
+*                                                                          *
+*--------------------------------------------------------------------------*
+* Copyright (C) 1999, James Arvo                                           *
+*                                                                          *
+* This program is free software; you can redistribute it and/or modify it  *
+* under the terms of the GNU General Public License as published by the    *
+* Free Software Foundation.  See http://www.fsf.org/copyleft/gpl.html      *
+*                                                                          *
+* This program is distributed in the hope that it will be useful, but      *
+* WITHOUT EXPRESS OR IMPLIED WARRANTY of merchantability or fitness for    *
+* any particular purpose.  See the GNU General Public License for more     *
+* details.                                                                 *
+*                                                                          *
+***************************************************************************/
+#ifndef __PERM_INCLUDED__
+#define __PERM_INCLUDED__
+
+namespace ArvoMath {
+
+	class Perm {
+	public:
+		Perm( const Perm & );                   // Initialize from a permutation.
+		Perm( int a = 0, int b = 0 );           // Create permutation of ints a...b.
+		Perm( const char * );                   // Create from string of numbers.
+		~Perm() { delete p; }                    // Destructor.
+		void  Get( char * ) const;              // Gets a string representation.
+		int   Size() const { return b - a + 1;} // The number of elements.
+		int   Min () const { return a; }        // The smallest value.
+		int   Max () const { return b; }        // The largest value.
+		int   operator++();                     // Make "next" permutation.
+		int   operator--();                     // Make "previous" permutation.
+		Perm &operator+=( int n );              // Advances by n permutations.
+		Perm &operator-=( int n );              // Decrement by n permutations.
+		Perm &operator =( const char * ) ;      // Resets from string of numbers.
+		Perm &operator =( const Perm & ) ;      // Copy from another permutation.
+		Perm &operator()( int i, int j ) ;      // Swap entries i and j.
+		int   operator()( int n        ) const; // Index from Min() to Max().
+		int   operator[]( int n        ) const; // Index from 0 to Size() - 1.
+		Perm  operator ^( int n        ) const; // Exponentiation: -1 means inverse.
+		Perm  operator *( const Perm & ) const; // Multiplication means composition.
+		int   operator==( const Perm & ) const; // True if all elements match.
+		int   operator<=( const Perm & ) const; // Lexicographic order relation.
+	private:
+		int& Elem( int i ) { return p[i]; }
+		int  Next();
+		int  Prev();
+		int  a, b;
+		int  *p;
+		friend void Reset( Perm & );
+	};
+
+
+	// A "Partition" is a collection of k indistinguishable "balls" in n "bins".  
+	// The Partition class encapsulates this notion and provides a convenient means 
+	// of generating all possible partitions of k objects among n bins exactly once.  
+	// Starting with all objects in bin zero, the ++ operator creates new and distinct
+	// distributions among the bins until all objects are in the last bin.
+
+	class Partition {
+	public:
+		Partition( );                              // Creates a null partition.
+		Partition( const Partition & );            // Initialize from another partition.
+		Partition( int bins, int balls );          // Specify # of bins & balls.
+		~Partition() { delete Bin; }                // Descructor.
+		void Get( char * ) const;                  // Gets a string representation.
+		int  Bins () const { return bins;  }       // The number of bins.
+		int  Balls() const { return balls; }       // The number of balls.
+		void operator+=( int bin );                // Add a ball to this bin.
+		void operator =( int n   );                // Set to the n'th configuration.
+		void operator =( const Partition& );       // Copy from another partition.
+		int  operator==( const Partition& ) const; // Compare two partitions.
+		int  operator!=( const Partition& ) const; // Compare two partitions.
+		int  operator++();                         // Make "next" partition.
+		int  operator[]( int i ) const;            // Return # of balls in bin i.
+		long NumCombinations() const;              // Number of distinct configurations.
+	private:
+		int  bins;
+		int  balls;
+		int* Bin;
+		friend void Reset( Partition & );
+	};
+
+
+	// Predicates for determining when a permutation or partition is the last of
+	// the sequence, functions for printing, resetting, and miscellaneous operations.
+
+	extern int  End  ( const Partition & );  // True if all balls in last bin.
+	extern int  End  ( const Perm      & );  // True if descending.
+	extern int  Even ( const Perm      & );  // True if even # of 2-cycles.
+	extern int  Odd  ( const Perm      & );  // True if odd # of 2-cycles.
+	extern void Print( const Partition & );  // Write to standard out.
+	extern void Print( const Perm      & );  // Write to standard out.
+	extern void Reset(       Partition & );  // Reset to all balls in bin 0.
+	extern void Reset(       Perm      & );  // Reset to ascending order.
+};
+#endif
diff --git a/src/nvtt/bc6h/arvo/Rand.cpp b/src/nvtt/bc6h/arvo/Rand.cpp
new file mode 100755
index 0000000..5f3025b
--- /dev/null
+++ b/src/nvtt/bc6h/arvo/Rand.cpp
@@ -0,0 +1,230 @@
+/***************************************************************************
+* Rand.C  (Random Number Generators)                                       *
+*                                                                          *
+* Source file for pseudo-random number utilities.  Rand is the             *
+* base class for several different algorithms for generating pseudo-random *
+* numbers.  Any method can generate individual samples or arrays of        *
+* samples using "Eval".  The random seed can be reset at any time by       *
+* calling "Seed" with any integer.  Random permutations of the integers    *
+* 0,1,...(n-1) are generated by "Perm(n,P)".                               *
+*                                                                          *
+*   HISTORY                                                                *
+*      Name    Date        Description                                     *
+*                                                                          *
+*      arvo    08/04/97    Changed to virtual functions.                   *
+*      arvo    06/06/93    Optimization, especially for array evaluators.  *
+*      arvo    10/06/91    Converted to C++                                *
+*      arvo    11/20/89    Added "gen_seed" function to handle.            *
+*      arvo    10/30/89    "state" allocation now done in rand_alloc.      *
+*      arvo    07/08/89    Initial coding.                                 *
+*                                                                          *
+*--------------------------------------------------------------------------*
+* Copyright (C) 1989, James Arvo                                           *
+*                                                                          *
+* This program is free software; you can redistribute it and/or modify it  *
+* under the terms of the GNU General Public License as published by the    *
+* Free Software Foundation.  See http://www.fsf.org/copyleft/gpl.html      *
+*                                                                          *
+* This program is distributed in the hope that it will be useful, but      *
+* WITHOUT EXPRESS OR IMPLIED WARRANTY of merchantability or fitness for    *
+* any particular purpose.  See the GNU General Public License for more     *
+* details.                                                                 *
+*                                                                          *
+***************************************************************************/
+#include <stdio.h>
+#include <math.h>
+#include "Rand.h"
+
+namespace ArvoMath {
+#ifndef ABS
+#define ABS( x ) ((x) > 0 ? (x) : -(x))
+#endif
+
+	/*-------------------------------------------------------------------------*
+	* M E T H O D 1                                                           *
+	*                                                                         *
+	* From "Numerical Recipes," by William H. Press, Brian P. Flannery,       *
+	* Saul A. Teukolsky, and William T. Vetterling, p. 197.                   *
+	*                                                                         *
+	*-------------------------------------------------------------------------*/
+	static const long   M1 = 714025;
+	static const long   IA =   1366;
+	static const long   IC = 150889;
+	static const double RM = 1.400512E-6;
+
+	float RandGen_1::Eval()
+	{
+		register long  *elem;
+		register long  offset;
+		register float rand;
+		offset = 1 + ( 97 * index ) / M1;
+		if( offset > 97 ) offset = 97;
+		if( offset <  1 ) offset =  1;
+		elem   = shuffle + offset;
+		rand   = ( index = *elem ) * RM;
+		*elem  = ( seed  = ( IA * seed + IC ) % M1 );
+		return rand;
+	}
+
+	void RandGen_1::Eval( int n, float *array )
+	{
+		register long *shfl = shuffle;
+		register long *elem;
+		register long offset;
+		for( int i = 0; i < n; i++ ) 
+		{
+			offset   = 1 + ( 97 * index ) / M1;
+			if( offset > 97 ) offset = 97;
+			if( offset <  1 ) offset =  1;
+			elem     = shfl + offset;
+			*array++ = ( index = *elem ) * RM;
+			*elem    = ( seed  = ( IA * seed + IC ) % M1 );
+		}
+	}
+
+	void RandGen_1::Seed( long seed )
+	{
+		long t = ( IC + ABS( seed ) + 1 ) % M1;
+		for( register int k = 1; k <= 97; k++ )
+		{
+			t = ( IA * t + IC ) % M1;
+			shuffle[k] = ABS( t );
+		}
+		t = ( IA * t + IC ) % M1;
+		seed  = ABS( t );
+		index = ABS( t );
+	}
+
+	/*-------------------------------------------------------------------------*
+	* M E T H O D 2                                                           *
+	*                                                                         *
+	* From "The Multiple Prime Random Number Generator," by Alexander Haas,   *
+	* ACM Transactions on Mathematical Software, Vol. 13, No. 4, December     *
+	* 1987, pp. 368-381.                                                      *
+	*                                                                         *
+	*-------------------------------------------------------------------------*/
+	float RandGen_2::Eval()
+	{
+		if( (m += 7    ) >=   9973 ) m -=  9871;
+		if( (i += 1907 ) >=  99991 ) i -= 89989;
+		if( (j += 73939) >= 224729 ) j -= 96233;
+		r = ((r * m + i + j) % 100000) / 10;
+		return r * 1.00010001E-4;
+	}
+
+	void RandGen_2::Eval( int n, float *array )
+	{
+		for( register int k = 0; k < n; k++ ) 
+		{
+			if( (m += 7    ) >=   9973 ) m -=  9871;
+			if( (i += 1907 ) >=  99991 ) i -= 89989;
+			if( (j += 73939) >= 224729 ) j -= 96233;
+			r = ((r * m + i + j) % 100000) / 10;
+			*array++ = r * 1.00010001E-4;
+		}
+	}
+
+	void RandGen_2::Seed( long seed )
+	{
+		r = ABS( seed      );
+		m = ABS( seed *  7 );
+		i = ABS( seed * 11 );
+		j = ABS( seed * 13 );
+		if( m < 100    ) m += 100;
+		if( i < 10000  ) i += 10000;
+		if( j < 128000 ) j += 128000;
+	}
+
+	/*-------------------------------------------------------------------------*
+	* M E T H O D 3                                                           *
+	*                                                                         *
+	* From "A More Portable Fortran Random Number Generator," by Linus        *
+	* Schrage, ACM Transactions on Mathematical Software, Vol. 5, No, 2,      *
+	* June 1979, pp. 132-138.                                                 *
+	*                                                                         *
+	*-------------------------------------------------------------------------*/
+	static const long A3 = 16807;
+	static const long P3 = 2147483647;
+
+	float RandGen_3::Eval()
+	{
+		long xhi    = ix >> 16;
+		long xalo   = ( ix & 0xFFFF ) * A3;
+		long leftlo = xalo >> 16;
+		long fhi    = xhi * A3 + leftlo;
+		long k      = fhi >> 15;
+		ix          = ( ((xalo - (leftlo << 16)) - P3) +
+			((fhi - (k << 15)) << 16) ) + k;
+		if( ix < 0 ) ix += P3;
+		return ix * 4.656612875E-10;
+	}
+
+	void RandGen_3::Eval( int n, float *array )
+	{
+		register long xhi, xalo, leftlo;
+		register long fhi, k;
+		for( register int i = 0; i < n; i++ ) 
+		{
+			xhi    = ix >> 16;
+			xalo   = ( ix & 0xFFFF ) * A3;
+			leftlo = xalo >> 16;
+			fhi    = xhi * A3 + leftlo;
+			k      = fhi >> 15;
+			ix     = ( ((xalo - (leftlo << 16)) - P3) +
+				((fhi - (k << 15)) << 16) ) + k;
+			if( ix < 0 ) ix += P3;
+			*array++ = ix * 4.656612875E-10;
+		}
+	}
+
+	void RandGen_3::Seed( long seed )
+	{
+		ix = ABS( seed );
+	}
+
+	/*-------------------------------------------------------------------------*
+	* R A N D : : P E R M        (Permutation)                                *
+	*                                                                         *
+	* This routine fills an integer array of length "len" with a random       *
+	* permutation of the integers 0, 1, 2, ... (len-1).                       *
+	*                                                                         *
+	* For efficiency, the random numbers are generated in batches of up to    *
+	* "Nmax" at a time.  The constant Nmax can be set to any value >= 1.      *
+	*                                                                         *
+	*-------------------------------------------------------------------------*/
+	static const int Nmax = 20;
+
+	void RandGen::Perm( int len, int perm[] )
+	{
+		float R[ Nmax ];    // A buffer for getting random numbers.
+		int   L = len - 1;  // Total number of random numbers needed.
+		int   N = 0;        // How many to generate when we call Eval.
+		int   n = 0;        // The array index into R.
+
+		// First initialize the array "perm" to the identity permutation.
+
+		for( int j = 0; j < len; j++ ) perm[j] = j;
+
+		// Now swap a random element in the front with the i'th element.
+		// When i gets down to 0, we're done.
+
+		for( int i = len - 1; i > 0; i-- )   // Element i is a swap candidate.
+		{
+			if( n == N )                     // Generate more random numbers.
+			{
+				N = ( L < Nmax ) ? L : Nmax; // Can't get more than "Nmax".
+				Eval( N, R );                // Generate N random numbers.
+				L -= N;                      // Decrement total counter.
+				n  = 0;                      // Start index at beginning of R.
+			}
+			float r = ( i + 1 ) * R[ n++ ];  // Pick a float in [0,i+1].
+			int   k = (int)r;                // Truncate r to an integer.
+			if( k < i )                      // Disregard k == i and k == i+1.
+			{
+				int tmp = perm[i];           // Swap elements i and k.
+				perm[i] = perm[k];
+				perm[k] = tmp;
+			}
+		}
+	}
+};
diff --git a/src/nvtt/bc6h/arvo/Rand.h b/src/nvtt/bc6h/arvo/Rand.h
new file mode 100755
index 0000000..a8ef5d9
--- /dev/null
+++ b/src/nvtt/bc6h/arvo/Rand.h
@@ -0,0 +1,114 @@
+/***************************************************************************
+* Rand.h  (Random Number Generators)                                       *
+*                                                                          *
+* Header file for Rand.C, pseudo-random number utilities.  Rand is the     *
+* base class for several different algorithms for generating pseudo-random *
+* numbers.  Any method can generate individual samples or arrays of        *
+* samples using "Eval".  The random seed can be reset at any time by       *
+* calling "Seed" with any integer.  Random permutations of the integers    *
+* 0,1,...(n-1) are generated by "Perm(n,P)".                               *
+*                                                                          *
+*   HISTORY                                                                *
+*      Name    Date        Description                                     *
+*                                                                          *
+*      arvo    08/04/97    Changed to virtual functions.                   *
+*      arvo    06/06/93    Optimization, especially for array evaluators.  *
+*      arvo    10/06/91    Converted to C++                                *
+*      arvo    11/20/89    Added "gen_seed" function to handle.            *
+*      arvo    10/30/89    "state" allocation now done in rand_alloc.      *
+*      arvo    07/08/89    Initial coding.                                 *
+*                                                                          *
+*--------------------------------------------------------------------------*
+* Copyright (C) 1989, James Arvo                                           *
+*                                                                          *
+* This program is free software; you can redistribute it and/or modify it  *
+* under the terms of the GNU General Public License as published by the    *
+* Free Software Foundation.  See http://www.fsf.org/copyleft/gpl.html      *
+*                                                                          *
+* This program is distributed in the hope that it will be useful, but      *
+* WITHOUT EXPRESS OR IMPLIED WARRANTY of merchantability or fitness for    *
+* any particular purpose.  See the GNU General Public License for more     *
+* details.                                                                 *
+*                                                                          *
+***************************************************************************/
+#ifndef __RAND_INCLUDED__
+#define __RAND_INCLUDED__
+
+namespace ArvoMath {
+
+	// Base class for random number generators.  This class contains
+	// several pure virtual functions, so it cannot be instanced directly.
+
+	class RandGen {
+	public:
+		RandGen() {}
+		virtual float Eval(                  ) = 0;
+		virtual void  Eval( int n, float x[] ) = 0;
+		virtual void  Seed( long seed        ) = 0;
+	public:
+		void  Perm( int n, int P[] );
+		float Interval( float a, float b );
+		void  Eval( float &x ) { x = Eval(); }
+	};
+
+
+	// Method 1: From "Numerical Recipes," by William H. Press, Brian P. 
+	// Flannery, Saul A. Teukolsky, and William T. Vetterling, p. 197.
+
+	class RandGen_1 : public RandGen {
+	public:
+		RandGen_1(           ) { Seed( 1    ); }
+		RandGen_1( long seed ) { Seed( seed ); }
+		virtual float Eval(                  );
+		virtual void  Eval( int n, float x[] );
+		virtual void  Seed( long seed        );
+	private: 
+		long index; 
+		long seed;
+		long shuffle[ 98 ];
+	};
+
+
+	// Method 2: From "The Multiple Prime Random Number Generator," by 
+	// Alexander Haas, ACM Transactions on Mathematical Software, 
+	// Vol. 13, No. 4, December 1987, pp. 368-381.                                                      *
+
+	class RandGen_2 : public RandGen {
+	public:
+		RandGen_2(           ) { Seed( 1    ); }
+		RandGen_2( long seed ) { Seed( seed ); }
+		virtual float Eval(                  );
+		virtual void  Eval( int n, float x[] );
+		virtual void  Seed( long seed        );
+	private: 
+		long r;  
+		long m;
+		long i;
+		long j;
+	};
+
+
+	// Method 3: From "A More Portable Fortran Random Number Generator," 
+	// by Linus Schrage, ACM Transactions on Mathematical Software, 
+	// Vol. 5, No, 2, June 1979, pp. 132-138.                                                 *
+
+	class RandGen_3 : public RandGen {
+	public:
+		RandGen_3(           ) { Seed( 1    ); }
+		RandGen_3( long seed ) { Seed( seed ); }
+		virtual float Eval(                  );
+		virtual void  Eval( int n, float x[] );
+		virtual void  Seed( long seed        );
+	private:
+		long ix;
+	};
+
+
+	inline float RandGen::Interval( float a, float b )
+	{
+		return ( a < b ) ?
+			a + Eval() * ( b - a ) :
+		b + Eval() * ( a - b ) ;
+	}
+};
+#endif
diff --git a/src/nvtt/bc6h/arvo/SI_units.h b/src/nvtt/bc6h/arvo/SI_units.h
new file mode 100755
index 0000000..69cc8cc
--- /dev/null
+++ b/src/nvtt/bc6h/arvo/SI_units.h
@@ -0,0 +1,232 @@
+/*****************************************************************************
+** 
+**   MODULE NAME  SI_units.h       International System of Units (SI)
+**
+**   DESCRIPTION
+**       The purpose of this header file is to provide a simple and efficient
+**       mechanism for associating physically meaningful units with floating
+**       point numbers.  No extra space is required, and no runtime overhead
+**       is introduced; all type-checking occurs at compile time.
+**
+**
+**   HISTORY
+**      Name	Date	    Description
+**
+**      arvo    02/09/92    Replaced conversion macros with inline functions.
+**      arvo    10/16/91    Initial implementation.
+**
+**
+**   (c) Copyright 1991, 1992
+**       Program of Computer Graphics, Cornell University, Ithaca, NY
+**       ALL RIGHTS RESERVED
+**
+*****************************************************************************/
+
+#ifndef SI_UNITS_H
+#define SI_UNITS_H
+
+#include <iostream.h>
+
+namespace ArvoMath {
+
+	const float
+		SI_deci  = 1.0E-1,
+		SI_centi = 1.0E-2,
+		SI_milli = 1.0E-3,
+		SI_micro = 1.0E-6,
+		SI_nano  = 1.0E-9,
+		SI_kilo  = 1.0E+3,
+		SI_mega  = 1.0E+6,
+		SI_giga  = 1.0E+9,
+		SI_tera  = 1.0E+12;
+
+	/*******************************************************************************
+	*                                                                              *
+	*   I N T E R N A T I O N A L    S Y S T E M    O F    U N I T S               *
+	*                                                                              *
+	********************************************************************************
+	*                                                                              *
+	* DIMENSION           CLASS           INITIALIZER     SYMBOL   BASE UNITS      *
+	*                                                                              *
+	* length              SI_length        meter            m        m             *
+	* time                SI_time          second           s        s             *
+	* mass                SI_mass          kilogram         kg       kg            *
+	* angle               SI_angle         radian           rad      rad           *
+	* solid angle         SI_solid_angle   steradian        sr       sr            *
+	* temperature         SI_temperature   kelvin           K        K             *
+	* luminous intensity  SI_lum_inten     candela          cd       cd            *
+	* area                SI_area          meter2           m2       m2            *
+	* volume              SI_volume        meter3           m3       m3            *
+	* frequency           SI_frequency     hertz            Hz       1/s           *
+	* force               SI_force         newton           N        m kg/s2       *
+	* energy              SI_energy        joule            J        m2 kg/s2      *
+	* power               SI_power         watt             W        m2 kg/s3      *
+	* radiance            SI_radiance      watts_per_m2sr   W/m2sr   kg/(s3 sr)    *
+	* irradiance          SI_irradiance    watts_per_m2     W/m2     kg/s3         *
+	* radiant intensity   SI_rad_inten     watts_per_sr     W/sr     m2 kg/(s3 sr) *
+	* luminance           SI_luminance     candela_per_m2   cd/m2    cd/m2         *
+	* illuminance         SI_illuminance   lux              lx       cd sr/m2      *
+	* luminous flux       SI_lum_flux      lumen            lm       cd sr         *
+	* luminous energy     SI_lum_energy    talbot           tb       cd sr s       *
+	*                                                                              *
+	*******************************************************************************/
+
+	class SI_dimensionless {
+	public:
+		float Value() const { return value; }
+		ostream& Put( ostream &s, char *a ) { return s << value << " " << a; }
+	protected:
+		SI_dimensionless() { value = 0; }
+		SI_dimensionless( float x ){ value = x; }
+		float value;
+	};
+
+	/*******************************************************************************
+	* The following macro is used for creating new quantity classes and their      *
+	* corresponding initializing functions and abbreviations.  This macro is       *
+	* not intended to be used outside of this file -- it is a compact means of     *
+	* defining generic operations for each quantity (e.g. scaling & comparing).    *
+	*******************************************************************************/
+
+#define SI_Make( C, Initializer, Symbol )                                  \
+	struct C : SI_dimensionless {                                          \
+	C                 (         ) : SI_dimensionless(   ) {};          \
+	C                 ( float x ) : SI_dimensionless( x ) {};          \
+	C     operator *  ( float x ) { return C( value *  x         ); }  \
+	C     operator /  ( float x ) { return C( value /  x         ); }  \
+	C     operator /= ( float x ) { return C( value /= x         ); }  \
+	C     operator *= ( float x ) { return C( value *= x         ); }  \
+	C     operator +  ( C     x ) { return C( value +  x.Value() ); }  \
+	C     operator -  (         ) { return C(-value              ); }  \
+	C     operator -  ( C     x ) { return C( value -  x.Value() ); }  \
+	C     operator += ( C     x ) { return C( value += x.Value() ); }  \
+	C     operator -= ( C     x ) { return C( value -= x.Value() ); }  \
+	C     operator =  ( C     x ) { return C( value =  x.Value() ); }  \
+	int   operator >  ( C     x ) { return  ( value >  x.Value() ); }  \
+	int   operator <  ( C     x ) { return  ( value <  x.Value() ); }  \
+	int   operator >= ( C     x ) { return  ( value >= x.Value() ); }  \
+	int   operator <= ( C     x ) { return  ( value <= x.Value() ); }  \
+	float operator /  ( C     x ) { return  ( value /  x.Value() ); }  \
+	};                                                                 \
+	inline ostream& operator<<(ostream &s, C x) {return x.Put(s,Symbol);}  \
+	inline C Initializer( float x      )   { return C( x );             }  \
+	inline C operator * ( float x, C y )   { return C( x * y.Value() ); }
+
+	/*******************************************************************************
+	* The following macros define permissible arithmetic operations among          *
+	* variables with different physical meanings.  This ensures that the           *
+	* result of any such operation is ALWAYS another meaningful quantity.          *
+	*******************************************************************************/
+
+#define SI_Square( A, B )                                                  \
+	inline B operator*( A x, A y ) { return B( x.Value() * y.Value() ); }  \
+	inline A operator/( B x, A y ) { return A( x.Value() / y.Value() ); }
+
+#define SI_Recip( A, B )                                                   \
+	inline B operator/( float x, A y ) { return B( x / y.Value() ); }      \
+	inline A operator/( float x, B y ) { return A( x / y.Value() ); }      \
+	inline float operator*( A x, B y ) { return x.Value() * y.Value(); }   \
+	inline float operator*( B x, A y ) { return x.Value() * y.Value(); }
+
+#define SI_Times( A, B, C )                                                \
+	inline C operator*( A x, B y ) { return C( x.Value() * y.Value() ); }  \
+	inline C operator*( B x, A y ) { return C( x.Value() * y.Value() ); }  \
+	inline A operator/( C x, B y ) { return A( x.Value() / y.Value() ); }  \
+	inline B operator/( C x, A y ) { return B( x.Value() / y.Value() ); }
+
+	/*******************************************************************************
+	* The following macros create classes for a variety of quantities.  These      *
+	* include base qunatities such as "time" and "length" as well as derived       *
+	* quantities such as "power" and "volume".  Each quantity is provided with     *
+	* an initialization function in SI units and an abbreviation for printing.     *
+	*******************************************************************************/
+
+	SI_Make( SI_length         , meter           , "m"      ); // Base Units:
+	SI_Make( SI_mass           , kilogram        , "kg"     );
+	SI_Make( SI_time           , second          , "s"      );
+	SI_Make( SI_lum_inten      , candela         , "cd"     );
+	SI_Make( SI_temperature    , kelvin          , "K"      );
+	SI_Make( SI_angle          , radian          , "rad"    ); // Supplementary:
+	SI_Make( SI_solid_angle    , steradian       , "sr"     );
+	SI_Make( SI_area           , meter2          , "m2"     ); // Derived units:
+	SI_Make( SI_volume         , meter3          , "m3"     ); 
+	SI_Make( SI_frequency      , hertz           , "Hz"     ); 
+	SI_Make( SI_force          , newton          , "N"      );
+	SI_Make( SI_energy         , joule           , "J"      );
+	SI_Make( SI_power          , watt            , "W"      );
+	SI_Make( SI_radiance       , watts_per_m2sr  , "W/m2sr" );
+	SI_Make( SI_irradiance     , watts_per_m2    , "W/m2"   );
+	SI_Make( SI_rad_inten      , watts_per_sr    , "W/sr"   );
+	SI_Make( SI_luminance      , candela_per_m2  , "cd/m2"  );
+	SI_Make( SI_illuminance    , lux             , "lx"     );
+	SI_Make( SI_lum_flux       , lumen           , "lm"     );
+	SI_Make( SI_lum_energy     , talbot          , "tb"     );
+	SI_Make( SI_time2          , second2         , "s2"     ); // Intermediate: 
+	SI_Make( SI_sa_area        , meter2_sr       , "m2sr"   );
+	SI_Make( SI_inv_area       , inv_meter2      , "1/m2"   ); 
+	SI_Make( SI_inv_solid_angle, inv_steradian   , "1/sr"   );
+	SI_Make( SI_length_temp    , meters_kelvin   , "m K"    );
+	SI_Make( SI_power_area     , watts_m2        , "W m2"   );
+	SI_Make( SI_power_per_volume, watts_per_m3   , "W/m3"   );
+
+	SI_Square( SI_length       , SI_area            );
+	SI_Square( SI_time         , SI_time2           );
+	SI_Recip ( SI_time         , SI_frequency       );
+	SI_Recip ( SI_area         , SI_inv_area        );
+	SI_Recip ( SI_solid_angle  , SI_inv_solid_angle );
+
+	SI_Times( SI_area          , SI_length         , SI_volume      );
+	SI_Times( SI_force         , SI_length         , SI_energy      );
+	SI_Times( SI_power         , SI_time           , SI_energy      );
+	SI_Times( SI_lum_flux      , SI_time           , SI_lum_energy  );
+	SI_Times( SI_lum_inten     , SI_solid_angle    , SI_lum_flux    );
+	SI_Times( SI_radiance      , SI_solid_angle    , SI_irradiance  );
+	SI_Times( SI_rad_inten     , SI_solid_angle    , SI_power       );
+	SI_Times( SI_irradiance    , SI_area           , SI_power       );
+	SI_Times( SI_illuminance   , SI_area           , SI_lum_flux    );
+	SI_Times( SI_solid_angle   , SI_area           , SI_sa_area     );
+	SI_Times( SI_radiance      , SI_sa_area        , SI_power       );
+	SI_Times( SI_irradiance    , SI_inv_solid_angle, SI_radiance    );
+	SI_Times( SI_power         , SI_inv_solid_angle, SI_rad_inten   );
+	SI_Times( SI_length        , SI_temperature    , SI_length_temp );
+	SI_Times( SI_power         , SI_area           , SI_power_area  );
+
+	/*******************************************************************************
+	* Following are some useful non-SI units.  These units can be used in place of *
+	* the unit-initializers above.  Thus, a variable of type SI_length, for example*
+	* may be initialized in "meters", "inches", or "centimeters".  In all cases,   *
+	* however, the value is converted to the underlying SI unit (e.g. meters).     *
+	*******************************************************************************/
+
+#define SI_Convert( SI, New, Old ) inline SI New( float x ) { return x * Old; }
+
+	SI_Convert( SI_time        , minute     ,         second(     60.0 ) );
+	SI_Convert( SI_time        , hour       ,         minute(     60.0 ) );
+	SI_Convert( SI_force       , dyne       ,         newton(   1.0E-5 ) );
+	SI_Convert( SI_energy      , erg        ,          joule(   1.0E-7 ) );
+	SI_Convert( SI_power       , kilowatt   ,           watt(  SI_kilo ) );
+	SI_Convert( SI_mass        , gram       ,       kilogram( SI_milli ) );
+	SI_Convert( SI_length      , inch       ,          meter(  2.54E-2 ) );
+	SI_Convert( SI_length      , foot       ,           inch(     12.0 ) );
+	SI_Convert( SI_length      , centimeter ,          meter( SI_centi ) );
+	SI_Convert( SI_length      , micron     ,          meter( SI_micro ) );
+	SI_Convert( SI_length      , angstrom   ,          meter(  1.0E-10 ) );
+	SI_Convert( SI_area        , barn       ,         meter2(  1.0E-28 ) );
+	SI_Convert( SI_angle       , degree     ,         radian( 0.017453 ) );
+	SI_Convert( SI_illuminance , phot       ,            lux(   1.0E+4 ) );
+	SI_Convert( SI_illuminance , footcandle ,            lux(  9.29E-2 ) );
+	SI_Convert( SI_luminance   , stilb      , candela_per_m2(   1.0E+4 ) );
+
+	/*******************************************************************************
+	* Often there are multiple names for a single quantity.  Below are some        *
+	* synonyms for the quantities defined above.  These can be used in place of    *
+	* the original quantities and may be clearer in some contexts.                 *
+	*******************************************************************************/
+
+	typedef SI_power       SI_radiant_flux;
+	typedef SI_irradiance  SI_radiant_flux_density;
+	typedef SI_irradiance  SI_radiant_exitance;
+	typedef SI_radiance    SI_intensity;
+	typedef SI_irradiance  SI_radiosity;
+};
+#endif
\ No newline at end of file
diff --git a/src/nvtt/bc6h/arvo/SVD.cpp b/src/nvtt/bc6h/arvo/SVD.cpp
new file mode 100755
index 0000000..36f0ea6
--- /dev/null
+++ b/src/nvtt/bc6h/arvo/SVD.cpp
@@ -0,0 +1,398 @@
+/***************************************************************************
+* SVD.C                                                                    *
+*                                                                          *
+* Singular Value Decomposition.                                            *
+*                                                                          *
+*   HISTORY                                                                *
+*      Name    Date          Description                                   *
+*                                                                          *
+*      arvo    08/22/2000    Copied to CIT library.                        *
+*      arvo    06/28/1993    Rewritten from "Numerical Recipes" C-code.    *
+*                                                                          *
+*--------------------------------------------------------------------------*
+* Copyright (C) 2000, James Arvo                                           *
+*                                                                          *
+* This program is free software; you can redistribute it and/or modify it  *
+* under the terms of the GNU General Public License as published by the    *
+* Free Software Foundation.  See http://www.fsf.org/copyleft/gpl.html      *
+*                                                                          *
+* This program is distributed in the hope that it will be useful, but      *
+* WITHOUT EXPRESS OR IMPLIED WARRANTY of merchantability or fitness for    *
+* any particular purpose.  See the GNU General Public License for more     *
+* details.                                                                 *
+*                                                                          *
+***************************************************************************/
+#include <math.h>
+#include <assert.h>
+#include "ArvoMath.h"
+#include "Vector.h"
+#include "Matrix.h"
+#include "SVD.h"
+
+namespace ArvoMath {
+	static const int MaxIterations = 30;
+
+	static double svd_pythag( double a, double b )
+	{
+		double at = Abs(a);
+		double bt = Abs(b);
+		if( at > bt )
+			return at * sqrt( 1.0 + Sqr( bt / at ) );
+		else if( bt > 0.0 )
+			return bt * sqrt( 1.0 + Sqr( at / bt ) );
+		else return 0.0;
+	}
+
+	static inline double SameSign( double a, double b ) 
+	{
+		double t;
+		if( b >= 0.0 ) t = Abs( a );
+		else t = -Abs( a );
+		return t;
+	}
+
+	static int ComputeRank( const Matrix &D, double epsilon )
+	{
+		int rank = 0;
+		for( int i = 0; i < D.Rows(); i++ )
+			if( Abs(D(i,i)) > epsilon ) rank++;
+		return rank;
+	}
+
+	SVD::SVD( ) : Q_(0), D_(0), R_(0)
+	{
+	}
+
+	SVD::SVD( const Matrix &M ) : Q_(0), D_(0), R_(0)
+	{
+		(*this) = M;
+	}
+
+	void SVD::operator=( const Matrix &A )
+	{
+		if( A.Rows() >= A.Cols() ) Q_ = A;
+		else
+		{
+			Q_ = Matrix( A.Cols() );
+			for( int i = 0; i < A.Rows(); i++ )
+				for( int j = 0; j < A.Cols(); j++ ) Q_(i,j) = A(i,j);
+		}
+		R_ = Matrix( A.Cols() );
+		Decompose( Q_, D_, R_ );
+	}
+
+	const Matrix &SVD::Q( double epsilon ) const
+	{
+		int rank = 0;
+		if( epsilon != 0.0 ) rank = ComputeRank( D_, epsilon );
+		return Q_;
+	}
+
+	const Matrix &SVD::D( double epsilon ) const
+	{
+		int rank = 0;
+		if( epsilon != 0.0 ) rank = ComputeRank( D_, epsilon );
+		return D_;
+	}
+
+	const Matrix &SVD::R( double epsilon ) const
+	{
+		int rank = 0;
+		if( epsilon != 0.0 ) rank = ComputeRank( D_, epsilon );
+		return R_;
+	}
+
+	int SVD::Rank( double epsilon ) const
+	{
+		return ComputeRank( D_, epsilon );
+	}
+
+	int SVD::Decompose( Matrix &Q, Matrix &D, Matrix &R )
+	{
+		int    i, j, k, l, m, n, p, q, iter;
+		double c, f, h, s, x, y, z;
+		double norm  = 0.0;
+		double g     = 0.0;
+		double scale = 0.0;
+
+		m = Q.Rows();
+		n = Q.Cols();
+
+		Vector Temp( n );
+		Vector diag( n );
+
+		for( i = 0; i < n; i++ ) 
+		{
+
+			Temp(i) = scale * g;
+			scale   = 0.0;
+			g       = 0.0;
+			s       = 0.0;
+			l       = i + 1;
+
+			if( i < m )
+			{
+				for( k = i; k < m; k++ ) scale += Abs( Q(k,i) );
+				if( scale != 0.0 ) 
+				{
+					for( k = i; k < m; k++ ) 
+					{
+						Q(k,i) /= scale;
+						s += Sqr( Q(k,i) );
+					}
+					f = Q(i,i);
+					g = -SameSign( sqrt(s), f );
+					h = f * g - s;
+					Q(i,i) = f - g;
+					if( i != n - 1 )
+					{
+						for( j = l; j < n; j++ ) 
+						{
+							s = 0.0;
+							for( k = i; k < m; k++ ) s += Q(k,i) * Q(k,j);
+							f = s / h;
+							for( k = i; k < m; k++ ) Q(k,j) += f * Q(k,i);
+						}
+					}
+					for( k = i; k < m; k++ ) Q(k,i) *= scale;
+				}
+			}
+
+			diag(i) = scale * g;
+			g       = 0.0;
+			s       = 0.0;
+			scale   = 0.0;
+
+			if( i < m && i != n - 1 ) 
+			{
+				for( k = l; k < n; k++ ) scale += Abs( Q(i,k) );
+				if( scale != 0.0 ) 
+				{
+					for( k = l; k < n; k++ ) 
+					{
+						Q(i,k) /= scale;
+						s += Sqr( Q(i,k) );
+					}
+					f = Q(i,l);
+					g = -SameSign( sqrt(s), f );
+					h = f * g - s;
+					Q(i,l) = f - g;
+					for( k = l; k < n; k++ ) Temp(k) = Q(i,k) / h;
+					if( i != m - 1 ) 
+					{
+						for( j = l; j < m; j++ ) 
+						{
+							s = 0.0;
+							for( k = l; k < n; k++ ) s += Q(j,k) * Q(i,k);
+							for( k = l; k < n; k++ ) Q(j,k) += s * Temp(k);
+						}
+					}
+					for( k = l; k < n; k++ ) Q(i,k) *= scale;
+				}
+			}
+			norm = Max( norm, Abs( diag(i) ) + Abs( Temp(i) ) );
+		}
+
+
+		for( i = n - 1; i >= 0; i-- ) 
+		{
+			if( i < n - 1 ) 
+			{
+				if( g != 0.0 ) 
+				{
+					for( j = l; j < n; j++ ) R(i,j) = ( Q(i,j) / Q(i,l) ) / g;
+					for( j = l; j < n; j++ ) 
+					{
+						s = 0.0;
+						for( k = l; k < n; k++ ) s += Q(i,k) * R(j,k);
+						for( k = l; k < n; k++ ) R(j,k) += s * R(i,k);
+					}
+				}
+				for( j = l; j < n; j++ ) 
+				{
+					R(i,j) = 0.0;
+					R(j,i) = 0.0;
+				}
+			}
+			R(i,i) = 1.0;
+			g = Temp(i);
+			l = i;
+		}
+
+
+		for( i = n - 1; i >= 0; i-- ) 
+		{
+			l = i + 1;
+			g = diag(i);
+			if( i < n - 1 ) for( j = l; j < n; j++ ) Q(i,j) = 0.0;
+			if( g != 0.0 ) 
+			{
+				g = 1.0 / g;
+				if( i != n - 1 ) 
+				{
+					for( j = l; j < n; j++ ) 
+					{
+						s = 0.0;
+						for( k = l; k < m; k++ ) s += Q(k,i) * Q(k,j);
+						f = ( s / Q(i,i) ) * g;
+						for( k = i; k < m; k++ ) Q(k,j) += f * Q(k,i);
+					}
+				}
+				for( j = i; j < m; j++ ) Q(j,i) *= g;
+			} 
+			else 
+			{
+				for( j = i; j < m; j++ ) Q(j,i) = 0.0;
+			}
+			Q(i,i) += 1.0;
+		}
+
+
+		for( k = n - 1; k >= 0; k-- ) 
+		{
+			for( iter = 1; iter <= MaxIterations; iter++ ) 
+			{
+				int jump;
+
+				for( l = k; l >= 0; l-- )
+				{
+					q = l - 1;
+					if( Abs( Temp(l) ) + norm == norm ) { jump = 1; break; }
+					if( Abs( diag(q) ) + norm == norm ) { jump = 0; break; }
+				}
+
+				if( !jump )
+				{
+					c = 0.0;
+					s = 1.0;
+					for( i = l; i <= k; i++ )
+					{
+						f = s * Temp(i);
+						Temp(i) *= c;
+						if( Abs( f ) + norm == norm ) break;
+						g = diag(i);
+						h = svd_pythag( f, g );
+						diag(i) = h;
+						h = 1.0 / h;
+						c = g * h;
+						s = -f * h;
+						for( j = 0; j < m; j++ ) 
+						{
+							y = Q(j,q);
+							z = Q(j,i);
+							Q(j,q) = y * c + z * s;
+							Q(j,i) = z * c - y * s;
+						}
+					}
+				}
+
+				z = diag(k);
+				if( l == k ) 
+				{
+					if( z < 0.0 ) 
+					{
+						diag(k) = -z;
+						for( j = 0; j < n; j++ ) R(k,j) *= -1.0; 
+					}
+					break;
+				}
+				if( iter >= MaxIterations ) return 0;
+				x = diag(l);
+				q = k - 1;
+				y = diag(q);
+				g = Temp(q);
+				h = Temp(k);
+				f = ( ( y - z ) * ( y + z ) + ( g - h ) * ( g + h ) ) / ( 2.0 * h * y );
+				g = svd_pythag( f, 1.0 );
+				f = ( ( x - z ) * ( x + z ) + h * ( ( y / ( f + SameSign( g, f ) ) ) - h ) ) / x;
+				c = 1.0;
+				s = 1.0;
+				for( j = l; j <= q; j++ ) 
+				{
+					i = j + 1;
+					g = Temp(i);
+					y = diag(i);
+					h = s * g;
+					g = c * g;
+					z = svd_pythag( f, h );
+					Temp(j) = z;
+					c = f / z;
+					s = h / z;
+					f = x * c + g * s;
+					g = g * c - x * s;
+					h = y * s;
+					y = y * c;
+					for( p = 0; p < n; p++ ) 
+					{
+						x = R(j,p);
+						z = R(i,p);
+						R(j,p) = x * c + z * s;
+						R(i,p) = z * c - x * s;
+					}
+					z = svd_pythag( f, h );
+					diag(j) = z;
+					if( z != 0.0 ) 
+					{
+						z = 1.0 / z;
+						c = f * z;
+						s = h * z;
+					}
+					f = c * g + s * y;
+					x = c * y - s * g;
+					for( p = 0; p < m; p++ ) 
+					{
+						y = Q(p,j);
+						z = Q(p,i);
+						Q(p,j) = y * c + z * s;
+						Q(p,i) = z * c - y * s;
+					}
+				}
+				Temp(l) = 0.0;
+				Temp(k) = f;
+				diag(k) = x;
+			}
+		}
+
+		// Sort the singular values into descending order.
+
+		for( i = 0; i < n - 1; i++ )
+		{
+			double biggest = diag(i);  // Biggest singular value so far.
+			int    bindex  = i;        // The row/col it occurred in.
+			for( j = i + 1; j < n; j++ )
+			{
+				if( diag(j) > biggest ) 
+				{
+					biggest = diag(j);
+					bindex  = j;
+				}            
+			}
+			if( bindex != i )  // Need to swap rows and columns.
+			{
+				Q.SwapCols( i, bindex );  // Swap columns in Q.
+				R.SwapRows( i, bindex );  // Swap rows in R.
+				diag.Swap ( i, bindex );  // Swap elements in diag.
+			}
+		}
+
+		D = Diag( diag );
+		return 1;
+	}
+
+
+	const Matrix &SVD::PseudoInverse( double epsilon )
+	{
+		if( Null(P_) )
+		{
+			Matrix D_Inverse( D_ );
+			for( int i = 0; i < D_Inverse.Rows(); i++ )
+			{
+				if( Abs( D_Inverse(i,i) ) > epsilon )
+					D_Inverse(i,i) = 1.0 / D_Inverse(i,i);
+				else D_Inverse(i,i) = 0.0;
+			}
+			P_ = Q_ * D_Inverse * R_;
+		}
+		return P_;
+	}
+};
diff --git a/src/nvtt/bc6h/arvo/SVD.h b/src/nvtt/bc6h/arvo/SVD.h
new file mode 100755
index 0000000..d6bf850
--- /dev/null
+++ b/src/nvtt/bc6h/arvo/SVD.h
@@ -0,0 +1,54 @@
+/***************************************************************************
+* SVD.h                                                                    *
+*                                                                          *
+* Singular Value Decomposition.                                            *
+*                                                                          *
+*   HISTORY                                                                *
+*      Name    Date          Description                                   *
+*                                                                          *
+*      arvo    08/22/2000    Split off from Matrix.h                       *
+*      arvo    06/28/1993    Rewritten from "Numerical Recipes" C-code.    *
+*                                                                          *
+*--------------------------------------------------------------------------*
+* Copyright (C) 2000, James Arvo                                           *
+*                                                                          *
+* This program is free software; you can redistribute it and/or modify it  *
+* under the terms of the GNU General Public License as published by the    *
+* Free Software Foundation.  See http://www.fsf.org/copyleft/gpl.html      *
+*                                                                          *
+* This program is distributed in the hope that it will be useful, but      *
+* WITHOUT EXPRESS OR IMPLIED WARRANTY of merchantability or fitness for    *
+* any particular purpose.  See the GNU General Public License for more     *
+* details.                                                                 *
+*                                                                          *
+***************************************************************************/
+#ifndef __SVD_INCLUDED__
+#define __SVD_INCLUDED__
+
+#include "Vector.h"
+#include "Matrix.h"
+
+namespace ArvoMath {
+
+	class SVD {
+	public:
+		SVD( );
+		SVD( const SVD    & );  // Copies the decomposition.
+		SVD( const Matrix & );  // Performs the decomposition.
+		~SVD() {};
+		const Matrix &Q( double epsilon = 0.0 ) const;
+		const Matrix &D( double epsilon = 0.0 ) const;
+		const Matrix &R( double epsilon = 0.0 ) const;
+		const Matrix &PseudoInverse( double epsilon = 0.0 );
+		int   Rank( double epsilon = 0.0 ) const;
+		void  operator=( const Matrix & );  // Performs the decomposition.
+	private:
+		int Decompose( Matrix &Q, Matrix &D, Matrix &R );
+		Matrix Q_;
+		Matrix D_;
+		Matrix R_;
+		Matrix P_; // Pseudo inverse.
+		int    error;
+	};
+};
+#endif
diff --git a/src/nvtt/bc6h/arvo/SphTri.cpp b/src/nvtt/bc6h/arvo/SphTri.cpp
new file mode 100755
index 0000000..40de956
--- /dev/null
+++ b/src/nvtt/bc6h/arvo/SphTri.cpp
@@ -0,0 +1,292 @@
+/***************************************************************************
+* SphTri.C                                                                 *
+*                                                                          *
+* This file defines the SphericalTriangle class definition, which          *
+* supports member functions for Monte Carlo sampling, point containment,   *
+* and other basic operations on spherical triangles.                       *
+*                                                                          *
+*   Changes:                                                               *
+*     01/01/2000  arvo  Added New_{Alpha,Beta,Gamma} methods.              *
+*     12/30/1999  arvo  Added VecIrrad method for "Vector Irradiance".     *
+*     04/08/1995  arvo  Further optimized sampling algorithm.              *
+*     10/11/1994  arvo  Added analytic sampling algorithm.                 *
+*     06/14/1994  arvo  Initial implementation.                            *
+*                                                                          *
+*--------------------------------------------------------------------------*
+* Copyright (C) 1995, 2000, James Arvo                                     *
+*                                                                          *
+* This program is free software; you can redistribute it and/or modify it  *
+* under the terms of the GNU General Public License as published by the    *
+* Free Software Foundation.  See http://www.fsf.org/copyleft/gpl.html      *
+*                                                                          *
+* This program is distributed in the hope that it will be useful, but      *
+* WITHOUT EXPRESS OR IMPLIED WARRANTY of merchantability or fitness for    *
+* any particular purpose.  See the GNU General Public License for more     *
+* details.                                                                 *
+*                                                                          *
+***************************************************************************/
+#include <iostream>
+#include <math.h>
+#include "SphTri.h"
+#include "form.h"
+namespace ArvoMath {
+	/*-------------------------------------------------------------------------*
+	* Constructor                                                             *
+	*                                                                         *
+	* Construct a spherical triangle from three (non-zero) vectors.  The      *
+	* vectors needn't be of unit length.                                      *
+	*                                                                         *
+	*-------------------------------------------------------------------------*/
+	SphericalTriangle::SphericalTriangle( const Vec3 &A0, const Vec3 &B0, const Vec3 &C0 )
+	{
+		Init( A0, B0, C0 );
+	}
+
+	/*-------------------------------------------------------------------------*
+	* Init                                                                    *
+	*                                                                         *
+	* Construct the spherical triange from three vertices.  Assume that the   *
+	* sphere is centered at the origin.  The vectors A, B, and C need not     *
+	* be normalized.                                                          *
+	*                                                                         *
+	*-------------------------------------------------------------------------*/
+	void SphericalTriangle::Init( const Vec3 &A0, const Vec3 &B0, const Vec3 &C0 )
+	{
+		// Normalize the three vectors -- these are the vertices.
+
+		A_ = Unit( A0 );
+		B_ = Unit( B0 );
+		C_ = Unit( C0 );
+
+		// Compute and save the cosines of the edge lengths.
+
+		cos_a = B_ * C_;
+		cos_b = A_ * C_;
+		cos_c = A_ * B_;
+
+		// Compute and save the edge lengths.
+
+		a_ = ArcCos( cos_a );
+		b_ = ArcCos( cos_b );
+		c_ = ArcCos( cos_c );
+
+		// Compute the cosines of the internal (i.e. dihedral) angles.
+
+		cos_alpha = CosDihedralAngle( C_, A_, B_ );
+		cos_beta  = CosDihedralAngle( A_, B_, C_ );
+		cos_gamma = CosDihedralAngle( A_, C_, B_ );
+
+		// Compute the (dihedral) angles.
+
+		alpha = ArcCos( cos_alpha );
+		beta  = ArcCos( cos_beta  );
+		gamma = ArcCos( cos_gamma );
+
+		// Compute the solid angle of the spherical triangle.
+
+		area = alpha + beta + gamma - Pi;
+
+		// Compute the orientation of the triangle.
+
+		orient = Sign( A_ * ( B_ ^ C_ ) );
+
+		// Initialize three variables that are used for sampling the triangle.
+
+		U         = Unit( C_ / A_ );  // In plane of AC orthogonal to A.
+		sin_alpha = sin( alpha );
+		product   = sin_alpha * cos_c;
+	}
+
+	/*-------------------------------------------------------------------------*
+	* Init                                                                    *
+	*                                                                         *
+	* Initialize all fields.  Create a null spherical triangle.               *
+	*                                                                         *
+	*-------------------------------------------------------------------------*/
+	void SphericalTriangle::Init()
+	{
+		a_ = 0;  A_ = 0;  cos_alpha = 0;  cos_a = 0;  alpha = 0;  
+		b_ = 0;  B_ = 0;  cos_beta  = 0;  cos_b = 0;  beta  = 0;  
+		c_ = 0;  C_ = 0;  cos_gamma = 0;  cos_c = 0;  gamma = 0;  
+		area      = 0;
+		orient    = 0;
+		sin_alpha = 0;
+		product   = 0;
+		U         = 0;
+	}
+
+	/*-------------------------------------------------------------------------*
+	* "( A, B, C )" operator.                                                 *
+	*                                                                         *
+	* Construct the spherical triange from three vertices.  Assume that the   *
+	* sphere is centered at the origin.  The vectors A, B, and C need not     *
+	* be normalized.                                                          *
+	*                                                                         *
+	*-------------------------------------------------------------------------*/
+	SphericalTriangle & SphericalTriangle::operator()( 
+		const Vec3 &A0, 
+		const Vec3 &B0, 
+		const Vec3 &C0 )
+	{
+		Init( A0, B0, C0 );
+		return *this;
+	}
+
+	/*-------------------------------------------------------------------------*
+	* Inside                                                                  *
+	*                                                                         *
+	* Determine if the vector W is inside the triangle.  W need not be a      *
+	* unit vector                                                             *
+	*                                                                         *
+	*-------------------------------------------------------------------------*/
+	int SphericalTriangle::Inside( const Vec3 &W ) const
+	{
+		Vec3 Z = Orient() * W;
+		if( Z * ( A() ^ B() ) < 0.0 ) return 0;
+		if( Z * ( B() ^ C() ) < 0.0 ) return 0;
+		if( Z * ( C() ^ A() ) < 0.0 ) return 0;
+		return 1;
+	}
+
+	/*-------------------------------------------------------------------------*
+	* Chart                                                                   *
+	*                                                                         *
+	* Generate samples from the current spherical triangle.  If x1 and x2 are *
+	* random variables uniformly distributed over [0,1], then the returned    *
+	* points are uniformly distributed over the solid angle.                  *
+	*                                                                         *
+	*-------------------------------------------------------------------------*/
+	Vec3 SphericalTriangle::Chart( float x1, float x2 ) const
+	{
+		// Use one random variable to select the area of the sub-triangle.
+		// Save the sine and cosine of the angle phi.
+
+		register float phi = x1 * area - Alpha();
+		register float s   = sin( phi );
+		register float t   = cos( phi );
+
+		// Compute the pair (u,v) that determines the new angle beta.
+
+		register float u = t - cos_alpha;
+		register float v = s + product  ;  // sin_alpha * cos_c
+
+		// Compute the cosine of the new edge b.
+
+		float q = ( cos_alpha * ( v * t - u * s ) - v ) / 
+			( sin_alpha * ( u * t + v * s )     );
+
+		// Compute the third vertex of the sub-triangle.
+
+		Vec3 C_new = q * A() + Sqrt( 1.0 - q * q ) * U;
+
+		// Use the other random variable to select the height z.
+
+		float z = 1.0 - x2 * ( 1.0 - C_new * B() );
+
+		// Construct the corresponding point on the sphere.
+
+		Vec3 D = C_new / B();  // Remove B component of C_new.
+		return z * B() + Sqrt( ( 1.0 - z * z ) / ( D * D ) ) * D;
+	}
+
+	/*-------------------------------------------------------------------------*
+	* Coord                                                                   *
+	*                                                                         *
+	* Compute the two coordinates (x1,x2) corresponding to a point in the     *
+	* spherical triangle.  This is the inverse of "Chart".                    *
+	*                                                                         *
+	*-------------------------------------------------------------------------*/
+	Vec2 SphericalTriangle::Coord( const Vec3 &P1 ) const
+	{
+		Vec3 P = Unit( P1 );
+
+		// Compute the new C vertex, which lies on the arc defined by B-P
+		// and the arc defined by A-C.
+
+		Vec3 C_new = Unit( ( B() ^ P ) ^ ( C() ^ A() ) );
+
+		// Adjust the sign of C_new.  Make sure it's on the arc between A and C.
+
+		if( C_new * ( A() + C() ) < 0.0 ) C_new = -C_new;
+
+		// Compute x1, the area of the sub-triangle over the original area.
+
+		float cos_beta  = CosDihedralAngle( A(), B(), C_new  );
+		float cos_gamma = CosDihedralAngle( A(), C_new , B() );
+		float sub_area  = Alpha() + acos( cos_beta ) + acos( cos_gamma ) - Pi;
+		float x1        = sub_area / SolidAngle();
+
+		// Now compute the second coordinate using the new C vertex.
+
+		float z  = P * B();
+		float x2 = ( 1.0 - z ) / ( 1.0 - C_new * B() );
+
+		if( x1 < 0.0 ) x1 = 0.0;  if( x1 > 1.0 ) x1 = 1.0;
+		if( x2 < 0.0 ) x2 = 0.0;  if( x2 > 1.0 ) x2 = 1.0;
+		return Vec2( x1, x2 );
+	}
+
+	/*-------------------------------------------------------------------------*
+	* Dual                                                                    *
+	*                                                                         *
+	* Construct the dual triangle of the current triangle, which is another   *
+	* spherical triangle.                                                     *
+	*                                                                         *
+	*-------------------------------------------------------------------------*/
+	SphericalTriangle SphericalTriangle::Dual() const
+	{
+		Vec3 dual_A = B() ^ C();  if( dual_A * A() < 0.0 ) dual_A *= -1.0;
+		Vec3 dual_B = A() ^ C();  if( dual_B * B() < 0.0 ) dual_B *= -1.0;
+		Vec3 dual_C = A() ^ B();  if( dual_C * C() < 0.0 ) dual_C *= -1.0;
+		return SphericalTriangle( dual_A, dual_B, dual_C );
+	}
+
+	/*-------------------------------------------------------------------------*
+	* VecIrrad                                                                *
+	*                                                                         *
+	* Return the "vector irradiance" due to a light source of unit brightness *
+	* whose spherical projection is this spherical triangle.  The negative of *
+	* this vector dotted with the surface normal gives the (scalar)           *
+	* irradiance at the origin.                                               *
+	*                                                                         *
+	*-------------------------------------------------------------------------*/
+	Vec3 SphericalTriangle::VecIrrad() const
+	{
+		Vec3 Phi =
+			a() * Unit( B() ^ C() ) +
+			b() * Unit( C() ^ A() ) +
+			c() * Unit( A() ^ B() ) ;
+		if( Orient() ) Phi *= -1.0;
+		return Phi;    
+	}
+
+	/*-------------------------------------------------------------------------*
+	* New_Alpha                                                               *
+	*                                                                         *
+	* Returns a new spherical triangle derived from the original one by       *
+	* moving the "C" vertex along the edge "BC" until the new "alpha" angle   *
+	* equals the given argument.                                              *
+	*                                                                         *
+	*-------------------------------------------------------------------------*/
+	SphericalTriangle SphericalTriangle::New_Alpha( float alpha ) const
+	{
+		Vec3 V1( A() ), V2( B() ), V3( C() );
+		Vec3 E1 = Unit( V2 ^ V1 );
+		Vec3 E2 = E1 ^ V1;
+		Vec3 G  = ( cos(alpha) * E1 ) + ( sin(alpha) * E2 );
+		Vec3 D  = Unit( V3 / V2 );
+		Vec3 C2 = ((G * D) * V2) - ((G * V2) * D);
+		if( Triple( V1, V2, C2 ) > 0.0 ) C2 *= -1.0;
+		return SphericalTriangle( V1, V2, C2 );
+	}
+
+	std::ostream &operator<<( std::ostream &out, const SphericalTriangle &T )
+	{
+		out << "SphericalTriangle:\n"
+			<< "  " << T.A() << "\n"
+			<< "  " << T.B() << "\n"
+			<< "  " << T.C() << std::endl;
+		return out;
+	}
+
+};
diff --git a/src/nvtt/bc6h/arvo/SphTri.h b/src/nvtt/bc6h/arvo/SphTri.h
new file mode 100755
index 0000000..7336dc7
--- /dev/null
+++ b/src/nvtt/bc6h/arvo/SphTri.h
@@ -0,0 +1,124 @@
+/***************************************************************************
+* SphTri.h                                                                 *
+*                                                                          *
+* This file defines the SphericalTriangle class definition, which          *
+* supports member functions for Monte Carlo sampling, point containment,   *
+* and other basic operations on spherical triangles.                       *
+*                                                                          *
+*   Changes:                                                               *
+*     01/01/2000  arvo  Added New_{Alpha,Beta,Gamma} methods.              *
+*     12/30/1999  arvo  Added VecIrrad method for "Vector Irradiance".     *
+*     04/08/1995  arvo  Further optimized sampling algorithm.              *
+*     10/11/1994  arvo  Added analytic sampling algorithm.                 *
+*     06/14/1994  arvo  Initial implementation.                            *
+*                                                                          *
+*--------------------------------------------------------------------------*
+* Copyright (C) 1995, 2000, James Arvo                                     *
+*                                                                          *
+* This program is free software; you can redistribute it and/or modify it  *
+* under the terms of the GNU General Public License as published by the    *
+* Free Software Foundation.  See http://www.fsf.org/copyleft/gpl.html      *
+*                                                                          *
+* This program is distributed in the hope that it will be useful, but      *
+* WITHOUT EXPRESS OR IMPLIED WARRANTY of merchantability or fitness for    *
+* any particular purpose.  See the GNU General Public License for more     *
+* details.                                                                 *
+*                                                                          *
+***************************************************************************/
+#ifndef __SPHTRI_INCLUDED__
+#define __SPHTRI_INCLUDED__
+
+#include "Vec3.h"
+#include "Vec2.h"
+
+namespace ArvoMath {
+
+	/*
+	*  The (Oblique) Spherical Triangle ABC.  Edge lengths (segments of great 
+	*  circles) are a, b, and c.  The (dihedral) angles are Alpha, Beta, and Gamma.
+	*
+	*                      B
+	*                      o
+	*                     / \
+	*                    /   \
+	*                   /Beta \
+	*                  /       \
+	*               c /         \ a
+	*                /           \ 
+	*               /             \
+	*              /               \
+	*             /                 \
+	*            /                   \
+	*           /Alpha          Gamma \
+	*          o-----------------------o
+	*         A            b            C
+	*
+	*/
+
+	class SphericalTriangle {
+
+	public: // methods
+		SphericalTriangle() { Init(); }
+		SphericalTriangle( const SphericalTriangle &T ) { *this = T; }
+		SphericalTriangle( const Vec3 &, const Vec3 &, const Vec3 & );
+		SphericalTriangle & operator()( const Vec3 &, const Vec3 &, const Vec3 & );
+		~SphericalTriangle( ) {}
+		void   operator=( const SphericalTriangle &T ) { *this = T; }
+		Vec3   Chart    ( float x, float y ) const;  // Const-Jacobian map from square.
+		Vec2   Coord    ( const Vec3 &P    ) const;  // Get 2D coords of a point.
+		int    Orient( ) const { return orient; }
+		int    Inside( const Vec3 & ) const;
+		float  SolidAngle() const { return area; }
+		float  SignedSolidAngle() const { return -orient * area; } // CC is pos.
+		const  Vec3 &A()  const { return A_       ; }
+		const  Vec3 &B()  const { return B_       ; }
+		const  Vec3 &C()  const { return C_       ; }
+		float  a()        const { return a_       ; }
+		float  b()        const { return b_       ; }
+		float  c()        const { return c_       ; }
+		float  Cos_a()    const { return cos_a    ; }
+		float  Cos_b()    const { return cos_b    ; }
+		float  Cos_c()    const { return cos_c    ; }
+		float  Alpha()    const { return alpha    ; }
+		float  Beta ()    const { return beta     ; }
+		float  Gamma()    const { return gamma    ; }
+		float  CosAlpha() const { return cos_alpha; }
+		float  CosBeta () const { return cos_beta ; }
+		float  CosGamma() const { return cos_gamma; }
+		Vec3   VecIrrad() const; // Returns the vector irradiance.
+		SphericalTriangle Dual() const;
+		SphericalTriangle New_Alpha( float alpha ) const;
+		SphericalTriangle New_Beta ( float beta  ) const;
+		SphericalTriangle New_Gamma( float gamma ) const;
+
+	private: // methods
+		void Init( );
+		void Init( const Vec3 &A, const Vec3 &B, const Vec3 &C );
+
+	private: // data
+		Vec3  A_, B_, C_, U;       // The vertices (and a temp vector).
+		float a_, b_, c_;          // The edge lengths.
+		float alpha, beta, gamma;  // The angles.
+		float cos_a, cos_b, cos_c;
+		float cos_alpha, cos_beta, cos_gamma;
+		float area;
+		float sin_alpha, product;  // Used in sampling algorithm.
+		int   orient;              // Orientation.
+	};
+
+	inline double CosDihedralAngle( const Vec3 &A, const Vec3 &B, const Vec3 &C )
+	{
+		float x = Unit( A ^ B ) * Unit( C ^ B );
+		if( x < -1.0 ) x = -1.0;
+		if( x >  1.0 ) x =  1.0;
+		return x;
+	}
+
+	inline double DihedralAngle( const Vec3 &A, const Vec3 &B, const Vec3 &C )
+	{
+		return acos( CosDihedralAngle( A, B, C ) );
+	}
+
+	extern std::ostream &operator<<( std::ostream &out, const SphericalTriangle & );
+};
+#endif
diff --git a/src/nvtt/bc6h/arvo/Token.cpp b/src/nvtt/bc6h/arvo/Token.cpp
new file mode 100755
index 0000000..9575d92
--- /dev/null
+++ b/src/nvtt/bc6h/arvo/Token.cpp
@@ -0,0 +1,913 @@
+/***************************************************************************
+* Token.h                                                                  *
+*                                                                          *
+* The Token class ecapsulates a lexical analyzer for C++-like syntax.      *
+* A token instance is associated with one or more text files, and          *
+* grabs C++ tokens from them sequentially.  There are many member          *
+* functions designed to make parsing easy, such as "==" operators for      *
+* strings and characters, and automatic conversion of numeric tokens       *
+* into numeric values.                                                     *
+*                                                                          *
+* Files can be nested via #include directives, and both styles of C++      *
+* comments are supported.                                                  *
+*                                                                          *
+*   HISTORY                                                                *
+*      Name    Date        Description                                     *
+*                                                                          *
+*      arvo    10/05/99    Fixed bug in TokFrame string allocation.        *
+*      arvo    01/15/95    Added ifdef, ifndef, else, and endif.           *
+*      arvo    02/13/94    Added Debug() member function.                  *
+*      arvo    01/22/94    Several sections rewritten.                     *
+*      arvo    06/19/93    Converted to C++                                *
+*      arvo    07/15/89    Rewritten for scene description parser.         *
+*      arvo    01/22/89    Initial coding.                                 *
+*                                                                          *
+*--------------------------------------------------------------------------*
+* Copyright (C) 1999, James Arvo                                           *
+*                                                                          *
+* This program is free software; you can redistribute it and/or modify it  *
+* under the terms of the GNU General Public License as published by the    *
+* Free Software Foundation.  See http://www.fsf.org/copyleft/gpl.html      *
+*                                                                          *
+* This program is distributed in the hope that it will be useful, but      *
+* WITHOUT EXPRESS OR IMPLIED WARRANTY of merchantability or fitness for    *
+* any particular purpose.  See the GNU General Public License for more     *
+* details.                                                                 *
+*                                                                          *
+***************************************************************************/
+#include <stdlib.h>
+#include <string.h>
+#include "Token.h"
+#include "Char.h"
+
+namespace ArvoMath {
+
+	FILE*  Token::debug = NULL;  // Static data member of Token class.
+	int    Token::argc  = 0;
+	char** Token::argv  = NULL;
+
+	typedef TokMacro *TokMacroPtr;
+
+	static const int True      = 1;
+	static const int False     = 0;
+	static const int HashConst = 217;  // Size of hash-table for macros.
+
+
+	TokFrame::TokFrame()
+	{
+		next   = NULL;
+		source = NULL;
+		fname  = NULL;
+		line   = 0;
+		column = 0;
+	}
+
+	TokFrame::~TokFrame()
+	{
+		if( fname != NULL ) delete[] fname;
+		if( source != NULL ) fclose( source );
+	}
+
+	void TokFrame::operator=( const TokFrame &frame )
+	{
+		next   = frame.next;
+		source = frame.source;
+		fname  = strdup( frame.fname );
+		line   = frame.line;
+		column = frame.column;
+	}
+
+	static int HashName( const char *str )
+	{
+		static int prime[5] = { 7, 11, 17, 23, 3 };
+		int k = 0;
+		int h = 0;
+		while( *str != NullChar )
+		{
+			h += (*str++) * prime[k++];
+			if( k == 5 ) k = 0;
+		}
+		if( h < 0 ) h = 0;  // Check for overflow.
+		return h % HashConst;
+	}
+
+	TokMacro *Token::MacroLookup( const char *str ) const
+	{
+		if( table == NULL ) return NULL;
+		int i = HashName( str );
+		for( TokMacro *m = table[i]; m != NULL; m = m->next )
+		{
+			if( strcmp( str, m->macro ) == 0 ) return m;
+		}
+		return NULL;
+	}
+
+	int Token::MacroReplace( char *str, int &length, TokType &type ) const
+	{
+		TokMacro *m = MacroLookup( str );
+		if( m == NULL ) return 0;
+		strcpy( str, m->repl );
+		length = strlen( str );
+		type   = m->type;
+		return 1;
+	}
+
+	/*-------------------------------------------------------------------------*
+	*  D e b u g  P r i n t                                                   *
+	*                                                                         *
+	*  This routine is used to record the entire token stream in a file to    *
+	*  use as a debugging aid.  It does not affect the action of the lexer;   *
+	*  it merely records a "shadow" copy of all the tokens that are read by   *
+	*  ANY Token instance.  The data that is written to the file is           *
+	*                                                                         *
+	*  <Line number>  <Column number>  <File name>  <Token>                   *
+	*                                                                         *
+	*-------------------------------------------------------------------------*/
+	static void DebugPrint( const Token &tok, FILE *fp )
+	{
+		fprintf( fp, "%3d %3d  ", tok.Line(), tok.Column() );
+		fprintf( fp, "%s  "     , tok.FileName() ); 
+		fprintf( fp, "%s\n"     , tok.Spelling() );
+		fflush ( fp );
+	}
+
+	/*-------------------------------------------------------------------------*
+	*  T o k e n   (Constructors)                                             *
+	*                                                                         *
+	*-------------------------------------------------------------------------*/
+	Token::Token( const char *file_name )
+	{
+		Init();
+		Open( file_name );
+	}
+
+	Token::Token( FILE *fp )
+	{
+		Init();
+		Open( fp );
+	}
+
+	Token::Token( )
+	{
+		Init();
+	}
+
+	/*-------------------------------------------------------------------------*
+	*  T o k e n   (Destructor)                                               *
+	*                                                                         *
+	*  Close all files and deletes all frames and paths.                      *
+	*                                                                         *
+	*-------------------------------------------------------------------------*/
+	Token::~Token( )
+	{
+		// Don't try to delete "frame" as its a member of this class, not 
+		// something that we've allocated.
+		TokFrame *f = frame.next;
+		while( f != NULL )
+		{
+			TokFrame *n = f->next;
+			delete f;
+			f = n;
+		}
+		ClearPaths();
+	}
+
+	/*-------------------------------------------------------------------------*
+	*  O p e n                                                                *
+	*                                                                         *
+	*  Establish a new file to read from, either by name, or by pointer.      *
+	*                                                                         *
+	*-------------------------------------------------------------------------*/
+	void Token::Open( const char *file_name )
+	{
+		FILE *fp = fopen( file_name, "r" );
+		if( fp == NULL ) return;
+		Open( fp );
+		frame.fname = strdup( file_name );
+	}
+
+	void Token::Open( FILE *fp )
+	{
+		frame.source = fp;
+		frame.line   = 1;
+		frame.column = 0;
+		pushed       = NullChar;
+	}
+
+	/*-------------------------------------------------------------------------*
+	*  O p e r a t o r  ==                                                    *
+	*                                                                         *
+	*  A token can be compared with a string, a single character, or a type.  *
+	*                                                                         *
+	*-------------------------------------------------------------------------*/
+	int Token::operator==( const char *s ) const
+	{
+		const char *t = spelling;
+		if( case_sensitive )
+		{
+			do { if( *s != *t ) return False; } 
+			while( *s++ && *t++ );
+		}
+		else
+		{
+			do { if( ToUpper(*s) != ToUpper(*t) ) return False; } 
+			while( *s++ && *t++ );
+		}
+		return True;
+	}
+
+	int Token::operator==( char c ) const
+	{
+		if( length != 1 ) return False;
+		if( case_sensitive ) return spelling[0] == c;
+		else return ToUpper(spelling[0]) == ToUpper(c);
+	}
+
+	int Token::operator==( TokType _type_ ) const 
+	{
+		int match = 0;
+		switch( _type_ )
+		{ 
+		case T_char   : match = ( type == T_string  && Len() == 1      ); break;
+		case T_numeric: match = ( type == T_integer || type == T_float ); break;
+		default       : match = ( type == _type_                       ); break;
+		}
+		return match;
+	}
+
+	/*-------------------------------------------------------------------------*
+	*  O p e r a t o r  !=                                                    *
+	*                                                                         *
+	*  Define negations of the three types of "==" tests.                     *
+	*                                                                         *
+	*-------------------------------------------------------------------------*/
+	int Token::operator!=( const char *s ) const { return !( *this == s ); }
+	int Token::operator!=( char        c ) const { return !( *this == c ); }
+	int Token::operator!=( TokType     t ) const { return !( *this == t ); }
+
+	/*-------------------------------------------------------------------------*
+	*  E r r o r                                                              *
+	*                                                                         *
+	*  Print error message to "stderr" followed by optional "name".           *
+	*                                                                         *
+	*-------------------------------------------------------------------------*/
+	void Token::Error( TokError error, const char *name )
+	{
+		char *s;
+		switch( error )
+		{
+		case T_malformed_float   : s = "malformed real number   "; break;
+		case T_unterm_string     : s = "unterminated string     "; break;
+		case T_unterm_comment    : s = "unterminated comment    "; break;
+		case T_file_not_found    : s = "include file not found: "; break;
+		case T_unknown_directive : s = "unknown # directive     "; break;
+		case T_string_expected   : s = "string expected         "; break;
+		case T_putback_error     : s = "putback overflow        "; break;
+		case T_name_too_long     : s = "file name is too long   "; break;
+		case T_no_endif          : s = "#endif directive missing"; break;
+		case T_extra_endif       : s = "#endif with no #ifdef   "; break;
+		case T_extra_else        : s = "#else with no #ifdef    "; break;
+		default                  : s = "unknown error type      "; break;
+		}
+		fprintf( stderr, "LEXICAL ERROR, line %d, column %d: %s", 
+			frame.line, frame.column, s );
+		if( name == NULL )
+			fprintf( stderr, "  \n"       );
+		else fprintf( stderr, "%s\n", name );
+		exit( 1 );
+	}
+
+	/*-------------------------------------------------------------------------*
+	*  G e t c                                                                *
+	*                                                                         *
+	*  This routine fetches one character at a time from the current file     *
+	*  being read.  It is responsible for keeping track of the column number  *
+	*  and for handling single characters that have been "put back".          *
+	*                                                                         *
+	*-------------------------------------------------------------------------*/
+	int Token::Getc( int &c )
+	{
+		if( pushed != NullChar )  // Return the pushed character.
+		{
+			c = pushed;
+			pushed = NullChar;
+		}
+		else  // Get a new character from the source file.
+		{
+			c = getc( frame.source );
+			frame.column++;
+		}
+		return c;
+	}
+
+	/*-------------------------------------------------------------------------*
+	*  N o n W h i t e                                                        *
+	*                                                                         *
+	*  This routine implements a simple finite state machine that skips       *
+	*  white space and recognizes the two styles of comments used in C++.     *
+	*  It returns the first non-white character not part of a comment.        *
+	*                                                                         *
+	*-------------------------------------------------------------------------*/
+	int Token::NonWhite( int &c )
+	{
+start_state:
+		Getc( c );
+		if( c == Space   ) goto start_state;
+		if( c == Tab     ) goto start_state;
+		if( c == NewLine ) goto start_new_line;
+		if( c == Slash   ) goto start_comment;
+		goto return_char;
+
+start_comment:
+		Getc( c );
+		if( c == Star    ) goto in_comment1;  
+		if( c == Slash   ) goto in_comment2;  
+		Unget( c );
+		c = Slash;
+		goto return_char;
+
+in_comment1:
+		Getc( c );
+		if( c == Star    ) goto end_comment1;
+		if( c == NewLine ) goto newline_in_comment;
+		if( c == EOF     ) goto return_char;
+		goto in_comment1;
+
+end_comment1:
+		Getc( c );
+		if( c == Slash   ) goto start_state;
+		if( c == NewLine ) goto newline_in_comment;
+		if( c == EOF     ) goto unterm_comment;
+		goto in_comment1;
+
+in_comment2:
+		Getc( c );
+		if( c == NewLine ) goto start_new_line;
+		if( c == EOF     ) goto return_char;
+		goto in_comment2;
+
+unterm_comment:
+		Error( T_unterm_comment );
+		c = EOF;
+		goto return_char;
+
+start_new_line:
+		frame.line++;
+		frame.column = 0;
+		goto start_state;
+
+newline_in_comment:
+		frame.line++;
+		frame.column = 0;
+		goto in_comment1;
+
+return_char:
+		Tcolumn = frame.column;  // This is where the token starts.
+		return c;
+	}
+
+	/*-------------------------------------------------------------------------*
+	*  N e x t R a w T o k                                                    *
+	*                                                                         *
+	*-------------------------------------------------------------------------*/
+	int Token::NextRawTok( )
+	{
+		static int Trans0[] = { 0, 1, 3, 3, 3 };  // Found a digit.
+		static int Trans1[] = { 5, 6, 4, 6, 7 };  // Found a sign.
+		static int Trans2[] = { 1, 6, 7, 6, 7 };  // Found decimal point.
+		static int Trans3[] = { 2, 2, 7, 6, 7 };  // Found an exponent.
+		static int Trans4[] = { 5, 6, 7, 6, 7 };  // Found something else.
+		char       *tok     = spelling;
+		int        state;
+		int        c;
+
+		length = 0;
+		type   = T_null;
+
+		// Skip comments and whitespace.
+
+		if( NonWhite( c ) == EOF ) goto endtok;
+
+		// Is this the beginning of an identifier?  If so, get the rest. 
+
+		if( isAlpha( c ) )
+		{
+			type = T_ident;
+			do  {
+				*tok++ = c;
+				length++;
+				if( Getc( c ) == EOF ) goto endtok;
+			}
+			while( isAlpha( c ) || isDigit( c ) || c == Underscore );
+			Unget( c );
+			goto endtok;
+		}
+
+		// Is this the beginning of a number?
+
+		else if( isDigit( c ) || c == Minus || c == Period )
+		{
+			char c1 = c;
+			state = 0;
+			for(;;)
+			{
+				*tok++ = c;
+				length++;
+				switch( Getc( c ) )
+				{
+				case '0':
+				case '1':
+				case '2':
+				case '3':
+				case '4':
+				case '5':
+				case '6':
+				case '7':
+				case '8':
+				case '9': state = Trans0[ state ]; break;
+				case '+': 
+				case '-': state = Trans1[ state ]; break;
+				case '.': state = Trans2[ state ]; break;
+				case 'e':
+				case 'E': state = Trans3[ state ]; break;
+				default : state = Trans4[ state ]; break;
+				}
+				switch( state )
+				{
+				case 5 : Unget( c ); 
+					type = ( c1 == Period ) ? T_float : T_integer; 
+					goto endtok;
+				case 6 : Unget( c ); type = T_float  ; goto endtok;
+				case 7 : Error( T_malformed_float ) ; break;
+				default: continue;
+				}
+			} // for
+		} // if numeric 
+
+		// Is this the beginning of an operator?
+
+		if( c == '*' || c == '>' || c == '<' || c == '+' || c == '-' || c == '!' )
+		{
+			char oldc = c;
+			type = T_other;
+			*tok++ = c;
+			length++;
+			if( Getc( c ) == EOF ) goto endtok;
+			if( c == oldc || c == EqualSign )
+			{
+				*tok++ = c;
+				length++;
+			}
+			else Unget( c );
+			goto endtok;
+		}
+
+		// Is this the beginning of a string?
+
+		else if( c == DoubleQuote )
+		{
+			type = T_string;
+			while( Getc( c ) != EOF && length < MaxTokenLen )
+			{
+				if( c == DoubleQuote ) goto endtok;
+				*tok++ = c;
+				length++;
+			}
+			Error( T_unterm_string );
+		}
+
+		// Is this the beginning of a "#" directive?
+
+		else if( c == Hash )
+		{
+			type = T_directive;
+			NonWhite( c );
+			while( isAlpha( c ) )
+			{
+				*tok++ = c;
+				length++;
+				Getc( c );
+			}
+			Unget( c );
+			goto endtok;
+		}
+
+		// This must be a one-character token. 
+
+		else
+		{
+			*tok++ = c;
+			length = 1;
+			type   = T_other;
+		}
+
+endtok: // Jump to here when token is completed.
+
+		*tok = NullChar;  // Terminate the string.
+		if( debug != NULL ) DebugPrint( *this, debug );
+
+		return length;
+	}
+
+	/*-------------------------------------------------------------------------*
+	*  N e x t T o k                                                          *
+	*                                                                         *
+	*-------------------------------------------------------------------------*/
+	int Token::NextTok( )
+	{
+		NextRawTok();
+
+		// If the token is an identifier, see if it's a macro.
+		// If the macro substitution is null, get another token.
+
+		if( type == T_ident )
+		{
+			if( table != NULL )
+			{
+				if( MacroReplace( spelling, length, type ) && debug != NULL ) 
+					DebugPrint( *this, debug );
+			}
+			if( type == T_nullmacro ) NextTok();
+		}
+		return length;
+	}
+
+	/*-------------------------------------------------------------------------*
+	*  O p e r a t o r  - -                                                   *
+	*                                                                         *
+	*  Puts back the last token found.  Only one token can be put back.       *
+	*                                                                         *
+	*-------------------------------------------------------------------------*/
+	Token & Token::operator--( )  // Put the last token back. 
+	{
+		if( put_back ) Error( T_putback_error );  // Can only handle one putback.
+		put_back = 1; 
+		return *this;
+	}
+
+	Token & Token::operator--( int )  // Postfix decrement.
+	{
+		fprintf( stderr, "Postfix decrement is not implemented for the Token class.\n" );
+		return *this;
+	}
+
+	/*-------------------------------------------------------------------------*
+	*  H a n d l e   D i r e c t i v e                                        *
+	*                                                                         *
+	*  Directive beginning with "#" must be handled by the lexer, as they     *
+	*  determine the current source file via "#include", etc.                 *
+	*                                                                         *
+	*  Returns 1 if, after handling this directive, we now have the next      *
+	*  token.                                                                 *
+	*                                                                         *
+	*-------------------------------------------------------------------------*/
+	int Token::HandleDirective( )
+	{
+		FILE *fp;
+		char name[128];
+		if( *this == "define" )
+		{
+			NextRawTok(); 
+			strcpy( tempbuff, Spelling() );  // This is the macro name.
+			int line = Line();
+			NextRawTok();
+			if( Line() == line )
+				AddMacro( tempbuff, Spelling(), Type() );
+			else
+			{
+				// If next token is on a different line; we went too far.
+				AddMacro( tempbuff, "", T_nullmacro );
+				return 1;  // Signal that we already have the next token.
+			}
+		}
+		else if( *this == "include" )
+		{
+			NextRawTok();
+			if( *this == "<" )
+			{
+				GetName( name, sizeof(name) );
+				PushFrame( ResolveName( name ), name );
+			}
+			else if( type == T_string )
+			{
+				fp = fopen( spelling, "r" );
+				if( fp == NULL ) Error( T_file_not_found, spelling );
+				else PushFrame( fp, spelling );
+			}
+			else Error( T_string_expected );
+		}
+		else if( *this == "ifdef" )
+		{
+			NextRawTok();
+			TokMacro *m = MacroLookup( Spelling() );
+			if( m == NULL )  // Skip until else or endif.
+			{
+				while( *this != T_null )
+				{
+					NextRawTok();
+					if( *this != T_directive ) continue;
+					if( *this == "endif" ) break;
+					if( *this == "else"  ) { if_nesting++; break; }  // Like m != NULL.
+				}
+				if( *this == T_null ) Error( T_no_endif );
+				return 0; // Ready to get the next token.
+			}
+			else if_nesting++;
+		}
+		else if( *this == "ifndef" )
+		{
+			NextRawTok();
+			TokMacro *m = MacroLookup( Spelling() );
+			if( m != NULL )  // Skip until else or endif.
+			{
+				while( *this != T_null )
+				{
+					NextRawTok();
+					if( *this != T_directive ) continue;
+					if( *this == "endif" ) break;
+					if( *this == "else"  ) { if_nesting++; break; }  // Like m == NULL.
+				}
+				if( *this == T_null ) Error( T_no_endif );
+				return 0; // Ready to get the next token.
+			}
+			else if_nesting++;
+		}
+		else if( *this == "else" )  // Skip until #endif.
+		{
+			if( if_nesting == 0 ) Error( T_extra_else );
+			while( *this != T_null )
+			{
+				NextRawTok();
+				if( *this == T_directive && *this == "endif" ) break;
+			}
+			if( *this == T_null ) Error( T_no_endif );
+			if_nesting--;
+			return 0; // Ready to get next token.
+		}
+		else if( *this == "endif" )
+		{
+			if( if_nesting == 0 ) Error( T_extra_endif );
+			if_nesting--;
+			return 0; // Ready to get next token.
+		}
+		else if( *this == "error" )
+		{
+			int line = Line();
+			NextTok(); // Allow macro substitution.
+			if( Line() == line )
+			{
+				fprintf( stderr, "(preprocessor, line %d) %s\n", line, Spelling() );
+				return 0; // Ready to get next token.
+			}
+			else
+			{
+				// If next token is on a different line; we went too far.
+				fprintf( stderr, "(null preprocessor message, line %d)\n", line );
+				return 1;  // Signal that we already have the next token.
+			}
+		}
+		return 0;
+	}
+
+
+	/*-------------------------------------------------------------------------*
+	*  O p e r a t o r  + +                                                   *
+	*                                                                         *
+	*  Grab the next token from the current source file.  If at end of file,  *
+	*  pick up where we left off in the previous file.  If there is no        *
+	*  previous file, return "T_null".                                        *
+	*                                                                         *
+	*-------------------------------------------------------------------------*/
+	Token & Token::operator++( )
+	{
+		if( put_back ) 
+		{
+			put_back = 0;
+			return *this;
+		}
+
+		// If we've reached the end of an include file, pop the stack.
+
+		for(;;)
+		{
+			NextTok();  
+			if( type == T_directive ) 
+			{
+				if( HandleDirective() ) break;
+			}
+			else if( type == T_null ) 
+			{
+				fclose( frame.source );
+				if( !PopFrame() ) break;
+			}
+			else break;  // We have a real token.
+		}
+
+		// Now fill in the value fields if the token is a number. 
+
+		switch( type )
+		{
+		case T_integer : ivalue = atoi( spelling ); break;
+		case T_float   : fvalue = atof( spelling ); break;
+		case T_null    : if( if_nesting > 0 ) Error( T_no_endif ); break;
+		default        : break;
+		}
+
+		return *this;
+	}
+
+	Token & Token::operator++( int )
+	{
+		fprintf( stderr, "Postfix increment is not implemented for the Token class.\n" );
+		return *this;
+	}
+
+	/*-------------------------------------------------------------------------*
+	*  T o k e n   Push & Pop Frame                                           *
+	*                                                                         *
+	*  These functions are used to create and destroy the context "frames"    *
+	*  that are used to handle nested files (via "include").                  *
+	*                                                                         *
+	*-------------------------------------------------------------------------*/
+	void Token::PushFrame( FILE *fp, char *fname )
+	{
+		// Create a copy of the current (top-level) frame.
+
+		TokFrame *n = new TokFrame;
+		*n = frame;
+
+		// Now overwrite the top-level frame with the new state.
+
+		frame.next   = n;
+		frame.source = fp;
+		frame.line   = 1;
+		frame.column = 0;
+		frame.fname  = strdup( fname );
+		pushed       = NullChar;
+	}
+
+	int Token::PopFrame()
+	{
+		if( frame.next == NULL ) return 0;
+		TokFrame *old = frame.next;
+		frame = *old;
+		delete   old;  // Delete the frame that we just copied from.
+		return 1;
+	}
+
+	/*-------------------------------------------------------------------------*
+	*  Miscellaneous Functions                                                *
+	*                                                                         *
+	*-------------------------------------------------------------------------*/
+	void Token::Init()
+	{
+		case_sensitive = 1;
+		put_back       = 0;
+		pushed         = NullChar;
+		if_nesting     = 0;
+		frame.source   = NULL;
+		frame.next     = NULL;
+		frame.fname    = NULL;
+		first          = NULL;
+		last           = NULL;
+		table          = NULL;
+		pushed         = NullChar;
+		SearchArgs();  // Search command-line args for macro definitions.
+	}
+
+	const char* Token::Spelling() const 
+	{ 
+		return spelling;    
+	}
+
+	char Token::Char() const 
+	{ 
+		return spelling[0];
+	}
+
+	const char* Token::FileName() const
+	{ 
+		static char *null_string = "";
+		if( frame.fname == NULL ) return null_string;
+		else return frame.fname; 
+	}
+
+	float Token::Fvalue() const
+	{
+		float val = 0.0;
+		if( type == T_float   ) val = fvalue;
+		if( type == T_integer ) val = ivalue;
+		return val;
+	}
+
+	void Token::GetName( char *name, int max )
+	{
+		int c;
+		for( int i = 1; i < max; i++ )
+		{
+			if( NonWhite(c) == '>' ) 
+			{ 
+				*name = NullChar; 
+				return; 
+			}
+			*name++ = c;
+		}
+		Error( T_name_too_long );
+	}
+
+	void Token::AddPath( const char *new_path )
+	{
+		char *name = strdup( new_path );
+		if( name == NULL ) return;
+		TokPath *p = new TokPath;
+		p->next = NULL;
+		p->path = name;
+		if( first == NULL ) first = p;
+		else last->next = p;
+		last = p;
+	}
+
+	void Token::ClearPaths()
+	{
+		TokPath *p = first;
+		while( p != NULL )
+		{
+			TokPath *q = p->next;
+			delete[] p->path;  // delete the string.
+			delete   p;        // delete the path structure.
+			p = q;
+		}
+		first = NULL;
+		last  = NULL;
+	}
+
+	FILE *Token::ResolveName( const char *name )
+	{
+		char resolved[128];
+		for( const TokPath *p = first; p != NULL; p = p->next )
+		{
+			strcpy( resolved, p->path );
+			strcat( resolved, "/"     );
+			strcat( resolved, name    );
+			FILE *fp = fopen( resolved, "r" );
+			if( fp != NULL ) return fp;
+		}
+		Error( T_file_not_found, name );
+		return NULL;
+	}
+
+	void Token::CaseSensitive( int on_off = 1 ) 
+	{ 
+		case_sensitive = on_off; 
+	}
+
+	void Token::Debug( FILE *fp ) 
+	{ 
+		debug = fp;
+	}
+
+	void Token::AddMacro( const char *macro, const char *repl, TokType t )
+	{
+		if( table == NULL ) // Create and initialize the table.
+		{
+			table = new TokMacroPtr[ HashConst ];
+			for( int j = 0; j < HashConst; j++ ) table[j] = NULL;
+		}
+		int i = HashName( macro );    
+		TokMacro *m = new TokMacro;
+		m->next   = table[i];
+		m->macro  = strdup( macro );
+		m->repl   = strdup( repl  );
+		m->type   = t;
+		table[i]  = m;
+	}
+
+	void Token::Args( int argc_, char *argv_[] )
+	{
+		argc = argc_;  // Set the static variables.
+		argv = argv_;
+	}
+
+	void Token::SearchArgs( )
+	{
+		TokType type = T_null;
+		for( int i = 1; i < argc; i++ )
+		{
+			if( strcmp( argv[i], "-macro" ) == 0 )
+			{
+				if( i+2 >= argc ) 
+				{
+					fprintf( stderr, "(Token) ERROR macro argument(s) missing\n" );
+					return;
+				}
+				char *macro = argv[i+1];
+				char *repl  = argv[i+2];
+				if( isAlpha  ( repl[0] ) ) type = T_ident  ; else
+					if( isInteger( repl    ) ) type = T_integer; else
+						type = T_float  ;
+				AddMacro( macro, repl, type );
+				i += 2;
+			}
+		}
+	}
+};
diff --git a/src/nvtt/bc6h/arvo/Token.h b/src/nvtt/bc6h/arvo/Token.h
new file mode 100755
index 0000000..eabdacc
--- /dev/null
+++ b/src/nvtt/bc6h/arvo/Token.h
@@ -0,0 +1,203 @@
+/***************************************************************************
+* Token.h                                                                  *
+*                                                                          *
+* The Token class ecapsulates a lexical analyzer for C++-like syntax.      *
+* A token instance is associated with one or more text files, and          *
+* grabs C++ tokens from them sequentially.  There are many member          *
+* functions designed to make parsing easy, such as "==" operators for      *
+* strings and characters, and automatic conversion of numeric tokens       *
+* into numeric values.                                                     *
+*                                                                          *
+* Files can be nested via #include directives, and both styles of C++      *
+* comments are supported.                                                  *
+*                                                                          *
+*                                                                          *
+*   HISTORY                                                                *
+*      Name    Date        Description                                     *
+*                                                                          *
+*      arvo    10/05/99    Fixed bug in TokFrame string allocation.        *
+*      arvo    01/15/95    Added ifdef, ifndef, else, and endif.           *
+*      arvo    02/13/94    Added Debug() member function.                  *
+*      arvo    01/22/94    Several sections rewritten.                     *
+*      arvo    06/19/93    Converted to C++                                *
+*      arvo    07/15/89    Rewritten for scene description parser.         *
+*      arvo    01/22/89    Initial coding.                                 *
+*                                                                          *
+*--------------------------------------------------------------------------*
+* Copyright (C) 1999, James Arvo                                           *
+*                                                                          *
+* This program is free software; you can redistribute it and/or modify it  *
+* under the terms of the GNU General Public License as published by the    *
+* Free Software Foundation.  See http://www.fsf.org/copyleft/gpl.html      *
+*                                                                          *
+* This program is distributed in the hope that it will be useful, but      *
+* WITHOUT EXPRESS OR IMPLIED WARRANTY of merchantability or fitness for    *
+* any particular purpose.  See the GNU General Public License for more     *
+* details.                                                                 *
+*                                                                          *
+***************************************************************************/
+#ifndef __TOKEN_INCLUDED__
+#define __TOKEN_INCLUDED__
+
+#include <iostream>
+#include <stdio.h>
+
+namespace ArvoMath {
+
+	const int MaxTokenLen = 128;
+
+	typedef enum {
+		T_null,   
+		T_char,       // A string of length 1.
+		T_string,
+		T_integer,
+		T_float,
+		T_ident,
+		T_other,
+		T_numeric,    // Either T_float or T_int (use with == operator).
+		T_directive,  // Directives like #include are not returned to the user.
+		T_nullmacro
+	} TokType;
+
+	typedef enum {
+		T_malformed_float,
+		T_unterm_string,
+		T_unterm_comment,
+		T_file_not_found,
+		T_unknown_directive,
+		T_string_expected,
+		T_putback_error,
+		T_name_too_long,
+		T_no_endif,
+		T_extra_endif,
+		T_extra_else
+	} TokError;
+
+	class TokFrame {
+	public:
+		TokFrame();
+		TokFrame( const TokFrame &frame ) { *this = frame; }
+		~TokFrame();
+		void operator=( const TokFrame & );
+	public:
+		TokFrame *next;
+		FILE     *source;
+		char     *fname;
+		int       line;    
+		int       column;  
+	};
+
+	struct TokPath {
+		char    *path;
+		TokPath *next;
+	};
+
+	struct TokMacro {
+		char     *macro;
+		char     *repl;
+		TokType   type;
+		TokMacro *next;
+	};
+
+	class Token {
+
+	public:
+		// Constructors and destructor.
+
+		Token();
+		Token( const char *file_name );
+		Token( FILE *file_pointer    );
+		~Token();
+
+		// Const data members for querying token information.
+
+		TokType Type()    const { return type;       }  // The type of token found. 
+		int     Len()     const { return length;     }  // The length of the token. 
+		int     Line()    const { return frame.line; }  // The line it was found on.
+		int     Column()  const { return Tcolumn;    }  // The column it began in.  
+		long    Ivalue()  const { return ivalue;     }  // Token value if an integer.
+		float   Fvalue()  const;                        // Token value if int or float.
+		char    Char()    const;                        // The token (if a Len() == 1).
+
+		// Operators.
+
+		int     operator == ( const char* ) const;      // 1 if strings match.
+		int     operator != ( const char* ) const;      // 0 if strings match.
+		int     operator == ( char        ) const;      // 1 if token is this char.
+		int     operator != ( char        ) const;      // 0 if token is this char.
+		int     operator == ( TokType     ) const;      // 1 if token is of this type.
+		int     operator != ( TokType     ) const;      // 0 if token is of this type.
+		Token & operator ++ (             );            // (prefix) Get the next token.
+		Token & operator -- (             );            // (prefix) Put back one token.
+		Token & operator ++ ( int         );            // (postfix) Undefined.
+		Token & operator -- ( int         );            // (postfix) Undefined.
+
+		// State-setting member functions.
+
+		void Open( FILE * );                            // Read already opened file.
+		void Open( const char * );                      // Open the named file.
+		void CaseSensitive( int on_off );               // Applies to == and != operators.
+		void AddPath( const char * );                   // Adds path for <...> includes.
+		void ClearPaths();                              // Remove all search paths.
+
+		// Miscellaneous.
+
+		const char* Spelling() const;                   // The token itself.
+		const char* FileName() const;                   // Current file being lexed.
+		static void Debug( FILE * );                    // Write all token streams to a file.
+		static void Args ( int argc, char *argv[] );    // Search args for macro settings.
+		void AddMacro( const char*, const char*, TokType type );
+		void SearchArgs();
+
+	private:
+
+		// Private member functions.       
+
+		void     Init();
+		int      Getc ( int & );
+		void     Unget( int c ) { pushed = c; }
+		void     Error( TokError error, const char *name = NULL );
+		int      NonWhite( int & );
+		int      HandleDirective();
+		int      NextRawTok();  // No macro substitutions.
+		int      NextTok();
+		void     PushFrame( FILE *fp, char *fname = NULL );
+		int      PopFrame();
+		void     GetName( char *name, int max );
+		FILE     *ResolveName( const char *name );
+		TokMacro *MacroLookup( const char *str ) const;
+		int      MacroReplace( char *str, int &length, TokType &type ) const;
+
+		// Private data members.       
+
+		TokPath  *first;
+		TokPath  *last;
+		TokMacro **table;
+		TokFrame frame;
+		TokType  type;
+		long     ivalue;  
+		float    fvalue;  
+		int      length;  
+		int      Tcolumn;  
+		int      put_back;    
+		int      case_sensitive;
+		int      pushed;
+		int      if_nesting;
+		char     spelling[ MaxTokenLen ];
+		char     tempbuff[ MaxTokenLen ];
+
+		// Static data members.       
+
+		static int  argc;
+		static char **argv;
+		static FILE *debug;
+	};
+
+
+	// Predicate-style functions for testing token types.
+
+	inline int Null   ( const Token &t ) { return t.Type() == T_null;    }
+	inline int Numeric( const Token &t ) { return t.Type() == T_numeric; }
+	inline int StringP( const Token &t ) { return t.Type() == T_string;  }
+};
+#endif
diff --git a/src/nvtt/bc6h/arvo/Vec2.cpp b/src/nvtt/bc6h/arvo/Vec2.cpp
new file mode 100755
index 0000000..cca6723
--- /dev/null
+++ b/src/nvtt/bc6h/arvo/Vec2.cpp
@@ -0,0 +1,94 @@
+/***************************************************************************
+* Vec2.C                                                                   *
+*                                                                          *
+* Basic operations on 2-dimensional vectors.  This special case is useful  *
+* because nearly all operations are performed inline.                      *
+*                                                                          *
+*   HISTORY                                                                *
+*      Name    Date        Description                                     *
+*                                                                          *
+*      arvo    05/22/98    Added TimedVec2, extending Vec2.                *
+*      arvo    06/17/93    Initial coding.                                 *
+*                                                                          *
+*--------------------------------------------------------------------------*
+* Copyright (C) 1999, James Arvo                                           *
+*                                                                          *
+* This program is free software; you can redistribute it and/or modify it  *
+* under the terms of the GNU General Public License as published by the    *
+* Free Software Foundation.  See http://www.fsf.org/copyleft/gpl.html      *
+*                                                                          *
+* This program is distributed in the hope that it will be useful, but      *
+* WITHOUT EXPRESS OR IMPLIED WARRANTY of merchantability or fitness for    *
+* any particular purpose.  See the GNU General Public License for more     *
+* details.                                                                 *
+*                                                                          *
+***************************************************************************/
+#include <math.h>
+#include "ArvoMath.h"
+#include "Vec2.h"
+#include "form.h"
+
+namespace ArvoMath {
+
+	const Vec2 Vec2::Zero;
+	const Vec2 Vec2::Xaxis( 1, 0 );
+	const Vec2 Vec2::Yaxis( 0, 1 );
+
+	// Most routines are now inline.
+
+	float Normalize( Vec2 &A )
+	{
+		float d = Len( A );
+		if( d != 0.0 )
+		{
+			A.X() /= d;
+			A.Y() /= d;
+		}
+		return d;
+	}
+
+	Vec2 Min( const Vec2 &A, const Vec2 &B )
+	{
+		return Vec2( Min( A.X(), B.X() ), Min( A.Y(), B.Y() ) );
+	}
+
+	Vec2 Max( const Vec2 &A, const Vec2 &B )
+	{
+		return Vec2( Max( A.X(), B.X() ), Max( A.Y(), B.Y() ) );
+	}
+
+	std::ostream &operator<<( std::ostream &out, const Vec2 &A )
+	{
+		out << form( " %9.5f %9.5f\n", A.X(), A.Y() );
+		return out;
+	}
+
+	std::ostream &operator<<( std::ostream &out, const Mat2x2 &M )
+	{
+		out << form( " %9.5f %9.5f\n", M(0,0), M(0,1) )
+			<< form( " %9.5f %9.5f\n", M(1,0), M(1,1) )
+			<< std::endl;
+		return out;
+	}
+
+	Mat2x2::Mat2x2( const Vec2 &c1, const Vec2 &c2 ) 
+	{ 
+		m[0][0] = c1.X(); 
+		m[1][0] = c1.Y(); 
+		m[0][1] = c2.X();
+		m[1][1] = c2.Y();
+	}
+
+	// Return solution x of the system Ax = b.
+	Vec2 Solve( const Mat2x2 &A, const Vec2 &b )
+	{
+		float MachEps = MachineEpsilon();
+		Vec2 x;
+		double d = det( A );
+		double n = Norm1( A );
+		if( n <= MachEps || Abs(d) <= MachEps * n ) return Vec2::Zero;
+		x.X() =  A(1,1) * b.X() - A(0,1) * b.Y();
+		x.Y() = -A(1,0) * b.X() + A(0,0) * b.Y();
+		return x / d;
+	}
+};
diff --git a/src/nvtt/bc6h/arvo/Vec2.h b/src/nvtt/bc6h/arvo/Vec2.h
new file mode 100755
index 0000000..7aca458
--- /dev/null
+++ b/src/nvtt/bc6h/arvo/Vec2.h
@@ -0,0 +1,358 @@
+/***************************************************************************
+* Vec2.h                                                                   *
+*                                                                          *
+* Basic operations on 2-dimensional vectors.  This special case is useful  *
+* because nearly all operations are performed inline.                      *
+*                                                                          *
+*   HISTORY                                                                *
+*      Name    Date        Description                                     *
+*                                                                          *
+*      arvo    05/22/98    Added TimedVec2, extending Vec2.                *
+*      arvo    06/17/93    Initial coding.                                 *
+*                                                                          *
+*--------------------------------------------------------------------------*
+* Copyright (C) 1999, James Arvo                                           *
+*                                                                          *
+* This program is free software; you can redistribute it and/or modify it  *
+* under the terms of the GNU General Public License as published by the    *
+* Free Software Foundation.  See http://www.fsf.org/copyleft/gpl.html      *
+*                                                                          *
+* This program is distributed in the hope that it will be useful, but      *
+* WITHOUT EXPRESS OR IMPLIED WARRANTY of merchantability or fitness for    *
+* any particular purpose.  See the GNU General Public License for more     *
+* details.                                                                 *
+*                                                                          *
+***************************************************************************/
+#ifndef __VEC2_INCLUDED__
+#define __VEC2_INCLUDED__
+
+#include <math.h>
+#include <iostream>
+#include "ArvoMath.h"
+
+namespace ArvoMath {
+
+	class Vec2;       // 2-D floating-point vector.
+	class TimedVec2;  // 2-D vector with a time stamp.
+	class Mat2x2;     // 2x2 floating-point matrix.
+
+	class Vec2 {
+	public:
+		Vec2(                  ) { x = 0.0;   y = 0.0;   }
+		Vec2( float a, float b ) { x = a;     y = b;     }
+		Vec2( const Vec2 &A    ) { x = A.X(); y = A.Y(); }
+		~Vec2() {}
+		Vec2 &operator=( float s       ) { return Set(     s,     s ); }
+		Vec2 &operator=( const Vec2 &A ) { return Set( A.X(), A.Y() ); }
+		float  X() const { return x; }
+		float  Y() const { return y; }
+		float &X()       { return x; }
+		float &Y()       { return y; }
+		float  operator[]( int i ) const { return *( &x + i ); }
+		float &operator[]( int i )       { return *( &x + i ); }
+		Vec2  &Set( float a, float b ) { x = a; y = b; return *this; }
+		Vec2  &Set( const Vec2 &A    ) { return Set( A.X(), A.Y() ); }
+	public:
+		static const Vec2 Zero;
+		static const Vec2 Xaxis;
+		static const Vec2 Yaxis;
+	protected:
+		float x, y;
+	};
+
+	// This class simply adds a time field to the Vec2 class so that time-stamped
+	// coordinates can be easily inserted into objects such as Polylines.
+
+	class TimedVec2 : public Vec2 {
+	public:
+		TimedVec2() { time = 0; }
+		TimedVec2( const Vec2 &p   , long u = 0 ) { Set( p ); time = u; }
+		TimedVec2( float x, float y, long u = 0 ) { Set(x,y); time = u; }
+		~TimedVec2() {}
+		Vec2 &Coord()       { return *this; }
+		Vec2  Coord() const { return *this; }
+		long  Time () const { return  time; }
+		void  SetTime( long u ) { time = u; }
+	protected:
+		long time;
+	};
+
+	class Mat2x2 {
+	public:
+		Mat2x2( ) { Set( 0, 0, 0, 0 ); }
+		Mat2x2( float a, float b, float c, float d ) { Set( a, b, c, d ); }
+		Mat2x2( const Vec2 &c1, const Vec2 &c2 );
+		~Mat2x2( ) {}
+		Mat2x2 &operator*=( float scale );
+		Mat2x2  operator* ( float scale ) const;
+		void Set( float a, float b, float c, float d ) 
+		{ m[0][0] = a; m[0][1] = b; m[1][0] = c; m[1][1] = d; }
+		float  operator()( int i, int j ) const { return m[i][j]; }
+		float &operator()( int i, int j )       { return m[i][j]; }
+	private:
+		float m[2][2];
+	};
+
+
+	//==========================================
+	//===  Miscellaneous external functions  ===                        
+	//==========================================
+
+	extern float Normalize( Vec2 &A );
+	extern Vec2  Min ( const Vec2 &A, const Vec2 &B );
+	extern Vec2  Max ( const Vec2 &A, const Vec2 &B );
+
+
+	//==========================================
+	//===  Norm-related functions           ===                        
+	//==========================================
+
+	inline double LenSqr ( const Vec2 &A ) { return Sqr(A[0]) + Sqr(A[1]); }
+	inline double Len    ( const Vec2 &A ) { return sqrt( LenSqr( A ) ); }
+	inline double OneNorm( const Vec2 &A ) { return Abs( A.X() ) + Abs( A.Y() ); }
+	inline double TwoNorm( const Vec2 &A ) { return Len(A); }
+	inline float  SupNorm( const Vec2 &A ) { return MaxAbs( A.X(), A.Y() ); }
+
+
+	//==========================================
+	//===  Addition                          ===                        
+	//==========================================
+
+	inline Vec2 operator+( const Vec2 &A, const Vec2 &B )
+	{
+		return Vec2( A.X() + B.X(), A.Y() + B.Y() );
+	}
+
+	inline Vec2& operator+=( Vec2 &A, const Vec2 &B )
+	{
+		A.X() += B.X();
+		A.Y() += B.Y();
+		return A;
+	}
+
+
+	//==========================================
+	//===  Subtraction                       ===                        
+	//==========================================
+
+	inline Vec2 operator-( const Vec2 &A, const Vec2 &B )
+	{
+		return Vec2( A.X() - B.X(), A.Y() - B.Y() );
+	}
+
+	inline Vec2 operator-( const Vec2 &A )
+	{
+		return Vec2( -A.X(), -A.Y() );
+	}
+
+	inline Vec2& operator-=( Vec2 &A, const Vec2 &B )
+	{
+		A.X() -= B.X();
+		A.Y() -= B.Y();
+		return A;
+	}
+
+
+	//==========================================
+	//===  Multiplication                    ===                        
+	//==========================================
+
+	inline Vec2 operator*( float c, const Vec2 &A )
+	{
+		return Vec2( c * A.X(), c * A.Y() );
+	}
+
+	inline Vec2 operator*( const Vec2 &A, float c )
+	{
+		return Vec2( c * A.X(), c * A.Y() );
+	}
+
+	inline float operator*( const Vec2 &A, const Vec2 &B )  // Inner product
+	{
+		return A.X() * B.X() + A.Y() * B.Y();
+	}
+
+	inline Vec2& operator*=( Vec2 &A, float c )
+	{
+		A.X() *= c;
+		A.Y() *= c;
+		return A;
+	}
+
+	//==========================================
+	//===  Division                          ===                        
+	//==========================================
+
+	inline Vec2 operator/( const Vec2 &A, float c )
+	{
+		return Vec2( A.X() / c, A.Y() / c );
+	}
+
+	inline Vec2 operator/( const Vec2 &A, const Vec2 &B ) 
+	{
+		return A - B * (( A * B ) / LenSqr( B ));
+	}
+
+
+	//==========================================
+	//===  Comparison                        ===                        
+	//==========================================
+
+	inline int operator==( const Vec2 &A, const Vec2 &B ) 
+	{ 
+		return A.X() == B.X() && A.Y() == B.Y(); 
+	}
+
+	inline int operator!=( const Vec2 &A, const Vec2 &B ) 
+	{ 
+		return A.X() != B.X() || A.Y() != B.Y(); 
+	}
+
+	inline int operator<=( const Vec2 &A, const Vec2 &B ) 
+	{ 
+		return A.X() <= B.X() && A.Y() <= B.Y(); 
+	}
+
+	inline int operator<( const Vec2 &A, const Vec2 &B ) 
+	{ 
+		return A.X() < B.X() && A.Y() < B.Y(); 
+	}
+
+	inline int operator>=( const Vec2 &A, const Vec2 &B ) 
+	{ 
+		return A.X() >= B.X() && A.Y() >= B.Y(); 
+	}
+
+	inline int operator>( const Vec2 &A, const Vec2 &B ) 
+	{ 
+		return A.X() > B.X() && A.Y() > B.Y();
+	}
+
+	//==========================================
+	//===  Miscellaneous                     ===                        
+	//==========================================
+
+	inline float operator|( const Vec2 &A, const Vec2 &B )  // Inner product
+	{
+		return A * B;
+	}
+
+	inline Vec2 Unit( const Vec2 &A )
+	{
+		float c = LenSqr( A );
+		if( c > 0.0 ) c = 1.0 / sqrt( c );
+		return c * A;
+	}
+
+	inline Vec2 Unit( const Vec2 &A, float &len )
+	{
+		float c = LenSqr( A );
+		if( c > 0.0 ) 
+		{
+			len = sqrt( c );
+			return A / len;
+		}
+		len = 0.0;
+		return A;
+	}
+
+	inline Vec2 Unit( float x, float y )
+	{
+		return Unit( Vec2( x, y ) );
+	}
+
+	inline double dist( const Vec2 &A, const Vec2 &B ) 
+	{ 
+		return Len( A - B ); 
+	}
+
+	inline float operator^( const Vec2 &A, const Vec2 &B )
+	{
+		return A.X() * B.Y() - A.Y() * B.X();
+	}
+
+	inline int Quadrant( const Vec2 &A )
+	{
+		if( A.Y() >= 0.0 ) return A.X() >= 0.0 ? 1 : 2;
+		return A.X() >= 0.0 ? 4 : 3;
+	}
+
+	inline Vec2 OrthogonalTo( const Vec2 &A ) // A vector orthogonal to that given.
+	{
+		return Vec2( -A.Y(), A.X() );
+	}
+
+	inline Vec2 Interpolate( const Vec2 &A, const Vec2 &B, float t )
+	{
+		// Compute a point along the segment joining points A and B
+		// according to the normalized parameter t in [0,1].
+		return ( 1.0 - t ) * A + t * B;
+	}
+
+	//==========================================
+	//===  Operations involving Matrices     ===                        
+	//==========================================
+
+	inline Mat2x2 Outer( const Vec2 &A, const Vec2 &B )  // Outer product.
+	{
+		Mat2x2 C;
+		C(0,0) = A.X() * B.X();
+		C(0,1) = A.X() * B.Y();
+		C(1,0) = A.Y() * B.X();
+		C(1,1) = A.Y() * B.Y();
+		return C;
+	}
+
+	inline Vec2 operator*( const Mat2x2 &M, const Vec2 &A )
+	{
+		return Vec2( 
+			M(0,0) * A.X() + M(0,1) * A.Y(),
+			M(1,0) * A.X() + M(1,1) * A.Y()
+			);
+	}
+
+	inline Mat2x2 &Mat2x2::operator*=( float scale )
+	{
+		m[0][0] *= scale;
+		m[0][1] *= scale;
+		m[1][0] *= scale;
+		m[1][1] *= scale;
+		return *this;
+	}
+
+	inline Mat2x2 Mat2x2::operator*( float scale ) const
+	{
+		return Mat2x2(
+			scale * m[0][0], scale * m[0][1],       
+			scale * m[1][0], scale * m[1][1]
+			);
+	}
+
+	inline Mat2x2 operator*( float scale, const Mat2x2 &M )
+	{
+		return M * scale;
+	}
+
+	inline float Norm1( const Mat2x2 &A )
+	{
+		return Max( Abs(A(0,0)) + Abs(A(0,1)), Abs(A(1,0)) + Abs(A(1,1)) );
+	}
+
+	inline double det( const Mat2x2 &A )
+	{
+		return A(0,0) * A(1,1) - A(1,0) * A(0,1);
+	}
+
+	extern Vec2 Solve(  // Return solution x of the system Ax = b.
+		const Mat2x2 &A, 
+		const Vec2 &b 
+		);
+
+	//==========================================
+	//===  Output routines                   ===                        
+	//==========================================
+
+	extern std::ostream &operator<<( std::ostream &out, const Vec2   & );
+	extern std::ostream &operator<<( std::ostream &out, const Mat2x2 & );
+};
+#endif
diff --git a/src/nvtt/bc6h/arvo/Vec3.cpp b/src/nvtt/bc6h/arvo/Vec3.cpp
new file mode 100755
index 0000000..1033f84
--- /dev/null
+++ b/src/nvtt/bc6h/arvo/Vec3.cpp
@@ -0,0 +1,119 @@
+/***************************************************************************
+* Vec3.C                                                                   *
+*                                                                          *
+* Basic operations on 3-dimensional vectors.  This special case is useful  *
+* because many operations are performed inline.                            *
+*                                                                          *
+*   HISTORY                                                                *
+*      Name    Date        Description                                     *
+*                                                                          *
+*      arvo    10/27/94    Reorganized (removed Col & Row distinction).    *
+*      arvo    06/14/93    Initial coding.                                 *
+*                                                                          *
+*--------------------------------------------------------------------------*
+* Copyright (C) 1994, James Arvo                                           *
+*                                                                          *
+* This program is free software; you can redistribute it and/or modify it  *
+* under the terms of the GNU General Public License as published by the    *
+* Free Software Foundation.  See http://www.fsf.org/copyleft/gpl.html      *
+*                                                                          *
+* This program is distributed in the hope that it will be useful, but      *
+* WITHOUT EXPRESS OR IMPLIED WARRANTY of merchantability or fitness for    *
+* any particular purpose.  See the GNU General Public License for more     *
+* details.                                                                 *
+*                                                                          *
+***************************************************************************/
+#include <stdio.h>
+#include <math.h>
+#include "ArvoMath.h"
+#include "Vec3.h"
+#include "form.h"
+
+namespace ArvoMath {
+
+	float Normalize( Vec3 &A )
+	{
+		float d = Len( A );
+		if( d > 0.0 )
+		{
+			double c = 1.0 / d;
+			A.X() *= c;
+			A.Y() *= c;
+			A.Z() *= c;
+		}
+		return( d );
+	}
+
+	double Angle( const Vec3 &A, const Vec3 &B )
+	{
+		double t = LenSqr(A) * LenSqr(B);
+		if( t <= 0.0 ) return 0.0;
+		return ArcCos( (A * B) / sqrt(t) );
+	}
+
+	/*-------------------------------------------------------------------------*
+	* O R T H O N O R M A L                                                   *
+	*                                                                         *
+	* On Input  A, B....: Two linearly independent 3-space vectors.           *
+	*                                                                         *
+	* On Return A.......: Unit vector pointing in original A direction.       *
+	*           B.......: Unit vector orthogonal to A and in subspace spanned *
+	*                     by original A and B vectors.                        *
+	*           C.......: Unit vector orthogonal to both A and B, chosen so   *
+	*                     that A-B-C forms a right-handed coordinate system.  *
+	*                                                                         *
+	*-------------------------------------------------------------------------*/
+	int Orthonormal( Vec3 &A, Vec3 &B, Vec3 &C )
+	{
+		if( Normalize( A ) == 0.0 ) return 1;
+		B /= A;
+		if( Normalize( B ) == 0.0 ) return 1;
+		C = A ^ B;
+		return 0;
+	}
+
+	int Orthonormal( Vec3 &A, Vec3 &B )
+	{
+		if( Normalize( A ) == 0.0 ) return 1;
+		B /= A;
+		if( Normalize( B ) == 0.0 ) return 1;
+		return 0;
+	}
+
+	/*-------------------------------------------------------------------------*
+	* O R T H O G O N A L  T O                                                *
+	*                                                                         *
+	* Returns a vector that is orthogonal to A (but of arbitrary length).     *
+	*                                                                         *
+	*-------------------------------------------------------------------------*/
+	Vec3 OrthogonalTo( const Vec3 &A )
+	{
+		float c = 0.5 * SupNorm( A );
+		if( c ==       0.0  ) return Vec3(    1.0,    0.0,    0.0 );
+		if( c <= Abs(A.X()) ) return Vec3( -A.Y(),  A.X(),    0.0 );
+		if( c <= Abs(A.Y()) ) return Vec3(    0.0, -A.Z(),  A.Y() );
+		return Vec3(  A.Z(),    0.0, -A.X() );
+	}
+
+	Vec3 Min( const Vec3 &A, const Vec3 &B )
+	{
+		return Vec3( 
+			Min( A.X(), B.X() ),
+			Min( A.Y(), B.Y() ),
+			Min( A.Z(), B.Z() ));
+	}
+
+	Vec3 Max( const Vec3 &A, const Vec3 &B )
+	{
+		return Vec3( 
+			Max( A.X(), B.X() ),
+			Max( A.Y(), B.Y() ),
+			Max( A.Z(), B.Z() ));
+	}
+
+	std::ostream &operator<<( std::ostream &out, const Vec3 &A )
+	{
+		out << form( " %9.5f %9.5f %9.5f", A.X(), A.Y(), A.Z() ) << std::endl;
+		return out;
+	}
+};
diff --git a/src/nvtt/bc6h/arvo/Vec3.h b/src/nvtt/bc6h/arvo/Vec3.h
new file mode 100755
index 0000000..b9d539f
--- /dev/null
+++ b/src/nvtt/bc6h/arvo/Vec3.h
@@ -0,0 +1,517 @@
+/***************************************************************************
+* Vec3.h                                                                   *
+*                                                                          *
+* Basic operations on 3-dimensional vectors.  This special case is useful  *
+* because many operations are performed inline.                            *
+*                                                                          *
+*   HISTORY                                                                *
+*      Name    Date        Description                                     *
+*                                                                          *
+*      arvo    10/27/94    Reorganized (removed Col & Row distinction).    *
+*      arvo    06/14/93    Initial coding.                                 *
+*                                                                          *
+*--------------------------------------------------------------------------*
+* Copyright (C) 1994, James Arvo                                           *
+*                                                                          *
+* This program is free software; you can redistribute it and/or modify it  *
+* under the terms of the GNU General Public License as published by the    *
+* Free Software Foundation.  See http://www.fsf.org/copyleft/gpl.html      *
+*                                                                          *
+* This program is distributed in the hope that it will be useful, but      *
+* WITHOUT EXPRESS OR IMPLIED WARRANTY of merchantability or fitness for    *
+* any particular purpose.  See the GNU General Public License for more     *
+* details.                                                                 *
+*                                                                          *
+***************************************************************************/
+#ifndef __VEC3_INCLUDED__
+#define __VEC3_INCLUDED__
+
+#include <math.h>
+#include <iostream>
+#include "Vec2.h"
+
+namespace ArvoMath {
+
+	class Vec3 {
+	public:
+		Vec3( float c = 0.0             ) { x =     c; y =     c; z =     c; }
+		Vec3( float a, float b, float c ) { x =     a; y =     b; z =     c; }
+		Vec3( const Vec3 &A             ) { x = A.X(); y = A.Y(); z = A.Z(); }
+		void operator=( float c         ) { x =     c; y =     c; z =     c; }
+		void operator=( const Vec3 &A   ) { x = A.X(); y = A.Y(); z = A.Z(); }
+		void operator=( const Vec2 &A   ) { x = A.X(); y = A.Y(); z =   0.0; }
+		~Vec3() {}
+		float   X() const { return x; }
+		float   Y() const { return y; }
+		float   Z() const { return z; }
+		float & X()       { return x; }
+		float & Y()       { return y; }
+		float & Z()       { return z; }
+		float   operator[]( int i ) const { return *( &x + i ); }
+		float & operator[]( int i )       { return *( &x + i ); }
+	private:
+		float x, y, z;
+	};
+
+	//class Mat3x3 {
+	//public:
+	//	inline Mat3x3( );
+	//	Mat3x3( const Mat3x3 &M ) { *this = M; }
+	//	Mat3x3( const Vec3 &, const Vec3 &, const Vec3 & );  // Three columns.
+	//	~Mat3x3( ) {}
+	//	float    operator()( int i, int j ) const { return m[i][j]; }
+	//	float  & operator()( int i, int j )       { return m[i][j]; }
+	//	Mat3x3 & operator=( float          );
+	//	Mat3x3 & operator=( const Mat3x3 & );
+	//	inline   void ScaleRows( float, float, float );
+	//	inline   void ScaleCols( float, float, float );
+	//	void     Col( int n, const Vec3 & );
+	//	const    float *Base() const { return &(m[0][0]); }
+	//private:
+	//	float m[3][3];
+	//};
+
+	//class Mat4x4 {
+	//public:
+	//	Mat4x4( );
+	//	Mat4x4( const Mat4x4 &M ) { *this = M; }
+	//	Mat4x4( const Mat3x3 &M ) ;
+	//	~Mat4x4( ) {}
+	//	float    operator()( int i, int j ) const { return m[i][j]; }
+	//	float  & operator()( int i, int j )       { return m[i][j]; }
+	//	Mat4x4 & operator=( float          );
+	//	Mat4x4 & operator=( const Mat4x4 & );
+	//	void     Row( int i, int j, const Vec3 & );
+	//	void     Col( int i, int j, const Vec3 & );
+	//	void     ScaleRows( float, float, float, float );
+	//	void     ScaleCols( float, float, float, float );
+	//	const    float *Base() const { return &(m[0][0]); }
+	//private:
+	//	float m[4][4];
+	//};
+
+
+	//==========================================
+	//===  External operators                ===                        
+	//==========================================
+
+	//extern Vec3     operator * ( const Mat4x4 &, const Vec3   & );
+	//extern Vec3     operator * ( const Vec3   &, const Mat4x4 & );
+	//extern Mat3x3   operator * (        float  , const Mat3x3 & );
+	//extern Mat3x3   operator * ( const Mat3x3 &,       float    );
+	//extern Mat3x3   operator / ( const Mat3x3 &,       double   );
+	//extern Mat3x3 & operator *=(       Mat3x3 &,       float    );
+	//extern Mat3x3 & operator *=(       Mat3x3 &, const Mat3x3 & );
+	//extern Mat3x3   operator * ( const Mat3x3 &, const Mat3x3 & );
+	//extern Mat3x3   operator + ( const Mat3x3 &, const Mat3x3 & );
+	//extern Mat3x3 & operator +=(       Mat3x3 &, const Mat3x3 & );
+	//extern Mat3x3   operator - ( const Mat3x3 &, const Mat3x3 & );
+	//extern Mat3x3 & operator -=(       Mat3x3 &, const Mat3x3 & );
+	//extern Mat4x4   operator * (        float  , const Mat4x4 & );
+	//extern Mat4x4   operator * ( const Mat4x4 &,       float    );
+	//extern Mat4x4   operator / ( const Mat4x4 &,       float    );
+	//extern Mat4x4 & operator *=(       Mat4x4 &,       float    );
+	//extern Mat4x4   operator * ( const Mat4x4 &, const Mat4x4 & );
+	//extern Mat4x4   operator + ( const Mat4x4 &, const Mat4x4 & );
+	//extern Mat4x4 & operator +=(       Mat4x4 &, const Mat4x4 & );
+	//extern Mat4x4   operator - ( const Mat4x4 &, const Mat4x4 & );
+	//extern Mat4x4 & operator -=(       Mat4x4 &, const Mat4x4 & );
+
+
+	//==========================================
+	//===  Miscellaneous external functions  ===                        
+	//==========================================
+
+	//extern Vec3   OrthogonalTo( const Vec3   & ); // A vector orthogonal to that given.
+	//extern Vec3   Min         ( const Vec3   &, const Vec3 &         );
+	//extern Vec3   Max         ( const Vec3   &, const Vec3 &         );
+	//extern double Angle       ( const Vec3   &, const Vec3 &         );
+	//extern int    Orthonormal (       Vec3   &,       Vec3 &         );
+	//extern int    Orthonormal (       Vec3   &,       Vec3 &, Vec3 & );
+	//extern float  Trace       ( const Mat3x3 & );
+	//extern float  Normalize   (       Vec3   & );
+	//extern float  Norm1       ( const Mat3x3 & );
+	//extern float  SupNorm     ( const Mat3x3 & );
+	//extern double Determinant ( const Mat3x3 & );
+	//extern Mat3x3 Transp      ( const Mat3x3 & );
+	//extern Mat3x3 Householder ( const Vec3   &, const Vec3 & );
+	//extern Mat3x3 Householder ( const Vec3   & );
+	//extern Mat3x3 Rotation3x3 (       float, float, float ); // Values in [0,1].
+	//extern Mat3x3 Inverse     ( const Mat3x3 & );
+	//extern Mat3x3 Diag3x3     ( const Vec3   & );
+	//extern Mat3x3 Diag3x3     (       float, float, float );
+	//extern Mat3x3 Rotation3x3 ( const Vec3   &Axis,                     float angle );
+	//extern Mat4x4 Rotation4x4 ( const Vec3   &Axis, const Vec3 &Origin, float angle );
+
+
+	//==========================================
+	//===      Norm-related functions        ===                        
+	//==========================================
+
+	inline double LenSqr ( const Vec3 &A ) { return Sqr(A[0]) + Sqr(A[1]) + Sqr(A[2]); }
+	inline double Len    ( const Vec3 &A ) { return Sqrt( LenSqr( A ) ); }
+	inline double Norm1  ( const Vec3 &A ) { return Abs(A[0]) + Abs(A[1]) + Abs(A[2]); }
+	inline double Norm2  ( const Vec3 &A ) { return Len( A ); }
+	inline float  SupNorm( const Vec3 &A ) { return MaxAbs( A[0], A[1], A[2] ); }
+
+
+	//==========================================
+	//===            Addition                ===                        
+	//==========================================
+
+	inline Vec3 operator+( const Vec3 &A, const Vec3 &B )
+	{
+		return Vec3( A.X() + B.X(), A.Y() + B.Y(), A.Z() + B.Z() );
+	}
+
+	inline Vec3& operator+=( Vec3 &A, const Vec3 &B )
+	{
+		A.X() += B.X();
+		A.Y() += B.Y();
+		A.Z() += B.Z();
+		return A;
+	}
+
+
+	//==========================================
+	//===            Subtraction             ===                        
+	//==========================================
+
+	inline Vec3 operator-( const Vec3 &A, const Vec3 &B )
+	{
+		return Vec3( A.X() - B.X(), A.Y() - B.Y(), A.Z() - B.Z() );
+	}
+
+	inline Vec3 operator-( const Vec3 &A )
+	{
+		return Vec3( -A.X(), -A.Y(), -A.Z() );
+	}
+
+	inline Vec3& operator-=( Vec3 &A, const Vec3 &B )
+	{
+		A.X() -= B.X();
+		A.Y() -= B.Y();
+		A.Z() -= B.Z();
+		return A;
+	}
+
+
+	//==========================================
+	//===         Multiplication             ===                        
+	//==========================================
+
+	inline Vec3 operator*( float a, const Vec3 &x )
+	{
+		return Vec3( a * x.X(), a * x.Y(), a * x.Z() );
+	}
+
+	inline Vec3 operator*( const Vec3 &x, float a )
+	{
+		return Vec3( a * x.X(), a * x.Y(), a * x.Z() );
+	}
+
+	inline float operator*( const Vec3 &A, const Vec3 &B )  // Inner product.
+	{
+		return A.X() * B.X() + A.Y() * B.Y() + A.Z() * B.Z();
+	}
+
+	inline Vec3& operator*=( Vec3 &A, float a )
+	{
+		A.X() *= a;
+		A.Y() *= a;
+		A.Z() *= a;
+		return A;
+	}
+
+	//inline Vec3& operator*=( Vec3 &A, const Mat3x3 &M )  // A = M * A
+	//{
+	//	float x = M(0,0) * A.X() + M(0,1) * A.Y() + M(0,2) * A.Z();
+	//	float y = M(1,0) * A.X() + M(1,1) * A.Y() + M(1,2) * A.Z();
+	//	float z = M(2,0) * A.X() + M(2,1) * A.Y() + M(2,2) * A.Z();
+	//	A.X() = x;
+	//	A.Y() = y;
+	//	A.Z() = z;
+	//	return A;
+	//}
+
+	//inline Vec3& operator*=( Vec3 &A, const Mat4x4 &M )  // A = M * A
+	//{
+	//	float x = M(0,0) * A.X() + M(0,1) * A.Y() + M(0,2) * A.Z() + M(0,3);
+	//	float y = M(1,0) * A.X() + M(1,1) * A.Y() + M(1,2) * A.Z() + M(1,3);
+	//	float z = M(2,0) * A.X() + M(2,1) * A.Y() + M(2,2) * A.Z() + M(2,3);
+	//	A.X() = x;
+	//	A.Y() = y;
+	//	A.Z() = z;
+	//	return A;
+	//}
+
+
+	//==========================================
+	//===             Division               ===                        
+	//==========================================
+
+	inline Vec3 operator/( const Vec3 &A, double c )
+	{
+		double t = 1.0 / c;
+		return Vec3( A.X() * t, A.Y() * t, A.Z() * t );
+	}
+
+	inline Vec3& operator/=( Vec3 &A, double a )
+	{
+		A.X() /= a;
+		A.Y() /= a;
+		A.Z() /= a;
+		return A;
+	}
+
+	inline Vec3 operator/( const Vec3 &A, const Vec3 &B )  // Remove component parallel to B.
+	{
+		Vec3 C;  // Cumbersome due to compiler falure.
+		double x = LenSqr( B );
+		if( x > 0.0 ) C = A - B * (( A * B ) / x); else C = A;
+		return C;
+	}
+
+	inline void operator/=( Vec3 &A, const Vec3 &B ) // Remove component parallel to B.
+	{
+		double x = LenSqr( B );
+		if( x > 0.0 ) A -= B * (( A * B ) / x);
+	}
+
+
+	//==========================================
+	//===          Miscellaneous             ===                        
+	//==========================================
+
+	inline float operator|( const Vec3 &A, const Vec3 &B )  // Inner product.
+	{
+		return A * B;
+	}
+
+	inline Vec3 Unit( const Vec3 &A )
+	{
+		double d = LenSqr( A );
+		return d > 0.0 ? A / sqrt(d) : Vec3(0,0,0);
+	}
+
+	inline Vec3 Unit( float x, float y, float z )
+	{
+		return Unit( Vec3( x, y, z ) );
+	}
+
+	inline Vec3 Ortho( const Vec3 &A, const Vec3 &B )
+	{
+		return Unit( A / B );
+	}
+
+	inline int operator==( const Vec3 &A, float x )
+	{
+		return (A[0] == x) && (A[1] == x) && (A[2] == x);
+	}
+
+	inline Vec3 operator^( const Vec3 &A, const Vec3 &B )
+	{
+		return Vec3( 
+			A.Y() * B.Z() - A.Z() * B.Y(),
+			A.Z() * B.X() - A.X() * B.Z(),
+			A.X() * B.Y() - A.Y() * B.X() );
+	}
+
+	inline double dist( const Vec3 &A, const Vec3 &B ) 
+	{ 
+		return Len( A - B ); 
+	}
+
+	inline double Dihedral( const Vec3 &A, const Vec3 &B, const Vec3 &C )
+	{
+		return ArcCos( Unit( A ^ B ) * Unit( C ^ B ) );
+	}
+
+	inline Vec3 operator>>( const Vec3 &A, const Vec3 &B )  // Project A onto B.
+	{
+		Vec3 C;
+		double x = LenSqr( B );
+		if( x > 0.0 ) C = B * (( A * B ) / x);
+		return C;
+	}
+
+	inline Vec3 operator<<( const Vec3 &A, const Vec3 &B ) // Project B onto A.
+	{
+		return B >> A;
+	}
+
+	inline double Triple( const Vec3 &A, const Vec3 &B, const Vec3 &C )
+	{
+		return ( A ^ B ) * C;
+	}
+
+
+	//==========================================
+	//===  Operations involving Matrices     ===                        
+	//==========================================
+
+	//inline Mat3x3 Outer( const Vec3 &A, const Vec3 &B )  // Outer product.
+	//{
+	//	Mat3x3 C;
+	//	C(0,0) = A.X() * B.X();
+	//	C(0,1) = A.X() * B.Y();
+	//	C(0,2) = A.X() * B.Z();
+	//	C(1,0) = A.Y() * B.X();
+	//	C(1,1) = A.Y() * B.Y();
+	//	C(1,2) = A.Y() * B.Z();
+	//	C(2,0) = A.Z() * B.X();
+	//	C(2,1) = A.Z() * B.Y();
+	//	C(2,2) = A.Z() * B.Z();
+	//	return C;
+	//}
+
+	//inline Vec3 operator*( const Mat3x3 &M, const Vec3 &A )
+	//{
+	//	return Vec3(
+	//		M(0,0) * A[0] + M(0,1) * A[1] + M(0,2) * A[2],
+	//		M(1,0) * A[0] + M(1,1) * A[1] + M(1,2) * A[2],
+	//		M(2,0) * A[0] + M(2,1) * A[1] + M(2,2) * A[2]);
+	//}
+
+	//inline Vec3 operator*( const Vec3 &A, const Mat3x3 &M )
+	//{
+	//	return Vec3( 
+	//		A[0] * M(0,0) + A[1] * M(1,0) + A[2] * M(2,0),
+	//		A[0] * M(0,1) + A[1] * M(1,1) + A[2] * M(2,1),
+	//		A[0] * M(0,2) + A[1] * M(1,2) + A[2] * M(2,2));
+	//}
+
+	////==========================================
+	////===      Operations on Matrices        ===                        
+	////==========================================
+
+	//inline Mat3x3 operator+( const Mat3x3 &A, const Mat3x3 &B )
+	//{
+	//	Mat3x3 C;
+	//	C(0,0) = A(0,0) + B(0,0);  C(0,1) = A(0,1) + B(0,1);  C(0,2) = A(0,2) + B(0,2);
+	//	C(1,0) = A(1,0) + B(1,0);  C(1,1) = A(1,1) + B(1,1);  C(1,2) = A(1,2) + B(1,2);
+	//	C(2,0) = A(2,0) + B(2,0);  C(2,1) = A(2,1) + B(2,1);  C(2,2) = A(2,2) + B(2,2);
+	//	return C;
+	//}
+
+	//inline Mat3x3 operator-( const Mat3x3 &A, const Mat3x3 &B )
+	//{
+	//	Mat3x3 C;
+	//	C(0,0) = A(0,0) - B(0,0);  C(0,1) = A(0,1) - B(0,1);  C(0,2) = A(0,2) - B(0,2);
+	//	C(1,0) = A(1,0) - B(1,0);  C(1,1) = A(1,1) - B(1,1);  C(1,2) = A(1,2) - B(1,2);
+	//	C(2,0) = A(2,0) - B(2,0);  C(2,1) = A(2,1) - B(2,1);  C(2,2) = A(2,2) - B(2,2);
+	//	return C;
+	//}
+
+	//inline Mat3x3 operator*( const Mat3x3 &A, const Mat3x3 &B )
+	//{
+	//	Mat3x3 C;
+	//	C(0,0) = A(0,0) * B(0,0) + A(0,1) * B(1,0) + A(0,2) * B(2,0);
+	//	C(0,1) = A(0,0) * B(0,1) + A(0,1) * B(1,1) + A(0,2) * B(2,1);
+	//	C(0,2) = A(0,0) * B(0,2) + A(0,1) * B(1,2) + A(0,2) * B(2,2);
+	//	C(1,0) = A(1,0) * B(0,0) + A(1,1) * B(1,0) + A(1,2) * B(2,0);
+	//	C(1,1) = A(1,0) * B(0,1) + A(1,1) * B(1,1) + A(1,2) * B(2,1);
+	//	C(1,2) = A(1,0) * B(0,2) + A(1,1) * B(1,2) + A(1,2) * B(2,2);
+	//	C(2,0) = A(2,0) * B(0,0) + A(2,1) * B(1,0) + A(2,2) * B(2,0);
+	//	C(2,1) = A(2,0) * B(0,1) + A(2,1) * B(1,1) + A(2,2) * B(2,1);
+	//	C(2,2) = A(2,0) * B(0,2) + A(2,1) * B(1,2) + A(2,2) * B(2,2);
+	//	return C;
+	//}
+
+	//inline void Mat3x3::ScaleRows( float a, float b, float c )
+	//{
+	//	m[0][0] *= a;  m[0][1] *= a;  m[0][2] *= a;
+	//	m[1][0] *= b;  m[1][1] *= b;  m[1][2] *= b;
+	//	m[2][0] *= c;  m[2][1] *= c;  m[2][2] *= c;
+	//}
+
+	//inline void Mat3x3::ScaleCols( float a, float b, float c )
+	//{
+	//	m[0][0] *= a;  m[0][1] *= b;  m[0][2] *= c;
+	//	m[1][0] *= a;  m[1][1] *= b;  m[1][2] *= c;
+	//	m[2][0] *= a;  m[2][1] *= b;  m[2][2] *= c;
+	//}
+
+
+	//==========================================
+	//===       Special Matrices             ===                        
+	//==========================================
+
+	//inline Mat3x3::Mat3x3() 
+	//{
+	//	m[0][0] = 0;  m[0][1] = 0;  m[0][2] = 0;
+	//	m[1][0] = 0;  m[1][1] = 0;  m[1][2] = 0;
+	//	m[2][0] = 0;  m[2][1] = 0;  m[2][2] = 0; 
+	//}
+
+	//inline Mat3x3 Ident3x3()
+	//{
+	//	Mat3x3 I;
+	//	I(0,0) = 1.0;
+	//	I(1,1) = 1.0;
+	//	I(2,2) = 1.0;
+	//	return I;
+	//}
+
+	//inline Mat4x4 Ident4x4()
+	//{
+	//	Mat4x4 I;
+	//	I(0,0) = 1.0;
+	//	I(1,1) = 1.0;
+	//	I(2,2) = 1.0;
+	//	I(3,3) = 1.0;
+	//	return I;
+	//}
+
+	//inline void Adjoint( const Mat3x3 &M, Mat3x3 &A )
+	//{
+	//	A(0,0) = M(1,1) * M(2,2) - M(1,2) * M(2,1);
+	//	A(0,1) = M(1,2) * M(2,0) - M(1,0) * M(2,2);
+	//	A(0,2) = M(1,0) * M(2,1) - M(1,1) * M(2,0);
+
+	//	A(1,0) = M(0,2) * M(2,1) - M(0,1) * M(2,2);
+	//	A(1,1) = M(0,0) * M(2,2) - M(0,2) * M(2,0);
+	//	A(1,2) = M(0,1) * M(2,0) - M(0,0) * M(2,1);
+
+	//	A(2,0) = M(0,1) * M(1,2) - M(0,2) * M(1,1);
+	//	A(2,1) = M(0,2) * M(1,0) - M(0,0) * M(1,2);
+	//	A(2,2) = M(0,0) * M(1,1) - M(0,1) * M(1,0);
+	//}
+
+	//inline void TranspAdjoint( const Mat3x3 &M, Mat3x3 &A )
+	//{
+	//	A(0,0) = M(1,1) * M(2,2) - M(1,2) * M(2,1);
+	//	A(1,0) = M(1,2) * M(2,0) - M(1,0) * M(2,2);
+	//	A(2,0) = M(1,0) * M(2,1) - M(1,1) * M(2,0);
+
+	//	A(0,1) = M(0,2) * M(2,1) - M(0,1) * M(2,2);
+	//	A(1,1) = M(0,0) * M(2,2) - M(0,2) * M(2,0);
+	//	A(2,1) = M(0,1) * M(2,0) - M(0,0) * M(2,1);
+
+	//	A(0,2) = M(0,1) * M(1,2) - M(0,2) * M(1,1);
+	//	A(1,2) = M(0,2) * M(1,0) - M(0,0) * M(1,2);
+	//	A(2,2) = M(0,0) * M(1,1) - M(0,1) * M(1,0);
+	//}
+
+	//inline void Adjoint( const Mat3x3 &M, Mat3x3 &A, double &det )
+	//{
+	//	Adjoint( M, A );
+	//	det = A(0,0) * M(0,0) + A(1,0) * M(1,0) + A(2,0) * M(2,0);
+	//}
+
+	//inline void TranspAdjoint( const Mat3x3 &M, Mat3x3 &A, double &det )
+	//{
+	//	TranspAdjoint( M, A );
+	//	det = A(0,0) * M(0,0) + A(0,1) * M(1,0) + A(0,2) * M(2,0);
+	//}
+
+
+	//==========================================
+	//===  Output routines                   ===                        
+	//==========================================
+
+	extern std::ostream &operator<<( std::ostream &out, const Vec3   & );
+	//extern std::ostream &operator<<( std::ostream &out, const Mat3x3 & );
+	//extern std::ostream &operator<<( std::ostream &out, const Mat4x4 & );
+};
+#endif
diff --git a/src/nvtt/bc6h/arvo/Vector.cpp b/src/nvtt/bc6h/arvo/Vector.cpp
new file mode 100755
index 0000000..af3bc11
--- /dev/null
+++ b/src/nvtt/bc6h/arvo/Vector.cpp
@@ -0,0 +1,366 @@
+/***************************************************************************
+* Vector.C                                                                 *
+*                                                                          *
+* General Vector and Matrix classes, with all the associated methods.      *
+*                                                                          *
+*   HISTORY                                                                *
+*      Name    Date        Description                                     *
+*                                                                          *
+*      arvo    08/16/2000    Revamped for CIT tools.                       *
+*      arvo    10/31/1994    Combined RowVec & ColVec into Vector.         *
+*      arvo    06/30/1993    Added singular value decomposition class.     *
+*      arvo    06/25/1993    Major revisions.                              *
+*      arvo    09/08/1991    Initial implementation.                       *
+*                                                                          *
+*--------------------------------------------------------------------------*
+* Copyright (C) 2000, James Arvo                                           *
+*                                                                          *
+* This program is free software; you can redistribute it and/or modify it  *
+* under the terms of the GNU General Public License as published by the    *
+* Free Software Foundation.  See http://www.fsf.org/copyleft/gpl.html      *
+*                                                                          *
+* This program is distributed in the hope that it will be useful, but      *
+* WITHOUT EXPRESS OR IMPLIED WARRANTY of merchantability or fitness for    *
+* any particular purpose.  See the GNU General Public License for more     *
+* details.                                                                 *
+*                                                                          *
+***************************************************************************/
+#include <iostream>
+#include <assert.h>
+#include "ArvoMath.h"
+#include "Vector.h"
+#include "form.h"
+
+namespace ArvoMath {
+
+	const Vector Vector::Null(0);
+
+	/*-------------------------------------------------------------------------*
+	*                                                                         *
+	*  C O N S T R U C T O R S                                                *
+	*                                                                         *
+	*-------------------------------------------------------------------------*/
+	Vector::Vector( const float *x, int n )
+	{
+		Create( n );
+		for( register int i = 0; i < size; i++ ) elem[i] = x[i];
+	}
+
+	Vector::Vector( const Vector &A )
+	{
+		Create( A.Size() );
+		for( register int i = 0; i < A.Size(); i++ ) elem[i] = A(i);
+	}
+
+	Vector::Vector( int n )
+	{
+		Create( n );
+		for( register int i = 0; i < n; i++ ) elem[i] = 0.0;
+	}
+
+	Vector::Vector( float x, float y )
+	{
+		Create( 2 );
+		elem[0] = x;
+		elem[1] = y;
+	}
+
+	Vector::Vector( float x, float y, float z )
+	{
+		Create( 3 );
+		elem[0] = x;
+		elem[1] = y;
+		elem[2] = z;
+	}
+
+	void Vector::SetSize( int new_size )
+	{
+		if( size != new_size )
+		{
+			delete[] elem;
+			Create( new_size );
+			for( register int i = 0; i < new_size; i++ ) elem[i] = 0.0;
+		}
+	}
+
+	Vector &Vector::Swap( int i, int j )
+	{
+		float temp = elem[i];
+		elem[i]    = elem[j];
+		elem[j]    = temp;
+		return *this;
+	}
+
+	Vector Vector::GetBlock( int i, int j ) const
+	{
+		assert( 0 <= i && i <= j && j < size );
+		int n = j - i + 1;
+		Vector V( n );
+		register float *v = V.Array();
+		register float *e = elem + i;
+		for( register int k = 0; k < n; k++ ) *v++ = *e++;
+		return V;
+	}
+
+	void Vector::SetBlock( int i, int j, const Vector &V )
+	{
+		assert( 0 <= i && i <= j && j < size );
+		int n = j - i + 1;
+		assert( n == V.Size() );
+		register float *v = V.Array();
+		register float *e = elem + i;
+		for( register int k = 0; k < n; k++ ) *e++ = *v++;
+	}
+
+	/*-------------------------------------------------------------------------*
+	*                                                                         *
+	*  O P E R A T O R S                                                      *
+	*                                                                         *
+	*-------------------------------------------------------------------------*/
+	double operator*( const Vector &A, const Vector &B )
+	{
+		assert( A.Size() == B.Size() );
+		double sum = A(0) * B(0);
+		for( register int i = 1; i < A.Size(); i++ ) sum += A(i) * B(i);
+		return sum;
+	}
+
+	void Vector::operator=( float c )
+	{
+		for( register int i = 0; i < size; i++ ) elem[i] = c;
+	}
+
+	Vector operator*( const Vector &A, float s ) 
+	{
+		Vector C( A.Size() );
+		for( register int i = 0; i < A.Size(); i++ ) C(i) = A(i) * s;
+		return C;
+	}
+
+	Vector operator*( float s, const Vector &A ) 
+	{
+		Vector C( A.Size() );
+		for( register int i = 0; i < A.Size(); i++ ) C(i) = A(i) * s;
+		return C;
+	}
+
+	Vector operator/( const Vector &A, float s ) 
+	{
+		assert( s != 0.0 );
+		Vector C( A.Size() );
+		for( register int i = 0; i < A.Size(); i++ ) C(i) = A(i) / s;
+		return C;
+	}
+
+	Vector& operator+=( Vector &A, const Vector &B ) 
+	{
+		assert( A.Size() == B.Size() );
+		for( register int i = 0; i < A.Size(); i++ ) A(i) += B(i);
+		return A;
+	}
+
+	Vector& operator*=( Vector &A, float scale ) 
+	{
+		for( register int i = 0; i < A.Size(); i++ ) A(i) *= scale;
+		return A;
+	}
+
+	Vector& operator/=( Vector &A, float scale ) 
+	{
+		for( register int i = 0; i < A.Size(); i++ ) A(i) /= scale;
+		return A;
+	}
+
+	Vector& Vector::operator=( const Vector &A )
+	{
+		SetSize( A.Size() );
+		for( register int i = 0; i < size; i++ ) elem[i] = A(i);
+		return *this;
+	}
+
+	Vector operator+( const Vector &A, const Vector &B ) 
+	{
+		assert( A.Size() == B.Size() );
+		Vector C( A.Size() );
+		for( register int i = 0; i < A.Size(); i++ ) C(i) = A(i) + B(i);
+		return C;
+	}
+
+	Vector operator-( const Vector &A, const Vector &B ) 
+	{
+		assert( A.Size() == B.Size() );
+		Vector C( A.Size() );
+		for( register int i = 0; i < A.Size(); i++ ) C(i) = A(i) - B(i);
+		return C;
+	}
+
+	Vector operator-( const Vector &A )  // Unary minus.
+	{
+		Vector B( A.Size() );
+		for( register int i = 0; i < A.Size(); i++ ) B(i) = -A(i);
+		return B;
+	}
+
+	Vector operator^( const Vector &A, const Vector &B )
+	{
+		Vector C(3);
+		assert( A.Size() == B.Size() );
+		if( A.Size() == 2 ) // Assume z components of A and B are zero.
+		{
+			C(0) = 0.0;
+			C(1) = 0.0;
+			C(2) = A(0) * B(1) - A(1) * B(0);
+		}
+		else 
+		{
+			assert( A.Size() == 3 );
+			C(0) = A(1) * B(2) - A(2) * B(1);
+			C(1) = A(2) * B(0) - A(0) * B(2);
+			C(2) = A(0) * B(1) - A(1) * B(0);
+		}
+		return C;
+	}
+
+	/*-------------------------------------------------------------------------*
+	*                                                                         *
+	*  M I S C E L L A N E O U S   F U N C T I O N S                          *
+	*                                                                         *
+	*-------------------------------------------------------------------------*/
+	Vector Min( const Vector &A, const Vector &B )
+	{
+		assert( A.Size() == B.Size() );
+		Vector C( A.Size() );
+		for( register int i = 0; i < A.Size(); i++ ) C(i) = Min( A(i), B(i) );
+		return C;
+	}
+
+	Vector Max( const Vector &A, const Vector &B )
+	{
+		assert( A.Size() == B.Size() );
+		Vector C( A.Size() );
+		for( register int i = 0; i < A.Size(); i++ ) C(i) = Max( A(i), B(i) );
+		return C;
+	}
+
+	Vector Unit( const Vector &A )
+	{
+		double norm = TwoNorm( A );
+		assert( norm > 0.0 );
+		return A * ( 1.0 / norm );
+	}
+
+	double Normalize( Vector &A )
+	{
+		double norm = TwoNorm( A );
+		assert( norm > 0.0 );
+		for( register int i = 0; i < A.Size(); i++ ) A(i) /= norm;
+		return norm;
+	}
+
+	int Null( const Vector &A ) 
+	{
+		return A.Size() == 0;
+	}
+
+	double TwoNormSqr( const Vector &A )
+	{
+		double sum = A(0) * A(0);
+		for( register int i = 1; i < A.Size(); i++ ) sum += A(i) * A(i);
+		return sum;
+	}
+
+	double TwoNorm( const Vector &A )
+	{
+		return sqrt( TwoNormSqr( A ) );
+	}
+
+	double dist( const Vector &A, const Vector &B )
+	{
+		return TwoNorm( A - B );
+	}
+
+	double OneNorm( const Vector &A )
+	{
+		double norm = Abs( A(0) );
+		for( register int i = 1; i < A.Size(); i++ ) norm += Abs( A(i) );
+		return norm;
+	}
+
+	double SupNorm( const Vector &A )
+	{
+		double norm = Abs( A(0) );
+		for( register int i = 1; i < A.Size(); i++ )
+		{
+			double a = Abs( A(i) );
+			if( a > norm ) norm = a;
+		}
+		return norm;
+	}
+
+	Vec2 ToVec2( const Vector &V )
+	{
+		assert( V.Size() == 2 );
+		return Vec2( V(0), V(1) );
+	}
+
+	Vec3 ToVec3( const Vector &V )
+	{
+		assert( V.Size() == 3 );
+		return Vec3( V(0), V(1), V(2) );
+	}
+
+	Vector ToVector( const Vec2 &V )
+	{
+		return Vector( V.X(), V.Y() );
+	}
+
+	Vector ToVector( const Vec3 &V )
+	{
+		return Vector( V.X(), V.Y(), V.Z() );
+	}
+
+	//
+	// Returns a vector that is orthogonal to A (but of arbitrary length). 
+	//
+	Vector OrthogonalTo( const Vector &A )
+	{
+		Vector B( A.Size() );
+		double c = 0.5 * SupNorm( A );
+
+		if( A.Size() < 2 ) 
+		{
+			// Just return the zero-vector.
+		}
+		else if( c == 0.0 ) 
+		{
+			B(0) = 1.0;
+		}
+		else for( register int i = 0; i < A.Size(); i++ )
+		{
+			if( Abs( A(i)) > c )
+			{
+				int k = ( i > 0 ) ? i - 1 : i + 1;
+				B(k) = -A(i);
+				B(i) =  A(k);
+				break;
+			}
+		}
+		return B;
+	}
+
+	std::ostream &operator<<( std::ostream &out, const Vector &A )
+	{
+		if( A.Size() == 0 )
+		{
+			out << "NULL";
+		}
+		else for( register int i = 0; i < A.Size(); i++ )
+		{
+			out << form( "%3d:  %10.5g\n", i, A(i) );
+		}
+		out << std::endl;
+		return out;
+	}
+
+
+};
diff --git a/src/nvtt/bc6h/arvo/Vector.h b/src/nvtt/bc6h/arvo/Vector.h
new file mode 100755
index 0000000..01e66df
--- /dev/null
+++ b/src/nvtt/bc6h/arvo/Vector.h
@@ -0,0 +1,103 @@
+/***************************************************************************
+* Vector.h                                                                 *
+*                                                                          *
+* General Vector and Matrix classes, with all the associated methods.      *
+*                                                                          *
+*   HISTORY                                                                *
+*      Name    Date        Description                                     *
+*                                                                          *
+*      arvo    08/16/2000    Revamped for CIT tools.                       *
+*      arvo    10/31/1994    Combined RowVec & ColVec into Vector.         *
+*      arvo    06/30/1993    Added singular value decomposition class.     *
+*      arvo    06/25/1993    Major revisions.                              *
+*      arvo    09/08/1991    Initial implementation.                       *
+*                                                                          *
+*--------------------------------------------------------------------------*
+* Copyright (C) 2000, James Arvo                                           *
+*                                                                          *
+* This program is free software; you can redistribute it and/or modify it  *
+* under the terms of the GNU General Public License as published by the    *
+* Free Software Foundation.  See http://www.fsf.org/copyleft/gpl.html      *
+*                                                                          *
+* This program is distributed in the hope that it will be useful, but      *
+* WITHOUT EXPRESS OR IMPLIED WARRANTY of merchantability or fitness for    *
+* any particular purpose.  See the GNU General Public License for more     *
+* details.                                                                 *
+*                                                                          *
+***************************************************************************/
+#ifndef __VECTOR_INCLUDED__
+#define __VECTOR_INCLUDED__
+
+#include <istream>
+#include "Vec2.h"
+#include "Vec3.h"
+
+namespace ArvoMath {
+	class Vector {
+	public:
+		Vector( int size = 0   );
+		Vector( const Vector & );
+		Vector( float, float );
+		Vector( float, float, float );
+		Vector( const float *x, int n );
+		Vector &operator=( const Vector & );
+		void    operator=( float );
+		void    SetSize( int );
+		Vector &Swap( int i, int j );
+		Vector  GetBlock( int i, int j ) const;
+		void    SetBlock( int i, int j, const Vector & );
+		static  const Vector Null;
+
+	public: // Inlined functions.
+		inline float  operator()( int i ) const { return elem[i]; }
+		inline float& operator()( int i )       { return elem[i]; }
+		inline float* Array() const { return elem; }
+		inline int    Size () const { return size; }
+		inline ~Vector() { delete[] elem; }
+
+	private:
+		void   Create( int n = 0 ) { size = n; elem = new float[n]; }
+		int    size;
+		float* elem;
+	};
+
+	extern Vector  operator +  ( const Vector &, const Vector & );
+	extern Vector  operator -  ( const Vector &, const Vector & ); // Binary minus.
+	extern Vector  operator -  ( const Vector &                 ); // Unary minus.
+	extern Vector  operator *  ( const Vector &,        float   );
+	extern Vector  operator *  (       float   , const Vector & );
+	extern Vector  operator /  ( const Vector &,        float   );
+	extern Vector  operator /  ( const Vector &, const Vector & );
+	extern Vector  operator ^  ( const Vector &, const Vector & );
+	extern Vector& operator += (       Vector &, const Vector & );
+	extern Vector& operator *= (       Vector &,        float   );
+	extern Vector& operator /= (       Vector &,        float   );
+	extern Vector  Min         ( const Vector &, const Vector & );
+	extern Vector  Max         ( const Vector &, const Vector & );
+	extern double  operator *  ( const Vector &, const Vector & );  // Inner product.
+	extern double  dist        ( const Vector &, const Vector & );
+	extern Vector  OrthogonalTo( const Vector & );  // Returns some orthogonal vector.
+	extern Vector  Unit        ( const Vector & );
+	extern double  Normalize   (       Vector & );
+	extern double  OneNorm     ( const Vector & );
+	extern double  TwoNorm     ( const Vector & );
+	extern double  TwoNormSqr  ( const Vector & );
+	extern double  SupNorm     ( const Vector & );
+	extern int     Null        ( const Vector & );
+	extern Vec2    ToVec2      ( const Vector & );
+	extern Vec3    ToVec3      ( const Vector & );
+	extern Vector  ToVector    ( const Vec2   & );
+	extern Vector  ToVector    ( const Vec3   & );
+
+	std::ostream &operator<<( 
+		std::ostream &out, 
+		const Vector &
+		);
+};
+#endif
+
+
+
+
+
+
diff --git a/src/nvtt/bc6h/arvo/form.h b/src/nvtt/bc6h/arvo/form.h
new file mode 100755
index 0000000..48aef94
--- /dev/null
+++ b/src/nvtt/bc6h/arvo/form.h
@@ -0,0 +1,26 @@
+#ifndef __FORM_INCLUDED__
+#define __FORM_INCLUDED__
+
+#include <stdlib.h>
+#include <stdarg.h>
+#include <string.h>
+#include <assert.h>
+
+namespace ArvoMath {
+
+	inline const char *form(char *fmt, ...)
+	{
+		static char printbfr[65536];
+		va_list arglist;
+
+		va_start(arglist,fmt);	
+		int length = vsprintf(printbfr,fmt,arglist);
+		va_end(arglist);
+
+		assert(length > 65536);
+
+		return printbfr;
+	}
+};
+
+#endif
diff --git a/src/nvtt/bc6h/bits.h b/src/nvtt/bc6h/bits.h
new file mode 100755
index 0000000..bf177ca
--- /dev/null
+++ b/src/nvtt/bc6h/bits.h
@@ -0,0 +1,73 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+#ifndef _BITS_H
+#define _BITS_H
+
+// read/write a bitstream
+
+#include <assert.h>
+
+class Bits
+{
+public:
+
+	Bits(char *data, int maxdatabits) { assert (data && maxdatabits > 0); bptr = bend = 0; bits = data; maxbits = maxdatabits; readonly = 0;}
+	Bits(const char *data, int availdatabits) { assert (data && availdatabits > 0); bptr = 0; bend = availdatabits; cbits = data; maxbits = availdatabits; readonly = 1;}
+
+	void write(int value, int nbits) {
+		assert (nbits >= 0 && nbits < 32);
+		assert (sizeof(int)>= 4);
+		for (int i=0; i<nbits; ++i)
+			writeone(value>>i);
+	}
+	int read(int nbits) { 
+		assert (nbits >= 0 && nbits < 32);
+		assert (sizeof(int)>= 4);
+		int out = 0;
+		for (int i=0; i<nbits; ++i)
+			out |= readone() << i;
+		return out;
+	}
+	int getptr() { return bptr; }
+	int setptr(int ptr) { assert (ptr >= 0 && ptr < maxbits); bptr = ptr; }
+	int getsize() { return bend; }
+
+private:
+	int	bptr;		// next bit to read
+	int bend;		// last written bit + 1
+	char *bits;		// ptr to user bit stream
+	const char *cbits;	// ptr to const user bit stream
+	int maxbits;	// max size of user bit stream
+	char readonly;	// 1 if this is a read-only stream
+
+	int readone() {
+		assert (bptr < bend);
+		if (bptr >= bend) return 0;
+		int bit = (readonly ? cbits[bptr>>3] : bits[bptr>>3]) & (1 << (bptr & 7));
+		++bptr;
+		return bit != 0;
+	}
+	void writeone(int bit) {
+		if (readonly)
+			throw "Writing a read-only bit stream";
+		assert (bptr < maxbits);
+		if (bptr >= maxbits) return;
+		if (bit&1)
+			bits[bptr>>3] |= 1 << (bptr & 7);
+		else
+			bits[bptr>>3] &= ~(1 << (bptr & 7));
+		if (bptr++ >= bend) bend = bptr;
+	}
+};
+
+#endif
\ No newline at end of file
diff --git a/src/nvtt/bc6h/exr.cpp b/src/nvtt/bc6h/exr.cpp
new file mode 100755
index 0000000..9321734
--- /dev/null
+++ b/src/nvtt/bc6h/exr.cpp
@@ -0,0 +1,51 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+// Simple .exr file reader/writer
+
+#include <string>
+
+#include <ImfRgbaFile.h>
+#include <ImfArray.h>
+
+#include "exr.h"
+
+using namespace std;
+using namespace Imf;
+using namespace Imath;
+
+void Exr::fileinfo(const string inf, int &width, int &height)
+{
+	RgbaInputFile file (inf.c_str());
+    Box2i dw = file.dataWindow();
+
+    width  = dw.max.x - dw.min.x + 1;
+    height = dw.max.y - dw.min.y + 1;
+}
+
+void Exr::readRgba(const string inf, Array2D<Rgba> &pix, int &w, int &h)
+{
+    RgbaInputFile file (inf.c_str());
+    Box2i dw = file.dataWindow();
+    w  = dw.max.x - dw.min.x + 1;
+    h = dw.max.y - dw.min.y + 1;
+    pix.resizeErase (h, w);
+    file.setFrameBuffer (&pix[0][0] - dw.min.x - dw.min.y * w, 1, w);
+    file.readPixels (dw.min.y, dw.max.y);
+}
+
+void Exr::writeRgba(const string outf, const Array2D<Rgba> &pix, int w, int h)
+{
+	RgbaOutputFile file (outf.c_str(), w, h, WRITE_RGBA);
+	file.setFrameBuffer (&pix[0][0], 1, w);
+	file.writePixels (h);
+}
\ No newline at end of file
diff --git a/src/nvtt/bc6h/exr.h b/src/nvtt/bc6h/exr.h
new file mode 100755
index 0000000..b2568ee
--- /dev/null
+++ b/src/nvtt/bc6h/exr.h
@@ -0,0 +1,37 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+#ifndef _EXR_H
+#define _EXR_H
+
+// exr-friendly routines
+
+#include <string>
+
+#include "ImfArray.h"
+#include "ImfRgba.h"
+
+using namespace std;
+using namespace Imf;
+
+class Exr
+{
+public:
+	Exr() {};
+	~Exr() {};
+
+	static void fileinfo(const string inf, int &width, int &height);
+	static void readRgba(const string inf, Array2D<Rgba> &pix, int &w, int &h);
+	static void writeRgba(const string outf, const Array2D<Rgba> &pix, int w, int h);
+};
+
+#endif
\ No newline at end of file
diff --git a/src/nvtt/bc6h/shapes_two.h b/src/nvtt/bc6h/shapes_two.h
new file mode 100755
index 0000000..d9a52ef
--- /dev/null
+++ b/src/nvtt/bc6h/shapes_two.h
@@ -0,0 +1,133 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+#ifndef _SHAPES_TWO_H
+#define _SHAPES_TWO_H
+
+// shapes for two regions
+
+#define NREGIONS 2
+#define NSHAPES 64
+#define SHAPEBITS 6
+
+static int shapes[NSHAPES*16] = 
+{
+0, 0, 1, 1,   0, 0, 0, 1,   0, 1, 1, 1,   0, 0, 0, 1,   
+0, 0, 1, 1,   0, 0, 0, 1,   0, 1, 1, 1,   0, 0, 1, 1,   
+0, 0, 1, 1,   0, 0, 0, 1,   0, 1, 1, 1,   0, 0, 1, 1,   
+0, 0, 1, 1,   0, 0, 0, 1,   0, 1, 1, 1,   0, 1, 1, 1,   
+
+0, 0, 0, 0,   0, 0, 1, 1,   0, 0, 0, 1,   0, 0, 0, 0,   
+0, 0, 0, 1,   0, 1, 1, 1,   0, 0, 1, 1,   0, 0, 0, 1,   
+0, 0, 0, 1,   0, 1, 1, 1,   0, 1, 1, 1,   0, 0, 1, 1,   
+0, 0, 1, 1,   1, 1, 1, 1,   1, 1, 1, 1,   0, 1, 1, 1,   
+
+0, 0, 0, 0,   0, 0, 1, 1,   0, 0, 0, 0,   0, 0, 0, 0,   
+0, 0, 0, 0,   0, 1, 1, 1,   0, 0, 0, 1,   0, 0, 0, 0,   
+0, 0, 0, 1,   1, 1, 1, 1,   0, 1, 1, 1,   0, 0, 0, 1,   
+0, 0, 1, 1,   1, 1, 1, 1,   1, 1, 1, 1,   0, 1, 1, 1,   
+
+0, 0, 0, 1,   0, 0, 0, 0,   0, 0, 0, 0,   0, 0, 0, 0,   
+0, 1, 1, 1,   0, 0, 0, 0,   1, 1, 1, 1,   0, 0, 0, 0,   
+1, 1, 1, 1,   1, 1, 1, 1,   1, 1, 1, 1,   0, 0, 0, 0,   
+1, 1, 1, 1,   1, 1, 1, 1,   1, 1, 1, 1,   1, 1, 1, 1,   
+
+0, 0, 0, 0,   0, 1, 1, 1,   0, 0, 0, 0,   0, 1, 1, 1,   
+1, 0, 0, 0,   0, 0, 0, 1,   0, 0, 0, 0,   0, 0, 1, 1,   
+1, 1, 1, 0,   0, 0, 0, 0,   1, 0, 0, 0,   0, 0, 0, 1,   
+1, 1, 1, 1,   0, 0, 0, 0,   1, 1, 1, 0,   0, 0, 0, 0,   
+
+0, 0, 1, 1,   0, 0, 0, 0,   0, 0, 0, 0,   0, 1, 1, 1,   
+0, 0, 0, 1,   1, 0, 0, 0,   0, 0, 0, 0,   0, 0, 1, 1,   
+0, 0, 0, 0,   1, 1, 0, 0,   1, 0, 0, 0,   0, 0, 1, 1,   
+0, 0, 0, 0,   1, 1, 1, 0,   1, 1, 0, 0,   0, 0, 0, 1,   
+
+0, 0, 1, 1,   0, 0, 0, 0,   0, 1, 1, 0,   0, 0, 1, 1,   
+0, 0, 0, 1,   1, 0, 0, 0,   0, 1, 1, 0,   0, 1, 1, 0,   
+0, 0, 0, 1,   1, 0, 0, 0,   0, 1, 1, 0,   0, 1, 1, 0,   
+0, 0, 0, 0,   1, 1, 0, 0,   0, 1, 1, 0,   1, 1, 0, 0,   
+
+0, 0, 0, 1,   0, 0, 0, 0,   0, 1, 1, 1,   0, 0, 1, 1,   
+0, 1, 1, 1,   1, 1, 1, 1,   0, 0, 0, 1,   1, 0, 0, 1,   
+1, 1, 1, 0,   1, 1, 1, 1,   1, 0, 0, 0,   1, 0, 0, 1,   
+1, 0, 0, 0,   0, 0, 0, 0,   1, 1, 1, 0,   1, 1, 0, 0,   
+
+0, 1, 0, 1,   0, 0, 0, 0,   0, 1, 0, 1,   0, 0, 1, 1,   
+0, 1, 0, 1,   1, 1, 1, 1,   1, 0, 1, 0,   0, 0, 1, 1,   
+0, 1, 0, 1,   0, 0, 0, 0,   0, 1, 0, 1,   1, 1, 0, 0,   
+0, 1, 0, 1,   1, 1, 1, 1,   1, 0, 1, 0,   1, 1, 0, 0,   
+
+0, 0, 1, 1,   0, 1, 0, 1,   0, 1, 1, 0,   0, 1, 0, 1,   
+1, 1, 0, 0,   0, 1, 0, 1,   1, 0, 0, 1,   1, 0, 1, 0,   
+0, 0, 1, 1,   1, 0, 1, 0,   0, 1, 1, 0,   1, 0, 1, 0,   
+1, 1, 0, 0,   1, 0, 1, 0,   1, 0, 0, 1,   0, 1, 0, 1,   
+
+0, 1, 1, 1,   0, 0, 0, 1,   0, 0, 1, 1,   0, 0, 1, 1,   
+0, 0, 1, 1,   0, 0, 1, 1,   0, 0, 1, 0,   1, 0, 1, 1,   
+1, 1, 0, 0,   1, 1, 0, 0,   0, 1, 0, 0,   1, 1, 0, 1,   
+1, 1, 1, 0,   1, 0, 0, 0,   1, 1, 0, 0,   1, 1, 0, 0,   
+
+0, 1, 1, 0,   0, 0, 1, 1,   0, 1, 1, 0,   0, 0, 0, 0,   
+1, 0, 0, 1,   1, 1, 0, 0,   0, 1, 1, 0,   0, 1, 1, 0,   
+1, 0, 0, 1,   1, 1, 0, 0,   1, 0, 0, 1,   0, 1, 1, 0,   
+0, 1, 1, 0,   0, 0, 1, 1,   1, 0, 0, 1,   0, 0, 0, 0,   
+
+0, 1, 0, 0,   0, 0, 1, 0,   0, 0, 0, 0,   0, 0, 0, 0,   
+1, 1, 1, 0,   0, 1, 1, 1,   0, 0, 1, 0,   0, 1, 0, 0,   
+0, 1, 0, 0,   0, 0, 1, 0,   0, 1, 1, 1,   1, 1, 1, 0,   
+0, 0, 0, 0,   0, 0, 0, 0,   0, 0, 1, 0,   0, 1, 0, 0,   
+
+0, 1, 1, 0,   0, 0, 1, 1,   0, 1, 1, 0,   0, 0, 1, 1,   
+1, 1, 0, 0,   0, 1, 1, 0,   0, 0, 1, 1,   1, 0, 0, 1,   
+1, 0, 0, 1,   1, 1, 0, 0,   1, 0, 0, 1,   1, 1, 0, 0,   
+0, 0, 1, 1,   1, 0, 0, 1,   1, 1, 0, 0,   0, 1, 1, 0,   
+
+0, 1, 1, 0,   0, 1, 1, 0,   0, 1, 1, 1,   0, 0, 0, 1,   
+1, 1, 0, 0,   0, 0, 1, 1,   1, 1, 1, 0,   1, 0, 0, 0,   
+1, 1, 0, 0,   0, 0, 1, 1,   1, 0, 0, 0,   1, 1, 1, 0,   
+1, 0, 0, 1,   1, 0, 0, 1,   0, 0, 0, 1,   0, 1, 1, 1,   
+
+0, 0, 0, 0,   0, 0, 1, 1,   0, 0, 1, 0,   0, 1, 0, 0,   
+1, 1, 1, 1,   0, 0, 1, 1,   0, 0, 1, 0,   0, 1, 0, 0,   
+0, 0, 1, 1,   1, 1, 1, 1,   1, 1, 1, 0,   0, 1, 1, 1,   
+0, 0, 1, 1,   0, 0, 0, 0,   1, 1, 1, 0,   0, 1, 1, 1,   
+
+};
+
+#define	REGION(x,y,si)	shapes[((si)&3)*4+((si)>>2)*64+(x)+(y)*16]
+
+static int shapeindex_to_compressed_indices[NSHAPES*2] = 
+{
+	0,15,  0,15,  0,15,  0,15,
+	0,15,  0,15,  0,15,  0,15,
+	0,15,  0,15,  0,15,  0,15,
+	0,15,  0,15,  0,15,  0,15,
+
+	0,15,  0, 2,  0, 8,  0, 2,
+	0, 2,  0, 8,  0, 8,  0,15,
+	0, 2,  0, 8,  0, 2,  0, 2,
+	0, 8,  0, 8,  0, 2,  0, 2,
+
+	0,15,  0,15,  0, 6,  0, 8,
+	0, 2,  0, 8,  0,15,  0,15,
+	0, 2,  0, 8,  0, 2,  0, 2,
+	0, 2,  0,15,  0,15,  0, 6,
+
+	0, 6,  0, 2,  0, 6,  0, 8,
+	0,15,  0,15,  0, 2,  0, 2,
+	0,15,  0,15,  0,15,  0,15,
+	0,15,  0, 2,  0, 2,  0,15
+
+};
+#define SHAPEINDEX_TO_COMPRESSED_INDICES(si,region)  shapeindex_to_compressed_indices[(si)*2+(region)]
+
+#endif
diff --git a/src/nvtt/bc6h/tile.h b/src/nvtt/bc6h/tile.h
new file mode 100755
index 0000000..f3bd2d6
--- /dev/null
+++ b/src/nvtt/bc6h/tile.h
@@ -0,0 +1,115 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+#ifndef _TILE_H
+#define _TILE_H
+
+#include <ImfArray.h>
+#include <ImfRgba.h>
+#include <half.h>
+#include <math.h>
+#include "arvo/Vec3.h"
+
+#include "utils.h"
+
+#define	DBL_MAX	(1.0e37)		// doesn't have to be really dblmax, just bigger than any possible squared error
+
+using namespace Imf;
+using namespace ArvoMath;
+
+//#define	USE_IMPORTANCE_MAP	1		// define this if you want to increase importance of some pixels in tile
+class Tile
+{
+private:
+	// NOTE: this returns the appropriately-clamped BIT PATTERN of the half as an INTEGRAL float value
+	static float half2float(half h)
+	{
+		return (float) Utils::ushort_to_format(h.bits());
+	}
+	// NOTE: this is the inverse of the above operation
+	static half float2half(float f)
+	{
+		half h;
+		h.setBits(Utils::format_to_ushort((int)f));
+		return h;
+	}
+	// look for adjacent pixels that are identical. if there are enough of them, increase their importance
+	void generate_importance_map()
+	{
+		// initialize
+		for (int y=0; y<size_y; ++y)
+		for (int x=0; x<size_x; ++x)
+		{
+			// my importance is increased if I am identical to any of my 4-neighbors
+			importance_map[y][x] = match_4_neighbor(x,y) ? 5.0f : 1.0f;
+		}
+	}
+	bool is_equal(int x, int y, int xn, int yn)
+	{
+		if (xn < 0 || xn >= size_x || yn < 0 || yn >= size_y)
+			return false;
+		return( (data[y][x].X() == data[yn][xn].X()) &&
+				(data[y][x].Y() == data[yn][xn].Y()) &&
+				(data[y][x].Z() == data[yn][xn].Z()) );
+	}
+#ifdef USE_IMPORTANCE_MAP
+	bool match_4_neighbor(int x, int y)
+	{
+		return is_equal(x,y,x-1,y) || is_equal(x,y,x+1,y) || is_equal(x,y,x,y-1) || is_equal(x,y,x,y+1);
+	}
+#else
+	bool match_4_neighbor(int x, int y)
+	{
+		return false;
+	}
+#endif
+
+public:
+	Tile() {};
+	~Tile(){};
+	Tile(int xs, int ys) {size_x = xs; size_y = ys;}
+
+	static const int TILE_H = 4;
+	static const int TILE_W = 4;
+	static const int TILE_TOTAL = TILE_H * TILE_W;
+	Vec3 data[TILE_H][TILE_W];
+	float importance_map[TILE_H][TILE_W];
+	int	size_x, size_y;			// actual size of tile
+
+	// pixels -> tile
+	void inline insert(const Array2D<Rgba> &pixels, int x, int y)
+	{
+		for (int y0=0; y0<size_y; ++y0)
+		for (int x0=0; x0<size_x; ++x0)
+		{
+			data[y0][x0].X() = half2float((pixels[y+y0][x+x0]).r);
+			data[y0][x0].Y() = half2float((pixels[y+y0][x+x0]).g);
+			data[y0][x0].Z() = half2float((pixels[y+y0][x+x0]).b);
+		}
+		generate_importance_map();
+	}
+
+	// tile -> pixels
+	void inline extract(Array2D<Rgba> &pixels, int x, int y)	
+	{
+		for (int y0=0; y0<size_y; ++y0)
+		for (int x0=0; x0<size_x; ++x0)
+		{
+			pixels[y+y0][x+x0].r = float2half(data[y0][x0].X());
+			pixels[y+y0][x+x0].g = float2half(data[y0][x0].Y());
+			pixels[y+y0][x+x0].b = float2half(data[y0][x0].Z());
+			pixels[y+y0][x+x0].a = 0;		// set it to a known value
+		}
+	}
+};
+
+#endif
\ No newline at end of file
diff --git a/src/nvtt/bc6h/utils.cpp b/src/nvtt/bc6h/utils.cpp
new file mode 100755
index 0000000..be99d01
--- /dev/null
+++ b/src/nvtt/bc6h/utils.cpp
@@ -0,0 +1,466 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+// Utility and common routines
+
+#include "utils.h"
+#include <half.h>
+#include <math.h>
+#include <assert.h>
+
+static int denom7_weights_64[] = {0, 9, 18, 27, 37, 46, 55, 64};										// divided by 64
+static int denom15_weights_64[] = {0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64};		// divided by 64
+
+int Utils::lerp(int a, int b, int i, int denom)
+{
+	assert (denom == 3 || denom == 7 || denom == 15);
+	assert (i >= 0 && i <= denom);
+
+	int round = 32, shift = 6, *weights;
+
+	switch(denom)
+	{
+	case 3:		denom *= 5; i *= 5;	// fall through to case 15
+	case 15:	weights = denom15_weights_64; break;
+	case 7:		weights = denom7_weights_64; break;
+	default:	assert(0);
+	}
+
+	return (a*weights[denom-i] +b*weights[i] + round) >> shift;
+}
+
+Vec3 Utils::lerp(const Vec3& a, const Vec3 &b, int i, int denom)
+{
+	assert (denom == 3 || denom == 7 || denom == 15);
+	assert (i >= 0 && i <= denom);
+
+	int shift = 6, *weights;
+
+	switch(denom)
+	{
+	case 3:		denom *= 5; i *= 5;	// fall through to case 15
+	case 15:	weights = denom15_weights_64; break;
+	case 7:		weights = denom7_weights_64; break;
+	default:	assert(0);
+	}
+
+	// no need to round these as this is an exact division
+	return (a*weights[denom-i] +b*weights[i]) / float(1 << shift);
+}
+
+
+/*
+	For unsigned f16, clamp the input to [0,F16MAX]. Thus u15.
+	For signed f16, clamp the input to [-F16MAX,F16MAX]. Thus s16.
+
+	The conversions proceed as follows:
+
+	unsigned f16: get bits. if high bit set, clamp to 0, else clamp to F16MAX.
+	signed f16: get bits. extract exp+mantissa and clamp to F16MAX. return -value if sign bit was set, else value
+	unsigned int: get bits. return as a positive value.
+	signed int. get bits. return as a value in -32768..32767.
+
+	The inverse conversions are just the inverse of the above.
+*/
+
+// clamp the 3 channels of the input vector to the allowable range based on FORMAT
+// note that each channel is a float storing the allowable range as a bit pattern converted to float
+// that is, for unsigned f16 say, we would clamp each channel to the range [0, F16MAX]
+
+void Utils::clamp(Vec3 &v)
+{
+	for (int i=0; i<3; ++i)
+	{
+		switch(Utils::FORMAT)
+		{
+		case UNSIGNED_F16:
+			if (v[i] < 0.0) v[i] = 0;
+			else if (v[i] > F16MAX) v[i] = F16MAX;
+			break;
+
+		case SIGNED_F16:
+			if (v[i] < -F16MAX) v[i] = -F16MAX;
+			else if (v[i] > F16MAX) v[i] = F16MAX;
+			break;
+
+		default:
+			assert (0);
+		}
+	}
+}
+
+// convert a u16 value to s17 (represented as an int) based on the format expected
+int Utils::ushort_to_format(unsigned short input)
+{
+	int out, s;
+
+	// clamp to the valid range we are expecting
+	switch (Utils::FORMAT)
+	{
+	case UNSIGNED_F16:
+		if (input & F16S_MASK) out = 0;
+		else if (input > F16MAX) out = F16MAX;
+		else out = input;
+		break;
+
+	case SIGNED_F16:
+		s = input & F16S_MASK;
+		input &= F16EM_MASK;
+		if (input > F16MAX) out = F16MAX;
+		else out = input;
+		out = s ? -out : out;
+		break;
+	}
+	return out;
+}
+
+// convert a s17 value to u16 based on the format expected
+unsigned short Utils::format_to_ushort(int input)
+{
+	unsigned short out;
+
+	// clamp to the valid range we are expecting
+	switch (Utils::FORMAT)
+	{
+	case UNSIGNED_F16:
+		assert (input >= 0 && input <= F16MAX);
+		out = input;
+		break;
+
+	case SIGNED_F16:
+		assert (input >= -F16MAX && input <= F16MAX);
+		// convert to sign-magnitude
+		int s;
+		if (input < 0) { s = F16S_MASK; input = -input; }
+		else           { s = 0; }
+		out = s | input;
+		break;
+	}
+	return out;
+}
+
+// quantize the input range into equal-sized bins
+int Utils::quantize(float value, int prec)
+{
+	int q, ivalue, s;
+
+	assert (prec > 1);	// didn't bother to make it work for 1
+
+	value = (float)floor(value + 0.5);
+
+	int bias = (prec > 10) ? ((1<<(prec-1))-1) : 0;	// bias precisions 11..16 to get a more accurate quantization
+
+	switch (Utils::FORMAT)
+	{
+	case UNSIGNED_F16:
+		assert (value >= 0 && value <= F16MAX);
+		ivalue = (int)value;
+		q = ((ivalue << prec) + bias) / (F16MAX+1);
+		assert (q >= 0 && q < (1 << prec));
+		break;
+
+	case SIGNED_F16:
+		assert (value >= -F16MAX && value <= F16MAX);
+		// convert to sign-magnitude
+		ivalue = (int)value;
+		if (ivalue < 0) { s = 1; ivalue = -ivalue; } else s = 0;
+
+		q = ((ivalue << (prec-1)) + bias) / (F16MAX+1);
+		if (s)
+			q = -q;
+		assert (q > -(1 << (prec-1)) && q < (1 << (prec-1)));
+		break;
+	}
+
+	return q;
+}
+
+int Utils::finish_unquantize(int q, int prec)
+{
+	if (Utils::FORMAT == UNSIGNED_F16)
+		return (q * 31) >> 6;										// scale the magnitude by 31/64
+	else if (Utils::FORMAT == SIGNED_F16)
+		return (q < 0) ? -(((-q) * 31) >> 5) : (q * 31) >> 5;		// scale the magnitude by 31/32
+	else
+		return q;
+}
+
+// unquantize each bin to midpoint of original bin range, except
+// for the end bins which we push to an endpoint of the bin range.
+// we do this to ensure we can represent all possible original values.
+// the asymmetric end bins do not affect PSNR for the test images.
+//
+// code this function assuming an arbitrary bit pattern as the encoded block
+int Utils::unquantize(int q, int prec)
+{
+	int unq, s;
+
+	assert (prec > 1);	// not implemented for prec 1
+
+	switch (Utils::FORMAT)
+	{
+	// modify this case to move the multiplication by 31 after interpolation.
+	// Need to use finish_unquantize.
+
+	// since we have 16 bits available, let's unquantize this to 16 bits unsigned
+	// thus the scale factor is [0-7c00)/[0-10000) = 31/64
+	case UNSIGNED_F16:
+		if (prec >= 15) 
+			unq = q;
+		else if (q == 0) 
+			unq = 0;
+		else if (q == ((1<<prec)-1)) 
+			unq = U16MAX;
+		else
+			unq = (q * (U16MAX+1) + (U16MAX+1)/2) >> prec;
+		break;
+
+	// here, let's stick with S16 (no apparent quality benefit from going to S17)
+	// range is (-7c00..7c00)/(-8000..8000) = 31/32
+	case SIGNED_F16:
+		// don't remove this test even though it appears equivalent to the code below
+		// as it isn't -- the code below can overflow for prec = 16
+		if (prec >= 16)
+			unq = q;
+		else
+		{
+			if (q < 0) { s = 1; q = -q; } else s = 0;
+
+			if (q == 0)
+				unq = 0;
+			else if (q >= ((1<<(prec-1))-1))
+				unq = s ? -S16MAX : S16MAX;
+			else
+			{
+				unq = (q * (S16MAX+1) + (S16MAX+1)/2) >> (prec-1);
+				if (s)
+					unq = -unq;
+			}
+		}
+		break;
+	}
+	return unq;
+}
+
+static int clamp(double r, double low, double high)
+{
+	if (r < low) return low;
+	else if (r > high) return high;
+	else return r;
+}
+
+// match the tonemapping function used by exrdisplay
+static void tonemap(const Vec3 &in, double exposure, Vec3 &out)
+{
+    double r,g,b;
+	half h;
+
+	// convert from bit pattern back to half and then to double
+	h.setBits(in.X());	r = h;
+	h.setBits(in.Y());	g = h;
+	h.setBits(in.Z());	b = h;
+
+    //  1) Compensate for fogging by subtracting defog
+    //     from the raw pixel values.
+    // Response: We work with defog of 0.0, so this is a no-op
+
+    //  2) Multiply the defogged pixel values by
+    //     2^(exposure + 2.47393).
+	double exposure_scale = pow(2.0, exposure + 2.47393);
+    r *= exposure_scale;
+    g *= exposure_scale;
+    b *= exposure_scale;
+
+    //  3) Values, which are now 1.0, are called "middle gray".
+    //     If defog and exposure are both set to 0.0, then
+    //     middle gray corresponds to a raw pixel value of 0.18.
+    //     In step 6, middle gray values will be mapped to an
+    //     intensity 3.5 f-stops below the display's maximum
+    //     intensity.
+    // Response: no apparent content.
+
+    //  4) Apply a knee function.  The knee function has two
+    //     parameters, kneeLow and kneeHigh.  Pixel values
+    //     below 2^kneeLow are not changed by the knee
+    //     function.  Pixel values above kneeLow are lowered
+    //     according to a logarithmic curve, such that the
+    //     value 2^kneeHigh is mapped to 2^3.5 (in step 6,
+    //     this value will be mapped to the the display's
+    //     maximum intensity).
+    // Response: kneeLow = 0.0 (2^0.0 => 1); kneeHigh = 5.0 (2^5 =>32)
+    if (r > 1.0)
+        r = 1.0 + log ((r-1.0) * 0.184874 + 1) / 0.184874;
+    if (g > 1.0)
+        g = 1.0 + log ((g-1.0) * 0.184874 + 1) / 0.184874;
+    if (b > 1.0)
+        b = 1.0 + log ((b-1.0) * 0.184874 + 1) / 0.184874;
+//
+//  5) Gamma-correct the pixel values, assuming that the
+//     screen's gamma is 0.4545 (or 1/2.2).
+    r = pow (r, 0.4545);
+    g = pow (g, 0.4545);
+    b = pow (b, 0.4545);
+
+//  6) Scale the values such that pixels middle gray
+//     pixels are mapped to 84.66 (or 3.5 f-stops below
+//     the display's maximum intensity).
+//
+//  7) Clamp the values to [0, 255].
+    r *= 84.66f;
+    g *= 84.66f;
+    b *= 84.66f;
+
+    out.X() = clamp (r, 0, 255);
+    out.Y() = clamp (g, 0, 255);
+    out.Z() = clamp (b, 0, 255);
+}
+
+static void mpsnrmap(const Vec3 &in, int exposure, Vec3 &out)
+{
+    double r,g,b;
+	half h;
+
+	// convert from bit pattern back to half and then to double
+	h.setBits(in.X());	r = h;
+	h.setBits(in.Y());	g = h;
+	h.setBits(in.Z());	b = h;
+
+	assert (exposure > -32 && exposure < 32);
+	if (exposure > 0)
+	{
+		r *= 1 << exposure;
+		g *= 1 << exposure;
+		b *= 1 << exposure;
+	}
+	else if (exposure < 0)
+	{
+		exposure = -exposure;
+		r /= 1 << exposure;
+		g /= 1 << exposure;
+		b /= 1 << exposure;
+	}
+    r = 255 * pow (r, 0.4545);
+    g = 255 * pow (g, 0.4545);
+    b = 255 * pow (b, 0.4545);
+
+    out.X() = clamp (r, 0, 255);
+    out.Y() = clamp (g, 0, 255);
+    out.Z() = clamp (b, 0, 255);
+}
+
+// pick a norm!
+#define	NORM_EUCLIDEAN 1
+
+double Utils::norm(const Vec3 &a, const Vec3 &b)
+{
+#ifdef	NORM_EUCLIDEAN
+	Vec3 err = a - b;
+	return err * err;
+#endif
+#ifdef	NORM_ABS
+	Vec3 err = a - b;
+	return fabs(err.X()) + fabs(err.Y()) + fabs(err.Z());
+#endif
+#ifdef	NORM_EUCLIDEAN_EXPOSURE_UNWEIGHED
+	double toterr = 0;
+	Vec3 mapa, mapb, err;
+	for (int i=-6; i <= 6; i += 3)			// figure how many exposure samples needed. I'd argue if you take too many it's same as euclidean
+	{
+		tonemap(a, i, mapa);
+		tonemap(b, i, mapb);
+		err = mapa - mapb;
+		toterr += err * err;
+	}
+	return toterr;
+#endif
+#ifdef	NORM_EUCLIDEAN_EXPOSURE_WEIGHED
+	double toterr = 0;
+	Vec3 mapa, mapb, err;
+	double rwt = 0.299;
+	double gwt = 0.587;
+	double bwt = 0.114;
+	for (int i=-6; i <= 6; i += 3)			// figure how many exposure samples needed. I'd argue if you take too many it's same as euclidean
+	{
+		tonemap(a, i, mapa);
+		tonemap(b, i, mapb);
+		mapa.X() *= rwt; mapa.Y() *= gwt; mapa.Z() *= bwt;
+		mapb.X() *= rwt; mapb.Y() *= gwt; mapb.Z() *= bwt;
+		err = mapa - mapb;
+		toterr += err * err;
+	}
+	return toterr;
+#endif
+}
+
+double Utils::mpsnr_norm(const Vec3 &a, int exposure, const Vec3 &b)
+{
+	double toterr = 0;
+	Vec3 mapa, mapb, err;
+
+	mpsnrmap(a, exposure, mapa);
+	mpsnrmap(b, exposure, mapb);
+
+	err = mapa - mapb;
+	toterr += err * err;
+
+	return toterr;
+}
+
+// parse <name>[<start>{:<end>}]{,}	
+// the pointer starts here         ^
+// name is 1 or 2 chars and matches field names. start and end are decimal numbers
+void Utils::parse(char *encoding, int &ptr, Field &field, int &endbit, int &len)
+{
+	if (ptr <= 0) return;
+	--ptr;
+	if (encoding[ptr] == ',') --ptr;
+	assert (encoding[ptr] == ']');
+	--ptr;
+	endbit = 0;
+	int scale = 1;
+	while (encoding[ptr] != ':' && encoding[ptr] != '[')
+	{
+		assert(encoding[ptr] >= '0' && encoding[ptr] <= '9');
+		endbit += (encoding[ptr--] - '0') * scale;
+		scale *= 10;
+	}
+	int startbit = 0; scale = 1;
+	if (encoding[ptr] == '[')
+		startbit = endbit;
+	else  
+	{
+		ptr--;
+		while (encoding[ptr] != '[')
+		{
+			assert(encoding[ptr] >= '0' && encoding[ptr] <= '9');
+			startbit += (encoding[ptr--] - '0') * scale;
+			scale *= 10;
+		}
+	}
+	len = startbit - endbit + 1;	// startbit>=endbit note
+	--ptr;
+	if (encoding[ptr] == 'm')		field = FIELD_M;
+	else if (encoding[ptr] == 'd')	field = FIELD_D;
+	else {
+		// it's wxyz
+		assert (encoding[ptr] >= 'w' && encoding[ptr] <= 'z');
+		int foo = encoding[ptr--] - 'w';
+		// now it is r g or b
+		if (encoding[ptr] == 'r')		foo += 10;
+		else if (encoding[ptr] == 'g')	foo += 20;
+		else if (encoding[ptr] == 'b')	foo += 30;
+		else assert(0);
+		field = (Field) foo;
+	}
+}
+
+
diff --git a/src/nvtt/bc6h/utils.h b/src/nvtt/bc6h/utils.h
new file mode 100755
index 0000000..308f3e6
--- /dev/null
+++ b/src/nvtt/bc6h/utils.h
@@ -0,0 +1,79 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+// utility class holding common routines
+#ifndef _UTILS_H
+#define _UTILS_H
+
+#include "arvo/Vec3.h"
+
+using namespace ArvoMath;
+
+#ifndef MIN
+#define MIN(x,y) ((x)<(y)?(x):(y))
+#endif
+
+#ifndef MAX
+#define MAX(x,y) ((x)>(y)?(x):(y))
+#endif
+
+#define	PALETTE_LERP(a, b, i, denom)	Utils::lerp(a, b, i, denom)
+
+#define	SIGN_EXTEND(x,nb)	((((signed(x))&(1<<((nb)-1)))?((~0)<<(nb)):0)|(signed(x)))
+
+enum Field { FIELD_M = 1,	// mode
+				FIELD_D = 2,	// distribution/shape
+				FIELD_RW = 10+0, FIELD_RX = 10+1, FIELD_RY = 10+2, FIELD_RZ = 10+3,	// red channel endpoints or deltas
+				FIELD_GW = 20+0, FIELD_GX = 20+1, FIELD_GY = 20+2, FIELD_GZ = 20+3,	// green channel endpoints or deltas
+				FIELD_BW = 30+0, FIELD_BX = 30+1, FIELD_BY = 30+2, FIELD_BZ = 30+3,	// blue channel endpoints or deltas
+};
+
+// some constants
+#define	F16S_MASK	0x8000		// f16 sign mask
+#define	F16EM_MASK	0x7fff		// f16 exp & mantissa mask
+#define	U16MAX		0xffff
+#define	S16MIN		(-0x8000)
+#define	S16MAX		0x7fff
+#define	INT16_MASK	0xffff
+#define	F16MAX	(0x7bff)		// MAXFLT bit pattern for halfs
+
+enum Format { UNSIGNED_F16, SIGNED_F16 };
+
+class Utils
+{
+public:
+	static Format FORMAT;	// this is a global -- we're either handling unsigned or unsigned half values
+
+	// error metrics
+	static double norm(const Vec3 &a, const Vec3 &b);
+	static double mpsnr_norm(const Vec3 &a, int exposure, const Vec3 &b);
+
+	// conversion & clamp
+	static int ushort_to_format(unsigned short input);
+	static unsigned short format_to_ushort(int input);
+
+	// clamp to format
+	static void Utils::clamp(Vec3 &v);
+
+	// quantization and unquantization
+	static int finish_unquantize(int q, int prec);
+	static int unquantize(int q, int prec);
+	static int quantize(float value, int prec);
+
+	static void parse(char *encoding, int &ptr, Field &field, int &endbit, int &len);
+
+	// lerping
+	static int lerp(int a, int b, int i, int denom);
+	static Vec3 lerp(const Vec3& a, const Vec3 &b, int i, int denom);
+};
+
+#endif
\ No newline at end of file
diff --git a/src/nvtt/bc6h/zoh.cpp b/src/nvtt/bc6h/zoh.cpp
new file mode 100755
index 0000000..224ee74
--- /dev/null
+++ b/src/nvtt/bc6h/zoh.cpp
@@ -0,0 +1,205 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+// the zoh compressor and decompressor
+
+#include <string>
+#include <iostream>
+#include <sstream>
+#include <assert.h>
+
+#include "ImfArray.h"
+#include "ImfRgba.h"
+
+#include "tile.h"
+#include "zoh.h"
+#include "exr.h"
+
+#ifndef MIN
+#define MIN(x,y) ((x)<(y)?(x):(y))
+#endif
+
+using namespace std;
+
+bool ZOH::isone(const char *block)
+{
+	char code = block[0] & 0x1F;
+
+	return (code == 0x03 || code == 0x07 || code == 0x0b || code == 0x0f);
+}
+
+void ZOH::compress(const Tile &t, char *block)
+{
+	char oneblock[ZOH::BLOCKSIZE], twoblock[ZOH::BLOCKSIZE];
+
+	double mseone = ZOH::compressone(t, oneblock);
+	double msetwo = ZOH::compresstwo(t, twoblock);
+
+	if (mseone <= msetwo)
+		memcpy(block, oneblock, ZOH::BLOCKSIZE);
+	else
+		memcpy(block, twoblock, ZOH::BLOCKSIZE);
+}
+
+void ZOH::decompress(const char *block, Tile &t)
+{
+	if (ZOH::isone(block))
+		ZOH::decompressone(block, t);
+	else
+		ZOH::decompresstwo(block, t);
+}
+
+void ZOH::compress(string inf, string zohf)
+{
+	Array2D<Rgba> pixels;
+	int w, h;
+	char block[ZOH::BLOCKSIZE];
+
+	Exr::readRgba(inf, pixels, w, h);
+	FILE *zohfile = fopen(zohf.c_str(), "wb");
+	if (zohfile == NULL) throw "Unable to open .zoh file for write";
+
+	// stuff for progress bar O.o
+	int ntiles = ((h+Tile::TILE_H-1)/Tile::TILE_H)*((w+Tile::TILE_W-1)/Tile::TILE_W);
+	int tilecnt = 0;
+	int ndots = 25;
+	int dotcnt = 0;
+	printf("Progress [");
+	for (int i=0; i<ndots;++i) printf(" ");
+	printf("]\rProgress ["); fflush(stdout);
+
+	// convert to tiles and compress each tile
+	for (int y=0; y<h; y+=Tile::TILE_H)
+	{
+		int ysize = MIN(Tile::TILE_H, h-y);
+		for (int x=0; x<w; x+=Tile::TILE_W)
+		{
+			int xsize = MIN(Tile::TILE_W, w-x);
+			Tile t(xsize, ysize);
+
+			t.insert(pixels, x, y);
+
+			ZOH::compress(t, block);
+			if (fwrite(block, sizeof(char), ZOH::BLOCKSIZE, zohfile) != ZOH::BLOCKSIZE)
+				throw "File error on write";
+
+			// progress bar
+			++tilecnt;
+			if (tilecnt > (ntiles * dotcnt)/ndots) { printf("."); fflush(stdout); ++dotcnt; }
+		}
+	}
+
+	printf("]\n");		// advance to next line finally
+
+	if (fclose(zohfile)) throw "Close failed on .zoh file";
+}
+
+static int str2int(std::string s) 
+{
+	int thing;
+	std::stringstream str (stringstream::in | stringstream::out);
+	str << s;
+	str >> thing;
+	return thing;
+}
+
+// zoh file name is ...-w-h.zoh, extract width and height
+static void extract(string zohf, int &w, int &h)
+{
+	size_t n = zohf.rfind('.', zohf.length()-1);
+	size_t n1 = zohf.rfind('-', n-1);
+	size_t n2 = zohf.rfind('-', n1-1);
+	string width = zohf.substr(n2+1, n1-n2-1);
+	w = str2int(width);
+	string height = zohf.substr(n1+1, n-n1-1);
+	h = str2int(height);
+}
+
+static int mode_to_prec[] = {
+	10,7,11,10,
+	10,7,11,11,
+	10,7,11,12,
+	10,7,9,16,
+	10,7,8,-1,
+	10,7,8,-1,
+	10,7,8,-1,
+	10,7,6,-1,
+};
+
+static int shapeindexhist[32], modehist[32], prechistone[16], prechisttwo[16], oneregion, tworegions;
+
+static void stats(char block[ZOH::BLOCKSIZE])
+{
+	char mode = block[0] & 0x1F; if ((mode & 0x3) == 0) mode = 0; if ((mode & 0x3) == 1) mode = 1; modehist[mode]++;
+	int prec = mode_to_prec[mode];
+	assert (prec != -1);
+	if (!ZOH::isone(block))
+	{
+		tworegions++;
+		prechisttwo[prec]++;
+		int shapeindex = ((block[0] & 0xe0) >> 5) | ((block[1] & 0x3) << 3);
+		shapeindexhist[shapeindex]++;
+	}
+	else
+	{
+		oneregion++;
+		prechistone[prec]++;
+	}
+}
+
+static void printstats()
+{
+	printf("\nPrecision histogram 10b to 16b one region: "); for (int i=10; i<=16; ++i) printf("%d,", prechistone[i]);
+	printf("\nPrecision histogram 6b to 11b two regions: "); for (int i=6; i<=11; ++i) printf("%d,", prechisttwo[i]);
+	printf("\nMode histogram: "); for (int i=0; i<32; ++i) printf("%d,", modehist[i]);
+	printf("\nShape index histogram: "); for (int i=0; i<32; ++i) printf("%d,", shapeindexhist[i]);
+	printf("\nOne region %5.2f%%  Two regions %5.2f%%", 100.0*oneregion/float(oneregion+tworegions), 100.0*tworegions/float(oneregion+tworegions));
+	printf("\n");
+}
+
+void ZOH::decompress(string zohf, string outf)
+{
+	Array2D<Rgba> pixels;
+	int w, h;
+	char block[ZOH::BLOCKSIZE];
+
+	extract(zohf, w, h);
+	FILE *zohfile = fopen(zohf.c_str(), "rb");
+	if (zohfile == NULL) throw "Unable to open .zoh file for read";
+	pixels.resizeErase(h, w);
+
+	// convert to tiles and decompress each tile
+	for (int y=0; y<h; y+=Tile::TILE_H)
+	{
+		int ysize = MIN(Tile::TILE_H, h-y);
+		for (int x=0; x<w; x+=Tile::TILE_W)
+		{
+			int xsize = MIN(Tile::TILE_W, w-x);
+			Tile t(xsize, ysize);
+
+			if (fread(block, sizeof(char), ZOH::BLOCKSIZE, zohfile) != ZOH::BLOCKSIZE)
+				throw "File error on read";
+
+			stats(block);	// collect statistics
+
+			ZOH::decompress(block, t);
+
+			t.extract(pixels, x, y);
+		}
+	}
+	if (fclose(zohfile)) throw "Close failed on .zoh file";
+	Exr::writeRgba(outf, pixels, w, h);
+
+#ifndef EXTERNAL_RELEASE
+	printstats();	// print statistics
+#endif
+}
diff --git a/src/nvtt/bc6h/zoh.h b/src/nvtt/bc6h/zoh.h
new file mode 100755
index 0000000..96ce84d
--- /dev/null
+++ b/src/nvtt/bc6h/zoh.h
@@ -0,0 +1,78 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+#ifndef _ZOH_H
+#define _ZOH_H
+
+#include <string>
+
+#include "tile.h"
+
+using namespace std;
+
+// UNUSED ZOH MODES are 0x13, 0x17, 0x1b, 0x1f
+
+#define	EXTERNAL_RELEASE	1	// define this if we're releasing this code externally
+
+#define	NREGIONS_TWO	2
+#define	NREGIONS_ONE	1
+#define	NCHANNELS	3
+
+// Note: this code only reads OpenEXR files, which are only in F16 format.
+// if unsigned is selected, the input is clamped to >= 0.
+// if f16 is selected, the range is clamped to 0..0x7bff.
+
+struct FltEndpts
+{
+	Vec3	A;
+	Vec3	B;
+};
+
+struct IntEndpts
+{
+	int		A[NCHANNELS];
+	int		B[NCHANNELS];
+};
+
+struct ComprEndpts
+{
+	unsigned int	A[NCHANNELS];
+	unsigned int	B[NCHANNELS];
+};
+
+class ZOH
+{
+public:
+	static const int BLOCKSIZE=16;
+	static const int BITSIZE=128;
+	static Format FORMAT;
+
+	static void compress(string inf, string zohf);
+	static void decompress(string zohf, string outf);
+	static void compress(const Tile &t, char *block);
+	static void decompress(const char *block, Tile &t);
+
+	static double compressone(const Tile &t, char *block);
+	static double compresstwo(const Tile &t, char *block);
+	static void decompressone(const char *block, Tile &t);
+	static void decompresstwo(const char *block, Tile &t);
+
+	static double refinetwo(const Tile &tile, int shapeindex_best, const FltEndpts endpts[NREGIONS_TWO], char *block);
+	static double roughtwo(const Tile &tile, int shape, FltEndpts endpts[NREGIONS_TWO]);
+
+	static double refineone(const Tile &tile, int shapeindex_best, const FltEndpts endpts[NREGIONS_ONE], char *block);
+	static double roughone(const Tile &tile, int shape, FltEndpts endpts[NREGIONS_ONE]);
+
+	static bool isone(const char *block);
+};
+
+#endif
\ No newline at end of file
diff --git a/src/nvtt/bc6h/zoh.sln b/src/nvtt/bc6h/zoh.sln
new file mode 100755
index 0000000..74b9fc2
--- /dev/null
+++ b/src/nvtt/bc6h/zoh.sln
@@ -0,0 +1,21 @@
+Microsoft Visual Studio Solution File, Format Version 8.00
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "zoh", "zoh.vcproj", "{C6F6CD96-D0C0-4A35-B1BC-53E0A3CB712F}"
+	ProjectSection(ProjectDependencies) = postProject
+	EndProjectSection
+EndProject
+Global
+	GlobalSection(SolutionConfiguration) = preSolution
+		Debug = Debug
+		Release = Release
+	EndGlobalSection
+	GlobalSection(ProjectConfiguration) = postSolution
+		{C6F6CD96-D0C0-4A35-B1BC-53E0A3CB712F}.Debug.ActiveCfg = Debug|Win32
+		{C6F6CD96-D0C0-4A35-B1BC-53E0A3CB712F}.Debug.Build.0 = Debug|Win32
+		{C6F6CD96-D0C0-4A35-B1BC-53E0A3CB712F}.Release.ActiveCfg = Release|Win32
+		{C6F6CD96-D0C0-4A35-B1BC-53E0A3CB712F}.Release.Build.0 = Release|Win32
+	EndGlobalSection
+	GlobalSection(ExtensibilityGlobals) = postSolution
+	EndGlobalSection
+	GlobalSection(ExtensibilityAddIns) = postSolution
+	EndGlobalSection
+EndGlobal
diff --git a/src/nvtt/bc6h/zoh.vcproj b/src/nvtt/bc6h/zoh.vcproj
new file mode 100755
index 0000000..31a7f30
--- /dev/null
+++ b/src/nvtt/bc6h/zoh.vcproj
@@ -0,0 +1,281 @@
+<?xml version="1.0" encoding="Windows-1252"?>
+<VisualStudioProject
+	ProjectType="Visual C++"
+	Version="7.10"
+	Name="zoh"
+	ProjectGUID="{C6F6CD96-D0C0-4A35-B1BC-53E0A3CB712F}"
+	SccProjectName=""
+	SccLocalPath="">
+	<Platforms>
+		<Platform
+			Name="Win32"/>
+	</Platforms>
+	<Configurations>
+		<Configuration
+			Name="Debug|Win32"
+			OutputDirectory=".\Debug"
+			IntermediateDirectory=".\Debug"
+			ConfigurationType="1"
+			UseOfMFC="0"
+			ATLMinimizesCRunTimeLibraryUsage="FALSE"
+			CharacterSet="2">
+			<Tool
+				Name="VCCLCompilerTool"
+				Optimization="0"
+				AdditionalIncludeDirectories="../include/OpenEXR"
+				PreprocessorDefinitions="_DEBUG;WIN32;_CONSOLE"
+				BasicRuntimeChecks="3"
+				RuntimeLibrary="3"
+				ForceConformanceInForLoopScope="TRUE"
+				RuntimeTypeInfo="TRUE"
+				UsePrecompiledHeader="0"
+				ProgramDataBaseFileName="$(IntDir)/$(ProjectName)_d.pdb"
+				WarningLevel="1"
+				SuppressStartupBanner="TRUE"
+				Detect64BitPortabilityProblems="TRUE"
+				DebugInformationFormat="4"
+				CompileAs="0"
+				DisableSpecificWarnings="4290"/>
+			<Tool
+				Name="VCCustomBuildTool"/>
+			<Tool
+				Name="VCLinkerTool"
+				AdditionalDependencies="IlmImf.lib IMath.lib Half.lib zlib.lib comctl32.lib"
+				OutputFile="../test/zohc_d.exe"
+				LinkIncremental="2"
+				SuppressStartupBanner="TRUE"
+				AdditionalLibraryDirectories="../lib/OpenEXR"
+				GenerateDebugInformation="TRUE"
+				SubSystem="1"
+				TargetMachine="1"/>
+			<Tool
+				Name="VCMIDLTool"
+				TypeLibraryName="./Debug/zoh.tlb"
+				HeaderFileName=""/>
+			<Tool
+				Name="VCPostBuildEventTool"/>
+			<Tool
+				Name="VCPreBuildEventTool"/>
+			<Tool
+				Name="VCPreLinkEventTool"/>
+			<Tool
+				Name="VCResourceCompilerTool"
+				PreprocessorDefinitions="_DEBUG"
+				Culture="1033"/>
+			<Tool
+				Name="VCWebServiceProxyGeneratorTool"/>
+			<Tool
+				Name="VCXMLDataGeneratorTool"/>
+			<Tool
+				Name="VCWebDeploymentTool"/>
+			<Tool
+				Name="VCManagedWrapperGeneratorTool"/>
+			<Tool
+				Name="VCAuxiliaryManagedWrapperGeneratorTool"/>
+		</Configuration>
+		<Configuration
+			Name="Release|Win32"
+			OutputDirectory=".\Release"
+			IntermediateDirectory=".\Release"
+			ConfigurationType="1"
+			UseOfMFC="0"
+			ATLMinimizesCRunTimeLibraryUsage="FALSE"
+			CharacterSet="2">
+			<Tool
+				Name="VCCLCompilerTool"
+				Optimization="2"
+				InlineFunctionExpansion="1"
+				AdditionalIncludeDirectories="../include/OpenEXR"
+				PreprocessorDefinitions="NDEBUG;WIN32;_CONSOLE"
+				StringPooling="TRUE"
+				RuntimeLibrary="2"
+				ForceConformanceInForLoopScope="TRUE"
+				RuntimeTypeInfo="TRUE"
+				UsePrecompiledHeader="0"
+				ProgramDataBaseFileName="$(IntDir)/$(ProjectName).pdb"
+				WarningLevel="1"
+				SuppressStartupBanner="TRUE"
+				Detect64BitPortabilityProblems="TRUE"
+				DebugInformationFormat="3"
+				CompileAs="0"
+				DisableSpecificWarnings="4290"/>
+			<Tool
+				Name="VCCustomBuildTool"/>
+			<Tool
+				Name="VCLinkerTool"
+				AdditionalDependencies="IlmImf.lib IMath.lib Half.lib zlib.lib comctl32.lib"
+				OutputFile="../test/zohc.exe"
+				LinkIncremental="1"
+				SuppressStartupBanner="TRUE"
+				AdditionalLibraryDirectories="../lib/OpenEXR"
+				GenerateDebugInformation="FALSE"
+				SubSystem="1"
+				EntryPointSymbol="mainCRTStartup"
+				TargetMachine="1"/>
+			<Tool
+				Name="VCMIDLTool"
+				TypeLibraryName="./Release/zoh.tlb"
+				HeaderFileName=""/>
+			<Tool
+				Name="VCPostBuildEventTool"/>
+			<Tool
+				Name="VCPreBuildEventTool"/>
+			<Tool
+				Name="VCPreLinkEventTool"/>
+			<Tool
+				Name="VCResourceCompilerTool"
+				PreprocessorDefinitions="NDEBUG"
+				Culture="1033"/>
+			<Tool
+				Name="VCWebServiceProxyGeneratorTool"/>
+			<Tool
+				Name="VCXMLDataGeneratorTool"/>
+			<Tool
+				Name="VCWebDeploymentTool"/>
+			<Tool
+				Name="VCManagedWrapperGeneratorTool"/>
+			<Tool
+				Name="VCAuxiliaryManagedWrapperGeneratorTool"/>
+		</Configuration>
+	</Configurations>
+	<References>
+	</References>
+	<Files>
+		<Filter
+			Name="Source Files"
+			Filter="cpp;c;cxx;rc;def;r;odl;idl;hpj;bat">
+			<File
+				RelativePath=".\exr.cpp">
+			</File>
+			<File
+				RelativePath=".\utils.cpp">
+			</File>
+			<File
+				RelativePath=".\zoh.cpp">
+			</File>
+			<File
+				RelativePath=".\zohc.cpp">
+			</File>
+			<File
+				RelativePath=".\zohone.cpp">
+			</File>
+			<File
+				RelativePath=".\zohtwo.cpp">
+			</File>
+			<Filter
+				Name="arvo"
+				Filter="">
+				<File
+					RelativePath=".\arvo\ArvoMath.cpp">
+				</File>
+				<File
+					RelativePath=".\arvo\Char.cpp">
+				</File>
+				<File
+					RelativePath=".\arvo\Complex.cpp">
+				</File>
+				<File
+					RelativePath=".\arvo\Matrix.cpp">
+				</File>
+				<File
+					RelativePath=".\arvo\Perm.cpp">
+				</File>
+				<File
+					RelativePath=".\arvo\Rand.cpp">
+				</File>
+				<File
+					RelativePath=".\arvo\SphTri.cpp">
+				</File>
+				<File
+					RelativePath=".\arvo\SVD.cpp">
+				</File>
+				<File
+					RelativePath=".\arvo\Token.cpp">
+				</File>
+				<File
+					RelativePath=".\arvo\Vec2.cpp">
+				</File>
+				<File
+					RelativePath=".\arvo\Vec3.cpp">
+				</File>
+				<File
+					RelativePath=".\arvo\Vector.cpp">
+				</File>
+			</Filter>
+		</Filter>
+		<Filter
+			Name="Header Files"
+			Filter="h;hpp;hxx;hm;inl">
+			<File
+				RelativePath=".\bits.h">
+			</File>
+			<File
+				RelativePath=".\exr.h">
+			</File>
+			<File
+				RelativePath=".\shapes_two.h">
+			</File>
+			<File
+				RelativePath=".\tile.h">
+			</File>
+			<File
+				RelativePath=".\utils.h">
+			</File>
+			<File
+				RelativePath=".\zoh.h">
+			</File>
+			<Filter
+				Name="arvo"
+				Filter="">
+				<File
+					RelativePath=".\arvo\ArvoMath.h">
+				</File>
+				<File
+					RelativePath=".\arvo\Char.h">
+				</File>
+				<File
+					RelativePath=".\arvo\Complex.h">
+				</File>
+				<File
+					RelativePath=".\arvo\form.h">
+				</File>
+				<File
+					RelativePath=".\arvo\Matrix.h">
+				</File>
+				<File
+					RelativePath=".\arvo\Perm.h">
+				</File>
+				<File
+					RelativePath=".\arvo\Rand.h">
+				</File>
+				<File
+					RelativePath=".\arvo\SI_units.h">
+				</File>
+				<File
+					RelativePath=".\arvo\SphTri.h">
+				</File>
+				<File
+					RelativePath=".\arvo\SVD.h">
+				</File>
+				<File
+					RelativePath=".\arvo\Token.h">
+				</File>
+				<File
+					RelativePath=".\arvo\Vec2.h">
+				</File>
+				<File
+					RelativePath=".\arvo\Vec3.h">
+				</File>
+				<File
+					RelativePath=".\arvo\Vector.h">
+				</File>
+			</Filter>
+		</Filter>
+		<Filter
+			Name="Resource Files"
+			Filter="ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe">
+		</Filter>
+	</Files>
+	<Globals>
+	</Globals>
+</VisualStudioProject>
diff --git a/src/nvtt/bc6h/zohc.cpp b/src/nvtt/bc6h/zohc.cpp
new file mode 100755
index 0000000..79cbb30
--- /dev/null
+++ b/src/nvtt/bc6h/zohc.cpp
@@ -0,0 +1,301 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+// WORK: the widecolorgamut.exr image is the poster child for not doing independent compression of each 4x4 tile. (The image is available from www.openexr.org.)
+// At the lower-left vertex, the companded image shows a visible artifact due to the vertex being compressed with 6 bit endpoint accuracy
+// but the constant tile right next to it being compressed with 16 bit endpoint accuracy. It's an open problem to figure out how to deal with that in the best possible way.
+//
+// WORK: we removed 4 codes since we couldn't come up with anything to use them for that showed a worthwhile improvement in PSNR. Clearly the compression format can be improved since
+// we're only using 7/8 of the available code space. But how?
+// 
+// NOTE: HDR compression formats that compress luminance and chrominance separatey and multiply them together to get the final decompressed channels tend to do poorly at extreme values.
+
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <stdexcept>
+
+#include <ImfArray.h>
+#include "exr.h"
+#include "zoh.h"
+#include "utils.h"
+
+using namespace std;
+
+static int mpsnr_low = -10, mpsnr_high = 10;
+
+static void dump(char *tag, Array2D<Rgba> &in1, int x, int y)
+{
+	printf("\n%s\n", tag);
+	for (int y0=0; y0<4; ++y0)
+	{
+		for (int x0=0; x0<4; ++x0)
+			printf("%6d%6d%6d   ", Utils::ushort_to_format((in1[y+y0][x+x0].r).bits()), Utils::ushort_to_format((in1[y+y0][x+x0].g).bits()), Utils::ushort_to_format((in1[y+y0][x+x0].b).bits()));
+		printf("\n");
+	}
+}
+
+static void analyze(string in1, string in2)
+{
+	Array2D<Rgba> pin1, pin2;
+	int w1, h1, w2, h2;
+
+	Exr::readRgba(in1, pin1, w1, h1);
+	Exr::readRgba(in2, pin2, w2, h2);
+
+	// choose the smaller of the two dimensions (since the old compressor would truncate to multiple-of-4 sizes)
+	int w = MIN(w1, w2);
+	int h = MIN(h1, h2);
+
+	double nsamples = 0;
+	double mabse = 0, mse = 0, mpsnre = 0;
+	int errdist[17];
+	int errs[3*16];
+
+	for (int i=0; i<17; ++i)
+		errdist[i] = 0;
+
+	int psnrhist[100];
+	for (int i=0; i<100; ++i)
+		psnrhist[i] = 0;
+	bool first = true;
+
+	for (int y = 0; y < h; y+=4)
+	for (int x = 0; x < w; x+=4)
+	{
+		int xw = MIN(w-x, 4);
+		int yw = MIN(h-y, 4);
+		int np = 0;
+
+		Vec3 a, b;
+
+		for (int y0=0; y0<yw; ++y0)
+		for (int x0=0; x0<xw; ++x0)
+		{
+			a.X() = Utils::ushort_to_format(((pin1[y+y0][x+x0]).r).bits());
+			a.Y() = Utils::ushort_to_format(((pin1[y+y0][x+x0]).g).bits());
+			a.Z() = Utils::ushort_to_format(((pin1[y+y0][x+x0]).b).bits());
+
+			b.X() = Utils::ushort_to_format(((pin2[y+y0][x+x0]).r).bits());
+			b.Y() = Utils::ushort_to_format(((pin2[y+y0][x+x0]).g).bits());
+			b.Z() = Utils::ushort_to_format(((pin2[y+y0][x+x0]).b).bits());
+
+			for (int exposure = mpsnr_low; exposure <= mpsnr_high; ++exposure)
+				mpsnre += Utils::mpsnr_norm(a, exposure, b);
+
+			errs[np+0] = a.X() - b.X();
+			errs[np+1] = a.Y() - b.Y();
+			errs[np+2] = a.Z() - b.Z();
+			np += 3;
+		}
+
+		double msetile = 0.0;
+
+		for (int i = 0; i < np; ++i)
+		{
+			int err = errs[i];
+			int abse = err > 0 ? err : -err;
+			mabse += (double)abse;
+			mse += (double)abse * abse;
+			msetile += (double)abse * abse;
+
+			int lsb;
+
+			for (lsb=0; abse>0; ++lsb, abse >>= 1)
+				;
+
+			errdist[lsb]++;
+		}
+
+		double psnrtile, rmsetile;
+
+		rmsetile = sqrt(msetile / double(np));
+		psnrtile = (rmsetile == 0) ? 99.0 : 20.0 * log10(32767.0/rmsetile);
+
+		int psnrquant = (int) floor (psnrtile);		// 10 means [10,11) psnrs, e.g.
+		// clamp just in case
+		psnrquant = (psnrquant < 0) ? 0 : (psnrquant > 99) ? 99 : psnrquant;
+		psnrhist[psnrquant]++;
+		if (first && psnrquant < 20)
+		{
+			first = false;
+			printf("Tiles with PSNR's worse than 20dB\n");
+		}
+		if (psnrquant < 20)
+			printf("X %4d Y %4d PSNR %7.2f\n", x, y, psnrtile);
+	}
+	
+	nsamples = w * h * 3;
+
+	mabse /= nsamples;
+	mse /= nsamples;
+
+	double rmse, psnr;
+
+	rmse = sqrt(mse);
+	psnr = (rmse == 0) ? 999.0 : 20.0 * log10(32767.0/rmse);
+
+	mpsnre /= (mpsnr_high-mpsnr_low+1) * w * h;
+
+	double mpsnr = (mpsnre == 0) ? 999.0 : 10.0 * log10(3.0 * 255.0 * 255.0 / mpsnre);
+
+	printf("Image size compared: %dw x %dh\n", w, h);
+	if (w != w1 || w != w2 || h != h1 || h != h2)
+		printf("--- NOTE: only the overlap between the 2 images (%d,%d) and (%d,%d) was compared\n", w1, h1, w2, h2);
+	printf("Total pixels: %12.0f\n", nsamples/3);
+	printf("Mean absolute error: %f\n", mabse);
+	printf("Root mean squared error: %f\n", rmse);
+	printf("Peak signal to noise ratio in dB: %f\n", psnr);
+	printf("mPSNR for exposure range %d..%d: %8.3f\n", mpsnr_low, mpsnr_high, mpsnr);
+	printf("Histogram of number of channels with indicated LSB error\n");
+	for (int i = 0; i < 17; ++i)
+		if (errdist[i])
+			printf("%2d LSB error: %10d\n", i, errdist[i]);
+#if 0
+	printf("Histogram of per-tile PSNR\n");
+	for (int i = 0; i < 100; ++i)
+		if (psnrhist[i])
+			printf("[%2d,%2d) %6d\n", i, i+1, psnrhist[i]);
+#endif
+}
+
+static bool ext(string inf, char *extension)
+{
+	size_t n = inf.rfind('.', inf.length()-1);
+	if (n != string::npos)
+		return inf.substr(n, inf.length()) == extension;
+	else if (*extension != '\0')
+		return false;
+	else
+		return true;	// extension is null and we didn't find a .
+}
+
+template <typename T>
+std::string toString(const T &thing) 
+{
+	std::stringstream os;
+	os << thing;
+	return os.str();
+}
+
+static int str2int(std::string s) 
+{
+	int thing;
+	std::stringstream str (stringstream::in | stringstream::out);
+	str << s;
+	str >> thing;
+	return thing;
+}
+
+static void usage()
+{
+	cout << endl <<
+	"Usage:" << endl <<
+	"zohc infile.exr outroot             generates outroot-w-h.bc6, outroot-bc6.exr" << endl <<
+	"zohc foo-w-h.bc6 outroot            generates outroot-bc6.exr" << endl <<
+	"zohc infile.exr outfile.exr [e1 e2] compares the two images; optionally specify the mPSNR exposure range" << endl << endl <<
+	"Flags:" << endl <<
+	"-u     treat the input as unsigned. negative values are clamped to zero. (default)" << endl <<
+	"-s     treat the input as signed." << endl;
+}
+
+Format Utils::FORMAT = UNSIGNED_F16;
+
+int main(int argc, char* argv[])
+{
+#ifdef EXTERNAL_RELEASE
+	cout << "BC6H OpenEXR RGB Compressor/Decompressor version 1.61 (May 27 2010)." << endl <<
+			"Bug reports, questions, and suggestions to wdonovan a t nvidia d o t com." << endl << endl;
+#endif
+	try
+	{
+		char * args[4];
+		int nargs = 0;
+		bool is_unsigned = true;
+		bool is_float = true;
+
+		// process flags, copy any non flag arg to args[]
+		for (int i = 1; i < argc; ++i)
+		{
+			if ((argv[i])[0] == '-')
+				switch ((argv[i])[1]) {
+					case 'u': is_unsigned = true; break;
+					case 's': is_unsigned = false; break;
+					default:  throw "bad flag arg";
+				}
+			else
+			{
+				if (nargs >= 6) throw "Incorrect number of args";
+				args[nargs++] = argv[i];
+			}
+		}
+
+		Utils::FORMAT = (!is_unsigned) ? SIGNED_F16 : UNSIGNED_F16;
+
+		if (nargs < 2) throw "Incorrect number of args";
+
+		string inf(args[0]), outroot(args[1]);
+
+		cout << "Input format is: " << (is_unsigned ? "UNSIGNED FLOAT_16" : "SIGNED FLOAT_16") << endl;
+
+		if (ext(outroot, ""))
+		{
+			if (ext(inf, ".exr"))
+			{
+				int width, height;
+				Exr::fileinfo(inf, width, height);
+				string outf, zohf;
+				outf = outroot + "-bc6.exr";
+				zohf = outroot + "-" + toString(width) + "-" + toString(height) + ".bc6";
+				cout << "Compressing " << inf << " to " << zohf << endl;
+				ZOH::compress(inf, zohf);
+				cout << "Decompressing " << zohf << " to " << outf << endl;
+				ZOH::decompress(zohf, outf);
+				analyze(inf, outf);
+			}
+			else if (ext(inf, ".bc6"))
+			{
+				string outf;
+				outf = outroot + "-bc6.exr";
+				cout << "Decompressing " << inf << " to " << outf << endl;
+				ZOH::decompress(inf, outf);
+			}
+			else throw "Invalid file args";
+		}
+		else if (ext(inf, ".exr") && ext(outroot, ".exr"))
+		{
+			if (nargs == 4)
+			{
+				string low(args[2]), high(args[3]);
+				mpsnr_low = str2int(low);
+				mpsnr_high = str2int(high);
+				if (mpsnr_low > mpsnr_high) throw "Invalid exposure range";
+			}
+			analyze(inf, outroot);
+		}
+		else throw "Invalid file args";
+	}
+	catch(const exception& e)
+	{
+		// Print error message and usage instructions
+		cerr << e.what() << endl;
+		usage();
+		return 1;
+	}
+	catch(char * msg)
+	{
+		cerr << msg << endl;
+		usage();
+		return 1;
+	}
+	return 0;
+}
diff --git a/src/nvtt/bc6h/zohone.cpp b/src/nvtt/bc6h/zohone.cpp
new file mode 100755
index 0000000..e6d2b87
--- /dev/null
+++ b/src/nvtt/bc6h/zohone.cpp
@@ -0,0 +1,804 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+// one region zoh compress/decompress code
+// Thanks to Jacob Munkberg (jacob@cs.lth.se) for the shortcut of using SVD to do the equivalent of principal components analysis
+
+#include "bits.h"
+#include "tile.h"
+#include "zoh.h"
+#include "arvo/Vec3.h"
+#include "arvo/Matrix.h"
+#include "arvo/SVD.h"
+#include "utils.h"
+
+#include <assert.h>
+
+using namespace ArvoMath;
+
+#define NINDICES	16
+#define	INDEXBITS	4
+#define	HIGH_INDEXBIT	(1<<(INDEXBITS-1))
+#define	DENOM		(NINDICES-1)
+
+#define	NSHAPES	1
+
+static int shapes[NSHAPES] =
+{
+	0x0000
+};	// only 1 shape
+
+#define	REGION(x,y,shapeindex)	((shapes[shapeindex]&(1<<(15-(x)-4*(y))))!=0)
+
+#define	POS_TO_X(pos)	((pos)&3)
+#define	POS_TO_Y(pos)	(((pos)>>2)&3)
+
+#define	NDELTA	2
+
+struct Chanpat
+{
+	int prec[NDELTA];		// precision pattern for one channel
+};
+
+struct Pattern
+{
+	Chanpat chan[NCHANNELS];// allow different bit patterns per channel -- but we still want constant precision per channel
+	int transformed;		// if 0, deltas are unsigned and no transform; otherwise, signed and transformed
+	int mode;				// associated mode value
+	int modebits;			// number of mode bits
+	char *encoding;			// verilog description of encoding for this mode
+};
+
+#define MAXMODEBITS	5
+#define	MAXMODES (1<<MAXMODEBITS)
+
+#define	NPATTERNS 4
+
+static Pattern patterns[NPATTERNS] =
+{
+	16,4,	16,4,	16,4,		1,	0x0f, 5, "bw[10],bw[11],bw[12],bw[13],bw[14],bw[15],bx[3:0],gw[10],gw[11],gw[12],gw[13],gw[14],gw[15],gx[3:0],rw[10],rw[11],rw[12],rw[13],rw[14],rw[15],rx[3:0],bw[9:0],gw[9:0],rw[9:0],m[4:0]",
+	12,8,	12,8,	12,8,		1,  0x0b, 5, "bw[10],bw[11],bx[7:0],gw[10],gw[11],gx[7:0],rw[10],rw[11],rx[7:0],bw[9:0],gw[9:0],rw[9:0],m[4:0]",
+	11,9,	11,9,	11,9,		1,	0x07, 5, "bw[10],bx[8:0],gw[10],gx[8:0],rw[10],rx[8:0],bw[9:0],gw[9:0],rw[9:0],m[4:0]",
+	10,10,	10,10,	10,10,		0,	0x03, 5, "bx[9:0],gx[9:0],rx[9:0],bw[9:0],gw[9:0],rw[9:0],m[4:0]",
+};
+
+// mapping of mode to the corresponding index in pattern
+static int mode_to_pat[MAXMODES] = {
+	-1,-1,-1,
+	3,	// 0x03
+	-1,-1,-1,
+	2,	// 0x07
+	-1,-1,-1,
+	1,	// 0x0b
+	-1,-1,-1,
+	0,	// 0x0f
+	-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+};
+
+#define	R_0(ep)	(ep)[0].A[i]
+#define	R_1(ep)	(ep)[0].B[i]
+#define	MASK(n)	((1<<(n))-1)
+
+// compress endpoints
+static void compress_endpts(const IntEndpts in[NREGIONS_ONE], ComprEndpts out[NREGIONS_ONE], const Pattern &p)
+{
+	if (p.transformed)
+	{
+		for (int i=0; i<NCHANNELS; ++i)
+		{
+			R_0(out) = R_0(in) & MASK(p.chan[i].prec[0]);
+			R_1(out) = (R_1(in) - R_0(in)) & MASK(p.chan[i].prec[1]);
+		}
+	}
+	else
+	{
+		for (int i=0; i<NCHANNELS; ++i)
+		{
+			R_0(out) = R_0(in) & MASK(p.chan[i].prec[0]);
+			R_1(out) = R_1(in) & MASK(p.chan[i].prec[1]);
+		}
+	}
+}
+
+// decompress endpoints
+static void decompress_endpts(const ComprEndpts in[NREGIONS_ONE], IntEndpts out[NREGIONS_ONE], const Pattern &p)
+{
+	bool issigned = Utils::FORMAT == SIGNED_F16;
+
+	if (p.transformed)
+	{
+		for (int i=0; i<NCHANNELS; ++i)
+		{
+			R_0(out) = issigned ? SIGN_EXTEND(R_0(in),p.chan[i].prec[0]) : R_0(in);
+			int t;
+			t = SIGN_EXTEND(R_1(in), p.chan[i].prec[1]);
+			t = (t + R_0(in)) & MASK(p.chan[i].prec[0]);
+			R_1(out) = issigned ? SIGN_EXTEND(t,p.chan[i].prec[0]) : t;
+		}
+	}
+	else
+	{
+		for (int i=0; i<NCHANNELS; ++i)
+		{
+			R_0(out) = issigned ? SIGN_EXTEND(R_0(in),p.chan[i].prec[0]) : R_0(in);
+			R_1(out) = issigned ? SIGN_EXTEND(R_1(in),p.chan[i].prec[1]) : R_1(in);
+		}
+	}
+}
+
+static void quantize_endpts(const FltEndpts endpts[NREGIONS_ONE], int prec, IntEndpts q_endpts[NREGIONS_ONE])
+{
+	for (int region = 0; region < NREGIONS_ONE; ++region)
+	{
+		q_endpts[region].A[0] = Utils::quantize(endpts[region].A.X(), prec);
+		q_endpts[region].A[1] = Utils::quantize(endpts[region].A.Y(), prec);
+		q_endpts[region].A[2] = Utils::quantize(endpts[region].A.Z(), prec);
+		q_endpts[region].B[0] = Utils::quantize(endpts[region].B.X(), prec);
+		q_endpts[region].B[1] = Utils::quantize(endpts[region].B.Y(), prec);
+		q_endpts[region].B[2] = Utils::quantize(endpts[region].B.Z(), prec);
+	}
+}
+
+// swap endpoints as needed to ensure that the indices at index_one and index_one have a 0 high-order bit
+// index_one is 0 at x=0 y=0 and 15 at x=3 y=3 so y = (index >> 2) & 3 and x = index & 3
+static void swap_indices(IntEndpts endpts[NREGIONS_ONE], int indices[Tile::TILE_H][Tile::TILE_W], int shapeindex)
+{
+	int index_positions[NREGIONS_ONE];
+
+	index_positions[0] = 0;			// since WLOG we have the high bit of the shapes at 0
+
+	for (int region = 0; region < NREGIONS_ONE; ++region)
+	{
+		int x = index_positions[region] & 3;
+		int y = (index_positions[region] >> 2) & 3;
+		assert(REGION(x,y,shapeindex) == region);		// double check the table
+		if (indices[y][x] & HIGH_INDEXBIT)
+		{
+			// high bit is set, swap the endpts and indices for this region
+			int t;
+			for (int i=0; i<NCHANNELS; ++i) { t = endpts[region].A[i]; endpts[region].A[i] = endpts[region].B[i]; endpts[region].B[i] = t; }
+
+			for (int y = 0; y < Tile::TILE_H; y++)
+			for (int x = 0; x < Tile::TILE_W; x++)
+				if (REGION(x,y,shapeindex) == region)
+					indices[y][x] = NINDICES - 1 - indices[y][x];
+		}
+	}
+}
+
+// endpoints fit only if the compression was lossless
+static bool endpts_fit(const IntEndpts orig[NREGIONS_ONE], const ComprEndpts compressed[NREGIONS_ONE], const Pattern &p)
+{
+	IntEndpts uncompressed[NREGIONS_ONE];
+
+	decompress_endpts(compressed, uncompressed, p);
+
+	for (int j=0; j<NREGIONS_ONE; ++j)
+	for (int i=0; i<NCHANNELS; ++i)
+	{
+		if (orig[j].A[i] != uncompressed[j].A[i]) return false;
+		if (orig[j].B[i] != uncompressed[j].B[i]) return false;
+	}
+	return true;
+}
+
+static void write_header(const ComprEndpts endpts[NREGIONS_ONE], const Pattern &p, Bits &out)
+{
+	// interpret the verilog backwards and process it
+	int m = p.mode;
+	int rw = endpts[0].A[0], rx = endpts[0].B[0];
+	int gw = endpts[0].A[1], gx = endpts[0].B[1];
+	int bw = endpts[0].A[2], bx = endpts[0].B[2];
+	int ptr = strlen(p.encoding);
+	while (ptr)
+	{
+		Field field;
+		int endbit, len;
+
+		Utils::parse(p.encoding, ptr, field, endbit, len);
+		switch(field)
+		{
+		case FIELD_M:	out.write( m >> endbit, len); break;
+		case FIELD_RW:	out.write(rw >> endbit, len); break;
+		case FIELD_RX:	out.write(rx >> endbit, len); break;
+		case FIELD_GW:	out.write(gw >> endbit, len); break;
+		case FIELD_GX:	out.write(gx >> endbit, len); break;
+		case FIELD_BW:	out.write(bw >> endbit, len); break;
+		case FIELD_BX:	out.write(bx >> endbit, len); break;
+
+		case FIELD_D:
+		case FIELD_RY:
+		case FIELD_RZ:
+		case FIELD_GY:
+		case FIELD_GZ:
+		case FIELD_BY:
+		case FIELD_BZ:
+		default: assert(0);
+		}
+	}
+}
+
+static void read_header(Bits &in, ComprEndpts endpts[NREGIONS_ONE], Pattern &p)
+{
+	// reading isn't quite symmetric with writing -- we don't know the encoding until we decode the mode
+	int mode = in.read(2);
+	if (mode != 0x00 && mode != 0x01)
+		mode = (in.read(3) << 2) | mode;
+
+	int pat_index = mode_to_pat[mode];
+
+	assert (pat_index >= 0 && pat_index < NPATTERNS);
+	assert (in.getptr() == patterns[pat_index].modebits);
+
+	p = patterns[pat_index];
+
+	int d;
+	int rw, rx;
+	int gw, gx;
+	int bw, bx;
+
+	d = 0;
+	rw = rx = 0;
+	gw = gx = 0;
+	bw = bx = 0;
+
+	int ptr = strlen(p.encoding);
+
+	while (ptr)
+	{
+		Field field;
+		int endbit, len;
+
+		Utils::parse(p.encoding, ptr, field, endbit, len);
+
+		switch(field)
+		{
+		case FIELD_M:	break;	// already processed so ignore
+		case FIELD_RW:	rw |= in.read(len) << endbit; break;
+		case FIELD_RX:	rx |= in.read(len) << endbit; break;
+		case FIELD_GW:	gw |= in.read(len) << endbit; break;
+		case FIELD_GX:	gx |= in.read(len) << endbit; break;
+		case FIELD_BW:	bw |= in.read(len) << endbit; break;
+		case FIELD_BX:	bx |= in.read(len) << endbit; break;
+
+		case FIELD_D:	
+		case FIELD_RY:	
+		case FIELD_RZ:	
+		case FIELD_GY:	
+		case FIELD_GZ:	
+		case FIELD_BY:	
+		case FIELD_BZ:	
+		default: assert(0);
+		}
+	}
+
+	assert (in.getptr() == 128 - 63);
+
+	endpts[0].A[0] = rw; endpts[0].B[0] = rx;
+	endpts[0].A[1] = gw; endpts[0].B[1] = gx;
+	endpts[0].A[2] = bw; endpts[0].B[2] = bx;
+}
+
+// compress index 0
+static void write_indices(const int indices[Tile::TILE_H][Tile::TILE_W], int shapeindex, Bits &out)
+{
+	for (int pos = 0; pos < Tile::TILE_TOTAL; ++pos)
+	{
+		int x = POS_TO_X(pos);
+		int y = POS_TO_Y(pos);
+
+		out.write(indices[y][x], INDEXBITS - ((pos == 0) ? 1 : 0));
+	}
+}
+
+static void emit_block(const ComprEndpts endpts[NREGIONS_ONE], int shapeindex, const Pattern &p, const int indices[Tile::TILE_H][Tile::TILE_W], char *block)
+{
+	Bits out(block, ZOH::BITSIZE);
+
+	write_header(endpts, p, out);
+
+	write_indices(indices, shapeindex, out);
+
+	assert(out.getptr() == ZOH::BITSIZE);
+}
+
+static void generate_palette_quantized(const IntEndpts &endpts, int prec, Vec3 palette[NINDICES])
+{
+	// scale endpoints
+	int a, b;			// really need a IntVec3...
+
+	a = Utils::unquantize(endpts.A[0], prec); 
+	b = Utils::unquantize(endpts.B[0], prec);
+
+	// interpolate
+	for (int i = 0; i < NINDICES; ++i)
+		palette[i].X() = Utils::finish_unquantize(PALETTE_LERP(a, b, i, DENOM), prec);
+
+	a = Utils::unquantize(endpts.A[1], prec); 
+	b = Utils::unquantize(endpts.B[1], prec);
+
+	// interpolate
+	for (int i = 0; i < NINDICES; ++i)
+		palette[i].Y() = Utils::finish_unquantize(PALETTE_LERP(a, b, i, DENOM), prec);
+
+	a = Utils::unquantize(endpts.A[2], prec); 
+	b = Utils::unquantize(endpts.B[2], prec);
+
+	// interpolate
+	for (int i = 0; i < NINDICES; ++i)
+		palette[i].Z() = Utils::finish_unquantize(PALETTE_LERP(a, b, i, DENOM), prec);
+}
+
+// position 0 was compressed
+static void read_indices(Bits &in, int shapeindex, int indices[Tile::TILE_H][Tile::TILE_W])
+{
+	for (int pos = 0; pos < Tile::TILE_TOTAL; ++pos)
+	{
+		int x = POS_TO_X(pos);
+		int y = POS_TO_Y(pos);
+
+		indices[y][x]= in.read(INDEXBITS - ((pos == 0) ? 1 : 0));
+	}
+}
+
+void ZOH::decompressone(const char *block, Tile &t)
+{
+	Bits in(block, ZOH::BITSIZE);
+
+	Pattern p;
+	IntEndpts endpts[NREGIONS_ONE];
+	ComprEndpts compr_endpts[NREGIONS_ONE];
+
+	read_header(in, compr_endpts, p);
+	int shapeindex = 0;		// only one shape
+	
+	decompress_endpts(compr_endpts, endpts, p);
+
+	Vec3 palette[NREGIONS_ONE][NINDICES];
+	for (int r = 0; r < NREGIONS_ONE; ++r)
+		generate_palette_quantized(endpts[r], p.chan[0].prec[0], &palette[r][0]);
+
+	// read indices
+	int indices[Tile::TILE_H][Tile::TILE_W];
+
+	read_indices(in, shapeindex, indices);
+
+	assert(in.getptr() == ZOH::BITSIZE);
+
+	// lookup
+	for (int y = 0; y < Tile::TILE_H; y++)
+	for (int x = 0; x < Tile::TILE_W; x++)
+		t.data[y][x] = palette[REGION(x,y,shapeindex)][indices[y][x]];
+}
+
+// given a collection of colors and quantized endpoints, generate a palette, choose best entries, and return a single toterr
+static double map_colors(const Vec3 colors[], const float importance[], int np, const IntEndpts &endpts, int prec)
+{
+	Vec3 palette[NINDICES];
+	double toterr = 0;
+	Vec3 err;
+
+	generate_palette_quantized(endpts, prec, palette);
+
+	for (int i = 0; i < np; ++i)
+	{
+		double err, besterr;
+
+		besterr = Utils::norm(colors[i], palette[0]) * importance[i];
+
+		for (int j = 1; j < NINDICES && besterr > 0; ++j)
+		{
+			err = Utils::norm(colors[i], palette[j]) * importance[i];
+
+			if (err > besterr)	// error increased, so we're done searching
+				break;
+			if (err < besterr)
+				besterr = err;
+		}
+		toterr += besterr;
+	}
+	return toterr;
+}
+
+// assign indices given a tile, shape, and quantized endpoints, return toterr for each region
+static void assign_indices(const Tile &tile, int shapeindex, IntEndpts endpts[NREGIONS_ONE], int prec, 
+						   int indices[Tile::TILE_H][Tile::TILE_W], double toterr[NREGIONS_ONE])
+{
+	// build list of possibles
+	Vec3 palette[NREGIONS_ONE][NINDICES];
+
+	for (int region = 0; region < NREGIONS_ONE; ++region)
+	{
+		generate_palette_quantized(endpts[region], prec, &palette[region][0]);
+		toterr[region] = 0;
+	}
+
+	Vec3 err;
+
+	for (int y = 0; y < tile.size_y; y++)
+	for (int x = 0; x < tile.size_x; x++)
+	{
+		int region = REGION(x,y,shapeindex);
+		double err, besterr;
+
+		besterr = Utils::norm(tile.data[y][x], palette[region][0]);
+		indices[y][x] = 0;
+
+		for (int i = 1; i < NINDICES && besterr > 0; ++i)
+		{
+			err = Utils::norm(tile.data[y][x], palette[region][i]);
+
+			if (err > besterr)	// error increased, so we're done searching
+				break;
+			if (err < besterr)
+			{
+				besterr = err;
+				indices[y][x] = i;
+			}
+		}
+		toterr[region] += besterr;
+	}
+}
+
+static double perturb_one(const Vec3 colors[], const float importance[], int np, int ch, int prec, const IntEndpts &old_endpts, IntEndpts &new_endpts, 
+						  double old_err, int do_b)
+{
+	// we have the old endpoints: old_endpts
+	// we have the perturbed endpoints: new_endpts
+	// we have the temporary endpoints: temp_endpts
+
+	IntEndpts temp_endpts;
+	float min_err = old_err;		// start with the best current error
+	int beststep;
+
+	// copy real endpoints so we can perturb them
+	for (int i=0; i<NCHANNELS; ++i) { temp_endpts.A[i] = new_endpts.A[i] = old_endpts.A[i]; temp_endpts.B[i] = new_endpts.B[i] = old_endpts.B[i]; }
+
+	// do a logarithmic search for the best error for this endpoint (which)
+	for (int step = 1 << (prec-1); step; step >>= 1)
+	{
+		bool improved = false;
+		for (int sign = -1; sign <= 1; sign += 2)
+		{
+			if (do_b == 0)
+			{
+				temp_endpts.A[ch] = new_endpts.A[ch] + sign * step;
+				if (temp_endpts.A[ch] < 0 || temp_endpts.A[ch] >= (1 << prec))
+					continue;
+			}
+			else
+			{
+				temp_endpts.B[ch] = new_endpts.B[ch] + sign * step;
+				if (temp_endpts.B[ch] < 0 || temp_endpts.B[ch] >= (1 << prec))
+					continue;
+			}
+
+			float err = map_colors(colors, importance, np, temp_endpts, prec);
+
+			if (err < min_err)
+			{
+				improved = true;
+				min_err = err;
+				beststep = sign * step;
+			}
+		}
+		// if this was an improvement, move the endpoint and continue search from there
+		if (improved)
+		{
+			if (do_b == 0)
+				new_endpts.A[ch] += beststep;
+			else
+				new_endpts.B[ch] += beststep;
+		}
+	}
+	return min_err;
+}
+
+static void optimize_one(const Vec3 colors[], const float importance[], int np, double orig_err, const IntEndpts &orig_endpts, int prec, IntEndpts &opt_endpts)
+{
+	double opt_err = orig_err;
+	for (int ch = 0; ch < NCHANNELS; ++ch)
+	{
+		opt_endpts.A[ch] = orig_endpts.A[ch];
+		opt_endpts.B[ch] = orig_endpts.B[ch];
+	}
+	/*
+		err0 = perturb(rgb0, delta0)
+		err1 = perturb(rgb1, delta1)
+		if (err0 < err1)
+			if (err0 >= initial_error) break
+			rgb0 += delta0
+			next = 1
+		else
+			if (err1 >= initial_error) break
+			rgb1 += delta1
+			next = 0
+		initial_err = map()
+		for (;;)
+			err = perturb(next ? rgb1:rgb0, delta)
+			if (err >= initial_err) break
+			next? rgb1 : rgb0 += delta
+			initial_err = err
+	*/
+	IntEndpts new_a, new_b;
+	IntEndpts new_endpt;
+	int do_b;
+
+	// now optimize each channel separately
+	for (int ch = 0; ch < NCHANNELS; ++ch)
+	{
+		// figure out which endpoint when perturbed gives the most improvement and start there
+		// if we just alternate, we can easily end up in a local minima
+		float err0 = perturb_one(colors, importance, np, ch, prec, opt_endpts, new_a, opt_err, 0);	// perturb endpt A
+		float err1 = perturb_one(colors, importance, np, ch, prec, opt_endpts, new_b, opt_err, 1);	// perturb endpt B
+
+		if (err0 < err1)
+		{
+			if (err0 >= opt_err)
+				continue;
+
+			opt_endpts.A[ch] = new_a.A[ch];
+			opt_err = err0;
+			do_b = 1;		// do B next
+		}
+		else
+		{
+			if (err1 >= opt_err)
+				continue;
+			opt_endpts.B[ch] = new_b.B[ch];
+			opt_err = err1;
+			do_b = 0;		// do A next
+		}
+		
+		// now alternate endpoints and keep trying until there is no improvement
+		for (;;)
+		{
+			float err = perturb_one(colors, importance, np, ch, prec, opt_endpts, new_endpt, opt_err, do_b);
+			if (err >= opt_err)
+				break;
+			if (do_b == 0)
+				opt_endpts.A[ch] = new_endpt.A[ch];
+			else
+				opt_endpts.B[ch] = new_endpt.B[ch];
+			opt_err = err;
+			do_b = 1 - do_b;	// now move the other endpoint
+		}
+	}
+}
+
+static void optimize_endpts(const Tile &tile, int shapeindex, const double orig_err[NREGIONS_ONE], 
+							const IntEndpts orig_endpts[NREGIONS_ONE], int prec, IntEndpts opt_endpts[NREGIONS_ONE])
+{
+	Vec3 pixels[Tile::TILE_TOTAL];
+	float importance[Tile::TILE_TOTAL];
+	double err = 0;
+	int indices[Tile::TILE_TOTAL];
+
+	for (int region=0; region<NREGIONS_ONE; ++region)
+	{
+		// collect the pixels in the region
+		int np = 0;
+
+		for (int y = 0; y < tile.size_y; y++)
+		for (int x = 0; x < tile.size_x; x++)
+			if (REGION(x,y,shapeindex) == region)
+			{
+				pixels[np] = tile.data[y][x];
+				importance[np] = tile.importance_map[y][x];
+				++np;
+			}
+
+		optimize_one(pixels, importance, np, orig_err[region], orig_endpts[region], prec, opt_endpts[region]);
+	}
+}
+
+/* optimization algorithm
+	for each pattern
+		convert endpoints using pattern precision
+		assign indices and get initial error
+		compress indices (and possibly reorder endpoints)
+		transform endpoints
+		if transformed endpoints fit pattern
+			get original endpoints back
+			optimize endpoints, get new endpoints, new indices, and new error // new error will almost always be better
+			compress new indices
+			transform new endpoints
+			if new endpoints fit pattern AND if error is improved
+				emit compressed block with new data
+			else
+				emit compressed block with original data // to try to preserve maximum endpoint precision
+*/
+
+double ZOH::refineone(const Tile &tile, int shapeindex_best, const FltEndpts endpts[NREGIONS_ONE], char *block)
+{
+	double orig_err[NREGIONS_ONE], opt_err[NREGIONS_ONE], orig_toterr, opt_toterr;
+	IntEndpts orig_endpts[NREGIONS_ONE], opt_endpts[NREGIONS_ONE];
+	ComprEndpts compr_orig[NREGIONS_ONE], compr_opt[NREGIONS_ONE];
+	int orig_indices[Tile::TILE_H][Tile::TILE_W], opt_indices[Tile::TILE_H][Tile::TILE_W];
+
+	for (int sp = 0; sp < NPATTERNS; ++sp)
+	{
+		// precisions for all channels need to be the same
+		for (int i=1; i<NCHANNELS; ++i) assert (patterns[sp].chan[0].prec[0] == patterns[sp].chan[i].prec[0]);
+
+		quantize_endpts(endpts, patterns[sp].chan[0].prec[0], orig_endpts);
+		assign_indices(tile, shapeindex_best, orig_endpts, patterns[sp].chan[0].prec[0], orig_indices, orig_err);
+		swap_indices(orig_endpts, orig_indices, shapeindex_best);
+		compress_endpts(orig_endpts, compr_orig, patterns[sp]);
+		if (endpts_fit(orig_endpts, compr_orig, patterns[sp]))
+		{
+			optimize_endpts(tile, shapeindex_best, orig_err, orig_endpts, patterns[sp].chan[0].prec[0], opt_endpts);
+			assign_indices(tile, shapeindex_best, opt_endpts, patterns[sp].chan[0].prec[0], opt_indices, opt_err);
+			swap_indices(opt_endpts, opt_indices, shapeindex_best);
+			compress_endpts(opt_endpts, compr_opt, patterns[sp]);
+			orig_toterr = opt_toterr = 0;
+			for (int i=0; i < NREGIONS_ONE; ++i) { orig_toterr += orig_err[i]; opt_toterr += opt_err[i]; }
+
+			if (endpts_fit(opt_endpts, compr_opt, patterns[sp]) && opt_toterr < orig_toterr)
+			{
+				emit_block(compr_opt, shapeindex_best, patterns[sp], opt_indices, block);
+				return opt_toterr;
+			}
+			else
+			{
+				// either it stopped fitting when we optimized it, or there was no improvement
+				// so go back to the unoptimized endpoints which we know will fit
+				emit_block(compr_orig, shapeindex_best, patterns[sp], orig_indices, block);
+				return orig_toterr;
+			}
+		}
+	}
+	throw "No candidate found, should never happen (refineone.)";
+}
+
+static void generate_palette_unquantized(const FltEndpts endpts[NREGIONS_ONE], Vec3 palette[NREGIONS_ONE][NINDICES])
+{
+	for (int region = 0; region < NREGIONS_ONE; ++region)
+	for (int i = 0; i < NINDICES; ++i)
+		palette[region][i] = PALETTE_LERP(endpts[region].A, endpts[region].B, i, DENOM);
+}
+
+// generate a palette from unquantized endpoints, then pick best palette color for all pixels in each region, return toterr for all regions combined
+static double map_colors(const Tile &tile, int shapeindex, const FltEndpts endpts[NREGIONS_ONE])
+{
+	// build list of possibles
+	Vec3 palette[NREGIONS_ONE][NINDICES];
+
+	generate_palette_unquantized(endpts, palette);
+
+	double toterr = 0;
+	Vec3 err;
+
+	for (int y = 0; y < tile.size_y; y++)
+	for (int x = 0; x < tile.size_x; x++)
+	{
+		int region = REGION(x,y,shapeindex);
+		double err, besterr;
+
+		besterr = Utils::norm(tile.data[y][x], palette[region][0]) * tile.importance_map[y][x];
+
+		for (int i = 1; i < NINDICES && besterr > 0; ++i)
+		{
+			err = Utils::norm(tile.data[y][x], palette[region][i]) * tile.importance_map[y][x];
+
+			if (err > besterr)	// error increased, so we're done searching
+				break;
+			if (err < besterr)
+				besterr = err;
+		}
+		toterr += besterr;
+	}
+	return toterr;
+}
+
+double ZOH::roughone(const Tile &tile, int shapeindex, FltEndpts endpts[NREGIONS_ONE])
+{
+	for (int region=0; region<NREGIONS_ONE; ++region)
+	{
+		int np = 0;
+		Vec3 colors[Tile::TILE_TOTAL];
+		Vec3 mean(0,0,0);
+
+		for (int y = 0; y < tile.size_y; y++)
+		for (int x = 0; x < tile.size_x; x++)
+			if (REGION(x,y,shapeindex) == region)
+			{
+				colors[np] = tile.data[y][x];
+				mean += tile.data[y][x];
+				++np;
+			}
+
+		// handle simple cases	
+		if (np == 0)
+		{
+			Vec3 zero(0,0,0);
+			endpts[region].A = zero;
+			endpts[region].B = zero;
+			continue;
+		}
+		else if (np == 1)
+		{
+			endpts[region].A = colors[0];
+			endpts[region].B = colors[0];
+			continue;
+		}
+		else if (np == 2)
+		{
+			endpts[region].A = colors[0];
+			endpts[region].B = colors[1];
+			continue;
+		}
+
+		Matrix rdq(np, np);
+
+		mean /= float(np);
+
+		for (int i = 0; i < np; ++i)
+		{
+			rdq(i,0) = colors[i].X() - mean.X();
+			rdq(i,1) = colors[i].Y() - mean.Y();
+			rdq(i,2) = colors[i].Z() - mean.Z();
+		}
+				
+		// perform a singular value decomposition
+		SVD svd(rdq);			// decompose matrix rdq to R*D*Q (== U*W*V in standard nomenclature)
+
+		// get the principal component direction (well, the one with the largest weight)
+		Vec3 direction(svd.R()(0,0), svd.R()(0,1), svd.R()(0,2));
+
+		// project each pixel value along the principal direction
+		double minp = DBL_MAX, maxp = -DBL_MAX;
+		for (int i = 0; i < np; i++) 
+		{
+			float dp = rdq(i,0) * direction.X() + rdq(i,1)*direction.Y() + rdq(i,2)*direction.Z();
+			if (dp < minp) minp = dp;
+			if (dp > maxp) maxp = dp;
+		}
+
+		// choose as endpoints 2 points along the principal direction that span the projections of all of the pixel values
+		endpts[region].A = mean + minp*direction;
+		endpts[region].B = mean + maxp*direction;
+
+		// clamp endpoints
+		// the argument for clamping is that the actual endpoints need to be clamped and thus we need to choose the best
+		// shape based on endpoints being clamped
+		Utils::clamp(endpts[region].A);
+		Utils::clamp(endpts[region].B);
+	}
+
+	return map_colors(tile, shapeindex, endpts);
+}
+
+double ZOH::compressone(const Tile &t, char *block)
+{
+	int shapeindex_best = 0;
+	FltEndpts endptsbest[NREGIONS_ONE], tempendpts[NREGIONS_ONE];
+	double msebest = DBL_MAX;
+
+	/*
+		collect the mse values that are within 5% of the best values
+		optimize each one and choose the best
+	*/
+	// hack for now -- just use the best value WORK
+	for (int i=0; i<NSHAPES && msebest>0.0; ++i)
+	{
+		double mse = roughone(t, i, tempendpts);
+		if (mse < msebest)
+		{
+			msebest = mse;
+			shapeindex_best = i;
+			memcpy(endptsbest, tempendpts, sizeof(endptsbest));
+		}
+
+	}
+	return refineone(t, shapeindex_best, endptsbest, block);
+}
diff --git a/src/nvtt/bc6h/zohtwo.cpp b/src/nvtt/bc6h/zohtwo.cpp
new file mode 100755
index 0000000..8a692bd
--- /dev/null
+++ b/src/nvtt/bc6h/zohtwo.cpp
@@ -0,0 +1,892 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+// two regions zoh compress/decompress code
+// Thanks to Jacob Munkberg (jacob@cs.lth.se) for the shortcut of using SVD to do the equivalent of principal components analysis
+
+/* optimization algorithm
+
+	get initial float endpoints
+	convert endpoints using 16 bit precision, transform, and get bit delta. choose likely endpoint compression candidates.
+		note that there will be 1 or 2 candidates; 2 will be chosen when the delta values are close to the max possible.
+	for each EC candidate in order from max precision to smaller precision
+		convert endpoints using the appropriate precision.
+		optimize the endpoints and minimize square error. save the error and index assignments. apply index compression as well.
+			(thus the endpoints and indices are in final form.)
+		transform and get bit delta.
+		if the bit delta fits, exit
+	if we ended up with no candidates somehow, choose the tail set of EC candidates and retry. this should happen hardly ever.
+		add a state variable to assert we only do this once.
+	convert to bit stream.
+	return the error.
+
+	Global optimization
+		order all tiles based on their errors
+		do something special for high-error tiles
+			the goal here is to try to avoid tiling artifacts. but I think this is a research problem. let's just generate an error image...
+
+	display an image that shows partitioning and precision selected for each tile
+*/
+
+#include "bits.h"
+#include "tile.h"
+#include "zoh.h"
+#include "arvo/Vec3.h"
+#include "arvo/Matrix.h"
+#include "arvo/SVD.h"
+#include "utils.h"
+
+#include <assert.h>
+
+using namespace ArvoMath;
+
+#define NINDICES	8
+#define	INDEXBITS	3
+#define	HIGH_INDEXBIT	(1<<(INDEXBITS-1))
+#define	DENOM		(NINDICES-1)
+
+// WORK: determine optimal traversal pattern to search for best shape -- what does the error curve look like?
+// i.e. can we search shapes in a particular order so we can see the global error minima easily and
+// stop without having to touch all shapes?
+
+#include "shapes_two.h"
+// use only the first 32 available shapes
+#undef NSHAPES
+#undef SHAPEBITS
+#define NSHAPES 32
+#define SHAPEBITS 5
+
+#define	POS_TO_X(pos)	((pos)&3)
+#define	POS_TO_Y(pos)	(((pos)>>2)&3)
+
+#define	NDELTA	4
+
+struct Chanpat
+{
+	int prec[NDELTA];		// precision pattern for one channel
+};
+
+struct Pattern
+{
+	Chanpat chan[NCHANNELS];// allow different bit patterns per channel -- but we still want constant precision per channel
+	int transformed;		// if 0, deltas are unsigned and no transform; otherwise, signed and transformed
+	int mode;				// associated mode value
+	int modebits;			// number of mode bits
+	char *encoding;			// verilog description of encoding for this mode
+};
+
+#define MAXMODEBITS	5
+#define	MAXMODES (1<<MAXMODEBITS)
+
+#define	NPATTERNS 10
+
+static Pattern patterns[NPATTERNS] =
+{
+	11,5,5,5,	11,4,4,4,	11,4,4,4,	1,	0x02, 5, "d[4:0],bz[3],rz[4:0],bz[2],ry[4:0],by[3:0],bz[1],bw[10],bx[3:0],gz[3:0],bz[0],gw[10],gx[3:0],gy[3:0],rw[10],rx[4:0],bw[9:0],gw[9:0],rw[9:0],m[4:0]",
+	11,4,4,4,	11,5,5,5,	11,4,4,4,	1,	0x06, 5, "d[4:0],bz[3],gy[4],rz[3:0],bz[2],bz[0],ry[3:0],by[3:0],bz[1],bw[10],bx[3:0],gz[3:0],gw[10],gx[4:0],gy[3:0],gz[4],rw[10],rx[3:0],bw[9:0],gw[9:0],rw[9:0],m[4:0]",
+	11,4,4,4,	11,4,4,4,	11,5,5,5,	1,	0x0a, 5, "d[4:0],bz[3],bz[4],rz[3:0],bz[2:1],ry[3:0],by[3:0],bw[10],bx[4:0],gz[3:0],bz[0],gw[10],gx[3:0],gy[3:0],by[4],rw[10],rx[3:0],bw[9:0],gw[9:0],rw[9:0],m[4:0]",
+	10,5,5,5,	10,5,5,5,	10,5,5,5,	1,	0x00, 2, "d[4:0],bz[3],rz[4:0],bz[2],ry[4:0],by[3:0],bz[1],bx[4:0],gz[3:0],bz[0],gx[4:0],gy[3:0],gz[4],rx[4:0],bw[9:0],gw[9:0],rw[9:0],bz[4],by[4],gy[4],m[1:0]",
+	9,5,5,5,	9,5,5,5,	9,5,5,5,	1,	0x0e, 5, "d[4:0],bz[3],rz[4:0],bz[2],ry[4:0],by[3:0],bz[1],bx[4:0],gz[3:0],bz[0],gx[4:0],gy[3:0],gz[4],rx[4:0],bz[4],bw[8:0],gy[4],gw[8:0],by[4],rw[8:0],m[4:0]",
+	8,6,6,6,	8,5,5,5,	8,5,5,5,	1,	0x12, 5, "d[4:0],rz[5:0],ry[5:0],by[3:0],bz[1],bx[4:0],gz[3:0],bz[0],gx[4:0],gy[3:0],rx[5:0],bz[4:3],bw[7:0],gy[4],bz[2],gw[7:0],by[4],gz[4],rw[7:0],m[4:0]",
+	8,5,5,5,	8,6,6,6,	8,5,5,5,	1,	0x16, 5, "d[4:0],bz[3],rz[4:0],bz[2],ry[4:0],by[3:0],bz[1],bx[4:0],gz[3:0],gx[5:0],gy[3:0],gz[4],rx[4:0],bz[4],gz[5],bw[7:0],gy[4],gy[5],gw[7:0],by[4],bz[0],rw[7:0],m[4:0]",
+	8,5,5,5,	8,5,5,5,	8,6,6,6,	1,	0x1a, 5, "d[4:0],bz[3],rz[4:0],bz[2],ry[4:0],by[3:0],bx[5:0],gz[3:0],bz[0],gx[4:0],gy[3:0],gz[4],rx[4:0],bz[4],bz[5],bw[7:0],gy[4],by[5],gw[7:0],by[4],bz[1],rw[7:0],m[4:0]",
+	7,6,6,6,	7,6,6,6,	7,6,6,6,	1,	0x01, 2, "d[4:0],rz[5:0],ry[5:0],by[3:0],bx[5:0],gz[3:0],gx[5:0],gy[3:0],rx[5:0],bz[4],bz[5],bz[3],bw[6:0],gy[4],bz[2],by[5],gw[6:0],by[4],bz[1:0],rw[6:0],gz[5:4],gy[5],m[1:0]",
+	6,6,6,6,	6,6,6,6,	6,6,6,6,	0,	0x1e, 5, "d[4:0],rz[5:0],ry[5:0],by[3:0],bx[5:0],gz[3:0],gx[5:0],gy[3:0],rx[5:0],bz[4],bz[5],bz[3],gz[5],bw[5:0],gy[4],bz[2],by[5],gy[5],gw[5:0],by[4],bz[1:0],gz[4],rw[5:0],m[4:0]",
+};
+
+// mapping of mode to the corresponding index in pattern
+// UNUSED ZOH MODES are 0x13, 0x17, 0x1b, 0x1f -- return -2 for these
+static int mode_to_pat[MAXMODES] = {	
+	3,	// 0x00
+	8,	// 0x01
+	0,	// 0x02
+	-1,-1,-1,
+	1,	// 0x06
+	-1,-1,-1,
+	2,	// 0x0a
+	-1,-1,-1,
+	4,	// 0x0e
+	-1,-1,-1,
+	5,	// 0x12
+	-2,-1,-1,
+	6,	// 0x16
+	-2,-1,-1,
+	7,	// 0x1a
+	-2,-1,-1,
+	9,	// 0x1e
+	-2
+};
+
+#define	R_0(ep)	(ep)[0].A[i]
+#define	R_1(ep)	(ep)[0].B[i]
+#define	R_2(ep)	(ep)[1].A[i]
+#define	R_3(ep)	(ep)[1].B[i]
+#define	MASK(n)	((1<<(n))-1)
+
+// compress endpoints
+static void compress_endpts(const IntEndpts in[NREGIONS_TWO], ComprEndpts out[NREGIONS_TWO], const Pattern &p)
+{
+	if (p.transformed)
+	{
+		for (int i=0; i<NCHANNELS; ++i)
+		{
+			R_0(out) = R_0(in) & MASK(p.chan[i].prec[0]);
+			R_1(out) = (R_1(in) - R_0(in)) & MASK(p.chan[i].prec[1]);
+			R_2(out) = (R_2(in) - R_0(in)) & MASK(p.chan[i].prec[2]);
+			R_3(out) = (R_3(in) - R_0(in)) & MASK(p.chan[i].prec[3]);
+		}
+	}
+	else
+	{
+		for (int i=0; i<NCHANNELS; ++i)
+		{
+			R_0(out) = R_0(in) & MASK(p.chan[i].prec[0]);
+			R_1(out) = R_1(in) & MASK(p.chan[i].prec[1]);
+			R_2(out) = R_2(in) & MASK(p.chan[i].prec[2]);
+			R_3(out) = R_3(in) & MASK(p.chan[i].prec[3]);
+		}
+	}
+}
+
+// decompress endpoints
+static void decompress_endpts(const ComprEndpts in[NREGIONS_TWO], IntEndpts out[NREGIONS_TWO], const Pattern &p)
+{
+	bool issigned = Utils::FORMAT == SIGNED_F16;
+
+	if (p.transformed)
+	{
+		for (int i=0; i<NCHANNELS; ++i)
+		{
+			R_0(out) = issigned ? SIGN_EXTEND(R_0(in),p.chan[i].prec[0]) : R_0(in);
+			int t;
+			t = SIGN_EXTEND(R_1(in), p.chan[i].prec[1]);
+			t = (t + R_0(in)) & MASK(p.chan[i].prec[0]);
+			R_1(out) = issigned ? SIGN_EXTEND(t,p.chan[i].prec[0]) : t;
+			t = SIGN_EXTEND(R_2(in), p.chan[i].prec[2]);
+			t = (t + R_0(in)) & MASK(p.chan[i].prec[0]);
+			R_2(out) = issigned ? SIGN_EXTEND(t,p.chan[i].prec[0]) : t;
+			t = SIGN_EXTEND(R_3(in), p.chan[i].prec[3]);
+			t = (t + R_0(in)) & MASK(p.chan[i].prec[0]);
+			R_3(out) = issigned ? SIGN_EXTEND(t,p.chan[i].prec[0]) : t;
+		}
+	}
+	else
+	{
+		for (int i=0; i<NCHANNELS; ++i)
+		{
+			R_0(out) = issigned ? SIGN_EXTEND(R_0(in),p.chan[i].prec[0]) : R_0(in);
+			R_1(out) = issigned ? SIGN_EXTEND(R_1(in),p.chan[i].prec[1]) : R_1(in);
+			R_2(out) = issigned ? SIGN_EXTEND(R_2(in),p.chan[i].prec[2]) : R_2(in);
+			R_3(out) = issigned ? SIGN_EXTEND(R_3(in),p.chan[i].prec[3]) : R_3(in);
+		}
+	}
+}
+
+static void quantize_endpts(const FltEndpts endpts[NREGIONS_TWO], int prec, IntEndpts q_endpts[NREGIONS_TWO])
+{
+	for (int region = 0; region < NREGIONS_TWO; ++region)
+	{
+		q_endpts[region].A[0] = Utils::quantize(endpts[region].A.X(), prec);
+		q_endpts[region].A[1] = Utils::quantize(endpts[region].A.Y(), prec);
+		q_endpts[region].A[2] = Utils::quantize(endpts[region].A.Z(), prec);
+		q_endpts[region].B[0] = Utils::quantize(endpts[region].B.X(), prec);
+		q_endpts[region].B[1] = Utils::quantize(endpts[region].B.Y(), prec);
+		q_endpts[region].B[2] = Utils::quantize(endpts[region].B.Z(), prec);
+	}
+}
+
+// swap endpoints as needed to ensure that the indices at index_positions have a 0 high-order bit
+static void swap_indices(IntEndpts endpts[NREGIONS_TWO], int indices[Tile::TILE_H][Tile::TILE_W], int shapeindex)
+{
+	for (int region = 0; region < NREGIONS_TWO; ++region)
+	{
+		int position = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,region);
+
+		int x = POS_TO_X(position);
+		int y = POS_TO_Y(position);
+		assert(REGION(x,y,shapeindex) == region);		// double check the table
+		if (indices[y][x] & HIGH_INDEXBIT)
+		{
+			// high bit is set, swap the endpts and indices for this region
+			int t;
+			for (int i=0; i<NCHANNELS; ++i) 
+			{
+				t = endpts[region].A[i]; endpts[region].A[i] = endpts[region].B[i]; endpts[region].B[i] = t;
+			}
+
+			for (int y = 0; y < Tile::TILE_H; y++)
+			for (int x = 0; x < Tile::TILE_W; x++)
+				if (REGION(x,y,shapeindex) == region)
+					indices[y][x] = NINDICES - 1 - indices[y][x];
+		}
+	}
+}
+
+// endpoints fit only if the compression was lossless
+static bool endpts_fit(const IntEndpts orig[NREGIONS_TWO], const ComprEndpts compressed[NREGIONS_TWO], const Pattern &p)
+{
+	IntEndpts uncompressed[NREGIONS_TWO];
+
+	decompress_endpts(compressed, uncompressed, p);
+
+	for (int j=0; j<NREGIONS_TWO; ++j)
+	for (int i=0; i<NCHANNELS; ++i)
+	{
+		if (orig[j].A[i] != uncompressed[j].A[i]) return false;
+		if (orig[j].B[i] != uncompressed[j].B[i]) return false;
+	}
+	return true;
+}
+
+static void write_header(const ComprEndpts endpts[NREGIONS_TWO], int shapeindex, const Pattern &p, Bits &out)
+{
+	// interpret the verilog backwards and process it
+	int m = p.mode;
+	int d = shapeindex;
+	int rw = endpts[0].A[0], rx = endpts[0].B[0], ry = endpts[1].A[0], rz = endpts[1].B[0];
+	int gw = endpts[0].A[1], gx = endpts[0].B[1], gy = endpts[1].A[1], gz = endpts[1].B[1];
+	int bw = endpts[0].A[2], bx = endpts[0].B[2], by = endpts[1].A[2], bz = endpts[1].B[2];
+	int ptr = strlen(p.encoding);
+	while (ptr)
+	{
+		Field field;
+		int endbit, len;
+
+		Utils::parse(p.encoding, ptr, field, endbit, len);
+		switch(field)
+		{
+		case FIELD_M:	out.write( m >> endbit, len); break;
+		case FIELD_D:	out.write( d >> endbit, len); break;
+		case FIELD_RW:	out.write(rw >> endbit, len); break;
+		case FIELD_RX:	out.write(rx >> endbit, len); break;
+		case FIELD_RY:	out.write(ry >> endbit, len); break;
+		case FIELD_RZ:	out.write(rz >> endbit, len); break;
+		case FIELD_GW:	out.write(gw >> endbit, len); break;
+		case FIELD_GX:	out.write(gx >> endbit, len); break;
+		case FIELD_GY:	out.write(gy >> endbit, len); break;
+		case FIELD_GZ:	out.write(gz >> endbit, len); break;
+		case FIELD_BW:	out.write(bw >> endbit, len); break;
+		case FIELD_BX:	out.write(bx >> endbit, len); break;
+		case FIELD_BY:	out.write(by >> endbit, len); break;
+		case FIELD_BZ:	out.write(bz >> endbit, len); break;
+		default: assert(0);
+		}
+	}
+}
+
+static bool read_header(Bits &in, ComprEndpts endpts[NREGIONS_TWO], int &shapeindex, Pattern &p)
+{
+	// reading isn't quite symmetric with writing -- we don't know the encoding until we decode the mode
+	int mode = in.read(2);
+	if (mode != 0x00 && mode != 0x01)
+		mode = (in.read(3) << 2) | mode;
+
+	int pat_index = mode_to_pat[mode];
+
+	if (pat_index == -2)
+		return false;		// reserved mode found
+
+	assert (pat_index >= 0 && pat_index < NPATTERNS);
+	assert (in.getptr() == patterns[pat_index].modebits);
+
+	p = patterns[pat_index];
+
+	int d;
+	int rw, rx, ry, rz;
+	int gw, gx, gy, gz;
+	int bw, bx, by, bz;
+
+	d = 0;
+	rw = rx = ry = rz = 0;
+	gw = gx = gy = gz = 0;
+	bw = bx = by = bz = 0;
+
+	int ptr = strlen(p.encoding);
+
+	while (ptr)
+	{
+		Field field;
+		int endbit, len;
+
+		Utils::parse(p.encoding, ptr, field, endbit, len);
+
+		switch(field)
+		{
+		case FIELD_M:	break;	// already processed so ignore
+		case FIELD_D:	 d |= in.read(len) << endbit; break;
+		case FIELD_RW:	rw |= in.read(len) << endbit; break;
+		case FIELD_RX:	rx |= in.read(len) << endbit; break;
+		case FIELD_RY:	ry |= in.read(len) << endbit; break;
+		case FIELD_RZ:	rz |= in.read(len) << endbit; break;
+		case FIELD_GW:	gw |= in.read(len) << endbit; break;
+		case FIELD_GX:	gx |= in.read(len) << endbit; break;
+		case FIELD_GY:	gy |= in.read(len) << endbit; break;
+		case FIELD_GZ:	gz |= in.read(len) << endbit; break;
+		case FIELD_BW:	bw |= in.read(len) << endbit; break;
+		case FIELD_BX:	bx |= in.read(len) << endbit; break;
+		case FIELD_BY:	by |= in.read(len) << endbit; break;
+		case FIELD_BZ:	bz |= in.read(len) << endbit; break;
+		default: assert(0);
+		}
+	}
+
+	assert (in.getptr() == 128 - 46);
+
+	shapeindex = d;
+	endpts[0].A[0] = rw; endpts[0].B[0] = rx; endpts[1].A[0] = ry; endpts[1].B[0] = rz;
+	endpts[0].A[1] = gw; endpts[0].B[1] = gx; endpts[1].A[1] = gy; endpts[1].B[1] = gz;
+	endpts[0].A[2] = bw; endpts[0].B[2] = bx; endpts[1].A[2] = by; endpts[1].B[2] = bz;
+
+	return true;
+}
+
+static void write_indices(const int indices[Tile::TILE_H][Tile::TILE_W], int shapeindex, Bits &out)
+{
+	int positions[NREGIONS_TWO];
+
+	for (int r = 0; r < NREGIONS_TWO; ++r)
+		positions[r] = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,r);
+
+	for (int pos = 0; pos < Tile::TILE_TOTAL; ++pos)
+	{
+		int x = POS_TO_X(pos);
+		int y = POS_TO_Y(pos);
+
+		bool match = false;
+
+		for (int r = 0; r < NREGIONS_TWO; ++r)
+			if (positions[r] == pos) { match = true; break; }
+
+		out.write(indices[y][x], INDEXBITS - (match ? 1 : 0));
+	}
+}
+
+static void emit_block(const ComprEndpts compr_endpts[NREGIONS_TWO], int shapeindex, const Pattern &p, const int indices[Tile::TILE_H][Tile::TILE_W], char *block)
+{
+	Bits out(block, ZOH::BITSIZE);
+
+	write_header(compr_endpts, shapeindex, p, out);
+
+	write_indices(indices, shapeindex, out);
+
+	assert(out.getptr() == ZOH::BITSIZE);
+}
+
+static void generate_palette_quantized(const IntEndpts &endpts, int prec, Vec3 palette[NINDICES])
+{
+	// scale endpoints
+	int a, b;			// really need a IntVec3...
+
+	a = Utils::unquantize(endpts.A[0], prec); 
+	b = Utils::unquantize(endpts.B[0], prec);
+
+	// interpolate
+	for (int i = 0; i < NINDICES; ++i)
+		palette[i].X() = Utils::finish_unquantize(PALETTE_LERP(a, b, i, DENOM), prec);
+
+	a = Utils::unquantize(endpts.A[1], prec); 
+	b = Utils::unquantize(endpts.B[1], prec);
+
+	// interpolate
+	for (int i = 0; i < NINDICES; ++i)
+		palette[i].Y() = Utils::finish_unquantize(PALETTE_LERP(a, b, i, DENOM), prec);
+
+	a = Utils::unquantize(endpts.A[2], prec); 
+	b = Utils::unquantize(endpts.B[2], prec);
+
+	// interpolate
+	for (int i = 0; i < NINDICES; ++i)
+		palette[i].Z() = Utils::finish_unquantize(PALETTE_LERP(a, b, i, DENOM), prec);
+}
+
+static void read_indices(Bits &in, int shapeindex, int indices[Tile::TILE_H][Tile::TILE_W])
+{
+	int positions[NREGIONS_TWO];
+
+	for (int r = 0; r < NREGIONS_TWO; ++r)
+		positions[r] = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,r);
+
+	for (int pos = 0; pos < Tile::TILE_TOTAL; ++pos)
+	{
+		int x = POS_TO_X(pos);
+		int y = POS_TO_Y(pos);
+
+		bool match = false;
+
+		for (int r = 0; r < NREGIONS_TWO; ++r)
+			if (positions[r] == pos) { match = true; break; }
+
+		indices[y][x]= in.read(INDEXBITS - (match ? 1 : 0));
+	}
+}
+
+void ZOH::decompresstwo(const char *block, Tile &t)
+{
+	Bits in(block, ZOH::BITSIZE);
+
+	Pattern p;
+	IntEndpts endpts[NREGIONS_TWO];
+	ComprEndpts compr_endpts[NREGIONS_TWO];
+	int shapeindex;
+
+	if (!read_header(in, compr_endpts, shapeindex, p))
+	{
+		// reserved mode, return all zeroes
+		Vec3 zero(0);
+
+		for (int y = 0; y < Tile::TILE_H; y++)
+		for (int x = 0; x < Tile::TILE_W; x++)
+			t.data[y][x] = zero;
+
+		return;
+	}
+	
+	decompress_endpts(compr_endpts, endpts, p);
+
+	Vec3 palette[NREGIONS_TWO][NINDICES];
+	for (int r = 0; r < NREGIONS_TWO; ++r)
+		generate_palette_quantized(endpts[r], p.chan[0].prec[0], &palette[r][0]);
+
+	int indices[Tile::TILE_H][Tile::TILE_W];
+
+	read_indices(in, shapeindex, indices);
+
+	assert(in.getptr() == ZOH::BITSIZE);
+
+	// lookup
+	for (int y = 0; y < Tile::TILE_H; y++)
+	for (int x = 0; x < Tile::TILE_W; x++)
+		t.data[y][x] = palette[REGION(x,y,shapeindex)][indices[y][x]];
+}
+
+// given a collection of colors and quantized endpoints, generate a palette, choose best entries, and return a single toterr
+static double map_colors(const Vec3 colors[], const float importance[], int np, const IntEndpts &endpts, int prec)
+{
+	Vec3 palette[NINDICES];
+	double toterr = 0;
+	Vec3 err;
+
+	generate_palette_quantized(endpts, prec, palette);
+
+	for (int i = 0; i < np; ++i)
+	{
+		double err, besterr;
+
+		besterr = Utils::norm(colors[i], palette[0]) * importance[i];
+
+		for (int j = 1; j < NINDICES && besterr > 0; ++j)
+		{
+			err = Utils::norm(colors[i], palette[j]) * importance[i];
+
+			if (err > besterr)	// error increased, so we're done searching
+				break;
+			if (err < besterr)
+				besterr = err;
+		}
+		toterr += besterr;
+	}
+	return toterr;
+}
+
+// assign indices given a tile, shape, and quantized endpoints, return toterr for each region
+static void assign_indices(const Tile &tile, int shapeindex, IntEndpts endpts[NREGIONS_TWO], int prec, 
+						   int indices[Tile::TILE_H][Tile::TILE_W], double toterr[NREGIONS_TWO])
+{
+	// build list of possibles
+	Vec3 palette[NREGIONS_TWO][NINDICES];
+
+	for (int region = 0; region < NREGIONS_TWO; ++region)
+	{
+		generate_palette_quantized(endpts[region], prec, &palette[region][0]);
+		toterr[region] = 0;
+	}
+
+	Vec3 err;
+
+	for (int y = 0; y < tile.size_y; y++)
+	for (int x = 0; x < tile.size_x; x++)
+	{
+		int region = REGION(x,y,shapeindex);
+		double err, besterr;
+
+		besterr = Utils::norm(tile.data[y][x], palette[region][0]);
+		indices[y][x] = 0;
+
+		for (int i = 1; i < NINDICES && besterr > 0; ++i)
+		{
+			err = Utils::norm(tile.data[y][x], palette[region][i]);
+
+			if (err > besterr)	// error increased, so we're done searching
+				break;
+			if (err < besterr)
+			{
+				besterr = err;
+				indices[y][x] = i;
+			}
+		}
+		toterr[region] += besterr;
+	}
+}
+
+static double perturb_one(const Vec3 colors[], const float importance[], int np, int ch, int prec, const IntEndpts &old_endpts, IntEndpts &new_endpts, 
+						  double old_err, int do_b)
+{
+	// we have the old endpoints: old_endpts
+	// we have the perturbed endpoints: new_endpts
+	// we have the temporary endpoints: temp_endpts
+
+	IntEndpts temp_endpts;
+	float min_err = old_err;		// start with the best current error
+	int beststep;
+
+	// copy real endpoints so we can perturb them
+	for (int i=0; i<NCHANNELS; ++i) { temp_endpts.A[i] = new_endpts.A[i] = old_endpts.A[i]; temp_endpts.B[i] = new_endpts.B[i] = old_endpts.B[i]; }
+
+	// do a logarithmic search for the best error for this endpoint (which)
+	for (int step = 1 << (prec-1); step; step >>= 1)
+	{
+		bool improved = false;
+		for (int sign = -1; sign <= 1; sign += 2)
+		{
+			if (do_b == 0)
+			{
+				temp_endpts.A[ch] = new_endpts.A[ch] + sign * step;
+				if (temp_endpts.A[ch] < 0 || temp_endpts.A[ch] >= (1 << prec))
+					continue;
+			}
+			else
+			{
+				temp_endpts.B[ch] = new_endpts.B[ch] + sign * step;
+				if (temp_endpts.B[ch] < 0 || temp_endpts.B[ch] >= (1 << prec))
+					continue;
+			}
+
+			float err = map_colors(colors, importance, np, temp_endpts, prec);
+
+			if (err < min_err)
+			{
+				improved = true;
+				min_err = err;
+				beststep = sign * step;
+			}
+		}
+		// if this was an improvement, move the endpoint and continue search from there
+		if (improved)
+		{
+			if (do_b == 0)
+				new_endpts.A[ch] += beststep;
+			else
+				new_endpts.B[ch] += beststep;
+		}
+	}
+	return min_err;
+}
+
+static void optimize_one(const Vec3 colors[], const float importance[], int np, double orig_err, const IntEndpts &orig_endpts, int prec, IntEndpts &opt_endpts)
+{
+	double opt_err = orig_err;
+	for (int ch = 0; ch < NCHANNELS; ++ch)
+	{
+		opt_endpts.A[ch] = orig_endpts.A[ch];
+		opt_endpts.B[ch] = orig_endpts.B[ch];
+	}
+	/*
+		err0 = perturb(rgb0, delta0)
+		err1 = perturb(rgb1, delta1)
+		if (err0 < err1)
+			if (err0 >= initial_error) break
+			rgb0 += delta0
+			next = 1
+		else
+			if (err1 >= initial_error) break
+			rgb1 += delta1
+			next = 0
+		initial_err = map()
+		for (;;)
+			err = perturb(next ? rgb1:rgb0, delta)
+			if (err >= initial_err) break
+			next? rgb1 : rgb0 += delta
+			initial_err = err
+	*/
+	IntEndpts new_a, new_b;
+	IntEndpts new_endpt;
+	int do_b;
+
+	// now optimize each channel separately
+	for (int ch = 0; ch < NCHANNELS; ++ch)
+	{
+		// figure out which endpoint when perturbed gives the most improvement and start there
+		// if we just alternate, we can easily end up in a local minima
+		float err0 = perturb_one(colors, importance, np, ch, prec, opt_endpts, new_a, opt_err, 0);	// perturb endpt A
+		float err1 = perturb_one(colors, importance, np, ch, prec, opt_endpts, new_b, opt_err, 1);	// perturb endpt B
+
+		if (err0 < err1)
+		{
+			if (err0 >= opt_err)
+				continue;
+
+			opt_endpts.A[ch] = new_a.A[ch];
+			opt_err = err0;
+			do_b = 1;		// do B next
+		}
+		else
+		{
+			if (err1 >= opt_err)
+				continue;
+			opt_endpts.B[ch] = new_b.B[ch];
+			opt_err = err1;
+			do_b = 0;		// do A next
+		}
+		
+		// now alternate endpoints and keep trying until there is no improvement
+		for (;;)
+		{
+			float err = perturb_one(colors, importance, np, ch, prec, opt_endpts, new_endpt, opt_err, do_b);
+			if (err >= opt_err)
+				break;
+			if (do_b == 0)
+				opt_endpts.A[ch] = new_endpt.A[ch];
+			else
+				opt_endpts.B[ch] = new_endpt.B[ch];
+			opt_err = err;
+			do_b = 1 - do_b;	// now move the other endpoint
+		}
+	}
+}
+
+static void optimize_endpts(const Tile &tile, int shapeindex, const double orig_err[NREGIONS_TWO], 
+							const IntEndpts orig_endpts[NREGIONS_TWO], int prec, IntEndpts opt_endpts[NREGIONS_TWO])
+{
+	Vec3 pixels[Tile::TILE_TOTAL];
+	float importance[Tile::TILE_TOTAL];
+	double err = 0;
+	int indices[Tile::TILE_TOTAL];
+
+	for (int region=0; region<NREGIONS_TWO; ++region)
+	{
+		// collect the pixels in the region
+		int np = 0;
+
+		for (int y = 0; y < tile.size_y; y++)
+		for (int x = 0; x < tile.size_x; x++)
+			if (REGION(x,y,shapeindex) == region)
+			{
+				pixels[np] = tile.data[y][x];
+				importance[np] = tile.importance_map[y][x];
+				++np;
+			}
+
+		optimize_one(pixels, importance, np, orig_err[region], orig_endpts[region], prec, opt_endpts[region]);
+	}
+}
+
+/* optimization algorithm
+	for each pattern
+		convert endpoints using pattern precision
+		assign indices and get initial error
+		compress indices (and possibly reorder endpoints)
+		transform endpoints
+		if transformed endpoints fit pattern
+			get original endpoints back
+			optimize endpoints, get new endpoints, new indices, and new error // new error will almost always be better
+			compress new indices
+			transform new endpoints
+			if new endpoints fit pattern AND if error is improved
+				emit compressed block with new data
+			else
+				emit compressed block with original data // to try to preserve maximum endpoint precision
+*/
+
+double ZOH::refinetwo(const Tile &tile, int shapeindex_best, const FltEndpts endpts[NREGIONS_TWO], char *block)
+{
+	double orig_err[NREGIONS_TWO], opt_err[NREGIONS_TWO], orig_toterr, opt_toterr;
+	IntEndpts orig_endpts[NREGIONS_TWO], opt_endpts[NREGIONS_TWO];
+	ComprEndpts compr_orig[NREGIONS_TWO], compr_opt[NREGIONS_TWO];
+	int orig_indices[Tile::TILE_H][Tile::TILE_W], opt_indices[Tile::TILE_H][Tile::TILE_W];
+
+	for (int sp = 0; sp < NPATTERNS; ++sp)
+	{
+		// precisions for all channels need to be the same
+		for (int i=1; i<NCHANNELS; ++i) assert (patterns[sp].chan[0].prec[0] == patterns[sp].chan[i].prec[0]);
+
+		quantize_endpts(endpts, patterns[sp].chan[0].prec[0], orig_endpts);
+		assign_indices(tile, shapeindex_best, orig_endpts, patterns[sp].chan[0].prec[0], orig_indices, orig_err);
+		swap_indices(orig_endpts, orig_indices, shapeindex_best);
+		compress_endpts(orig_endpts, compr_orig, patterns[sp]);
+		if (endpts_fit(orig_endpts, compr_orig, patterns[sp]))
+		{
+			optimize_endpts(tile, shapeindex_best, orig_err, orig_endpts, patterns[sp].chan[0].prec[0], opt_endpts);
+			assign_indices(tile, shapeindex_best, opt_endpts, patterns[sp].chan[0].prec[0], opt_indices, opt_err);
+			swap_indices(opt_endpts, opt_indices, shapeindex_best);
+			compress_endpts(opt_endpts, compr_opt, patterns[sp]);
+			orig_toterr = opt_toterr = 0;
+			for (int i=0; i < NREGIONS_TWO; ++i) { orig_toterr += orig_err[i]; opt_toterr += opt_err[i]; }
+			if (endpts_fit(opt_endpts, compr_opt, patterns[sp]) && opt_toterr < orig_toterr)
+			{
+				emit_block(compr_opt, shapeindex_best, patterns[sp], opt_indices, block);
+				return opt_toterr;
+			}
+			else
+			{
+				// either it stopped fitting when we optimized it, or there was no improvement
+				// so go back to the unoptimized endpoints which we know will fit
+				emit_block(compr_orig, shapeindex_best, patterns[sp], orig_indices, block);
+				return orig_toterr;
+			}
+		}
+	}
+	throw "No candidate found, should never happen (refinetwo.)";
+}
+
+static void generate_palette_unquantized(const FltEndpts endpts[NREGIONS_TWO], Vec3 palette[NREGIONS_TWO][NINDICES])
+{
+	for (int region = 0; region < NREGIONS_TWO; ++region)
+	for (int i = 0; i < NINDICES; ++i)
+		palette[region][i] = PALETTE_LERP(endpts[region].A, endpts[region].B, i, DENOM);
+}
+
+// generate a palette from unquantized endpoints, then pick best palette color for all pixels in each region, return toterr for all regions combined
+static double map_colors(const Tile &tile, int shapeindex, const FltEndpts endpts[NREGIONS_TWO])
+{
+	// build list of possibles
+	Vec3 palette[NREGIONS_TWO][NINDICES];
+
+	generate_palette_unquantized(endpts, palette);
+
+	double toterr = 0;
+	Vec3 err;
+
+	for (int y = 0; y < tile.size_y; y++)
+	for (int x = 0; x < tile.size_x; x++)
+	{
+		int region = REGION(x,y,shapeindex);
+		double err, besterr;
+
+		besterr = Utils::norm(tile.data[y][x], palette[region][0]) * tile.importance_map[y][x];
+
+		for (int i = 1; i < NINDICES && besterr > 0; ++i)
+		{
+			err = Utils::norm(tile.data[y][x], palette[region][i]) * tile.importance_map[y][x];
+
+			if (err > besterr)	// error increased, so we're done searching
+				break;
+			if (err < besterr)
+				besterr = err;
+		}
+		toterr += besterr;
+	}
+	return toterr;
+}
+
+double ZOH::roughtwo(const Tile &tile, int shapeindex, FltEndpts endpts[NREGIONS_TWO])
+{
+	for (int region=0; region<NREGIONS_TWO; ++region)
+	{
+		int np = 0;
+		Vec3 colors[Tile::TILE_TOTAL];
+		Vec3 mean(0,0,0);
+
+		for (int y = 0; y < tile.size_y; y++)
+		for (int x = 0; x < tile.size_x; x++)
+			if (REGION(x,y,shapeindex) == region)
+			{
+				colors[np] = tile.data[y][x];
+				mean += tile.data[y][x];
+				++np;
+			}
+
+		// handle simple cases	
+		if (np == 0)
+		{
+			Vec3 zero(0,0,0);
+			endpts[region].A = zero;
+			endpts[region].B = zero;
+			continue;
+		}
+		else if (np == 1)
+		{
+			endpts[region].A = colors[0];
+			endpts[region].B = colors[0];
+			continue;
+		}
+		else if (np == 2)
+		{
+			endpts[region].A = colors[0];
+			endpts[region].B = colors[1];
+			continue;
+		}
+
+		Matrix rdq(np, np);
+
+		mean /= float(np);
+
+		for (int i = 0; i < np; ++i)
+		{
+			rdq(i,0) = colors[i].X() - mean.X();
+			rdq(i,1) = colors[i].Y() - mean.Y();
+			rdq(i,2) = colors[i].Z() - mean.Z();
+		}
+				
+		// perform a singular value decomposition
+		SVD svd(rdq);			// decompose matrix rdq to R*D*Q (== U*W*V in standard nomenclature)
+
+		// get the principal component direction (well, the one with the largest weight)
+		Vec3 direction(svd.R()(0,0), svd.R()(0,1), svd.R()(0,2));
+
+		// project each pixel value along the principal direction
+		double minp = DBL_MAX, maxp = -DBL_MAX;
+		for (int i = 0; i < np; i++) 
+		{
+			float dp = rdq(i,0) * direction.X() + rdq(i,1)*direction.Y() + rdq(i,2)*direction.Z();
+			if (dp < minp) minp = dp;
+			if (dp > maxp) maxp = dp;
+		}
+
+		// choose as endpoints 2 points along the principal direction that span the projections of all of the pixel values
+		endpts[region].A = mean + minp*direction;
+		endpts[region].B = mean + maxp*direction;
+
+		// clamp endpoints
+		// the argument for clamping is that the actual endpoints need to be clamped and thus we need to choose the best
+		// shape based on endpoints being clamped
+		Utils::clamp(endpts[region].A);
+		Utils::clamp(endpts[region].B);
+	}
+
+	return map_colors(tile, shapeindex, endpts);
+}
+
+double ZOH::compresstwo(const Tile &t, char *block)
+{
+	int shapeindex_best = 0;
+	FltEndpts endptsbest[NREGIONS_TWO], tempendpts[NREGIONS_TWO];
+	double msebest = DBL_MAX;
+
+	/*
+		collect the mse values that are within 5% of the best values
+		optimize each one and choose the best
+	*/
+	// hack for now -- just use the best value WORK
+	for (int i=0; i<NSHAPES && msebest>0.0; ++i)
+	{
+		double mse = roughtwo(t, i, tempendpts);
+		if (mse < msebest)
+		{
+			msebest = mse;
+			shapeindex_best = i;
+			memcpy(endptsbest, tempendpts, sizeof(endptsbest));
+		}
+
+	}
+	return refinetwo(t, shapeindex_best, endptsbest, block);
+}
+
diff --git a/src/nvtt/bc7/ImfArray.h b/src/nvtt/bc7/ImfArray.h
new file mode 100644
index 0000000..5160fa4
--- /dev/null
+++ b/src/nvtt/bc7/ImfArray.h
@@ -0,0 +1,261 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2002, Industrial Light & Magic, a division of Lucas
+// Digital Ltd. LLC
+// 
+// All rights reserved.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+// *       Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// *       Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// *       Neither the name of Industrial Light & Magic nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission. 
+// 
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+///////////////////////////////////////////////////////////////////////////
+
+
+
+#ifndef INCLUDED_IMF_ARRAY_H
+#define INCLUDED_IMF_ARRAY_H
+
+//-------------------------------------------------------------------------
+//
+// class Array
+// class Array2D
+//
+// "Arrays of T" whose sizes are not known at compile time.
+// When an array goes out of scope, its elements are automatically
+// deleted.
+//
+// Usage example:
+//
+//	struct C
+//	{
+//	    C ()		{std::cout << "C::C  (" << this << ")\n";};
+//	    virtual ~C ()	{std::cout << "C::~C (" << this << ")\n";};
+//	};
+// 
+//	int
+//	main ()
+//	{
+//	    Array <C> a(3);
+// 
+//	    C &b = a[1];
+//	    const C &c = a[1];
+//	    C *d = a + 2;
+//	    const C *e = a;
+// 
+//	    return 0;
+//	}
+//
+//-------------------------------------------------------------------------
+
+namespace Imf {
+
+
+template <class T>
+class Array
+{
+  public:
+
+    //-----------------------------
+    // Constructors and destructors
+    //-----------------------------
+
+     Array ()				{_data = 0;}
+     Array (long size)			{_data = new T[size];}
+    ~Array ()				{delete [] _data;}
+
+
+    //-----------------------------
+    // Access to the array elements
+    //-----------------------------
+
+    operator T * ()			{return _data;}
+    operator const T * () const		{return _data;}
+
+
+    //------------------------------------------------------
+    // Resize and clear the array (the contents of the array
+    // are not preserved across the resize operation).
+    //
+    // resizeEraseUnsafe() is more memory efficient than
+    // resizeErase() because it deletes the old memory block
+    // before allocating a new one, but if allocating the
+    // new block throws an exception, resizeEraseUnsafe()
+    // leaves the array in an unusable state.
+    //
+    //------------------------------------------------------
+
+    void resizeErase (long size);
+    void resizeEraseUnsafe (long size);
+
+
+  private:
+
+    Array (const Array &);		// Copying and assignment
+    Array & operator = (const Array &);	// are not implemented
+
+    T * _data;
+};
+
+
+template <class T>
+class Array2D
+{
+  public:
+
+    //-----------------------------
+    // Constructors and destructors
+    //-----------------------------
+
+     Array2D ();			// empty array, 0 by 0 elements
+     Array2D (long sizeX, long sizeY);	// sizeX by sizeY elements
+    ~Array2D ();
+
+
+    //-----------------------------
+    // Access to the array elements
+    //-----------------------------
+
+    T *		operator [] (long x);
+    const T *	operator [] (long x) const;
+
+
+    //------------------------------------------------------
+    // Resize and clear the array (the contents of the array
+    // are not preserved across the resize operation).
+    //
+    // resizeEraseUnsafe() is more memory efficient than
+    // resizeErase() because it deletes the old memory block
+    // before allocating a new one, but if allocating the
+    // new block throws an exception, resizeEraseUnsafe()
+    // leaves the array in an unusable state.
+    //
+    //------------------------------------------------------
+
+    void resizeErase (long sizeX, long sizeY);
+    void resizeEraseUnsafe (long sizeX, long sizeY);
+
+
+  private:
+
+    Array2D (const Array2D &);			// Copying and assignment
+    Array2D & operator = (const Array2D &);	// are not implemented
+
+    long	_sizeY;
+    T *		_data;
+};
+
+
+//---------------
+// Implementation
+//---------------
+
+template <class T>
+inline void
+Array<T>::resizeErase (long size)
+{
+    T *tmp = new T[size];
+    delete [] _data;
+    _data = tmp;
+}
+
+
+template <class T>
+inline void
+Array<T>::resizeEraseUnsafe (long size)
+{
+    delete [] _data;
+    _data = 0;
+    _data = new T[size];
+}
+
+
+template <class T>
+inline
+Array2D<T>::Array2D ():
+    _sizeY (0), _data (0)
+{
+    // emtpy
+}
+
+
+template <class T>
+inline
+Array2D<T>::Array2D (long sizeX, long sizeY):
+    _sizeY (sizeY), _data (new T[sizeX * sizeY])
+{
+    // emtpy
+}
+
+
+template <class T>
+inline
+Array2D<T>::~Array2D ()
+{
+    delete [] _data;
+}
+
+
+template <class T>
+inline T *	
+Array2D<T>::operator [] (long x)
+{
+    return _data + x * _sizeY;
+}
+
+
+template <class T>
+inline const T *
+Array2D<T>::operator [] (long x) const
+{
+    return _data + x * _sizeY;
+}
+
+
+template <class T>
+inline void
+Array2D<T>::resizeErase (long sizeX, long sizeY)
+{
+    T *tmp = new T[sizeX * sizeY];
+    delete [] _data;
+    _sizeY = sizeY;
+    _data = tmp;
+}
+
+
+template <class T>
+inline void
+Array2D<T>::resizeEraseUnsafe (long sizeX, long sizeY)
+{
+    delete [] _data;
+    _data = 0;
+    _sizeY = 0;
+    _data = new T[sizeX * sizeY];
+    _sizeY = sizeY;
+}
+
+
+} // namespace Imf
+
+#endif
diff --git a/src/nvtt/bc7/arvo/ArvoMath.cpp b/src/nvtt/bc7/arvo/ArvoMath.cpp
new file mode 100644
index 0000000..95d1a7d
--- /dev/null
+++ b/src/nvtt/bc7/arvo/ArvoMath.cpp
@@ -0,0 +1,342 @@
+/***************************************************************************
+* Math.C                                                                   *
+*                                                                          *
+* Some basic math functions.                                               *
+*                                                                          *
+*   HISTORY                                                                *
+*      Name    Date        Description                                     *
+*                                                                          *
+*      arvo    06/21/93    Initial coding.                                 *
+*                                                                          *
+*--------------------------------------------------------------------------*
+* Copyright (C) 1999, James Arvo                                           *
+*                                                                          *
+* This program is free software; you can redistribute it and/or modify it  *
+* under the terms of the GNU General Public License as published by the    *
+* Free Software Foundation.  See http://www.fsf.org/copyleft/gpl.html      *
+*                                                                          *
+* This program is distributed in the hope that it will be useful, but      *
+* WITHOUT EXPRESS OR IMPLIED WARRANTY of merchantability or fitness for    *
+* any particular purpose.  See the GNU General Public License for more     *
+* details.                                                                 *
+*                                                                          *
+***************************************************************************/
+#include <math.h>
+#include <stdlib.h>
+#include <iostream>
+#include <assert.h>
+#include "ArvoMath.h"
+#include "form.h"
+
+namespace ArvoMath {
+	static const float  Epsilon = 1.0E-5;
+	static const double LogTwo  = log( 2.0 );
+
+#define BinCoeffMax 500
+
+	double RelErr( double x, double y )
+	{
+		double z = x - y;
+		if( x < 0.0 ) x = -x;
+		if( y < 0.0 ) y = -y;
+		return z / ( x > y ? x : y );
+	}
+
+	/***************************************************************************
+	*  A R C   Q U A D                                                         *
+	*                                                                          *
+	* Returns the theta / ( 2*PI ) where the input variables x and y are       *
+	* such that  x == COS( theta ) and  y == SIN( theta ).                     *
+	*                                                                          *
+	***************************************************************************/
+	float ArcQuad( float x, float y )
+	{
+		if( Abs( x ) > Epsilon )
+		{
+			float temp = OverTwoPi * atan( Abs( y ) / Abs( x ) );
+			if( x < 0.0 ) temp = 0.5 - temp;
+			if( y < 0.0 ) temp = 1.0 - temp;
+			return( temp );
+		}
+		else if( y >  Epsilon ) return( 0.25 );
+		else if( y < -Epsilon ) return( 0.75 );
+		else return( 0.0 ); 
+	}
+
+	/***************************************************************************
+	*  A R C   T A N                                                           *
+	*                                                                          *
+	* Returns the angle theta such that x = COS( theta ) & y = SIN( theta ).   *
+	*                                                                          *
+	***************************************************************************/
+	float ArcTan( float x, float y )
+	{
+		if( Abs( x ) > Epsilon )
+		{
+			float temp = atan( Abs( y ) / Abs( x ) );
+			if( x < 0.0 ) temp = Pi    - temp;
+			if( y < 0.0 ) temp = TwoPi - temp;
+			return( temp );
+		}
+		else if( y >  Epsilon ) return(     PiOverTwo );
+		else if( y < -Epsilon ) return( 3 * PiOverTwo );
+		else return( 0.0 ); 
+	}
+
+	/***************************************************************************
+	*  M A C H I N E   E P S I L O N                                           *
+	*                                                                          *
+	* Returns the machine epsilon.                                             *
+	*                                                                          *
+	***************************************************************************/
+	float MachineEpsilon()
+	{
+		float x = 1.0;
+		float y;
+		float z = 1.0 + x;
+		while( z > 1.0 )
+		{
+			y = x;
+			x /= 2.0;
+			z = (float)( 1.0 + (float)x );  // Avoid double precision!
+		}
+		return (float)y;
+	}
+
+	/***************************************************************************
+	*  L O G   G A M M A                                                       *
+	*                                                                          *
+	*  Computes the natural log of the gamma function using the Lanczos        *
+	*  approximation formula.  Gamma is defined by                             *
+	*                                                                          *
+	*                                 ( z - 1 )   -t                           *
+	*         gamma( z ) = Integral[ t           e    dt ]                     *
+	*                                                                          *
+	*                                                                          *
+	*  where the integral ranges from 0 to infinity.  The gamma function       *
+	*  satisfies                                                               *
+	*                    gamma( n + 1 ) = n!                                   *
+	*                                                                          *
+	*  This algorithm has been adapted from "Numerical Recipes", p. 157.       *
+	*                                                                          *
+	***************************************************************************/
+	double LogGamma( double x )
+	{
+		static const double 
+			coeff0 =  7.61800917300E+1,
+			coeff1 = -8.65053203300E+1,
+			coeff2 =  2.40140982200E+1,
+			coeff3 = -1.23173951600E+0,
+			coeff4 =  1.20858003000E-3,
+			coeff5 = -5.36382000000E-6,
+			stp    =  2.50662827465E+0,
+			half   =  5.00000000000E-1,
+			fourpf =  4.50000000000E+0,
+			one    =  1.00000000000E+0,
+			two    =  2.00000000000E+0, 
+			three  =  3.00000000000E+0,
+			four   =  4.00000000000E+0, 
+			five   =  5.00000000000E+0;
+		double r = coeff0 / ( x        ) + coeff1 / ( x + one   ) +
+			coeff2 / ( x + two  ) + coeff3 / ( x + three ) +
+			coeff4 / ( x + four ) + coeff5 / ( x + five  ) ;
+		double s = x + fourpf;
+		double t = ( x - half ) * log( s ) - s;
+		return t + log( stp * ( r + one ) );
+	}
+
+	/***************************************************************************
+	*  L O G   F A C T                                                         *
+	*                                                                          *
+	*  Returns the natural logarithm of n factorial.  For efficiency, some     *
+	*  of the values are cached, so they need be computed only once.           *
+	*                                                                          *
+	***************************************************************************/
+	double LogFact( int n )
+	{
+		static const int Cache_Size = 100;
+		static double c[ Cache_Size ] = { 0.0 }; // Cache some of the values.
+		if( n <= 1 ) return 0.0;
+		if( n < Cache_Size )
+		{
+			if( c[n] == 0.0 ) c[n] = LogGamma((double)(n+1));
+			return c[n];
+		}
+		return LogGamma((double)(n+1)); // gamma(n+1) == n!
+	}
+
+	/***************************************************************************
+	*  M U L T I N O M I A L    C O E F F                                      *
+	*                                                                          *
+	*  Returns the multinomial coefficient ( n; X1 X2 ... Xk ) which is        *
+	*  defined to be n! / ( X1! X2! ... Xk! ).  This is done by computing      *
+	*  exp( log(n!) - log(X1!) - log(X2!) - ... - log(Xk!) ).  The value of    *
+	*  n is obtained by summing the Xi's.                                      *
+	*                                                                          *
+	***************************************************************************/
+	double MultinomialCoeff( int k, int X[] )
+	{
+		int i;
+		// Find n by summing the coefficients.
+
+		int  n = X[0];
+		for( i = 1; i < k; i++ ) n += X[i];
+
+		// Compute log(n!) then subtract log(X!) for each X.
+
+		double LogCoeff = LogFact( n );
+		for( i = 0; i < k; i++ ) LogCoeff -= LogFact( X[i] );
+
+		// Round the exponential of the result to the nearest integer.
+
+		return floor( exp( LogCoeff ) + 0.5 );
+	}
+
+
+	double MultinomialCoeff( int i, int j, int k )
+	{
+		int    n = i + j + k;
+		double x = LogFact( n ) - LogFact( i ) - LogFact( j ) - LogFact( k );
+		return floor( exp( x ) + 0.5 );
+	}
+
+	/***************************************************************************
+	*  B I N O M I A L    C O E F F S                                          *
+	*                                                                          *
+	*  Generate all n+1 binomial coefficents for a given n.  This is done by   *
+	*  computing the n'th row of Pascal's triange, starting from the top.      *
+	*  No additional storage is required.                                      *
+	*                                                                          *
+	***************************************************************************/
+	void BinomialCoeffs( int n, long *coeff )
+	{
+		coeff[0] = 1;
+		for( int i = 1; i <= n; i++ )
+		{
+			long a = coeff[0];
+			long b = coeff[1];
+			for( int j = 1; j < i; j++ )  // Make next row of Pascal's triangle.
+			{
+				coeff[j] = a + b; // Overwrite the old row.
+				a = b;
+				b = coeff[j+1];
+			}
+			coeff[i] = 1;  // The last entry in any row is always 1.
+		}
+	}
+
+	void BinomialCoeffs( int n, double *coeff )
+	{
+		coeff[0] = 1.0;
+		for( int i = 1; i <= n; i++ )
+		{
+			double a = coeff[0];
+			double b = coeff[1];
+			for( int j = 1; j < i; j++ )  // Make next row of Pascal's triangle.
+			{
+				coeff[j] = a + b; // Overwrite the old row.
+				a = b;
+				b = coeff[j+1];
+			}
+			coeff[i] = 1.0;  // The last entry in any row is always 1.
+		}
+	}
+
+	const double *BinomialCoeffs( int n )
+	{
+		static double *coeff[ BinCoeffMax + 1 ] = { 0 };
+		if( n > BinCoeffMax || n < 0 ) 
+		{
+			std::cerr << form( "%d is outside of (0,%d) in BinomialCoeffs", n, BinCoeffMax );
+			return NULL;
+		}
+		if( coeff[n] == NULL ) // Fill in this entry.
+		{
+			double *c = new double[ n + 1 ];
+			if( c == NULL )
+			{
+				std::cerr << form( "Could not allocate for BinomialCoeffs(%d)", n );
+				return NULL;
+			}
+			BinomialCoeffs( n, c );
+			coeff[n] = c;
+		}
+		return coeff[n];
+	}
+
+	/***************************************************************************
+	*  B I N O M I A L    C O E F F                                            *
+	*                                                                          *
+	*  Compute a given binomial coefficient.  Several rows of Pascal's         *
+	*  triangle are stored for efficiently computing the small coefficients.   *
+	*  Higher-order terms are computed using LogFact.                          *
+	*                                                                          *
+	***************************************************************************/
+	double BinomialCoeff( int n, int k )
+	{
+		double b;
+		int    p = n - k;
+		if( k <= 1 || p <= 1 )  // Check for errors and special cases.
+		{
+			if( k == 0 || p == 0 ) return 1;
+			if( k == 1 || p == 1 ) return n;
+			std::cerr << form( "BinomialCoeff(%d,%d) is undefined", n, k );
+			return 0;
+		}
+		static const int  // Store part of Pascal's triange for small coeffs.
+			n0[] = { 1 },
+			n1[] = { 1, 1 },
+			n2[] = { 1, 2, 1 },
+			n3[] = { 1, 3, 3, 1 },
+			n4[] = { 1, 4, 6, 4, 1 },
+			n5[] = { 1, 5, 10, 10, 5, 1 },
+			n6[] = { 1, 6, 15, 20, 15, 6, 1 },
+			n7[] = { 1, 7, 21, 35, 35, 21, 7, 1 },
+			n8[] = { 1, 8, 28, 56, 70, 56, 28, 8, 1 },
+			n9[] = { 1, 9, 36, 84, 126, 126, 84, 36, 9, 1 };
+		switch( n )
+		{
+		case 0 : b = n0[k]; break;
+		case 1 : b = n1[k]; break;
+		case 2 : b = n2[k]; break;
+		case 3 : b = n3[k]; break;
+		case 4 : b = n4[k]; break;
+		case 5 : b = n5[k]; break;
+		case 6 : b = n6[k]; break;
+		case 7 : b = n7[k]; break;
+		case 8 : b = n8[k]; break;
+		case 9 : b = n9[k]; break;
+		default:
+			{
+				double x = LogFact( n ) - LogFact( p ) - LogFact( k );
+				b = floor( exp( x ) + 0.5 );
+			}
+		}
+		return b;
+	}
+
+
+	/***************************************************************************
+	*  L O G   D O U B L E   F A C T   (Log of double factorial)               *
+	*                                                                          *
+	*  Return log( n!! ) where the double factorial is defined by              *
+	*                                                                          *
+	*      (2 n + 1)!! = 1 * 3 * 5 * ... * (2n + 1)    (Odd integers)          *
+	*                                                                          *
+	*      (2 n)!!     = 2 * 4 * 6 * ... * 2n          (Even integers)         *
+	*                                                                          *
+	*  and is related to the single factorial via                              *
+	*                                                                          *
+	*      (2 n + 1)!! = (2 n + 1)! / ( 2^n n! )       (Odd integers)          *
+	*                                                                          *
+	*      (2 n)!!     = 2^n n!                        (Even integers)         *
+	*                                                                          *
+	***************************************************************************/
+	double LogDoubleFact( int n )   // log( n!! )
+	{
+		int    k = n / 2;
+		double f = LogFact( k ) + k * LogTwo;
+		if( Odd(n) ) f = LogFact( n ) - f;
+		return f;
+	}
+};
diff --git a/src/nvtt/bc7/arvo/ArvoMath.h b/src/nvtt/bc7/arvo/ArvoMath.h
new file mode 100644
index 0000000..e9edd7e
--- /dev/null
+++ b/src/nvtt/bc7/arvo/ArvoMath.h
@@ -0,0 +1,212 @@
+/***************************************************************************
+* Math.h                                                                   *
+*                                                                          *
+* Convenient constants, macros, and inline functions for basic math        *
+* functions.                                                               *
+*                                                                          *
+*   HISTORY                                                                *
+*      Name    Date        Description                                     *
+*                                                                          *
+*      arvo    06/17/93    Initial coding.                                 *
+*                                                                          *
+*--------------------------------------------------------------------------*
+* Copyright (C) 1999, James Arvo                                           *
+*                                                                          *
+* This program is free software; you can redistribute it and/or modify it  *
+* under the terms of the GNU General Public License as published by the    *
+* Free Software Foundation.  See http://www.fsf.org/copyleft/gpl.html      *
+*                                                                          *
+* This program is distributed in the hope that it will be useful, but      *
+* WITHOUT EXPRESS OR IMPLIED WARRANTY of merchantability or fitness for    *
+* any particular purpose.  See the GNU General Public License for more     *
+* details.                                                                 *
+*                                                                          *
+***************************************************************************/
+#ifndef __MATH_INCLUDED__
+#define __MATH_INCLUDED__
+
+#include <math.h>
+#include <stdlib.h>
+
+namespace ArvoMath {
+
+#ifndef MAXFLOAT
+#define MAXFLOAT 1.0E+20
+#endif
+
+	static const double
+		Pi            = 3.14159265358979,
+		PiSquared     = Pi * Pi,
+		TwoPi         = 2.0 * Pi,
+		FourPi        = 4.0 * Pi,
+		PiOverTwo     = Pi / 2.0,
+		PiOverFour    = Pi / 4.0,
+		OverPi        = 1.0 / Pi,
+		OverTwoPi     = 1.0 / TwoPi,
+		OverFourPi    = 1.0 / FourPi,
+		Infinity      = MAXFLOAT,
+		Tiny          = 1.0 / MAXFLOAT,
+		DegreesToRad  = Pi / 180.0,
+		RadToDegrees  = 180.0 / Pi;
+
+	inline int    Odd   ( int    k           ) { return k & 1; }
+	inline int    Even  ( int    k           ) { return !(k & 1); }
+	inline float  Abs   ( int    x           ) { return x > 0  ? x : -x; }
+	inline float  Abs   ( float  x           ) { return x > 0. ? x : -x; }
+	inline float  Abs   ( double x           ) { return x > 0. ? x : -x; }
+	inline float  Min   ( float  x, float  y ) { return x < y ? x : y; }
+	inline float  Max   ( float  x, float  y ) { return x > y ? x : y; }
+	inline double dMin  ( double x, double y ) { return x < y ? x : y; }
+	inline double dMax  ( double x, double y ) { return x > y ? x : y; }
+	inline float  Sqr   ( int    x           ) { return x * x; }
+	inline float  Sqr   ( float  x           ) { return x * x; }
+	inline float  Sqr   ( double x           ) { return x * x; }
+	inline float  Sqrt  ( double x           ) { return x > 0. ? sqrt(x) : 0.; }
+	inline float  Cubed ( float  x           ) { return x * x * x; }
+	inline int    Sign  ( float  x           ) { return x > 0. ? 1 : (x < 0. ? -1 : 0); }
+	inline void   Swap  ( float &a, float &b ) { float c = a; a = b; b = c; }
+	inline void   Swap  ( int   &a, int   &b ) { int   c = a; a = b; b = c; }
+	inline double Sin   ( double x, int    n ) { return pow( sin(x), n ); }
+	inline double Cos   ( double x, int    n ) { return pow( cos(x), n ); }
+	inline float  ToSin ( double x           ) { return Sqrt( 1.0 - Sqr(x) ); }
+	inline float  ToCos ( double x           ) { return Sqrt( 1.0 - Sqr(x) ); }
+	inline float  MaxAbs( float  x, float  y ) { return Max( Abs(x), Abs(y) ); }
+	inline float  MinAbs( float  x, float  y ) { return Min( Abs(x), Abs(y) ); }
+	inline float  Pythag( double x, double y ) { return Sqrt( x*x + y*y ); }
+
+	inline double ArcCos( double x )
+	{
+		double y;
+		if( -1.0 <= x && x <= 1.0 ) y = acos( x );
+		else if( x >  1.0 ) y = 0.0;
+		else if( x < -1.0 ) y = Pi;
+		return y;
+	}
+
+	inline double ArcSin( double x )
+	{
+		if( x < -1.0 ) x = -1.0;
+		if( x >  1.0 ) x =  1.0;
+		return asin( x );
+	}
+
+	inline float Clamp( float min, float &x, float max )
+	{
+		if( x < min ) x = min; else
+			if( x > max ) x = max;
+		return x;
+	}
+
+	inline double Clamp( float min, double &x, float max )
+	{
+		if( x < min ) x = min; else
+			if( x > max ) x = max;
+		return x;
+	}
+
+	inline float Max( float x, float y, float z )
+	{
+		float t;
+		if( x >= y && x >= z ) t = x;
+		else if( y >= z ) t = y;
+		else t = z;
+		return t;
+	}
+
+	inline float Min( float x, float y, float z )
+	{
+		float t;
+		if( x <= y && x <= z ) t = x;
+		else if( y <= z ) t = y;
+		else t = z;
+		return t;
+	}
+
+	inline float Max( float x, float y, float z, float w )
+	{
+		float t;
+		if( x >= y && x >= z && x >= w) t = x;
+		else if( y >= z && y >= w ) t = y;
+		else if (z >= w) t = z;
+		else t = w;
+		return t;
+	}
+
+	inline float Min( float x, float y, float z, float w )
+	{
+		float t;
+		if( x <= y && x <= z && x <= w) t = x;
+		else if( y <= z && y <= w ) t = y;
+		else if (z <= w) t = z;
+		else t = w;
+		return t;
+	}
+
+	inline double dMax( double x, double y, double z )
+	{
+		double t;
+		if( x >= y && x >= z ) t = x;
+		else if( y >= z ) t = y;
+		else t = z;
+		return t;
+	}
+
+	inline double dMin( double x, double y, double z )
+	{
+		double t;
+		if( x <= y && x <= z ) t = x;
+		else if( y <= z ) t = y;
+		else t = z;
+		return t;
+	}
+
+	inline float MaxAbs( float x, float y, float z )
+	{
+		return Max( Abs( x ), Abs( y ), Abs( z ) );
+	}
+
+	inline float MaxAbs( float x, float y, float z, float w )
+	{
+		return Max( Abs( x ), Abs( y ), Abs( z ), Abs( w ) );
+	}
+
+	inline float Pythag( float x, float y, float z )
+	{
+		return sqrt( x * x  +  y * y  +  z * z );
+	}
+
+	extern float  ArcTan          ( float x, float y      );
+	extern float  ArcQuad         ( float x, float y      );
+	extern float  MachineEpsilon  (                       );
+	extern double LogGamma        ( double x              );
+	extern double LogFact         ( int n                 );
+	extern double LogDoubleFact   ( int n                 );   // log( n!! )
+	extern double BinomialCoeff   ( int n, int k          );
+	extern void   BinomialCoeffs  ( int n, long   *coeffs );
+	extern void   BinomialCoeffs  ( int n, double *coeffs );
+	extern double MultinomialCoeff( int i, int j, int k   );
+	extern double MultinomialCoeff( int k, int N[]        );
+	extern double RelErr          ( double x, double y    );
+
+#ifndef ABS
+#define ABS( x ) ((x) > 0 ? (x) : -(x))
+#endif
+
+#ifndef MAX
+#define MAX( x, y ) ((x) > (y) ? (x) : (y))
+#endif
+
+#ifndef MIN
+#define MIN( x, y ) ((x) < (y) ? (x) : (y))
+#endif
+
+};
+
+#endif
+
+
+
+
+
+
+
diff --git a/src/nvtt/bc7/arvo/Char.cpp b/src/nvtt/bc7/arvo/Char.cpp
new file mode 100644
index 0000000..cc450a5
--- /dev/null
+++ b/src/nvtt/bc7/arvo/Char.cpp
@@ -0,0 +1,420 @@
+/***************************************************************************
+* Char.h                                                                   *
+*                                                                          *
+* Convenient constants, macros, and inline functions for manipulation of   *
+* characters and strings.                                                  *
+*                                                                          *
+*   HISTORY                                                                *
+*      Name    Date        Description                                     *
+*                                                                          *
+*      arvo    07/01/93    Initial coding.                                 *
+*                                                                          *
+*--------------------------------------------------------------------------*
+* Copyright (C) 1999, James Arvo                                           *
+*                                                                          *
+* This program is free software; you can redistribute it and/or modify it  *
+* under the terms of the GNU General Public License as published by the    *
+* Free Software Foundation.  See http://www.fsf.org/copyleft/gpl.html      *
+*                                                                          *
+* This program is distributed in the hope that it will be useful, but      *
+* WITHOUT EXPRESS OR IMPLIED WARRANTY of merchantability or fitness for    *
+* any particular purpose.  See the GNU General Public License for more     *
+* details.                                                                 *
+*                                                                          *
+***************************************************************************/
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include "Char.h"
+
+namespace ArvoMath {
+
+	typedef char *charPtr;
+
+	// Treat "str" as a file name, and return just the directory
+	// portion -- i.e. strip off the name of the leaf object (but
+	// leave the final "/".
+	const char *getPath( const char *str, char *buff )
+	{
+		int k;
+		for( k = strlen( str ) - 1; k >= 0; k-- )
+		{
+			if( str[k] == Slash ) break;
+		}
+		for( int i = 0; i <= k; i++ ) buff[i] = str[i];
+		buff[k+1] = NullChar;
+		return buff;
+	}
+
+	// Treat "str" as a file name, and return just the file name
+	// portion -- i.e. strip off everything up to and including
+	// the final "/".
+	const char *getFile( const char *str, char *buff )
+	{
+		int k;
+		int len = strlen( str );
+		for( k = len - 1; k >= 0; k-- )
+		{
+			if( str[k] == Slash ) break;
+		}
+		for( int i = 0; i < len - k; i++ ) buff[i] = str[ i + k + 1 ];
+		return buff;
+	}
+
+	int getPrefix( const char *str, char *buff )
+	{
+		int len = 0;
+		while( *str != NullChar && *str != Period ) 
+		{
+			*buff++ = *str++;
+			len++;
+		}
+		*buff = NullChar;
+		return len;
+	}
+
+	int getSuffix( const char *str, char *buff )
+	{
+		int n = strlen( str );
+		int k = n - 1;
+		while( k >= 0 && str[k] != Period ) k--;
+		for( int i = k + 1; i < n; i++ ) *buff++ = str[i];
+		*buff = NullChar;    
+		return n - k - 1;
+	}
+
+	const char* toString( int number, char *buff )
+	{
+		static char local_buff[32];
+		char *str = ( buff == NULL ) ? local_buff : buff;
+		sprintf( str, "%d", number );
+		return str;
+	}
+
+	const char* toString( float number, char *buff )
+	{
+		static char local_buff[32];
+		char *str = ( buff == NULL ) ? local_buff : buff;
+		sprintf( str, "%g", number );
+		return str;
+	}
+
+	int isInteger( const char *str )
+	{
+		int n = strlen( str );
+		for( int i = 0; i < n; i++ )
+		{
+			char c = str[i];
+			if( isDigit(c) ) continue;
+			if( c == Plus || c == Minus ) continue;
+			if( c == Space ) continue;
+			return 0;
+		}
+		return 1;
+	}
+
+	// Test to see if a string has a given suffix.
+	int hasSuffix( const char *string, const char *suffix )
+	{
+		if( suffix == NULL ) return 1; // The null suffix always matches.
+		if( string == NULL ) return 0; // The null string can only have a null suffix.
+		int m = strlen( string );
+		int k = strlen( suffix );
+		if( k <= 0    ) return 1; // Empty suffix always matches.
+		if( m < k + 1 ) return 0; // String is too short to have this suffix.
+
+		// See if the file has the given suffix.
+		int s = m - k;  // Beginning of suffix (if it matches).
+		for( int i = 0; i < k; i++ )
+			if( string[ s + i ] != suffix[ i ] ) return 0;
+		return s;  // Always > 0.
+	}
+
+	// Test to see if a string has a given prefix.
+	int hasPrefix( const char *string, const char *prefix )
+	{
+		if( prefix == NULL ) return 1; // The null prefix always matches.
+		if( string == NULL ) return 0; // The null string can only have a null suffix.
+		while( *prefix )
+		{
+			if( *prefix++ != *string++ ) return 0;
+		}
+		return 1;
+	}
+
+	// Test to see if the string contains the given character.
+	int inString( char c, const char *str )
+	{
+		if( str == NULL || str[0] == NullChar ) return 0;
+		while( *str != '\0' ) 
+			if( *str++ == c ) return 1;
+		return 0;
+	}
+
+	int nullString( const char *str )
+	{
+		return str == NULL || str[0] == NullChar;
+	}
+
+	const char *stripSuffix( const char *string, const char *suffix, char *buff )
+	{
+		static char local_buff[256];
+		if( buff == NULL ) buff = local_buff;
+		buff[0] = NullChar;
+		if( !hasSuffix( string, suffix ) ) return NULL;
+		int s = strlen( string ) - strlen( suffix );
+		for( int i = 0; i < s; i++ )
+		{
+			buff[i] = string[i];
+		}
+		buff[s] = NullChar;
+		return buff;
+	}
+
+	int getIndex( const char *pat, const char *str )
+	{
+		int p_len = strlen( pat );
+		int s_len = strlen( str );
+		if( p_len == 0 || s_len == 0 ) return -1;
+		for( int i = 0; i <= s_len - p_len; i++ )
+		{
+			int match = 1;
+			for( int j = 0; j < p_len; j++ )
+			{
+				if( str[ i + j ] != pat[ j ] ) { match = 0; break; }
+			}
+			if( match ) return i;
+		}
+		return -1;
+	}
+
+	int getSubstringAfter( const char *pat, const char *str, char *buff )
+	{
+		int ind = getIndex( pat, str );
+		if( ind < 0 ) return -1;
+		int p_len = strlen( pat );
+		int k = 0;
+		for( int i = ind + p_len; ; i++ )
+		{
+			buff[ k++ ] = str[ i ];
+			if( str[ i ] == NullChar ) break;
+		}
+		return k;
+	}
+
+	const char *SubstringAfter( const char *pat, const char *str, char *user_buff )
+	{
+		static char temp[128];
+		char *buff = ( user_buff != NULL ) ? user_buff : temp;
+		int k = getSubstringAfter( pat, str, buff );
+		if( k > 0 ) return buff;
+		return str;
+	}
+
+	const char *metaString( const char *str, char *user_buff )
+	{
+		static char temp[128];
+		char *buff = ( user_buff != NULL ) ? user_buff : temp;
+		sprintf( buff, "\"%s\"", str );
+		return buff;
+	}
+
+	// This is the opposite of metaString.
+	const char *stripQuotes( const char *str, char *user_buff )
+	{
+		static char temp[128];
+		char *buff = ( user_buff != NULL ) ? user_buff : temp;
+		char *b = buff;
+		for(;;)
+		{
+			if( *str != DoubleQuote ) *b++ = *str;
+			if( *str == NullChar ) break; 
+			str++;
+		}
+		return buff;
+	}
+
+	int getIntFlag( const char *flags, const char *flag, int &value )
+	{
+		while( *flags )
+		{
+			if( hasPrefix( flags, flag ) )
+			{
+				int k = strlen( flag );
+				if( flags[k] == '=' )
+				{
+					value = atoi( flags + k + 1 );
+					return 1;
+				}
+			}
+			flags++;
+		}
+		return 0;
+	}
+
+	int getFloatFlag( const char *flags, const char *flag, float &value )
+	{
+		while( *flags )
+		{
+			if( hasPrefix( flags, flag ) )
+			{
+				int k = strlen( flag );
+				if( flags[k] == '=' )
+				{
+					value = atof( flags + k + 1 );
+					return 1;
+				}
+			}
+			flags++;
+		}
+		return 0;
+	}
+
+	SortedList::SortedList( sort_type type_, int ascend_ )
+	{
+		type         = type_;
+		ascend       = ascend_;
+		num_elements = 0;
+		max_elements = 0;
+		sorted       = 1;
+		list         = NULL;
+	}
+
+	SortedList::~SortedList()
+	{
+		Clear();
+		delete[] list;
+	}
+
+	void SortedList::Clear()
+	{
+		// Delete all the private copies of the strings and re-initialize the
+		// list.  Reuse the same list, expanding it when necessary.
+		for( int i = 0; i < num_elements; i++ ) 
+		{
+			delete list[i];
+			list[i] = NULL;
+		}
+		num_elements = 0;
+		sorted       = 1;
+	}
+
+	SortedList &SortedList::operator<<( const char *str )
+	{
+		// Add a new string to the end of the list, expanding the list if necessary.
+		// Mark the list as unsorted, so that the next reference to an element will
+		// cause the list to be sorted again.
+		if( num_elements == max_elements ) Expand();
+		list[ num_elements++ ] = strdup( str );
+		sorted = 0;
+		return *this;
+	}
+
+	const char *SortedList::operator()( int i )
+	{
+		// Return the i'th element of the list.  Sort first if necessary.
+		static char *null = "";
+		if( num_elements == 0 || i < 0 || i >= num_elements ) return null;
+		if( !sorted ) Sort();
+		return list[i];
+	}
+
+	void SortedList::Expand()
+	{
+		// Create a new list of twice the size and copy the old list into it.
+		// This doubles "max_elements", but leaves "num_elements" unchanged.
+		if( max_elements == 0 ) max_elements = 1;
+		max_elements *= 2;
+		charPtr *new_list = new charPtr[ max_elements ];
+		for( int i = 0; i < max_elements; i++ ) 
+			new_list[i] = ( i < num_elements ) ? list[i] : NULL;
+		delete[] list;
+		list = new_list;
+	}
+
+	void SortedList::Swap( int i, int j )
+	{
+		char *temp = list[i];
+		list[i] = list[j];
+		list[j] = temp;
+	}
+
+	int SortedList::inOrder( int p, int q ) const
+	{
+		int test;
+		if( type == sort_alphabetic )
+			test = ( strcmp( list[p], list[q] ) <= 0 );
+		else
+		{
+			int len_p = strlen( list[p] );
+			int len_q = strlen( list[q] );
+			test = ( len_p <  len_q ) || 
+				( len_p == len_q && strcmp( list[p], list[q] ) <= 0 );
+		}
+		if( ascend ) return test;
+		return !test;
+	}
+
+	// This is an insertion sort that operates on subsets of the
+	// input defined by the step length.
+	void SortedList::InsertionSort( int start, int size, int step ) 
+	{
+		for( int i = 0; i + step < size; i += step )
+		{
+			for( int j = i; j >= 0; j -= step )
+			{
+				int p = start + j;
+				int q = p + step;
+				if( inOrder( p, q ) ) break;
+				Swap( p, q );
+			}
+		}
+	}
+
+	// This is a Shell sort.
+	void SortedList::Sort()
+	{
+		for( int step  = num_elements / 2; step > 1; step /= 2 )
+			for( int start = 0; start < step; start++ )
+				InsertionSort( start, num_elements  - start, step );
+		InsertionSort( 0, num_elements, 1 );
+		sorted = 1;
+	}
+
+	void SortedList::SetOrder( sort_type type_, int ascend_ )
+	{
+		if( type_ != type || ascend_ != ascend )
+		{
+			type   = type_;
+			ascend = ascend_;
+			sorted = 0;
+		}
+	}
+
+	int getstring( std::istream &in, const char *str )
+	{
+		char ch;
+		if( str == NULL ) return 1;
+		while( *str != NullChar )
+		{
+			in >> ch;
+			if( *str != ch ) return 0;
+			str++;
+		}
+		return 1;
+	}
+
+	std::istream &skipWhite( std::istream &in )
+	{
+		char c;
+		while( in.get(c) ) 
+		{
+			if( !isWhite( c ) ) 
+			{
+				in.putback(c);
+				break;
+			}
+		}
+		return in;
+	}
+};
diff --git a/src/nvtt/bc7/arvo/Char.h b/src/nvtt/bc7/arvo/Char.h
new file mode 100644
index 0000000..2742c1d
--- /dev/null
+++ b/src/nvtt/bc7/arvo/Char.h
@@ -0,0 +1,245 @@
+/***************************************************************************
+* Char.h                                                                   *
+*                                                                          *
+* Convenient constants, macros, and inline functions for manipulation of   *
+* characters and strings.                                                  *
+*                                                                          *
+*   HISTORY                                                                *
+*      Name    Date        Description                                     *
+*                                                                          *
+*      arvo    07/01/93    Initial coding.                                 *
+*                                                                          *
+*--------------------------------------------------------------------------*
+* Copyright (C) 1999, James Arvo                                           *
+*                                                                          *
+* This program is free software; you can redistribute it and/or modify it  *
+* under the terms of the GNU General Public License as published by the    *
+* Free Software Foundation.  See http://www.fsf.org/copyleft/gpl.html      *
+*                                                                          *
+* This program is distributed in the hope that it will be useful, but      *
+* WITHOUT EXPRESS OR IMPLIED WARRANTY of merchantability or fitness for    *
+* any particular purpose.  See the GNU General Public License for more     *
+* details.                                                                 *
+*                                                                          *
+***************************************************************************/
+#ifndef __CHAR_INCLUDED__
+#define __CHAR_INCLUDED__
+
+#include <string>
+#include <iostream>
+
+namespace ArvoMath {
+
+	static const char 
+		Apostrophe  = '\'' ,
+		Asterisk    = '*'  ,
+		Atsign      = '@'  ,
+		Backslash   = '\\' ,
+		Bell        = '\7' ,
+		Colon       = ':'  ,
+		Comma       = ','  ,
+		Dash        = '-'  ,
+		DoubleQuote = '"'  ,
+		EqualSign   = '='  ,
+		Exclamation = '!'  ,
+		GreaterThan = '>'  ,
+		Hash        = '#'  ,
+		Lbrack      = '['  ,
+		Lcurley     = '{'  ,
+		LessThan    = '<'  ,
+		Lparen      = '('  ,
+		Minus       = '-'  ,
+		NewLine     = '\n' ,
+		NullChar    = '\0' ,
+		Percent     = '%'  ,
+		Period      = '.'  ,
+		Pound       = '#'  ,
+		Plus        = '+'  ,
+		Rbrack      = ']'  ,
+		Rcurley     = '}'  ,
+		Rparen      = ')'  ,
+		Semicolon   = ';'  ,
+		Space       = ' '  ,
+		Slash       = '/'  ,
+		Star        = '*'  ,
+		Tab         = '\t' ,
+		Tilde       = '~'  ,
+		Underscore  = '_'  ;
+
+	inline int  isWhite( char c ) { return c == Space || c == NewLine || c == Tab; }
+	inline int  isUcase( char c ) { return 'A' <= c && c <= 'Z'; }
+	inline int  isLcase( char c ) { return 'a' <= c && c <= 'z'; }
+	inline int  isAlpha( char c ) { return isUcase( c ) || isLcase( c ); }
+	inline int  isDigit( char c ) { return '0' <= c && c <= '9'; }
+	inline char ToLower( char c ) { return isUcase( c ) ? c + ( 'a' - 'A' ) : c; }
+	inline char ToUpper( char c ) { return isLcase( c ) ? c + ( 'A' - 'a' ) : c; }
+
+	extern const char *getPath( 
+		const char *str, 
+		char *buff 
+		);
+
+	extern const char *getFile( 
+		const char *str, 
+		char *buff 
+		);
+
+	extern int getPrefix( 
+		const char *str, 
+		char *buff 
+		);
+
+	extern int getSuffix( 
+		const char *str, 
+		char *buff 
+		);
+
+	extern int isInteger( 
+		const char *str
+		);
+
+	extern int hasSuffix( 
+		const char *string, 
+		const char *suffix 
+		);
+
+	extern int hasPrefix( 
+		const char *string, 
+		const char *prefix 
+		);
+
+	extern int inString( 
+		char c, 
+		const char *str 
+		);
+
+	extern int nullString( 
+		const char *str 
+		);
+
+	extern const char *stripSuffix(  // Return NULL if unsuccessful.
+		const char *string,  // The string to truncate.
+		const char *suffix,  // The suffix to remove.
+		char  *buff = NULL   // Defaults to internal buffer.
+		);
+
+	extern const char* toString( 
+		int  n,            // An integer to convert to a string.
+		char *buff = NULL  // Defauts to internal buffer.
+		);
+
+	extern const char* toString( 
+		float x,           // A float to convert to a string.
+		char *buff = NULL  // Defauts to internal buffer.
+		);
+
+	extern int getIndex( // The index of the start of a pattern in a string.
+		const char *pat, // The pattern to look for.
+		const char *str  // The string to search.
+		);
+
+	extern int getSubstringAfter( 
+		const char *pat, 
+		const char *str, 
+		char *buff 
+		);
+
+	extern const char *SubstringAfter( 
+		const char *pat, 
+		const char *str,
+		char *buff = NULL  // Defauts to internal buffer.
+		);
+
+	extern const char *metaString(
+		const char *str,   // Make this a string within a string.
+		char *buff = NULL  // Defauts to internal buffer.
+		);
+
+	extern const char *stripQuotes(
+		const char *str,   // This is the opposite of metaString.
+		char *buff = NULL  // Defauts to internal buffer.
+		);
+
+	extern int getIntFlag( 
+		const char *flags, // List of assignment statements.
+		const char *flag,  // A specific flag to look for.
+		int &value         // The variable to assign the value to.
+		);
+
+	extern int getFloatFlag( 
+		const char *flags, // List of assignment statements.
+		const char *flag,  // A specific flag to look for.
+		float &value       // The variable to assign the value to.
+		);
+
+	extern int getstring( 
+		std::istream &in, 
+		const char *str 
+		);
+
+	enum sort_type {
+		sort_alphabetic,    // Standard dictionary ordering.
+		sort_lexicographic  // Sort first by length, then alphabetically.
+	};
+
+	class SortedList {
+
+	public:
+		SortedList( sort_type = sort_alphabetic, int ascending = 1 );
+		~SortedList();
+		SortedList &operator<<( const char * );
+		int Size() const { return num_elements; }
+		const char *operator()( int i );
+		void Clear();
+		void SetOrder( sort_type = sort_alphabetic, int ascending = 1 );
+
+	private:
+		void Sort();
+		void InsertionSort( int start, int size, int step );
+		void Swap( int i, int j );
+		void Expand();
+		int  inOrder( int i, int j ) const;
+		int  num_elements;
+		int  max_elements;
+		int  sorted;
+		int  ascend;
+		sort_type type;
+		char **list;
+	};
+
+
+	inline int Match( const char *s, const char *t )
+	{
+		return s != NULL && 
+			(t != NULL && strcmp( s, t ) == 0);
+	}
+
+	inline int Match( const char *s, const char *t1, const char *t2 )
+	{
+		return s != NULL && (
+			(t1 != NULL && strcmp( s, t1 ) == 0) ||
+			(t2 != NULL && strcmp( s, t2 ) == 0) );
+	}
+
+	union long_union_float {
+		long  i;
+		float f;
+	};
+
+	inline long float_as_long( float x )
+	{
+		long_union_float u;
+		u.f = x;
+		return u.i;
+	}
+
+	inline float long_as_float( long i )
+	{
+		long_union_float u;
+		u.i = i;
+		return u.f;
+	}
+
+	extern std::istream &skipWhite( std::istream &in );
+};
+#endif
diff --git a/src/nvtt/bc7/arvo/Complex.cpp b/src/nvtt/bc7/arvo/Complex.cpp
new file mode 100644
index 0000000..468704f
--- /dev/null
+++ b/src/nvtt/bc7/arvo/Complex.cpp
@@ -0,0 +1,76 @@
+/***************************************************************************
+* Complex.C                                                                *
+*                                                                          *
+* Complex numbers, complex arithmetic, and functions of a complex          *
+* variable.                                                                *
+*                                                                          *
+*   HISTORY                                                                *
+*      Name    Date        Description                                     *
+*                                                                          *
+*      arvo    03/02/2000  Initial coding.                                 *
+*                                                                          *
+*--------------------------------------------------------------------------*
+* Copyright (C) 2000, James Arvo                                           *
+*                                                                          *
+* This program is free software; you can redistribute it and/or modify it  *
+* under the terms of the GNU General Public License as published by the    *
+* Free Software Foundation.  See http://www.fsf.org/copyleft/gpl.html      *
+*                                                                          *
+* This program is distributed in the hope that it will be useful, but      *
+* WITHOUT EXPRESS OR IMPLIED WARRANTY of merchantability or fitness for    *
+* any particular purpose.  See the GNU General Public License for more     *
+* details.                                                                 *
+*                                                                          *
+***************************************************************************/
+#include "Complex.h"
+#include "form.h"
+
+namespace ArvoMath {
+	const Complex Complex::i( 0.0, 1.0 );
+
+	std::ostream &operator<<( std::ostream &out, const Complex &z )
+	{
+		out << form( "(%f,%f) ", z.Real(), z.Imag() );
+		return out;
+	}
+
+	Complex cos( const Complex &z )
+	{
+		return Complex( 
+			::cos( z.Real() ) * ::cosh( z.Imag() ), 
+			-::sin( z.Real() ) * ::sinh( z.Imag() )
+			);
+	}
+
+	Complex sin( const Complex &z )
+	{
+		return Complex( 
+			::sin( z.Real() ) * ::cosh( z.Imag() ), 
+			::cos( z.Real() ) * ::sinh( z.Imag() )
+			);
+	}
+
+	Complex cosh( const Complex &z )
+	{
+		return Complex( 
+			::cosh( z.Real() ) * ::cos( z.Imag() ), 
+			::sinh( z.Real() ) * ::sin( z.Imag() )
+			);
+	}
+
+	Complex sinh( const Complex &z )
+	{
+		return Complex( 
+			::sinh( z.Real() ) * ::cos( z.Imag() ), 
+			::cosh( z.Real() ) * ::sin( z.Imag() )
+			);
+	}
+
+	Complex log( const Complex &z )
+	{
+		float r = ::sqrt( z.Real() * z.Real() + z.Imag() * z.Imag() );
+		float t = ::acos( z.Real() / r );
+		if( z.Imag() < 0.0 ) t = 2.0 * 3.1415926 - t;
+		return Complex( ::log(r), t );
+	}
+};
diff --git a/src/nvtt/bc7/arvo/Complex.h b/src/nvtt/bc7/arvo/Complex.h
new file mode 100644
index 0000000..671fd57
--- /dev/null
+++ b/src/nvtt/bc7/arvo/Complex.h
@@ -0,0 +1,187 @@
+/***************************************************************************
+* Complex.h                                                                *
+*                                                                          *
+* Complex numbers, complex arithmetic, and functions of a complex          *
+* variable.                                                                *
+*                                                                          *
+*   HISTORY                                                                *
+*      Name    Date        Description                                     *
+*                                                                          *
+*      arvo    03/02/2000  Initial coding.                                 *
+*                                                                          *
+*--------------------------------------------------------------------------*
+* Copyright (C) 2000, James Arvo                                           *
+*                                                                          *
+* This program is free software; you can redistribute it and/or modify it  *
+* under the terms of the GNU General Public License as published by the    *
+* Free Software Foundation.  See http://www.fsf.org/copyleft/gpl.html      *
+*                                                                          *
+* This program is distributed in the hope that it will be useful, but      *
+* WITHOUT EXPRESS OR IMPLIED WARRANTY of merchantability or fitness for    *
+* any particular purpose.  See the GNU General Public License for more     *
+* details.                                                                 *
+*                                                                          *
+***************************************************************************/
+#ifndef __COMPLEX_INCLUDED__
+#define __COMPLEX_INCLUDED__
+
+#include <math.h>
+#include <iostream>
+
+namespace ArvoMath {
+
+	class Complex {
+	public:
+		Complex()                   { x = 0; y = 0; }
+		Complex( float a          ) { x = a; y = 0; }
+		Complex( float a, float b ) { x = a; y = b; }
+		Complex( const Complex &z ) { *this = z; }
+		float &Real() { return x; }
+		float &Imag() { return y; }
+		float Real() const { return x; }
+		float Imag() const { return y; }
+		inline Complex &operator=( const Complex &z );
+		static const Complex i;
+	private:
+		float x;
+		float y;
+	};
+
+	inline Complex &Complex::operator=( const Complex &z ) 
+	{ 
+		x = z.Real(); 
+		y = z.Imag(); 
+		return *this;
+	}
+
+	inline float Real( const Complex &z )
+	{
+		return z.Real();
+	}
+
+	inline float Imag( const Complex &z )
+	{
+		return z.Imag();
+	}
+
+	inline Complex conj( const Complex &z )
+	{
+		return Complex( z.Real(), -z.Imag() );
+	}
+
+	inline double modsqr( const Complex &z )
+	{
+		return z.Real() * z.Real() + z.Imag() * z.Imag();
+	}
+
+	inline double modulus( const Complex &z )
+	{
+		return sqrt( z.Real() * z.Real() + z.Imag() * z.Imag() );
+	}
+
+	inline double arg( const Complex &z )
+	{
+		float t = acos( z.Real() / modulus(z) );
+		if( z.Imag() < 0.0 ) t = 2.0 * 3.1415926 - t;
+		return t;
+	}
+
+	inline Complex operator*( const Complex &z, float a )
+	{
+		return Complex( a * z.Real(), a * z.Imag() );
+	}
+
+	inline Complex operator*( float a, const Complex &z )
+	{
+		return Complex( a * z.Real(), a * z.Imag() );
+	}
+
+	inline Complex operator*( const Complex &z, const Complex &w )
+	{
+		return Complex( 
+			z.Real() * w.Real() - z.Imag() * w.Imag(),
+			z.Real() * w.Imag() + z.Imag() * w.Real()
+			);
+	}
+
+	inline Complex operator+( const Complex &z, const Complex &w )
+	{
+		return Complex( z.Real() + w.Real(), z.Imag() + w.Imag() );
+	}
+
+	inline Complex operator-( const Complex &z, const Complex &w )
+	{
+		return Complex( z.Real() - w.Real(), z.Imag() - w.Imag() );
+	}
+
+	inline Complex operator-( const Complex &z )
+	{
+		return Complex( -z.Real(), -z.Imag() );
+	}
+
+	inline Complex operator/( const Complex &z, float w )
+	{
+		return Complex( z.Real() / w, z.Imag() / w );
+	}
+
+	inline Complex operator/( const Complex &z, const Complex &w )
+	{
+		return ( z * conj(w) ) / modsqr(w);
+	}
+
+	inline Complex operator/( float a, const Complex &w )
+	{
+		return conj(w) * ( a / modsqr(w) );
+	}
+
+	inline Complex &operator+=( Complex &z, const Complex &w )
+	{
+		z.Real() += w.Real();
+		z.Imag() += w.Imag();
+		return z;
+	}
+
+	inline Complex &operator*=( Complex &z, const Complex &w )
+	{
+		return z = ( z * w );
+	}
+
+	inline Complex &operator-=( Complex &z, const Complex &w )
+	{
+		z.Real() -= w.Real();
+		z.Imag() -= w.Imag();
+		return z;
+	}
+
+	inline Complex exp( const Complex &z )
+	{
+		float r = ::exp( z.Real() );
+		return Complex( r * cos( z.Imag() ), r * sin( z.Imag() ) );
+	}
+
+	inline Complex pow( const Complex &z, int n )
+	{
+		float r = ::pow( modulus( z ), (double)n );
+		float t = arg( z );
+		return Complex( r * cos( n * t ), r * sin( n * t ) );
+	}
+
+	inline Complex polar( float r, float theta )
+	{
+		return Complex( r * cos( theta ), r * sin( theta ) );
+	}
+
+
+	extern Complex cos ( const Complex &z );
+	extern Complex sin ( const Complex &z );
+	extern Complex cosh( const Complex &z );
+	extern Complex sinh( const Complex &z );
+	extern Complex log ( const Complex &z );
+
+	extern std::ostream &operator<<( 
+		std::ostream &out, 
+		const Complex & 
+		);
+};
+#endif
+
diff --git a/src/nvtt/bc7/arvo/Matrix.cpp b/src/nvtt/bc7/arvo/Matrix.cpp
new file mode 100644
index 0000000..d84b7ef
--- /dev/null
+++ b/src/nvtt/bc7/arvo/Matrix.cpp
@@ -0,0 +1,1201 @@
+/***************************************************************************
+* Matrix.C                                                                 *
+*                                                                          *
+* General Vector and Matrix classes, with all the associated methods.      *
+*                                                                          *
+*   HISTORY                                                                *
+*      Name    Date        Description                                     *
+*                                                                          *
+*      arvo    08/16/2000    Revamped for CIT tools.                       *
+*      arvo    10/31/1994    Combined RowVec & ColVec into Vector.         *
+*      arvo    06/30/1993    Added singular value decomposition class.     *
+*      arvo    06/25/1993    Major revisions.                              *
+*      arvo    09/08/1991    Initial implementation.                       *
+*                                                                          *
+*--------------------------------------------------------------------------*
+* Copyright (C) 2000, James Arvo                                           *
+*                                                                          *
+* This program is free software; you can redistribute it and/or modify it  *
+* under the terms of the GNU General Public License as published by the    *
+* Free Software Foundation.  See http://www.fsf.org/copyleft/gpl.html      *
+*                                                                          *
+* This program is distributed in the hope that it will be useful, but      *
+* WITHOUT EXPRESS OR IMPLIED WARRANTY of merchantability or fitness for    *
+* any particular purpose.  See the GNU General Public License for more     *
+* details.                                                                 *
+*                                                                          *
+***************************************************************************/
+#include <iostream>
+#include <assert.h>
+#include <math.h>
+#include "ArvoMath.h"
+#include "Vector.h"
+#include "Matrix.h"
+#include "form.h"
+
+namespace ArvoMath {
+	const Matrix Matrix::Null(0);
+
+	/*-------------------------------------------------------------------------*
+	*                                                                         *
+	*  C O N S T R U C T O R S                                                *
+	*                                                                         *
+	*-------------------------------------------------------------------------*/
+
+	// Create a new matrix of the given size.  If n_cols is zero (the default), 
+	// it is assumed that the matrix is to be square; that is, n_rows x n_rows.  
+	// The matrix is filled with "value", which defaults to zero.
+	Matrix::Matrix( int n_rows, int n_cols, float value ) 
+	{
+		assert( n_rows >= 0 && n_cols >= 0 );
+		rows = 0;
+		cols = 0;
+		elem = NULL;
+		SetSize( n_rows, n_cols );
+		float *e = elem;
+		for( register int i = 0; i < rows * cols; i++ ) *e++ = value;
+	}
+
+	// Copy constructor.
+	Matrix::Matrix( const Matrix &M ) 
+	{
+		rows = 0;
+		cols = 0;
+		elem = NULL;
+		SetSize( M.Rows(), M.Cols() );
+		register float *e = elem;
+		register float *m = M.Array();
+		for( register int i = 0; i < rows * cols; i++ ) *e++ = *m++;
+	}
+
+	Matrix::~Matrix() 
+	{
+		SetSize( 0, 0 );
+	}
+
+	/*-------------------------------------------------------------------------*
+	*                                                                         *
+	*  M I S C E L L A N E O U S   M E T H O D S                              *
+	*                                                                         *
+	*-------------------------------------------------------------------------*/
+
+	// Re-shape the matrix.  If the number of elements in the new matrix is
+	// different from the original matrix, the original data is deleted and
+	// replaced with a new array.  If new_cols is zero (the default), it is
+	// assumed to be the same as new_rows -- i.e. a square matrix.
+	void Matrix::SetSize( int new_rows, int new_cols )
+	{
+		if( new_cols == 0 ) new_cols = new_rows;
+		int n = new_rows * new_cols;
+		if( rows * cols != n )
+		{
+			if( elem != NULL ) delete[] elem;
+			elem = ( n == 0 ) ? NULL : new float[ n ];
+		}
+		rows = new_rows;
+		cols = new_cols;
+	}
+
+	Vector Matrix::GetCol( int j ) const
+	{
+		Vector C( rows );
+		float *e = elem + j;
+		float *c = C.Array();
+		for( int i = 0; i < rows; i++ )
+		{
+			*c++ = *e;
+			e += cols;
+		}
+		return C;
+	}
+
+	Vector Matrix::GetRow( int i ) const
+	{
+		Vector R( cols );
+		float *e = elem + ( i * cols );
+		float *r = R.Array();
+		for( int j = 0; j < cols; j++ ) *r++ = *e++;
+		return R;
+	}
+
+	void Matrix::SetCol( int j, const Vector &C )
+	{
+		assert( rows == C.Size() );
+		float *e = elem + j;
+		float *c = C.Array();
+		for( int i = 0; i < rows; i++ )
+		{
+			*e = *c++;
+			e += cols;
+		}
+	}
+
+	void Matrix::SetRow( int i, const Vector &R )
+	{
+		assert( cols == R.Size() );
+		float *e = elem + ( i * cols );
+		float *r = R.Array();
+		for( int j = 0; j < cols; j++ ) *e++ = *r++;
+	}
+
+	Matrix Matrix::GetBlock( int imin, int imax, int jmin, int jmax ) const
+	{
+		if( imax < imin || jmax < jmin ) return Matrix(0,0);
+		Matrix M( imax - imin + 1, jmax - jmin + 1 );
+		for( int i = imin; i <= imax; i++ )
+			for( int j = jmin; j <= jmax; j++ )
+			{
+				M( i - imin, j - jmin ) = (*this)( i, j );
+			}
+			return M;
+	}
+
+	void Matrix::SetBlock( int imin, int imax, int jmin, int jmax, const Matrix &B )
+	{
+		int ni = imax - imin + 1;
+		int nj = jmax - jmin + 1;
+		assert( ni == B.Rows() );
+		assert( nj == B.Cols() );
+		int k = imin * cols + jmin;
+		for( int i = 0; i < ni; i++ )
+			for( int j = 0; j < nj; j++ )
+			{
+				elem[ k + i * cols + j ] = B(i,j);
+			}
+	}
+
+	void Matrix::SetBlock( int imin, int imax, int jmin, int jmax, const Vector &V )
+	{
+		int k = imin * cols + jmin;
+		if( imin == imax )
+		{
+			int nj = jmax - jmin + 1;
+			assert( nj == V.Size() );
+			for( int j = 0; j < nj; j++ ) elem[ k + j ] = V(j);
+		}
+		else if( jmin == jmax )
+		{
+			int ni = imax - imin + 1;
+			assert( ni == V.Size() );
+			for( int i = 0; i < ni; i++ ) elem[ k + i * cols ] = V(i);
+		}
+		else 
+		{
+			// This assertion will be false, and will signal an error.
+			assert( imin == imax || jmin == jmax );
+		}
+	}
+
+	Matrix &Matrix::SwapRows( int i1, int i2 )
+	{
+		float temp;
+		float *r1 = elem + ( i1 * cols );
+		float *r2 = elem + ( i2 * cols );
+		for( register int j = 0; j < cols; j++ )
+		{
+			temp = *r1;
+			*r1  = *r2;
+			*r2  = temp;
+			r1++;
+			r2++;
+		}
+		return *this;
+	}
+
+	Matrix &Matrix::SwapCols( int j1, int j2 )
+	{
+		float temp;
+		float *c1 = elem + j1;
+		float *c2 = elem + j2;
+		for( register int i = 0; i < rows; i++ )
+		{
+			temp = *c1;
+			*c1  = *c2;
+			*c2  = temp;
+			c1 += cols;
+			c2 += cols;
+		}
+		return *this;
+	}
+
+	/*-------------------------------------------------------------------------*
+	*                                                                         *
+	*  A S S I G N M E N T    O P E R A T O R S                               *
+	*                                                                         *
+	*-------------------------------------------------------------------------*/
+	Matrix& Matrix::operator=( const Matrix &M ) 
+	{
+		SetSize( M.Rows(), M.Cols() );
+		register float *e = elem;
+		register float *m = M.Array();
+		for( register int i = 0; i < rows * cols; i++ ) *e++ = *m++;
+		return *this;
+	}
+
+	Matrix& Matrix::operator=( float s ) 
+	{
+		register float *e = elem;
+		for( register int i = 0; i < rows * cols; i++ ) *e++ = s;
+		return *this;
+	}
+
+	/*-------------------------------------------------------------------------*
+	*                                                                         *
+	*  O P E R A T O R S                                                      *
+	*                                                                         *
+	*-------------------------------------------------------------------------*/
+	Vector operator*( const Matrix &M, const Vector &A ) 
+	{
+		// Handle the special case with translation built in.
+		if( M.Cols() == 4 && M.Rows() == 4 && A.Size() == 3 )
+		{
+			Vector C(3);
+			C(0) = M(0,0) * A(0) + M(0,1) * A(1) + M(0,2) * A(2) + M(0,3);
+			C(1) = M(1,0) * A(0) + M(1,1) * A(1) + M(1,2) * A(2) + M(1,3);
+			C(2) = M(2,0) * A(0) + M(2,1) * A(1) + M(2,2) * A(2) + M(2,3);
+			return C;
+		}
+		assert( M.Cols() == A.Size() );
+		Vector C( M.Rows() );
+		float *m = M.Array();
+		for( int i = 0; i < M.Rows(); i++ ) 
+		{
+			register float *a  = A.Array();
+			register double sum = (*m++) * (*a++);
+			for( register int j = 1; j < M.Cols(); j++ ) 
+				sum += (*m++) * (*a++);
+			C(i) = sum;
+		}
+		return C;
+	}
+
+	Vector operator*( const Vector &A, const Matrix &M ) 
+	{
+		assert( A.Size() == M.Rows() );
+		Vector C( M.Cols() );
+		for( register int j = 0; j < M.Cols(); j++ ) 
+		{
+			register double sum = 0.0;
+			register float *a = A.Array();
+			for( register int i = 0; i < M.Rows(); i++ ) 
+				sum += (*a++) * M(i,j);
+			C(j) = sum;
+		}
+		return C;
+	}
+
+	Vector& operator*=( Vector &A, const Matrix &M ) 
+	{
+		// Handle the special case with translation built in.
+		if( M.Cols() == 4 && M.Rows() == 4 && A.Size() == 3 )  
+		{
+			float x = M(0,0) * A(0) + M(0,1) * A(1) + M(0,2) * A(2) + M(0,3);
+			float y = M(1,0) * A(0) + M(1,1) * A(1) + M(1,2) * A(2) + M(1,3);
+			float z = M(2,0) * A(0) + M(2,1) * A(1) + M(2,2) * A(2) + M(2,3);
+			A(0) = x;
+			A(1) = y;
+			A(2) = z;
+			return A;
+		}
+		assert( M.Cols() == A.Size() );
+		Vector C( M.Rows() );
+		float *m = M.Array();
+		for( register int i = 0; i < M.Rows(); i++ ) 
+		{
+			double sum = 0.0;
+			for( register int j = 0; j < A.Size(); j++ ) 
+				sum += (*m++) * A(j);
+			C(i) = sum;
+		}
+		return A = C;
+	}
+
+	Matrix& operator*=( Matrix &M, float s ) 
+	{
+		register float *m = M.Array();
+		for( register int i = 0; i < M.Rows() * M.Cols(); i++ ) *m++ *= s;
+		return M;
+	}
+
+	Matrix& operator/=( Matrix &M, float s ) 
+	{
+		assert( s != 0.0 );
+		register float *m = M.Array();
+		for( register int i = 0; i < M.Rows() * M.Cols(); i++ ) *m++ /= s;
+		return M;
+	}
+
+	Matrix operator+( const Matrix &A, const Matrix &B ) 
+	{
+		assert( A.Rows() == B.Rows() );
+		assert( A.Cols() == B.Cols() );
+		Matrix C( A.Rows(), A.Cols() );
+		register float *a = A.Array();
+		register float *b = B.Array();
+		register float *c = C.Array();
+		for( register int i = 0; i < A.Rows() * A.Cols(); i++ ) (*c++) = (*a++) + (*b++);
+		return C;
+	}
+
+	Matrix operator-( const Matrix &A, const Matrix &B ) 
+	{
+		assert( A.Rows() == B.Rows() );
+		assert( A.Cols() == B.Cols() );
+		Matrix C( A.Rows(), A.Cols() );
+		register float *a = A.Array();
+		register float *b = B.Array();
+		register float *c = C.Array();
+		for( register int i = 0; i < A.Rows() * A.Cols(); i++ ) (*c++) = (*a++) - (*b++);
+		return C;
+	}
+
+	Matrix operator-( const Matrix &A )
+	{
+		Matrix B( A.Cols(), A.Rows() );
+		register float *a = A.Array();
+		register float *b = B.Array();
+		for( register int i = 0; i < A.Rows() * A.Cols(); i++ )
+		{
+			*b++ = -(*a++);
+		}
+		return B;
+	}
+
+	Matrix& operator+=( Matrix &A, const Matrix &B ) 
+	{
+		assert( A.Rows() == B.Rows() );
+		assert( A.Cols() == B.Cols() );
+		register float *a = A.Array();
+		register float *b = B.Array();
+		for( register int i = 0; i < A.Rows() * A.Cols(); i++ ) (*a++) += (*b++);
+		return A;
+	}
+
+	Matrix operator*( const Matrix &A, const Matrix &B )
+	{
+		assert( A.Cols() == B.Rows() );
+		Matrix M( A.Rows(), B.Cols() );
+		for( register int i = 0; i < A.Rows(); i++ )
+			for( register int j = 0; j < B.Cols(); j++ )
+			{
+				double sum = 0.0;
+				for( register int k = 0; k < A.Cols(); k++ ) sum += A(i,k) * B(k,j);
+				M(i,j) = sum;
+			}
+			return M;
+	}
+
+	Matrix operator*( float s, const Matrix &A )
+	{
+		Matrix B( A.Cols(), A.Rows() );
+		register float *a = A.Array();
+		register float *b = B.Array();
+		for( register int i = 0; i < A.Rows() * A.Cols(); i++ )
+		{
+			*b++ = s * (*a++);
+		}
+		return B;
+	}
+
+	Matrix operator*( const Matrix &A, float s )
+	{
+		Matrix B( A.Cols(), A.Rows() );
+		register float *a = A.Array();
+		register float *b = B.Array();
+		for( register int i = 0; i < A.Rows() * A.Cols(); i++ )
+		{
+			*b++ = s * (*a++);
+		}
+		return B;
+	}
+
+	Matrix operator/( const Matrix &A, float s )
+	{
+		assert( s != 0.0 );
+		Matrix B( A.Cols(), A.Rows() );
+		register float *a = A.Array();
+		register float *b = B.Array();
+		for( register int i = 0; i < A.Rows() * A.Cols(); i++ )
+		{
+			*b++ = (*a++) / s;
+		}
+		return B;
+	}
+
+	Matrix& operator*=( Matrix &A, const Matrix &B )
+	{
+		assert( A.Cols() == B.Rows() );
+		Vector R( B.Cols() );
+		for( register int i = 0; i < A.Rows(); i++ )
+		{
+			for( register int j = 0; j < B.Cols(); j++ )  // Compute the ith row of A * B.
+			{
+				double sum = A(i,0) * B(0,j);
+				for( register int k = 1; k < A.Cols(); k++ ) sum += A(i,k) * B(k,j);
+				R(j) = sum;
+			}
+			// Copy the new i'th row back into A.
+			for( register int k = 0; k < A.Cols(); k++ ) A(i,k) = R(k); 
+		}
+		return A;
+	}
+
+	/*-------------------------------------------------------------------------*
+	*                                                                         *
+	*  M I S C E L L A N E O U S   F U N C T I O N S                          *
+	*                                                                         *
+	*-------------------------------------------------------------------------*/
+	Matrix Transp( const Matrix &M )
+	{
+		Matrix T( M.Cols(), M.Rows() );
+		register float *m = M.Array();
+		for( register int i = 0; i < M.Rows(); i++ )
+			for( register int j = 0; j < M.Cols(); j++ ) T(j,i) = *m++;
+		return T;
+	}
+
+	// Computes A * Transp(A).
+	Matrix AATransp( const Matrix &A )
+	{
+		int n = A.Rows();
+		Matrix B( n, n );
+		for( register int i = 0; i < n; i++ )
+			for( register int j = 0; j < n; j++ ) 
+			{
+				double sum = 0.0;
+				for( register int k = 0; k < A.Cols(); k++ ) 
+					sum += A(i,k) * A(j,k);
+				B(i,j) = sum;
+			}
+			return B;
+	}
+
+	// Computes Transp(A) * A.
+	Matrix ATranspA( const Matrix &A )
+	{
+		int n = A.Cols();
+		Matrix B( n, n );
+		for( register int i = 0; i < n; i++ )
+			for( register int j = 0; j < n; j++ ) 
+			{
+				double sum = 0.0;
+				for( register int k = 0; k < A.Rows(); k++ ) 
+					sum += A(k,i) * A(k,j);
+				B(i,j) = sum;
+			}
+			return B;
+	}
+
+	// Computes the outer product of the vectors A and B.
+	Matrix Outer( const Vector &A, const Vector &B ) 
+	{
+		Matrix M( A.Size(), B.Size() );
+		for( register int i = 0; i < A.Size(); i++ )
+		{
+			float c = A(i);
+			for( register int j = 0; j < B.Size(); j++ ) M(i,j) = c * B(j);
+		}
+		return M;
+	}
+
+	// Computes the L1-norm of the matrix A, which is the maximum absolute
+	// row sum.
+	double OneNorm( const Matrix &A )
+	{
+		double norm = 0.0;
+		for( register int i = 0; i < A.Rows(); i++ )
+		{
+			double sum = 0.0;
+			for( register int j = 0; j < A.Cols(); j++ ) sum += Abs( A(i,j) );
+			if( sum > norm ) norm = sum;
+		}
+		return norm;
+	}
+
+	// Computes the L-infinity norm of the matrix A, which is the maximum 
+	// absolute column sum.
+	double SupNorm( const Matrix &A )
+	{
+		double norm = 0.0;
+		for( register int j = 0; j < A.Cols(); j++ )
+		{
+			double sum = 0.0;
+			for( register int i = 0; i < A.Rows(); i++ ) sum += Abs( A(i,j) );
+			if( sum > norm ) norm = sum;
+		}
+		return norm;
+	}
+
+	// Returns the square matrix with the elements of the vector d along
+	// its diagonal.
+	Matrix Diag( const Vector &d ) 
+	{
+		Matrix D( d.Size() );
+		for( register int i = 0; i < d.Size(); i++ ) D(i,i) = d(i);
+		return D;
+	}
+
+	// Returns the 3 x 3 diagonal matrix with x, y, and z as its diagonal
+	// elements.
+	Matrix Diag( float x, float y, float z )
+	{
+		Matrix D(3,3);
+		D(0,0) = x;
+		D(1,1) = y;
+		D(2,2) = z;
+		return D;
+	}
+
+	// Returns the vector consisting of the diagonal elements of the
+	// matrix M, which need not be square.
+	Vector Diag( const Matrix &M )
+	{
+		int m = Min( M.Rows(), M.Cols() );
+		Vector V(m);
+		for( register int i = 0; i < m; i++ ) V(i) = M(i,i);
+		return V;
+	}
+
+	// Returns the n x n identity matrix.
+	Matrix Ident( int n )
+	{
+		Matrix I( n );
+		for( register int i = 0; i < n; i++ ) I(i,i) = 1.0;
+		return I;
+	}
+
+	// Determines whether the matrix M is "Null" -- i.e. has zero rows
+	// or columns.
+	int Null( const Matrix &M ) 
+	{
+		return M.Rows() == 0 || M.Cols() == 0;
+	}
+
+	int Square( const Matrix &M )
+	{
+		return M.Rows() == M.Cols();
+	}
+
+	// Convert a "vector-shaped" matrix to a vector.  That is, represent a
+	// matrix with a single row or a single column as a vector.
+	Vector ToVector( const Matrix &M ) 
+	{
+		if( M.Rows() == 1 )
+		{
+			Vector V( M.Cols() );
+			for( int j = 0; j < M.Cols(); j++ ) V(j) = M(0,j);
+			return V;
+		}
+		else if( M.Cols() == 1 )
+		{
+			Vector V( M.Rows() );
+			for( int i = 0; i < M.Rows(); i++ ) V(i) = M(i,0);
+			return V;
+		}
+		else 
+		{
+			// Report an error.     
+			assert( M.Rows() == 1 || M.Cols() == 1 );
+		}
+		return Vector();
+	}
+
+	std::ostream &operator<<( std::ostream &out, const Matrix &M )
+	{
+		if( M.Rows() == 0 || M.Cols() == 0 )
+		{
+			out << "NULL" << std::endl;
+		}
+		else for( register int i = 0; i < M.Rows(); i++ )
+		{
+			out << form( "%3d: ", i );
+			for( register int j = 0; j < M.Cols(); j++ )
+				out << form( " %10.5g", M(i,j) );
+			out << std::endl;
+		}
+		return out;
+	}
+
+	/*-------------------------------------------------------------------------*
+	* R O T A T I O N                                                         *
+	*                                                                         * 
+	* Builds a 3x3 modeling matrix that performs a rotation about an          *
+	* arbitrary axis.  The rotation is right-handed about this axis and       *
+	* "angle" is taken to be in radians.  The only error that can occur is    *
+	* when "axis" is the zero-vector.                                         *
+	*                                                                         *  
+	*-------------------------------------------------------------------------*/
+	Matrix Rotation( const Vector &Axis, float angle )
+	{
+		// Compute a unit quaternion (a,b,c,d) that performs the rotation.
+
+		float t = TwoNormSqr( Axis );
+		if( t == 0.0 ) return Matrix(3,3);
+		t = sin( angle * 0.5 ) / sqrt( t );
+
+		// Fill in the entries of the quaternion.
+
+		float a = cos( angle * 0.5 );
+		float b = t * Axis(0);
+		float c = t * Axis(1);
+		float d = t * Axis(2);
+
+		// Compute all the double products of a, b, c, and d, except a * a.
+
+		float bb = b * b;
+		float cc = c * c;
+		float dd = d * d;
+		float ab = a * b;
+		float ac = a * c;
+		float ad = a * d;
+		float bc = b * c;
+		float bd = b * d;
+		float cd = c * d;
+
+		// Fill in the entries of the rotation matrix.
+
+		Matrix R(3,3);
+
+		R(0,0) = 1.0 - 2.0 * ( cc + dd );
+		R(0,1) =       2.0 * ( bc + ad );
+		R(0,2) =       2.0 * ( bd - ac );
+
+		R(1,0) =       2.0 * ( bc - ad );
+		R(1,1) = 1.0 - 2.0 * ( bb + dd );
+		R(1,2) =       2.0 * ( cd + ab );
+
+		R(2,0) =       2.0 * ( bd + ac );
+		R(2,1) =       2.0 * ( cd - ab );
+		R(2,2) = 1.0 - 2.0 * ( bb + cc );
+
+		return R;
+	}
+
+	/*-------------------------------------------------------------------------*
+	* R O T A T I O N                                                         *
+	*                                                                         * 
+	* Builds a 4x4 modeling matrix that performs a rotation about an          *
+	* arbitrary axis through an arbitrary point.  The rotation is             *
+	* right-handed about this axis and "angle" is taken to be in radians.     *
+	*                                                                         *  
+	*-------------------------------------------------------------------------*/
+	Matrix Rotation( const Vector &Axis, const Vector &Origin, float angle )
+	{
+		Matrix R = Rotation( Axis, angle );   // A simple 3x3 rotation.
+		Matrix M = Ident(4);                  // A 4x4 including translation.
+
+		// Compute the last row of the matrix (the translation) using the
+		// 3x3 rotation matrix.  We need to compute the last row of the 4x4
+		// matrix that performs Translate( -Origin ) * Rotate * Translate( Origin ).
+		//
+		//       | I   p | | R   0 | | I  -p |   | R   p - Rp |
+		//       |       | |       | |       | = |            |
+		//       | 0   1 | | 0   1 | | 0   1 |   | 0      1   |
+		//
+		// So, the desired column is  p - R p.
+
+		Vector V( Origin - R * Origin );
+		for( int i = 0; i < 3; i++ )
+		{
+			M(i,3) = V(i);
+			for( int j = 0; j < 3; j++ )
+				M(i,j) = R(i,j);
+		}
+		return M;
+	}
+
+	/*-------------------------------------------------------------------------*
+	* X  R O T A T I O N                                                      *
+	*                                                                         * 
+	* Builds a 3x3 modeling matrix that performs a rotation about the X-axis. *
+	*                                                                         *  
+	*-------------------------------------------------------------------------*/
+	Matrix Xrotation( float angle )
+	{
+		Matrix M = Ident(3);
+		float c = cos( angle );
+		float s = sin( angle );
+		M(1,1) = c;  M(1,2) = -s;
+		M(2,1) = s;  M(2,2) =  c;
+		return M;
+	}
+
+	/*-------------------------------------------------------------------------*
+	* Y  R O T A T I O N                                                      *
+	*                                                                         * 
+	* Builds a 3x3 modeling matrix that performs a rotation about the Y-axis. *
+	*                                                                         *  
+	*-------------------------------------------------------------------------*/
+	Matrix Yrotation( float angle )
+	{
+		Matrix M = Ident(3);
+		float c = cos( angle );
+		float s = sin( angle );
+		M(0,0) = c;  M(0,2) = -s;
+		M(2,0) = s;  M(2,2) =  c;
+		return M;
+	}
+
+	/*-------------------------------------------------------------------------*
+	* Z  R O T A T I O N                                                      *
+	*                                                                         * 
+	* Builds a 3x3 modeling matrix that performs a rotation about the Z-axis. *
+	*                                                                         *  
+	*-------------------------------------------------------------------------*/
+	Matrix Zrotation( float angle )
+	{
+		Matrix M = Ident(3);
+		float c = cos( angle );
+		float s = sin( angle );
+		M(0,0) = c;  M(0,1) = -s;
+		M(1,0) = s;  M(1,1) =  c;
+		return M;
+	}
+
+	/*-------------------------------------------------------------------------*
+	* H O U S E H O L D E R                                                   *
+	*                                                                         * 
+	* Returns the Householder reflection matrix that reflects through the     *  
+	* plane orthogonal to V.  The vector V is not assumed to be normalized.   *  
+	*                                                                         *  
+	*-------------------------------------------------------------------------*/
+	Matrix Householder( const Vector &V )
+	{
+		Matrix I = Ident( V.Size() );
+		float  c = 2.0 / ( V * V );
+		return I - Outer( c * V, V );
+	}
+
+	/*=========================================================================*
+	*  R O T A T I O N                Author: Jim Arvo, 1991                  *
+	*                                                                         *
+	*  This routine maps three values (x1, x2, x3) in the range [0,1] into    *
+	*  a 3x3 rotation matrix, M.  Uniformly distributed random variables      *
+	*  x1, x2, and x3 create uniformly distributed random rotation matrices.  *
+	*  To create small uniformly distributed "perturbations", supply          *
+	*  samples in the following ranges                                        *
+	*                                                                         *
+	*      x1 in [ 0, d ]                                                     *
+	*      x2 in [ 0, 1 ]                                                     *
+	*      x3 in [ 0, d ]                                                     *
+	*                                                                         *
+	* where 0 < d < 1 controls the size of the perturbation.  Any of the      *
+	* random variables may be stratified (or "jittered") for a slightly more  *
+	* even distribution.                                                      *
+	*                                                                         *
+	*=========================================================================*/
+	Matrix Rotation( float x1, float x2, float x3 )
+	{
+		Matrix M(3,3);
+		float theta = x1 * TwoPi; // Rotation about the pole (Z). 
+		float phi   = x2 * TwoPi; // For direction of pole deflection.
+		float z     = x3 * 2.0;   // For magnitude of pole deflection.
+
+		// Compute a vector V used for distributing points over the sphere
+		// via the reflection I - V Transpose(V).  This formulation of V
+		// will guarantee that if x1 and x2 are uniformly distributed,
+		// the reflected points will be uniform on the sphere.  Note that V
+		// has length sqrt(2) to eliminate the 2 in the Householder matrix.
+
+		float r  = sqrt( z );
+		float Vx = sin( phi ) * r;
+		float Vy = cos( phi ) * r;
+		float Vz = sqrt( 2.0 - z );    
+
+		// Compute the row vector S = Transpose(V) * R, where R is a simple
+		// rotation by theta about the z-axis.  No need to compute Sz since
+		// it's just Vz.
+
+		float st = sin( theta );
+		float ct = cos( theta );
+		float Sx = Vx * ct - Vy * st;
+		float Sy = Vx * st + Vy * ct;
+
+		// Construct the rotation matrix  ( V Transpose(V) - I ) R, which
+		// is equivalent to V S - R.
+
+		M(0,0) = Vx * Sx - ct;
+		M(0,1) = Vx * Sy - st;
+		M(0,2) = Vx * Vz;
+
+		M(1,0) = Vy * Sx + st;
+		M(1,1) = Vy * Sy - ct;
+		M(1,2) = Vy * Vz;
+
+		M(2,0) = Vz * Sx;
+		M(2,1) = Vz * Sy;
+		M(2,2) = 1.0 - z;   // This equals Vz * Vz - 1.0 
+
+		return M;
+	}
+
+	/*-------------------------------------------------------------------------*
+	* P A R T I A L   P I V O T                                               *
+	*                                                                         * 
+	* Look for the element with the largest magnitude on or below the         *
+	* diagonal in column "col" of the matrix A.  Bring this element to the    *
+	* diagonal by a row interchange.  Perform the same row interchange on b.  *
+	*                                                                         *  
+	*-------------------------------------------------------------------------*/
+	static int PartialPivot( int col, Matrix &A, Vector &b )
+	{
+		int n = A.Cols();
+		float a_max = Abs( A( col, col ) );
+		int   i_max = col;
+		for( int i = col + 1; i < n; i++ )
+		{
+			float temp = Abs( A( i, col ) );
+			if( temp > a_max )
+			{
+				a_max = temp;
+				i_max = i;
+			}
+		}
+		if( a_max == 0.0 ) return 0;
+		if( i_max != col )
+		{
+			A.SwapRows( col, i_max );
+			b.Swap    ( col, i_max );
+		}
+		return 1;
+	}
+
+	/*-------------------------------------------------------------------------*
+	* G A U S S I A N   E L I M I N A T I O N                                 *
+	*                                                                         * 
+	* Solves the linear system A x = b using Gaussian elimination, with or    *
+	* without partial pivoting.                                               *
+	*                                                                         *  
+	*-------------------------------------------------------------------------*/
+	int GaussElimination( const Matrix &A, const Vector &b, Vector &x, pivot_type pivot )
+	{
+		assert( Square( A ) );
+		assert( A.Rows() == b.Size() );
+		Matrix B( A );
+		Vector c( b );
+		x.SetSize( A.Cols() );
+		int m = B.Rows();
+		register int i, j, k;
+
+		// Perform Gaussian elimination on the copies, B and c.
+
+		for( i = 0; i < m; i++ )
+		{
+			if( pivot == pivot_partial ) PartialPivot( i, B, c );
+
+			for( j = i + 1; j < m; j++ )
+			{
+				double scale = -B(j,i) / B(i,i);
+				for( k = i; k < m; k++ )
+					B(j,k) += scale * B(i,k);
+				B(j,i) = 0.0;
+				c(j) += scale * c(i);
+			}
+		}
+
+		// Now solve by back substitution.
+
+		for( i = m - 1; i >= 0; i-- )
+		{
+			double a = 0.0;
+			for( j = i + 1; j < m; j++ ) a += B(i,j) * x(j);
+			x(i) = ( c(i) - a ) / B(i,i);
+		}
+
+		return 1;
+	}
+
+	/*-------------------------------------------------------------------------*
+	*  L E A S T   S Q U A R E S                                              *
+	*                                                                         *
+	* Solves the normal equations associated with the system A x = b, which   *
+	* are given by  Transp(A) A x = Transp(A) b.                              *
+	*                                                                         *
+	*-------------------------------------------------------------------------*/
+	int LeastSquares( const Matrix &A, const Vector &b, Vector &x )
+	{
+		//
+		// Set up and solve the normal equations Transp(A) A x = Transp(A) b.
+		// Note that Transp(A) * b is computed here as b * A.
+		//
+		GaussElimination( ATranspA(A), b * A, x );
+		return 1;
+	}
+
+	/*-------------------------------------------------------------------------*
+	*  D E T E R M I N A N T                                                  *
+	*                                                                         *
+	* Computes the determinant of the n by n matrix M using Householder       *
+	* transformations.                                                        *
+	*                                                                         *
+	*-------------------------------------------------------------------------*/
+	double Determinant( const Matrix &M )
+	{
+		static const float MachEps = MachineEpsilon();
+		assert( Square(M) );
+
+		double dot;
+		int    k;
+		Matrix A    = M;    // Make a copy that we can destroy.
+		double det  = 1.0;  // Multiply diagonal elements as they are generated.
+		int    sign = 1;	// Keep track of sign (each reflection has det -1).
+		int    n    = M.Cols();
+
+		for( int i = 0; i < n - 1; i++ ) 
+		{
+			// Compute the 2-norm of the first column of the (n-i)x(n-i) submatrix.
+
+			dot = 0.0;
+			for( k = i; k < n; k++ ) dot += Sqr( A(k,i) );
+
+			double Xnorm = sqrt( dot );
+			if( Xnorm == 0.0 ) return 0.0;
+
+			// This norm is another diagonal element of the upper triangular
+			// matrix, so we multiply it into the running product for det.
+
+			det *= Xnorm;		
+
+			// If X is already of the right form we must not perform the
+			// processing because V will be zero.
+
+			float x1   = Abs( A(i,i) );
+			float diff = Abs( Xnorm - x1 );
+			if( diff < MachEps * Max( Xnorm, x1 ) ) continue;  // This column is okay as is.
+
+			// Each Householder transformation has a determinant of -1,
+			// so we must keep track of how many we apply.
+
+			sign *= -1;
+
+			// Compute the V vector, which will define the Householder
+			// transformation via  H = I - V transp(V).  Leave it in the
+			// i'th column of A.  V = sqrt(2) * Normalized( X - ( Xnorm, 0, 0,... ) ).
+
+			float scale = 1.0 / sqrt( Xnorm * Abs( A(i,i) - Xnorm ) );  // sqrt(2) / || p ||
+			A(i,i) = ( A(i,i) - Xnorm ) * scale;        
+			for( k = i + 1; k < n; k++ ) A(k,i) *= scale;
+
+			// Now apply the transformation I - V Transp(V) to all the remaining columns, 
+			// except for the first row.
+
+			for( int j = i + 1; j < n; j++ ) 
+			{
+				// Compute Y dot V.
+
+				dot = 0.0;
+				for( k = i; k < n; k++ ) dot += A(k,i) * A(k,j);
+
+				// Subtract V ( V dot A(*,j) ) from A(*,j), ignoring the first row.
+
+				for( k = i + 1; k < n; k++ ) A(k,j) -= A(k,i) * dot;
+
+			} // for j
+
+		} // for i
+
+		// Now multiply in the very last element of the matrix and
+		// the accumulated sign.
+
+		return det * A(n-1,n-1) * sign;
+	}	
+
+	/*-------------------------------------------------------------------------*
+	*  C O F A C T O R                                                        *
+	*                                                                         *
+	* Computes the (i,j) cofactor of the n by n matrix M.                     *
+	*                                                                         *
+	*-------------------------------------------------------------------------*/
+	double Matrix::Cofactor( int omit_i, int omit_j ) const
+	{
+		assert( Square( *this ) );
+		assert( omit_i >= 0 && omit_j >= 0 );
+		assert( omit_i < Rows() );
+		assert( omit_j < Cols() );
+
+		// Create a new matrix that is smaller by one in both dimensions and
+		// copy the old matrix into it, omitting the specified row and column.
+
+		Matrix A( Rows() - 1, Cols() - 1 );
+		for( int i = 0; i < Rows() - 1; i++ )
+		{
+			int ii = ( i < omit_i ) ? i : i + 1;
+			for( int j = 0; j < Cols() - 1; j++ )
+			{
+				int jj = ( j < omit_j ) ? j : j + 1;
+				A( i, j ) = (*this)(ii,jj);
+			}
+		}
+
+		// Return the determinant of the smaller matrix.
+
+		return Determinant( A );
+	}
+
+	/*-------------------------------------------------------------------------*
+	*  A D J O I N T                                                          *
+	*                                                                         *
+	* Computes the adjoint of a matrix.                                       *
+	*                                                                         *
+	*-------------------------------------------------------------------------*/
+	Matrix Adjoint( const Matrix &M )
+	{
+		double det;
+		return Adjoint( M, det );  // Discard the determinant.
+	}
+
+	Matrix Adjoint( const Matrix &M, double &det )
+	{
+		int n = M.Rows();
+		det   = 0.0;
+		Matrix A( n, n );
+		assert( Square(M) );
+		if( n == 3 )
+		{
+			A(0,0) = M(1,1) * M(2,2) - M(1,2) * M(2,1);
+			A(0,1) = M(1,2) * M(2,0) - M(1,0) * M(2,2);
+			A(0,2) = M(1,0) * M(2,1) - M(1,1) * M(2,0);
+
+			A(1,0) = M(0,2) * M(2,1) - M(0,1) * M(2,2);
+			A(1,1) = M(0,0) * M(2,2) - M(0,2) * M(2,0);
+			A(1,2) = M(0,1) * M(2,0) - M(0,0) * M(2,1);
+
+			A(2,0) = M(0,1) * M(1,2) - M(0,2) * M(1,1);
+			A(2,1) = M(0,2) * M(1,0) - M(0,0) * M(1,2);
+			A(2,2) = M(0,0) * M(1,1) - M(0,1) * M(1,0);
+
+			det = A(0,0) * M(0,0) + A(1,0) * M(1,0) + A(2,0) * M(2,0);
+		}
+		else
+		{
+			for( register int i = 0; i < n; i++ )
+			{
+				for( register int j = 0; j < n; j++ )
+				{
+					if( Odd( i + j ) )
+						A(i,j) = -M.Cofactor(i,j);
+					else A(i,j) =  M.Cofactor(i,j);
+				}
+				det += M(i,0) * A(i,0);
+			}
+		}
+		return A;
+	}
+
+	/*-------------------------------------------------------------------------*
+	*  I N V E R S E                                                          *
+	*                                                                         *
+	* Computes the inverse of a square matrix.                                *
+	*                                                                         *
+	*-------------------------------------------------------------------------*/
+	Matrix Inverse( const Matrix &M )
+	{
+		assert( Square( M ) );
+		int n = M.Cols();
+		Matrix Inv( n, n );
+		Vector b( n ), x( n );
+
+		for( int i = 0; i < n; i++ )
+		{
+			if( i > 0 ) b( i - 1 ) = 0.0;
+			b(i) = 1.0;
+			GaussElimination( M, b, x );
+			Inv.SetCol( i, x );
+		}
+		return Inv;
+	}
+
+	/*-------------------------------------------------------------------------*
+	*  T R A C E                                                              *
+	*                                                                         *
+	* Computes the trace of a square matrix.                                  *
+	*                                                                         *
+	*-------------------------------------------------------------------------*/
+	extern double Trace( const Matrix &M )
+	{
+		assert( Square(M) );
+		double trace = M(0,0);
+		for( int i = 1; i < M.Cols(); i++ ) trace += M(i,i);
+		return trace;
+	}
+};
+
+
+
+/*
+
+C
+C  Subroutine GAUSS solves the the system Ax = b by using Gaussian elimination.
+C
+
+SUBROUTINE GAUSS( A, B, X, LDA, N, IFLAG )
+REAL A( LDA, N ), B( N ), X( N )
+
+DO 300 I = 1 , N - 1
+I2 = I
+CALL PIVOT( A, B, LDA, N, I2, IFLAG )
+IF ( IFLAG .LT. 0 ) RETURN
+DO 200 J = I + 1 , N
+TEMP = A( J , I ) / A( I , I )
+A( J , I ) = 0.0
+B( J ) = B( J ) - TEMP * B( I )
+DO 100 K = I + 1 , N
+A( J , K ) = A( J , K ) - TEMP * A( I , K )
+100           CONTINUE
+200       CONTINUE
+300   CONTINUE
+
+X( N ) = B( N ) / A( N , N )
+DO 500 I = N - 1 , 1 , -1
+TEMP = 0.0
+DO 400 J = I + 1 , N
+TEMP = TEMP + A( I , J ) * X( J )
+400       CONTINUE
+X( I ) = ( B( I ) - TEMP ) / A( I , I )
+500   CONTINUE
+
+RETURN
+END
+
+
+
+SUBROUTINE PIVOT( A, B, LDA, N, J, IFLAG )
+REAL A( LDA, N ), B( N ), AMAX, TEMP
+DATA TOL / 1.0E-6 /
+
+IFLAG = -1
+IF ( J .GT. N ) RETURN
+IF ( J .EQ. N .AND. ABS( A(N,N) ) .LT. TOL ) RETURN
+IF ( J .EQ. N ) GO TO 40
+
+AMAX  = ABS( A( J , J ) )
+INDEX = J
+10   DO 20 I = J + 1 , N
+IF ( ABS( A( I , J ) ) .LE. AMAX ) GO TO 20
+AMAX = ABS( A( I , J ) )
+INDEX = I
+20   CONTINUE
+
+IF ( AMAX .LT. TOL ) RETURN
+
+TEMP = B( J )
+B( J ) = B( INDEX )
+B( INDEX ) = TEMP
+
+DO 30 K = 1 , N
+TEMP = A( J , K )
+A( J , K ) = A( INDEX , K )
+A( INDEX , K ) = TEMP
+30   CONTINUE
+
+40   IFLAG = 1
+RETURN
+END
+
+
+*/
+
+
+
+
+
diff --git a/src/nvtt/bc7/arvo/Matrix.h b/src/nvtt/bc7/arvo/Matrix.h
new file mode 100644
index 0000000..1832c8f
--- /dev/null
+++ b/src/nvtt/bc7/arvo/Matrix.h
@@ -0,0 +1,142 @@
+/***************************************************************************
+* Matrix.h                                                                 *
+*                                                                          *
+* General Vector and Matrix classes, with all the associated methods.      *
+*                                                                          *
+*   HISTORY                                                                *
+*      Name    Date        Description                                     *
+*                                                                          *
+*      arvo    08/16/2000    Revamped for CIT tools.                       *
+*      arvo    10/31/1994    Combined RowVec & ColVec into Vector.         *
+*      arvo    06/30/1993    Added singular value decomposition class.     *
+*      arvo    06/25/1993    Major revisions.                              *
+*      arvo    09/08/1991    Initial implementation.                       *
+*                                                                          *
+*--------------------------------------------------------------------------*
+* Copyright (C) 2000, James Arvo                                           *
+*                                                                          *
+* This program is free software; you can redistribute it and/or modify it  *
+* under the terms of the GNU General Public License as published by the    *
+* Free Software Foundation.  See http://www.fsf.org/copyleft/gpl.html      *
+*                                                                          *
+* This program is distributed in the hope that it will be useful, but      *
+* WITHOUT EXPRESS OR IMPLIED WARRANTY of merchantability or fitness for    *
+* any particular purpose.  See the GNU General Public License for more     *
+* details.                                                                 *
+*                                                                          *
+***************************************************************************/
+#ifndef __MATRIX_INCLUDED__
+#define __MATRIX_INCLUDED__
+
+#include <iostream>
+#include "Vector.h"
+
+namespace ArvoMath {
+
+	class Matrix {
+	public:
+		Matrix( const Matrix & );
+		Matrix( int num_rows = 0, int num_cols = 0, float value = 0.0 );
+		~Matrix();
+		Matrix &operator=( const Matrix &M );
+		Matrix &operator=( float s );
+		Vector  GetCol( int col ) const;
+		Vector  GetRow( int row ) const;
+		void    SetCol( int col, const Vector & );
+		void    SetRow( int row, const Vector & );
+		Matrix  GetBlock( int imin, int imax, int jmin, int jmax ) const;
+		void    SetBlock( int imin, int imax, int jmin, int jmax, const Matrix & );
+		void    SetBlock( int imin, int imax, int jmin, int jmax, const Vector & );
+		Matrix &SwapRows( int i1, int i2 );
+		Matrix &SwapCols( int j1, int j2 );
+		void    SetSize( int rows, int cols = 0 );
+		double  Cofactor( int i, int j ) const;
+		static  const Matrix Null;
+
+	public: // Inlined functions.
+		inline float  operator()( int i, int j ) const { return elem[ i * cols + j ]; }
+		inline float &operator()( int i, int j )       { return elem[ i * cols + j ]; }
+		inline int    Rows  () const { return rows; }
+		inline int    Cols  () const { return cols; }
+		inline float *Array () const { return elem; }
+
+	private:
+		int    rows; // Number of rows in the matrix.
+		int    cols; // Number of columns in the matrix.
+		float *elem; // Pointer to the actual data.
+	};
+
+
+	extern Vector  operator *  ( const Matrix &, const Vector & );
+	extern Vector  operator *  ( const Vector &, const Matrix & );
+	extern Vector& operator *= (       Vector &, const Matrix & );
+	extern Matrix  Outer       ( const Vector &, const Vector & );  // Outer product.
+	extern Matrix  operator +  ( const Matrix &, const Matrix & );
+	extern Matrix  operator -  ( const Matrix &                 );
+	extern Matrix  operator -  ( const Matrix &, const Matrix & );
+	extern Matrix  operator *  ( const Matrix &, const Matrix & );
+	extern Matrix  operator *  ( const Matrix &,       float    );
+	extern Matrix  operator *  (       float  ,  const Matrix & );
+	extern Matrix  operator /  ( const Matrix &,       float    );
+	extern Matrix& operator += (       Matrix &, const Matrix & );
+	extern Matrix& operator *= (       Matrix &,       float    );
+	extern Matrix& operator *= (       Matrix &, const Matrix & );
+	extern Matrix& operator /= (       Matrix &,       float    );
+	extern Matrix  Ident       (       int    n );
+	extern Matrix  Householder ( const Vector & );
+	extern Matrix  Rotation    ( const Vector &Axis, float angle );
+	extern Matrix  Rotation    ( const Vector &Axis, const Vector &Origin, float angle );
+	extern Matrix  Rotation    (       float, float, float ); // For random 3D rotations.
+	extern Matrix  Xrotation   (       float    );
+	extern Matrix  Yrotation   (       float    );
+	extern Matrix  Zrotation   (       float    );
+	extern Matrix  Diag        ( const Vector & );
+	extern Vector  Diag        ( const Matrix & );
+	extern Matrix  Diag        ( float, float, float );
+	extern Matrix  Adjoint     ( const Matrix & );
+	extern Matrix  Adjoint     ( const Matrix &, double &det );
+	extern Matrix  AATransp    ( const Matrix & );
+	extern Matrix  ATranspA    ( const Matrix & );
+	extern double  OneNorm     ( const Matrix & );
+	extern double  SupNorm     ( const Matrix & );
+	extern double  Determinant ( const Matrix & );
+	extern double  Trace       ( const Matrix & );
+	extern Matrix  Transp      ( const Matrix & );
+	extern Matrix  Inverse     ( const Matrix & );
+	extern int     Null        ( const Matrix & );
+	extern int     Square      ( const Matrix & );
+	extern Vector  ToVector    ( const Matrix & ); // Only for vector-shaped matrices.
+
+	enum pivot_type {
+		pivot_off,
+		pivot_partial,
+		pivot_total
+	};
+
+	extern int GaussElimination( 
+		const Matrix &A, 
+		const Vector &b, // This is the right-hand side.
+		Vector       &x, // This is the matrix we are solving for.
+		pivot_type = pivot_off
+		);
+
+	extern int LeastSquares( 
+		const Matrix &A, 
+		const Vector &b, 
+		Vector       &x
+		);
+
+	extern int WeightedLeastSquares( 
+		const Matrix &A, 
+		const Vector &b, 
+		const Vector &w, 
+		Vector       &x 
+		);
+
+	std::ostream &operator<<( 
+		std::ostream &out, 
+		const Matrix &
+		);
+};
+
+#endif
diff --git a/src/nvtt/bc7/arvo/Perm.cpp b/src/nvtt/bc7/arvo/Perm.cpp
new file mode 100644
index 0000000..87e98e3
--- /dev/null
+++ b/src/nvtt/bc7/arvo/Perm.cpp
@@ -0,0 +1,503 @@
+/***************************************************************************
+* Perm.C                                                                   *
+*                                                                          *
+* This file defines permutation class: that is, a class for creating and   *
+* manipulating finite sequences of distinct integers.  The main feature    *
+* of the class is the "++" operator that can be used to step through all   *
+* N! permutations of a sequence of N integers.  As the set of permutations *
+* forms a multiplicative group, a multiplication operator and an           *
+* exponentiation operator are also defined.                                *
+*                                                                          *
+*   HISTORY                                                                *
+*      Name    Date        Description                                     *
+*                                                                          *
+*      arvo    07/01/93    Added the Partition class.                      *
+*      arvo    03/23/93    Initial coding.                                 *
+*                                                                          *
+*--------------------------------------------------------------------------*
+* Copyright (C) 1999, James Arvo                                           *
+*                                                                          *
+* This program is free software; you can redistribute it and/or modify it  *
+* under the terms of the GNU General Public License as published by the    *
+* Free Software Foundation.  See http://www.fsf.org/copyleft/gpl.html      *
+*                                                                          *
+* This program is distributed in the hope that it will be useful, but      *
+* WITHOUT EXPRESS OR IMPLIED WARRANTY of merchantability or fitness for    *
+* any particular purpose.  See the GNU General Public License for more     *
+* details.                                                                 *
+*                                                                          *
+***************************************************************************/
+#include <stdio.h>
+#include <string.h>
+#include "Perm.h"
+#include "ArvoMath.h"
+#include "Char.h"
+
+namespace ArvoMath {
+
+	/***************************************************************************
+	*                                                              
+	*  L O C A L   F U N C T I O N S
+	*
+	***************************************************************************/
+
+	static void Reverse( int *p, int n )
+	{
+		int k = n >> 1;
+		int m = n - 1;
+		for( int i = 0; i < k; i++ ) Swap( p[i], p[m-i] );
+	}
+
+	static void Error( char *msg )
+	{
+		fprintf( stderr, "ERROR: Perm, %s.\n", msg );
+	}
+
+	/***************************************************************************
+	**
+	**  M E M B E R   F U N C T I O N S
+	**
+	***************************************************************************/
+
+	Perm::Perm( int Left, int Right )
+	{
+		a = ( Left < Right ) ? Left : Right;
+		b = ( Left > Right ) ? Left : Right;
+		p = new int[ Size() ];
+		Reset( *this );
+	}
+
+	Perm::Perm( const Perm &Q )
+	{
+		a = Q.Min();
+		b = Q.Max();
+		p = new int[ Q.Size() ];
+		for( int i = 0; i < Size(); i++ ) p[i] = Q[i];
+	}
+
+	Perm::Perm( const char *str )
+	{
+		(*this) = str;
+	}
+
+	Perm &Perm::operator=( const char *str )
+	{
+		int  k, m = 0, n = 0;
+		char dig[10];
+		char c;
+		if( p != NULL ) delete[] p;
+		p = new int[ strlen(str)/2 + 1 ];
+		for(;;)
+		{
+			c = *str++;
+			if( isDigit(c) ) dig[m++] = c;
+			else if( m > 0 )
+			{ 
+				dig[m] = NullChar;
+				sscanf( dig, "%d", &k );
+				if( n == 0 ) a = k; else if( k < a ) a = k;
+				if( n == 0 ) b = k; else if( k > b ) b = k;
+				p[n++] = k;
+				m = 0; 
+			}
+			if( c == NullChar ) break;
+		}
+		for( int i = 0; i < n; i++ )
+		{
+			int N = i + a;
+			int okay = 0;
+			for( int j = 0; j < n; j++ )
+				if( p[j] == N ) { okay = 1; break; }
+				if( !okay )
+				{
+					Error( "string is not a valid permutation" );
+					return *this;
+				}
+		}
+		return *this;
+	}
+
+	void Perm::Get( char *str ) const
+	{
+		for( int i = 0; i < Size(); i++ )
+			str += sprintf( str, "%d ", p[i] );
+		*str = NullChar;
+	}
+
+	int Perm::Next()
+	{
+		int i, m, k = 0;
+		int N, M = 0;
+
+		// Look for the first element of p that is larger than its successor.
+		// If no such element exists, we are done.
+
+		M = p[0];                      // M is always the "previous" value.
+		for( i = 1; i < Size(); i++ )  // Now start with second element.
+		{
+			if( p[i] > M ) { k = i; break; }
+			M = p[i];
+		}
+		if( k == 0 ) return 0; // Already in descending order.
+		m = k - 1;
+
+		// Find the largest entry before k that is less than p[k].
+		// One exists because p[k] is bigger than M, i.e. p[k-1].
+
+		N = p[k];
+		for( i = 0; i < k - 1; i++ )
+		{
+			if( p[i] < N && p[i] > M ) { M = p[i]; m = i; }
+		}
+		Swap( p[m], p[k] ); // Entries 0..k-1 are still decreasing.
+		Reverse( p, k );    // Make first k elements increasing.
+		return 1;
+	}
+
+	int Perm::Prev()
+	{
+		int i, m, k = 0;
+		int N, M = 0;
+
+		// Look for the first element of p that is less than its successor.
+		// If no such element exists, we are done.
+
+		M = p[0];                      // M will always be the "previous" value.
+		for( i = 1; i < Size(); i++ )  // Start with the second element.
+		{
+			if( p[i] < M ) { k = i; break; }
+			M = p[i];
+		}
+		if( k == 0 ) return 0; // Already in ascending order.
+		m = k - 1;
+
+		// Find the smallest entry before k that is greater than p[k].
+		// One exists because p[k] is less than M, i.e. p[k-1].
+
+		N = p[k];
+		for( i = 0; i < k - 1; i++ )
+		{
+			if( p[i] > N && p[i] < M ) { M = p[i]; m = i; }
+		}
+		Swap( p[m], p[k] ); // Entries 0..k-1 are still increasing.
+		Reverse( p, k );    // Make first k elements decreasing.
+		return 1;
+	}
+
+
+	/***************************************************************************
+	**
+	**  O P E R A T O R S
+	**
+	***************************************************************************/
+
+	int Perm::operator++()
+	{
+		return Next();
+	}
+
+	int Perm::operator--()
+	{
+		return Prev();
+	}
+
+	Perm &Perm::operator+=( int n )
+	{
+		int i;
+		if( n > 0 ) for( i = 0; i < n; i++ ) if( !Next() ) break;
+		if( n < 0 ) for( i = n; i < 0; i++ ) if( !Prev() ) break;
+		return *this;
+	}
+
+	Perm &Perm::operator-=( int n )
+	{
+		int i;
+		if( n > 0 ) for( i = 0; i < n; i++ ) if( !Prev() ) break;
+		if( n < 0 ) for( i = n; i < 0; i++ ) if( !Next() ) break;
+		return *this;
+	}
+
+	int Perm::operator[]( int n ) const
+	{
+		if( n < 0 || Size() <= n ) 
+		{
+			Error( "permutation index[] out of range" );
+			return 0;
+		}
+		return p[ n ];
+	}
+
+	int Perm::operator()( int n ) const
+	{
+		if( n < Min() || Max() < n ) 
+		{
+			Error( "permutation index() out of range" );
+			return 0;
+		}
+		return p[ n - Min() ];
+	}
+
+	Perm &Perm::operator=( const Perm &Q )
+	{
+		if( Size() != Q.Size() )
+		{
+			delete[] p;
+			p = new int[ Q.Size() ];
+		}
+		a = Q.Min();
+		b = Q.Max();
+		for( int i = 0; i < Size(); i++ ) p[i] = Q[i];
+		return *this;
+	}
+
+	Perm Perm::operator*( const Perm &Q ) const
+	{
+		if( Min() != Q.Min() ) return Perm(0);
+		if( Max() != Q.Max() ) return Perm(0);
+		Perm A( Min(), Max() );
+		for( int i = 0; i < Size(); i++ ) A.Elem(i) = p[ Q[i] - Min() ];
+		return A;
+	}
+
+	Perm Perm::operator^( int n ) const
+	{
+		Perm A( Min(), Max() );
+		int pn = n;
+		if( n < 0 ) // First compute the inverse.
+		{
+			for( int i = 0; i < Size(); i++ )
+				A.Elem( p[i] - Min() ) = i + Min();
+			pn = -n;
+		}
+		for( int i = 0; i < Size(); i++ )
+		{
+			int k = ( n < 0 ) ? A[i] : p[i];
+			for( int j = 1; j < pn; j++ ) k = p[ k - Min() ];
+			A.Elem(i) = k;
+		}
+		return A;
+	}
+
+	Perm &Perm::operator()( int i, int j )
+	{
+		Swap( p[ i - Min() ], p[ j - Min() ] );
+		return *this;
+	}
+
+	int Perm::operator==( const Perm &Q ) const
+	{
+		int i;
+		if( Min() != Q.Min() ) return 0;
+		if( Max() != Q.Max() ) return 0;
+		for( i = 0; i < Size(); i++ ) if( p[i] != Q[i] ) return 0;
+		return 1;
+	}
+
+	int Perm::operator<=( const Perm &Q ) const
+	{
+		int i;
+		if( Min() != Q.Min() ) return 0;
+		if( Max() != Q.Max() ) return 0;
+		for( i = 0; i < Size(); i++ ) if( p[i] != Q[i] ) return p[i] < Q[i];
+		return 1;
+	}
+
+	void Reset( Perm &P )
+	{
+		for( int i = 0; i < P.Size(); i++ ) P.Elem(i) = P.Min() + i;
+	}
+
+	int End( const Perm &P )
+	{
+		int c = P[0];
+		for( int i = 1; i < P.Size(); i++ ) 
+		{
+			if( c < P[i] ) return 0;
+			c = P[i];
+		}
+		return 1;
+	}
+
+	void Print( const Perm &P )
+	{
+		if( P.Size() > 0 )
+		{
+			printf( "%d", P[0] );
+			for( int i = 1; i < P.Size(); i++ ) printf( " %d", P[i] );
+			printf( "\n" );
+		}
+	}
+
+	int Even( const Perm &P )
+	{
+		return !Odd( P );
+	}
+
+	int Odd( const Perm &P )
+	{
+		int count = 0;
+		Perm Q( P );
+		for( int i = P.Min(); i < P.Max(); i++ )
+		{
+			if( Q(i) == i ) continue;
+			for( int j = P.Min(); j <= P.Max(); j++ )
+			{
+				if( j == i ) continue;
+				if( Q(j) == i )
+				{
+					Q(i,j);
+					count = ( j - i ) + ( count % 2 );
+				}
+			}
+		}
+		return count % 2;
+	}
+
+
+	/***************************************************************************
+	**
+	**  P A R T I T I O N S
+	**
+	***************************************************************************/
+
+	Partition::Partition( )
+	{
+		Bin   = NULL;
+		bins  = 0;
+		balls = 0;
+	}
+
+	Partition::Partition( const Partition &Q )
+	{
+		Bin   = new int[ Q.Bins() ];
+		bins  = Q.Bins();
+		balls = Q.Balls();
+		for( int i = 0; i < bins; i++ ) Bin[i] = Q[i];
+	}
+
+	Partition::Partition( int bins_, int balls_ )
+	{
+		bins  = bins_;    
+		balls = balls_;
+		Bin   = new int[ bins ];
+		Reset( *this );
+	}
+
+	void Partition::operator+=( int bin )  // Add a ball to this bin.
+	{
+		if( bin < 0 || bin >= bins ) fprintf( stderr, "ERROR -- bin number out of range.\n" );
+		balls++;
+		Bin[ bin ]++;
+	}
+
+	int Partition::operator==( const Partition &P ) const  // Compare two partitions.
+	{
+		if( Balls() != P.Balls() ) return 0;
+		if( Bins () != P.Bins () ) return 0;
+		for( int i = 0; i < bins; i++ )
+		{
+			if( Bin[i] != P[i] ) return 0;
+		}
+		return 1;
+	}
+
+	void Partition::operator=( int n )  // Set to the n'th configuration.
+	{
+		Reset( *this );
+		for( int i = 0; i < n; i++ ) ++(*this);
+	}
+
+	int Partition::operator!=( const Partition &P ) const
+	{
+		return !( *this == P );
+	}
+
+	void Partition::operator=( const Partition &Q )
+	{
+		if( bins != Q.Bins() )
+		{
+			delete[] Bin;
+			Bin = new int[ Q.Bins() ];
+		}
+		bins  = Q.Bins();
+		balls = Q.Balls();
+		for( int i = 0; i < bins; i++ ) Bin[i] = Q[i];
+	}
+
+	void Partition::Get( char *str ) const
+	{
+		for( int i = 0; i < bins; i++ )
+			str += sprintf( str, "%d ", Bin[i] );
+		*str = NullChar;
+	}
+
+	int Partition::operator[]( int i ) const
+	{
+		if( i < 0 || i >= bins ) return 0;
+		else return Bin[i];
+	}
+
+	long Partition::NumCombinations() const  // How many distinct configurations.
+	{
+		// Think of the k "bins" as being k - 1 "partitions" mixed in with
+		// the n "balls".  If the balls and partitions were each distinguishable
+		// objects, there would be (n + k - 1)! distinct configurations.  
+		// But since both the balls and the partitions are  indistinguishable, 
+		// we simply divide by n! (k - 1)!.  This is the binomial coefficient 
+		// ( n + k - 1, n ).
+		//
+		if( balls == 0 ) return 0;
+		if( bins  == 1 ) return 1;
+		return (long)floor( BinomialCoeff( balls + bins - 1, balls ) + 0.5 );
+	}
+
+	/***************************************************************************
+	*  O P E R A T O R + +   (Next Partition)                                  *
+	*                                                                          *
+	*  Rearranges the n "balls" in k "bins" into the next configuration.       *
+	*  The first config is assumed to be all balls in the first bin -- i.e.    *
+	*  Bin[0].  All possible groupings are generated, each exactly once.  The  *
+	*  function returns 1 if successful, 0 if the last config has already been *
+	*  reached.  (Algorithm by Harold Zatz)                                    *
+	*                                                                          *
+	***************************************************************************/
+	int Partition::operator++()
+	{
+		int i;
+		if( Bin[0] > 0 )
+		{
+			Bin[1] += 1;
+			Bin[0] -= 1;
+		}
+		else
+		{
+			for( i = 1; Bin[i] == 0; i++ );
+			if( i == bins - 1 ) return 0;
+			Bin[i+1] += 1;
+			Bin[0] = Bin[i] - 1;
+			Bin[i] = 0;
+		}
+		return 1;
+	}
+
+	void Reset( Partition &P )
+	{
+		P.Bin[0] = P.Balls();
+		for( int i = 1; i < P.Bins(); i++ ) P.Bin[i] = 0;
+	}
+
+	int End( const Partition &P )
+	{
+		return P[ P.Bins() - 1 ] == P.Balls();
+	}
+
+	void Print( const Partition &P )
+	{
+		if( P.Bins() > 0 )
+		{
+			printf( "%d", P[0] );
+			for( int i = 1; i < P.Bins(); i++ ) printf( " %d", P[i] );
+			printf( "\n" );
+		}
+	}
+};
diff --git a/src/nvtt/bc7/arvo/Perm.h b/src/nvtt/bc7/arvo/Perm.h
new file mode 100644
index 0000000..2af4776
--- /dev/null
+++ b/src/nvtt/bc7/arvo/Perm.h
@@ -0,0 +1,111 @@
+/***************************************************************************
+* Perm.h                                                                   *
+*                                                                          *
+* This file defines permutation class: that is, a class for creating and   *
+* manipulating finite sequences of distinct integers.  The main feature    *
+* of the class is the "++" operator that can be used to step through all   *
+* N! permutations of a sequence of N integers.  As the set of permutations *
+* forms a multiplicative group, a multiplication operator and an           *
+* exponentiation operator are also defined.                                *
+*                                                                          *
+*   HISTORY                                                                *
+*      Name    Date        Description                                     *
+*                                                                          *
+*      arvo    07/01/93    Added the Partition class.                      *
+*      arvo    03/23/93    Initial coding.                                 *
+*                                                                          *
+*--------------------------------------------------------------------------*
+* Copyright (C) 1999, James Arvo                                           *
+*                                                                          *
+* This program is free software; you can redistribute it and/or modify it  *
+* under the terms of the GNU General Public License as published by the    *
+* Free Software Foundation.  See http://www.fsf.org/copyleft/gpl.html      *
+*                                                                          *
+* This program is distributed in the hope that it will be useful, but      *
+* WITHOUT EXPRESS OR IMPLIED WARRANTY of merchantability or fitness for    *
+* any particular purpose.  See the GNU General Public License for more     *
+* details.                                                                 *
+*                                                                          *
+***************************************************************************/
+#ifndef __PERM_INCLUDED__
+#define __PERM_INCLUDED__
+
+namespace ArvoMath {
+
+	class Perm {
+	public:
+		Perm( const Perm & );                   // Initialize from a permutation.
+		Perm( int a = 0, int b = 0 );           // Create permutation of ints a...b.
+		Perm( const char * );                   // Create from string of numbers.
+		~Perm() { delete p; }                    // Destructor.
+		void  Get( char * ) const;              // Gets a string representation.
+		int   Size() const { return b - a + 1;} // The number of elements.
+		int   Min () const { return a; }        // The smallest value.
+		int   Max () const { return b; }        // The largest value.
+		int   operator++();                     // Make "next" permutation.
+		int   operator--();                     // Make "previous" permutation.
+		Perm &operator+=( int n );              // Advances by n permutations.
+		Perm &operator-=( int n );              // Decrement by n permutations.
+		Perm &operator =( const char * ) ;      // Resets from string of numbers.
+		Perm &operator =( const Perm & ) ;      // Copy from another permutation.
+		Perm &operator()( int i, int j ) ;      // Swap entries i and j.
+		int   operator()( int n        ) const; // Index from Min() to Max().
+		int   operator[]( int n        ) const; // Index from 0 to Size() - 1.
+		Perm  operator ^( int n        ) const; // Exponentiation: -1 means inverse.
+		Perm  operator *( const Perm & ) const; // Multiplication means composition.
+		int   operator==( const Perm & ) const; // True if all elements match.
+		int   operator<=( const Perm & ) const; // Lexicographic order relation.
+	private:
+		int& Elem( int i ) { return p[i]; }
+		int  Next();
+		int  Prev();
+		int  a, b;
+		int  *p;
+		friend void Reset( Perm & );
+	};
+
+
+	// A "Partition" is a collection of k indistinguishable "balls" in n "bins".  
+	// The Partition class encapsulates this notion and provides a convenient means 
+	// of generating all possible partitions of k objects among n bins exactly once.  
+	// Starting with all objects in bin zero, the ++ operator creates new and distinct
+	// distributions among the bins until all objects are in the last bin.
+
+	class Partition {
+	public:
+		Partition( );                              // Creates a null partition.
+		Partition( const Partition & );            // Initialize from another partition.
+		Partition( int bins, int balls );          // Specify # of bins & balls.
+		~Partition() { delete Bin; }                // Descructor.
+		void Get( char * ) const;                  // Gets a string representation.
+		int  Bins () const { return bins;  }       // The number of bins.
+		int  Balls() const { return balls; }       // The number of balls.
+		void operator+=( int bin );                // Add a ball to this bin.
+		void operator =( int n   );                // Set to the n'th configuration.
+		void operator =( const Partition& );       // Copy from another partition.
+		int  operator==( const Partition& ) const; // Compare two partitions.
+		int  operator!=( const Partition& ) const; // Compare two partitions.
+		int  operator++();                         // Make "next" partition.
+		int  operator[]( int i ) const;            // Return # of balls in bin i.
+		long NumCombinations() const;              // Number of distinct configurations.
+	private:
+		int  bins;
+		int  balls;
+		int* Bin;
+		friend void Reset( Partition & );
+	};
+
+
+	// Predicates for determining when a permutation or partition is the last of
+	// the sequence, functions for printing, resetting, and miscellaneous operations.
+
+	extern int  End  ( const Partition & );  // True if all balls in last bin.
+	extern int  End  ( const Perm      & );  // True if descending.
+	extern int  Even ( const Perm      & );  // True if even # of 2-cycles.
+	extern int  Odd  ( const Perm      & );  // True if odd # of 2-cycles.
+	extern void Print( const Partition & );  // Write to standard out.
+	extern void Print( const Perm      & );  // Write to standard out.
+	extern void Reset(       Partition & );  // Reset to all balls in bin 0.
+	extern void Reset(       Perm      & );  // Reset to ascending order.
+};
+#endif
diff --git a/src/nvtt/bc7/arvo/Rand.cpp b/src/nvtt/bc7/arvo/Rand.cpp
new file mode 100644
index 0000000..5f3025b
--- /dev/null
+++ b/src/nvtt/bc7/arvo/Rand.cpp
@@ -0,0 +1,230 @@
+/***************************************************************************
+* Rand.C  (Random Number Generators)                                       *
+*                                                                          *
+* Source file for pseudo-random number utilities.  Rand is the             *
+* base class for several different algorithms for generating pseudo-random *
+* numbers.  Any method can generate individual samples or arrays of        *
+* samples using "Eval".  The random seed can be reset at any time by       *
+* calling "Seed" with any integer.  Random permutations of the integers    *
+* 0,1,...(n-1) are generated by "Perm(n,P)".                               *
+*                                                                          *
+*   HISTORY                                                                *
+*      Name    Date        Description                                     *
+*                                                                          *
+*      arvo    08/04/97    Changed to virtual functions.                   *
+*      arvo    06/06/93    Optimization, especially for array evaluators.  *
+*      arvo    10/06/91    Converted to C++                                *
+*      arvo    11/20/89    Added "gen_seed" function to handle.            *
+*      arvo    10/30/89    "state" allocation now done in rand_alloc.      *
+*      arvo    07/08/89    Initial coding.                                 *
+*                                                                          *
+*--------------------------------------------------------------------------*
+* Copyright (C) 1989, James Arvo                                           *
+*                                                                          *
+* This program is free software; you can redistribute it and/or modify it  *
+* under the terms of the GNU General Public License as published by the    *
+* Free Software Foundation.  See http://www.fsf.org/copyleft/gpl.html      *
+*                                                                          *
+* This program is distributed in the hope that it will be useful, but      *
+* WITHOUT EXPRESS OR IMPLIED WARRANTY of merchantability or fitness for    *
+* any particular purpose.  See the GNU General Public License for more     *
+* details.                                                                 *
+*                                                                          *
+***************************************************************************/
+#include <stdio.h>
+#include <math.h>
+#include "Rand.h"
+
+namespace ArvoMath {
+#ifndef ABS
+#define ABS( x ) ((x) > 0 ? (x) : -(x))
+#endif
+
+	/*-------------------------------------------------------------------------*
+	* M E T H O D 1                                                           *
+	*                                                                         *
+	* From "Numerical Recipes," by William H. Press, Brian P. Flannery,       *
+	* Saul A. Teukolsky, and William T. Vetterling, p. 197.                   *
+	*                                                                         *
+	*-------------------------------------------------------------------------*/
+	static const long   M1 = 714025;
+	static const long   IA =   1366;
+	static const long   IC = 150889;
+	static const double RM = 1.400512E-6;
+
+	float RandGen_1::Eval()
+	{
+		register long  *elem;
+		register long  offset;
+		register float rand;
+		offset = 1 + ( 97 * index ) / M1;
+		if( offset > 97 ) offset = 97;
+		if( offset <  1 ) offset =  1;
+		elem   = shuffle + offset;
+		rand   = ( index = *elem ) * RM;
+		*elem  = ( seed  = ( IA * seed + IC ) % M1 );
+		return rand;
+	}
+
+	void RandGen_1::Eval( int n, float *array )
+	{
+		register long *shfl = shuffle;
+		register long *elem;
+		register long offset;
+		for( int i = 0; i < n; i++ ) 
+		{
+			offset   = 1 + ( 97 * index ) / M1;
+			if( offset > 97 ) offset = 97;
+			if( offset <  1 ) offset =  1;
+			elem     = shfl + offset;
+			*array++ = ( index = *elem ) * RM;
+			*elem    = ( seed  = ( IA * seed + IC ) % M1 );
+		}
+	}
+
+	void RandGen_1::Seed( long seed )
+	{
+		long t = ( IC + ABS( seed ) + 1 ) % M1;
+		for( register int k = 1; k <= 97; k++ )
+		{
+			t = ( IA * t + IC ) % M1;
+			shuffle[k] = ABS( t );
+		}
+		t = ( IA * t + IC ) % M1;
+		seed  = ABS( t );
+		index = ABS( t );
+	}
+
+	/*-------------------------------------------------------------------------*
+	* M E T H O D 2                                                           *
+	*                                                                         *
+	* From "The Multiple Prime Random Number Generator," by Alexander Haas,   *
+	* ACM Transactions on Mathematical Software, Vol. 13, No. 4, December     *
+	* 1987, pp. 368-381.                                                      *
+	*                                                                         *
+	*-------------------------------------------------------------------------*/
+	float RandGen_2::Eval()
+	{
+		if( (m += 7    ) >=   9973 ) m -=  9871;
+		if( (i += 1907 ) >=  99991 ) i -= 89989;
+		if( (j += 73939) >= 224729 ) j -= 96233;
+		r = ((r * m + i + j) % 100000) / 10;
+		return r * 1.00010001E-4;
+	}
+
+	void RandGen_2::Eval( int n, float *array )
+	{
+		for( register int k = 0; k < n; k++ ) 
+		{
+			if( (m += 7    ) >=   9973 ) m -=  9871;
+			if( (i += 1907 ) >=  99991 ) i -= 89989;
+			if( (j += 73939) >= 224729 ) j -= 96233;
+			r = ((r * m + i + j) % 100000) / 10;
+			*array++ = r * 1.00010001E-4;
+		}
+	}
+
+	void RandGen_2::Seed( long seed )
+	{
+		r = ABS( seed      );
+		m = ABS( seed *  7 );
+		i = ABS( seed * 11 );
+		j = ABS( seed * 13 );
+		if( m < 100    ) m += 100;
+		if( i < 10000  ) i += 10000;
+		if( j < 128000 ) j += 128000;
+	}
+
+	/*-------------------------------------------------------------------------*
+	* M E T H O D 3                                                           *
+	*                                                                         *
+	* From "A More Portable Fortran Random Number Generator," by Linus        *
+	* Schrage, ACM Transactions on Mathematical Software, Vol. 5, No, 2,      *
+	* June 1979, pp. 132-138.                                                 *
+	*                                                                         *
+	*-------------------------------------------------------------------------*/
+	static const long A3 = 16807;
+	static const long P3 = 2147483647;
+
+	float RandGen_3::Eval()
+	{
+		long xhi    = ix >> 16;
+		long xalo   = ( ix & 0xFFFF ) * A3;
+		long leftlo = xalo >> 16;
+		long fhi    = xhi * A3 + leftlo;
+		long k      = fhi >> 15;
+		ix          = ( ((xalo - (leftlo << 16)) - P3) +
+			((fhi - (k << 15)) << 16) ) + k;
+		if( ix < 0 ) ix += P3;
+		return ix * 4.656612875E-10;
+	}
+
+	void RandGen_3::Eval( int n, float *array )
+	{
+		register long xhi, xalo, leftlo;
+		register long fhi, k;
+		for( register int i = 0; i < n; i++ ) 
+		{
+			xhi    = ix >> 16;
+			xalo   = ( ix & 0xFFFF ) * A3;
+			leftlo = xalo >> 16;
+			fhi    = xhi * A3 + leftlo;
+			k      = fhi >> 15;
+			ix     = ( ((xalo - (leftlo << 16)) - P3) +
+				((fhi - (k << 15)) << 16) ) + k;
+			if( ix < 0 ) ix += P3;
+			*array++ = ix * 4.656612875E-10;
+		}
+	}
+
+	void RandGen_3::Seed( long seed )
+	{
+		ix = ABS( seed );
+	}
+
+	/*-------------------------------------------------------------------------*
+	* R A N D : : P E R M        (Permutation)                                *
+	*                                                                         *
+	* This routine fills an integer array of length "len" with a random       *
+	* permutation of the integers 0, 1, 2, ... (len-1).                       *
+	*                                                                         *
+	* For efficiency, the random numbers are generated in batches of up to    *
+	* "Nmax" at a time.  The constant Nmax can be set to any value >= 1.      *
+	*                                                                         *
+	*-------------------------------------------------------------------------*/
+	static const int Nmax = 20;
+
+	void RandGen::Perm( int len, int perm[] )
+	{
+		float R[ Nmax ];    // A buffer for getting random numbers.
+		int   L = len - 1;  // Total number of random numbers needed.
+		int   N = 0;        // How many to generate when we call Eval.
+		int   n = 0;        // The array index into R.
+
+		// First initialize the array "perm" to the identity permutation.
+
+		for( int j = 0; j < len; j++ ) perm[j] = j;
+
+		// Now swap a random element in the front with the i'th element.
+		// When i gets down to 0, we're done.
+
+		for( int i = len - 1; i > 0; i-- )   // Element i is a swap candidate.
+		{
+			if( n == N )                     // Generate more random numbers.
+			{
+				N = ( L < Nmax ) ? L : Nmax; // Can't get more than "Nmax".
+				Eval( N, R );                // Generate N random numbers.
+				L -= N;                      // Decrement total counter.
+				n  = 0;                      // Start index at beginning of R.
+			}
+			float r = ( i + 1 ) * R[ n++ ];  // Pick a float in [0,i+1].
+			int   k = (int)r;                // Truncate r to an integer.
+			if( k < i )                      // Disregard k == i and k == i+1.
+			{
+				int tmp = perm[i];           // Swap elements i and k.
+				perm[i] = perm[k];
+				perm[k] = tmp;
+			}
+		}
+	}
+};
diff --git a/src/nvtt/bc7/arvo/Rand.h b/src/nvtt/bc7/arvo/Rand.h
new file mode 100644
index 0000000..a8ef5d9
--- /dev/null
+++ b/src/nvtt/bc7/arvo/Rand.h
@@ -0,0 +1,114 @@
+/***************************************************************************
+* Rand.h  (Random Number Generators)                                       *
+*                                                                          *
+* Header file for Rand.C, pseudo-random number utilities.  Rand is the     *
+* base class for several different algorithms for generating pseudo-random *
+* numbers.  Any method can generate individual samples or arrays of        *
+* samples using "Eval".  The random seed can be reset at any time by       *
+* calling "Seed" with any integer.  Random permutations of the integers    *
+* 0,1,...(n-1) are generated by "Perm(n,P)".                               *
+*                                                                          *
+*   HISTORY                                                                *
+*      Name    Date        Description                                     *
+*                                                                          *
+*      arvo    08/04/97    Changed to virtual functions.                   *
+*      arvo    06/06/93    Optimization, especially for array evaluators.  *
+*      arvo    10/06/91    Converted to C++                                *
+*      arvo    11/20/89    Added "gen_seed" function to handle.            *
+*      arvo    10/30/89    "state" allocation now done in rand_alloc.      *
+*      arvo    07/08/89    Initial coding.                                 *
+*                                                                          *
+*--------------------------------------------------------------------------*
+* Copyright (C) 1989, James Arvo                                           *
+*                                                                          *
+* This program is free software; you can redistribute it and/or modify it  *
+* under the terms of the GNU General Public License as published by the    *
+* Free Software Foundation.  See http://www.fsf.org/copyleft/gpl.html      *
+*                                                                          *
+* This program is distributed in the hope that it will be useful, but      *
+* WITHOUT EXPRESS OR IMPLIED WARRANTY of merchantability or fitness for    *
+* any particular purpose.  See the GNU General Public License for more     *
+* details.                                                                 *
+*                                                                          *
+***************************************************************************/
+#ifndef __RAND_INCLUDED__
+#define __RAND_INCLUDED__
+
+namespace ArvoMath {
+
+	// Base class for random number generators.  This class contains
+	// several pure virtual functions, so it cannot be instanced directly.
+
+	class RandGen {
+	public:
+		RandGen() {}
+		virtual float Eval(                  ) = 0;
+		virtual void  Eval( int n, float x[] ) = 0;
+		virtual void  Seed( long seed        ) = 0;
+	public:
+		void  Perm( int n, int P[] );
+		float Interval( float a, float b );
+		void  Eval( float &x ) { x = Eval(); }
+	};
+
+
+	// Method 1: From "Numerical Recipes," by William H. Press, Brian P. 
+	// Flannery, Saul A. Teukolsky, and William T. Vetterling, p. 197.
+
+	class RandGen_1 : public RandGen {
+	public:
+		RandGen_1(           ) { Seed( 1    ); }
+		RandGen_1( long seed ) { Seed( seed ); }
+		virtual float Eval(                  );
+		virtual void  Eval( int n, float x[] );
+		virtual void  Seed( long seed        );
+	private: 
+		long index; 
+		long seed;
+		long shuffle[ 98 ];
+	};
+
+
+	// Method 2: From "The Multiple Prime Random Number Generator," by 
+	// Alexander Haas, ACM Transactions on Mathematical Software, 
+	// Vol. 13, No. 4, December 1987, pp. 368-381.                                                      *
+
+	class RandGen_2 : public RandGen {
+	public:
+		RandGen_2(           ) { Seed( 1    ); }
+		RandGen_2( long seed ) { Seed( seed ); }
+		virtual float Eval(                  );
+		virtual void  Eval( int n, float x[] );
+		virtual void  Seed( long seed        );
+	private: 
+		long r;  
+		long m;
+		long i;
+		long j;
+	};
+
+
+	// Method 3: From "A More Portable Fortran Random Number Generator," 
+	// by Linus Schrage, ACM Transactions on Mathematical Software, 
+	// Vol. 5, No, 2, June 1979, pp. 132-138.                                                 *
+
+	class RandGen_3 : public RandGen {
+	public:
+		RandGen_3(           ) { Seed( 1    ); }
+		RandGen_3( long seed ) { Seed( seed ); }
+		virtual float Eval(                  );
+		virtual void  Eval( int n, float x[] );
+		virtual void  Seed( long seed        );
+	private:
+		long ix;
+	};
+
+
+	inline float RandGen::Interval( float a, float b )
+	{
+		return ( a < b ) ?
+			a + Eval() * ( b - a ) :
+		b + Eval() * ( a - b ) ;
+	}
+};
+#endif
diff --git a/src/nvtt/bc7/arvo/SI_units.h b/src/nvtt/bc7/arvo/SI_units.h
new file mode 100644
index 0000000..69cc8cc
--- /dev/null
+++ b/src/nvtt/bc7/arvo/SI_units.h
@@ -0,0 +1,232 @@
+/*****************************************************************************
+** 
+**   MODULE NAME  SI_units.h       International System of Units (SI)
+**
+**   DESCRIPTION
+**       The purpose of this header file is to provide a simple and efficient
+**       mechanism for associating physically meaningful units with floating
+**       point numbers.  No extra space is required, and no runtime overhead
+**       is introduced; all type-checking occurs at compile time.
+**
+**
+**   HISTORY
+**      Name	Date	    Description
+**
+**      arvo    02/09/92    Replaced conversion macros with inline functions.
+**      arvo    10/16/91    Initial implementation.
+**
+**
+**   (c) Copyright 1991, 1992
+**       Program of Computer Graphics, Cornell University, Ithaca, NY
+**       ALL RIGHTS RESERVED
+**
+*****************************************************************************/
+
+#ifndef SI_UNITS_H
+#define SI_UNITS_H
+
+#include <iostream.h>
+
+namespace ArvoMath {
+
+	const float
+		SI_deci  = 1.0E-1,
+		SI_centi = 1.0E-2,
+		SI_milli = 1.0E-3,
+		SI_micro = 1.0E-6,
+		SI_nano  = 1.0E-9,
+		SI_kilo  = 1.0E+3,
+		SI_mega  = 1.0E+6,
+		SI_giga  = 1.0E+9,
+		SI_tera  = 1.0E+12;
+
+	/*******************************************************************************
+	*                                                                              *
+	*   I N T E R N A T I O N A L    S Y S T E M    O F    U N I T S               *
+	*                                                                              *
+	********************************************************************************
+	*                                                                              *
+	* DIMENSION           CLASS           INITIALIZER     SYMBOL   BASE UNITS      *
+	*                                                                              *
+	* length              SI_length        meter            m        m             *
+	* time                SI_time          second           s        s             *
+	* mass                SI_mass          kilogram         kg       kg            *
+	* angle               SI_angle         radian           rad      rad           *
+	* solid angle         SI_solid_angle   steradian        sr       sr            *
+	* temperature         SI_temperature   kelvin           K        K             *
+	* luminous intensity  SI_lum_inten     candela          cd       cd            *
+	* area                SI_area          meter2           m2       m2            *
+	* volume              SI_volume        meter3           m3       m3            *
+	* frequency           SI_frequency     hertz            Hz       1/s           *
+	* force               SI_force         newton           N        m kg/s2       *
+	* energy              SI_energy        joule            J        m2 kg/s2      *
+	* power               SI_power         watt             W        m2 kg/s3      *
+	* radiance            SI_radiance      watts_per_m2sr   W/m2sr   kg/(s3 sr)    *
+	* irradiance          SI_irradiance    watts_per_m2     W/m2     kg/s3         *
+	* radiant intensity   SI_rad_inten     watts_per_sr     W/sr     m2 kg/(s3 sr) *
+	* luminance           SI_luminance     candela_per_m2   cd/m2    cd/m2         *
+	* illuminance         SI_illuminance   lux              lx       cd sr/m2      *
+	* luminous flux       SI_lum_flux      lumen            lm       cd sr         *
+	* luminous energy     SI_lum_energy    talbot           tb       cd sr s       *
+	*                                                                              *
+	*******************************************************************************/
+
+	class SI_dimensionless {
+	public:
+		float Value() const { return value; }
+		ostream& Put( ostream &s, char *a ) { return s << value << " " << a; }
+	protected:
+		SI_dimensionless() { value = 0; }
+		SI_dimensionless( float x ){ value = x; }
+		float value;
+	};
+
+	/*******************************************************************************
+	* The following macro is used for creating new quantity classes and their      *
+	* corresponding initializing functions and abbreviations.  This macro is       *
+	* not intended to be used outside of this file -- it is a compact means of     *
+	* defining generic operations for each quantity (e.g. scaling & comparing).    *
+	*******************************************************************************/
+
+#define SI_Make( C, Initializer, Symbol )                                  \
+	struct C : SI_dimensionless {                                          \
+	C                 (         ) : SI_dimensionless(   ) {};          \
+	C                 ( float x ) : SI_dimensionless( x ) {};          \
+	C     operator *  ( float x ) { return C( value *  x         ); }  \
+	C     operator /  ( float x ) { return C( value /  x         ); }  \
+	C     operator /= ( float x ) { return C( value /= x         ); }  \
+	C     operator *= ( float x ) { return C( value *= x         ); }  \
+	C     operator +  ( C     x ) { return C( value +  x.Value() ); }  \
+	C     operator -  (         ) { return C(-value              ); }  \
+	C     operator -  ( C     x ) { return C( value -  x.Value() ); }  \
+	C     operator += ( C     x ) { return C( value += x.Value() ); }  \
+	C     operator -= ( C     x ) { return C( value -= x.Value() ); }  \
+	C     operator =  ( C     x ) { return C( value =  x.Value() ); }  \
+	int   operator >  ( C     x ) { return  ( value >  x.Value() ); }  \
+	int   operator <  ( C     x ) { return  ( value <  x.Value() ); }  \
+	int   operator >= ( C     x ) { return  ( value >= x.Value() ); }  \
+	int   operator <= ( C     x ) { return  ( value <= x.Value() ); }  \
+	float operator /  ( C     x ) { return  ( value /  x.Value() ); }  \
+	};                                                                 \
+	inline ostream& operator<<(ostream &s, C x) {return x.Put(s,Symbol);}  \
+	inline C Initializer( float x      )   { return C( x );             }  \
+	inline C operator * ( float x, C y )   { return C( x * y.Value() ); }
+
+	/*******************************************************************************
+	* The following macros define permissible arithmetic operations among          *
+	* variables with different physical meanings.  This ensures that the           *
+	* result of any such operation is ALWAYS another meaningful quantity.          *
+	*******************************************************************************/
+
+#define SI_Square( A, B )                                                  \
+	inline B operator*( A x, A y ) { return B( x.Value() * y.Value() ); }  \
+	inline A operator/( B x, A y ) { return A( x.Value() / y.Value() ); }
+
+#define SI_Recip( A, B )                                                   \
+	inline B operator/( float x, A y ) { return B( x / y.Value() ); }      \
+	inline A operator/( float x, B y ) { return A( x / y.Value() ); }      \
+	inline float operator*( A x, B y ) { return x.Value() * y.Value(); }   \
+	inline float operator*( B x, A y ) { return x.Value() * y.Value(); }
+
+#define SI_Times( A, B, C )                                                \
+	inline C operator*( A x, B y ) { return C( x.Value() * y.Value() ); }  \
+	inline C operator*( B x, A y ) { return C( x.Value() * y.Value() ); }  \
+	inline A operator/( C x, B y ) { return A( x.Value() / y.Value() ); }  \
+	inline B operator/( C x, A y ) { return B( x.Value() / y.Value() ); }
+
+	/*******************************************************************************
+	* The following macros create classes for a variety of quantities.  These      *
+	* include base qunatities such as "time" and "length" as well as derived       *
+	* quantities such as "power" and "volume".  Each quantity is provided with     *
+	* an initialization function in SI units and an abbreviation for printing.     *
+	*******************************************************************************/
+
+	SI_Make( SI_length         , meter           , "m"      ); // Base Units:
+	SI_Make( SI_mass           , kilogram        , "kg"     );
+	SI_Make( SI_time           , second          , "s"      );
+	SI_Make( SI_lum_inten      , candela         , "cd"     );
+	SI_Make( SI_temperature    , kelvin          , "K"      );
+	SI_Make( SI_angle          , radian          , "rad"    ); // Supplementary:
+	SI_Make( SI_solid_angle    , steradian       , "sr"     );
+	SI_Make( SI_area           , meter2          , "m2"     ); // Derived units:
+	SI_Make( SI_volume         , meter3          , "m3"     ); 
+	SI_Make( SI_frequency      , hertz           , "Hz"     ); 
+	SI_Make( SI_force          , newton          , "N"      );
+	SI_Make( SI_energy         , joule           , "J"      );
+	SI_Make( SI_power          , watt            , "W"      );
+	SI_Make( SI_radiance       , watts_per_m2sr  , "W/m2sr" );
+	SI_Make( SI_irradiance     , watts_per_m2    , "W/m2"   );
+	SI_Make( SI_rad_inten      , watts_per_sr    , "W/sr"   );
+	SI_Make( SI_luminance      , candela_per_m2  , "cd/m2"  );
+	SI_Make( SI_illuminance    , lux             , "lx"     );
+	SI_Make( SI_lum_flux       , lumen           , "lm"     );
+	SI_Make( SI_lum_energy     , talbot          , "tb"     );
+	SI_Make( SI_time2          , second2         , "s2"     ); // Intermediate: 
+	SI_Make( SI_sa_area        , meter2_sr       , "m2sr"   );
+	SI_Make( SI_inv_area       , inv_meter2      , "1/m2"   ); 
+	SI_Make( SI_inv_solid_angle, inv_steradian   , "1/sr"   );
+	SI_Make( SI_length_temp    , meters_kelvin   , "m K"    );
+	SI_Make( SI_power_area     , watts_m2        , "W m2"   );
+	SI_Make( SI_power_per_volume, watts_per_m3   , "W/m3"   );
+
+	SI_Square( SI_length       , SI_area            );
+	SI_Square( SI_time         , SI_time2           );
+	SI_Recip ( SI_time         , SI_frequency       );
+	SI_Recip ( SI_area         , SI_inv_area        );
+	SI_Recip ( SI_solid_angle  , SI_inv_solid_angle );
+
+	SI_Times( SI_area          , SI_length         , SI_volume      );
+	SI_Times( SI_force         , SI_length         , SI_energy      );
+	SI_Times( SI_power         , SI_time           , SI_energy      );
+	SI_Times( SI_lum_flux      , SI_time           , SI_lum_energy  );
+	SI_Times( SI_lum_inten     , SI_solid_angle    , SI_lum_flux    );
+	SI_Times( SI_radiance      , SI_solid_angle    , SI_irradiance  );
+	SI_Times( SI_rad_inten     , SI_solid_angle    , SI_power       );
+	SI_Times( SI_irradiance    , SI_area           , SI_power       );
+	SI_Times( SI_illuminance   , SI_area           , SI_lum_flux    );
+	SI_Times( SI_solid_angle   , SI_area           , SI_sa_area     );
+	SI_Times( SI_radiance      , SI_sa_area        , SI_power       );
+	SI_Times( SI_irradiance    , SI_inv_solid_angle, SI_radiance    );
+	SI_Times( SI_power         , SI_inv_solid_angle, SI_rad_inten   );
+	SI_Times( SI_length        , SI_temperature    , SI_length_temp );
+	SI_Times( SI_power         , SI_area           , SI_power_area  );
+
+	/*******************************************************************************
+	* Following are some useful non-SI units.  These units can be used in place of *
+	* the unit-initializers above.  Thus, a variable of type SI_length, for example*
+	* may be initialized in "meters", "inches", or "centimeters".  In all cases,   *
+	* however, the value is converted to the underlying SI unit (e.g. meters).     *
+	*******************************************************************************/
+
+#define SI_Convert( SI, New, Old ) inline SI New( float x ) { return x * Old; }
+
+	SI_Convert( SI_time        , minute     ,         second(     60.0 ) );
+	SI_Convert( SI_time        , hour       ,         minute(     60.0 ) );
+	SI_Convert( SI_force       , dyne       ,         newton(   1.0E-5 ) );
+	SI_Convert( SI_energy      , erg        ,          joule(   1.0E-7 ) );
+	SI_Convert( SI_power       , kilowatt   ,           watt(  SI_kilo ) );
+	SI_Convert( SI_mass        , gram       ,       kilogram( SI_milli ) );
+	SI_Convert( SI_length      , inch       ,          meter(  2.54E-2 ) );
+	SI_Convert( SI_length      , foot       ,           inch(     12.0 ) );
+	SI_Convert( SI_length      , centimeter ,          meter( SI_centi ) );
+	SI_Convert( SI_length      , micron     ,          meter( SI_micro ) );
+	SI_Convert( SI_length      , angstrom   ,          meter(  1.0E-10 ) );
+	SI_Convert( SI_area        , barn       ,         meter2(  1.0E-28 ) );
+	SI_Convert( SI_angle       , degree     ,         radian( 0.017453 ) );
+	SI_Convert( SI_illuminance , phot       ,            lux(   1.0E+4 ) );
+	SI_Convert( SI_illuminance , footcandle ,            lux(  9.29E-2 ) );
+	SI_Convert( SI_luminance   , stilb      , candela_per_m2(   1.0E+4 ) );
+
+	/*******************************************************************************
+	* Often there are multiple names for a single quantity.  Below are some        *
+	* synonyms for the quantities defined above.  These can be used in place of    *
+	* the original quantities and may be clearer in some contexts.                 *
+	*******************************************************************************/
+
+	typedef SI_power       SI_radiant_flux;
+	typedef SI_irradiance  SI_radiant_flux_density;
+	typedef SI_irradiance  SI_radiant_exitance;
+	typedef SI_radiance    SI_intensity;
+	typedef SI_irradiance  SI_radiosity;
+};
+#endif
\ No newline at end of file
diff --git a/src/nvtt/bc7/arvo/SVD.cpp b/src/nvtt/bc7/arvo/SVD.cpp
new file mode 100644
index 0000000..36f0ea6
--- /dev/null
+++ b/src/nvtt/bc7/arvo/SVD.cpp
@@ -0,0 +1,398 @@
+/***************************************************************************
+* SVD.C                                                                    *
+*                                                                          *
+* Singular Value Decomposition.                                            *
+*                                                                          *
+*   HISTORY                                                                *
+*      Name    Date          Description                                   *
+*                                                                          *
+*      arvo    08/22/2000    Copied to CIT library.                        *
+*      arvo    06/28/1993    Rewritten from "Numerical Recipes" C-code.    *
+*                                                                          *
+*--------------------------------------------------------------------------*
+* Copyright (C) 2000, James Arvo                                           *
+*                                                                          *
+* This program is free software; you can redistribute it and/or modify it  *
+* under the terms of the GNU General Public License as published by the    *
+* Free Software Foundation.  See http://www.fsf.org/copyleft/gpl.html      *
+*                                                                          *
+* This program is distributed in the hope that it will be useful, but      *
+* WITHOUT EXPRESS OR IMPLIED WARRANTY of merchantability or fitness for    *
+* any particular purpose.  See the GNU General Public License for more     *
+* details.                                                                 *
+*                                                                          *
+***************************************************************************/
+#include <math.h>
+#include <assert.h>
+#include "ArvoMath.h"
+#include "Vector.h"
+#include "Matrix.h"
+#include "SVD.h"
+
+namespace ArvoMath {
+	static const int MaxIterations = 30;
+
+	static double svd_pythag( double a, double b )
+	{
+		double at = Abs(a);
+		double bt = Abs(b);
+		if( at > bt )
+			return at * sqrt( 1.0 + Sqr( bt / at ) );
+		else if( bt > 0.0 )
+			return bt * sqrt( 1.0 + Sqr( at / bt ) );
+		else return 0.0;
+	}
+
+	static inline double SameSign( double a, double b ) 
+	{
+		double t;
+		if( b >= 0.0 ) t = Abs( a );
+		else t = -Abs( a );
+		return t;
+	}
+
+	static int ComputeRank( const Matrix &D, double epsilon )
+	{
+		int rank = 0;
+		for( int i = 0; i < D.Rows(); i++ )
+			if( Abs(D(i,i)) > epsilon ) rank++;
+		return rank;
+	}
+
+	SVD::SVD( ) : Q_(0), D_(0), R_(0)
+	{
+	}
+
+	SVD::SVD( const Matrix &M ) : Q_(0), D_(0), R_(0)
+	{
+		(*this) = M;
+	}
+
+	void SVD::operator=( const Matrix &A )
+	{
+		if( A.Rows() >= A.Cols() ) Q_ = A;
+		else
+		{
+			Q_ = Matrix( A.Cols() );
+			for( int i = 0; i < A.Rows(); i++ )
+				for( int j = 0; j < A.Cols(); j++ ) Q_(i,j) = A(i,j);
+		}
+		R_ = Matrix( A.Cols() );
+		Decompose( Q_, D_, R_ );
+	}
+
+	const Matrix &SVD::Q( double epsilon ) const
+	{
+		int rank = 0;
+		if( epsilon != 0.0 ) rank = ComputeRank( D_, epsilon );
+		return Q_;
+	}
+
+	const Matrix &SVD::D( double epsilon ) const
+	{
+		int rank = 0;
+		if( epsilon != 0.0 ) rank = ComputeRank( D_, epsilon );
+		return D_;
+	}
+
+	const Matrix &SVD::R( double epsilon ) const
+	{
+		int rank = 0;
+		if( epsilon != 0.0 ) rank = ComputeRank( D_, epsilon );
+		return R_;
+	}
+
+	int SVD::Rank( double epsilon ) const
+	{
+		return ComputeRank( D_, epsilon );
+	}
+
+	int SVD::Decompose( Matrix &Q, Matrix &D, Matrix &R )
+	{
+		int    i, j, k, l, m, n, p, q, iter;
+		double c, f, h, s, x, y, z;
+		double norm  = 0.0;
+		double g     = 0.0;
+		double scale = 0.0;
+
+		m = Q.Rows();
+		n = Q.Cols();
+
+		Vector Temp( n );
+		Vector diag( n );
+
+		for( i = 0; i < n; i++ ) 
+		{
+
+			Temp(i) = scale * g;
+			scale   = 0.0;
+			g       = 0.0;
+			s       = 0.0;
+			l       = i + 1;
+
+			if( i < m )
+			{
+				for( k = i; k < m; k++ ) scale += Abs( Q(k,i) );
+				if( scale != 0.0 ) 
+				{
+					for( k = i; k < m; k++ ) 
+					{
+						Q(k,i) /= scale;
+						s += Sqr( Q(k,i) );
+					}
+					f = Q(i,i);
+					g = -SameSign( sqrt(s), f );
+					h = f * g - s;
+					Q(i,i) = f - g;
+					if( i != n - 1 )
+					{
+						for( j = l; j < n; j++ ) 
+						{
+							s = 0.0;
+							for( k = i; k < m; k++ ) s += Q(k,i) * Q(k,j);
+							f = s / h;
+							for( k = i; k < m; k++ ) Q(k,j) += f * Q(k,i);
+						}
+					}
+					for( k = i; k < m; k++ ) Q(k,i) *= scale;
+				}
+			}
+
+			diag(i) = scale * g;
+			g       = 0.0;
+			s       = 0.0;
+			scale   = 0.0;
+
+			if( i < m && i != n - 1 ) 
+			{
+				for( k = l; k < n; k++ ) scale += Abs( Q(i,k) );
+				if( scale != 0.0 ) 
+				{
+					for( k = l; k < n; k++ ) 
+					{
+						Q(i,k) /= scale;
+						s += Sqr( Q(i,k) );
+					}
+					f = Q(i,l);
+					g = -SameSign( sqrt(s), f );
+					h = f * g - s;
+					Q(i,l) = f - g;
+					for( k = l; k < n; k++ ) Temp(k) = Q(i,k) / h;
+					if( i != m - 1 ) 
+					{
+						for( j = l; j < m; j++ ) 
+						{
+							s = 0.0;
+							for( k = l; k < n; k++ ) s += Q(j,k) * Q(i,k);
+							for( k = l; k < n; k++ ) Q(j,k) += s * Temp(k);
+						}
+					}
+					for( k = l; k < n; k++ ) Q(i,k) *= scale;
+				}
+			}
+			norm = Max( norm, Abs( diag(i) ) + Abs( Temp(i) ) );
+		}
+
+
+		for( i = n - 1; i >= 0; i-- ) 
+		{
+			if( i < n - 1 ) 
+			{
+				if( g != 0.0 ) 
+				{
+					for( j = l; j < n; j++ ) R(i,j) = ( Q(i,j) / Q(i,l) ) / g;
+					for( j = l; j < n; j++ ) 
+					{
+						s = 0.0;
+						for( k = l; k < n; k++ ) s += Q(i,k) * R(j,k);
+						for( k = l; k < n; k++ ) R(j,k) += s * R(i,k);
+					}
+				}
+				for( j = l; j < n; j++ ) 
+				{
+					R(i,j) = 0.0;
+					R(j,i) = 0.0;
+				}
+			}
+			R(i,i) = 1.0;
+			g = Temp(i);
+			l = i;
+		}
+
+
+		for( i = n - 1; i >= 0; i-- ) 
+		{
+			l = i + 1;
+			g = diag(i);
+			if( i < n - 1 ) for( j = l; j < n; j++ ) Q(i,j) = 0.0;
+			if( g != 0.0 ) 
+			{
+				g = 1.0 / g;
+				if( i != n - 1 ) 
+				{
+					for( j = l; j < n; j++ ) 
+					{
+						s = 0.0;
+						for( k = l; k < m; k++ ) s += Q(k,i) * Q(k,j);
+						f = ( s / Q(i,i) ) * g;
+						for( k = i; k < m; k++ ) Q(k,j) += f * Q(k,i);
+					}
+				}
+				for( j = i; j < m; j++ ) Q(j,i) *= g;
+			} 
+			else 
+			{
+				for( j = i; j < m; j++ ) Q(j,i) = 0.0;
+			}
+			Q(i,i) += 1.0;
+		}
+
+
+		for( k = n - 1; k >= 0; k-- ) 
+		{
+			for( iter = 1; iter <= MaxIterations; iter++ ) 
+			{
+				int jump;
+
+				for( l = k; l >= 0; l-- )
+				{
+					q = l - 1;
+					if( Abs( Temp(l) ) + norm == norm ) { jump = 1; break; }
+					if( Abs( diag(q) ) + norm == norm ) { jump = 0; break; }
+				}
+
+				if( !jump )
+				{
+					c = 0.0;
+					s = 1.0;
+					for( i = l; i <= k; i++ )
+					{
+						f = s * Temp(i);
+						Temp(i) *= c;
+						if( Abs( f ) + norm == norm ) break;
+						g = diag(i);
+						h = svd_pythag( f, g );
+						diag(i) = h;
+						h = 1.0 / h;
+						c = g * h;
+						s = -f * h;
+						for( j = 0; j < m; j++ ) 
+						{
+							y = Q(j,q);
+							z = Q(j,i);
+							Q(j,q) = y * c + z * s;
+							Q(j,i) = z * c - y * s;
+						}
+					}
+				}
+
+				z = diag(k);
+				if( l == k ) 
+				{
+					if( z < 0.0 ) 
+					{
+						diag(k) = -z;
+						for( j = 0; j < n; j++ ) R(k,j) *= -1.0; 
+					}
+					break;
+				}
+				if( iter >= MaxIterations ) return 0;
+				x = diag(l);
+				q = k - 1;
+				y = diag(q);
+				g = Temp(q);
+				h = Temp(k);
+				f = ( ( y - z ) * ( y + z ) + ( g - h ) * ( g + h ) ) / ( 2.0 * h * y );
+				g = svd_pythag( f, 1.0 );
+				f = ( ( x - z ) * ( x + z ) + h * ( ( y / ( f + SameSign( g, f ) ) ) - h ) ) / x;
+				c = 1.0;
+				s = 1.0;
+				for( j = l; j <= q; j++ ) 
+				{
+					i = j + 1;
+					g = Temp(i);
+					y = diag(i);
+					h = s * g;
+					g = c * g;
+					z = svd_pythag( f, h );
+					Temp(j) = z;
+					c = f / z;
+					s = h / z;
+					f = x * c + g * s;
+					g = g * c - x * s;
+					h = y * s;
+					y = y * c;
+					for( p = 0; p < n; p++ ) 
+					{
+						x = R(j,p);
+						z = R(i,p);
+						R(j,p) = x * c + z * s;
+						R(i,p) = z * c - x * s;
+					}
+					z = svd_pythag( f, h );
+					diag(j) = z;
+					if( z != 0.0 ) 
+					{
+						z = 1.0 / z;
+						c = f * z;
+						s = h * z;
+					}
+					f = c * g + s * y;
+					x = c * y - s * g;
+					for( p = 0; p < m; p++ ) 
+					{
+						y = Q(p,j);
+						z = Q(p,i);
+						Q(p,j) = y * c + z * s;
+						Q(p,i) = z * c - y * s;
+					}
+				}
+				Temp(l) = 0.0;
+				Temp(k) = f;
+				diag(k) = x;
+			}
+		}
+
+		// Sort the singular values into descending order.
+
+		for( i = 0; i < n - 1; i++ )
+		{
+			double biggest = diag(i);  // Biggest singular value so far.
+			int    bindex  = i;        // The row/col it occurred in.
+			for( j = i + 1; j < n; j++ )
+			{
+				if( diag(j) > biggest ) 
+				{
+					biggest = diag(j);
+					bindex  = j;
+				}            
+			}
+			if( bindex != i )  // Need to swap rows and columns.
+			{
+				Q.SwapCols( i, bindex );  // Swap columns in Q.
+				R.SwapRows( i, bindex );  // Swap rows in R.
+				diag.Swap ( i, bindex );  // Swap elements in diag.
+			}
+		}
+
+		D = Diag( diag );
+		return 1;
+	}
+
+
+	const Matrix &SVD::PseudoInverse( double epsilon )
+	{
+		if( Null(P_) )
+		{
+			Matrix D_Inverse( D_ );
+			for( int i = 0; i < D_Inverse.Rows(); i++ )
+			{
+				if( Abs( D_Inverse(i,i) ) > epsilon )
+					D_Inverse(i,i) = 1.0 / D_Inverse(i,i);
+				else D_Inverse(i,i) = 0.0;
+			}
+			P_ = Q_ * D_Inverse * R_;
+		}
+		return P_;
+	}
+};
diff --git a/src/nvtt/bc7/arvo/SVD.h b/src/nvtt/bc7/arvo/SVD.h
new file mode 100644
index 0000000..d6bf850
--- /dev/null
+++ b/src/nvtt/bc7/arvo/SVD.h
@@ -0,0 +1,54 @@
+/***************************************************************************
+* SVD.h                                                                    *
+*                                                                          *
+* Singular Value Decomposition.                                            *
+*                                                                          *
+*   HISTORY                                                                *
+*      Name    Date          Description                                   *
+*                                                                          *
+*      arvo    08/22/2000    Split off from Matrix.h                       *
+*      arvo    06/28/1993    Rewritten from "Numerical Recipes" C-code.    *
+*                                                                          *
+*--------------------------------------------------------------------------*
+* Copyright (C) 2000, James Arvo                                           *
+*                                                                          *
+* This program is free software; you can redistribute it and/or modify it  *
+* under the terms of the GNU General Public License as published by the    *
+* Free Software Foundation.  See http://www.fsf.org/copyleft/gpl.html      *
+*                                                                          *
+* This program is distributed in the hope that it will be useful, but      *
+* WITHOUT EXPRESS OR IMPLIED WARRANTY of merchantability or fitness for    *
+* any particular purpose.  See the GNU General Public License for more     *
+* details.                                                                 *
+*                                                                          *
+***************************************************************************/
+#ifndef __SVD_INCLUDED__
+#define __SVD_INCLUDED__
+
+#include "Vector.h"
+#include "Matrix.h"
+
+namespace ArvoMath {
+
+	class SVD {
+	public:
+		SVD( );
+		SVD( const SVD    & );  // Copies the decomposition.
+		SVD( const Matrix & );  // Performs the decomposition.
+		~SVD() {};
+		const Matrix &Q( double epsilon = 0.0 ) const;
+		const Matrix &D( double epsilon = 0.0 ) const;
+		const Matrix &R( double epsilon = 0.0 ) const;
+		const Matrix &PseudoInverse( double epsilon = 0.0 );
+		int   Rank( double epsilon = 0.0 ) const;
+		void  operator=( const Matrix & );  // Performs the decomposition.
+	private:
+		int Decompose( Matrix &Q, Matrix &D, Matrix &R );
+		Matrix Q_;
+		Matrix D_;
+		Matrix R_;
+		Matrix P_; // Pseudo inverse.
+		int    error;
+	};
+};
+#endif
diff --git a/src/nvtt/bc7/arvo/SphTri.cpp b/src/nvtt/bc7/arvo/SphTri.cpp
new file mode 100644
index 0000000..40de956
--- /dev/null
+++ b/src/nvtt/bc7/arvo/SphTri.cpp
@@ -0,0 +1,292 @@
+/***************************************************************************
+* SphTri.C                                                                 *
+*                                                                          *
+* This file defines the SphericalTriangle class definition, which          *
+* supports member functions for Monte Carlo sampling, point containment,   *
+* and other basic operations on spherical triangles.                       *
+*                                                                          *
+*   Changes:                                                               *
+*     01/01/2000  arvo  Added New_{Alpha,Beta,Gamma} methods.              *
+*     12/30/1999  arvo  Added VecIrrad method for "Vector Irradiance".     *
+*     04/08/1995  arvo  Further optimized sampling algorithm.              *
+*     10/11/1994  arvo  Added analytic sampling algorithm.                 *
+*     06/14/1994  arvo  Initial implementation.                            *
+*                                                                          *
+*--------------------------------------------------------------------------*
+* Copyright (C) 1995, 2000, James Arvo                                     *
+*                                                                          *
+* This program is free software; you can redistribute it and/or modify it  *
+* under the terms of the GNU General Public License as published by the    *
+* Free Software Foundation.  See http://www.fsf.org/copyleft/gpl.html      *
+*                                                                          *
+* This program is distributed in the hope that it will be useful, but      *
+* WITHOUT EXPRESS OR IMPLIED WARRANTY of merchantability or fitness for    *
+* any particular purpose.  See the GNU General Public License for more     *
+* details.                                                                 *
+*                                                                          *
+***************************************************************************/
+#include <iostream>
+#include <math.h>
+#include "SphTri.h"
+#include "form.h"
+namespace ArvoMath {
+	/*-------------------------------------------------------------------------*
+	* Constructor                                                             *
+	*                                                                         *
+	* Construct a spherical triangle from three (non-zero) vectors.  The      *
+	* vectors needn't be of unit length.                                      *
+	*                                                                         *
+	*-------------------------------------------------------------------------*/
+	SphericalTriangle::SphericalTriangle( const Vec3 &A0, const Vec3 &B0, const Vec3 &C0 )
+	{
+		Init( A0, B0, C0 );
+	}
+
+	/*-------------------------------------------------------------------------*
+	* Init                                                                    *
+	*                                                                         *
+	* Construct the spherical triange from three vertices.  Assume that the   *
+	* sphere is centered at the origin.  The vectors A, B, and C need not     *
+	* be normalized.                                                          *
+	*                                                                         *
+	*-------------------------------------------------------------------------*/
+	void SphericalTriangle::Init( const Vec3 &A0, const Vec3 &B0, const Vec3 &C0 )
+	{
+		// Normalize the three vectors -- these are the vertices.
+
+		A_ = Unit( A0 );
+		B_ = Unit( B0 );
+		C_ = Unit( C0 );
+
+		// Compute and save the cosines of the edge lengths.
+
+		cos_a = B_ * C_;
+		cos_b = A_ * C_;
+		cos_c = A_ * B_;
+
+		// Compute and save the edge lengths.
+
+		a_ = ArcCos( cos_a );
+		b_ = ArcCos( cos_b );
+		c_ = ArcCos( cos_c );
+
+		// Compute the cosines of the internal (i.e. dihedral) angles.
+
+		cos_alpha = CosDihedralAngle( C_, A_, B_ );
+		cos_beta  = CosDihedralAngle( A_, B_, C_ );
+		cos_gamma = CosDihedralAngle( A_, C_, B_ );
+
+		// Compute the (dihedral) angles.
+
+		alpha = ArcCos( cos_alpha );
+		beta  = ArcCos( cos_beta  );
+		gamma = ArcCos( cos_gamma );
+
+		// Compute the solid angle of the spherical triangle.
+
+		area = alpha + beta + gamma - Pi;
+
+		// Compute the orientation of the triangle.
+
+		orient = Sign( A_ * ( B_ ^ C_ ) );
+
+		// Initialize three variables that are used for sampling the triangle.
+
+		U         = Unit( C_ / A_ );  // In plane of AC orthogonal to A.
+		sin_alpha = sin( alpha );
+		product   = sin_alpha * cos_c;
+	}
+
+	/*-------------------------------------------------------------------------*
+	* Init                                                                    *
+	*                                                                         *
+	* Initialize all fields.  Create a null spherical triangle.               *
+	*                                                                         *
+	*-------------------------------------------------------------------------*/
+	void SphericalTriangle::Init()
+	{
+		a_ = 0;  A_ = 0;  cos_alpha = 0;  cos_a = 0;  alpha = 0;  
+		b_ = 0;  B_ = 0;  cos_beta  = 0;  cos_b = 0;  beta  = 0;  
+		c_ = 0;  C_ = 0;  cos_gamma = 0;  cos_c = 0;  gamma = 0;  
+		area      = 0;
+		orient    = 0;
+		sin_alpha = 0;
+		product   = 0;
+		U         = 0;
+	}
+
+	/*-------------------------------------------------------------------------*
+	* "( A, B, C )" operator.                                                 *
+	*                                                                         *
+	* Construct the spherical triange from three vertices.  Assume that the   *
+	* sphere is centered at the origin.  The vectors A, B, and C need not     *
+	* be normalized.                                                          *
+	*                                                                         *
+	*-------------------------------------------------------------------------*/
+	SphericalTriangle & SphericalTriangle::operator()( 
+		const Vec3 &A0, 
+		const Vec3 &B0, 
+		const Vec3 &C0 )
+	{
+		Init( A0, B0, C0 );
+		return *this;
+	}
+
+	/*-------------------------------------------------------------------------*
+	* Inside                                                                  *
+	*                                                                         *
+	* Determine if the vector W is inside the triangle.  W need not be a      *
+	* unit vector                                                             *
+	*                                                                         *
+	*-------------------------------------------------------------------------*/
+	int SphericalTriangle::Inside( const Vec3 &W ) const
+	{
+		Vec3 Z = Orient() * W;
+		if( Z * ( A() ^ B() ) < 0.0 ) return 0;
+		if( Z * ( B() ^ C() ) < 0.0 ) return 0;
+		if( Z * ( C() ^ A() ) < 0.0 ) return 0;
+		return 1;
+	}
+
+	/*-------------------------------------------------------------------------*
+	* Chart                                                                   *
+	*                                                                         *
+	* Generate samples from the current spherical triangle.  If x1 and x2 are *
+	* random variables uniformly distributed over [0,1], then the returned    *
+	* points are uniformly distributed over the solid angle.                  *
+	*                                                                         *
+	*-------------------------------------------------------------------------*/
+	Vec3 SphericalTriangle::Chart( float x1, float x2 ) const
+	{
+		// Use one random variable to select the area of the sub-triangle.
+		// Save the sine and cosine of the angle phi.
+
+		register float phi = x1 * area - Alpha();
+		register float s   = sin( phi );
+		register float t   = cos( phi );
+
+		// Compute the pair (u,v) that determines the new angle beta.
+
+		register float u = t - cos_alpha;
+		register float v = s + product  ;  // sin_alpha * cos_c
+
+		// Compute the cosine of the new edge b.
+
+		float q = ( cos_alpha * ( v * t - u * s ) - v ) / 
+			( sin_alpha * ( u * t + v * s )     );
+
+		// Compute the third vertex of the sub-triangle.
+
+		Vec3 C_new = q * A() + Sqrt( 1.0 - q * q ) * U;
+
+		// Use the other random variable to select the height z.
+
+		float z = 1.0 - x2 * ( 1.0 - C_new * B() );
+
+		// Construct the corresponding point on the sphere.
+
+		Vec3 D = C_new / B();  // Remove B component of C_new.
+		return z * B() + Sqrt( ( 1.0 - z * z ) / ( D * D ) ) * D;
+	}
+
+	/*-------------------------------------------------------------------------*
+	* Coord                                                                   *
+	*                                                                         *
+	* Compute the two coordinates (x1,x2) corresponding to a point in the     *
+	* spherical triangle.  This is the inverse of "Chart".                    *
+	*                                                                         *
+	*-------------------------------------------------------------------------*/
+	Vec2 SphericalTriangle::Coord( const Vec3 &P1 ) const
+	{
+		Vec3 P = Unit( P1 );
+
+		// Compute the new C vertex, which lies on the arc defined by B-P
+		// and the arc defined by A-C.
+
+		Vec3 C_new = Unit( ( B() ^ P ) ^ ( C() ^ A() ) );
+
+		// Adjust the sign of C_new.  Make sure it's on the arc between A and C.
+
+		if( C_new * ( A() + C() ) < 0.0 ) C_new = -C_new;
+
+		// Compute x1, the area of the sub-triangle over the original area.
+
+		float cos_beta  = CosDihedralAngle( A(), B(), C_new  );
+		float cos_gamma = CosDihedralAngle( A(), C_new , B() );
+		float sub_area  = Alpha() + acos( cos_beta ) + acos( cos_gamma ) - Pi;
+		float x1        = sub_area / SolidAngle();
+
+		// Now compute the second coordinate using the new C vertex.
+
+		float z  = P * B();
+		float x2 = ( 1.0 - z ) / ( 1.0 - C_new * B() );
+
+		if( x1 < 0.0 ) x1 = 0.0;  if( x1 > 1.0 ) x1 = 1.0;
+		if( x2 < 0.0 ) x2 = 0.0;  if( x2 > 1.0 ) x2 = 1.0;
+		return Vec2( x1, x2 );
+	}
+
+	/*-------------------------------------------------------------------------*
+	* Dual                                                                    *
+	*                                                                         *
+	* Construct the dual triangle of the current triangle, which is another   *
+	* spherical triangle.                                                     *
+	*                                                                         *
+	*-------------------------------------------------------------------------*/
+	SphericalTriangle SphericalTriangle::Dual() const
+	{
+		Vec3 dual_A = B() ^ C();  if( dual_A * A() < 0.0 ) dual_A *= -1.0;
+		Vec3 dual_B = A() ^ C();  if( dual_B * B() < 0.0 ) dual_B *= -1.0;
+		Vec3 dual_C = A() ^ B();  if( dual_C * C() < 0.0 ) dual_C *= -1.0;
+		return SphericalTriangle( dual_A, dual_B, dual_C );
+	}
+
+	/*-------------------------------------------------------------------------*
+	* VecIrrad                                                                *
+	*                                                                         *
+	* Return the "vector irradiance" due to a light source of unit brightness *
+	* whose spherical projection is this spherical triangle.  The negative of *
+	* this vector dotted with the surface normal gives the (scalar)           *
+	* irradiance at the origin.                                               *
+	*                                                                         *
+	*-------------------------------------------------------------------------*/
+	Vec3 SphericalTriangle::VecIrrad() const
+	{
+		Vec3 Phi =
+			a() * Unit( B() ^ C() ) +
+			b() * Unit( C() ^ A() ) +
+			c() * Unit( A() ^ B() ) ;
+		if( Orient() ) Phi *= -1.0;
+		return Phi;    
+	}
+
+	/*-------------------------------------------------------------------------*
+	* New_Alpha                                                               *
+	*                                                                         *
+	* Returns a new spherical triangle derived from the original one by       *
+	* moving the "C" vertex along the edge "BC" until the new "alpha" angle   *
+	* equals the given argument.                                              *
+	*                                                                         *
+	*-------------------------------------------------------------------------*/
+	SphericalTriangle SphericalTriangle::New_Alpha( float alpha ) const
+	{
+		Vec3 V1( A() ), V2( B() ), V3( C() );
+		Vec3 E1 = Unit( V2 ^ V1 );
+		Vec3 E2 = E1 ^ V1;
+		Vec3 G  = ( cos(alpha) * E1 ) + ( sin(alpha) * E2 );
+		Vec3 D  = Unit( V3 / V2 );
+		Vec3 C2 = ((G * D) * V2) - ((G * V2) * D);
+		if( Triple( V1, V2, C2 ) > 0.0 ) C2 *= -1.0;
+		return SphericalTriangle( V1, V2, C2 );
+	}
+
+	std::ostream &operator<<( std::ostream &out, const SphericalTriangle &T )
+	{
+		out << "SphericalTriangle:\n"
+			<< "  " << T.A() << "\n"
+			<< "  " << T.B() << "\n"
+			<< "  " << T.C() << std::endl;
+		return out;
+	}
+
+};
diff --git a/src/nvtt/bc7/arvo/SphTri.h b/src/nvtt/bc7/arvo/SphTri.h
new file mode 100644
index 0000000..7336dc7
--- /dev/null
+++ b/src/nvtt/bc7/arvo/SphTri.h
@@ -0,0 +1,124 @@
+/***************************************************************************
+* SphTri.h                                                                 *
+*                                                                          *
+* This file defines the SphericalTriangle class definition, which          *
+* supports member functions for Monte Carlo sampling, point containment,   *
+* and other basic operations on spherical triangles.                       *
+*                                                                          *
+*   Changes:                                                               *
+*     01/01/2000  arvo  Added New_{Alpha,Beta,Gamma} methods.              *
+*     12/30/1999  arvo  Added VecIrrad method for "Vector Irradiance".     *
+*     04/08/1995  arvo  Further optimized sampling algorithm.              *
+*     10/11/1994  arvo  Added analytic sampling algorithm.                 *
+*     06/14/1994  arvo  Initial implementation.                            *
+*                                                                          *
+*--------------------------------------------------------------------------*
+* Copyright (C) 1995, 2000, James Arvo                                     *
+*                                                                          *
+* This program is free software; you can redistribute it and/or modify it  *
+* under the terms of the GNU General Public License as published by the    *
+* Free Software Foundation.  See http://www.fsf.org/copyleft/gpl.html      *
+*                                                                          *
+* This program is distributed in the hope that it will be useful, but      *
+* WITHOUT EXPRESS OR IMPLIED WARRANTY of merchantability or fitness for    *
+* any particular purpose.  See the GNU General Public License for more     *
+* details.                                                                 *
+*                                                                          *
+***************************************************************************/
+#ifndef __SPHTRI_INCLUDED__
+#define __SPHTRI_INCLUDED__
+
+#include "Vec3.h"
+#include "Vec2.h"
+
+namespace ArvoMath {
+
+	/*
+	*  The (Oblique) Spherical Triangle ABC.  Edge lengths (segments of great 
+	*  circles) are a, b, and c.  The (dihedral) angles are Alpha, Beta, and Gamma.
+	*
+	*                      B
+	*                      o
+	*                     / \
+	*                    /   \
+	*                   /Beta \
+	*                  /       \
+	*               c /         \ a
+	*                /           \ 
+	*               /             \
+	*              /               \
+	*             /                 \
+	*            /                   \
+	*           /Alpha          Gamma \
+	*          o-----------------------o
+	*         A            b            C
+	*
+	*/
+
+	class SphericalTriangle {
+
+	public: // methods
+		SphericalTriangle() { Init(); }
+		SphericalTriangle( const SphericalTriangle &T ) { *this = T; }
+		SphericalTriangle( const Vec3 &, const Vec3 &, const Vec3 & );
+		SphericalTriangle & operator()( const Vec3 &, const Vec3 &, const Vec3 & );
+		~SphericalTriangle( ) {}
+		void   operator=( const SphericalTriangle &T ) { *this = T; }
+		Vec3   Chart    ( float x, float y ) const;  // Const-Jacobian map from square.
+		Vec2   Coord    ( const Vec3 &P    ) const;  // Get 2D coords of a point.
+		int    Orient( ) const { return orient; }
+		int    Inside( const Vec3 & ) const;
+		float  SolidAngle() const { return area; }
+		float  SignedSolidAngle() const { return -orient * area; } // CC is pos.
+		const  Vec3 &A()  const { return A_       ; }
+		const  Vec3 &B()  const { return B_       ; }
+		const  Vec3 &C()  const { return C_       ; }
+		float  a()        const { return a_       ; }
+		float  b()        const { return b_       ; }
+		float  c()        const { return c_       ; }
+		float  Cos_a()    const { return cos_a    ; }
+		float  Cos_b()    const { return cos_b    ; }
+		float  Cos_c()    const { return cos_c    ; }
+		float  Alpha()    const { return alpha    ; }
+		float  Beta ()    const { return beta     ; }
+		float  Gamma()    const { return gamma    ; }
+		float  CosAlpha() const { return cos_alpha; }
+		float  CosBeta () const { return cos_beta ; }
+		float  CosGamma() const { return cos_gamma; }
+		Vec3   VecIrrad() const; // Returns the vector irradiance.
+		SphericalTriangle Dual() const;
+		SphericalTriangle New_Alpha( float alpha ) const;
+		SphericalTriangle New_Beta ( float beta  ) const;
+		SphericalTriangle New_Gamma( float gamma ) const;
+
+	private: // methods
+		void Init( );
+		void Init( const Vec3 &A, const Vec3 &B, const Vec3 &C );
+
+	private: // data
+		Vec3  A_, B_, C_, U;       // The vertices (and a temp vector).
+		float a_, b_, c_;          // The edge lengths.
+		float alpha, beta, gamma;  // The angles.
+		float cos_a, cos_b, cos_c;
+		float cos_alpha, cos_beta, cos_gamma;
+		float area;
+		float sin_alpha, product;  // Used in sampling algorithm.
+		int   orient;              // Orientation.
+	};
+
+	inline double CosDihedralAngle( const Vec3 &A, const Vec3 &B, const Vec3 &C )
+	{
+		float x = Unit( A ^ B ) * Unit( C ^ B );
+		if( x < -1.0 ) x = -1.0;
+		if( x >  1.0 ) x =  1.0;
+		return x;
+	}
+
+	inline double DihedralAngle( const Vec3 &A, const Vec3 &B, const Vec3 &C )
+	{
+		return acos( CosDihedralAngle( A, B, C ) );
+	}
+
+	extern std::ostream &operator<<( std::ostream &out, const SphericalTriangle & );
+};
+#endif
diff --git a/src/nvtt/bc7/arvo/Token.cpp b/src/nvtt/bc7/arvo/Token.cpp
new file mode 100644
index 0000000..9575d92
--- /dev/null
+++ b/src/nvtt/bc7/arvo/Token.cpp
@@ -0,0 +1,913 @@
+/***************************************************************************
+* Token.h                                                                  *
+*                                                                          *
+* The Token class ecapsulates a lexical analyzer for C++-like syntax.      *
+* A token instance is associated with one or more text files, and          *
+* grabs C++ tokens from them sequentially.  There are many member          *
+* functions designed to make parsing easy, such as "==" operators for      *
+* strings and characters, and automatic conversion of numeric tokens       *
+* into numeric values.                                                     *
+*                                                                          *
+* Files can be nested via #include directives, and both styles of C++      *
+* comments are supported.                                                  *
+*                                                                          *
+*   HISTORY                                                                *
+*      Name    Date        Description                                     *
+*                                                                          *
+*      arvo    10/05/99    Fixed bug in TokFrame string allocation.        *
+*      arvo    01/15/95    Added ifdef, ifndef, else, and endif.           *
+*      arvo    02/13/94    Added Debug() member function.                  *
+*      arvo    01/22/94    Several sections rewritten.                     *
+*      arvo    06/19/93    Converted to C++                                *
+*      arvo    07/15/89    Rewritten for scene description parser.         *
+*      arvo    01/22/89    Initial coding.                                 *
+*                                                                          *
+*--------------------------------------------------------------------------*
+* Copyright (C) 1999, James Arvo                                           *
+*                                                                          *
+* This program is free software; you can redistribute it and/or modify it  *
+* under the terms of the GNU General Public License as published by the    *
+* Free Software Foundation.  See http://www.fsf.org/copyleft/gpl.html      *
+*                                                                          *
+* This program is distributed in the hope that it will be useful, but      *
+* WITHOUT EXPRESS OR IMPLIED WARRANTY of merchantability or fitness for    *
+* any particular purpose.  See the GNU General Public License for more     *
+* details.                                                                 *
+*                                                                          *
+***************************************************************************/
+#include <stdlib.h>
+#include <string.h>
+#include "Token.h"
+#include "Char.h"
+
+namespace ArvoMath {
+
+	FILE*  Token::debug = NULL;  // Static data member of Token class.
+	int    Token::argc  = 0;
+	char** Token::argv  = NULL;
+
+	typedef TokMacro *TokMacroPtr;
+
+	static const int True      = 1;
+	static const int False     = 0;
+	static const int HashConst = 217;  // Size of hash-table for macros.
+
+
+	TokFrame::TokFrame()
+	{
+		next   = NULL;
+		source = NULL;
+		fname  = NULL;
+		line   = 0;
+		column = 0;
+	}
+
+	TokFrame::~TokFrame()
+	{
+		if( fname != NULL ) delete[] fname;
+		if( source != NULL ) fclose( source );
+	}
+
+	void TokFrame::operator=( const TokFrame &frame )
+	{
+		next   = frame.next;
+		source = frame.source;
+		fname  = strdup( frame.fname );
+		line   = frame.line;
+		column = frame.column;
+	}
+
+	static int HashName( const char *str )
+	{
+		static int prime[5] = { 7, 11, 17, 23, 3 };
+		int k = 0;
+		int h = 0;
+		while( *str != NullChar )
+		{
+			h += (*str++) * prime[k++];
+			if( k == 5 ) k = 0;
+		}
+		if( h < 0 ) h = 0;  // Check for overflow.
+		return h % HashConst;
+	}
+
+	TokMacro *Token::MacroLookup( const char *str ) const
+	{
+		if( table == NULL ) return NULL;
+		int i = HashName( str );
+		for( TokMacro *m = table[i]; m != NULL; m = m->next )
+		{
+			if( strcmp( str, m->macro ) == 0 ) return m;
+		}
+		return NULL;
+	}
+
+	int Token::MacroReplace( char *str, int &length, TokType &type ) const
+	{
+		TokMacro *m = MacroLookup( str );
+		if( m == NULL ) return 0;
+		strcpy( str, m->repl );
+		length = strlen( str );
+		type   = m->type;
+		return 1;
+	}
+
+	/*-------------------------------------------------------------------------*
+	*  D e b u g  P r i n t                                                   *
+	*                                                                         *
+	*  This routine is used to record the entire token stream in a file to    *
+	*  use as a debugging aid.  It does not affect the action of the lexer;   *
+	*  it merely records a "shadow" copy of all the tokens that are read by   *
+	*  ANY Token instance.  The data that is written to the file is           *
+	*                                                                         *
+	*  <Line number>  <Column number>  <File name>  <Token>                   *
+	*                                                                         *
+	*-------------------------------------------------------------------------*/
+	static void DebugPrint( const Token &tok, FILE *fp )
+	{
+		fprintf( fp, "%3d %3d  ", tok.Line(), tok.Column() );
+		fprintf( fp, "%s  "     , tok.FileName() ); 
+		fprintf( fp, "%s\n"     , tok.Spelling() );
+		fflush ( fp );
+	}
+
+	/*-------------------------------------------------------------------------*
+	*  T o k e n   (Constructors)                                             *
+	*                                                                         *
+	*-------------------------------------------------------------------------*/
+	Token::Token( const char *file_name )
+	{
+		Init();
+		Open( file_name );
+	}
+
+	Token::Token( FILE *fp )
+	{
+		Init();
+		Open( fp );
+	}
+
+	Token::Token( )
+	{
+		Init();
+	}
+
+	/*-------------------------------------------------------------------------*
+	*  T o k e n   (Destructor)                                               *
+	*                                                                         *
+	*  Close all files and deletes all frames and paths.                      *
+	*                                                                         *
+	*-------------------------------------------------------------------------*/
+	Token::~Token( )
+	{
+		// Don't try to delete "frame" as its a member of this class, not 
+		// something that we've allocated.
+		TokFrame *f = frame.next;
+		while( f != NULL )
+		{
+			TokFrame *n = f->next;
+			delete f;
+			f = n;
+		}
+		ClearPaths();
+	}
+
+	/*-------------------------------------------------------------------------*
+	*  O p e n                                                                *
+	*                                                                         *
+	*  Establish a new file to read from, either by name, or by pointer.      *
+	*                                                                         *
+	*-------------------------------------------------------------------------*/
+	void Token::Open( const char *file_name )
+	{
+		FILE *fp = fopen( file_name, "r" );
+		if( fp == NULL ) return;
+		Open( fp );
+		frame.fname = strdup( file_name );
+	}
+
+	void Token::Open( FILE *fp )
+	{
+		frame.source = fp;
+		frame.line   = 1;
+		frame.column = 0;
+		pushed       = NullChar;
+	}
+
+	/*-------------------------------------------------------------------------*
+	*  O p e r a t o r  ==                                                    *
+	*                                                                         *
+	*  A token can be compared with a string, a single character, or a type.  *
+	*                                                                         *
+	*-------------------------------------------------------------------------*/
+	int Token::operator==( const char *s ) const
+	{
+		const char *t = spelling;
+		if( case_sensitive )
+		{
+			do { if( *s != *t ) return False; } 
+			while( *s++ && *t++ );
+		}
+		else
+		{
+			do { if( ToUpper(*s) != ToUpper(*t) ) return False; } 
+			while( *s++ && *t++ );
+		}
+		return True;
+	}
+
+	int Token::operator==( char c ) const
+	{
+		if( length != 1 ) return False;
+		if( case_sensitive ) return spelling[0] == c;
+		else return ToUpper(spelling[0]) == ToUpper(c);
+	}
+
+	int Token::operator==( TokType _type_ ) const 
+	{
+		int match = 0;
+		switch( _type_ )
+		{ 
+		case T_char   : match = ( type == T_string  && Len() == 1      ); break;
+		case T_numeric: match = ( type == T_integer || type == T_float ); break;
+		default       : match = ( type == _type_                       ); break;
+		}
+		return match;
+	}
+
+	/*-------------------------------------------------------------------------*
+	*  O p e r a t o r  !=                                                    *
+	*                                                                         *
+	*  Define negations of the three types of "==" tests.                     *
+	*                                                                         *
+	*-------------------------------------------------------------------------*/
+	int Token::operator!=( const char *s ) const { return !( *this == s ); }
+	int Token::operator!=( char        c ) const { return !( *this == c ); }
+	int Token::operator!=( TokType     t ) const { return !( *this == t ); }
+
+	/*-------------------------------------------------------------------------*
+	*  E r r o r                                                              *
+	*                                                                         *
+	*  Print error message to "stderr" followed by optional "name".           *
+	*                                                                         *
+	*-------------------------------------------------------------------------*/
+	void Token::Error( TokError error, const char *name )
+	{
+		char *s;
+		switch( error )
+		{
+		case T_malformed_float   : s = "malformed real number   "; break;
+		case T_unterm_string     : s = "unterminated string     "; break;
+		case T_unterm_comment    : s = "unterminated comment    "; break;
+		case T_file_not_found    : s = "include file not found: "; break;
+		case T_unknown_directive : s = "unknown # directive     "; break;
+		case T_string_expected   : s = "string expected         "; break;
+		case T_putback_error     : s = "putback overflow        "; break;
+		case T_name_too_long     : s = "file name is too long   "; break;
+		case T_no_endif          : s = "#endif directive missing"; break;
+		case T_extra_endif       : s = "#endif with no #ifdef   "; break;
+		case T_extra_else        : s = "#else with no #ifdef    "; break;
+		default                  : s = "unknown error type      "; break;
+		}
+		fprintf( stderr, "LEXICAL ERROR, line %d, column %d: %s", 
+			frame.line, frame.column, s );
+		if( name == NULL )
+			fprintf( stderr, "  \n"       );
+		else fprintf( stderr, "%s\n", name );
+		exit( 1 );
+	}
+
+	/*-------------------------------------------------------------------------*
+	*  G e t c                                                                *
+	*                                                                         *
+	*  This routine fetches one character at a time from the current file     *
+	*  being read.  It is responsible for keeping track of the column number  *
+	*  and for handling single characters that have been "put back".          *
+	*                                                                         *
+	*-------------------------------------------------------------------------*/
+	int Token::Getc( int &c )
+	{
+		if( pushed != NullChar )  // Return the pushed character.
+		{
+			c = pushed;
+			pushed = NullChar;
+		}
+		else  // Get a new character from the source file.
+		{
+			c = getc( frame.source );
+			frame.column++;
+		}
+		return c;
+	}
+
+	/*-------------------------------------------------------------------------*
+	*  N o n W h i t e                                                        *
+	*                                                                         *
+	*  This routine implements a simple finite state machine that skips       *
+	*  white space and recognizes the two styles of comments used in C++.     *
+	*  It returns the first non-white character not part of a comment.        *
+	*                                                                         *
+	*-------------------------------------------------------------------------*/
+	int Token::NonWhite( int &c )
+	{
+start_state:
+		Getc( c );
+		if( c == Space   ) goto start_state;
+		if( c == Tab     ) goto start_state;
+		if( c == NewLine ) goto start_new_line;
+		if( c == Slash   ) goto start_comment;
+		goto return_char;
+
+start_comment:
+		Getc( c );
+		if( c == Star    ) goto in_comment1;  
+		if( c == Slash   ) goto in_comment2;  
+		Unget( c );
+		c = Slash;
+		goto return_char;
+
+in_comment1:
+		Getc( c );
+		if( c == Star    ) goto end_comment1;
+		if( c == NewLine ) goto newline_in_comment;
+		if( c == EOF     ) goto return_char;
+		goto in_comment1;
+
+end_comment1:
+		Getc( c );
+		if( c == Slash   ) goto start_state;
+		if( c == NewLine ) goto newline_in_comment;
+		if( c == EOF     ) goto unterm_comment;
+		goto in_comment1;
+
+in_comment2:
+		Getc( c );
+		if( c == NewLine ) goto start_new_line;
+		if( c == EOF     ) goto return_char;
+		goto in_comment2;
+
+unterm_comment:
+		Error( T_unterm_comment );
+		c = EOF;
+		goto return_char;
+
+start_new_line:
+		frame.line++;
+		frame.column = 0;
+		goto start_state;
+
+newline_in_comment:
+		frame.line++;
+		frame.column = 0;
+		goto in_comment1;
+
+return_char:
+		Tcolumn = frame.column;  // This is where the token starts.
+		return c;
+	}
+
+	/*-------------------------------------------------------------------------*
+	*  N e x t R a w T o k                                                    *
+	*                                                                         *
+	*-------------------------------------------------------------------------*/
+	int Token::NextRawTok( )
+	{
+		static int Trans0[] = { 0, 1, 3, 3, 3 };  // Found a digit.
+		static int Trans1[] = { 5, 6, 4, 6, 7 };  // Found a sign.
+		static int Trans2[] = { 1, 6, 7, 6, 7 };  // Found decimal point.
+		static int Trans3[] = { 2, 2, 7, 6, 7 };  // Found an exponent.
+		static int Trans4[] = { 5, 6, 7, 6, 7 };  // Found something else.
+		char       *tok     = spelling;
+		int        state;
+		int        c;
+
+		length = 0;
+		type   = T_null;
+
+		// Skip comments and whitespace.
+
+		if( NonWhite( c ) == EOF ) goto endtok;
+
+		// Is this the beginning of an identifier?  If so, get the rest. 
+
+		if( isAlpha( c ) )
+		{
+			type = T_ident;
+			do  {
+				*tok++ = c;
+				length++;
+				if( Getc( c ) == EOF ) goto endtok;
+			}
+			while( isAlpha( c ) || isDigit( c ) || c == Underscore );
+			Unget( c );
+			goto endtok;
+		}
+
+		// Is this the beginning of a number?
+
+		else if( isDigit( c ) || c == Minus || c == Period )
+		{
+			char c1 = c;
+			state = 0;
+			for(;;)
+			{
+				*tok++ = c;
+				length++;
+				switch( Getc( c ) )
+				{
+				case '0':
+				case '1':
+				case '2':
+				case '3':
+				case '4':
+				case '5':
+				case '6':
+				case '7':
+				case '8':
+				case '9': state = Trans0[ state ]; break;
+				case '+': 
+				case '-': state = Trans1[ state ]; break;
+				case '.': state = Trans2[ state ]; break;
+				case 'e':
+				case 'E': state = Trans3[ state ]; break;
+				default : state = Trans4[ state ]; break;
+				}
+				switch( state )
+				{
+				case 5 : Unget( c ); 
+					type = ( c1 == Period ) ? T_float : T_integer; 
+					goto endtok;
+				case 6 : Unget( c ); type = T_float  ; goto endtok;
+				case 7 : Error( T_malformed_float ) ; break;
+				default: continue;
+				}
+			} // for
+		} // if numeric 
+
+		// Is this the beginning of an operator?
+
+		if( c == '*' || c == '>' || c == '<' || c == '+' || c == '-' || c == '!' )
+		{
+			char oldc = c;
+			type = T_other;
+			*tok++ = c;
+			length++;
+			if( Getc( c ) == EOF ) goto endtok;
+			if( c == oldc || c == EqualSign )
+			{
+				*tok++ = c;
+				length++;
+			}
+			else Unget( c );
+			goto endtok;
+		}
+
+		// Is this the beginning of a string?
+
+		else if( c == DoubleQuote )
+		{
+			type = T_string;
+			while( Getc( c ) != EOF && length < MaxTokenLen )
+			{
+				if( c == DoubleQuote ) goto endtok;
+				*tok++ = c;
+				length++;
+			}
+			Error( T_unterm_string );
+		}
+
+		// Is this the beginning of a "#" directive?
+
+		else if( c == Hash )
+		{
+			type = T_directive;
+			NonWhite( c );
+			while( isAlpha( c ) )
+			{
+				*tok++ = c;
+				length++;
+				Getc( c );
+			}
+			Unget( c );
+			goto endtok;
+		}
+
+		// This must be a one-character token. 
+
+		else
+		{
+			*tok++ = c;
+			length = 1;
+			type   = T_other;
+		}
+
+endtok: // Jump to here when token is completed.
+
+		*tok = NullChar;  // Terminate the string.
+		if( debug != NULL ) DebugPrint( *this, debug );
+
+		return length;
+	}
+
+	/*-------------------------------------------------------------------------*
+	*  N e x t T o k                                                          *
+	*                                                                         *
+	*-------------------------------------------------------------------------*/
+	int Token::NextTok( )
+	{
+		NextRawTok();
+
+		// If the token is an identifier, see if it's a macro.
+		// If the macro substitution is null, get another token.
+
+		if( type == T_ident )
+		{
+			if( table != NULL )
+			{
+				if( MacroReplace( spelling, length, type ) && debug != NULL ) 
+					DebugPrint( *this, debug );
+			}
+			if( type == T_nullmacro ) NextTok();
+		}
+		return length;
+	}
+
+	/*-------------------------------------------------------------------------*
+	*  O p e r a t o r  - -                                                   *
+	*                                                                         *
+	*  Puts back the last token found.  Only one token can be put back.       *
+	*                                                                         *
+	*-------------------------------------------------------------------------*/
+	Token & Token::operator--( )  // Put the last token back. 
+	{
+		if( put_back ) Error( T_putback_error );  // Can only handle one putback.
+		put_back = 1; 
+		return *this;
+	}
+
+	Token & Token::operator--( int )  // Postfix decrement.
+	{
+		fprintf( stderr, "Postfix decrement is not implemented for the Token class.\n" );
+		return *this;
+	}
+
+	/*-------------------------------------------------------------------------*
+	*  H a n d l e   D i r e c t i v e                                        *
+	*                                                                         *
+	*  Directive beginning with "#" must be handled by the lexer, as they     *
+	*  determine the current source file via "#include", etc.                 *
+	*                                                                         *
+	*  Returns 1 if, after handling this directive, we now have the next      *
+	*  token.                                                                 *
+	*                                                                         *
+	*-------------------------------------------------------------------------*/
+	int Token::HandleDirective( )
+	{
+		FILE *fp;
+		char name[128];
+		if( *this == "define" )
+		{
+			NextRawTok(); 
+			strcpy( tempbuff, Spelling() );  // This is the macro name.
+			int line = Line();
+			NextRawTok();
+			if( Line() == line )
+				AddMacro( tempbuff, Spelling(), Type() );
+			else
+			{
+				// If next token is on a different line; we went too far.
+				AddMacro( tempbuff, "", T_nullmacro );
+				return 1;  // Signal that we already have the next token.
+			}
+		}
+		else if( *this == "include" )
+		{
+			NextRawTok();
+			if( *this == "<" )
+			{
+				GetName( name, sizeof(name) );
+				PushFrame( ResolveName( name ), name );
+			}
+			else if( type == T_string )
+			{
+				fp = fopen( spelling, "r" );
+				if( fp == NULL ) Error( T_file_not_found, spelling );
+				else PushFrame( fp, spelling );
+			}
+			else Error( T_string_expected );
+		}
+		else if( *this == "ifdef" )
+		{
+			NextRawTok();
+			TokMacro *m = MacroLookup( Spelling() );
+			if( m == NULL )  // Skip until else or endif.
+			{
+				while( *this != T_null )
+				{
+					NextRawTok();
+					if( *this != T_directive ) continue;
+					if( *this == "endif" ) break;
+					if( *this == "else"  ) { if_nesting++; break; }  // Like m != NULL.
+				}
+				if( *this == T_null ) Error( T_no_endif );
+				return 0; // Ready to get the next token.
+			}
+			else if_nesting++;
+		}
+		else if( *this == "ifndef" )
+		{
+			NextRawTok();
+			TokMacro *m = MacroLookup( Spelling() );
+			if( m != NULL )  // Skip until else or endif.
+			{
+				while( *this != T_null )
+				{
+					NextRawTok();
+					if( *this != T_directive ) continue;
+					if( *this == "endif" ) break;
+					if( *this == "else"  ) { if_nesting++; break; }  // Like m == NULL.
+				}
+				if( *this == T_null ) Error( T_no_endif );
+				return 0; // Ready to get the next token.
+			}
+			else if_nesting++;
+		}
+		else if( *this == "else" )  // Skip until #endif.
+		{
+			if( if_nesting == 0 ) Error( T_extra_else );
+			while( *this != T_null )
+			{
+				NextRawTok();
+				if( *this == T_directive && *this == "endif" ) break;
+			}
+			if( *this == T_null ) Error( T_no_endif );
+			if_nesting--;
+			return 0; // Ready to get next token.
+		}
+		else if( *this == "endif" )
+		{
+			if( if_nesting == 0 ) Error( T_extra_endif );
+			if_nesting--;
+			return 0; // Ready to get next token.
+		}
+		else if( *this == "error" )
+		{
+			int line = Line();
+			NextTok(); // Allow macro substitution.
+			if( Line() == line )
+			{
+				fprintf( stderr, "(preprocessor, line %d) %s\n", line, Spelling() );
+				return 0; // Ready to get next token.
+			}
+			else
+			{
+				// If next token is on a different line; we went too far.
+				fprintf( stderr, "(null preprocessor message, line %d)\n", line );
+				return 1;  // Signal that we already have the next token.
+			}
+		}
+		return 0;
+	}
+
+
+	/*-------------------------------------------------------------------------*
+	*  O p e r a t o r  + +                                                   *
+	*                                                                         *
+	*  Grab the next token from the current source file.  If at end of file,  *
+	*  pick up where we left off in the previous file.  If there is no        *
+	*  previous file, return "T_null".                                        *
+	*                                                                         *
+	*-------------------------------------------------------------------------*/
+	Token & Token::operator++( )
+	{
+		if( put_back ) 
+		{
+			put_back = 0;
+			return *this;
+		}
+
+		// If we've reached the end of an include file, pop the stack.
+
+		for(;;)
+		{
+			NextTok();  
+			if( type == T_directive ) 
+			{
+				if( HandleDirective() ) break;
+			}
+			else if( type == T_null ) 
+			{
+				fclose( frame.source );
+				if( !PopFrame() ) break;
+			}
+			else break;  // We have a real token.
+		}
+
+		// Now fill in the value fields if the token is a number. 
+
+		switch( type )
+		{
+		case T_integer : ivalue = atoi( spelling ); break;
+		case T_float   : fvalue = atof( spelling ); break;
+		case T_null    : if( if_nesting > 0 ) Error( T_no_endif ); break;
+		default        : break;
+		}
+
+		return *this;
+	}
+
+	Token & Token::operator++( int )
+	{
+		fprintf( stderr, "Postfix increment is not implemented for the Token class.\n" );
+		return *this;
+	}
+
+	/*-------------------------------------------------------------------------*
+	*  T o k e n   Push & Pop Frame                                           *
+	*                                                                         *
+	*  These functions are used to create and destroy the context "frames"    *
+	*  that are used to handle nested files (via "include").                  *
+	*                                                                         *
+	*-------------------------------------------------------------------------*/
+	void Token::PushFrame( FILE *fp, char *fname )
+	{
+		// Create a copy of the current (top-level) frame.
+
+		TokFrame *n = new TokFrame;
+		*n = frame;
+
+		// Now overwrite the top-level frame with the new state.
+
+		frame.next   = n;
+		frame.source = fp;
+		frame.line   = 1;
+		frame.column = 0;
+		frame.fname  = strdup( fname );
+		pushed       = NullChar;
+	}
+
+	int Token::PopFrame()
+	{
+		if( frame.next == NULL ) return 0;
+		TokFrame *old = frame.next;
+		frame = *old;
+		delete   old;  // Delete the frame that we just copied from.
+		return 1;
+	}
+
+	/*-------------------------------------------------------------------------*
+	*  Miscellaneous Functions                                                *
+	*                                                                         *
+	*-------------------------------------------------------------------------*/
+	void Token::Init()
+	{
+		case_sensitive = 1;
+		put_back       = 0;
+		pushed         = NullChar;
+		if_nesting     = 0;
+		frame.source   = NULL;
+		frame.next     = NULL;
+		frame.fname    = NULL;
+		first          = NULL;
+		last           = NULL;
+		table          = NULL;
+		pushed         = NullChar;
+		SearchArgs();  // Search command-line args for macro definitions.
+	}
+
+	const char* Token::Spelling() const 
+	{ 
+		return spelling;    
+	}
+
+	char Token::Char() const 
+	{ 
+		return spelling[0];
+	}
+
+	const char* Token::FileName() const
+	{ 
+		static char *null_string = "";
+		if( frame.fname == NULL ) return null_string;
+		else return frame.fname; 
+	}
+
+	float Token::Fvalue() const
+	{
+		float val = 0.0;
+		if( type == T_float   ) val = fvalue;
+		if( type == T_integer ) val = ivalue;
+		return val;
+	}
+
+	void Token::GetName( char *name, int max )
+	{
+		int c;
+		for( int i = 1; i < max; i++ )
+		{
+			if( NonWhite(c) == '>' ) 
+			{ 
+				*name = NullChar; 
+				return; 
+			}
+			*name++ = c;
+		}
+		Error( T_name_too_long );
+	}
+
+	void Token::AddPath( const char *new_path )
+	{
+		char *name = strdup( new_path );
+		if( name == NULL ) return;
+		TokPath *p = new TokPath;
+		p->next = NULL;
+		p->path = name;
+		if( first == NULL ) first = p;
+		else last->next = p;
+		last = p;
+	}
+
+	void Token::ClearPaths()
+	{
+		TokPath *p = first;
+		while( p != NULL )
+		{
+			TokPath *q = p->next;
+			delete[] p->path;  // delete the string.
+			delete   p;        // delete the path structure.
+			p = q;
+		}
+		first = NULL;
+		last  = NULL;
+	}
+
+	FILE *Token::ResolveName( const char *name )
+	{
+		char resolved[128];
+		for( const TokPath *p = first; p != NULL; p = p->next )
+		{
+			strcpy( resolved, p->path );
+			strcat( resolved, "/"     );
+			strcat( resolved, name    );
+			FILE *fp = fopen( resolved, "r" );
+			if( fp != NULL ) return fp;
+		}
+		Error( T_file_not_found, name );
+		return NULL;
+	}
+
+	void Token::CaseSensitive( int on_off = 1 ) 
+	{ 
+		case_sensitive = on_off; 
+	}
+
+	void Token::Debug( FILE *fp ) 
+	{ 
+		debug = fp;
+	}
+
+	void Token::AddMacro( const char *macro, const char *repl, TokType t )
+	{
+		if( table == NULL ) // Create and initialize the table.
+		{
+			table = new TokMacroPtr[ HashConst ];
+			for( int j = 0; j < HashConst; j++ ) table[j] = NULL;
+		}
+		int i = HashName( macro );    
+		TokMacro *m = new TokMacro;
+		m->next   = table[i];
+		m->macro  = strdup( macro );
+		m->repl   = strdup( repl  );
+		m->type   = t;
+		table[i]  = m;
+	}
+
+	void Token::Args( int argc_, char *argv_[] )
+	{
+		argc = argc_;  // Set the static variables.
+		argv = argv_;
+	}
+
+	void Token::SearchArgs( )
+	{
+		TokType type = T_null;
+		for( int i = 1; i < argc; i++ )
+		{
+			if( strcmp( argv[i], "-macro" ) == 0 )
+			{
+				if( i+2 >= argc ) 
+				{
+					fprintf( stderr, "(Token) ERROR macro argument(s) missing\n" );
+					return;
+				}
+				char *macro = argv[i+1];
+				char *repl  = argv[i+2];
+				if( isAlpha  ( repl[0] ) ) type = T_ident  ; else
+					if( isInteger( repl    ) ) type = T_integer; else
+						type = T_float  ;
+				AddMacro( macro, repl, type );
+				i += 2;
+			}
+		}
+	}
+};
diff --git a/src/nvtt/bc7/arvo/Token.h b/src/nvtt/bc7/arvo/Token.h
new file mode 100644
index 0000000..eabdacc
--- /dev/null
+++ b/src/nvtt/bc7/arvo/Token.h
@@ -0,0 +1,203 @@
+/***************************************************************************
+* Token.h                                                                  *
+*                                                                          *
+* The Token class ecapsulates a lexical analyzer for C++-like syntax.      *
+* A token instance is associated with one or more text files, and          *
+* grabs C++ tokens from them sequentially.  There are many member          *
+* functions designed to make parsing easy, such as "==" operators for      *
+* strings and characters, and automatic conversion of numeric tokens       *
+* into numeric values.                                                     *
+*                                                                          *
+* Files can be nested via #include directives, and both styles of C++      *
+* comments are supported.                                                  *
+*                                                                          *
+*                                                                          *
+*   HISTORY                                                                *
+*      Name    Date        Description                                     *
+*                                                                          *
+*      arvo    10/05/99    Fixed bug in TokFrame string allocation.        *
+*      arvo    01/15/95    Added ifdef, ifndef, else, and endif.           *
+*      arvo    02/13/94    Added Debug() member function.                  *
+*      arvo    01/22/94    Several sections rewritten.                     *
+*      arvo    06/19/93    Converted to C++                                *
+*      arvo    07/15/89    Rewritten for scene description parser.         *
+*      arvo    01/22/89    Initial coding.                                 *
+*                                                                          *
+*--------------------------------------------------------------------------*
+* Copyright (C) 1999, James Arvo                                           *
+*                                                                          *
+* This program is free software; you can redistribute it and/or modify it  *
+* under the terms of the GNU General Public License as published by the    *
+* Free Software Foundation.  See http://www.fsf.org/copyleft/gpl.html      *
+*                                                                          *
+* This program is distributed in the hope that it will be useful, but      *
+* WITHOUT EXPRESS OR IMPLIED WARRANTY of merchantability or fitness for    *
+* any particular purpose.  See the GNU General Public License for more     *
+* details.                                                                 *
+*                                                                          *
+***************************************************************************/
+#ifndef __TOKEN_INCLUDED__
+#define __TOKEN_INCLUDED__
+
+#include <iostream>
+#include <stdio.h>
+
+namespace ArvoMath {
+
+	const int MaxTokenLen = 128;
+
+	typedef enum {
+		T_null,   
+		T_char,       // A string of length 1.
+		T_string,
+		T_integer,
+		T_float,
+		T_ident,
+		T_other,
+		T_numeric,    // Either T_float or T_int (use with == operator).
+		T_directive,  // Directives like #include are not returned to the user.
+		T_nullmacro
+	} TokType;
+
+	typedef enum {
+		T_malformed_float,
+		T_unterm_string,
+		T_unterm_comment,
+		T_file_not_found,
+		T_unknown_directive,
+		T_string_expected,
+		T_putback_error,
+		T_name_too_long,
+		T_no_endif,
+		T_extra_endif,
+		T_extra_else
+	} TokError;
+
+	class TokFrame {
+	public:
+		TokFrame();
+		TokFrame( const TokFrame &frame ) { *this = frame; }
+		~TokFrame();
+		void operator=( const TokFrame & );
+	public:
+		TokFrame *next;
+		FILE     *source;
+		char     *fname;
+		int       line;    
+		int       column;  
+	};
+
+	struct TokPath {
+		char    *path;
+		TokPath *next;
+	};
+
+	struct TokMacro {
+		char     *macro;
+		char     *repl;
+		TokType   type;
+		TokMacro *next;
+	};
+
+	class Token {
+
+	public:
+		// Constructors and destructor.
+
+		Token();
+		Token( const char *file_name );
+		Token( FILE *file_pointer    );
+		~Token();
+
+		// Const data members for querying token information.
+
+		TokType Type()    const { return type;       }  // The type of token found. 
+		int     Len()     const { return length;     }  // The length of the token. 
+		int     Line()    const { return frame.line; }  // The line it was found on.
+		int     Column()  const { return Tcolumn;    }  // The column it began in.  
+		long    Ivalue()  const { return ivalue;     }  // Token value if an integer.
+		float   Fvalue()  const;                        // Token value if int or float.
+		char    Char()    const;                        // The token (if a Len() == 1).
+
+		// Operators.
+
+		int     operator == ( const char* ) const;      // 1 if strings match.
+		int     operator != ( const char* ) const;      // 0 if strings match.
+		int     operator == ( char        ) const;      // 1 if token is this char.
+		int     operator != ( char        ) const;      // 0 if token is this char.
+		int     operator == ( TokType     ) const;      // 1 if token is of this type.
+		int     operator != ( TokType     ) const;      // 0 if token is of this type.
+		Token & operator ++ (             );            // (prefix) Get the next token.
+		Token & operator -- (             );            // (prefix) Put back one token.
+		Token & operator ++ ( int         );            // (postfix) Undefined.
+		Token & operator -- ( int         );            // (postfix) Undefined.
+
+		// State-setting member functions.
+
+		void Open( FILE * );                            // Read already opened file.
+		void Open( const char * );                      // Open the named file.
+		void CaseSensitive( int on_off );               // Applies to == and != operators.
+		void AddPath( const char * );                   // Adds path for <...> includes.
+		void ClearPaths();                              // Remove all search paths.
+
+		// Miscellaneous.
+
+		const char* Spelling() const;                   // The token itself.
+		const char* FileName() const;                   // Current file being lexed.
+		static void Debug( FILE * );                    // Write all token streams to a file.
+		static void Args ( int argc, char *argv[] );    // Search args for macro settings.
+		void AddMacro( const char*, const char*, TokType type );
+		void SearchArgs();
+
+	private:
+
+		// Private member functions.       
+
+		void     Init();
+		int      Getc ( int & );
+		void     Unget( int c ) { pushed = c; }
+		void     Error( TokError error, const char *name = NULL );
+		int      NonWhite( int & );
+		int      HandleDirective();
+		int      NextRawTok();  // No macro substitutions.
+		int      NextTok();
+		void     PushFrame( FILE *fp, char *fname = NULL );
+		int      PopFrame();
+		void     GetName( char *name, int max );
+		FILE     *ResolveName( const char *name );
+		TokMacro *MacroLookup( const char *str ) const;
+		int      MacroReplace( char *str, int &length, TokType &type ) const;
+
+		// Private data members.       
+
+		TokPath  *first;
+		TokPath  *last;
+		TokMacro **table;
+		TokFrame frame;
+		TokType  type;
+		long     ivalue;  
+		float    fvalue;  
+		int      length;  
+		int      Tcolumn;  
+		int      put_back;    
+		int      case_sensitive;
+		int      pushed;
+		int      if_nesting;
+		char     spelling[ MaxTokenLen ];
+		char     tempbuff[ MaxTokenLen ];
+
+		// Static data members.       
+
+		static int  argc;
+		static char **argv;
+		static FILE *debug;
+	};
+
+
+	// Predicate-style functions for testing token types.
+
+	inline int Null   ( const Token &t ) { return t.Type() == T_null;    }
+	inline int Numeric( const Token &t ) { return t.Type() == T_numeric; }
+	inline int StringP( const Token &t ) { return t.Type() == T_string;  }
+};
+#endif
diff --git a/src/nvtt/bc7/arvo/Vec2.cpp b/src/nvtt/bc7/arvo/Vec2.cpp
new file mode 100644
index 0000000..cca6723
--- /dev/null
+++ b/src/nvtt/bc7/arvo/Vec2.cpp
@@ -0,0 +1,94 @@
+/***************************************************************************
+* Vec2.C                                                                   *
+*                                                                          *
+* Basic operations on 2-dimensional vectors.  This special case is useful  *
+* because nearly all operations are performed inline.                      *
+*                                                                          *
+*   HISTORY                                                                *
+*      Name    Date        Description                                     *
+*                                                                          *
+*      arvo    05/22/98    Added TimedVec2, extending Vec2.                *
+*      arvo    06/17/93    Initial coding.                                 *
+*                                                                          *
+*--------------------------------------------------------------------------*
+* Copyright (C) 1999, James Arvo                                           *
+*                                                                          *
+* This program is free software; you can redistribute it and/or modify it  *
+* under the terms of the GNU General Public License as published by the    *
+* Free Software Foundation.  See http://www.fsf.org/copyleft/gpl.html      *
+*                                                                          *
+* This program is distributed in the hope that it will be useful, but      *
+* WITHOUT EXPRESS OR IMPLIED WARRANTY of merchantability or fitness for    *
+* any particular purpose.  See the GNU General Public License for more     *
+* details.                                                                 *
+*                                                                          *
+***************************************************************************/
+#include <math.h>
+#include "ArvoMath.h"
+#include "Vec2.h"
+#include "form.h"
+
+namespace ArvoMath {
+
+	const Vec2 Vec2::Zero;
+	const Vec2 Vec2::Xaxis( 1, 0 );
+	const Vec2 Vec2::Yaxis( 0, 1 );
+
+	// Most routines are now inline.
+
+	float Normalize( Vec2 &A )
+	{
+		float d = Len( A );
+		if( d != 0.0 )
+		{
+			A.X() /= d;
+			A.Y() /= d;
+		}
+		return d;
+	}
+
+	Vec2 Min( const Vec2 &A, const Vec2 &B )
+	{
+		return Vec2( Min( A.X(), B.X() ), Min( A.Y(), B.Y() ) );
+	}
+
+	Vec2 Max( const Vec2 &A, const Vec2 &B )
+	{
+		return Vec2( Max( A.X(), B.X() ), Max( A.Y(), B.Y() ) );
+	}
+
+	std::ostream &operator<<( std::ostream &out, const Vec2 &A )
+	{
+		out << form( " %9.5f %9.5f\n", A.X(), A.Y() );
+		return out;
+	}
+
+	std::ostream &operator<<( std::ostream &out, const Mat2x2 &M )
+	{
+		out << form( " %9.5f %9.5f\n", M(0,0), M(0,1) )
+			<< form( " %9.5f %9.5f\n", M(1,0), M(1,1) )
+			<< std::endl;
+		return out;
+	}
+
+	Mat2x2::Mat2x2( const Vec2 &c1, const Vec2 &c2 ) 
+	{ 
+		m[0][0] = c1.X(); 
+		m[1][0] = c1.Y(); 
+		m[0][1] = c2.X();
+		m[1][1] = c2.Y();
+	}
+
+	// Return solution x of the system Ax = b.
+	Vec2 Solve( const Mat2x2 &A, const Vec2 &b )
+	{
+		float MachEps = MachineEpsilon();
+		Vec2 x;
+		double d = det( A );
+		double n = Norm1( A );
+		if( n <= MachEps || Abs(d) <= MachEps * n ) return Vec2::Zero;
+		x.X() =  A(1,1) * b.X() - A(0,1) * b.Y();
+		x.Y() = -A(1,0) * b.X() + A(0,0) * b.Y();
+		return x / d;
+	}
+};
diff --git a/src/nvtt/bc7/arvo/Vec2.h b/src/nvtt/bc7/arvo/Vec2.h
new file mode 100644
index 0000000..7aca458
--- /dev/null
+++ b/src/nvtt/bc7/arvo/Vec2.h
@@ -0,0 +1,358 @@
+/***************************************************************************
+* Vec2.h                                                                   *
+*                                                                          *
+* Basic operations on 2-dimensional vectors.  This special case is useful  *
+* because nearly all operations are performed inline.                      *
+*                                                                          *
+*   HISTORY                                                                *
+*      Name    Date        Description                                     *
+*                                                                          *
+*      arvo    05/22/98    Added TimedVec2, extending Vec2.                *
+*      arvo    06/17/93    Initial coding.                                 *
+*                                                                          *
+*--------------------------------------------------------------------------*
+* Copyright (C) 1999, James Arvo                                           *
+*                                                                          *
+* This program is free software; you can redistribute it and/or modify it  *
+* under the terms of the GNU General Public License as published by the    *
+* Free Software Foundation.  See http://www.fsf.org/copyleft/gpl.html      *
+*                                                                          *
+* This program is distributed in the hope that it will be useful, but      *
+* WITHOUT EXPRESS OR IMPLIED WARRANTY of merchantability or fitness for    *
+* any particular purpose.  See the GNU General Public License for more     *
+* details.                                                                 *
+*                                                                          *
+***************************************************************************/
+#ifndef __VEC2_INCLUDED__
+#define __VEC2_INCLUDED__
+
+#include <math.h>
+#include <iostream>
+#include "ArvoMath.h"
+
+namespace ArvoMath {
+
+	class Vec2;       // 2-D floating-point vector.
+	class TimedVec2;  // 2-D vector with a time stamp.
+	class Mat2x2;     // 2x2 floating-point matrix.
+
+	class Vec2 {
+	public:
+		Vec2(                  ) { x = 0.0;   y = 0.0;   }
+		Vec2( float a, float b ) { x = a;     y = b;     }
+		Vec2( const Vec2 &A    ) { x = A.X(); y = A.Y(); }
+		~Vec2() {}
+		Vec2 &operator=( float s       ) { return Set(     s,     s ); }
+		Vec2 &operator=( const Vec2 &A ) { return Set( A.X(), A.Y() ); }
+		float  X() const { return x; }
+		float  Y() const { return y; }
+		float &X()       { return x; }
+		float &Y()       { return y; }
+		float  operator[]( int i ) const { return *( &x + i ); }
+		float &operator[]( int i )       { return *( &x + i ); }
+		Vec2  &Set( float a, float b ) { x = a; y = b; return *this; }
+		Vec2  &Set( const Vec2 &A    ) { return Set( A.X(), A.Y() ); }
+	public:
+		static const Vec2 Zero;
+		static const Vec2 Xaxis;
+		static const Vec2 Yaxis;
+	protected:
+		float x, y;
+	};
+
+	// This class simply adds a time field to the Vec2 class so that time-stamped
+	// coordinates can be easily inserted into objects such as Polylines.
+
+	class TimedVec2 : public Vec2 {
+	public:
+		TimedVec2() { time = 0; }
+		TimedVec2( const Vec2 &p   , long u = 0 ) { Set( p ); time = u; }
+		TimedVec2( float x, float y, long u = 0 ) { Set(x,y); time = u; }
+		~TimedVec2() {}
+		Vec2 &Coord()       { return *this; }
+		Vec2  Coord() const { return *this; }
+		long  Time () const { return  time; }
+		void  SetTime( long u ) { time = u; }
+	protected:
+		long time;
+	};
+
+	class Mat2x2 {
+	public:
+		Mat2x2( ) { Set( 0, 0, 0, 0 ); }
+		Mat2x2( float a, float b, float c, float d ) { Set( a, b, c, d ); }
+		Mat2x2( const Vec2 &c1, const Vec2 &c2 );
+		~Mat2x2( ) {}
+		Mat2x2 &operator*=( float scale );
+		Mat2x2  operator* ( float scale ) const;
+		void Set( float a, float b, float c, float d ) 
+		{ m[0][0] = a; m[0][1] = b; m[1][0] = c; m[1][1] = d; }
+		float  operator()( int i, int j ) const { return m[i][j]; }
+		float &operator()( int i, int j )       { return m[i][j]; }
+	private:
+		float m[2][2];
+	};
+
+
+	//==========================================
+	//===  Miscellaneous external functions  ===                        
+	//==========================================
+
+	extern float Normalize( Vec2 &A );
+	extern Vec2  Min ( const Vec2 &A, const Vec2 &B );
+	extern Vec2  Max ( const Vec2 &A, const Vec2 &B );
+
+
+	//==========================================
+	//===  Norm-related functions           ===                        
+	//==========================================
+
+	inline double LenSqr ( const Vec2 &A ) { return Sqr(A[0]) + Sqr(A[1]); }
+	inline double Len    ( const Vec2 &A ) { return sqrt( LenSqr( A ) ); }
+	inline double OneNorm( const Vec2 &A ) { return Abs( A.X() ) + Abs( A.Y() ); }
+	inline double TwoNorm( const Vec2 &A ) { return Len(A); }
+	inline float  SupNorm( const Vec2 &A ) { return MaxAbs( A.X(), A.Y() ); }
+
+
+	//==========================================
+	//===  Addition                          ===                        
+	//==========================================
+
+	inline Vec2 operator+( const Vec2 &A, const Vec2 &B )
+	{
+		return Vec2( A.X() + B.X(), A.Y() + B.Y() );
+	}
+
+	inline Vec2& operator+=( Vec2 &A, const Vec2 &B )
+	{
+		A.X() += B.X();
+		A.Y() += B.Y();
+		return A;
+	}
+
+
+	//==========================================
+	//===  Subtraction                       ===                        
+	//==========================================
+
+	inline Vec2 operator-( const Vec2 &A, const Vec2 &B )
+	{
+		return Vec2( A.X() - B.X(), A.Y() - B.Y() );
+	}
+
+	inline Vec2 operator-( const Vec2 &A )
+	{
+		return Vec2( -A.X(), -A.Y() );
+	}
+
+	inline Vec2& operator-=( Vec2 &A, const Vec2 &B )
+	{
+		A.X() -= B.X();
+		A.Y() -= B.Y();
+		return A;
+	}
+
+
+	//==========================================
+	//===  Multiplication                    ===                        
+	//==========================================
+
+	inline Vec2 operator*( float c, const Vec2 &A )
+	{
+		return Vec2( c * A.X(), c * A.Y() );
+	}
+
+	inline Vec2 operator*( const Vec2 &A, float c )
+	{
+		return Vec2( c * A.X(), c * A.Y() );
+	}
+
+	inline float operator*( const Vec2 &A, const Vec2 &B )  // Inner product
+	{
+		return A.X() * B.X() + A.Y() * B.Y();
+	}
+
+	inline Vec2& operator*=( Vec2 &A, float c )
+	{
+		A.X() *= c;
+		A.Y() *= c;
+		return A;
+	}
+
+	//==========================================
+	//===  Division                          ===                        
+	//==========================================
+
+	inline Vec2 operator/( const Vec2 &A, float c )
+	{
+		return Vec2( A.X() / c, A.Y() / c );
+	}
+
+	inline Vec2 operator/( const Vec2 &A, const Vec2 &B ) 
+	{
+		return A - B * (( A * B ) / LenSqr( B ));
+	}
+
+
+	//==========================================
+	//===  Comparison                        ===                        
+	//==========================================
+
+	inline int operator==( const Vec2 &A, const Vec2 &B ) 
+	{ 
+		return A.X() == B.X() && A.Y() == B.Y(); 
+	}
+
+	inline int operator!=( const Vec2 &A, const Vec2 &B ) 
+	{ 
+		return A.X() != B.X() || A.Y() != B.Y(); 
+	}
+
+	inline int operator<=( const Vec2 &A, const Vec2 &B ) 
+	{ 
+		return A.X() <= B.X() && A.Y() <= B.Y(); 
+	}
+
+	inline int operator<( const Vec2 &A, const Vec2 &B ) 
+	{ 
+		return A.X() < B.X() && A.Y() < B.Y(); 
+	}
+
+	inline int operator>=( const Vec2 &A, const Vec2 &B ) 
+	{ 
+		return A.X() >= B.X() && A.Y() >= B.Y(); 
+	}
+
+	inline int operator>( const Vec2 &A, const Vec2 &B ) 
+	{ 
+		return A.X() > B.X() && A.Y() > B.Y();
+	}
+
+	//==========================================
+	//===  Miscellaneous                     ===                        
+	//==========================================
+
+	inline float operator|( const Vec2 &A, const Vec2 &B )  // Inner product
+	{
+		return A * B;
+	}
+
+	inline Vec2 Unit( const Vec2 &A )
+	{
+		float c = LenSqr( A );
+		if( c > 0.0 ) c = 1.0 / sqrt( c );
+		return c * A;
+	}
+
+	inline Vec2 Unit( const Vec2 &A, float &len )
+	{
+		float c = LenSqr( A );
+		if( c > 0.0 ) 
+		{
+			len = sqrt( c );
+			return A / len;
+		}
+		len = 0.0;
+		return A;
+	}
+
+	inline Vec2 Unit( float x, float y )
+	{
+		return Unit( Vec2( x, y ) );
+	}
+
+	inline double dist( const Vec2 &A, const Vec2 &B ) 
+	{ 
+		return Len( A - B ); 
+	}
+
+	inline float operator^( const Vec2 &A, const Vec2 &B )
+	{
+		return A.X() * B.Y() - A.Y() * B.X();
+	}
+
+	inline int Quadrant( const Vec2 &A )
+	{
+		if( A.Y() >= 0.0 ) return A.X() >= 0.0 ? 1 : 2;
+		return A.X() >= 0.0 ? 4 : 3;
+	}
+
+	inline Vec2 OrthogonalTo( const Vec2 &A ) // A vector orthogonal to that given.
+	{
+		return Vec2( -A.Y(), A.X() );
+	}
+
+	inline Vec2 Interpolate( const Vec2 &A, const Vec2 &B, float t )
+	{
+		// Compute a point along the segment joining points A and B
+		// according to the normalized parameter t in [0,1].
+		return ( 1.0 - t ) * A + t * B;
+	}
+
+	//==========================================
+	//===  Operations involving Matrices     ===                        
+	//==========================================
+
+	inline Mat2x2 Outer( const Vec2 &A, const Vec2 &B )  // Outer product.
+	{
+		Mat2x2 C;
+		C(0,0) = A.X() * B.X();
+		C(0,1) = A.X() * B.Y();
+		C(1,0) = A.Y() * B.X();
+		C(1,1) = A.Y() * B.Y();
+		return C;
+	}
+
+	inline Vec2 operator*( const Mat2x2 &M, const Vec2 &A )
+	{
+		return Vec2( 
+			M(0,0) * A.X() + M(0,1) * A.Y(),
+			M(1,0) * A.X() + M(1,1) * A.Y()
+			);
+	}
+
+	inline Mat2x2 &Mat2x2::operator*=( float scale )
+	{
+		m[0][0] *= scale;
+		m[0][1] *= scale;
+		m[1][0] *= scale;
+		m[1][1] *= scale;
+		return *this;
+	}
+
+	inline Mat2x2 Mat2x2::operator*( float scale ) const
+	{
+		return Mat2x2(
+			scale * m[0][0], scale * m[0][1],       
+			scale * m[1][0], scale * m[1][1]
+			);
+	}
+
+	inline Mat2x2 operator*( float scale, const Mat2x2 &M )
+	{
+		return M * scale;
+	}
+
+	inline float Norm1( const Mat2x2 &A )
+	{
+		return Max( Abs(A(0,0)) + Abs(A(0,1)), Abs(A(1,0)) + Abs(A(1,1)) );
+	}
+
+	inline double det( const Mat2x2 &A )
+	{
+		return A(0,0) * A(1,1) - A(1,0) * A(0,1);
+	}
+
+	extern Vec2 Solve(  // Return solution x of the system Ax = b.
+		const Mat2x2 &A, 
+		const Vec2 &b 
+		);
+
+	//==========================================
+	//===  Output routines                   ===                        
+	//==========================================
+
+	extern std::ostream &operator<<( std::ostream &out, const Vec2   & );
+	extern std::ostream &operator<<( std::ostream &out, const Mat2x2 & );
+};
+#endif
diff --git a/src/nvtt/bc7/arvo/Vec3.cpp b/src/nvtt/bc7/arvo/Vec3.cpp
new file mode 100644
index 0000000..1033f84
--- /dev/null
+++ b/src/nvtt/bc7/arvo/Vec3.cpp
@@ -0,0 +1,119 @@
+/***************************************************************************
+* Vec3.C                                                                   *
+*                                                                          *
+* Basic operations on 3-dimensional vectors.  This special case is useful  *
+* because many operations are performed inline.                            *
+*                                                                          *
+*   HISTORY                                                                *
+*      Name    Date        Description                                     *
+*                                                                          *
+*      arvo    10/27/94    Reorganized (removed Col & Row distinction).    *
+*      arvo    06/14/93    Initial coding.                                 *
+*                                                                          *
+*--------------------------------------------------------------------------*
+* Copyright (C) 1994, James Arvo                                           *
+*                                                                          *
+* This program is free software; you can redistribute it and/or modify it  *
+* under the terms of the GNU General Public License as published by the    *
+* Free Software Foundation.  See http://www.fsf.org/copyleft/gpl.html      *
+*                                                                          *
+* This program is distributed in the hope that it will be useful, but      *
+* WITHOUT EXPRESS OR IMPLIED WARRANTY of merchantability or fitness for    *
+* any particular purpose.  See the GNU General Public License for more     *
+* details.                                                                 *
+*                                                                          *
+***************************************************************************/
+#include <stdio.h>
+#include <math.h>
+#include "ArvoMath.h"
+#include "Vec3.h"
+#include "form.h"
+
+namespace ArvoMath {
+
+	float Normalize( Vec3 &A )
+	{
+		float d = Len( A );
+		if( d > 0.0 )
+		{
+			double c = 1.0 / d;
+			A.X() *= c;
+			A.Y() *= c;
+			A.Z() *= c;
+		}
+		return( d );
+	}
+
+	double Angle( const Vec3 &A, const Vec3 &B )
+	{
+		double t = LenSqr(A) * LenSqr(B);
+		if( t <= 0.0 ) return 0.0;
+		return ArcCos( (A * B) / sqrt(t) );
+	}
+
+	/*-------------------------------------------------------------------------*
+	* O R T H O N O R M A L                                                   *
+	*                                                                         *
+	* On Input  A, B....: Two linearly independent 3-space vectors.           *
+	*                                                                         *
+	* On Return A.......: Unit vector pointing in original A direction.       *
+	*           B.......: Unit vector orthogonal to A and in subspace spanned *
+	*                     by original A and B vectors.                        *
+	*           C.......: Unit vector orthogonal to both A and B, chosen so   *
+	*                     that A-B-C forms a right-handed coordinate system.  *
+	*                                                                         *
+	*-------------------------------------------------------------------------*/
+	int Orthonormal( Vec3 &A, Vec3 &B, Vec3 &C )
+	{
+		if( Normalize( A ) == 0.0 ) return 1;
+		B /= A;
+		if( Normalize( B ) == 0.0 ) return 1;
+		C = A ^ B;
+		return 0;
+	}
+
+	int Orthonormal( Vec3 &A, Vec3 &B )
+	{
+		if( Normalize( A ) == 0.0 ) return 1;
+		B /= A;
+		if( Normalize( B ) == 0.0 ) return 1;
+		return 0;
+	}
+
+	/*-------------------------------------------------------------------------*
+	* O R T H O G O N A L  T O                                                *
+	*                                                                         *
+	* Returns a vector that is orthogonal to A (but of arbitrary length).     *
+	*                                                                         *
+	*-------------------------------------------------------------------------*/
+	Vec3 OrthogonalTo( const Vec3 &A )
+	{
+		float c = 0.5 * SupNorm( A );
+		if( c ==       0.0  ) return Vec3(    1.0,    0.0,    0.0 );
+		if( c <= Abs(A.X()) ) return Vec3( -A.Y(),  A.X(),    0.0 );
+		if( c <= Abs(A.Y()) ) return Vec3(    0.0, -A.Z(),  A.Y() );
+		return Vec3(  A.Z(),    0.0, -A.X() );
+	}
+
+	Vec3 Min( const Vec3 &A, const Vec3 &B )
+	{
+		return Vec3( 
+			Min( A.X(), B.X() ),
+			Min( A.Y(), B.Y() ),
+			Min( A.Z(), B.Z() ));
+	}
+
+	Vec3 Max( const Vec3 &A, const Vec3 &B )
+	{
+		return Vec3( 
+			Max( A.X(), B.X() ),
+			Max( A.Y(), B.Y() ),
+			Max( A.Z(), B.Z() ));
+	}
+
+	std::ostream &operator<<( std::ostream &out, const Vec3 &A )
+	{
+		out << form( " %9.5f %9.5f %9.5f", A.X(), A.Y(), A.Z() ) << std::endl;
+		return out;
+	}
+};
diff --git a/src/nvtt/bc7/arvo/Vec3.h b/src/nvtt/bc7/arvo/Vec3.h
new file mode 100644
index 0000000..b9d539f
--- /dev/null
+++ b/src/nvtt/bc7/arvo/Vec3.h
@@ -0,0 +1,517 @@
+/***************************************************************************
+* Vec3.h                                                                   *
+*                                                                          *
+* Basic operations on 3-dimensional vectors.  This special case is useful  *
+* because many operations are performed inline.                            *
+*                                                                          *
+*   HISTORY                                                                *
+*      Name    Date        Description                                     *
+*                                                                          *
+*      arvo    10/27/94    Reorganized (removed Col & Row distinction).    *
+*      arvo    06/14/93    Initial coding.                                 *
+*                                                                          *
+*--------------------------------------------------------------------------*
+* Copyright (C) 1994, James Arvo                                           *
+*                                                                          *
+* This program is free software; you can redistribute it and/or modify it  *
+* under the terms of the GNU General Public License as published by the    *
+* Free Software Foundation.  See http://www.fsf.org/copyleft/gpl.html      *
+*                                                                          *
+* This program is distributed in the hope that it will be useful, but      *
+* WITHOUT EXPRESS OR IMPLIED WARRANTY of merchantability or fitness for    *
+* any particular purpose.  See the GNU General Public License for more     *
+* details.                                                                 *
+*                                                                          *
+***************************************************************************/
+#ifndef __VEC3_INCLUDED__
+#define __VEC3_INCLUDED__
+
+#include <math.h>
+#include <iostream>
+#include "Vec2.h"
+
+namespace ArvoMath {
+
+	class Vec3 {
+	public:
+		Vec3( float c = 0.0             ) { x =     c; y =     c; z =     c; }
+		Vec3( float a, float b, float c ) { x =     a; y =     b; z =     c; }
+		Vec3( const Vec3 &A             ) { x = A.X(); y = A.Y(); z = A.Z(); }
+		void operator=( float c         ) { x =     c; y =     c; z =     c; }
+		void operator=( const Vec3 &A   ) { x = A.X(); y = A.Y(); z = A.Z(); }
+		void operator=( const Vec2 &A   ) { x = A.X(); y = A.Y(); z =   0.0; }
+		~Vec3() {}
+		float   X() const { return x; }
+		float   Y() const { return y; }
+		float   Z() const { return z; }
+		float & X()       { return x; }
+		float & Y()       { return y; }
+		float & Z()       { return z; }
+		float   operator[]( int i ) const { return *( &x + i ); }
+		float & operator[]( int i )       { return *( &x + i ); }
+	private:
+		float x, y, z;
+	};
+
+	//class Mat3x3 {
+	//public:
+	//	inline Mat3x3( );
+	//	Mat3x3( const Mat3x3 &M ) { *this = M; }
+	//	Mat3x3( const Vec3 &, const Vec3 &, const Vec3 & );  // Three columns.
+	//	~Mat3x3( ) {}
+	//	float    operator()( int i, int j ) const { return m[i][j]; }
+	//	float  & operator()( int i, int j )       { return m[i][j]; }
+	//	Mat3x3 & operator=( float          );
+	//	Mat3x3 & operator=( const Mat3x3 & );
+	//	inline   void ScaleRows( float, float, float );
+	//	inline   void ScaleCols( float, float, float );
+	//	void     Col( int n, const Vec3 & );
+	//	const    float *Base() const { return &(m[0][0]); }
+	//private:
+	//	float m[3][3];
+	//};
+
+	//class Mat4x4 {
+	//public:
+	//	Mat4x4( );
+	//	Mat4x4( const Mat4x4 &M ) { *this = M; }
+	//	Mat4x4( const Mat3x3 &M ) ;
+	//	~Mat4x4( ) {}
+	//	float    operator()( int i, int j ) const { return m[i][j]; }
+	//	float  & operator()( int i, int j )       { return m[i][j]; }
+	//	Mat4x4 & operator=( float          );
+	//	Mat4x4 & operator=( const Mat4x4 & );
+	//	void     Row( int i, int j, const Vec3 & );
+	//	void     Col( int i, int j, const Vec3 & );
+	//	void     ScaleRows( float, float, float, float );
+	//	void     ScaleCols( float, float, float, float );
+	//	const    float *Base() const { return &(m[0][0]); }
+	//private:
+	//	float m[4][4];
+	//};
+
+
+	//==========================================
+	//===  External operators                ===                        
+	//==========================================
+
+	//extern Vec3     operator * ( const Mat4x4 &, const Vec3   & );
+	//extern Vec3     operator * ( const Vec3   &, const Mat4x4 & );
+	//extern Mat3x3   operator * (        float  , const Mat3x3 & );
+	//extern Mat3x3   operator * ( const Mat3x3 &,       float    );
+	//extern Mat3x3   operator / ( const Mat3x3 &,       double   );
+	//extern Mat3x3 & operator *=(       Mat3x3 &,       float    );
+	//extern Mat3x3 & operator *=(       Mat3x3 &, const Mat3x3 & );
+	//extern Mat3x3   operator * ( const Mat3x3 &, const Mat3x3 & );
+	//extern Mat3x3   operator + ( const Mat3x3 &, const Mat3x3 & );
+	//extern Mat3x3 & operator +=(       Mat3x3 &, const Mat3x3 & );
+	//extern Mat3x3   operator - ( const Mat3x3 &, const Mat3x3 & );
+	//extern Mat3x3 & operator -=(       Mat3x3 &, const Mat3x3 & );
+	//extern Mat4x4   operator * (        float  , const Mat4x4 & );
+	//extern Mat4x4   operator * ( const Mat4x4 &,       float    );
+	//extern Mat4x4   operator / ( const Mat4x4 &,       float    );
+	//extern Mat4x4 & operator *=(       Mat4x4 &,       float    );
+	//extern Mat4x4   operator * ( const Mat4x4 &, const Mat4x4 & );
+	//extern Mat4x4   operator + ( const Mat4x4 &, const Mat4x4 & );
+	//extern Mat4x4 & operator +=(       Mat4x4 &, const Mat4x4 & );
+	//extern Mat4x4   operator - ( const Mat4x4 &, const Mat4x4 & );
+	//extern Mat4x4 & operator -=(       Mat4x4 &, const Mat4x4 & );
+
+
+	//==========================================
+	//===  Miscellaneous external functions  ===                        
+	//==========================================
+
+	//extern Vec3   OrthogonalTo( const Vec3   & ); // A vector orthogonal to that given.
+	//extern Vec3   Min         ( const Vec3   &, const Vec3 &         );
+	//extern Vec3   Max         ( const Vec3   &, const Vec3 &         );
+	//extern double Angle       ( const Vec3   &, const Vec3 &         );
+	//extern int    Orthonormal (       Vec3   &,       Vec3 &         );
+	//extern int    Orthonormal (       Vec3   &,       Vec3 &, Vec3 & );
+	//extern float  Trace       ( const Mat3x3 & );
+	//extern float  Normalize   (       Vec3   & );
+	//extern float  Norm1       ( const Mat3x3 & );
+	//extern float  SupNorm     ( const Mat3x3 & );
+	//extern double Determinant ( const Mat3x3 & );
+	//extern Mat3x3 Transp      ( const Mat3x3 & );
+	//extern Mat3x3 Householder ( const Vec3   &, const Vec3 & );
+	//extern Mat3x3 Householder ( const Vec3   & );
+	//extern Mat3x3 Rotation3x3 (       float, float, float ); // Values in [0,1].
+	//extern Mat3x3 Inverse     ( const Mat3x3 & );
+	//extern Mat3x3 Diag3x3     ( const Vec3   & );
+	//extern Mat3x3 Diag3x3     (       float, float, float );
+	//extern Mat3x3 Rotation3x3 ( const Vec3   &Axis,                     float angle );
+	//extern Mat4x4 Rotation4x4 ( const Vec3   &Axis, const Vec3 &Origin, float angle );
+
+
+	//==========================================
+	//===      Norm-related functions        ===                        
+	//==========================================
+
+	inline double LenSqr ( const Vec3 &A ) { return Sqr(A[0]) + Sqr(A[1]) + Sqr(A[2]); }
+	inline double Len    ( const Vec3 &A ) { return Sqrt( LenSqr( A ) ); }
+	inline double Norm1  ( const Vec3 &A ) { return Abs(A[0]) + Abs(A[1]) + Abs(A[2]); }
+	inline double Norm2  ( const Vec3 &A ) { return Len( A ); }
+	inline float  SupNorm( const Vec3 &A ) { return MaxAbs( A[0], A[1], A[2] ); }
+
+
+	//==========================================
+	//===            Addition                ===                        
+	//==========================================
+
+	inline Vec3 operator+( const Vec3 &A, const Vec3 &B )
+	{
+		return Vec3( A.X() + B.X(), A.Y() + B.Y(), A.Z() + B.Z() );
+	}
+
+	inline Vec3& operator+=( Vec3 &A, const Vec3 &B )
+	{
+		A.X() += B.X();
+		A.Y() += B.Y();
+		A.Z() += B.Z();
+		return A;
+	}
+
+
+	//==========================================
+	//===            Subtraction             ===                        
+	//==========================================
+
+	inline Vec3 operator-( const Vec3 &A, const Vec3 &B )
+	{
+		return Vec3( A.X() - B.X(), A.Y() - B.Y(), A.Z() - B.Z() );
+	}
+
+	inline Vec3 operator-( const Vec3 &A )
+	{
+		return Vec3( -A.X(), -A.Y(), -A.Z() );
+	}
+
+	inline Vec3& operator-=( Vec3 &A, const Vec3 &B )
+	{
+		A.X() -= B.X();
+		A.Y() -= B.Y();
+		A.Z() -= B.Z();
+		return A;
+	}
+
+
+	//==========================================
+	//===         Multiplication             ===                        
+	//==========================================
+
+	inline Vec3 operator*( float a, const Vec3 &x )
+	{
+		return Vec3( a * x.X(), a * x.Y(), a * x.Z() );
+	}
+
+	inline Vec3 operator*( const Vec3 &x, float a )
+	{
+		return Vec3( a * x.X(), a * x.Y(), a * x.Z() );
+	}
+
+	inline float operator*( const Vec3 &A, const Vec3 &B )  // Inner product.
+	{
+		return A.X() * B.X() + A.Y() * B.Y() + A.Z() * B.Z();
+	}
+
+	inline Vec3& operator*=( Vec3 &A, float a )
+	{
+		A.X() *= a;
+		A.Y() *= a;
+		A.Z() *= a;
+		return A;
+	}
+
+	//inline Vec3& operator*=( Vec3 &A, const Mat3x3 &M )  // A = M * A
+	//{
+	//	float x = M(0,0) * A.X() + M(0,1) * A.Y() + M(0,2) * A.Z();
+	//	float y = M(1,0) * A.X() + M(1,1) * A.Y() + M(1,2) * A.Z();
+	//	float z = M(2,0) * A.X() + M(2,1) * A.Y() + M(2,2) * A.Z();
+	//	A.X() = x;
+	//	A.Y() = y;
+	//	A.Z() = z;
+	//	return A;
+	//}
+
+	//inline Vec3& operator*=( Vec3 &A, const Mat4x4 &M )  // A = M * A
+	//{
+	//	float x = M(0,0) * A.X() + M(0,1) * A.Y() + M(0,2) * A.Z() + M(0,3);
+	//	float y = M(1,0) * A.X() + M(1,1) * A.Y() + M(1,2) * A.Z() + M(1,3);
+	//	float z = M(2,0) * A.X() + M(2,1) * A.Y() + M(2,2) * A.Z() + M(2,3);
+	//	A.X() = x;
+	//	A.Y() = y;
+	//	A.Z() = z;
+	//	return A;
+	//}
+
+
+	//==========================================
+	//===             Division               ===                        
+	//==========================================
+
+	inline Vec3 operator/( const Vec3 &A, double c )
+	{
+		double t = 1.0 / c;
+		return Vec3( A.X() * t, A.Y() * t, A.Z() * t );
+	}
+
+	inline Vec3& operator/=( Vec3 &A, double a )
+	{
+		A.X() /= a;
+		A.Y() /= a;
+		A.Z() /= a;
+		return A;
+	}
+
+	inline Vec3 operator/( const Vec3 &A, const Vec3 &B )  // Remove component parallel to B.
+	{
+		Vec3 C;  // Cumbersome due to compiler falure.
+		double x = LenSqr( B );
+		if( x > 0.0 ) C = A - B * (( A * B ) / x); else C = A;
+		return C;
+	}
+
+	inline void operator/=( Vec3 &A, const Vec3 &B ) // Remove component parallel to B.
+	{
+		double x = LenSqr( B );
+		if( x > 0.0 ) A -= B * (( A * B ) / x);
+	}
+
+
+	//==========================================
+	//===          Miscellaneous             ===                        
+	//==========================================
+
+	inline float operator|( const Vec3 &A, const Vec3 &B )  // Inner product.
+	{
+		return A * B;
+	}
+
+	inline Vec3 Unit( const Vec3 &A )
+	{
+		double d = LenSqr( A );
+		return d > 0.0 ? A / sqrt(d) : Vec3(0,0,0);
+	}
+
+	inline Vec3 Unit( float x, float y, float z )
+	{
+		return Unit( Vec3( x, y, z ) );
+	}
+
+	inline Vec3 Ortho( const Vec3 &A, const Vec3 &B )
+	{
+		return Unit( A / B );
+	}
+
+	inline int operator==( const Vec3 &A, float x )
+	{
+		return (A[0] == x) && (A[1] == x) && (A[2] == x);
+	}
+
+	inline Vec3 operator^( const Vec3 &A, const Vec3 &B )
+	{
+		return Vec3( 
+			A.Y() * B.Z() - A.Z() * B.Y(),
+			A.Z() * B.X() - A.X() * B.Z(),
+			A.X() * B.Y() - A.Y() * B.X() );
+	}
+
+	inline double dist( const Vec3 &A, const Vec3 &B ) 
+	{ 
+		return Len( A - B ); 
+	}
+
+	inline double Dihedral( const Vec3 &A, const Vec3 &B, const Vec3 &C )
+	{
+		return ArcCos( Unit( A ^ B ) * Unit( C ^ B ) );
+	}
+
+	inline Vec3 operator>>( const Vec3 &A, const Vec3 &B )  // Project A onto B.
+	{
+		Vec3 C;
+		double x = LenSqr( B );
+		if( x > 0.0 ) C = B * (( A * B ) / x);
+		return C;
+	}
+
+	inline Vec3 operator<<( const Vec3 &A, const Vec3 &B ) // Project B onto A.
+	{
+		return B >> A;
+	}
+
+	inline double Triple( const Vec3 &A, const Vec3 &B, const Vec3 &C )
+	{
+		return ( A ^ B ) * C;
+	}
+
+
+	//==========================================
+	//===  Operations involving Matrices     ===                        
+	//==========================================
+
+	//inline Mat3x3 Outer( const Vec3 &A, const Vec3 &B )  // Outer product.
+	//{
+	//	Mat3x3 C;
+	//	C(0,0) = A.X() * B.X();
+	//	C(0,1) = A.X() * B.Y();
+	//	C(0,2) = A.X() * B.Z();
+	//	C(1,0) = A.Y() * B.X();
+	//	C(1,1) = A.Y() * B.Y();
+	//	C(1,2) = A.Y() * B.Z();
+	//	C(2,0) = A.Z() * B.X();
+	//	C(2,1) = A.Z() * B.Y();
+	//	C(2,2) = A.Z() * B.Z();
+	//	return C;
+	//}
+
+	//inline Vec3 operator*( const Mat3x3 &M, const Vec3 &A )
+	//{
+	//	return Vec3(
+	//		M(0,0) * A[0] + M(0,1) * A[1] + M(0,2) * A[2],
+	//		M(1,0) * A[0] + M(1,1) * A[1] + M(1,2) * A[2],
+	//		M(2,0) * A[0] + M(2,1) * A[1] + M(2,2) * A[2]);
+	//}
+
+	//inline Vec3 operator*( const Vec3 &A, const Mat3x3 &M )
+	//{
+	//	return Vec3( 
+	//		A[0] * M(0,0) + A[1] * M(1,0) + A[2] * M(2,0),
+	//		A[0] * M(0,1) + A[1] * M(1,1) + A[2] * M(2,1),
+	//		A[0] * M(0,2) + A[1] * M(1,2) + A[2] * M(2,2));
+	//}
+
+	////==========================================
+	////===      Operations on Matrices        ===                        
+	////==========================================
+
+	//inline Mat3x3 operator+( const Mat3x3 &A, const Mat3x3 &B )
+	//{
+	//	Mat3x3 C;
+	//	C(0,0) = A(0,0) + B(0,0);  C(0,1) = A(0,1) + B(0,1);  C(0,2) = A(0,2) + B(0,2);
+	//	C(1,0) = A(1,0) + B(1,0);  C(1,1) = A(1,1) + B(1,1);  C(1,2) = A(1,2) + B(1,2);
+	//	C(2,0) = A(2,0) + B(2,0);  C(2,1) = A(2,1) + B(2,1);  C(2,2) = A(2,2) + B(2,2);
+	//	return C;
+	//}
+
+	//inline Mat3x3 operator-( const Mat3x3 &A, const Mat3x3 &B )
+	//{
+	//	Mat3x3 C;
+	//	C(0,0) = A(0,0) - B(0,0);  C(0,1) = A(0,1) - B(0,1);  C(0,2) = A(0,2) - B(0,2);
+	//	C(1,0) = A(1,0) - B(1,0);  C(1,1) = A(1,1) - B(1,1);  C(1,2) = A(1,2) - B(1,2);
+	//	C(2,0) = A(2,0) - B(2,0);  C(2,1) = A(2,1) - B(2,1);  C(2,2) = A(2,2) - B(2,2);
+	//	return C;
+	//}
+
+	//inline Mat3x3 operator*( const Mat3x3 &A, const Mat3x3 &B )
+	//{
+	//	Mat3x3 C;
+	//	C(0,0) = A(0,0) * B(0,0) + A(0,1) * B(1,0) + A(0,2) * B(2,0);
+	//	C(0,1) = A(0,0) * B(0,1) + A(0,1) * B(1,1) + A(0,2) * B(2,1);
+	//	C(0,2) = A(0,0) * B(0,2) + A(0,1) * B(1,2) + A(0,2) * B(2,2);
+	//	C(1,0) = A(1,0) * B(0,0) + A(1,1) * B(1,0) + A(1,2) * B(2,0);
+	//	C(1,1) = A(1,0) * B(0,1) + A(1,1) * B(1,1) + A(1,2) * B(2,1);
+	//	C(1,2) = A(1,0) * B(0,2) + A(1,1) * B(1,2) + A(1,2) * B(2,2);
+	//	C(2,0) = A(2,0) * B(0,0) + A(2,1) * B(1,0) + A(2,2) * B(2,0);
+	//	C(2,1) = A(2,0) * B(0,1) + A(2,1) * B(1,1) + A(2,2) * B(2,1);
+	//	C(2,2) = A(2,0) * B(0,2) + A(2,1) * B(1,2) + A(2,2) * B(2,2);
+	//	return C;
+	//}
+
+	//inline void Mat3x3::ScaleRows( float a, float b, float c )
+	//{
+	//	m[0][0] *= a;  m[0][1] *= a;  m[0][2] *= a;
+	//	m[1][0] *= b;  m[1][1] *= b;  m[1][2] *= b;
+	//	m[2][0] *= c;  m[2][1] *= c;  m[2][2] *= c;
+	//}
+
+	//inline void Mat3x3::ScaleCols( float a, float b, float c )
+	//{
+	//	m[0][0] *= a;  m[0][1] *= b;  m[0][2] *= c;
+	//	m[1][0] *= a;  m[1][1] *= b;  m[1][2] *= c;
+	//	m[2][0] *= a;  m[2][1] *= b;  m[2][2] *= c;
+	//}
+
+
+	//==========================================
+	//===       Special Matrices             ===                        
+	//==========================================
+
+	//inline Mat3x3::Mat3x3() 
+	//{
+	//	m[0][0] = 0;  m[0][1] = 0;  m[0][2] = 0;
+	//	m[1][0] = 0;  m[1][1] = 0;  m[1][2] = 0;
+	//	m[2][0] = 0;  m[2][1] = 0;  m[2][2] = 0; 
+	//}
+
+	//inline Mat3x3 Ident3x3()
+	//{
+	//	Mat3x3 I;
+	//	I(0,0) = 1.0;
+	//	I(1,1) = 1.0;
+	//	I(2,2) = 1.0;
+	//	return I;
+	//}
+
+	//inline Mat4x4 Ident4x4()
+	//{
+	//	Mat4x4 I;
+	//	I(0,0) = 1.0;
+	//	I(1,1) = 1.0;
+	//	I(2,2) = 1.0;
+	//	I(3,3) = 1.0;
+	//	return I;
+	//}
+
+	//inline void Adjoint( const Mat3x3 &M, Mat3x3 &A )
+	//{
+	//	A(0,0) = M(1,1) * M(2,2) - M(1,2) * M(2,1);
+	//	A(0,1) = M(1,2) * M(2,0) - M(1,0) * M(2,2);
+	//	A(0,2) = M(1,0) * M(2,1) - M(1,1) * M(2,0);
+
+	//	A(1,0) = M(0,2) * M(2,1) - M(0,1) * M(2,2);
+	//	A(1,1) = M(0,0) * M(2,2) - M(0,2) * M(2,0);
+	//	A(1,2) = M(0,1) * M(2,0) - M(0,0) * M(2,1);
+
+	//	A(2,0) = M(0,1) * M(1,2) - M(0,2) * M(1,1);
+	//	A(2,1) = M(0,2) * M(1,0) - M(0,0) * M(1,2);
+	//	A(2,2) = M(0,0) * M(1,1) - M(0,1) * M(1,0);
+	//}
+
+	//inline void TranspAdjoint( const Mat3x3 &M, Mat3x3 &A )
+	//{
+	//	A(0,0) = M(1,1) * M(2,2) - M(1,2) * M(2,1);
+	//	A(1,0) = M(1,2) * M(2,0) - M(1,0) * M(2,2);
+	//	A(2,0) = M(1,0) * M(2,1) - M(1,1) * M(2,0);
+
+	//	A(0,1) = M(0,2) * M(2,1) - M(0,1) * M(2,2);
+	//	A(1,1) = M(0,0) * M(2,2) - M(0,2) * M(2,0);
+	//	A(2,1) = M(0,1) * M(2,0) - M(0,0) * M(2,1);
+
+	//	A(0,2) = M(0,1) * M(1,2) - M(0,2) * M(1,1);
+	//	A(1,2) = M(0,2) * M(1,0) - M(0,0) * M(1,2);
+	//	A(2,2) = M(0,0) * M(1,1) - M(0,1) * M(1,0);
+	//}
+
+	//inline void Adjoint( const Mat3x3 &M, Mat3x3 &A, double &det )
+	//{
+	//	Adjoint( M, A );
+	//	det = A(0,0) * M(0,0) + A(1,0) * M(1,0) + A(2,0) * M(2,0);
+	//}
+
+	//inline void TranspAdjoint( const Mat3x3 &M, Mat3x3 &A, double &det )
+	//{
+	//	TranspAdjoint( M, A );
+	//	det = A(0,0) * M(0,0) + A(0,1) * M(1,0) + A(0,2) * M(2,0);
+	//}
+
+
+	//==========================================
+	//===  Output routines                   ===                        
+	//==========================================
+
+	extern std::ostream &operator<<( std::ostream &out, const Vec3   & );
+	//extern std::ostream &operator<<( std::ostream &out, const Mat3x3 & );
+	//extern std::ostream &operator<<( std::ostream &out, const Mat4x4 & );
+};
+#endif
diff --git a/src/nvtt/bc7/arvo/Vec4.cpp b/src/nvtt/bc7/arvo/Vec4.cpp
new file mode 100644
index 0000000..286a203
--- /dev/null
+++ b/src/nvtt/bc7/arvo/Vec4.cpp
@@ -0,0 +1,79 @@
+/***************************************************************************
+* Vec4.C                                                                   *
+*                                                                          *
+* Basic operations on 3-dimensional vectors.  This special case is useful  *
+* because many operations are performed inline.                            *
+*                                                                          *
+*   HISTORY                                                                *
+*      Name    Date        Description                                     *
+*                                                                          *
+*      walt    6/26/07     Edited Vec4 to make this new class              *
+*      arvo    10/27/94    Reorganized (removed Col & Row distinction).    *
+*      arvo    06/14/93    Initial coding.                                 *
+*                                                                          *
+*--------------------------------------------------------------------------*
+* Copyright (C) 1994, James Arvo                                           *
+*                                                                          *
+* This program is free software; you can redistribute it and/or modify it  *
+* under the terms of the GNU General Public License as published by the    *
+* Free Software Foundation.  See http://www.fsf.org/copyleft/gpl.html      *
+*                                                                          *
+* This program is distributed in the hope that it will be useful, but      *
+* WITHOUT EXPRESS OR IMPLIED WARRANTY of merchantability or fitness for    *
+* any particular purpose.  See the GNU General Public License for more     *
+* details.                                                                 *
+*                                                                          *
+***************************************************************************/
+#include <stdio.h>
+#include <math.h>
+#include "ArvoMath.h"
+#include "Vec4.h"
+#include "form.h"
+
+namespace ArvoMath {
+
+	float Normalize( Vec4 &A )
+	{
+		float d = Len( A );
+		if( d > 0.0 )
+		{
+			double c = 1.0 / d;
+			A.X() *= c;
+			A.Y() *= c;
+			A.Z() *= c;
+			A.W() *= c;
+		}
+		return( d );
+	}
+
+	double Angle( const Vec4 &A, const Vec4 &B )
+	{
+		double t = LenSqr(A) * LenSqr(B);
+		if( t <= 0.0 ) return 0.0;
+		return ArcCos( (A * B) / sqrt(t) );
+	}
+
+	Vec4 Min( const Vec4 &A, const Vec4 &B )
+	{
+		return Vec4( 
+			Min( A.X(), B.X() ),
+			Min( A.Y(), B.Y() ),
+			Min( A.Z(), B.Z() ),
+			Min( A.W(), B.W() ) );
+	}
+
+	Vec4 Max( const Vec4 &A, const Vec4 &B )
+	{
+		return Vec4( 
+			Max( A.X(), B.X() ),
+			Max( A.Y(), B.Y() ),
+			Max( A.Z(), B.Z() ),
+			Max( A.W(), B.W() ) );
+	}
+
+	std::ostream &operator<<( std::ostream &out, const Vec4 &A )
+	{
+		out << form( " %9.5f %9.5f %9.5f %9.5f", A.X(), A.Y(), A.Z(), A.W() ) << std::endl;
+		return out;
+	}
+};
diff --git a/src/nvtt/bc7/arvo/Vec4.h b/src/nvtt/bc7/arvo/Vec4.h
new file mode 100644
index 0000000..efe1f3f
--- /dev/null
+++ b/src/nvtt/bc7/arvo/Vec4.h
@@ -0,0 +1,238 @@
+/***************************************************************************
+* Vec4.h                                                                   *
+*                                                                          *
+* Basic operations on 4-dimensional vectors.  This special case is useful  *
+* because many operations are performed inline.                            *
+*                                                                          *
+*   HISTORY                                                                *
+*      Name    Date        Description                                     *
+*                                                                          *
+*      walt    6/26/07     Edited Vec3 to make this new class              *
+*      arvo    10/27/94    Reorganized (removed Col & Row distinction).    *
+*      arvo    06/14/93    Initial coding.                                 *
+*                                                                          *
+*--------------------------------------------------------------------------*
+* Copyright (C) 1994, James Arvo                                           *
+*                                                                          *
+* This program is free software; you can redistribute it and/or modify it  *
+* under the terms of the GNU General Public License as published by the    *
+* Free Software Foundation.  See http://www.fsf.org/copyleft/gpl.html      *
+*                                                                          *
+* This program is distributed in the hope that it will be useful, but      *
+* WITHOUT EXPRESS OR IMPLIED WARRANTY of merchantability or fitness for    *
+* any particular purpose.  See the GNU General Public License for more     *
+* details.                                                                 *
+*                                                                          *
+***************************************************************************/
+#ifndef __Vec4_INCLUDED__
+#define __Vec4_INCLUDED__
+
+#include <math.h>
+#include <iostream>
+#include "Vec2.h"
+#include "Vec3.h"
+
+namespace ArvoMath {
+
+	class Vec4 {
+	public:
+		Vec4( float c = 0.0             ) { x =     c; y =     c; z =     c; w =     c; }
+		Vec4( float a, float b, float c, float d ) { x =     a; y =     b; z =     c; w = d; }
+		Vec4( const Vec4 &A             ) { x = A.X(); y = A.Y(); z = A.Z(); w = A.W(); }
+		Vec4( const Vec3 &A, float d    ) { x = A.X(); y = A.Y(); z = A.Z(); w = d;     }
+		void operator=( float c         ) { x =     c; y =     c; z =     c; w =     c; }
+		void operator=( const Vec4 &A   ) { x = A.X(); y = A.Y(); z = A.Z(); w = A.W(); }
+		void operator=( const Vec3 &A   ) { x = A.X(); y = A.Y(); z = A.Z(); w =   0.0; }
+		void operator=( const Vec2 &A   ) { x = A.X(); y = A.Y(); z =   0.0; w =   0.0; }
+		~Vec4() {}
+		float   X() const { return x; }
+		float   Y() const { return y; }
+		float   Z() const { return z; }
+		float   W() const { return w; }
+		float & X()       { return x; }
+		float & Y()       { return y; }
+		float & Z()       { return z; }
+		float & W()       { return w; }
+		float   operator[]( int i ) const { return *( &x + i ); }
+		float & operator[]( int i )       { return *( &x + i ); }
+	private:
+		float x, y, z, w;
+	};
+
+	//==========================================
+	//===      Norm-related functions        ===                        
+	//==========================================
+
+	inline double LenSqr ( const Vec4 &A ) { return Sqr(A[0]) + Sqr(A[1]) + Sqr(A[2]) + Sqr(A[3]); }
+	inline double Len    ( const Vec4 &A ) { return Sqrt( LenSqr( A ) ); }
+	inline double Norm1  ( const Vec4 &A ) { return Abs(A[0]) + Abs(A[1]) + Abs(A[2]) + Abs(A[3]); }
+	inline double Norm2  ( const Vec4 &A ) { return Len( A ); }
+	inline float  SupNorm( const Vec4 &A ) { return MaxAbs( A[0], A[1], A[2], A[3] ); }
+
+
+	//==========================================
+	//===            Addition                ===                        
+	//==========================================
+
+	inline Vec4 operator+( const Vec4 &A, const Vec4 &B )
+	{
+		return Vec4( A.X() + B.X(), A.Y() + B.Y(), A.Z() + B.Z(), A.W() + B.W() );
+	}
+
+	inline Vec4& operator+=( Vec4 &A, const Vec4 &B )
+	{
+		A.X() += B.X();
+		A.Y() += B.Y();
+		A.Z() += B.Z();
+		A.W() += B.W();
+		return A;
+	}
+
+
+	//==========================================
+	//===            Subtraction             ===                        
+	//==========================================
+
+	inline Vec4 operator-( const Vec4 &A, const Vec4 &B )
+	{
+		return Vec4( A.X() - B.X(), A.Y() - B.Y(), A.Z() - B.Z(), A.W() - B.W());
+	}
+
+	inline Vec4 operator-( const Vec4 &A )
+	{
+		return Vec4( -A.X(), -A.Y(), -A.Z(), -A.W() );
+	}
+
+	inline Vec4& operator-=( Vec4 &A, const Vec4 &B )
+	{
+		A.X() -= B.X();
+		A.Y() -= B.Y();
+		A.Z() -= B.Z();
+		A.W() -= B.W();
+		return A;
+	}
+
+
+	//==========================================
+	//===         Multiplication             ===                        
+	//==========================================
+
+	inline Vec4 operator*( float a, const Vec4 &x )
+	{
+		return Vec4( a * x.X(), a * x.Y(), a * x.Z(), a * x.W() );
+	}
+
+	inline Vec4 operator*( const Vec4 &x, float a )
+	{
+		return Vec4( a * x.X(), a * x.Y(), a * x.Z(), a * x.W() );
+	}
+
+	inline float operator*( const Vec4 &A, const Vec4 &B )  // Inner product.
+	{
+		return A.X() * B.X() + A.Y() * B.Y() + A.Z() * B.Z() + A.W() * B.W();
+	}
+
+	inline Vec4& operator*=( Vec4 &A, float a )
+	{
+		A.X() *= a;
+		A.Y() *= a;
+		A.Z() *= a;
+		A.W() *= a;
+		return A;
+	}
+
+	//==========================================
+	//===             Division               ===                        
+	//==========================================
+
+	inline Vec4 operator/( const Vec4 &A, double c )
+	{
+		double t = 1.0 / c;
+		return Vec4( A.X() * t, A.Y() * t, A.Z() * t, A.W() * t);
+	}
+
+	inline Vec4& operator/=( Vec4 &A, double a )
+	{
+		A.X() /= a;
+		A.Y() /= a;
+		A.Z() /= a;
+		A.W() /= a;
+		return A;
+	}
+
+	inline Vec4 operator/( const Vec4 &A, const Vec4 &B )  // Remove component parallel to B.
+	{
+		Vec4 C;  // Cumbersome due to compiler falure.
+		double x = LenSqr( B );
+		if( x > 0.0 ) C = A - B * (( A * B ) / x); else C = A;
+		return C;
+	}
+
+	inline void operator/=( Vec4 &A, const Vec4 &B ) // Remove component parallel to B.
+	{
+		double x = LenSqr( B );
+		if( x > 0.0 ) A -= B * (( A * B ) / x);
+	}
+
+
+	//==========================================
+	//===          Miscellaneous             ===                        
+	//==========================================
+
+	inline float operator|( const Vec4 &A, const Vec4 &B )  // Inner product.
+	{
+		return A * B;
+	}
+
+	inline Vec4 Unit( const Vec4 &A )
+	{
+		double d = LenSqr( A );
+		return d > 0.0 ? A / sqrt(d) : Vec4(0,0,0,0);
+	}
+
+	inline Vec4 Unit( float x, float y, float z, float w )
+	{
+		return Unit( Vec4( x, y, z, w ) );
+	}
+
+	inline Vec4 Ortho( const Vec4 &A, const Vec4 &B )
+	{
+		return Unit( A / B );
+	}
+
+	inline int operator==( const Vec4 &A, float x )
+	{
+		return (A[0] == x) && (A[1] == x) && (A[2] == x) && (A[3] == x);
+	}
+
+//	inline Vec4 operator^( const Vec4 &A, const Vec4 &B ) there is no 4ED "cross product" of 2 4D vectors -- we need six dimensions
+
+	inline double dist( const Vec4 &A, const Vec4 &B ) 
+	{ 
+		return Len( A - B ); 
+	}
+
+//	inline double Dihedral( const Vec4 &A, const Vec4 &B, const Vec4 &C )
+
+	inline Vec4 operator>>( const Vec4 &A, const Vec4 &B )  // Project A onto B.
+	{
+		Vec4 C;
+		double x = LenSqr( B );
+		if( x > 0.0 ) C = B * (( A * B ) / x);
+		return C;
+	}
+
+	inline Vec4 operator<<( const Vec4 &A, const Vec4 &B ) // Project B onto A.
+	{
+		return B >> A;
+	}
+
+//	inline double Triple( const Vec4 &A, const Vec4 &B, const Vec4 &C )
+
+	//==========================================
+	//===  Output routines                   ===                        
+	//==========================================
+
+	extern std::ostream &operator<<( std::ostream &out, const Vec4   & );
+};
+#endif
diff --git a/src/nvtt/bc7/arvo/Vector.cpp b/src/nvtt/bc7/arvo/Vector.cpp
new file mode 100644
index 0000000..af3bc11
--- /dev/null
+++ b/src/nvtt/bc7/arvo/Vector.cpp
@@ -0,0 +1,366 @@
+/***************************************************************************
+* Vector.C                                                                 *
+*                                                                          *
+* General Vector and Matrix classes, with all the associated methods.      *
+*                                                                          *
+*   HISTORY                                                                *
+*      Name    Date        Description                                     *
+*                                                                          *
+*      arvo    08/16/2000    Revamped for CIT tools.                       *
+*      arvo    10/31/1994    Combined RowVec & ColVec into Vector.         *
+*      arvo    06/30/1993    Added singular value decomposition class.     *
+*      arvo    06/25/1993    Major revisions.                              *
+*      arvo    09/08/1991    Initial implementation.                       *
+*                                                                          *
+*--------------------------------------------------------------------------*
+* Copyright (C) 2000, James Arvo                                           *
+*                                                                          *
+* This program is free software; you can redistribute it and/or modify it  *
+* under the terms of the GNU General Public License as published by the    *
+* Free Software Foundation.  See http://www.fsf.org/copyleft/gpl.html      *
+*                                                                          *
+* This program is distributed in the hope that it will be useful, but      *
+* WITHOUT EXPRESS OR IMPLIED WARRANTY of merchantability or fitness for    *
+* any particular purpose.  See the GNU General Public License for more     *
+* details.                                                                 *
+*                                                                          *
+***************************************************************************/
+#include <iostream>
+#include <assert.h>
+#include "ArvoMath.h"
+#include "Vector.h"
+#include "form.h"
+
+namespace ArvoMath {
+
+	const Vector Vector::Null(0);
+
+	/*-------------------------------------------------------------------------*
+	*                                                                         *
+	*  C O N S T R U C T O R S                                                *
+	*                                                                         *
+	*-------------------------------------------------------------------------*/
+	Vector::Vector( const float *x, int n )
+	{
+		Create( n );
+		for( register int i = 0; i < size; i++ ) elem[i] = x[i];
+	}
+
+	Vector::Vector( const Vector &A )
+	{
+		Create( A.Size() );
+		for( register int i = 0; i < A.Size(); i++ ) elem[i] = A(i);
+	}
+
+	Vector::Vector( int n )
+	{
+		Create( n );
+		for( register int i = 0; i < n; i++ ) elem[i] = 0.0;
+	}
+
+	Vector::Vector( float x, float y )
+	{
+		Create( 2 );
+		elem[0] = x;
+		elem[1] = y;
+	}
+
+	Vector::Vector( float x, float y, float z )
+	{
+		Create( 3 );
+		elem[0] = x;
+		elem[1] = y;
+		elem[2] = z;
+	}
+
+	void Vector::SetSize( int new_size )
+	{
+		if( size != new_size )
+		{
+			delete[] elem;
+			Create( new_size );
+			for( register int i = 0; i < new_size; i++ ) elem[i] = 0.0;
+		}
+	}
+
+	Vector &Vector::Swap( int i, int j )
+	{
+		float temp = elem[i];
+		elem[i]    = elem[j];
+		elem[j]    = temp;
+		return *this;
+	}
+
+	Vector Vector::GetBlock( int i, int j ) const
+	{
+		assert( 0 <= i && i <= j && j < size );
+		int n = j - i + 1;
+		Vector V( n );
+		register float *v = V.Array();
+		register float *e = elem + i;
+		for( register int k = 0; k < n; k++ ) *v++ = *e++;
+		return V;
+	}
+
+	void Vector::SetBlock( int i, int j, const Vector &V )
+	{
+		assert( 0 <= i && i <= j && j < size );
+		int n = j - i + 1;
+		assert( n == V.Size() );
+		register float *v = V.Array();
+		register float *e = elem + i;
+		for( register int k = 0; k < n; k++ ) *e++ = *v++;
+	}
+
+	/*-------------------------------------------------------------------------*
+	*                                                                         *
+	*  O P E R A T O R S                                                      *
+	*                                                                         *
+	*-------------------------------------------------------------------------*/
+	double operator*( const Vector &A, const Vector &B )
+	{
+		assert( A.Size() == B.Size() );
+		double sum = A(0) * B(0);
+		for( register int i = 1; i < A.Size(); i++ ) sum += A(i) * B(i);
+		return sum;
+	}
+
+	void Vector::operator=( float c )
+	{
+		for( register int i = 0; i < size; i++ ) elem[i] = c;
+	}
+
+	Vector operator*( const Vector &A, float s ) 
+	{
+		Vector C( A.Size() );
+		for( register int i = 0; i < A.Size(); i++ ) C(i) = A(i) * s;
+		return C;
+	}
+
+	Vector operator*( float s, const Vector &A ) 
+	{
+		Vector C( A.Size() );
+		for( register int i = 0; i < A.Size(); i++ ) C(i) = A(i) * s;
+		return C;
+	}
+
+	Vector operator/( const Vector &A, float s ) 
+	{
+		assert( s != 0.0 );
+		Vector C( A.Size() );
+		for( register int i = 0; i < A.Size(); i++ ) C(i) = A(i) / s;
+		return C;
+	}
+
+	Vector& operator+=( Vector &A, const Vector &B ) 
+	{
+		assert( A.Size() == B.Size() );
+		for( register int i = 0; i < A.Size(); i++ ) A(i) += B(i);
+		return A;
+	}
+
+	Vector& operator*=( Vector &A, float scale ) 
+	{
+		for( register int i = 0; i < A.Size(); i++ ) A(i) *= scale;
+		return A;
+	}
+
+	Vector& operator/=( Vector &A, float scale ) 
+	{
+		for( register int i = 0; i < A.Size(); i++ ) A(i) /= scale;
+		return A;
+	}
+
+	Vector& Vector::operator=( const Vector &A )
+	{
+		SetSize( A.Size() );
+		for( register int i = 0; i < size; i++ ) elem[i] = A(i);
+		return *this;
+	}
+
+	Vector operator+( const Vector &A, const Vector &B ) 
+	{
+		assert( A.Size() == B.Size() );
+		Vector C( A.Size() );
+		for( register int i = 0; i < A.Size(); i++ ) C(i) = A(i) + B(i);
+		return C;
+	}
+
+	Vector operator-( const Vector &A, const Vector &B ) 
+	{
+		assert( A.Size() == B.Size() );
+		Vector C( A.Size() );
+		for( register int i = 0; i < A.Size(); i++ ) C(i) = A(i) - B(i);
+		return C;
+	}
+
+	Vector operator-( const Vector &A )  // Unary minus.
+	{
+		Vector B( A.Size() );
+		for( register int i = 0; i < A.Size(); i++ ) B(i) = -A(i);
+		return B;
+	}
+
+	Vector operator^( const Vector &A, const Vector &B )
+	{
+		Vector C(3);
+		assert( A.Size() == B.Size() );
+		if( A.Size() == 2 ) // Assume z components of A and B are zero.
+		{
+			C(0) = 0.0;
+			C(1) = 0.0;
+			C(2) = A(0) * B(1) - A(1) * B(0);
+		}
+		else 
+		{
+			assert( A.Size() == 3 );
+			C(0) = A(1) * B(2) - A(2) * B(1);
+			C(1) = A(2) * B(0) - A(0) * B(2);
+			C(2) = A(0) * B(1) - A(1) * B(0);
+		}
+		return C;
+	}
+
+	/*-------------------------------------------------------------------------*
+	*                                                                         *
+	*  M I S C E L L A N E O U S   F U N C T I O N S                          *
+	*                                                                         *
+	*-------------------------------------------------------------------------*/
+	Vector Min( const Vector &A, const Vector &B )
+	{
+		assert( A.Size() == B.Size() );
+		Vector C( A.Size() );
+		for( register int i = 0; i < A.Size(); i++ ) C(i) = Min( A(i), B(i) );
+		return C;
+	}
+
+	Vector Max( const Vector &A, const Vector &B )
+	{
+		assert( A.Size() == B.Size() );
+		Vector C( A.Size() );
+		for( register int i = 0; i < A.Size(); i++ ) C(i) = Max( A(i), B(i) );
+		return C;
+	}
+
+	Vector Unit( const Vector &A )
+	{
+		double norm = TwoNorm( A );
+		assert( norm > 0.0 );
+		return A * ( 1.0 / norm );
+	}
+
+	double Normalize( Vector &A )
+	{
+		double norm = TwoNorm( A );
+		assert( norm > 0.0 );
+		for( register int i = 0; i < A.Size(); i++ ) A(i) /= norm;
+		return norm;
+	}
+
+	int Null( const Vector &A ) 
+	{
+		return A.Size() == 0;
+	}
+
+	double TwoNormSqr( const Vector &A )
+	{
+		double sum = A(0) * A(0);
+		for( register int i = 1; i < A.Size(); i++ ) sum += A(i) * A(i);
+		return sum;
+	}
+
+	double TwoNorm( const Vector &A )
+	{
+		return sqrt( TwoNormSqr( A ) );
+	}
+
+	double dist( const Vector &A, const Vector &B )
+	{
+		return TwoNorm( A - B );
+	}
+
+	double OneNorm( const Vector &A )
+	{
+		double norm = Abs( A(0) );
+		for( register int i = 1; i < A.Size(); i++ ) norm += Abs( A(i) );
+		return norm;
+	}
+
+	double SupNorm( const Vector &A )
+	{
+		double norm = Abs( A(0) );
+		for( register int i = 1; i < A.Size(); i++ )
+		{
+			double a = Abs( A(i) );
+			if( a > norm ) norm = a;
+		}
+		return norm;
+	}
+
+	Vec2 ToVec2( const Vector &V )
+	{
+		assert( V.Size() == 2 );
+		return Vec2( V(0), V(1) );
+	}
+
+	Vec3 ToVec3( const Vector &V )
+	{
+		assert( V.Size() == 3 );
+		return Vec3( V(0), V(1), V(2) );
+	}
+
+	Vector ToVector( const Vec2 &V )
+	{
+		return Vector( V.X(), V.Y() );
+	}
+
+	Vector ToVector( const Vec3 &V )
+	{
+		return Vector( V.X(), V.Y(), V.Z() );
+	}
+
+	//
+	// Returns a vector that is orthogonal to A (but of arbitrary length). 
+	//
+	Vector OrthogonalTo( const Vector &A )
+	{
+		Vector B( A.Size() );
+		double c = 0.5 * SupNorm( A );
+
+		if( A.Size() < 2 ) 
+		{
+			// Just return the zero-vector.
+		}
+		else if( c == 0.0 ) 
+		{
+			B(0) = 1.0;
+		}
+		else for( register int i = 0; i < A.Size(); i++ )
+		{
+			if( Abs( A(i)) > c )
+			{
+				int k = ( i > 0 ) ? i - 1 : i + 1;
+				B(k) = -A(i);
+				B(i) =  A(k);
+				break;
+			}
+		}
+		return B;
+	}
+
+	std::ostream &operator<<( std::ostream &out, const Vector &A )
+	{
+		if( A.Size() == 0 )
+		{
+			out << "NULL";
+		}
+		else for( register int i = 0; i < A.Size(); i++ )
+		{
+			out << form( "%3d:  %10.5g\n", i, A(i) );
+		}
+		out << std::endl;
+		return out;
+	}
+
+
+};
diff --git a/src/nvtt/bc7/arvo/Vector.h b/src/nvtt/bc7/arvo/Vector.h
new file mode 100644
index 0000000..01e66df
--- /dev/null
+++ b/src/nvtt/bc7/arvo/Vector.h
@@ -0,0 +1,103 @@
+/***************************************************************************
+* Vector.h                                                                 *
+*                                                                          *
+* General Vector and Matrix classes, with all the associated methods.      *
+*                                                                          *
+*   HISTORY                                                                *
+*      Name    Date        Description                                     *
+*                                                                          *
+*      arvo    08/16/2000    Revamped for CIT tools.                       *
+*      arvo    10/31/1994    Combined RowVec & ColVec into Vector.         *
+*      arvo    06/30/1993    Added singular value decomposition class.     *
+*      arvo    06/25/1993    Major revisions.                              *
+*      arvo    09/08/1991    Initial implementation.                       *
+*                                                                          *
+*--------------------------------------------------------------------------*
+* Copyright (C) 2000, James Arvo                                           *
+*                                                                          *
+* This program is free software; you can redistribute it and/or modify it  *
+* under the terms of the GNU General Public License as published by the    *
+* Free Software Foundation.  See http://www.fsf.org/copyleft/gpl.html      *
+*                                                                          *
+* This program is distributed in the hope that it will be useful, but      *
+* WITHOUT EXPRESS OR IMPLIED WARRANTY of merchantability or fitness for    *
+* any particular purpose.  See the GNU General Public License for more     *
+* details.                                                                 *
+*                                                                          *
+***************************************************************************/
+#ifndef __VECTOR_INCLUDED__
+#define __VECTOR_INCLUDED__
+
+#include <istream>
+#include "Vec2.h"
+#include "Vec3.h"
+
+namespace ArvoMath {
+	class Vector {
+	public:
+		Vector( int size = 0   );
+		Vector( const Vector & );
+		Vector( float, float );
+		Vector( float, float, float );
+		Vector( const float *x, int n );
+		Vector &operator=( const Vector & );
+		void    operator=( float );
+		void    SetSize( int );
+		Vector &Swap( int i, int j );
+		Vector  GetBlock( int i, int j ) const;
+		void    SetBlock( int i, int j, const Vector & );
+		static  const Vector Null;
+
+	public: // Inlined functions.
+		inline float  operator()( int i ) const { return elem[i]; }
+		inline float& operator()( int i )       { return elem[i]; }
+		inline float* Array() const { return elem; }
+		inline int    Size () const { return size; }
+		inline ~Vector() { delete[] elem; }
+
+	private:
+		void   Create( int n = 0 ) { size = n; elem = new float[n]; }
+		int    size;
+		float* elem;
+	};
+
+	extern Vector  operator +  ( const Vector &, const Vector & );
+	extern Vector  operator -  ( const Vector &, const Vector & ); // Binary minus.
+	extern Vector  operator -  ( const Vector &                 ); // Unary minus.
+	extern Vector  operator *  ( const Vector &,        float   );
+	extern Vector  operator *  (       float   , const Vector & );
+	extern Vector  operator /  ( const Vector &,        float   );
+	extern Vector  operator /  ( const Vector &, const Vector & );
+	extern Vector  operator ^  ( const Vector &, const Vector & );
+	extern Vector& operator += (       Vector &, const Vector & );
+	extern Vector& operator *= (       Vector &,        float   );
+	extern Vector& operator /= (       Vector &,        float   );
+	extern Vector  Min         ( const Vector &, const Vector & );
+	extern Vector  Max         ( const Vector &, const Vector & );
+	extern double  operator *  ( const Vector &, const Vector & );  // Inner product.
+	extern double  dist        ( const Vector &, const Vector & );
+	extern Vector  OrthogonalTo( const Vector & );  // Returns some orthogonal vector.
+	extern Vector  Unit        ( const Vector & );
+	extern double  Normalize   (       Vector & );
+	extern double  OneNorm     ( const Vector & );
+	extern double  TwoNorm     ( const Vector & );
+	extern double  TwoNormSqr  ( const Vector & );
+	extern double  SupNorm     ( const Vector & );
+	extern int     Null        ( const Vector & );
+	extern Vec2    ToVec2      ( const Vector & );
+	extern Vec3    ToVec3      ( const Vector & );
+	extern Vector  ToVector    ( const Vec2   & );
+	extern Vector  ToVector    ( const Vec3   & );
+
+	std::ostream &operator<<( 
+		std::ostream &out, 
+		const Vector &
+		);
+};
+#endif
+
+
+
+
+
+
diff --git a/src/nvtt/bc7/arvo/form.h b/src/nvtt/bc7/arvo/form.h
new file mode 100644
index 0000000..48aef94
--- /dev/null
+++ b/src/nvtt/bc7/arvo/form.h
@@ -0,0 +1,26 @@
+#ifndef __FORM_INCLUDED__
+#define __FORM_INCLUDED__
+
+#include <stdlib.h>
+#include <stdarg.h>
+#include <string.h>
+#include <assert.h>
+
+namespace ArvoMath {
+
+	inline const char *form(char *fmt, ...)
+	{
+		static char printbfr[65536];
+		va_list arglist;
+
+		va_start(arglist,fmt);	
+		int length = vsprintf(printbfr,fmt,arglist);
+		va_end(arglist);
+
+		assert(length > 65536);
+
+		return printbfr;
+	}
+};
+
+#endif
diff --git a/src/nvtt/bc7/avpcl.cpp b/src/nvtt/bc7/avpcl.cpp
new file mode 100644
index 0000000..6cbb972
--- /dev/null
+++ b/src/nvtt/bc7/avpcl.cpp
@@ -0,0 +1,263 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+// the avpcl compressor and decompressor
+
+#include <string>
+#include <iostream>
+#include <sstream>
+#include <assert.h>
+#include <time.h>
+
+#include "ImfArray.h"
+#include "RGBA.h"
+#include "tile.h"
+#include "avpcl.h"
+#include "targa.h"
+
+#ifndef MIN
+#define MIN(x,y) ((x)<(y)?(x):(y))
+#endif
+
+using namespace std;
+
+void AVPCL::compress(const Tile &t, char *block, FILE *errfile)
+{
+	char tempblock[AVPCL::BLOCKSIZE];
+	double msebest = DBL_MAX;
+
+	double mse_mode0 = AVPCL::compress_mode0(t, tempblock);		if(mse_mode0 < msebest) { msebest = mse_mode0; memcpy(block, tempblock, AVPCL::BLOCKSIZE); }
+	double mse_mode1 = AVPCL::compress_mode1(t, tempblock);		if(mse_mode1 < msebest) { msebest = mse_mode1; memcpy(block, tempblock, AVPCL::BLOCKSIZE); }
+	double mse_mode2 = AVPCL::compress_mode2(t, tempblock);		if(mse_mode2 < msebest) { msebest = mse_mode2; memcpy(block, tempblock, AVPCL::BLOCKSIZE); }
+	double mse_mode3 = AVPCL::compress_mode3(t, tempblock);		if(mse_mode3 < msebest) { msebest = mse_mode3; memcpy(block, tempblock, AVPCL::BLOCKSIZE); }
+	double mse_mode4 = AVPCL::compress_mode4(t, tempblock);		if(mse_mode4 < msebest) { msebest = mse_mode4; memcpy(block, tempblock, AVPCL::BLOCKSIZE); }
+	double mse_mode5 = AVPCL::compress_mode5(t, tempblock);		if(mse_mode5 < msebest) { msebest = mse_mode5; memcpy(block, tempblock, AVPCL::BLOCKSIZE); }
+	double mse_mode6 = AVPCL::compress_mode6(t, tempblock);		if(mse_mode6 < msebest) { msebest = mse_mode6; memcpy(block, tempblock, AVPCL::BLOCKSIZE); }
+	double mse_mode7 = AVPCL::compress_mode7(t, tempblock);		if(mse_mode7 < msebest) { msebest = mse_mode7; memcpy(block, tempblock, AVPCL::BLOCKSIZE); }
+		
+	if (errfile)
+	{
+		double errs[21];
+		int nerrs = 8;
+		errs[0] = mse_mode0; 
+		errs[1] = mse_mode1; 
+		errs[2] = mse_mode2; 
+		errs[3] = mse_mode3; 
+		errs[4] = mse_mode4; 
+		errs[5] = mse_mode5; 
+		errs[6] = mse_mode6; 
+		errs[7] = mse_mode7;
+		if (fwrite(errs, sizeof(double), nerrs, errfile) != nerrs)
+			throw "Write error on error file";
+	}
+}
+
+static int getbit(char *b, int start)
+{
+	if (start < 0 || start >= 128) return 0; // out of range
+
+	int ix = start >> 3;
+	return (b[ix] & (1 << (start & 7))) != 0;
+}
+
+static int getbits(char *b, int start, int len)
+{
+	int out = 0;
+	for (int i=0; i<len; ++i)
+		out |= getbit(b, start+i) << i;
+	return out;
+}
+
+static void setbit(char *b, int start, int bit)
+{
+	if (start < 0 || start >= 128) return; // out of range
+
+	int ix = start >> 3;
+
+	if (bit & 1)
+		b[ix] |= (1 << (start & 7));
+	else
+		b[ix] &= ~(1 << (start & 7));
+}
+
+static void setbits(char *b, int start, int len, int bits)
+{
+	for (int i=0; i<len; ++i)
+		setbit(b, start+i, bits >> i);
+}
+
+void AVPCL::decompress(const char *cblock, Tile &t)
+{
+	Vec4 zero(0);
+
+	char block[16];
+
+	for (int i=0; i<16; ++i) block[i] = cblock[i];
+
+	switch(getmode(block))
+	{
+	case 0:	AVPCL::decompress_mode0(block, t);	break;
+	case 1:	AVPCL::decompress_mode1(block, t);	break;
+	case 2:	AVPCL::decompress_mode2(block, t);	break;
+	case 3:	AVPCL::decompress_mode3(block, t);	break;
+	case 4:	AVPCL::decompress_mode4(block, t);	break;
+	case 5:	AVPCL::decompress_mode5(block, t);	break;
+	case 6:	AVPCL::decompress_mode6(block, t);	break;
+	case 7:	AVPCL::decompress_mode7(block, t);	break;
+	case 8: // return a black tile if you get a reserved mode
+		for (int y=0; y<Tile::TILE_H; ++y)
+			for (int x=0; x<Tile::TILE_W; ++x)
+				t.data[y][x] = zero;
+		break;
+	default: assert(0);
+	}
+}
+
+void AVPCL::compress(string inf, string avpclf, string errf)
+{
+	Array2D<RGBA> pixels;
+	int w, h;
+	char block[AVPCL::BLOCKSIZE];
+
+	Targa::read(inf, pixels, w, h);
+	FILE *avpclfile = fopen(avpclf.c_str(), "wb");
+	if (avpclfile == NULL) throw "Unable to open .avpcl file for write";
+	FILE *errfile = NULL;
+	if (errf != "")
+	{
+		errfile = fopen(errf.c_str(), "wb");
+		if (errfile == NULL) throw "Unable to open error file for write";
+	}
+
+	// Look at alpha channel and override the premult flag if alpha is constant (but only if premult is set)
+	if (AVPCL::flag_premult)
+	{
+		if (AVPCL::mode_rgb)
+		{
+			AVPCL::flag_premult = false;
+			cout << endl << "NOTE: Source image alpha is constant 255, turning off premultiplied-alpha error metric." << endl << endl;
+		}
+	}
+
+	// stuff for progress bar O.o
+	int ntiles = ((h+Tile::TILE_H-1)/Tile::TILE_H)*((w+Tile::TILE_W-1)/Tile::TILE_W);
+	int tilecnt = 0;
+	clock_t start, prev, cur;
+
+	start = prev = clock();
+
+	// convert to tiles and compress each tile
+	for (int y=0; y<h; y+=Tile::TILE_H)
+	{
+		int ysize = MIN(Tile::TILE_H, h-y);
+		for (int x=0; x<w; x+=Tile::TILE_W)
+		{
+			if ((tilecnt%100) == 0) { cur = clock(); printf("Progress %d of %d, %5.2f seconds per 100 tiles\r", tilecnt, ntiles, double(cur-prev)/CLOCKS_PER_SEC); fflush(stdout); prev = cur; }
+
+			int xsize = MIN(Tile::TILE_W, w-x);
+			Tile t(xsize, ysize);
+
+			t.insert(pixels, x, y);
+
+			AVPCL::compress(t, block, errfile);
+			if (fwrite(block, sizeof(char), AVPCL::BLOCKSIZE, avpclfile) != AVPCL::BLOCKSIZE)
+				throw "File error on write";
+
+			// progress bar
+			++tilecnt;
+		}
+	}
+
+	cur = clock();
+	printf("\nTotal time to compress: %.2f seconds\n\n", double(cur-start)/CLOCKS_PER_SEC);		// advance to next line finally
+
+	if (fclose(avpclfile)) throw "Close failed on .avpcl file";
+	if (errfile && fclose(errfile)) throw "Close failed on error file";
+}
+
+static int str2int(std::string s) 
+{
+	int thing;
+	std::stringstream str (stringstream::in | stringstream::out);
+	str << s;
+	str >> thing;
+	return thing;
+}
+
+// avpcl file name is ...-w-h-RGB[A].avpcl, extract width and height
+static void extract(string avpclf, int &w, int &h, bool &mode_rgb)
+{
+	size_t n = avpclf.rfind('.', avpclf.length()-1);
+	size_t n1 = avpclf.rfind('-', n-1);
+	size_t n2 = avpclf.rfind('-', n1-1);
+	size_t n3 = avpclf.rfind('-', n2-1);
+	//	...-wwww-hhhh-RGB[A].avpcl
+	//     ^    ^    ^      ^
+	//     n3   n2   n1     n n3<n2<n1<n
+	string width = avpclf.substr(n3+1, n2-n3-1);
+	w = str2int(width);
+	string height = avpclf.substr(n2+1, n1-n2-1);
+	h = str2int(height);
+	string mode = avpclf.substr(n1+1, n-n1-1);
+	mode_rgb = mode == "RGB";
+}
+
+static int modehist[8];
+
+static void stats(char block[AVPCL::BLOCKSIZE])
+{
+	int m = AVPCL::getmode(block);
+	modehist[m]++;
+}
+
+static void printstats()
+{
+	printf("\nMode histogram: "); for (int i=0; i<8; ++i) { printf("%d,", modehist[i]); }
+	printf("\n");
+}
+
+void AVPCL::decompress(string avpclf, string outf)
+{
+	Array2D<RGBA> pixels;
+	int w, h;
+	char block[AVPCL::BLOCKSIZE];
+
+	extract(avpclf, w, h, AVPCL::mode_rgb);
+	FILE *avpclfile = fopen(avpclf.c_str(), "rb");
+	if (avpclfile == NULL) throw "Unable to open .avpcl file for read";
+	pixels.resizeErase(h, w);
+
+	// convert to tiles and decompress each tile
+	for (int y=0; y<h; y+=Tile::TILE_H)
+	{
+		int ysize = MIN(Tile::TILE_H, h-y);
+		for (int x=0; x<w; x+=Tile::TILE_W)
+		{
+			int xsize = MIN(Tile::TILE_W, w-x);
+			Tile t(xsize, ysize);
+
+			if (fread(block, sizeof(char), AVPCL::BLOCKSIZE, avpclfile) != AVPCL::BLOCKSIZE)
+				throw "File error on read";
+
+			stats(block);	// collect statistics
+		
+			AVPCL::decompress(block, t);
+
+			t.extract(pixels, x, y);
+		}
+	}
+	if (fclose(avpclfile)) throw "Close failed on .avpcl file";
+
+	Targa::write(outf, pixels, w, h);
+
+	printstats();	// print statistics
+}
diff --git a/src/nvtt/bc7/avpcl.h b/src/nvtt/bc7/avpcl.h
new file mode 100644
index 0000000..3cf7527
--- /dev/null
+++ b/src/nvtt/bc7/avpcl.h
@@ -0,0 +1,107 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+#ifndef _AVPCL_H
+#define _AVPCL_H
+
+#include <string>
+#include <assert.h>
+
+#include "tile.h"
+#include "bits.h"
+
+using namespace std;
+
+#define	EXTERNAL_RELEASE	1	// define this if we're releasing this code externally
+#define	DISABLE_EXHAUSTIVE	1	// define this if you don't want to spend a lot of time on exhaustive compression
+#define	USE_ZOH_INTERP		1	// use zoh interpolator, otherwise use exact avpcl interpolators
+#define	USE_ZOH_INTERP_ROUNDED 1	// use the rounded versions!
+
+#define	NREGIONS_TWO	2
+#define	NREGIONS_THREE	3
+#define	DBL_MAX	(1.0e37)		// doesn't have to be really dblmax, just bigger than any possible squared error
+
+class AVPCL
+{
+public:
+	static const int BLOCKSIZE=16;
+	static const int BITSIZE=128;
+
+	// global flags
+	static bool flag_premult;
+	static bool flag_nonuniform;
+	static bool flag_nonuniform_ati;
+
+	// global mode
+	static bool mode_rgb;		// true if image had constant alpha = 255
+
+	static void compress(string inf, string zohf, string errf);
+	static void decompress(string zohf, string outf);
+	static void compress(const Tile &t, char *block, FILE *errfile);
+	static void decompress(const char *block, Tile &t);
+
+	static double compress_mode0(const Tile &t, char *block);
+	static void decompress_mode0(const char *block, Tile &t);
+
+	static double compress_mode1(const Tile &t, char *block);
+	static void decompress_mode1(const char *block, Tile &t);
+
+	static double compress_mode2(const Tile &t, char *block);
+	static void decompress_mode2(const char *block, Tile &t);
+
+	static double compress_mode3(const Tile &t, char *block);
+	static void decompress_mode3(const char *block, Tile &t);
+
+	static double compress_mode4(const Tile &t, char *block);
+	static void decompress_mode4(const char *block, Tile &t);
+
+	static double compress_mode5(const Tile &t, char *block);
+	static void decompress_mode5(const char *block, Tile &t);
+
+	static double compress_mode6(const Tile &t, char *block);
+	static void decompress_mode6(const char *block, Tile &t);
+
+	static double compress_mode7(const Tile &t, char *block);
+	static void decompress_mode7(const char *block, Tile &t);
+
+	static int getmode(Bits &in)
+	{
+		int mode = 0;
+
+		if (in.read(1))			mode = 0;
+		else if (in.read(1))	mode = 1;
+		else if (in.read(1))	mode = 2;
+		else if (in.read(1))	mode = 3;
+		else if (in.read(1))	mode = 4;
+		else if (in.read(1))	mode = 5;
+		else if (in.read(1))	mode = 6;
+		else if (in.read(1))	mode = 7;
+		else mode = 8;	// reserved
+		return mode;
+	}
+	static int getmode(const char *block)
+	{
+		int bits = block[0], mode = 0;
+
+		if (bits & 1) mode = 0;
+		else if ((bits&3) == 2) mode = 1;
+		else if ((bits&7) == 4) mode = 2;
+		else if ((bits & 0xF) == 8) mode = 3;
+		else if ((bits & 0x1F) == 16) mode = 4;
+		else if ((bits & 0x3F) == 32) mode = 5;
+		else if ((bits & 0x7F) == 64) mode = 6;
+		else if ((bits & 0xFF) == 128) mode = 7;
+		else mode = 8;	// reserved
+		return mode;
+	}
+};
+#endif
\ No newline at end of file
diff --git a/src/nvtt/bc7/avpcl.sln b/src/nvtt/bc7/avpcl.sln
new file mode 100644
index 0000000..395b1ce
--- /dev/null
+++ b/src/nvtt/bc7/avpcl.sln
@@ -0,0 +1,21 @@
+Microsoft Visual Studio Solution File, Format Version 8.00
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "avpcl", "avpcl.vcproj", "{C6F6CD96-D0C0-4A35-B1BC-53E0A3CB712F}"
+	ProjectSection(ProjectDependencies) = postProject
+	EndProjectSection
+EndProject
+Global
+	GlobalSection(SolutionConfiguration) = preSolution
+		Debug = Debug
+		Release = Release
+	EndGlobalSection
+	GlobalSection(ProjectConfiguration) = postSolution
+		{C6F6CD96-D0C0-4A35-B1BC-53E0A3CB712F}.Debug.ActiveCfg = Debug|Win32
+		{C6F6CD96-D0C0-4A35-B1BC-53E0A3CB712F}.Debug.Build.0 = Debug|Win32
+		{C6F6CD96-D0C0-4A35-B1BC-53E0A3CB712F}.Release.ActiveCfg = Release|Win32
+		{C6F6CD96-D0C0-4A35-B1BC-53E0A3CB712F}.Release.Build.0 = Release|Win32
+	EndGlobalSection
+	GlobalSection(ExtensibilityGlobals) = postSolution
+	EndGlobalSection
+	GlobalSection(ExtensibilityAddIns) = postSolution
+	EndGlobalSection
+EndGlobal
diff --git a/src/nvtt/bc7/avpcl.vcproj b/src/nvtt/bc7/avpcl.vcproj
new file mode 100644
index 0000000..4857f78
--- /dev/null
+++ b/src/nvtt/bc7/avpcl.vcproj
@@ -0,0 +1,314 @@
+<?xml version="1.0" encoding="Windows-1252"?>
+<VisualStudioProject
+	ProjectType="Visual C++"
+	Version="7.10"
+	Name="avpcl"
+	ProjectGUID="{3d7401c5-23e7-4280-bfa2-a51073587cf3}"
+	SccProjectName=""
+	SccLocalPath="">
+	<Platforms>
+		<Platform
+			Name="Win32"/>
+	</Platforms>
+	<Configurations>
+		<Configuration
+			Name="Debug|Win32"
+			OutputDirectory=".\Debug"
+			IntermediateDirectory=".\Debug"
+			ConfigurationType="1"
+			UseOfMFC="0"
+			ATLMinimizesCRunTimeLibraryUsage="FALSE"
+			CharacterSet="2">
+			<Tool
+				Name="VCCLCompilerTool"
+				Optimization="0"
+				AdditionalIncludeDirectories=""
+				PreprocessorDefinitions="_DEBUG;WIN32;_CONSOLE"
+				BasicRuntimeChecks="3"
+				RuntimeLibrary="3"
+				ForceConformanceInForLoopScope="TRUE"
+				RuntimeTypeInfo="TRUE"
+				UsePrecompiledHeader="0"
+				ProgramDataBaseFileName="$(IntDir)/$(ProjectName)_d.pdb"
+				WarningLevel="1"
+				SuppressStartupBanner="TRUE"
+				Detect64BitPortabilityProblems="TRUE"
+				DebugInformationFormat="4"
+				CompileAs="0"
+				DisableSpecificWarnings="4290"/>
+			<Tool
+				Name="VCCustomBuildTool"/>
+			<Tool
+				Name="VCLinkerTool"
+				AdditionalDependencies="comctl32.lib"
+				OutputFile="../test/avpclc_d.exe"
+				LinkIncremental="2"
+				SuppressStartupBanner="TRUE"
+				AdditionalLibraryDirectories=""
+				GenerateDebugInformation="TRUE"
+				SubSystem="1"
+				TargetMachine="1"/>
+			<Tool
+				Name="VCMIDLTool"
+				TypeLibraryName="./Debug/avpcl.tlb"
+				HeaderFileName=""/>
+			<Tool
+				Name="VCPostBuildEventTool"/>
+			<Tool
+				Name="VCPreBuildEventTool"/>
+			<Tool
+				Name="VCPreLinkEventTool"/>
+			<Tool
+				Name="VCResourceCompilerTool"
+				PreprocessorDefinitions="_DEBUG"
+				Culture="1033"/>
+			<Tool
+				Name="VCWebServiceProxyGeneratorTool"/>
+			<Tool
+				Name="VCXMLDataGeneratorTool"/>
+			<Tool
+				Name="VCWebDeploymentTool"/>
+			<Tool
+				Name="VCManagedWrapperGeneratorTool"/>
+			<Tool
+				Name="VCAuxiliaryManagedWrapperGeneratorTool"/>
+		</Configuration>
+		<Configuration
+			Name="Release|Win32"
+			OutputDirectory=".\Release"
+			IntermediateDirectory=".\Release"
+			ConfigurationType="1"
+			UseOfMFC="0"
+			ATLMinimizesCRunTimeLibraryUsage="FALSE"
+			CharacterSet="2">
+			<Tool
+				Name="VCCLCompilerTool"
+				Optimization="2"
+				InlineFunctionExpansion="1"
+				AdditionalIncludeDirectories=""
+				PreprocessorDefinitions="NDEBUG;WIN32;_CONSOLE"
+				StringPooling="TRUE"
+				RuntimeLibrary="2"
+				ForceConformanceInForLoopScope="TRUE"
+				RuntimeTypeInfo="TRUE"
+				UsePrecompiledHeader="0"
+				ProgramDataBaseFileName="$(IntDir)/$(ProjectName).pdb"
+				WarningLevel="1"
+				SuppressStartupBanner="TRUE"
+				Detect64BitPortabilityProblems="TRUE"
+				DebugInformationFormat="3"
+				CompileAs="0"
+				DisableSpecificWarnings="4290"/>
+			<Tool
+				Name="VCCustomBuildTool"/>
+			<Tool
+				Name="VCLinkerTool"
+				AdditionalDependencies="comctl32.lib"
+				OutputFile="../test/avpclc.exe"
+				LinkIncremental="1"
+				SuppressStartupBanner="TRUE"
+				AdditionalLibraryDirectories=""
+				GenerateDebugInformation="FALSE"
+				SubSystem="1"
+				EntryPointSymbol="mainCRTStartup"
+				TargetMachine="1"/>
+			<Tool
+				Name="VCMIDLTool"
+				TypeLibraryName="./Release/avpcl.tlb"
+				HeaderFileName=""/>
+			<Tool
+				Name="VCPostBuildEventTool"/>
+			<Tool
+				Name="VCPreBuildEventTool"/>
+			<Tool
+				Name="VCPreLinkEventTool"/>
+			<Tool
+				Name="VCResourceCompilerTool"
+				PreprocessorDefinitions="NDEBUG"
+				Culture="1033"/>
+			<Tool
+				Name="VCWebServiceProxyGeneratorTool"/>
+			<Tool
+				Name="VCXMLDataGeneratorTool"/>
+			<Tool
+				Name="VCWebDeploymentTool"/>
+			<Tool
+				Name="VCManagedWrapperGeneratorTool"/>
+			<Tool
+				Name="VCAuxiliaryManagedWrapperGeneratorTool"/>
+		</Configuration>
+	</Configurations>
+	<References>
+	</References>
+	<Files>
+		<Filter
+			Name="Source Files"
+			Filter="cpp;c;cxx;rc;def;r;odl;idl;hpj;bat">
+			<File
+				RelativePath=".\avpcl.cpp">
+			</File>
+			<File
+				RelativePath=".\avpcl_mode0.cpp">
+			</File>
+			<File
+				RelativePath=".\avpcl_mode1.cpp">
+			</File>
+			<File
+				RelativePath=".\avpcl_mode2.cpp">
+			</File>
+			<File
+				RelativePath=".\avpcl_mode3.cpp">
+			</File>
+			<File
+				RelativePath=".\avpcl_mode4.cpp">
+			</File>
+			<File
+				RelativePath=".\avpcl_mode5.cpp">
+			</File>
+			<File
+				RelativePath=".\avpcl_mode6.cpp">
+			</File>
+			<File
+				RelativePath=".\avpcl_mode7.cpp">
+			</File>
+			<File
+				RelativePath=".\avpclc.cpp">
+			</File>
+			<File
+				RelativePath=".\targa.cpp">
+			</File>
+			<File
+				RelativePath=".\utils.cpp">
+			</File>
+			<Filter
+				Name="arvo"
+				Filter="">
+				<File
+					RelativePath=".\arvo\ArvoMath.cpp">
+				</File>
+				<File
+					RelativePath=".\arvo\Char.cpp">
+				</File>
+				<File
+					RelativePath=".\arvo\Complex.cpp">
+				</File>
+				<File
+					RelativePath=".\arvo\Matrix.cpp">
+				</File>
+				<File
+					RelativePath=".\arvo\Perm.cpp">
+				</File>
+				<File
+					RelativePath=".\arvo\Rand.cpp">
+				</File>
+				<File
+					RelativePath=".\arvo\SphTri.cpp">
+				</File>
+				<File
+					RelativePath=".\arvo\SVD.cpp">
+				</File>
+				<File
+					RelativePath=".\arvo\Token.cpp">
+				</File>
+				<File
+					RelativePath=".\arvo\Vec2.cpp">
+				</File>
+				<File
+					RelativePath=".\arvo\Vec3.cpp">
+				</File>
+				<File
+					RelativePath=".\arvo\Vec4.cpp">
+				</File>
+				<File
+					RelativePath=".\arvo\Vector.cpp">
+				</File>
+			</Filter>
+		</Filter>
+		<Filter
+			Name="Header Files"
+			Filter="h;hpp;hxx;hm;inl">
+			<File
+				RelativePath=".\avpcl.h">
+			</File>
+			<File
+				RelativePath=".\bits.h">
+			</File>
+			<File
+				RelativePath=".\endpts.h">
+			</File>
+			<File
+				RelativePath=".\rgba.h">
+			</File>
+			<File
+				RelativePath=".\shapes_three.h">
+			</File>
+			<File
+				RelativePath=".\shapes_two.h">
+			</File>
+			<File
+				RelativePath=".\targa.h">
+			</File>
+			<File
+				RelativePath=".\tile.h">
+			</File>
+			<File
+				RelativePath=".\utils.h">
+			</File>
+			<Filter
+				Name="arvo"
+				Filter="">
+				<File
+					RelativePath=".\arvo\ArvoMath.h">
+				</File>
+				<File
+					RelativePath=".\arvo\Char.h">
+				</File>
+				<File
+					RelativePath=".\arvo\Complex.h">
+				</File>
+				<File
+					RelativePath=".\arvo\form.h">
+				</File>
+				<File
+					RelativePath=".\arvo\Matrix.h">
+				</File>
+				<File
+					RelativePath=".\arvo\Perm.h">
+				</File>
+				<File
+					RelativePath=".\arvo\Rand.h">
+				</File>
+				<File
+					RelativePath=".\arvo\SI_units.h">
+				</File>
+				<File
+					RelativePath=".\arvo\SphTri.h">
+				</File>
+				<File
+					RelativePath=".\arvo\SVD.h">
+				</File>
+				<File
+					RelativePath=".\arvo\Token.h">
+				</File>
+				<File
+					RelativePath=".\arvo\Vec2.h">
+				</File>
+				<File
+					RelativePath=".\arvo\Vec3.h">
+				</File>
+				<File
+					RelativePath=".\arvo\Vec4.h">
+				</File>
+				<File
+					RelativePath=".\arvo\Vector.h">
+				</File>
+			</Filter>
+		</Filter>
+		<Filter
+			Name="Resource Files"
+			Filter="ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe">
+		</Filter>
+	</Files>
+	<Globals>
+	</Globals>
+</VisualStudioProject>
diff --git a/src/nvtt/bc7/avpcl_mode0.cpp b/src/nvtt/bc7/avpcl_mode0.cpp
new file mode 100644
index 0000000..7583b70
--- /dev/null
+++ b/src/nvtt/bc7/avpcl_mode0.cpp
@@ -0,0 +1,1068 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+// Thanks to Jacob Munkberg (jacob@cs.lth.se) for the shortcut of using SVD to do the equivalent of principal components analysis
+
+//  x1		444.1x6 16p 45b (3bi)
+
+#include "bits.h"
+#include "tile.h"
+#include "avpcl.h"
+#include "arvo/Vec4.h"
+#include "arvo/Matrix.h"
+#include "arvo/SVD.h"
+#include "utils.h"
+#include "endpts.h"
+
+#include <assert.h>
+
+#include "shapes_three.h"
+
+// use only the first 16 available shapes
+#undef NSHAPES
+#undef SHAPEBITS
+#define NSHAPES 16
+#define SHAPEBITS 4
+
+using namespace ArvoMath;
+
+#define	NLSBMODES	4		// number of different lsb modes per region. since we have two .1 per region, that can have 4 values
+
+#define NINDICES	8
+#define	INDEXBITS	3
+#define	HIGH_INDEXBIT	(1<<(INDEXBITS-1))
+#define	DENOM		(NINDICES-1)
+#define	BIAS		(DENOM/2)
+
+// WORK: determine optimal traversal pattern to search for best shape -- what does the error curve look like?
+// i.e. can we search shapes in a particular order so we can see the global error minima easily and
+// stop without having to touch all shapes?
+
+#define	POS_TO_X(pos)	((pos)&3)
+#define	POS_TO_Y(pos)	(((pos)>>2)&3)
+
+#define	NBITSIZES	(NREGIONS*2)
+#define	ABITINDEX(region)	(2*(region)+0)
+#define	BBITINDEX(region)	(2*(region)+1)
+
+struct ChanBits
+{
+	int nbitsizes[NBITSIZES];	// bitsizes for one channel
+};
+
+struct Pattern
+{
+	ChanBits chan[NCHANNELS_RGB];//  bit patterns used per channel
+	int transformed;		// if 0, deltas are unsigned and no transform; otherwise, signed and transformed
+	int mode;				// associated mode value
+	int modebits;			// number of mode bits
+	char *encoding;			// verilog description of encoding for this mode
+};
+
+#define	NPATTERNS 1
+
+static Pattern patterns[NPATTERNS] =
+{
+	// red			green			blue			xfm	mode  mb
+	4,4,4,4,4,4,	4,4,4,4,4,4,	4,4,4,4,4,4,	0,	0x1, 1, "",	// really 444.1 x 6
+};
+
+struct RegionPrec
+{
+	int	endpt_a_prec[NCHANNELS_RGB];
+	int endpt_b_prec[NCHANNELS_RGB];
+};
+
+struct PatternPrec
+{
+	RegionPrec region_precs[NREGIONS];
+};
+
+// this is the precision for each channel and region
+// NOTE: this MUST match the corresponding data in "patterns" above -- WARNING: there is NO assert to check this!
+static PatternPrec pattern_precs[NPATTERNS] =
+{
+	4,4,4, 4,4,4, 4,4,4, 4,4,4, 4,4,4, 4,4,4, 
+};
+
+// return # of bits needed to store n. handle signed or unsigned cases properly
+static int nbits(int n, bool issigned)
+{
+	int nb;
+	if (n==0)
+		return 0;	// no bits needed for 0, signed or not
+	else if (n > 0)
+	{
+		for (nb=0; n; ++nb, n>>=1) ;
+		return nb + (issigned?1:0);
+	}
+	else
+	{
+		assert (issigned);
+		for (nb=0; n<-1; ++nb, n>>=1) ;
+		return nb + 1;
+	}
+}
+
+static void transform_forward(IntEndptsRGB_2 ep[NREGIONS])
+{
+	assert(0);
+}
+
+static void transform_inverse(IntEndptsRGB_2 ep[NREGIONS])
+{
+	assert(0);
+}
+
+// endpoints are 555,555; reduce to 444,444 and put the lsb bit majority in compr_bits
+static void compress_one(const IntEndptsRGB& endpts, IntEndptsRGB_2& compr_endpts)
+{
+	int onescnt;
+
+	onescnt = 0;
+	for (int j=0; j<NCHANNELS_RGB; ++j)
+	{
+		onescnt += endpts.A[j] & 1;
+		compr_endpts.A[j] = endpts.A[j] >> 1;
+		assert (compr_endpts.A[j] < 16);
+	}
+	compr_endpts.a_lsb = onescnt >= 2;
+
+	onescnt = 0;
+	for (int j=0; j<NCHANNELS_RGB; ++j)
+	{
+		onescnt += endpts.B[j] & 1;
+		compr_endpts.B[j] = endpts.B[j] >> 1;
+		assert (compr_endpts.B[j] < 16);
+	}
+	compr_endpts.b_lsb = onescnt >= 2;
+}
+
+static void uncompress_one(const IntEndptsRGB_2& compr_endpts, IntEndptsRGB& endpts)
+{
+	for (int j=0; j<NCHANNELS_RGB; ++j)
+	{
+		endpts.A[j] = (compr_endpts.A[j] << 1) | compr_endpts.a_lsb;
+		endpts.B[j] = (compr_endpts.B[j] << 1) | compr_endpts.b_lsb;
+	}
+}
+
+static void uncompress_endpoints(const IntEndptsRGB_2 compr_endpts[NREGIONS], IntEndptsRGB endpts[NREGIONS])
+{
+	for (int i=0; i<NREGIONS; ++i)
+		uncompress_one(compr_endpts[i], endpts[i]);
+}
+
+static void compress_endpoints(const IntEndptsRGB endpts[NREGIONS], IntEndptsRGB_2 compr_endpts[NREGIONS])
+{
+	for (int i=0; i<NREGIONS; ++i)
+		compress_one(endpts[i], compr_endpts[i]);
+}
+
+
+static void quantize_endpts(const FltEndpts endpts[NREGIONS], const PatternPrec &pattern_prec, IntEndptsRGB_2 q_endpts[NREGIONS])
+{
+	IntEndptsRGB full_endpts[NREGIONS];
+
+	for (int region = 0; region < NREGIONS; ++region)
+	{
+		full_endpts[region].A[0] = Utils::quantize(endpts[region].A.X(), pattern_prec.region_precs[region].endpt_a_prec[0]+1);	// +1 since we are in uncompressed space
+		full_endpts[region].A[1] = Utils::quantize(endpts[region].A.Y(), pattern_prec.region_precs[region].endpt_a_prec[1]+1);
+		full_endpts[region].A[2] = Utils::quantize(endpts[region].A.Z(), pattern_prec.region_precs[region].endpt_a_prec[2]+1);
+		full_endpts[region].B[0] = Utils::quantize(endpts[region].B.X(), pattern_prec.region_precs[region].endpt_b_prec[0]+1);
+		full_endpts[region].B[1] = Utils::quantize(endpts[region].B.Y(), pattern_prec.region_precs[region].endpt_b_prec[1]+1);
+		full_endpts[region].B[2] = Utils::quantize(endpts[region].B.Z(), pattern_prec.region_precs[region].endpt_b_prec[2]+1);
+		compress_one(full_endpts[region], q_endpts[region]);
+	}
+}
+
+// swap endpoints as needed to ensure that the indices at index_positions have a 0 high-order bit
+static void swap_indices(IntEndptsRGB_2 endpts[NREGIONS], int indices[Tile::TILE_H][Tile::TILE_W], int shapeindex)
+{
+	for (int region = 0; region < NREGIONS; ++region)
+	{
+		int position = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,region);
+
+		int x = POS_TO_X(position);
+		int y = POS_TO_Y(position);
+		assert(REGION(x,y,shapeindex) == region);		// double check the table
+		if (indices[y][x] & HIGH_INDEXBIT)
+		{
+			// high bit is set, swap the endpts and indices for this region
+			int t;
+			for (int i=0; i<NCHANNELS_RGB; ++i) 
+			{
+				t = endpts[region].A[i]; endpts[region].A[i] = endpts[region].B[i]; endpts[region].B[i] = t;
+			}
+			t = endpts[region].a_lsb; endpts[region].a_lsb = endpts[region].b_lsb; endpts[region].b_lsb = t;
+
+			for (int y = 0; y < Tile::TILE_H; y++)
+			for (int x = 0; x < Tile::TILE_W; x++)
+				if (REGION(x,y,shapeindex) == region)
+					indices[y][x] = NINDICES - 1 - indices[y][x];
+		}
+	}
+}
+
+static bool endpts_fit(IntEndptsRGB_2 endpts[NREGIONS], const Pattern &p)
+{
+	return true;
+}
+
+static void write_header(const IntEndptsRGB_2 endpts[NREGIONS], int shapeindex, const Pattern &p, Bits &out)
+{
+	out.write(p.mode, p.modebits);
+	out.write(shapeindex, SHAPEBITS);
+
+	for (int j=0; j<NCHANNELS_RGB; ++j)
+		for (int i=0; i<NREGIONS; ++i)
+		{
+			out.write(endpts[i].A[j], p.chan[j].nbitsizes[ABITINDEX(i)]);
+			out.write(endpts[i].B[j], p.chan[j].nbitsizes[BBITINDEX(i)]);
+		}
+
+	for (int i=0; i<NREGIONS; ++i)
+	{
+		out.write(endpts[i].a_lsb, 1);
+		out.write(endpts[i].b_lsb, 1);
+	}
+
+	assert (out.getptr() == 83);
+}
+
+static void read_header(Bits &in, IntEndptsRGB_2 endpts[NREGIONS], int &shapeindex, Pattern &p, int &pat_index)
+{
+	int mode = AVPCL::getmode(in);
+
+	pat_index = 0;
+	assert (pat_index >= 0 && pat_index < NPATTERNS);
+	assert (in.getptr() == patterns[pat_index].modebits);
+
+	shapeindex = in.read(SHAPEBITS);
+	p = patterns[pat_index];
+
+	for (int j=0; j<NCHANNELS_RGB; ++j)
+		for (int i=0; i<NREGIONS; ++i)
+		{
+			endpts[i].A[j] = in.read(p.chan[j].nbitsizes[ABITINDEX(i)]);
+			endpts[i].B[j] = in.read(p.chan[j].nbitsizes[BBITINDEX(i)]);
+		}
+	
+	for (int i=0; i<NREGIONS; ++i)
+	{
+		endpts[i].a_lsb  = in.read(1);
+		endpts[i].b_lsb  = in.read(1);
+	}
+
+	assert (in.getptr() == 83);
+}
+
+static void write_indices(const int indices[Tile::TILE_H][Tile::TILE_W], int shapeindex, Bits &out)
+{
+	int positions[NREGIONS];
+
+	for (int r = 0; r < NREGIONS; ++r)
+		positions[r] = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,r);
+
+	for (int pos = 0; pos < Tile::TILE_TOTAL; ++pos)
+	{
+		int x = POS_TO_X(pos);
+		int y = POS_TO_Y(pos);
+
+		bool match = false;
+
+		for (int r = 0; r < NREGIONS; ++r)
+			if (positions[r] == pos) { match = true; break; }
+
+		out.write(indices[y][x], INDEXBITS - (match ? 1 : 0));
+	}
+}
+
+static void read_indices(Bits &in, int shapeindex, int indices[Tile::TILE_H][Tile::TILE_W])
+{
+	int positions[NREGIONS];
+
+	for (int r = 0; r < NREGIONS; ++r)
+		positions[r] = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,r);
+
+	for (int pos = 0; pos < Tile::TILE_TOTAL; ++pos)
+	{
+		int x = POS_TO_X(pos);
+		int y = POS_TO_Y(pos);
+
+		bool match = false;
+
+		for (int r = 0; r < NREGIONS; ++r)
+			if (positions[r] == pos) { match = true; break; }
+
+		indices[y][x]= in.read(INDEXBITS - (match ? 1 : 0));
+	}
+}
+
+static void emit_block(const IntEndptsRGB_2 endpts[NREGIONS], int shapeindex, const Pattern &p, const int indices[Tile::TILE_H][Tile::TILE_W], char *block)
+{
+	Bits out(block, AVPCL::BITSIZE);
+
+	write_header(endpts, shapeindex, p, out);
+
+	write_indices(indices, shapeindex, out);
+
+	assert(out.getptr() == AVPCL::BITSIZE);
+}
+
+static void generate_palette_quantized(const IntEndptsRGB_2 &endpts_2, const RegionPrec &region_prec, Vec4 palette[NINDICES])
+{
+	IntEndptsRGB endpts;
+
+	uncompress_one(endpts_2, endpts);
+
+	// scale endpoints
+	int a, b;			// really need a IntVec4...
+
+	a = Utils::unquantize(endpts.A[0], region_prec.endpt_a_prec[0]+1);	// +1 since we are in uncompressed space
+	b = Utils::unquantize(endpts.B[0], region_prec.endpt_b_prec[0]+1);
+
+	// interpolate
+	for (int i = 0; i < NINDICES; ++i)
+		palette[i].X() = PALETTE_LERP(a, b, i, BIAS, DENOM);
+
+	a = Utils::unquantize(endpts.A[1], region_prec.endpt_a_prec[1]+1); 
+	b = Utils::unquantize(endpts.B[1], region_prec.endpt_b_prec[1]+1);
+
+	// interpolate
+	for (int i = 0; i < NINDICES; ++i)
+		palette[i].Y() = PALETTE_LERP(a, b, i, BIAS, DENOM);
+
+	a = Utils::unquantize(endpts.A[2], region_prec.endpt_a_prec[2]+1); 
+	b = Utils::unquantize(endpts.B[2], region_prec.endpt_b_prec[2]+1);
+
+	// interpolate
+	for (int i = 0; i < NINDICES; ++i)
+		palette[i].Z() = PALETTE_LERP(a, b, i, BIAS, DENOM);
+
+	// constant alpha
+	for (int i = 0; i < NINDICES; ++i)
+		palette[i].W() = RGBA_MAX;
+}
+
+static void sign_extend(Pattern &p, IntEndptsRGB_2 endpts[NREGIONS])
+{
+	assert(0);
+}
+
+void AVPCL::decompress_mode0(const char *block, Tile &t)
+{
+	Bits in(block, AVPCL::BITSIZE);
+
+	Pattern p;
+	IntEndptsRGB_2 endpts[NREGIONS];
+	int shapeindex, pat_index;
+
+	read_header(in, endpts, shapeindex, p, pat_index);
+	
+	if (p.transformed)
+	{
+		sign_extend(p, endpts);
+		transform_inverse(endpts);
+	}
+
+	Vec4 palette[NREGIONS][NINDICES];
+	for (int r = 0; r < NREGIONS; ++r)
+		generate_palette_quantized(endpts[r], pattern_precs[pat_index].region_precs[r], &palette[r][0]);
+
+	int indices[Tile::TILE_H][Tile::TILE_W];
+
+	read_indices(in, shapeindex, indices);
+
+	assert(in.getptr() == AVPCL::BITSIZE);
+
+	// lookup
+	for (int y = 0; y < Tile::TILE_H; y++)
+	for (int x = 0; x < Tile::TILE_W; x++)
+		t.data[y][x] = palette[REGION(x,y,shapeindex)][indices[y][x]];
+}
+
+// given a collection of colors and quantized endpoints, generate a palette, choose best entries, and return a single toterr
+static double map_colors(const Vec4 colors[], int np, const IntEndptsRGB_2 &endpts, const RegionPrec &region_prec, double current_err, int indices[Tile::TILE_TOTAL])
+{
+	Vec4 palette[NINDICES];
+	double toterr = 0;
+	Vec4 err;
+
+	generate_palette_quantized(endpts, region_prec, palette);
+
+	for (int i = 0; i < np; ++i)
+	{
+		double err, besterr = DBL_MAX;
+
+		for (int j = 0; j < NINDICES && besterr > 0; ++j)
+		{
+			err = Utils::metric4(colors[i], palette[j]);
+
+			if (err > besterr)	// error increased, so we're done searching
+				break;
+			if (err < besterr)
+			{
+				besterr = err;
+				indices[i] = j;
+			}
+		}
+		toterr += besterr;
+
+		// check for early exit
+		if (toterr > current_err)
+		{
+			// fill out bogus index values so it's initialized at least
+			for (int k = i; k < np; ++k)
+				indices[k] = -1;
+
+			return DBL_MAX;
+		}
+	}
+	return toterr;
+}
+
+// assign indices given a tile, shape, and quantized endpoints, return toterr for each region
+static void assign_indices(const Tile &tile, int shapeindex, IntEndptsRGB_2 endpts[NREGIONS], const PatternPrec &pattern_prec, 
+						   int indices[Tile::TILE_H][Tile::TILE_W], double toterr[NREGIONS])
+{
+	// build list of possibles
+	Vec4 palette[NREGIONS][NINDICES];
+
+	for (int region = 0; region < NREGIONS; ++region)
+	{
+		generate_palette_quantized(endpts[region], pattern_prec.region_precs[region], &palette[region][0]);
+		toterr[region] = 0;
+	}
+
+	Vec4 err;
+
+	for (int y = 0; y < tile.size_y; y++)
+	for (int x = 0; x < tile.size_x; x++)
+	{
+		int region = REGION(x,y,shapeindex);
+		double err, besterr = DBL_MAX;
+
+		for (int i = 0; i < NINDICES && besterr > 0; ++i)
+		{
+			err = Utils::metric4(tile.data[y][x], palette[region][i]);
+
+			if (err > besterr)	// error increased, so we're done searching
+				break;
+			if (err < besterr)
+			{
+				besterr = err;
+				indices[y][x] = i;
+			}
+		}
+		toterr[region] += besterr;
+	}
+}
+
+// note: indices are valid only if the value returned is less than old_err; otherwise they contain -1's
+// this function returns either old_err or a value smaller (if it was successful in improving the error)
+static double perturb_one(const Vec4 colors[], int np, int ch, const RegionPrec &region_prec, const IntEndptsRGB_2 &old_endpts, IntEndptsRGB_2 &new_endpts, 
+						  double old_err, int do_b, int indices[Tile::TILE_TOTAL])
+{
+	// we have the old endpoints: old_endpts
+	// we have the perturbed endpoints: new_endpts
+	// we have the temporary endpoints: temp_endpts
+
+	IntEndptsRGB_2 temp_endpts;
+	float min_err = old_err;		// start with the best current error
+	int beststep;
+	int temp_indices[Tile::TILE_TOTAL];
+
+	for (int i=0; i<np; ++i)
+		indices[i] = -1;
+
+	// copy real endpoints so we can perturb them
+	temp_endpts = new_endpts = old_endpts;
+
+	int prec = do_b ? region_prec.endpt_b_prec[ch] : region_prec.endpt_a_prec[ch];
+
+	// do a logarithmic search for the best error for this endpoint (which)
+	for (int step = 1 << (prec-1); step; step >>= 1)
+	{
+		bool improved = false;
+		for (int sign = -1; sign <= 1; sign += 2)
+		{
+			if (do_b == 0)
+			{
+				temp_endpts.A[ch] = new_endpts.A[ch] + sign * step;
+				if (temp_endpts.A[ch] < 0 || temp_endpts.A[ch] >= (1 << prec))
+					continue;
+			}
+			else
+			{
+				temp_endpts.B[ch] = new_endpts.B[ch] + sign * step;
+				if (temp_endpts.B[ch] < 0 || temp_endpts.B[ch] >= (1 << prec))
+					continue;
+			}
+
+			float err = map_colors(colors, np, temp_endpts, region_prec, min_err, temp_indices);
+
+			if (err < min_err)
+			{
+				improved = true;
+				min_err = err;
+				beststep = sign * step;
+				for (int i=0; i<np; ++i)
+					indices[i] = temp_indices[i];
+			}
+		}
+		// if this was an improvement, move the endpoint and continue search from there
+		if (improved)
+		{
+			if (do_b == 0)
+				new_endpts.A[ch] += beststep;
+			else
+				new_endpts.B[ch] += beststep;
+		}
+	}
+	return min_err;
+}
+
+// the larger the error the more time it is worth spending on an exhaustive search.
+// perturb the endpoints at least -3 to 3.
+// if err > 5000 perturb endpoints 50% of precision
+// if err > 1000 25%
+// if err > 200 12.5%
+// if err > 40  6.25%
+// for np = 16 -- adjust error thresholds as a function of np
+// always ensure endpoint ordering is preserved (no need to overlap the scan)
+// if orig_err returned from this is less than its input value, then indices[] will contain valid indices
+static double exhaustive(const Vec4 colors[], int np, int ch, const RegionPrec &region_prec, double &orig_err, IntEndptsRGB_2 &opt_endpts, int indices[Tile::TILE_TOTAL])
+{
+	IntEndptsRGB_2 temp_endpts;
+	double best_err = orig_err;
+	int aprec = region_prec.endpt_a_prec[ch];
+	int bprec = region_prec.endpt_b_prec[ch];
+	int good_indices[Tile::TILE_TOTAL];
+	int temp_indices[Tile::TILE_TOTAL];
+
+	for (int i=0; i<np; ++i)
+		indices[i] = -1;
+
+	double thr_scale = (double)np / (double)Tile::TILE_TOTAL;
+
+	if (orig_err == 0) return orig_err;
+
+	int adelta = 0, bdelta = 0;
+	if (orig_err > 5000.0*thr_scale)		{ adelta = (1 << aprec)/2; bdelta = (1 << bprec)/2; }
+	else if (orig_err > 1000.0*thr_scale)	{ adelta = (1 << aprec)/4; bdelta = (1 << bprec)/4; }
+	else if (orig_err > 200.0*thr_scale)	{ adelta = (1 << aprec)/8; bdelta = (1 << bprec)/8; }
+	else if (orig_err > 40.0*thr_scale)		{ adelta = (1 << aprec)/16; bdelta = (1 << bprec)/16; }
+	adelta = MAX(adelta, 3);
+	bdelta = MAX(bdelta, 3);
+
+#ifdef	DISABLE_EXHAUSTIVE
+	adelta = bdelta = 3;
+#endif
+
+	temp_endpts = opt_endpts;
+
+	// ok figure out the range of A and B
+	int alow = MAX(0, opt_endpts.A[ch] - adelta);
+	int ahigh = MIN((1<<aprec)-1, opt_endpts.A[ch] + adelta);
+	int blow = MAX(0, opt_endpts.B[ch] - bdelta);
+	int bhigh = MIN((1<<bprec)-1, opt_endpts.B[ch] + bdelta);
+
+	// now there's no need to swap the ordering of A and B
+	bool a_le_b = opt_endpts.A[ch] <= opt_endpts.B[ch];
+
+	int amin, bmin;
+
+	if (opt_endpts.A[ch] <= opt_endpts.B[ch])
+	{
+		// keep a <= b
+		for (int a = alow; a <= ahigh; ++a)
+		for (int b = MAX(a, blow); b < bhigh; ++b)
+		{
+			temp_endpts.A[ch] = a;
+			temp_endpts.B[ch] = b;
+		
+			float err = map_colors(colors, np, temp_endpts, region_prec, best_err, temp_indices);
+			if (err < best_err) 
+			{ 
+				amin = a; 
+				bmin = b; 
+				best_err = err;
+				for (int i=0; i<np; ++i)
+					good_indices[i] = temp_indices[i];
+			}
+		}
+	}
+	else
+	{
+		// keep b <= a
+		for (int b = blow; b < bhigh; ++b)
+		for (int a = MAX(b, alow); a <= ahigh; ++a)
+		{
+			temp_endpts.A[ch] = a;
+			temp_endpts.B[ch] = b;
+		
+			float err = map_colors(colors, np, temp_endpts, region_prec, best_err, temp_indices);
+			if (err < best_err) 
+			{ 
+				amin = a; 
+				bmin = b; 
+				best_err = err; 
+				for (int i=0; i<np; ++i)
+					good_indices[i] = temp_indices[i];
+			}
+		}
+	}
+	if (best_err < orig_err)
+	{
+		opt_endpts.A[ch] = amin;
+		opt_endpts.B[ch] = bmin;
+		orig_err = best_err;
+		// if we actually improved, update the indices
+		for (int i=0; i<np; ++i)
+			indices[i] = good_indices[i];
+	}
+	return best_err;
+}
+
+static double optimize_one(const Vec4 colors[], int np, double orig_err, const IntEndptsRGB_2 &orig_endpts, const RegionPrec &region_prec, IntEndptsRGB_2 &opt_endpts)
+{
+	double opt_err = orig_err;
+
+	opt_endpts = orig_endpts;
+
+	/*
+		err0 = perturb(rgb0, delta0)
+		err1 = perturb(rgb1, delta1)
+		if (err0 < err1)
+			if (err0 >= initial_error) break
+			rgb0 += delta0
+			next = 1
+		else
+			if (err1 >= initial_error) break
+			rgb1 += delta1
+			next = 0
+		initial_err = map()
+		for (;;)
+			err = perturb(next ? rgb1:rgb0, delta)
+			if (err >= initial_err) break
+			next? rgb1 : rgb0 += delta
+			initial_err = err
+	*/
+	IntEndptsRGB_2 new_a, new_b;
+	IntEndptsRGB_2 new_endpt;
+	int do_b;
+	int orig_indices[Tile::TILE_TOTAL];
+	int new_indices[Tile::TILE_TOTAL];
+	int temp_indices0[Tile::TILE_TOTAL];
+	int temp_indices1[Tile::TILE_TOTAL];
+
+	// now optimize each channel separately
+	// for the first error improvement, we save the indices. then, for any later improvement, we compare the indices
+	// if they differ, we restart the loop (which then falls back to looking for a first improvement.)
+	for (int ch = 0; ch < NCHANNELS_RGB; ++ch)
+	{
+		// figure out which endpoint when perturbed gives the most improvement and start there
+		// if we just alternate, we can easily end up in a local minima
+		float err0 = perturb_one(colors, np, ch, region_prec, opt_endpts, new_a, opt_err, 0, temp_indices0);	// perturb endpt A
+		float err1 = perturb_one(colors, np, ch, region_prec, opt_endpts, new_b, opt_err, 1, temp_indices1);	// perturb endpt B
+
+		if (err0 < err1)
+		{
+			if (err0 >= opt_err)
+				continue;
+
+			for (int i=0; i<np; ++i)
+			{
+				new_indices[i] = orig_indices[i] = temp_indices0[i];
+				assert (orig_indices[i] != -1);
+			}
+
+			opt_endpts.A[ch] = new_a.A[ch];
+			opt_err = err0;
+			do_b = 1;		// do B next
+		}
+		else
+		{
+			if (err1 >= opt_err)
+				continue;
+
+			for (int i=0; i<np; ++i)
+			{
+				new_indices[i] = orig_indices[i] = temp_indices1[i];
+				assert (orig_indices[i] != -1);
+			}
+
+			opt_endpts.B[ch] = new_b.B[ch];
+			opt_err = err1;
+			do_b = 0;		// do A next
+		}
+		
+		// now alternate endpoints and keep trying until there is no improvement
+		for (;;)
+		{
+			float err = perturb_one(colors, np, ch, region_prec, opt_endpts, new_endpt, opt_err, do_b, temp_indices0);
+			if (err >= opt_err)
+				break;
+
+			for (int i=0; i<np; ++i)
+			{
+				new_indices[i] = temp_indices0[i];
+				assert (new_indices[i] != -1);
+			}
+
+			if (do_b == 0)
+				opt_endpts.A[ch] = new_endpt.A[ch];
+			else
+				opt_endpts.B[ch] = new_endpt.B[ch];
+			opt_err = err;
+			do_b = 1 - do_b;	// now move the other endpoint
+		}
+
+		// see if the indices have changed
+		int i;
+		for (i=0; i<np; ++i)
+			if (orig_indices[i] != new_indices[i])
+				break;
+
+		if (i<np)
+			ch = -1;	// start over
+	}
+
+	// finally, do a small exhaustive search around what we think is the global minima to be sure
+	// note this is independent of the above search, so we don't care about the indices from the above
+	// we don't care about the above because if they differ, so what? we've already started at ch=0
+	bool first = true;
+	for (int ch = 0; ch < NCHANNELS_RGB; ++ch)
+	{
+		double new_err = exhaustive(colors, np, ch, region_prec, opt_err, opt_endpts, temp_indices0);
+
+		if (new_err < opt_err)
+		{
+			opt_err = new_err;
+
+			if (first)
+			{
+				for (int i=0; i<np; ++i)
+				{
+					orig_indices[i] = temp_indices0[i];
+					assert (orig_indices[i] != -1);
+				}
+				first = false;
+			}
+			else
+			{
+				// see if the indices have changed
+				int i;
+				for (i=0; i<np; ++i)
+					if (orig_indices[i] != temp_indices0[i])
+						break;
+
+				if (i<np)
+				{
+					ch = -1;	// start over
+					first = true;
+				}
+			}
+		}
+	}
+
+	return opt_err;
+}
+
+// this will return a valid set of endpoints in opt_endpts regardless of whether it improve orig_endpts or not
+static void optimize_endpts(const Tile &tile, int shapeindex, const double orig_err[NREGIONS], 
+							const IntEndptsRGB_2 orig_endpts[NREGIONS], const PatternPrec &pattern_prec, double opt_err[NREGIONS], IntEndptsRGB_2 opt_endpts[NREGIONS])
+{
+	Vec4 pixels[Tile::TILE_TOTAL];
+	IntEndptsRGB_2 temp_in, temp_out;
+	int temp_indices[Tile::TILE_TOTAL];
+
+	for (int region=0; region<NREGIONS; ++region)
+	{
+		// collect the pixels in the region
+		int np = 0;
+
+		for (int y = 0; y < tile.size_y; y++)
+		for (int x = 0; x < tile.size_x; x++)
+			if (REGION(x,y,shapeindex) == region)
+				pixels[np++] = tile.data[y][x];
+
+		opt_endpts[region] = temp_in = orig_endpts[region];
+		opt_err[region] = orig_err[region];
+
+		double best_err = orig_err[region];
+
+		for (int lsbmode=0; lsbmode<NLSBMODES; ++lsbmode)
+		{
+			temp_in.a_lsb = lsbmode & 1;
+			temp_in.b_lsb = (lsbmode >> 1) & 1;
+
+			// make sure we have a valid error for temp_in
+			// we use DBL_MAX here because we want an accurate temp_in_err, no shortcuts
+			// (mapcolors will compute a mapping but will stop if the error exceeds the value passed in the DBL_MAX position)
+			double temp_in_err = map_colors(pixels, np, temp_in, pattern_prec.region_precs[region], DBL_MAX, temp_indices);
+
+			// now try to optimize these endpoints
+			double temp_out_err = optimize_one(pixels, np, temp_in_err, temp_in, pattern_prec.region_precs[region], temp_out);
+
+			// if we find an improvement, update the best so far and correct the output endpoints and errors
+			if (temp_out_err < best_err)
+			{
+				best_err = temp_out_err;
+				opt_err[region] = temp_out_err;
+				opt_endpts[region] = temp_out;
+			}
+		}
+	}
+}
+
+/* optimization algorithm
+	for each pattern
+		convert endpoints using pattern precision
+		assign indices and get initial error
+		compress indices (and possibly reorder endpoints)
+		transform endpoints
+		if transformed endpoints fit pattern
+			get original endpoints back
+			optimize endpoints, get new endpoints, new indices, and new error // new error will almost always be better
+			compress new indices
+			transform new endpoints
+			if new endpoints fit pattern AND if error is improved
+				emit compressed block with new data
+			else
+				emit compressed block with original data // to try to preserve maximum endpoint precision
+*/
+
+static double refine(const Tile &tile, int shapeindex_best, const FltEndpts endpts[NREGIONS], char *block)
+{
+	double orig_err[NREGIONS], opt_err[NREGIONS], orig_toterr, opt_toterr, expected_opt_err[NREGIONS];
+	IntEndptsRGB_2 orig_endpts[NREGIONS], opt_endpts[NREGIONS];
+	int orig_indices[Tile::TILE_H][Tile::TILE_W], opt_indices[Tile::TILE_H][Tile::TILE_W];
+
+	for (int sp = 0; sp < NPATTERNS; ++sp)
+	{
+		quantize_endpts(endpts, pattern_precs[sp], orig_endpts);
+		assign_indices(tile, shapeindex_best, orig_endpts, pattern_precs[sp], orig_indices, orig_err);
+		swap_indices(orig_endpts, orig_indices, shapeindex_best);
+		if (patterns[sp].transformed)
+			transform_forward(orig_endpts);
+		// apply a heuristic here -- we check if the endpoints fit before we try to optimize them.
+		// the assumption made is that if they don't fit now, they won't fit after optimizing.
+		if (endpts_fit(orig_endpts, patterns[sp]))
+		{
+			if (patterns[sp].transformed)
+				transform_inverse(orig_endpts);
+			optimize_endpts(tile, shapeindex_best, orig_err, orig_endpts, pattern_precs[sp], expected_opt_err, opt_endpts);
+			assign_indices(tile, shapeindex_best, opt_endpts, pattern_precs[sp], opt_indices, opt_err);
+			for (int i=0; i<NREGIONS; ++i)
+				assert(expected_opt_err[i] == opt_err[i]);
+			swap_indices(opt_endpts, opt_indices, shapeindex_best);
+			if (patterns[sp].transformed)
+				transform_forward(opt_endpts);
+			orig_toterr = opt_toterr = 0;
+			for (int i=0; i < NREGIONS; ++i) { orig_toterr += orig_err[i]; opt_toterr += opt_err[i]; }
+			if (endpts_fit(opt_endpts, patterns[sp]) && opt_toterr < orig_toterr)
+			{
+				emit_block(opt_endpts, shapeindex_best, patterns[sp], opt_indices, block);
+				return opt_toterr;
+			}
+			else
+			{
+				// either it stopped fitting when we optimized it, or there was no improvement
+				// so go back to the unoptimized endpoints which we know will fit
+				if (patterns[sp].transformed)
+					transform_forward(orig_endpts);
+				emit_block(orig_endpts, shapeindex_best, patterns[sp], orig_indices, block);
+				return orig_toterr;
+			}
+		}
+	}
+	throw "No candidate found, should never happen (mode avpcl 0).";
+}
+
+static void clamp(Vec4 &v)
+{
+	if (v.X() < RGBA_MIN) v.X() = RGBA_MIN;
+	if (v.X() > RGBA_MAX) v.X() = RGBA_MAX;
+	if (v.Y() < RGBA_MIN) v.Y() = RGBA_MIN;
+	if (v.Y() > RGBA_MAX) v.Y() = RGBA_MAX;
+	if (v.Z() < RGBA_MIN) v.Z() = RGBA_MIN;
+	if (v.Z() > RGBA_MAX) v.Z() = RGBA_MAX;
+	v.W() = RGBA_MAX;
+}
+
+static void generate_palette_unquantized(const FltEndpts endpts[NREGIONS], Vec4 palette[NREGIONS][NINDICES])
+{
+	for (int region = 0; region < NREGIONS; ++region)
+	for (int i = 0; i < NINDICES; ++i)
+		palette[region][i] = PALETTE_LERP(endpts[region].A, endpts[region].B, i, 0.0, float(DENOM));
+}
+
+// generate a palette from unquantized endpoints, then pick best palette color for all pixels in each region, return toterr for all regions combined
+static double map_colors(const Tile &tile, int shapeindex, const FltEndpts endpts[NREGIONS])
+{
+	// build list of possibles
+	Vec4 palette[NREGIONS][NINDICES];
+
+	generate_palette_unquantized(endpts, palette);
+
+	double toterr = 0;
+	Vec4 err;
+
+	for (int y = 0; y < tile.size_y; y++)
+	for (int x = 0; x < tile.size_x; x++)
+	{
+		int region = REGION(x,y,shapeindex);
+		double err, besterr = DBL_MAX;
+
+		for (int i = 0; i < NINDICES && besterr > 0; ++i)
+		{
+			err = Utils::metric4(tile.data[y][x], palette[region][i]);
+
+			if (err > besterr)	// error increased, so we're done searching. this works for most norms.
+				break;
+			if (err < besterr)
+				besterr = err;
+		}
+		toterr += besterr;
+	}
+	return toterr;
+}
+
+// for this mode, we assume alpha = 255 constant and compress only the RGB portion.
+// however, we do the error check against the actual alpha values supplied for the tile.
+static double rough(const Tile &tile, int shapeindex, FltEndpts endpts[NREGIONS])
+{
+	for (int region=0; region<NREGIONS; ++region)
+	{
+		int np = 0;
+		Vec4 colors[Tile::TILE_TOTAL];
+		Vec4 mean(0,0,0,0);
+
+		for (int y = 0; y < tile.size_y; y++)
+		for (int x = 0; x < tile.size_x; x++)
+			if (REGION(x,y,shapeindex) == region)
+			{
+				colors[np] = tile.data[y][x];
+				mean += tile.data[y][x];
+				++np;
+			}
+
+		// handle simple cases	
+		if (np == 0)
+		{
+			Vec4 zero(0,0,0,RGBA_MAX);
+			endpts[region].A = zero;
+			endpts[region].B = zero;
+			continue;
+		}
+		else if (np == 1)
+		{
+			endpts[region].A = colors[0];
+			endpts[region].B = colors[0];
+			continue;
+		}
+		else if (np == 2)
+		{
+			endpts[region].A = colors[0];
+			endpts[region].B = colors[1];
+			continue;
+		}
+
+		Matrix rdq(np, 3);
+
+		mean /= float(np);
+
+		// only look at RGB; ignore A
+		for (int i = 0; i < np; ++i)
+		{
+			rdq(i,0) = colors[i].X() - mean.X();
+			rdq(i,1) = colors[i].Y() - mean.Y();
+			rdq(i,2) = colors[i].Z() - mean.Z();
+		}
+				
+		// perform a singular value decomposition
+		SVD svd(rdq);
+
+		// get the principal component direction (the one with the largest weight)
+		Vec4 direction(svd.R()(0,0), svd.R()(0,1), svd.R()(0,2), 0.0);
+
+		// project each pixel value along the principal direction
+		double minp = DBL_MAX, maxp = -DBL_MAX;
+		for (int i = 0; i < np; i++) 
+		{
+			float dp = rdq(i,0) * direction.X() + rdq(i,1)*direction.Y() + rdq(i,2)*direction.Z();
+			if (dp < minp) minp = dp;
+			if (dp > maxp) maxp = dp;
+		}
+
+		// choose as endpoints 2 points along the principal direction that span the projections of all of the pixel values
+		endpts[region].A = mean + minp*direction;
+		endpts[region].B = mean + maxp*direction;
+
+		// clamp endpoints
+		// the argument for clamping is that the actual endpoints need to be clamped and thus we need to choose the best
+		// shape based on endpoints being clamped
+		clamp(endpts[region].A);
+		clamp(endpts[region].B);
+	}
+
+	return map_colors(tile, shapeindex, endpts);
+}
+
+static void swap(double *list1, int *list2, int i, int j)
+{
+	double t = list1[i]; list1[i] = list1[j]; list1[j] = t;
+	int t1 = list2[i]; list2[i] = list2[j]; list2[j] = t1;
+}
+
+double AVPCL::compress_mode0(const Tile &t, char *block)
+{
+	// number of rough cases to look at. reasonable values of this are 1, NSHAPES/4, and NSHAPES
+	// NSHAPES/4 gets nearly all the cases; you can increase that a bit (say by 3 or 4) if you really want to squeeze the last bit out
+	const int NITEMS=NSHAPES/4;
+
+	// pick the best NITEMS shapes and refine these.
+	struct {
+		FltEndpts endpts[NREGIONS];
+	} all[NSHAPES];
+	double roughmse[NSHAPES];
+	int index[NSHAPES];
+	char tempblock[AVPCL::BLOCKSIZE];
+	double msebest = DBL_MAX;
+
+	for (int i=0; i<NSHAPES; ++i)
+	{
+		roughmse[i] = rough(t, i, &all[i].endpts[0]);
+		index[i] = i;
+	}
+
+	// bubble sort -- only need to bubble up the first NITEMS items
+	for (int i=0; i<NITEMS; ++i)
+	for (int j=i+1; j<NSHAPES; ++j)
+		if (roughmse[i] > roughmse[j])
+			swap(roughmse, index, i, j);
+
+	for (int i=0; i<NITEMS && msebest>0; ++i)
+	{
+		int shape = index[i];
+		double mse = refine(t, shape, &all[shape].endpts[0], tempblock);
+		if (mse < msebest)
+		{
+			memcpy(block, tempblock, sizeof(tempblock));
+			msebest = mse;
+		}
+	}
+	return msebest;
+}
+
diff --git a/src/nvtt/bc7/avpcl_mode1.cpp b/src/nvtt/bc7/avpcl_mode1.cpp
new file mode 100644
index 0000000..bee9daa
--- /dev/null
+++ b/src/nvtt/bc7/avpcl_mode1.cpp
@@ -0,0 +1,1049 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+// Thanks to Jacob Munkberg (jacob@cs.lth.se) for the shortcut of using SVD to do the equivalent of principal components analysis
+
+// x10	(666x2).1 (666x2).1 64p 3bi
+
+#include "bits.h"
+#include "tile.h"
+#include "avpcl.h"
+#include "arvo/Vec4.h"
+#include "arvo/Matrix.h"
+#include "arvo/SVD.h"
+#include "utils.h"
+#include "endpts.h"
+
+#include <assert.h>
+
+#include "shapes_two.h"
+
+using namespace ArvoMath;
+
+#define	NLSBMODES	2		// number of different lsb modes per region. since we have one .1 per region, that can have 2 values
+
+#define NINDICES	8
+#define	INDEXBITS	3
+#define	HIGH_INDEXBIT	(1<<(INDEXBITS-1))
+#define	DENOM		(NINDICES-1)
+#define	BIAS		(DENOM/2)
+
+// WORK: determine optimal traversal pattern to search for best shape -- what does the error curve look like?
+// i.e. can we search shapes in a particular order so we can see the global error minima easily and
+// stop without having to touch all shapes?
+
+#define	POS_TO_X(pos)	((pos)&3)
+#define	POS_TO_Y(pos)	(((pos)>>2)&3)
+
+#define	NBITSIZES	(NREGIONS*2)
+#define	ABITINDEX(region)	(2*(region)+0)
+#define	BBITINDEX(region)	(2*(region)+1)
+
+struct ChanBits
+{
+	int nbitsizes[NBITSIZES];	// bitsizes for one channel
+};
+
+struct Pattern
+{
+	ChanBits chan[NCHANNELS_RGB];//  bit patterns used per channel
+	int transformed;		// if 0, deltas are unsigned and no transform; otherwise, signed and transformed
+	int mode;				// associated mode value
+	int modebits;			// number of mode bits
+	char *encoding;			// verilog description of encoding for this mode
+};
+
+#define	NPATTERNS 1
+
+static Pattern patterns[NPATTERNS] =
+{
+	// red		green		blue		xfm	mode  mb
+	6,6,6,6,	6,6,6,6,	6,6,6,6,	0,	0x2, 2, "",
+};
+
+struct RegionPrec
+{
+	int	endpt_a_prec[NCHANNELS_RGB];
+	int endpt_b_prec[NCHANNELS_RGB];
+};
+
+struct PatternPrec
+{
+	RegionPrec region_precs[NREGIONS];
+};
+
+
+// this is the precision for each channel and region
+// NOTE: this MUST match the corresponding data in "patterns" above -- WARNING: there is NO assert to check this!
+static PatternPrec pattern_precs[NPATTERNS] =
+{
+	6,6,6, 6,6,6, 6,6,6, 6,6,6,	
+};
+
+// return # of bits needed to store n. handle signed or unsigned cases properly
+static int nbits(int n, bool issigned)
+{
+	int nb;
+	if (n==0)
+		return 0;	// no bits needed for 0, signed or not
+	else if (n > 0)
+	{
+		for (nb=0; n; ++nb, n>>=1) ;
+		return nb + (issigned?1:0);
+	}
+	else
+	{
+		assert (issigned);
+		for (nb=0; n<-1; ++nb, n>>=1) ;
+		return nb + 1;
+	}
+}
+
+
+static void transform_forward(IntEndptsRGB_1 ep[NREGIONS])
+{
+	assert(0);
+}
+
+static void transform_inverse(IntEndptsRGB_1 ep[NREGIONS])
+{
+	assert(0);
+}
+
+// endpoints are 777,777; reduce to 666,666 and put the lsb bit majority in compr_bits
+static void compress_one(const IntEndptsRGB& endpts, IntEndptsRGB_1& compr_endpts)
+{
+	int onescnt;
+
+	onescnt = 0;
+	for (int j=0; j<NCHANNELS_RGB; ++j)
+	{
+		onescnt += endpts.A[j] & 1;
+		compr_endpts.A[j] = endpts.A[j] >> 1;
+		onescnt += endpts.B[j] & 1;
+		compr_endpts.B[j] = endpts.B[j] >> 1;
+		assert (compr_endpts.A[j] < 64);
+		assert (compr_endpts.B[j] < 64);
+	}
+	compr_endpts.lsb = onescnt >= 3;
+}
+
+static void uncompress_one(const IntEndptsRGB_1& compr_endpts, IntEndptsRGB& endpts)
+{
+	for (int j=0; j<NCHANNELS_RGB; ++j)
+	{
+		endpts.A[j] = (compr_endpts.A[j] << 1) | compr_endpts.lsb;
+		endpts.B[j] = (compr_endpts.B[j] << 1) | compr_endpts.lsb;
+	}
+}
+
+static void uncompress_endpoints(const IntEndptsRGB_1 compr_endpts[NREGIONS], IntEndptsRGB endpts[NREGIONS])
+{
+	for (int i=0; i<NREGIONS; ++i)
+		uncompress_one(compr_endpts[i], endpts[i]);
+}
+
+static void compress_endpoints(const IntEndptsRGB endpts[NREGIONS], IntEndptsRGB_1 compr_endpts[NREGIONS])
+{
+	for (int i=0; i<NREGIONS; ++i)
+		compress_one(endpts[i], compr_endpts[i]);
+}
+
+
+static void quantize_endpts(const FltEndpts endpts[NREGIONS], const PatternPrec &pattern_prec, IntEndptsRGB_1 q_endpts[NREGIONS])
+{
+	IntEndptsRGB full_endpts[NREGIONS];
+
+	for (int region = 0; region < NREGIONS; ++region)
+	{
+		full_endpts[region].A[0] = Utils::quantize(endpts[region].A.X(), pattern_prec.region_precs[region].endpt_a_prec[0]+1);	// +1 since we are in uncompressed space
+		full_endpts[region].A[1] = Utils::quantize(endpts[region].A.Y(), pattern_prec.region_precs[region].endpt_a_prec[1]+1);
+		full_endpts[region].A[2] = Utils::quantize(endpts[region].A.Z(), pattern_prec.region_precs[region].endpt_a_prec[2]+1);
+		full_endpts[region].B[0] = Utils::quantize(endpts[region].B.X(), pattern_prec.region_precs[region].endpt_b_prec[0]+1);
+		full_endpts[region].B[1] = Utils::quantize(endpts[region].B.Y(), pattern_prec.region_precs[region].endpt_b_prec[1]+1);
+		full_endpts[region].B[2] = Utils::quantize(endpts[region].B.Z(), pattern_prec.region_precs[region].endpt_b_prec[2]+1);
+		compress_one(full_endpts[region], q_endpts[region]);
+	}
+}
+
+// swap endpoints as needed to ensure that the indices at index_positions have a 0 high-order bit
+static void swap_indices(IntEndptsRGB_1 endpts[NREGIONS], int indices[Tile::TILE_H][Tile::TILE_W], int shapeindex)
+{
+	for (int region = 0; region < NREGIONS; ++region)
+	{
+		int position = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,region);
+
+		int x = POS_TO_X(position);
+		int y = POS_TO_Y(position);
+		assert(REGION(x,y,shapeindex) == region);		// double check the table
+		if (indices[y][x] & HIGH_INDEXBIT)
+		{
+			// high bit is set, swap the endpts and indices for this region
+			int t;
+			for (int i=0; i<NCHANNELS_RGB; ++i) { t = endpts[region].A[i]; endpts[region].A[i] = endpts[region].B[i]; endpts[region].B[i] = t; }
+
+			for (int y = 0; y < Tile::TILE_H; y++)
+			for (int x = 0; x < Tile::TILE_W; x++)
+				if (REGION(x,y,shapeindex) == region)
+					indices[y][x] = NINDICES - 1 - indices[y][x];
+		}
+	}
+}
+
+static bool endpts_fit(IntEndptsRGB_1 endpts[NREGIONS], const Pattern &p)
+{
+	return true;
+}
+
+
+static void write_header(const IntEndptsRGB_1 endpts[NREGIONS], int shapeindex, const Pattern &p, Bits &out)
+{
+	out.write(p.mode, p.modebits);
+	out.write(shapeindex, SHAPEBITS);
+
+	for (int j=0; j<NCHANNELS_RGB; ++j)
+		for (int i=0; i<NREGIONS; ++i)
+		{
+			out.write(endpts[i].A[j], p.chan[j].nbitsizes[ABITINDEX(i)]);
+			out.write(endpts[i].B[j], p.chan[j].nbitsizes[BBITINDEX(i)]);
+		}
+
+	for (int i=0; i<NREGIONS; ++i)
+		out.write(endpts[i].lsb, 1);
+
+	assert (out.getptr() == 82);
+}
+
+static void read_header(Bits &in, IntEndptsRGB_1 endpts[NREGIONS], int &shapeindex, Pattern &p, int &pat_index)
+{
+	int mode = AVPCL::getmode(in);
+
+	pat_index = 0;
+	assert (pat_index >= 0 && pat_index < NPATTERNS);
+	assert (in.getptr() == patterns[pat_index].modebits);
+
+	shapeindex = in.read(SHAPEBITS);
+	p = patterns[pat_index];
+
+	for (int j=0; j<NCHANNELS_RGB; ++j)
+		for (int i=0; i<NREGIONS; ++i)
+		{
+			endpts[i].A[j] = in.read(p.chan[j].nbitsizes[ABITINDEX(i)]);
+			endpts[i].B[j] = in.read(p.chan[j].nbitsizes[BBITINDEX(i)]);
+		}
+
+	for (int i=0; i<NREGIONS; ++i)
+		endpts[i].lsb  = in.read(1);
+	
+	assert (in.getptr() == 82);
+}
+
+static void write_indices(const int indices[Tile::TILE_H][Tile::TILE_W], int shapeindex, Bits &out)
+{
+	int positions[NREGIONS];
+
+	for (int r = 0; r < NREGIONS; ++r)
+		positions[r] = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,r);
+
+	for (int pos = 0; pos < Tile::TILE_TOTAL; ++pos)
+	{
+		int x = POS_TO_X(pos);
+		int y = POS_TO_Y(pos);
+
+		bool match = false;
+
+		for (int r = 0; r < NREGIONS; ++r)
+			if (positions[r] == pos) { match = true; break; }
+
+		out.write(indices[y][x], INDEXBITS - (match ? 1 : 0));
+	}
+}
+
+static void read_indices(Bits &in, int shapeindex, int indices[Tile::TILE_H][Tile::TILE_W])
+{
+	int positions[NREGIONS];
+
+	for (int r = 0; r < NREGIONS; ++r)
+		positions[r] = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,r);
+
+	for (int pos = 0; pos < Tile::TILE_TOTAL; ++pos)
+	{
+		int x = POS_TO_X(pos);
+		int y = POS_TO_Y(pos);
+
+		bool match = false;
+
+		for (int r = 0; r < NREGIONS; ++r)
+			if (positions[r] == pos) { match = true; break; }
+
+		indices[y][x]= in.read(INDEXBITS - (match ? 1 : 0));
+	}
+}
+
+static void emit_block(const IntEndptsRGB_1 endpts[NREGIONS], int shapeindex, const Pattern &p, const int indices[Tile::TILE_H][Tile::TILE_W], char *block)
+{
+	Bits out(block, AVPCL::BITSIZE);
+
+	write_header(endpts, shapeindex, p, out);
+
+	write_indices(indices, shapeindex, out);
+
+	assert(out.getptr() == AVPCL::BITSIZE);
+}
+
+static void generate_palette_quantized(const IntEndptsRGB_1 &endpts_1, const RegionPrec &region_prec, Vec4 palette[NINDICES])
+{
+	IntEndptsRGB endpts;
+
+	uncompress_one(endpts_1, endpts);
+
+	// scale endpoints
+	int a, b;			// really need a IntVec4...
+
+	a = Utils::unquantize(endpts.A[0], region_prec.endpt_a_prec[0]+1);	// +1 since we are in uncompressed space
+	b = Utils::unquantize(endpts.B[0], region_prec.endpt_b_prec[0]+1);
+
+	// note: don't simplify to a + ((b-a)*i + BIAS)/DENOM as that doesn't work due to the way C handles integer division of negatives
+
+	// interpolate
+	for (int i = 0; i < NINDICES; ++i)
+		palette[i].X() = PALETTE_LERP(a, b, i, BIAS, DENOM);	
+
+	a = Utils::unquantize(endpts.A[1], region_prec.endpt_a_prec[1]+1); 
+	b = Utils::unquantize(endpts.B[1], region_prec.endpt_b_prec[1]+1);
+
+	// interpolate
+	for (int i = 0; i < NINDICES; ++i)
+		palette[i].Y() = PALETTE_LERP(a, b, i, BIAS, DENOM);
+
+	a = Utils::unquantize(endpts.A[2], region_prec.endpt_a_prec[2]+1); 
+	b = Utils::unquantize(endpts.B[2], region_prec.endpt_b_prec[2]+1);
+
+	// interpolate
+	for (int i = 0; i < NINDICES; ++i)
+		palette[i].Z() = PALETTE_LERP(a, b, i, BIAS, DENOM);
+
+	// constant alpha
+	for (int i = 0; i < NINDICES; ++i)
+		palette[i].W() = RGBA_MAX;
+}
+
+// sign extend but only if it was transformed
+static void sign_extend(Pattern &p, IntEndptsRGB_1 endpts[NREGIONS])
+{
+	assert(0);
+}
+
+void AVPCL::decompress_mode1(const char *block, Tile &t)
+{
+	Bits in(block, AVPCL::BITSIZE);
+
+	Pattern p;
+	IntEndptsRGB_1 endpts[NREGIONS];
+	int shapeindex, pat_index;
+
+	read_header(in, endpts, shapeindex, p, pat_index);
+	
+	if (p.transformed)
+	{
+		sign_extend(p, endpts);
+		transform_inverse(endpts);
+	}
+
+	Vec4 palette[NREGIONS][NINDICES];
+	for (int r = 0; r < NREGIONS; ++r)
+		generate_palette_quantized(endpts[r], pattern_precs[pat_index].region_precs[r], &palette[r][0]);
+
+	int indices[Tile::TILE_H][Tile::TILE_W];
+
+	read_indices(in, shapeindex, indices);
+
+	assert(in.getptr() == AVPCL::BITSIZE);
+
+	// lookup
+	for (int y = 0; y < Tile::TILE_H; y++)
+	for (int x = 0; x < Tile::TILE_W; x++)
+		t.data[y][x] = palette[REGION(x,y,shapeindex)][indices[y][x]];
+}
+
+// given a collection of colors and quantized endpoints, generate a palette, choose best entries, and return a single toterr
+static double map_colors(const Vec4 colors[], int np, const IntEndptsRGB_1 &endpts, const RegionPrec &region_prec, double current_err, int indices[Tile::TILE_TOTAL])
+{
+	Vec4 palette[NINDICES];
+	double toterr = 0;
+	Vec4 err;
+
+	generate_palette_quantized(endpts, region_prec, palette);
+
+	for (int i = 0; i < np; ++i)
+	{
+		double err, besterr = DBL_MAX;
+
+		for (int j = 0; j < NINDICES && besterr > 0; ++j)
+		{
+			err = Utils::metric4(colors[i], palette[j]);
+
+			if (err > besterr)	// error increased, so we're done searching
+				break;
+			if (err < besterr)
+			{
+				besterr = err;
+				indices[i] = j;
+			}
+		}
+		toterr += besterr;
+
+		// check for early exit
+		if (toterr > current_err)
+		{
+			// fill out bogus index values so it's initialized at least
+			for (int k = i; k < np; ++k)
+				indices[k] = -1;
+
+			return DBL_MAX;
+		}
+	}
+	return toterr;
+}
+
+// assign indices given a tile, shape, and quantized endpoints, return toterr for each region
+static void assign_indices(const Tile &tile, int shapeindex, IntEndptsRGB_1 endpts[NREGIONS], const PatternPrec &pattern_prec, 
+						   int indices[Tile::TILE_H][Tile::TILE_W], double toterr[NREGIONS])
+{
+	// build list of possibles
+	Vec4 palette[NREGIONS][NINDICES];
+
+	for (int region = 0; region < NREGIONS; ++region)
+	{
+		generate_palette_quantized(endpts[region], pattern_prec.region_precs[region], &palette[region][0]);
+		toterr[region] = 0;
+	}
+
+	Vec4 err;
+
+	for (int y = 0; y < tile.size_y; y++)
+	for (int x = 0; x < tile.size_x; x++)
+	{
+		int region = REGION(x,y,shapeindex);
+		double err, besterr = DBL_MAX;
+
+		for (int i = 0; i < NINDICES && besterr > 0; ++i)
+		{
+			err = Utils::metric4(tile.data[y][x], palette[region][i]);
+
+			if (err > besterr)	// error increased, so we're done searching
+				break;
+			if (err < besterr)
+			{
+				besterr = err;
+				indices[y][x] = i;
+			}
+		}
+		toterr[region] += besterr;
+	}
+}
+
+// note: indices are valid only if the value returned is less than old_err; otherwise they contain -1's
+// this function returns either old_err or a value smaller (if it was successful in improving the error)
+static double perturb_one(const Vec4 colors[], int np, int ch, const RegionPrec &region_prec, const IntEndptsRGB_1 &old_endpts, IntEndptsRGB_1 &new_endpts, 
+						  double old_err, int do_b, int indices[Tile::TILE_TOTAL])
+{
+	// we have the old endpoints: old_endpts
+	// we have the perturbed endpoints: new_endpts
+	// we have the temporary endpoints: temp_endpts
+
+	IntEndptsRGB_1 temp_endpts;
+	float min_err = old_err;		// start with the best current error
+	int beststep;
+	int temp_indices[Tile::TILE_TOTAL];
+
+	for (int i=0; i<np; ++i)
+		indices[i] = -1;
+
+	// copy real endpoints so we can perturb them
+	temp_endpts = new_endpts = old_endpts;
+
+	int prec = do_b ? region_prec.endpt_b_prec[ch] : region_prec.endpt_a_prec[ch];
+
+	// do a logarithmic search for the best error for this endpoint (which)
+	for (int step = 1 << (prec-1); step; step >>= 1)
+	{
+		bool improved = false;
+		for (int sign = -1; sign <= 1; sign += 2)
+		{
+			if (do_b == 0)
+			{
+				temp_endpts.A[ch] = new_endpts.A[ch] + sign * step;
+				if (temp_endpts.A[ch] < 0 || temp_endpts.A[ch] >= (1 << prec))
+					continue;
+			}
+			else
+			{
+				temp_endpts.B[ch] = new_endpts.B[ch] + sign * step;
+				if (temp_endpts.B[ch] < 0 || temp_endpts.B[ch] >= (1 << prec))
+					continue;
+			}
+
+			float err = map_colors(colors, np, temp_endpts, region_prec, min_err, temp_indices);
+
+			if (err < min_err)
+			{
+				improved = true;
+				min_err = err;
+				beststep = sign * step;
+				for (int i=0; i<np; ++i)
+					indices[i] = temp_indices[i];
+			}
+		}
+		// if this was an improvement, move the endpoint and continue search from there
+		if (improved)
+		{
+			if (do_b == 0)
+				new_endpts.A[ch] += beststep;
+			else
+				new_endpts.B[ch] += beststep;
+		}
+	}
+	return min_err;
+}
+
+// the larger the error the more time it is worth spending on an exhaustive search.
+// perturb the endpoints at least -3 to 3.
+// if err > 5000 perturb endpoints 50% of precision
+// if err > 1000 25%
+// if err > 200 12.5%
+// if err > 40  6.25%
+// for np = 16 -- adjust error thresholds as a function of np
+// always ensure endpoint ordering is preserved (no need to overlap the scan)
+// if orig_err returned from this is less than its input value, then indices[] will contain valid indices
+static double exhaustive(const Vec4 colors[], int np, int ch, const RegionPrec &region_prec, double orig_err, IntEndptsRGB_1 &opt_endpts, int indices[Tile::TILE_TOTAL])
+{
+	IntEndptsRGB_1 temp_endpts;
+	double best_err = orig_err;
+	int aprec = region_prec.endpt_a_prec[ch];
+	int bprec = region_prec.endpt_b_prec[ch];
+	int good_indices[Tile::TILE_TOTAL];
+	int temp_indices[Tile::TILE_TOTAL];
+
+	for (int i=0; i<np; ++i)
+		indices[i] = -1;
+
+	double thr_scale = (double)np / (double)Tile::TILE_TOTAL;
+
+	if (orig_err == 0) return orig_err;
+
+	int adelta = 0, bdelta = 0;
+	if (orig_err > 5000.0*thr_scale)		{ adelta = (1 << aprec)/2; bdelta = (1 << bprec)/2; }
+	else if (orig_err > 1000.0*thr_scale)	{ adelta = (1 << aprec)/4; bdelta = (1 << bprec)/4; }
+	else if (orig_err > 200.0*thr_scale)	{ adelta = (1 << aprec)/8; bdelta = (1 << bprec)/8; }
+	else if (orig_err > 40.0*thr_scale)		{ adelta = (1 << aprec)/16; bdelta = (1 << bprec)/16; }
+	adelta = MAX(adelta, 3);
+	bdelta = MAX(bdelta, 3);
+
+#ifdef	DISABLE_EXHAUSTIVE
+	adelta = bdelta = 3;
+#endif
+
+	temp_endpts = opt_endpts;
+
+	// ok figure out the range of A and B
+	int alow = MAX(0, opt_endpts.A[ch] - adelta);
+	int ahigh = MIN((1<<aprec)-1, opt_endpts.A[ch] + adelta);
+	int blow = MAX(0, opt_endpts.B[ch] - bdelta);
+	int bhigh = MIN((1<<bprec)-1, opt_endpts.B[ch] + bdelta);
+
+	// now there's no need to swap the ordering of A and B
+	bool a_le_b = opt_endpts.A[ch] <= opt_endpts.B[ch];
+
+	int amin, bmin;
+
+	if (opt_endpts.A[ch] <= opt_endpts.B[ch])
+	{
+		// keep a <= b
+		for (int a = alow; a <= ahigh; ++a)
+		for (int b = MAX(a, blow); b < bhigh; ++b)
+		{
+			temp_endpts.A[ch] = a;
+			temp_endpts.B[ch] = b;
+		
+			float err = map_colors(colors, np, temp_endpts, region_prec, best_err, temp_indices);
+			if (err < best_err) 
+			{ 
+				amin = a; 
+				bmin = b; 
+				best_err = err;
+				for (int i=0; i<np; ++i)
+					good_indices[i] = temp_indices[i];
+			}
+		}
+	}
+	else
+	{
+		// keep b <= a
+		for (int b = blow; b < bhigh; ++b)
+		for (int a = MAX(b, alow); a <= ahigh; ++a)
+		{
+			temp_endpts.A[ch] = a;
+			temp_endpts.B[ch] = b;
+		
+			float err = map_colors(colors, np, temp_endpts, region_prec, best_err, temp_indices);
+			if (err < best_err) 
+			{ 
+				amin = a; 
+				bmin = b; 
+				best_err = err; 
+				for (int i=0; i<np; ++i)
+					good_indices[i] = temp_indices[i];
+			}
+		}
+	}
+	if (best_err < orig_err)
+	{
+		opt_endpts.A[ch] = amin;
+		opt_endpts.B[ch] = bmin;
+		// if we actually improved, update the indices
+		for (int i=0; i<np; ++i)
+			indices[i] = good_indices[i];
+	}
+	return best_err;
+}
+
+static double optimize_one(const Vec4 colors[], int np, double orig_err, const IntEndptsRGB_1 &orig_endpts, const RegionPrec &region_prec, IntEndptsRGB_1 &opt_endpts)
+{
+	double opt_err = orig_err;
+
+	opt_endpts = orig_endpts;
+
+	/*
+		err0 = perturb(rgb0, delta0)
+		err1 = perturb(rgb1, delta1)
+		if (err0 < err1)
+			if (err0 >= initial_error) break
+			rgb0 += delta0
+			next = 1
+		else
+			if (err1 >= initial_error) break
+			rgb1 += delta1
+			next = 0
+		initial_err = map()
+		for (;;)
+			err = perturb(next ? rgb1:rgb0, delta)
+			if (err >= initial_err) break
+			next? rgb1 : rgb0 += delta
+			initial_err = err
+	*/
+	IntEndptsRGB_1 new_a, new_b;
+	IntEndptsRGB_1 new_endpt;
+	int do_b;
+	int orig_indices[Tile::TILE_TOTAL];
+	int new_indices[Tile::TILE_TOTAL];
+	int temp_indices0[Tile::TILE_TOTAL];
+	int temp_indices1[Tile::TILE_TOTAL];
+
+	// now optimize each channel separately
+	// for the first error improvement, we save the indices. then, for any later improvement, we compare the indices
+	// if they differ, we restart the loop (which then falls back to looking for a first improvement.)
+	for (int ch = 0; ch < NCHANNELS_RGB; ++ch)
+	{
+		// figure out which endpoint when perturbed gives the most improvement and start there
+		// if we just alternate, we can easily end up in a local minima
+		float err0 = perturb_one(colors, np, ch, region_prec, opt_endpts, new_a, opt_err, 0, temp_indices0);	// perturb endpt A
+		float err1 = perturb_one(colors, np, ch, region_prec, opt_endpts, new_b, opt_err, 1, temp_indices1);	// perturb endpt B
+
+		if (err0 < err1)
+		{
+			if (err0 >= opt_err)
+				continue;
+
+			for (int i=0; i<np; ++i)
+			{
+				new_indices[i] = orig_indices[i] = temp_indices0[i];
+				assert (orig_indices[i] != -1);
+			}
+
+			opt_endpts.A[ch] = new_a.A[ch];
+			opt_err = err0;
+			do_b = 1;		// do B next
+		}
+		else
+		{
+			if (err1 >= opt_err)
+				continue;
+
+			for (int i=0; i<np; ++i)
+			{
+				new_indices[i] = orig_indices[i] = temp_indices1[i];
+				assert (orig_indices[i] != -1);
+			}
+
+			opt_endpts.B[ch] = new_b.B[ch];
+			opt_err = err1;
+			do_b = 0;		// do A next
+		}
+		
+		// now alternate endpoints and keep trying until there is no improvement
+		for (;;)
+		{
+			float err = perturb_one(colors, np, ch, region_prec, opt_endpts, new_endpt, opt_err, do_b, temp_indices0);
+			if (err >= opt_err)
+				break;
+
+			for (int i=0; i<np; ++i)
+			{
+				new_indices[i] = temp_indices0[i];
+				assert (new_indices[i] != -1);
+			}
+
+			if (do_b == 0)
+				opt_endpts.A[ch] = new_endpt.A[ch];
+			else
+				opt_endpts.B[ch] = new_endpt.B[ch];
+			opt_err = err;
+			do_b = 1 - do_b;	// now move the other endpoint
+		}
+
+		// see if the indices have changed
+		int i;
+		for (i=0; i<np; ++i)
+			if (orig_indices[i] != new_indices[i])
+				break;
+
+		if (i<np)
+			ch = -1;	// start over
+	}
+
+	// finally, do a small exhaustive search around what we think is the global minima to be sure
+	// note this is independent of the above search, so we don't care about the indices from the above
+	// we don't care about the above because if they differ, so what? we've already started at ch=0
+	bool first = true;
+	for (int ch = 0; ch < NCHANNELS_RGB; ++ch)
+	{
+		double new_err = exhaustive(colors, np, ch, region_prec, opt_err, opt_endpts, temp_indices0);
+
+		if (new_err < opt_err)
+		{
+			opt_err = new_err;
+
+			if (first)
+			{
+				for (int i=0; i<np; ++i)
+				{
+					orig_indices[i] = temp_indices0[i];
+					assert (orig_indices[i] != -1);
+				}
+				first = false;
+			}
+			else
+			{
+				// see if the indices have changed
+				int i;
+				for (i=0; i<np; ++i)
+					if (orig_indices[i] != temp_indices0[i])
+						break;
+
+				if (i<np)
+				{
+					ch = -1;	// start over
+					first = true;
+				}
+			}
+		}
+	}
+
+	return opt_err;
+}
+
+static void optimize_endpts(const Tile &tile, int shapeindex, const double orig_err[NREGIONS], 
+							IntEndptsRGB_1 orig_endpts[NREGIONS], const PatternPrec &pattern_prec, double opt_err[NREGIONS], IntEndptsRGB_1 opt_endpts[NREGIONS])
+{
+	Vec4 pixels[Tile::TILE_TOTAL];
+	IntEndptsRGB_1 temp_in, temp_out;
+	int temp_indices[Tile::TILE_TOTAL];
+
+	for (int region=0; region<NREGIONS; ++region)
+	{
+		// collect the pixels in the region
+		int np = 0;
+
+		for (int y = 0; y < tile.size_y; y++)
+		for (int x = 0; x < tile.size_x; x++)
+			if (REGION(x,y,shapeindex) == region)
+				pixels[np++] = tile.data[y][x];
+
+		opt_endpts[region] = temp_in = orig_endpts[region];
+		opt_err[region] = orig_err[region];
+
+		double best_err = orig_err[region];
+
+		for (int lsbmode=0; lsbmode<NLSBMODES; ++lsbmode)
+		{
+			temp_in.lsb = lsbmode;
+
+			// make sure we have a valid error for temp_in
+			// we use DBL_MAX here because we want an accurate temp_in_err, no shortcuts
+			// (mapcolors will compute a mapping but will stop if the error exceeds the value passed in the DBL_MAX position)
+			double temp_in_err = map_colors(pixels, np, temp_in, pattern_prec.region_precs[region], DBL_MAX, temp_indices);
+
+			// now try to optimize these endpoints
+			double temp_out_err = optimize_one(pixels, np, temp_in_err, temp_in, pattern_prec.region_precs[region], temp_out);
+
+			// if we find an improvement, update the best so far and correct the output endpoints and errors
+			if (temp_out_err < best_err)
+			{
+				best_err = temp_out_err;
+				opt_err[region] = temp_out_err;
+				opt_endpts[region] = temp_out;
+			}
+		}
+	}
+}
+
+
+/* optimization algorithm
+	for each pattern
+		convert endpoints using pattern precision
+		assign indices and get initial error
+		compress indices (and possibly reorder endpoints)
+		transform endpoints
+		if transformed endpoints fit pattern
+			get original endpoints back
+			optimize endpoints, get new endpoints, new indices, and new error // new error will almost always be better
+			compress new indices
+			transform new endpoints
+			if new endpoints fit pattern AND if error is improved
+				emit compressed block with new data
+			else
+				emit compressed block with original data // to try to preserve maximum endpoint precision
+*/
+
+static double refine(const Tile &tile, int shapeindex_best, const FltEndpts endpts[NREGIONS], char *block)
+{
+	double orig_err[NREGIONS], opt_err[NREGIONS], orig_toterr, opt_toterr, expected_opt_err[NREGIONS];
+	IntEndptsRGB_1 orig_endpts[NREGIONS], opt_endpts[NREGIONS];
+	int orig_indices[Tile::TILE_H][Tile::TILE_W], opt_indices[Tile::TILE_H][Tile::TILE_W];
+
+	for (int sp = 0; sp < NPATTERNS; ++sp)
+	{
+		quantize_endpts(endpts, pattern_precs[sp], orig_endpts);
+		assign_indices(tile, shapeindex_best, orig_endpts, pattern_precs[sp], orig_indices, orig_err);
+		swap_indices(orig_endpts, orig_indices, shapeindex_best);
+		if (patterns[sp].transformed)
+			transform_forward(orig_endpts);
+		// apply a heuristic here -- we check if the endpoints fit before we try to optimize them.
+		// the assumption made is that if they don't fit now, they won't fit after optimizing.
+		if (endpts_fit(orig_endpts, patterns[sp]))
+		{
+			if (patterns[sp].transformed)
+				transform_inverse(orig_endpts);
+			optimize_endpts(tile, shapeindex_best, orig_err, orig_endpts, pattern_precs[sp], expected_opt_err, opt_endpts);
+			assign_indices(tile, shapeindex_best, opt_endpts, pattern_precs[sp], opt_indices, opt_err);
+			for (int i=0; i<NREGIONS; ++i)
+				assert(expected_opt_err[i] == opt_err[i]);
+			swap_indices(opt_endpts, opt_indices, shapeindex_best);
+			if (patterns[sp].transformed)
+				transform_forward(opt_endpts);
+			orig_toterr = opt_toterr = 0;
+			for (int i=0; i < NREGIONS; ++i) { orig_toterr += orig_err[i]; opt_toterr += opt_err[i]; }
+			assert (opt_toterr <= orig_toterr);
+			if (endpts_fit(opt_endpts, patterns[sp]) && opt_toterr < orig_toterr)
+			{
+				emit_block(opt_endpts, shapeindex_best, patterns[sp], opt_indices, block);
+				return opt_toterr;
+			}
+			else
+			{
+				// either it stopped fitting when we optimized it, or there was no improvement
+				// so go back to the unoptimized endpoints which we know will fit
+				if (patterns[sp].transformed)
+					transform_forward(orig_endpts);
+				emit_block(orig_endpts, shapeindex_best, patterns[sp], orig_indices, block);
+				return orig_toterr;
+			}
+		}
+	}
+	throw "No candidate found, should never happen (mode avpcl 1).";
+}
+
+static void clamp(Vec4 &v)
+{
+	if (v.X() < RGBA_MIN) v.X() = RGBA_MIN;
+	if (v.X() > RGBA_MAX) v.X() = RGBA_MAX;
+	if (v.Y() < RGBA_MIN) v.Y() = RGBA_MIN;
+	if (v.Y() > RGBA_MAX) v.Y() = RGBA_MAX;
+	if (v.Z() < RGBA_MIN) v.Z() = RGBA_MIN;
+	if (v.Z() > RGBA_MAX) v.Z() = RGBA_MAX;
+	v.W() = RGBA_MAX;
+}
+
+static void generate_palette_unquantized(const FltEndpts endpts[NREGIONS], Vec4 palette[NREGIONS][NINDICES])
+{
+	for (int region = 0; region < NREGIONS; ++region)
+	for (int i = 0; i < NINDICES; ++i)
+		palette[region][i] = PALETTE_LERP(endpts[region].A, endpts[region].B, i, 0.0, float(DENOM));
+}
+
+// generate a palette from unquantized endpoints, then pick best palette color for all pixels in each region, return toterr for all regions combined
+static double map_colors(const Tile &tile, int shapeindex, const FltEndpts endpts[NREGIONS])
+{
+	// build list of possibles
+	Vec4 palette[NREGIONS][NINDICES];
+
+	generate_palette_unquantized(endpts, palette);
+
+	double toterr = 0;
+	Vec4 err;
+
+	for (int y = 0; y < tile.size_y; y++)
+	for (int x = 0; x < tile.size_x; x++)
+	{
+		int region = REGION(x,y,shapeindex);
+		double err, besterr = DBL_MAX;
+
+		for (int i = 0; i < NINDICES && besterr > 0; ++i)
+		{
+			err = Utils::metric4(tile.data[y][x], palette[region][i]);
+
+			if (err > besterr)	// error increased, so we're done searching. this works for most norms.
+				break;
+			if (err < besterr)
+				besterr = err;
+		}
+		toterr += besterr;
+	}
+	return toterr;
+}
+
+static double rough(const Tile &tile, int shapeindex, FltEndpts endpts[NREGIONS])
+{
+	for (int region=0; region<NREGIONS; ++region)
+	{
+		int np = 0;
+		Vec4 colors[Tile::TILE_TOTAL];
+		Vec4 mean(0,0,0,0);
+
+		for (int y = 0; y < tile.size_y; y++)
+		for (int x = 0; x < tile.size_x; x++)
+			if (REGION(x,y,shapeindex) == region)
+			{
+				colors[np] = tile.data[y][x];
+				mean += tile.data[y][x];
+				++np;
+			}
+
+		// handle simple cases	
+		if (np == 0)
+		{
+			Vec4 zero(0,0,0,RGBA_MAX);
+			endpts[region].A = zero;
+			endpts[region].B = zero;
+			continue;
+		}
+		else if (np == 1)
+		{
+			endpts[region].A = colors[0];
+			endpts[region].B = colors[0];
+			continue;
+		}
+		else if (np == 2)
+		{
+			endpts[region].A = colors[0];
+			endpts[region].B = colors[1];
+			continue;
+		}
+
+		Matrix rdq(np, 3);
+
+		mean /= float(np);
+
+		// only look at RGB' ignore A
+		for (int i = 0; i < np; ++i)
+		{
+			rdq(i,0) = colors[i].X() - mean.X();
+			rdq(i,1) = colors[i].Y() - mean.Y();
+			rdq(i,2) = colors[i].Z() - mean.Z();
+		}
+				
+		// perform a singular value decomposition
+		SVD svd(rdq);
+
+		// get the principal component direction (well, the one with the largest weight)
+		Vec4 direction(svd.R()(0,0), svd.R()(0,1), svd.R()(0,2), 0.0);
+
+		// project each pixel value along the principal direction
+		double minp = DBL_MAX, maxp = -DBL_MAX;
+		for (int i = 0; i < np; i++) 
+		{
+			float dp = rdq(i,0) * direction.X() + rdq(i,1)*direction.Y() + rdq(i,2)*direction.Z();
+			if (dp < minp) minp = dp;
+			if (dp > maxp) maxp = dp;
+		}
+
+		// choose as endpoints 2 points along the principal direction that span the projections of all of the pixel values
+		endpts[region].A = mean + minp*direction;
+		endpts[region].B = mean + maxp*direction;
+
+		// clamp endpoints
+		// the argument for clamping is that the actual endpoints need to be clamped and thus we need to choose the best
+		// shape based on endpoints being clamped
+		clamp(endpts[region].A);
+		clamp(endpts[region].B);
+	}
+
+	return map_colors(tile, shapeindex, endpts);
+}
+
+static void swap(double *list1, int *list2, int i, int j)
+{
+	double t = list1[i]; list1[i] = list1[j]; list1[j] = t;
+	int t1 = list2[i]; list2[i] = list2[j]; list2[j] = t1;
+}
+
+double AVPCL::compress_mode1(const Tile &t, char *block)
+{
+	// number of rough cases to look at. reasonable values of this are 1, NSHAPES/4, and NSHAPES
+	// NSHAPES/4 gets nearly all the cases; you can increase that a bit (say by 3 or 4) if you really want to squeeze the last bit out
+	const int NITEMS=NSHAPES/4;
+
+	// pick the best NITEMS shapes and refine these.
+	struct {
+		FltEndpts endpts[NREGIONS];
+	} all[NSHAPES];
+	double roughmse[NSHAPES];
+	int index[NSHAPES];
+	char tempblock[AVPCL::BLOCKSIZE];
+	double msebest = DBL_MAX;
+
+	for (int i=0; i<NSHAPES; ++i)
+	{
+		roughmse[i] = rough(t, i, &all[i].endpts[0]);
+		index[i] = i;
+	}
+
+	// bubble sort -- only need to bubble up the first NITEMS items
+	for (int i=0; i<NITEMS; ++i)
+	for (int j=i+1; j<NSHAPES; ++j)
+		if (roughmse[i] > roughmse[j])
+			swap(roughmse, index, i, j);
+
+	for (int i=0; i<NITEMS && msebest>0; ++i)
+	{
+		int shape = index[i];
+		double mse = refine(t, shape, &all[shape].endpts[0], tempblock);
+		if (mse < msebest)
+		{
+			memcpy(block, tempblock, sizeof(tempblock));
+			msebest = mse;
+		}
+	}
+	return msebest;
+}
+
diff --git a/src/nvtt/bc7/avpcl_mode2.cpp b/src/nvtt/bc7/avpcl_mode2.cpp
new file mode 100644
index 0000000..ef37dbe
--- /dev/null
+++ b/src/nvtt/bc7/avpcl_mode2.cpp
@@ -0,0 +1,1005 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+// Thanks to Jacob Munkberg (jacob@cs.lth.se) for the shortcut of using SVD to do the equivalent of principal components analysis
+
+// x100 555x6 64p 2bi
+
+#include "bits.h"
+#include "tile.h"
+#include "avpcl.h"
+#include "arvo/Vec4.h"
+#include "arvo/Matrix.h"
+#include "arvo/SVD.h"
+#include "utils.h"
+#include "endpts.h"
+
+#include <assert.h>
+
+#include "shapes_three.h"
+
+using namespace ArvoMath;
+
+#define NINDICES	4
+#define	INDEXBITS	2
+#define	HIGH_INDEXBIT	(1<<(INDEXBITS-1))
+#define	DENOM		(NINDICES-1)
+#define	BIAS		(DENOM/2)
+
+// WORK: determine optimal traversal pattern to search for best shape -- what does the error curve look like?
+// i.e. can we search shapes in a particular order so we can see the global error minima easily and
+// stop without having to touch all shapes?
+
+#define	POS_TO_X(pos)	((pos)&3)
+#define	POS_TO_Y(pos)	(((pos)>>2)&3)
+
+#define	NBITSIZES	6
+
+struct ChanBits
+{
+	int nbitsizes[NBITSIZES];	// bitsizes for one channel
+};
+
+struct Pattern
+{
+	ChanBits chan[NCHANNELS_RGB];//  bit patterns used per channel
+	int transformed;		// if 0, deltas are unsigned and no transform; otherwise, signed and transformed
+	int mode;				// associated mode value
+	int modebits;			// number of mode bits
+	char *encoding;			// verilog description of encoding for this mode
+};
+
+#define	NPATTERNS 1
+
+static Pattern patterns[NPATTERNS] =
+{
+	// red			green			blue			xfm	mode  mb
+	5,5,5,5,5,5,	5,5,5,5,5,5,	5,5,5,5,5,5,	0,	0x4, 3, "",
+};
+
+
+struct RegionPrec
+{
+	int	endpt_a_prec[NCHANNELS_RGB];
+	int endpt_b_prec[NCHANNELS_RGB];
+};
+
+struct PatternPrec
+{
+	RegionPrec region_precs[NREGIONS_THREE];
+};
+
+
+// this is the precision for each channel and region
+// NOTE: this MUST match the corresponding data in "patterns" above -- WARNING: there is NO assert to check this!
+
+static PatternPrec pattern_precs[NPATTERNS] =
+{
+	5,5,5, 5,5,5, 5,5,5, 5,5,5, 5,5,5, 5,5,5, 
+};
+
+// return # of bits needed to store n. handle signed or unsigned cases properly
+static int nbits(int n, bool issigned)
+{
+	int nb;
+	if (n==0)
+		return 0;	// no bits needed for 0, signed or not
+	else if (n > 0)
+	{
+		for (nb=0; n; ++nb, n>>=1) ;
+		return nb + (issigned?1:0);
+	}
+	else
+	{
+		assert (issigned);
+		for (nb=0; n<-1; ++nb, n>>=1) ;
+		return nb + 1;
+	}
+}
+
+#define	R_0	ep[0].A[i]
+#define	R_1 ep[0].B[i]
+#define	R_2 ep[1].A[i]
+#define	R_3	ep[1].B[i]
+
+static void transform_forward(IntEndptsRGB ep[NREGIONS])
+{
+	for (int i=0; i<NCHANNELS_RGB; ++i)
+	{
+		R_1 -= R_3; R_2 -= R_3; R_0 -= R_3;
+	}
+}
+
+static void transform_inverse(IntEndptsRGB ep[NREGIONS])
+{
+	for (int i=0; i<NCHANNELS_RGB; ++i)
+	{
+		R_0 += R_3; R_2 += R_3; R_1 += R_3;
+	}
+}
+
+static void quantize_endpts(const FltEndpts endpts[NREGIONS_THREE], const PatternPrec &pattern_prec, IntEndptsRGB q_endpts[NREGIONS_THREE])
+{
+	for (int region = 0; region < NREGIONS_THREE; ++region)
+	{
+		q_endpts[region].A[0] = Utils::quantize(endpts[region].A.X(), pattern_prec.region_precs[region].endpt_a_prec[0]);
+		q_endpts[region].A[1] = Utils::quantize(endpts[region].A.Y(), pattern_prec.region_precs[region].endpt_a_prec[1]);
+		q_endpts[region].A[2] = Utils::quantize(endpts[region].A.Z(), pattern_prec.region_precs[region].endpt_a_prec[2]);
+		q_endpts[region].B[0] = Utils::quantize(endpts[region].B.X(), pattern_prec.region_precs[region].endpt_b_prec[0]);
+		q_endpts[region].B[1] = Utils::quantize(endpts[region].B.Y(), pattern_prec.region_precs[region].endpt_b_prec[1]);
+		q_endpts[region].B[2] = Utils::quantize(endpts[region].B.Z(), pattern_prec.region_precs[region].endpt_b_prec[2]);
+	}
+}
+
+// swap endpoints as needed to ensure that the indices at index_positions have a 0 high-order bit
+static void swap_indices(IntEndptsRGB endpts[NREGIONS_THREE], int indices[Tile::TILE_H][Tile::TILE_W], int shapeindex)
+{
+	for (int region = 0; region < NREGIONS_THREE; ++region)
+	{
+		int position = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,region);
+
+		int x = POS_TO_X(position);
+		int y = POS_TO_Y(position);
+		assert(REGION(x,y,shapeindex) == region);		// double check the table
+		if (indices[y][x] & HIGH_INDEXBIT)
+		{
+			// high bit is set, swap the endpts and indices for this region
+			int t;
+			for (int i=0; i<NCHANNELS_RGB; ++i) { t = endpts[region].A[i]; endpts[region].A[i] = endpts[region].B[i]; endpts[region].B[i] = t; }
+
+			for (int y = 0; y < Tile::TILE_H; y++)
+			for (int x = 0; x < Tile::TILE_W; x++)
+				if (REGION(x,y,shapeindex) == region)
+					indices[y][x] = NINDICES - 1 - indices[y][x];
+		}
+	}
+}
+
+static bool endpts_fit(IntEndptsRGB endpts[NREGIONS_THREE], const Pattern &p)
+{
+	return true;
+}
+
+
+static void write_header(const IntEndptsRGB endpts[NREGIONS_THREE], int shapeindex, const Pattern &p, Bits &out)
+{
+	out.write(p.mode, p.modebits);
+	out.write(shapeindex, SHAPEBITS);
+
+	for (int j=0; j<NCHANNELS_RGB; ++j)
+		for (int i=0; i<NREGIONS_THREE; ++i)
+		{
+			out.write(endpts[i].A[j], p.chan[j].nbitsizes[i*2+0]);
+			out.write(endpts[i].B[j], p.chan[j].nbitsizes[i*2+1]);
+		}
+	assert (out.getptr() == 99);
+}
+
+static void read_header(Bits &in, IntEndptsRGB endpts[NREGIONS_THREE], int &shapeindex, Pattern &p, int &pat_index)
+{
+	int mode = AVPCL::getmode(in);
+
+	pat_index = 0;
+	assert (pat_index >= 0 && pat_index < NPATTERNS);
+	assert (in.getptr() == patterns[pat_index].modebits);
+
+	shapeindex = in.read(SHAPEBITS);
+
+	p = patterns[pat_index];
+
+	for (int j=0; j<NCHANNELS_RGB; ++j)
+		for (int i=0; i<NREGIONS_THREE; ++i)
+		{
+			endpts[i].A[j] = in.read(p.chan[j].nbitsizes[i*2+0]);
+			endpts[i].B[j] = in.read(p.chan[j].nbitsizes[i*2+1]);
+		}
+	assert (in.getptr() == 99);
+}
+
+
+// WORK PLACEHOLDER -- keep it simple for now
+static void write_indices(const int indices[Tile::TILE_H][Tile::TILE_W], int shapeindex, Bits &out)
+{
+	int positions[NREGIONS_THREE];
+
+	for (int r = 0; r < NREGIONS_THREE; ++r)
+		positions[r] = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,r);
+
+	for (int pos = 0; pos < Tile::TILE_TOTAL; ++pos)
+	{
+		int x = POS_TO_X(pos);
+		int y = POS_TO_Y(pos);
+
+		bool match = false;
+
+		for (int r = 0; r < NREGIONS_THREE; ++r)
+			if (positions[r] == pos) { match = true; break; }
+
+		out.write(indices[y][x], INDEXBITS - (match ? 1 : 0));
+	}
+}
+
+static void read_indices(Bits &in, int shapeindex, int indices[Tile::TILE_H][Tile::TILE_W])
+{
+	int positions[NREGIONS_THREE];
+
+	for (int r = 0; r < NREGIONS_THREE; ++r)
+		positions[r] = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,r);
+
+	for (int pos = 0; pos < Tile::TILE_TOTAL; ++pos)
+	{
+		int x = POS_TO_X(pos);
+		int y = POS_TO_Y(pos);
+
+		bool match = false;
+
+		for (int r = 0; r < NREGIONS_THREE; ++r)
+			if (positions[r] == pos) { match = true; break; }
+
+		indices[y][x]= in.read(INDEXBITS - (match ? 1 : 0));
+	}
+}
+
+static void emit_block(const IntEndptsRGB endpts[NREGIONS_THREE], int shapeindex, const Pattern &p, const int indices[Tile::TILE_H][Tile::TILE_W], char *block)
+{
+	Bits out(block, AVPCL::BITSIZE);
+
+	write_header(endpts, shapeindex, p, out);
+
+	write_indices(indices, shapeindex, out);
+
+	assert(out.getptr() == AVPCL::BITSIZE);
+}
+
+static void generate_palette_quantized(const IntEndptsRGB &endpts, const RegionPrec &region_prec, Vec4 palette[NINDICES])
+{
+	// scale endpoints
+	int a, b;			// really need a IntVec4...
+
+	a = Utils::unquantize(endpts.A[0], region_prec.endpt_a_prec[0]); 
+	b = Utils::unquantize(endpts.B[0], region_prec.endpt_b_prec[0]);
+
+	// interpolate
+	for (int i = 0; i < NINDICES; ++i)
+		palette[i].X() = PALETTE_LERP(a, b, i, BIAS, DENOM);
+
+	a = Utils::unquantize(endpts.A[1], region_prec.endpt_a_prec[1]); 
+	b = Utils::unquantize(endpts.B[1], region_prec.endpt_b_prec[1]);
+
+	// interpolate
+	for (int i = 0; i < NINDICES; ++i)
+		palette[i].Y() = PALETTE_LERP(a, b, i, BIAS, DENOM);
+
+	a = Utils::unquantize(endpts.A[2], region_prec.endpt_a_prec[2]); 
+	b = Utils::unquantize(endpts.B[2], region_prec.endpt_b_prec[2]);
+
+	// interpolate
+	for (int i = 0; i < NINDICES; ++i)
+		palette[i].Z() = PALETTE_LERP(a, b, i, BIAS, DENOM);
+
+	// constant alpha
+	for (int i = 0; i < NINDICES; ++i)
+		palette[i].W() = RGBA_MAX;
+}
+
+// sign extend but only if it was transformed
+static void sign_extend(Pattern &p, IntEndptsRGB endpts[NREGIONS_THREE])
+{
+	assert (p.transformed != 0);
+
+	for (int i=0; i<NCHANNELS_RGB; ++i)
+	{
+		// endpts[0].A[i] = SIGN_EXTEND(endpts[0].B[i], p.chan[i].nbitsizes[0]);	// always positive here
+		endpts[0].B[i] = SIGN_EXTEND(endpts[0].B[i], p.chan[i].nbitsizes[1]);
+		endpts[1].A[i] = SIGN_EXTEND(endpts[1].A[i], p.chan[i].nbitsizes[2]);
+		endpts[1].B[i] = SIGN_EXTEND(endpts[1].B[i], p.chan[i].nbitsizes[3]);
+		endpts[2].A[i] = SIGN_EXTEND(endpts[2].A[i], p.chan[i].nbitsizes[4]);
+		endpts[2].B[i] = SIGN_EXTEND(endpts[2].B[i], p.chan[i].nbitsizes[5]);
+	}
+}
+
+void AVPCL::decompress_mode2(const char *block, Tile &t)
+{
+	Bits in(block, AVPCL::BITSIZE);
+
+	Pattern p;
+	IntEndptsRGB endpts[NREGIONS_THREE];
+	int shapeindex, pat_index;
+
+	read_header(in, endpts, shapeindex, p, pat_index);
+	
+	if (p.transformed)
+	{
+		sign_extend(p, endpts);
+		transform_inverse(endpts);
+	}
+
+	Vec4 palette[NREGIONS_THREE][NINDICES];
+	for (int r = 0; r < NREGIONS_THREE; ++r)
+		generate_palette_quantized(endpts[r], pattern_precs[pat_index].region_precs[r], &palette[r][0]);
+
+	int indices[Tile::TILE_H][Tile::TILE_W];
+
+	read_indices(in, shapeindex, indices);
+
+	assert(in.getptr() == AVPCL::BITSIZE);
+
+	// lookup
+	for (int y = 0; y < Tile::TILE_H; y++)
+	for (int x = 0; x < Tile::TILE_W; x++)
+		t.data[y][x] = palette[REGION(x,y,shapeindex)][indices[y][x]];
+}
+
+// given a collection of colors and quantized endpoints, generate a palette, choose best entries, and return a single toterr
+static double map_colors(const Vec4 colors[], int np, const IntEndptsRGB &endpts, const RegionPrec &region_prec, double current_err, int indices[Tile::TILE_TOTAL])
+{
+	Vec4 palette[NINDICES];
+	double toterr = 0;
+	Vec4 err;
+
+	generate_palette_quantized(endpts, region_prec, palette);
+
+	for (int i = 0; i < np; ++i)
+	{
+		double err, besterr = DBL_MAX;
+
+		for (int j = 0; j < NINDICES && besterr > 0; ++j)
+		{
+			err = Utils::metric4(colors[i], palette[j]);
+
+			if (err > besterr)	// error increased, so we're done searching
+				break;
+			if (err < besterr)
+			{
+				besterr = err;
+				indices[i] = j;
+			}
+		}
+		toterr += besterr;
+
+		// check for early exit
+		if (toterr > current_err)
+		{
+			// fill out bogus index values so it's initialized at least
+			for (int k = i; k < np; ++k)
+				indices[k] = -1;
+
+			return DBL_MAX;
+		}
+	}
+	return toterr;
+}
+
+// assign indices given a tile, shape, and quantized endpoints, return toterr for each region
+static void assign_indices(const Tile &tile, int shapeindex, IntEndptsRGB endpts[NREGIONS_THREE], const PatternPrec &pattern_prec, 
+						   int indices[Tile::TILE_H][Tile::TILE_W], double toterr[NREGIONS_THREE])
+{
+	// build list of possibles
+	Vec4 palette[NREGIONS_THREE][NINDICES];
+
+	for (int region = 0; region < NREGIONS_THREE; ++region)
+	{
+		generate_palette_quantized(endpts[region], pattern_prec.region_precs[region], &palette[region][0]);
+		toterr[region] = 0;
+	}
+
+	Vec4 err;
+
+	for (int y = 0; y < tile.size_y; y++)
+	for (int x = 0; x < tile.size_x; x++)
+	{
+		int region = REGION(x,y,shapeindex);
+		double err, besterr = DBL_MAX;
+
+		for (int i = 0; i < NINDICES && besterr > 0; ++i)
+		{
+			err = Utils::metric4(tile.data[y][x], palette[region][i]);
+
+			if (err > besterr)	// error increased, so we're done searching
+				break;
+			if (err < besterr)
+			{
+				besterr = err;
+				indices[y][x] = i;
+			}
+		}
+		toterr[region] += besterr;
+	}
+}
+
+// note: indices are valid only if the value returned is less than old_err; otherwise they contain -1's
+// this function returns either old_err or a value smaller (if it was successful in improving the error)
+static double perturb_one(const Vec4 colors[], int np, int ch, const RegionPrec &region_prec, const IntEndptsRGB &old_endpts, IntEndptsRGB &new_endpts, 
+						  double old_err, int do_b, int indices[Tile::TILE_TOTAL])
+{
+	// we have the old endpoints: old_endpts
+	// we have the perturbed endpoints: new_endpts
+	// we have the temporary endpoints: temp_endpts
+
+	IntEndptsRGB temp_endpts;
+	float min_err = old_err;		// start with the best current error
+	int beststep;
+	int temp_indices[Tile::TILE_TOTAL];
+
+	for (int i=0; i<np; ++i)
+		indices[i] = -1;
+
+	// copy real endpoints so we can perturb them
+	temp_endpts = new_endpts = old_endpts;
+
+	int prec = do_b ? region_prec.endpt_b_prec[ch] : region_prec.endpt_a_prec[ch];
+
+	// do a logarithmic search for the best error for this endpoint (which)
+	for (int step = 1 << (prec-1); step; step >>= 1)
+	{
+		bool improved = false;
+		for (int sign = -1; sign <= 1; sign += 2)
+		{
+			if (do_b == 0)
+			{
+				temp_endpts.A[ch] = new_endpts.A[ch] + sign * step;
+				if (temp_endpts.A[ch] < 0 || temp_endpts.A[ch] >= (1 << prec))
+					continue;
+			}
+			else
+			{
+				temp_endpts.B[ch] = new_endpts.B[ch] + sign * step;
+				if (temp_endpts.B[ch] < 0 || temp_endpts.B[ch] >= (1 << prec))
+					continue;
+			}
+
+			float err = map_colors(colors, np, temp_endpts, region_prec, min_err, temp_indices);
+
+			if (err < min_err)
+			{
+				improved = true;
+				min_err = err;
+				beststep = sign * step;
+				for (int i=0; i<np; ++i)
+					indices[i] = temp_indices[i];
+			}
+		}
+		// if this was an improvement, move the endpoint and continue search from there
+		if (improved)
+		{
+			if (do_b == 0)
+				new_endpts.A[ch] += beststep;
+			else
+				new_endpts.B[ch] += beststep;
+		}
+	}
+	return min_err;
+}
+
+// the larger the error the more time it is worth spending on an exhaustive search.
+// perturb the endpoints at least -3 to 3.
+// if err > 5000 perturb endpoints 50% of precision
+// if err > 1000 25%
+// if err > 200 12.5%
+// if err > 40  6.25%
+// for np = 16 -- adjust error thresholds as a function of np
+// always ensure endpoint ordering is preserved (no need to overlap the scan)
+// if orig_err returned from this is less than its input value, then indices[] will contain valid indices
+static double exhaustive(const Vec4 colors[], int np, int ch, const RegionPrec &region_prec, double orig_err, IntEndptsRGB &opt_endpts, int indices[Tile::TILE_TOTAL])
+{
+	IntEndptsRGB temp_endpts;
+	double best_err = orig_err;
+	int aprec = region_prec.endpt_a_prec[ch];
+	int bprec = region_prec.endpt_b_prec[ch];
+	int good_indices[Tile::TILE_TOTAL];
+	int temp_indices[Tile::TILE_TOTAL];
+
+	for (int i=0; i<np; ++i)
+		indices[i] = -1;
+
+	double thr_scale = (double)np / (double)Tile::TILE_TOTAL;
+
+	if (orig_err == 0) return orig_err;
+
+	int adelta = 0, bdelta = 0;
+	if (orig_err > 5000.0*thr_scale)		{ adelta = (1 << aprec)/2; bdelta = (1 << bprec)/2; }
+	else if (orig_err > 1000.0*thr_scale)	{ adelta = (1 << aprec)/4; bdelta = (1 << bprec)/4; }
+	else if (orig_err > 200.0*thr_scale)	{ adelta = (1 << aprec)/8; bdelta = (1 << bprec)/8; }
+	else if (orig_err > 40.0*thr_scale)		{ adelta = (1 << aprec)/16; bdelta = (1 << bprec)/16; }
+	adelta = MAX(adelta, 3);
+	bdelta = MAX(bdelta, 3);
+
+#ifdef	DISABLE_EXHAUSTIVE
+	adelta = bdelta = 3;
+#endif
+
+	temp_endpts = opt_endpts;
+
+	// ok figure out the range of A and B
+	int alow = MAX(0, opt_endpts.A[ch] - adelta);
+	int ahigh = MIN((1<<aprec)-1, opt_endpts.A[ch] + adelta);
+	int blow = MAX(0, opt_endpts.B[ch] - bdelta);
+	int bhigh = MIN((1<<bprec)-1, opt_endpts.B[ch] + bdelta);
+
+	// now there's no need to swap the ordering of A and B
+	bool a_le_b = opt_endpts.A[ch] <= opt_endpts.B[ch];
+
+	int amin, bmin;
+
+	if (opt_endpts.A[ch] <= opt_endpts.B[ch])
+	{
+		// keep a <= b
+		for (int a = alow; a <= ahigh; ++a)
+		for (int b = MAX(a, blow); b < bhigh; ++b)
+		{
+			temp_endpts.A[ch] = a;
+			temp_endpts.B[ch] = b;
+		
+			float err = map_colors(colors, np, temp_endpts, region_prec, best_err, temp_indices);
+			if (err < best_err) 
+			{ 
+				amin = a; 
+				bmin = b; 
+				best_err = err;
+				for (int i=0; i<np; ++i)
+					good_indices[i] = temp_indices[i];
+			}
+		}
+	}
+	else
+	{
+		// keep b <= a
+		for (int b = blow; b < bhigh; ++b)
+		for (int a = MAX(b, alow); a <= ahigh; ++a)
+		{
+			temp_endpts.A[ch] = a;
+			temp_endpts.B[ch] = b;
+		
+			float err = map_colors(colors, np, temp_endpts, region_prec, best_err, temp_indices);
+			if (err < best_err) 
+			{ 
+				amin = a; 
+				bmin = b; 
+				best_err = err; 
+				for (int i=0; i<np; ++i)
+					good_indices[i] = temp_indices[i];
+			}
+		}
+	}
+	if (best_err < orig_err)
+	{
+		opt_endpts.A[ch] = amin;
+		opt_endpts.B[ch] = bmin;
+		orig_err = best_err;
+		// if we actually improved, update the indices
+		for (int i=0; i<np; ++i)
+			indices[i] = good_indices[i];
+	}
+	return best_err;
+}
+
+static double optimize_one(const Vec4 colors[], int np, double orig_err, const IntEndptsRGB &orig_endpts, const RegionPrec &region_prec, IntEndptsRGB &opt_endpts)
+{
+	double opt_err = orig_err;
+
+	opt_endpts = orig_endpts;
+
+	/*
+		err0 = perturb(rgb0, delta0)
+		err1 = perturb(rgb1, delta1)
+		if (err0 < err1)
+			if (err0 >= initial_error) break
+			rgb0 += delta0
+			next = 1
+		else
+			if (err1 >= initial_error) break
+			rgb1 += delta1
+			next = 0
+		initial_err = map()
+		for (;;)
+			err = perturb(next ? rgb1:rgb0, delta)
+			if (err >= initial_err) break
+			next? rgb1 : rgb0 += delta
+			initial_err = err
+	*/
+	IntEndptsRGB new_a, new_b;
+	IntEndptsRGB new_endpt;
+	int do_b;
+	int orig_indices[Tile::TILE_TOTAL];
+	int new_indices[Tile::TILE_TOTAL];
+	int temp_indices0[Tile::TILE_TOTAL];
+	int temp_indices1[Tile::TILE_TOTAL];
+
+	// now optimize each channel separately
+	// for the first error improvement, we save the indices. then, for any later improvement, we compare the indices
+	// if they differ, we restart the loop (which then falls back to looking for a first improvement.)
+	for (int ch = 0; ch < NCHANNELS_RGB; ++ch)
+	{
+		// figure out which endpoint when perturbed gives the most improvement and start there
+		// if we just alternate, we can easily end up in a local minima
+		float err0 = perturb_one(colors, np, ch, region_prec, opt_endpts, new_a, opt_err, 0, temp_indices0);	// perturb endpt A
+		float err1 = perturb_one(colors, np, ch, region_prec, opt_endpts, new_b, opt_err, 1, temp_indices1);	// perturb endpt B
+
+		if (err0 < err1)
+		{
+			if (err0 >= opt_err)
+				continue;
+
+			for (int i=0; i<np; ++i)
+			{
+				new_indices[i] = orig_indices[i] = temp_indices0[i];
+				assert (orig_indices[i] != -1);
+			}
+
+			opt_endpts.A[ch] = new_a.A[ch];
+			opt_err = err0;
+			do_b = 1;		// do B next
+		}
+		else
+		{
+			if (err1 >= opt_err)
+				continue;
+
+			for (int i=0; i<np; ++i)
+			{
+				new_indices[i] = orig_indices[i] = temp_indices1[i];
+				assert (orig_indices[i] != -1);
+			}
+
+			opt_endpts.B[ch] = new_b.B[ch];
+			opt_err = err1;
+			do_b = 0;		// do A next
+		}
+		
+		// now alternate endpoints and keep trying until there is no improvement
+		for (;;)
+		{
+			float err = perturb_one(colors, np, ch, region_prec, opt_endpts, new_endpt, opt_err, do_b, temp_indices0);
+			if (err >= opt_err)
+				break;
+
+			for (int i=0; i<np; ++i)
+			{
+				new_indices[i] = temp_indices0[i];
+				assert (new_indices[i] != -1);
+			}
+
+			if (do_b == 0)
+				opt_endpts.A[ch] = new_endpt.A[ch];
+			else
+				opt_endpts.B[ch] = new_endpt.B[ch];
+			opt_err = err;
+			do_b = 1 - do_b;	// now move the other endpoint
+		}
+
+		// see if the indices have changed
+		int i;
+		for (i=0; i<np; ++i)
+			if (orig_indices[i] != new_indices[i])
+				break;
+
+		if (i<np)
+			ch = -1;	// start over
+	}
+
+	// finally, do a small exhaustive search around what we think is the global minima to be sure
+	// note this is independent of the above search, so we don't care about the indices from the above
+	// we don't care about the above because if they differ, so what? we've already started at ch=0
+	bool first = true;
+	for (int ch = 0; ch < NCHANNELS_RGB; ++ch)
+	{
+		double new_err = exhaustive(colors, np, ch, region_prec, opt_err, opt_endpts, temp_indices0);
+
+		if (new_err < opt_err)
+		{
+			opt_err = new_err;
+
+			if (first)
+			{
+				for (int i=0; i<np; ++i)
+				{
+					orig_indices[i] = temp_indices0[i];
+					assert (orig_indices[i] != -1);
+				}
+				first = false;
+			}
+			else
+			{
+				// see if the indices have changed
+				int i;
+				for (i=0; i<np; ++i)
+					if (orig_indices[i] != temp_indices0[i])
+						break;
+
+				if (i<np)
+				{
+					ch = -1;	// start over
+					first = true;
+				}
+			}
+		}
+	}
+
+	return opt_err;
+}
+
+static void optimize_endpts(const Tile &tile, int shapeindex, const double orig_err[NREGIONS_THREE], 
+							const IntEndptsRGB orig_endpts[NREGIONS_THREE], const PatternPrec &pattern_prec, double opt_err[NREGIONS], IntEndptsRGB opt_endpts[NREGIONS_THREE])
+{
+	Vec4 pixels[Tile::TILE_TOTAL];
+	IntEndptsRGB temp_in, temp_out;
+
+	for (int region=0; region<NREGIONS_THREE; ++region)
+	{
+		// collect the pixels in the region
+		int np = 0;
+
+		for (int y = 0; y < tile.size_y; y++)
+		for (int x = 0; x < tile.size_x; x++)
+			if (REGION(x,y,shapeindex) == region)
+				pixels[np++] = tile.data[y][x];
+
+		opt_endpts[region] = temp_in = orig_endpts[region];
+		opt_err[region] = orig_err[region];
+
+		double best_err = orig_err[region];
+
+		// make sure we have a valid error for temp_in
+		// we didn't change temp_in, so orig_err[region] is still valid
+		double temp_in_err = orig_err[region];
+
+		// now try to optimize these endpoints
+		double temp_out_err = optimize_one(pixels, np, temp_in_err, temp_in, pattern_prec.region_precs[region], temp_out);
+
+		// if we find an improvement, update the best so far and correct the output endpoints and errors
+		if (temp_out_err < best_err)
+		{
+			best_err = temp_out_err;
+			opt_err[region] = temp_out_err;
+			opt_endpts[region] = temp_out;
+		}
+	}
+}
+
+/* optimization algorithm
+	for each pattern
+		convert endpoints using pattern precision
+		assign indices and get initial error
+		compress indices (and possibly reorder endpoints)
+		transform endpoints
+		if transformed endpoints fit pattern
+			get original endpoints back
+			optimize endpoints, get new endpoints, new indices, and new error // new error will almost always be better
+			compress new indices
+			transform new endpoints
+			if new endpoints fit pattern AND if error is improved
+				emit compressed block with new data
+			else
+				emit compressed block with original data // to try to preserve maximum endpoint precision
+*/
+
+static double refine(const Tile &tile, int shapeindex_best, const FltEndpts endpts[NREGIONS_THREE], char *block)
+{
+	double orig_err[NREGIONS_THREE], opt_err[NREGIONS_THREE], orig_toterr, opt_toterr, expected_opt_err[NREGIONS];
+	IntEndptsRGB orig_endpts[NREGIONS_THREE], opt_endpts[NREGIONS_THREE];
+	int orig_indices[Tile::TILE_H][Tile::TILE_W], opt_indices[Tile::TILE_H][Tile::TILE_W];
+
+	for (int sp = 0; sp < NPATTERNS; ++sp)
+	{
+		quantize_endpts(endpts, pattern_precs[sp], orig_endpts);
+		assign_indices(tile, shapeindex_best, orig_endpts, pattern_precs[sp], orig_indices, orig_err);
+		swap_indices(orig_endpts, orig_indices, shapeindex_best);
+		if (patterns[sp].transformed)
+			transform_forward(orig_endpts);
+		// apply a heuristic here -- we check if the endpoints fit before we try to optimize them.
+		// the assumption made is that if they don't fit now, they won't fit after optimizing.
+		if (endpts_fit(orig_endpts, patterns[sp]))
+		{
+			if (patterns[sp].transformed)
+				transform_inverse(orig_endpts);
+			optimize_endpts(tile, shapeindex_best, orig_err, orig_endpts, pattern_precs[sp], expected_opt_err, opt_endpts);
+			assign_indices(tile, shapeindex_best, opt_endpts, pattern_precs[sp], opt_indices, opt_err);
+			for (int i=0; i<NREGIONS; ++i)
+				assert(expected_opt_err[i] == opt_err[i]);
+			swap_indices(opt_endpts, opt_indices, shapeindex_best);
+			if (patterns[sp].transformed)
+				transform_forward(opt_endpts);
+			orig_toterr = opt_toterr = 0;
+			for (int i=0; i < NREGIONS_THREE; ++i) { orig_toterr += orig_err[i]; opt_toterr += opt_err[i]; }
+			if (endpts_fit(opt_endpts, patterns[sp]) && opt_toterr < orig_toterr)
+			{
+				emit_block(opt_endpts, shapeindex_best, patterns[sp], opt_indices, block);
+				return opt_toterr;
+			}
+			else
+			{
+				// either it stopped fitting when we optimized it, or there was no improvement
+				// so go back to the unoptimized endpoints which we know will fit
+				if (patterns[sp].transformed)
+					transform_forward(orig_endpts);
+				emit_block(orig_endpts, shapeindex_best, patterns[sp], orig_indices, block);
+				return orig_toterr;
+			}
+		}
+	}
+	throw "No candidate found, should never happen (avpcl mode 2).";
+}
+
+static void clamp(Vec4 &v)
+{
+	if (v.X() < RGBA_MIN) v.X() = RGBA_MIN;
+	if (v.X() > RGBA_MAX) v.X() = RGBA_MAX;
+	if (v.Y() < RGBA_MIN) v.Y() = RGBA_MIN;
+	if (v.Y() > RGBA_MAX) v.Y() = RGBA_MAX;
+	if (v.Z() < RGBA_MIN) v.Z() = RGBA_MIN;
+	if (v.Z() > RGBA_MAX) v.Z() = RGBA_MAX;
+	v.W() = RGBA_MAX;
+}
+
+static void generate_palette_unquantized(const FltEndpts endpts[NREGIONS_THREE], Vec4 palette[NREGIONS_THREE][NINDICES])
+{
+	for (int region = 0; region < NREGIONS_THREE; ++region)
+	for (int i = 0; i < NINDICES; ++i)
+		palette[region][i] = PALETTE_LERP(endpts[region].A, endpts[region].B, i, 0.0, float(DENOM));
+}
+
+// generate a palette from unquantized endpoints, then pick best palette color for all pixels in each region, return toterr for all regions combined
+static double map_colors(const Tile &tile, int shapeindex, const FltEndpts endpts[NREGIONS_THREE])
+{
+	// build list of possibles
+	Vec4 palette[NREGIONS_THREE][NINDICES];
+
+	generate_palette_unquantized(endpts, palette);
+
+	double toterr = 0;
+	Vec4 err;
+
+	for (int y = 0; y < tile.size_y; y++)
+	for (int x = 0; x < tile.size_x; x++)
+	{
+		int region = REGION(x,y,shapeindex);
+		double err, besterr = DBL_MAX;
+
+		for (int i = 0; i < NINDICES && besterr > 0; ++i)
+		{
+			err = Utils::metric4(tile.data[y][x], palette[region][i]);
+
+			if (err > besterr)	// error increased, so we're done searching. this works for most norms.
+				break;
+			if (err < besterr)
+				besterr = err;
+		}
+		toterr += besterr;
+	}
+	return toterr;
+}
+
+static double rough(const Tile &tile, int shapeindex, FltEndpts endpts[NREGIONS_THREE])
+{
+	for (int region=0; region<NREGIONS_THREE; ++region)
+	{
+		int np = 0;
+		Vec4 colors[Tile::TILE_TOTAL];
+		Vec4 mean(0,0,0,0);
+
+		for (int y = 0; y < tile.size_y; y++)
+		for (int x = 0; x < tile.size_x; x++)
+			if (REGION(x,y,shapeindex) == region)
+			{
+				colors[np] = tile.data[y][x];
+				mean += tile.data[y][x];
+				++np;
+			}
+
+		// handle simple cases	
+		if (np == 0)
+		{
+			Vec4 zero(0,0,0,RGBA_MAX);
+			endpts[region].A = zero;
+			endpts[region].B = zero;
+			continue;
+		}
+		else if (np == 1)
+		{
+			endpts[region].A = colors[0];
+			endpts[region].B = colors[0];
+			continue;
+		}
+		else if (np == 2)
+		{
+			endpts[region].A = colors[0];
+			endpts[region].B = colors[1];
+			continue;
+		}
+
+		Matrix rdq(np, 3);
+
+		mean /= float(np);
+
+		// only look at RGB' ignore A
+		for (int i = 0; i < np; ++i)
+		{
+			rdq(i,0) = colors[i].X() - mean.X();
+			rdq(i,1) = colors[i].Y() - mean.Y();
+			rdq(i,2) = colors[i].Z() - mean.Z();
+		}
+				
+		// perform a singular value decomposition
+		SVD svd(rdq);
+
+		// get the principal component direction (well, the one with the largest weight)
+		Vec4 direction(svd.R()(0,0), svd.R()(0,1), svd.R()(0,2), 0.0);
+
+		// project each pixel value along the principal direction
+		double minp = DBL_MAX, maxp = -DBL_MAX;
+		for (int i = 0; i < np; i++) 
+		{
+			float dp = rdq(i,0) * direction.X() + rdq(i,1)*direction.Y() + rdq(i,2)*direction.Z();
+			if (dp < minp) minp = dp;
+			if (dp > maxp) maxp = dp;
+		}
+
+		// choose as endpoints 2 points along the principal direction that span the projections of all of the pixel values
+		endpts[region].A = mean + minp*direction;
+		endpts[region].B = mean + maxp*direction;
+
+		// clamp endpoints
+		// the argument for clamping is that the actual endpoints need to be clamped and thus we need to choose the best
+		// shape based on endpoints being clamped
+		clamp(endpts[region].A);
+		clamp(endpts[region].B);
+	}
+
+	return map_colors(tile, shapeindex, endpts);
+}
+
+static void swap(double *list1, int *list2, int i, int j)
+{
+	double t = list1[i]; list1[i] = list1[j]; list1[j] = t;
+	int t1 = list2[i]; list2[i] = list2[j]; list2[j] = t1;
+}
+
+double AVPCL::compress_mode2(const Tile &t, char *block)
+{
+	// number of rough cases to look at. reasonable values of this are 1, NSHAPES/4, and NSHAPES
+	// NSHAPES/4 gets nearly all the cases; you can increase that a bit (say by 3 or 4) if you really want to squeeze the last bit out
+	const int NITEMS=NSHAPES/4;
+
+	// pick the best NITEMS shapes and refine these.
+	struct {
+		FltEndpts endpts[NREGIONS_THREE];
+	} all[NSHAPES];
+	double roughmse[NSHAPES];
+	int index[NSHAPES];
+	char tempblock[AVPCL::BLOCKSIZE];
+	double msebest = DBL_MAX;
+
+	for (int i=0; i<NSHAPES; ++i)
+	{
+		roughmse[i] = rough(t, i, &all[i].endpts[0]);
+		index[i] = i;
+	}
+
+	// bubble sort -- only need to bubble up the first NITEMS items
+	for (int i=0; i<NITEMS; ++i)
+	for (int j=i+1; j<NSHAPES; ++j)
+		if (roughmse[i] > roughmse[j])
+			swap(roughmse, index, i, j);
+
+	for (int i=0; i<NITEMS && msebest>0; ++i)
+	{
+		int shape = index[i];
+		double mse = refine(t, shape, &all[shape].endpts[0], tempblock);
+		if (mse < msebest)
+		{
+			memcpy(block, tempblock, sizeof(tempblock));
+			msebest = mse;
+		}
+	}
+	return msebest;
+}
+
diff --git a/src/nvtt/bc7/avpcl_mode3.cpp b/src/nvtt/bc7/avpcl_mode3.cpp
new file mode 100644
index 0000000..cf19759
--- /dev/null
+++ b/src/nvtt/bc7/avpcl_mode3.cpp
@@ -0,0 +1,1061 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+// Thanks to Jacob Munkberg (jacob@cs.lth.se) for the shortcut of using SVD to do the equivalent of principal components analysis
+
+// x1000 777.1x4 64p 2bi (30b)
+
+#include "bits.h"
+#include "tile.h"
+#include "avpcl.h"
+#include "arvo/Vec4.h"
+#include "arvo/Matrix.h"
+#include "arvo/SVD.h"
+#include "utils.h"
+#include "endpts.h"
+
+#include <assert.h>
+
+#include "shapes_two.h"
+
+using namespace ArvoMath;
+
+#define	NLSBMODES	4		// number of different lsb modes per region. since we have two .1 per region, that can have 4 values
+
+#define NINDICES	4
+#define	INDEXBITS	2
+#define	HIGH_INDEXBIT	(1<<(INDEXBITS-1))
+#define	DENOM		(NINDICES-1)
+#define	BIAS		(DENOM/2)
+
+// WORK: determine optimal traversal pattern to search for best shape -- what does the error curve look like?
+// i.e. can we search shapes in a particular order so we can see the global error minima easily and
+// stop without having to touch all shapes?
+
+#define	POS_TO_X(pos)	((pos)&3)
+#define	POS_TO_Y(pos)	(((pos)>>2)&3)
+
+#define	NBITSIZES	(NREGIONS*2)
+#define	ABITINDEX(region)	(2*(region)+0)
+#define	BBITINDEX(region)	(2*(region)+1)
+
+struct ChanBits
+{
+	int nbitsizes[NBITSIZES];	// bitsizes for one channel
+};
+
+struct Pattern
+{
+	ChanBits chan[NCHANNELS_RGB];//  bit patterns used per channel
+	int transformed;		// if 0, deltas are unsigned and no transform; otherwise, signed and transformed
+	int mode;				// associated mode value
+	int modebits;			// number of mode bits
+	char *encoding;			// verilog description of encoding for this mode
+};
+
+#define	NPATTERNS 1
+#define	NREGIONS  2
+
+static Pattern patterns[NPATTERNS] =
+{
+	// red		green		blue		xfm	mode  mb
+	7,7,7,7,	7,7,7,7,	7,7,7,7,	0,	0x8, 4, "",
+};
+
+struct RegionPrec
+{
+	int	endpt_a_prec[NCHANNELS_RGB];
+	int endpt_b_prec[NCHANNELS_RGB];
+};
+
+struct PatternPrec
+{
+	RegionPrec region_precs[NREGIONS];
+};
+
+
+// this is the precision for each channel and region
+// NOTE: this MUST match the corresponding data in "patterns" above -- WARNING: there is NO assert to check this!
+static PatternPrec pattern_precs[NPATTERNS] =
+{
+	7,7,7, 7,7,7, 7,7,7, 7,7,7,
+};
+
+// return # of bits needed to store n. handle signed or unsigned cases properly
+static int nbits(int n, bool issigned)
+{
+	int nb;
+	if (n==0)
+		return 0;	// no bits needed for 0, signed or not
+	else if (n > 0)
+	{
+		for (nb=0; n; ++nb, n>>=1) ;
+		return nb + (issigned?1:0);
+	}
+	else
+	{
+		assert (issigned);
+		for (nb=0; n<-1; ++nb, n>>=1) ;
+		return nb + 1;
+	}
+}
+
+static void transform_forward(IntEndptsRGB_2 ep[NREGIONS])
+{
+	assert(0);
+}
+
+static void transform_inverse(IntEndptsRGB_2 ep[NREGIONS])
+{
+	assert(0);
+}
+
+// endpoints are 888,888; reduce to 777,777 and put the lsb bit majority in compr_bits
+static void compress_one(const IntEndptsRGB& endpts, IntEndptsRGB_2& compr_endpts)
+{
+	int onescnt;
+
+	onescnt = 0;
+	for (int j=0; j<NCHANNELS_RGB; ++j)
+	{
+		onescnt += endpts.A[j] & 1;
+		compr_endpts.A[j] = endpts.A[j] >> 1;
+		assert (compr_endpts.A[j] < 128);
+	}
+	compr_endpts.a_lsb = onescnt >= 2;
+
+	onescnt = 0;
+	for (int j=0; j<NCHANNELS_RGB; ++j)
+	{
+		onescnt += endpts.B[j] & 1;
+		compr_endpts.B[j] = endpts.B[j] >> 1;
+		assert (compr_endpts.B[j] < 128);
+	}
+	compr_endpts.b_lsb = onescnt >= 2;
+}
+
+static void uncompress_one(const IntEndptsRGB_2& compr_endpts, IntEndptsRGB& endpts)
+{
+	for (int j=0; j<NCHANNELS_RGB; ++j)
+	{
+		endpts.A[j] = (compr_endpts.A[j] << 1) | compr_endpts.a_lsb;
+		endpts.B[j] = (compr_endpts.B[j] << 1) | compr_endpts.b_lsb;
+	}
+}
+
+static void uncompress_endpoints(const IntEndptsRGB_2 compr_endpts[NREGIONS], IntEndptsRGB endpts[NREGIONS])
+{
+	for (int i=0; i<NREGIONS; ++i)
+		uncompress_one(compr_endpts[i], endpts[i]);
+}
+
+static void compress_endpoints(const IntEndptsRGB endpts[NREGIONS], IntEndptsRGB_2 compr_endpts[NREGIONS])
+{
+	for (int i=0; i<NREGIONS; ++i)
+		compress_one(endpts[i], compr_endpts[i]);
+}
+
+
+static void quantize_endpts(const FltEndpts endpts[NREGIONS], const PatternPrec &pattern_prec, IntEndptsRGB_2 q_endpts[NREGIONS])
+{
+	IntEndptsRGB full_endpts[NREGIONS];
+
+	for (int region = 0; region < NREGIONS; ++region)
+	{
+		full_endpts[region].A[0] = Utils::quantize(endpts[region].A.X(), pattern_prec.region_precs[region].endpt_a_prec[0]+1);	// +1 since we are in uncompressed space
+		full_endpts[region].A[1] = Utils::quantize(endpts[region].A.Y(), pattern_prec.region_precs[region].endpt_a_prec[1]+1);
+		full_endpts[region].A[2] = Utils::quantize(endpts[region].A.Z(), pattern_prec.region_precs[region].endpt_a_prec[2]+1);
+		full_endpts[region].B[0] = Utils::quantize(endpts[region].B.X(), pattern_prec.region_precs[region].endpt_b_prec[0]+1);
+		full_endpts[region].B[1] = Utils::quantize(endpts[region].B.Y(), pattern_prec.region_precs[region].endpt_b_prec[1]+1);
+		full_endpts[region].B[2] = Utils::quantize(endpts[region].B.Z(), pattern_prec.region_precs[region].endpt_b_prec[2]+1);
+		compress_one(full_endpts[region], q_endpts[region]);
+	}
+}
+
+// swap endpoints as needed to ensure that the indices at index_positions have a 0 high-order bit
+static void swap_indices(IntEndptsRGB_2 endpts[NREGIONS], int indices[Tile::TILE_H][Tile::TILE_W], int shapeindex)
+{
+	for (int region = 0; region < NREGIONS; ++region)
+	{
+		int position = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,region);
+
+		int x = POS_TO_X(position);
+		int y = POS_TO_Y(position);
+		assert(REGION(x,y,shapeindex) == region);		// double check the table
+		if (indices[y][x] & HIGH_INDEXBIT)
+		{
+			// high bit is set, swap the endpts and indices for this region
+			int t;
+			for (int i=0; i<NCHANNELS_RGB; ++i) 
+			{
+				t = endpts[region].A[i]; endpts[region].A[i] = endpts[region].B[i]; endpts[region].B[i] = t;
+			}
+			t = endpts[region].a_lsb; endpts[region].a_lsb = endpts[region].b_lsb; endpts[region].b_lsb = t;
+
+			for (int y = 0; y < Tile::TILE_H; y++)
+			for (int x = 0; x < Tile::TILE_W; x++)
+				if (REGION(x,y,shapeindex) == region)
+					indices[y][x] = NINDICES - 1 - indices[y][x];
+		}
+	}
+}
+
+static bool endpts_fit(IntEndptsRGB_2 endpts[NREGIONS], const Pattern &p)
+{
+	return true;
+}
+
+static void write_header(const IntEndptsRGB_2 endpts[NREGIONS], int shapeindex, const Pattern &p, Bits &out)
+{
+	out.write(p.mode, p.modebits);
+	out.write(shapeindex, SHAPEBITS);
+
+	for (int j=0; j<NCHANNELS_RGB; ++j)
+		for (int i=0; i<NREGIONS; ++i)
+		{
+			out.write(endpts[i].A[j], p.chan[j].nbitsizes[ABITINDEX(i)]);
+			out.write(endpts[i].B[j], p.chan[j].nbitsizes[BBITINDEX(i)]);
+		}
+
+	for (int i=0; i<NREGIONS; ++i)
+	{
+		out.write(endpts[i].a_lsb, 1);
+		out.write(endpts[i].b_lsb, 1);
+	}
+
+	assert (out.getptr() == 98);
+}
+
+static void read_header(Bits &in, IntEndptsRGB_2 endpts[NREGIONS], int &shapeindex, Pattern &p, int &pat_index)
+{
+	int mode = AVPCL::getmode(in);
+
+	pat_index = 0;
+	assert (pat_index >= 0 && pat_index < NPATTERNS);
+	assert (in.getptr() == patterns[pat_index].modebits);
+
+	shapeindex = in.read(SHAPEBITS);
+	p = patterns[pat_index];
+
+	for (int j=0; j<NCHANNELS_RGB; ++j)
+		for (int i=0; i<NREGIONS; ++i)
+		{
+			endpts[i].A[j] = in.read(p.chan[j].nbitsizes[ABITINDEX(i)]);
+			endpts[i].B[j] = in.read(p.chan[j].nbitsizes[BBITINDEX(i)]);
+		}
+	
+	for (int i=0; i<NREGIONS; ++i)
+	{
+		endpts[i].a_lsb  = in.read(1);
+		endpts[i].b_lsb  = in.read(1);
+	}
+
+	assert (in.getptr() == 98);
+}
+
+static void write_indices(const int indices[Tile::TILE_H][Tile::TILE_W], int shapeindex, Bits &out)
+{
+	int positions[NREGIONS];
+
+	for (int r = 0; r < NREGIONS; ++r)
+		positions[r] = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,r);
+
+	for (int pos = 0; pos < Tile::TILE_TOTAL; ++pos)
+	{
+		int x = POS_TO_X(pos);
+		int y = POS_TO_Y(pos);
+
+		bool match = false;
+
+		for (int r = 0; r < NREGIONS; ++r)
+			if (positions[r] == pos) { match = true; break; }
+
+		out.write(indices[y][x], INDEXBITS - (match ? 1 : 0));
+	}
+}
+
+static void read_indices(Bits &in, int shapeindex, int indices[Tile::TILE_H][Tile::TILE_W])
+{
+	int positions[NREGIONS];
+
+	for (int r = 0; r < NREGIONS; ++r)
+		positions[r] = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,r);
+
+	for (int pos = 0; pos < Tile::TILE_TOTAL; ++pos)
+	{
+		int x = POS_TO_X(pos);
+		int y = POS_TO_Y(pos);
+
+		bool match = false;
+
+		for (int r = 0; r < NREGIONS; ++r)
+			if (positions[r] == pos) { match = true; break; }
+
+		indices[y][x]= in.read(INDEXBITS - (match ? 1 : 0));
+	}
+}
+
+static void emit_block(const IntEndptsRGB_2 endpts[NREGIONS], int shapeindex, const Pattern &p, const int indices[Tile::TILE_H][Tile::TILE_W], char *block)
+{
+	Bits out(block, AVPCL::BITSIZE);
+
+	write_header(endpts, shapeindex, p, out);
+
+	write_indices(indices, shapeindex, out);
+
+	assert(out.getptr() == AVPCL::BITSIZE);
+}
+
+static void generate_palette_quantized(const IntEndptsRGB_2 &endpts_2, const RegionPrec &region_prec, Vec4 palette[NINDICES])
+{
+	IntEndptsRGB endpts;
+
+	uncompress_one(endpts_2, endpts);
+
+	// scale endpoints
+	int a, b;			// really need a IntVec4...
+
+	a = Utils::unquantize(endpts.A[0], region_prec.endpt_a_prec[0]+1);	// +1 since we are in uncompressed space
+	b = Utils::unquantize(endpts.B[0], region_prec.endpt_b_prec[0]+1);
+
+	// interpolate
+	for (int i = 0; i < NINDICES; ++i)
+		palette[i].X() = PALETTE_LERP(a, b, i, BIAS, DENOM);
+
+	a = Utils::unquantize(endpts.A[1], region_prec.endpt_a_prec[1]+1); 
+	b = Utils::unquantize(endpts.B[1], region_prec.endpt_b_prec[1]+1);
+
+	// interpolate
+	for (int i = 0; i < NINDICES; ++i)
+		palette[i].Y() = PALETTE_LERP(a, b, i, BIAS, DENOM);
+
+	a = Utils::unquantize(endpts.A[2], region_prec.endpt_a_prec[2]+1); 
+	b = Utils::unquantize(endpts.B[2], region_prec.endpt_b_prec[2]+1);
+
+	// interpolate
+	for (int i = 0; i < NINDICES; ++i)
+		palette[i].Z() = PALETTE_LERP(a, b, i, BIAS, DENOM);
+
+	// constant alpha
+	for (int i = 0; i < NINDICES; ++i)
+		palette[i].W() = RGBA_MAX;
+}
+
+static void sign_extend(Pattern &p, IntEndptsRGB_2 endpts[NREGIONS])
+{
+	assert(0);
+}
+
+void AVPCL::decompress_mode3(const char *block, Tile &t)
+{
+	Bits in(block, AVPCL::BITSIZE);
+
+	Pattern p;
+	IntEndptsRGB_2 endpts[NREGIONS];
+	int shapeindex, pat_index;
+
+	read_header(in, endpts, shapeindex, p, pat_index);
+	
+	if (p.transformed)
+	{
+		sign_extend(p, endpts);
+		transform_inverse(endpts);
+	}
+
+	Vec4 palette[NREGIONS][NINDICES];
+	for (int r = 0; r < NREGIONS; ++r)
+		generate_palette_quantized(endpts[r], pattern_precs[pat_index].region_precs[r], &palette[r][0]);
+
+	int indices[Tile::TILE_H][Tile::TILE_W];
+
+	read_indices(in, shapeindex, indices);
+
+	assert(in.getptr() == AVPCL::BITSIZE);
+
+	// lookup
+	for (int y = 0; y < Tile::TILE_H; y++)
+	for (int x = 0; x < Tile::TILE_W; x++)
+		t.data[y][x] = palette[REGION(x,y,shapeindex)][indices[y][x]];
+}
+
+// given a collection of colors and quantized endpoints, generate a palette, choose best entries, and return a single toterr
+static double map_colors(const Vec4 colors[], int np, const IntEndptsRGB_2 &endpts, const RegionPrec &region_prec, double current_err, int indices[Tile::TILE_TOTAL])
+{
+	Vec4 palette[NINDICES];
+	double toterr = 0;
+	Vec4 err;
+
+	generate_palette_quantized(endpts, region_prec, palette);
+
+	for (int i = 0; i < np; ++i)
+	{
+		double err, besterr = DBL_MAX;
+
+		for (int j = 0; j < NINDICES && besterr > 0; ++j)
+		{
+			err = Utils::metric4(colors[i], palette[j]);
+
+			if (err > besterr)	// error increased, so we're done searching
+				break;
+			if (err < besterr)
+			{
+				besterr = err;
+				indices[i] = j;
+			}
+		}
+		toterr += besterr;
+
+		// check for early exit
+		if (toterr > current_err)
+		{
+			// fill out bogus index values so it's initialized at least
+			for (int k = i; k < np; ++k)
+				indices[k] = -1;
+
+			return DBL_MAX;
+		}
+	}
+	return toterr;
+}
+
+static void assign_indices(const Tile &tile, int shapeindex, IntEndptsRGB_2 endpts[NREGIONS], const PatternPrec &pattern_prec, 
+						   int indices[Tile::TILE_H][Tile::TILE_W], double toterr[NREGIONS])
+{
+	// build list of possibles
+	Vec4 palette[NREGIONS][NINDICES];
+
+	for (int region = 0; region < NREGIONS; ++region)
+	{
+		generate_palette_quantized(endpts[region], pattern_prec.region_precs[region], &palette[region][0]);
+		toterr[region] = 0;
+	}
+
+	Vec4 err;
+
+	for (int y = 0; y < tile.size_y; y++)
+	for (int x = 0; x < tile.size_x; x++)
+	{
+		int region = REGION(x,y,shapeindex);
+		double err, besterr = DBL_MAX;
+
+		for (int i = 0; i < NINDICES && besterr > 0; ++i)
+		{
+			err = Utils::metric4(tile.data[y][x], palette[region][i]);
+
+			if (err > besterr)	// error increased, so we're done searching
+				break;
+			if (err < besterr)
+			{
+				besterr = err;
+				indices[y][x] = i;
+			}
+		}
+		toterr[region] += besterr;
+	}
+}
+
+// note: indices are valid only if the value returned is less than old_err; otherwise they contain -1's
+// this function returns either old_err or a value smaller (if it was successful in improving the error)
+static double perturb_one(const Vec4 colors[], int np, int ch, const RegionPrec &region_prec, const IntEndptsRGB_2 &old_endpts, IntEndptsRGB_2 &new_endpts, 
+						  double old_err, int do_b, int indices[Tile::TILE_TOTAL])
+{
+	// we have the old endpoints: old_endpts
+	// we have the perturbed endpoints: new_endpts
+	// we have the temporary endpoints: temp_endpts
+
+	IntEndptsRGB_2 temp_endpts;
+	float min_err = old_err;		// start with the best current error
+	int beststep;
+	int temp_indices[Tile::TILE_TOTAL];
+
+	for (int i=0; i<np; ++i)
+		indices[i] = -1;
+
+	// copy real endpoints so we can perturb them
+	temp_endpts = new_endpts = old_endpts;
+
+	int prec = do_b ? region_prec.endpt_b_prec[ch] : region_prec.endpt_a_prec[ch];
+
+	// do a logarithmic search for the best error for this endpoint (which)
+	for (int step = 1 << (prec-1); step; step >>= 1)
+	{
+		bool improved = false;
+		for (int sign = -1; sign <= 1; sign += 2)
+		{
+			if (do_b == 0)
+			{
+				temp_endpts.A[ch] = new_endpts.A[ch] + sign * step;
+				if (temp_endpts.A[ch] < 0 || temp_endpts.A[ch] >= (1 << prec))
+					continue;
+			}
+			else
+			{
+				temp_endpts.B[ch] = new_endpts.B[ch] + sign * step;
+				if (temp_endpts.B[ch] < 0 || temp_endpts.B[ch] >= (1 << prec))
+					continue;
+			}
+
+			float err = map_colors(colors, np, temp_endpts, region_prec, min_err, temp_indices);
+
+			if (err < min_err)
+			{
+				improved = true;
+				min_err = err;
+				beststep = sign * step;
+				for (int i=0; i<np; ++i)
+					indices[i] = temp_indices[i];
+			}
+		}
+		// if this was an improvement, move the endpoint and continue search from there
+		if (improved)
+		{
+			if (do_b == 0)
+				new_endpts.A[ch] += beststep;
+			else
+				new_endpts.B[ch] += beststep;
+		}
+	}
+	return min_err;
+}
+
+// the larger the error the more time it is worth spending on an exhaustive search.
+// perturb the endpoints at least -3 to 3.
+// if err > 5000 perturb endpoints 50% of precision
+// if err > 1000 25%
+// if err > 200 12.5%
+// if err > 40  6.25%
+// for np = 16 -- adjust error thresholds as a function of np
+// always ensure endpoint ordering is preserved (no need to overlap the scan)
+// if orig_err returned from this is less than its input value, then indices[] will contain valid indices
+static double exhaustive(const Vec4 colors[], int np, int ch, const RegionPrec &region_prec, double &orig_err, IntEndptsRGB_2 &opt_endpts, int indices[Tile::TILE_TOTAL])
+{
+	IntEndptsRGB_2 temp_endpts;
+	double best_err = orig_err;
+	int aprec = region_prec.endpt_a_prec[ch];
+	int bprec = region_prec.endpt_b_prec[ch];
+	int good_indices[Tile::TILE_TOTAL];
+	int temp_indices[Tile::TILE_TOTAL];
+
+	for (int i=0; i<np; ++i)
+		indices[i] = -1;
+
+	double thr_scale = (double)np / (double)Tile::TILE_TOTAL;
+
+	if (orig_err == 0) return orig_err;
+
+	int adelta = 0, bdelta = 0;
+	if (orig_err > 5000.0*thr_scale)		{ adelta = (1 << aprec)/2; bdelta = (1 << bprec)/2; }
+	else if (orig_err > 1000.0*thr_scale)	{ adelta = (1 << aprec)/4; bdelta = (1 << bprec)/4; }
+	else if (orig_err > 200.0*thr_scale)	{ adelta = (1 << aprec)/8; bdelta = (1 << bprec)/8; }
+	else if (orig_err > 40.0*thr_scale)		{ adelta = (1 << aprec)/16; bdelta = (1 << bprec)/16; }
+	adelta = MAX(adelta, 3);
+	bdelta = MAX(bdelta, 3);
+
+#ifdef	DISABLE_EXHAUSTIVE
+	adelta = bdelta = 3;
+#endif
+
+	temp_endpts = opt_endpts;
+
+	// ok figure out the range of A and B
+	int alow = MAX(0, opt_endpts.A[ch] - adelta);
+	int ahigh = MIN((1<<aprec)-1, opt_endpts.A[ch] + adelta);
+	int blow = MAX(0, opt_endpts.B[ch] - bdelta);
+	int bhigh = MIN((1<<bprec)-1, opt_endpts.B[ch] + bdelta);
+
+	// now there's no need to swap the ordering of A and B
+	bool a_le_b = opt_endpts.A[ch] <= opt_endpts.B[ch];
+
+	int amin, bmin;
+
+	if (opt_endpts.A[ch] <= opt_endpts.B[ch])
+	{
+		// keep a <= b
+		for (int a = alow; a <= ahigh; ++a)
+		for (int b = MAX(a, blow); b < bhigh; ++b)
+		{
+			temp_endpts.A[ch] = a;
+			temp_endpts.B[ch] = b;
+		
+			float err = map_colors(colors, np, temp_endpts, region_prec, best_err, temp_indices);
+			if (err < best_err) 
+			{ 
+				amin = a; 
+				bmin = b; 
+				best_err = err;
+				for (int i=0; i<np; ++i)
+					good_indices[i] = temp_indices[i];
+			}
+		}
+	}
+	else
+	{
+		// keep b <= a
+		for (int b = blow; b < bhigh; ++b)
+		for (int a = MAX(b, alow); a <= ahigh; ++a)
+		{
+			temp_endpts.A[ch] = a;
+			temp_endpts.B[ch] = b;
+		
+			float err = map_colors(colors, np, temp_endpts, region_prec, best_err, temp_indices);
+			if (err < best_err) 
+			{ 
+				amin = a; 
+				bmin = b; 
+				best_err = err; 
+				for (int i=0; i<np; ++i)
+					good_indices[i] = temp_indices[i];
+			}
+		}
+	}
+	if (best_err < orig_err)
+	{
+		opt_endpts.A[ch] = amin;
+		opt_endpts.B[ch] = bmin;
+		orig_err = best_err;
+		// if we actually improved, update the indices
+		for (int i=0; i<np; ++i)
+			indices[i] = good_indices[i];
+	}
+	return best_err;
+}
+
+static double optimize_one(const Vec4 colors[], int np, double orig_err, const IntEndptsRGB_2 &orig_endpts, const RegionPrec &region_prec, IntEndptsRGB_2 &opt_endpts)
+{
+	double opt_err = orig_err;
+
+	opt_endpts = orig_endpts;
+
+	/*
+		err0 = perturb(rgb0, delta0)
+		err1 = perturb(rgb1, delta1)
+		if (err0 < err1)
+			if (err0 >= initial_error) break
+			rgb0 += delta0
+			next = 1
+		else
+			if (err1 >= initial_error) break
+			rgb1 += delta1
+			next = 0
+		initial_err = map()
+		for (;;)
+			err = perturb(next ? rgb1:rgb0, delta)
+			if (err >= initial_err) break
+			next? rgb1 : rgb0 += delta
+			initial_err = err
+	*/
+	IntEndptsRGB_2 new_a, new_b;
+	IntEndptsRGB_2 new_endpt;
+	int do_b;
+	int orig_indices[Tile::TILE_TOTAL];
+	int new_indices[Tile::TILE_TOTAL];
+	int temp_indices0[Tile::TILE_TOTAL];
+	int temp_indices1[Tile::TILE_TOTAL];
+
+	// now optimize each channel separately
+	// for the first error improvement, we save the indices. then, for any later improvement, we compare the indices
+	// if they differ, we restart the loop (which then falls back to looking for a first improvement.)
+	for (int ch = 0; ch < NCHANNELS_RGB; ++ch)
+	{
+		// figure out which endpoint when perturbed gives the most improvement and start there
+		// if we just alternate, we can easily end up in a local minima
+		float err0 = perturb_one(colors, np, ch, region_prec, opt_endpts, new_a, opt_err, 0, temp_indices0);	// perturb endpt A
+		float err1 = perturb_one(colors, np, ch, region_prec, opt_endpts, new_b, opt_err, 1, temp_indices1);	// perturb endpt B
+
+		if (err0 < err1)
+		{
+			if (err0 >= opt_err)
+				continue;
+
+			for (int i=0; i<np; ++i)
+			{
+				new_indices[i] = orig_indices[i] = temp_indices0[i];
+				assert (orig_indices[i] != -1);
+			}
+
+			opt_endpts.A[ch] = new_a.A[ch];
+			opt_err = err0;
+			do_b = 1;		// do B next
+		}
+		else
+		{
+			if (err1 >= opt_err)
+				continue;
+
+			for (int i=0; i<np; ++i)
+			{
+				new_indices[i] = orig_indices[i] = temp_indices1[i];
+				assert (orig_indices[i] != -1);
+			}
+
+			opt_endpts.B[ch] = new_b.B[ch];
+			opt_err = err1;
+			do_b = 0;		// do A next
+		}
+		
+		// now alternate endpoints and keep trying until there is no improvement
+		for (;;)
+		{
+			float err = perturb_one(colors, np, ch, region_prec, opt_endpts, new_endpt, opt_err, do_b, temp_indices0);
+			if (err >= opt_err)
+				break;
+
+			for (int i=0; i<np; ++i)
+			{
+				new_indices[i] = temp_indices0[i];
+				assert (new_indices[i] != -1);
+			}
+
+			if (do_b == 0)
+				opt_endpts.A[ch] = new_endpt.A[ch];
+			else
+				opt_endpts.B[ch] = new_endpt.B[ch];
+			opt_err = err;
+			do_b = 1 - do_b;	// now move the other endpoint
+		}
+
+		// see if the indices have changed
+		int i;
+		for (i=0; i<np; ++i)
+			if (orig_indices[i] != new_indices[i])
+				break;
+
+		if (i<np)
+			ch = -1;	// start over
+	}
+
+	// finally, do a small exhaustive search around what we think is the global minima to be sure
+	// note this is independent of the above search, so we don't care about the indices from the above
+	// we don't care about the above because if they differ, so what? we've already started at ch=0
+	bool first = true;
+	for (int ch = 0; ch < NCHANNELS_RGB; ++ch)
+	{
+		double new_err = exhaustive(colors, np, ch, region_prec, opt_err, opt_endpts, temp_indices0);
+
+		if (new_err < opt_err)
+		{
+			opt_err = new_err;
+
+			if (first)
+			{
+				for (int i=0; i<np; ++i)
+				{
+					orig_indices[i] = temp_indices0[i];
+					assert (orig_indices[i] != -1);
+				}
+				first = false;
+			}
+			else
+			{
+				// see if the indices have changed
+				int i;
+				for (i=0; i<np; ++i)
+					if (orig_indices[i] != temp_indices0[i])
+						break;
+
+				if (i<np)
+				{
+					ch = -1;	// start over
+					first = true;
+				}
+			}
+		}
+	}
+
+	return opt_err;
+}
+
+// this will return a valid set of endpoints in opt_endpts regardless of whether it improve orig_endpts or not
+static void optimize_endpts(const Tile &tile, int shapeindex, const double orig_err[NREGIONS], 
+							const IntEndptsRGB_2 orig_endpts[NREGIONS], const PatternPrec &pattern_prec, double opt_err[NREGIONS], IntEndptsRGB_2 opt_endpts[NREGIONS])
+{
+	Vec4 pixels[Tile::TILE_TOTAL];
+	IntEndptsRGB_2 temp_in, temp_out;
+	int temp_indices[Tile::TILE_TOTAL];
+
+	for (int region=0; region<NREGIONS; ++region)
+	{
+		// collect the pixels in the region
+		int np = 0;
+
+		for (int y = 0; y < tile.size_y; y++)
+		for (int x = 0; x < tile.size_x; x++)
+			if (REGION(x,y,shapeindex) == region)
+				pixels[np++] = tile.data[y][x];
+
+		opt_endpts[region] = temp_in = orig_endpts[region];
+		opt_err[region] = orig_err[region];
+
+		double best_err = orig_err[region];
+
+		for (int lsbmode=0; lsbmode<NLSBMODES; ++lsbmode)
+		{
+			temp_in.a_lsb = lsbmode & 1;
+			temp_in.b_lsb = (lsbmode >> 1) & 1;
+
+			// make sure we have a valid error for temp_in
+			// we use DBL_MAX here because we want an accurate temp_in_err, no shortcuts
+			// (mapcolors will compute a mapping but will stop if the error exceeds the value passed in the DBL_MAX position)
+			double temp_in_err = map_colors(pixels, np, temp_in, pattern_prec.region_precs[region], DBL_MAX, temp_indices);
+
+			// now try to optimize these endpoints
+			double temp_out_err = optimize_one(pixels, np, temp_in_err, temp_in, pattern_prec.region_precs[region], temp_out);
+
+			// if we find an improvement, update the best so far and correct the output endpoints and errors
+			if (temp_out_err < best_err)
+			{
+				best_err = temp_out_err;
+				opt_err[region] = temp_out_err;
+				opt_endpts[region] = temp_out;
+			}
+		}
+	}
+}
+
+/* optimization algorithm
+	for each pattern
+		convert endpoints using pattern precision
+		assign indices and get initial error
+		compress indices (and possibly reorder endpoints)
+		transform endpoints
+		if transformed endpoints fit pattern
+			get original endpoints back
+			optimize endpoints, get new endpoints, new indices, and new error // new error will almost always be better
+			compress new indices
+			transform new endpoints
+			if new endpoints fit pattern AND if error is improved
+				emit compressed block with new data
+			else
+				emit compressed block with original data // to try to preserve maximum endpoint precision
+*/
+
+static double refine(const Tile &tile, int shapeindex_best, const FltEndpts endpts[NREGIONS], char *block)
+{
+	double orig_err[NREGIONS], opt_err[NREGIONS], orig_toterr, opt_toterr, expected_opt_err[NREGIONS];
+	IntEndptsRGB_2 orig_endpts[NREGIONS], opt_endpts[NREGIONS];
+	int orig_indices[Tile::TILE_H][Tile::TILE_W], opt_indices[Tile::TILE_H][Tile::TILE_W];
+
+	for (int sp = 0; sp < NPATTERNS; ++sp)
+	{
+		quantize_endpts(endpts, pattern_precs[sp], orig_endpts);
+		assign_indices(tile, shapeindex_best, orig_endpts, pattern_precs[sp], orig_indices, orig_err);
+		swap_indices(orig_endpts, orig_indices, shapeindex_best);
+		if (patterns[sp].transformed)
+			transform_forward(orig_endpts);
+		// apply a heuristic here -- we check if the endpoints fit before we try to optimize them.
+		// the assumption made is that if they don't fit now, they won't fit after optimizing.
+		if (endpts_fit(orig_endpts, patterns[sp]))
+		{
+			if (patterns[sp].transformed)
+				transform_inverse(orig_endpts);
+			optimize_endpts(tile, shapeindex_best, orig_err, orig_endpts, pattern_precs[sp], expected_opt_err, opt_endpts);
+			assign_indices(tile, shapeindex_best, opt_endpts, pattern_precs[sp], opt_indices, opt_err);
+			for (int i=0; i<NREGIONS; ++i)
+				assert(expected_opt_err[i] == opt_err[i]);
+			swap_indices(opt_endpts, opt_indices, shapeindex_best);
+			if (patterns[sp].transformed)
+				transform_forward(opt_endpts);
+			orig_toterr = opt_toterr = 0;
+			for (int i=0; i < NREGIONS; ++i) { orig_toterr += orig_err[i]; opt_toterr += opt_err[i]; }
+			if (endpts_fit(opt_endpts, patterns[sp]) && opt_toterr < orig_toterr)
+			{
+				emit_block(opt_endpts, shapeindex_best, patterns[sp], opt_indices, block);
+				return opt_toterr;
+			}
+			else
+			{
+				// either it stopped fitting when we optimized it, or there was no improvement
+				// so go back to the unoptimized endpoints which we know will fit
+				if (patterns[sp].transformed)
+					transform_forward(orig_endpts);
+				emit_block(orig_endpts, shapeindex_best, patterns[sp], orig_indices, block);
+				return orig_toterr;
+			}
+		}
+	}
+	throw "No candidate found, should never happen (avpcl mode 3).";
+}
+
+static void clamp(Vec4 &v)
+{
+	if (v.X() < RGBA_MIN) v.X() = RGBA_MIN;
+	if (v.X() > RGBA_MAX) v.X() = RGBA_MAX;
+	if (v.Y() < RGBA_MIN) v.Y() = RGBA_MIN;
+	if (v.Y() > RGBA_MAX) v.Y() = RGBA_MAX;
+	if (v.Z() < RGBA_MIN) v.Z() = RGBA_MIN;
+	if (v.Z() > RGBA_MAX) v.Z() = RGBA_MAX;
+	v.W() = RGBA_MAX;
+}
+
+static void generate_palette_unquantized(const FltEndpts endpts[NREGIONS], Vec4 palette[NREGIONS][NINDICES])
+{
+	for (int region = 0; region < NREGIONS; ++region)
+	for (int i = 0; i < NINDICES; ++i)
+		palette[region][i] = PALETTE_LERP(endpts[region].A, endpts[region].B, i, 0.0, float(DENOM));
+}
+
+// generate a palette from unquantized endpoints, then pick best palette color for all pixels in each region, return toterr for all regions combined
+static double map_colors(const Tile &tile, int shapeindex, const FltEndpts endpts[NREGIONS])
+{
+	// build list of possibles
+	Vec4 palette[NREGIONS][NINDICES];
+
+	generate_palette_unquantized(endpts, palette);
+
+	double toterr = 0;
+	Vec4 err;
+
+	for (int y = 0; y < tile.size_y; y++)
+	for (int x = 0; x < tile.size_x; x++)
+	{
+		int region = REGION(x,y,shapeindex);
+		double err, besterr = DBL_MAX;
+
+		for (int i = 0; i < NINDICES && besterr > 0; ++i)
+		{
+			err = Utils::metric4(tile.data[y][x], palette[region][i]);
+
+			if (err > besterr)	// error increased, so we're done searching. this works for most norms.
+				break;
+			if (err < besterr)
+				besterr = err;
+		}
+		toterr += besterr;
+	}
+	return toterr;
+}
+
+static double rough(const Tile &tile, int shapeindex, FltEndpts endpts[NREGIONS])
+{
+	for (int region=0; region<NREGIONS; ++region)
+	{
+		int np = 0;
+		Vec4 colors[Tile::TILE_TOTAL];
+		Vec4 mean(0,0,0,0);
+
+		for (int y = 0; y < tile.size_y; y++)
+		for (int x = 0; x < tile.size_x; x++)
+			if (REGION(x,y,shapeindex) == region)
+			{
+				colors[np] = tile.data[y][x];
+				mean += tile.data[y][x];
+				++np;
+			}
+
+		// handle simple cases	
+		if (np == 0)
+		{
+			Vec4 zero(0,0,0,RGBA_MAX);
+			endpts[region].A = zero;
+			endpts[region].B = zero;
+			continue;
+		}
+		else if (np == 1)
+		{
+			endpts[region].A = colors[0];
+			endpts[region].B = colors[0];
+			continue;
+		}
+		else if (np == 2)
+		{
+			endpts[region].A = colors[0];
+			endpts[region].B = colors[1];
+			continue;
+		}
+
+		Matrix rdq(np, 3);
+
+		mean /= float(np);
+
+		// only look at RGB' ignore A
+		for (int i = 0; i < np; ++i)
+		{
+			rdq(i,0) = colors[i].X() - mean.X();
+			rdq(i,1) = colors[i].Y() - mean.Y();
+			rdq(i,2) = colors[i].Z() - mean.Z();
+		}
+				
+		// perform a singular value decomposition
+		SVD svd(rdq);
+
+		// get the principal component direction (well, the one with the largest weight)
+		Vec4 direction(svd.R()(0,0), svd.R()(0,1), svd.R()(0,2), 0.0);
+
+		// project each pixel value along the principal direction
+		double minp = DBL_MAX, maxp = -DBL_MAX;
+		for (int i = 0; i < np; i++) 
+		{
+			float dp = rdq(i,0) * direction.X() + rdq(i,1)*direction.Y() + rdq(i,2)*direction.Z();
+			if (dp < minp) minp = dp;
+			if (dp > maxp) maxp = dp;
+		}
+
+		// choose as endpoints 2 points along the principal direction that span the projections of all of the pixel values
+		endpts[region].A = mean + minp*direction;
+		endpts[region].B = mean + maxp*direction;
+
+		// clamp endpoints
+		// the argument for clamping is that the actual endpoints need to be clamped and thus we need to choose the best
+		// shape based on endpoints being clamped
+		clamp(endpts[region].A);
+		clamp(endpts[region].B);
+	}
+
+	return map_colors(tile, shapeindex, endpts);
+}
+
+static void swap(double *list1, int *list2, int i, int j)
+{
+	double t = list1[i]; list1[i] = list1[j]; list1[j] = t;
+	int t1 = list2[i]; list2[i] = list2[j]; list2[j] = t1;
+}
+
+double AVPCL::compress_mode3(const Tile &t, char *block)
+{
+	// number of rough cases to look at. reasonable values of this are 1, NSHAPES/4, and NSHAPES
+	// NSHAPES/4 gets nearly all the cases; you can increase that a bit (say by 3 or 4) if you really want to squeeze the last bit out
+	const int NITEMS=NSHAPES/4;
+
+	// pick the best NITEMS shapes and refine these.
+	struct {
+		FltEndpts endpts[NREGIONS];
+	} all[NSHAPES];
+	double roughmse[NSHAPES];
+	int index[NSHAPES];
+	char tempblock[AVPCL::BLOCKSIZE];
+	double msebest = DBL_MAX;
+
+	for (int i=0; i<NSHAPES; ++i)
+	{
+		roughmse[i] = rough(t, i, &all[i].endpts[0]);
+		index[i] = i;
+	}
+
+	// bubble sort -- only need to bubble up the first NITEMS items
+	for (int i=0; i<NITEMS; ++i)
+	for (int j=i+1; j<NSHAPES; ++j)
+		if (roughmse[i] > roughmse[j])
+			swap(roughmse, index, i, j);
+
+	for (int i=0; i<NITEMS && msebest>0; ++i)
+	{
+		int shape = index[i];
+		double mse = refine(t, shape, &all[shape].endpts[0], tempblock);
+		if (mse < msebest)
+		{
+			memcpy(block, tempblock, sizeof(tempblock));
+			msebest = mse;
+		}
+	}
+	return msebest;
+}
+
diff --git a/src/nvtt/bc7/avpcl_mode4.cpp b/src/nvtt/bc7/avpcl_mode4.cpp
new file mode 100644
index 0000000..cd6a5e5
--- /dev/null
+++ b/src/nvtt/bc7/avpcl_mode4.cpp
@@ -0,0 +1,1220 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+// Thanks to Jacob Munkberg (jacob@cs.lth.se) for the shortcut of using SVD to do the equivalent of principal components analysis
+
+// x10000 2r 1i 555x2 6x2 2bi 3bi
+
+#include "bits.h"
+#include "tile.h"
+#include "avpcl.h"
+#include "arvo/Vec4.h"
+#include "arvo/Matrix.h"
+#include "arvo/SVD.h"
+#include "utils.h"
+#include "endpts.h"
+
+#include <assert.h>
+
+using namespace ArvoMath;
+
+// there are 2 index arrays. INDEXMODE selects between the arrays being 2 & 3 bits or 3 & 2 bits
+// array 0 is always the RGB array and array 1 is always the A array
+#define	NINDEXARRAYS	2
+#define	INDEXARRAY_RGB	0
+#define INDEXARRAY_A	1
+#define INDEXARRAY_2BITS(indexmode)	((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? INDEXARRAY_A : INDEXARRAY_RGB)
+#define INDEXARRAY_3BITS(indexmode)	((indexmode == INDEXMODE_ALPHA_IS_3BITS) ? INDEXARRAY_A : INDEXARRAY_RGB)
+
+#define NINDICES3	8
+#define	INDEXBITS3	3
+#define	HIGH_INDEXBIT3	(1<<(INDEXBITS3-1))
+#define	DENOM3		(NINDICES3-1)
+#define	BIAS3		(DENOM3/2)
+
+#define NINDICES2	4
+#define	INDEXBITS2	2
+#define	HIGH_INDEXBIT2	(1<<(INDEXBITS2-1))
+#define	DENOM2		(NINDICES2-1)
+#define	BIAS2		(DENOM2/2)
+
+#define	NINDICES_RGB(indexmode)		((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? NINDICES3 : NINDICES2)
+#define	INDEXBITS_RGB(indexmode)	((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? INDEXBITS3 : INDEXBITS2)
+#define	HIGH_INDEXBIT_RGB(indexmode)((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? HIGH_INDEXBIT3 : HIGH_INDEXBIT2)
+#define	DENOM_RGB(indexmode)		((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? DENOM3 : DENOM2)
+#define	BIAS_RGB(indexmode)			((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? BIAS3 : BIAS2)
+
+#define	NINDICES_A(indexmode)		((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? NINDICES2 : NINDICES3)
+#define	INDEXBITS_A(indexmode)		((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? INDEXBITS2 : INDEXBITS3)
+#define	HIGH_INDEXBIT_A(indexmode)	((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? HIGH_INDEXBIT2 : HIGH_INDEXBIT3)
+#define	DENOM_A(indexmode)			((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? DENOM2 : DENOM3)
+#define	BIAS_A(indexmode)			((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? BIAS2 : BIAS3)
+
+#define	NSHAPES	1
+
+static int shapes[NSHAPES] =
+{
+	0x0000,
+};
+
+#define	REGION(x,y,shapeindex)	((shapes[shapeindex]&(1<<(15-(x)-4*(y))))!=0)
+
+#define NREGIONS	1			// keep the region stuff in just in case...
+
+// encoded index compression location: region 0 is always at 0,0.
+
+#define	NBITSIZES	2			// one endpoint pair
+
+struct ChanBits
+{
+	int nbitsizes[NBITSIZES];	// bitsizes for one channel
+};
+
+struct Pattern
+{
+	ChanBits chan[NCHANNELS_RGBA];//  bit patterns used per channel
+	int transform_mode;		// x0 means alpha channel not transformed, x1 otherwise. 0x rgb not transformed, 1x otherwise.
+	int mode;				// associated mode value
+	int modebits;			// number of mode bits
+	char *encoding;			// verilog description of encoding for this mode
+};
+
+#define	TRANSFORM_MODE_ALPHA	1
+#define	TRANSFORM_MODE_RGB	2
+
+#define	NPATTERNS 1
+
+static Pattern patterns[NPATTERNS] =
+{
+	// red		green		blue		alpha	xfm	mode  mb encoding
+	5,5,		5,5,		5,5,		6,6,	0x0, 0x10, 5, "",
+};
+
+struct RegionPrec
+{
+	int	endpt_a_prec[NCHANNELS_RGBA];
+	int endpt_b_prec[NCHANNELS_RGBA];
+};
+
+struct PatternPrec
+{
+	RegionPrec region_precs[NREGIONS];
+};
+
+// this is the precision for each channel and region
+// NOTE: this MUST match the corresponding data in "patterns" above -- WARNING: there is NO assert to check this!
+static PatternPrec pattern_precs[NPATTERNS] =
+{
+	5,5,5,6,	5,5,5,6,
+};
+
+
+// return # of bits needed to store n. handle signed or unsigned cases properly
+static int nbits(int n, bool issigned)
+{
+	int nb;
+	if (n==0)
+		return 0;	// no bits needed for 0, signed or not
+	else if (n > 0)
+	{
+		for (nb=0; n; ++nb, n>>=1) ;
+		return nb + (issigned?1:0);
+	}
+	else
+	{
+		assert (issigned);
+		for (nb=0; n<-1; ++nb, n>>=1) ;
+		return nb + 1;
+	}
+}
+
+#define	R_0	ep[0].A[i]
+#define	R_1 ep[0].B[i]
+
+static void transform_forward(int transform_mode, IntEndptsRGBA ep[NREGIONS])
+{
+	int i;
+
+	if (transform_mode & TRANSFORM_MODE_RGB)
+		for (i=CHANNEL_R; i<CHANNEL_A; ++i)
+			R_1 -= R_0;
+	if (transform_mode & TRANSFORM_MODE_ALPHA)
+	{
+		i = CHANNEL_A;
+		R_1 -= R_0;
+	}
+}
+
+static void transform_inverse(int transform_mode, IntEndptsRGBA ep[NREGIONS])
+{
+	int i;
+
+	if (transform_mode & TRANSFORM_MODE_RGB)
+		for (i=CHANNEL_R; i<CHANNEL_A; ++i)
+			R_1 += R_0;
+	if (transform_mode & TRANSFORM_MODE_ALPHA)
+	{
+		i = CHANNEL_A;
+		R_1 += R_0;
+	}
+}
+
+static void quantize_endpts(const FltEndpts endpts[NREGIONS], const PatternPrec &pattern_prec, IntEndptsRGBA q_endpts[NREGIONS])
+{
+	for (int region = 0; region < NREGIONS; ++region)
+	{
+		q_endpts[region].A[0] = Utils::quantize(endpts[region].A.X(), pattern_prec.region_precs[region].endpt_a_prec[0]);
+		q_endpts[region].A[1] = Utils::quantize(endpts[region].A.Y(), pattern_prec.region_precs[region].endpt_a_prec[1]);
+		q_endpts[region].A[2] = Utils::quantize(endpts[region].A.Z(), pattern_prec.region_precs[region].endpt_a_prec[2]);
+		q_endpts[region].A[3] = Utils::quantize(endpts[region].A.W(), pattern_prec.region_precs[region].endpt_a_prec[3]);
+
+		q_endpts[region].B[0] = Utils::quantize(endpts[region].B.X(), pattern_prec.region_precs[region].endpt_b_prec[0]);
+		q_endpts[region].B[1] = Utils::quantize(endpts[region].B.Y(), pattern_prec.region_precs[region].endpt_b_prec[1]);
+		q_endpts[region].B[2] = Utils::quantize(endpts[region].B.Z(), pattern_prec.region_precs[region].endpt_b_prec[2]);
+		q_endpts[region].B[3] = Utils::quantize(endpts[region].B.W(), pattern_prec.region_precs[region].endpt_b_prec[3]);
+	}
+}
+
+// swap endpoints as needed to ensure that the indices at index_one and index_two have a 0 high-order bit
+// index_two is 0 at x=0 y=0 and 15 at x=3 y=3 so y = (index >> 2) & 3 and x = index & 3
+static void swap_indices(int shapeindex, int indexmode, IntEndptsRGBA endpts[NREGIONS], int indices[NINDEXARRAYS][Tile::TILE_H][Tile::TILE_W])
+{
+	int index_positions[NREGIONS];
+
+	index_positions[0] = 0;			// since WLOG we have the high bit of the shapes at 0
+
+	for (int region = 0; region < NREGIONS; ++region)
+	{
+		int x = index_positions[region] & 3;
+		int y = (index_positions[region] >> 2) & 3;
+		assert(REGION(x,y,shapeindex) == region);		// double check the table
+
+		// swap RGB
+		if (indices[INDEXARRAY_RGB][y][x] & HIGH_INDEXBIT_RGB(indexmode))
+		{
+			// high bit is set, swap the endpts and indices for this region
+			int t;
+			for (int i=CHANNEL_R; i<=CHANNEL_B; ++i) { t = endpts[region].A[i]; endpts[region].A[i] = endpts[region].B[i]; endpts[region].B[i] = t; }
+
+			for (int y = 0; y < Tile::TILE_H; y++)
+			for (int x = 0; x < Tile::TILE_W; x++)
+				if (REGION(x,y,shapeindex) == region)
+					indices[INDEXARRAY_RGB][y][x] = NINDICES_RGB(indexmode) - 1 - indices[INDEXARRAY_RGB][y][x];
+		}
+
+		// swap A
+		if (indices[INDEXARRAY_A][y][x] & HIGH_INDEXBIT_A(indexmode))
+		{
+			// high bit is set, swap the endpts and indices for this region
+			int t;
+			for (int i=CHANNEL_A; i<=CHANNEL_A; ++i) { t = endpts[region].A[i]; endpts[region].A[i] = endpts[region].B[i]; endpts[region].B[i] = t; }
+
+			for (int y = 0; y < Tile::TILE_H; y++)
+			for (int x = 0; x < Tile::TILE_W; x++)
+				if (REGION(x,y,shapeindex) == region)
+					indices[INDEXARRAY_A][y][x] = NINDICES_A(indexmode) - 1 - indices[INDEXARRAY_A][y][x];
+		}
+	}
+}
+
+static bool endpts_fit(IntEndptsRGBA endpts[NREGIONS], const Pattern &p)
+{
+	return true;
+}
+
+static void write_header(const IntEndptsRGBA endpts[NREGIONS], int shapeindex, const Pattern &p, int rotatemode, int indexmode, Bits &out)
+{
+	// ignore shapeindex
+	out.write(p.mode, p.modebits);
+	out.write(rotatemode, ROTATEMODE_BITS);
+	out.write(indexmode, INDEXMODE_BITS);
+	for (int i=0; i<NREGIONS; ++i)
+		for (int j=0; j<NCHANNELS_RGBA; ++j)
+		{
+			out.write(endpts[i].A[j], p.chan[j].nbitsizes[0]);
+			out.write(endpts[i].B[j], p.chan[j].nbitsizes[1]);
+		}
+	assert (out.getptr() == 50);
+}
+
+static void read_header(Bits &in, IntEndptsRGBA endpts[NREGIONS], int &shapeindex, int &rotatemode, int &indexmode, Pattern &p, int &pat_index)
+{
+	int mode = AVPCL::getmode(in);
+
+	pat_index = 0;
+
+	assert (pat_index >= 0 && pat_index < NPATTERNS);
+	assert (in.getptr() == patterns[pat_index].modebits);
+
+	p = patterns[pat_index];
+
+	shapeindex = 0;		// we don't have any
+
+	rotatemode = in.read(ROTATEMODE_BITS);
+	indexmode = in.read(INDEXMODE_BITS);
+	for (int i=0; i<NREGIONS; ++i)
+		for (int j=0; j<NCHANNELS_RGBA; ++j)
+		{
+			endpts[i].A[j] = in.read(p.chan[j].nbitsizes[0]);
+			endpts[i].B[j] = in.read(p.chan[j].nbitsizes[1]);
+		}
+	assert (in.getptr() == 50);
+}
+
+static void write_indices(const int indices[NINDEXARRAYS][Tile::TILE_H][Tile::TILE_W], int shapeindex, int indexmode, Bits &out)
+{
+	// the indices we shorten is always index 0
+
+	// do the 2 bit indices first
+	assert ((indices[INDEXARRAY_2BITS(indexmode)][0][0] & HIGH_INDEXBIT2) == 0);
+	for (int i = 0; i < Tile::TILE_TOTAL; ++i)
+		out.write(indices[INDEXARRAY_2BITS(indexmode)][i>>2][i&3], INDEXBITS2 - (i==0?1:0));	// write i..[1:0] or i..[0]
+
+	// then the 3 bit indices
+	assert ((indices[INDEXARRAY_3BITS(indexmode)][0][0] & HIGH_INDEXBIT3) == 0);
+	for (int i = 0; i < Tile::TILE_TOTAL; ++i)
+		out.write(indices[INDEXARRAY_3BITS(indexmode)][i>>2][i&3], INDEXBITS3 - (i==0?1:0));	// write i..[2:0] or i..[1:0]
+}
+
+static void read_indices(Bits &in, int shapeindex, int indexmode, int indices[NINDEXARRAYS][Tile::TILE_H][Tile::TILE_W])
+{
+	// the indices we shorten is always index 0
+
+	// do the 2 bit indices first
+	for (int i = 0; i < Tile::TILE_TOTAL; ++i)
+		indices[INDEXARRAY_2BITS(indexmode)][i>>2][i&3] = in.read(INDEXBITS2 - (i==0?1:0));		// read i..[1:0] or i..[0]
+
+	// then the 3 bit indices
+	for (int i = 0; i < Tile::TILE_TOTAL; ++i)
+		indices[INDEXARRAY_3BITS(indexmode)][i>>2][i&3] = in.read(INDEXBITS3 - (i==0?1:0));		// read i..[1:0] or i..[0]
+}
+
+static void emit_block(const IntEndptsRGBA endpts[NREGIONS], int shapeindex, const Pattern &p, const int indices[NINDEXARRAYS][Tile::TILE_H][Tile::TILE_W], int rotatemode, int indexmode, char *block)
+{
+	Bits out(block, AVPCL::BITSIZE);
+
+	write_header(endpts, shapeindex, p, rotatemode, indexmode, out);
+
+	write_indices(indices, shapeindex, indexmode, out);
+
+	assert(out.getptr() == AVPCL::BITSIZE);
+}
+
+static void generate_palette_quantized_rgb_a(const IntEndptsRGBA &endpts, const RegionPrec &region_prec, int indexmode, Vec3 palette_rgb[NINDICES3], float palette_a[NINDICES3])
+{
+	// scale endpoints for RGB
+	int a, b;
+
+	a = Utils::unquantize(endpts.A[0], region_prec.endpt_a_prec[0]); 
+	b = Utils::unquantize(endpts.B[0], region_prec.endpt_b_prec[0]);
+
+	// interpolate R
+	for (int i = 0; i < NINDICES_RGB(indexmode); ++i)
+		palette_rgb[i].X() = PALETTE_LERP(a, b, i, BIAS_RGB(indexmode), DENOM_RGB(indexmode));
+
+	a = Utils::unquantize(endpts.A[1], region_prec.endpt_a_prec[1]); 
+	b = Utils::unquantize(endpts.B[1], region_prec.endpt_b_prec[1]);
+
+	// interpolate G
+	for (int i = 0; i < NINDICES_RGB(indexmode); ++i)
+		palette_rgb[i].Y() = PALETTE_LERP(a, b, i, BIAS_RGB(indexmode), DENOM_RGB(indexmode));
+
+	a = Utils::unquantize(endpts.A[2], region_prec.endpt_a_prec[2]); 
+	b = Utils::unquantize(endpts.B[2], region_prec.endpt_b_prec[2]);
+
+	// interpolate B
+	for (int i = 0; i < NINDICES_RGB(indexmode); ++i)
+		palette_rgb[i].Z() = PALETTE_LERP(a, b, i, BIAS_RGB(indexmode), DENOM_RGB(indexmode));
+
+	a = Utils::unquantize(endpts.A[3], region_prec.endpt_a_prec[3]); 
+	b = Utils::unquantize(endpts.B[3], region_prec.endpt_b_prec[3]);
+
+	// interpolate A
+	for (int i = 0; i < NINDICES_A(indexmode); ++i)
+		palette_a[i] = PALETTE_LERP(a, b, i, BIAS_A(indexmode), DENOM_A(indexmode));
+
+}
+
+static void sign_extend(Pattern &p, IntEndptsRGBA endpts[NREGIONS])
+{
+	for (int i=0; i<NCHANNELS_RGBA; ++i)
+	{
+		if (p.transform_mode)
+		{
+			// endpts[0].A[i] = SIGN_EXTEND(endpts[0].B[i], p.chan[i].nbitsizes[0]);	// always positive here
+			endpts[0].B[i] = SIGN_EXTEND(endpts[0].B[i], p.chan[i].nbitsizes[1]);
+			endpts[1].A[i] = SIGN_EXTEND(endpts[1].A[i], p.chan[i].nbitsizes[2]);
+			endpts[1].B[i] = SIGN_EXTEND(endpts[1].B[i], p.chan[i].nbitsizes[3]);
+		}
+	}
+}
+
+static void rotate_tile(const Tile &in, int rotatemode, Tile &out)
+{
+	out.size_x = in.size_x;
+	out.size_y = in.size_y;
+
+	for (int y=0; y<in.size_y; ++y)
+	for (int x=0; x<in.size_x; ++x)
+	{
+		float t;
+		out.data[y][x] = in.data[y][x];
+
+		switch(rotatemode)
+		{
+		case ROTATEMODE_RGBA_RGBA: break;
+		case ROTATEMODE_RGBA_AGBR: t = (out.data[y][x]).X(); (out.data[y][x]).X() = (out.data[y][x]).W(); (out.data[y][x]).W() = t; break;
+		case ROTATEMODE_RGBA_RABG: t = (out.data[y][x]).Y(); (out.data[y][x]).Y() = (out.data[y][x]).W(); (out.data[y][x]).W() = t; break;
+		case ROTATEMODE_RGBA_RGAB: t = (out.data[y][x]).Z(); (out.data[y][x]).Z() = (out.data[y][x]).W(); (out.data[y][x]).W() = t; break;
+		default: assert(0);
+		}
+	}
+}
+
+void AVPCL::decompress_mode4(const char *block, Tile &t)
+{
+	Bits in(block, AVPCL::BITSIZE);
+
+	Pattern p;
+	IntEndptsRGBA endpts[NREGIONS];
+	int shapeindex, pat_index, rotatemode, indexmode;
+
+	read_header(in, endpts, shapeindex, rotatemode, indexmode, p, pat_index);
+	
+	sign_extend(p, endpts);
+
+	if (p.transform_mode)
+		transform_inverse(p.transform_mode, endpts);
+
+	Vec3 palette_rgb[NREGIONS][NINDICES3];	// could be nindices2
+	float palette_a[NREGIONS][NINDICES3];	// could be nindices2
+
+	for (int region = 0; region < NREGIONS; ++region)
+		generate_palette_quantized_rgb_a(endpts[region], pattern_precs[pat_index].region_precs[region], indexmode, &palette_rgb[region][0], &palette_a[region][0]);
+
+	int indices[NINDEXARRAYS][Tile::TILE_H][Tile::TILE_W];
+
+	read_indices(in, shapeindex, indexmode, indices);
+
+	assert(in.getptr() == AVPCL::BITSIZE);
+
+	Tile temp(t.size_x, t.size_y);
+
+	// lookup
+	for (int y = 0; y < Tile::TILE_H; y++)
+	for (int x = 0; x < Tile::TILE_W; x++)
+		temp.data[y][x] = Vec4(palette_rgb[REGION(x,y,shapeindex)][indices[INDEXARRAY_RGB][y][x]], palette_a[REGION(x,y,shapeindex)][indices[INDEXARRAY_A][y][x]]);
+
+	rotate_tile(temp, rotatemode, t);
+}
+
+// given a collection of colors and quantized endpoints, generate a palette, choose best entries, and return a single toterr
+// we already have a candidate mapping when we call this function, thus an error. take an early exit if the accumulated error so far
+// exceeds what we already have
+static double map_colors(const Vec4 colors[], int np, int rotatemode, int indexmode, const IntEndptsRGBA &endpts, const RegionPrec &region_prec, double current_besterr, int indices[NINDEXARRAYS][Tile::TILE_TOTAL])
+{
+	Vec3 palette_rgb[NINDICES3];	// could be nindices2
+	float palette_a[NINDICES3];	// could be nindices2
+	double toterr = 0;
+
+	generate_palette_quantized_rgb_a(endpts, region_prec, indexmode, &palette_rgb[0], &palette_a[0]);
+
+	Vec3 rgb;
+	float a;
+
+	for (int i = 0; i < np; ++i)
+	{
+		double err, besterr;
+		float palette_alpha = 0, tile_alpha = 0;
+
+		if(AVPCL::flag_premult)
+				tile_alpha = (rotatemode == ROTATEMODE_RGBA_AGBR) ? (colors[i]).X() :
+							 (rotatemode == ROTATEMODE_RGBA_RABG) ? (colors[i]).Y() :
+							 (rotatemode == ROTATEMODE_RGBA_RGAB) ? (colors[i]).Z() : (colors[i]).W();
+
+		rgb.X() = (colors[i]).X();
+		rgb.Y() = (colors[i]).Y();
+		rgb.Z() = (colors[i]).Z();
+		a = (colors[i]).W();
+
+		// compute the two indices separately
+		// if we're doing premultiplied alpha, we need to choose first the index that
+		// determines the alpha value, and then do the other index
+
+		if (rotatemode == ROTATEMODE_RGBA_RGBA)
+		{
+			// do A index first as it has the alpha
+			besterr = DBL_MAX;
+			for (int j = 0; j < NINDICES_A(indexmode) && besterr > 0; ++j)
+			{
+				err = Utils::metric1(a, palette_a[j], rotatemode);
+
+				if (err > besterr)	// error increased, so we're done searching
+					break;
+				if (err < besterr)
+				{
+					besterr = err;
+					palette_alpha = palette_a[j];
+					indices[INDEXARRAY_A][i] = j;
+				}
+			}
+			toterr += besterr;		// squared-error norms are additive since we don't do the square root
+
+			// do RGB index
+			besterr = DBL_MAX;
+			for (int j = 0; j < NINDICES_RGB(indexmode) && besterr > 0; ++j)
+			{
+				err = !AVPCL::flag_premult ? Utils::metric3(rgb, palette_rgb[j], rotatemode) :
+											 Utils::metric3premult_alphaout(rgb, tile_alpha, palette_rgb[j], palette_alpha);
+
+				if (err > besterr)	// error increased, so we're done searching
+					break;
+				if (err < besterr)
+				{
+					besterr = err;
+					indices[INDEXARRAY_RGB][i] = j;
+				}
+			}
+			toterr += besterr;
+			if (toterr > current_besterr)
+			{
+				// fill out bogus index values so it's initialized at least
+				for (int k = i; k < np; ++k)
+				{
+					indices[INDEXARRAY_RGB][k] = -1;
+					indices[INDEXARRAY_A][k] = -1;
+				}
+				return DBL_MAX;
+			}
+		}
+		else
+		{
+			// do RGB index
+			besterr = DBL_MAX;
+			int bestindex;
+			for (int j = 0; j < NINDICES_RGB(indexmode) && besterr > 0; ++j)
+			{
+				err = !AVPCL::flag_premult ? Utils::metric3(rgb, palette_rgb[j], rotatemode) :
+											 Utils::metric3premult_alphain(rgb, palette_rgb[j], rotatemode);
+
+				if (err > besterr)	// error increased, so we're done searching
+					break;
+				if (err < besterr)
+				{
+					besterr = err;
+					bestindex = j;
+					indices[INDEXARRAY_RGB][i] = j;
+				}
+			}
+			palette_alpha = (rotatemode == ROTATEMODE_RGBA_AGBR) ? (palette_rgb[bestindex]).X() :
+							(rotatemode == ROTATEMODE_RGBA_RABG) ? (palette_rgb[bestindex]).Y() :
+							(rotatemode == ROTATEMODE_RGBA_RGAB) ? (palette_rgb[bestindex]).Z() : (assert(0),0);
+			toterr += besterr;
+
+			// do A index
+			besterr = DBL_MAX;
+			for (int j = 0; j < NINDICES_A(indexmode) && besterr > 0; ++j)
+			{
+				err = !AVPCL::flag_premult ? Utils::metric1(a, palette_a[j], rotatemode) :
+											 Utils::metric1premult(a, tile_alpha, palette_a[j], palette_alpha, rotatemode);
+
+				if (err > besterr)	// error increased, so we're done searching
+					break;
+				if (err < besterr)
+				{
+					besterr = err;
+					indices[INDEXARRAY_A][i] = j;
+				}
+			}
+			toterr += besterr;		// squared-error norms are additive since we don't do the square root
+			if (toterr > current_besterr)
+			{
+				// fill out bogus index values so it's initialized at least
+				for (int k = i; k < np; ++k)
+				{
+					indices[INDEXARRAY_RGB][k] = -1;
+					indices[INDEXARRAY_A][k] = -1;
+				}
+				return DBL_MAX;
+			}
+		}
+	}
+	return toterr;
+}
+
+// assign indices given a tile, shape, and quantized endpoints, return toterr for each region
+static void assign_indices(const Tile &tile, int shapeindex, int rotatemode, int indexmode, IntEndptsRGBA endpts[NREGIONS], const PatternPrec &pattern_prec, 
+						   int indices[NINDEXARRAYS][Tile::TILE_H][Tile::TILE_W], double toterr[NREGIONS])
+{
+	Vec3 palette_rgb[NREGIONS][NINDICES3];	// could be nindices2
+	float palette_a[NREGIONS][NINDICES3];	// could be nindices2
+
+	for (int region = 0; region < NREGIONS; ++region)
+	{
+		generate_palette_quantized_rgb_a(endpts[region], pattern_prec.region_precs[region], indexmode, &palette_rgb[region][0], &palette_a[region][0]);
+		toterr[region] = 0;
+	}
+
+	Vec3 rgb;
+	float a;
+
+	for (int y = 0; y < tile.size_y; y++)
+	for (int x = 0; x < tile.size_x; x++)
+	{
+		int region = REGION(x,y,shapeindex);
+		double err, besterr;
+		float palette_alpha = 0, tile_alpha = 0;
+
+		rgb.X() = (tile.data[y][x]).X();
+		rgb.Y() = (tile.data[y][x]).Y();
+		rgb.Z() = (tile.data[y][x]).Z();
+		a = (tile.data[y][x]).W();
+
+		if(AVPCL::flag_premult)
+				tile_alpha = (rotatemode == ROTATEMODE_RGBA_AGBR) ? (tile.data[y][x]).X() :
+							 (rotatemode == ROTATEMODE_RGBA_RABG) ? (tile.data[y][x]).Y() :
+							 (rotatemode == ROTATEMODE_RGBA_RGAB) ? (tile.data[y][x]).Z() : (tile.data[y][x]).W();
+
+		// compute the two indices separately
+		// if we're doing premultiplied alpha, we need to choose first the index that
+		// determines the alpha value, and then do the other index
+
+		if (rotatemode == ROTATEMODE_RGBA_RGBA)
+		{
+			// do A index first as it has the alpha
+			besterr = DBL_MAX;
+			for (int i = 0; i < NINDICES_A(indexmode) && besterr > 0; ++i)
+			{
+				err = Utils::metric1(a, palette_a[region][i], rotatemode);
+
+				if (err > besterr)	// error increased, so we're done searching
+					break;
+				if (err < besterr)
+				{
+					besterr = err;
+					indices[INDEXARRAY_A][y][x] = i;
+					palette_alpha = palette_a[region][i];
+				}
+			}
+			toterr[region] += besterr;		// squared-error norms are additive since we don't do the square root
+
+			// do RGB index
+			besterr = DBL_MAX;
+			for (int i = 0; i < NINDICES_RGB(indexmode) && besterr > 0; ++i)
+			{
+				err = !AVPCL::flag_premult ? Utils::metric3(rgb, palette_rgb[region][i], rotatemode) :
+											 Utils::metric3premult_alphaout(rgb, tile_alpha, palette_rgb[region][i], palette_alpha);
+
+				if (err > besterr)	// error increased, so we're done searching
+					break;
+				if (err < besterr)
+				{
+					besterr = err;
+					indices[INDEXARRAY_RGB][y][x] = i;
+				}
+			}
+			toterr[region] += besterr;
+		}
+		else
+		{
+			// do RGB index first as it has the alpha
+			besterr = DBL_MAX;
+			int bestindex;
+			for (int i = 0; i < NINDICES_RGB(indexmode) && besterr > 0; ++i)
+			{
+				err = !AVPCL::flag_premult ? Utils::metric3(rgb, palette_rgb[region][i], rotatemode) :
+											 Utils::metric3premult_alphain(rgb, palette_rgb[region][i], rotatemode);
+
+				if (err > besterr)	// error increased, so we're done searching
+					break;
+				if (err < besterr)
+				{
+					besterr = err;
+					indices[INDEXARRAY_RGB][y][x] = i;
+					bestindex = i;
+				}
+			}
+			palette_alpha = (rotatemode == ROTATEMODE_RGBA_AGBR) ? (palette_rgb[region][bestindex]).X() :
+							(rotatemode == ROTATEMODE_RGBA_RABG) ? (palette_rgb[region][bestindex]).Y() :
+							(rotatemode == ROTATEMODE_RGBA_RGAB) ? (palette_rgb[region][bestindex]).Z() : (assert(0),0);
+			toterr[region] += besterr;
+
+			// do A index
+			besterr = DBL_MAX;
+			for (int i = 0; i < NINDICES_A(indexmode) && besterr > 0; ++i)
+			{
+				err = !AVPCL::flag_premult ? Utils::metric1(a, palette_a[region][i], rotatemode) :
+											 Utils::metric1premult(a, tile_alpha, palette_a[region][i], palette_alpha, rotatemode);
+
+				if (err > besterr)	// error increased, so we're done searching
+					break;
+				if (err < besterr)
+				{
+					besterr = err;
+					indices[INDEXARRAY_A][y][x] = i;
+				}
+			}
+			toterr[region] += besterr;		// squared-error norms are additive since we don't do the square root
+		}
+	}
+}
+
+// note: indices are valid only if the value returned is less than old_err; otherwise they contain -1's
+// this function returns either old_err or a value smaller (if it was successful in improving the error)
+static double perturb_one(const Vec4 colors[], int np, int rotatemode, int indexmode, int ch, const RegionPrec &region_prec, const IntEndptsRGBA &old_endpts, IntEndptsRGBA &new_endpts, 
+						  double old_err, int do_b, int indices[NINDEXARRAYS][Tile::TILE_TOTAL])
+{
+	// we have the old endpoints: old_endpts
+	// we have the perturbed endpoints: new_endpts
+	// we have the temporary endpoints: temp_endpts
+
+	IntEndptsRGBA temp_endpts;
+	float min_err = old_err;		// start with the best current error
+	int beststep;
+	int temp_indices[NINDEXARRAYS][Tile::TILE_TOTAL];
+
+	for (int j=0; j<NINDEXARRAYS; ++j)
+	for (int i=0; i<np; ++i)
+		indices[j][i] = -1;
+
+	// copy real endpoints so we can perturb them
+	temp_endpts = new_endpts = old_endpts;
+
+	int prec = do_b ? region_prec.endpt_b_prec[ch] : region_prec.endpt_a_prec[ch];
+
+	// do a logarithmic search for the best error for this endpoint (which)
+	for (int step = 1 << (prec-1); step; step >>= 1)
+	{
+		bool improved = false;
+		for (int sign = -1; sign <= 1; sign += 2)
+		{
+			if (do_b == 0)
+			{
+				temp_endpts.A[ch] = new_endpts.A[ch] + sign * step;
+				if (temp_endpts.A[ch] < 0 || temp_endpts.A[ch] >= (1 << prec))
+					continue;
+			}
+			else
+			{
+				temp_endpts.B[ch] = new_endpts.B[ch] + sign * step;
+				if (temp_endpts.B[ch] < 0 || temp_endpts.B[ch] >= (1 << prec))
+					continue;
+			}
+
+			float err = map_colors(colors, np, rotatemode, indexmode, temp_endpts, region_prec, min_err, temp_indices);
+
+			if (err < min_err)
+			{
+				improved = true;
+				min_err = err;
+				beststep = sign * step;
+				for (int j=0; j<NINDEXARRAYS; ++j)
+				for (int i=0; i<np; ++i)
+					indices[j][i] = temp_indices[j][i];
+			}
+		}
+		// if this was an improvement, move the endpoint and continue search from there
+		if (improved)
+		{
+			if (do_b == 0)
+				new_endpts.A[ch] += beststep;
+			else
+				new_endpts.B[ch] += beststep;
+		}
+	}
+	return min_err;
+}
+
+// the larger the error the more time it is worth spending on an exhaustive search.
+// perturb the endpoints at least -3 to 3.
+// if err > 5000 perturb endpoints 50% of precision
+// if err > 1000 25%
+// if err > 200 12.5%
+// if err > 40  6.25%
+// for np = 16 -- adjust error thresholds as a function of np
+// always ensure endpoint ordering is preserved (no need to overlap the scan)
+static double exhaustive(const Vec4 colors[], int np, int rotatemode, int indexmode, int ch, const RegionPrec &region_prec, double orig_err, IntEndptsRGBA &opt_endpts, int indices[NINDEXARRAYS][Tile::TILE_TOTAL])
+{
+	IntEndptsRGBA temp_endpts;
+	double best_err = orig_err;
+	int aprec = region_prec.endpt_a_prec[ch];
+	int bprec = region_prec.endpt_b_prec[ch];
+	int good_indices[NINDEXARRAYS][Tile::TILE_TOTAL];
+	int temp_indices[NINDEXARRAYS][Tile::TILE_TOTAL];
+
+	for (int j=0; j<NINDEXARRAYS; ++j)
+	for (int i=0; i<np; ++i)
+		indices[j][i] = -1;
+
+	double thr_scale = (double)np / (double)Tile::TILE_TOTAL;
+
+	if (orig_err == 0) return orig_err;
+
+	int adelta = 0, bdelta = 0;
+	if (orig_err > 5000.0*thr_scale)		{ adelta = (1 << aprec)/2; bdelta = (1 << bprec)/2; }
+	else if (orig_err > 1000.0*thr_scale)	{ adelta = (1 << aprec)/4; bdelta = (1 << bprec)/4; }
+	else if (orig_err > 200.0*thr_scale)	{ adelta = (1 << aprec)/8; bdelta = (1 << bprec)/8; }
+	else if (orig_err > 40.0*thr_scale)		{ adelta = (1 << aprec)/16; bdelta = (1 << bprec)/16; }
+	adelta = MAX(adelta, 3);
+	bdelta = MAX(bdelta, 3);
+
+#ifdef	DISABLE_EXHAUSTIVE
+	adelta = bdelta = 3;
+#endif
+
+	temp_endpts = opt_endpts;
+
+	// ok figure out the range of A and B
+	int alow = MAX(0, opt_endpts.A[ch] - adelta);
+	int ahigh = MIN((1<<aprec)-1, opt_endpts.A[ch] + adelta);
+	int blow = MAX(0, opt_endpts.B[ch] - bdelta);
+	int bhigh = MIN((1<<bprec)-1, opt_endpts.B[ch] + bdelta);
+
+	// now there's no need to swap the ordering of A and B
+	bool a_le_b = opt_endpts.A[ch] <= opt_endpts.B[ch];
+
+	int amin, bmin;
+
+	if (opt_endpts.A[ch] <= opt_endpts.B[ch])
+	{
+		// keep a <= b
+		for (int a = alow; a <= ahigh; ++a)
+		for (int b = MAX(a, blow); b < bhigh; ++b)
+		{
+			temp_endpts.A[ch] = a;
+			temp_endpts.B[ch] = b;
+		
+			float err = map_colors(colors, np, rotatemode, indexmode, temp_endpts, region_prec, best_err, temp_indices);
+			if (err < best_err) 
+			{ 
+				amin = a; 
+				bmin = b; 
+				best_err = err;
+				for (int j=0; j<NINDEXARRAYS; ++j)
+				for (int i=0; i<np; ++i)
+					good_indices[j][i] = temp_indices[j][i];
+			}
+		}
+	}
+	else
+	{
+		// keep b <= a
+		for (int b = blow; b < bhigh; ++b)
+		for (int a = MAX(b, alow); a <= ahigh; ++a)
+		{
+			temp_endpts.A[ch] = a;
+			temp_endpts.B[ch] = b;
+		
+			float err = map_colors(colors, np, rotatemode, indexmode, temp_endpts, region_prec, best_err, temp_indices);
+			if (err < best_err) 
+			{ 
+				amin = a; 
+				bmin = b; 
+				best_err = err;
+				for (int j=0; j<NINDEXARRAYS; ++j)
+				for (int i=0; i<np; ++i)
+					good_indices[j][i] = temp_indices[j][i];
+			}
+		}
+	}
+	if (best_err < orig_err)
+	{
+		opt_endpts.A[ch] = amin;
+		opt_endpts.B[ch] = bmin;
+		orig_err = best_err;
+		for (int j=0; j<NINDEXARRAYS; ++j)
+		for (int i=0; i<np; ++i)
+			indices[j][i] = good_indices[j][i];
+	}
+
+	return best_err;
+}
+
+static double optimize_one(const Vec4 colors[], int np, int rotatemode, int indexmode, double orig_err, const IntEndptsRGBA &orig_endpts, const RegionPrec &region_prec, IntEndptsRGBA &opt_endpts)
+{
+	double opt_err = orig_err;
+
+	opt_endpts = orig_endpts;
+
+	/*
+		err0 = perturb(rgb0, delta0)
+		err1 = perturb(rgb1, delta1)
+		if (err0 < err1)
+			if (err0 >= initial_error) break
+			rgb0 += delta0
+			next = 1
+		else
+			if (err1 >= initial_error) break
+			rgb1 += delta1
+			next = 0
+		initial_err = map()
+		for (;;)
+			err = perturb(next ? rgb1:rgb0, delta)
+			if (err >= initial_err) break
+			next? rgb1 : rgb0 += delta
+			initial_err = err
+	*/
+	IntEndptsRGBA new_a, new_b;
+	IntEndptsRGBA new_endpt;
+	int do_b;
+	int orig_indices[NINDEXARRAYS][Tile::TILE_TOTAL];
+	int new_indices[NINDEXARRAYS][Tile::TILE_TOTAL];
+	int temp_indices0[NINDEXARRAYS][Tile::TILE_TOTAL];
+	int temp_indices1[NINDEXARRAYS][Tile::TILE_TOTAL];
+
+	// now optimize each channel separately
+	for (int ch = 0; ch < NCHANNELS_RGBA; ++ch)
+	{
+		// figure out which endpoint when perturbed gives the most improvement and start there
+		// if we just alternate, we can easily end up in a local minima
+		float err0 = perturb_one(colors, np, rotatemode, indexmode, ch, region_prec, opt_endpts, new_a, opt_err, 0, temp_indices0);	// perturb endpt A
+		float err1 = perturb_one(colors, np, rotatemode, indexmode, ch, region_prec, opt_endpts, new_b, opt_err, 1, temp_indices1);	// perturb endpt B
+
+		if (err0 < err1)
+		{
+			if (err0 >= opt_err)
+				continue;
+
+			for (int j=0; j<NINDEXARRAYS; ++j)
+			for (int i=0; i<np; ++i)
+			{
+				new_indices[j][i] = orig_indices[j][i] = temp_indices0[j][i];
+				assert (orig_indices[j][i] != -1);
+			}
+
+			opt_endpts.A[ch] = new_a.A[ch];
+			opt_err = err0;
+			do_b = 1;		// do B next
+		}
+		else
+		{
+			if (err1 >= opt_err)
+				continue;
+
+			for (int j=0; j<NINDEXARRAYS; ++j)
+			for (int i=0; i<np; ++i)
+			{
+				new_indices[j][i] = orig_indices[j][i] = temp_indices1[j][i];
+				assert (orig_indices[j][i] != -1);
+			}
+
+			opt_endpts.B[ch] = new_b.B[ch];
+			opt_err = err1;
+			do_b = 0;		// do A next
+		}
+		
+		// now alternate endpoints and keep trying until there is no improvement
+		for (;;)
+		{
+			float err = perturb_one(colors, np, rotatemode, indexmode, ch, region_prec, opt_endpts, new_endpt, opt_err, do_b, temp_indices0);
+			if (err >= opt_err)
+				break;
+
+			for (int j=0; j<NINDEXARRAYS; ++j)
+			for (int i=0; i<np; ++i)
+			{
+				new_indices[j][i] = temp_indices0[j][i];
+				assert (orig_indices[j][i] != -1);
+			}
+
+			if (do_b == 0)
+				opt_endpts.A[ch] = new_endpt.A[ch];
+			else
+				opt_endpts.B[ch] = new_endpt.B[ch];
+			opt_err = err;
+			do_b = 1 - do_b;	// now move the other endpoint
+		}
+
+		// see if the indices have changed
+		int i;
+		for (i=0; i<np; ++i)
+			if (orig_indices[INDEXARRAY_RGB][i] != new_indices[INDEXARRAY_RGB][i] || orig_indices[INDEXARRAY_A][i] != new_indices[INDEXARRAY_A][i])
+				break;
+
+		if (i<np)
+			ch = -1;	// start over
+	}
+
+	// finally, do a small exhaustive search around what we think is the global minima to be sure
+	bool first = true;
+	for (int ch = 0; ch < NCHANNELS_RGBA; ++ch)
+	{
+		double new_err = exhaustive(colors, np, rotatemode, indexmode, ch, region_prec, opt_err, opt_endpts, temp_indices0);
+
+		if (new_err < opt_err)
+		{
+			opt_err = new_err;
+
+			if (first)
+			{
+				for (int j=0; j<NINDEXARRAYS; ++j)
+				for (int i=0; i<np; ++i)
+				{
+					orig_indices[j][i] = temp_indices0[j][i];
+					assert (orig_indices[j][i] != -1);
+				}
+				first = false;
+			}
+			else
+			{
+				// see if the indices have changed
+				int i;
+				for (i=0; i<np; ++i)
+					if (orig_indices[INDEXARRAY_RGB][i] != temp_indices0[INDEXARRAY_RGB][i] || orig_indices[INDEXARRAY_A][i] != temp_indices0[INDEXARRAY_A][i])
+						break;
+
+				if (i<np)
+				{
+					ch = -1;	// start over
+					first = true;
+				}
+			}
+		}
+	}
+
+	return opt_err;
+}
+
+static void optimize_endpts(const Tile &tile, int shapeindex, int rotatemode, int indexmode, const double orig_err[NREGIONS], 
+							const IntEndptsRGBA orig_endpts[NREGIONS], const PatternPrec &pattern_prec, double opt_err[NREGIONS], IntEndptsRGBA opt_endpts[NREGIONS])
+{
+	Vec4 pixels[Tile::TILE_TOTAL];
+	IntEndptsRGBA temp_in, temp_out;
+
+	for (int region=0; region<NREGIONS; ++region)
+	{
+		// collect the pixels in the region
+		int np = 0;
+
+		for (int y = 0; y < tile.size_y; y++)
+		for (int x = 0; x < tile.size_x; x++)
+			if (REGION(x,y,shapeindex) == region)
+				pixels[np++] = tile.data[y][x];
+
+		opt_endpts[region] = temp_in = orig_endpts[region];
+		opt_err[region] = orig_err[region];
+
+		double best_err = orig_err[region];
+
+		// make sure we have a valid error for temp_in
+		// we didn't change temp_in, so orig_err[region] is still valid
+		double temp_in_err = orig_err[region];
+
+		// now try to optimize these endpoints
+		double temp_out_err = optimize_one(pixels, np, rotatemode, indexmode, temp_in_err, temp_in, pattern_prec.region_precs[region], temp_out);
+
+		// if we find an improvement, update the best so far and correct the output endpoints and errors
+		if (temp_out_err < best_err)
+		{
+			best_err = temp_out_err;
+			opt_err[region] = temp_out_err;
+			opt_endpts[region] = temp_out;
+		}
+	}
+}
+
+/* optimization algorithm
+	for each pattern
+		convert endpoints using pattern precision
+		assign indices and get initial error
+		compress indices (and possibly reorder endpoints)
+		transform endpoints
+		if transformed endpoints fit pattern
+			get original endpoints back
+			optimize endpoints, get new endpoints, new indices, and new error // new error will almost always be better
+			compress new indices
+			transform new endpoints
+			if new endpoints fit pattern AND if error is improved
+				emit compressed block with new data
+			else
+				emit compressed block with original data // to try to preserve maximum endpoint precision
+*/
+
+static double refine(const Tile &tile, int shapeindex_best, int rotatemode, int indexmode, const FltEndpts endpts[NREGIONS], char *block)
+{
+	double orig_err[NREGIONS], opt_err[NREGIONS], orig_toterr, opt_toterr, expected_opt_err[NREGIONS];
+	IntEndptsRGBA orig_endpts[NREGIONS], opt_endpts[NREGIONS];
+	int orig_indices[NINDEXARRAYS][Tile::TILE_H][Tile::TILE_W], opt_indices[NINDEXARRAYS][Tile::TILE_H][Tile::TILE_W];
+	int temp_indices[NINDEXARRAYS][Tile::TILE_H][Tile::TILE_W];
+
+	for (int sp = 0; sp < NPATTERNS; ++sp)
+	{
+		quantize_endpts(endpts, pattern_precs[sp], orig_endpts);
+
+		assign_indices(tile, shapeindex_best, rotatemode, indexmode, orig_endpts, pattern_precs[sp], orig_indices, orig_err);
+		swap_indices(shapeindex_best, indexmode, orig_endpts, orig_indices);
+
+		if (patterns[sp].transform_mode)
+			transform_forward(patterns[sp].transform_mode, orig_endpts);
+
+		// apply a heuristic here -- we check if the endpoints fit before we try to optimize them.
+		// the assumption made is that if they don't fit now, they won't fit after optimizing.
+		if (endpts_fit(orig_endpts, patterns[sp]))
+		{
+			if (patterns[sp].transform_mode)
+				transform_inverse(patterns[sp].transform_mode, orig_endpts);
+
+			optimize_endpts(tile, shapeindex_best, rotatemode, indexmode, orig_err, orig_endpts, pattern_precs[sp], expected_opt_err, opt_endpts);
+
+			assign_indices(tile, shapeindex_best, rotatemode, indexmode, opt_endpts, pattern_precs[sp], opt_indices, opt_err);
+			for (int i=0; i<NREGIONS; ++i)
+				assert(expected_opt_err[i] == opt_err[i]);
+			swap_indices(shapeindex_best, indexmode, opt_endpts, opt_indices);
+
+			if (patterns[sp].transform_mode)
+				transform_forward(patterns[sp].transform_mode, opt_endpts);
+
+			orig_toterr = opt_toterr = 0;
+			for (int i=0; i < NREGIONS; ++i) { orig_toterr += orig_err[i]; opt_toterr += opt_err[i]; }
+			if (endpts_fit(opt_endpts, patterns[sp]) && opt_toterr < orig_toterr)
+			{
+				emit_block(opt_endpts, shapeindex_best, patterns[sp], opt_indices, rotatemode, indexmode, block);
+				return opt_toterr;
+			}
+			else
+			{
+				// either it stopped fitting when we optimized it, or there was no improvement
+				// so go back to the unoptimized endpoints which we know will fit
+				if (patterns[sp].transform_mode)
+					transform_forward(patterns[sp].transform_mode, orig_endpts);
+				emit_block(orig_endpts, shapeindex_best, patterns[sp], orig_indices, rotatemode, indexmode, block);
+				return orig_toterr;
+			}
+		}
+	}
+	throw "No candidate found, should never happen (avpcl mode 4).";
+}
+
+static void clamp(Vec4 &v)
+{
+	if (v.X() < RGBA_MIN) v.X() = RGBA_MIN;
+	if (v.X() > RGBA_MAX) v.X() = RGBA_MAX;
+	if (v.Y() < RGBA_MIN) v.Y() = RGBA_MIN;
+	if (v.Y() > RGBA_MAX) v.Y() = RGBA_MAX;
+	if (v.Z() < RGBA_MIN) v.Z() = RGBA_MIN;
+	if (v.Z() > RGBA_MAX) v.Z() = RGBA_MAX;
+	if (v.W() < RGBA_MIN) v.W() = RGBA_MIN;
+	if (v.W() > RGBA_MAX) v.W() = RGBA_MAX;
+}
+
+// compute initial endpoints for the "RGB" portion and the "A" portion. 
+// Note these channels may have been rotated.
+static void rough(const Tile &tile, int shapeindex, FltEndpts endpts[NREGIONS])
+{
+	for (int region=0; region<NREGIONS; ++region)
+	{
+		int np = 0;
+		Vec4 colors[Tile::TILE_TOTAL];
+		Vec4 mean(0,0,0,0);
+
+		for (int y = 0; y < tile.size_y; y++)
+		for (int x = 0; x < tile.size_x; x++)
+			if (REGION(x,y,shapeindex) == region)
+			{
+				colors[np] = tile.data[y][x];
+				mean += tile.data[y][x];
+				++np;
+			}
+
+		// handle simple cases	
+		if (np == 0)
+		{
+			Vec4 zero(0,0,0,RGBA_MAX);
+			endpts[region].A = zero;
+			endpts[region].B = zero;
+			continue;
+		}
+		else if (np == 1)
+		{
+			endpts[region].A = colors[0];
+			endpts[region].B = colors[0];
+			continue;
+		}
+		else if (np == 2)
+		{
+			endpts[region].A = colors[0];
+			endpts[region].B = colors[1];
+			continue;
+		}
+
+		Matrix rdq(np, 3);
+		float alpha[Tile::TILE_TOTAL];
+
+		mean /= float(np);
+
+		for (int i = 0; i < np; ++i)
+		{
+			rdq(i,0) = colors[i].X() - mean.X();
+			rdq(i,1) = colors[i].Y() - mean.Y();
+			rdq(i,2) = colors[i].Z() - mean.Z();
+			alpha[i] = colors[i].W() - mean.W();
+		}
+				
+		// perform a singular value decomposition
+		SVD svd(rdq);
+
+		// get the principal component direction (the one with the largest weight)
+		// hack the alpha channel
+		Vec4 direction(svd.R()(0,0), svd.R()(0,1), svd.R()(0,2), 0.0);
+		
+		// project each pixel value along the principal direction
+		double minp = DBL_MAX, maxp = -DBL_MAX;
+		double mina = DBL_MAX, maxa = -DBL_MAX;
+		for (int i = 0; i < np; i++) 
+		{
+			float dp = rdq(i,0) * direction.X() + rdq(i,1)*direction.Y() + rdq(i,2)*direction.Z();
+			if (dp < minp) minp = dp;
+			if (dp > maxp) maxp = dp;
+
+			dp = alpha[i];
+			if (dp < mina) mina = dp;
+			if (dp > maxa) maxa = dp;
+		}
+
+		// choose as endpoints 2 points along the principal direction that span the projections of all of the pixel values
+		endpts[region].A = mean + minp*direction; 
+		endpts[region].B = mean + maxp*direction;
+		endpts[region].A.W() = mean.W() + mina;
+		endpts[region].B.W() = mean.W() + maxa;
+
+		// clamp endpoints
+		// WORK: is [0,255] the right range, or should it be [0,255.5) or even [0,256) ?
+		clamp(endpts[region].A);
+		clamp(endpts[region].B);
+	}
+}
+
+double AVPCL::compress_mode4(const Tile &t, char *block)
+{
+	FltEndpts endpts[NREGIONS];
+	char tempblock[AVPCL::BLOCKSIZE];
+	double msebest = DBL_MAX;
+	int shape = 0;
+	Tile t1;
+
+	// try all rotations. refine tries the 2 different indexings.
+	for (int r = 0; r < NROTATEMODES && msebest > 0; ++r)
+	{
+		rotate_tile(t, r, t1);
+		rough(t1, shape, endpts);
+		for (int i = 0; i < NINDEXMODES && msebest > 0; ++i)
+		{
+			double mse = refine(t1, shape, r, i, endpts, tempblock);
+			if (mse < msebest)
+			{
+				memcpy(block, tempblock, sizeof(tempblock));
+				msebest = mse;
+			}
+		}
+	}
+	return msebest;
+}
diff --git a/src/nvtt/bc7/avpcl_mode5.cpp b/src/nvtt/bc7/avpcl_mode5.cpp
new file mode 100644
index 0000000..d7f04da
--- /dev/null
+++ b/src/nvtt/bc7/avpcl_mode5.cpp
@@ -0,0 +1,1222 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+// Thanks to Jacob Munkberg (jacob@cs.lth.se) for the shortcut of using SVD to do the equivalent of principal components analysis
+
+// x100000 2r 777x2 8x2 2bi 2bi
+
+#include "bits.h"
+#include "tile.h"
+#include "avpcl.h"
+#include "arvo/Vec4.h"
+#include "arvo/Matrix.h"
+#include "arvo/SVD.h"
+#include "utils.h"
+#include "endpts.h"
+
+#include <assert.h>
+
+using namespace ArvoMath;
+
+// there are 2 index arrays. INDEXMODE selects between the arrays being 2 & 3 bits or 3 & 2 bits
+// array 0 is always the RGB array and array 1 is always the A array
+#define	NINDEXARRAYS	2
+#define	INDEXARRAY_RGB	0
+#define INDEXARRAY_A	1
+#define INDEXARRAY_2BITS(indexmode)	((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? INDEXARRAY_A : INDEXARRAY_RGB)
+#define INDEXARRAY_3BITS(indexmode)	((indexmode == INDEXMODE_ALPHA_IS_3BITS) ? INDEXARRAY_A : INDEXARRAY_RGB)
+
+#define NINDICES3	4
+#define	INDEXBITS3	2
+#define	HIGH_INDEXBIT3	(1<<(INDEXBITS3-1))
+#define	DENOM3		(NINDICES3-1)
+#define	BIAS3		(DENOM3/2)
+
+#define NINDICES2	4
+#define	INDEXBITS2	2
+#define	HIGH_INDEXBIT2	(1<<(INDEXBITS2-1))
+#define	DENOM2		(NINDICES2-1)
+#define	BIAS2		(DENOM2/2)
+
+#define	NINDICES_RGB(indexmode)		((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? NINDICES3 : NINDICES2)
+#define	INDEXBITS_RGB(indexmode)	((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? INDEXBITS3 : INDEXBITS2)
+#define	HIGH_INDEXBIT_RGB(indexmode)((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? HIGH_INDEXBIT3 : HIGH_INDEXBIT2)
+#define	DENOM_RGB(indexmode)		((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? DENOM3 : DENOM2)
+#define	BIAS_RGB(indexmode)			((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? BIAS3 : BIAS2)
+
+#define	NINDICES_A(indexmode)		((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? NINDICES2 : NINDICES3)
+#define	INDEXBITS_A(indexmode)		((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? INDEXBITS2 : INDEXBITS3)
+#define	HIGH_INDEXBIT_A(indexmode)	((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? HIGH_INDEXBIT2 : HIGH_INDEXBIT3)
+#define	DENOM_A(indexmode)			((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? DENOM2 : DENOM3)
+#define	BIAS_A(indexmode)			((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? BIAS2 : BIAS3)
+
+#define	NSHAPES	1
+
+static int shapes[NSHAPES] =
+{
+	0x0000,
+};
+
+#define	REGION(x,y,shapeindex)	((shapes[shapeindex]&(1<<(15-(x)-4*(y))))!=0)
+
+#define NREGIONS	1			// keep the region stuff in just in case...
+
+// encoded index compression location: region 0 is always at 0,0.
+
+#define	NBITSIZES	2			// one endpoint pair
+
+struct ChanBits
+{
+	int nbitsizes[NBITSIZES];	// bitsizes for one channel
+};
+
+struct Pattern
+{
+	ChanBits chan[NCHANNELS_RGBA];//  bit patterns used per channel
+	int transform_mode;		// x0 means alpha channel not transformed, x1 otherwise. 0x rgb not transformed, 1x otherwise.
+	int mode;				// associated mode value
+	int modebits;			// number of mode bits
+	char *encoding;			// verilog description of encoding for this mode
+};
+
+#define	TRANSFORM_MODE_ALPHA	1
+#define	TRANSFORM_MODE_RGB	2
+
+#define	NPATTERNS 1
+
+static Pattern patterns[NPATTERNS] =
+{
+	// red		green		blue		alpha	xfm	mode  mb encoding
+	7,7,		7,7,		7,7,		8,8,	0x0, 0x20, 6, "",
+};
+
+struct RegionPrec
+{
+	int	endpt_a_prec[NCHANNELS_RGBA];
+	int endpt_b_prec[NCHANNELS_RGBA];
+};
+
+struct PatternPrec
+{
+	RegionPrec region_precs[NREGIONS];
+};
+
+// this is the precision for each channel and region
+// NOTE: this MUST match the corresponding data in "patterns" above -- WARNING: there is NO assert to check this!
+static PatternPrec pattern_precs[NPATTERNS] =
+{
+	7,7,7,8,	7,7,7,8,
+};
+
+
+// return # of bits needed to store n. handle signed or unsigned cases properly
+static int nbits(int n, bool issigned)
+{
+	int nb;
+	if (n==0)
+		return 0;	// no bits needed for 0, signed or not
+	else if (n > 0)
+	{
+		for (nb=0; n; ++nb, n>>=1) ;
+		return nb + (issigned?1:0);
+	}
+	else
+	{
+		assert (issigned);
+		for (nb=0; n<-1; ++nb, n>>=1) ;
+		return nb + 1;
+	}
+}
+
+#define	R_0	ep[0].A[i]
+#define	R_1 ep[0].B[i]
+
+static void transform_forward(int transform_mode, IntEndptsRGBA ep[NREGIONS])
+{
+	int i;
+
+	if (transform_mode & TRANSFORM_MODE_RGB)
+		for (i=CHANNEL_R; i<CHANNEL_A; ++i)
+			R_1 -= R_0;
+	if (transform_mode & TRANSFORM_MODE_ALPHA)
+	{
+		i = CHANNEL_A;
+		R_1 -= R_0;
+	}
+}
+
+static void transform_inverse(int transform_mode, IntEndptsRGBA ep[NREGIONS])
+{
+	int i;
+
+	if (transform_mode & TRANSFORM_MODE_RGB)
+		for (i=CHANNEL_R; i<CHANNEL_A; ++i)
+			R_1 += R_0;
+	if (transform_mode & TRANSFORM_MODE_ALPHA)
+	{
+		i = CHANNEL_A;
+		R_1 += R_0;
+	}
+}
+
+static void quantize_endpts(const FltEndpts endpts[NREGIONS], const PatternPrec &pattern_prec, IntEndptsRGBA q_endpts[NREGIONS])
+{
+	for (int region = 0; region < NREGIONS; ++region)
+	{
+		q_endpts[region].A[0] = Utils::quantize(endpts[region].A.X(), pattern_prec.region_precs[region].endpt_a_prec[0]);
+		q_endpts[region].A[1] = Utils::quantize(endpts[region].A.Y(), pattern_prec.region_precs[region].endpt_a_prec[1]);
+		q_endpts[region].A[2] = Utils::quantize(endpts[region].A.Z(), pattern_prec.region_precs[region].endpt_a_prec[2]);
+		q_endpts[region].A[3] = Utils::quantize(endpts[region].A.W(), pattern_prec.region_precs[region].endpt_a_prec[3]);
+
+		q_endpts[region].B[0] = Utils::quantize(endpts[region].B.X(), pattern_prec.region_precs[region].endpt_b_prec[0]);
+		q_endpts[region].B[1] = Utils::quantize(endpts[region].B.Y(), pattern_prec.region_precs[region].endpt_b_prec[1]);
+		q_endpts[region].B[2] = Utils::quantize(endpts[region].B.Z(), pattern_prec.region_precs[region].endpt_b_prec[2]);
+		q_endpts[region].B[3] = Utils::quantize(endpts[region].B.W(), pattern_prec.region_precs[region].endpt_b_prec[3]);
+	}
+}
+
+// swap endpoints as needed to ensure that the indices at index_one and index_two have a 0 high-order bit
+// index_two is 0 at x=0 y=0 and 15 at x=3 y=3 so y = (index >> 2) & 3 and x = index & 3
+static void swap_indices(int shapeindex, int indexmode, IntEndptsRGBA endpts[NREGIONS], int indices[NINDEXARRAYS][Tile::TILE_H][Tile::TILE_W])
+{
+	int index_positions[NREGIONS];
+
+	index_positions[0] = 0;			// since WLOG we have the high bit of the shapes at 0
+
+	for (int region = 0; region < NREGIONS; ++region)
+	{
+		int x = index_positions[region] & 3;
+		int y = (index_positions[region] >> 2) & 3;
+		assert(REGION(x,y,shapeindex) == region);		// double check the table
+
+		// swap RGB
+		if (indices[INDEXARRAY_RGB][y][x] & HIGH_INDEXBIT_RGB(indexmode))
+		{
+			// high bit is set, swap the endpts and indices for this region
+			int t;
+			for (int i=CHANNEL_R; i<=CHANNEL_B; ++i) { t = endpts[region].A[i]; endpts[region].A[i] = endpts[region].B[i]; endpts[region].B[i] = t; }
+
+			for (int y = 0; y < Tile::TILE_H; y++)
+			for (int x = 0; x < Tile::TILE_W; x++)
+				if (REGION(x,y,shapeindex) == region)
+					indices[INDEXARRAY_RGB][y][x] = NINDICES_RGB(indexmode) - 1 - indices[INDEXARRAY_RGB][y][x];
+		}
+
+		// swap A
+		if (indices[INDEXARRAY_A][y][x] & HIGH_INDEXBIT_A(indexmode))
+		{
+			// high bit is set, swap the endpts and indices for this region
+			int t;
+			for (int i=CHANNEL_A; i<=CHANNEL_A; ++i) { t = endpts[region].A[i]; endpts[region].A[i] = endpts[region].B[i]; endpts[region].B[i] = t; }
+
+			for (int y = 0; y < Tile::TILE_H; y++)
+			for (int x = 0; x < Tile::TILE_W; x++)
+				if (REGION(x,y,shapeindex) == region)
+					indices[INDEXARRAY_A][y][x] = NINDICES_A(indexmode) - 1 - indices[INDEXARRAY_A][y][x];
+		}
+	}
+}
+
+static bool endpts_fit(IntEndptsRGBA endpts[NREGIONS], const Pattern &p)
+{
+	return true;
+}
+
+static void write_header(const IntEndptsRGBA endpts[NREGIONS], int shapeindex, const Pattern &p, int rotatemode, int indexmode, Bits &out)
+{
+	// ignore shapeindex
+	out.write(p.mode, p.modebits);
+	out.write(rotatemode, ROTATEMODE_BITS);
+//	out.write(indexmode, INDEXMODE_BITS);
+	for (int i=0; i<NREGIONS; ++i)
+		for (int j=0; j<NCHANNELS_RGBA; ++j)
+		{
+			out.write(endpts[i].A[j], p.chan[j].nbitsizes[0]);
+			out.write(endpts[i].B[j], p.chan[j].nbitsizes[1]);
+		}
+	assert (out.getptr() == 66);
+}
+
+static void read_header(Bits &in, IntEndptsRGBA endpts[NREGIONS], int &shapeindex, int &rotatemode, int &indexmode, Pattern &p, int &pat_index)
+{
+	int mode = AVPCL::getmode(in);
+
+	pat_index = 0;
+
+	assert (pat_index >= 0 && pat_index < NPATTERNS);
+	assert (in.getptr() == patterns[pat_index].modebits);
+
+	p = patterns[pat_index];
+
+	shapeindex = 0;		// we don't have any
+
+	rotatemode = in.read(ROTATEMODE_BITS);
+
+	indexmode = 0;		// we don't have any
+
+	for (int i=0; i<NREGIONS; ++i)
+		for (int j=0; j<NCHANNELS_RGBA; ++j)
+		{
+			endpts[i].A[j] = in.read(p.chan[j].nbitsizes[0]);
+			endpts[i].B[j] = in.read(p.chan[j].nbitsizes[1]);
+		}
+	assert (in.getptr() == 66);
+}
+
+static void write_indices(const int indices[NINDEXARRAYS][Tile::TILE_H][Tile::TILE_W], int shapeindex, int indexmode, Bits &out)
+{
+	// the indices we shorten is always index 0
+
+	// do the 2 bit indices first
+	assert ((indices[INDEXARRAY_2BITS(indexmode)][0][0] & HIGH_INDEXBIT2) == 0);
+	for (int i = 0; i < Tile::TILE_TOTAL; ++i)
+		out.write(indices[INDEXARRAY_2BITS(indexmode)][i>>2][i&3], INDEXBITS2 - (i==0?1:0));	// write i..[1:0] or i..[0]
+
+	// then the 3 bit indices
+	assert ((indices[INDEXARRAY_3BITS(indexmode)][0][0] & HIGH_INDEXBIT3) == 0);
+	for (int i = 0; i < Tile::TILE_TOTAL; ++i)
+		out.write(indices[INDEXARRAY_3BITS(indexmode)][i>>2][i&3], INDEXBITS3 - (i==0?1:0));	// write i..[2:0] or i..[1:0]
+}
+
+static void read_indices(Bits &in, int shapeindex, int indexmode, int indices[NINDEXARRAYS][Tile::TILE_H][Tile::TILE_W])
+{
+	// the indices we shorten is always index 0
+
+	// do the 2 bit indices first
+	for (int i = 0; i < Tile::TILE_TOTAL; ++i)
+		indices[INDEXARRAY_2BITS(indexmode)][i>>2][i&3] = in.read(INDEXBITS2 - (i==0?1:0));		// read i..[1:0] or i..[0]
+
+	// then the 3 bit indices
+	for (int i = 0; i < Tile::TILE_TOTAL; ++i)
+		indices[INDEXARRAY_3BITS(indexmode)][i>>2][i&3] = in.read(INDEXBITS3 - (i==0?1:0));		// read i..[1:0] or i..[0]
+}
+
+static void emit_block(const IntEndptsRGBA endpts[NREGIONS], int shapeindex, const Pattern &p, const int indices[NINDEXARRAYS][Tile::TILE_H][Tile::TILE_W], int rotatemode, int indexmode, char *block)
+{
+	Bits out(block, AVPCL::BITSIZE);
+
+	write_header(endpts, shapeindex, p, rotatemode, indexmode, out);
+
+	write_indices(indices, shapeindex, indexmode, out);
+
+	assert(out.getptr() == AVPCL::BITSIZE);
+}
+
+static void generate_palette_quantized_rgb_a(const IntEndptsRGBA &endpts, const RegionPrec &region_prec, int indexmode, Vec3 palette_rgb[NINDICES3], float palette_a[NINDICES3])
+{
+	// scale endpoints for RGB
+	int a, b;
+
+	a = Utils::unquantize(endpts.A[0], region_prec.endpt_a_prec[0]); 
+	b = Utils::unquantize(endpts.B[0], region_prec.endpt_b_prec[0]);
+
+	// interpolate R
+	for (int i = 0; i < NINDICES_RGB(indexmode); ++i)
+		palette_rgb[i].X() = PALETTE_LERP(a, b, i, BIAS_RGB(indexmode), DENOM_RGB(indexmode));
+
+	a = Utils::unquantize(endpts.A[1], region_prec.endpt_a_prec[1]); 
+	b = Utils::unquantize(endpts.B[1], region_prec.endpt_b_prec[1]);
+
+	// interpolate G
+	for (int i = 0; i < NINDICES_RGB(indexmode); ++i)
+		palette_rgb[i].Y() = PALETTE_LERP(a, b, i, BIAS_RGB(indexmode), DENOM_RGB(indexmode));
+
+	a = Utils::unquantize(endpts.A[2], region_prec.endpt_a_prec[2]); 
+	b = Utils::unquantize(endpts.B[2], region_prec.endpt_b_prec[2]);
+
+	// interpolate B
+	for (int i = 0; i < NINDICES_RGB(indexmode); ++i)
+		palette_rgb[i].Z() = PALETTE_LERP(a, b, i, BIAS_RGB(indexmode), DENOM_RGB(indexmode));
+
+	a = Utils::unquantize(endpts.A[3], region_prec.endpt_a_prec[3]); 
+	b = Utils::unquantize(endpts.B[3], region_prec.endpt_b_prec[3]);
+
+	// interpolate A
+	for (int i = 0; i < NINDICES_A(indexmode); ++i)
+		palette_a[i] = PALETTE_LERP(a, b, i, BIAS_A(indexmode), DENOM_A(indexmode));
+}
+
+static void sign_extend(Pattern &p, IntEndptsRGBA endpts[NREGIONS])
+{
+	for (int i=0; i<NCHANNELS_RGBA; ++i)
+	{
+		if (p.transform_mode)
+		{
+			// endpts[0].A[i] = SIGN_EXTEND(endpts[0].B[i], p.chan[i].nbitsizes[0]);	// always positive here
+			endpts[0].B[i] = SIGN_EXTEND(endpts[0].B[i], p.chan[i].nbitsizes[1]);
+			endpts[1].A[i] = SIGN_EXTEND(endpts[1].A[i], p.chan[i].nbitsizes[2]);
+			endpts[1].B[i] = SIGN_EXTEND(endpts[1].B[i], p.chan[i].nbitsizes[3]);
+		}
+	}
+}
+
+static void rotate_tile(const Tile &in, int rotatemode, Tile &out)
+{
+	out.size_x = in.size_x;
+	out.size_y = in.size_y;
+
+	for (int y=0; y<in.size_y; ++y)
+	for (int x=0; x<in.size_x; ++x)
+	{
+		float t;
+		out.data[y][x] = in.data[y][x];
+
+		switch(rotatemode)
+		{
+		case ROTATEMODE_RGBA_RGBA: break;
+		case ROTATEMODE_RGBA_AGBR: t = (out.data[y][x]).X(); (out.data[y][x]).X() = (out.data[y][x]).W(); (out.data[y][x]).W() = t; break;
+		case ROTATEMODE_RGBA_RABG: t = (out.data[y][x]).Y(); (out.data[y][x]).Y() = (out.data[y][x]).W(); (out.data[y][x]).W() = t; break;
+		case ROTATEMODE_RGBA_RGAB: t = (out.data[y][x]).Z(); (out.data[y][x]).Z() = (out.data[y][x]).W(); (out.data[y][x]).W() = t; break;
+		default: assert(0);
+		}
+	}
+}
+
+void AVPCL::decompress_mode5(const char *block, Tile &t)
+{
+	Bits in(block, AVPCL::BITSIZE);
+
+	Pattern p;
+	IntEndptsRGBA endpts[NREGIONS];
+	int shapeindex, pat_index, rotatemode, indexmode;
+
+	read_header(in, endpts, shapeindex, rotatemode, indexmode, p, pat_index);
+	
+	sign_extend(p, endpts);
+
+	if (p.transform_mode)
+		transform_inverse(p.transform_mode, endpts);
+
+	Vec3 palette_rgb[NREGIONS][NINDICES3];	// could be nindices2
+	float palette_a[NREGIONS][NINDICES3];	// could be nindices2
+
+	for (int region = 0; region < NREGIONS; ++region)
+		generate_palette_quantized_rgb_a(endpts[region], pattern_precs[pat_index].region_precs[region], indexmode, &palette_rgb[region][0], &palette_a[region][0]);
+
+	int indices[NINDEXARRAYS][Tile::TILE_H][Tile::TILE_W];
+
+	read_indices(in, shapeindex, indexmode, indices);
+
+	assert(in.getptr() == AVPCL::BITSIZE);
+
+	Tile temp(t.size_x, t.size_y);
+
+	// lookup
+	for (int y = 0; y < Tile::TILE_H; y++)
+	for (int x = 0; x < Tile::TILE_W; x++)
+		temp.data[y][x] = Vec4(palette_rgb[REGION(x,y,shapeindex)][indices[INDEXARRAY_RGB][y][x]], palette_a[REGION(x,y,shapeindex)][indices[INDEXARRAY_A][y][x]]);
+
+	rotate_tile(temp, rotatemode, t);
+}
+
+// given a collection of colors and quantized endpoints, generate a palette, choose best entries, and return a single toterr
+// we already have a candidate mapping when we call this function, thus an error. take an early exit if the accumulated error so far
+// exceeds what we already have
+static double map_colors(const Vec4 colors[], int np, int rotatemode, int indexmode, const IntEndptsRGBA &endpts, const RegionPrec &region_prec, double current_besterr, int indices[NINDEXARRAYS][Tile::TILE_TOTAL])
+{
+	Vec3 palette_rgb[NINDICES3];	// could be nindices2
+	float palette_a[NINDICES3];	// could be nindices2
+	double toterr = 0;
+
+	generate_palette_quantized_rgb_a(endpts, region_prec, indexmode, &palette_rgb[0], &palette_a[0]);
+
+	Vec3 rgb;
+	float a;
+
+	for (int i = 0; i < np; ++i)
+	{
+		double err, besterr;
+		float palette_alpha = 0, tile_alpha = 0;
+
+		if(AVPCL::flag_premult)
+				tile_alpha = (rotatemode == ROTATEMODE_RGBA_AGBR) ? (colors[i]).X() :
+							 (rotatemode == ROTATEMODE_RGBA_RABG) ? (colors[i]).Y() :
+							 (rotatemode == ROTATEMODE_RGBA_RGAB) ? (colors[i]).Z() : (colors[i]).W();
+
+		rgb.X() = (colors[i]).X();
+		rgb.Y() = (colors[i]).Y();
+		rgb.Z() = (colors[i]).Z();
+		a = (colors[i]).W();
+
+		// compute the two indices separately
+		// if we're doing premultiplied alpha, we need to choose first the index that
+		// determines the alpha value, and then do the other index
+
+		if (rotatemode == ROTATEMODE_RGBA_RGBA)
+		{
+			// do A index first as it has the alpha
+			besterr = DBL_MAX;
+			for (int j = 0; j < NINDICES_A(indexmode) && besterr > 0; ++j)
+			{
+				err = Utils::metric1(a, palette_a[j], rotatemode);
+
+				if (err > besterr)	// error increased, so we're done searching
+					break;
+				if (err < besterr)
+				{
+					besterr = err;
+					palette_alpha = palette_a[j];
+					indices[INDEXARRAY_A][i] = j;
+				}
+			}
+			toterr += besterr;		// squared-error norms are additive since we don't do the square root
+
+			// do RGB index
+			besterr = DBL_MAX;
+			for (int j = 0; j < NINDICES_RGB(indexmode) && besterr > 0; ++j)
+			{
+				err = !AVPCL::flag_premult ? Utils::metric3(rgb, palette_rgb[j], rotatemode) :
+											 Utils::metric3premult_alphaout(rgb, tile_alpha, palette_rgb[j], palette_alpha);
+
+				if (err > besterr)	// error increased, so we're done searching
+					break;
+				if (err < besterr)
+				{
+					besterr = err;
+					indices[INDEXARRAY_RGB][i] = j;
+				}
+			}
+			toterr += besterr;
+			if (toterr > current_besterr)
+			{
+				// fill out bogus index values so it's initialized at least
+				for (int k = i; k < np; ++k)
+				{
+					indices[INDEXARRAY_RGB][k] = -1;
+					indices[INDEXARRAY_A][k] = -1;
+				}
+				return DBL_MAX;
+			}
+		}
+		else
+		{
+			// do RGB index
+			besterr = DBL_MAX;
+			int bestindex;
+			for (int j = 0; j < NINDICES_RGB(indexmode) && besterr > 0; ++j)
+			{
+				err = !AVPCL::flag_premult ? Utils::metric3(rgb, palette_rgb[j], rotatemode) :
+											 Utils::metric3premult_alphain(rgb, palette_rgb[j], rotatemode);
+
+				if (err > besterr)	// error increased, so we're done searching
+					break;
+				if (err < besterr)
+				{
+					besterr = err;
+					bestindex = j;
+					indices[INDEXARRAY_RGB][i] = j;
+				}
+			}
+			palette_alpha = (rotatemode == ROTATEMODE_RGBA_AGBR) ? (palette_rgb[bestindex]).X() :
+							(rotatemode == ROTATEMODE_RGBA_RABG) ? (palette_rgb[bestindex]).Y() :
+							(rotatemode == ROTATEMODE_RGBA_RGAB) ? (palette_rgb[bestindex]).Z() : (assert(0),0);
+			toterr += besterr;
+
+			// do A index
+			besterr = DBL_MAX;
+			for (int j = 0; j < NINDICES_A(indexmode) && besterr > 0; ++j)
+			{
+				err = !AVPCL::flag_premult ? Utils::metric1(a, palette_a[j], rotatemode) :
+											 Utils::metric1premult(a, tile_alpha, palette_a[j], palette_alpha, rotatemode);
+
+				if (err > besterr)	// error increased, so we're done searching
+					break;
+				if (err < besterr)
+				{
+					besterr = err;
+					indices[INDEXARRAY_A][i] = j;
+				}
+			}
+			toterr += besterr;		// squared-error norms are additive since we don't do the square root
+			if (toterr > current_besterr)
+			{
+				// fill out bogus index values so it's initialized at least
+				for (int k = i; k < np; ++k)
+				{
+					indices[INDEXARRAY_RGB][k] = -1;
+					indices[INDEXARRAY_A][k] = -1;
+				}
+				return DBL_MAX;
+			}
+		}
+	}
+	return toterr;
+}
+
+// assign indices given a tile, shape, and quantized endpoints, return toterr for each region
+static void assign_indices(const Tile &tile, int shapeindex, int rotatemode, int indexmode, IntEndptsRGBA endpts[NREGIONS], const PatternPrec &pattern_prec, 
+						   int indices[NINDEXARRAYS][Tile::TILE_H][Tile::TILE_W], double toterr[NREGIONS])
+{
+	Vec3 palette_rgb[NREGIONS][NINDICES3];	// could be nindices2
+	float palette_a[NREGIONS][NINDICES3];	// could be nindices2
+
+	for (int region = 0; region < NREGIONS; ++region)
+	{
+		generate_palette_quantized_rgb_a(endpts[region], pattern_prec.region_precs[region], indexmode, &palette_rgb[region][0], &palette_a[region][0]);
+		toterr[region] = 0;
+	}
+
+	Vec3 rgb;
+	float a;
+
+	for (int y = 0; y < tile.size_y; y++)
+	for (int x = 0; x < tile.size_x; x++)
+	{
+		int region = REGION(x,y,shapeindex);
+		double err, besterr;
+		float palette_alpha = 0, tile_alpha = 0;
+
+		rgb.X() = (tile.data[y][x]).X();
+		rgb.Y() = (tile.data[y][x]).Y();
+		rgb.Z() = (tile.data[y][x]).Z();
+		a = (tile.data[y][x]).W();
+
+		if(AVPCL::flag_premult)
+				tile_alpha = (rotatemode == ROTATEMODE_RGBA_AGBR) ? (tile.data[y][x]).X() :
+							 (rotatemode == ROTATEMODE_RGBA_RABG) ? (tile.data[y][x]).Y() :
+							 (rotatemode == ROTATEMODE_RGBA_RGAB) ? (tile.data[y][x]).Z() : (tile.data[y][x]).W();
+
+		// compute the two indices separately
+		// if we're doing premultiplied alpha, we need to choose first the index that
+		// determines the alpha value, and then do the other index
+
+		if (rotatemode == ROTATEMODE_RGBA_RGBA)
+		{
+			// do A index first as it has the alpha
+			besterr = DBL_MAX;
+			for (int i = 0; i < NINDICES_A(indexmode) && besterr > 0; ++i)
+			{
+				err = Utils::metric1(a, palette_a[region][i], rotatemode);
+
+				if (err > besterr)	// error increased, so we're done searching
+					break;
+				if (err < besterr)
+				{
+					besterr = err;
+					indices[INDEXARRAY_A][y][x] = i;
+					palette_alpha = palette_a[region][i];
+				}
+			}
+			toterr[region] += besterr;		// squared-error norms are additive since we don't do the square root
+
+			// do RGB index
+			besterr = DBL_MAX;
+			for (int i = 0; i < NINDICES_RGB(indexmode) && besterr > 0; ++i)
+			{
+				err = !AVPCL::flag_premult ? Utils::metric3(rgb, palette_rgb[region][i], rotatemode) :
+											 Utils::metric3premult_alphaout(rgb, tile_alpha, palette_rgb[region][i], palette_alpha);
+
+				if (err > besterr)	// error increased, so we're done searching
+					break;
+				if (err < besterr)
+				{
+					besterr = err;
+					indices[INDEXARRAY_RGB][y][x] = i;
+				}
+			}
+			toterr[region] += besterr;
+		}
+		else
+		{
+			// do RGB index first as it has the alpha
+			besterr = DBL_MAX;
+			int bestindex;
+			for (int i = 0; i < NINDICES_RGB(indexmode) && besterr > 0; ++i)
+			{
+				err = !AVPCL::flag_premult ? Utils::metric3(rgb, palette_rgb[region][i], rotatemode) :
+											 Utils::metric3premult_alphain(rgb, palette_rgb[region][i], rotatemode);
+
+				if (err > besterr)	// error increased, so we're done searching
+					break;
+				if (err < besterr)
+				{
+					besterr = err;
+					indices[INDEXARRAY_RGB][y][x] = i;
+					bestindex = i;
+				}
+			}
+			palette_alpha = (rotatemode == ROTATEMODE_RGBA_AGBR) ? (palette_rgb[region][bestindex]).X() :
+							(rotatemode == ROTATEMODE_RGBA_RABG) ? (palette_rgb[region][bestindex]).Y() :
+							(rotatemode == ROTATEMODE_RGBA_RGAB) ? (palette_rgb[region][bestindex]).Z() : (assert(0),0);
+			toterr[region] += besterr;
+
+			// do A index
+			besterr = DBL_MAX;
+			for (int i = 0; i < NINDICES_A(indexmode) && besterr > 0; ++i)
+			{
+				err = !AVPCL::flag_premult ? Utils::metric1(a, palette_a[region][i], rotatemode) :
+											 Utils::metric1premult(a, tile_alpha, palette_a[region][i], palette_alpha, rotatemode);
+
+				if (err > besterr)	// error increased, so we're done searching
+					break;
+				if (err < besterr)
+				{
+					besterr = err;
+					indices[INDEXARRAY_A][y][x] = i;
+				}
+			}
+			toterr[region] += besterr;		// squared-error norms are additive since we don't do the square root
+		}
+	}
+}
+
+// note: indices are valid only if the value returned is less than old_err; otherwise they contain -1's
+// this function returns either old_err or a value smaller (if it was successful in improving the error)
+static double perturb_one(const Vec4 colors[], int np, int rotatemode, int indexmode, int ch, const RegionPrec &region_prec, const IntEndptsRGBA &old_endpts, IntEndptsRGBA &new_endpts, 
+						  double old_err, int do_b, int indices[NINDEXARRAYS][Tile::TILE_TOTAL])
+{
+	// we have the old endpoints: old_endpts
+	// we have the perturbed endpoints: new_endpts
+	// we have the temporary endpoints: temp_endpts
+
+	IntEndptsRGBA temp_endpts;
+	float min_err = old_err;		// start with the best current error
+	int beststep;
+	int temp_indices[NINDEXARRAYS][Tile::TILE_TOTAL];
+
+	for (int j=0; j<NINDEXARRAYS; ++j)
+	for (int i=0; i<np; ++i)
+		indices[j][i] = -1;
+
+	// copy real endpoints so we can perturb them
+	temp_endpts = new_endpts = old_endpts;
+
+	int prec = do_b ? region_prec.endpt_b_prec[ch] : region_prec.endpt_a_prec[ch];
+
+	// do a logarithmic search for the best error for this endpoint (which)
+	for (int step = 1 << (prec-1); step; step >>= 1)
+	{
+		bool improved = false;
+		for (int sign = -1; sign <= 1; sign += 2)
+		{
+			if (do_b == 0)
+			{
+				temp_endpts.A[ch] = new_endpts.A[ch] + sign * step;
+				if (temp_endpts.A[ch] < 0 || temp_endpts.A[ch] >= (1 << prec))
+					continue;
+			}
+			else
+			{
+				temp_endpts.B[ch] = new_endpts.B[ch] + sign * step;
+				if (temp_endpts.B[ch] < 0 || temp_endpts.B[ch] >= (1 << prec))
+					continue;
+			}
+
+			float err = map_colors(colors, np, rotatemode, indexmode, temp_endpts, region_prec, min_err, temp_indices);
+
+			if (err < min_err)
+			{
+				improved = true;
+				min_err = err;
+				beststep = sign * step;
+				for (int j=0; j<NINDEXARRAYS; ++j)
+				for (int i=0; i<np; ++i)
+					indices[j][i] = temp_indices[j][i];
+			}
+		}
+		// if this was an improvement, move the endpoint and continue search from there
+		if (improved)
+		{
+			if (do_b == 0)
+				new_endpts.A[ch] += beststep;
+			else
+				new_endpts.B[ch] += beststep;
+		}
+	}
+	return min_err;
+}
+
+// the larger the error the more time it is worth spending on an exhaustive search.
+// perturb the endpoints at least -3 to 3.
+// if err > 5000 perturb endpoints 50% of precision
+// if err > 1000 25%
+// if err > 200 12.5%
+// if err > 40  6.25%
+// for np = 16 -- adjust error thresholds as a function of np
+// always ensure endpoint ordering is preserved (no need to overlap the scan)
+static double exhaustive(const Vec4 colors[], int np, int rotatemode, int indexmode, int ch, const RegionPrec &region_prec, double orig_err, IntEndptsRGBA &opt_endpts, int indices[NINDEXARRAYS][Tile::TILE_TOTAL])
+{
+	IntEndptsRGBA temp_endpts;
+	double best_err = orig_err;
+	int aprec = region_prec.endpt_a_prec[ch];
+	int bprec = region_prec.endpt_b_prec[ch];
+	int good_indices[NINDEXARRAYS][Tile::TILE_TOTAL];
+	int temp_indices[NINDEXARRAYS][Tile::TILE_TOTAL];
+
+	for (int j=0; j<NINDEXARRAYS; ++j)
+	for (int i=0; i<np; ++i)
+		indices[j][i] = -1;
+
+	double thr_scale = (double)np / (double)Tile::TILE_TOTAL;
+
+	if (orig_err == 0) return orig_err;
+
+	int adelta = 0, bdelta = 0;
+	if (orig_err > 5000.0*thr_scale)		{ adelta = (1 << aprec)/2; bdelta = (1 << bprec)/2; }
+	else if (orig_err > 1000.0*thr_scale)	{ adelta = (1 << aprec)/4; bdelta = (1 << bprec)/4; }
+	else if (orig_err > 200.0*thr_scale)	{ adelta = (1 << aprec)/8; bdelta = (1 << bprec)/8; }
+	else if (orig_err > 40.0*thr_scale)		{ adelta = (1 << aprec)/16; bdelta = (1 << bprec)/16; }
+	adelta = MAX(adelta, 3);
+	bdelta = MAX(bdelta, 3);
+
+#ifdef	DISABLE_EXHAUSTIVE
+	adelta = bdelta = 3;
+#endif
+
+	temp_endpts = opt_endpts;
+
+	// ok figure out the range of A and B
+	int alow = MAX(0, opt_endpts.A[ch] - adelta);
+	int ahigh = MIN((1<<aprec)-1, opt_endpts.A[ch] + adelta);
+	int blow = MAX(0, opt_endpts.B[ch] - bdelta);
+	int bhigh = MIN((1<<bprec)-1, opt_endpts.B[ch] + bdelta);
+
+	// now there's no need to swap the ordering of A and B
+	bool a_le_b = opt_endpts.A[ch] <= opt_endpts.B[ch];
+
+	int amin, bmin;
+
+	if (opt_endpts.A[ch] <= opt_endpts.B[ch])
+	{
+		// keep a <= b
+		for (int a = alow; a <= ahigh; ++a)
+		for (int b = MAX(a, blow); b < bhigh; ++b)
+		{
+			temp_endpts.A[ch] = a;
+			temp_endpts.B[ch] = b;
+		
+			float err = map_colors(colors, np, rotatemode, indexmode, temp_endpts, region_prec, best_err, temp_indices);
+			if (err < best_err) 
+			{ 
+				amin = a; 
+				bmin = b; 
+				best_err = err;
+				for (int j=0; j<NINDEXARRAYS; ++j)
+				for (int i=0; i<np; ++i)
+					good_indices[j][i] = temp_indices[j][i];
+			}
+		}
+	}
+	else
+	{
+		// keep b <= a
+		for (int b = blow; b < bhigh; ++b)
+		for (int a = MAX(b, alow); a <= ahigh; ++a)
+		{
+			temp_endpts.A[ch] = a;
+			temp_endpts.B[ch] = b;
+		
+			float err = map_colors(colors, np, rotatemode, indexmode, temp_endpts, region_prec, best_err, temp_indices);
+			if (err < best_err) 
+			{ 
+				amin = a; 
+				bmin = b; 
+				best_err = err;
+				for (int j=0; j<NINDEXARRAYS; ++j)
+				for (int i=0; i<np; ++i)
+					good_indices[j][i] = temp_indices[j][i];
+			}
+		}
+	}
+	if (best_err < orig_err)
+	{
+		opt_endpts.A[ch] = amin;
+		opt_endpts.B[ch] = bmin;
+		orig_err = best_err;
+		for (int j=0; j<NINDEXARRAYS; ++j)
+		for (int i=0; i<np; ++i)
+			indices[j][i] = good_indices[j][i];
+	}
+
+	return best_err;
+}
+
+static double optimize_one(const Vec4 colors[], int np, int rotatemode, int indexmode, double orig_err, const IntEndptsRGBA &orig_endpts, const RegionPrec &region_prec, IntEndptsRGBA &opt_endpts)
+{
+	double opt_err = orig_err;
+
+	opt_endpts = orig_endpts;
+
+	/*
+		err0 = perturb(rgb0, delta0)
+		err1 = perturb(rgb1, delta1)
+		if (err0 < err1)
+			if (err0 >= initial_error) break
+			rgb0 += delta0
+			next = 1
+		else
+			if (err1 >= initial_error) break
+			rgb1 += delta1
+			next = 0
+		initial_err = map()
+		for (;;)
+			err = perturb(next ? rgb1:rgb0, delta)
+			if (err >= initial_err) break
+			next? rgb1 : rgb0 += delta
+			initial_err = err
+	*/
+	IntEndptsRGBA new_a, new_b;
+	IntEndptsRGBA new_endpt;
+	int do_b;
+	int orig_indices[NINDEXARRAYS][Tile::TILE_TOTAL];
+	int new_indices[NINDEXARRAYS][Tile::TILE_TOTAL];
+	int temp_indices0[NINDEXARRAYS][Tile::TILE_TOTAL];
+	int temp_indices1[NINDEXARRAYS][Tile::TILE_TOTAL];
+
+	// now optimize each channel separately
+	for (int ch = 0; ch < NCHANNELS_RGBA; ++ch)
+	{
+		// figure out which endpoint when perturbed gives the most improvement and start there
+		// if we just alternate, we can easily end up in a local minima
+		float err0 = perturb_one(colors, np, rotatemode, indexmode, ch, region_prec, opt_endpts, new_a, opt_err, 0, temp_indices0);	// perturb endpt A
+		float err1 = perturb_one(colors, np, rotatemode, indexmode, ch, region_prec, opt_endpts, new_b, opt_err, 1, temp_indices1);	// perturb endpt B
+
+		if (err0 < err1)
+		{
+			if (err0 >= opt_err)
+				continue;
+
+			for (int j=0; j<NINDEXARRAYS; ++j)
+			for (int i=0; i<np; ++i)
+			{
+				new_indices[j][i] = orig_indices[j][i] = temp_indices0[j][i];
+				assert (orig_indices[j][i] != -1);
+			}
+
+			opt_endpts.A[ch] = new_a.A[ch];
+			opt_err = err0;
+			do_b = 1;		// do B next
+		}
+		else
+		{
+			if (err1 >= opt_err)
+				continue;
+
+			for (int j=0; j<NINDEXARRAYS; ++j)
+			for (int i=0; i<np; ++i)
+			{
+				new_indices[j][i] = orig_indices[j][i] = temp_indices1[j][i];
+				assert (orig_indices[j][i] != -1);
+			}
+
+			opt_endpts.B[ch] = new_b.B[ch];
+			opt_err = err1;
+			do_b = 0;		// do A next
+		}
+		
+		// now alternate endpoints and keep trying until there is no improvement
+		for (;;)
+		{
+			float err = perturb_one(colors, np, rotatemode, indexmode, ch, region_prec, opt_endpts, new_endpt, opt_err, do_b, temp_indices0);
+			if (err >= opt_err)
+				break;
+
+			for (int j=0; j<NINDEXARRAYS; ++j)
+			for (int i=0; i<np; ++i)
+			{
+				new_indices[j][i] = temp_indices0[j][i];
+				assert (orig_indices[j][i] != -1);
+			}
+
+			if (do_b == 0)
+				opt_endpts.A[ch] = new_endpt.A[ch];
+			else
+				opt_endpts.B[ch] = new_endpt.B[ch];
+			opt_err = err;
+			do_b = 1 - do_b;	// now move the other endpoint
+		}
+
+		// see if the indices have changed
+		int i;
+		for (i=0; i<np; ++i)
+			if (orig_indices[INDEXARRAY_RGB][i] != new_indices[INDEXARRAY_RGB][i] || orig_indices[INDEXARRAY_A][i] != new_indices[INDEXARRAY_A][i])
+				break;
+
+		if (i<np)
+			ch = -1;	// start over
+	}
+
+	// finally, do a small exhaustive search around what we think is the global minima to be sure
+	bool first = true;
+	for (int ch = 0; ch < NCHANNELS_RGBA; ++ch)
+	{
+		double new_err = exhaustive(colors, np, rotatemode, indexmode, ch, region_prec, opt_err, opt_endpts, temp_indices0);
+
+		if (new_err < opt_err)
+		{
+			opt_err = new_err;
+
+			if (first)
+			{
+				for (int j=0; j<NINDEXARRAYS; ++j)
+				for (int i=0; i<np; ++i)
+				{
+					orig_indices[j][i] = temp_indices0[j][i];
+					assert (orig_indices[j][i] != -1);
+				}
+				first = false;
+			}
+			else
+			{
+				// see if the indices have changed
+				int i;
+				for (i=0; i<np; ++i)
+					if (orig_indices[INDEXARRAY_RGB][i] != temp_indices0[INDEXARRAY_RGB][i] || orig_indices[INDEXARRAY_A][i] != temp_indices0[INDEXARRAY_A][i])
+						break;
+
+				if (i<np)
+				{
+					ch = -1;	// start over
+					first = true;
+				}
+			}
+		}
+	}
+
+	return opt_err;
+}
+
+static void optimize_endpts(const Tile &tile, int shapeindex, int rotatemode, int indexmode, const double orig_err[NREGIONS], 
+							const IntEndptsRGBA orig_endpts[NREGIONS], const PatternPrec &pattern_prec, double opt_err[NREGIONS], IntEndptsRGBA opt_endpts[NREGIONS])
+{
+	Vec4 pixels[Tile::TILE_TOTAL];
+	IntEndptsRGBA temp_in, temp_out;
+
+	for (int region=0; region<NREGIONS; ++region)
+	{
+		// collect the pixels in the region
+		int np = 0;
+
+		for (int y = 0; y < tile.size_y; y++)
+		for (int x = 0; x < tile.size_x; x++)
+			if (REGION(x,y,shapeindex) == region)
+				pixels[np++] = tile.data[y][x];
+
+		opt_endpts[region] = temp_in = orig_endpts[region];
+		opt_err[region] = orig_err[region];
+
+		double best_err = orig_err[region];
+
+		// make sure we have a valid error for temp_in
+		// we didn't change temp_in, so orig_err[region] is still valid
+		double temp_in_err = orig_err[region];
+
+		// now try to optimize these endpoints
+		double temp_out_err = optimize_one(pixels, np, rotatemode, indexmode, temp_in_err, temp_in, pattern_prec.region_precs[region], temp_out);
+
+		// if we find an improvement, update the best so far and correct the output endpoints and errors
+		if (temp_out_err < best_err)
+		{
+			best_err = temp_out_err;
+			opt_err[region] = temp_out_err;
+			opt_endpts[region] = temp_out;
+		}
+	}
+}
+
+/* optimization algorithm
+	for each pattern
+		convert endpoints using pattern precision
+		assign indices and get initial error
+		compress indices (and possibly reorder endpoints)
+		transform endpoints
+		if transformed endpoints fit pattern
+			get original endpoints back
+			optimize endpoints, get new endpoints, new indices, and new error // new error will almost always be better
+			compress new indices
+			transform new endpoints
+			if new endpoints fit pattern AND if error is improved
+				emit compressed block with new data
+			else
+				emit compressed block with original data // to try to preserve maximum endpoint precision
+*/
+
+static double refine(const Tile &tile, int shapeindex_best, int rotatemode, int indexmode, const FltEndpts endpts[NREGIONS], char *block)
+{
+	double orig_err[NREGIONS], opt_err[NREGIONS], orig_toterr, opt_toterr, expected_opt_err[NREGIONS];
+	IntEndptsRGBA orig_endpts[NREGIONS], opt_endpts[NREGIONS];
+	int orig_indices[NINDEXARRAYS][Tile::TILE_H][Tile::TILE_W], opt_indices[NINDEXARRAYS][Tile::TILE_H][Tile::TILE_W];
+	int temp_indices[NINDEXARRAYS][Tile::TILE_H][Tile::TILE_W];
+
+	for (int sp = 0; sp < NPATTERNS; ++sp)
+	{
+		quantize_endpts(endpts, pattern_precs[sp], orig_endpts);
+
+		assign_indices(tile, shapeindex_best, rotatemode, indexmode, orig_endpts, pattern_precs[sp], orig_indices, orig_err);
+		swap_indices(shapeindex_best, indexmode, orig_endpts, orig_indices);
+
+		if (patterns[sp].transform_mode)
+			transform_forward(patterns[sp].transform_mode, orig_endpts);
+
+		// apply a heuristic here -- we check if the endpoints fit before we try to optimize them.
+		// the assumption made is that if they don't fit now, they won't fit after optimizing.
+		if (endpts_fit(orig_endpts, patterns[sp]))
+		{
+			if (patterns[sp].transform_mode)
+				transform_inverse(patterns[sp].transform_mode, orig_endpts);
+
+			optimize_endpts(tile, shapeindex_best, rotatemode, indexmode, orig_err, orig_endpts, pattern_precs[sp], expected_opt_err, opt_endpts);
+
+			assign_indices(tile, shapeindex_best, rotatemode, indexmode, opt_endpts, pattern_precs[sp], opt_indices, opt_err);
+			for (int i=0; i<NREGIONS; ++i)
+				assert(expected_opt_err[i] == opt_err[i]);
+			swap_indices(shapeindex_best, indexmode, opt_endpts, opt_indices);
+
+			if (patterns[sp].transform_mode)
+				transform_forward(patterns[sp].transform_mode, opt_endpts);
+
+			orig_toterr = opt_toterr = 0;
+			for (int i=0; i < NREGIONS; ++i) { orig_toterr += orig_err[i]; opt_toterr += opt_err[i]; }
+			if (endpts_fit(opt_endpts, patterns[sp]) && opt_toterr < orig_toterr)
+			{
+				emit_block(opt_endpts, shapeindex_best, patterns[sp], opt_indices, rotatemode, indexmode, block);
+				return opt_toterr;
+			}
+			else
+			{
+				// either it stopped fitting when we optimized it, or there was no improvement
+				// so go back to the unoptimized endpoints which we know will fit
+				if (patterns[sp].transform_mode)
+					transform_forward(patterns[sp].transform_mode, orig_endpts);
+				emit_block(orig_endpts, shapeindex_best, patterns[sp], orig_indices, rotatemode, indexmode, block);
+				return orig_toterr;
+			}
+		}
+	}
+	throw "No candidate found, should never happen (avpcl mode 5).";
+}
+
+static void clamp(Vec4 &v)
+{
+	if (v.X() < RGBA_MIN) v.X() = RGBA_MIN;
+	if (v.X() > RGBA_MAX) v.X() = RGBA_MAX;
+	if (v.Y() < RGBA_MIN) v.Y() = RGBA_MIN;
+	if (v.Y() > RGBA_MAX) v.Y() = RGBA_MAX;
+	if (v.Z() < RGBA_MIN) v.Z() = RGBA_MIN;
+	if (v.Z() > RGBA_MAX) v.Z() = RGBA_MAX;
+	if (v.W() < RGBA_MIN) v.W() = RGBA_MIN;
+	if (v.W() > RGBA_MAX) v.W() = RGBA_MAX;
+}
+
+// compute initial endpoints for the "RGB" portion and the "A" portion. 
+// Note these channels may have been rotated.
+static void rough(const Tile &tile, int shapeindex, FltEndpts endpts[NREGIONS])
+{
+	for (int region=0; region<NREGIONS; ++region)
+	{
+		int np = 0;
+		Vec4 colors[Tile::TILE_TOTAL];
+		Vec4 mean(0,0,0,0);
+
+		for (int y = 0; y < tile.size_y; y++)
+		for (int x = 0; x < tile.size_x; x++)
+			if (REGION(x,y,shapeindex) == region)
+			{
+				colors[np] = tile.data[y][x];
+				mean += tile.data[y][x];
+				++np;
+			}
+
+		// handle simple cases	
+		if (np == 0)
+		{
+			Vec4 zero(0,0,0,RGBA_MAX);
+			endpts[region].A = zero;
+			endpts[region].B = zero;
+			continue;
+		}
+		else if (np == 1)
+		{
+			endpts[region].A = colors[0];
+			endpts[region].B = colors[0];
+			continue;
+		}
+		else if (np == 2)
+		{
+			endpts[region].A = colors[0];
+			endpts[region].B = colors[1];
+			continue;
+		}
+
+		Matrix rdq(np, 3);
+		float alpha[Tile::TILE_TOTAL];
+
+		mean /= float(np);
+
+		for (int i = 0; i < np; ++i)
+		{
+			rdq(i,0) = colors[i].X() - mean.X();
+			rdq(i,1) = colors[i].Y() - mean.Y();
+			rdq(i,2) = colors[i].Z() - mean.Z();
+			alpha[i] = colors[i].W() - mean.W();
+		}
+				
+		// perform a singular value decomposition
+		SVD svd(rdq);
+
+		// get the principal component direction (the one with the largest weight)
+		// hack the alpha channel
+		Vec4 direction(svd.R()(0,0), svd.R()(0,1), svd.R()(0,2), 0.0);
+		
+		// project each pixel value along the principal direction
+		double minp = DBL_MAX, maxp = -DBL_MAX;
+		double mina = DBL_MAX, maxa = -DBL_MAX;
+		for (int i = 0; i < np; i++) 
+		{
+			float dp = rdq(i,0) * direction.X() + rdq(i,1)*direction.Y() + rdq(i,2)*direction.Z();
+			if (dp < minp) minp = dp;
+			if (dp > maxp) maxp = dp;
+
+			dp = alpha[i];
+			if (dp < mina) mina = dp;
+			if (dp > maxa) maxa = dp;
+		}
+
+		// choose as endpoints 2 points along the principal direction that span the projections of all of the pixel values
+		endpts[region].A = mean + minp*direction; 
+		endpts[region].B = mean + maxp*direction;
+		endpts[region].A.W() = mean.W() + mina;
+		endpts[region].B.W() = mean.W() + maxa;
+
+		// clamp endpoints
+		// WORK: is [0,255] the right range, or should it be [0,255.5) or even [0,256) ?
+		clamp(endpts[region].A);
+		clamp(endpts[region].B);
+	}
+}
+
+double AVPCL::compress_mode5(const Tile &t, char *block)
+{
+	FltEndpts endpts[NREGIONS];
+	char tempblock[AVPCL::BLOCKSIZE];
+	double msebest = DBL_MAX;
+	int shape = 0;
+	Tile t1;
+
+	// try all rotations. refine tries the 2 different indexings.
+	for (int r = 0; r < NROTATEMODES && msebest > 0; ++r)
+	{
+		rotate_tile(t, r, t1);
+		rough(t1, shape, endpts);
+//		for (int i = 0; i < NINDEXMODES && msebest > 0; ++i)
+		for (int i = 0; i < 1 && msebest > 0; ++i)
+		{
+			double mse = refine(t1, shape, r, i, endpts, tempblock);
+			if (mse < msebest)
+			{
+				memcpy(block, tempblock, sizeof(tempblock));
+				msebest = mse;
+			}
+		}
+	}
+	return msebest;
+}
diff --git a/src/nvtt/bc7/avpcl_mode6.cpp b/src/nvtt/bc7/avpcl_mode6.cpp
new file mode 100644
index 0000000..13e07fb
--- /dev/null
+++ b/src/nvtt/bc7/avpcl_mode6.cpp
@@ -0,0 +1,1059 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+// Thanks to Jacob Munkberg (jacob@cs.lth.se) for the shortcut of using SVD to do the equivalent of principal components analysis
+
+// x1000000 7777.1x2 4bi
+
+#include "bits.h"
+#include "tile.h"
+#include "avpcl.h"
+#include "arvo/Vec4.h"
+#include "arvo/Matrix.h"
+#include "arvo/SVD.h"
+#include "utils.h"
+#include "endpts.h"
+
+#include <assert.h>
+
+using namespace ArvoMath;
+
+#define	NLSBMODES	4		// number of different lsb modes per region. since we have two .1 per region, that can have 4 values
+
+#define NINDICES	16
+#define	INDEXBITS	4
+#define	HIGH_INDEXBIT	(1<<(INDEXBITS-1))
+#define	DENOM		(NINDICES-1)
+#define	BIAS		(DENOM/2)
+
+#define	NSHAPES	1
+
+static int shapes[NSHAPES] =
+{
+	0x0000,
+};
+
+#define	REGION(x,y,shapeindex)	((shapes[shapeindex]&(1<<(15-(x)-4*(y))))!=0)
+
+#define	NREGIONS	1
+
+#define	NBITSIZES	(NREGIONS*2)
+#define	ABITINDEX(region)	(2*(region)+0)
+#define	BBITINDEX(region)	(2*(region)+1)
+
+struct ChanBits
+{
+	int nbitsizes[NBITSIZES];	// bitsizes for one channel
+};
+
+struct Pattern
+{
+	ChanBits chan[NCHANNELS_RGBA];//  bit patterns used per channel
+	int mode;				// associated mode value
+	int modebits;			// number of mode bits
+	char *encoding;			// verilog description of encoding for this mode
+};
+
+#define	NPATTERNS 1
+
+static Pattern patterns[NPATTERNS] =
+{
+	// red	green	blue	alpha	mode  mb verilog
+	7,7,	7,7,	7,7,	7,7,	0x40, 7, "",
+};
+
+struct RegionPrec
+{
+	int	endpt_a_prec[NCHANNELS_RGBA];
+	int endpt_b_prec[NCHANNELS_RGBA];
+};
+
+struct PatternPrec
+{
+	RegionPrec region_precs[NREGIONS];
+};
+
+// this is the precision for each channel and region
+// NOTE: this MUST match the corresponding data in "patterns" above -- WARNING: there is NO assert to check this!
+static PatternPrec pattern_precs[NPATTERNS] =
+{
+	7,7,7,7,	7,7,7,7,
+};
+
+// return # of bits needed to store n. handle signed or unsigned cases properly
+static int nbits(int n, bool issigned)
+{
+	int nb;
+	if (n==0)
+		return 0;	// no bits needed for 0, signed or not
+	else if (n > 0)
+	{
+		for (nb=0; n; ++nb, n>>=1) ;
+		return nb + (issigned?1:0);
+	}
+	else
+	{
+		assert (issigned);
+		for (nb=0; n<-1; ++nb, n>>=1) ;
+		return nb + 1;
+	}
+}
+
+/*
+we're using this table to assign lsbs
+abgr	>=2	correct
+0000	0	0
+0001	0	0
+0010	0	0
+0011	1	x1
+0100	0	0
+0101	1	x1
+0110	1	x1
+0111	1	1
+1000	0	0
+1001	1	x0
+1010	1	x0
+1011	1	1
+1100	1	x0
+1101	1	1
+1110	1	1
+1111	1	1
+
+we need 8 0's and 8 1's. the x's can be either 0 or 1 as long as you get 8/8.
+I choose to assign the lsbs so that the rgb channels are as good as possible.
+*/
+
+// 8888 ->7777.1, use the "correct" column above to assign the lsb
+static void compress_one(const IntEndptsRGBA& endpts, IntEndptsRGBA_2& compr_endpts)
+{
+	int onescnt;
+
+	onescnt = 0;
+	for (int j=0; j<NCHANNELS_RGBA; ++j)
+	{
+		// ignore the alpha channel in the count
+		onescnt += (j==CHANNEL_A) ? 0 : (endpts.A[j] & 1);
+		compr_endpts.A[j] = endpts.A[j] >> 1;
+		assert (compr_endpts.A[j] < 128);
+	}
+	compr_endpts.a_lsb = onescnt >= 2;
+
+	onescnt = 0;
+	for (int j=0; j<NCHANNELS_RGBA; ++j)
+	{
+		onescnt += (j==CHANNEL_A) ? 0 : (endpts.B[j] & 1);
+		compr_endpts.B[j] = endpts.B[j] >> 1;
+		assert (compr_endpts.B[j] < 128);
+	}
+	compr_endpts.b_lsb = onescnt >= 2;
+}
+
+static void uncompress_one(const IntEndptsRGBA_2& compr_endpts, IntEndptsRGBA& endpts)
+{
+	for (int j=0; j<NCHANNELS_RGBA; ++j)
+	{
+		endpts.A[j] = (compr_endpts.A[j] << 1) | compr_endpts.a_lsb;
+		endpts.B[j] = (compr_endpts.B[j] << 1) | compr_endpts.b_lsb;
+	}
+}
+
+static void uncompress_endpoints(const IntEndptsRGBA_2 compr_endpts[NREGIONS], IntEndptsRGBA endpts[NREGIONS])
+{
+	for (int i=0; i<NREGIONS; ++i)
+		uncompress_one(compr_endpts[i], endpts[i]);
+}
+
+static void compress_endpoints(const IntEndptsRGBA endpts[NREGIONS], IntEndptsRGBA_2 compr_endpts[NREGIONS])
+{
+	for (int i=0; i<NREGIONS; ++i)
+		compress_one(endpts[i], compr_endpts[i]);
+}
+
+
+
+static void quantize_endpts(const FltEndpts endpts[NREGIONS], const PatternPrec &pattern_prec, IntEndptsRGBA_2 q_endpts[NREGIONS])
+{
+	IntEndptsRGBA full_endpts[NREGIONS];
+
+	for (int region = 0; region < NREGIONS; ++region)
+	{
+		full_endpts[region].A[0] = Utils::quantize(endpts[region].A.X(), pattern_prec.region_precs[region].endpt_a_prec[0]+1);	// +1 since we are in uncompressed space
+		full_endpts[region].A[1] = Utils::quantize(endpts[region].A.Y(), pattern_prec.region_precs[region].endpt_a_prec[1]+1);
+		full_endpts[region].A[2] = Utils::quantize(endpts[region].A.Z(), pattern_prec.region_precs[region].endpt_a_prec[2]+1);
+		full_endpts[region].A[3] = Utils::quantize(endpts[region].A.W(), pattern_prec.region_precs[region].endpt_a_prec[3]+1);
+
+		full_endpts[region].B[0] = Utils::quantize(endpts[region].B.X(), pattern_prec.region_precs[region].endpt_b_prec[0]+1);
+		full_endpts[region].B[1] = Utils::quantize(endpts[region].B.Y(), pattern_prec.region_precs[region].endpt_b_prec[1]+1);
+		full_endpts[region].B[2] = Utils::quantize(endpts[region].B.Z(), pattern_prec.region_precs[region].endpt_b_prec[2]+1);
+		full_endpts[region].B[3] = Utils::quantize(endpts[region].B.W(), pattern_prec.region_precs[region].endpt_b_prec[3]+1);
+
+		compress_one(full_endpts[region], q_endpts[region]);
+	}
+}
+
+// swap endpoints as needed to ensure that the indices at index_one and index_two have a 0 high-order bit
+// index_two is 0 at x=0 y=0 and 15 at x=3 y=3 so y = (index >> 2) & 3 and x = index & 3
+static void swap_indices(IntEndptsRGBA_2 endpts[NREGIONS], int indices[Tile::TILE_H][Tile::TILE_W], int shapeindex)
+{
+	int index_positions[NREGIONS];
+
+	index_positions[0] = 0;			// since WLOG we have the high bit of the shapes at 0
+
+	for (int region = 0; region < NREGIONS; ++region)
+	{
+		int x = index_positions[region] & 3;
+		int y = (index_positions[region] >> 2) & 3;
+		assert(REGION(x,y,shapeindex) == region);		// double check the table
+		if (indices[y][x] & HIGH_INDEXBIT)
+		{
+			// high bit is set, swap the endpts and indices for this region
+			int t;
+			for (int i=0; i<NCHANNELS_RGBA; ++i) 
+			{
+				t = endpts[region].A[i]; endpts[region].A[i] = endpts[region].B[i]; endpts[region].B[i] = t;
+			}
+			t = endpts[region].a_lsb; endpts[region].a_lsb = endpts[region].b_lsb; endpts[region].b_lsb = t;
+
+			for (int y = 0; y < Tile::TILE_H; y++)
+			for (int x = 0; x < Tile::TILE_W; x++)
+				if (REGION(x,y,shapeindex) == region)
+					indices[y][x] = NINDICES - 1 - indices[y][x];
+		}
+	}
+}
+
+static bool endpts_fit(IntEndptsRGBA_2 endpts[NREGIONS], const Pattern &p)
+{
+	return true;
+}
+
+static void write_header(const IntEndptsRGBA_2 endpts[NREGIONS], int shapeindex, const Pattern &p, Bits &out)
+{
+	out.write(p.mode, p.modebits);
+
+	for (int j=0; j<NCHANNELS_RGBA; ++j)
+		for (int i=0; i<NREGIONS; ++i)
+		{
+			out.write(endpts[i].A[j], p.chan[j].nbitsizes[ABITINDEX(i)]);
+			out.write(endpts[i].B[j], p.chan[j].nbitsizes[BBITINDEX(i)]);
+		}
+
+	for (int i=0; i<NREGIONS; ++i)
+	{
+		out.write(endpts[i].a_lsb, 1);
+		out.write(endpts[i].b_lsb, 1);
+	}
+
+	assert (out.getptr() == 65);
+}
+
+static void read_header(Bits &in, IntEndptsRGBA_2 endpts[NREGIONS], int &shapeindex, Pattern &p, int &pat_index)
+{
+	int mode = AVPCL::getmode(in);
+
+	pat_index = 0;
+
+	assert (pat_index >= 0 && pat_index < NPATTERNS);
+	assert (in.getptr() == patterns[pat_index].modebits);
+
+	p = patterns[pat_index];
+
+	shapeindex = 0;		// we don't have any
+
+	for (int j=0; j<NCHANNELS_RGBA; ++j)
+		for (int i=0; i<NREGIONS; ++i)
+		{
+			endpts[i].A[j] = in.read(p.chan[j].nbitsizes[ABITINDEX(i)]);
+			endpts[i].B[j] = in.read(p.chan[j].nbitsizes[BBITINDEX(i)]);
+		}
+	
+	for (int i=0; i<NREGIONS; ++i)
+	{
+		endpts[i].a_lsb  = in.read(1);
+		endpts[i].b_lsb  = in.read(1);
+	}
+
+	assert (in.getptr() == 65);
+}
+
+static void write_indices(const int indices[Tile::TILE_H][Tile::TILE_W], int shapeindex, Bits &out)
+{
+	assert ((indices[0][0] & HIGH_INDEXBIT) == 0);
+
+	// the index we shorten is always index 0
+	for (int i = 0; i < Tile::TILE_TOTAL; ++i)
+	{
+		if (i==0)
+			out.write(indices[i>>2][i&3], INDEXBITS-1);	// write i..[2:0]
+		else
+			out.write(indices[i>>2][i&3], INDEXBITS);	// write i..[3:0]
+	}
+
+}
+
+static void read_indices(Bits &in, int shapeindex, int indices[Tile::TILE_H][Tile::TILE_W])
+{
+	// the index we shorten is always index 0
+	for (int i = 0; i < Tile::TILE_TOTAL; ++i)
+	{
+		if (i==0)
+			indices[i>>2][i&3] = in.read(INDEXBITS-1);	// read i..[1:0]
+		else
+			indices[i>>2][i&3] = in.read(INDEXBITS);	// read i..[2:0]
+	}
+}
+
+static void emit_block(const IntEndptsRGBA_2 endpts[NREGIONS], int shapeindex, const Pattern &p, const int indices[Tile::TILE_H][Tile::TILE_W], char *block)
+{
+	Bits out(block, AVPCL::BITSIZE);
+
+	write_header(endpts, shapeindex, p, out);
+
+	write_indices(indices, shapeindex, out);
+
+	assert(out.getptr() == AVPCL::BITSIZE);
+}
+
+static void generate_palette_quantized(const IntEndptsRGBA_2 &endpts_2, const RegionPrec &region_prec, Vec4 palette[NINDICES])
+{
+	IntEndptsRGBA endpts;
+
+	uncompress_one(endpts_2, endpts);
+
+	// scale endpoints
+	int a, b;			// really need a IntVec4...
+
+	a = Utils::unquantize(endpts.A[0], region_prec.endpt_a_prec[0]+1);	// +1 since we are in uncompressed space 
+	b = Utils::unquantize(endpts.B[0], region_prec.endpt_b_prec[0]+1);
+
+	// interpolate
+	for (int i = 0; i < NINDICES; ++i)
+		palette[i].X() = PALETTE_LERP(a, b, i, BIAS, DENOM);
+
+	a = Utils::unquantize(endpts.A[1], region_prec.endpt_a_prec[1]+1); 
+	b = Utils::unquantize(endpts.B[1], region_prec.endpt_b_prec[1]+1);
+
+	// interpolate
+	for (int i = 0; i < NINDICES; ++i)
+		palette[i].Y() = PALETTE_LERP(a, b, i, BIAS, DENOM);
+
+	a = Utils::unquantize(endpts.A[2], region_prec.endpt_a_prec[2]+1); 
+	b = Utils::unquantize(endpts.B[2], region_prec.endpt_b_prec[2]+1);
+
+	// interpolate
+	for (int i = 0; i < NINDICES; ++i)
+		palette[i].Z() = PALETTE_LERP(a, b, i, BIAS, DENOM);
+
+	a = Utils::unquantize(endpts.A[3], region_prec.endpt_a_prec[3]+1); 
+	b = Utils::unquantize(endpts.B[3], region_prec.endpt_b_prec[3]+1);
+
+	// interpolate
+	for (int i = 0; i < NINDICES; ++i)
+		palette[i].W() = PALETTE_LERP(a, b, i, BIAS, DENOM);
+}
+
+void AVPCL::decompress_mode6(const char *block, Tile &t)
+{
+	Bits in(block, AVPCL::BITSIZE);
+
+	Pattern p;
+	IntEndptsRGBA_2 endpts[NREGIONS];
+	int shapeindex, pat_index;
+
+	read_header(in, endpts, shapeindex, p, pat_index);
+	
+	Vec4 palette[NREGIONS][NINDICES];
+	for (int r = 0; r < NREGIONS; ++r)
+		generate_palette_quantized(endpts[r], pattern_precs[pat_index].region_precs[r], &palette[r][0]);
+
+	int indices[Tile::TILE_H][Tile::TILE_W];
+
+	read_indices(in, shapeindex, indices);
+
+	assert(in.getptr() == AVPCL::BITSIZE);
+
+	// lookup
+	for (int y = 0; y < Tile::TILE_H; y++)
+	for (int x = 0; x < Tile::TILE_W; x++)
+		t.data[y][x] = palette[REGION(x,y,shapeindex)][indices[y][x]];
+}
+
+// given a collection of colors and quantized endpoints, generate a palette, choose best entries, and return a single toterr
+static double map_colors(const Vec4 colors[], int np, const IntEndptsRGBA_2 &endpts, const RegionPrec &region_prec, double current_err, int indices[Tile::TILE_TOTAL])
+{
+	Vec4 palette[NINDICES];
+	double toterr = 0;
+	Vec4 err;
+
+	generate_palette_quantized(endpts, region_prec, palette);
+
+	for (int i = 0; i < np; ++i)
+	{
+		double err, besterr = DBL_MAX;
+
+		for (int j = 0; j < NINDICES && besterr > 0; ++j)
+		{
+			err = !AVPCL::flag_premult ? Utils::metric4(colors[i], palette[j]) :
+									     Utils::metric4premult(colors[i], palette[j]) ;
+
+			if (err > besterr)	// error increased, so we're done searching
+				break;
+			if (err < besterr)
+			{
+				besterr = err;
+				indices[i] = j;
+			}
+		}
+		toterr += besterr;
+
+		// check for early exit
+		if (toterr > current_err)
+		{
+			// fill out bogus index values so it's initialized at least
+			for (int k = i; k < np; ++k)
+				indices[k] = -1;
+
+			return DBL_MAX;
+		}
+	}
+	return toterr;
+}
+
+// assign indices given a tile, shape, and quantized endpoints, return toterr for each region
+static void assign_indices(const Tile &tile, int shapeindex, IntEndptsRGBA_2 endpts[NREGIONS], const PatternPrec &pattern_prec, 
+						   int indices[Tile::TILE_H][Tile::TILE_W], double toterr[NREGIONS])
+{
+	// build list of possibles
+	Vec4 palette[NREGIONS][NINDICES];
+
+	for (int region = 0; region < NREGIONS; ++region)
+	{
+		generate_palette_quantized(endpts[region], pattern_prec.region_precs[region], &palette[region][0]);
+		toterr[region] = 0;
+	}
+
+	Vec4 err;
+
+	for (int y = 0; y < tile.size_y; y++)
+	for (int x = 0; x < tile.size_x; x++)
+	{
+		int region = REGION(x,y,shapeindex);
+		double err, besterr = DBL_MAX;
+
+		for (int i = 0; i < NINDICES && besterr > 0; ++i)
+		{
+			err = !AVPCL::flag_premult ? Utils::metric4(tile.data[y][x], palette[region][i]) :
+										 Utils::metric4premult(tile.data[y][x], palette[region][i]) ;
+
+			if (err > besterr)	// error increased, so we're done searching
+				break;
+			if (err < besterr)
+			{
+				besterr = err;
+				indices[y][x] = i;
+			}
+		}
+		toterr[region] += besterr;
+	}
+}
+
+// note: indices are valid only if the value returned is less than old_err; otherwise they contain -1's
+// this function returns either old_err or a value smaller (if it was successful in improving the error)
+static double perturb_one(const Vec4 colors[], int np, int ch, const RegionPrec &region_prec, const IntEndptsRGBA_2 &old_endpts, IntEndptsRGBA_2 &new_endpts, 
+						  double old_err, int do_b, int indices[Tile::TILE_TOTAL])
+{
+	// we have the old endpoints: old_endpts
+	// we have the perturbed endpoints: new_endpts
+	// we have the temporary endpoints: temp_endpts
+
+	IntEndptsRGBA_2 temp_endpts;
+	float min_err = old_err;		// start with the best current error
+	int beststep;
+	int temp_indices[Tile::TILE_TOTAL];
+
+	for (int i=0; i<np; ++i)
+		indices[i] = -1;
+
+	// copy real endpoints so we can perturb them
+	temp_endpts = new_endpts = old_endpts;
+
+	int prec = do_b ? region_prec.endpt_b_prec[ch] : region_prec.endpt_a_prec[ch];
+
+	// do a logarithmic search for the best error for this endpoint (which)
+	for (int step = 1 << (prec-1); step; step >>= 1)
+	{
+		bool improved = false;
+		for (int sign = -1; sign <= 1; sign += 2)
+		{
+			if (do_b == 0)
+			{
+				temp_endpts.A[ch] = new_endpts.A[ch] + sign * step;
+				if (temp_endpts.A[ch] < 0 || temp_endpts.A[ch] >= (1 << prec))
+					continue;
+			}
+			else
+			{
+				temp_endpts.B[ch] = new_endpts.B[ch] + sign * step;
+				if (temp_endpts.B[ch] < 0 || temp_endpts.B[ch] >= (1 << prec))
+					continue;
+			}
+
+			float err = map_colors(colors, np, temp_endpts, region_prec, min_err, temp_indices);
+
+			if (err < min_err)
+			{
+				improved = true;
+				min_err = err;
+				beststep = sign * step;
+				for (int i=0; i<np; ++i)
+					indices[i] = temp_indices[i];
+			}
+		}
+		// if this was an improvement, move the endpoint and continue search from there
+		if (improved)
+		{
+			if (do_b == 0)
+				new_endpts.A[ch] += beststep;
+			else
+				new_endpts.B[ch] += beststep;
+		}
+	}
+	return min_err;
+}
+
+// the larger the error the more time it is worth spending on an exhaustive search.
+// perturb the endpoints at least -3 to 3.
+// if err > 5000 perturb endpoints 50% of precision
+// if err > 1000 25%
+// if err > 200 12.5%
+// if err > 40  6.25%
+// for np = 16 -- adjust error thresholds as a function of np
+// always ensure endpoint ordering is preserved (no need to overlap the scan)
+// if orig_err returned from this is less than its input value, then indices[] will contain valid indices
+static double exhaustive(const Vec4 colors[], int np, int ch, const RegionPrec &region_prec, double orig_err, IntEndptsRGBA_2 &opt_endpts, int indices[Tile::TILE_TOTAL])
+{
+	IntEndptsRGBA_2 temp_endpts;
+	double best_err = orig_err;
+	int aprec = region_prec.endpt_a_prec[ch];
+	int bprec = region_prec.endpt_b_prec[ch];
+	int good_indices[Tile::TILE_TOTAL];
+	int temp_indices[Tile::TILE_TOTAL];
+
+	for (int i=0; i<np; ++i)
+		indices[i] = -1;
+
+	double thr_scale = (double)np / (double)Tile::TILE_TOTAL;
+
+	if (orig_err == 0) return orig_err;
+
+	int adelta = 0, bdelta = 0;
+	if (orig_err > 5000.0*thr_scale)		{ adelta = (1 << aprec)/2; bdelta = (1 << bprec)/2; }
+	else if (orig_err > 1000.0*thr_scale)	{ adelta = (1 << aprec)/4; bdelta = (1 << bprec)/4; }
+	else if (orig_err > 200.0*thr_scale)	{ adelta = (1 << aprec)/8; bdelta = (1 << bprec)/8; }
+	else if (orig_err > 40.0*thr_scale)		{ adelta = (1 << aprec)/16; bdelta = (1 << bprec)/16; }
+	adelta = MAX(adelta, 3);
+	bdelta = MAX(bdelta, 3);
+
+#ifdef	DISABLE_EXHAUSTIVE
+	adelta = bdelta = 3;
+#endif
+
+	temp_endpts = opt_endpts;
+
+	// ok figure out the range of A and B
+	int alow = MAX(0, opt_endpts.A[ch] - adelta);
+	int ahigh = MIN((1<<aprec)-1, opt_endpts.A[ch] + adelta);
+	int blow = MAX(0, opt_endpts.B[ch] - bdelta);
+	int bhigh = MIN((1<<bprec)-1, opt_endpts.B[ch] + bdelta);
+
+	// now there's no need to swap the ordering of A and B
+	bool a_le_b = opt_endpts.A[ch] <= opt_endpts.B[ch];
+
+	int amin, bmin;
+
+	if (opt_endpts.A[ch] <= opt_endpts.B[ch])
+	{
+		// keep a <= b
+		for (int a = alow; a <= ahigh; ++a)
+		for (int b = MAX(a, blow); b < bhigh; ++b)
+		{
+			temp_endpts.A[ch] = a;
+			temp_endpts.B[ch] = b;
+		
+			float err = map_colors(colors, np, temp_endpts, region_prec, best_err, temp_indices);
+			if (err < best_err) 
+			{ 
+				amin = a; 
+				bmin = b; 
+				best_err = err;
+				for (int i=0; i<np; ++i)
+					good_indices[i] = temp_indices[i];
+			}
+		}
+	}
+	else
+	{
+		// keep b <= a
+		for (int b = blow; b < bhigh; ++b)
+		for (int a = MAX(b, alow); a <= ahigh; ++a)
+		{
+			temp_endpts.A[ch] = a;
+			temp_endpts.B[ch] = b;
+		
+			float err = map_colors(colors, np, temp_endpts, region_prec, best_err, temp_indices);
+			if (err < best_err) 
+			{ 
+				amin = a; 
+				bmin = b; 
+				best_err = err; 
+				for (int i=0; i<np; ++i)
+					good_indices[i] = temp_indices[i];
+			}
+		}
+	}
+	if (best_err < orig_err)
+	{
+		opt_endpts.A[ch] = amin;
+		opt_endpts.B[ch] = bmin;
+		orig_err = best_err;
+		// if we actually improved, update the indices
+		for (int i=0; i<np; ++i)
+			indices[i] = good_indices[i];
+	}
+	return best_err;
+}
+
+static double optimize_one(const Vec4 colors[], int np, double orig_err, const IntEndptsRGBA_2 &orig_endpts, const RegionPrec &region_prec, IntEndptsRGBA_2 &opt_endpts)
+{
+	double opt_err = orig_err;
+
+	opt_endpts = orig_endpts;
+
+	/*
+		err0 = perturb(rgb0, delta0)
+		err1 = perturb(rgb1, delta1)
+		if (err0 < err1)
+			if (err0 >= initial_error) break
+			rgb0 += delta0
+			next = 1
+		else
+			if (err1 >= initial_error) break
+			rgb1 += delta1
+			next = 0
+		initial_err = map()
+		for (;;)
+			err = perturb(next ? rgb1:rgb0, delta)
+			if (err >= initial_err) break
+			next? rgb1 : rgb0 += delta
+			initial_err = err
+	*/
+	IntEndptsRGBA_2 new_a, new_b;
+	IntEndptsRGBA_2 new_endpt;
+	int do_b;
+	int orig_indices[Tile::TILE_TOTAL];
+	int new_indices[Tile::TILE_TOTAL];
+	int temp_indices0[Tile::TILE_TOTAL];
+	int temp_indices1[Tile::TILE_TOTAL];
+
+	// now optimize each channel separately
+	// for the first error improvement, we save the indices. then, for any later improvement, we compare the indices
+	// if they differ, we restart the loop (which then falls back to looking for a first improvement.)
+	for (int ch = 0; ch < NCHANNELS_RGBA; ++ch)
+	{
+		// figure out which endpoint when perturbed gives the most improvement and start there
+		// if we just alternate, we can easily end up in a local minima
+		float err0 = perturb_one(colors, np, ch, region_prec, opt_endpts, new_a, opt_err, 0, temp_indices0);	// perturb endpt A
+		float err1 = perturb_one(colors, np, ch, region_prec, opt_endpts, new_b, opt_err, 1, temp_indices1);	// perturb endpt B
+
+		if (err0 < err1)
+		{
+			if (err0 >= opt_err)
+				continue;
+
+			for (int i=0; i<np; ++i)
+			{
+				new_indices[i] = orig_indices[i] = temp_indices0[i];
+				assert (orig_indices[i] != -1);
+			}
+
+			opt_endpts.A[ch] = new_a.A[ch];
+			opt_err = err0;
+			do_b = 1;		// do B next
+		}
+		else
+		{
+			if (err1 >= opt_err)
+				continue;
+
+			for (int i=0; i<np; ++i)
+			{
+				new_indices[i] = orig_indices[i] = temp_indices1[i];
+				assert (orig_indices[i] != -1);
+			}
+
+			opt_endpts.B[ch] = new_b.B[ch];
+			opt_err = err1;
+			do_b = 0;		// do A next
+		}
+		
+		// now alternate endpoints and keep trying until there is no improvement
+		for (;;)
+		{
+			float err = perturb_one(colors, np, ch, region_prec, opt_endpts, new_endpt, opt_err, do_b, temp_indices0);
+			if (err >= opt_err)
+				break;
+
+			for (int i=0; i<np; ++i)
+			{
+				new_indices[i] = temp_indices0[i];
+				assert (new_indices[i] != -1);
+			}
+
+			if (do_b == 0)
+				opt_endpts.A[ch] = new_endpt.A[ch];
+			else
+				opt_endpts.B[ch] = new_endpt.B[ch];
+			opt_err = err;
+			do_b = 1 - do_b;	// now move the other endpoint
+		}
+
+		// see if the indices have changed
+		int i;
+		for (i=0; i<np; ++i)
+			if (orig_indices[i] != new_indices[i])
+				break;
+
+		if (i<np)
+			ch = -1;	// start over
+	}
+
+	// finally, do a small exhaustive search around what we think is the global minima to be sure
+	// note this is independent of the above search, so we don't care about the indices from the above
+	// we don't care about the above because if they differ, so what? we've already started at ch=0
+	bool first = true;
+	for (int ch = 0; ch < NCHANNELS_RGBA; ++ch)
+	{
+		double new_err = exhaustive(colors, np, ch, region_prec, opt_err, opt_endpts, temp_indices0);
+
+		if (new_err < opt_err)
+		{
+			opt_err = new_err;
+
+			if (first)
+			{
+				for (int i=0; i<np; ++i)
+				{
+					orig_indices[i] = temp_indices0[i];
+					assert (orig_indices[i] != -1);
+				}
+				first = false;
+			}
+			else
+			{
+				// see if the indices have changed
+				int i;
+				for (i=0; i<np; ++i)
+					if (orig_indices[i] != temp_indices0[i])
+						break;
+
+				if (i<np)
+				{
+					ch = -1;	// start over
+					first = true;
+				}
+			}
+		}
+	}
+
+	return opt_err;
+}
+
+static void optimize_endpts(const Tile &tile, int shapeindex, const double orig_err[NREGIONS], 
+							IntEndptsRGBA_2 orig_endpts[NREGIONS], const PatternPrec &pattern_prec, double opt_err[NREGIONS], IntEndptsRGBA_2 opt_endpts[NREGIONS])
+{
+	Vec4 pixels[Tile::TILE_TOTAL];
+	IntEndptsRGBA_2 temp_in, temp_out;
+	int temp_indices[Tile::TILE_TOTAL];
+
+	for (int region=0; region<NREGIONS; ++region)
+	{
+		// collect the pixels in the region
+		int np = 0;
+
+		for (int y = 0; y < tile.size_y; y++)
+		for (int x = 0; x < tile.size_x; x++)
+			if (REGION(x,y,shapeindex) == region)
+				pixels[np++] = tile.data[y][x];
+
+		opt_endpts[region] = temp_in = orig_endpts[region];
+		opt_err[region] = orig_err[region];
+
+		double best_err = orig_err[region];
+
+		// try all lsb modes as we search for better endpoints
+		for (int lsbmode=0; lsbmode<NLSBMODES; ++lsbmode)
+		{
+			temp_in.a_lsb = lsbmode & 1;
+			temp_in.b_lsb = (lsbmode >> 1) & 1;
+
+			// make sure we have a valid error for temp_in
+			// we use DBL_MAX here because we want an accurate temp_in_err, no shortcuts
+			// (mapcolors will compute a mapping but will stop if the error exceeds the value passed in the DBL_MAX position)
+			double temp_in_err = map_colors(pixels, np, temp_in, pattern_prec.region_precs[region], DBL_MAX, temp_indices);
+
+			// now try to optimize these endpoints
+			double temp_out_err = optimize_one(pixels, np, temp_in_err, temp_in, pattern_prec.region_precs[region], temp_out);
+
+			// if we find an improvement, update the best so far and correct the output endpoints and errors
+			if (temp_out_err < best_err)
+			{
+				best_err = temp_out_err;
+				opt_err[region] = temp_out_err;
+				opt_endpts[region] = temp_out;
+			}
+		}
+	}
+}
+
+/* optimization algorithm
+	for each pattern
+		convert endpoints using pattern precision
+		assign indices and get initial error
+		compress indices (and possibly reorder endpoints)
+		transform endpoints
+		if transformed endpoints fit pattern
+			get original endpoints back
+			optimize endpoints, get new endpoints, new indices, and new error // new error will almost always be better
+			compress new indices
+			transform new endpoints
+			if new endpoints fit pattern AND if error is improved
+				emit compressed block with new data
+			else
+				emit compressed block with original data // to try to preserve maximum endpoint precision
+
+     simplify the above given that there is no transform now and that endpoints will always fit
+*/
+
+static double refine(const Tile &tile, int shapeindex_best, const FltEndpts endpts[NREGIONS], char *block)
+{
+	double orig_err[NREGIONS], opt_err[NREGIONS], orig_toterr, opt_toterr, expected_opt_err[NREGIONS];
+	IntEndptsRGBA_2 orig_endpts[NREGIONS], opt_endpts[NREGIONS];
+	int orig_indices[Tile::TILE_H][Tile::TILE_W], opt_indices[Tile::TILE_H][Tile::TILE_W];
+
+	for (int sp = 0; sp < NPATTERNS; ++sp)
+	{
+		quantize_endpts(endpts, pattern_precs[sp], orig_endpts);
+		assign_indices(tile, shapeindex_best, orig_endpts, pattern_precs[sp], orig_indices, orig_err);
+		swap_indices(orig_endpts, orig_indices, shapeindex_best);
+
+		optimize_endpts(tile, shapeindex_best, orig_err, orig_endpts, pattern_precs[sp], expected_opt_err, opt_endpts);
+
+		assign_indices(tile, shapeindex_best, opt_endpts, pattern_precs[sp], opt_indices, opt_err);
+		for (int i=0; i<NREGIONS; ++i)
+			assert(expected_opt_err[i] == opt_err[i]);
+		swap_indices(opt_endpts, opt_indices, shapeindex_best);
+
+		orig_toterr = opt_toterr = 0;
+		for (int i=0; i < NREGIONS; ++i) { orig_toterr += orig_err[i]; opt_toterr += opt_err[i]; }
+		assert (opt_toterr <= orig_toterr);
+
+		if (opt_toterr < orig_toterr)
+		{
+			emit_block(opt_endpts, shapeindex_best, patterns[sp], opt_indices, block);
+			return opt_toterr;
+		}
+		else
+		{
+			emit_block(orig_endpts, shapeindex_best, patterns[sp], orig_indices, block);
+			return orig_toterr;
+		}
+	}
+	throw "No candidate found, should never happen (avpcl mode 6).";
+}
+
+static void clamp(Vec4 &v)
+{
+	if (v.X() < RGBA_MIN) v.X() = RGBA_MIN;
+	if (v.X() > RGBA_MAX) v.X() = RGBA_MAX;
+	if (v.Y() < RGBA_MIN) v.Y() = RGBA_MIN;
+	if (v.Y() > RGBA_MAX) v.Y() = RGBA_MAX;
+	if (v.Z() < RGBA_MIN) v.Z() = RGBA_MIN;
+	if (v.Z() > RGBA_MAX) v.Z() = RGBA_MAX;
+	if (v.W() < RGBA_MIN) v.W() = RGBA_MIN;
+	if (v.W() > RGBA_MAX) v.W() = RGBA_MAX;
+}
+
+static void generate_palette_unquantized(const FltEndpts endpts[NREGIONS], Vec4 palette[NREGIONS][NINDICES])
+{
+	for (int region = 0; region < NREGIONS; ++region)
+	for (int i = 0; i < NINDICES; ++i)
+		palette[region][i] = PALETTE_LERP(endpts[region].A, endpts[region].B, i, 0.0, float(DENOM));
+}
+
+// generate a palette from unquantized endpoints, then pick best palette color for all pixels in each region, return toterr for all regions combined
+static double map_colors(const Tile &tile, int shapeindex, const FltEndpts endpts[NREGIONS])
+{
+	// build list of possibles
+	Vec4 palette[NREGIONS][NINDICES];
+
+	generate_palette_unquantized(endpts, palette);
+
+	double toterr = 0;
+	Vec4 err;
+
+	for (int y = 0; y < tile.size_y; y++)
+	for (int x = 0; x < tile.size_x; x++)
+	{
+		int region = REGION(x,y,shapeindex);
+		double err, besterr;
+
+		besterr = Utils::metric4(tile.data[y][x], palette[region][0]);
+
+		for (int i = 1; i < NINDICES && besterr > 0; ++i)
+		{
+			err = Utils::metric4(tile.data[y][x], palette[region][i]);
+
+			if (err > besterr)	// error increased, so we're done searching. this works for most norms.
+				break;
+			if (err < besterr)
+				besterr = err;
+		}
+		toterr += besterr;
+	}
+	return toterr;
+}
+
+static double rough(const Tile &tile, int shapeindex, FltEndpts endpts[NREGIONS])
+{
+	for (int region=0; region<NREGIONS; ++region)
+	{
+		int np = 0;
+		Vec4 colors[Tile::TILE_TOTAL];
+		Vec4 mean(0,0,0,0);
+
+		for (int y = 0; y < tile.size_y; y++)
+		for (int x = 0; x < tile.size_x; x++)
+			if (REGION(x,y,shapeindex) == region)
+			{
+				colors[np] = tile.data[y][x];
+				mean += tile.data[y][x];
+				++np;
+			}
+
+		// handle simple cases	
+		if (np == 0)
+		{
+			Vec4 zero(0,0,0,RGBA_MAX);
+			endpts[region].A = zero;
+			endpts[region].B = zero;
+			continue;
+		}
+		else if (np == 1)
+		{
+			endpts[region].A = colors[0];
+			endpts[region].B = colors[0];
+			continue;
+		}
+		else if (np == 2)
+		{
+			endpts[region].A = colors[0];
+			endpts[region].B = colors[1];
+			continue;
+		}
+
+		Matrix rdq(np, 4);
+
+		mean /= float(np);
+
+		for (int i = 0; i < np; ++i)
+		{
+			rdq(i,0) = colors[i].X() - mean.X();
+			rdq(i,1) = colors[i].Y() - mean.Y();
+			rdq(i,2) = colors[i].Z() - mean.Z();
+			rdq(i,3) = colors[i].W() - mean.W();
+		}
+				
+		// perform a singular value decomposition
+		SVD svd(rdq);
+
+		// get the principal component direction (the one with the largest weight)
+		Vec4 direction(svd.R()(0,0), svd.R()(0,1), svd.R()(0,2), svd.R()(0,3));
+
+		// project each pixel value along the principal direction
+		double minp = DBL_MAX, maxp = -DBL_MAX;
+		for (int i = 0; i < np; i++) 
+		{
+			float dp = rdq(i,0) * direction.X() + rdq(i,1)*direction.Y() + rdq(i,2)*direction.Z() + rdq(i,3)*direction.W();
+			if (dp < minp) minp = dp;
+			if (dp > maxp) maxp = dp;
+		}
+
+		// choose as endpoints 2 points along the principal direction that span the projections of all of the pixel values
+		endpts[region].A = mean + minp*direction;
+		endpts[region].B = mean + maxp*direction;
+
+		// clamp endpoints
+		// the argument for clamping is that the actual endpoints need to be clamped and thus we need to choose the best
+		// shape based on endpoints being clamped
+		clamp(endpts[region].A);
+		clamp(endpts[region].B);
+	}
+
+	return map_colors(tile, shapeindex, endpts);
+}
+
+static void swap(double *list1, int *list2, int i, int j)
+{
+	double t = list1[i]; list1[i] = list1[j]; list1[j] = t;
+	int t1 = list2[i]; list2[i] = list2[j]; list2[j] = t1;
+}
+
+double AVPCL::compress_mode6(const Tile &t, char *block)
+{
+	// number of rough cases to look at. reasonable values of this are 1, NSHAPES/4, and NSHAPES
+	// NSHAPES/4 gets nearly all the cases; you can increase that a bit (say by 3 or 4) if you really want to squeeze the last bit out
+	const int NITEMS=1;
+
+	// pick the best NITEMS shapes and refine these.
+	struct {
+		FltEndpts endpts[NREGIONS];
+	} all[NSHAPES];
+	double roughmse[NSHAPES];
+	int index[NSHAPES];
+	char tempblock[AVPCL::BLOCKSIZE];
+	double msebest = DBL_MAX;
+
+	for (int i=0; i<NSHAPES; ++i)
+	{
+		roughmse[i] = rough(t, i, &all[i].endpts[0]);
+		index[i] = i;
+	}
+
+	// bubble sort -- only need to bubble up the first NITEMS items
+	for (int i=0; i<NITEMS; ++i)
+	for (int j=i+1; j<NSHAPES; ++j)
+		if (roughmse[i] > roughmse[j])
+			swap(roughmse, index, i, j);
+
+	for (int i=0; i<NITEMS && msebest>0; ++i)
+	{
+		int shape = index[i];
+		double mse = refine(t, shape, &all[shape].endpts[0], tempblock);
+		if (mse < msebest)
+		{
+			memcpy(block, tempblock, sizeof(tempblock));
+			msebest = mse;
+		}
+	}
+	return msebest;
+}
+
diff --git a/src/nvtt/bc7/avpcl_mode7.cpp b/src/nvtt/bc7/avpcl_mode7.cpp
new file mode 100644
index 0000000..b2813e8
--- /dev/null
+++ b/src/nvtt/bc7/avpcl_mode7.cpp
@@ -0,0 +1,1098 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+// Thanks to Jacob Munkberg (jacob@cs.lth.se) for the shortcut of using SVD to do the equivalent of principal components analysis
+
+// x10000000 5555.1x4 64p 2bi (30b)
+
+#include "bits.h"
+#include "tile.h"
+#include "avpcl.h"
+#include "arvo/Vec4.h"
+#include "arvo/Matrix.h"
+#include "arvo/SVD.h"
+#include "utils.h"
+#include "endpts.h"
+
+#include <assert.h>
+
+#include "shapes_two.h"
+
+using namespace ArvoMath;
+
+#define	NLSBMODES	4		// number of different lsb modes per region. since we have two .1 per region, that can have 4 values
+
+#define NINDICES	4
+#define	INDEXBITS	2
+#define	HIGH_INDEXBIT	(1<<(INDEXBITS-1))
+#define	DENOM		(NINDICES-1)
+#define	BIAS		(DENOM/2)
+
+// WORK: determine optimal traversal pattern to search for best shape -- what does the error curve look like?
+// i.e. can we search shapes in a particular order so we can see the global error minima easily and
+// stop without having to touch all shapes?
+
+#define	POS_TO_X(pos)	((pos)&3)
+#define	POS_TO_Y(pos)	(((pos)>>2)&3)
+
+#define	NBITSIZES	(NREGIONS*2)
+#define	ABITINDEX(region)	(2*(region)+0)
+#define	BBITINDEX(region)	(2*(region)+1)
+
+struct ChanBits
+{
+	int nbitsizes[NBITSIZES];	// bitsizes for one channel
+};
+
+struct Pattern
+{
+	ChanBits chan[NCHANNELS_RGBA];//  bit patterns used per channel
+	int transformed;		// if 0, deltas are unsigned and no transform; otherwise, signed and transformed
+	int mode;				// associated mode value
+	int modebits;			// number of mode bits
+	char *encoding;			// verilog description of encoding for this mode
+};
+
+#define	NPATTERNS 1
+#define	NREGIONS  2
+
+static Pattern patterns[NPATTERNS] =
+{
+	// red		green		blue		alpha		xfm	mode  mb
+	5,5,5,5,	5,5,5,5,	5,5,5,5,	5,5,5,5,	0,	0x80, 8, "",
+};
+
+struct RegionPrec
+{
+	int	endpt_a_prec[NCHANNELS_RGBA];
+	int endpt_b_prec[NCHANNELS_RGBA];
+};
+
+struct PatternPrec
+{
+	RegionPrec region_precs[NREGIONS];
+};
+
+
+// this is the precision for each channel and region
+// NOTE: this MUST match the corresponding data in "patterns" above -- WARNING: there is NO assert to check this!
+static PatternPrec pattern_precs[NPATTERNS] =
+{
+	5,5,5,5,  5,5,5,5,  5,5,5,5,  5,5,5,5,
+};
+
+// return # of bits needed to store n. handle signed or unsigned cases properly
+static int nbits(int n, bool issigned)
+{
+	int nb;
+	if (n==0)
+		return 0;	// no bits needed for 0, signed or not
+	else if (n > 0)
+	{
+		for (nb=0; n; ++nb, n>>=1) ;
+		return nb + (issigned?1:0);
+	}
+	else
+	{
+		assert (issigned);
+		for (nb=0; n<-1; ++nb, n>>=1) ;
+		return nb + 1;
+	}
+}
+
+static void transform_forward(IntEndptsRGBA_2 ep[NREGIONS])
+{
+	assert(0);
+}
+
+static void transform_inverse(IntEndptsRGBA_2 ep[NREGIONS])
+{
+	assert(0);
+}
+
+/*
+we're using this table to assign lsbs
+abgr	>=2	correct
+0000	0	0
+0001	0	0
+0010	0	0
+0011	1	x1
+0100	0	0
+0101	1	x1
+0110	1	x1
+0111	1	1
+1000	0	0
+1001	1	x0
+1010	1	x0
+1011	1	1
+1100	1	x0
+1101	1	1
+1110	1	1
+1111	1	1
+
+we need 8 0's and 8 1's. the x's can be either 0 or 1 as long as you get 8/8.
+I choose to assign the lsbs so that the rgb channels are as good as possible.
+*/
+
+// 6666 ->5555.1, use the "correct" column above to assign the lsb
+static void compress_one(const IntEndptsRGBA& endpts, IntEndptsRGBA_2& compr_endpts)
+{
+	int onescnt;
+
+	onescnt = 0;
+	for (int j=0; j<NCHANNELS_RGBA; ++j)
+	{
+		// ignore the alpha channel in the count
+		onescnt += (j==CHANNEL_A) ? 0 : (endpts.A[j] & 1);
+		compr_endpts.A[j] = endpts.A[j] >> 1;
+		assert (compr_endpts.A[j] < 32);
+	}
+	compr_endpts.a_lsb = onescnt >= 2;
+
+	onescnt = 0;
+	for (int j=0; j<NCHANNELS_RGBA; ++j)
+	{
+		onescnt += (j==CHANNEL_A) ? 0 : (endpts.B[j] & 1);
+		compr_endpts.B[j] = endpts.B[j] >> 1;
+		assert (compr_endpts.B[j] < 32);
+	}
+	compr_endpts.b_lsb = onescnt >= 2;
+}
+
+static void uncompress_one(const IntEndptsRGBA_2& compr_endpts, IntEndptsRGBA& endpts)
+{
+	for (int j=0; j<NCHANNELS_RGBA; ++j)
+	{
+		endpts.A[j] = (compr_endpts.A[j] << 1) | compr_endpts.a_lsb;
+		endpts.B[j] = (compr_endpts.B[j] << 1) | compr_endpts.b_lsb;
+	}
+}
+static void uncompress_endpoints(const IntEndptsRGBA_2 compr_endpts[NREGIONS], IntEndptsRGBA endpts[NREGIONS])
+{
+	for (int i=0; i<NREGIONS; ++i)
+		uncompress_one(compr_endpts[i], endpts[i]);
+}
+
+static void compress_endpoints(const IntEndptsRGBA endpts[NREGIONS], IntEndptsRGBA_2 compr_endpts[NREGIONS])
+{
+	for (int i=0; i<NREGIONS; ++i)
+		compress_one(endpts[i], compr_endpts[i]);
+}
+
+static void quantize_endpts(const FltEndpts endpts[NREGIONS], const PatternPrec &pattern_prec, IntEndptsRGBA_2 q_endpts[NREGIONS])
+{
+	IntEndptsRGBA full_endpts[NREGIONS];
+
+	for (int region = 0; region < NREGIONS; ++region)
+	{
+		full_endpts[region].A[0] = Utils::quantize(endpts[region].A.X(), pattern_prec.region_precs[region].endpt_a_prec[0]+1);	// +1 since we are in uncompressed space
+		full_endpts[region].A[1] = Utils::quantize(endpts[region].A.Y(), pattern_prec.region_precs[region].endpt_a_prec[1]+1);
+		full_endpts[region].A[2] = Utils::quantize(endpts[region].A.Z(), pattern_prec.region_precs[region].endpt_a_prec[2]+1);
+		full_endpts[region].A[3] = Utils::quantize(endpts[region].A.W(), pattern_prec.region_precs[region].endpt_a_prec[3]+1);
+
+		full_endpts[region].B[0] = Utils::quantize(endpts[region].B.X(), pattern_prec.region_precs[region].endpt_b_prec[0]+1);
+		full_endpts[region].B[1] = Utils::quantize(endpts[region].B.Y(), pattern_prec.region_precs[region].endpt_b_prec[1]+1);
+		full_endpts[region].B[2] = Utils::quantize(endpts[region].B.Z(), pattern_prec.region_precs[region].endpt_b_prec[2]+1);
+		full_endpts[region].B[3] = Utils::quantize(endpts[region].B.W(), pattern_prec.region_precs[region].endpt_b_prec[3]+1);
+
+		compress_one(full_endpts[region], q_endpts[region]);
+	}
+}
+
+// swap endpoints as needed to ensure that the indices at index_one and index_two have a 0 high-order bit
+// index_two is 0 at x=0 y=0 and 15 at x=3 y=3 so y = (index >> 2) & 3 and x = index & 3
+static void swap_indices(IntEndptsRGBA_2 endpts[NREGIONS], int indices[Tile::TILE_H][Tile::TILE_W], int shapeindex)
+{
+	for (int region = 0; region < NREGIONS; ++region)
+	{
+		int position = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,region);
+
+		int x = POS_TO_X(position);
+		int y = POS_TO_Y(position);
+		assert(REGION(x,y,shapeindex) == region);		// double check the table
+		if (indices[y][x] & HIGH_INDEXBIT)
+		{
+			// high bit is set, swap the endpts and indices for this region
+			int t;
+			for (int i=0; i<NCHANNELS_RGBA; ++i) 
+			{
+				t = endpts[region].A[i]; endpts[region].A[i] = endpts[region].B[i]; endpts[region].B[i] = t;
+			}
+			t = endpts[region].a_lsb; endpts[region].a_lsb = endpts[region].b_lsb; endpts[region].b_lsb = t;
+
+			for (int y = 0; y < Tile::TILE_H; y++)
+			for (int x = 0; x < Tile::TILE_W; x++)
+				if (REGION(x,y,shapeindex) == region)
+					indices[y][x] = NINDICES - 1 - indices[y][x];
+		}
+	}
+}
+
+static bool endpts_fit(IntEndptsRGBA_2 endpts[NREGIONS], const Pattern &p)
+{
+	return true;
+}
+
+static void write_header(const IntEndptsRGBA_2 endpts[NREGIONS], int shapeindex, const Pattern &p, Bits &out)
+{
+	out.write(p.mode, p.modebits);
+	out.write(shapeindex, SHAPEBITS);
+
+	for (int j=0; j<NCHANNELS_RGBA; ++j)
+		for (int i=0; i<NREGIONS; ++i)
+		{
+			out.write(endpts[i].A[j], p.chan[j].nbitsizes[ABITINDEX(i)]);
+			out.write(endpts[i].B[j], p.chan[j].nbitsizes[BBITINDEX(i)]);
+		}
+
+	for (int i=0; i<NREGIONS; ++i)
+	{
+		out.write(endpts[i].a_lsb, 1);
+		out.write(endpts[i].b_lsb, 1);
+	}
+
+	assert (out.getptr() == 98);
+}
+
+static void read_header(Bits &in, IntEndptsRGBA_2 endpts[NREGIONS], int &shapeindex, Pattern &p, int &pat_index)
+{
+	int mode = AVPCL::getmode(in);
+
+	pat_index = 0;
+	assert (pat_index >= 0 && pat_index < NPATTERNS);
+	assert (in.getptr() == patterns[pat_index].modebits);
+
+	shapeindex = in.read(SHAPEBITS);
+	p = patterns[pat_index];
+
+	for (int j=0; j<NCHANNELS_RGBA; ++j)
+		for (int i=0; i<NREGIONS; ++i)
+		{
+			endpts[i].A[j] = in.read(p.chan[j].nbitsizes[ABITINDEX(i)]);
+			endpts[i].B[j] = in.read(p.chan[j].nbitsizes[BBITINDEX(i)]);
+		}
+	
+	for (int i=0; i<NREGIONS; ++i)
+	{
+		endpts[i].a_lsb  = in.read(1);
+		endpts[i].b_lsb  = in.read(1);
+	}
+
+	assert (in.getptr() == 98);
+}
+
+// WORK PLACEHOLDER -- keep it simple for now
+static void write_indices(const int indices[Tile::TILE_H][Tile::TILE_W], int shapeindex, Bits &out)
+{
+	int positions[NREGIONS];
+
+	for (int r = 0; r < NREGIONS; ++r)
+		positions[r] = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,r);
+
+	for (int pos = 0; pos < Tile::TILE_TOTAL; ++pos)
+	{
+		int x = POS_TO_X(pos);
+		int y = POS_TO_Y(pos);
+
+		bool match = false;
+
+		for (int r = 0; r < NREGIONS; ++r)
+			if (positions[r] == pos) { match = true; break; }
+
+		out.write(indices[y][x], INDEXBITS - (match ? 1 : 0));
+	}
+}
+
+static void read_indices(Bits &in, int shapeindex, int indices[Tile::TILE_H][Tile::TILE_W])
+{
+	int positions[NREGIONS];
+
+	for (int r = 0; r < NREGIONS; ++r)
+		positions[r] = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,r);
+
+	for (int pos = 0; pos < Tile::TILE_TOTAL; ++pos)
+	{
+		int x = POS_TO_X(pos);
+		int y = POS_TO_Y(pos);
+
+		bool match = false;
+
+		for (int r = 0; r < NREGIONS; ++r)
+			if (positions[r] == pos) { match = true; break; }
+
+		indices[y][x]= in.read(INDEXBITS - (match ? 1 : 0));
+	}
+}
+
+static void emit_block(const IntEndptsRGBA_2 endpts[NREGIONS], int shapeindex, const Pattern &p, const int indices[Tile::TILE_H][Tile::TILE_W], char *block)
+{
+	Bits out(block, AVPCL::BITSIZE);
+
+	write_header(endpts, shapeindex, p, out);
+
+	write_indices(indices, shapeindex, out);
+
+	assert(out.getptr() == AVPCL::BITSIZE);
+}
+
+static void generate_palette_quantized(const IntEndptsRGBA_2 &endpts_2, const RegionPrec &region_prec, Vec4 palette[NINDICES])
+{
+	IntEndptsRGBA endpts;
+
+	uncompress_one(endpts_2, endpts);
+
+	// scale endpoints
+	int a, b;			// really need a IntVec4...
+
+	a = Utils::unquantize(endpts.A[0], region_prec.endpt_a_prec[0]+1);	// +1 since we are in uncompressed space 
+	b = Utils::unquantize(endpts.B[0], region_prec.endpt_b_prec[0]+1);
+
+	// interpolate
+	for (int i = 0; i < NINDICES; ++i)
+		palette[i].X() = PALETTE_LERP(a, b, i, BIAS, DENOM);
+
+	a = Utils::unquantize(endpts.A[1], region_prec.endpt_a_prec[1]+1); 
+	b = Utils::unquantize(endpts.B[1], region_prec.endpt_b_prec[1]+1);
+
+	// interpolate
+	for (int i = 0; i < NINDICES; ++i)
+		palette[i].Y() = PALETTE_LERP(a, b, i, BIAS, DENOM);
+
+	a = Utils::unquantize(endpts.A[2], region_prec.endpt_a_prec[2]+1); 
+	b = Utils::unquantize(endpts.B[2], region_prec.endpt_b_prec[2]+1);
+
+	// interpolate
+	for (int i = 0; i < NINDICES; ++i)
+		palette[i].Z() = PALETTE_LERP(a, b, i, BIAS, DENOM);
+
+	a = Utils::unquantize(endpts.A[3], region_prec.endpt_a_prec[3]+1); 
+	b = Utils::unquantize(endpts.B[3], region_prec.endpt_b_prec[3]+1);
+
+	// interpolate
+	for (int i = 0; i < NINDICES; ++i)
+		palette[i].W() = PALETTE_LERP(a, b, i, BIAS, DENOM);
+}
+
+// sign extend but only if it was transformed
+static void sign_extend(Pattern &p, IntEndptsRGBA_2 endpts[NREGIONS])
+{
+	assert(0);
+}
+
+void AVPCL::decompress_mode7(const char *block, Tile &t)
+{
+	Bits in(block, AVPCL::BITSIZE);
+
+	Pattern p;
+	IntEndptsRGBA_2 endpts[NREGIONS];
+	int shapeindex, pat_index;
+
+	read_header(in, endpts, shapeindex, p, pat_index);
+	
+	if (p.transformed)
+	{
+		sign_extend(p, endpts);
+		transform_inverse(endpts);
+	}
+
+	Vec4 palette[NREGIONS][NINDICES];
+	for (int r = 0; r < NREGIONS; ++r)
+		generate_palette_quantized(endpts[r], pattern_precs[pat_index].region_precs[r], &palette[r][0]);
+
+	int indices[Tile::TILE_H][Tile::TILE_W];
+
+	read_indices(in, shapeindex, indices);
+
+	assert(in.getptr() == AVPCL::BITSIZE);
+
+	// lookup
+	for (int y = 0; y < Tile::TILE_H; y++)
+	for (int x = 0; x < Tile::TILE_W; x++)
+		t.data[y][x] = palette[REGION(x,y,shapeindex)][indices[y][x]];
+}
+
+// given a collection of colors and quantized endpoints, generate a palette, choose best entries, and return a single toterr
+static double map_colors(const Vec4 colors[], int np, const IntEndptsRGBA_2 &endpts, const RegionPrec &region_prec, double current_err, int indices[Tile::TILE_TOTAL])
+{
+	Vec4 palette[NINDICES];
+	double toterr = 0;
+	Vec4 err;
+
+	generate_palette_quantized(endpts, region_prec, palette);
+
+	for (int i = 0; i < np; ++i)
+	{
+		double err, besterr = DBL_MAX;
+
+		for (int j = 0; j < NINDICES && besterr > 0; ++j)
+		{
+			err = !AVPCL::flag_premult ? Utils::metric4(colors[i], palette[j]) :
+									     Utils::metric4premult(colors[i], palette[j]) ;
+
+			if (err > besterr)	// error increased, so we're done searching
+				break;
+			if (err < besterr)
+			{
+				besterr = err;
+				indices[i] = j;
+			}
+		}
+		toterr += besterr;
+
+		// check for early exit
+		if (toterr > current_err)
+		{
+			// fill out bogus index values so it's initialized at least
+			for (int k = i; k < np; ++k)
+				indices[k] = -1;
+
+			return DBL_MAX;
+		}
+	}
+	return toterr;
+}
+
+// assign indices given a tile, shape, and quantized endpoints, return toterr for each region
+static void assign_indices(const Tile &tile, int shapeindex, IntEndptsRGBA_2 endpts[NREGIONS], const PatternPrec &pattern_prec, 
+						   int indices[Tile::TILE_H][Tile::TILE_W], double toterr[NREGIONS])
+{
+	// build list of possibles
+	Vec4 palette[NREGIONS][NINDICES];
+
+	for (int region = 0; region < NREGIONS; ++region)
+	{
+		generate_palette_quantized(endpts[region], pattern_prec.region_precs[region], &palette[region][0]);
+		toterr[region] = 0;
+	}
+
+	Vec4 err;
+
+	for (int y = 0; y < tile.size_y; y++)
+	for (int x = 0; x < tile.size_x; x++)
+	{
+		int region = REGION(x,y,shapeindex);
+		double err, besterr = DBL_MAX;
+
+		for (int i = 0; i < NINDICES && besterr > 0; ++i)
+		{
+			err = !AVPCL::flag_premult ? Utils::metric4(tile.data[y][x], palette[region][i]) :
+										 Utils::metric4premult(tile.data[y][x], palette[region][i]) ;
+
+			if (err > besterr)	// error increased, so we're done searching
+				break;
+			if (err < besterr)
+			{
+				besterr = err;
+				indices[y][x] = i;
+			}
+		}
+		toterr[region] += besterr;
+	}
+}
+
+// note: indices are valid only if the value returned is less than old_err; otherwise they contain -1's
+// this function returns either old_err or a value smaller (if it was successful in improving the error)
+static double perturb_one(const Vec4 colors[], int np, int ch, const RegionPrec &region_prec, const IntEndptsRGBA_2 &old_endpts, IntEndptsRGBA_2 &new_endpts, 
+						  double old_err, int do_b, int indices[Tile::TILE_TOTAL])
+{
+	// we have the old endpoints: old_endpts
+	// we have the perturbed endpoints: new_endpts
+	// we have the temporary endpoints: temp_endpts
+
+	IntEndptsRGBA_2 temp_endpts;
+	float min_err = old_err;		// start with the best current error
+	int beststep;
+	int temp_indices[Tile::TILE_TOTAL];
+
+	for (int i=0; i<np; ++i)
+		indices[i] = -1;
+
+	// copy real endpoints so we can perturb them
+	temp_endpts = new_endpts = old_endpts;
+
+	int prec = do_b ? region_prec.endpt_b_prec[ch] : region_prec.endpt_a_prec[ch];
+
+	// do a logarithmic search for the best error for this endpoint (which)
+	for (int step = 1 << (prec-1); step; step >>= 1)
+	{
+		bool improved = false;
+		for (int sign = -1; sign <= 1; sign += 2)
+		{
+			if (do_b == 0)
+			{
+				temp_endpts.A[ch] = new_endpts.A[ch] + sign * step;
+				if (temp_endpts.A[ch] < 0 || temp_endpts.A[ch] >= (1 << prec))
+					continue;
+			}
+			else
+			{
+				temp_endpts.B[ch] = new_endpts.B[ch] + sign * step;
+				if (temp_endpts.B[ch] < 0 || temp_endpts.B[ch] >= (1 << prec))
+					continue;
+			}
+
+			float err = map_colors(colors, np, temp_endpts, region_prec, min_err, temp_indices);
+
+			if (err < min_err)
+			{
+				improved = true;
+				min_err = err;
+				beststep = sign * step;
+				for (int i=0; i<np; ++i)
+					indices[i] = temp_indices[i];
+			}
+		}
+		// if this was an improvement, move the endpoint and continue search from there
+		if (improved)
+		{
+			if (do_b == 0)
+				new_endpts.A[ch] += beststep;
+			else
+				new_endpts.B[ch] += beststep;
+		}
+	}
+	return min_err;
+}
+
+// the larger the error the more time it is worth spending on an exhaustive search.
+// perturb the endpoints at least -3 to 3.
+// if err > 5000 perturb endpoints 50% of precision
+// if err > 1000 25%
+// if err > 200 12.5%
+// if err > 40  6.25%
+// for np = 16 -- adjust error thresholds as a function of np
+// always ensure endpoint ordering is preserved (no need to overlap the scan)
+// if orig_err returned from this is less than its input value, then indices[] will contain valid indices
+static double exhaustive(const Vec4 colors[], int np, int ch, const RegionPrec &region_prec, double orig_err, IntEndptsRGBA_2 &opt_endpts, int indices[Tile::TILE_TOTAL])
+{
+	IntEndptsRGBA_2 temp_endpts;
+	double best_err = orig_err;
+	int aprec = region_prec.endpt_a_prec[ch];
+	int bprec = region_prec.endpt_b_prec[ch];
+	int good_indices[Tile::TILE_TOTAL];
+	int temp_indices[Tile::TILE_TOTAL];
+
+	for (int i=0; i<np; ++i)
+		indices[i] = -1;
+
+	double thr_scale = (double)np / (double)Tile::TILE_TOTAL;
+
+	if (orig_err == 0) return orig_err;
+
+	int adelta = 0, bdelta = 0;
+	if (orig_err > 5000.0*thr_scale)		{ adelta = (1 << aprec)/2; bdelta = (1 << bprec)/2; }
+	else if (orig_err > 1000.0*thr_scale)	{ adelta = (1 << aprec)/4; bdelta = (1 << bprec)/4; }
+	else if (orig_err > 200.0*thr_scale)	{ adelta = (1 << aprec)/8; bdelta = (1 << bprec)/8; }
+	else if (orig_err > 40.0*thr_scale)		{ adelta = (1 << aprec)/16; bdelta = (1 << bprec)/16; }
+	adelta = MAX(adelta, 3);
+	bdelta = MAX(bdelta, 3);
+
+#ifdef	DISABLE_EXHAUSTIVE
+	adelta = bdelta = 3;
+#endif
+
+	temp_endpts = opt_endpts;
+
+	// ok figure out the range of A and B
+	int alow = MAX(0, opt_endpts.A[ch] - adelta);
+	int ahigh = MIN((1<<aprec)-1, opt_endpts.A[ch] + adelta);
+	int blow = MAX(0, opt_endpts.B[ch] - bdelta);
+	int bhigh = MIN((1<<bprec)-1, opt_endpts.B[ch] + bdelta);
+
+	// now there's no need to swap the ordering of A and B
+	bool a_le_b = opt_endpts.A[ch] <= opt_endpts.B[ch];
+
+	int amin, bmin;
+
+	if (opt_endpts.A[ch] <= opt_endpts.B[ch])
+	{
+		// keep a <= b
+		for (int a = alow; a <= ahigh; ++a)
+		for (int b = MAX(a, blow); b < bhigh; ++b)
+		{
+			temp_endpts.A[ch] = a;
+			temp_endpts.B[ch] = b;
+		
+			float err = map_colors(colors, np, temp_endpts, region_prec, best_err, temp_indices);
+			if (err < best_err) 
+			{ 
+				amin = a; 
+				bmin = b; 
+				best_err = err;
+				for (int i=0; i<np; ++i)
+					good_indices[i] = temp_indices[i];
+			}
+		}
+	}
+	else
+	{
+		// keep b <= a
+		for (int b = blow; b < bhigh; ++b)
+		for (int a = MAX(b, alow); a <= ahigh; ++a)
+		{
+			temp_endpts.A[ch] = a;
+			temp_endpts.B[ch] = b;
+		
+			float err = map_colors(colors, np, temp_endpts, region_prec, best_err, temp_indices);
+			if (err < best_err) 
+			{ 
+				amin = a; 
+				bmin = b; 
+				best_err = err; 
+				for (int i=0; i<np; ++i)
+					good_indices[i] = temp_indices[i];
+			}
+		}
+	}
+	if (best_err < orig_err)
+	{
+		opt_endpts.A[ch] = amin;
+		opt_endpts.B[ch] = bmin;
+		orig_err = best_err;
+		// if we actually improved, update the indices
+		for (int i=0; i<np; ++i)
+			indices[i] = good_indices[i];
+	}
+	return best_err;
+}
+
+static double optimize_one(const Vec4 colors[], int np, double orig_err, const IntEndptsRGBA_2 &orig_endpts, const RegionPrec &region_prec, IntEndptsRGBA_2 &opt_endpts)
+{
+	double opt_err = orig_err;
+
+	opt_endpts = orig_endpts;
+
+	/*
+		err0 = perturb(rgb0, delta0)
+		err1 = perturb(rgb1, delta1)
+		if (err0 < err1)
+			if (err0 >= initial_error) break
+			rgb0 += delta0
+			next = 1
+		else
+			if (err1 >= initial_error) break
+			rgb1 += delta1
+			next = 0
+		initial_err = map()
+		for (;;)
+			err = perturb(next ? rgb1:rgb0, delta)
+			if (err >= initial_err) break
+			next? rgb1 : rgb0 += delta
+			initial_err = err
+	*/
+	IntEndptsRGBA_2 new_a, new_b;
+	IntEndptsRGBA_2 new_endpt;
+	int do_b;
+	int orig_indices[Tile::TILE_TOTAL];
+	int new_indices[Tile::TILE_TOTAL];
+	int temp_indices0[Tile::TILE_TOTAL];
+	int temp_indices1[Tile::TILE_TOTAL];
+
+	// now optimize each channel separately
+	// for the first error improvement, we save the indices. then, for any later improvement, we compare the indices
+	// if they differ, we restart the loop (which then falls back to looking for a first improvement.)
+	for (int ch = 0; ch < NCHANNELS_RGBA; ++ch)
+	{
+		// figure out which endpoint when perturbed gives the most improvement and start there
+		// if we just alternate, we can easily end up in a local minima
+		float err0 = perturb_one(colors, np, ch, region_prec, opt_endpts, new_a, opt_err, 0, temp_indices0);	// perturb endpt A
+		float err1 = perturb_one(colors, np, ch, region_prec, opt_endpts, new_b, opt_err, 1, temp_indices1);	// perturb endpt B
+
+		if (err0 < err1)
+		{
+			if (err0 >= opt_err)
+				continue;
+
+			for (int i=0; i<np; ++i)
+			{
+				new_indices[i] = orig_indices[i] = temp_indices0[i];
+				assert (orig_indices[i] != -1);
+			}
+
+			opt_endpts.A[ch] = new_a.A[ch];
+			opt_err = err0;
+			do_b = 1;		// do B next
+		}
+		else
+		{
+			if (err1 >= opt_err)
+				continue;
+
+			for (int i=0; i<np; ++i)
+			{
+				new_indices[i] = orig_indices[i] = temp_indices1[i];
+				assert (orig_indices[i] != -1);
+			}
+
+			opt_endpts.B[ch] = new_b.B[ch];
+			opt_err = err1;
+			do_b = 0;		// do A next
+		}
+		
+		// now alternate endpoints and keep trying until there is no improvement
+		for (;;)
+		{
+			float err = perturb_one(colors, np, ch, region_prec, opt_endpts, new_endpt, opt_err, do_b, temp_indices0);
+			if (err >= opt_err)
+				break;
+
+			for (int i=0; i<np; ++i)
+			{
+				new_indices[i] = temp_indices0[i];
+				assert (new_indices[i] != -1);
+			}
+
+			if (do_b == 0)
+				opt_endpts.A[ch] = new_endpt.A[ch];
+			else
+				opt_endpts.B[ch] = new_endpt.B[ch];
+			opt_err = err;
+			do_b = 1 - do_b;	// now move the other endpoint
+		}
+
+		// see if the indices have changed
+		int i;
+		for (i=0; i<np; ++i)
+			if (orig_indices[i] != new_indices[i])
+				break;
+
+		if (i<np)
+			ch = -1;	// start over
+	}
+
+	// finally, do a small exhaustive search around what we think is the global minima to be sure
+	// note this is independent of the above search, so we don't care about the indices from the above
+	// we don't care about the above because if they differ, so what? we've already started at ch=0
+	bool first = true;
+	for (int ch = 0; ch < NCHANNELS_RGBA; ++ch)
+	{
+		double new_err = exhaustive(colors, np, ch, region_prec, opt_err, opt_endpts, temp_indices0);
+
+		if (new_err < opt_err)
+		{
+			opt_err = new_err;
+
+			if (first)
+			{
+				for (int i=0; i<np; ++i)
+				{
+					orig_indices[i] = temp_indices0[i];
+					assert (orig_indices[i] != -1);
+				}
+				first = false;
+			}
+			else
+			{
+				// see if the indices have changed
+				int i;
+				for (i=0; i<np; ++i)
+					if (orig_indices[i] != temp_indices0[i])
+						break;
+
+				if (i<np)
+				{
+					ch = -1;	// start over
+					first = true;
+				}
+			}
+		}
+	}
+
+	return opt_err;
+}
+
+static void optimize_endpts(const Tile &tile, int shapeindex, const double orig_err[NREGIONS], 
+							IntEndptsRGBA_2 orig_endpts[NREGIONS], const PatternPrec &pattern_prec, double opt_err[NREGIONS], IntEndptsRGBA_2 opt_endpts[NREGIONS])
+{
+	Vec4 pixels[Tile::TILE_TOTAL];
+	IntEndptsRGBA_2 temp_in, temp_out;
+	int temp_indices[Tile::TILE_TOTAL];
+
+	for (int region=0; region<NREGIONS; ++region)
+	{
+		// collect the pixels in the region
+		int np = 0;
+
+		for (int y = 0; y < tile.size_y; y++)
+		for (int x = 0; x < tile.size_x; x++)
+			if (REGION(x,y,shapeindex) == region)
+				pixels[np++] = tile.data[y][x];
+
+		opt_endpts[region] = temp_in = orig_endpts[region];
+		opt_err[region] = orig_err[region];
+
+		double best_err = orig_err[region];
+
+		// try all lsb modes as we search for better endpoints
+		for (int lsbmode=0; lsbmode<NLSBMODES; ++lsbmode)
+		{
+			temp_in.a_lsb = lsbmode & 1;
+			temp_in.b_lsb = (lsbmode >> 1) & 1;
+
+			// make sure we have a valid error for temp_in
+			// we use DBL_MAX here because we want an accurate temp_in_err, no shortcuts
+			// (mapcolors will compute a mapping but will stop if the error exceeds the value passed in the DBL_MAX position)
+			double temp_in_err = map_colors(pixels, np, temp_in, pattern_prec.region_precs[region], DBL_MAX, temp_indices);
+
+			// now try to optimize these endpoints
+			double temp_out_err = optimize_one(pixels, np, temp_in_err, temp_in, pattern_prec.region_precs[region], temp_out);
+
+			// if we find an improvement, update the best so far and correct the output endpoints and errors
+			if (temp_out_err < best_err)
+			{
+				best_err = temp_out_err;
+				opt_err[region] = temp_out_err;
+				opt_endpts[region] = temp_out;
+			}
+		}
+	}
+}
+
+/* optimization algorithm
+	for each pattern
+		convert endpoints using pattern precision
+		assign indices and get initial error
+		compress indices (and possibly reorder endpoints)
+		transform endpoints
+		if transformed endpoints fit pattern
+			get original endpoints back
+			optimize endpoints, get new endpoints, new indices, and new error // new error will almost always be better
+			compress new indices
+			transform new endpoints
+			if new endpoints fit pattern AND if error is improved
+				emit compressed block with new data
+			else
+				emit compressed block with original data // to try to preserve maximum endpoint precision
+*/
+
+static double refine(const Tile &tile, int shapeindex_best, const FltEndpts endpts[NREGIONS], char *block)
+{
+	double orig_err[NREGIONS], opt_err[NREGIONS], orig_toterr, opt_toterr, expected_opt_err[NREGIONS];
+	IntEndptsRGBA_2 orig_endpts[NREGIONS], opt_endpts[NREGIONS];
+	int orig_indices[Tile::TILE_H][Tile::TILE_W], opt_indices[Tile::TILE_H][Tile::TILE_W];
+
+	for (int sp = 0; sp < NPATTERNS; ++sp)
+	{
+		quantize_endpts(endpts, pattern_precs[sp], orig_endpts);
+		assign_indices(tile, shapeindex_best, orig_endpts, pattern_precs[sp], orig_indices, orig_err);
+		swap_indices(orig_endpts, orig_indices, shapeindex_best);
+		if (patterns[sp].transformed)
+			transform_forward(orig_endpts);
+		// apply a heuristic here -- we check if the endpoints fit before we try to optimize them.
+		// the assumption made is that if they don't fit now, they won't fit after optimizing.
+		if (endpts_fit(orig_endpts, patterns[sp]))
+		{
+			if (patterns[sp].transformed)
+				transform_inverse(orig_endpts);
+			optimize_endpts(tile, shapeindex_best, orig_err, orig_endpts, pattern_precs[sp], expected_opt_err, opt_endpts);
+			assign_indices(tile, shapeindex_best, opt_endpts, pattern_precs[sp], opt_indices, opt_err);
+			for (int i=0; i<NREGIONS; ++i)
+				assert(expected_opt_err[i] == opt_err[i]);
+			swap_indices(opt_endpts, opt_indices, shapeindex_best);
+			if (patterns[sp].transformed)
+				transform_forward(opt_endpts);
+			orig_toterr = opt_toterr = 0;
+			for (int i=0; i < NREGIONS; ++i) { orig_toterr += orig_err[i]; opt_toterr += opt_err[i]; }
+			if (endpts_fit(opt_endpts, patterns[sp]) && opt_toterr < orig_toterr)
+			{
+				emit_block(opt_endpts, shapeindex_best, patterns[sp], opt_indices, block);
+				return opt_toterr;
+			}
+			else
+			{
+				// either it stopped fitting when we optimized it, or there was no improvement
+				// so go back to the unoptimized endpoints which we know will fit
+				if (patterns[sp].transformed)
+					transform_forward(orig_endpts);
+				emit_block(orig_endpts, shapeindex_best, patterns[sp], orig_indices, block);
+				return orig_toterr;
+			}
+		}
+	}
+	throw "No candidate found, should never happen (avpcl mode 7).";
+}
+
+static void clamp(Vec4 &v)
+{
+	if (v.X() < RGBA_MIN) v.X() = RGBA_MIN;
+	if (v.X() > RGBA_MAX) v.X() = RGBA_MAX;
+	if (v.Y() < RGBA_MIN) v.Y() = RGBA_MIN;
+	if (v.Y() > RGBA_MAX) v.Y() = RGBA_MAX;
+	if (v.Z() < RGBA_MIN) v.Z() = RGBA_MIN;
+	if (v.Z() > RGBA_MAX) v.Z() = RGBA_MAX;
+	if (v.W() < RGBA_MIN) v.W() = RGBA_MIN;
+	if (v.W() > RGBA_MAX) v.W() = RGBA_MAX;
+}
+
+static void generate_palette_unquantized(const FltEndpts endpts[NREGIONS], Vec4 palette[NREGIONS][NINDICES])
+{
+	for (int region = 0; region < NREGIONS; ++region)
+	for (int i = 0; i < NINDICES; ++i)
+		palette[region][i] = PALETTE_LERP(endpts[region].A, endpts[region].B, i, 0.0, float(DENOM));
+}
+
+// generate a palette from unquantized endpoints, then pick best palette color for all pixels in each region, return toterr for all regions combined
+static double map_colors(const Tile &tile, int shapeindex, const FltEndpts endpts[NREGIONS])
+{
+	// build list of possibles
+	Vec4 palette[NREGIONS][NINDICES];
+
+	generate_palette_unquantized(endpts, palette);
+
+	double toterr = 0;
+	Vec4 err;
+
+	for (int y = 0; y < tile.size_y; y++)
+	for (int x = 0; x < tile.size_x; x++)
+	{
+		int region = REGION(x,y,shapeindex);
+		double err, besterr = DBL_MAX;
+
+		for (int i = 0; i < NINDICES && besterr > 0; ++i)
+		{
+			err = Utils::metric4(tile.data[y][x], palette[region][i]);
+
+			if (err > besterr)	// error increased, so we're done searching. this works for most norms.
+				break;
+			if (err < besterr)
+				besterr = err;
+		}
+		toterr += besterr;
+	}
+	return toterr;
+}
+
+static double rough(const Tile &tile, int shapeindex, FltEndpts endpts[NREGIONS])
+{
+	for (int region=0; region<NREGIONS; ++region)
+	{
+		int np = 0;
+		Vec4 colors[Tile::TILE_TOTAL];
+		Vec4 mean(0,0,0,0);
+
+		for (int y = 0; y < tile.size_y; y++)
+		for (int x = 0; x < tile.size_x; x++)
+			if (REGION(x,y,shapeindex) == region)
+			{
+				colors[np] = tile.data[y][x];
+				mean += tile.data[y][x];
+				++np;
+			}
+
+		// handle simple cases	
+		if (np == 0)
+		{
+			Vec4 zero(0,0,0,RGBA_MAX);
+			endpts[region].A = zero;
+			endpts[region].B = zero;
+			continue;
+		}
+		else if (np == 1)
+		{
+			endpts[region].A = colors[0];
+			endpts[region].B = colors[0];
+			continue;
+		}
+		else if (np == 2)
+		{
+			endpts[region].A = colors[0];
+			endpts[region].B = colors[1];
+			continue;
+		}
+
+		Matrix rdq(np, 4);
+
+		mean /= float(np);
+
+		for (int i = 0; i < np; ++i)
+		{
+			rdq(i,0) = colors[i].X() - mean.X();
+			rdq(i,1) = colors[i].Y() - mean.Y();
+			rdq(i,2) = colors[i].Z() - mean.Z();
+			rdq(i,3) = colors[i].W() - mean.W();
+		}
+				
+		// perform a singular value decomposition
+		SVD svd(rdq);
+
+		// get the principal component direction (the one with the largest weight)
+		Vec4 direction(svd.R()(0,0), svd.R()(0,1), svd.R()(0,2), svd.R()(0,3));
+
+		// project each pixel value along the principal direction
+		double minp = DBL_MAX, maxp = -DBL_MAX;
+		for (int i = 0; i < np; i++) 
+		{
+			float dp = rdq(i,0) * direction.X() + rdq(i,1)*direction.Y() + rdq(i,2)*direction.Z() + rdq(i,3)*direction.W();
+			if (dp < minp) minp = dp;
+			if (dp > maxp) maxp = dp;
+		}
+
+		// choose as endpoints 2 points along the principal direction that span the projections of all of the pixel values
+		endpts[region].A = mean + minp*direction;
+		endpts[region].B = mean + maxp*direction;
+
+		// clamp endpoints
+		// the argument for clamping is that the actual endpoints need to be clamped and thus we need to choose the best
+		// shape based on endpoints being clamped
+		clamp(endpts[region].A);
+		clamp(endpts[region].B);
+	}
+
+	return map_colors(tile, shapeindex, endpts);
+}
+
+static void swap(double *list1, int *list2, int i, int j)
+{
+	double t = list1[i]; list1[i] = list1[j]; list1[j] = t;
+	int t1 = list2[i]; list2[i] = list2[j]; list2[j] = t1;
+}
+
+double AVPCL::compress_mode7(const Tile &t, char *block)
+{
+	// number of rough cases to look at. reasonable values of this are 1, NSHAPES/4, and NSHAPES
+	// NSHAPES/4 gets nearly all the cases; you can increase that a bit (say by 3 or 4) if you really want to squeeze the last bit out
+	const int NITEMS=NSHAPES/4;
+
+	// pick the best NITEMS shapes and refine these.
+	struct {
+		FltEndpts endpts[NREGIONS];
+	} all[NSHAPES];
+	double roughmse[NSHAPES];
+	int index[NSHAPES];
+	char tempblock[AVPCL::BLOCKSIZE];
+	double msebest = DBL_MAX;
+
+	for (int i=0; i<NSHAPES; ++i)
+	{
+		roughmse[i] = rough(t, i, &all[i].endpts[0]);
+		index[i] = i;
+	}
+
+	// bubble sort -- only need to bubble up the first NITEMS items
+	for (int i=0; i<NITEMS; ++i)
+	for (int j=i+1; j<NSHAPES; ++j)
+		if (roughmse[i] > roughmse[j])
+			swap(roughmse, index, i, j);
+
+	for (int i=0; i<NITEMS && msebest>0; ++i)
+	{
+		int shape = index[i];
+		double mse = refine(t, shape, &all[shape].endpts[0], tempblock);
+		if (mse < msebest)
+		{
+			memcpy(block, tempblock, sizeof(tempblock));
+			msebest = mse;
+		}
+	}
+	return msebest;
+}
+
diff --git a/src/nvtt/bc7/avpclc.cpp b/src/nvtt/bc7/avpclc.cpp
new file mode 100644
index 0000000..afa8903
--- /dev/null
+++ b/src/nvtt/bc7/avpclc.cpp
@@ -0,0 +1,348 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+// NOTE: the compressor will compress RGB tiles where the input alpha is constant at 255
+// using modes where the alpha is variable if that mode gives a smaller mean squared error.
+
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <stdexcept>
+#include <assert.h>
+
+#include "ImfArray.h"
+#include "targa.h"
+#include "avpcl.h"
+
+using namespace std;
+
+static void analyze(string in1, string in2)
+{
+	Array2D<RGBA> pin1, pin2;
+	int w1, h1, w2, h2;
+
+	Targa::read(in1, pin1, w1, h1);
+	Targa::read(in2, pin2, w2, h2);
+
+	// choose the smaller of the two dimensions (since the old compressor would truncate to multiple-of-4 sizes)
+	int w = MIN(w1, w2);
+	int h = MIN(h1, h2);
+
+	double nsamples = 0;
+	double mabse_rgb = 0, mabse_a = 0, mabse_rgba = 0, mse_rgb = 0, mse_a = 0, mse_rgba = 0;
+	int errdist_rgb[9], errdist_a[9], errdist_rgba[9];
+	int errs[4*16];
+
+	for (int i=0; i<9; ++i)
+		errdist_rgb[i] = errdist_a[i] = errdist_rgba[i] = 0;
+
+	int psnrhist[100];
+	for (int i=0; i<100; ++i)
+		psnrhist[i] = 0;
+	bool first = true;
+
+	int worstx, worsty;
+	double worstpsnr = 999.0;
+
+	bool constant_alpha = true;
+
+	for (int y = 0; y < h; y+=4)
+	for (int x = 0; x < w; x+=4)
+	{
+		int xw = MIN(w-x, 4);
+		int yw = MIN(h-y, 4);
+		int np = 0;
+
+		float a[4], b[4];
+
+		for (int y0=0; y0<yw; ++y0)
+		for (int x0=0; x0<xw; ++x0)
+		{
+			a[0] = (pin1[y+y0][x+x0]).r;
+			a[1] = (pin1[y+y0][x+x0]).g;
+			a[2] = (pin1[y+y0][x+x0]).b;
+			a[3] = (pin1[y+y0][x+x0]).a;
+
+			b[0] = (pin2[y+y0][x+x0]).r;
+			b[1] = (pin2[y+y0][x+x0]).g;
+			b[2] = (pin2[y+y0][x+x0]).b;
+			b[3] = (pin2[y+y0][x+x0]).a;
+
+			if (AVPCL::flag_premult)
+			{
+				// premultiply
+				for (int i=0; i<3; ++i)
+				{
+					a[i] = Utils::premult(a[i], a[3]);
+					b[i] = Utils::premult(b[i], b[3]);
+				}
+			}
+
+			if (a[3] != RGBA_MAX || b[3] != RGBA_MAX) 
+				constant_alpha = false;
+
+			for (int i=0; i<4; ++i)
+				errs[np+i] = a[i] - b[i];
+
+			np += 4;
+		}
+
+		double msetile = 0.0;
+
+		for (int i = 0; i < np; ++i)
+		{
+			int err = errs[i];
+			int abse = err > 0 ? err : -err;
+			int j = i & 3;
+			int lsb;
+
+			for (lsb=0; (abse>>lsb)>0; ++lsb)
+				;
+			assert (lsb <= 8);
+
+			if (j == 3)
+			{
+				mabse_a += (double)abse;
+				mse_a += (double)abse * abse;
+				errdist_a[lsb]++;
+			}
+			else
+			{
+				mabse_rgb += (double)abse;
+				mse_rgb += (double)abse * abse;
+				errdist_rgb[lsb]++;
+			}
+			mabse_rgba += (double)abse;
+			mse_rgba += (double)abse * abse;
+			errdist_rgba[lsb]++;
+
+			msetile += (double)abse * abse;
+		}
+
+		double psnrtile, rmsetile;
+
+		rmsetile = sqrt(msetile / double(np));
+		psnrtile = (rmsetile == 0) ? 99.0 : 20.0 * log10(255.0/rmsetile);
+
+		if (psnrtile < worstpsnr)
+		{
+			worstx = x; worsty = y; worstpsnr = psnrtile;
+		}
+#ifdef EXTERNAL_RELEASE
+		int psnrquant = (int) floor (psnrtile);		// 10 means [10,11) psnrs, e.g.
+		// clamp just in case
+		psnrquant = (psnrquant < 0) ? 0 : (psnrquant > 99) ? 99 : psnrquant;
+		psnrhist[psnrquant]++;
+		if (first && psnrquant < 16)
+		{
+			first = false;
+			printf("Tiles with RGBA PSNR's worse than 16dB\n");
+		}
+		if (psnrquant < 16)
+			printf("X %4d Y %4d RGBA PSNR %7.2f\n", x, y, psnrtile);
+#endif
+	}
+	
+	nsamples = w * h;
+
+	mabse_a /= nsamples;
+	mse_a /= nsamples;
+	mabse_rgb /= (nsamples*3);
+	mse_rgb /= (nsamples*3);
+	mabse_rgba /= (nsamples*4);
+	mse_rgba /= (nsamples*4);
+
+	double rmse_a, psnr_a, rmse_rgb, psnr_rgb, rmse_rgba, psnr_rgba;
+
+	rmse_a = sqrt(mse_a);
+	psnr_a = (rmse_a == 0) ? 999.0 : 20.0 * log10(255.0/rmse_a);
+
+	rmse_rgb = sqrt(mse_rgb);
+	psnr_rgb = (rmse_rgb == 0) ? 999.0 : 20.0 * log10(255.0/rmse_rgb);
+
+	rmse_rgba = sqrt(mse_rgba);
+	psnr_rgba = (rmse_rgba == 0) ? 999.0 : 20.0 * log10(255.0/rmse_rgba);
+
+	printf("Image size compared: %dw x %dh\n", w, h);
+	printf("Image alpha is %s.\n", constant_alpha ? "CONSTANT" : "VARIABLE");
+	if (w != w1 || w != w2 || h != h1 || h != h2)
+		printf("--- NOTE: only the overlap between the 2 images (%d,%d) and (%d,%d) was compared\n", w1, h1, w2, h2);
+	printf("Total pixels: %12d\n", w * h);
+
+	char *which = !AVPCL::flag_premult ? "RGB" : "aRaGaB";
+
+	printf("\n%s Mean absolute error: %f\n", which, mabse_rgb);
+	printf("%s Root mean squared error: %f (MSE %f)\n", which, rmse_rgb, rmse_rgb*rmse_rgb);
+	printf("%s Peak signal to noise ratio in dB: %f\n", which, psnr_rgb);
+	printf("%s Histogram of number of channels with indicated LSB error\n", which);
+	for (int i = 0; i < 9; ++i)
+		if (errdist_rgb[i]) printf("%2d LSB error: %10d\n", i, errdist_rgb[i]);
+
+	printf("\nAlpha Mean absolute error: %f\n", mabse_a);
+	printf("Alpha Root mean squared error: %f (MSE %f)\n", rmse_a, rmse_a*rmse_a);
+	printf("Alpha Peak signal to noise ratio in dB: %f\n", psnr_a);
+	printf("Alpha Histogram of number of channels with indicated LSB error\n");
+	for (int i = 0; i < 9; ++i)
+		if (errdist_a[i]) printf("%2d LSB error: %10d\n", i, errdist_a[i]);
+
+	printf("\nRGBA Mean absolute error: %f\n", mabse_rgba);
+	printf("RGBA Root mean squared error: %f (MSE %f)\n", rmse_rgba, rmse_rgba*rmse_rgba);
+	printf("RGBA Peak signal to noise ratio in dB: %f\n", psnr_rgba);
+	printf("RGBA Histogram of number of channels with indicated LSB error\n");
+	for (int i = 0; i < 9; ++i)
+		if (errdist_rgba[i]) printf("%2d LSB error: %10d\n", i, errdist_rgba[i]);
+
+	printf("\nWorst tile RGBA PSNR %f at x %d y %d\n", worstpsnr, worstx, worsty);
+#if 0
+	printf("Histogram of per-tile PSNR\n");
+	for (int i = 0; i < 100; ++i)
+		if (psnrhist[i])
+			printf("[%2d,%2d) %6d\n", i, i+1, psnrhist[i]);
+#endif
+}
+
+static bool ext(string inf, char *extension)
+{
+	size_t n = inf.rfind('.', inf.length()-1);
+	if (n != string::npos)
+		return inf.substr(n, inf.length()) == extension;
+	else if (*extension != '\0')
+		return false;
+	else
+		return true;	// extension is null and we didn't find a .
+}
+
+template <typename T>
+std::string toString(const T &thing) 
+{
+	std::stringstream os;
+	os << thing;
+	return os.str();
+}
+
+static int str2int(std::string s) 
+{
+	int thing;
+	std::stringstream str (stringstream::in | stringstream::out);
+	str << s;
+	str >> thing;
+	return thing;
+}
+
+static void usage()
+{
+	cout << endl <<
+	"Usage:" << endl <<
+	"avpclc infile.tga outroot       generates outroot-w-h.avpcl and outroot-avpcl.tga" << endl <<
+	"avpclc foo-w-h.avpcl outroot    generates outroot-avpcl.tga" << endl <<
+	"avpclc infile.tga outfile.tga   compares the two images" << endl << endl <<
+	"Flags:" << endl <<
+	"-p     use a metric based on AR AG AB A (note: if the image has alpha constant 255 this option is overridden)" << endl <<
+	"-n     use a non-uniformly-weighed metric (weights .299 .587 .114)" << endl <<
+	"-na	use a non-uniformly-weighed metric (ATI weights .3086 .6094 .0820)" << endl <<
+	"-e     dump squared errors for each tile to outroot-errors.bin" << endl;
+}
+
+bool AVPCL::flag_premult = false;
+bool AVPCL::flag_nonuniform = false;
+bool AVPCL::flag_nonuniform_ati = false;
+
+bool AVPCL::mode_rgb = false;
+
+int main(int argc, char* argv[])
+{
+	bool noerrfile = true;
+#ifdef EXTERNAL_RELEASE
+	cout << "avpcl/BC7L Targa RGBA Compressor/Decompressor version 1.41 (May 27, 2010)." << endl <<
+			"Bug reports, questions, and suggestions to wdonovan a t nvidia d o t com." << endl;
+#endif
+	try
+	{
+		char * args[2];
+		int nargs = 0;
+
+		// process flags, copy any non flag arg to args[]
+		for (int i = 1; i < argc; ++i)
+			if ((argv[i])[0] == '-')
+				switch ((argv[i])[1]) {
+					case 'p': AVPCL::flag_premult = true; break;
+					case 'n': if ((argv[i])[2] == 'a') { AVPCL::flag_nonuniform_ati = true; AVPCL::flag_nonuniform = false; }
+							  else { AVPCL::flag_nonuniform = true; AVPCL::flag_nonuniform_ati = false; }
+							  break;
+					case 'e': noerrfile = false; break;
+					default:  throw "bad flag arg";
+				}
+			else
+			{
+				if (nargs > 1) throw "Incorrect number of args";
+				args[nargs++] = argv[i];
+			}
+
+		if (nargs != 2) throw "Incorrect number of args";
+
+		string inf(args[0]), outroot(args[1]);
+
+		if (ext(outroot, ""))
+		{
+			if (ext(inf, ".tga"))
+			{
+				int width, height;
+
+				Targa::fileinfo(inf, width, height, AVPCL::mode_rgb);
+
+				string outf, avpclf, errf;
+				outf = outroot + "-avpcl.tga";
+				avpclf = outroot + "-" + toString(width) + "-" + toString(height) + "-" + (AVPCL::mode_rgb ? "RGB" : "RGBA") + ".avpcl";
+				cout << "Compressing " << (AVPCL::mode_rgb ? "RGB file " : "RGBA file ") << inf << " to " << avpclf << endl;
+				if (!noerrfile)
+				{
+					errf = outroot + "-errors" + ".bin";
+					cout << "Errors output file is " << errf << endl;
+				}
+				else
+					errf = "";
+				AVPCL::compress(inf, avpclf, errf);
+				cout << "Decompressing " << avpclf << " to " << outf << endl;
+				AVPCL::decompress(avpclf, outf);
+				analyze(inf, outf);
+			}
+			else if (ext(inf, ".avpcl"))
+			{
+				string outf;
+				outf = outroot + "-avpcl.tga";
+				cout << "Decompressing " << inf << " to " << outf << endl;
+				AVPCL::decompress(inf, outf);
+			}
+			else throw "Invalid file args";
+		}
+		else if (ext(inf, ".tga") && ext(outroot, ".tga"))
+		{
+			analyze(inf, outroot);
+		}
+		else throw "Invalid file args";
+
+	}
+	catch(const exception& e)
+	{
+		// Print error message and usage instructions
+		cerr << e.what() << endl;
+		usage();
+		return 1;
+	}
+	catch(char * msg)
+	{
+		cerr << msg << endl;
+		usage();
+		return 1;
+	}
+	return 0;
+}
diff --git a/src/nvtt/bc7/bits.h b/src/nvtt/bc7/bits.h
new file mode 100644
index 0000000..3fa4af2
--- /dev/null
+++ b/src/nvtt/bc7/bits.h
@@ -0,0 +1,73 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+#ifndef _BITS_H
+#define _BITS_H
+
+// read/write a bitstream
+
+#include <assert.h>
+
+class Bits
+{
+public:
+
+	Bits(char *data, int maxdatabits) { assert (data && maxdatabits > 0); bptr = bend = 0; bits = data; maxbits = maxdatabits; readonly = 0;}
+	Bits(const char *data, int availdatabits) { assert (data && availdatabits > 0); bptr = 0; bend = availdatabits; cbits = data; maxbits = availdatabits; readonly = 1;}
+
+	void write(int value, int nbits) {
+		assert (nbits >= 0 && nbits < 32);
+		assert (sizeof(int)>= 4);
+		for (int i=0; i<nbits; ++i)
+			writeone(value>>i);
+	}
+	int read(int nbits) { 
+		assert (nbits >= 0 && nbits < 32);
+		assert (sizeof(int)>= 4);
+		int out = 0;
+		for (int i=0; i<nbits; ++i)
+			out |= readone() << i;
+		return out;
+	}
+	int getptr() { return bptr; }
+	void setptr(int ptr) { assert (ptr >= 0 && ptr < maxbits); bptr = ptr; }
+	int getsize() { return bend; }
+
+private:
+	int	bptr;		// next bit to read
+	int bend;		// last written bit + 1
+	char *bits;		// ptr to user bit stream
+	const char *cbits;	// ptr to const user bit stream
+	int maxbits;	// max size of user bit stream
+	char readonly;	// 1 if this is a read-only stream
+
+	int readone() {
+		assert (bptr < bend);
+		if (bptr >= bend) return 0;
+		int bit = (readonly ? cbits[bptr>>3] : bits[bptr>>3]) & (1 << (bptr & 7));
+		++bptr;
+		return bit != 0;
+	}
+	void writeone(int bit) {
+		if (readonly)
+			throw "Writing a read-only bit stream";
+		assert (bptr < maxbits);
+		if (bptr >= maxbits) return;
+		if (bit&1)
+			bits[bptr>>3] |= 1 << (bptr & 7);
+		else
+			bits[bptr>>3] &= ~(1 << (bptr & 7));
+		if (bptr++ >= bend) bend = bptr;
+	}
+};
+
+#endif
\ No newline at end of file
diff --git a/src/nvtt/bc7/endpts.h b/src/nvtt/bc7/endpts.h
new file mode 100644
index 0000000..0b33eef
--- /dev/null
+++ b/src/nvtt/bc7/endpts.h
@@ -0,0 +1,80 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+#ifndef _ENDPTS_H
+#define _ENDPTS_H
+
+// endpoint definitions and routines to search through endpoint space
+
+#include "arvo/Vec4.h"
+
+using namespace ArvoMath;
+
+#define	NCHANNELS_RGB	3
+#define	NCHANNELS_RGBA	4
+#define	CHANNEL_R	0
+#define	CHANNEL_G	1
+#define	CHANNEL_B	2
+#define	CHANNEL_A	3
+
+struct FltEndpts
+{
+	Vec4	A;
+	Vec4	B;
+};
+
+struct IntEndptsRGB
+{
+	int		A[NCHANNELS_RGB];
+	int		B[NCHANNELS_RGB];
+};
+
+struct IntEndptsRGB_1
+{
+	int		A[NCHANNELS_RGB];
+	int		B[NCHANNELS_RGB];
+	int		lsb;				// shared lsb for A and B
+};
+
+struct IntEndptsRGB_2
+{
+	int		A[NCHANNELS_RGB];
+	int		B[NCHANNELS_RGB];
+	int		a_lsb;				// lsb for A
+	int		b_lsb;				// lsb for B
+};
+
+
+struct IntEndptsRGBA
+{
+	int		A[NCHANNELS_RGBA];
+	int		B[NCHANNELS_RGBA];
+};
+
+struct IntEndptsRGBA_2
+{
+	int		A[NCHANNELS_RGBA];
+	int		B[NCHANNELS_RGBA];
+	int		a_lsb;				// lsb for A
+	int		b_lsb;				// lsb for B
+};
+
+struct IntEndptsRGBA_2a
+{
+	int		A[NCHANNELS_RGBA];
+	int		B[NCHANNELS_RGBA];
+	int		a_lsb;				// lsb for RGB channels of A
+	int		b_lsb;				// lsb for RGB channels of A
+};
+
+#endif
+
diff --git a/src/nvtt/bc7/rgba.h b/src/nvtt/bc7/rgba.h
new file mode 100644
index 0000000..d356a10
--- /dev/null
+++ b/src/nvtt/bc7/rgba.h
@@ -0,0 +1,27 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+#ifndef _RGBA_H
+#define _RGBA_H
+
+#define	RGBA_MIN	0
+#define	RGBA_MAX	255		// range of RGBA
+
+class RGBA
+{
+public:
+	float r, g, b, a;
+	RGBA(): r(0), g(0), b(0), a(0){}
+	RGBA(float r, float g, float b, float a): r(r), g(g), b(b), a(a){}
+};
+
+#endif
diff --git a/src/nvtt/bc7/shapes_three.h b/src/nvtt/bc7/shapes_three.h
new file mode 100644
index 0000000..c618d22
--- /dev/null
+++ b/src/nvtt/bc7/shapes_three.h
@@ -0,0 +1,132 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+#ifndef	_SHAPES_THREE_H
+#define _SHAPES_THREE_H
+
+// shapes for 3 regions
+
+#define NREGIONS 3
+#define NSHAPES 64
+#define SHAPEBITS 6
+
+static int shapes[NSHAPES*16] = 
+{
+0, 0, 1, 1,   0, 0, 0, 1,   0, 0, 0, 0,   0, 2, 2, 2,   
+0, 0, 1, 1,   0, 0, 1, 1,   2, 0, 0, 1,   0, 0, 2, 2,   
+0, 2, 2, 1,   2, 2, 1, 1,   2, 2, 1, 1,   0, 0, 1, 1,   
+2, 2, 2, 2,   2, 2, 2, 1,   2, 2, 1, 1,   0, 1, 1, 1,   
+
+0, 0, 0, 0,   0, 0, 1, 1,   0, 0, 2, 2,   0, 0, 1, 1,   
+0, 0, 0, 0,   0, 0, 1, 1,   0, 0, 2, 2,   0, 0, 1, 1,   
+1, 1, 2, 2,   0, 0, 2, 2,   1, 1, 1, 1,   2, 2, 1, 1,   
+1, 1, 2, 2,   0, 0, 2, 2,   1, 1, 1, 1,   2, 2, 1, 1,   
+
+0, 0, 0, 0,   0, 0, 0, 0,   0, 0, 0, 0,   0, 0, 1, 2,   
+0, 0, 0, 0,   1, 1, 1, 1,   1, 1, 1, 1,   0, 0, 1, 2,   
+1, 1, 1, 1,   1, 1, 1, 1,   2, 2, 2, 2,   0, 0, 1, 2,   
+2, 2, 2, 2,   2, 2, 2, 2,   2, 2, 2, 2,   0, 0, 1, 2,   
+
+0, 1, 1, 2,   0, 1, 2, 2,   0, 0, 1, 1,   0, 0, 1, 1,   
+0, 1, 1, 2,   0, 1, 2, 2,   0, 1, 1, 2,   2, 0, 0, 1,   
+0, 1, 1, 2,   0, 1, 2, 2,   1, 1, 2, 2,   2, 2, 0, 0,   
+0, 1, 1, 2,   0, 1, 2, 2,   1, 2, 2, 2,   2, 2, 2, 0,   
+
+0, 0, 0, 1,   0, 1, 1, 1,   0, 0, 0, 0,   0, 0, 2, 2,   
+0, 0, 1, 1,   0, 0, 1, 1,   1, 1, 2, 2,   0, 0, 2, 2,   
+0, 1, 1, 2,   2, 0, 0, 1,   1, 1, 2, 2,   0, 0, 2, 2,   
+1, 1, 2, 2,   2, 2, 0, 0,   1, 1, 2, 2,   1, 1, 1, 1,   
+
+0, 1, 1, 1,   0, 0, 0, 1,   0, 0, 0, 0,   0, 0, 0, 0,   
+0, 1, 1, 1,   0, 0, 0, 1,   0, 0, 1, 1,   1, 1, 0, 0,   
+0, 2, 2, 2,   2, 2, 2, 1,   0, 1, 2, 2,   2, 2, 1, 0,   
+0, 2, 2, 2,   2, 2, 2, 1,   0, 1, 2, 2,   2, 2, 1, 0,   
+
+0, 1, 2, 2,   0, 0, 1, 2,   0, 1, 1, 0,   0, 0, 0, 0,   
+0, 1, 2, 2,   0, 0, 1, 2,   1, 2, 2, 1,   0, 1, 1, 0,   
+0, 0, 1, 1,   1, 1, 2, 2,   1, 2, 2, 1,   1, 2, 2, 1,   
+0, 0, 0, 0,   2, 2, 2, 2,   0, 1, 1, 0,   1, 2, 2, 1,   
+
+0, 0, 2, 2,   0, 1, 1, 0,   0, 0, 1, 1,   0, 0, 0, 0,   
+1, 1, 0, 2,   0, 1, 1, 0,   0, 1, 2, 2,   2, 0, 0, 0,   
+1, 1, 0, 2,   2, 0, 0, 2,   0, 1, 2, 2,   2, 2, 1, 1,   
+0, 0, 2, 2,   2, 2, 2, 2,   0, 0, 1, 1,   2, 2, 2, 1,   
+
+0, 0, 0, 0,   0, 2, 2, 2,   0, 0, 1, 1,   0, 1, 2, 0,   
+0, 0, 0, 2,   0, 0, 2, 2,   0, 0, 1, 2,   0, 1, 2, 0,   
+1, 1, 2, 2,   0, 0, 1, 2,   0, 0, 2, 2,   0, 1, 2, 0,   
+1, 2, 2, 2,   0, 0, 1, 1,   0, 2, 2, 2,   0, 1, 2, 0,   
+
+0, 0, 0, 0,   0, 1, 2, 0,   0, 1, 2, 0,   0, 0, 1, 1,   
+1, 1, 1, 1,   1, 2, 0, 1,   2, 0, 1, 2,   2, 2, 0, 0,   
+2, 2, 2, 2,   2, 0, 1, 2,   1, 2, 0, 1,   1, 1, 2, 2,   
+0, 0, 0, 0,   0, 1, 2, 0,   0, 1, 2, 0,   0, 0, 1, 1,   
+
+0, 0, 1, 1,   0, 1, 0, 1,   0, 0, 0, 0,   0, 0, 2, 2,   
+1, 1, 2, 2,   0, 1, 0, 1,   0, 0, 0, 0,   1, 1, 2, 2,   
+2, 2, 0, 0,   2, 2, 2, 2,   2, 1, 2, 1,   0, 0, 2, 2,   
+0, 0, 1, 1,   2, 2, 2, 2,   2, 1, 2, 1,   1, 1, 2, 2,   
+
+0, 0, 2, 2,   0, 2, 2, 0,   0, 1, 0, 1,   0, 0, 0, 0,   
+0, 0, 1, 1,   1, 2, 2, 1,   2, 2, 2, 2,   2, 1, 2, 1,   
+0, 0, 2, 2,   0, 2, 2, 0,   2, 2, 2, 2,   2, 1, 2, 1,   
+0, 0, 1, 1,   1, 2, 2, 1,   0, 1, 0, 1,   2, 1, 2, 1,   
+
+0, 1, 0, 1,   0, 2, 2, 2,   0, 0, 0, 2,   0, 0, 0, 0,   
+0, 1, 0, 1,   0, 1, 1, 1,   1, 1, 1, 2,   2, 1, 1, 2,   
+0, 1, 0, 1,   0, 2, 2, 2,   0, 0, 0, 2,   2, 1, 1, 2,   
+2, 2, 2, 2,   0, 1, 1, 1,   1, 1, 1, 2,   2, 1, 1, 2,   
+
+0, 2, 2, 2,   0, 0, 0, 2,   0, 1, 1, 0,   0, 0, 0, 0,   
+0, 1, 1, 1,   1, 1, 1, 2,   0, 1, 1, 0,   0, 0, 0, 0,   
+0, 1, 1, 1,   1, 1, 1, 2,   0, 1, 1, 0,   2, 1, 1, 2,   
+0, 2, 2, 2,   0, 0, 0, 2,   2, 2, 2, 2,   2, 1, 1, 2,   
+
+0, 1, 1, 0,   0, 0, 2, 2,   0, 0, 2, 2,   0, 0, 0, 0,   
+0, 1, 1, 0,   0, 0, 1, 1,   1, 1, 2, 2,   0, 0, 0, 0,   
+2, 2, 2, 2,   0, 0, 1, 1,   1, 1, 2, 2,   0, 0, 0, 0,   
+2, 2, 2, 2,   0, 0, 2, 2,   0, 0, 2, 2,   2, 1, 1, 2,   
+
+0, 0, 0, 2,   0, 2, 2, 2,   0, 1, 0, 1,   0, 1, 1, 1,   
+0, 0, 0, 1,   1, 2, 2, 2,   2, 2, 2, 2,   2, 0, 1, 1,   
+0, 0, 0, 2,   0, 2, 2, 2,   2, 2, 2, 2,   2, 2, 0, 1,   
+0, 0, 0, 1,   1, 2, 2, 2,   2, 2, 2, 2,   2, 2, 2, 0,
+};
+
+#define	REGION(x,y,si)	shapes[((si)&3)*4+((si)>>2)*64+(x)+(y)*16]
+
+static int shapeindex_to_compressed_indices[NSHAPES*3] = 
+{
+	0, 3,15,  0, 3, 8,  0,15, 8,  0,15, 3,
+	0, 8,15,  0, 3,15,  0,15, 3,  0,15, 8,
+	0, 8,15,  0, 8,15,  0, 6,15,  0, 6,15,
+	0, 6,15,  0, 5,15,  0, 3,15,  0, 3, 8,
+
+	0, 3,15,  0, 3, 8,  0, 8,15,  0,15, 3,
+	0, 3,15,  0, 3, 8,  0, 6,15,  0,10, 8,
+	0, 5, 3,  0, 8,15,  0, 8, 6,  0, 6,10,
+	0, 8,15,  0, 5,15,  0,15,10,  0,15, 8,
+
+	0, 8,15,  0,15, 3,  0, 3,15,  0, 5,10,
+	0, 6,10,  0,10, 8,  0, 8, 9,  0,15,10,
+	0,15, 6,  0, 3,15,  0,15, 8,  0, 5,15,
+	0,15, 3,  0,15, 6,  0,15, 6,  0,15, 8,
+
+	0, 3,15,  0,15, 3,  0, 5,15,  0, 5,15,
+	0, 5,15,  0, 8,15,  0, 5,15,  0,10,15,
+	0, 5,15,  0,10,15,  0, 8,15,  0,13,15,
+	0,15, 3,  0,12,15,  0, 3,15,  0, 3, 8
+
+};
+#define SHAPEINDEX_TO_COMPRESSED_INDICES(si,region)  shapeindex_to_compressed_indices[(si)*3+(region)]
+
+#endif
diff --git a/src/nvtt/bc7/shapes_two.h b/src/nvtt/bc7/shapes_two.h
new file mode 100644
index 0000000..d9a52ef
--- /dev/null
+++ b/src/nvtt/bc7/shapes_two.h
@@ -0,0 +1,133 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+#ifndef _SHAPES_TWO_H
+#define _SHAPES_TWO_H
+
+// shapes for two regions
+
+#define NREGIONS 2
+#define NSHAPES 64
+#define SHAPEBITS 6
+
+static int shapes[NSHAPES*16] = 
+{
+0, 0, 1, 1,   0, 0, 0, 1,   0, 1, 1, 1,   0, 0, 0, 1,   
+0, 0, 1, 1,   0, 0, 0, 1,   0, 1, 1, 1,   0, 0, 1, 1,   
+0, 0, 1, 1,   0, 0, 0, 1,   0, 1, 1, 1,   0, 0, 1, 1,   
+0, 0, 1, 1,   0, 0, 0, 1,   0, 1, 1, 1,   0, 1, 1, 1,   
+
+0, 0, 0, 0,   0, 0, 1, 1,   0, 0, 0, 1,   0, 0, 0, 0,   
+0, 0, 0, 1,   0, 1, 1, 1,   0, 0, 1, 1,   0, 0, 0, 1,   
+0, 0, 0, 1,   0, 1, 1, 1,   0, 1, 1, 1,   0, 0, 1, 1,   
+0, 0, 1, 1,   1, 1, 1, 1,   1, 1, 1, 1,   0, 1, 1, 1,   
+
+0, 0, 0, 0,   0, 0, 1, 1,   0, 0, 0, 0,   0, 0, 0, 0,   
+0, 0, 0, 0,   0, 1, 1, 1,   0, 0, 0, 1,   0, 0, 0, 0,   
+0, 0, 0, 1,   1, 1, 1, 1,   0, 1, 1, 1,   0, 0, 0, 1,   
+0, 0, 1, 1,   1, 1, 1, 1,   1, 1, 1, 1,   0, 1, 1, 1,   
+
+0, 0, 0, 1,   0, 0, 0, 0,   0, 0, 0, 0,   0, 0, 0, 0,   
+0, 1, 1, 1,   0, 0, 0, 0,   1, 1, 1, 1,   0, 0, 0, 0,   
+1, 1, 1, 1,   1, 1, 1, 1,   1, 1, 1, 1,   0, 0, 0, 0,   
+1, 1, 1, 1,   1, 1, 1, 1,   1, 1, 1, 1,   1, 1, 1, 1,   
+
+0, 0, 0, 0,   0, 1, 1, 1,   0, 0, 0, 0,   0, 1, 1, 1,   
+1, 0, 0, 0,   0, 0, 0, 1,   0, 0, 0, 0,   0, 0, 1, 1,   
+1, 1, 1, 0,   0, 0, 0, 0,   1, 0, 0, 0,   0, 0, 0, 1,   
+1, 1, 1, 1,   0, 0, 0, 0,   1, 1, 1, 0,   0, 0, 0, 0,   
+
+0, 0, 1, 1,   0, 0, 0, 0,   0, 0, 0, 0,   0, 1, 1, 1,   
+0, 0, 0, 1,   1, 0, 0, 0,   0, 0, 0, 0,   0, 0, 1, 1,   
+0, 0, 0, 0,   1, 1, 0, 0,   1, 0, 0, 0,   0, 0, 1, 1,   
+0, 0, 0, 0,   1, 1, 1, 0,   1, 1, 0, 0,   0, 0, 0, 1,   
+
+0, 0, 1, 1,   0, 0, 0, 0,   0, 1, 1, 0,   0, 0, 1, 1,   
+0, 0, 0, 1,   1, 0, 0, 0,   0, 1, 1, 0,   0, 1, 1, 0,   
+0, 0, 0, 1,   1, 0, 0, 0,   0, 1, 1, 0,   0, 1, 1, 0,   
+0, 0, 0, 0,   1, 1, 0, 0,   0, 1, 1, 0,   1, 1, 0, 0,   
+
+0, 0, 0, 1,   0, 0, 0, 0,   0, 1, 1, 1,   0, 0, 1, 1,   
+0, 1, 1, 1,   1, 1, 1, 1,   0, 0, 0, 1,   1, 0, 0, 1,   
+1, 1, 1, 0,   1, 1, 1, 1,   1, 0, 0, 0,   1, 0, 0, 1,   
+1, 0, 0, 0,   0, 0, 0, 0,   1, 1, 1, 0,   1, 1, 0, 0,   
+
+0, 1, 0, 1,   0, 0, 0, 0,   0, 1, 0, 1,   0, 0, 1, 1,   
+0, 1, 0, 1,   1, 1, 1, 1,   1, 0, 1, 0,   0, 0, 1, 1,   
+0, 1, 0, 1,   0, 0, 0, 0,   0, 1, 0, 1,   1, 1, 0, 0,   
+0, 1, 0, 1,   1, 1, 1, 1,   1, 0, 1, 0,   1, 1, 0, 0,   
+
+0, 0, 1, 1,   0, 1, 0, 1,   0, 1, 1, 0,   0, 1, 0, 1,   
+1, 1, 0, 0,   0, 1, 0, 1,   1, 0, 0, 1,   1, 0, 1, 0,   
+0, 0, 1, 1,   1, 0, 1, 0,   0, 1, 1, 0,   1, 0, 1, 0,   
+1, 1, 0, 0,   1, 0, 1, 0,   1, 0, 0, 1,   0, 1, 0, 1,   
+
+0, 1, 1, 1,   0, 0, 0, 1,   0, 0, 1, 1,   0, 0, 1, 1,   
+0, 0, 1, 1,   0, 0, 1, 1,   0, 0, 1, 0,   1, 0, 1, 1,   
+1, 1, 0, 0,   1, 1, 0, 0,   0, 1, 0, 0,   1, 1, 0, 1,   
+1, 1, 1, 0,   1, 0, 0, 0,   1, 1, 0, 0,   1, 1, 0, 0,   
+
+0, 1, 1, 0,   0, 0, 1, 1,   0, 1, 1, 0,   0, 0, 0, 0,   
+1, 0, 0, 1,   1, 1, 0, 0,   0, 1, 1, 0,   0, 1, 1, 0,   
+1, 0, 0, 1,   1, 1, 0, 0,   1, 0, 0, 1,   0, 1, 1, 0,   
+0, 1, 1, 0,   0, 0, 1, 1,   1, 0, 0, 1,   0, 0, 0, 0,   
+
+0, 1, 0, 0,   0, 0, 1, 0,   0, 0, 0, 0,   0, 0, 0, 0,   
+1, 1, 1, 0,   0, 1, 1, 1,   0, 0, 1, 0,   0, 1, 0, 0,   
+0, 1, 0, 0,   0, 0, 1, 0,   0, 1, 1, 1,   1, 1, 1, 0,   
+0, 0, 0, 0,   0, 0, 0, 0,   0, 0, 1, 0,   0, 1, 0, 0,   
+
+0, 1, 1, 0,   0, 0, 1, 1,   0, 1, 1, 0,   0, 0, 1, 1,   
+1, 1, 0, 0,   0, 1, 1, 0,   0, 0, 1, 1,   1, 0, 0, 1,   
+1, 0, 0, 1,   1, 1, 0, 0,   1, 0, 0, 1,   1, 1, 0, 0,   
+0, 0, 1, 1,   1, 0, 0, 1,   1, 1, 0, 0,   0, 1, 1, 0,   
+
+0, 1, 1, 0,   0, 1, 1, 0,   0, 1, 1, 1,   0, 0, 0, 1,   
+1, 1, 0, 0,   0, 0, 1, 1,   1, 1, 1, 0,   1, 0, 0, 0,   
+1, 1, 0, 0,   0, 0, 1, 1,   1, 0, 0, 0,   1, 1, 1, 0,   
+1, 0, 0, 1,   1, 0, 0, 1,   0, 0, 0, 1,   0, 1, 1, 1,   
+
+0, 0, 0, 0,   0, 0, 1, 1,   0, 0, 1, 0,   0, 1, 0, 0,   
+1, 1, 1, 1,   0, 0, 1, 1,   0, 0, 1, 0,   0, 1, 0, 0,   
+0, 0, 1, 1,   1, 1, 1, 1,   1, 1, 1, 0,   0, 1, 1, 1,   
+0, 0, 1, 1,   0, 0, 0, 0,   1, 1, 1, 0,   0, 1, 1, 1,   
+
+};
+
+#define	REGION(x,y,si)	shapes[((si)&3)*4+((si)>>2)*64+(x)+(y)*16]
+
+static int shapeindex_to_compressed_indices[NSHAPES*2] = 
+{
+	0,15,  0,15,  0,15,  0,15,
+	0,15,  0,15,  0,15,  0,15,
+	0,15,  0,15,  0,15,  0,15,
+	0,15,  0,15,  0,15,  0,15,
+
+	0,15,  0, 2,  0, 8,  0, 2,
+	0, 2,  0, 8,  0, 8,  0,15,
+	0, 2,  0, 8,  0, 2,  0, 2,
+	0, 8,  0, 8,  0, 2,  0, 2,
+
+	0,15,  0,15,  0, 6,  0, 8,
+	0, 2,  0, 8,  0,15,  0,15,
+	0, 2,  0, 8,  0, 2,  0, 2,
+	0, 2,  0,15,  0,15,  0, 6,
+
+	0, 6,  0, 2,  0, 6,  0, 8,
+	0,15,  0,15,  0, 2,  0, 2,
+	0,15,  0,15,  0,15,  0,15,
+	0,15,  0, 2,  0, 2,  0,15
+
+};
+#define SHAPEINDEX_TO_COMPRESSED_INDICES(si,region)  shapeindex_to_compressed_indices[(si)*2+(region)]
+
+#endif
diff --git a/src/nvtt/bc7/targa.cpp b/src/nvtt/bc7/targa.cpp
new file mode 100644
index 0000000..18a2ddf
--- /dev/null
+++ b/src/nvtt/bc7/targa.cpp
@@ -0,0 +1,179 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+// Quick and dirty Targa file I/O -- doesn't handle compressed format targa files, though.
+
+#include <stdexcept>
+#include <iostream>
+
+#include "ImfArray.h"
+#include "targa.h"
+#include "rgba.h"
+
+Targa::Targa() {}
+
+Targa::~Targa() {}
+
+// read either RGB or RGBA files
+static int readTgaHeader(FILE *fp, int& width, int& height, int &bpp, int &origin)
+{
+	unsigned char hdr[18];
+
+	if (fread(hdr, sizeof(hdr), 1, fp ) != 1)
+		return 0;
+
+	if (hdr[2] != 2)
+		return 0;
+
+	bpp = hdr[16];
+	if (bpp != 24 && bpp != 32)
+		return 0;
+
+	int alphabpp = hdr[17] & 0xF;
+	origin = (hdr[17] >> 4) & 0x3;
+
+	if (bpp == 24 && alphabpp != 0)
+		return 0;
+	if (bpp == 32 && alphabpp != 8)
+		return 0;
+
+	width = (hdr[13] << 8) | hdr[12];
+	height = (hdr[15] << 8) | hdr[14];
+
+	// skip image ID field
+	int idsize = hdr[0];
+	for (; idsize; --idsize)
+		(void) getc(fp);
+
+	return 1;
+}
+
+static void read_file(FILE *fp, Imf::Array2D<RGBA>& pixels, int width, int height, int bpp, int origin)
+{
+	pixels.resizeErase(height, width);
+
+	// bottom to top order
+	for (int y = 0; y < height; ++y)
+	for (int x = 0; x < width; ++x)
+	{
+		float b = float(getc(fp));
+		float g = float(getc(fp));
+		float r = float(getc(fp));
+		float a = (bpp == 24) ? RGBA_MAX : float(getc(fp));
+
+		int xt, yt;
+
+		// transform based on origin
+		switch (origin)
+		{
+		case 0:	xt = x; yt = height-1-y; break;		// bottom left
+		case 1:	xt = width-1-x; yt = y;	break;		// bottom right
+		case 2:	xt = x; yt = y;	break;				// top left
+		case 3: xt = width-1-x; yt = height-1-y; break;	// top right
+		default:  throw "impossible origin value";
+		}
+
+		pixels[yt][xt].a = a;
+		pixels[yt][xt].r = r;
+		pixels[yt][xt].g = g;
+		pixels[yt][xt].b = b;
+	}
+}
+
+void Targa::fileinfo(const std::string& filename, int& width, int& height, bool& const_alpha)
+{
+	int bpp, origin;
+
+	FILE *fp = fopen(filename.c_str(), "rb");
+
+	if (fp == (FILE *) 0)
+		throw "Unable to open infile";
+
+	if (readTgaHeader(fp, width, height, bpp, origin) == 0)
+		throw "Invalid or unimplemented format for infile, needs to be a 24 or 32 bit uncompressed TGA file";
+
+	if (bpp == 24)
+		const_alpha = true;
+	else
+	{
+		// even if file is 32bpp the alpha may still be constant. so read file and check
+		Imf::Array2D<RGBA> pixels;
+		
+		read_file(fp, pixels, width, height, bpp, origin);
+
+		bool const_alpha = true;
+
+		for (int y=0; y<height && const_alpha; ++y)
+		for (int x=0; x<width && const_alpha; ++x)
+			if (pixels[y][x].a != 255.0)
+				const_alpha = false;
+	}
+
+	fclose(fp);
+}
+
+
+void Targa::read(const std::string& filename, Imf::Array2D<RGBA>& pixels, int& width, int& height)
+{
+	int bpp, origin;
+
+	FILE *fp = fopen(filename.c_str(), "rb");
+
+	if (fp == (FILE *) 0)
+		throw "Unable to open infile";
+
+	if (readTgaHeader(fp, width, height, bpp, origin) == 0)
+		throw "Invalid or unimplemented format for infile, needs to be a 24 or 32 bit uncompressed TGA file";
+
+	read_file(fp, pixels, width, height, bpp, origin);
+
+	fclose(fp);
+}
+
+void Targa::write(const std::string& filename, const Imf::Array2D<RGBA>& pixels, int width, int height)
+{
+	FILE *fp = fopen(filename.c_str(), "wb");
+
+	if (fp == (FILE *) 0)
+		throw "Unable to open outfile";
+
+	unsigned char hdr[18];
+
+	// we're lazy, always write this as a 32bpp file, even if the alpha is constant 255
+
+	memset(hdr, 0, sizeof(hdr));
+	hdr[2]  = 2;
+	hdr[12] = width & 0xFF;
+	hdr[13] = width >> 8;
+	hdr[14] = height & 0xFF;
+	hdr[15] = height >> 8;
+	hdr[16] = 32;
+	hdr[17] = 0x28;
+
+	fwrite( hdr, sizeof(hdr), 1, fp );
+
+	// top to bottom order
+	for (int y = 0; y < height; ++y)
+	for (int x = 0; x < width; ++x)
+	{
+		int a = int((pixels[y][x]).a + 0.5f);
+		int r = int((pixels[y][x]).r + 0.5f);
+		int g = int((pixels[y][x]).g + 0.5f);
+		int b = int((pixels[y][x]).b + 0.5f);
+
+		if (b < RGBA_MIN) b = RGBA_MIN; if (b > RGBA_MAX) b = RGBA_MAX; fputc(b, fp);
+		if (g < RGBA_MIN) g = RGBA_MIN; if (g > RGBA_MAX) g = RGBA_MAX; fputc(g, fp);
+		if (r < RGBA_MIN) r = RGBA_MIN; if (r > RGBA_MAX) r = RGBA_MAX; fputc(r, fp);
+		if (a < RGBA_MIN) a = RGBA_MIN; if (a > RGBA_MAX) a = RGBA_MAX; fputc(a, fp);
+	}
+	fclose(fp);
+}
diff --git a/src/nvtt/bc7/targa.h b/src/nvtt/bc7/targa.h
new file mode 100644
index 0000000..995df8e
--- /dev/null
+++ b/src/nvtt/bc7/targa.h
@@ -0,0 +1,30 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+#ifndef _targa_h_
+#define _targa_h_
+
+#include "ImfArray.h"
+#include "rgba.h"
+
+class Targa
+{
+public:
+	Targa();
+	~Targa();
+
+	static void fileinfo( const std::string& filename, int& width, int& height, bool& const_alpha);
+	static void read( const std::string& filename, Imf::Array2D<RGBA>& pixels, int& width, int& height );
+	static void write(const std::string& filename, const Imf::Array2D<RGBA>& pixels, int width, int height );
+};
+
+#endif /* _targa_h_ */
diff --git a/src/nvtt/bc7/tile.h b/src/nvtt/bc7/tile.h
new file mode 100644
index 0000000..620ae2b
--- /dev/null
+++ b/src/nvtt/bc7/tile.h
@@ -0,0 +1,67 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+#ifndef _TILE_H
+#define _TILE_H
+
+#include "ImfArray.h"
+#include <math.h>
+#include "arvo/Vec4.h"
+#include "utils.h"
+#include "rgba.h"
+
+using namespace Imf;
+using namespace ArvoMath;
+
+// extract a tile of pixels from an array
+
+class Tile
+{
+public:
+	static const int TILE_H = 4;
+	static const int TILE_W = 4;
+	static const int TILE_TOTAL = TILE_H * TILE_W;
+	Vec4 data[TILE_H][TILE_W];
+	int	size_x, size_y;			// actual size of tile
+
+	Tile() {};
+	~Tile(){};
+	Tile(int xs, int ys) {size_x = xs; size_y = ys;}
+
+	// pixels -> tile
+	void inline insert(const Array2D<RGBA> &pixels, int x, int y)
+	{
+		for (int y0=0; y0<size_y; ++y0)
+		for (int x0=0; x0<size_x; ++x0)
+		{
+			data[y0][x0].X() = (pixels[y+y0][x+x0]).r;
+			data[y0][x0].Y() = (pixels[y+y0][x+x0]).g;
+			data[y0][x0].Z() = (pixels[y+y0][x+x0]).b;
+			data[y0][x0].W() = (pixels[y+y0][x+x0]).a;
+		}
+	}
+
+	// tile -> pixels
+	void inline extract(Array2D<RGBA> &pixels, int x, int y)	
+	{
+		for (int y0=0; y0<size_y; ++y0)
+		for (int x0=0; x0<size_x; ++x0)
+		{
+			pixels[y+y0][x+x0].r = data[y0][x0].X();
+			pixels[y+y0][x+x0].g = data[y0][x0].Y();
+			pixels[y+y0][x+x0].b = data[y0][x0].Z();
+			pixels[y+y0][x+x0].a = data[y0][x0].W();
+		}
+	}
+};
+
+#endif
\ No newline at end of file
diff --git a/src/nvtt/bc7/utils.cpp b/src/nvtt/bc7/utils.cpp
new file mode 100644
index 0000000..c8036fe
--- /dev/null
+++ b/src/nvtt/bc7/utils.cpp
@@ -0,0 +1,391 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+// Utility and common routines
+
+#include "utils.h"
+#include "avpcl.h"
+#include <math.h>
+#include <assert.h>
+#include "rgba.h"
+#include "arvo/Vec3.h"
+#include "arvo/Vec4.h"
+
+static int denom7_weights[] = {0, 9, 18, 27, 37, 46, 55, 64};										// divided by 64
+static int denom15_weights[] = {0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64};		// divided by 64
+
+int Utils::lerp(int a, int b, int i, int bias, int denom)
+{
+#ifdef	USE_ZOH_INTERP
+	assert (denom == 3 || denom == 7 || denom == 15);
+	assert (i >= 0 && i <= denom);
+	assert (bias >= 0 && bias <= denom/2);
+	assert (a >= 0 && b >= 0);
+
+	int round = 0;
+#ifdef	USE_ZOH_INTERP_ROUNDED
+	round = 32;
+#endif
+
+	switch (denom)
+	{
+	case 3:	denom *= 5; i *= 5;	// fall through to case 15
+	case 15:return (a*denom15_weights[denom-i] + b*denom15_weights[i] + round) >> 6;
+	case 7:	return (a*denom7_weights[denom-i] + b*denom7_weights[i] + round) >> 6;
+	default: assert(0); return 0;
+	}
+#else
+	return (((a)*((denom)-i)+(b)*(i)+(bias))/(denom));		// simple exact interpolation
+#endif
+}
+
+Vec4 Utils::lerp(const Vec4& a, const Vec4 &b, int i, int bias, int denom)
+{
+#ifdef	USE_ZOH_INTERP
+	assert (denom == 3 || denom == 7 || denom == 15);
+	assert (i >= 0 && i <= denom);
+	assert (bias >= 0 && bias <= denom/2);
+//	assert (a >= 0 && b >= 0);
+
+	// no need to bias these as this is an exact division
+
+	switch (denom)
+	{
+	case 3:	denom *= 5; i *= 5;	// fall through to case 15
+	case 15:return (a*denom15_weights[denom-i] + b*denom15_weights[i]) / 64.0;
+	case 7:	return (a*denom7_weights[denom-i] + b*denom7_weights[i]) / 64.0;
+	default: assert(0); return 0;
+	}
+#else
+	return (((a)*((denom)-i)+(b)*(i)+(bias))/(denom));		// simple exact interpolation
+#endif
+}
+
+
+int Utils::unquantize(int q, int prec)
+{
+	int unq;
+
+	assert (prec > 3);	// we only want to do one replicate
+	assert (RGBA_MIN == 0);
+
+#ifdef USE_ZOH_QUANT
+	if (prec >= 8)
+		unq = q;
+	else if (q == 0) 
+		unq = 0;
+	else if (q == ((1<<prec)-1)) 
+		unq = RGBA_MAX;
+	else
+		unq = (q * (RGBA_MAX+1) + (RGBA_MAX+1)/2) >> prec;
+#else
+	// avpcl unquantizer -- bit replicate
+	unq = (q << (8-prec)) | (q >> (2*prec-8));
+#endif
+
+	return unq;
+}
+
+// quantize to the best value -- i.e., minimize unquantize error
+int Utils::quantize(float value, int prec)
+{
+	int q, unq;
+
+	assert (prec > 3);	// we only want to do one replicate
+	assert (RGBA_MIN == 0);
+
+	unq = (int)floor(value + 0.5);
+	assert (unq >= RGBA_MIN && unq <= RGBA_MAX);
+
+#ifdef USE_ZOH_QUANT
+	q = (prec >= 8) ? unq : (unq << prec) / (RGBA_MAX+1);
+#else
+	// avpcl quantizer -- scale properly for best possible bit-replicated result
+	q = (unq * ((1<<prec)-1) + RGBA_MAX/2)/RGBA_MAX;
+#endif
+
+	assert (q >= 0 && q < (1 << prec));
+
+	return q;
+}
+
+double Utils::metric4(const Vec4& a, const Vec4& b)
+{
+	Vec4 err = a - b;
+
+	// if nonuniform, select weights and weigh away
+	if (AVPCL::flag_nonuniform || AVPCL::flag_nonuniform_ati)
+	{
+		double rwt, gwt, bwt;
+		if (AVPCL::flag_nonuniform)
+		{
+			rwt = 0.299; gwt = 0.587; bwt = 0.114;
+		}
+		else if (AVPCL::flag_nonuniform_ati)
+		{
+			rwt = 0.3086; gwt = 0.6094; bwt = 0.0820;
+		}
+
+		// weigh the components
+		err.X() *= rwt;
+		err.Y() *= gwt;
+		err.Z() *= bwt;
+	}
+
+	return err * err;
+}
+
+// WORK -- implement rotatemode for the below -- that changes where the rwt, gwt, and bwt's go.
+double Utils::metric3(const Vec3& a, const Vec3& b, int rotatemode)
+{
+	Vec3 err = a - b;
+
+	// if nonuniform, select weights and weigh away
+	if (AVPCL::flag_nonuniform || AVPCL::flag_nonuniform_ati)
+	{
+		double rwt, gwt, bwt;
+		if (AVPCL::flag_nonuniform)
+		{
+			rwt = 0.299; gwt = 0.587; bwt = 0.114;
+		}
+		else if (AVPCL::flag_nonuniform_ati)
+		{
+			rwt = 0.3086; gwt = 0.6094; bwt = 0.0820;
+		}
+
+		// adjust weights based on rotatemode
+		switch(rotatemode)
+		{
+		case ROTATEMODE_RGBA_RGBA: break;
+		case ROTATEMODE_RGBA_AGBR: rwt = 1.0; break;
+		case ROTATEMODE_RGBA_RABG: gwt = 1.0; break;
+		case ROTATEMODE_RGBA_RGAB: bwt = 1.0; break;
+		default: assert(0);
+		}
+
+		// weigh the components
+		err.X() *= rwt;
+		err.Y() *= gwt;
+		err.Z() *= bwt;
+	}
+
+	return err * err;
+}
+
+double Utils::metric1(const float a, const float b, int rotatemode)
+{
+	float err = a - b;
+
+	// if nonuniform, select weights and weigh away
+	if (AVPCL::flag_nonuniform || AVPCL::flag_nonuniform_ati)
+	{
+		double rwt, gwt, bwt, awt;
+		if (AVPCL::flag_nonuniform)
+		{
+			rwt = 0.299; gwt = 0.587; bwt = 0.114;
+		}
+		else if (AVPCL::flag_nonuniform_ati)
+		{
+			rwt = 0.3086; gwt = 0.6094; bwt = 0.0820;
+		}
+
+		// adjust weights based on rotatemode
+		switch(rotatemode)
+		{
+		case ROTATEMODE_RGBA_RGBA: awt = 1.0; break;
+		case ROTATEMODE_RGBA_AGBR: awt = rwt; break;
+		case ROTATEMODE_RGBA_RABG: awt = gwt; break;
+		case ROTATEMODE_RGBA_RGAB: awt = bwt; break;
+		default: assert(0);
+		}
+
+		// weigh the components
+		err *= awt;
+	}
+
+	return err * err;
+}
+
+float Utils::premult(float r, float a)
+{
+	// note that the args are really integers stored in floats
+	int R = r, A = a;
+
+	assert ((R==r) && (A==a));
+
+	return float((R*A + RGBA_MAX/2)/RGBA_MAX);
+}
+
+static void premult4(Vec4& rgba)
+{
+	rgba.X() = Utils::premult(rgba.X(), rgba.W());
+	rgba.Y() = Utils::premult(rgba.Y(), rgba.W());
+	rgba.Z() = Utils::premult(rgba.Z(), rgba.W());
+}
+
+static void premult3(Vec3& rgb, float a)
+{
+	rgb.X() = Utils::premult(rgb.X(), a);
+	rgb.Y() = Utils::premult(rgb.Y(), a);
+	rgb.Z() = Utils::premult(rgb.Z(), a);
+}
+
+double Utils::metric4premult(const Vec4& a, const Vec4& b)
+{
+	Vec4 pma = a, pmb = b;
+
+	premult4(pma);
+	premult4(pmb);
+
+	Vec4 err = pma - pmb;
+
+	// if nonuniform, select weights and weigh away
+	if (AVPCL::flag_nonuniform || AVPCL::flag_nonuniform_ati)
+	{
+		double rwt, gwt, bwt;
+		if (AVPCL::flag_nonuniform)
+		{
+			rwt = 0.299; gwt = 0.587; bwt = 0.114;
+		}
+		else if (AVPCL::flag_nonuniform_ati)
+		{
+			rwt = 0.3086; gwt = 0.6094; bwt = 0.0820;
+		}
+
+		// weigh the components
+		err.X() *= rwt;
+		err.Y() *= gwt;
+		err.Z() *= bwt;
+	}
+
+	return err * err;
+}
+
+double Utils::metric3premult_alphaout(const Vec3& rgb0, float a0, const Vec3& rgb1, float a1)
+{
+	Vec3 pma = rgb0, pmb = rgb1;
+
+	premult3(pma, a0);
+	premult3(pmb, a1);
+
+	Vec3 err = pma - pmb;
+
+	// if nonuniform, select weights and weigh away
+	if (AVPCL::flag_nonuniform || AVPCL::flag_nonuniform_ati)
+	{
+		double rwt, gwt, bwt;
+		if (AVPCL::flag_nonuniform)
+		{
+			rwt = 0.299; gwt = 0.587; bwt = 0.114;
+		}
+		else if (AVPCL::flag_nonuniform_ati)
+		{
+			rwt = 0.3086; gwt = 0.6094; bwt = 0.0820;
+		}
+
+		// weigh the components
+		err.X() *= rwt;
+		err.Y() *= gwt;
+		err.Z() *= bwt;
+	}
+
+	return err * err;
+}
+
+double Utils::metric3premult_alphain(const Vec3& rgb0, const Vec3& rgb1, int rotatemode)
+{
+	Vec3 pma = rgb0, pmb = rgb1;
+
+	switch(rotatemode)
+	{
+	case ROTATEMODE_RGBA_RGBA:
+		// this function isn't supposed to be called for this rotatemode
+		assert(0);
+		break;
+	case ROTATEMODE_RGBA_AGBR:
+		pma.Y() = premult(pma.Y(), pma.X());
+		pma.Z() = premult(pma.Z(), pma.X());
+		pmb.Y() = premult(pmb.Y(), pmb.X());
+		pmb.Z() = premult(pmb.Z(), pmb.X());
+		break;
+	case ROTATEMODE_RGBA_RABG:
+		pma.X() = premult(pma.X(), pma.Y());
+		pma.Z() = premult(pma.Z(), pma.Y());
+		pmb.X() = premult(pmb.X(), pmb.Y());
+		pmb.Z() = premult(pmb.Z(), pmb.Y());
+		break;
+	case ROTATEMODE_RGBA_RGAB:
+		pma.X() = premult(pma.X(), pma.Z());
+		pma.Y() = premult(pma.Y(), pma.Z());
+		pmb.X() = premult(pmb.X(), pmb.Z());
+		pmb.Y() = premult(pmb.Y(), pmb.Z());
+		break;
+	default: assert(0);
+	}
+
+	Vec3 err = pma - pmb;
+
+	// if nonuniform, select weights and weigh away
+	if (AVPCL::flag_nonuniform || AVPCL::flag_nonuniform_ati)
+	{
+		double rwt, gwt, bwt;
+		if (AVPCL::flag_nonuniform)
+		{
+			rwt = 0.299; gwt = 0.587; bwt = 0.114;
+		}
+		else if (AVPCL::flag_nonuniform_ati)
+		{
+			rwt = 0.3086; gwt = 0.6094; bwt = 0.0820;
+		}
+
+		// weigh the components
+		err.X() *= rwt;
+		err.Y() *= gwt;
+		err.Z() *= bwt;
+	}
+
+	return err * err;
+}
+
+double Utils::metric1premult(float rgb0, float a0, float rgb1, float a1, int rotatemode)
+{
+	float err = premult(rgb0, a0) - premult(rgb1, a1);
+
+	// if nonuniform, select weights and weigh away
+	if (AVPCL::flag_nonuniform || AVPCL::flag_nonuniform_ati)
+	{
+		double rwt, gwt, bwt, awt;
+		if (AVPCL::flag_nonuniform)
+		{
+			rwt = 0.299; gwt = 0.587; bwt = 0.114;
+		}
+		else if (AVPCL::flag_nonuniform_ati)
+		{
+			rwt = 0.3086; gwt = 0.6094; bwt = 0.0820;
+		}
+
+		// adjust weights based on rotatemode
+		switch(rotatemode)
+		{
+		case ROTATEMODE_RGBA_RGBA: awt = 1.0; break;
+		case ROTATEMODE_RGBA_AGBR: awt = rwt; break;
+		case ROTATEMODE_RGBA_RABG: awt = gwt; break;
+		case ROTATEMODE_RGBA_RGAB: awt = bwt; break;
+		default: assert(0);
+		}
+
+		// weigh the components
+		err *= awt;
+	}
+
+	return err * err;
+}
diff --git a/src/nvtt/bc7/utils.h b/src/nvtt/bc7/utils.h
new file mode 100644
index 0000000..5c08ffd
--- /dev/null
+++ b/src/nvtt/bc7/utils.h
@@ -0,0 +1,69 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+// utility class holding common routines
+#ifndef _UTILS_H
+#define _UTILS_H
+
+#include "arvo/Vec4.h"
+
+using namespace ArvoMath;
+
+#ifndef MIN
+#define MIN(x,y) ((x)<(y)?(x):(y))
+#endif
+
+#ifndef MAX
+#define MAX(x,y) ((x)>(y)?(x):(y))
+#endif
+
+#define	PALETTE_LERP(a, b, i, bias, denom)	Utils::lerp(a, b, i, bias, denom)
+
+#define	SIGN_EXTEND(x,nb)	((((x)&(1<<((nb)-1)))?((~0)<<(nb)):0)|(x))
+
+#define	INDEXMODE_BITS 1		// 2 different index modes
+#define	NINDEXMODES	(1<<(INDEXMODE_BITS))
+#define	INDEXMODE_ALPHA_IS_3BITS 0
+#define	INDEXMODE_ALPHA_IS_2BITS 1
+
+#define	ROTATEMODE_BITS	2		// 4 different rotate modes
+#define	NROTATEMODES	(1<<(ROTATEMODE_BITS))
+#define	ROTATEMODE_RGBA_RGBA	0
+#define	ROTATEMODE_RGBA_AGBR	1
+#define	ROTATEMODE_RGBA_RABG	2
+#define	ROTATEMODE_RGBA_RGAB	3
+
+class Utils
+{
+public:
+	// error metrics
+	static double metric4(const Vec4& a, const Vec4& b);
+	static double metric3(const Vec3& a, const Vec3& b, int rotatemode);
+	static double metric1(float a, float b, int rotatemode);
+
+	static double metric4premult(const Vec4& rgba0, const Vec4& rgba1);
+	static double metric3premult_alphaout(const Vec3& rgb0, float a0, const Vec3& rgb1, float a1);
+	static double metric3premult_alphain(const Vec3& rgb0, const Vec3& rgb1, int rotatemode);
+	static double metric1premult(float rgb0, float a0, float rgb1, float a1, int rotatemode);
+
+	static float  Utils::premult(float r, float a);
+
+	// quantization and unquantization
+	static int unquantize(int q, int prec);
+	static int quantize(float value, int prec);
+
+	// lerping
+	static int lerp(int a, int b, int i, int bias, int denom);
+	static Vec4 lerp(const Vec4& a, const Vec4 &b, int i, int bias, int denom);
+};
+
+#endif
\ No newline at end of file